diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index d065c554bbf4..daf38f2523fc 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -61,39 +61,39 @@ jobs: submodules: true - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - - name: Test arrow-buffer with all features + - name: Test arrow-buffer run: cargo test -p arrow-buffer --all-features - - name: Test arrow-data with all features + - name: Test arrow-data run: cargo test -p arrow-data --all-features - - name: Test arrow-schema with all features + - name: Test arrow-schema run: cargo test -p arrow-schema --all-features - - name: Test arrow-array with all features + - name: Test arrow-array run: cargo test -p arrow-array --all-features - - name: Test arrow-select with all features + - name: Test arrow-select run: cargo test -p arrow-select --all-features - - name: Test arrow-cast with all features + - name: Test arrow-cast run: cargo test -p arrow-cast --all-features - - name: Test arrow-ipc with all features + - name: Test arrow-ipc run: cargo test -p arrow-ipc --all-features - - name: Test arrow-csv with all features + - name: Test arrow-csv run: cargo test -p arrow-csv --all-features - - name: Test arrow-json with all features + - name: Test arrow-json run: cargo test -p arrow-json --all-features - - name: Test arrow-avro with all features + - name: Test arrow-avro run: cargo test -p arrow-avro --all-features - - name: Test arrow-string with all features + - name: Test arrow-string run: cargo test -p arrow-string --all-features - - name: Test arrow-ord with all features + - name: Test arrow-ord run: cargo test -p arrow-ord --all-features - - name: Test arrow-arith with all features + - name: Test arrow-arith run: cargo test -p arrow-arith --all-features - - name: Test arrow-row with all features + - name: Test arrow-row run: cargo test -p arrow-row --all-features - - name: Test arrow-integration-test with all features + - name: Test arrow-integration-test run: cargo test -p arrow-integration-test --all-features - name: Test arrow with default features run: cargo test -p arrow - - name: Test arrow with all features except pyarrow + - name: Test arrow except pyarrow run: cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,chrono-tz - name: Run examples run: | @@ -163,37 +163,139 @@ jobs: uses: ./.github/actions/setup-builder - name: Setup Clippy run: rustup component add clippy - - name: Clippy arrow-buffer with all features - run: cargo clippy -p arrow-buffer --all-targets --all-features -- -D warnings - - name: Clippy arrow-data with all features - run: cargo clippy -p arrow-data --all-targets --all-features -- -D warnings - - name: Clippy arrow-schema with all features - run: cargo clippy -p arrow-schema --all-targets --all-features -- -D warnings - - name: Clippy arrow-array with all features - run: cargo clippy -p arrow-array --all-targets --all-features -- -D warnings - - name: Clippy arrow-select with all features - run: cargo clippy -p arrow-select --all-targets --all-features -- -D warnings - - name: Clippy arrow-cast with all features - run: cargo clippy -p arrow-cast --all-targets --all-features -- -D warnings - - name: Clippy arrow-ipc with all features - run: cargo clippy -p arrow-ipc --all-targets --all-features -- -D warnings - - name: Clippy arrow-csv with all features - run: cargo clippy -p arrow-csv --all-targets --all-features -- -D warnings - - name: Clippy arrow-json with all features - run: cargo clippy -p arrow-json --all-targets --all-features -- -D warnings - - name: Clippy arrow-avro with all features - run: cargo clippy -p arrow-avro --all-targets --all-features -- -D warnings - - name: Clippy arrow-string with all features - run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings - - name: Clippy arrow-ord with all features - run: cargo clippy -p arrow-ord --all-targets --all-features -- -D warnings - - name: Clippy arrow-arith with all features - run: cargo clippy -p arrow-arith --all-targets --all-features -- -D warnings - - name: Clippy arrow-row with all features - run: cargo clippy -p arrow-row --all-targets --all-features -- -D warnings - - name: Clippy arrow with all features - run: cargo clippy -p arrow --all-features --all-targets -- -D warnings - - name: Clippy arrow-integration-test with all features - run: cargo clippy -p arrow-integration-test --all-targets --all-features -- -D warnings - - name: Clippy arrow-integration-testing with all features - run: cargo clippy -p arrow-integration-testing --all-targets --all-features -- -D warnings + - name: Clippy arrow-buffer + run: | + mod=arrow-buffer + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-data + run: | + mod=arrow-data + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-schema + run: | + mod=arrow-schema + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-array + run: | + mod=arrow-array + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-select + run: | + mod=arrow-select + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-cast + run: | + mod=arrow-cast + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-ipc + run: | + mod=arrow-ipc + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-csv + run: | + mod=arrow-csv + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-json + run: | + mod=arrow-json + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-avro + run: | + mod=arrow-avro + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-string + run: | + mod=arrow-string + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-ord + run: | + mod=arrow-ord + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-arith + run: | + mod=arrow-arith + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-row + run: | + mod=arrow-row + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow + run: | + mod=arrow + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-integration-test + run: | + mod=arrow-integration-test + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-integration-testing + run: | + mod=arrow-integration-testing + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 59501b5addfe..93f809aaabd4 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -141,6 +141,7 @@ jobs: echo "LOCALSTACK_CONTAINER=$(docker run -d -p 4566:4566 localstack/localstack:4.0.3)" >> $GITHUB_ENV echo "EC2_METADATA_CONTAINER=$(docker run -d -p 1338:1338 amazon/amazon-ec2-metadata-mock:v1.9.2 --imdsv2)" >> $GITHUB_ENV aws --endpoint-url=http://localhost:4566 s3 mb s3://test-bucket + aws --endpoint-url=http://localhost:4566 s3api create-bucket --bucket test-object-lock --object-lock-enabled-for-bucket aws --endpoint-url=http://localhost:4566 dynamodb create-table --table-name test-table --key-schema AttributeName=path,KeyType=HASH AttributeName=etag,KeyType=RANGE --attribute-definitions AttributeName=path,AttributeType=S AttributeName=etag,AttributeType=S --provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5 KMS_KEY=$(aws --endpoint-url=http://localhost:4566 kms create-key --description "test key") diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index f1ce5fa857d2..2b288445c74b 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -15,9 +15,11 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::*; -use crate::types::Int32Type; use crate::StructArray; +use crate::{ + builder::*, + types::{Int16Type, Int32Type, Int64Type, Int8Type}, +}; use arrow_buffer::NullBufferBuilder; use arrow_schema::{DataType, Fields, IntervalUnit, SchemaBuilder, TimeUnit}; use std::sync::Arc; @@ -290,29 +292,42 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box panic!("The field of Map data type {t:?} should has a child Struct field"), }, DataType::Struct(fields) => Box::new(StructBuilder::from_fields(fields.clone(), capacity)), - DataType::Dictionary(key_type, value_type) if **key_type == DataType::Int32 => { - match &**value_type { - DataType::Utf8 => { - let dict_builder: StringDictionaryBuilder = - StringDictionaryBuilder::with_capacity(capacity, 256, 1024); - Box::new(dict_builder) - } - DataType::LargeUtf8 => { - let dict_builder: LargeStringDictionaryBuilder = - LargeStringDictionaryBuilder::with_capacity(capacity, 256, 1024); - Box::new(dict_builder) - } - DataType::Binary => { - let dict_builder: BinaryDictionaryBuilder = - BinaryDictionaryBuilder::with_capacity(capacity, 256, 1024); - Box::new(dict_builder) - } - DataType::LargeBinary => { - let dict_builder: LargeBinaryDictionaryBuilder = - LargeBinaryDictionaryBuilder::with_capacity(capacity, 256, 1024); - Box::new(dict_builder) + t @ DataType::Dictionary(key_type, value_type) => { + macro_rules! dict_builder { + ($key_type:ty) => { + match &**value_type { + DataType::Utf8 => { + let dict_builder: StringDictionaryBuilder<$key_type> = + StringDictionaryBuilder::with_capacity(capacity, 256, 1024); + Box::new(dict_builder) + } + DataType::LargeUtf8 => { + let dict_builder: LargeStringDictionaryBuilder<$key_type> = + LargeStringDictionaryBuilder::with_capacity(capacity, 256, 1024); + Box::new(dict_builder) + } + DataType::Binary => { + let dict_builder: BinaryDictionaryBuilder<$key_type> = + BinaryDictionaryBuilder::with_capacity(capacity, 256, 1024); + Box::new(dict_builder) + } + DataType::LargeBinary => { + let dict_builder: LargeBinaryDictionaryBuilder<$key_type> = + LargeBinaryDictionaryBuilder::with_capacity(capacity, 256, 1024); + Box::new(dict_builder) + } + t => panic!("Dictionary value type {t:?} is not currently supported"), + } + }; + } + match &**key_type { + DataType::Int8 => dict_builder!(Int8Type), + DataType::Int16 => dict_builder!(Int16Type), + DataType::Int32 => dict_builder!(Int32Type), + DataType::Int64 => dict_builder!(Int64Type), + _ => { + panic!("Data type {t:?} with key type {key_type:?} is not currently supported") } - t => panic!("Unsupported dictionary value type {t:?} is not currently supported"), } } t => panic!("Data type {t:?} is not currently supported"), @@ -430,12 +445,14 @@ impl StructBuilder { #[cfg(test)] mod tests { + use std::any::type_name; + use super::*; use arrow_buffer::Buffer; use arrow_data::ArrayData; use arrow_schema::Field; - use crate::array::Array; + use crate::{array::Array, types::ArrowDictionaryKeyType}; #[test] fn test_struct_array_builder() { @@ -690,10 +707,31 @@ mod tests { } #[test] - fn test_struct_array_builder_from_dictionary_type() { + fn test_struct_array_builder_from_dictionary_type_int8_key() { + test_struct_array_builder_from_dictionary_type_inner::(DataType::Int8); + } + + #[test] + fn test_struct_array_builder_from_dictionary_type_int16_key() { + test_struct_array_builder_from_dictionary_type_inner::(DataType::Int16); + } + + #[test] + fn test_struct_array_builder_from_dictionary_type_int32_key() { + test_struct_array_builder_from_dictionary_type_inner::(DataType::Int32); + } + + #[test] + fn test_struct_array_builder_from_dictionary_type_int64_key() { + test_struct_array_builder_from_dictionary_type_inner::(DataType::Int64); + } + + fn test_struct_array_builder_from_dictionary_type_inner( + key_type: DataType, + ) { let dict_field = Field::new( "f1", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + DataType::Dictionary(Box::new(key_type), Box::new(DataType::Utf8)), false, ); let fields = vec![dict_field.clone()]; @@ -701,10 +739,14 @@ mod tests { let cloned_dict_field = dict_field.clone(); let expected_child_dtype = dict_field.data_type(); let mut struct_builder = StructBuilder::from_fields(vec![cloned_dict_field], 5); - struct_builder - .field_builder::>(0) - .expect("Builder should be StringDictionaryBuilder") - .append_value("dict string"); + let Some(dict_builder) = struct_builder.field_builder::>(0) + else { + panic!( + "Builder should be StringDictionaryBuilder<{}>", + type_name::() + ) + }; + dict_builder.append_value("dict string"); struct_builder.append(true); let array = struct_builder.finish(); @@ -714,13 +756,15 @@ mod tests { } #[test] - #[should_panic(expected = "Data type Dictionary(Int16, Utf8) is not currently supported")] + #[should_panic( + expected = "Data type Dictionary(UInt64, Utf8) with key type UInt64 is not currently supported" + )] fn test_struct_array_builder_from_schema_unsupported_type() { let fields = vec![ - Field::new("f1", DataType::Int16, false), + Field::new("f1", DataType::UInt64, false), Field::new( "f2", - DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), + DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), false, ), ]; @@ -729,7 +773,7 @@ mod tests { } #[test] - #[should_panic(expected = "Unsupported dictionary value type Int32 is not currently supported")] + #[should_panic(expected = "Dictionary value type Int32 is not currently supported")] fn test_struct_array_builder_from_dict_with_unsupported_value_type() { let fields = vec![Field::new( "f1", diff --git a/arrow-array/src/ffi_stream.rs b/arrow-array/src/ffi_stream.rs index 0d4a3f3b39a7..3d4e89e80b89 100644 --- a/arrow-array/src/ffi_stream.rs +++ b/arrow-array/src/ffi_stream.rs @@ -379,21 +379,6 @@ impl RecordBatchReader for ArrowArrayStreamReader { } } -/// Exports a record batch reader to raw pointer of the C Stream Interface provided by the consumer. -/// -/// # Safety -/// Assumes that the pointer represents valid C Stream Interfaces, both in memory -/// representation and lifetime via the `release` mechanism. -#[deprecated(since = "50.0.0", note = "Use FFI_ArrowArrayStream::new")] -pub unsafe fn export_reader_into_raw( - reader: Box, - out_stream: *mut FFI_ArrowArrayStream, -) { - let stream = FFI_ArrowArrayStream::new(reader); - - std::ptr::write_unaligned(out_stream, stream); -} - #[cfg(test)] mod tests { use super::*; diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index dc0de7736bd9..d0c8ffa39783 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -20,7 +20,7 @@ use std::fmt::Debug; use std::ptr::NonNull; use std::sync::Arc; -use crate::alloc::{Allocation, Deallocation, ALIGNMENT}; +use crate::alloc::{Allocation, Deallocation}; use crate::util::bit_chunk_iterator::{BitChunks, UnalignedBitChunk}; use crate::BufferBuilder; use crate::{bit_util, bytes::Bytes, native::ArrowNativeType}; @@ -99,26 +99,6 @@ impl Buffer { buffer.into() } - /// Creates a buffer from an existing aligned memory region (must already be byte-aligned), this - /// `Buffer` will free this piece of memory when dropped. - /// - /// # Arguments - /// - /// * `ptr` - Pointer to raw parts - /// * `len` - Length of raw parts in **bytes** - /// * `capacity` - Total allocated memory for the pointer `ptr`, in **bytes** - /// - /// # Safety - /// - /// This function is unsafe as there is no guarantee that the given pointer is valid for `len` - /// bytes. If the `ptr` and `capacity` come from a `Buffer`, then this is guaranteed. - #[deprecated(since = "50.0.0", note = "Use Buffer::from_vec")] - pub unsafe fn from_raw_parts(ptr: NonNull, len: usize, capacity: usize) -> Self { - assert!(len <= capacity); - let layout = Layout::from_size_align(capacity, ALIGNMENT).unwrap(); - Buffer::build_with_arguments(ptr, len, Deallocation::Standard(layout)) - } - /// Creates a buffer from an existing memory region. Ownership of the memory is tracked via reference counting /// and the memory will be freed using the `drop` method of [crate::alloc::Allocation] when the reference count reaches zero. /// @@ -322,6 +302,8 @@ impl Buffer { /// Returns `MutableBuffer` for mutating the buffer if this buffer is not shared. /// Returns `Err` if this is shared or its allocation is from an external source or /// it is not allocated with alignment [`ALIGNMENT`] + /// + /// [`ALIGNMENT`]: crate::alloc::ALIGNMENT pub fn into_mutable(self) -> Result { let ptr = self.ptr; let length = self.length; diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 4729779c43ba..8823924eb55b 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -35,7 +35,6 @@ bench = false [dependencies] arrow-array = { workspace = true } -arrow-buffer = { workspace = true } arrow-cast = { workspace = true } arrow-schema = { workspace = true } chrono = { workspace = true } @@ -45,6 +44,7 @@ lazy_static = { version = "1.4", default-features = false } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } [dev-dependencies] +arrow-buffer = { workspace = true } tempfile = "3.3" futures = "0.3" tokio = { version = "1.27", default-features = false, features = ["io-util"] } diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 9bdb80ef31ca..d3d518316397 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -241,7 +241,7 @@ pub struct Format { } impl Format { - /// Specify whether the CSV file has a header, defaults to `true` + /// Specify whether the CSV file has a header, defaults to `false` /// /// When `true`, the first row of the CSV file is treated as a header row pub fn with_header(mut self, has_header: bool) -> Self { diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 012d3947f02b..fbb295036a9b 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -43,11 +43,11 @@ base64 = { version = "0.22", default-features = false, features = ["std"] } bytes = { version = "1", default-features = false } futures = { version = "0.3", default-features = false, features = ["alloc"] } once_cell = { version = "1", optional = true } -paste = { version = "1.0" } +paste = { version = "1.0" , optional = true } prost = { version = "0.13.1", default-features = false, features = ["prost-derive"] } # For Timestamp type prost-types = { version = "0.13.1", default-features = false } -tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } +tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"], optional = true } tonic = { version = "0.12.3", default-features = false, features = ["transport", "codegen", "prost"] } # CLI-related dependencies @@ -61,11 +61,10 @@ all-features = true [features] default = [] -flight-sql-experimental = ["dep:arrow-arith", "dep:arrow-data", "dep:arrow-ord", "dep:arrow-row", "dep:arrow-select", "dep:arrow-string", "dep:once_cell"] +flight-sql-experimental = ["dep:arrow-arith", "dep:arrow-data", "dep:arrow-ord", "dep:arrow-row", "dep:arrow-select", "dep:arrow-string", "dep:once_cell", "dep:paste"] tls = ["tonic/tls"] - # Enable CLI tools -cli = ["dep:anyhow", "arrow-array/chrono-tz", "arrow-cast/prettyprint", "dep:clap", "dep:tracing-log", "dep:tracing-subscriber", "tonic/tls-webpki-roots"] +cli = ["arrow-array/chrono-tz", "arrow-cast/prettyprint", "tonic/tls-webpki-roots", "dep:anyhow", "dep:clap", "dep:tracing-log", "dep:tracing-subscriber"] [dev-dependencies] arrow-cast = { workspace = true, features = ["prettyprint"] } @@ -75,6 +74,9 @@ http-body = "1.0.0" hyper-util = "0.1" pin-project-lite = "0.2" tempfile = "3.3" +tracing-log = { version = "0.2" } +tracing-subscriber = { version = "0.3.1", default-features = false, features = ["ansi", "env-filter", "fmt"] } +tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } tokio-stream = { version = "0.1", features = ["net"] } tower = { version = "0.5.0", features = ["util"] } uuid = { version = "1.10.0", features = ["v4"] } diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index ccd32b433cf1..6358227a8912 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,5 +32,5 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -prost-build = { version = "=0.13.3", default-features = false } +prost-build = { version = "=0.13.4", default-features = false } tonic-build = { version = "=0.12.3", default-features = false, features = ["transport", "prost"] } diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 9f18416c06ec..81fa99faeb93 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -38,6 +38,8 @@ //! [Flight SQL]: https://arrow.apache.org/docs/format/FlightSql.html #![allow(rustdoc::invalid_html_tags)] #![warn(missing_docs)] +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] use arrow_ipc::{convert, writer, writer::EncodedData, writer::IpcWriteOptions}; use arrow_schema::{ArrowError, Schema}; diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 0ebbc05a8b83..8654b4b92734 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -36,18 +36,17 @@ logging = ["tracing-subscriber"] [dependencies] arrow = { path = "../arrow", default-features = false, features = ["test_utils", "ipc", "ipc_compression", "json", "ffi"] } arrow-flight = { path = "../arrow-flight", default-features = false } -arrow-buffer = { path = "../arrow-buffer", default-features = false } arrow-integration-test = { path = "../arrow-integration-test", default-features = false } -async-trait = { version = "0.1.41", default-features = false } clap = { version = "4", default-features = false, features = ["std", "derive", "help", "error-context", "usage"] } futures = { version = "0.3", default-features = false } prost = { version = "0.13", default-features = false } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } -tokio = { version = "1.0", default-features = false } +tokio = { version = "1.0", default-features = false, features = [ "rt-multi-thread"] } tonic = { version = "0.12", default-features = false } tracing-subscriber = { version = "0.3.1", default-features = false, features = ["fmt"], optional = true } flate2 = { version = "1", default-features = false, features = ["rust_backend"] } [dev-dependencies] +arrow-buffer = { path = "../arrow-buffer", default-features = false } tempfile = { version = "3", default-features = false } diff --git a/arrow-integration-testing/src/bin/arrow-file-to-stream.rs b/arrow-integration-testing/src/bin/arrow-file-to-stream.rs index 3e027faef91f..661f0a047db4 100644 --- a/arrow-integration-testing/src/bin/arrow-file-to-stream.rs +++ b/arrow-integration-testing/src/bin/arrow-file-to-stream.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use arrow::error::Result; use arrow::ipc::reader::FileReader; use arrow::ipc::writer::StreamWriter; diff --git a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs index cc3dd2110e36..6a901cc63bab 100644 --- a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs +++ b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use arrow::error::{ArrowError, Result}; use arrow::ipc::reader::FileReader; use arrow::ipc::writer::FileWriter; diff --git a/arrow-integration-testing/src/bin/arrow-stream-to-file.rs b/arrow-integration-testing/src/bin/arrow-stream-to-file.rs index 07ac5c7ddd42..8b4bb332781c 100644 --- a/arrow-integration-testing/src/bin/arrow-stream-to-file.rs +++ b/arrow-integration-testing/src/bin/arrow-stream-to-file.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use std::io; use arrow::error::Result; diff --git a/arrow-integration-testing/src/bin/flight-test-integration-client.rs b/arrow-integration-testing/src/bin/flight-test-integration-client.rs index b8bbb952837b..0d16fe3b403f 100644 --- a/arrow-integration-testing/src/bin/flight-test-integration-client.rs +++ b/arrow-integration-testing/src/bin/flight-test-integration-client.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use arrow_integration_testing::flight_client_scenarios; use clap::Parser; type Error = Box; diff --git a/arrow-integration-testing/src/bin/flight-test-integration-server.rs b/arrow-integration-testing/src/bin/flight-test-integration-server.rs index 5310d07d4f8e..94be71309799 100644 --- a/arrow-integration-testing/src/bin/flight-test-integration-server.rs +++ b/arrow-integration-testing/src/bin/flight-test-integration-server.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use arrow_integration_testing::flight_server_scenarios; use clap::Parser; diff --git a/arrow-integration-testing/src/lib.rs b/arrow-integration-testing/src/lib.rs index c8ce01e9f13b..e669690ef4f5 100644 --- a/arrow-integration-testing/src/lib.rs +++ b/arrow-integration-testing/src/lib.rs @@ -17,6 +17,8 @@ //! Common code used in the integration test binaries +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] #![warn(missing_docs)] use serde_json::Value; diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml index a2894c020296..8d74d2f97d72 100644 --- a/arrow-ord/Cargo.toml +++ b/arrow-ord/Cargo.toml @@ -39,7 +39,7 @@ arrow-buffer = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } arrow-select = { workspace = true } -half = { version = "2.1", default-features = false, features = ["num-traits"] } [dev-dependencies] +half = { version = "2.1", default-features = false, features = ["num-traits"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml index 3754afb4dbc6..90d99684d265 100644 --- a/arrow-row/Cargo.toml +++ b/arrow-row/Cargo.toml @@ -33,12 +33,6 @@ name = "arrow_row" path = "src/lib.rs" bench = false -[target.'cfg(target_arch = "wasm32")'.dependencies] -ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } - -[target.'cfg(not(target_arch = "wasm32"))'.dependencies] -ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } - [dependencies] arrow-array = { workspace = true } arrow-buffer = { workspace = true } diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 628d4a683cac..1e1f9fbde0e4 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -47,3 +47,8 @@ features = ["ffi"] [dev-dependencies] serde_json = "1.0" bincode = { version = "1.3.3", default-features = false } +criterion = { version = "0.5", default-features = false } + +[[bench]] +name = "ffi" +harness = false \ No newline at end of file diff --git a/arrow-schema/benches/ffi.rs b/arrow-schema/benches/ffi.rs new file mode 100644 index 000000000000..1285acb883ea --- /dev/null +++ b/arrow-schema/benches/ffi.rs @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::ffi::FFI_ArrowSchema; +use arrow_schema::{DataType, Field}; +use criterion::*; +use std::sync::Arc; + +fn criterion_benchmark(c: &mut Criterion) { + let fields = vec![ + Arc::new(Field::new("c1", DataType::Utf8, false)), + Arc::new(Field::new("c2", DataType::Utf8, false)), + Arc::new(Field::new("c3", DataType::Utf8, false)), + Arc::new(Field::new("c4", DataType::Utf8, false)), + Arc::new(Field::new("c5", DataType::Utf8, false)), + ]; + let data_type = DataType::Struct(fields.into()); + c.bench_function("ffi_arrow_schema_try_from", |b| { + b.iter(|| FFI_ArrowSchema::try_from(&data_type)); + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index 70650d769cf6..08b1e6d0b186 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -38,6 +38,7 @@ use crate::{ ArrowError, DataType, Field, FieldRef, IntervalUnit, Schema, TimeUnit, UnionFields, UnionMode, }; use bitflags::bitflags; +use std::borrow::Cow; use std::sync::Arc; use std::{ collections::HashMap, @@ -685,57 +686,59 @@ impl TryFrom<&DataType> for FFI_ArrowSchema { } } -fn get_format_string(dtype: &DataType) -> Result { +fn get_format_string(dtype: &DataType) -> Result, ArrowError> { match dtype { - DataType::Null => Ok("n".to_string()), - DataType::Boolean => Ok("b".to_string()), - DataType::Int8 => Ok("c".to_string()), - DataType::UInt8 => Ok("C".to_string()), - DataType::Int16 => Ok("s".to_string()), - DataType::UInt16 => Ok("S".to_string()), - DataType::Int32 => Ok("i".to_string()), - DataType::UInt32 => Ok("I".to_string()), - DataType::Int64 => Ok("l".to_string()), - DataType::UInt64 => Ok("L".to_string()), - DataType::Float16 => Ok("e".to_string()), - DataType::Float32 => Ok("f".to_string()), - DataType::Float64 => Ok("g".to_string()), - DataType::BinaryView => Ok("vz".to_string()), - DataType::Binary => Ok("z".to_string()), - DataType::LargeBinary => Ok("Z".to_string()), - DataType::Utf8View => Ok("vu".to_string()), - DataType::Utf8 => Ok("u".to_string()), - DataType::LargeUtf8 => Ok("U".to_string()), - DataType::FixedSizeBinary(num_bytes) => Ok(format!("w:{num_bytes}")), - DataType::FixedSizeList(_, num_elems) => Ok(format!("+w:{num_elems}")), - DataType::Decimal128(precision, scale) => Ok(format!("d:{precision},{scale}")), - DataType::Decimal256(precision, scale) => Ok(format!("d:{precision},{scale},256")), - DataType::Date32 => Ok("tdD".to_string()), - DataType::Date64 => Ok("tdm".to_string()), - DataType::Time32(TimeUnit::Second) => Ok("tts".to_string()), - DataType::Time32(TimeUnit::Millisecond) => Ok("ttm".to_string()), - DataType::Time64(TimeUnit::Microsecond) => Ok("ttu".to_string()), - DataType::Time64(TimeUnit::Nanosecond) => Ok("ttn".to_string()), - DataType::Timestamp(TimeUnit::Second, None) => Ok("tss:".to_string()), - DataType::Timestamp(TimeUnit::Millisecond, None) => Ok("tsm:".to_string()), - DataType::Timestamp(TimeUnit::Microsecond, None) => Ok("tsu:".to_string()), - DataType::Timestamp(TimeUnit::Nanosecond, None) => Ok("tsn:".to_string()), - DataType::Timestamp(TimeUnit::Second, Some(tz)) => Ok(format!("tss:{tz}")), - DataType::Timestamp(TimeUnit::Millisecond, Some(tz)) => Ok(format!("tsm:{tz}")), - DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => Ok(format!("tsu:{tz}")), - DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) => Ok(format!("tsn:{tz}")), - DataType::Duration(TimeUnit::Second) => Ok("tDs".to_string()), - DataType::Duration(TimeUnit::Millisecond) => Ok("tDm".to_string()), - DataType::Duration(TimeUnit::Microsecond) => Ok("tDu".to_string()), - DataType::Duration(TimeUnit::Nanosecond) => Ok("tDn".to_string()), - DataType::Interval(IntervalUnit::YearMonth) => Ok("tiM".to_string()), - DataType::Interval(IntervalUnit::DayTime) => Ok("tiD".to_string()), - DataType::Interval(IntervalUnit::MonthDayNano) => Ok("tin".to_string()), - DataType::List(_) => Ok("+l".to_string()), - DataType::LargeList(_) => Ok("+L".to_string()), - DataType::Struct(_) => Ok("+s".to_string()), - DataType::Map(_, _) => Ok("+m".to_string()), - DataType::RunEndEncoded(_, _) => Ok("+r".to_string()), + DataType::Null => Ok("n".into()), + DataType::Boolean => Ok("b".into()), + DataType::Int8 => Ok("c".into()), + DataType::UInt8 => Ok("C".into()), + DataType::Int16 => Ok("s".into()), + DataType::UInt16 => Ok("S".into()), + DataType::Int32 => Ok("i".into()), + DataType::UInt32 => Ok("I".into()), + DataType::Int64 => Ok("l".into()), + DataType::UInt64 => Ok("L".into()), + DataType::Float16 => Ok("e".into()), + DataType::Float32 => Ok("f".into()), + DataType::Float64 => Ok("g".into()), + DataType::BinaryView => Ok("vz".into()), + DataType::Binary => Ok("z".into()), + DataType::LargeBinary => Ok("Z".into()), + DataType::Utf8View => Ok("vu".into()), + DataType::Utf8 => Ok("u".into()), + DataType::LargeUtf8 => Ok("U".into()), + DataType::FixedSizeBinary(num_bytes) => Ok(Cow::Owned(format!("w:{num_bytes}"))), + DataType::FixedSizeList(_, num_elems) => Ok(Cow::Owned(format!("+w:{num_elems}"))), + DataType::Decimal128(precision, scale) => Ok(Cow::Owned(format!("d:{precision},{scale}"))), + DataType::Decimal256(precision, scale) => { + Ok(Cow::Owned(format!("d:{precision},{scale},256"))) + } + DataType::Date32 => Ok("tdD".into()), + DataType::Date64 => Ok("tdm".into()), + DataType::Time32(TimeUnit::Second) => Ok("tts".into()), + DataType::Time32(TimeUnit::Millisecond) => Ok("ttm".into()), + DataType::Time64(TimeUnit::Microsecond) => Ok("ttu".into()), + DataType::Time64(TimeUnit::Nanosecond) => Ok("ttn".into()), + DataType::Timestamp(TimeUnit::Second, None) => Ok("tss:".into()), + DataType::Timestamp(TimeUnit::Millisecond, None) => Ok("tsm:".into()), + DataType::Timestamp(TimeUnit::Microsecond, None) => Ok("tsu:".into()), + DataType::Timestamp(TimeUnit::Nanosecond, None) => Ok("tsn:".into()), + DataType::Timestamp(TimeUnit::Second, Some(tz)) => Ok(Cow::Owned(format!("tss:{tz}"))), + DataType::Timestamp(TimeUnit::Millisecond, Some(tz)) => Ok(Cow::Owned(format!("tsm:{tz}"))), + DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => Ok(Cow::Owned(format!("tsu:{tz}"))), + DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) => Ok(Cow::Owned(format!("tsn:{tz}"))), + DataType::Duration(TimeUnit::Second) => Ok("tDs".into()), + DataType::Duration(TimeUnit::Millisecond) => Ok("tDm".into()), + DataType::Duration(TimeUnit::Microsecond) => Ok("tDu".into()), + DataType::Duration(TimeUnit::Nanosecond) => Ok("tDn".into()), + DataType::Interval(IntervalUnit::YearMonth) => Ok("tiM".into()), + DataType::Interval(IntervalUnit::DayTime) => Ok("tiD".into()), + DataType::Interval(IntervalUnit::MonthDayNano) => Ok("tin".into()), + DataType::List(_) => Ok("+l".into()), + DataType::LargeList(_) => Ok("+L".into()), + DataType::Struct(_) => Ok("+s".into()), + DataType::Map(_, _) => Ok("+m".into()), + DataType::RunEndEncoded(_, _) => Ok("+r".into()), DataType::Dictionary(key_data_type, _) => get_format_string(key_data_type), DataType::Union(fields, mode) => { let formats = fields @@ -743,8 +746,8 @@ fn get_format_string(dtype: &DataType) -> Result { .map(|(t, _)| t.to_string()) .collect::>(); match mode { - UnionMode::Dense => Ok(format!("{}:{}", "+ud", formats.join(","))), - UnionMode::Sparse => Ok(format!("{}:{}", "+us", formats.join(","))), + UnionMode::Dense => Ok(Cow::Owned(format!("{}:{}", "+ud", formats.join(",")))), + UnionMode::Sparse => Ok(Cow::Owned(format!("{}:{}", "+us", formats.join(",")))), } } other => Err(ArrowError::CDataInterface(format!( diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index b4c4d4e59ecb..904b933cd299 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -18,7 +18,7 @@ use std::ops::Deref; use std::sync::Arc; -use crate::{ArrowError, DataType, Field, FieldRef, SchemaBuilder}; +use crate::{ArrowError, DataType, Field, FieldRef}; /// A cheaply cloneable, owned slice of [`FieldRef`] /// @@ -256,33 +256,6 @@ impl Fields { .collect(); Ok(filtered) } - - /// Remove a field by index and return it. - /// - /// # Panic - /// - /// Panics if `index` is out of bounds. - /// - /// # Example - /// ``` - /// use arrow_schema::{DataType, Field, Fields}; - /// let mut fields = Fields::from(vec![ - /// Field::new("a", DataType::Boolean, false), - /// Field::new("b", DataType::Int8, false), - /// Field::new("c", DataType::Utf8, false), - /// ]); - /// assert_eq!(fields.len(), 3); - /// assert_eq!(fields.remove(1), Field::new("b", DataType::Int8, false).into()); - /// assert_eq!(fields.len(), 2); - /// ``` - #[deprecated(since = "50.0.0", note = "Use SchemaBuilder::remove")] - #[doc(hidden)] - pub fn remove(&mut self, index: usize) -> FieldRef { - let mut builder = SchemaBuilder::from(Fields::from(&*self.0)); - let field = builder.remove(index); - *self = builder.finish().fields; - field - } } impl Default for Fields { diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index c5c22b527136..47c22e2a9318 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -434,33 +434,6 @@ impl Schema { .iter() .all(|(k, v1)| self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default()) } - - /// Remove field by index and return it. Recommend to use [`SchemaBuilder`] - /// if you are looking to remove multiple columns, as this will save allocations. - /// - /// # Panic - /// - /// Panics if `index` is out of bounds. - /// - /// # Example - /// - /// ``` - /// use arrow_schema::{DataType, Field, Schema}; - /// let mut schema = Schema::new(vec![ - /// Field::new("a", DataType::Boolean, false), - /// Field::new("b", DataType::Int8, false), - /// Field::new("c", DataType::Utf8, false), - /// ]); - /// assert_eq!(schema.fields.len(), 3); - /// assert_eq!(schema.remove(1), Field::new("b", DataType::Int8, false).into()); - /// assert_eq!(schema.fields.len(), 2); - /// ``` - #[deprecated(since = "50.0.0", note = "Use SchemaBuilder::remove")] - #[doc(hidden)] - #[allow(deprecated)] - pub fn remove(&mut self, index: usize) -> FieldRef { - self.fields.remove(index) - } } impl fmt::Display for Schema { diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 32b2384afe7b..8860cd61c5b3 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -56,8 +56,6 @@ arrow-string = { workspace = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } pyo3 = { version = "0.23", default-features = false, optional = true } -chrono = { workspace = true, optional = true } - [package.metadata.docs.rs] features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"] @@ -72,7 +70,7 @@ prettyprint = ["arrow-cast/prettyprint"] # not the core arrow code itself. Be aware that `rand` must be kept as # an optional dependency for supporting compile to wasm32-unknown-unknown # target without assuming an environment containing JavaScript. -test_utils = ["rand", "dep:chrono"] +test_utils = ["dep:rand"] pyarrow = ["pyo3", "ffi"] # force_validate runs full data validation for all arrays that are created # this is not enabled by default as it is too computationally expensive diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs index ec7990d3d764..5c4fcff13dee 100644 --- a/arrow/benches/cast_kernels.rs +++ b/arrow/benches/cast_kernels.rs @@ -250,6 +250,9 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function("cast decimal128 to decimal128 512", |b| { b.iter(|| cast_array(&decimal128_array, DataType::Decimal128(30, 5))) }); + c.bench_function("cast decimal128 to decimal128 512 lower precision", |b| { + b.iter(|| cast_array(&decimal128_array, DataType::Decimal128(6, 5))) + }); c.bench_function("cast decimal128 to decimal256 512", |b| { b.iter(|| cast_array(&decimal128_array, DataType::Decimal256(50, 5))) }); diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index a04733625135..bcc8e0b92243 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -77,6 +77,8 @@ http-body-util = "0.1" rand = "0.8" tempfile = "3.1.0" regex = "1.11.1" +# The "gzip" feature for reqwest is enabled for an integration test. +reqwest = { version = "0.12", features = ["gzip"] } http = "1.1.0" [[test]] diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 47249685b7bb..81015e82b39c 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -29,7 +29,7 @@ use crate::client::list::ListClient; use crate::client::retry::RetryExt; use crate::client::s3::{ CompleteMultipartUpload, CompleteMultipartUploadResult, CopyPartResult, - InitiateMultipartUploadResult, ListResponse, + InitiateMultipartUploadResult, ListResponse, PartMetadata, }; use crate::client::GetOptionsExt; use crate::multipart::PartId; @@ -62,6 +62,7 @@ use std::sync::Arc; const VERSION_HEADER: &str = "x-amz-version-id"; const SHA256_CHECKSUM: &str = "x-amz-checksum-sha256"; const USER_DEFINED_METADATA_HEADER_PREFIX: &str = "x-amz-meta-"; +const ALGORITHM: &str = "x-amz-checksum-algorithm"; /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] @@ -390,10 +391,9 @@ impl<'a> Request<'a> { let payload_sha256 = sha256.finish(); if let Some(Checksum::SHA256) = self.config.checksum { - self.builder = self.builder.header( - "x-amz-checksum-sha256", - BASE64_STANDARD.encode(payload_sha256), - ); + self.builder = self + .builder + .header(SHA256_CHECKSUM, BASE64_STANDARD.encode(payload_sha256)); } self.payload_sha256 = Some(payload_sha256); } @@ -617,8 +617,15 @@ impl S3Client { location: &Path, opts: PutMultipartOpts, ) -> Result { - let response = self - .request(Method::POST, location) + let mut request = self.request(Method::POST, location); + if let Some(algorithm) = self.config.checksum { + match algorithm { + Checksum::SHA256 => { + request = request.header(ALGORITHM, "SHA256"); + } + } + } + let response = request .query(&[("uploads", "")]) .with_encryption_headers() .with_attributes(opts.attributes) @@ -669,8 +676,13 @@ impl S3Client { request = request.with_encryption_headers(); } let response = request.send().await?; + let checksum_sha256 = response + .headers() + .get(SHA256_CHECKSUM) + .and_then(|v| v.to_str().ok()) + .map(|v| v.to_string()); - let content_id = match is_copy { + let e_tag = match is_copy { false => get_etag(response.headers()).context(MetadataSnafu)?, true => { let response = response @@ -682,6 +694,17 @@ impl S3Client { response.e_tag } }; + + let content_id = if self.config.checksum == Some(Checksum::SHA256) { + let meta = PartMetadata { + e_tag, + checksum_sha256, + }; + quick_xml::se::to_string(&meta).unwrap() + } else { + e_tag + }; + Ok(PartId { content_id }) } diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index d7c8c9b546eb..a7f9264a6815 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -493,6 +493,66 @@ mod tests { const NON_EXISTENT_NAME: &str = "nonexistentname"; + #[tokio::test] + async fn write_multipart_file_with_signature() { + maybe_skip_integration!(); + + let store = AmazonS3Builder::from_env() + .with_checksum_algorithm(Checksum::SHA256) + .build() + .unwrap(); + + let str = "test.bin"; + let path = Path::parse(str).unwrap(); + let opts = PutMultipartOpts::default(); + let mut upload = store.put_multipart_opts(&path, opts).await.unwrap(); + + upload + .put_part(PutPayload::from(vec![0u8; 10_000_000])) + .await + .unwrap(); + upload + .put_part(PutPayload::from(vec![0u8; 5_000_000])) + .await + .unwrap(); + + let res = upload.complete().await.unwrap(); + assert!(res.e_tag.is_some(), "Should have valid etag"); + + store.delete(&path).await.unwrap(); + } + + #[tokio::test] + async fn write_multipart_file_with_signature_object_lock() { + maybe_skip_integration!(); + + let bucket = "test-object-lock"; + let store = AmazonS3Builder::from_env() + .with_bucket_name(bucket) + .with_checksum_algorithm(Checksum::SHA256) + .build() + .unwrap(); + + let str = "test.bin"; + let path = Path::parse(str).unwrap(); + let opts = PutMultipartOpts::default(); + let mut upload = store.put_multipart_opts(&path, opts).await.unwrap(); + + upload + .put_part(PutPayload::from(vec![0u8; 10_000_000])) + .await + .unwrap(); + upload + .put_part(PutPayload::from(vec![0u8; 5_000_000])) + .await + .unwrap(); + + let res = upload.complete().await.unwrap(); + assert!(res.e_tag.is_some(), "Should have valid etag"); + + store.delete(&path).await.unwrap(); + } + #[tokio::test] async fn s3_test() { maybe_skip_integration!(); diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index 76d1c1f22f58..1b7ce5aa7a78 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -671,6 +671,10 @@ impl ClientOptions { builder = builder.danger_accept_invalid_certs(true) } + // Reqwest will remove the `Content-Length` header if it is configured to + // transparently decompress the body via the non-default `gzip` feature. + builder = builder.no_gzip(); + builder .https_only(!self.allow_http.get()?) .build() diff --git a/object_store/src/client/s3.rs b/object_store/src/client/s3.rs index dba752cb1251..7fe956b2376e 100644 --- a/object_store/src/client/s3.rs +++ b/object_store/src/client/s3.rs @@ -106,14 +106,32 @@ pub(crate) struct CompleteMultipartUpload { pub part: Vec, } +#[derive(Serialize, Deserialize)] +pub(crate) struct PartMetadata { + pub e_tag: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub checksum_sha256: Option, +} + impl From> for CompleteMultipartUpload { fn from(value: Vec) -> Self { let part = value .into_iter() .enumerate() - .map(|(part_number, part)| MultipartPart { - e_tag: part.content_id, - part_number: part_number + 1, + .map(|(part_idx, part)| { + let md = match quick_xml::de::from_str::(&part.content_id) { + Ok(md) => md, + // fallback to old way + Err(_) => PartMetadata { + e_tag: part.content_id.clone(), + checksum_sha256: None, + }, + }; + MultipartPart { + e_tag: md.e_tag, + part_number: part_idx + 1, + checksum_sha256: md.checksum_sha256, + } }) .collect(); Self { part } @@ -126,6 +144,9 @@ pub(crate) struct MultipartPart { pub e_tag: String, #[serde(rename = "PartNumber")] pub part_number: usize, + #[serde(rename = "ChecksumSHA256")] + #[serde(skip_serializing_if = "Option::is_none")] + pub checksum_sha256: Option, } #[derive(Debug, Deserialize)] diff --git a/object_store/tests/http.rs b/object_store/tests/http.rs new file mode 100644 index 000000000000..a9b3145bb660 --- /dev/null +++ b/object_store/tests/http.rs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests the HTTP store implementation + +#[cfg(feature = "http")] +use object_store::{http::HttpBuilder, path::Path, GetOptions, GetRange, ObjectStore}; + +/// Tests that even when reqwest has the `gzip` feature enabled, the HTTP store +/// does not error on a missing `Content-Length` header. +#[tokio::test] +#[cfg(feature = "http")] +async fn test_http_store_gzip() { + let http_store = HttpBuilder::new() + .with_url("https://raw.githubusercontent.com/apache/arrow-rs/refs/heads/main") + .build() + .unwrap(); + + let _ = http_store + .get_opts( + &Path::parse("LICENSE.txt").unwrap(), + GetOptions { + range: Some(GetRange::Bounded(0..100)), + ..Default::default() + }, + ) + .await + .unwrap(); +} diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 8d0be5f9f8c4..16de0ba78989 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -574,7 +574,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// anticipated encoded size. #[cfg(feature = "arrow")] pub(crate) fn get_estimated_total_bytes(&self) -> u64 { - self.column_metrics.total_bytes_written + self.data_pages + .iter() + .map(|page| page.data().len() as u64) + .sum::() + + self.column_metrics.total_bytes_written + self.encoder.estimated_data_page_size() as u64 + self.encoder.estimated_dict_page_size().unwrap_or_default() as u64 } @@ -3422,6 +3426,26 @@ mod tests { assert!(stats.max_bytes_opt().is_none()); } + #[test] + #[cfg(feature = "arrow")] + fn test_column_writer_get_estimated_total_bytes() { + let page_writer = get_test_page_writer(); + let props = Default::default(); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + assert_eq!(writer.get_estimated_total_bytes(), 0); + + writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); + writer.add_data_page().unwrap(); + let size_with_one_page = writer.get_estimated_total_bytes(); + assert_eq!(size_with_one_page, 20); + + writer.write_batch(&[5, 6, 7, 8], None, None).unwrap(); + writer.add_data_page().unwrap(); + let size_with_two_pages = writer.get_estimated_total_bytes(); + // different pages have different compressed lengths + assert_eq!(size_with_two_pages, 20 + 21); + } + fn write_multiple_pages( column_descr: &Arc, pages: &[&[Option]],