From db9084e74cc5869727e9e51ca0ddd5a6d386c271 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 21 Dec 2022 09:38:30 +0000 Subject: [PATCH] Split out arrow-row (#2594) (#3375) * Split out arrow-row (#2594) * Fix CI * Fix doc * More SortOptions to arrow_schema --- .github/workflows/arrow.yml | 5 ++ .github/workflows/integration.yml | 3 +- Cargo.toml | 1 + arrow-ord/src/sort.rs | 21 +------ arrow-row/Cargo.toml | 61 ++++++++++++++++++ .../src/row => arrow-row/src}/dictionary.rs | 9 ++- {arrow/src/row => arrow-row/src}/fixed.rs | 8 +-- {arrow/src/row => arrow-row/src}/interner.rs | 0 arrow/src/row/mod.rs => arrow-row/src/lib.rs | 62 ++++++++----------- {arrow/src/row => arrow-row/src}/list.rs | 5 +- {arrow/src/row => arrow-row/src}/variable.rs | 9 ++- arrow-schema/src/lib.rs | 19 ++++++ arrow/Cargo.toml | 2 + arrow/src/lib.rs | 2 +- dev/release/README.md | 3 +- 15 files changed, 134 insertions(+), 76 deletions(-) create mode 100644 arrow-row/Cargo.toml rename {arrow/src/row => arrow-row/src}/dictionary.rs (97%) rename {arrow/src/row => arrow-row/src}/fixed.rs (98%) rename {arrow/src/row => arrow-row/src}/interner.rs (100%) rename arrow/src/row/mod.rs => arrow-row/src/lib.rs (98%) rename {arrow/src/row => arrow-row/src}/list.rs (98%) rename {arrow/src/row => arrow-row/src}/variable.rs (97%) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 458e0e0a149..e0db2c08812 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -35,6 +35,7 @@ on: - arrow-ipc/** - arrow-json/** - arrow-ord/** + - arrow-row/** - arrow-schema/** - arrow-select/** - arrow-string/** @@ -76,6 +77,8 @@ jobs: run: cargo test -p arrow-string --all-features - name: Test arrow-ord with all features except SIMD run: cargo test -p arrow-ord --features dyn_cmp_dict + - name: Test arrow-row with all features + run: cargo test -p arrow-row --all-features - name: Test arrow-integration-test with all features run: cargo test -p arrow-integration-test --all-features - name: Test arrow with default features @@ -196,5 +199,7 @@ jobs: run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings - name: Clippy arrow-ord with all features except SIMD run: cargo clippy -p arrow-ord --all-targets --features dyn_cmp_dict -- -D warnings + - name: Clippy arrow-row with all features + run: cargo clippy -p arrow-row --all-targets --all-features -- -D warnings - name: Clippy arrow run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 526106bfe7c..0975c11d52f 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -33,11 +33,12 @@ on: - arrow-integration-test/** - arrow-integration-testing/** - arrow-ipc/** - - arrow-ord/** - arrow-json/** + - arrow-ord/** - arrow-pyarrow-integration-testing/** - arrow-schema/** - arrow-select/** + - arrow-sort/** - arrow-string/** - arrow/** diff --git a/Cargo.toml b/Cargo.toml index c123106c6f7..fb072f7d346 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ members = [ "arrow-ipc", "arrow-json", "arrow-ord", + "arrow-row", "arrow-schema", "arrow-select", "arrow-string", diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index a2035988fe2..d13a7a03de9 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -27,6 +27,8 @@ use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; use arrow_select::take::take; use std::cmp::Ordering; +pub use arrow_schema::SortOptions; + /// Sort the `ArrayRef` using `SortOptions`. /// /// Performs a sort on values and indices. Nulls are ordered according @@ -366,25 +368,6 @@ pub fn sort_to_indices( }) } -/// Options that define how sort kernels should behave -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub struct SortOptions { - /// Whether to sort in descending order - pub descending: bool, - /// Whether to sort nulls first - pub nulls_first: bool, -} - -impl Default for SortOptions { - fn default() -> Self { - Self { - descending: false, - // default to nulls first to match spark's behavior - nulls_first: true, - } - } -} - /// Sort boolean values /// /// when a limit is present, the sort is pair-comparison based as k-select might be more efficient, diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml new file mode 100644 index 00000000000..4741c9d5840 --- /dev/null +++ b/arrow-row/Cargo.toml @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-row" +version = "29.0.0" +description = "Arrow row format" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_row" +path = "src/lib.rs" +bench = false + +[target.'cfg(target_arch = "wasm32")'.dependencies] +ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } + +[target.'cfg(not(target_arch = "wasm32"))'.dependencies] +ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } + +[dependencies] +arrow-array = { version = "29.0.0", path = "../arrow-array" } +arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } +arrow-data = { version = "29.0.0", path = "../arrow-data" } +arrow-schema = { version = "29.0.0", path = "../arrow-schema" } + +half = { version = "2.1", default-features = false } +hashbrown = { version = "0.13", default-features = false } + +[dev-dependencies] +arrow-cast = { version = "29.0.0", path = "../arrow-cast" } +arrow-ord = { version = "29.0.0", path = "../arrow-ord" } +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } + +[features] + diff --git a/arrow/src/row/dictionary.rs b/arrow-row/src/dictionary.rs similarity index 97% rename from arrow/src/row/dictionary.rs rename to arrow-row/src/dictionary.rs index 82169a37d35..0da6c68d168 100644 --- a/arrow/src/row/dictionary.rs +++ b/arrow-row/src/dictionary.rs @@ -15,17 +15,16 @@ // specific language governing permissions and limitations // under the License. -use crate::compute::SortOptions; -use crate::row::fixed::{FixedLengthEncoding, FromSlice}; -use crate::row::interner::{Interned, OrderPreservingInterner}; -use crate::row::{null_sentinel, Rows}; +use crate::fixed::{FixedLengthEncoding, FromSlice}; +use crate::interner::{Interned, OrderPreservingInterner}; +use crate::{null_sentinel, Rows}; use arrow_array::builder::*; use arrow_array::cast::*; use arrow_array::types::*; use arrow_array::*; use arrow_buffer::{ArrowNativeType, MutableBuffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::{ArrowError, DataType}; +use arrow_schema::{ArrowError, DataType, SortOptions}; use std::collections::hash_map::Entry; use std::collections::HashMap; diff --git a/arrow/src/row/fixed.rs b/arrow-row/src/fixed.rs similarity index 98% rename from arrow/src/row/fixed.rs rename to arrow-row/src/fixed.rs index 03c53c99479..159eba9adf1 100644 --- a/arrow/src/row/fixed.rs +++ b/arrow-row/src/fixed.rs @@ -16,14 +16,12 @@ // under the License. use crate::array::PrimitiveArray; -use crate::compute::SortOptions; -use crate::datatypes::ArrowPrimitiveType; -use crate::row::{null_sentinel, Rows}; +use crate::{null_sentinel, Rows}; use arrow_array::builder::BufferBuilder; -use arrow_array::{BooleanArray, FixedSizeBinaryArray}; +use arrow_array::{ArrowPrimitiveType, BooleanArray, FixedSizeBinaryArray}; use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::DataType; +use arrow_schema::{DataType, SortOptions}; use half::f16; pub trait FromSlice { diff --git a/arrow/src/row/interner.rs b/arrow-row/src/interner.rs similarity index 100% rename from arrow/src/row/interner.rs rename to arrow-row/src/interner.rs diff --git a/arrow/src/row/mod.rs b/arrow-row/src/lib.rs similarity index 98% rename from arrow/src/row/mod.rs rename to arrow-row/src/lib.rs index bf58cf2f01e..cf23e6e5c3b 100644 --- a/arrow/src/row/mod.rs +++ b/arrow-row/src/lib.rs @@ -50,7 +50,7 @@ //! # Basic Example //! ``` //! # use std::sync::Arc; -//! # use arrow::row::{RowConverter, SortField}; +//! # use arrow_row::{RowConverter, SortField}; //! # use arrow_array::{ArrayRef, Int32Array, StringArray}; //! # use arrow_array::cast::{as_primitive_array, as_string_array}; //! # use arrow_array::types::Int32Type; @@ -102,7 +102,7 @@ //! The row format can also be used to implement a fast multi-column / lexicographic sort //! //! ``` -//! # use arrow::row::{RowConverter, SortField}; +//! # use arrow_row::{RowConverter, SortField}; //! # use arrow_array::{ArrayRef, UInt32Array}; //! fn lexsort_to_indices(arrays: &[ArrayRef]) -> UInt32Array { //! let fields = arrays @@ -117,11 +117,11 @@ //! } //! ``` //! -//! [non-comparison sorts]:[https://en.wikipedia.org/wiki/Sorting_algorithm#Non-comparison_sorts] -//! [radix sort]:[https://en.wikipedia.org/wiki/Radix_sort] -//! [normalized for sorting]:[https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf] -//! [`memcmp`]:[https://www.man7.org/linux/man-pages/man3/memcmp.3.html] -//! [`lexsort`]: crate::compute::kernels::sort::lexsort +//! [non-comparison sorts]: https://en.wikipedia.org/wiki/Sorting_algorithm#Non-comparison_sorts +//! [radix sort]: https://en.wikipedia.org/wiki/Radix_sort +//! [normalized for sorting]: https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf +//! [`memcmp`]: https://www.man7.org/linux/man-pages/man3/memcmp.3.html +//! [`lexsort`]: https://docs.rs/arrow-ord/latest/arrow_ord/sort/fn.lexsort.html //! [compared]: PartialOrd //! [compare]: PartialOrd @@ -131,18 +131,16 @@ use std::sync::Arc; use arrow_array::cast::*; use arrow_array::*; +use arrow_buffer::ArrowNativeType; use arrow_data::ArrayDataBuilder; +use arrow_schema::*; -use crate::compute::SortOptions; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::row::dictionary::{ +use crate::dictionary::{ compute_dictionary_mapping, decode_dictionary, encode_dictionary, }; -use crate::row::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive}; -use crate::row::interner::OrderPreservingInterner; -use crate::row::variable::{decode_binary, decode_string}; -use crate::{downcast_dictionary_array, downcast_primitive_array}; +use crate::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive}; +use crate::interner::OrderPreservingInterner; +use crate::variable::{decode_binary, decode_string}; mod dictionary; mod fixed; @@ -437,7 +435,7 @@ enum Codec { } impl Codec { - fn new(sort_field: &SortField) -> Result { + fn new(sort_field: &SortField) -> Result { match &sort_field.data_type { DataType::Dictionary(_, _) => Ok(Self::Dictionary(Default::default())), d if !d.is_nested() => Ok(Self::Stateless), @@ -485,7 +483,7 @@ impl Codec { } } - fn encoder(&mut self, array: &dyn Array) -> Result> { + fn encoder(&mut self, array: &dyn Array) -> Result, ArrowError> { match self { Codec::Stateless => Ok(Encoder::Stateless), Codec::Dictionary(interner) => { @@ -577,7 +575,7 @@ impl SortField { impl RowConverter { /// Create a new [`RowConverter`] with the provided schema - pub fn new(fields: Vec) -> Result { + pub fn new(fields: Vec) -> Result { if !Self::supports_fields(&fields) { return Err(ArrowError::NotYetImplemented(format!( "Row format support not yet implemented for: {:?}", @@ -585,7 +583,7 @@ impl RowConverter { ))); } - let codecs = fields.iter().map(Codec::new).collect::>()?; + let codecs = fields.iter().map(Codec::new).collect::>()?; Ok(Self { fields: fields.into(), codecs, @@ -617,7 +615,7 @@ impl RowConverter { /// # Panics /// /// Panics if the schema of `columns` does not match that provided to [`RowConverter::new`] - pub fn convert_columns(&mut self, columns: &[ArrayRef]) -> Result { + pub fn convert_columns(&mut self, columns: &[ArrayRef]) -> Result { if columns.len() != self.fields.len() { return Err(ArrowError::InvalidArgumentError(format!( "Incorrect number of arrays provided to RowConverter, expected {} got {}", @@ -640,7 +638,7 @@ impl RowConverter { } codec.encoder(column.as_ref()) }) - .collect::>>()?; + .collect::, _>>()?; let config = RowConfig { fields: Arc::clone(&self.fields), @@ -671,7 +669,7 @@ impl RowConverter { /// # Panics /// /// Panics if the rows were not produced by this [`RowConverter`] - pub fn convert_rows<'a, I>(&self, rows: I) -> Result> + pub fn convert_rows<'a, I>(&self, rows: I) -> Result, ArrowError> where I: IntoIterator>, { @@ -703,7 +701,7 @@ impl RowConverter { &self, rows: &mut [&[u8]], validate_utf8: bool, - ) -> Result> { + ) -> Result, ArrowError> { self.fields .iter() .zip(&self.codecs) @@ -1196,7 +1194,7 @@ unsafe fn decode_column( rows: &mut [&[u8]], codec: &Codec, validate_utf8: bool, -) -> Result { +) -> Result { let options = field.options; let array: ArrayRef = match codec { @@ -1255,24 +1253,18 @@ unsafe fn decode_column( mod tests { use std::sync::Arc; - use arrow_array::builder::{ - FixedSizeBinaryBuilder, GenericListBuilder, Int32Builder, - }; use rand::distributions::uniform::SampleUniform; use rand::distributions::{Distribution, Standard}; use rand::{thread_rng, Rng}; - use arrow_array::NullArray; + use arrow_array::builder::*; + use arrow_array::types::*; + use arrow_array::*; + use arrow_buffer::i256; use arrow_buffer::Buffer; + use arrow_cast::display::array_value_to_string; use arrow_ord::sort::{LexicographicalComparator, SortColumn, SortOptions}; - use crate::array::{ - BinaryArray, BooleanArray, DictionaryArray, Float32Array, GenericStringArray, - Int16Array, Int32Array, OffsetSizeTrait, PrimitiveArray, - PrimitiveDictionaryBuilder, StringArray, - }; - use crate::util::display::array_value_to_string; - use super::*; #[test] diff --git a/arrow/src/row/list.rs b/arrow-row/src/list.rs similarity index 98% rename from arrow/src/row/list.rs rename to arrow-row/src/list.rs index e5ea5c2a04c..dcd247be1a7 100644 --- a/arrow/src/row/list.rs +++ b/arrow-row/src/list.rs @@ -15,12 +15,11 @@ // specific language governing permissions and limitations // under the License. -use crate::compute::SortOptions; -use crate::row::{RowConverter, Rows, SortField}; +use crate::{RowConverter, Rows, SortField}; use arrow_array::builder::BufferBuilder; use arrow_array::{Array, GenericListArray, OffsetSizeTrait}; use arrow_data::ArrayDataBuilder; -use arrow_schema::ArrowError; +use arrow_schema::{ArrowError, SortOptions}; use std::ops::Range; pub fn compute_lengths( diff --git a/arrow/src/row/variable.rs b/arrow-row/src/variable.rs similarity index 97% rename from arrow/src/row/variable.rs rename to arrow-row/src/variable.rs index 9162f231203..c927f76963a 100644 --- a/arrow/src/row/variable.rs +++ b/arrow-row/src/variable.rs @@ -15,14 +15,13 @@ // specific language governing permissions and limitations // under the License. -use crate::compute::SortOptions; -use crate::row::{null_sentinel, Rows}; -use crate::util::bit_util::ceil; +use crate::{null_sentinel, Rows}; use arrow_array::builder::BufferBuilder; -use arrow_array::{Array, GenericBinaryArray, GenericStringArray, OffsetSizeTrait}; +use arrow_array::*; +use arrow_buffer::bit_util::ceil; use arrow_buffer::MutableBuffer; use arrow_data::ArrayDataBuilder; -use arrow_schema::DataType; +use arrow_schema::{DataType, SortOptions}; /// The block size of the variable length encoding pub const BLOCK_SIZE: usize = 32; diff --git a/arrow-schema/src/lib.rs b/arrow-schema/src/lib.rs index 34030f2d356..c2b1aba3b92 100644 --- a/arrow-schema/src/lib.rs +++ b/arrow-schema/src/lib.rs @@ -25,3 +25,22 @@ mod field; pub use field::*; mod schema; pub use schema::*; + +/// Options that define the sort order of a given column +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct SortOptions { + /// Whether to sort in descending order + pub descending: bool, + /// Whether to sort nulls first + pub nulls_first: bool, +} + +impl Default for SortOptions { + fn default() -> Self { + Self { + descending: false, + // default to nulls first to match spark's behavior + nulls_first: true, + } + } +} diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 0954909a099..772c1be7745 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -53,9 +53,11 @@ arrow-data = { version = "29.0.0", path = "../arrow-data" } arrow-ipc = { version = "29.0.0", path = "../arrow-ipc", optional = true } arrow-json = { version = "29.0.0", path = "../arrow-json", optional = true } arrow-ord = { version = "29.0.0", path = "../arrow-ord" } +arrow-row = { version = "29.0.0", path = "../arrow-row" } arrow-schema = { version = "29.0.0", path = "../arrow-schema" } arrow-select = { version = "29.0.0", path = "../arrow-select" } arrow-string = { version = "29.0.0", path = "../arrow-string" } + rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index d57168dc9ea..8611acf52fe 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -328,7 +328,7 @@ pub mod pyarrow; pub mod record_batch { pub use arrow_array::{RecordBatch, RecordBatchOptions, RecordBatchReader}; } -pub mod row; pub use arrow_array::temporal_conversions; +pub use arrow_row as row; pub mod tensor; pub mod util; diff --git a/dev/release/README.md b/dev/release/README.md index 75849641d8b..a18d8a4992c 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -258,13 +258,12 @@ Rust Arrow Crates: (cd arrow-array && cargo publish) (cd arrow-select && cargo publish) (cd arrow-cast && cargo publish) -(cd arrow-string && cargo publish) -(cd arrow-ord && cargo publish) (cd arrow-ipc && cargo publish) (cd arrow-csv && cargo publish) (cd arrow-json && cargo publish) (cd arrow-ord && cargo publish) (cd arrow-string && cargo publish) +(cd arrow-row && cargo publish) (cd arrow && cargo publish) (cd arrow-flight && cargo publish) (cd parquet && cargo publish)