From 7f867b4b9cf88ce002ee62121661a83462ceabb5 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 10 Dec 2024 17:56:39 +0000 Subject: [PATCH 1/3] wip --- Cargo.lock | 20 +++++++++++++++++++ Cargo.toml | 1 + vortex-dtype/Cargo.toml | 1 + vortex-dtype/src/dtype.rs | 19 +++++++++--------- vortex-dtype/src/extension.rs | 8 +++++--- vortex-dtype/src/field.rs | 5 +++-- vortex-dtype/src/serde/flatbuffers/project.rs | 8 +++----- vortex-dtype/src/serde/proto.rs | 2 +- 8 files changed, 44 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fcead9eef4..bf1ec0b4ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2367,6 +2367,25 @@ version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" +[[package]] +name = "inline-array" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b663811b69874e8b05e9257d21b3cf18ca8380684cad0be4339270d00fbd643b" +dependencies = [ + "serde", +] + +[[package]] +name = "inline-str" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c4bdffe182ba907c0d9908eff54b0cdd1c752a8f0e5b44b80347a26bc9a3aa3" +dependencies = [ + "inline-array", + "serde", +] + [[package]] name = "instant" version = "0.1.13" @@ -4831,6 +4850,7 @@ dependencies = [ "arrow-array", "flatbuffers", "half", + "inline-str", "itertools 0.13.0", "num-traits", "prost", diff --git a/Cargo.toml b/Cargo.toml index 74407ee34f..03fb5c54db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -90,6 +90,7 @@ hashbrown = "0.15.0" homedir = "0.3.3" humansize = "2.1.3" indicatif = "0.17.8" +inline-str = "0.3" itertools = "0.13.0" jiff = "0.1.8" libfuzzer-sys = "0.4" diff --git a/vortex-dtype/Cargo.toml b/vortex-dtype/Cargo.toml index 88791b10dc..e0433cb943 100644 --- a/vortex-dtype/Cargo.toml +++ b/vortex-dtype/Cargo.toml @@ -24,6 +24,7 @@ arrow-array = { workspace = true } flatbuffers = { workspace = true, optional = true } half = { workspace = true, features = ["num-traits"] } itertools = { workspace = true } +inline-str = { workspace = true, features = ["serde"] } num-traits = { workspace = true } prost = { workspace = true, optional = true } serde = { workspace = true, optional = true, features = ["rc", "derive"] } diff --git a/vortex-dtype/src/dtype.rs b/vortex-dtype/src/dtype.rs index db5a994c3c..7c673af1d4 100644 --- a/vortex-dtype/src/dtype.rs +++ b/vortex-dtype/src/dtype.rs @@ -2,6 +2,7 @@ use std::fmt::{Debug, Display, Formatter}; use std::hash::Hash; use std::sync::Arc; +use inline_str::InlineStr; use itertools::Itertools; use vortex_error::{vortex_bail, vortex_err, vortex_panic, VortexResult}; use DType::*; @@ -11,7 +12,7 @@ use crate::nullability::Nullability; use crate::{ExtDType, PType}; /// A name for a field in a struct -pub type FieldName = Arc; +pub type FieldName = InlineStr; /// An ordered list of field names in a struct pub type FieldNames = Arc<[FieldName]>; @@ -189,7 +190,7 @@ pub struct FieldInfo<'a> { /// The position index of the field within the enclosing struct pub index: usize, /// The name of the field - pub name: Arc, + pub name: FieldName, /// The dtype of the field pub dtype: &'a DType, } @@ -218,7 +219,7 @@ impl StructDType { /// Find the index of a field by name /// Returns `None` if the field is not found pub fn find_name(&self, name: &str) -> Option { - self.names.iter().position(|n| n.as_ref() == name) + self.names.iter().position(|n| n == &name) } /// Get information about the referenced field, either by name or index @@ -308,27 +309,27 @@ mod test { let sdt = dtype.as_struct().unwrap(); assert_eq!(sdt.names().len(), 2); assert_eq!(sdt.dtypes().len(), 2); - assert_eq!(sdt.names()[0], "A".into()); - assert_eq!(sdt.names()[1], "B".into()); + assert_eq!(sdt.names()[0], "A"); + assert_eq!(sdt.names()[1], "B"); assert_eq!(sdt.dtypes()[0], a_type); assert_eq!(sdt.dtypes()[1], b_type); let proj = sdt .project(&[Field::Index(1), Field::Name("A".into())]) .unwrap(); - assert_eq!(proj.names()[0], "B".into()); + assert_eq!(proj.names()[0], "B"); assert_eq!(proj.dtypes()[0], b_type); - assert_eq!(proj.names()[1], "A".into()); + assert_eq!(proj.names()[1], "A"); assert_eq!(proj.dtypes()[1], a_type); let field_info = sdt.field_info(&Field::Name("B".into())).unwrap(); assert_eq!(field_info.index, 1); - assert_eq!(field_info.name, "B".into()); + assert_eq!(field_info.name, "B"); assert_eq!(field_info.dtype, &b_type); let field_info = sdt.field_info(&Field::Index(0)).unwrap(); assert_eq!(field_info.index, 0); - assert_eq!(field_info.name, "A".into()); + assert_eq!(field_info.name, "A"); assert_eq!(field_info.dtype, &a_type); assert!(sdt.field_info(&Field::Index(2)).is_err()); diff --git a/vortex-dtype/src/extension.rs b/vortex-dtype/src/extension.rs index 066c3df0af..c0f9563c6e 100644 --- a/vortex-dtype/src/extension.rs +++ b/vortex-dtype/src/extension.rs @@ -1,17 +1,19 @@ use std::fmt::{Display, Formatter}; use std::sync::Arc; +use inline_str::InlineStr; + use crate::{DType, Nullability}; /// A unique identifier for an extension type #[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Hash)] #[cfg_attr(feature = "serde", derive(::serde::Serialize, ::serde::Deserialize))] -pub struct ExtID(Arc); +pub struct ExtID(InlineStr); impl ExtID { /// Constructs a new `ExtID` from a string - pub fn new(value: Arc) -> Self { - Self(value) + pub fn new(value: impl AsRef) -> Self { + Self(InlineStr::from(value.as_ref())) } } diff --git a/vortex-dtype/src/field.rs b/vortex-dtype/src/field.rs index 7fb264282d..7191121f9f 100644 --- a/vortex-dtype/src/field.rs +++ b/vortex-dtype/src/field.rs @@ -5,16 +5,17 @@ use core::fmt; use std::fmt::{Display, Formatter}; -use std::sync::Arc; use itertools::Itertools; +use crate::FieldName; + /// A selector for a field in a struct #[derive(Clone, Debug, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum Field { /// A field selector by name - Name(Arc), + Name(FieldName), /// A field selector by index (position) Index(usize), } diff --git a/vortex-dtype/src/serde/flatbuffers/project.rs b/vortex-dtype/src/serde/flatbuffers/project.rs index 0f44205975..abed4c17fe 100644 --- a/vortex-dtype/src/serde/flatbuffers/project.rs +++ b/vortex-dtype/src/serde/flatbuffers/project.rs @@ -1,9 +1,7 @@ -use std::sync::Arc; - use vortex_error::{vortex_err, VortexResult}; use crate::field::Field; -use crate::{flatbuffers as fb, DType, StructDType}; +use crate::{flatbuffers as fb, DType, FieldName, StructDType}; /// Convert name references in projection list into index references. /// @@ -42,7 +40,7 @@ pub fn project_and_deserialize( .ok_or_else(|| vortex_err!("The top-level type should be a struct"))?; let nullability = fb_struct.nullable().into(); - let (names, dtypes): (Vec>, Vec) = projection + let (names, dtypes): (Vec, Vec) = projection .iter() .map(|f| resolve_field(fb_struct, f)) .map(|idx| idx.and_then(|i| read_field(fb_struct, i))) @@ -56,7 +54,7 @@ pub fn project_and_deserialize( )) } -fn read_field(fb_struct: fb::Struct_, idx: usize) -> VortexResult<(Arc, DType)> { +fn read_field(fb_struct: fb::Struct_, idx: usize) -> VortexResult<(FieldName, DType)> { let name = fb_struct .names() .ok_or_else(|| vortex_err!("Missing field names"))? diff --git a/vortex-dtype/src/serde/proto.rs b/vortex-dtype/src/serde/proto.rs index 83b8f13965..f1549fb77c 100644 --- a/vortex-dtype/src/serde/proto.rs +++ b/vortex-dtype/src/serde/proto.rs @@ -79,7 +79,7 @@ impl From<&DType> for pb::DType { nullable: (*n).into(), }), DType::Struct(s, n) => DtypeType::Struct(pb::Struct { - names: s.names().iter().map(|s| s.as_ref().to_string()).collect(), + names: s.names().iter().map(|s| s.to_string()).collect(), dtypes: s.dtypes().iter().map(Into::into).collect(), nullable: (*n).into(), }), From 9a6ff5ed850783e8091f3c2056b4205b6d83107f Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 10 Dec 2024 18:25:04 +0000 Subject: [PATCH 2/3] . --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- bench-vortex/src/clickbench.rs | 10 +++++----- bench-vortex/src/tpch/mod.rs | 10 +++++----- vortex-array/src/array/extension/mod.rs | 2 +- vortex-array/src/variants.rs | 2 +- vortex-dtype/src/arbitrary.rs | 5 ++++- vortex-expr/src/lib.rs | 2 +- vortex-expr/src/select.rs | 4 ++-- vortex-file/src/read/cache.rs | 10 +++++----- vortex-scalar/src/display.rs | 4 ++-- 11 files changed, 29 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bf1ec0b4ce..0d8eaea2d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2378,9 +2378,9 @@ dependencies = [ [[package]] name = "inline-str" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c4bdffe182ba907c0d9908eff54b0cdd1c752a8f0e5b44b80347a26bc9a3aa3" +checksum = "820bac9c34967d698aff083835f1de980b3e7e95d513d6ff7d780dfd97953a8b" dependencies = [ "inline-array", "serde", diff --git a/Cargo.toml b/Cargo.toml index 03fb5c54db..5759cd1571 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -90,7 +90,7 @@ hashbrown = "0.15.0" homedir = "0.3.3" humansize = "2.1.3" indicatif = "0.17.8" -inline-str = "0.3" +inline-str = "0.4" itertools = "0.13.0" jiff = "0.1.8" libfuzzer-sys = "0.4" diff --git a/bench-vortex/src/clickbench.rs b/bench-vortex/src/clickbench.rs index 268338e710..68285c01d5 100644 --- a/bench-vortex/src/clickbench.rs +++ b/bench-vortex/src/clickbench.rs @@ -10,7 +10,7 @@ use datafusion::prelude::{ParquetReadOptions, SessionContext}; use tokio::fs::{create_dir_all, OpenOptions}; use vortex::aliases::hash_map::HashMap; use vortex::array::{ChunkedArray, StructArray}; -use vortex::dtype::DType; +use vortex::dtype::{DType, FieldName}; use vortex::error::vortex_err; use vortex::file::{VortexFileWriter, VORTEX_FILE_EXTENSION}; use vortex::sampling_compressor::SamplingCompressor; @@ -169,8 +169,8 @@ pub async fn register_vortex_files( .map(|a| a.unwrap().into_struct().unwrap()) .collect::>(); - let mut arrays_map: HashMap, Vec> = HashMap::default(); - let mut types_map: HashMap, DType> = HashMap::default(); + let mut arrays_map: HashMap> = HashMap::default(); + let mut types_map: HashMap = HashMap::default(); for st in sts.into_iter() { let struct_dtype = st.dtype().as_struct().unwrap(); @@ -189,12 +189,12 @@ pub async fn register_vortex_files( .fields() .iter() .map(|field| { - let name: Arc = field.name().as_str().into(); + let name: FieldName = field.name().as_str().into(); let dtype = types_map[&name].clone(); let chunks = arrays_map.remove(&name).unwrap(); let chunked_child = ChunkedArray::try_new(chunks, dtype).unwrap(); - (name, chunked_child.into_array()) + (name.to_string(), chunked_child.into_array()) }) .collect::>(); diff --git a/bench-vortex/src/tpch/mod.rs b/bench-vortex/src/tpch/mod.rs index bc4136cc46..ca3ab34276 100644 --- a/bench-vortex/src/tpch/mod.rs +++ b/bench-vortex/src/tpch/mod.rs @@ -15,7 +15,7 @@ use tokio::fs::OpenOptions; use vortex::aliases::hash_map::HashMap; use vortex::array::{ChunkedArray, StructArray}; use vortex::arrow::FromArrowArray; -use vortex::dtype::DType; +use vortex::dtype::{DType, FieldName}; use vortex::file::{VortexFileWriter, VORTEX_FILE_EXTENSION}; use vortex::sampling_compressor::SamplingCompressor; use vortex::variants::StructArrayTrait; @@ -225,8 +225,8 @@ async fn register_vortex_file( .map(|a| a.unwrap().into_struct().unwrap()) .collect::>(); - let mut arrays_map: HashMap, Vec> = HashMap::default(); - let mut types_map: HashMap, DType> = HashMap::default(); + let mut arrays_map: HashMap> = HashMap::default(); + let mut types_map: HashMap = HashMap::default(); for st in sts.into_iter() { let struct_dtype = st.dtype().as_struct().unwrap(); @@ -245,7 +245,7 @@ async fn register_vortex_file( .fields() .iter() .map(|field| { - let name: Arc = field.name().as_str().into(); + let name = FieldName::from(field.name().as_str()); let dtype = types_map[&name].clone(); let chunks = arrays_map.remove(&name).unwrap(); let mut chunked_child = ChunkedArray::try_new(chunks, dtype).unwrap(); @@ -255,7 +255,7 @@ async fn register_vortex_file( .unwrap() } - (name, chunked_child.into_array()) + (name.to_string(), chunked_child.into_array()) }) .collect::>(); diff --git a/vortex-array/src/array/extension/mod.rs b/vortex-array/src/array/extension/mod.rs index 845d7b3d8e..5ad8b51985 100644 --- a/vortex-array/src/array/extension/mod.rs +++ b/vortex-array/src/array/extension/mod.rs @@ -125,7 +125,7 @@ mod tests { #[test] fn compute_statistics() { let ext_dtype = Arc::new(ExtDType::new( - ExtID::new("timestamp".into()), + ExtID::new("timestamp"), DType::from(PType::I64).into(), None, )); diff --git a/vortex-array/src/variants.rs b/vortex-array/src/variants.rs index 4e5e7ee6f2..f828f1c85a 100644 --- a/vortex-array/src/variants.rs +++ b/vortex-array/src/variants.rs @@ -237,7 +237,7 @@ pub trait StructArrayTrait: ArrayTrait { let field_idx = self .names() .iter() - .position(|field_name| field_name.as_ref() == name); + .position(|field_name| *field_name == name); field_idx.and_then(|field_idx| self.field(field_idx)) } diff --git a/vortex-dtype/src/arbitrary.rs b/vortex-dtype/src/arbitrary.rs index 161141dae6..be2e08488f 100644 --- a/vortex-dtype/src/arbitrary.rs +++ b/vortex-dtype/src/arbitrary.rs @@ -63,7 +63,10 @@ impl<'a> Arbitrary<'a> for StructDType { fn random_struct_dtype(u: &mut Unstructured<'_>, depth: u8) -> Result { let field_count = u.choose_index(3)?; let names: FieldNames = (0..field_count) - .map(|_| FieldName::arbitrary(u)) + .map(|_| { + let s = String::arbitrary(u)?; + Ok(FieldName::from(s)) + }) .collect::>>()?; let dtypes = (0..names.len()) .map(|_| random_dtype(u, depth)) diff --git a/vortex-expr/src/lib.rs b/vortex-expr/src/lib.rs index ed8f4f9511..f2756f8be1 100644 --- a/vortex-expr/src/lib.rs +++ b/vortex-expr/src/lib.rs @@ -198,7 +198,7 @@ mod tests { Literal::new_expr(Scalar::struct_( DType::Struct( StructDType::new( - Arc::from([Arc::from("dog"), Arc::from("cat")]), + Arc::from(["dog".into(), "cat".into()]), vec![ DType::Primitive(PType::U32, Nullability::NonNullable), DType::Utf8(Nullability::NonNullable) diff --git a/vortex-expr/src/select.rs b/vortex-expr/src/select.rs index bf043c4b70..f95e2d0ef6 100644 --- a/vortex-expr/src/select.rs +++ b/vortex-expr/src/select.rs @@ -108,7 +108,7 @@ mod tests { let select = Select::include(vec![Field::from("a")]); let selected = select.evaluate(st.as_ref()).unwrap(); let selected_names = selected.as_struct_array().unwrap().names().clone(); - assert_eq!(selected_names.as_ref(), &["a".into()]); + assert_eq!(selected_names.as_ref(), &["a"]); } #[test] @@ -117,6 +117,6 @@ mod tests { let select = Select::exclude(vec![Field::from("a")]); let selected = select.evaluate(st.as_ref()).unwrap(); let selected_names = selected.as_struct_array().unwrap().names().clone(); - assert_eq!(selected_names.as_ref(), &["b".into()]); + assert_eq!(selected_names.as_ref(), &["b"]); } } diff --git a/vortex-file/src/read/cache.rs b/vortex-file/src/read/cache.rs index 561e7b3185..3ae33657ba 100644 --- a/vortex-file/src/read/cache.rs +++ b/vortex-file/src/read/cache.rs @@ -224,16 +224,16 @@ fn field_names(bytes: &[u8], dtype_field: &SerializedDTypeField) -> VortexResult .ok_or_else(|| vortex_err!("Not a struct dtype"))?; match dtype_field { SerializedDTypeField::Projection(projection) => match projection { - Projection::All => Ok(names.iter().map(Arc::from).collect()), + Projection::All => Ok(names.iter().map(Into::into).collect()), Projection::Flat(fields) => fields .iter() .map(|f| resolve_field(struct_field, f)) - .map(|idx| idx.map(|i| Arc::from(names.get(i)))) + .map(|idx| idx.map(|i| names.get(i).into())) .collect(), }, - SerializedDTypeField::Field(f) => Ok(Arc::new([Arc::from( - names.get(resolve_field(struct_field, f)?), - )])), + SerializedDTypeField::Field(f) => Ok(Arc::new([names + .get(resolve_field(struct_field, f)?) + .into()])), } } diff --git a/vortex-scalar/src/display.rs b/vortex-scalar/src/display.rs index 0e8b3bf37a..2f3a46b162 100644 --- a/vortex-scalar/src/display.rs +++ b/vortex-scalar/src/display.rs @@ -179,7 +179,7 @@ mod tests { fn dtype() -> DType { DType::Struct( StructDType::new( - Arc::new([Arc::from("foo")]), + Arc::new(["foo".into()]), vec![DType::Primitive(PType::U32, Nullable)], ), Nullable, @@ -207,7 +207,7 @@ mod tests { fn dtype() -> DType { DType::Struct( StructDType::new( - Arc::new([Arc::from("foo"), Arc::from("bar")]), + Arc::new(["foo".into(), "bar".into()]), vec![ DType::Bool(Nullable), DType::Primitive(PType::U32, Nullable), From cb639090b84be800e45a00351c4cabefb239b5d2 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Wed, 11 Dec 2024 11:19:11 +0000 Subject: [PATCH 3/3] Fix tests --- vortex-dtype/src/extension.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-dtype/src/extension.rs b/vortex-dtype/src/extension.rs index c0f9563c6e..c4f7c11d09 100644 --- a/vortex-dtype/src/extension.rs +++ b/vortex-dtype/src/extension.rs @@ -93,7 +93,7 @@ impl ExtDType { /// // Make a new extension type that encodes the unit for a set of nullable `f64`. /// pub fn create_temperature_type(unit: TemperatureUnit) -> ExtDType { /// ExtDType::new( - /// ExtID::new("vortex.temperature".into()), + /// ExtID::new("vortex.temperature"), /// Arc::new(DType::Primitive(PType::F64, Nullability::Nullable)), /// Some(ExtMetadata::new([unit as u8].into())) /// )