Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Even smaller field names! #1639

Draft
wants to merge 3 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ hashbrown = "0.15.0"
homedir = "0.3.3"
humansize = "2.1.3"
indicatif = "0.17.8"
inline-str = "0.4"
itertools = "0.13.0"
jiff = "0.1.8"
libfuzzer-sys = "0.4"
Expand Down
10 changes: 5 additions & 5 deletions bench-vortex/src/clickbench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use datafusion::prelude::{ParquetReadOptions, SessionContext};
use tokio::fs::{create_dir_all, OpenOptions};
use vortex::aliases::hash_map::HashMap;
use vortex::array::{ChunkedArray, StructArray};
use vortex::dtype::DType;
use vortex::dtype::{DType, FieldName};
use vortex::error::vortex_err;
use vortex::file::{VortexFileWriter, VORTEX_FILE_EXTENSION};
use vortex::sampling_compressor::SamplingCompressor;
Expand Down Expand Up @@ -169,8 +169,8 @@ pub async fn register_vortex_files(
.map(|a| a.unwrap().into_struct().unwrap())
.collect::<Vec<_>>();

let mut arrays_map: HashMap<Arc<str>, Vec<ArrayData>> = HashMap::default();
let mut types_map: HashMap<Arc<str>, DType> = HashMap::default();
let mut arrays_map: HashMap<FieldName, Vec<ArrayData>> = HashMap::default();
let mut types_map: HashMap<FieldName, DType> = HashMap::default();

for st in sts.into_iter() {
let struct_dtype = st.dtype().as_struct().unwrap();
Expand All @@ -189,12 +189,12 @@ pub async fn register_vortex_files(
.fields()
.iter()
.map(|field| {
let name: Arc<str> = field.name().as_str().into();
let name: FieldName = field.name().as_str().into();
let dtype = types_map[&name].clone();
let chunks = arrays_map.remove(&name).unwrap();
let chunked_child = ChunkedArray::try_new(chunks, dtype).unwrap();

(name, chunked_child.into_array())
(name.to_string(), chunked_child.into_array())
})
.collect::<Vec<_>>();

Expand Down
10 changes: 5 additions & 5 deletions bench-vortex/src/tpch/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use tokio::fs::OpenOptions;
use vortex::aliases::hash_map::HashMap;
use vortex::array::{ChunkedArray, StructArray};
use vortex::arrow::FromArrowArray;
use vortex::dtype::DType;
use vortex::dtype::{DType, FieldName};
use vortex::file::{VortexFileWriter, VORTEX_FILE_EXTENSION};
use vortex::sampling_compressor::SamplingCompressor;
use vortex::variants::StructArrayTrait;
Expand Down Expand Up @@ -225,8 +225,8 @@ async fn register_vortex_file(
.map(|a| a.unwrap().into_struct().unwrap())
.collect::<Vec<_>>();

let mut arrays_map: HashMap<Arc<str>, Vec<ArrayData>> = HashMap::default();
let mut types_map: HashMap<Arc<str>, DType> = HashMap::default();
let mut arrays_map: HashMap<FieldName, Vec<ArrayData>> = HashMap::default();
let mut types_map: HashMap<FieldName, DType> = HashMap::default();

for st in sts.into_iter() {
let struct_dtype = st.dtype().as_struct().unwrap();
Expand All @@ -245,7 +245,7 @@ async fn register_vortex_file(
.fields()
.iter()
.map(|field| {
let name: Arc<str> = field.name().as_str().into();
let name = FieldName::from(field.name().as_str());
let dtype = types_map[&name].clone();
let chunks = arrays_map.remove(&name).unwrap();
let mut chunked_child = ChunkedArray::try_new(chunks, dtype).unwrap();
Expand All @@ -255,7 +255,7 @@ async fn register_vortex_file(
.unwrap()
}

(name, chunked_child.into_array())
(name.to_string(), chunked_child.into_array())
})
.collect::<Vec<_>>();

Expand Down
2 changes: 1 addition & 1 deletion vortex-array/src/array/extension/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ mod tests {
#[test]
fn compute_statistics() {
let ext_dtype = Arc::new(ExtDType::new(
ExtID::new("timestamp".into()),
ExtID::new("timestamp"),
DType::from(PType::I64).into(),
None,
));
Expand Down
2 changes: 1 addition & 1 deletion vortex-array/src/variants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ pub trait StructArrayTrait: ArrayTrait {
let field_idx = self
.names()
.iter()
.position(|field_name| field_name.as_ref() == name);
.position(|field_name| *field_name == name);

field_idx.and_then(|field_idx| self.field(field_idx))
}
Expand Down
1 change: 1 addition & 0 deletions vortex-dtype/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ arrow-array = { workspace = true }
flatbuffers = { workspace = true, optional = true }
half = { workspace = true, features = ["num-traits"] }
itertools = { workspace = true }
inline-str = { workspace = true, features = ["serde"] }
num-traits = { workspace = true }
prost = { workspace = true, optional = true }
serde = { workspace = true, optional = true, features = ["rc", "derive"] }
Expand Down
5 changes: 4 additions & 1 deletion vortex-dtype/src/arbitrary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ impl<'a> Arbitrary<'a> for StructDType {
fn random_struct_dtype(u: &mut Unstructured<'_>, depth: u8) -> Result<StructDType> {
let field_count = u.choose_index(3)?;
let names: FieldNames = (0..field_count)
.map(|_| FieldName::arbitrary(u))
.map(|_| {
let s = String::arbitrary(u)?;
Ok(FieldName::from(s))
})
.collect::<Result<Arc<_>>>()?;
let dtypes = (0..names.len())
.map(|_| random_dtype(u, depth))
Expand Down
19 changes: 10 additions & 9 deletions vortex-dtype/src/dtype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use std::fmt::{Debug, Display, Formatter};
use std::hash::Hash;
use std::sync::Arc;

use inline_str::InlineStr;
use itertools::Itertools;
use vortex_error::{vortex_bail, vortex_err, vortex_panic, VortexResult};
use DType::*;
Expand All @@ -11,7 +12,7 @@ use crate::nullability::Nullability;
use crate::{ExtDType, PType};

/// A name for a field in a struct
pub type FieldName = Arc<str>;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So Arc<str> has 24 bytes overhead (usize ptr, usize len, usize refcount), if I'm following inline-array docs then InlineStr should have what, 2 bytes of overhead in the worst cast?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it can be worse if we clone a ton, but realistically yes.

pub type FieldName = InlineStr;
/// An ordered list of field names in a struct
pub type FieldNames = Arc<[FieldName]>;

Expand Down Expand Up @@ -189,7 +190,7 @@ pub struct FieldInfo<'a> {
/// The position index of the field within the enclosing struct
pub index: usize,
/// The name of the field
pub name: Arc<str>,
pub name: FieldName,
/// The dtype of the field
pub dtype: &'a DType,
}
Expand Down Expand Up @@ -218,7 +219,7 @@ impl StructDType {
/// Find the index of a field by name
/// Returns `None` if the field is not found
pub fn find_name(&self, name: &str) -> Option<usize> {
self.names.iter().position(|n| n.as_ref() == name)
self.names.iter().position(|n| n == &name)
}

/// Get information about the referenced field, either by name or index
Expand Down Expand Up @@ -308,27 +309,27 @@ mod test {
let sdt = dtype.as_struct().unwrap();
assert_eq!(sdt.names().len(), 2);
assert_eq!(sdt.dtypes().len(), 2);
assert_eq!(sdt.names()[0], "A".into());
assert_eq!(sdt.names()[1], "B".into());
assert_eq!(sdt.names()[0], "A");
assert_eq!(sdt.names()[1], "B");
assert_eq!(sdt.dtypes()[0], a_type);
assert_eq!(sdt.dtypes()[1], b_type);

let proj = sdt
.project(&[Field::Index(1), Field::Name("A".into())])
.unwrap();
assert_eq!(proj.names()[0], "B".into());
assert_eq!(proj.names()[0], "B");
assert_eq!(proj.dtypes()[0], b_type);
assert_eq!(proj.names()[1], "A".into());
assert_eq!(proj.names()[1], "A");
assert_eq!(proj.dtypes()[1], a_type);

let field_info = sdt.field_info(&Field::Name("B".into())).unwrap();
assert_eq!(field_info.index, 1);
assert_eq!(field_info.name, "B".into());
assert_eq!(field_info.name, "B");
assert_eq!(field_info.dtype, &b_type);

let field_info = sdt.field_info(&Field::Index(0)).unwrap();
assert_eq!(field_info.index, 0);
assert_eq!(field_info.name, "A".into());
assert_eq!(field_info.name, "A");
assert_eq!(field_info.dtype, &a_type);

assert!(sdt.field_info(&Field::Index(2)).is_err());
Expand Down
10 changes: 6 additions & 4 deletions vortex-dtype/src/extension.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
use std::fmt::{Display, Formatter};
use std::sync::Arc;

use inline_str::InlineStr;

use crate::{DType, Nullability};

/// A unique identifier for an extension type
#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Hash)]
#[cfg_attr(feature = "serde", derive(::serde::Serialize, ::serde::Deserialize))]
pub struct ExtID(Arc<str>);
pub struct ExtID(InlineStr);

impl ExtID {
/// Constructs a new `ExtID` from a string
pub fn new(value: Arc<str>) -> Self {
Self(value)
pub fn new(value: impl AsRef<str>) -> Self {
Self(InlineStr::from(value.as_ref()))
}
}

Expand Down Expand Up @@ -91,7 +93,7 @@ impl ExtDType {
/// // Make a new extension type that encodes the unit for a set of nullable `f64`.
/// pub fn create_temperature_type(unit: TemperatureUnit) -> ExtDType {
/// ExtDType::new(
/// ExtID::new("vortex.temperature".into()),
/// ExtID::new("vortex.temperature"),
/// Arc::new(DType::Primitive(PType::F64, Nullability::Nullable)),
/// Some(ExtMetadata::new([unit as u8].into()))
/// )
Expand Down
5 changes: 3 additions & 2 deletions vortex-dtype/src/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,17 @@

use core::fmt;
use std::fmt::{Display, Formatter};
use std::sync::Arc;

use itertools::Itertools;

use crate::FieldName;

/// A selector for a field in a struct
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum Field {
/// A field selector by name
Name(Arc<str>),
Name(FieldName),
/// A field selector by index (position)
Index(usize),
}
Expand Down
8 changes: 3 additions & 5 deletions vortex-dtype/src/serde/flatbuffers/project.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
use std::sync::Arc;

use vortex_error::{vortex_err, VortexResult};

use crate::field::Field;
use crate::{flatbuffers as fb, DType, StructDType};
use crate::{flatbuffers as fb, DType, FieldName, StructDType};

/// Convert name references in projection list into index references.
///
Expand Down Expand Up @@ -42,7 +40,7 @@ pub fn project_and_deserialize(
.ok_or_else(|| vortex_err!("The top-level type should be a struct"))?;
let nullability = fb_struct.nullable().into();

let (names, dtypes): (Vec<Arc<str>>, Vec<DType>) = projection
let (names, dtypes): (Vec<FieldName>, Vec<DType>) = projection
.iter()
.map(|f| resolve_field(fb_struct, f))
.map(|idx| idx.and_then(|i| read_field(fb_struct, i)))
Expand All @@ -56,7 +54,7 @@ pub fn project_and_deserialize(
))
}

fn read_field(fb_struct: fb::Struct_, idx: usize) -> VortexResult<(Arc<str>, DType)> {
fn read_field(fb_struct: fb::Struct_, idx: usize) -> VortexResult<(FieldName, DType)> {
let name = fb_struct
.names()
.ok_or_else(|| vortex_err!("Missing field names"))?
Expand Down
2 changes: 1 addition & 1 deletion vortex-dtype/src/serde/proto.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ impl From<&DType> for pb::DType {
nullable: (*n).into(),
}),
DType::Struct(s, n) => DtypeType::Struct(pb::Struct {
names: s.names().iter().map(|s| s.as_ref().to_string()).collect(),
names: s.names().iter().map(|s| s.to_string()).collect(),
dtypes: s.dtypes().iter().map(Into::into).collect(),
nullable: (*n).into(),
}),
Expand Down
2 changes: 1 addition & 1 deletion vortex-expr/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ mod tests {
Literal::new_expr(Scalar::struct_(
DType::Struct(
StructDType::new(
Arc::from([Arc::from("dog"), Arc::from("cat")]),
Arc::from(["dog".into(), "cat".into()]),
vec![
DType::Primitive(PType::U32, Nullability::NonNullable),
DType::Utf8(Nullability::NonNullable)
Expand Down
4 changes: 2 additions & 2 deletions vortex-expr/src/select.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ mod tests {
let select = Select::include(vec![Field::from("a")]);
let selected = select.evaluate(st.as_ref()).unwrap();
let selected_names = selected.as_struct_array().unwrap().names().clone();
assert_eq!(selected_names.as_ref(), &["a".into()]);
assert_eq!(selected_names.as_ref(), &["a"]);
}

#[test]
Expand All @@ -117,6 +117,6 @@ mod tests {
let select = Select::exclude(vec![Field::from("a")]);
let selected = select.evaluate(st.as_ref()).unwrap();
let selected_names = selected.as_struct_array().unwrap().names().clone();
assert_eq!(selected_names.as_ref(), &["b".into()]);
assert_eq!(selected_names.as_ref(), &["b"]);
}
}
10 changes: 5 additions & 5 deletions vortex-file/src/read/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,16 +224,16 @@ fn field_names(bytes: &[u8], dtype_field: &SerializedDTypeField) -> VortexResult
.ok_or_else(|| vortex_err!("Not a struct dtype"))?;
match dtype_field {
SerializedDTypeField::Projection(projection) => match projection {
Projection::All => Ok(names.iter().map(Arc::from).collect()),
Projection::All => Ok(names.iter().map(Into::into).collect()),
Projection::Flat(fields) => fields
.iter()
.map(|f| resolve_field(struct_field, f))
.map(|idx| idx.map(|i| Arc::from(names.get(i))))
.map(|idx| idx.map(|i| names.get(i).into()))
.collect(),
},
SerializedDTypeField::Field(f) => Ok(Arc::new([Arc::from(
names.get(resolve_field(struct_field, f)?),
)])),
SerializedDTypeField::Field(f) => Ok(Arc::new([names
.get(resolve_field(struct_field, f)?)
.into()])),
}
}

Expand Down
4 changes: 2 additions & 2 deletions vortex-scalar/src/display.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ mod tests {
fn dtype() -> DType {
DType::Struct(
StructDType::new(
Arc::new([Arc::from("foo")]),
Arc::new(["foo".into()]),
vec![DType::Primitive(PType::U32, Nullable)],
),
Nullable,
Expand Down Expand Up @@ -207,7 +207,7 @@ mod tests {
fn dtype() -> DType {
DType::Struct(
StructDType::new(
Arc::new([Arc::from("foo"), Arc::from("bar")]),
Arc::new(["foo".into(), "bar".into()]),
vec![
DType::Bool(Nullable),
DType::Primitive(PType::U32, Nullable),
Expand Down
Loading