Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Proposal for non-static encodings #2066

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions bench-vortex/benches/datafusion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ use vortex_datafusion::memory::VortexMemTable;

pub static CTX: LazyLock<Context> = LazyLock::new(|| {
Context::default().with_encodings([
&BitPackedEncoding as EncodingRef,
&DictEncoding,
&FoREncoding,
&DeltaEncoding,
Arc::new(BitPackedEncoding) as EncodingRef,
Arc::new(DictEncoding),
Arc::new(FoREncoding),
Arc::new(DeltaEncoding),
])
});

Expand Down
2 changes: 1 addition & 1 deletion bench-vortex/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ pub static CTX: LazyLock<ContextRef> = LazyLock::new(|| {
Arc::new(
Context::default()
.with_encodings(SamplingCompressor::default().used_encodings())
.with_encoding(&DeltaEncoding),
.with_encoding(Arc::new(DeltaEncoding)),
)
});

Expand Down
16 changes: 9 additions & 7 deletions fuzz/fuzz_targets/array_ops.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#![no_main]

use std::sync::Arc;

use libfuzzer_sys::{fuzz_target, Corpus};
use vortex_array::aliases::hash_set::HashSet;
use vortex_array::array::{
Expand Down Expand Up @@ -44,14 +46,14 @@ fuzz_target!(|fuzz_action: FuzzArrayAction| -> Corpus {
// TODO(robert): Ideally we'd preserve the encoding perfectly but this is close enough
let mut sorted = sort_canonical_array(&current_array);
if !HashSet::from([
&PrimitiveEncoding as EncodingRef,
&VarBinEncoding,
&VarBinViewEncoding,
&BoolEncoding,
&StructEncoding,
&ListEncoding,
Arc::new(PrimitiveEncoding) as EncodingRef,
Arc::new(VarBinEncoding),
Arc::new(VarBinViewEncoding),
Arc::new(BoolEncoding),
Arc::new(StructEncoding),
Arc::new(ListEncoding),
])
.contains(&current_array.encoding())
.contains(current_array.encoding())
{
sorted =
fuzz_compress(&sorted, &SamplingCompressor::default()).unwrap_or(sorted);
Expand Down
2 changes: 1 addition & 1 deletion fuzz/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ fn random_value_from_list(u: &mut Unstructured<'_>, vec: &[usize]) -> Result<usi

const ALL_ACTIONS: RangeInclusive<usize> = 0..=4;

fn actions_for_encoding(encoding: EncodingRef) -> HashSet<usize> {
fn actions_for_encoding(encoding: &EncodingRef) -> HashSet<usize> {
if ListEncoding::ID == encoding.id() {
// compress, slice
vec![0, 1].into_iter().collect()
Expand Down
8 changes: 5 additions & 3 deletions vortex-array/src/canonical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -523,18 +523,20 @@ impl IntoCanonical for ArrayData {
if !self.is_canonical() && self.len() > 1 {
log::trace!("Canonicalizing array with encoding {:?}", self.encoding());
}
self.encoding().into_canonical(self)
self.encoding().clone().into_canonical(self)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can save on a bunch of these clones by changing into_canonical/into_arrow to take self: Arc<Self>

}

fn into_arrow(self) -> VortexResult<ArrayRef>
where
Self: Sized,
{
self.encoding().into_arrow(self)
self.encoding().clone().into_arrow(self)
}

fn into_arrow_with_data_type(self, data_type: &DataType) -> VortexResult<ArrayRef> {
self.encoding().into_arrow_with_data_type(self, data_type)
self.encoding()
.clone()
.into_arrow_with_data_type(self, data_type)
}
}

Expand Down
2 changes: 1 addition & 1 deletion vortex-array/src/compute/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ where
.as_any()
.downcast_ref::<E>()
.ok_or_else(|| vortex_err!("Mismatched encoding"))?;
let array = <E::Array as TryFrom<ArrayData>>::try_from(array)?;
let array = <E::Array as TryFrom<ArrayData>>::try_from(array.clone())?;
LikeFn::like(encoding, array, pattern, options)
}
}
Expand Down
26 changes: 13 additions & 13 deletions vortex-array/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::array::{
PrimitiveEncoding, SparseEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding,
};
use crate::encoding::opaque::OpaqueEncoding;
use crate::encoding::EncodingRef;
use crate::encoding::{Encoding, EncodingRef};

/// A mapping between an encoding's ID to an [`EncodingRef`], used to have a shared view of all available encoding schemes.
#[derive(Debug, Clone)]
Expand Down Expand Up @@ -43,7 +43,7 @@ impl Context {
// OpaqueEncoding however must be created dynamically, since we do not know ahead
// of time which of the ~65,000 unknown code IDs we will end up seeing. Thus, we
// allocate (and leak) 2 bytes of memory to create a new encoding.
Box::leak(Box::new(OpaqueEncoding(encoding_id)))
Arc::new(OpaqueEncoding(encoding_id))
})
}
}
Expand All @@ -52,17 +52,17 @@ impl Default for Context {
fn default() -> Self {
Self {
encodings: [
&NullEncoding as EncodingRef,
&BoolEncoding,
&PrimitiveEncoding,
&StructEncoding,
&ListEncoding,
&VarBinEncoding,
&VarBinViewEncoding,
&ExtensionEncoding,
&SparseEncoding,
&ConstantEncoding,
&ChunkedEncoding,
Arc::new(NullEncoding) as EncodingRef,
Arc::new(BoolEncoding),
Arc::new(PrimitiveEncoding),
Arc::new(StructEncoding),
Arc::new(ListEncoding),
Arc::new(VarBinEncoding),
Arc::new(VarBinViewEncoding),
Arc::new(ExtensionEncoding),
Arc::new(SparseEncoding),
Arc::new(ConstantEncoding),
Arc::new(ChunkedEncoding),
]
.into_iter()
.map(|e| (e.id().code(), e))
Expand Down
18 changes: 13 additions & 5 deletions vortex-array/src/data/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::fmt::{Display, Formatter};
use std::ops::Deref;
use std::sync::{Arc, RwLock};

use itertools::Itertools;
Expand Down Expand Up @@ -140,10 +141,10 @@ impl ArrayData {
}

/// Return the array's encoding
pub fn encoding(&self) -> EncodingRef {
pub fn encoding(&self) -> &EncodingRef {
match &self.0 {
InnerArrayData::Owned(d) => d.encoding,
InnerArrayData::Viewed(v) => v.encoding,
InnerArrayData::Owned(d) => &d.encoding,
InnerArrayData::Viewed(v) => &v.encoding,
}
}

Expand Down Expand Up @@ -401,12 +402,19 @@ impl<T: AsRef<ArrayData>> ArrayLen for T {
impl<A: AsRef<ArrayData>> ArrayValidity for A {
/// Return whether the element at the given index is valid (true) or null (false).
fn is_valid(&self, index: usize) -> bool {
ValidityVTable::<ArrayData>::is_valid(self.as_ref().encoding(), self.as_ref(), index)
ValidityVTable::<ArrayData>::is_valid(
self.as_ref().encoding().deref(),
self.as_ref(),
index,
)
}

/// Return the logical validity of the array.
fn logical_validity(&self) -> LogicalValidity {
ValidityVTable::<ArrayData>::logical_validity(self.as_ref().encoding(), self.as_ref())
ValidityVTable::<ArrayData>::logical_validity(
self.as_ref().encoding().deref(),
self.as_ref(),
)
}
}

Expand Down
7 changes: 4 additions & 3 deletions vortex-array/src/encoding/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
use std::any::Any;
use std::fmt::{Debug, Display, Formatter};
use std::hash::{Hash, Hasher};
use std::sync::{Arc, LazyLock};

use crate::compute::ComputeVTable;
use crate::stats::StatisticsVTable;
Expand Down Expand Up @@ -63,14 +64,14 @@ impl AsRef<str> for EncodingId {
}

/// Marker trait for array encodings with their associated Array type.
pub trait Encoding: 'static {
pub trait Encoding: 'static + Send + Sync {
const ID: EncodingId;

type Array;
type Metadata: SerializeMetadata + DeserializeMetadata;
}

pub type EncodingRef = &'static dyn EncodingVTable;
pub type EncodingRef = Arc<dyn EncodingVTable>;

/// Object-safe encoding trait for an array.
pub trait EncodingVTable:
Expand Down Expand Up @@ -105,7 +106,7 @@ impl Hash for dyn EncodingVTable + '_ {
}

pub trait ArrayEncodingRef {
fn encoding(&self) -> EncodingRef;
fn encoding(&self) -> &EncodingRef;
}

#[doc = "Encoding ID constants for all Vortex-provided encodings"]
Expand Down
1 change: 1 addition & 0 deletions vortex-array/src/encoding/opaque.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::any::Any;
use std::fmt::{Debug, Display, Formatter};
use std::sync::Arc;

use arrow_array::ArrayRef;
use vortex_error::{vortex_bail, vortex_panic, VortexResult};
Expand Down
10 changes: 7 additions & 3 deletions vortex-array/src/macros.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
//! The core Vortex macro to create new encodings and array types.

use std::fmt::{Display, Formatter};
use std::sync::{Arc, LazyLock};

use crate::array::StructMetadata;
use crate::array::{PrimitiveEncoding, StructMetadata};
use crate::encoding::{ArrayEncodingRef, Encoding, EncodingRef};
use crate::{ArrayData, ArrayMetadata, ToArrayData};

Expand Down Expand Up @@ -49,7 +50,7 @@ macro_rules! impl_encoding {
use $crate::SerializeMetadata;

Self::try_from($crate::ArrayData::try_new_owned(
&[<$Name Encoding>],
_REF.clone(),
dtype,
len,
metadata.serialize()?,
Expand Down Expand Up @@ -114,8 +115,11 @@ macro_rules! impl_encoding {
#[derive(std::fmt::Debug)]
pub struct [<$Name Encoding>];

static _REF: std::sync::LazyLock<$crate::encoding::EncodingRef> = std::sync::LazyLock::new(|| std::sync::Arc::new([<$Name Encoding>]));

impl $crate::encoding::Encoding for [<$Name Encoding>] {
const ID: $crate::encoding::EncodingId = $crate::encoding::EncodingId::new($id, $code);

type Array = [<$Name Array>];
type Metadata = $Metadata;
}
Expand All @@ -135,7 +139,7 @@ macro_rules! impl_encoding {
}

impl<T: AsRef<ArrayData>> ArrayEncodingRef for T {
fn encoding(&self) -> EncodingRef {
fn encoding(&self) -> &EncodingRef {
self.as_ref().encoding()
}
}
Expand Down
8 changes: 5 additions & 3 deletions vortex-sampling-compressor/src/compressors/alp.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::sync::Arc;

use vortex_alp::{
alp_encode_components, match_each_alp_float_ptype, ALPArray, ALPEncoding, ALPRDEncoding,
};
Expand Down Expand Up @@ -76,10 +78,10 @@ impl EncodingCompressor for ALPCompressor {

fn used_encodings(&self) -> HashSet<EncodingRef> {
HashSet::from([
&ALPEncoding as EncodingRef,
Arc::new(ALPEncoding) as EncodingRef,
// ALP-RD + BitPacking possibly used for patches
&ALPRDEncoding,
&BitPackedEncoding,
Arc::new(ALPRDEncoding),
Arc::new(BitPackedEncoding),
])
}
}
5 changes: 4 additions & 1 deletion vortex-sampling-compressor/src/compressors/alp_rd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,10 @@ impl EncodingCompressor for ALPRDCompressor {
}

fn used_encodings(&self) -> HashSet<EncodingRef> {
HashSet::from([&ALPRDEncoding as EncodingRef, &BitPackedEncoding])
HashSet::from([
Arc::new(ALPRDEncoding) as EncodingRef,
Arc::new(BitPackedEncoding),
])
}
}

Expand Down
5 changes: 4 additions & 1 deletion vortex-sampling-compressor/src/compressors/bitpacked.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#![allow(clippy::cast_possible_truncation)]

use std::sync::Arc;

use vortex_array::aliases::hash_set::HashSet;
use vortex_array::array::PrimitiveArray;
use vortex_array::encoding::EncodingRef;
Expand Down Expand Up @@ -156,7 +159,7 @@ impl EncodingCompressor for BitPackedCompressor {
}

fn used_encodings(&self) -> HashSet<EncodingRef> {
HashSet::from([&BitPackedEncoding as EncodingRef])
HashSet::from([Arc::new(BitPackedEncoding) as EncodingRef])
}
}

Expand Down
4 changes: 3 additions & 1 deletion vortex-sampling-compressor/src/compressors/constant.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::sync::Arc;

use vortex_array::aliases::hash_set::HashSet;
use vortex_array::array::{ConstantArray, ConstantEncoding};
use vortex_array::encoding::{Encoding, EncodingRef};
Expand Down Expand Up @@ -47,6 +49,6 @@ impl EncodingCompressor for ConstantCompressor {
}

fn used_encodings(&self) -> HashSet<EncodingRef> {
HashSet::from([&ConstantEncoding as EncodingRef])
HashSet::from([Arc::new(ConstantEncoding) as EncodingRef])
}
}
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::sync::Arc;

use vortex_array::aliases::hash_set::HashSet;
use vortex_array::array::TemporalArray;
use vortex_array::encoding::{Encoding, EncodingRef};
Expand Down Expand Up @@ -77,6 +79,6 @@ impl EncodingCompressor for DateTimePartsCompressor {
}

fn used_encodings(&self) -> HashSet<EncodingRef> {
HashSet::from([&DateTimePartsEncoding as EncodingRef])
HashSet::from([Arc::new(DateTimePartsEncoding) as EncodingRef])
}
}
4 changes: 3 additions & 1 deletion vortex-sampling-compressor/src/compressors/delta.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::sync::Arc;

use vortex_array::aliases::hash_set::HashSet;
use vortex_array::array::PrimitiveArray;
use vortex_array::encoding::{Encoding, EncodingRef};
Expand Down Expand Up @@ -62,6 +64,6 @@ impl EncodingCompressor for DeltaCompressor {
}

fn used_encodings(&self) -> HashSet<EncodingRef> {
HashSet::from([&DeltaEncoding as EncodingRef])
HashSet::from([Arc::new(DeltaEncoding) as EncodingRef])
}
}
4 changes: 3 additions & 1 deletion vortex-sampling-compressor/src/compressors/dict.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::sync::Arc;

use vortex_array::aliases::hash_set::HashSet;
use vortex_array::array::{PrimitiveEncoding, VarBinEncoding, VarBinViewEncoding};
use vortex_array::encoding::{Encoding, EncodingRef};
Expand Down Expand Up @@ -71,6 +73,6 @@ impl EncodingCompressor for DictCompressor {
}

fn used_encodings(&self) -> HashSet<EncodingRef> {
HashSet::from([&DictEncoding as EncodingRef])
HashSet::from([Arc::new(DictEncoding) as EncodingRef])
}
}
4 changes: 3 additions & 1 deletion vortex-sampling-compressor/src/compressors/for.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::sync::Arc;

use vortex_array::aliases::hash_set::HashSet;
use vortex_array::array::PrimitiveArray;
use vortex_array::encoding::{Encoding, EncodingRef};
Expand Down Expand Up @@ -75,6 +77,6 @@ impl EncodingCompressor for FoRCompressor {
}

fn used_encodings(&self) -> HashSet<EncodingRef> {
HashSet::from([&FoREncoding as EncodingRef])
HashSet::from([Arc::new(FoREncoding) as EncodingRef])
}
}
2 changes: 1 addition & 1 deletion vortex-sampling-compressor/src/compressors/fsst.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,6 @@ impl EncodingCompressor for FSSTCompressor {
}

fn used_encodings(&self) -> HashSet<EncodingRef> {
HashSet::from([&FSSTEncoding as EncodingRef])
HashSet::from([Arc::new(FSSTEncoding) as EncodingRef])
}
}
Loading
Loading