Skip to content

Commit

Permalink
refactor(rust): Use row encoding in asof join
Browse files Browse the repository at this point in the history
  • Loading branch information
orlp committed Oct 7, 2024
1 parent 018dfd1 commit 359fc11
Show file tree
Hide file tree
Showing 7 changed files with 30 additions and 325 deletions.
19 changes: 0 additions & 19 deletions crates/polars-core/src/frame/column/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -677,25 +677,6 @@ impl Column {
.vec_hash_combine(build_hasher, hashes)
}

/// # Safety
///
/// Indexes need to be in bounds.
pub(crate) unsafe fn equal_element(
&self,
idx_self: usize,
idx_other: usize,
other: &Column,
) -> bool {
// @scalar-opt
unsafe {
self.as_materialized_series().equal_element(
idx_self,
idx_other,
other.as_materialized_series(),
)
}
}

pub fn append(&mut self, other: &Column) -> PolarsResult<&mut Self> {
// @scalar-opt
self.into_materialized_series()
Expand Down
25 changes: 0 additions & 25 deletions crates/polars-core/src/hashing/identity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,28 +33,3 @@ impl Hasher for IdHasher {
}

pub type IdBuildHasher = BuildHasherDefault<IdHasher>;

#[derive(Debug)]
/// Contains an idx of a row in a DataFrame and the precomputed hash of that row.
///
/// That hash still needs to be used to create another hash to be able to resize hashmaps without
/// accidental quadratic behavior. So do not use an Identity function!
pub struct IdxHash {
// idx in row of Series, DataFrame
pub idx: IdxSize,
// precomputed hash of T
pub hash: u64,
}

impl Hash for IdxHash {
fn hash<H: Hasher>(&self, state: &mut H) {
state.write_u64(self.hash)
}
}

impl IdxHash {
#[inline]
pub(crate) fn new(idx: IdxSize, hash: u64) -> Self {
IdxHash { idx, hash }
}
}
76 changes: 1 addition & 75 deletions crates/polars-core/src/hashing/mod.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
mod identity;
pub(crate) mod vector_hasher;

use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};
use std::hash::{BuildHasherDefault, Hash, Hasher};

use hashbrown::hash_map::RawEntryMut;
use hashbrown::HashMap;
pub use identity::*;
pub use vector_hasher::*;

use crate::prelude::*;

// hash combine from c++' boost lib
#[inline]
pub fn _boost_hash_combine(l: u64, r: u64) -> u64 {
Expand All @@ -19,73 +15,3 @@ pub fn _boost_hash_combine(l: u64, r: u64) -> u64 {
// We must strike a balance between cache
// Overallocation seems a lot more expensive than resizing so we start reasonable small.
pub const _HASHMAP_INIT_SIZE: usize = 512;

/// Utility function used as comparison function in the hashmap.
/// The rationale is that equality is an AND operation and therefore its probability of success
/// declines rapidly with the number of keys. Instead of first copying an entire row from both
/// sides and then do the comparison, we do the comparison value by value catching early failures
/// eagerly.
///
/// # Safety
/// Doesn't check any bounds
#[inline]
pub(crate) unsafe fn compare_df_rows(keys: &DataFrame, idx_a: usize, idx_b: usize) -> bool {
for s in keys.get_columns() {
if !s.equal_element(idx_a, idx_b, s) {
return false;
}
}
true
}

/// Populate a multiple key hashmap with row indexes.
///
/// Instead of the keys (which could be very large), the row indexes are stored.
/// To check if a row is equal the original DataFrame is also passed as ref.
/// When a hash collision occurs the indexes are ptrs to the rows and the rows are compared
/// on equality.
pub fn populate_multiple_key_hashmap<V, H, F, G>(
hash_tbl: &mut HashMap<IdxHash, V, H>,
// row index
idx: IdxSize,
// hash
original_h: u64,
// keys of the hash table (will not be inserted, the indexes will be used)
// the keys are needed for the equality check
keys: &DataFrame,
// value to insert
vacant_fn: G,
// function that gets a mutable ref to the occupied value in the hash table
mut occupied_fn: F,
) where
G: Fn() -> V,
F: FnMut(&mut V),
H: BuildHasher,
{
let entry = hash_tbl
.raw_entry_mut()
// uses the idx to probe rows in the original DataFrame with keys
// to check equality to find an entry
// this does not invalidate the hashmap as this equality function is not used
// during rehashing/resize (then the keys are already known to be unique).
// Only during insertion and probing an equality function is needed
.from_hash(original_h, |idx_hash| {
// first check the hash values
// before we incur a cache miss
idx_hash.hash == original_h && {
let key_idx = idx_hash.idx;
// SAFETY:
// indices in a group_by operation are always in bounds.
unsafe { compare_df_rows(keys, key_idx as usize, idx as usize) }
}
});
match entry {
RawEntryMut::Vacant(entry) => {
entry.insert_hashed_nocheck(original_h, IdxHash::new(idx, original_h), vacant_fn());
},
RawEntryMut::Occupied(mut entry) => {
let (_k, v) = entry.get_key_value_mut();
occupied_fn(v);
},
}
}
Loading

0 comments on commit 359fc11

Please sign in to comment.