From c9f80d10e3041423ef30fe9595518be5efbf9cf8 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 14 Nov 2024 08:07:04 -0500 Subject: [PATCH] switch to ahash for perf (#543) --- Cargo.lock | 12 ++---------- Cargo.toml | 4 +++- vegafusion-common/Cargo.toml | 6 +++--- vegafusion-common/src/data/table.rs | 5 +++-- vegafusion-core/Cargo.toml | 2 +- vegafusion-core/src/data/dataset.rs | 14 ++++---------- vegafusion-core/src/task_graph/graph.rs | 8 +++----- vegafusion-python/Cargo.toml | 3 --- vegafusion-runtime/Cargo.toml | 1 - 9 files changed, 19 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4f132b38a..7c82d61b8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1563,12 +1563,6 @@ dependencies = [ "byteorder", ] -[[package]] -name = "deterministic-hash" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51a9f34e639edf43f709706fc3016ccc7ce5a819d339fd23665e4385af8e93f0" - [[package]] name = "difflib" version = "0.4.0" @@ -4535,7 +4529,6 @@ version = "2.0.0-a0" dependencies = [ "arrow", "async-trait", - "deterministic-hash", "env_logger", "lazy_static", "log", @@ -4558,6 +4551,7 @@ dependencies = [ name = "vegafusion-common" version = "2.0.0-a0" dependencies = [ + "ahash", "arrow", "base64 0.21.7", "chrono", @@ -4567,7 +4561,6 @@ dependencies = [ "datafusion-functions-nested", "datafusion-proto", "datafusion-proto-common", - "deterministic-hash", "jni", "object_store", "prost", @@ -4583,12 +4576,12 @@ dependencies = [ name = "vegafusion-core" version = "2.0.0-a0" dependencies = [ + "ahash", "async-trait", "bytes", "chrono", "chrono-tz 0.9.0", "datafusion-common", - "deterministic-hash", "itertools 0.10.5", "json-patch", "lazy_static", @@ -4636,7 +4629,6 @@ dependencies = [ "datafusion-optimizer", "datafusion-physical-expr", "datafusion-proto", - "deterministic-hash", "env_logger", "float-cmp", "futures", diff --git a/Cargo.toml b/Cargo.toml index 594fbdaa9..2c4443919 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,6 @@ resolver = "2" members = [ "vegafusion-common", "vegafusion-core", "vegafusion-runtime", "vegafusion-python", "vegafusion-wasm", "vegafusion-server", "examples/rust-examples",] [workspace.dependencies] -deterministic-hash = "1.0.1" async-trait = "0.1.73" futures = "0.3.21" url = "2.3.1" @@ -39,6 +38,9 @@ version = "0.51.0" version = "0.4.35" default-features = false +[workspace.dependencies.ahash] +version = "0.8.11" + [workspace.dependencies.chrono-tz] version = "0.9.0" features = [ "case-insensitive", "filter-by-regex",] diff --git a/vegafusion-common/Cargo.toml b/vegafusion-common/Cargo.toml index fe312bc11..348e2ccd7 100644 --- a/vegafusion-common/Cargo.toml +++ b/vegafusion-common/Cargo.toml @@ -14,9 +14,6 @@ proto = ["datafusion-proto", "datafusion-proto-common", "prost"] [dependencies] thiserror = "^1.0.29" -[dependencies.deterministic-hash] -workspace = true - [dependencies.chrono] workspace = true optional = true @@ -25,6 +22,9 @@ optional = true workspace = true optional = true +[dependencies.ahash] +workspace = true + [dependencies.serde_json] workspace = true default-features = false diff --git a/vegafusion-common/src/data/table.rs b/vegafusion-common/src/data/table.rs index 5c4f8de3b..223dbb4d1 100644 --- a/vegafusion-common/src/data/table.rs +++ b/vegafusion-common/src/data/table.rs @@ -1,5 +1,6 @@ use datafusion_common::ScalarValue; +use ahash::RandomState; use arrow::{ array::{ArrayData, ArrayRef, StructArray, UInt32Array}, compute::concat_batches, @@ -7,6 +8,7 @@ use arrow::{ ipc::{reader::StreamReader, writer::StreamWriter}, record_batch::RecordBatch, }; +use std::hash::BuildHasher; use crate::{ data::{ORDER_COL, ORDER_COL_DTYPE}, @@ -19,7 +21,6 @@ use arrow::array::{ }; #[cfg(feature = "prettyprint")] use arrow::util::pretty::pretty_format_batches; -use std::hash::DefaultHasher; use std::{ hash::{Hash, Hasher}, io::Cursor, @@ -434,7 +435,7 @@ impl VegaFusionTable { } pub fn get_hash(&self) -> u64 { - let mut hasher = deterministic_hash::DeterministicHasher::new(DefaultHasher::new()); + let mut hasher = RandomState::with_seed(123).build_hasher(); self.hash(&mut hasher); hasher.finish() } diff --git a/vegafusion-core/Cargo.toml b/vegafusion-core/Cargo.toml index 622c12d5c..279bf32ef 100644 --- a/vegafusion-core/Cargo.toml +++ b/vegafusion-core/Cargo.toml @@ -25,7 +25,7 @@ log = "0.4.22" [dependencies.lazy_static] workspace = true -[dependencies.deterministic-hash] +[dependencies.ahash] workspace = true [dependencies.prost] diff --git a/vegafusion-core/src/data/dataset.rs b/vegafusion-core/src/data/dataset.rs index adefaa0cf..691b10cf8 100644 --- a/vegafusion-core/src/data/dataset.rs +++ b/vegafusion-core/src/data/dataset.rs @@ -1,6 +1,4 @@ use crate::error::Result; -use std::collections::hash_map::DefaultHasher; -use std::hash::{Hash, Hasher}; use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; @@ -14,11 +12,9 @@ impl VegaFusionDataset { pub fn fingerprint(&self) -> String { match self { VegaFusionDataset::Table { hash, .. } => hash.to_string(), - VegaFusionDataset::Plan { plan } => { - let mut hasher = deterministic_hash::DeterministicHasher::new(DefaultHasher::new()); - plan.hash(&mut hasher); - hasher.finish().to_string() - } + VegaFusionDataset::Plan { plan } => ahash::RandomState::with_seed(123) + .hash_one(plan) + .to_string(), } } @@ -29,9 +25,7 @@ impl VegaFusionDataset { pub fn from_table_ipc_bytes(ipc_bytes: &[u8]) -> Result { // Hash ipc bytes - let mut hasher = deterministic_hash::DeterministicHasher::new(DefaultHasher::new()); - ipc_bytes.hash(&mut hasher); - let hash = hasher.finish(); + let hash = ahash::RandomState::with_seed(123).hash_one(&ipc_bytes); let table = VegaFusionTable::from_ipc_bytes(ipc_bytes)?; Ok(Self::Table { table, hash }) } diff --git a/vegafusion-core/src/task_graph/graph.rs b/vegafusion-core/src/task_graph/graph.rs index 2b1a1d2dd..29ee2cae3 100644 --- a/vegafusion-core/src/task_graph/graph.rs +++ b/vegafusion-core/src/task_graph/graph.rs @@ -14,9 +14,8 @@ use crate::task_graph::task_value::TaskValue; use crate::proto::gen::tasks::task::TaskKind; use crate::proto::gen::tasks::task_value::Data; use crate::proto::gen::tasks::TaskValue as ProtoTaskValue; -use std::collections::hash_map::DefaultHasher; use std::convert::TryFrom; -use std::hash::{Hash, Hasher}; +use std::hash::{BuildHasher, Hash, Hasher}; struct PetgraphEdge { output_var: Option, @@ -208,8 +207,7 @@ impl TaskGraph { let mut id_fingerprints: Vec = Vec::with_capacity(self.nodes.len()); for (i, node) in self.nodes.iter().enumerate() { let task = node.task(); - let mut hasher = deterministic_hash::DeterministicHasher::new(DefaultHasher::new()); - + let mut hasher = ahash::RandomState::with_seed(123).build_hasher(); if let TaskKind::Value(value) = task.task_kind() { // Only hash the distinction between Scalar and Table, not the value itself. // The state fingerprint takes the value into account. @@ -249,7 +247,7 @@ impl TaskGraph { let mut state_fingerprints: Vec = Vec::with_capacity(self.nodes.len()); for (i, node) in self.nodes.iter().enumerate() { let task = node.task(); - let mut hasher = deterministic_hash::DeterministicHasher::new(DefaultHasher::new()); + let mut hasher = ahash::RandomState::with_seed(123).build_hasher(); if matches!(task.task_kind(), TaskKind::Value(_)) { // Hash the task with inline TaskValue diff --git a/vegafusion-python/Cargo.toml b/vegafusion-python/Cargo.toml index 04e9ef971..bb56dc5ed 100644 --- a/vegafusion-python/Cargo.toml +++ b/vegafusion-python/Cargo.toml @@ -34,9 +34,6 @@ workspace = true workspace = true features = ["tls"] -[dependencies.deterministic-hash] -version = "1.0.1" - [dependencies.serde] version = "1.0.137" features = ["derive"] diff --git a/vegafusion-runtime/Cargo.toml b/vegafusion-runtime/Cargo.toml index 3492b4c15..12b5ad5e4 100644 --- a/vegafusion-runtime/Cargo.toml +++ b/vegafusion-runtime/Cargo.toml @@ -43,7 +43,6 @@ async-lock = "2.8.0" tempfile = "3.3.0" futures-util = "0.3.21" bytes = "1.1.0" -deterministic-hash = "1.0.1" log = "0.4.17" env_logger = "0.10.0" ordered-float = "3.6.0"