diff --git a/src/distance/angular.rs b/src/distance/angular.rs index 2e2520f6..c13c0b4e 100644 --- a/src/distance/angular.rs +++ b/src/distance/angular.rs @@ -20,12 +20,13 @@ pub enum Angular {} #[repr(C)] #[derive(Pod, Zeroable, Debug, Clone, Copy)] pub struct NodeHeaderAngular { - norm: f32, + pub(super) norm: f32, } impl Distance for Angular { type Header = NodeHeaderAngular; type VectorCodec = f32; + type ExactDistanceTrait = Angular; fn name() -> &'static str { "angular" diff --git a/src/distance/binary_quantized_angular.rs b/src/distance/binary_quantized_angular.rs new file mode 100644 index 00000000..c60d1507 --- /dev/null +++ b/src/distance/binary_quantized_angular.rs @@ -0,0 +1,140 @@ +use std::borrow::Cow; + +use bytemuck::{Pod, Zeroable}; +use rand::Rng; + +use super::{two_means_binary_quantized as two_means, Angular, NodeHeaderAngular}; +use crate::distance::Distance; +use crate::node::Leaf; +use crate::parallel::ImmutableSubsetLeafs; +use crate::unaligned_vector::{self, BinaryQuantized, UnalignedVector}; + +/// The Cosine similarity is a measure of similarity between two +/// non-zero vectors defined in an inner product space. Cosine similarity +/// is the cosine of the angle between the vectors. +#[derive(Debug, Clone)] +pub enum BinaryQuantizedAngular {} + +/// The header of BinaryQuantizedAngular leaf nodes. +#[repr(C)] +#[derive(Pod, Zeroable, Debug, Clone, Copy)] +pub struct NodeHeaderBinaryQuantizedAngular { + norm: f32, +} + +impl Distance for BinaryQuantizedAngular { + type Header = NodeHeaderBinaryQuantizedAngular; + type VectorCodec = unaligned_vector::BinaryQuantized; + type ExactDistanceTrait = Angular; + + fn name() -> &'static str { + "binary quantized angular" + } + + fn new_header(vector: &UnalignedVector) -> Self::Header { + NodeHeaderBinaryQuantizedAngular { norm: Self::norm_no_header(vector) } + } + + fn built_distance(p: &Leaf, q: &Leaf) -> f32 { + let pn = p.header.norm; + let qn = q.header.norm; + let pq = dot_product(&p.vector, &q.vector); + let pnqn = pn * qn; + if pnqn != 0.0 { + let cos = pq / pnqn; + // cos is [-1; 1] + // cos = 0. -> 0.5 + // cos = -1. -> 1.0 + // cos = 1. -> 0.0 + (1.0 - cos) / 2.0 + } else { + 0.0 + } + } + + /// Normalizes the distance returned by the distance method. + fn normalized_distance(d: f32, _dimensions: usize) -> f32 { + d + } + + fn norm_no_header(v: &UnalignedVector) -> f32 { + dot_product(v, v).sqrt() + } + + fn init(node: &mut Leaf) { + node.header.norm = dot_product(&node.vector, &node.vector).sqrt(); + } + + fn create_split<'a, R: Rng>( + children: &'a ImmutableSubsetLeafs, + rng: &mut R, + ) -> heed::Result::VectorCodec>>> + { + let [node_p, node_q] = two_means::(rng, children, true)?; + let vector: Vec = + node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect(); + let unaligned_vector = UnalignedVector::from_vec(vector); + let mut normal = Leaf { header: NodeHeaderAngular { norm: 0.0 }, vector: unaligned_vector }; + Angular::normalize(&mut normal); + + Ok(normal.vector) + } + + fn margin_no_header( + p: &UnalignedVector, + q: &UnalignedVector, + ) -> f32 { + dot_product(p, q) + } +} + +fn bits(mut word: u8) -> [f32; 8] { + let mut ret = [0.0; 8]; + for i in 0..8 { + let bit = word & 1; + word >>= 1; + if bit == 0 { + ret[i] = -1.0; + // ret[i] = 0.0; + } else { + ret[i] = 1.0; + } + } + + ret +} + +fn dot_product(u: &UnalignedVector, v: &UnalignedVector) -> f32 { + // /!\ If the number of dimensions is not a multiple of the `Word` size, we'll xor 0 bits at the end, which will generate a lot of 1s. + // This may or may not impact relevancy since the 1s will be added to every vector. + // u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u | v).count_ones()).sum::() as f32 + + u.as_bytes() + .iter() + .zip(v.as_bytes()) + .flat_map(|(u, v)| { + let u = bits(*u); + let v = bits(*v); + u.into_iter().zip(v).map(|(u, v)| u * v) + }) + .sum::() +} + +fn squared_euclidean_distance( + u: &UnalignedVector, + v: &UnalignedVector, +) -> f32 { + // /!\ If the number of dimensions is not a multiple of the `Word` size, we'll xor 0 bits at the end, which will generate a lot of 1s. + // This may or may not impact relevancy since the 1s will be added to every vector. + // u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u ^ v).count_ones()).sum::() as f32 + + u.as_bytes() + .iter() + .zip(v.as_bytes()) + .flat_map(|(u, v)| { + let u = bits(*u); + let v = bits(*v); + u.into_iter().zip(v).map(|(u, v)| (u - v) * (u - v)) + }) + .sum::() +} diff --git a/src/distance/binary_quantized_euclidean.rs b/src/distance/binary_quantized_euclidean.rs index 724fb0ed..6fe552fa 100644 --- a/src/distance/binary_quantized_euclidean.rs +++ b/src/distance/binary_quantized_euclidean.rs @@ -27,6 +27,7 @@ pub struct NodeHeaderBinaryQuantizedEuclidean { impl Distance for BinaryQuantizedEuclidean { type Header = NodeHeaderBinaryQuantizedEuclidean; type VectorCodec = unaligned_vector::BinaryQuantized; + type ExactDistanceTrait = Euclidean; fn name() -> &'static str { "binary quantized euclidean" @@ -54,7 +55,8 @@ impl Distance for BinaryQuantizedEuclidean { fn create_split<'a, R: Rng>( children: &'a ImmutableSubsetLeafs, rng: &mut R, - ) -> heed::Result>> { + ) -> heed::Result::VectorCodec>>> + { let [node_p, node_q] = two_means::(rng, children, false)?; let vector: Vec = node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect(); diff --git a/src/distance/binary_quantized_manhattan.rs b/src/distance/binary_quantized_manhattan.rs index 3a724f10..8a6460b4 100644 --- a/src/distance/binary_quantized_manhattan.rs +++ b/src/distance/binary_quantized_manhattan.rs @@ -26,6 +26,7 @@ pub struct NodeHeaderBinaryQuantizedManhattan { impl Distance for BinaryQuantizedManhattan { type Header = NodeHeaderBinaryQuantizedManhattan; type VectorCodec = unaligned_vector::BinaryQuantized; + type ExactDistanceTrait = Manhattan; fn name() -> &'static str { "binary quantized manhattan" @@ -54,7 +55,8 @@ impl Distance for BinaryQuantizedManhattan { fn create_split<'a, R: Rng>( children: &'a ImmutableSubsetLeafs, rng: &mut R, - ) -> heed::Result>> { + ) -> heed::Result::VectorCodec>>> + { let [node_p, node_q] = two_means::(rng, children, false)?; let vector: Vec = node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect(); diff --git a/src/distance/dot_product.rs b/src/distance/dot_product.rs index 6a948c98..e309ac36 100644 --- a/src/distance/dot_product.rs +++ b/src/distance/dot_product.rs @@ -31,6 +31,7 @@ pub struct NodeHeaderDotProduct { impl Distance for DotProduct { type Header = NodeHeaderDotProduct; type VectorCodec = f32; + type ExactDistanceTrait = Self; fn name() -> &'static str { "dot-product" diff --git a/src/distance/euclidean.rs b/src/distance/euclidean.rs index 1c0b2f54..1cae9c84 100644 --- a/src/distance/euclidean.rs +++ b/src/distance/euclidean.rs @@ -28,6 +28,7 @@ pub struct NodeHeaderEuclidean { impl Distance for Euclidean { type Header = NodeHeaderEuclidean; type VectorCodec = f32; + type ExactDistanceTrait = Self; fn name() -> &'static str { "euclidean" diff --git a/src/distance/manhattan.rs b/src/distance/manhattan.rs index ae4ba4d5..5139ca2f 100644 --- a/src/distance/manhattan.rs +++ b/src/distance/manhattan.rs @@ -27,6 +27,7 @@ pub struct NodeHeaderManhattan { impl Distance for Manhattan { type Header = NodeHeaderManhattan; type VectorCodec = f32; + type ExactDistanceTrait = Self; fn name() -> &'static str { "manhattan" diff --git a/src/distance/mod.rs b/src/distance/mod.rs index 7191d60b..0bf1c862 100644 --- a/src/distance/mod.rs +++ b/src/distance/mod.rs @@ -42,6 +42,8 @@ pub trait Distance: Send + Sync + Sized + Clone + fmt::Debug + 'static { /// A header structure with informations related to the type Header: Pod + Zeroable + fmt::Debug; type VectorCodec: UnalignedVectorCodec; + /// The trait used to compute the split nodes and internal distance in arroy + type ExactDistanceTrait: Distance; fn name() -> &'static str; @@ -95,7 +97,7 @@ pub trait Distance: Send + Sync + Sized + Clone + fmt::Debug + 'static { fn create_split<'a, R: Rng>( children: &'a ImmutableSubsetLeafs, rng: &mut R, - ) -> heed::Result>>; + ) -> heed::Result::VectorCodec>>>; fn margin(p: &Leaf, q: &Leaf) -> f32 { Self::margin_no_header(&p.vector, &q.vector) @@ -107,11 +109,13 @@ pub trait Distance: Send + Sync + Sized + Clone + fmt::Debug + 'static { ) -> f32; fn side( - normal_plane: &UnalignedVector, + normal_plane: &UnalignedVector<::VectorCodec>, node: &Leaf, rng: &mut R, ) -> Side { - let dot = Self::margin_no_header(&node.vector, normal_plane); + let node = node.vector.iter().collect(); + let node = UnalignedVector::from_vec(node); + let dot = Self::ExactDistanceTrait::margin_no_header(&node, normal_plane); if dot > 0.0 { Side::Right } else if dot < 0.0 { diff --git a/src/node.rs b/src/node.rs index b2bc1cc8..a57452fb 100644 --- a/src/node.rs +++ b/src/node.rs @@ -115,7 +115,7 @@ impl fmt::Debug for ItemIds<'_> { pub struct SplitPlaneNormal<'a, D: Distance> { pub left: NodeId, pub right: NodeId, - pub normal: Cow<'a, UnalignedVector>, + pub normal: Cow<'a, UnalignedVector<::VectorCodec>>, } impl fmt::Debug for SplitPlaneNormal<'_, D> { @@ -178,11 +178,17 @@ impl<'a, D: Distance> BytesDecode<'a> for NodeCodec { [SPLIT_PLANE_NORMAL_TAG, bytes @ ..] => { let (left, bytes) = NodeId::from_bytes(bytes); let (right, bytes) = NodeId::from_bytes(bytes); - Ok(Node::SplitPlaneNormal(SplitPlaneNormal { - normal: UnalignedVector::::from_bytes(bytes)?, - left, - right, - })) + Ok( + Node::SplitPlaneNormal( + SplitPlaneNormal { + normal: UnalignedVector::< + ::VectorCodec, + >::from_bytes(bytes)?, + left, + right, + }, + ), + ) } [DESCENDANTS_TAG, bytes @ ..] => Ok(Node::Descendants(Descendants { descendants: Cow::Owned(RoaringBitmap::deserialize_from(bytes)?), diff --git a/src/reader.rs b/src/reader.rs index f9e2ba05..5fd40a98 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -215,6 +215,11 @@ impl<'t, D: Distance> Reader<'t, D> { if self.items.is_empty() { return Ok(Vec::new()); } + let exact_query_vector = query_leaf.vector.iter().collect(); + let exact_query_vector = + UnalignedVector::<::VectorCodec>::from_vec( + exact_query_vector, + ); // Since the datastructure describes a kind of btree, the capacity is something in the order of: // The number of root nodes + log2 of the total number of vectors. let mut queue = @@ -246,7 +251,8 @@ impl<'t, D: Distance> Reader<'t, D> { } } Node::SplitPlaneNormal(SplitPlaneNormal { normal, left, right }) => { - let margin = D::margin_no_header(&normal, &query_leaf.vector); + let margin = + ::margin_no_header(&normal, &exact_query_vector); queue.push((OrderedFloat(D::pq_distance(dist, margin, Side::Left)), left)); queue.push((OrderedFloat(D::pq_distance(dist, margin, Side::Right)), right)); } diff --git a/src/unaligned_vector/binary_quantized.rs b/src/unaligned_vector/binary_quantized.rs index a8eb607d..24decef9 100644 --- a/src/unaligned_vector/binary_quantized.rs +++ b/src/unaligned_vector/binary_quantized.rs @@ -79,6 +79,7 @@ impl Iterator for BinaryQuantizedIterator<'_> { if bit == 0 { Some(-1.0) + // Some(0.0) } else { Some(1.0) }