Skip to content

Commit

Permalink
feat(storage): support vector block (#871)
Browse files Browse the repository at this point in the history
We added a new block type that stores fixed-size chunk data. The
in-memory array type is added in
#869.

#864

---------

Signed-off-by: Alex Chi Z <[email protected]>
Signed-off-by: Runji Wang <[email protected]>
Signed-off-by: Alex Chi <[email protected]>
Co-authored-by: Runji Wang <[email protected]>
  • Loading branch information
skyzh and wangrunji0408 authored Jan 26, 2025
1 parent 210c770 commit 1d646b8
Show file tree
Hide file tree
Showing 15 changed files with 632 additions and 17 deletions.
6 changes: 5 additions & 1 deletion src/array/var_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

use std::borrow::Borrow;
use std::fmt::{Display, Write};
use std::hash::Hash;
use std::marker::PhantomData;
use std::mem;

Expand All @@ -20,7 +21,10 @@ pub struct VarArray<T: ValueRef<U> + ?Sized, U: PrimitiveValueType = u8> {
_type: PhantomData<T>,
}

pub trait PrimitiveValueType: Send + Sync + 'static + Copy + Clone + Default {}
pub trait PrimitiveValueType:
Send + Sync + 'static + Copy + Clone + Default + PartialEq + Eq + Hash
{
}

impl PrimitiveValueType for u8 {}
impl PrimitiveValueType for F64 {}
Expand Down
4 changes: 4 additions & 0 deletions src/storage/secondary/block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ mod primitive_block_builder;
mod primitive_block_iterator;
mod rle_block_builder;
mod rle_block_iterator;
mod vector_block_builder;
mod vector_block_iterator;

use bitvec::prelude::{BitVec, Lsb0};
pub use blob_block_builder::*;
Expand All @@ -38,6 +40,8 @@ pub use block_index_builder::*;
use bytes::{Buf, BufMut, Bytes};
use risinglight_proto::rowset::block_checksum::ChecksumType;
use risinglight_proto::rowset::block_index::BlockType;
pub use vector_block_builder::*;
pub use vector_block_iterator::*;

use super::StorageResult;
use crate::array::Array;
Expand Down
139 changes: 139 additions & 0 deletions src/storage/secondary/block/vector_block_builder.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// Copyright 2024 RisingLight Project Authors. Licensed under Apache-2.0.

use bitvec::prelude::{BitVec, Lsb0};
use bytes::BufMut;
use risinglight_proto::rowset::BlockStatistics;

use super::super::statistics::StatisticsBuilder;
use super::{BlockBuilder, NonNullableBlockBuilder};
use crate::array::VectorArray;
use crate::types::VectorRef;

/// Encodes fixed-chunk data into a block. The data layout is
/// ```plain
/// | data | data | data | element_size |
/// ```
/// The `element_size` is the size for each vector element, and the data is aligned to the
/// `element_size`. The length of each element is `element_size * std::mem::size_of::<f64>()`.
pub struct PlainVectorBlockBuilder {
data: Vec<u8>,
element_size: Option<usize>,
target_size: usize,
}

impl PlainVectorBlockBuilder {
pub fn new(target_size: usize) -> Self {
let data = Vec::with_capacity(target_size);
Self {
data,
element_size: None,
target_size,
}
}
}

impl PlainVectorBlockBuilder {
fn update_element_size(&mut self, new_element_size: usize) {
if let Some(element_size) = self.element_size {
assert_eq!(element_size, new_element_size);
}
self.element_size = Some(new_element_size);
}
}

impl NonNullableBlockBuilder<VectorArray> for PlainVectorBlockBuilder {
fn append_value(&mut self, item: &VectorRef) {
for i in item.iter() {
self.data.extend_from_slice(&i.to_le_bytes());
}
self.update_element_size(item.len());
}

fn append_default(&mut self) {
panic!("PlainVectorBlockBuilder does not support append_default");
}

fn get_statistics_with_bitmap(&self, selection: &BitVec<u8, Lsb0>) -> Vec<BlockStatistics> {
let selection_empty = selection.is_empty();
let mut stats_builder = StatisticsBuilder::new();
let element_size = self.element_size.unwrap();
let item_cnt = self.data.len() / element_size / std::mem::size_of::<f64>();
for idx in 0..item_cnt {
let begin_pos = idx * element_size * std::mem::size_of::<f64>();
let end_pos = begin_pos + element_size * std::mem::size_of::<f64>();

if selection_empty || selection[idx] {
stats_builder.add_item(Some(&self.data[begin_pos..end_pos]));
}
}
stats_builder.get_statistics()
}

fn estimated_size_with_next_item(&self, next_item: &Option<&VectorRef>) -> usize {
self.estimated_size()
+ next_item
.map(|x| x.len() * std::mem::size_of::<f64>())
.unwrap_or(0)
+ std::mem::size_of::<u32>()
}

fn is_empty(&self) -> bool {
self.data.is_empty()
}
}

impl BlockBuilder<VectorArray> for PlainVectorBlockBuilder {
fn append(&mut self, item: Option<&VectorRef>) {
match item {
Some(item) => {
self.append_value(item);
}
None => {
self.append_default();
}
}
}

fn estimated_size(&self) -> usize {
self.data.len() + std::mem::size_of::<u32>() // element_size
}

fn should_finish(&self, next_item: &Option<&VectorRef>) -> bool {
!self.is_empty() && self.estimated_size_with_next_item(next_item) > self.target_size
}

fn get_statistics(&self) -> Vec<BlockStatistics> {
self.get_statistics_with_bitmap(&BitVec::new())
}

fn finish(self) -> Vec<u8> {
let mut encoded_data = vec![];
encoded_data.extend(self.data);
encoded_data.put_u32(self.element_size.unwrap() as u32); // so that we can likely get vectors aligned
encoded_data
}

fn get_target_size(&self) -> usize {
self.target_size
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_build_vector() {
let mut builder = PlainVectorBlockBuilder::new(128);
builder.append(Some(VectorRef::new(&[1.0.into(), 2.0.into(), 3.0.into()])));
builder.append(Some(VectorRef::new(&[4.0.into(), 5.0.into(), 6.0.into()])));
builder.append_value(VectorRef::new(&[7.0.into(), 8.0.into(), 9.0.into()]));
assert_eq!(builder.estimated_size(), 3 * 3 * 8 + 4);
assert!(!builder.should_finish(&Some(VectorRef::new(&[
10.0.into(),
11.0.into(),
12.0.into()
]))));
builder.finish();
}
}
159 changes: 159 additions & 0 deletions src/storage/secondary/block/vector_block_iterator.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
// Copyright 2024 RisingLight Project Authors. Licensed under Apache-2.0.

use bytes::Buf;

use super::{Block, BlockIterator, NonNullableBlockIterator};
use crate::array::{ArrayBuilder, VectorArray, VectorArrayBuilder};
use crate::types::{VectorRef, F64};

/// Scans one or several arrays from the block content.
pub struct PlainVectorBlockIterator {
/// Block content
block: Block,

/// Total count of elements in block
row_count: usize,

/// Indicates the beginning row of the next batch
next_row: usize,

/// Fixed-size buffer for vector data
vec_buffer: Vec<F64>,

/// The size for each vector element
element_size: usize,
}

impl PlainVectorBlockIterator {
pub fn new(block: Block, row_count: usize) -> Self {
let element_size =
(&block[block.len() - std::mem::size_of::<u32>()..block.len()]).get_u32() as usize;

Self {
block,
row_count,
next_row: 0,
vec_buffer: Vec::new(),
element_size,
}
}
}

impl NonNullableBlockIterator<VectorArray> for PlainVectorBlockIterator {
fn next_batch_non_null(
&mut self,
expected_size: Option<usize>,
builder: &mut VectorArrayBuilder,
) -> usize {
if self.next_row >= self.row_count {
return 0;
}

// TODO(chi): error handling on corrupted block

let mut cnt = 0;
let data_buffer = &self.block[..];

loop {
if let Some(expected_size) = expected_size {
assert!(expected_size > 0);
if cnt >= expected_size {
break;
}
}

if self.next_row >= self.row_count {
break;
}

let from = self.next_row * self.element_size * std::mem::size_of::<f64>();
let to = from + self.element_size * std::mem::size_of::<f64>();
assert!((to - from) % std::mem::size_of::<f64>() == 0);
self.vec_buffer.clear();
self.vec_buffer
.reserve(self.element_size * std::mem::size_of::<f64>());
let mut buf = &data_buffer[from..to];
for _ in 0..self.element_size {
self.vec_buffer.push(F64::from(buf.get_f64_le()));
}
builder.push(Some(VectorRef::new(&self.vec_buffer)));

cnt += 1;
self.next_row += 1;
}

cnt
}
}

impl BlockIterator<VectorArray> for PlainVectorBlockIterator {
fn next_batch(
&mut self,
expected_size: Option<usize>,
builder: &mut VectorArrayBuilder,
) -> usize {
self.next_batch_non_null(expected_size, builder)
}

fn skip(&mut self, cnt: usize) {
self.next_row += cnt;
}

fn remaining_items(&self) -> usize {
self.row_count - self.next_row
}
}

#[cfg(test)]
mod tests {
use bytes::Bytes;

use super::*;
use crate::array::{ArrayBuilder, ArrayToVecExt, VectorArrayBuilder};
use crate::storage::secondary::block::{BlockBuilder, PlainVectorBlockBuilder};
use crate::storage::secondary::BlockIterator;
use crate::types::Vector;

#[test]
fn test_scan_vector() {
let mut builder = PlainVectorBlockBuilder::new(128);
let input = [
Some(Vector::new(vec![1.0, 2.0, 3.0])),
Some(Vector::new(vec![4.0, 5.0, 6.0])),
Some(Vector::new(vec![7.0, 8.0, 9.0])),
];

input
.iter()
.for_each(|v| builder.append(v.as_ref().map(|v| v.as_ref())));
let data = builder.finish();

let mut scanner = PlainVectorBlockIterator::new(Bytes::from(data), 3);

let mut builder = VectorArrayBuilder::new();

scanner.skip(1);
assert_eq!(scanner.remaining_items(), 2);

assert_eq!(scanner.next_batch(Some(1), &mut builder), 1);
assert_eq!(
builder.finish().to_vec(),
vec![Some(
VectorRef::new(&[F64::from(4.0), F64::from(5.0), F64::from(6.0)]).to_vector()
)]
);

let mut builder = VectorArrayBuilder::new();
assert_eq!(scanner.next_batch(Some(2), &mut builder), 1);

assert_eq!(
builder.finish().to_vec(),
vec![Some(
VectorRef::new(&[F64::from(7.0), F64::from(8.0), F64::from(9.0)]).to_vector()
)]
);

let mut builder = VectorArrayBuilder::new();
assert_eq!(scanner.next_batch(None, &mut builder), 0);
}
}
17 changes: 10 additions & 7 deletions src/storage/secondary/column.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,31 +10,34 @@
mod blob_column_builder;
mod blob_column_factory;
mod char_column_builder;
mod char_column_factory;
mod column_builder;
mod column_iterator;
mod concrete_column_iterator;
mod primitive_column_builder;
mod primitive_column_factory;
mod row_handler_column_iterator;
mod vector_column_builder;
mod vector_column_factory;

use std::future::Future;
use std::io::{Read, Seek, SeekFrom};
use std::os::unix::fs::FileExt;
use std::sync::{Arc, Mutex};

pub use blob_column_factory::*;
use bytes::Bytes;
pub use char_column_factory::*;
pub use column_builder::*;
pub use column_iterator::*;
pub use concrete_column_iterator::*;
use moka::future::Cache;
pub use primitive_column_builder::*;
pub use primitive_column_factory::*;
use risinglight_proto::rowset::BlockIndex;
pub use row_handler_column_iterator::*;
mod char_column_factory;
use std::os::unix::fs::FileExt;
use std::sync::{Arc, Mutex};

use bytes::Bytes;
pub use char_column_factory::*;
use moka::future::Cache;
pub use vector_column_builder::*;
pub use vector_column_factory::*;

use super::block::BLOCK_META_CHECKSUM_SIZE;
use super::{Block, BlockCacheKey, BlockMeta, ColumnIndex, BLOCK_META_SIZE};
Expand Down
Loading

0 comments on commit 1d646b8

Please sign in to comment.