Skip to content

Update tantivy version #5863

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 24 additions & 23 deletions quickwit/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion quickwit/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ quickwit-serve = { path = "quickwit-serve" }
quickwit-storage = { path = "quickwit-storage" }
quickwit-telemetry = { path = "quickwit-telemetry" }

tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "80f5f1e", default-features = false, features = [
tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "b621e4da", default-features = false, features = [
"lz4-compression",
"mmap",
"quickwit",
Expand Down
118 changes: 73 additions & 45 deletions quickwit/quickwit-indexing/src/actors/packager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,14 @@ use quickwit_proto::search::{
};
use tantivy::index::FieldMetadata;
use tantivy::schema::{FieldType, Type};
use tantivy::{InvertedIndexReader, ReloadPolicy, SegmentMeta};
use tantivy::{ByteCount, Index, InvertedIndexReader, ReloadPolicy, SegmentMeta};
use tokio::runtime::Handle;
use tracing::{debug, info, instrument, warn};

const LIMIT_PACKAGED_FIELDS: usize = 1_000;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
const LIMIT_PACKAGED_FIELDS: usize = 1_000;
const QW_MAX_PACKAGED_FIELDS_CAPS_ENV_DEFAULT: usize = 1_000;


const QW_MAX_PACKAGED_FIELDS_CAPS_ENV_KEY: &str = "QW_MAX_PACKAGED_FIELDS_CAPS";

/// Maximum distinct values allowed for a tag field within a split.
const MAX_VALUES_PER_TAG_FIELD: usize = if cfg!(any(test, feature = "testsuite")) {
6
Expand Down Expand Up @@ -268,6 +272,41 @@ fn try_extract_terms(
Ok(terms)
}

fn total_num_bytes(field_metadata: &FieldMetadata) -> u64 {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

list fields also returns dynamic fields in a JSON field. How does total_num_bytes work with that? Does it return 0 or the size of the parent field?

let num_bytes =
|bytes_opt: &Option<ByteCount>| bytes_opt.map(|b| b.get_bytes()).unwrap_or(0u64);
num_bytes(&field_metadata.term_dictionary_size)
+ num_bytes(&field_metadata.postings_size)
+ num_bytes(&field_metadata.positions_size)
+ num_bytes(&field_metadata.fast_size)
}

fn build_list_fields(index: &Index) -> anyhow::Result<ListFields> {
let fields_metadata: Vec<FieldMetadata> = index.fields_metadata()?;
let num_fields = fields_metadata.len();

let max_packaged_field_caps =
quickwit_common::get_from_env(QW_MAX_PACKAGED_FIELDS_CAPS_ENV_KEY, 1_000);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
quickwit_common::get_from_env(QW_MAX_PACKAGED_FIELDS_CAPS_ENV_KEY, 1_000);
quickwit_common::get_from_env(QW_MAX_PACKAGED_FIELDS_CAPS_ENV_KEY, QW_MAX_PACKAGED_FIELDS_CAPS_ENV_DEFAULT);

let fields_within_limit: Vec<FieldMetadata> = if num_fields > max_packaged_field_caps {
info!(
"truncate list field information num_fields={num_fields} limit={LIMIT_PACKAGED_FIELDS}"
);
fields_metadata
.into_iter()
.k_largest_by_key(LIMIT_PACKAGED_FIELDS, total_num_bytes)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
.k_largest_by_key(LIMIT_PACKAGED_FIELDS, total_num_bytes)
.k_largest_by_key(max_packaged_field_caps, total_num_bytes)

.collect()
} else {
fields_metadata
};

let fields = fields_within_limit
.iter()
.map(field_metadata_to_list_field_serialized)
.collect::<Vec<_>>();

Ok(ListFields { fields })
}

fn create_packaged_split(
segment_metas: &[SegmentMeta],
split: IndexedSplit,
Expand All @@ -286,8 +325,6 @@ fn create_packaged_split(
.reload_policy(ReloadPolicy::Manual)
.try_into()?;

let fields_metadata = split.index.fields_metadata()?;

let mut tags = BTreeSet::default();
for named_field in tag_fields {
let inverted_indexes = index_reader
Expand All @@ -314,7 +351,8 @@ fn create_packaged_split(
build_hotcache(split.split_scratch_directory.path(), &mut hotcache_bytes)?;
ctx.record_progress();

let serialized_split_fields = serialize_field_metadata(&fields_metadata);
let list_fields: ListFields = build_list_fields(&split.index)?;
let serialized_split_fields = serialize_split_fields(&list_fields);

let packaged_split = PackagedSplit {
serialized_split_fields,
Expand All @@ -327,18 +365,6 @@ fn create_packaged_split(
Ok(packaged_split)
}

/// Serializes the Split fields.
///
/// `fields_metadata` has to be sorted.
fn serialize_field_metadata(fields_metadata: &[FieldMetadata]) -> Vec<u8> {
let fields = fields_metadata
.iter()
.map(field_metadata_to_list_field_serialized)
.collect::<Vec<_>>();

serialize_split_fields(ListFields { fields })
}

fn tantivy_type_to_list_field_type(typ: Type) -> ListFieldType {
match typ {
Type::Str => ListFieldType::Str,
Expand All @@ -360,8 +386,8 @@ fn field_metadata_to_list_field_serialized(
ListFieldsEntryResponse {
field_name: field_metadata.field_name.to_string(),
field_type: tantivy_type_to_list_field_type(field_metadata.typ) as i32,
searchable: field_metadata.indexed,
aggregatable: field_metadata.fast,
searchable: field_metadata.is_indexed(),
aggregatable: field_metadata.is_fast(),
index_ids: Vec::new(),
non_searchable_index_ids: Vec::new(),
non_aggregatable_index_ids: Vec::new(),
Expand All @@ -385,45 +411,47 @@ mod tests {
use quickwit_proto::search::{ListFieldsEntryResponse, deserialize_split_fields};
use quickwit_proto::types::{DocMappingUid, IndexUid, NodeId};
use tantivy::directory::MmapDirectory;
use tantivy::schema::{FAST, NumericOptions, STRING, Schema, TEXT, Type};
use tantivy::schema::{FAST, NumericOptions, STRING, Schema, TEXT};
use tantivy::{DateTime, IndexBuilder, IndexSettings, doc};
use tracing::Span;

use super::*;
use crate::models::{PublishLock, SplitAttrs};

#[test]
fn serialize_field_metadata_test() {
let fields_metadata = vec![
FieldMetadata {
field_name: "test".to_string(),
typ: Type::Str,
indexed: true,
stored: true,
fast: true,
},
FieldMetadata {
field_name: "test2".to_string(),
typ: Type::Str,
indexed: true,
stored: false,
fast: false,
},
FieldMetadata {
field_name: "test3".to_string(),
typ: Type::U64,
indexed: true,
stored: false,
fast: true,
},
];
fn test_serialize_split_fields() {
let list_fields = ListFields {
fields: vec![
ListFieldsEntryResponse {
field_name: "test".to_string(),
field_type: ListFieldType::Str as i32,
searchable: true,
aggregatable: true,
..Default::default()
},
ListFieldsEntryResponse {
field_name: "test2".to_string(),
field_type: ListFieldType::Str as i32,
searchable: true,
aggregatable: false,
..Default::default()
},
ListFieldsEntryResponse {
field_name: "test3".to_string(),
field_type: ListFieldType::Str as i32,
searchable: true,
aggregatable: true,
..Default::default()
},
],
};

let out = serialize_field_metadata(&fields_metadata);
let out = serialize_split_fields(&list_fields);

let deserialized: Vec<ListFieldsEntryResponse> =
deserialize_split_fields(&mut &out[..]).unwrap().fields;

assert_eq!(fields_metadata.len(), deserialized.len());
assert_eq!(list_fields.fields.len(), deserialized.len());
assert_eq!(deserialized[0].field_name, "test");
assert_eq!(deserialized[0].field_type, ListFieldType::Str as i32);
assert!(deserialized[0].searchable);
Expand Down
2 changes: 1 addition & 1 deletion quickwit/quickwit-proto/src/search/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ impl PartialHit {
/// Serializes the Split fields.
///
/// `fields_metadata` has to be sorted.
pub fn serialize_split_fields(list_fields: ListFields) -> Vec<u8> {
pub fn serialize_split_fields(list_fields: &ListFields) -> Vec<u8> {
let payload = list_fields.encode_to_vec();
let compression_level = 3;
let payload_compressed = zstd::stream::encode_all(&mut &payload[..], compression_level)
Expand Down
14 changes: 7 additions & 7 deletions quickwit/quickwit-search/src/list_fields.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ pub async fn get_fields_from_split(
Ordering::Equal => left.field_type.cmp(&right.field_type),
other => other,
});
let list_fields = ListFields {
fields: list_fields,
};
// Put result into cache
searcher_context.list_fields_cache.put(
split_and_footer_offsets.clone(),
ListFields {
fields: list_fields.clone(),
},
);
searcher_context
.list_fields_cache
.put(split_and_footer_offsets.clone(), &list_fields);

Ok(Box::new(list_fields.into_iter()))
Ok(Box::new(list_fields.fields.into_iter()))
}

/// `current_group` needs to contain at least one element.
Expand Down
5 changes: 2 additions & 3 deletions quickwit/quickwit-search/src/list_fields_cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,8 @@ impl ListFieldsCache {
deserialize_split_fields(encoded_result).ok()
}

pub fn put(&self, split_info: SplitIdAndFooterOffsets, list_fields: ListFields) {
pub fn put(&self, split_info: SplitIdAndFooterOffsets, list_fields: &ListFields) {
let key = CacheKey::from_split_meta(split_info);

let encoded_result = serialize_split_fields(list_fields);
self.content.put(key, OwnedBytes::new(encoded_result));
}
Expand Down Expand Up @@ -110,7 +109,7 @@ mod tests {
fields: vec![result.clone()],
};

cache.put(split_1.clone(), list_fields.clone());
cache.put(split_1.clone(), &list_fields);
assert_eq!(cache.get(split_1.clone()).unwrap(), list_fields);
assert!(cache.get(split_2).is_none());
}
Expand Down
Loading