Skip to content

Commit

Permalink
Add MinimumShouldMatchIterator (infiniflow#2056)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

Add MinimumShouldMatchIterator
Speed up fulltext filter with minimum_should_match option

Issue link:infiniflow#1862

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
- [x] Performance Improvement
  • Loading branch information
yangzq50 authored and vsian committed Oct 18, 2024
1 parent f28b8d2 commit f258385
Show file tree
Hide file tree
Showing 25 changed files with 463 additions and 166 deletions.
4 changes: 3 additions & 1 deletion src/common/stl.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,12 @@ export namespace std {
using std::isnan;
using std::log2;
using std::make_heap;
using std::push_heap;
using std::pop_heap;
using std::max_element;
using std::min_element;
using std::nearbyint;
using std::partial_sort;
using std::pop_heap;
using std::pow;
using std::reduce;
using std::remove_if;
Expand Down Expand Up @@ -291,6 +292,7 @@ export namespace std {
using std::conditional_t;
using std::remove_pointer_t;
using std::remove_reference_t;
using std::derived_from;

using std::function;
using std::monostate;
Expand Down
60 changes: 19 additions & 41 deletions src/executor/operator/physical_match.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ class FilterIterator final : public DocIterator {
void UpdateScoreThreshold(float threshold) override { query_iterator_->UpdateScoreThreshold(threshold); }

// for minimum_should_match parameter
u32 LeafCount() const override { return query_iterator_->LeafCount(); }
u32 MatchCount() const override { return query_iterator_->MatchCount(); }

void PrintTree(std::ostream &os, const String &prefix, bool is_final) const override {
Expand Down Expand Up @@ -144,19 +143,24 @@ struct FilterQueryNode final : public QueryNode {
query_tree_ = std::move(new_query_tree);
}

uint32_t LeafCount() const override { return query_tree_->LeafCount(); }

void PushDownWeight(float factor) override { MultiplyWeight(factor); }

std::unique_ptr<DocIterator>
CreateSearch(const TableEntry *table_entry, const IndexReader &index_reader, EarlyTermAlgo early_term_algo) const override {
std::unique_ptr<DocIterator> CreateSearch(const TableEntry *table_entry,
const IndexReader &index_reader,
const EarlyTermAlgo early_term_algo,
const u32 minimum_should_match) const override {
assert(common_query_filter_ != nullptr);
if (!common_query_filter_->AlwaysTrue() && common_query_filter_->filter_result_count_ == 0)
return nullptr;
auto search_iter = query_tree_->CreateSearch(table_entry, index_reader, early_term_algo);
auto search_iter = query_tree_->CreateSearch(table_entry, index_reader, early_term_algo, minimum_should_match);
if (!search_iter) {
return nullptr;
}
if (common_query_filter_->AlwaysTrue())
if (common_query_filter_->AlwaysTrue()) {
return search_iter;
}
return MakeUnique<FilterIterator>(common_query_filter_, std::move(search_iter));
}

Expand Down Expand Up @@ -186,20 +190,18 @@ void ASSERT_FLOAT_EQ(float bar, u32 i, float a, float b) {
}
}

template <bool use_minimum_should_match>
void ExecuteFTSearchT(UniquePtr<DocIterator> &et_iter, FullTextScoreResultHeap &result_heap, u32 &blockmax_loop_cnt, const u32 minimum_should_match) {
void ExecuteFTSearch(UniquePtr<DocIterator> &et_iter, FullTextScoreResultHeap &result_heap, u32 &blockmax_loop_cnt) {
// et_iter is nullptr if fulltext index is present but there's no data
if (et_iter == nullptr) {
LOG_DEBUG(fmt::format("et_iter is nullptr"));
return;
}
while (true) {
++blockmax_loop_cnt;
bool ok = et_iter->Next();
if (!ok) [[unlikely]] {
break;
}
if constexpr (use_minimum_should_match) {
assert(minimum_should_match >= 2);
if (et_iter->MatchCount() < minimum_should_match) {
continue;
}
}
RowID id = et_iter->DocID();
float et_score = et_iter->BM25Score();
if (SHOULD_LOG_DEBUG()) {
Expand All @@ -219,30 +221,6 @@ void ExecuteFTSearchT(UniquePtr<DocIterator> &et_iter, FullTextScoreResultHeap &
}
}

void ExecuteFTSearch(UniquePtr<DocIterator> &et_iter,
FullTextScoreResultHeap &result_heap,
u32 &blockmax_loop_cnt,
const MinimumShouldMatchOption &minimum_should_match_option) {
// et_iter is nullptr if fulltext index is present but there's no data
if (et_iter == nullptr) {
LOG_DEBUG(fmt::format("et_iter is nullptr"));
return;
}
u32 minimum_should_match_val = 0;
if (!minimum_should_match_option.empty()) {
const auto leaf_count = et_iter->LeafCount();
minimum_should_match_val = GetMinimumShouldMatchParameter(minimum_should_match_option, leaf_count);
}
if (minimum_should_match_val <= 1) {
// no need for minimum_should_match
return ExecuteFTSearchT<false>(et_iter, result_heap, blockmax_loop_cnt, 0);
} else {
// now minimum_should_match_val >= 2
// use minimum_should_match
return ExecuteFTSearchT<true>(et_iter, result_heap, blockmax_loop_cnt, minimum_should_match_val);
}
}

#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
Expand Down Expand Up @@ -310,13 +288,13 @@ bool PhysicalMatch::ExecuteInnerHomebrewed(QueryContext *query_context, Operator
full_text_query_context.query_tree_ = MakeUnique<FilterQueryNode>(common_query_filter_.get(), std::move(query_tree_));

if (use_block_max_iter) {
et_iter = query_builder.CreateSearch(full_text_query_context, early_term_algo_);
et_iter = query_builder.CreateSearch(full_text_query_context, early_term_algo_, minimum_should_match_option_);
// et_iter is nullptr if fulltext index is present but there's no data
if (et_iter != nullptr)
et_iter->UpdateScoreThreshold(begin_threshold_);
}
if (use_ordinary_iter) {
doc_iterator = query_builder.CreateSearch(full_text_query_context, EarlyTermAlgo::kNaive);
doc_iterator = query_builder.CreateSearch(full_text_query_context, EarlyTermAlgo::kNaive, minimum_should_match_option_);
}

// 3 full text search
Expand All @@ -331,7 +309,7 @@ bool PhysicalMatch::ExecuteInnerHomebrewed(QueryContext *query_context, Operator
#ifdef INFINITY_DEBUG
auto blockmax_begin_ts = std::chrono::high_resolution_clock::now();
#endif
ExecuteFTSearch(et_iter, result_heap, blockmax_loop_cnt, minimum_should_match_option_);
ExecuteFTSearch(et_iter, result_heap, blockmax_loop_cnt);
result_heap.Sort();
blockmax_result_count = result_heap.GetResultSize();
#ifdef INFINITY_DEBUG
Expand All @@ -346,7 +324,7 @@ bool PhysicalMatch::ExecuteInnerHomebrewed(QueryContext *query_context, Operator
#ifdef INFINITY_DEBUG
auto ordinary_begin_ts = std::chrono::high_resolution_clock::now();
#endif
ExecuteFTSearch(doc_iterator, result_heap, ordinary_loop_cnt, minimum_should_match_option_);
ExecuteFTSearch(doc_iterator, result_heap, ordinary_loop_cnt);
result_heap.Sort();
ordinary_result_count = result_heap.GetResultSize();
#ifdef INFINITY_DEBUG
Expand Down
50 changes: 27 additions & 23 deletions src/planner/optimizer/index_scan/index_filter_evaluators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -451,35 +451,35 @@ UniquePtr<IndexFilterEvaluatorSecondary> IndexFilterEvaluatorSecondary::Make(con
}
}

void IndexFilterEvaluatorFulltext::OptimizeQueryTree() {
if (after_optimize_.test(std::memory_order_acquire)) {
UnrecoverableError(std::format("{}: Already optimized!", __func__));
}
auto new_query_tree = QueryNode::GetOptimizedQueryTree(std::move(query_tree_));
query_tree_ = std::move(new_query_tree);
if (!minimum_should_match_option_.empty()) {
const auto leaf_count = query_tree_->LeafCount();
minimum_should_match_ = GetMinimumShouldMatchParameter(minimum_should_match_option_, leaf_count);
}
after_optimize_.test_and_set(std::memory_order_release);
}

Bitmask IndexFilterEvaluatorFulltext::Evaluate(const SegmentID segment_id, const SegmentOffset segment_row_count, Txn *txn) const {
if (!after_optimize_.test(std::memory_order_acquire)) {
UnrecoverableError(std::format("{}: Not optimized!", __func__));
}
Bitmask result(segment_row_count);
result.SetAllFalse();
const RowID begin_rowid(segment_id, 0);
const RowID end_rowid(segment_id, segment_row_count);
if (const auto ft_iter = query_tree_->CreateSearch(table_entry_, index_reader_, early_term_algo_); ft_iter && ft_iter->Next(begin_rowid)) {
u32 minimum_should_match_val = 0;
if (!minimum_should_match_option_.empty()) {
const auto leaf_count = ft_iter->LeafCount();
minimum_should_match_val = GetMinimumShouldMatchParameter(minimum_should_match_option_, leaf_count);
}
if (minimum_should_match_val <= 1) {
// no need for minimum_should_match
while (ft_iter->DocID() < end_rowid) {
result.SetTrue(ft_iter->DocID().segment_offset_);
ft_iter->Next();
}
} else {
// now minimum_should_match_val >= 2
// use minimum_should_match
while (ft_iter->DocID() < end_rowid) {
if (ft_iter->MatchCount() >= minimum_should_match_val) {
result.SetTrue(ft_iter->DocID().segment_offset_);
}
ft_iter->Next();
}
if (const auto ft_iter = query_tree_->CreateSearch(table_entry_, index_reader_, early_term_algo_, minimum_should_match_);
ft_iter && ft_iter->Next(begin_rowid)) {
while (ft_iter->DocID() < end_rowid) {
result.SetTrue(ft_iter->DocID().segment_offset_);
ft_iter->Next();
}
result.RunOptimize();
}
result.RunOptimize();
return result;
}

Expand All @@ -503,9 +503,13 @@ Bitmask IndexFilterEvaluatorAND::Evaluate(const SegmentID segment_id, const Segm
const auto &roaring_end = result.End();
Bitmask new_result(segment_row_count);
new_result.SetAllFalse();
if (!fulltext_evaluator_->after_optimize_.test(std::memory_order_acquire)) {
UnrecoverableError(std::format("{}: Not optimized!", __func__));
}
const auto ft_iter = fulltext_evaluator_->query_tree_->CreateSearch(fulltext_evaluator_->table_entry_,
fulltext_evaluator_->index_reader_,
fulltext_evaluator_->early_term_algo_);
fulltext_evaluator_->early_term_algo_,
fulltext_evaluator_->minimum_should_match_);
if (ft_iter) {
const RowID end_rowid(segment_id, segment_row_count);
while (roaring_begin != roaring_end && ft_iter->Next(RowID(segment_id, *roaring_begin))) {
Expand Down
7 changes: 3 additions & 4 deletions src/planner/optimizer/index_scan/index_filter_evaluators.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ export struct IndexFilterEvaluatorFulltext final : IndexFilterEvaluator {
IndexReader index_reader_;
UniquePtr<QueryNode> query_tree_;
MinimumShouldMatchOption minimum_should_match_option_;
u32 minimum_should_match_ = 0;
std::atomic_flag after_optimize_ = {};

IndexFilterEvaluatorFulltext(const FilterFulltextExpression *src_filter_fulltext_expression,
const TableEntry *table_entry,
Expand All @@ -102,10 +104,7 @@ export struct IndexFilterEvaluatorFulltext final : IndexFilterEvaluator {
minimum_should_match_option_(std::move(minimum_should_match_option)) {}
Bitmask Evaluate(SegmentID segment_id, SegmentOffset segment_row_count, Txn *txn) const override;
bool HaveMinimumShouldMatchOption() const { return !minimum_should_match_option_.empty(); }
void OptimizeQueryTree() {
auto new_query_tree = QueryNode::GetOptimizedQueryTree(std::move(query_tree_));
query_tree_ = std::move(new_query_tree);
}
void OptimizeQueryTree();
};

export UniquePtr<IndexFilterEvaluator> IndexFilterEvaluatorBuildFromAnd(Vector<UniquePtr<IndexFilterEvaluator>> candidates);
Expand Down
16 changes: 1 addition & 15 deletions src/storage/invertedindex/search/and_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,7 @@ AndIterator::AndIterator(Vector<UniquePtr<DocIterator>> iterators) : MultiDocIte
++fixed_match_count_;
break;
}
case DocIteratorType::kAndIterator:
case DocIteratorType::kAndNotIterator:
case DocIteratorType::kFilterIterator: {
UnrecoverableError("Wrong optimization result");
break;
}
case DocIteratorType::kOrIterator:
case DocIteratorType::kBMMIterator:
case DocIteratorType::kBMWIterator: {
default: {
dyn_match_ids_.push_back(i);
break;
}
Expand Down Expand Up @@ -124,12 +116,6 @@ void AndIterator::UpdateScoreThreshold(float threshold) {
}
}

u32 AndIterator::LeafCount() const {
return std::accumulate(children_.begin(), children_.end(), static_cast<u32>(0), [](const u32 cnt, const auto &it) {
return cnt + it->LeafCount();
});
}

u32 AndIterator::MatchCount() const {
if (DocID() == INVALID_ROWID) {
return 0;
Expand Down
2 changes: 0 additions & 2 deletions src/storage/invertedindex/search/and_iterator.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,6 @@ public:

void UpdateScoreThreshold(float threshold) override;

u32 LeafCount() const override;

u32 MatchCount() const override;

private:
Expand Down
2 changes: 0 additions & 2 deletions src/storage/invertedindex/search/and_not_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,6 @@ float AndNotIterator::BM25Score() { return children_[0]->BM25Score(); }

void AndNotIterator::UpdateScoreThreshold(float threshold) { children_[0]->UpdateScoreThreshold(threshold); }

u32 AndNotIterator::LeafCount() const { return children_[0]->LeafCount(); }

u32 AndNotIterator::MatchCount() const { return children_[0]->MatchCount(); }

} // namespace infinity
2 changes: 0 additions & 2 deletions src/storage/invertedindex/search/and_not_iterator.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ public:

void UpdateScoreThreshold(float threshold) override;

u32 LeafCount() const override;

u32 MatchCount() const override;

};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,6 @@ void BlockMaxMaxscoreIterator::UpdateScoreThreshold(const float threshold) {
threshold_ = threshold;
}

u32 BlockMaxMaxscoreIterator::LeafCount() const {
UnrecoverableError("BMM not supported now");
return 0;
}

u32 BlockMaxMaxscoreIterator::MatchCount() const {
UnrecoverableError("BMM not supported now");
return 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ public:

float BM25Score() override;

u32 LeafCount() const override;

u32 MatchCount() const override;

private:
Expand Down
6 changes: 0 additions & 6 deletions src/storage/invertedindex/search/blockmax_wand_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,12 +257,6 @@ float BlockMaxWandIterator::BM25Score() {
return sum_score;
}

u32 BlockMaxWandIterator::LeafCount() const {
return std::accumulate(children_.begin(), children_.end(), static_cast<u32>(0), [](const u32 cnt, const auto &it) {
return cnt + it->LeafCount();
});
}

u32 BlockMaxWandIterator::MatchCount() const {
u32 count = 0;
if (const auto current_doc_id = DocID(); current_doc_id != INVALID_ROWID) {
Expand Down
4 changes: 1 addition & 3 deletions src/storage/invertedindex/search/blockmax_wand_iterator.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import internal_types;
namespace infinity {

// Refers to https://engineering.nyu.edu/~suel/papers/bmw.pdf
export class BlockMaxWandIterator final : public MultiDocIterator {
export class BlockMaxWandIterator : public MultiDocIterator {
public:
explicit BlockMaxWandIterator(Vector<UniquePtr<DocIterator>> &&iterators);

Expand All @@ -41,8 +41,6 @@ public:

float BM25Score() override;

u32 LeafCount() const override;

u32 MatchCount() const override;

private:
Expand Down
2 changes: 1 addition & 1 deletion src/storage/invertedindex/search/doc_iterator.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ export enum class DocIteratorType : u8 {
kAndIterator,
kAndNotIterator,
kOrIterator,
kMinimumShouldMatchIterator,
kBMMIterator,
kBMWIterator,
kFilterIterator,
Expand Down Expand Up @@ -73,7 +74,6 @@ public:
virtual void UpdateScoreThreshold(float threshold) = 0;

// for minimum_should_match parameter
virtual u32 LeafCount() const = 0;
virtual u32 MatchCount() const = 0;

// print the query tree, for debugging
Expand Down
Loading

0 comments on commit f258385

Please sign in to comment.