diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.cpp index aaa6df569e17c6..a1ac87797ded4c 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.cpp @@ -69,5 +69,10 @@ void QueryHelper::collect_range(const IndexQueryContextPtr& context, } } +bool QueryHelper::is_simple_phrase(const std::vector& term_infos) { + return std::ranges::all_of(term_infos, + [](const auto& term_info) { return term_info.is_single_term(); }); +} + #include "common/compile_check_end.h" } // namespace doris::segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.h b/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.h index b7e5d51abcd172..319554ec46a1f9 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.h @@ -37,6 +37,8 @@ class QueryHelper { const DocRange& doc_range); static void collect_range(const IndexQueryContextPtr& context, const SimilarityPtr& similarity, const DocRange& doc_range); + + static bool is_simple_phrase(const std::vector& term_infos); }; #include "common/compile_check_end.h" diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/doc_set.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/doc_set.h index 55cddec4551790..e24bd09b134ae5 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/doc_set.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/doc_set.h @@ -32,35 +32,36 @@ class DocSet { virtual ~DocSet() = default; virtual uint32_t advance() { - throw doris::Exception(doris::ErrorCode::NOT_IMPLEMENTED_ERROR, - "advance() method not implemented in base DocSet class"); + throw Exception(doris::ErrorCode::NOT_IMPLEMENTED_ERROR, + "advance() method not implemented in base DocSet class"); } virtual uint32_t seek(uint32_t target) { - throw doris::Exception(doris::ErrorCode::NOT_IMPLEMENTED_ERROR, - "seek() method not implemented in base DocSet class"); + throw Exception(doris::ErrorCode::NOT_IMPLEMENTED_ERROR, + "seek() method not implemented in base DocSet class"); } virtual uint32_t doc() const { - throw doris::Exception(doris::ErrorCode::NOT_IMPLEMENTED_ERROR, - "doc() method not implemented in base DocSet class"); + throw Exception(doris::ErrorCode::NOT_IMPLEMENTED_ERROR, + "doc() method not implemented in base DocSet class"); } virtual uint32_t size_hint() const { - throw doris::Exception(doris::ErrorCode::NOT_IMPLEMENTED_ERROR, - "size_hint() method not implemented in base DocSet class"); + throw Exception(doris::ErrorCode::NOT_IMPLEMENTED_ERROR, + "size_hint() method not implemented in base DocSet class"); } virtual uint32_t freq() const { - throw doris::Exception(doris::ErrorCode::NOT_IMPLEMENTED_ERROR, - "freq() method not implemented in base DocSet class"); + throw Exception(doris::ErrorCode::NOT_IMPLEMENTED_ERROR, + "freq() method not implemented in base DocSet class"); } virtual uint32_t norm() const { - throw doris::Exception(doris::ErrorCode::NOT_IMPLEMENTED_ERROR, - "norm() method not implemented in base DocSet class"); + throw Exception(doris::ErrorCode::NOT_IMPLEMENTED_ERROR, + "norm() method not implemented in base DocSet class"); } }; +using DocSetPtr = std::shared_ptr; class MockDocSet : public DocSet { public: @@ -77,53 +78,114 @@ class MockDocSet : public DocSet { } } - uint32_t advance() override { + MockDocSet(std::vector docs, std::map> doc_positions, + uint32_t size_hint_val = 0, uint32_t norm_val = 1) + : _docs(std::move(docs)), + _doc_positions(std::move(doc_positions)), + _size_hint_val(size_hint_val), + _norm_val(norm_val) { + if (_docs.empty()) { + _current_doc = TERMINATED; + } else { + std::ranges::sort(_docs.begin(), _docs.end()); + _current_doc = _docs[0]; + } + if (_size_hint_val == 0) { + _size_hint_val = static_cast(_docs.size()); + } + } + + // Basic TermIterator-style interface (foundation methods) + bool next() { if (_docs.empty() || _index >= _docs.size()) { _current_doc = TERMINATED; - return TERMINATED; + return false; } ++_index; if (_index >= _docs.size()) { _current_doc = TERMINATED; - return TERMINATED; + return false; } _current_doc = _docs[_index]; - return _current_doc; + return true; } - uint32_t seek(uint32_t target) override { + bool skipTo(uint32_t target) { if (_docs.empty() || _index >= _docs.size()) { _current_doc = TERMINATED; - return TERMINATED; + return false; } if (_current_doc >= target) { - return _current_doc; + return true; } auto it = std::lower_bound(_docs.begin() + _index, _docs.end(), target); if (it == _docs.end()) { _index = _docs.size(); _current_doc = TERMINATED; - return TERMINATED; + return false; } _index = static_cast(it - _docs.begin()); _current_doc = *it; + return true; + } + + uint32_t docFreq() const { return _size_hint_val; } + + // DocSet virtual interface (built on top of basic methods) + uint32_t advance() override { + next(); + return _current_doc; + } + + uint32_t seek(uint32_t target) override { + skipTo(target); return _current_doc; } uint32_t doc() const override { return _current_doc; } - uint32_t size_hint() const override { return _size_hint_val; } + uint32_t size_hint() const override { return docFreq(); } uint32_t norm() const override { return _norm_val; } + uint32_t freq() const override { + if (_current_doc == TERMINATED) { + return 0; + } + auto it = _doc_positions.find(_current_doc); + if (it != _doc_positions.end()) { + return static_cast(it->second.size()); + } + return 1; + } + + void append_positions_with_offset(uint32_t offset, std::vector& output) { + if (_current_doc == TERMINATED) { + return; + } + auto it = _doc_positions.find(_current_doc); + if (it != _doc_positions.end()) { + size_t prev_size = output.size(); + output.reserve(prev_size + it->second.size()); + for (uint32_t pos : it->second) { + output.push_back(offset + pos); + } + } + } + + void positions_with_offset(uint32_t offset, std::vector& output) { + output.clear(); + append_positions_with_offset(offset, output); + } + private: std::vector _docs; + std::map> _doc_positions; size_t _index = 0; uint32_t _current_doc = TERMINATED; uint32_t _size_hint_val = 0; uint32_t _norm_val = 1; }; - using MockDocSetPtr = std::shared_ptr; } // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/intersection.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/intersection.cpp index b15c6ec7a1fb97..0198c80241ee0e 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/intersection.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/intersection.cpp @@ -147,29 +147,17 @@ Intersection::docset_mut_specialized(size_t ord) { } } -template class Intersection; -template class Intersection; - -// create -template std::enable_if_t< - std::is_same_v, - IntersectionPtr> -Intersection::create< - PositionPostingsWithOffsetPtr>(std::vector& docsets); - -template std::enable_if_t, - IntersectionPtr> -Intersection::create( - std::vector& docsets); - -// docset_mut_specialized -template std::enable_if_t< - std::is_same_v, - PositionPostingsWithOffsetPtr&> -Intersection::docset_mut_specialized< - PositionPostingsWithOffsetPtr>(size_t ord); - -template std::enable_if_t, MockDocSetPtr&> -Intersection::docset_mut_specialized(size_t ord); +#define INSTANTIATE_INTERSECTION(T) \ + template class Intersection; \ + template std::enable_if_t, IntersectionPtr> \ + Intersection::create(std::vector & docsets); \ + template std::enable_if_t, T&> \ + Intersection::docset_mut_specialized(size_t ord); + +INSTANTIATE_INTERSECTION(std::shared_ptr>) +INSTANTIATE_INTERSECTION(std::shared_ptr>) +INSTANTIATE_INTERSECTION(MockDocSetPtr) + +#undef INSTANTIATE_INTERSECTION } // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/nullable_scorer.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/nullable_scorer.h new file mode 100644 index 00000000000000..bc0fda8b6f72f4 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/nullable_scorer.h @@ -0,0 +1,77 @@ +// be/src/olap/rowset/segment_v2/inverted_index/query_v2/nullable_scorer.h + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include + +#include "common/exception.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/null_bitmap_fetcher.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/scorer.h" +#include "roaring/roaring.hh" + +namespace doris::segment_v2::inverted_index::query_v2 { + +template +class NullableScorer : public Scorer { +public: + NullableScorer(ScorerPtrT inner_scorer, std::shared_ptr null_bitmap) + : _inner_scorer(std::move(inner_scorer)), _null_bitmap(std::move(null_bitmap)) {} + ~NullableScorer() override = default; + + uint32_t advance() override { return _inner_scorer->advance(); } + uint32_t seek(uint32_t target) override { return _inner_scorer->seek(target); } + uint32_t doc() const override { return _inner_scorer->doc(); } + uint32_t size_hint() const override { return _inner_scorer->size_hint(); } + float score() override { return _inner_scorer->score(); } + + bool has_null_bitmap(const NullBitmapResolver* /*resolver*/ = nullptr) override { return true; } + + const roaring::Roaring* get_null_bitmap( + const NullBitmapResolver* /*resolver*/ = nullptr) override { + return _null_bitmap.get(); + } + +private: + ScorerPtrT _inner_scorer; + std::shared_ptr _null_bitmap; +}; +using NullableScorerPtr = std::shared_ptr>; + +template +inline ScorerPtr make_nullable_scorer(ScorerPtrT inner_scorer, const std::string& logical_field, + const NullBitmapResolver* resolver) { + if (!inner_scorer) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "make_nullable_scorer: inner_scorer must not be null"); + } + + auto null_bitmap = FieldNullBitmapFetcher::fetch(resolver, logical_field, inner_scorer.get()); + + if (!null_bitmap || null_bitmap->isEmpty()) { + return inner_scorer; + } + + return std::make_shared>(std::move(inner_scorer), + std::move(null_bitmap)); +} + +} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/multi_phrase_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/multi_phrase_query.h new file mode 100644 index 00000000000000..ddbfd1ef0bbace --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/multi_phrase_query.h @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "olap/rowset/segment_v2/index_query_context.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/multi_phrase_weight.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/query.h" +#include "olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity.h" + +namespace doris::segment_v2::inverted_index::query_v2 { + +class MultiPhraseQuery : public Query { +public: + MultiPhraseQuery(IndexQueryContextPtr context, std::wstring field, + std::vector term_infos) + : _context(std::move(context)), + _field(std::move(field)), + _term_infos(std::move(term_infos)) {} + ~MultiPhraseQuery() override = default; + + WeightPtr weight(bool enable_scoring) override { + if (_term_infos.size() < 2) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "Multi-phrase query requires at least 2 terms, got {}", + _term_infos.size()); + } + + SimilarityPtr bm25_similarity; + if (enable_scoring) { + bm25_similarity = std::make_shared(); + std::vector all_terms; + for (const auto& term_info : _term_infos) { + if (term_info.is_single_term()) { + all_terms.push_back(StringHelper::to_wstring(term_info.get_single_term())); + } else { + for (const auto& term : term_info.get_multi_terms()) { + all_terms.push_back(StringHelper::to_wstring(term)); + } + } + } + bm25_similarity->for_terms(_context, _field, all_terms); + } + + return std::make_shared(_context, _field, _term_infos, bm25_similarity, + enable_scoring, _nullable); + } + +private: + IndexQueryContextPtr _context; + + std::wstring _field; + std::vector _term_infos; + bool _nullable = true; +}; + +} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/multi_phrase_weight.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/multi_phrase_weight.h new file mode 100644 index 00000000000000..981ab0148be77c --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/multi_phrase_weight.h @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "olap/rowset/segment_v2/index_query_context.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/nullable_scorer.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_scorer.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/postings/loaded_postings.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/segment_postings.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/union/simple_union.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/weight.h" +#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h" + +namespace doris::segment_v2::inverted_index::query_v2 { + +constexpr uint32_t SPARSE_TERM_DOC_THRESHOLD = 100; + +class MultiPhraseWeight : public Weight { +public: + MultiPhraseWeight(IndexQueryContextPtr context, std::wstring field, + std::vector term_infos, SimilarityPtr similarity, + bool enable_scoring, bool nullable) + : _context(std::move(context)), + _field(std::move(field)), + _term_infos(std::move(term_infos)), + _similarity(std::move(similarity)), + _enable_scoring(enable_scoring), + _nullable(nullable) {} + ~MultiPhraseWeight() override = default; + + ScorerPtr scorer(const QueryExecutionContext& ctx, const std::string& binding_key) override { + auto scorer = phrase_scorer(ctx, binding_key); + if (_nullable) { + auto logical_field = logical_field_or_fallback(ctx, binding_key, _field); + return make_nullable_scorer(scorer, logical_field, ctx.null_resolver); + } + return scorer; + } + +private: + ScorerPtr phrase_scorer(const QueryExecutionContext& ctx, const std::string& binding_key) { + auto reader = lookup_reader(_field, ctx, binding_key); + if (!reader) { + throw Exception(ErrorCode::NOT_FOUND, "Reader not found for field '{}'", + StringHelper::to_string(_field)); + } + + std::vector> term_postings_list; + for (const auto& term_info : _term_infos) { + size_t offset = term_info.position; + if (term_info.is_single_term()) { + auto posting = + create_position_posting(reader.get(), _field, term_info.get_single_term(), + _enable_scoring, _context->io_ctx); + if (posting) { + if (posting->size_hint() > SPARSE_TERM_DOC_THRESHOLD) { + auto loaded_posting = LoadedPostings::load(*posting); + term_postings_list.emplace_back(offset, std::move(loaded_posting)); + } else { + term_postings_list.emplace_back(offset, std::move(posting)); + } + } else { + return std::make_shared(); + } + } else { + const auto& terms = term_info.get_multi_terms(); + std::vector postings; + for (const auto& term : terms) { + auto posting = create_position_posting(reader.get(), _field, term, + _enable_scoring, _context->io_ctx); + if (posting) { + if (posting->size_hint() <= SPARSE_TERM_DOC_THRESHOLD) { + postings.push_back(LoadedPostings::load(*posting)); + } else { + postings.push_back(posting); + } + } + } + if (postings.empty()) { + return std::make_shared(); + } + auto union_posting = SimpleUnion::create(std::move(postings)); + term_postings_list.emplace_back(offset, std::move(union_posting)); + } + } + return PhraseScorer::create(term_postings_list, _similarity, 0); + } + + IndexQueryContextPtr _context; + + std::wstring _field; + std::vector _term_infos; + SimilarityPtr _similarity; + bool _enable_scoring = false; + bool _nullable = true; +}; + +} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h index f46ac1793b758a..521f4112eb7dab 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h @@ -26,29 +26,43 @@ namespace doris::segment_v2::inverted_index::query_v2 { class PhraseQuery : public Query { public: - PhraseQuery(IndexQueryContextPtr context, std::wstring field, std::vector terms) - : _context(std::move(context)), _field(std::move(field)), _terms(std::move(terms)) {} + PhraseQuery(IndexQueryContextPtr context, std::wstring field, std::vector term_infos) + : _context(std::move(context)), + _field(std::move(field)), + _term_infos(std::move(term_infos)) {} ~PhraseQuery() override = default; WeightPtr weight(bool enable_scoring) override { - if (_terms.size() < 2) { - throw Exception(ErrorCode::INVALID_ARGUMENT, "Phrase query requires at least 2 terms"); + if (_term_infos.size() < 2) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "Phrase query requires at least 2 terms, got {}", _term_infos.size()); } SimilarityPtr bm25_similarity; if (enable_scoring) { bm25_similarity = std::make_shared(); - bm25_similarity->for_terms(_context, _field, _terms); + std::vector all_terms; + for (const auto& term_info : _term_infos) { + if (term_info.is_single_term()) { + all_terms.push_back(StringHelper::to_wstring(term_info.get_single_term())); + } else { + for (const auto& term : term_info.get_multi_terms()) { + all_terms.push_back(StringHelper::to_wstring(term)); + } + } + } + bm25_similarity->for_terms(_context, _field, all_terms); } - return std::make_shared(_context, _field, _terms, bm25_similarity, - enable_scoring); + return std::make_shared(_context, _field, _term_infos, bm25_similarity, + enable_scoring, _nullable); } private: IndexQueryContextPtr _context; std::wstring _field; - std::vector _terms; + std::vector _term_infos; + bool _nullable = true; }; } // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_scorer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_scorer.cpp index d1f766a665de7d..2aa64fff95c111 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_scorer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_scorer.cpp @@ -183,6 +183,7 @@ bool PhraseScorer::intersection_exists(const std::vector& l return false; } -template class PhraseScorer; +template class PhraseScorer; +template class PhraseScorer; } // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_weight.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_weight.h index 339b4fdc99f766..0a12fd117d3a16 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_weight.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_weight.h @@ -18,9 +18,7 @@ #pragma once #include "olap/rowset/segment_v2/index_query_context.h" -#include "olap/rowset/segment_v2/inverted_index/query_v2/bit_set_query/bit_set_scorer.h" -#include "olap/rowset/segment_v2/inverted_index/query_v2/const_score_query/const_score_scorer.h" -#include "olap/rowset/segment_v2/inverted_index/query_v2/null_bitmap_fetcher.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/nullable_scorer.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_scorer.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/scorer.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/weight.h" @@ -28,41 +26,27 @@ namespace doris::segment_v2::inverted_index::query_v2 { +constexpr uint32_t LOADED_POSTINGS_DOC_FREQ_THRESHOLD = 100; + class PhraseWeight : public Weight { public: - PhraseWeight(IndexQueryContextPtr context, std::wstring field, std::vector terms, - SimilarityPtr similarity, bool enable_scoring) + PhraseWeight(IndexQueryContextPtr context, std::wstring field, std::vector term_infos, + SimilarityPtr similarity, bool enable_scoring, bool nullable) : _context(std::move(context)), _field(std::move(field)), - _terms(std::move(terms)), + _term_infos(std::move(term_infos)), _similarity(std::move(similarity)), - _enable_scoring(enable_scoring) {} + _enable_scoring(enable_scoring), + _nullable(nullable) {} ~PhraseWeight() override = default; ScorerPtr scorer(const QueryExecutionContext& ctx, const std::string& binding_key) override { - auto phrase = phrase_scorer(ctx, binding_key); - auto logical_field = logical_field_or_fallback(ctx, binding_key, _field); - auto null_bitmap = FieldNullBitmapFetcher::fetch(ctx, logical_field); - - auto doc_bitset = std::make_shared(); - if (phrase) { - uint32_t doc = phrase->doc(); - if (doc == TERMINATED) { - doc = phrase->advance(); - } - while (doc != TERMINATED) { - doc_bitset->add(doc); - doc = phrase->advance(); - } - } - - auto bit_set = - std::make_shared(std::move(doc_bitset), std::move(null_bitmap)); - if (!phrase) { - return bit_set; + auto scorer = phrase_scorer(ctx, binding_key); + if (_nullable) { + auto logical_field = logical_field_or_fallback(ctx, binding_key, _field); + return make_nullable_scorer(scorer, logical_field, ctx.null_resolver); } - // Wrap with const score for consistency with other non-scoring paths - return std::make_shared>(std::move(bit_set)); + return scorer; } private: @@ -73,29 +57,28 @@ class PhraseWeight : public Weight { StringHelper::to_string(_field)); } - std::vector> term_postings_list; - for (size_t offset = 0; offset < _terms.size(); ++offset) { - const auto& term = _terms[offset]; - auto t = make_term_ptr(_field.c_str(), term.c_str()); - auto iter = make_term_positions_ptr(reader.get(), t.get(), _enable_scoring, - _context->io_ctx); - if (iter) { - auto segment_postings = - std::make_shared>(std::move(iter)); - term_postings_list.emplace_back(offset, std::move(segment_postings)); + std::vector> term_postings_list; + for (const auto& term_info : _term_infos) { + size_t offset = term_info.position; + auto posting = + create_position_posting(reader.get(), _field, term_info.get_single_term(), + _enable_scoring, _context->io_ctx); + if (posting) { + term_postings_list.emplace_back(offset, std::move(posting)); } else { - return nullptr; + return std::make_shared(); } } - return PhraseScorer::create(term_postings_list, _similarity, 0); + return PhraseScorer::create(term_postings_list, _similarity, 0); } IndexQueryContextPtr _context; std::wstring _field; - std::vector _terms; + std::vector _term_infos; SimilarityPtr _similarity; bool _enable_scoring = false; + bool _nullable = true; }; } // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/postings_with_offset.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/postings_with_offset.h index 0302bc36081c8e..48ff4244c70544 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/postings_with_offset.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/postings_with_offset.h @@ -19,7 +19,6 @@ #include "olap/rowset/segment_v2/inverted_index/query_v2/doc_set.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/segment_postings.h" -#include "olap/rowset/segment_v2/inverted_index_common.h" namespace doris::segment_v2::inverted_index::query_v2 { @@ -48,7 +47,4 @@ class PostingsWithOffset : public DocSet { template using PostingsWithOffsetPtr = std::shared_ptr>; -using PositionPostings = std::shared_ptr>; -using PositionPostingsWithOffsetPtr = std::shared_ptr>; - } // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/postings/loaded_postings.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/postings/loaded_postings.cpp new file mode 100644 index 00000000000000..1639c4e0f4ec8a --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/postings/loaded_postings.cpp @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/inverted_index/query_v2/postings/loaded_postings.h" + +#include + +namespace doris::segment_v2::inverted_index::query_v2 { + +LoadedPostings::LoadedPostings(std::vector doc_ids, + std::vector> positions) + : _doc_ids(std::move(doc_ids)) { + if (_doc_ids.empty()) { + _cursor = 0; + return; + } + + size_t total_positions = 0; + for (const auto& pos : positions) { + total_positions += pos.size(); + } + + _positions.reserve(total_positions); + _position_offsets.reserve(positions.size() + 1); + + for (const auto& pos : positions) { + _position_offsets.push_back(static_cast(_positions.size())); + _positions.insert(_positions.end(), pos.begin(), pos.end()); + } + _position_offsets.push_back(static_cast(_positions.size())); +} + +template +LoadedPostingsPtr LoadedPostings::load(TPostings& segment_postings) { + auto loaded = std::make_shared(); + + uint32_t num_docs = segment_postings.size_hint(); + loaded->_doc_ids.reserve(num_docs); + loaded->_position_offsets.reserve(num_docs + 1); + + while (segment_postings.doc() != TERMINATED) { + loaded->_position_offsets.push_back(static_cast(loaded->_positions.size())); + loaded->_doc_ids.push_back(segment_postings.doc()); + segment_postings.append_positions_with_offset(0, loaded->_positions); + segment_postings.advance(); + } + loaded->_position_offsets.push_back(static_cast(loaded->_positions.size())); + loaded->_cursor = 0; + + return loaded; +} + +uint32_t LoadedPostings::advance() { + ++_cursor; + if (_cursor >= _doc_ids.size()) { + _cursor = _doc_ids.size(); + return TERMINATED; + } + return doc(); +} + +uint32_t LoadedPostings::seek(uint32_t target) { + if (_doc_ids.empty() || _cursor >= _doc_ids.size()) { + _cursor = _doc_ids.size(); + return TERMINATED; + } + + if (_doc_ids[_cursor] >= target) { + return _doc_ids[_cursor]; + } + + auto it = std::lower_bound(_doc_ids.begin() + _cursor, _doc_ids.end(), target); + if (it == _doc_ids.end()) { + _cursor = _doc_ids.size(); + return TERMINATED; + } + + _cursor = static_cast(it - _doc_ids.begin()); + return *it; +} + +uint32_t LoadedPostings::doc() const { + if (_cursor >= _doc_ids.size()) { + return TERMINATED; + } + return _doc_ids[_cursor]; +} + +uint32_t LoadedPostings::size_hint() const { + return static_cast(_doc_ids.size()); +} + +uint32_t LoadedPostings::freq() const { + if (_cursor >= _doc_ids.size()) { + return 0; + } + uint32_t start = _position_offsets[_cursor]; + uint32_t end = _position_offsets[_cursor + 1]; + return end - start; +} + +uint32_t LoadedPostings::norm() const { + return 1; +} + +void LoadedPostings::append_positions_with_offset(uint32_t offset, std::vector& output) { + if (_cursor >= _doc_ids.size()) { + return; + } + + size_t start = _position_offsets[_cursor]; + size_t end = _position_offsets[_cursor + 1]; + + for (size_t i = start; i < end; ++i) { + output.push_back(_positions[i] + offset); + } +} + +template LoadedPostingsPtr LoadedPostings::load>( + SegmentPostings& segment_postings); + +} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/postings/loaded_postings.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/postings/loaded_postings.h new file mode 100644 index 00000000000000..09820ead5d4f76 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/postings/loaded_postings.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "olap/rowset/segment_v2/inverted_index/query_v2/segment_postings.h" + +namespace doris::segment_v2::inverted_index::query_v2 { + +class LoadedPostings; +using LoadedPostingsPtr = std::shared_ptr; + +class LoadedPostings final : public Postings { +public: + LoadedPostings() = default; + LoadedPostings(std::vector doc_ids, std::vector> positions); + ~LoadedPostings() override = default; + + template + static LoadedPostingsPtr load(TPostings& segment_postings); + + uint32_t advance() override; + uint32_t seek(uint32_t target) override; + uint32_t doc() const override; + uint32_t size_hint() const override; + uint32_t freq() const override; + uint32_t norm() const override; + + void append_positions_with_offset(uint32_t offset, std::vector& output) override; + +private: + std::vector _doc_ids; + std::vector _position_offsets; + std::vector _positions; + size_t _cursor = 0; +}; + +} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h index cce83b6e1e77e7..e5075511c675cb 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h @@ -33,7 +33,7 @@ class RegexpQuery : public Query { WeightPtr weight(bool enable_scoring) override { return std::make_shared(std::move(_context), std::move(_field), - std::move(_pattern), enable_scoring); + std::move(_pattern), enable_scoring, _nullable); } private: @@ -41,6 +41,7 @@ class RegexpQuery : public Query { std::wstring _field; std::string _pattern; + bool _nullable = true; }; } // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.cpp index f70b5be77c4734..456cb702d3233a 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.cpp @@ -26,29 +26,36 @@ #include "olap/rowset/segment_v2/inverted_index/query_v2/bit_set_query/bit_set_scorer.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/const_score_query/const_score_scorer.h" -#include "olap/rowset/segment_v2/inverted_index/query_v2/null_bitmap_fetcher.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/nullable_scorer.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/segment_postings.h" #include "olap/rowset/segment_v2/inverted_index/util/string_helper.h" -#include "olap/rowset/segment_v2/inverted_index_iterator.h" CL_NS_USE(index) namespace doris::segment_v2::inverted_index::query_v2 { RegexpWeight::RegexpWeight(IndexQueryContextPtr context, std::wstring field, std::string pattern, - bool enable_scoring) + bool enable_scoring, bool nullable) : _context(std::move(context)), _field(std::move(field)), _pattern(std::move(pattern)), - _enable_scoring(enable_scoring) { + _enable_scoring(enable_scoring), + _nullable(nullable) { // _max_expansions = _context->runtime_state->query_options().inverted_index_max_expansions; } ScorerPtr RegexpWeight::scorer(const QueryExecutionContext& context, const std::string& binding_key) { - auto logical_field = logical_field_or_fallback(context, binding_key, _field); - VLOG_DEBUG << "RegexpWeight::scorer() called - pattern=" << _pattern << ", logical_field='" - << logical_field << "'"; + auto scorer = regexp_scorer(context, binding_key); + if (_nullable) { + auto logical_field = logical_field_or_fallback(context, binding_key, _field); + return make_nullable_scorer(scorer, logical_field, context.null_resolver); + } + return scorer; +} + +ScorerPtr RegexpWeight::regexp_scorer(const QueryExecutionContext& context, + const std::string& binding_key) { auto prefix = get_regex_prefix(_pattern); hs_database_t* database = nullptr; @@ -81,10 +88,7 @@ ScorerPtr RegexpWeight::scorer(const QueryExecutionContext& context, hs_free_database(database); if (matching_terms.empty()) { - // Even when there are no matching terms, we must honor NULL semantics for the field. - auto empty_true = std::make_shared(); - auto null_bitmap = FieldNullBitmapFetcher::fetch(context, logical_field); - return std::make_shared(std::move(empty_true), std::move(null_bitmap)); + return std::make_shared(); } auto doc_bitset = std::make_shared(); @@ -101,8 +105,7 @@ ScorerPtr RegexpWeight::scorer(const QueryExecutionContext& context, } } - auto null_bitmap = FieldNullBitmapFetcher::fetch(context, logical_field); - auto bit_set = std::make_shared(doc_bitset, null_bitmap); + auto bit_set = std::make_shared(doc_bitset); auto const_score = std::make_shared>(std::move(bit_set)); return const_score; } diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.h index d85d42a6d298e0..b58d124ed112ae 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.h @@ -28,12 +28,14 @@ namespace doris::segment_v2::inverted_index::query_v2 { class RegexpWeight : public Weight { public: RegexpWeight(IndexQueryContextPtr context, std::wstring field, std::string pattern, - bool enable_scoring); + bool enable_scoring, bool nullable); ~RegexpWeight() override = default; ScorerPtr scorer(const QueryExecutionContext& context, const std::string& binding_key) override; private: + ScorerPtr regexp_scorer(const QueryExecutionContext& context, const std::string& binding_key); + std::optional get_regex_prefix(const std::string& pattern); void collect_matching_terms(const QueryExecutionContext& context, const std::string& binding_key, std::vector& terms, @@ -45,6 +47,7 @@ class RegexpWeight : public Weight { std::wstring _field; std::string _pattern; bool _enable_scoring = false; + bool _nullable = true; int32_t _max_expansions = 50; }; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/scorer.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/scorer.h index c5e4ea6319bab5..794acde282a75e 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/scorer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/scorer.h @@ -20,6 +20,8 @@ #include #include +#include "common/exception.h" +#include "common/status.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/doc_set.h" namespace roaring { @@ -49,6 +51,7 @@ class Scorer : public DocSet { virtual float score() = 0; virtual bool has_null_bitmap(const NullBitmapResolver* /*resolver*/ = nullptr) { return false; } + virtual const roaring::Roaring* get_null_bitmap( const NullBitmapResolver* /*resolver*/ = nullptr) { return nullptr; @@ -67,14 +70,6 @@ class EmptyScorer : public Scorer { uint32_t size_hint() const override { return 0; } float score() override { return 0.0F; } - - bool has_null_bitmap(const NullBitmapResolver* /*resolver*/ = nullptr) override { - return false; - } - const roaring::Roaring* get_null_bitmap( - const NullBitmapResolver* /*resolver*/ = nullptr) override { - return nullptr; - } }; } // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/segment_postings.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/segment_postings.h index 2f65bdf1cd1d7d..91083fbaf24075 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/segment_postings.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/segment_postings.h @@ -18,14 +18,29 @@ #pragma once #include "olap/rowset/segment_v2/inverted_index/query_v2/doc_set.h" +#include "olap/rowset/segment_v2/inverted_index_common.h" namespace doris::segment_v2::inverted_index::query_v2 { -template -class SegmentPostingsBase : public DocSet { +class Postings : public DocSet { +public: + Postings() = default; + ~Postings() override = default; + + virtual void positions_with_offset(uint32_t offset, std::vector& output) { + output.clear(); + append_positions_with_offset(offset, output); + } + + virtual void append_positions_with_offset(uint32_t offset, std::vector& output) = 0; +}; +using PostingsPtr = std::shared_ptr; + +template +class SegmentPostingsBase : public Postings { public: SegmentPostingsBase() = default; - SegmentPostingsBase(TermIterator iter) : _iter(std::move(iter)) { + SegmentPostingsBase(TCLuceneIter iter) : _iter(std::move(iter)) { if (_iter->next()) { int32_t d = _iter->doc(); _doc = d >= INT_MAX ? TERMINATED : d; @@ -52,38 +67,40 @@ class SegmentPostingsBase : public DocSet { } uint32_t doc() const override { return _doc; } - uint32_t size_hint() const override { return _iter->docFreq(); } - uint32_t freq() const override { return _iter->freq(); } - uint32_t norm() const override { return _iter->norm(); } + void append_positions_with_offset(uint32_t offset, std::vector& output) override { + throw Exception(doris::ErrorCode::NOT_IMPLEMENTED_ERROR, + "This posting type does not support position information"); + } + protected: - TermIterator _iter; + TCLuceneIter _iter; private: uint32_t _doc = TERMINATED; }; +template +using SegmentPostingsBasePtr = std::shared_ptr>; -template -class SegmentPostings final : public SegmentPostingsBase { +template +class SegmentPostings final : public SegmentPostingsBase { public: - SegmentPostings(TermIterator iter) : SegmentPostingsBase(std::move(iter)) {} - - void positions_with_offset(uint32_t offset, std::vector& output) { - output.clear(); - append_positions_with_offset(offset, output); - } + SegmentPostings(TCLuceneIter iter) : SegmentPostingsBase(std::move(iter)) {} +}; +using TermPostingsPtr = std::shared_ptr>; +using PositionPostingsPtr = std::shared_ptr>; - void append_positions_with_offset(uint32_t offset, std::vector& output) { - static_assert( - requires(TermIterator it) { - it->freq(); - it->nextPosition(); - }, "TermIterator must expose freq() and nextPosition()"); +template <> +class SegmentPostings final : public SegmentPostingsBase { +public: + SegmentPostings(TermPositionsPtr iter) + : SegmentPostingsBase(std::move(iter)) {} - auto freq = this->_iter->freq(); + void append_positions_with_offset(uint32_t offset, std::vector& output) override { + auto freq = this->freq(); size_t prev_len = output.size(); output.resize(prev_len + freq); for (int32_t i = 0; i < freq; ++i) { @@ -93,17 +110,17 @@ class SegmentPostings final : public SegmentPostingsBase { } }; -template -class NoScoreSegmentPosting final : public SegmentPostingsBase { +template +class NoScoreSegmentPosting final : public SegmentPostingsBase { public: - NoScoreSegmentPosting(TermIterator iter) : SegmentPostingsBase(std::move(iter)) {} + NoScoreSegmentPosting(TCLuceneIter iter) : SegmentPostingsBase(std::move(iter)) {} uint32_t freq() const override { return 1; } uint32_t norm() const override { return 1; } }; -template -class EmptySegmentPosting final : public SegmentPostingsBase { +template +class EmptySegmentPosting final : public SegmentPostingsBase { public: EmptySegmentPosting() = default; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/union/simple_union.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/union/simple_union.cpp new file mode 100644 index 00000000000000..e997e262a5828e --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/union/simple_union.cpp @@ -0,0 +1,143 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/inverted_index/query_v2/union/simple_union.h" + +#include + +#include "olap/rowset/segment_v2/inverted_index/query_v2/segment_postings.h" + +namespace doris::segment_v2::inverted_index::query_v2 { + +template +SimpleUnionPtr SimpleUnion::create(std::vector docsets) { + return std::make_shared>(std::move(docsets)); +} + +template +SimpleUnion::SimpleUnion(std::vector docsets) + : _docsets(std::move(docsets)), _doc(0) { + _docsets.erase( + std::remove_if(_docsets.begin(), _docsets.end(), + [](const TDocSet& docset) { return docset->doc() == TERMINATED; }), + _docsets.end()); + + initialize_first_doc_id(); +} + +template +void SimpleUnion::initialize_first_doc_id() { + uint32_t next_doc = TERMINATED; + + for (const auto& docset : _docsets) { + next_doc = std::min(next_doc, docset->doc()); + } + _doc = next_doc; +} + +template +uint32_t SimpleUnion::advance_to_next() { + uint32_t next_doc = TERMINATED; + + for (auto& docset : _docsets) { + if (docset->doc() <= _doc) { + docset->advance(); + } + next_doc = std::min(next_doc, docset->doc()); + } + _doc = next_doc; + return _doc; +} + +template +uint32_t SimpleUnion::advance() { + advance_to_next(); + return _doc; +} + +template +uint32_t SimpleUnion::seek(uint32_t target) { + _doc = TERMINATED; + for (auto& docset : _docsets) { + if (docset->doc() < target) { + docset->seek(target); + } + if (docset->doc() < _doc) { + _doc = docset->doc(); + } + } + return _doc; +} + +template +uint32_t SimpleUnion::doc() const { + return _doc; +} + +template +uint32_t SimpleUnion::size_hint() const { + uint32_t max_hint = 0; + for (const auto& docset : _docsets) { + max_hint = std::max(max_hint, docset->size_hint()); + } + return max_hint; +} + +template +uint32_t SimpleUnion::freq() const { + uint32_t total_freq = 0; + for (const auto& docset : _docsets) { + if (docset->doc() == _doc) { + total_freq += docset->freq(); + } + } + return total_freq; +} + +template +uint32_t SimpleUnion::norm() const { + for (const auto& docset : _docsets) { + if (docset->doc() == _doc) { + return docset->norm(); + } + } + return 1; +} + +template +void SimpleUnion::append_positions_with_offset(uint32_t offset, + std::vector& output) { + size_t initial_size = output.size(); + + for (auto& docset : _docsets) { + if (docset->doc() == _doc) { + docset->append_positions_with_offset(offset, output); + } + } + + if (output.size() > initial_size) { + std::sort(output.begin() + initial_size, output.end()); + auto last = std::unique(output.begin() + initial_size, output.end()); + output.erase(last, output.end()); + } +} + +template class SimpleUnion; +template class SimpleUnion; +template class SimpleUnion; + +} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/union/simple_union.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/union/simple_union.h new file mode 100644 index 00000000000000..362a07674bf6e8 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/union/simple_union.h @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "olap/rowset/segment_v2/inverted_index/query_v2/segment_postings.h" + +namespace doris::segment_v2::inverted_index::query_v2 { + +template +class SimpleUnion; + +template +using SimpleUnionPtr = std::shared_ptr>; + +template +class SimpleUnion final : public Postings { +public: + explicit SimpleUnion(std::vector docsets); + ~SimpleUnion() override = default; + + static SimpleUnionPtr create(std::vector docsets); + + uint32_t advance() override; + uint32_t seek(uint32_t target) override; + uint32_t doc() const override; + uint32_t size_hint() const override; + uint32_t freq() const override; + uint32_t norm() const override; + + void append_positions_with_offset(uint32_t offset, std::vector& output) override; + + size_t num_docsets() const { return _docsets.size(); } + +private: + void initialize_first_doc_id(); + uint32_t advance_to_next(); + + std::vector _docsets; + uint32_t _doc; +}; + +} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/weight.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/weight.h index 17d8d11cbc7def..ffd0bebb01b213 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/weight.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/weight.h @@ -23,11 +23,17 @@ #include #include "olap/rowset/segment_v2/inverted_index/query_v2/scorer.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/segment_postings.h" +#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h" namespace lucene::index { class IndexReader; } +namespace doris::io { +struct IOContext; +} // namespace doris::io + namespace doris::segment_v2::inverted_index::query_v2 { struct FieldBindingContext { @@ -100,6 +106,31 @@ class Weight { } return nullptr; } + + TermPostingsPtr create_term_posting(lucene::index::IndexReader* reader, + const std::wstring& field, const std::string& term, + bool enable_scoring, const io::IOContext* io_ctx) const { + auto term_wstr = StringHelper::to_wstring(term); + auto t = make_term_ptr(field.c_str(), term_wstr.c_str()); + auto iter = make_term_doc_ptr(reader, t.get(), enable_scoring, io_ctx); + if (iter) { + return std::make_shared>(std::move(iter)); + } + return nullptr; + } + + PositionPostingsPtr create_position_posting(lucene::index::IndexReader* reader, + const std::wstring& field, const std::string& term, + bool enable_scoring, + const io::IOContext* io_ctx) const { + auto term_wstr = StringHelper::to_wstring(term); + auto t = make_term_ptr(field.c_str(), term_wstr.c_str()); + auto iter = make_term_positions_ptr(reader, t.get(), enable_scoring, io_ctx); + if (iter) { + return std::make_shared>(std::move(iter)); + } + return nullptr; + } }; using WeightPtr = std::shared_ptr; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_query.h index 8b71ab9c0d4327..f5ad988c6ede19 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_query.h @@ -35,7 +35,7 @@ class WildcardQuery : public Query { WeightPtr weight(bool enable_scoring) override { return std::make_shared(std::move(_context), std::move(_field), - std::move(_pattern), enable_scoring); + std::move(_pattern), enable_scoring, _nullable); } private: @@ -43,6 +43,7 @@ class WildcardQuery : public Query { std::wstring _field; std::string _pattern; + bool _nullable = true; }; } // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_weight.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_weight.h index b906605db296b7..22de6e3b16dd6f 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_weight.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_weight.h @@ -29,18 +29,19 @@ namespace doris::segment_v2::inverted_index::query_v2 { class WildcardWeight : public Weight { public: WildcardWeight(IndexQueryContextPtr context, std::wstring field, std::string pattern, - bool enable_scoring) + bool enable_scoring, bool nullable) : _context(std::move(context)), _field(std::move(field)), _pattern(std::move(pattern)), - _enable_scoring(enable_scoring) {} + _enable_scoring(enable_scoring), + _nullable(nullable) {} ~WildcardWeight() override = default; ScorerPtr scorer(const QueryExecutionContext& ctx, const std::string& binding_key) override { std::string regex_pattern = wildcard_to_regex(_pattern); auto regexp_weight = std::make_shared( - _context, std::move(_field), std::move(regex_pattern), _enable_scoring); + _context, std::move(_field), std::move(regex_pattern), _enable_scoring, _nullable); return regexp_weight->scorer(ctx, binding_key); } @@ -60,6 +61,7 @@ class WildcardWeight : public Weight { std::wstring _field; std::string _pattern; bool _enable_scoring = false; + bool _nullable = true; }; } // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/vec/functions/function_search.cpp b/be/src/vec/functions/function_search.cpp index 95e0f868a486b7..0c5a3dab1a1b5b 100644 --- a/be/src/vec/functions/function_search.cpp +++ b/be/src/vec/functions/function_search.cpp @@ -35,9 +35,11 @@ #include "olap/rowset/segment_v2/index_file_reader.h" #include "olap/rowset/segment_v2/index_query_context.h" #include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h" +#include "olap/rowset/segment_v2/inverted_index/query/query_helper.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/bit_set_query/bit_set_query.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/boolean_query.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/operator.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/multi_phrase_query.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_query.h" @@ -564,21 +566,32 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, << "', returning empty BitSetQuery"; *out = std::make_shared(roaring::Roaring()); return Status::OK(); + } else if (term_infos.size() == 1) { + if (term_infos.size() == 1) { + const auto& term_info = term_infos[0]; + if (term_info.is_single_term()) { + std::wstring term_wstr = + StringHelper::to_wstring(term_info.get_single_term()); + *out = std::make_shared(context, field_wstr, + term_wstr); + } else { + query_v2::BooleanQuery::Builder builder(query_v2::OperatorType::OP_OR); + for (const auto& term : term_info.get_multi_terms()) { + std::wstring term_wstr = StringHelper::to_wstring(term); + builder.add(make_term_query(term_wstr), binding.binding_key); + } + *out = builder.build(); + } + } + } else { + if (QueryHelper::is_simple_phrase(term_infos)) { + *out = std::make_shared(context, field_wstr, term_infos); + } else { + *out = std::make_shared(context, field_wstr, + term_infos); + } } - if (term_infos.size() == 1) { - std::wstring term_wstr = StringHelper::to_wstring(term_infos[0].get_single_term()); - *out = make_term_query(term_wstr); - return Status::OK(); - } - - std::vector terms; - for (const auto& term_info : term_infos) { - terms.push_back(StringHelper::to_wstring(term_info.get_single_term())); - } - *out = std::make_shared(context, field_wstr, terms); - VLOG_DEBUG << "search: Built PhraseQuery for field=" << field_name << " with " - << terms.size() << " terms"; return Status::OK(); } if (clause_type == "MATCH") { diff --git a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/doc_set_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/doc_set_test.cpp index 143f6232b0f31a..8f45e876ff1532 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/doc_set_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/doc_set_test.cpp @@ -19,222 +19,519 @@ #include -#include "common/exception.h" - -namespace doris { - -using namespace segment_v2::inverted_index::query_v2; - -class DocSetTest : public testing::Test { -public: - void SetUp() override {} - void TearDown() override {} -}; - -TEST_F(DocSetTest, TerminatorConstant) { - EXPECT_EQ(TERMINATED, static_cast(INT_MAX)); -} - -TEST_F(DocSetTest, AdvanceNotImplemented) { - DocSet ds; - try { - (void)ds.advance(); - FAIL() << "Expected doris::Exception for NOT_IMPLEMENTED_ERROR"; - } catch (const Exception& e) { - EXPECT_EQ(e.code(), ErrorCode::NOT_IMPLEMENTED_ERROR); - } catch (...) { - FAIL() << "Expected doris::Exception"; - } -} - -TEST_F(DocSetTest, SeekNotImplemented) { - DocSet ds; - try { - (void)ds.seek(10); - FAIL() << "Expected doris::Exception for NOT_IMPLEMENTED_ERROR"; - } catch (const Exception& e) { - EXPECT_EQ(e.code(), ErrorCode::NOT_IMPLEMENTED_ERROR); - } catch (...) { - FAIL() << "Expected doris::Exception"; - } -} - -TEST_F(DocSetTest, DocNotImplemented) { - DocSet ds; - try { - (void)ds.doc(); - FAIL() << "Expected doris::Exception for NOT_IMPLEMENTED_ERROR"; - } catch (const Exception& e) { - EXPECT_EQ(e.code(), ErrorCode::NOT_IMPLEMENTED_ERROR); - } catch (...) { - FAIL() << "Expected doris::Exception"; - } -} - -TEST_F(DocSetTest, SizeHintNotImplemented) { - DocSet ds; - try { - (void)ds.size_hint(); - FAIL() << "Expected doris::Exception for NOT_IMPLEMENTED_ERROR"; - } catch (const Exception& e) { - EXPECT_EQ(e.code(), ErrorCode::NOT_IMPLEMENTED_ERROR); - } catch (...) { - FAIL() << "Expected doris::Exception"; - } -} - -TEST_F(DocSetTest, FreqNotImplemented) { - DocSet ds; - try { - (void)ds.freq(); - FAIL() << "Expected doris::Exception for NOT_IMPLEMENTED_ERROR"; - } catch (const Exception& e) { - EXPECT_EQ(e.code(), ErrorCode::NOT_IMPLEMENTED_ERROR); - } catch (...) { - FAIL() << "Expected doris::Exception"; - } -} - -TEST_F(DocSetTest, NormNotImplemented) { - DocSet ds; - try { - (void)ds.norm(); - FAIL() << "Expected doris::Exception for NOT_IMPLEMENTED_ERROR"; - } catch (const Exception& e) { - EXPECT_EQ(e.code(), ErrorCode::NOT_IMPLEMENTED_ERROR); - } catch (...) { - FAIL() << "Expected doris::Exception"; - } -} - -// MockDocSet tests -TEST_F(DocSetTest, MockDocSetEmptyDocs) { - MockDocSet ds({}); - EXPECT_EQ(ds.doc(), TERMINATED); - EXPECT_EQ(ds.advance(), TERMINATED); - EXPECT_EQ(ds.seek(10), TERMINATED); - EXPECT_EQ(ds.size_hint(), 0); - EXPECT_EQ(ds.norm(), 1); -} - -TEST_F(DocSetTest, MockDocSetSingleDoc) { - MockDocSet ds({5}); - EXPECT_EQ(ds.doc(), 5); - EXPECT_EQ(ds.size_hint(), 1); - EXPECT_EQ(ds.norm(), 1); - EXPECT_EQ(ds.advance(), TERMINATED); - EXPECT_EQ(ds.doc(), TERMINATED); -} - -TEST_F(DocSetTest, MockDocSetAdvance) { - MockDocSet ds({1, 5, 10, 15, 20}); - EXPECT_EQ(ds.doc(), 1); - EXPECT_EQ(ds.advance(), 5); - EXPECT_EQ(ds.doc(), 5); - EXPECT_EQ(ds.advance(), 10); - EXPECT_EQ(ds.advance(), 15); - EXPECT_EQ(ds.advance(), 20); - EXPECT_EQ(ds.advance(), TERMINATED); - EXPECT_EQ(ds.doc(), TERMINATED); -} - -TEST_F(DocSetTest, MockDocSetSeekExactMatch) { - MockDocSet ds({1, 5, 10, 15, 20}); - EXPECT_EQ(ds.doc(), 1); - EXPECT_EQ(ds.seek(10), 10); - EXPECT_EQ(ds.doc(), 10); - EXPECT_EQ(ds.seek(20), 20); - EXPECT_EQ(ds.doc(), 20); -} - -TEST_F(DocSetTest, MockDocSetSeekNextHigher) { - MockDocSet ds({1, 5, 10, 15, 20}); - EXPECT_EQ(ds.doc(), 1); - EXPECT_EQ(ds.seek(7), 10); - EXPECT_EQ(ds.doc(), 10); - EXPECT_EQ(ds.seek(12), 15); - EXPECT_EQ(ds.doc(), 15); -} - -TEST_F(DocSetTest, MockDocSetSeekBeyondLast) { - MockDocSet ds({1, 5, 10, 15, 20}); - EXPECT_EQ(ds.seek(25), TERMINATED); - EXPECT_EQ(ds.doc(), TERMINATED); -} - -TEST_F(DocSetTest, MockDocSetSeekCurrentDoc) { - MockDocSet ds({1, 5, 10, 15, 20}); - EXPECT_EQ(ds.doc(), 1); - EXPECT_EQ(ds.seek(1), 1); - EXPECT_EQ(ds.doc(), 1); - EXPECT_EQ(ds.advance(), 5); - EXPECT_EQ(ds.seek(5), 5); - EXPECT_EQ(ds.doc(), 5); -} - -TEST_F(DocSetTest, MockDocSetSeekBeforeCurrent) { - MockDocSet ds({1, 5, 10, 15, 20}); - EXPECT_EQ(ds.advance(), 5); - EXPECT_EQ(ds.doc(), 5); - // Seeking to a value less than current should return current - EXPECT_EQ(ds.seek(3), 5); - EXPECT_EQ(ds.doc(), 5); -} - -TEST_F(DocSetTest, MockDocSetUnsortedInput) { - // MockDocSet should sort the input - MockDocSet ds({20, 5, 15, 1, 10}); - EXPECT_EQ(ds.doc(), 1); - EXPECT_EQ(ds.advance(), 5); - EXPECT_EQ(ds.advance(), 10); - EXPECT_EQ(ds.advance(), 15); - EXPECT_EQ(ds.advance(), 20); - EXPECT_EQ(ds.advance(), TERMINATED); -} +#include +#include +#include -TEST_F(DocSetTest, MockDocSetCustomSizeHint) { - MockDocSet ds({1, 2, 3}, 100); - EXPECT_EQ(ds.size_hint(), 100); -} - -TEST_F(DocSetTest, MockDocSetDefaultSizeHint) { - MockDocSet ds({1, 2, 3, 4, 5}); - EXPECT_EQ(ds.size_hint(), 5); +namespace doris::segment_v2::inverted_index::query_v2 { + +class DocSetTest : public testing::Test {}; + +TEST_F(DocSetTest, test_docset_base_class_exceptions) { + DocSet docset; + + EXPECT_THROW(docset.advance(), Exception); + EXPECT_THROW(docset.seek(0), Exception); + EXPECT_THROW(docset.doc(), Exception); + EXPECT_THROW(docset.size_hint(), Exception); + EXPECT_THROW(docset.freq(), Exception); + EXPECT_THROW(docset.norm(), Exception); +} + +TEST_F(DocSetTest, test_mockdocset_constructor1_empty_docs) { + std::vector docs; + MockDocSet mock_docset(docs); + + EXPECT_EQ(mock_docset.doc(), TERMINATED); + EXPECT_EQ(mock_docset.size_hint(), 0); + EXPECT_EQ(mock_docset.norm(), 1); +} + +TEST_F(DocSetTest, test_mockdocset_constructor1_with_docs) { + std::vector docs = {5, 2, 8, 1, 9}; + MockDocSet mock_docset(docs); + + EXPECT_EQ(mock_docset.doc(), 1); + EXPECT_EQ(mock_docset.size_hint(), 5); + EXPECT_EQ(mock_docset.norm(), 1); +} + +TEST_F(DocSetTest, test_mockdocset_constructor1_with_size_hint) { + std::vector docs = {3, 1, 4}; + MockDocSet mock_docset(docs, 10); + + EXPECT_EQ(mock_docset.doc(), 1); + EXPECT_EQ(mock_docset.size_hint(), 10); + EXPECT_EQ(mock_docset.norm(), 1); +} + +TEST_F(DocSetTest, test_mockdocset_constructor1_with_norm) { + std::vector docs = {2, 5}; + MockDocSet mock_docset(docs, 0, 5); + + EXPECT_EQ(mock_docset.doc(), 2); + EXPECT_EQ(mock_docset.size_hint(), 2); + EXPECT_EQ(mock_docset.norm(), 5); +} + +TEST_F(DocSetTest, test_mockdocset_constructor1_with_all_params) { + std::vector docs = {7, 3, 9}; + MockDocSet mock_docset(docs, 20, 3); + + EXPECT_EQ(mock_docset.doc(), 3); + EXPECT_EQ(mock_docset.size_hint(), 20); + EXPECT_EQ(mock_docset.norm(), 3); +} + +TEST_F(DocSetTest, test_mockdocset_constructor2_empty_docs) { + std::vector docs; + std::map> doc_positions; + MockDocSet mock_docset(docs, doc_positions); + + EXPECT_EQ(mock_docset.doc(), TERMINATED); + EXPECT_EQ(mock_docset.size_hint(), 0); + EXPECT_EQ(mock_docset.norm(), 1); +} + +TEST_F(DocSetTest, test_mockdocset_constructor2_with_docs_and_positions) { + std::vector docs = {5, 2, 8}; + std::map> doc_positions; + doc_positions[2] = {10, 20}; + doc_positions[5] = {15}; + doc_positions[8] = {30, 40, 50}; + + MockDocSet mock_docset(docs, doc_positions); + + EXPECT_EQ(mock_docset.doc(), 2); + EXPECT_EQ(mock_docset.size_hint(), 3); + EXPECT_EQ(mock_docset.norm(), 1); + EXPECT_EQ(mock_docset.freq(), 2); +} + +TEST_F(DocSetTest, test_mockdocset_constructor2_with_size_hint) { + std::vector docs = {1, 3, 5}; + std::map> doc_positions; + MockDocSet mock_docset(docs, doc_positions, 15); + + EXPECT_EQ(mock_docset.doc(), 1); + EXPECT_EQ(mock_docset.size_hint(), 15); + EXPECT_EQ(mock_docset.norm(), 1); +} + +TEST_F(DocSetTest, test_mockdocset_constructor2_with_norm) { + std::vector docs = {4, 6}; + std::map> doc_positions; + MockDocSet mock_docset(docs, doc_positions, 0, 7); + + EXPECT_EQ(mock_docset.doc(), 4); + EXPECT_EQ(mock_docset.size_hint(), 2); + EXPECT_EQ(mock_docset.norm(), 7); +} + +TEST_F(DocSetTest, test_mockdocset_constructor2_with_all_params) { + std::vector docs = {2, 4, 6, 8}; + std::map> doc_positions; + doc_positions[2] = {1, 2}; + MockDocSet mock_docset(docs, doc_positions, 25, 4); + + EXPECT_EQ(mock_docset.doc(), 2); + EXPECT_EQ(mock_docset.size_hint(), 25); + EXPECT_EQ(mock_docset.norm(), 4); + EXPECT_EQ(mock_docset.freq(), 2); +} + +TEST_F(DocSetTest, test_next_empty_docs) { + std::vector docs; + MockDocSet mock_docset(docs); + + EXPECT_FALSE(mock_docset.next()); + EXPECT_EQ(mock_docset.doc(), TERMINATED); +} + +TEST_F(DocSetTest, test_next_single_doc) { + std::vector docs = {5}; + MockDocSet mock_docset(docs); + + EXPECT_FALSE(mock_docset.next()); + EXPECT_EQ(mock_docset.doc(), TERMINATED); +} + +TEST_F(DocSetTest, test_next_multiple_docs) { + std::vector docs = {1, 3, 5, 7}; + MockDocSet mock_docset(docs); + + EXPECT_EQ(mock_docset.doc(), 1); + EXPECT_TRUE(mock_docset.next()); + EXPECT_EQ(mock_docset.doc(), 3); + EXPECT_TRUE(mock_docset.next()); + EXPECT_EQ(mock_docset.doc(), 5); + EXPECT_TRUE(mock_docset.next()); + EXPECT_EQ(mock_docset.doc(), 7); + EXPECT_FALSE(mock_docset.next()); + EXPECT_EQ(mock_docset.doc(), TERMINATED); +} + +TEST_F(DocSetTest, test_next_at_boundary) { + std::vector docs = {2, 4}; + MockDocSet mock_docset(docs); + + EXPECT_EQ(mock_docset.doc(), 2); + EXPECT_TRUE(mock_docset.next()); + EXPECT_EQ(mock_docset.doc(), 4); + EXPECT_FALSE(mock_docset.next()); + EXPECT_FALSE(mock_docset.next()); + EXPECT_EQ(mock_docset.doc(), TERMINATED); +} + +TEST_F(DocSetTest, test_skipto_empty_docs) { + std::vector docs; + MockDocSet mock_docset(docs); + + EXPECT_FALSE(mock_docset.skipTo(5)); + EXPECT_EQ(mock_docset.doc(), TERMINATED); +} + +TEST_F(DocSetTest, test_skipto_current_doc_already_at_target) { + std::vector docs = {1, 3, 5, 7}; + MockDocSet mock_docset(docs); + + EXPECT_EQ(mock_docset.doc(), 1); + EXPECT_TRUE(mock_docset.skipTo(1)); + EXPECT_EQ(mock_docset.doc(), 1); +} + +TEST_F(DocSetTest, test_skipto_current_doc_after_target) { + std::vector docs = {1, 3, 5, 7}; + MockDocSet mock_docset(docs); + + mock_docset.seek(5); + EXPECT_EQ(mock_docset.doc(), 5); + EXPECT_TRUE(mock_docset.skipTo(3)); + EXPECT_EQ(mock_docset.doc(), 5); +} + +TEST_F(DocSetTest, test_skipto_find_target) { + std::vector docs = {1, 3, 5, 7, 9}; + MockDocSet mock_docset(docs); + + EXPECT_EQ(mock_docset.doc(), 1); + EXPECT_TRUE(mock_docset.skipTo(5)); + EXPECT_EQ(mock_docset.doc(), 5); +} + +TEST_F(DocSetTest, test_skipto_target_not_found) { + std::vector docs = {1, 3, 5}; + MockDocSet mock_docset(docs); + + EXPECT_EQ(mock_docset.doc(), 1); + EXPECT_FALSE(mock_docset.skipTo(10)); + EXPECT_EQ(mock_docset.doc(), TERMINATED); +} + +TEST_F(DocSetTest, test_skipto_find_target_between_docs) { + std::vector docs = {1, 3, 5, 7}; + MockDocSet mock_docset(docs); + + EXPECT_EQ(mock_docset.doc(), 1); + EXPECT_TRUE(mock_docset.skipTo(4)); + EXPECT_EQ(mock_docset.doc(), 5); +} + +TEST_F(DocSetTest, test_skipto_after_next) { + std::vector docs = {1, 3, 5, 7}; + MockDocSet mock_docset(docs); + + mock_docset.next(); + EXPECT_EQ(mock_docset.doc(), 3); + EXPECT_TRUE(mock_docset.skipTo(6)); + EXPECT_EQ(mock_docset.doc(), 7); +} + +TEST_F(DocSetTest, test_skipto_at_end) { + std::vector docs = {1, 3}; + MockDocSet mock_docset(docs); + + mock_docset.next(); + EXPECT_EQ(mock_docset.doc(), 3); + EXPECT_FALSE(mock_docset.skipTo(5)); + EXPECT_EQ(mock_docset.doc(), TERMINATED); +} + +TEST_F(DocSetTest, test_docfreq) { + std::vector docs = {1, 2, 3}; + MockDocSet mock_docset(docs, 42); + + EXPECT_EQ(mock_docset.docFreq(), 42); +} + +TEST_F(DocSetTest, test_advance) { + std::vector docs = {1, 3, 5}; + MockDocSet mock_docset(docs); + + EXPECT_EQ(mock_docset.doc(), 1); + EXPECT_EQ(mock_docset.advance(), 3); + EXPECT_EQ(mock_docset.doc(), 3); + EXPECT_EQ(mock_docset.advance(), 5); + EXPECT_EQ(mock_docset.doc(), 5); + EXPECT_EQ(mock_docset.advance(), TERMINATED); + EXPECT_EQ(mock_docset.doc(), TERMINATED); +} + +TEST_F(DocSetTest, test_advance_empty) { + std::vector docs; + MockDocSet mock_docset(docs); + + EXPECT_EQ(mock_docset.advance(), TERMINATED); + EXPECT_EQ(mock_docset.doc(), TERMINATED); +} + +TEST_F(DocSetTest, test_seek) { + std::vector docs = {1, 3, 5, 7}; + MockDocSet mock_docset(docs); + + EXPECT_EQ(mock_docset.doc(), 1); + EXPECT_EQ(mock_docset.seek(5), 5); + EXPECT_EQ(mock_docset.doc(), 5); + EXPECT_EQ(mock_docset.seek(1), 5); + EXPECT_EQ(mock_docset.doc(), 5); +} + +TEST_F(DocSetTest, test_seek_empty) { + std::vector docs; + MockDocSet mock_docset(docs); + + EXPECT_EQ(mock_docset.seek(5), TERMINATED); + EXPECT_EQ(mock_docset.doc(), TERMINATED); +} + +TEST_F(DocSetTest, test_seek_not_found) { + std::vector docs = {1, 3, 5}; + MockDocSet mock_docset(docs); + + EXPECT_EQ(mock_docset.seek(10), TERMINATED); + EXPECT_EQ(mock_docset.doc(), TERMINATED); } -TEST_F(DocSetTest, MockDocSetCustomNorm) { - MockDocSet ds({1, 2, 3}, 0, 42); - EXPECT_EQ(ds.norm(), 42); +TEST_F(DocSetTest, test_doc) { + std::vector docs = {2, 4, 6}; + MockDocSet mock_docset(docs); + + EXPECT_EQ(mock_docset.doc(), 2); + mock_docset.next(); + EXPECT_EQ(mock_docset.doc(), 4); + mock_docset.seek(6); + EXPECT_EQ(mock_docset.doc(), 6); } -TEST_F(DocSetTest, MockDocSetDefaultNorm) { - MockDocSet ds({1, 2, 3}); - EXPECT_EQ(ds.norm(), 1); +TEST_F(DocSetTest, test_size_hint) { + std::vector docs = {1, 2}; + MockDocSet mock_docset(docs, 50); + + EXPECT_EQ(mock_docset.size_hint(), 50); } -TEST_F(DocSetTest, MockDocSetAdvanceAfterSeek) { - MockDocSet ds({1, 5, 10, 15, 20, 25, 30}); - EXPECT_EQ(ds.seek(10), 10); - EXPECT_EQ(ds.advance(), 15); - EXPECT_EQ(ds.advance(), 20); - EXPECT_EQ(ds.seek(25), 25); - EXPECT_EQ(ds.advance(), 30); - EXPECT_EQ(ds.advance(), TERMINATED); +TEST_F(DocSetTest, test_norm) { + std::vector docs = {1}; + MockDocSet mock_docset(docs, 0, 99); + + EXPECT_EQ(mock_docset.norm(), 99); } -TEST_F(DocSetTest, MockDocSetSeekAfterTerminated) { - MockDocSet ds({1, 2, 3}); - EXPECT_EQ(ds.advance(), 2); - EXPECT_EQ(ds.advance(), 3); - EXPECT_EQ(ds.advance(), TERMINATED); - EXPECT_EQ(ds.seek(100), TERMINATED); +TEST_F(DocSetTest, test_freq_terminated) { + std::vector docs; + MockDocSet mock_docset(docs); + + EXPECT_EQ(mock_docset.freq(), 0); } -TEST_F(DocSetTest, MockDocSetAdvanceAfterTerminated) { - MockDocSet ds({1, 2}); - EXPECT_EQ(ds.advance(), 2); - EXPECT_EQ(ds.advance(), TERMINATED); - EXPECT_EQ(ds.advance(), TERMINATED); +TEST_F(DocSetTest, test_freq_with_positions) { + std::vector docs = {1, 2, 3}; + std::map> doc_positions; + doc_positions[1] = {10, 20, 30}; + doc_positions[2] = {15}; + doc_positions[3] = {25, 35}; + + MockDocSet mock_docset(docs, doc_positions); + + EXPECT_EQ(mock_docset.doc(), 1); + EXPECT_EQ(mock_docset.freq(), 3); + + mock_docset.seek(2); + EXPECT_EQ(mock_docset.freq(), 1); + + mock_docset.seek(3); + EXPECT_EQ(mock_docset.freq(), 2); +} + +TEST_F(DocSetTest, test_freq_without_positions) { + std::vector docs = {1, 2}; + std::map> doc_positions; + MockDocSet mock_docset(docs, doc_positions); + + EXPECT_EQ(mock_docset.doc(), 1); + EXPECT_EQ(mock_docset.freq(), 1); + + mock_docset.seek(2); + EXPECT_EQ(mock_docset.freq(), 1); +} + +TEST_F(DocSetTest, test_freq_partial_positions) { + std::vector docs = {1, 2, 3}; + std::map> doc_positions; + doc_positions[2] = {10, 20}; + + MockDocSet mock_docset(docs, doc_positions); + + EXPECT_EQ(mock_docset.doc(), 1); + EXPECT_EQ(mock_docset.freq(), 1); + + mock_docset.seek(2); + EXPECT_EQ(mock_docset.freq(), 2); + + mock_docset.seek(3); + EXPECT_EQ(mock_docset.freq(), 1); +} + +TEST_F(DocSetTest, test_append_positions_with_offset_terminated) { + std::vector docs; + MockDocSet mock_docset(docs); + + std::vector output; + mock_docset.append_positions_with_offset(100, output); + + EXPECT_TRUE(output.empty()); +} + +TEST_F(DocSetTest, test_append_positions_with_offset_with_positions) { + std::vector docs = {1, 2}; + std::map> doc_positions; + doc_positions[1] = {10, 20}; + doc_positions[2] = {30, 40, 50}; + + MockDocSet mock_docset(docs, doc_positions); + + std::vector output; + output.push_back(5); + + mock_docset.append_positions_with_offset(100, output); + + EXPECT_EQ(output.size(), 3); + EXPECT_EQ(output[0], 5); + EXPECT_EQ(output[1], 110); + EXPECT_EQ(output[2], 120); +} + +TEST_F(DocSetTest, test_append_positions_with_offset_multiple_appends) { + std::vector docs = {1, 2}; + std::map> doc_positions; + doc_positions[1] = {10}; + doc_positions[2] = {20}; + + MockDocSet mock_docset(docs, doc_positions); + + std::vector output; + mock_docset.append_positions_with_offset(100, output); + + EXPECT_EQ(output.size(), 1); + EXPECT_EQ(output[0], 110); + + mock_docset.seek(2); + mock_docset.append_positions_with_offset(200, output); + + EXPECT_EQ(output.size(), 2); + EXPECT_EQ(output[0], 110); + EXPECT_EQ(output[1], 220); +} + +TEST_F(DocSetTest, test_append_positions_with_offset_no_positions) { + std::vector docs = {1}; + std::map> doc_positions; + MockDocSet mock_docset(docs, doc_positions); + + std::vector output; + output.push_back(5); + + mock_docset.append_positions_with_offset(100, output); + + EXPECT_EQ(output.size(), 1); + EXPECT_EQ(output[0], 5); +} + +TEST_F(DocSetTest, test_positions_with_offset) { + std::vector docs = {1}; + std::map> doc_positions; + doc_positions[1] = {10, 20, 30}; + + MockDocSet mock_docset(docs, doc_positions); + + std::vector output; + output.push_back(999); + + mock_docset.positions_with_offset(100, output); + + EXPECT_EQ(output.size(), 3); + EXPECT_EQ(output[0], 110); + EXPECT_EQ(output[1], 120); + EXPECT_EQ(output[2], 130); +} + +TEST_F(DocSetTest, test_positions_with_offset_terminated) { + std::vector docs; + MockDocSet mock_docset(docs); + + std::vector output; + output.push_back(999); + + mock_docset.positions_with_offset(100, output); + + EXPECT_TRUE(output.empty()); +} + +TEST_F(DocSetTest, test_positions_with_offset_empty_positions) { + std::vector docs = {1}; + std::map> doc_positions; + MockDocSet mock_docset(docs, doc_positions); + + std::vector output; + output.push_back(999); + + mock_docset.positions_with_offset(100, output); + + EXPECT_TRUE(output.empty()); +} + +TEST_F(DocSetTest, test_shared_ptr_types) { + std::vector docs = {1, 2, 3}; + DocSetPtr docset_ptr = std::make_shared(docs); + + EXPECT_EQ(docset_ptr->doc(), 1); + EXPECT_EQ(docset_ptr->advance(), 2); + + MockDocSetPtr mock_ptr = std::make_shared(docs); + EXPECT_EQ(mock_ptr->doc(), 1); +} + +TEST_F(DocSetTest, test_unsorted_docs_auto_sorted) { + std::vector docs = {9, 1, 5, 3, 7}; + MockDocSet mock_docset(docs); + + EXPECT_EQ(mock_docset.doc(), 1); + EXPECT_EQ(mock_docset.advance(), 3); + EXPECT_EQ(mock_docset.advance(), 5); + EXPECT_EQ(mock_docset.advance(), 7); + EXPECT_EQ(mock_docset.advance(), 9); +} + +TEST_F(DocSetTest, test_skipto_from_middle) { + std::vector docs = {1, 3, 5, 7, 9}; + MockDocSet mock_docset(docs); + + mock_docset.seek(5); + EXPECT_EQ(mock_docset.doc(), 5); + EXPECT_TRUE(mock_docset.skipTo(7)); + EXPECT_EQ(mock_docset.doc(), 7); + EXPECT_FALSE(mock_docset.skipTo(10)); + EXPECT_EQ(mock_docset.doc(), TERMINATED); } -} // namespace doris +} // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/loaded_postings_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/loaded_postings_test.cpp new file mode 100644 index 00000000000000..553dcfcd1a91c6 --- /dev/null +++ b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/loaded_postings_test.cpp @@ -0,0 +1,296 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/inverted_index/query_v2/postings/loaded_postings.h" + +#include + +namespace doris { + +using namespace segment_v2::inverted_index::query_v2; + +class LoadedPostingsTest : public testing::Test { +public: + void SetUp() override {} + void TearDown() override {} +}; + +TEST_F(LoadedPostingsTest, EmptyPostings) { + LoadedPostings postings; + EXPECT_EQ(postings.doc(), TERMINATED); + EXPECT_EQ(postings.advance(), TERMINATED); + EXPECT_EQ(postings.size_hint(), 0); + EXPECT_EQ(postings.freq(), 0); +} + +TEST_F(LoadedPostingsTest, BasicAdvance) { + std::vector doc_ids; + for (uint32_t i = 0; i < 1024; ++i) { + doc_ids.push_back(i * 3); + } + std::vector> positions(1024); + + LoadedPostings postings(doc_ids, positions); + + EXPECT_EQ(postings.doc(), 0); + EXPECT_EQ(postings.advance(), 3); + EXPECT_EQ(postings.doc(), 3); +} + +TEST_F(LoadedPostingsTest, SeekExactMatch) { + std::vector doc_ids; + for (uint32_t i = 0; i < 1024; ++i) { + doc_ids.push_back(i * 3); + } + std::vector> positions(1024); + + LoadedPostings postings(doc_ids, positions); + + EXPECT_EQ(postings.doc(), 0); + EXPECT_EQ(postings.seek(15), 15); + EXPECT_EQ(postings.doc(), 15); + EXPECT_EQ(postings.seek(300), 300); + EXPECT_EQ(postings.doc(), 300); +} + +TEST_F(LoadedPostingsTest, SeekBeyondEnd) { + std::vector doc_ids; + for (uint32_t i = 0; i < 1024; ++i) { + doc_ids.push_back(i * 3); + } + std::vector> positions(1024); + + LoadedPostings postings(doc_ids, positions); + + EXPECT_EQ(postings.seek(6000), TERMINATED); +} + +TEST_F(LoadedPostingsTest, SeekNonExactMatch) { + std::vector doc_ids; + for (uint32_t i = 0; i < 1024; ++i) { + doc_ids.push_back(i * 3); + } + std::vector> positions(1024); + + LoadedPostings postings(doc_ids, positions); + + EXPECT_EQ(postings.doc(), 0); + // Seek to 14, should land on 15 (next available) + EXPECT_EQ(postings.seek(14), 15); + EXPECT_EQ(postings.doc(), 15); +} + +TEST_F(LoadedPostingsTest, WithPositions) { + std::vector doc_ids; + for (uint32_t i = 0; i < 1024; ++i) { + doc_ids.push_back(i * 3); + } + + std::vector> positions(1024); + positions[0] = {1, 2, 3}; + positions[1] = {30}; + positions[2] = {10}; + positions[4] = {50}; + + LoadedPostings postings(doc_ids, positions); + + EXPECT_EQ(postings.doc(), 0); + + std::vector pos; + postings.positions_with_offset(0, pos); + EXPECT_EQ(pos.size(), 3); + EXPECT_EQ(pos[0], 1); + EXPECT_EQ(pos[1], 2); + EXPECT_EQ(pos[2], 3); + + EXPECT_EQ(postings.advance(), 3); + EXPECT_EQ(postings.doc(), 3); + + postings.positions_with_offset(0, pos); + EXPECT_EQ(pos.size(), 1); + EXPECT_EQ(pos[0], 30); +} + +TEST_F(LoadedPostingsTest, TermFreq) { + std::vector doc_ids = {0, 3, 6, 9}; + std::vector> positions(4); + positions[0] = {1, 2, 3}; // freq = 3 + positions[1] = {30}; // freq = 1 + positions[2] = {10, 20}; // freq = 2 + positions[3] = {50, 60, 70, 80}; // freq = 4 + + LoadedPostings postings(doc_ids, positions); + + EXPECT_EQ(postings.doc(), 0); + EXPECT_EQ(postings.freq(), 3); + EXPECT_EQ(postings.freq(), 3); + + postings.advance(); + EXPECT_EQ(postings.doc(), 3); + EXPECT_EQ(postings.freq(), 1); + + postings.advance(); + EXPECT_EQ(postings.doc(), 6); + EXPECT_EQ(postings.freq(), 2); + + postings.advance(); + EXPECT_EQ(postings.doc(), 9); + EXPECT_EQ(postings.freq(), 4); +} + +TEST_F(LoadedPostingsTest, AppendPositionsWithOffset) { + std::vector doc_ids = {0, 3}; + std::vector> positions(2); + positions[0] = {1, 2, 3}; + positions[1] = {10, 20}; + + LoadedPostings postings(doc_ids, positions); + + std::vector output; + postings.append_positions_with_offset(100, output); + + EXPECT_EQ(output.size(), 3); + EXPECT_EQ(output[0], 101); + EXPECT_EQ(output[1], 102); + EXPECT_EQ(output[2], 103); + + // Append more from the same doc + postings.append_positions_with_offset(200, output); + EXPECT_EQ(output.size(), 6); + EXPECT_EQ(output[3], 201); + EXPECT_EQ(output[4], 202); + EXPECT_EQ(output[5], 203); +} + +TEST_F(LoadedPostingsTest, PositionsWithOffset) { + std::vector doc_ids = {0, 3}; + std::vector> positions(2); + positions[0] = {1, 2, 3}; + positions[1] = {10, 20}; + + LoadedPostings postings(doc_ids, positions); + + std::vector output; + postings.positions_with_offset(100, output); + + EXPECT_EQ(output.size(), 3); + EXPECT_EQ(output[0], 101); + EXPECT_EQ(output[1], 102); + EXPECT_EQ(output[2], 103); +} + +TEST_F(LoadedPostingsTest, SizeHint) { + std::vector doc_ids = {1, 5, 10, 15, 20}; + std::vector> positions(5); + + LoadedPostings postings(doc_ids, positions); + + EXPECT_EQ(postings.size_hint(), 5); +} + +TEST_F(LoadedPostingsTest, Norm) { + std::vector doc_ids = {1, 5}; + std::vector> positions(2); + + LoadedPostings postings(doc_ids, positions); + + EXPECT_EQ(postings.norm(), 1); +} + +TEST_F(LoadedPostingsTest, SingleDocument) { + std::vector doc_ids = {42}; + std::vector> positions(1); + positions[0] = {1, 5, 10}; + + LoadedPostings postings(doc_ids, positions); + + EXPECT_EQ(postings.doc(), 42); + EXPECT_EQ(postings.freq(), 3); + EXPECT_EQ(postings.size_hint(), 1); + EXPECT_EQ(postings.advance(), TERMINATED); +} + +TEST_F(LoadedPostingsTest, SeekAfterAdvance) { + std::vector doc_ids = {1, 5, 10, 15, 20, 25, 30}; + std::vector> positions(7); + + LoadedPostings postings(doc_ids, positions); + + EXPECT_EQ(postings.seek(10), 10); + EXPECT_EQ(postings.advance(), 15); + EXPECT_EQ(postings.advance(), 20); + EXPECT_EQ(postings.seek(25), 25); + EXPECT_EQ(postings.advance(), 30); + EXPECT_EQ(postings.advance(), TERMINATED); +} + +TEST_F(LoadedPostingsTest, SeekToCurrentDoc) { + std::vector doc_ids = {1, 5, 10, 15, 20}; + std::vector> positions(5); + + LoadedPostings postings(doc_ids, positions); + + EXPECT_EQ(postings.doc(), 1); + EXPECT_EQ(postings.seek(1), 1); + EXPECT_EQ(postings.doc(), 1); + + EXPECT_EQ(postings.advance(), 5); + EXPECT_EQ(postings.seek(5), 5); + EXPECT_EQ(postings.doc(), 5); +} + +TEST_F(LoadedPostingsTest, SeekBeforeCurrent) { + std::vector doc_ids = {1, 5, 10, 15, 20}; + std::vector> positions(5); + + LoadedPostings postings(doc_ids, positions); + + EXPECT_EQ(postings.advance(), 5); + EXPECT_EQ(postings.doc(), 5); + + // Seeking to a value less than current should return current + EXPECT_EQ(postings.seek(3), 5); + EXPECT_EQ(postings.doc(), 5); +} + +TEST_F(LoadedPostingsTest, NoPositions) { + std::vector doc_ids = {1, 5, 10}; + std::vector> positions(3); // All empty + + LoadedPostings postings(doc_ids, positions); + + EXPECT_EQ(postings.doc(), 1); + EXPECT_EQ(postings.freq(), 0); + + std::vector pos; + postings.positions_with_offset(0, pos); + EXPECT_EQ(pos.size(), 0); +} + +TEST_F(LoadedPostingsTest, AdvanceToEnd) { + std::vector doc_ids = {1, 2, 3}; + std::vector> positions(3); + + LoadedPostings postings(doc_ids, positions); + + EXPECT_EQ(postings.advance(), 2); + EXPECT_EQ(postings.advance(), 3); + EXPECT_EQ(postings.advance(), TERMINATED); + EXPECT_EQ(postings.advance(), TERMINATED); // Stay at TERMINATED +} + +} // namespace doris \ No newline at end of file diff --git a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/multi_phrase_query_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/multi_phrase_query_test.cpp new file mode 100644 index 00000000000000..ca5dba0aeac7c1 --- /dev/null +++ b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/multi_phrase_query_test.cpp @@ -0,0 +1,794 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/multi_phrase_query.h" + +#include + +#include +#include +#include + +#include "common/status.h" +#include "olap/rowset/segment_v2/index_query_context.h" +#include "olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.h" +#include "olap/rowset/segment_v2/inverted_index/query/query_info.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/multi_phrase_weight.h" +#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h" + +CL_NS_USE(search) +CL_NS_USE(store) +CL_NS_USE(index) + +namespace doris::segment_v2 { + +using namespace inverted_index; + +class MultiPhraseQueryV2Test : public testing::Test { +public: + const std::string kTestDir = "./ut_dir/multi_phrase_query_test"; + + void SetUp() override { + auto st = io::global_local_filesystem()->delete_directory(kTestDir); + ASSERT_TRUE(st.ok()) << st; + st = io::global_local_filesystem()->create_directory(kTestDir); + ASSERT_TRUE(st.ok()) << st; + std::string field_name = "content"; + create_test_index(field_name, kTestDir); + } + + void TearDown() override { + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kTestDir).ok()); + } + +private: + void create_test_index(const std::string& field_name, const std::string& dir) { + std::vector test_data = {"the quick brown fox jumps over the lazy dog", + "quick brown dogs are running fast", + "the brown cat sleeps peacefully", + "lazy dogs and quick cats", + "the lazy dog is very lazy", + "quick fox and brown bear", + "the quick brown horse runs", + "dogs and cats are pets", + "the fox is quick and brown", + "brown foxes jump over fences", + "lazy cat sleeps all day", + "quick brown fox in the forest", + "the dog barks loudly", + "brown and white dogs", + "quick movements of animals", + "the lazy afternoon", + "brown fox runs quickly", + "the quick test", + "brown lazy fox", + "quick brown lazy dog", + "fast quick animal runs", + "the speedy brown fox jumps", + "rapid brown dogs run"}; + + CustomAnalyzerConfig::Builder builder; + builder.with_tokenizer_config("standard", {}); + auto custom_analyzer_config = builder.build(); + auto custom_analyzer = CustomAnalyzer::build_custom_analyzer(custom_analyzer_config); + + auto* indexwriter = + _CLNEW lucene::index::IndexWriter(dir.c_str(), custom_analyzer.get(), true); + indexwriter->setMaxBufferedDocs(100); + indexwriter->setRAMBufferSizeMB(-1); + indexwriter->setMaxFieldLength(0x7FFFFFFFL); + indexwriter->setMergeFactor(1000000000); + indexwriter->setUseCompoundFile(false); + + auto char_string_reader = std::make_shared>(); + + auto* doc = _CLNEW lucene::document::Document(); + int32_t field_config = lucene::document::Field::STORE_NO; + field_config |= lucene::document::Field::INDEX_NONORMS; + field_config |= lucene::document::Field::INDEX_TOKENIZED; + auto field_name_w = std::wstring(field_name.begin(), field_name.end()); + auto* field = _CLNEW lucene::document::Field(field_name_w.c_str(), field_config); + field->setOmitTermFreqAndPositions(false); + doc->add(*field); + + for (const auto& data : test_data) { + char_string_reader->init(data.data(), data.size(), false); + auto* stream = custom_analyzer->reusableTokenStream(field->name(), char_string_reader); + field->setValue(stream); + indexwriter->addDocument(doc); + } + + indexwriter->close(); + _CLLDELETE(indexwriter); + _CLLDELETE(doc); + } +}; + +static std::shared_ptr make_shared_reader( + lucene::index::IndexReader* raw_reader) { + return {raw_reader, [](lucene::index::IndexReader* reader) { + if (reader != nullptr) { + reader->close(); + _CLDELETE(reader); + } + }}; +} + +// Test basic multi-phrase query construction with single terms at each position +TEST_F(MultiPhraseQueryV2Test, test_multi_phrase_query_construction_single_terms) { + auto context = std::make_shared(); + context->collection_statistics = std::make_shared(); + context->collection_similarity = std::make_shared(); + + std::wstring field = StringHelper::to_wstring("content"); + + // Create term_infos with single terms at each position + std::vector term_infos; + TermInfo term1; + term1.term = std::string("quick"); + term1.position = 0; + term_infos.push_back(term1); + + TermInfo term2; + term2.term = std::string("brown"); + term2.position = 1; + term_infos.push_back(term2); + + // Test query construction + auto query = std::make_shared(context, field, term_infos); + ASSERT_NE(query, nullptr); + + // Test weight creation without scoring + auto weight = query->weight(false); + ASSERT_NE(weight, nullptr); + + // Verify weight is of correct type + auto multi_phrase_weight = std::dynamic_pointer_cast(weight); + ASSERT_NE(multi_phrase_weight, nullptr); +} + +// Test multi-phrase query construction with multiple terms at one position +TEST_F(MultiPhraseQueryV2Test, test_multi_phrase_query_construction_multi_terms) { + auto context = std::make_shared(); + context->collection_statistics = std::make_shared(); + context->collection_similarity = std::make_shared(); + + std::wstring field = StringHelper::to_wstring("content"); + + // Create term_infos with multiple terms at first position (quick OR fast OR speedy) + std::vector term_infos; + TermInfo term1; + term1.term = std::vector {"quick", "fast", "speedy"}; + term1.position = 0; + term_infos.push_back(term1); + + TermInfo term2; + term2.term = std::string("brown"); + term2.position = 1; + term_infos.push_back(term2); + + // Test query construction + auto query = std::make_shared(context, field, term_infos); + ASSERT_NE(query, nullptr); + + // Test weight creation + auto weight = query->weight(false); + ASSERT_NE(weight, nullptr); + + auto multi_phrase_weight = std::dynamic_pointer_cast(weight); + ASSERT_NE(multi_phrase_weight, nullptr); +} + +// Test multi-phrase query with empty terms (should throw exception) +TEST_F(MultiPhraseQueryV2Test, test_multi_phrase_query_empty_terms) { + auto context = std::make_shared(); + context->collection_statistics = std::make_shared(); + context->collection_similarity = std::make_shared(); + + std::wstring field = StringHelper::to_wstring("content"); + std::vector term_infos; // Empty term_infos + + auto query = std::make_shared(context, field, term_infos); + ASSERT_NE(query, nullptr); + + // Should throw exception when creating weight with empty terms + EXPECT_THROW({ auto weight = query->weight(false); }, Exception); +} + +// Test multi-phrase query with single term (should throw exception) +TEST_F(MultiPhraseQueryV2Test, test_multi_phrase_query_single_term) { + auto context = std::make_shared(); + context->collection_statistics = std::make_shared(); + context->collection_similarity = std::make_shared(); + + std::wstring field = StringHelper::to_wstring("content"); + + std::vector term_infos; + TermInfo term1; + term1.term = std::string("fox"); + term1.position = 0; + term_infos.push_back(term1); + + auto query = std::make_shared(context, field, term_infos); + ASSERT_NE(query, nullptr); + + // Should throw exception when creating weight with single term (requires at least 2 terms) + EXPECT_THROW({ auto weight = query->weight(false); }, Exception); +} + +// Test multi-phrase query execution with two single terms +TEST_F(MultiPhraseQueryV2Test, test_multi_phrase_query_two_single_terms) { + auto context = std::make_shared(); + context->collection_statistics = std::make_shared(); + context->collection_similarity = std::make_shared(); + + auto* dir = FSDirectory::getDirectory(kTestDir.c_str()); + auto reader_holder = make_shared_reader(lucene::index::IndexReader::open(dir, true)); + ASSERT_TRUE(reader_holder != nullptr); + + std::wstring field = StringHelper::to_wstring("content"); + + std::vector term_infos; + TermInfo term1; + term1.term = std::string("quick"); + term1.position = 0; + term_infos.push_back(term1); + + TermInfo term2; + term2.term = std::string("brown"); + term2.position = 1; + term_infos.push_back(term2); + + auto query = std::make_shared(context, field, term_infos); + auto weight = query->weight(false); + + query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = reader_holder->maxDoc(); + exec_ctx.readers = {reader_holder}; + exec_ctx.field_reader_bindings.emplace(field, reader_holder); + + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(scorer, nullptr); + + roaring::Roaring result; + uint32_t doc = scorer->doc(); + while (doc != query_v2::TERMINATED) { + result.add(doc); + doc = scorer->advance(); + } + + // Should match documents containing "quick brown" + EXPECT_GT(result.cardinality(), 0); + + _CLDECDELETE(dir); +} + +// Test multi-phrase query with alternatives at first position +TEST_F(MultiPhraseQueryV2Test, test_multi_phrase_query_with_alternatives) { + auto context = std::make_shared(); + context->collection_statistics = std::make_shared(); + context->collection_similarity = std::make_shared(); + + auto* dir = FSDirectory::getDirectory(kTestDir.c_str()); + auto reader_holder = make_shared_reader(lucene::index::IndexReader::open(dir, true)); + ASSERT_TRUE(reader_holder != nullptr); + + std::wstring field = StringHelper::to_wstring("content"); + + // Match "(quick OR fast) brown" - should match more documents + std::vector term_infos; + TermInfo term1; + term1.term = std::vector {"quick", "fast"}; + term1.position = 0; + term_infos.push_back(term1); + + TermInfo term2; + term2.term = std::string("brown"); + term2.position = 1; + term_infos.push_back(term2); + + auto query = std::make_shared(context, field, term_infos); + auto weight = query->weight(false); + + query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = reader_holder->maxDoc(); + exec_ctx.readers = {reader_holder}; + exec_ctx.field_reader_bindings.emplace(field, reader_holder); + + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(scorer, nullptr); + + roaring::Roaring result; + uint32_t doc = scorer->doc(); + while (doc != query_v2::TERMINATED) { + result.add(doc); + doc = scorer->advance(); + } + + // Should match documents containing either "quick brown" or "fast brown" + EXPECT_GT(result.cardinality(), 0); + + _CLDECDELETE(dir); +} + +// Test multi-phrase query with alternatives at multiple positions +TEST_F(MultiPhraseQueryV2Test, test_multi_phrase_query_multiple_alternatives) { + auto context = std::make_shared(); + context->collection_statistics = std::make_shared(); + context->collection_similarity = std::make_shared(); + + auto* dir = FSDirectory::getDirectory(kTestDir.c_str()); + auto reader_holder = make_shared_reader(lucene::index::IndexReader::open(dir, true)); + ASSERT_TRUE(reader_holder != nullptr); + + std::wstring field = StringHelper::to_wstring("content"); + + // Match "(quick OR fast) (brown OR lazy)" + std::vector term_infos; + TermInfo term1; + term1.term = std::vector {"quick", "fast"}; + term1.position = 0; + term_infos.push_back(term1); + + TermInfo term2; + term2.term = std::vector {"brown", "lazy"}; + term2.position = 1; + term_infos.push_back(term2); + + auto query = std::make_shared(context, field, term_infos); + auto weight = query->weight(false); + + query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = reader_holder->maxDoc(); + exec_ctx.readers = {reader_holder}; + exec_ctx.field_reader_bindings.emplace(field, reader_holder); + + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(scorer, nullptr); + + roaring::Roaring result; + uint32_t doc = scorer->doc(); + while (doc != query_v2::TERMINATED) { + result.add(doc); + doc = scorer->advance(); + } + + // Should match documents containing combinations: "quick brown", "quick lazy", "fast brown", or "fast lazy" + EXPECT_GT(result.cardinality(), 0); + + _CLDECDELETE(dir); +} + +// Test multi-phrase query with three positions +TEST_F(MultiPhraseQueryV2Test, test_multi_phrase_query_three_positions) { + auto context = std::make_shared(); + context->collection_statistics = std::make_shared(); + context->collection_similarity = std::make_shared(); + + auto* dir = FSDirectory::getDirectory(kTestDir.c_str()); + auto reader_holder = make_shared_reader(lucene::index::IndexReader::open(dir, true)); + ASSERT_TRUE(reader_holder != nullptr); + + std::wstring field = StringHelper::to_wstring("content"); + + // Match "(quick OR fast) brown (fox OR dog)" + std::vector term_infos; + TermInfo term1; + term1.term = std::vector {"quick", "fast"}; + term1.position = 0; + term_infos.push_back(term1); + + TermInfo term2; + term2.term = std::string("brown"); + term2.position = 1; + term_infos.push_back(term2); + + TermInfo term3; + term3.term = std::vector {"fox", "dog"}; + term3.position = 2; + term_infos.push_back(term3); + + auto query = std::make_shared(context, field, term_infos); + auto weight = query->weight(false); + + query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = reader_holder->maxDoc(); + exec_ctx.readers = {reader_holder}; + exec_ctx.field_reader_bindings.emplace(field, reader_holder); + + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(scorer, nullptr); + + roaring::Roaring result; + uint32_t doc = scorer->doc(); + while (doc != query_v2::TERMINATED) { + result.add(doc); + doc = scorer->advance(); + } + + // Should match documents containing various combinations + EXPECT_GE(result.cardinality(), 0); + + _CLDECDELETE(dir); +} + +// Test multi-phrase query with non-matching phrase +TEST_F(MultiPhraseQueryV2Test, test_multi_phrase_query_no_matches) { + auto context = std::make_shared(); + context->collection_statistics = std::make_shared(); + context->collection_similarity = std::make_shared(); + + auto* dir = FSDirectory::getDirectory(kTestDir.c_str()); + auto reader_holder = make_shared_reader(lucene::index::IndexReader::open(dir, true)); + ASSERT_TRUE(reader_holder != nullptr); + + std::wstring field = StringHelper::to_wstring("content"); + + // Terms that don't appear together in sequence + std::vector term_infos; + TermInfo term1; + term1.term = std::vector {"purple", "orange"}; + term1.position = 0; + term_infos.push_back(term1); + + TermInfo term2; + term2.term = std::string("elephant"); + term2.position = 1; + term_infos.push_back(term2); + + auto query = std::make_shared(context, field, term_infos); + auto weight = query->weight(false); + + query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = reader_holder->maxDoc(); + exec_ctx.readers = {reader_holder}; + exec_ctx.field_reader_bindings.emplace(field, reader_holder); + + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(scorer, nullptr); + + roaring::Roaring result; + uint32_t doc = scorer->doc(); + while (doc != query_v2::TERMINATED) { + result.add(doc); + doc = scorer->advance(); + } + + EXPECT_EQ(result.cardinality(), 0); + + _CLDECDELETE(dir); +} + +// Test multi-phrase query with scoring enabled +TEST_F(MultiPhraseQueryV2Test, test_multi_phrase_query_with_scoring) { + auto context = std::make_shared(); + context->collection_statistics = std::make_shared(); + context->collection_similarity = std::make_shared(); + + auto* dir = FSDirectory::getDirectory(kTestDir.c_str()); + auto reader_holder = make_shared_reader(lucene::index::IndexReader::open(dir, true)); + ASSERT_TRUE(reader_holder != nullptr); + + std::wstring field = StringHelper::to_wstring("content"); + + std::vector term_infos; + TermInfo term1; + term1.term = std::vector {"quick", "fast"}; + term1.position = 0; + term_infos.push_back(term1); + + TermInfo term2; + term2.term = std::string("brown"); + term2.position = 1; + term_infos.push_back(term2); + + // Fill collection statistics for scoring + context->collection_statistics->_total_num_docs = reader_holder->numDocs(); + context->collection_statistics->_total_num_tokens[field] = reader_holder->numDocs() * 8; + context->collection_statistics->_term_doc_freqs[field][StringHelper::to_wstring("quick")] = 10; + context->collection_statistics->_term_doc_freqs[field][StringHelper::to_wstring("fast")] = 5; + context->collection_statistics->_term_doc_freqs[field][StringHelper::to_wstring("brown")] = 10; + + auto query = std::make_shared(context, field, term_infos); + auto weight = query->weight(true); // Enable scoring + + query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = reader_holder->maxDoc(); + exec_ctx.readers = {reader_holder}; + exec_ctx.field_reader_bindings.emplace(field, reader_holder); + + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(scorer, nullptr); + + roaring::Roaring result; + uint32_t doc = scorer->doc(); + float total_score = 0.0F; + uint32_t count = 0; + while (doc != query_v2::TERMINATED) { + float score = scorer->score(); + EXPECT_GE(score, 0.0F) << "Score should be non-negative"; + total_score += score; + result.add(doc); + ++count; + doc = scorer->advance(); + } + + if (count > 0) { + EXPECT_GT(total_score, 0.0F) << "Total score should be positive"; + } + + _CLDECDELETE(dir); +} + +// Test multi-phrase query with binding key +TEST_F(MultiPhraseQueryV2Test, test_multi_phrase_query_with_binding_key) { + auto context = std::make_shared(); + context->collection_statistics = std::make_shared(); + context->collection_similarity = std::make_shared(); + + auto* dir = FSDirectory::getDirectory(kTestDir.c_str()); + auto reader_holder = make_shared_reader(lucene::index::IndexReader::open(dir, true)); + ASSERT_TRUE(reader_holder != nullptr); + + std::wstring field = StringHelper::to_wstring("content"); + + std::vector term_infos; + TermInfo term1; + term1.term = std::string("lazy"); + term1.position = 0; + term_infos.push_back(term1); + + TermInfo term2; + term2.term = std::vector {"dog", "cat"}; + term2.position = 1; + term_infos.push_back(term2); + + auto query = std::make_shared(context, field, term_infos); + auto weight = query->weight(false); + + query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = reader_holder->maxDoc(); + exec_ctx.readers = {reader_holder}; + + std::string binding_key = "content#0"; + exec_ctx.reader_bindings[binding_key] = reader_holder; + exec_ctx.field_reader_bindings.emplace(field, reader_holder); + + auto scorer = weight->scorer(exec_ctx, binding_key); + ASSERT_NE(scorer, nullptr); + + roaring::Roaring result; + uint32_t doc = scorer->doc(); + while (doc != query_v2::TERMINATED) { + result.add(doc); + doc = scorer->advance(); + } + + EXPECT_GE(result.cardinality(), 0); + + _CLDECDELETE(dir); +} + +// Test multi-phrase query with all positions having alternatives +TEST_F(MultiPhraseQueryV2Test, test_multi_phrase_query_all_positions_alternatives) { + auto context = std::make_shared(); + context->collection_statistics = std::make_shared(); + context->collection_similarity = std::make_shared(); + + auto* dir = FSDirectory::getDirectory(kTestDir.c_str()); + auto reader_holder = make_shared_reader(lucene::index::IndexReader::open(dir, true)); + ASSERT_TRUE(reader_holder != nullptr); + + std::wstring field = StringHelper::to_wstring("content"); + + // All positions have alternatives + std::vector term_infos; + TermInfo term1; + term1.term = std::vector {"quick", "fast", "speedy", "rapid"}; + term1.position = 0; + term_infos.push_back(term1); + + TermInfo term2; + term2.term = std::vector {"brown", "lazy"}; + term2.position = 1; + term_infos.push_back(term2); + + TermInfo term3; + term3.term = std::vector {"fox", "dog", "dogs"}; + term3.position = 2; + term_infos.push_back(term3); + + auto query = std::make_shared(context, field, term_infos); + auto weight = query->weight(false); + + query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = reader_holder->maxDoc(); + exec_ctx.readers = {reader_holder}; + exec_ctx.field_reader_bindings.emplace(field, reader_holder); + + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(scorer, nullptr); + + roaring::Roaring result; + uint32_t doc = scorer->doc(); + while (doc != query_v2::TERMINATED) { + result.add(doc); + doc = scorer->advance(); + } + + // Should match various combinations + EXPECT_GE(result.cardinality(), 0); + + _CLDECDELETE(dir); +} + +// Test multi-phrase query destructor (coverage) +TEST_F(MultiPhraseQueryV2Test, test_multi_phrase_query_destructor) { + auto context = std::make_shared(); + context->collection_statistics = std::make_shared(); + context->collection_similarity = std::make_shared(); + + std::wstring field = StringHelper::to_wstring("content"); + + std::vector term_infos; + TermInfo term1; + term1.term = std::vector {"test", "example"}; + term1.position = 0; + term_infos.push_back(term1); + + TermInfo term2; + term2.term = std::string("phrase"); + term2.position = 1; + term_infos.push_back(term2); + + { + auto query = std::make_shared(context, field, term_infos); + auto weight = query->weight(false); + ASSERT_NE(weight, nullptr); + // Query and weight will be destroyed at scope exit + } + // If we reach here without crash, destructor works correctly + SUCCEED(); +} + +// Test multi-phrase query with longer phrase (4+ positions) +TEST_F(MultiPhraseQueryV2Test, test_multi_phrase_query_long_phrase) { + auto context = std::make_shared(); + context->collection_statistics = std::make_shared(); + context->collection_similarity = std::make_shared(); + + auto* dir = FSDirectory::getDirectory(kTestDir.c_str()); + auto reader_holder = make_shared_reader(lucene::index::IndexReader::open(dir, true)); + ASSERT_TRUE(reader_holder != nullptr); + + std::wstring field = StringHelper::to_wstring("content"); + + // Match "the (quick OR fast) brown fox jumps" + std::vector term_infos; + TermInfo term1; + term1.term = std::string("the"); + term1.position = 0; + term_infos.push_back(term1); + + TermInfo term2; + term2.term = std::vector {"quick", "fast"}; + term2.position = 1; + term_infos.push_back(term2); + + TermInfo term3; + term3.term = std::string("brown"); + term3.position = 2; + term_infos.push_back(term3); + + TermInfo term4; + term4.term = std::string("fox"); + term4.position = 3; + term_infos.push_back(term4); + + TermInfo term5; + term5.term = std::string("jumps"); + term5.position = 4; + term_infos.push_back(term5); + + auto query = std::make_shared(context, field, term_infos); + auto weight = query->weight(false); + + query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = reader_holder->maxDoc(); + exec_ctx.readers = {reader_holder}; + exec_ctx.field_reader_bindings.emplace(field, reader_holder); + + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(scorer, nullptr); + + roaring::Roaring result; + uint32_t doc = scorer->doc(); + while (doc != query_v2::TERMINATED) { + result.add(doc); + doc = scorer->advance(); + } + + EXPECT_GE(result.cardinality(), 0); + + _CLDECDELETE(dir); +} + +// Test multi-phrase query with BM25 similarity +TEST_F(MultiPhraseQueryV2Test, test_multi_phrase_query_bm25_similarity) { + auto context = std::make_shared(); + context->collection_statistics = std::make_shared(); + context->collection_similarity = std::make_shared(); + + auto* dir = FSDirectory::getDirectory(kTestDir.c_str()); + auto reader_holder = make_shared_reader(lucene::index::IndexReader::open(dir, true)); + ASSERT_TRUE(reader_holder != nullptr); + + std::wstring field = StringHelper::to_wstring("content"); + + std::vector term_infos; + TermInfo term1; + term1.term = std::vector {"quick", "fast"}; + term1.position = 0; + term_infos.push_back(term1); + + TermInfo term2; + term2.term = std::string("brown"); + term2.position = 1; + term_infos.push_back(term2); + + TermInfo term3; + term3.term = std::vector {"fox", "dog"}; + term3.position = 2; + term_infos.push_back(term3); + + // Setup statistics for BM25 + context->collection_statistics->_total_num_docs = reader_holder->numDocs(); + context->collection_statistics->_total_num_tokens[field] = reader_holder->numDocs() * 8; + context->collection_statistics->_term_doc_freqs[field][StringHelper::to_wstring("quick")] = 10; + context->collection_statistics->_term_doc_freqs[field][StringHelper::to_wstring("fast")] = 5; + context->collection_statistics->_term_doc_freqs[field][StringHelper::to_wstring("brown")] = 10; + context->collection_statistics->_term_doc_freqs[field][StringHelper::to_wstring("fox")] = 8; + context->collection_statistics->_term_doc_freqs[field][StringHelper::to_wstring("dog")] = 8; + + auto query = std::make_shared(context, field, term_infos); + auto weight = query->weight(true); // Enable scoring + + query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = reader_holder->maxDoc(); + exec_ctx.readers = {reader_holder}; + exec_ctx.field_reader_bindings.emplace(field, reader_holder); + + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(scorer, nullptr); + + uint32_t doc = scorer->doc(); + bool found_match = false; + while (doc != query_v2::TERMINATED) { + float score = scorer->score(); + EXPECT_GE(score, 0.0F) << "BM25 score should be non-negative"; + found_match = true; + doc = scorer->advance(); + } + + if (found_match) { + SUCCEED() << "Found matches with BM25 scoring"; + } + + _CLDECDELETE(dir); +} + +} // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query_test.cpp index 74831ab6917718..43cf0ed6197a5d 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query_test.cpp @@ -19,7 +19,6 @@ #include -#include #include #include #include @@ -27,8 +26,8 @@ #include "common/status.h" #include "olap/rowset/segment_v2/index_query_context.h" #include "olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.h" +#include "olap/rowset/segment_v2/inverted_index/query/query_info.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_weight.h" -#include "olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity.h" #include "olap/rowset/segment_v2/inverted_index/util/string_helper.h" CL_NS_USE(search) @@ -136,8 +135,17 @@ TEST_F(PhraseQueryV2Test, test_phrase_query_construction) { std::vector terms = {StringHelper::to_wstring("quick"), StringHelper::to_wstring("brown")}; + std::vector term_infos; + term_infos.reserve(terms.size()); + for (size_t i = 0; i < terms.size(); ++i) { + TermInfo term_info; + term_info.term = StringHelper::to_string(terms[i]); + term_info.position = static_cast(i); + term_infos.push_back(term_info); + } + // Test query construction - auto query = std::make_shared(context, field, terms); + auto query = std::make_shared(context, field, term_infos); ASSERT_NE(query, nullptr); // Test weight creation without scoring @@ -178,9 +186,9 @@ TEST_F(PhraseQueryV2Test, test_phrase_query_empty_terms) { context->collection_similarity = std::make_shared(); std::wstring field = StringHelper::to_wstring("content"); - std::vector terms; // Empty terms + std::vector term_infos; // Empty term_infos - auto query = std::make_shared(context, field, terms); + auto query = std::make_shared(context, field, term_infos); ASSERT_NE(query, nullptr); // Should throw exception when creating weight with empty terms @@ -201,7 +209,16 @@ TEST_F(PhraseQueryV2Test, test_phrase_query_two_terms) { std::vector terms = {StringHelper::to_wstring("quick"), StringHelper::to_wstring("brown")}; - auto query = std::make_shared(context, field, terms); + std::vector term_infos; + term_infos.reserve(terms.size()); + for (size_t i = 0; i < terms.size(); ++i) { + TermInfo term_info; + term_info.term = StringHelper::to_string(terms[i]); + term_info.position = static_cast(i); + term_infos.push_back(term_info); + } + + auto query = std::make_shared(context, field, term_infos); auto weight = query->weight(false); query_v2::QueryExecutionContext exec_ctx; @@ -240,7 +257,16 @@ TEST_F(PhraseQueryV2Test, test_phrase_query_three_terms) { StringHelper::to_wstring("brown"), StringHelper::to_wstring("fox")}; - auto query = std::make_shared(context, field, terms); + std::vector term_infos; + term_infos.reserve(terms.size()); + for (size_t i = 0; i < terms.size(); ++i) { + TermInfo term_info; + term_info.term = StringHelper::to_string(terms[i]); + term_info.position = static_cast(i); + term_infos.push_back(term_info); + } + + auto query = std::make_shared(context, field, term_infos); auto weight = query->weight(false); query_v2::QueryExecutionContext exec_ctx; @@ -273,7 +299,16 @@ TEST_F(PhraseQueryV2Test, test_phrase_query_single_term) { std::wstring field = StringHelper::to_wstring("content"); std::vector terms = {StringHelper::to_wstring("fox")}; - auto query = std::make_shared(context, field, terms); + std::vector term_infos; + term_infos.reserve(terms.size()); + for (size_t i = 0; i < terms.size(); ++i) { + TermInfo term_info; + term_info.term = StringHelper::to_string(terms[i]); + term_info.position = static_cast(i); + term_infos.push_back(term_info); + } + + auto query = std::make_shared(context, field, term_infos); ASSERT_NE(query, nullptr); // Should throw exception when creating weight with single term (phrase requires at least 2 terms) @@ -294,7 +329,16 @@ TEST_F(PhraseQueryV2Test, test_phrase_query_no_matches) { std::vector terms = {StringHelper::to_wstring("purple"), StringHelper::to_wstring("elephant")}; - auto query = std::make_shared(context, field, terms); + std::vector term_infos; + term_infos.reserve(terms.size()); + for (size_t i = 0; i < terms.size(); ++i) { + TermInfo term_info; + term_info.term = StringHelper::to_string(terms[i]); + term_info.position = static_cast(i); + term_infos.push_back(term_info); + } + + auto query = std::make_shared(context, field, term_infos); auto weight = query->weight(false); query_v2::QueryExecutionContext exec_ctx; @@ -331,13 +375,22 @@ TEST_F(PhraseQueryV2Test, test_phrase_query_scoring) { std::vector terms = {StringHelper::to_wstring("quick"), StringHelper::to_wstring("brown")}; + std::vector term_infos; + term_infos.reserve(terms.size()); + for (size_t i = 0; i < terms.size(); ++i) { + TermInfo term_info; + term_info.term = StringHelper::to_string(terms[i]); + term_info.position = static_cast(i); + term_infos.push_back(term_info); + } + // Fill collection statistics for scoring context->collection_statistics->_total_num_docs = reader_holder->numDocs(); context->collection_statistics->_total_num_tokens[field] = reader_holder->numDocs() * 8; context->collection_statistics->_term_doc_freqs[field][StringHelper::to_wstring("quick")] = 10; context->collection_statistics->_term_doc_freqs[field][StringHelper::to_wstring("brown")] = 10; - auto query = std::make_shared(context, field, terms); + auto query = std::make_shared(context, field, term_infos); auto weight = query->weight(true); query_v2::QueryExecutionContext exec_ctx; @@ -382,7 +435,16 @@ TEST_F(PhraseQueryV2Test, test_phrase_query_with_binding_key) { std::vector terms = {StringHelper::to_wstring("lazy"), StringHelper::to_wstring("dog")}; - auto query = std::make_shared(context, field, terms); + std::vector term_infos; + term_infos.reserve(terms.size()); + for (size_t i = 0; i < terms.size(); ++i) { + TermInfo term_info; + term_info.term = StringHelper::to_string(terms[i]); + term_info.position = static_cast(i); + term_infos.push_back(term_info); + } + + auto query = std::make_shared(context, field, term_infos); auto weight = query->weight(false); query_v2::QueryExecutionContext exec_ctx; @@ -418,8 +480,17 @@ TEST_F(PhraseQueryV2Test, test_phrase_query_destructor) { std::vector terms = {StringHelper::to_wstring("test"), StringHelper::to_wstring("phrase")}; + std::vector term_infos; + term_infos.reserve(terms.size()); + for (size_t i = 0; i < terms.size(); ++i) { + TermInfo term_info; + term_info.term = StringHelper::to_string(terms[i]); + term_info.position = static_cast(i); + term_infos.push_back(term_info); + } + { - auto query = std::make_shared(context, field, terms); + auto query = std::make_shared(context, field, term_infos); auto weight = query->weight(false); ASSERT_NE(weight, nullptr); // Query and weight will be destroyed at scope exit @@ -444,7 +515,16 @@ TEST_F(PhraseQueryV2Test, test_phrase_query_long_phrase) { StringHelper::to_wstring("brown"), StringHelper::to_wstring("fox"), StringHelper::to_wstring("jumps")}; - auto query = std::make_shared(context, field, terms); + std::vector term_infos; + term_infos.reserve(terms.size()); + for (size_t i = 0; i < terms.size(); ++i) { + TermInfo term_info; + term_info.term = StringHelper::to_string(terms[i]); + term_info.position = static_cast(i); + term_infos.push_back(term_info); + } + + auto query = std::make_shared(context, field, term_infos); auto weight = query->weight(false); query_v2::QueryExecutionContext exec_ctx; @@ -482,7 +562,16 @@ TEST_F(PhraseQueryV2Test, test_phrase_query_terms_not_in_sequence) { std::vector terms = {StringHelper::to_wstring("dog"), StringHelper::to_wstring("fox")}; - auto query = std::make_shared(context, field, terms); + std::vector term_infos; + term_infos.reserve(terms.size()); + for (size_t i = 0; i < terms.size(); ++i) { + TermInfo term_info; + term_info.term = StringHelper::to_string(terms[i]); + term_info.position = static_cast(i); + term_infos.push_back(term_info); + } + + auto query = std::make_shared(context, field, term_infos); auto weight = query->weight(false); query_v2::QueryExecutionContext exec_ctx; @@ -521,6 +610,15 @@ TEST_F(PhraseQueryV2Test, test_phrase_query_bm25_similarity) { StringHelper::to_wstring("brown"), StringHelper::to_wstring("fox")}; + std::vector term_infos; + term_infos.reserve(terms.size()); + for (size_t i = 0; i < terms.size(); ++i) { + TermInfo term_info; + term_info.term = StringHelper::to_string(terms[i]); + term_info.position = static_cast(i); + term_infos.push_back(term_info); + } + // Setup statistics for BM25 context->collection_statistics->_total_num_docs = reader_holder->numDocs(); context->collection_statistics->_total_num_tokens[field] = reader_holder->numDocs() * 8; @@ -528,7 +626,7 @@ TEST_F(PhraseQueryV2Test, test_phrase_query_bm25_similarity) { context->collection_statistics->_term_doc_freqs[field][term] = 5; } - auto query = std::make_shared(context, field, terms); + auto query = std::make_shared(context, field, term_infos); auto weight = query->weight(true); // Enable scoring query_v2::QueryExecutionContext exec_ctx; diff --git a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/segment_postings_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/segment_postings_test.cpp index f6fef2dae73da1..10567b0bf4856d 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/segment_postings_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/segment_postings_test.cpp @@ -17,149 +17,318 @@ #include "olap/rowset/segment_v2/inverted_index/query_v2/segment_postings.h" +#include #include -#include +#include #include #include -namespace doris { +#include "CLucene/index/DocRange.h" -using namespace segment_v2::inverted_index::query_v2; +namespace doris::segment_v2::inverted_index::query_v2 { -class SegmentPostingsTest : public ::testing::Test { +class MockTermDocs : public lucene::index::TermDocs { public: - void SetUp() override {} - void TearDown() override {} + MockTermDocs(std::vector docs, std::vector freqs, std::vector norms, + int32_t doc_freq) + : _docs(std::move(docs)), + _freqs(std::move(freqs)), + _norms(std::move(norms)), + _doc_freq(doc_freq) {} + + void seek(lucene::index::Term* term) override {} + void seek(lucene::index::TermEnum* termEnum) override {} + + int32_t doc() const override { + if (_index >= 0 && _index < static_cast(_docs.size())) { + return _docs[_index]; + } + return INT_MAX; + } + + int32_t freq() const override { + if (_index >= 0 && _index < static_cast(_freqs.size())) { + return _freqs[_index]; + } + return 0; + } + + int32_t norm() const override { + if (_index >= 0 && _index < static_cast(_norms.size())) { + return _norms[_index]; + } + return 1; + } + + bool next() override { + if (_index + 1 < static_cast(_docs.size())) { + ++_index; + return true; + } + return false; + } + + int32_t read(int32_t*, int32_t*, int32_t) override { return 0; } + int32_t read(int32_t*, int32_t*, int32_t*, int32_t) override { return 0; } + bool readRange(DocRange*) override { return false; } + + bool skipTo(const int32_t target) override { + auto size = static_cast(_docs.size()); + while (_index + 1 < size && _docs[_index + 1] < target) { + ++_index; + } + if (_index + 1 < size) { + ++_index; + return true; + } + return false; + } + + void close() override {} + lucene::index::TermPositions* __asTermPositions() override { return nullptr; } + int32_t docFreq() override { return _doc_freq; } + +private: + std::vector _docs; + std::vector _freqs; + std::vector _norms; + int32_t _doc_freq; + int32_t _index = -1; }; -class FakeIter { +class MockTermPositions : public lucene::index::TermPositions { public: - struct Entry { - int32_t d; - int32_t f; - int32_t n; - }; + MockTermPositions(std::vector docs, std::vector freqs, + std::vector norms, std::vector> positions, + int32_t doc_freq) + : _docs(std::move(docs)), + _freqs(std::move(freqs)), + _norms(std::move(norms)), + _positions(std::move(positions)), + _doc_freq(doc_freq) {} + + void seek(lucene::index::Term* term) override {} + void seek(lucene::index::TermEnum* termEnum) override {} - explicit FakeIter(std::vector postings) : _postings(std::move(postings)) {} + int32_t doc() const override { + if (_index >= 0 && _index < static_cast(_docs.size())) { + return _docs[_index]; + } + return INT_MAX; + } + + int32_t freq() const override { + if (_index >= 0 && _index < static_cast(_freqs.size())) { + return _freqs[_index]; + } + return 0; + } + + int32_t norm() const override { + if (_index >= 0 && _index < static_cast(_norms.size())) { + return _norms[_index]; + } + return 1; + } - bool next() { - if (_idx + 1 < static_cast(_postings.size())) { - ++_idx; + bool next() override { + if (_index + 1 < static_cast(_docs.size())) { + ++_index; + _pos_index = 0; return true; } return false; } - bool skipTo(uint32_t target) { - int32_t start = std::max(_idx, 0); - for (int32_t j = start; j < static_cast(_postings.size()); ++j) { - if (static_cast(_postings[j].d) >= target) { - _idx = j; - return true; - } + int32_t read(int32_t*, int32_t*, int32_t) override { return 0; } + int32_t read(int32_t*, int32_t*, int32_t*, int32_t) override { return 0; } + bool readRange(DocRange*) override { return false; } + + bool skipTo(const int32_t target) override { + auto size = static_cast(_docs.size()); + while (_index + 1 < size && _docs[_index + 1] < target) { + ++_index; + } + _pos_index = 0; + if (_index + 1 < size) { + ++_index; + return true; } return false; } - int32_t doc() const { return _postings[_idx].d; } - int32_t freq() const { return _postings[_idx].f; } - int32_t norm() const { return _postings[_idx].n; } - uint32_t docFreq() const { return static_cast(_postings.size()); } + void close() override {} + lucene::index::TermPositions* __asTermPositions() override { return this; } + lucene::index::TermDocs* __asTermDocs() override { return this; } + + int32_t nextPosition() override { + if (_index >= 0 && _index < static_cast(_positions.size()) && + _pos_index < _positions[_index].size()) { + return _positions[_index][_pos_index++]; + } + return 0; + } + + int32_t getPayloadLength() const override { return 0; } + uint8_t* getPayload(uint8_t*) override { return nullptr; } + bool isPayloadAvailable() const override { return false; } + int32_t docFreq() override { return _doc_freq; } private: - std::vector _postings; - int32_t _idx = -1; + std::vector _docs; + std::vector _freqs; + std::vector _norms; + std::vector> _positions; + int32_t _doc_freq; + int32_t _index = -1; + size_t _pos_index = 0; }; -TEST_F(SegmentPostingsTest, BasicIterationAndScore) { - using IterPtr = std::shared_ptr; - std::vector data = {{1, 2, 10}, {3, 1, 7}, {5, 4, 5}}; - auto iter = std::make_shared(data); +class SegmentPostingsTest : public testing::Test {}; + +TEST_F(SegmentPostingsTest, test_postings_positions_with_offset) { + class TestPostings : public Postings { + public: + void append_positions_with_offset(uint32_t offset, std::vector& output) override { + output.push_back(offset + 10); + output.push_back(offset + 20); + } + }; + + TestPostings postings; + std::vector output = {999}; + postings.positions_with_offset(100, output); - SegmentPostings sp(iter); + EXPECT_EQ(output.size(), 2); + EXPECT_EQ(output[0], 110); + EXPECT_EQ(output[1], 120); +} - EXPECT_EQ(sp.size_hint(), 3u); - EXPECT_EQ(sp.doc(), 1u); - EXPECT_EQ(sp.freq(), 2); - EXPECT_EQ(sp.norm(), 10); +TEST_F(SegmentPostingsTest, test_segment_postings_base_default_constructor) { + SegmentPostingsBase base; + EXPECT_EQ(base.doc(), TERMINATED); +} - EXPECT_EQ(sp.advance(), 3u); - EXPECT_EQ(sp.doc(), 3u); - EXPECT_EQ(sp.freq(), 1); - EXPECT_EQ(sp.norm(), 7); +TEST_F(SegmentPostingsTest, test_segment_postings_base_constructor_next_true) { + TermDocsPtr ptr(new MockTermDocs({1, 3, 5}, {2, 4, 6}, {1, 1, 1}, 3)); + SegmentPostingsBase base(std::move(ptr)); - EXPECT_EQ(sp.advance(), 5u); - EXPECT_EQ(sp.doc(), 5u); - EXPECT_EQ(sp.freq(), 4); - EXPECT_EQ(sp.norm(), 5); + EXPECT_EQ(base.doc(), 1); + EXPECT_EQ(base.size_hint(), 3); + EXPECT_EQ(base.freq(), 2); + EXPECT_EQ(base.norm(), 1); +} - EXPECT_EQ(sp.advance(), TERMINATED); - EXPECT_EQ(sp.doc(), TERMINATED); +TEST_F(SegmentPostingsTest, test_segment_postings_base_constructor_next_false) { + TermDocsPtr ptr(new MockTermDocs({}, {}, {}, 0)); + SegmentPostingsBase base(std::move(ptr)); + + EXPECT_EQ(base.doc(), TERMINATED); } -TEST_F(SegmentPostingsTest, SeekBehavior) { - using IterPtr = std::shared_ptr; - std::vector data = {{2, 1, 1}, {10, 2, 3}, {15, 3, 9}}; - auto iter = std::make_shared(data); +TEST_F(SegmentPostingsTest, test_segment_postings_base_constructor_doc_int_max) { + TermDocsPtr ptr(new MockTermDocs({INT_MAX}, {1}, {1}, 1)); + SegmentPostingsBase base(std::move(ptr)); + + EXPECT_EQ(base.doc(), TERMINATED); +} - SegmentPostings sp(iter); +TEST_F(SegmentPostingsTest, test_segment_postings_base_advance_success) { + TermDocsPtr ptr(new MockTermDocs({1, 3, 5}, {2, 4, 6}, {1, 1, 1}, 3)); + SegmentPostingsBase base(std::move(ptr)); - EXPECT_EQ(sp.doc(), 2u); - EXPECT_EQ(sp.seek(0), 2u); - EXPECT_EQ(sp.seek(2), 2u); - EXPECT_EQ(sp.seek(3), 10u); - EXPECT_EQ(sp.seek(10), 10u); - EXPECT_EQ(sp.seek(11), 15u); - EXPECT_EQ(sp.seek(100), TERMINATED); - EXPECT_EQ(sp.doc(), TERMINATED); + EXPECT_EQ(base.doc(), 1); + EXPECT_EQ(base.advance(), 3); + EXPECT_EQ(base.advance(), 5); } -TEST_F(SegmentPostingsTest, NoScoreSegmentPostingAlwaysOne) { - using IterPtr = std::shared_ptr; - std::vector data = {{1, 100, 200}, {2, 300, 400}}; - auto iter = std::make_shared(data); +TEST_F(SegmentPostingsTest, test_segment_postings_base_advance_end) { + TermDocsPtr ptr(new MockTermDocs({1}, {2}, {1}, 1)); + SegmentPostingsBase base(std::move(ptr)); - NoScoreSegmentPosting sp(iter); - EXPECT_EQ(sp.doc(), 1u); - EXPECT_EQ(sp.freq(), 1); - EXPECT_EQ(sp.norm(), 1); + EXPECT_EQ(base.advance(), TERMINATED); +} - EXPECT_EQ(sp.advance(), 2u); - EXPECT_EQ(sp.freq(), 1); - EXPECT_EQ(sp.norm(), 1); +TEST_F(SegmentPostingsTest, test_segment_postings_base_seek_target_le_doc) { + TermDocsPtr ptr(new MockTermDocs({1, 3, 5}, {2, 4, 6}, {1, 1, 1}, 3)); + SegmentPostingsBase base(std::move(ptr)); - EXPECT_EQ(sp.advance(), TERMINATED); - EXPECT_EQ(sp.doc(), TERMINATED); + EXPECT_EQ(base.seek(0), 1); + EXPECT_EQ(base.seek(1), 1); } -TEST_F(SegmentPostingsTest, EmptySegmentPostingAlwaysTerminated) { - EmptySegmentPosting> sp; - EXPECT_EQ(sp.size_hint(), 0u); - EXPECT_EQ(sp.doc(), TERMINATED); - EXPECT_EQ(sp.advance(), TERMINATED); - EXPECT_EQ(sp.seek(123), TERMINATED); - EXPECT_EQ(sp.freq(), 1); - EXPECT_EQ(sp.norm(), 1); +TEST_F(SegmentPostingsTest, test_segment_postings_base_seek_skipTo_success) { + TermDocsPtr ptr(new MockTermDocs({1, 3, 5, 7}, {2, 4, 6, 8}, {1, 1, 1, 1}, 4)); + SegmentPostingsBase base(std::move(ptr)); + + EXPECT_EQ(base.seek(5), 5); +} + +TEST_F(SegmentPostingsTest, test_segment_postings_base_seek_skipTo_fail) { + TermDocsPtr ptr(new MockTermDocs({1, 3, 5}, {2, 4, 6}, {1, 1, 1}, 3)); + SegmentPostingsBase base(std::move(ptr)); + + EXPECT_EQ(base.seek(10), TERMINATED); +} + +TEST_F(SegmentPostingsTest, test_segment_postings_base_append_positions_exception) { + TermDocsPtr ptr(new MockTermDocs({1}, {2}, {1}, 1)); + SegmentPostingsBase base(std::move(ptr)); + + std::vector output; + EXPECT_THROW(base.append_positions_with_offset(0, output), Exception); +} + +TEST_F(SegmentPostingsTest, test_segment_postings_termdocs) { + TermDocsPtr ptr(new MockTermDocs({1, 3}, {2, 4}, {1, 1}, 2)); + SegmentPostings postings(std::move(ptr)); + + EXPECT_EQ(postings.doc(), 1); + EXPECT_EQ(postings.size_hint(), 2); +} + +TEST_F(SegmentPostingsTest, test_segment_postings_termpositions) { + TermPositionsPtr ptr( + new MockTermPositions({1, 3}, {2, 3}, {1, 1}, {{10, 20}, {30, 40, 50}}, 2)); + SegmentPostings postings(std::move(ptr)); + + EXPECT_EQ(postings.doc(), 1); + EXPECT_EQ(postings.freq(), 2); +} + +TEST_F(SegmentPostingsTest, test_segment_postings_termpositions_append_positions) { + TermPositionsPtr ptr( + new MockTermPositions({1, 3}, {2, 3}, {1, 1}, {{10, 20}, {30, 40, 50}}, 2)); + SegmentPostings postings(std::move(ptr)); + + std::vector output = {999}; + postings.append_positions_with_offset(100, output); + + EXPECT_EQ(output.size(), 3); + EXPECT_EQ(output[0], 999); + EXPECT_EQ(output[1], 110); + EXPECT_EQ(output[2], 120); } -TEST_F(SegmentPostingsTest, ConstructorWithEmptyIterator) { - using IterPtr = std::shared_ptr; - std::vector data; - auto iter = std::make_shared(data); +TEST_F(SegmentPostingsTest, test_no_score_segment_posting) { + TermDocsPtr ptr(new MockTermDocs({1, 3}, {5, 7}, {10, 20}, 2)); + NoScoreSegmentPosting posting(std::move(ptr)); - SegmentPostings sp(iter); - EXPECT_EQ(sp.size_hint(), 0u); - EXPECT_EQ(sp.doc(), TERMINATED); + EXPECT_EQ(posting.doc(), 1); + EXPECT_EQ(posting.freq(), 1); + EXPECT_EQ(posting.norm(), 1); } -TEST_F(SegmentPostingsTest, IntMaxDocBecomesTerminatedOnInit) { - using IterPtr = std::shared_ptr; - std::vector data = {{INT_MAX, 1, 1}}; - auto iter = std::make_shared(data); +TEST_F(SegmentPostingsTest, test_empty_segment_posting) { + EmptySegmentPosting posting; - SegmentPostings sp(iter); - EXPECT_EQ(sp.doc(), TERMINATED); + EXPECT_EQ(posting.doc(), TERMINATED); + EXPECT_EQ(posting.size_hint(), 0); + EXPECT_EQ(posting.freq(), 1); + EXPECT_EQ(posting.norm(), 1); + EXPECT_EQ(posting.advance(), TERMINATED); + EXPECT_EQ(posting.seek(100), TERMINATED); } -} // namespace doris \ No newline at end of file +} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file diff --git a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/simple_union_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/simple_union_test.cpp new file mode 100644 index 00000000000000..7a5fd6272f4a69 --- /dev/null +++ b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/simple_union_test.cpp @@ -0,0 +1,568 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/inverted_index/query_v2/union/simple_union.h" + +#include + +#include "olap/rowset/segment_v2/inverted_index/query_v2/doc_set.h" + +namespace doris { + +using namespace segment_v2::inverted_index::query_v2; + +class SimpleUnionTest : public testing::Test { +public: + void SetUp() override {} + void TearDown() override {} +}; + +TEST_F(SimpleUnionTest, EmptyUnion) { + std::vector docsets; + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), TERMINATED); + EXPECT_EQ(union_docset->advance(), TERMINATED); + EXPECT_EQ(union_docset->size_hint(), 0); +} + +TEST_F(SimpleUnionTest, SingleDocSet) { + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 5, 10, 15})); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + EXPECT_EQ(union_docset->advance(), 5); + EXPECT_EQ(union_docset->advance(), 10); + EXPECT_EQ(union_docset->advance(), 15); + EXPECT_EQ(union_docset->advance(), TERMINATED); +} + +TEST_F(SimpleUnionTest, TwoDisjointDocSets) { + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 3, 5})); + docsets.push_back(std::make_shared(std::vector {2, 4, 6})); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + EXPECT_EQ(union_docset->advance(), 2); + EXPECT_EQ(union_docset->advance(), 3); + EXPECT_EQ(union_docset->advance(), 4); + EXPECT_EQ(union_docset->advance(), 5); + EXPECT_EQ(union_docset->advance(), 6); + EXPECT_EQ(union_docset->advance(), TERMINATED); +} + +TEST_F(SimpleUnionTest, TwoOverlappingDocSets) { + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 3, 5, 7})); + docsets.push_back(std::make_shared(std::vector {3, 5, 9})); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + // Union should return unique docs in order: 1, 3, 5, 7, 9 + EXPECT_EQ(union_docset->doc(), 1); + EXPECT_EQ(union_docset->advance(), 3); + EXPECT_EQ(union_docset->advance(), 5); + EXPECT_EQ(union_docset->advance(), 7); + EXPECT_EQ(union_docset->advance(), 9); + EXPECT_EQ(union_docset->advance(), TERMINATED); +} + +TEST_F(SimpleUnionTest, MultipleDocSets) { + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 10, 20})); + docsets.push_back(std::make_shared(std::vector {5, 15, 25})); + docsets.push_back(std::make_shared(std::vector {3, 13, 23})); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + EXPECT_EQ(union_docset->advance(), 3); + EXPECT_EQ(union_docset->advance(), 5); + EXPECT_EQ(union_docset->advance(), 10); + EXPECT_EQ(union_docset->advance(), 13); + EXPECT_EQ(union_docset->advance(), 15); + EXPECT_EQ(union_docset->advance(), 20); + EXPECT_EQ(union_docset->advance(), 23); + EXPECT_EQ(union_docset->advance(), 25); + EXPECT_EQ(union_docset->advance(), TERMINATED); +} + +TEST_F(SimpleUnionTest, SeekToExactDoc) { + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 5, 10, 15})); + docsets.push_back(std::make_shared(std::vector {2, 6, 12, 16})); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + EXPECT_EQ(union_docset->seek(10), 10); + EXPECT_EQ(union_docset->doc(), 10); +} + +TEST_F(SimpleUnionTest, SeekToNonExistentDoc) { + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 5, 10, 15})); + docsets.push_back(std::make_shared(std::vector {2, 6, 12, 16})); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + // Seek to 7 should return the next available doc (10) + EXPECT_EQ(union_docset->seek(7), 10); + EXPECT_EQ(union_docset->doc(), 10); +} + +TEST_F(SimpleUnionTest, SeekBeyondAllDocs) { + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 5, 10})); + docsets.push_back(std::make_shared(std::vector {2, 6, 12})); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->seek(100), TERMINATED); + EXPECT_EQ(union_docset->doc(), TERMINATED); +} + +TEST_F(SimpleUnionTest, SeekThenAdvance) { + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 5, 10, 15, 20})); + docsets.push_back(std::make_shared(std::vector {2, 6, 12, 16, 22})); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->seek(8), 10); + EXPECT_EQ(union_docset->advance(), 12); + EXPECT_EQ(union_docset->advance(), 15); + EXPECT_EQ(union_docset->advance(), 16); +} + +TEST_F(SimpleUnionTest, SizeHint) { + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 2, 3}, 10)); + docsets.push_back(std::make_shared(std::vector {4, 5}, 20)); + docsets.push_back(std::make_shared(std::vector {6, 7, 8, 9}, 15)); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + // Size hint should be the maximum of all docsets (20) + EXPECT_EQ(union_docset->size_hint(), 20); +} + +TEST_F(SimpleUnionTest, OneEmptyDocSet) { + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 5, 10})); + docsets.push_back(std::make_shared(std::vector {})); // Empty + docsets.push_back(std::make_shared(std::vector {2, 6, 12})); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + // Should work correctly, ignoring the empty docset + EXPECT_EQ(union_docset->doc(), 1); + EXPECT_EQ(union_docset->advance(), 2); + EXPECT_EQ(union_docset->advance(), 5); + EXPECT_EQ(union_docset->advance(), 6); + EXPECT_EQ(union_docset->advance(), 10); + EXPECT_EQ(union_docset->advance(), 12); + EXPECT_EQ(union_docset->advance(), TERMINATED); +} + +TEST_F(SimpleUnionTest, AllEmptyDocSets) { + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {})); + docsets.push_back(std::make_shared(std::vector {})); + docsets.push_back(std::make_shared(std::vector {})); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), TERMINATED); + EXPECT_EQ(union_docset->advance(), TERMINATED); +} + +TEST_F(SimpleUnionTest, IdenticalDocSets) { + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 5, 10})); + docsets.push_back(std::make_shared(std::vector {1, 5, 10})); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + // Should return each doc once even though they appear in both sets + EXPECT_EQ(union_docset->doc(), 1); + EXPECT_EQ(union_docset->advance(), 5); + EXPECT_EQ(union_docset->advance(), 10); + EXPECT_EQ(union_docset->advance(), TERMINATED); +} + +TEST_F(SimpleUnionTest, LargeNumberOfDocSets) { + std::vector docsets; + // Create 10 docsets with docs at multiples of their index + for (int i = 1; i <= 10; ++i) { + std::vector docs; + for (int j = 1; j <= 5; ++j) { + docs.push_back(i * j); + } + docsets.push_back(std::make_shared(docs)); + } + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + // Should start with smallest doc (1) + EXPECT_EQ(union_docset->doc(), 1); + + // Count total unique docs + int count = 1; + while (union_docset->advance() != TERMINATED) { + count++; + } + + // We should have multiple docs (exact count depends on overlaps) + EXPECT_GT(count, 10); +} + +TEST_F(SimpleUnionTest, SeekToCurrentDoc) { + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 5, 10, 15})); + docsets.push_back(std::make_shared(std::vector {2, 6, 12, 16})); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + EXPECT_EQ(union_docset->advance(), 2); + EXPECT_EQ(union_docset->doc(), 2); + + // Seek to current doc should stay at current + EXPECT_EQ(union_docset->seek(2), 2); + EXPECT_EQ(union_docset->doc(), 2); +} + +TEST_F(SimpleUnionTest, SeekBackwards) { + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 5, 10, 15})); + docsets.push_back(std::make_shared(std::vector {2, 6, 12, 16})); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->seek(10), 10); + EXPECT_EQ(union_docset->doc(), 10); + + // Seek backwards should return next available doc (which is 10 or later) + EXPECT_EQ(union_docset->seek(5), 10); + EXPECT_EQ(union_docset->doc(), 10); +} + +TEST_F(SimpleUnionTest, FreqWithSingleDocSet) { + std::map> positions1 = { + {1, {0, 5, 10}}, // doc 1 有 3 个位置 + {5, {2, 8}}, // doc 5 有 2 个位置 + {10, {1, 3, 7, 9}} // doc 10 有 4 个位置 + }; + + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 5, 10}, positions1)); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + EXPECT_EQ(union_docset->freq(), 3); + + EXPECT_EQ(union_docset->advance(), 5); + EXPECT_EQ(union_docset->freq(), 2); + + EXPECT_EQ(union_docset->advance(), 10); + EXPECT_EQ(union_docset->freq(), 4); +} + +TEST_F(SimpleUnionTest, FreqWithMultipleDocSetsDisjoint) { + std::map> positions1 = {{1, {0, 2}}, {5, {1, 3, 5}}}; + std::map> positions2 = {{10, {0, 1, 2}}}; + + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 5}, positions1)); + docsets.push_back(std::make_shared(std::vector {10}, positions2)); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + EXPECT_EQ(union_docset->freq(), 2); + + EXPECT_EQ(union_docset->advance(), 5); + EXPECT_EQ(union_docset->freq(), 3); + + EXPECT_EQ(union_docset->advance(), 10); + EXPECT_EQ(union_docset->freq(), 3); +} + +TEST_F(SimpleUnionTest, FreqWithOverlappingDocs) { + std::map> positions1 = {{1, {0, 2}}, {5, {1, 3, 5}}}; + std::map> positions2 = {{5, {2, 4}}, {10, {0, 1, 2}}}; + + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 5}, positions1)); + docsets.push_back(std::make_shared(std::vector {5, 10}, positions2)); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + EXPECT_EQ(union_docset->freq(), 2); + + EXPECT_EQ(union_docset->advance(), 5); + EXPECT_EQ(union_docset->freq(), 5); // 3 + 2 = 5 + + EXPECT_EQ(union_docset->advance(), 10); + EXPECT_EQ(union_docset->freq(), 3); +} + +TEST_F(SimpleUnionTest, FreqWithThreeDocSets) { + std::map> positions1 = {{5, {0, 1}}}; + std::map> positions2 = {{5, {2, 3, 4}}}; + std::map> positions3 = {{5, {5}}}; + + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {5}, positions1)); + docsets.push_back(std::make_shared(std::vector {5}, positions2)); + docsets.push_back(std::make_shared(std::vector {5}, positions3)); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 5); + EXPECT_EQ(union_docset->freq(), 6); // 2 + 3 + 1 = 6 +} + +TEST_F(SimpleUnionTest, FreqAfterSeek) { + std::map> positions = { + {1, {0, 1}}, {5, {2, 3, 4}}, {10, {5, 6}}}; + + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 5, 10}, positions)); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->seek(5), 5); + EXPECT_EQ(union_docset->freq(), 3); +} + +TEST_F(SimpleUnionTest, NormWithSingleDocSet) { + std::vector docsets; + docsets.push_back(std::make_shared( + std::vector {1, 5, 10}, std::map> {}, 0, 42)); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + EXPECT_EQ(union_docset->norm(), 42); + + EXPECT_EQ(union_docset->advance(), 5); + EXPECT_EQ(union_docset->norm(), 42); +} + +TEST_F(SimpleUnionTest, NormReturnsFirstMatchingDocSet) { + std::vector docsets; + docsets.push_back(std::make_shared( + std::vector {1, 5}, std::map> {}, 0, 10)); + docsets.push_back(std::make_shared( + std::vector {5, 10}, std::map> {}, 0, 20)); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + EXPECT_EQ(union_docset->norm(), 10); + + EXPECT_EQ(union_docset->advance(), 5); + EXPECT_EQ(union_docset->norm(), 10); + + EXPECT_EQ(union_docset->advance(), 10); + EXPECT_EQ(union_docset->norm(), 20); +} + +TEST_F(SimpleUnionTest, NormWithDifferentNorms) { + std::vector docsets; + docsets.push_back(std::make_shared( + std::vector {1, 3}, std::map> {}, 0, 5)); + docsets.push_back(std::make_shared( + std::vector {2, 4}, std::map> {}, 0, 15)); + docsets.push_back(std::make_shared( + std::vector {3, 5}, std::map> {}, 0, 25)); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + EXPECT_EQ(union_docset->norm(), 5); + + EXPECT_EQ(union_docset->advance(), 2); + EXPECT_EQ(union_docset->norm(), 15); + + EXPECT_EQ(union_docset->advance(), 3); + EXPECT_EQ(union_docset->norm(), 5); + + EXPECT_EQ(union_docset->advance(), 4); + EXPECT_EQ(union_docset->norm(), 15); + + EXPECT_EQ(union_docset->advance(), 5); + EXPECT_EQ(union_docset->norm(), 25); +} + +TEST_F(SimpleUnionTest, AppendPositionsSingleDocSet) { + std::map> positions = {{1, {0, 5, 10}}, {5, {2, 8, 15}}}; + + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 5}, positions)); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + + std::vector output; + union_docset->append_positions_with_offset(0, output); + EXPECT_EQ(output, (std::vector {0, 5, 10})); + + EXPECT_EQ(union_docset->advance(), 5); + output.clear(); + union_docset->append_positions_with_offset(0, output); + EXPECT_EQ(output, (std::vector {2, 8, 15})); +} + +TEST_F(SimpleUnionTest, AppendPositionsWithOffset) { + std::map> positions = {{1, {0, 2, 4}}}; + + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1}, positions)); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + + std::vector output; + union_docset->append_positions_with_offset(10, output); + EXPECT_EQ(output, (std::vector {10, 12, 14})); +} + +TEST_F(SimpleUnionTest, AppendPositionsMergeAndDeduplicate) { + std::map> positions1 = {{5, {1, 3, 5, 7}}}; + std::map> positions2 = {{5, {3, 5, 9, 11}}}; + + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {5}, positions1)); + docsets.push_back(std::make_shared(std::vector {5}, positions2)); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 5); + + std::vector output; + union_docset->append_positions_with_offset(0, output); + + EXPECT_EQ(output, (std::vector {1, 3, 5, 7, 9, 11})); +} + +TEST_F(SimpleUnionTest, AppendPositionsMultipleCalls) { + std::map> positions = {{1, {0, 5}}}; + + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1}, positions)); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + + std::vector output = {100, 200}; + union_docset->append_positions_with_offset(0, output); + + EXPECT_EQ(output, (std::vector {100, 200, 0, 5})); +} + +TEST_F(SimpleUnionTest, AppendPositionsThreeDocSets) { + std::map> positions1 = {{10, {0, 10}}}; + std::map> positions2 = {{10, {5, 15}}}; + std::map> positions3 = {{10, {10, 20}}}; + + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {10}, positions1)); + docsets.push_back(std::make_shared(std::vector {10}, positions2)); + docsets.push_back(std::make_shared(std::vector {10}, positions3)); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 10); + + std::vector output; + union_docset->append_positions_with_offset(0, output); + + EXPECT_EQ(output, (std::vector {0, 5, 10, 15, 20})); +} + +TEST_F(SimpleUnionTest, AppendPositionsWithDifferentOffsets) { + std::map> positions1 = {{5, {0, 1}}}; + std::map> positions2 = {{5, {2, 3}}}; + + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {5}, positions1)); + docsets.push_back(std::make_shared(std::vector {5}, positions2)); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 5); + + std::vector output; + union_docset->append_positions_with_offset(100, output); + + EXPECT_EQ(output, (std::vector {100, 101, 102, 103})); +} + +TEST_F(SimpleUnionTest, AppendPositionsPartialMatch) { + std::map> positions1 = {{1, {0, 1}}, {5, {2, 3}}}; + std::map> positions2 = {{1, {4, 5}}, {10, {6, 7}}}; + + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 5}, positions1)); + docsets.push_back(std::make_shared(std::vector {1, 10}, positions2)); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + std::vector output1; + union_docset->append_positions_with_offset(0, output1); + EXPECT_EQ(output1, (std::vector {0, 1, 4, 5})); + + EXPECT_EQ(union_docset->advance(), 5); + std::vector output2; + union_docset->append_positions_with_offset(0, output2); + EXPECT_EQ(output2, (std::vector {2, 3})); + + EXPECT_EQ(union_docset->advance(), 10); + std::vector output3; + union_docset->append_positions_with_offset(0, output3); + EXPECT_EQ(output3, (std::vector {6, 7})); +} + +TEST_F(SimpleUnionTest, AppendPositionsEmptyPositions) { + std::vector docsets; + docsets.push_back(std::make_shared(std::vector {1, 5})); + + auto union_docset = SimpleUnion::create(std::move(docsets)); + + EXPECT_EQ(union_docset->doc(), 1); + + std::vector output; + union_docset->append_positions_with_offset(0, output); + + EXPECT_TRUE(output.empty()); +} + +} // namespace doris \ No newline at end of file