From 0c9bf52905a5c6d68baa0135734dcc08b5a3f8c8 Mon Sep 17 00:00:00 2001 From: Greg Kempe Date: Tue, 14 May 2024 12:26:36 +0400 Subject: [PATCH 1/5] advanced search correctly searches content for search__content, we must search: content, pages.body, and provisions.body --- peachjam_search/views.py | 131 ++++++++++++++++++++++++++++----------- 1 file changed, 94 insertions(+), 37 deletions(-) diff --git a/peachjam_search/views.py b/peachjam_search/views.py index a7461e465..8baabaf2a 100644 --- a/peachjam_search/views.py +++ b/peachjam_search/views.py @@ -51,8 +51,35 @@ class MainSearchBackend(BaseSearchFilterBackend): It is a combination of MUST (AND) queries and SHOULD (OR) queries. """ + pages_inner_hits = { + "_source": ["pages.page_num"], + "highlight": { + "fields": {"pages.body": {}}, + "pre_tags": [""], + "post_tags": [""], + "fragment_size": 80, + "number_of_fragments": 2, + }, + } + + provisions_inner_hits = { + "_source": [ + "provisions.title", + "provisions.id", + "provisions.parent_titles", + "provisions.parent_ids", + ], + "highlight": { + "fields": {"provisions.body": {}}, + "pre_tags": [""], + "post_tags": [""], + "fragment_size": 80, + "number_of_fragments": 2, + }, + } + def get_field(self, view, field): - options = view.search_fields[field] or {} + options = view.search_fields.get(field, {}) or {} if "boost" in options: return f'{field}^{options["boost"]}' return field @@ -60,8 +87,10 @@ def get_field(self, view, field): def filter_queryset(self, request, queryset, view): """Build the actual search queries.""" must_queries = [] - must_queries.extend(self.build_per_field_queries(request, view)) must_queries.extend(self.build_rank_feature_queries(request, view)) + must_queries.extend(self.build_per_field_queries(request, view)) + must_queries.extend(self.build_content_advanced_queries(request, view)) + # TODO: handle "search" field for advanced queries, which must use AND semantics within each field should_queries = [] should_queries.extend(self.build_basic_queries(request, view)) @@ -88,20 +117,24 @@ def build_rank_feature_queries(self, request, view): def build_per_field_queries(self, request, view): """Supports searching across multiple fields. Specify zero or more query parameters such as search__title=foo""" - query_params = {} + queries = [] + for field in view.search_fields.keys(): + if field == "content": + # advanced search on the "content" field (which must include pages and provisions too), is handled + # by the + continue query = request.query_params.get(self.search_param + "__" + field) if query: - query_params[field] = query + queries.append( + SimpleQueryString( + query=query, + fields=[self.get_field(view, field)], + **view.advanced_simple_query_string_options, + ) + ) - return [ - SimpleQueryString( - query=search_term, - fields=[self.get_field(view, field)], - **view.simple_query_string_options, - ) - for field, search_term in query_params.items() - ] + return queries def build_basic_queries(self, request, view): """This implements a simple_query_string query across multiple fields, using AND logic for the terms @@ -129,6 +162,49 @@ def build_content_phrase_queries(self, request, view): return [] return [MatchPhrase(content={"query": search_term, "slop": 2})] + def build_content_advanced_queries(self, request, view): + """Adds advanced search queries for search__content, which searches across content, pages.body and + provisions.body.""" + query = request.query_params.get(self.search_param + "__content") + if query: + return [ + Q( + "bool", + minimum_should_match=1, + should=[ + # content + SimpleQueryString( + query=query, + fields=[self.get_field(view, "pages.body")], + **view.advanced_simple_query_string_options, + ), + # pages.body + Q( + "nested", + path="pages", + inner_hits=self.pages_inner_hits, + query=SimpleQueryString( + query=query, + fields=[self.get_field(view, "pages.body")], + **view.advanced_simple_query_string_options, + ), + ), + # provisions.body + Q( + "nested", + path="provisions", + inner_hits=self.provisions_inner_hits, + query=SimpleQueryString( + query=query, + fields=[self.get_field(view, "provisions.body")], + **view.advanced_simple_query_string_options, + ), + ), + ], + ) + ] + return [] + def build_nested_page_queries(self, request, view): """Does a nested page search, and includes highlights.""" search_term = " ".join(self.get_search_query_params(request)) @@ -139,6 +215,7 @@ def build_nested_page_queries(self, request, view): Q( "nested", path="pages", + inner_hits=self.pages_inner_hits, query=Q( "bool", must=[ @@ -153,16 +230,6 @@ def build_nested_page_queries(self, request, view): MatchPhrase(pages__body={"query": search_term, "slop": 2}), ], ), - inner_hits={ - "_source": ["pages.page_num"], - "highlight": { - "fields": {"pages.body": {}}, - "pre_tags": [""], - "post_tags": [""], - "fragment_size": 80, - "number_of_fragments": 2, - }, - }, ) ] @@ -176,6 +243,7 @@ def build_nested_provision_queries(self, request, view): Q( "nested", path="provisions", + inner_hits=self.provisions_inner_hits, query=Q( "bool", should=[ @@ -193,21 +261,6 @@ def build_nested_provision_queries(self, request, view): ), ], ), - inner_hits={ - "_source": [ - "provisions.title", - "provisions.id", - "provisions.parent_titles", - "provisions.parent_ids", - ], - "highlight": { - "fields": {"provisions.body": {}}, - "pre_tags": [""], - "post_tags": [""], - "fragment_size": 80, - "number_of_fragments": 2, - }, - }, ) ] @@ -260,6 +313,10 @@ class DocumentSearchViewSet(BaseDocumentViewSet): "default_operator": "OR", "minimum_should_match": "70%", } + # how to treat queries for advanced search: AND + advanced_simple_query_string_options = { + "default_operator": "AND", + } filter_fields = { "authors": "authors", From 623ca78a00c4c7dd40a5bc725cf4b4e4fde6185b Mon Sep 17 00:00:00 2001 From: Greg Kempe Date: Tue, 14 May 2024 12:29:59 +0400 Subject: [PATCH 2/5] simplify query term fetching --- peachjam_search/views.py | 41 ++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/peachjam_search/views.py b/peachjam_search/views.py index 8baabaf2a..a14730485 100644 --- a/peachjam_search/views.py +++ b/peachjam_search/views.py @@ -78,6 +78,8 @@ class MainSearchBackend(BaseSearchFilterBackend): }, } + query = None + def get_field(self, view, field): options = view.search_fields.get(field, {}) or {} if "boost" in options: @@ -86,6 +88,9 @@ def get_field(self, view, field): def filter_queryset(self, request, queryset, view): """Build the actual search queries.""" + # the basic query for a simple search + self.query = " ".join(self.get_search_query_params(request)) + must_queries = [] must_queries.extend(self.build_rank_feature_queries(request, view)) must_queries.extend(self.build_per_field_queries(request, view)) @@ -139,17 +144,18 @@ def build_per_field_queries(self, request, view): def build_basic_queries(self, request, view): """This implements a simple_query_string query across multiple fields, using AND logic for the terms in a field, but effectively OR (should) logic between the fields.""" + if not self.query: + return [] + query_fields = [ self.get_field(view, field) for field, options in view.search_fields.items() ] - query_terms = self.get_search_query_params(request) queries = [ SimpleQueryString( - query=search_term, + query=self.query, fields=[field], **view.simple_query_string_options, ) - for search_term in query_terms[:1] for field in query_fields ] @@ -157,10 +163,9 @@ def build_basic_queries(self, request, view): def build_content_phrase_queries(self, request, view): """Adds a best-effort phrase match query on the content field.""" - search_term = " ".join(self.get_search_query_params(request)) - if not search_term: + if not self.query: return [] - return [MatchPhrase(content={"query": search_term, "slop": 2})] + return [MatchPhrase(content={"query": self.query, "slop": 2})] def build_content_advanced_queries(self, request, view): """Adds advanced search queries for search__content, which searches across content, pages.body and @@ -175,7 +180,7 @@ def build_content_advanced_queries(self, request, view): # content SimpleQueryString( query=query, - fields=[self.get_field(view, "pages.body")], + fields=[self.get_field(view, "content")], **view.advanced_simple_query_string_options, ), # pages.body @@ -185,7 +190,8 @@ def build_content_advanced_queries(self, request, view): inner_hits=self.pages_inner_hits, query=SimpleQueryString( query=query, - fields=[self.get_field(view, "pages.body")], + fields=["pages.body"], + quote_field_suffix=".exact", **view.advanced_simple_query_string_options, ), ), @@ -196,7 +202,8 @@ def build_content_advanced_queries(self, request, view): inner_hits=self.provisions_inner_hits, query=SimpleQueryString( query=query, - fields=[self.get_field(view, "provisions.body")], + fields=["provisions.body"], + quote_field_suffix=".exact", **view.advanced_simple_query_string_options, ), ), @@ -207,8 +214,7 @@ def build_content_advanced_queries(self, request, view): def build_nested_page_queries(self, request, view): """Does a nested page search, and includes highlights.""" - search_term = " ".join(self.get_search_query_params(request)) - if not search_term: + if not self.query: return [] return [ @@ -220,14 +226,14 @@ def build_nested_page_queries(self, request, view): "bool", must=[ SimpleQueryString( - query=search_term, + query=self.query, fields=["pages.body"], quote_field_suffix=".exact", **view.simple_query_string_options, ) ], should=[ - MatchPhrase(pages__body={"query": search_term, "slop": 2}), + MatchPhrase(pages__body={"query": self.query, "slop": 2}), ], ), ) @@ -235,8 +241,7 @@ def build_nested_page_queries(self, request, view): def build_nested_provision_queries(self, request, view): """Does a nested provision search, and includes highlights.""" - search_term = " ".join(self.get_search_query_params(request)) - if not search_term: + if not self.query: return [] return [ @@ -247,15 +252,15 @@ def build_nested_provision_queries(self, request, view): query=Q( "bool", should=[ - MatchPhrase(provisions__body={"query": search_term, "slop": 2}), + MatchPhrase(provisions__body={"query": self.query, "slop": 2}), SimpleQueryString( - query=search_term, + query=self.query, fields=["provisions.body"], quote_field_suffix=".exact", **view.simple_query_string_options, ), SimpleQueryString( - query=search_term, + query=self.query, fields=["provisions.title^4", "provisions.parent_titles^2"], **view.simple_query_string_options, ), From 99d589f2e3c90d0a46cc18060da79beec5ce93b5 Mon Sep 17 00:00:00 2001 From: Greg Kempe Date: Tue, 14 May 2024 13:06:32 +0400 Subject: [PATCH 3/5] generate per-field queries in one go --- .../js/components/FindDocuments/index.vue | 73 ++++++++++--------- 1 file changed, 39 insertions(+), 34 deletions(-) diff --git a/peachjam/js/components/FindDocuments/index.vue b/peachjam/js/components/FindDocuments/index.vue index bd19d765a..8c26d6407 100644 --- a/peachjam/js/components/FindDocuments/index.vue +++ b/peachjam/js/components/FindDocuments/index.vue @@ -609,6 +609,12 @@ export default { params.append('facet', facet.name); }); + this.generateAdvancedSearchParams(params); + + return params; + }, + + generateAdvancedSearchParams (params) { // advanced search fields, if any if (this.advancedSearchDateCriteria.date_from && this.advancedSearchDateCriteria.date_to) { const dateFrom = this.advancedSearchDateCriteria.date_from; @@ -620,9 +626,40 @@ export default { params.append('date__lte', this.advancedSearchDateCriteria.date_to); } - this.advancedSearchCriteria.forEach(criterion => this.addAdvancedSearchParams(criterion, params)); + // group criteria by fields and process each field separately + const fields = new Map(); + for (const criterion of this.advancedSearchCriteria) { + if (criterion.text) { + for (const field of criterion.fields.length ? criterion.fields : ['']) { + if (!fields.has(field)) fields.set(field, []); + fields.get(field).push(criterion); + } + } + } - return params; + for (const [field, criteria] of fields) { + params.set(field === '' ? 'search': `search__${field}`, this.generateAdvancedSearchQuery(criteria)); + } + }, + + generateAdvancedSearchQuery (criteria) { + let q = ''; + + for (const criterion of criteria) { + const text = criterion.exact ? `"${criterion.text}"` : criterion.text; + + if (criterion.condition === 'AND') { + q = q + ' & '; + } else if (criterion.condition === 'OR') { + q = q + ' | '; + } else if (criterion.condition === 'NOT') { + q = q + ' -'; + } + + q = q + `(${text})`; + } + + return q.trim(); }, async search (pushState = true) { @@ -734,38 +771,6 @@ export default { }; }, - addAdvancedSearchParams (criterion, params) { - if (!criterion.text) return; - - let q = ''; - const text = criterion.exact ? `"${criterion.text}"` : criterion.text; - let splitValue = text.match(/\w+|"[^"]+"/g); - - if (criterion.condition === 'AND') { - const tokens = []; - - splitValue.forEach((value) => { - if (value.startsWith('"')) { - tokens.push(value); - } else { - tokens.push('"' + value + '"'); - } - }); - - splitValue = tokens.join(' '); - } else if (criterion.condition === 'OR') { - splitValue = `(|${splitValue.join('|')})`; - } else if (criterion.condition === 'NOT') { - splitValue = splitValue.map((value) => `-${value}`).join(' '); - } else splitValue = splitValue.join(' '); - - q = q + ' ' + splitValue.trim(); - - const fields = criterion.fields.length ? criterion.fields.map(f => `search__${f}`) : ['search']; - for (const field of fields) { - params.set(field, (params.get(field)?.trim() || '') + ' ' + q.trim()); - } - } } }; From ae91e2522f3c666a393d67a4a40bde637988c3f2 Mon Sep 17 00:00:00 2001 From: Greg Kempe Date: Tue, 14 May 2024 13:25:21 +0400 Subject: [PATCH 4/5] use search__all to differentiate advanced from simple search otherwise, we can't tell when to do a broad search vs an advanced search if we don't have explicit search fields --- .../FindDocuments/AdvancedSearch.vue | 2 +- .../FindDocuments/AdvancedSearchFields.vue | 34 ++++-- .../js/components/FindDocuments/index.vue | 8 +- peachjam_search/views.py | 110 +++++++++++++----- 4 files changed, 108 insertions(+), 46 deletions(-) diff --git a/peachjam/js/components/FindDocuments/AdvancedSearch.vue b/peachjam/js/components/FindDocuments/AdvancedSearch.vue index f2ae4bf71..bab8d9e2b 100644 --- a/peachjam/js/components/FindDocuments/AdvancedSearch.vue +++ b/peachjam/js/components/FindDocuments/AdvancedSearch.vue @@ -147,7 +147,7 @@ export default { // ensure there's an empty entry at the end if (data[data.length - 1].text) { - data.push({ text: '', fields: [], exact: false, condition: 'AND' }); + data.push({ text: '', fields: ['all'], exact: false, condition: 'AND' }); } this.$emit('update:modelValue', data); diff --git a/peachjam/js/components/FindDocuments/AdvancedSearchFields.vue b/peachjam/js/components/FindDocuments/AdvancedSearchFields.vue index 49699308b..6d97cc6ee 100644 --- a/peachjam/js/components/FindDocuments/AdvancedSearchFields.vue +++ b/peachjam/js/components/FindDocuments/AdvancedSearchFields.vue @@ -50,7 +50,8 @@ > { return { fields: [{ - field: 'ANY', + field: 'all', label: self.$t('Any field') },{ field: 'title', @@ -126,15 +127,28 @@ export default { this.$emit('on-change'); }, fieldChanged (field, checked) { - if (field === 'ANY') { - this.criterion.fields = []; - } else { + if (field === 'all') { if (checked) { - if (!this.criterion.fields.includes(field)) { - this.criterion.fields.push(field); - } - } else { - this.criterion.fields = this.criterion.fields.filter((f) => f !== field); + // only 'all' + this.criterion.fields.splice(0, this.criterion.fields.length); + this.criterion.fields.push(field); + } else if (this.criterion.fields.includes(field) && this.criterion.fields.length > 1) { + // remove it + this.criterion.fields.splice(this.criterion.fields.indexOf(field), 1); + } + } else if (checked) { + // add it + if (!this.criterion.fields.includes(field)) { + this.criterion.fields.push(field); + } + if (this.criterion.fields.includes('all')) { + this.criterion.fields.splice(this.criterion.fields.indexOf('all'), 1); + } + } else if (this.criterion.fields.includes(field)) { + // remove it + this.criterion.fields.splice(this.criterion.fields.indexOf(field), 1); + if (this.criterion.fields.length === 0) { + this.criterion.fields.push('all'); } } this.changed(); diff --git a/peachjam/js/components/FindDocuments/index.vue b/peachjam/js/components/FindDocuments/index.vue index 8c26d6407..28f6b8480 100644 --- a/peachjam/js/components/FindDocuments/index.vue +++ b/peachjam/js/components/FindDocuments/index.vue @@ -630,7 +630,7 @@ export default { const fields = new Map(); for (const criterion of this.advancedSearchCriteria) { if (criterion.text) { - for (const field of criterion.fields.length ? criterion.fields : ['']) { + for (const field of criterion.fields) { if (!fields.has(field)) fields.set(field, []); fields.get(field).push(criterion); } @@ -638,7 +638,7 @@ export default { } for (const [field, criteria] of fields) { - params.set(field === '' ? 'search': `search__${field}`, this.generateAdvancedSearchQuery(criteria)); + params.set(`search__${field}`, this.generateAdvancedSearchQuery(criteria)); } }, @@ -754,13 +754,13 @@ export default { resetAdvancedFields () { this.advancedSearchCriteria = [{ text: '', - fields: [], + fields: ['all'], condition: '', exact: false }, { text: '', - fields: [], + fields: ['all'], condition: 'AND', exact: false }]; diff --git a/peachjam_search/views.py b/peachjam_search/views.py index a14730485..5ad2a8eac 100644 --- a/peachjam_search/views.py +++ b/peachjam_search/views.py @@ -94,8 +94,6 @@ def filter_queryset(self, request, queryset, view): must_queries = [] must_queries.extend(self.build_rank_feature_queries(request, view)) must_queries.extend(self.build_per_field_queries(request, view)) - must_queries.extend(self.build_content_advanced_queries(request, view)) - # TODO: handle "search" field for advanced queries, which must use AND semantics within each field should_queries = [] should_queries.extend(self.build_basic_queries(request, view)) @@ -103,6 +101,10 @@ def filter_queryset(self, request, queryset, view): should_queries.extend(self.build_nested_page_queries(request, view)) should_queries.extend(self.build_nested_provision_queries(request, view)) + # these handle advanced search + must_queries.extend(self.build_advanced_all_queries(request, view)) + must_queries.extend(self.build_advanced_content_queries(request, view)) + return queryset.query( "bool", must=must_queries, @@ -167,50 +169,96 @@ def build_content_phrase_queries(self, request, view): return [] return [MatchPhrase(content={"query": self.query, "slop": 2})] - def build_content_advanced_queries(self, request, view): + def build_advanced_all_queries(self, request, view): + """Build queries for search__all (advanced search across all fields). Similar logic to build_basic_queries, + but all terms are required by default.""" + query = request.query_params.get(self.search_param + "__all") + if not query: + return [] + + query_fields = [ + self.get_field(view, field) for field, options in view.search_fields.items() + ] + return [ + Q( + "bool", + minimum_should_match=1, + should=[ + SimpleQueryString( + query=query, + fields=[field], + **view.advanced_simple_query_string_options, + ) + for field in query_fields + ] + + self.build_advanced_content_query(view, query), + ) + ] + + def build_advanced_content_queries(self, request, view): """Adds advanced search queries for search__content, which searches across content, pages.body and provisions.body.""" query = request.query_params.get(self.search_param + "__content") + + # don't allow search__content and search__all to clash, only one is needed to search content fields + if query and request.query_params.get(self.search_param + "__all"): + return [] + if query: return [ Q( "bool", minimum_should_match=1, + should=self.build_advanced_content_query(view, query), + ) + ] + return [] + + def build_advanced_content_query(self, view, query): + # TODO: negative queries don't work, because they must be applied to the whole content, not just a + # particular page or provision + return [ + # content + SimpleQueryString( + query=query, + fields=["content"], + **view.advanced_simple_query_string_options, + ), + # pages.body + Q( + "nested", + path="pages", + inner_hits=self.pages_inner_hits, + query=SimpleQueryString( + query=query, + fields=["pages.body"], + quote_field_suffix=".exact", + **view.advanced_simple_query_string_options, + ), + ), + # provisions.body + Q( + "nested", + path="provisions", + inner_hits=self.provisions_inner_hits, + query=Q( + "bool", should=[ - # content SimpleQueryString( query=query, - fields=[self.get_field(view, "content")], + fields=["provisions.body"], + quote_field_suffix=".exact", **view.advanced_simple_query_string_options, ), - # pages.body - Q( - "nested", - path="pages", - inner_hits=self.pages_inner_hits, - query=SimpleQueryString( - query=query, - fields=["pages.body"], - quote_field_suffix=".exact", - **view.advanced_simple_query_string_options, - ), - ), - # provisions.body - Q( - "nested", - path="provisions", - inner_hits=self.provisions_inner_hits, - query=SimpleQueryString( - query=query, - fields=["provisions.body"], - quote_field_suffix=".exact", - **view.advanced_simple_query_string_options, - ), + SimpleQueryString( + query=self.query, + fields=["provisions.title^4", "provisions.parent_titles^2"], + **view.advanced_simple_query_string_options, ), ], - ) - ] - return [] + ), + ), + ] def build_nested_page_queries(self, request, view): """Does a nested page search, and includes highlights.""" From 20d82735297af80cdb4cf497c118c6610be858b9 Mon Sep 17 00:00:00 2001 From: Greg Kempe Date: Tue, 14 May 2024 13:40:06 +0200 Subject: [PATCH 5/5] typo --- peachjam_search/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/peachjam_search/views.py b/peachjam_search/views.py index 5ad2a8eac..46148cf59 100644 --- a/peachjam_search/views.py +++ b/peachjam_search/views.py @@ -129,7 +129,7 @@ def build_per_field_queries(self, request, view): for field in view.search_fields.keys(): if field == "content": # advanced search on the "content" field (which must include pages and provisions too), is handled - # by the + # by build_advanced_content_queries continue query = request.query_params.get(self.search_param + "__" + field) if query: