From ece67efbf39eddb384348f0f5fa1b4c67a3045b1 Mon Sep 17 00:00:00 2001 From: Greg Kempe Date: Fri, 19 Apr 2024 16:09:50 +0200 Subject: [PATCH 1/7] WIP of indexing provisions for search --- peachjam/models/core_document_model.py | 3 +- peachjam/xmlutils.py | 8 ++++ peachjam_search/documents.py | 58 +++++++++++++++++++++++++- 3 files changed, 67 insertions(+), 2 deletions(-) create mode 100644 peachjam/xmlutils.py diff --git a/peachjam/models/core_document_model.py b/peachjam/models/core_document_model.py index 323488d59..07e005875 100644 --- a/peachjam/models/core_document_model.py +++ b/peachjam/models/core_document_model.py @@ -37,6 +37,7 @@ from peachjam.models.settings import pj_settings from peachjam.pipelines import DOC_MIMETYPES, word_pipeline from peachjam.storage import DynamicStorageFileField +from peachjam.xmlutils import parse_html_str class Label(models.Model): @@ -905,7 +906,7 @@ def update_or_create_for_document(cls, document): text = "" if document.content_html: # it's html, grab the text from the html tree - root = html.fromstring(document.content_html) + root = parse_html_str(document.content_html) text = " ".join(root.itertext()) elif hasattr(document, "source_file"): diff --git a/peachjam/xmlutils.py b/peachjam/xmlutils.py new file mode 100644 index 000000000..cde2229fa --- /dev/null +++ b/peachjam/xmlutils.py @@ -0,0 +1,8 @@ +import lxml.html + +html_parser = lxml.html.HTMLParser(encoding="utf-8") + + +def parse_html_str(html): + """Encode HTML into utf-8 bytes and parse.""" + return lxml.html.fromstring(html.encode("utf-8"), parser=html_parser) diff --git a/peachjam_search/documents.py b/peachjam_search/documents.py index ec349d885..521a60223 100644 --- a/peachjam_search/documents.py +++ b/peachjam_search/documents.py @@ -21,6 +21,7 @@ OrderOutcome, Taxonomy, ) +from peachjam.xmlutils import parse_html_str log = logging.getLogger(__name__) @@ -108,6 +109,16 @@ class SearchableDocument(Document): } ) + provisions = fields.NestedField( + properties={ + "title": fields.TextField(), + "id": fields.KeywordField(), + "parent_titles": fields.TextField(), + "parent_ids": fields.KeywordField(), + "body": fields.TextField(analyzer="standard", fields={"exact": Text()}), + } + ) + # this will be used to build prepare_xxx_xx fields for each of these translated_fields = [ ("court", "name"), @@ -218,7 +229,9 @@ def prepare_authors(self, instance): def prepare_content(self, instance): """Text content of document body for non-PDFs.""" - if instance.content_html: + if instance.content_html and ( + not instance.content_html_is_akn or not instance.toc_json + ): return instance.get_content_as_text() def prepare_ranking(self, instance): @@ -269,6 +282,49 @@ def prepare_pages(self, instance): pages.append({"page_num": i, "body": page}) return pages + def prepare_provisions(self, instance): + """Text content of provisions from AKN HTML.""" + + def prepare_provision(item, parents): + provision = None + provision_id = item["id"] or item["type"] + + # get the text of the provision + body = [] + for provision_el in root.xpath(f'//*[@id="{provision_id}"]'): + for el in provision_el: + # exclude headings so they aren't indexed twice + if el.tag not in ["h1", "h2", "h3", "h4", "h5"]: + body.append(" ".join(el.itertext())) + break + if body: + provision = { + "title": item["title"], + "id": provision_id, + "parent_titles": [ + p["title"] for p in parents if p["title"] and p["id"] + ], + "parent_ids": [p["id"] for p in parents if p["title"] and p["id"]], + "body": " ".join(body), + } + provisions.append(provision) + + # recurse into children + if not item["basic_unit"]: + if provision: + parents = parents + [provision] + for child in item["children"] or []: + prepare_provision(child, parents) + + if instance.content_html and instance.content_html_is_akn and instance.toc_json: + # index each provision separately + provisions = [] + root = parse_html_str(instance.content_html) + for item in instance.toc_json: + prepare_provision(item, []) + + return provisions + def prepare_taxonomies(self, instance): """Taxonomy topics are stored as slugs of all the items in the tree down to that topic. This is easier than storing and querying hierarchical taxonomy entries.""" From 13c74216ece816bc44a4dffae9edbc780e9b4d64 Mon Sep 17 00:00:00 2001 From: Greg Kempe Date: Fri, 19 Apr 2024 17:14:22 +0200 Subject: [PATCH 2/7] show provisions in search results --- .../components/FindDocuments/SearchResult.vue | 22 ++++++++- .../FindDocuments/SearchResultProvision.vue | 27 +++++++++++ peachjam_search/documents.py | 4 +- peachjam_search/serializers.py | 15 ++++++ peachjam_search/views.py | 48 ++++++++++++++++++- 5 files changed, 112 insertions(+), 4 deletions(-) create mode 100644 peachjam/js/components/FindDocuments/SearchResultProvision.vue diff --git a/peachjam/js/components/FindDocuments/SearchResult.vue b/peachjam/js/components/FindDocuments/SearchResult.vue index 760878434..c90f937d3 100644 --- a/peachjam/js/components/FindDocuments/SearchResult.vue +++ b/peachjam/js/components/FindDocuments/SearchResult.vue @@ -53,6 +53,15 @@ +
+ +
import JsonTable from './JsonTable.vue'; +import SearchResultProvision from './SearchResultProvision.vue'; export default { name: 'SearchResult', components: { - JsonTable + JsonTable, + SearchResultProvision }, props: { item: { @@ -125,6 +136,15 @@ export default { return Array.isArray(item.authors) ? ', '.join(item.authors) : item.authors; } return ''; + }, + provisionParents (provision) { + // zip item.parent_titles and item.parent_ids + return provision.parent_titles.map((title, index) => { + return { + title: title, + id: provision.parent_ids[index] + }; + }); } } }; diff --git a/peachjam/js/components/FindDocuments/SearchResultProvision.vue b/peachjam/js/components/FindDocuments/SearchResultProvision.vue new file mode 100644 index 000000000..752916161 --- /dev/null +++ b/peachjam/js/components/FindDocuments/SearchResultProvision.vue @@ -0,0 +1,27 @@ + + + diff --git a/peachjam_search/documents.py b/peachjam_search/documents.py index 521a60223..333b39820 100644 --- a/peachjam_search/documents.py +++ b/peachjam_search/documents.py @@ -105,7 +105,7 @@ class SearchableDocument(Document): pages = fields.NestedField( properties={ "page_num": fields.IntegerField(), - "body": fields.TextField(analyzer="standard", fields={"exact": Text()}), + "body": fields.TextField(fields={"exact": Text()}), } ) @@ -115,7 +115,7 @@ class SearchableDocument(Document): "id": fields.KeywordField(), "parent_titles": fields.TextField(), "parent_ids": fields.KeywordField(), - "body": fields.TextField(analyzer="standard", fields={"exact": Text()}), + "body": fields.TextField(fields={"exact": Text()}), } ) diff --git a/peachjam_search/serializers.py b/peachjam_search/serializers.py index 7ae2ba0d6..b76d2a5fe 100644 --- a/peachjam_search/serializers.py +++ b/peachjam_search/serializers.py @@ -8,6 +8,7 @@ class SearchableDocumentSerializer(DocumentSerializer): id = CharField(source="meta.id") highlight = SerializerMethodField() pages = SerializerMethodField() + provisions = SerializerMethodField() court = SerializerMethodField() nature = SerializerMethodField() order_outcome = SerializerMethodField() @@ -66,6 +67,20 @@ def get_pages(self, obj): pages.append(info) return pages + def get_provisions(self, obj): + """Serialize nested provision hits and highlights.""" + provisions = [] + if hasattr(obj.meta, "inner_hits"): + for provision in obj.meta.inner_hits.provisions.hits.hits: + info = provision._source.to_dict() + info["highlight"] = ( + provision.highlight.to_dict() + if hasattr(provision, "highlight") + else {} + ) + provisions.append(info) + return provisions + def get_court(self, obj): return obj["court" + self.language_suffix] diff --git a/peachjam_search/views.py b/peachjam_search/views.py index 12ee4cb2e..cb7db0b7c 100644 --- a/peachjam_search/views.py +++ b/peachjam_search/views.py @@ -61,6 +61,7 @@ def filter_queryset(self, request, queryset, view): should_queries.extend(self.build_basic_queries(request, view)) should_queries.extend(self.build_content_phrase_queries(request, view)) should_queries.extend(self.build_nested_page_queries(request, view)) + should_queries.extend(self.build_nested_provision_queries(request, view)) return queryset.query( "bool", @@ -159,6 +160,51 @@ def build_nested_page_queries(self, request, view): ) ] + def build_nested_provision_queries(self, request, view): + """Does a nested provision search, and includes highlights.""" + search_term = " ".join(self.get_search_query_params(request)) + if not search_term: + return [] + + return [ + Q( + "nested", + path="provisions", + query=Q( + "bool", + should=[ + MatchPhrase(provisions__body={"query": search_term, "slop": 2}), + SimpleQueryString( + query=search_term, + fields=["provisions.body"], + quote_field_suffix=".exact", + **view.simple_query_string_options, + ), + SimpleQueryString( + query=search_term, + fields=["provisions.title^4", "provisions.parent_titles^2"], + **view.simple_query_string_options, + ), + ], + ), + inner_hits={ + "_source": [ + "provisions.title", + "provisions.id", + "provisions.parent_titles", + "provisions.parent_ids", + ], + "highlight": { + "fields": {"provisions.body": {}}, + "pre_tags": [""], + "post_tags": [""], + "fragment_size": 80, + "number_of_fragments": 2, + }, + }, + ) + ] + class SearchView(TemplateView): template_name = "peachjam_search/search.html" @@ -304,7 +350,7 @@ class DocumentSearchViewSet(BaseDocumentViewSet): } # TODO perhaps better to explicitly include specific fields - source = {"excludes": ["pages", "content", "flynote", "case_summary"]} + source = {"excludes": ["pages", "content", "flynote", "case_summary", "provisions"]} def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) From ec7c7bd4cc103a1d3f76f9c115c15df0a5ada2e3 Mon Sep 17 00:00:00 2001 From: Greg Kempe Date: Fri, 19 Apr 2024 19:38:19 +0200 Subject: [PATCH 3/7] cards; padding; bug fixes --- .../components/FindDocuments/SearchResult.vue | 149 +++++++++--------- .../FindDocuments/SearchResultProvision.vue | 14 +- 2 files changed, 85 insertions(+), 78 deletions(-) diff --git a/peachjam/js/components/FindDocuments/SearchResult.vue b/peachjam/js/components/FindDocuments/SearchResult.vue index c90f937d3..2dc96022f 100644 --- a/peachjam/js/components/FindDocuments/SearchResult.vue +++ b/peachjam/js/components/FindDocuments/SearchResult.vue @@ -1,77 +1,82 @@