Skip to content

Commit

Permalink
Merge pull request #1801 from laws-africa/search-index-toc
Browse files Browse the repository at this point in the history
Search legislation TOC
  • Loading branch information
longhotsummer authored Apr 29, 2024
2 parents 334a4f7 + d90fc9e commit 2a3a3cf
Show file tree
Hide file tree
Showing 9 changed files with 295 additions and 69 deletions.
153 changes: 89 additions & 64 deletions peachjam/js/components/FindDocuments/SearchResult.vue
Original file line number Diff line number Diff line change
@@ -1,80 +1,96 @@
<template>
<li class="mb-4 hit">
<a
class="h5 text-primary"
target="_blank"
rel="noreferrer"
:href="item.expression_frbr_uri"
v-html="item.highlight.title || item.title"
></a>
<div>
<span v-if="showJurisdiction || item.locality" class="me-3">
<span v-if="showJurisdiction" v-html="getFlag(item)" class="me-1" />
<span v-if="showJurisdiction">
{{ item.jurisdiction }}
<span v-if="item.locality">· </span>
</span>
<span v-if="item.locality">{{ item.locality }}</span>
</span>
<span class="me-3">{{ item.date }}</span>
<span class="me-3">{{ item.doc_type }}</span>
<a
v-if="debug"
class="me-3"
href="#"
@click.prevent="$emit('explain')"
>{{ item._score }}</a>
<span
v-if="item.court"
class="me-3"
>{{ item.court }}</span>
<span
v-if="item.authors"
class="me-3"
>{{ authors(item) }}</span>
</div>
<div v-if="item.citation && item.citation !== item.title">
<i>{{ item.citation }}</i>
</div>
<div>
{{ item.matter_type }}
</div>
<div v-if="labels">
<span v-for="label in labels" :key="label.code" :class="[ `badge rounded-pill bg-${label.level}` ]">{{ label.name }}</span>
</div>
<div v-if="item.pages.length" class="ms-3">
<div
v-for="(page, index) in item.pages"
:key="index"
>
<span>
<a :href="`${item.expression_frbr_uri}#page-${page.page_num}`">Page {{ page.page_num }}</a>:
</span>
<span v-if="page.highlight['pages.body']" v-html="page.highlight['pages.body'].join(' ... ')" />
</div>
</div>
<div v-else class="ms-3">
<span
class="snippet"
v-html="highlights(item)"
/>
</div>
<div v-if="debug && item.explanation" class="ms-3 mt-2">
<h5>Explanation</h5>
<div class="explanation border p-2">
<json-table :data="item.explanation" />
<div class="card">
<div class="card-body">
<h5 class="card-title">
<a
class="h5 text-primary"
target="_blank"
rel="noreferrer"
:href="item.expression_frbr_uri"
v-html="item.highlight.title || item.title"
/>
</h5>
<div class="mb-1">
<div v-if="item.citation && item.citation !== item.title">
<i>{{ item.citation }}</i>
</div>
<div>
<span v-if="showJurisdiction || item.locality" class="me-3">
<span v-if="showJurisdiction" v-html="getFlag(item)" class="me-1" />
<span v-if="showJurisdiction">
{{ item.jurisdiction }}
<span v-if="item.locality">· </span>
</span>
<span v-if="item.locality">{{ item.locality }}</span>
</span>
<span class="me-3">{{ item.doc_type }}</span>
<span class="me-3">{{ item.date }}</span>
<span
v-if="item.court"
class="me-3"
>{{ item.court }}</span>
<span
v-if="item.authors"
class="me-3"
>{{ authors(item) }}</span>
<span v-for="label in labels" :key="label.code" :class="`me-3 badge rounded-pill bg-${label.level}`">{{ label.name }}</span>
<a
v-if="debug"
class="me-3"
href="#"
@click.prevent="$emit('explain')"
>{{ item._score }}</a>
</div>
<div>
{{ item.matter_type }}
</div>
</div>
<div v-if="item.pages.length">
<div
v-for="page in item.pages"
:key="page.page_num"
class="mb-1"
>
<a :href="`${item.expression_frbr_uri}#page-${page.page_num}`">Page {{ page.page_num }}</a>:
<span v-if="page.highlight['pages.body']" v-html="page.highlight['pages.body'].join(' ... ')" />
</div>
</div>
<div v-if="item.provisions.length">
<SearchResultProvision
v-for="provision in item.provisions"
:key="provision.id"
:item="provision"
:parents="provisionParents(provision)"
:expression-frbr-uri="item.expression_frbr_uri"
/>
</div>
<div v-else class="ms-3">
<span
class="snippet"
v-html="highlights(item)"
/>
</div>
<div v-if="debug && item.explanation" class="ms-3 mt-2">
<h5>Explanation</h5>
<div class="explanation border p-2">
<json-table :data="item.explanation" />
</div>
</div>
</div>
</div>
</li>
</template>

<script>
import JsonTable from './JsonTable.vue';
import SearchResultProvision from './SearchResultProvision.vue';
export default {
name: 'SearchResult',
components: {
JsonTable
JsonTable,
SearchResultProvision
},
props: {
item: {
Expand Down Expand Up @@ -125,6 +141,15 @@ export default {
return Array.isArray(item.authors) ? ', '.join(item.authors) : item.authors;
}
return '';
},
provisionParents (provision) {
// zip item.parent_titles and item.parent_ids
return provision.parent_titles.map((title, index) => {
return {
title: title,
id: provision.parent_ids[index]
};
});
}
}
};
Expand Down
29 changes: 29 additions & 0 deletions peachjam/js/components/FindDocuments/SearchResultProvision.vue
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<template>
<div class="mb-1">
<div v-if="parents.length">
<a :href="`${expressionFrbrUri}#${parents[0].id}`">{{ parents[0].title }}</a>
<div class="ms-3">
<SearchResultProvision
:item="item"
:parents="parents.slice(1)"
:expression-frbr-uri="expressionFrbrUri"
/>
</div>
</div>
<div v-else>
<a :href="`${expressionFrbrUri}#${item.id}`">{{ item.title }}</a>
<div class="ms-3" v-if="item.highlight['provisions.body']" v-html="item.highlight['provisions.body'].join(' ... ')" />
</div>
</div>
</template>

<script>
export default {
name: 'SearchResultProvision',
props: {
item: { type: Object, default: () => {} },
expressionFrbrUri: { type: String, default: '' },
parents: { type: Array, default: () => [] }
}
};
</script>
3 changes: 2 additions & 1 deletion peachjam/models/core_document_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from peachjam.models.settings import pj_settings
from peachjam.pipelines import DOC_MIMETYPES, word_pipeline
from peachjam.storage import DynamicStorageFileField
from peachjam.xmlutils import parse_html_str


class Label(models.Model):
Expand Down Expand Up @@ -905,7 +906,7 @@ def update_or_create_for_document(cls, document):
text = ""
if document.content_html:
# it's html, grab the text from the html tree
root = html.fromstring(document.content_html)
root = parse_html_str(document.content_html)
text = " ".join(root.itertext())

elif hasattr(document, "source_file"):
Expand Down
2 changes: 1 addition & 1 deletion peachjam/static/js/app-prod.js

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions peachjam/xmlutils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import lxml.html

html_parser = lxml.html.HTMLParser(encoding="utf-8")


def parse_html_str(html):
"""Encode HTML into utf-8 bytes and parse."""
return lxml.html.fromstring(html.encode("utf-8"), parser=html_parser)
64 changes: 62 additions & 2 deletions peachjam_search/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
OrderOutcome,
Taxonomy,
)
from peachjam.xmlutils import parse_html_str

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -104,7 +105,19 @@ class SearchableDocument(Document):
pages = fields.NestedField(
properties={
"page_num": fields.IntegerField(),
"body": fields.TextField(analyzer="standard", fields={"exact": Text()}),
"body": fields.TextField(fields={"exact": Text()}),
}
)

provisions = fields.NestedField(
properties={
"title": fields.TextField(),
"id": fields.KeywordField(),
"num": fields.KeywordField(),
"type": fields.KeywordField(),
"parent_titles": fields.TextField(),
"parent_ids": fields.KeywordField(),
"body": fields.TextField(fields={"exact": Text()}),
}
)

Expand Down Expand Up @@ -218,7 +231,9 @@ def prepare_authors(self, instance):

def prepare_content(self, instance):
"""Text content of document body for non-PDFs."""
if instance.content_html:
if instance.content_html and (
not instance.content_html_is_akn or not instance.toc_json
):
return instance.get_content_as_text()

def prepare_ranking(self, instance):
Expand Down Expand Up @@ -269,6 +284,51 @@ def prepare_pages(self, instance):
pages.append({"page_num": i, "body": page})
return pages

def prepare_provisions(self, instance):
"""Text content of provisions from AKN HTML."""

def prepare_provision(item, parents):
provision = None
provision_id = item["id"] or item["type"]

# get the text of the provision
body = []
for provision_el in root.xpath(f'//*[@id="{provision_id}"]'):
for el in provision_el:
# exclude headings so they aren't indexed twice
if el.tag not in ["h1", "h2", "h3", "h4", "h5"]:
body.append(" ".join(el.itertext()))
break
if body:
provision = {
"title": item["title"],
"id": provision_id,
"num": (item["num"] or "").rstrip("."),
"type": item["type"],
"parent_titles": [
p["title"] for p in parents if p["title"] and p["id"]
],
"parent_ids": [p["id"] for p in parents if p["title"] and p["id"]],
"body": " ".join(body),
}
provisions.append(provision)

# recurse into children
if not item["basic_unit"]:
if provision:
parents = parents + [provision]
for child in item["children"] or []:
prepare_provision(child, parents)

if instance.content_html and instance.content_html_is_akn and instance.toc_json:
# index each provision separately
provisions = []
root = parse_html_str(instance.content_html)
for item in instance.toc_json:
prepare_provision(item, [])

return provisions

def prepare_taxonomies(self, instance):
"""Taxonomy topics are stored as slugs of all the items in the tree down to that topic. This is easier than
storing and querying hierarchical taxonomy entries."""
Expand Down
31 changes: 31 additions & 0 deletions peachjam_search/migrations/0001_es_mapping_add_provisions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Generated by Django 3.2.25 on 2024-04-26 17:17

import os

from django.db import migrations


def forwards(apps, schema_editor):
from django.conf import settings
from django_elasticsearch_dsl.registries import registry

if settings.ELASTICSEARCH_DSL_AUTOSYNC and os.environ.get("ELASTICSEARCH_HOST"):
for ix in registry.get_indices():
if not ix._mapping:
continue
print(f"Adding provisions mapping for {ix._name}")
ix.connection.indices.put_mapping(
index=ix._name,
body={
"properties": {"provisions": ix._mapping["provisions"].to_dict()}
},
)


class Migration(migrations.Migration):

dependencies = [
("peachjam", "0125_judgment_auto_assign_title"),
]

operations = [migrations.RunPython(forwards, migrations.RunPython.noop)]
26 changes: 26 additions & 0 deletions peachjam_search/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class SearchableDocumentSerializer(DocumentSerializer):
id = CharField(source="meta.id")
highlight = SerializerMethodField()
pages = SerializerMethodField()
provisions = SerializerMethodField()
court = SerializerMethodField()
nature = SerializerMethodField()
order_outcome = SerializerMethodField()
Expand Down Expand Up @@ -66,6 +67,31 @@ def get_pages(self, obj):
pages.append(info)
return pages

def get_provisions(self, obj):
"""Serialize nested provision hits and highlights."""
provisions = []
# keep track of which provisions (including parents) we've seen, so that we don't, for
# example, repeat Chapter 7 if Chapter 7, Section 32 is also a hit
seen = set()
if hasattr(obj.meta, "inner_hits") and hasattr(
obj.meta.inner_hits, "provisions"
):
for provision in obj.meta.inner_hits.provisions.hits.hits:
info = provision._source.to_dict()

if info["id"] in seen:
continue
seen.add(info["id"])
seen.update(info["parent_ids"])

info["highlight"] = (
provision.highlight.to_dict()
if hasattr(provision, "highlight")
else {}
)
provisions.append(info)
return provisions

def get_court(self, obj):
return obj["court" + self.language_suffix]

Expand Down
Loading

0 comments on commit 2a3a3cf

Please sign in to comment.