Skip to content

Commit

Permalink
Merge pull request #1604 from laws-africa/citator
Browse files Browse the repository at this point in the history
Updates to support improved text citation extraction and the new citator api
  • Loading branch information
longhotsummer authored Nov 2, 2023
2 parents 835883b + e19946f commit fb0f0df
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 5 deletions.
19 changes: 18 additions & 1 deletion peachjam/analysis/citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,30 @@ def extract_text_matches(self, frbr_uri, text):
)
text = text[: self.max_text_size]

# For text documents, we need to provide the existing citations for context. For html, existing citations
# are already marked up in the HTML.
citations = [
c.to_citator_api()
for c in CitationLink.objects.filter(
document__expression_frbr_uri=frbr_uri.expression_uri()
)
]

resp = self.call_citator(
{
"frbr_uri": frbr_uri.expression_uri(),
"format": "text",
"body": text,
"citations": citations,
}
)

# only keep new citations
existing = {(c["start"], c["end"]) for c in citations}
citations = [
c for c in resp["citations"] if (c["start"], c["end"]) not in existing
]

# store the extracted citations
self.citations = [
ExtractedCitation(
Expand All @@ -127,7 +144,7 @@ def extract_text_matches(self, frbr_uri, text):
c["prefix"],
c["suffix"],
)
for c in resp["citations"]
for c in citations
]

def call_citator(self, body):
Expand Down
15 changes: 15 additions & 0 deletions peachjam/models/citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,21 @@ class Meta:
verbose_name = _("citation link")
verbose_name_plural = _("citation links")

def to_citator_api(self):
"""Transform into a format suitable for the Citator API."""
selector = next(
(t for t in self.target_selectors if t["type"] == "TextPositionSelector"),
None,
)
return {
"href": self.url,
"text": self.text,
# strip the page- and just keep the num
"target_id": int(self.target_id.split("-", 1)[1]) - 1,
"start": selector["start"] if selector else -1,
"end": selector["end"] if selector else -1,
}

def __str__(self):
return f"Citation link for {self.document.doc_type} - {self.document.title}"

Expand Down
4 changes: 3 additions & 1 deletion peachjam/models/core_document_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,7 +659,9 @@ def get_cited_work_frbr_uris(self):
else:
for citation_link in CitationLink.objects.filter(document_id=self.pk):
try:
work_frbr_uris.add(FrbrUri.parse(citation_link.url).work_uri())
uri = FrbrUri.parse(citation_link.url)
uri.portion = None
work_frbr_uris.add(uri.work_uri())
except ValueError:
# ignore malformed FRBR URIs
pass
Expand Down
20 changes: 17 additions & 3 deletions peachjam/views/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,17 @@ def dispatch(self, request, *args, **kwargs):
except ValueError:
raise Http404()

obj, exact = CoreDocument.objects.best_for_frbr_uri(frbr_uri, get_language())
# ensure portion is ignored when looking for a document
portion = parsed_frbr_uri.portion
parsed_frbr_uri.portion = None
uri_to_search = (
parsed_frbr_uri.expression_uri()
if parsed_frbr_uri.expression_date
else parsed_frbr_uri.work_uri()
)
obj, exact = CoreDocument.objects.best_for_frbr_uri(
uri_to_search, get_language()
)

if not obj:
resolver = RedirectResolver(settings.PEACHJAM["APP_NAME"])
Expand All @@ -135,8 +145,12 @@ def dispatch(self, request, *args, **kwargs):
return redirect(url)
raise Http404()

if not exact:
return redirect(obj.get_absolute_url())
if not exact or portion:
url = obj.get_absolute_url()
# this translates from /akn/.../~sec_2 to /akn/.../#sec_2
if portion:
url = url + "#" + portion
return redirect(url)

view_class = registry.views.get(obj.doc_type)
if view_class:
Expand Down

0 comments on commit fb0f0df

Please sign in to comment.