Merge pull request #1604 from laws-africa/citator

Updates to support improved text citation extraction and the new citator api
laws-africa · Nov 2, 2023 · fb0f0df · fb0f0df
2 parents 835883b + e19946f
commit fb0f0df
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 5 deletions.
diff --git a/peachjam/analysis/citations.py b/peachjam/analysis/citations.py
@@ -109,13 +109,30 @@ def extract_text_matches(self, frbr_uri, text):
             )
             text = text[: self.max_text_size]
 
+        # For text documents, we need to provide the existing citations for context. For html, existing citations
+        # are already marked up in the HTML.
+        citations = [
+            c.to_citator_api()
+            for c in CitationLink.objects.filter(
+                document__expression_frbr_uri=frbr_uri.expression_uri()
+            )
+        ]
+
         resp = self.call_citator(
             {
                 "frbr_uri": frbr_uri.expression_uri(),
                 "format": "text",
                 "body": text,
+                "citations": citations,
             }
         )
+
+        # only keep new citations
+        existing = {(c["start"], c["end"]) for c in citations}
+        citations = [
+            c for c in resp["citations"] if (c["start"], c["end"]) not in existing
+        ]
+
         # store the extracted citations
         self.citations = [
             ExtractedCitation(
@@ -127,7 +144,7 @@ def extract_text_matches(self, frbr_uri, text):
                 c["prefix"],
                 c["suffix"],
             )
-            for c in resp["citations"]
+            for c in citations
         ]
 
     def call_citator(self, body):

diff --git a/peachjam/models/citations.py b/peachjam/models/citations.py
@@ -26,6 +26,21 @@ class Meta:
         verbose_name = _("citation link")
         verbose_name_plural = _("citation links")
 
+    def to_citator_api(self):
+        """Transform into a format suitable for the Citator API."""
+        selector = next(
+            (t for t in self.target_selectors if t["type"] == "TextPositionSelector"),
+            None,
+        )
+        return {
+            "href": self.url,
+            "text": self.text,
+            # strip the page- and just keep the num
+            "target_id": int(self.target_id.split("-", 1)[1]) - 1,
+            "start": selector["start"] if selector else -1,
+            "end": selector["end"] if selector else -1,
+        }
+
     def __str__(self):
         return f"Citation link for {self.document.doc_type} - {self.document.title}"
 

diff --git a/peachjam/models/core_document_model.py b/peachjam/models/core_document_model.py
@@ -659,7 +659,9 @@ def get_cited_work_frbr_uris(self):
         else:
             for citation_link in CitationLink.objects.filter(document_id=self.pk):
                 try:
-                    work_frbr_uris.add(FrbrUri.parse(citation_link.url).work_uri())
+                    uri = FrbrUri.parse(citation_link.url)
+                    uri.portion = None
+                    work_frbr_uris.add(uri.work_uri())
                 except ValueError:
                     # ignore malformed FRBR URIs
                     pass

diff --git a/peachjam/views/documents.py b/peachjam/views/documents.py
@@ -125,7 +125,17 @@ def dispatch(self, request, *args, **kwargs):
         except ValueError:
             raise Http404()
 
-        obj, exact = CoreDocument.objects.best_for_frbr_uri(frbr_uri, get_language())
+        # ensure portion is ignored when looking for a document
+        portion = parsed_frbr_uri.portion
+        parsed_frbr_uri.portion = None
+        uri_to_search = (
+            parsed_frbr_uri.expression_uri()
+            if parsed_frbr_uri.expression_date
+            else parsed_frbr_uri.work_uri()
+        )
+        obj, exact = CoreDocument.objects.best_for_frbr_uri(
+            uri_to_search, get_language()
+        )
 
         if not obj:
             resolver = RedirectResolver(settings.PEACHJAM["APP_NAME"])
@@ -135,8 +145,12 @@ def dispatch(self, request, *args, **kwargs):
                 return redirect(url)
             raise Http404()
 
-        if not exact:
-            return redirect(obj.get_absolute_url())
+        if not exact or portion:
+            url = obj.get_absolute_url()
+            # this translates from /akn/.../~sec_2 to /akn/.../#sec_2
+            if portion:
+                url = url + "#" + portion
+            return redirect(url)
 
         view_class = registry.views.get(obj.doc_type)
         if view_class: