Skip to content

Commit

Permalink
Merge pull request #1744 from laws-africa/gazette-improvements
Browse files Browse the repository at this point in the history
Gazette improvements
  • Loading branch information
longhotsummer authored Mar 1, 2024
2 parents 673b233 + f58d2b9 commit ff6710b
Show file tree
Hide file tree
Showing 21 changed files with 375 additions and 121 deletions.
3 changes: 2 additions & 1 deletion peachjam/adapters/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
from .adapters import *
from .base import *
from .gazettes import *
from .indigo import *
34 changes: 34 additions & 0 deletions peachjam/adapters/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
class Adapter:
def __init__(self, settings):
self.settings = settings
self.predicates = {
"amended-by": {
"name": "amended by",
"verb": "is amended by",
"reverse_verb": "amends",
},
"repealed-by": {
"name": "repealed by",
"verb": "is repealed by",
"reverse_verb": "repeals",
},
"commenced-by": {
"name": "commenced by",
"verb": "is commenced by",
"reverse_verb": "commences",
},
}

def check_for_updates(self, last_refreshed):
"""Checks for documents updated since last_refreshed (which may be None), and returns a list
of document identifiers (expression FRBR URIs) which must be updated.
"""
raise NotImplementedError()

def update_document(self, document_id):
"""Update the document identified by some opaque id, returned by check_for_updates."""
raise NotImplementedError()

@classmethod
def name(cls):
return cls.__name__
134 changes: 133 additions & 1 deletion peachjam/adapters/gazettes.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import logging
from datetime import date

import requests
from cobalt.uri import FrbrUri
from countries_plus.models import Country
from languages_plus.models import Language

Expand All @@ -10,10 +13,11 @@
Gazette,
Locality,
SourceFile,
get_country_and_locality,
)
from peachjam.plugins import plugins

from .adapters import Adapter
from .base import Adapter

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -154,3 +158,131 @@ def delete_document(self, expression_frbr_uri):
).first()
if not ga_gazette and local_gazette:
local_gazette.delete()


@plugins.register("ingestor-adapter")
class GazetteAPIAdapter(Adapter):
def __init__(self, settings):
super().__init__(settings)
self.jurisdiction = self.settings.get("jurisdiction")
self.client = requests.session()
self.client.headers.update(
{
"Authorization": f"Token {self.settings['token']}",
}
)
self.api_url = self.settings["api_url"]

def check_for_updates(self, last_refreshed):
"""Checks for documents updated since last_refreshed (which may be None), and returns a list
of urls which must be updated.
"""
docs = self.get_updated_docs(last_refreshed)
urls = [d["url"] for d in docs]
return urls, []

def get_updated_docs(self, last_refreshed):
results = []
# self.jurisdiction can be a space-separated list of jurisdiction codes or an empty string for all jurisdictions
for juri in (self.jurisdiction or "").split() or [None]:
log.info(
f"Checking for new gazettes from Gazettes.Africa since {last_refreshed} for jurisdiction {juri}"
)

params = {}
if last_refreshed:
params["updated_at__gte"] = last_refreshed.isoformat()

if juri:
if juri.endswith("-*"):
# handle jurisdiction wildcards, eg za-*
# instead of asking for a jurisdiction code, we must ask for a specific
# country and all jurisdictions under it
params["country"] = juri.split("-")[0]
params["locality__isnull"] = False
else:
params["jurisdiction"] = juri

url = f"{self.api_url}/gazettes/archived.json"
while url:
res = self.client_get(url, params=params).json()
results.extend(res["results"])
url = res["next"]

return results

def update_document(self, url):
log.info(f"Updating gazette ... {url}")
if url.endswith("/"):
url = url[:-1]

try:
document = self.client_get(f"{url}.json").json()
except requests.HTTPError as error:
if error.response.status_code == 404:
return
else:
raise error

frbr_uri = FrbrUri.parse(document["expression_frbr_uri"])
country, locality = get_country_and_locality(document["jurisdiction"])
language = Language.objects.get(pk=document["language"])

data = {
"jurisdiction": country,
"locality": locality,
"frbr_uri_doctype": frbr_uri.doctype,
"frbr_uri_subtype": frbr_uri.subtype,
"frbr_uri_actor": frbr_uri.actor,
"frbr_uri_number": frbr_uri.number,
"frbr_uri_date": frbr_uri.date,
"nature": None, # see https://github.com/laws-africa/gazettemachine/issues/172
"language": language,
"date": date.fromisoformat(document["date"]),
"title": document["name"],
"publication": document["publication"],
"sub_publication": document["sub_publication"],
"supplement": document["supplement"],
"supplement_number": document["supplement_number"],
"part": document["part"],
"key": document["key"],
"created_at": document["created_at"],
"updated_at": document["updated_at"],
}
gazette, new = Gazette.objects.update_or_create(
expression_frbr_uri=document["expression_frbr_uri"],
defaults={**data},
)

if frbr_uri.expression_uri() != gazette.expression_frbr_uri:
raise Exception(
f"FRBR URIs do not match: {frbr_uri.expression_uri()} != {gazette.expression_frbr_uri}"
)

log.info(f"New document: {new}")

s3_file = "s3:" + document["s3_location"].replace("/", ":", 1)
sf, created = SourceFile.objects.update_or_create(
document=gazette,
defaults={
"file": s3_file,
"source_url": document["download_url"],
"mimetype": "application/pdf",
"filename": document["key"] + ".pdf",
"size": document["size"],
},
)
# force the dynamic file field to be set correctly
SourceFile.objects.filter(pk=sf.pk).update(file=s3_file)

log.info("Done.")

def delete_document(self, expression_frbr_uri):
# TODO:
pass

def client_get(self, url, **kwargs):
log.debug(f"GET {url} kwargs={kwargs}")
r = self.client.get(url, **kwargs)
r.raise_for_status()
return r
38 changes: 2 additions & 36 deletions peachjam/adapters/adapters.py → peachjam/adapters/indigo.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from django.utils.text import slugify
from languages_plus.models import Language

from peachjam.adapters.base import Adapter
from peachjam.models import (
AlternativeName,
Author,
Expand All @@ -35,42 +36,6 @@
logger = logging.getLogger(__name__)


class Adapter:
def __init__(self, settings):
self.settings = settings
self.predicates = {
"amended-by": {
"name": "amended by",
"verb": "is amended by",
"reverse_verb": "amends",
},
"repealed-by": {
"name": "repealed by",
"verb": "is repealed by",
"reverse_verb": "repeals",
},
"commenced-by": {
"name": "commenced by",
"verb": "is commenced by",
"reverse_verb": "commences",
},
}

def check_for_updates(self, last_refreshed):
"""Checks for documents updated since last_refreshed (which may be None), and returns a list
of document identifiers (expression FRBR URIs) which must be updated.
"""
raise NotImplementedError()

def update_document(self, document_id):
"""Update the document identified by some opaque id, returned by check_for_updates."""
raise NotImplementedError()

@classmethod
def name(cls):
return cls.__name__


@plugins.register("ingestor-adapter")
class IndigoAdapter(Adapter):
"""Adapter that pulls data from the Indigo API.
Expand Down Expand Up @@ -491,6 +456,7 @@ def create_relationship(self, slug, subject_work, object_work):
logger.info(f"{self.predicates[slug]['name']} relationship created")

def client_get(self, url):
logger.debug(f"GET {url}")
r = self.client.get(url)
r.raise_for_status()
return r
Expand Down
11 changes: 2 additions & 9 deletions peachjam/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,7 @@
from treebeard.admin import TreeAdmin
from treebeard.forms import MoveNodeForm, movenodeform_factory

from peachjam.forms import (
AttachedFilesForm,
IngestorForm,
NewDocumentFormMixin,
SourceFileForm,
)
from peachjam.forms import AttachedFilesForm, NewDocumentFormMixin, SourceFileForm
from peachjam.models import (
AlternativeName,
Article,
Expand Down Expand Up @@ -829,8 +824,6 @@ class IngestorSettingInline(admin.TabularInline):
@admin.register(Ingestor)
class IngestorAdmin(admin.ModelAdmin):
inlines = [IngestorSettingInline]
readonly_fields = ("last_refreshed_at",)
form = IngestorForm
actions = ["refresh_all_content"]
fields = ("adapter", "name", "last_refreshed_at", "enabled")
list_display = ("name", "last_refreshed_at", "enabled")
Expand All @@ -845,7 +838,7 @@ def refresh_all_content(self, request, queryset):
self.message_user(request, _("Refreshing content in the background."))

refresh_all_content.short_description = gettext_lazy(
"Refresh content selected ingestors"
"Refresh content for selected ingestors"
)


Expand Down
4 changes: 2 additions & 2 deletions peachjam/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ def ready(self):

from peachjam.tasks import rank_works, run_ingestors

run_ingestors(repeat=60 * 60 * 24)
rank_works(repeat=Task.WEEKLY)
run_ingestors(schedule=Task.DAILY, repeat=Task.DAILY)
rank_works(schedule=Task.WEEKLY, repeat=Task.WEEKLY)
18 changes: 1 addition & 17 deletions peachjam/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,7 @@
from django.utils.text import slugify
from django.utils.translation import gettext as _

from peachjam.models import (
AttachedFiles,
CoreDocument,
Ingestor,
SourceFile,
pj_settings,
)
from peachjam.models import AttachedFiles, CoreDocument, SourceFile, pj_settings
from peachjam.plugins import plugins
from peachjam.storage import clean_filename

Expand All @@ -31,16 +25,6 @@ def adapter_choices():
return [(key, p.name) for key, p in plugins.registry["ingestor-adapter"].items()]


class IngestorForm(forms.ModelForm):
adapter = forms.ChoiceField(choices=adapter_choices)
last_refreshed_at = forms.DateTimeField(required=False)
name = forms.CharField(max_length=255)

class Meta:
model = Ingestor
fields = ("adapter", "name")


class NewDocumentFormMixin:
"""Mixin for the admin view when creating a new document that adds a new field to allow the user to upload a
file from which key data can be extracted.
Expand Down
53 changes: 53 additions & 0 deletions peachjam/migrations/0120_gazette_details.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Generated by Django 3.2.20 on 2024-02-23 12:00

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("peachjam", "0119_matomo"),
]

operations = [
migrations.AddField(
model_name="gazette",
name="part",
field=models.CharField(
blank=True, max_length=10, null=True, verbose_name="part"
),
),
migrations.AddField(
model_name="gazette",
name="publication",
field=models.CharField(
blank=True, max_length=100, null=True, verbose_name="publication"
),
),
migrations.AddField(
model_name="gazette",
name="sub_publication",
field=models.CharField(
blank=True, max_length=100, null=True, verbose_name="sub publication"
),
),
migrations.AddField(
model_name="gazette",
name="supplement",
field=models.BooleanField(default=False, verbose_name="supplement"),
),
migrations.AddField(
model_name="gazette",
name="supplement_number",
field=models.IntegerField(
blank=True, null=True, verbose_name="supplement number"
),
),
migrations.AddField(
model_name="gazette",
name="key",
field=models.CharField(
max_length=512, null=True, blank=True, db_index=True, verbose_name="key"
),
),
]
Loading

0 comments on commit ff6710b

Please sign in to comment.