Skip to content

Commit

Permalink
adapter-based ingestor webhooks
Browse files Browse the repository at this point in the history
  • Loading branch information
longhotsummer committed Mar 1, 2024
1 parent ef6432a commit 6bd424c
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 192 deletions.
7 changes: 6 additions & 1 deletion peachjam/adapters/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
class Adapter:
def __init__(self, settings):
def __init__(self, ingestor, settings):
self.ingestor = ingestor
self.settings = settings
self.predicates = {
"amended-by": {
Expand Down Expand Up @@ -29,6 +30,10 @@ def update_document(self, document_id):
"""Update the document identified by some opaque id, returned by check_for_updates."""
raise NotImplementedError()

def handle_webhook(self, data):
"""Handle webhook from a remote server."""
pass

@classmethod
def name(cls):
return cls.__name__
191 changes: 32 additions & 159 deletions peachjam/adapters/gazettes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,167 +3,20 @@

import requests
from cobalt.uri import FrbrUri
from countries_plus.models import Country
from languages_plus.models import Language

from peachjam.models import (
CoreDocument,
DocumentContent,
DocumentNature,
Gazette,
Locality,
SourceFile,
get_country_and_locality,
)
from peachjam.models import Gazette, SourceFile, get_country_and_locality
from peachjam.plugins import plugins

from .base import Adapter

log = logging.getLogger(__name__)


@plugins.register("ingestor-adapter")
class GazetteAdapter(Adapter):
def __init__(self, settings):
super().__init__(settings)
self.jurisdiction = self.settings["jurisdiction"]

def check_for_updates(self, last_refreshed):
log.info(
f"Checking for new gazettes from Gazettes.Africa since {last_refreshed}"
)

# start building queryset for updated gazettes
updated_qs = (
CoreDocument.objects.filter(doc_type="gazette")
.non_polymorphic()
.using("gazettes_africa")
)
# start building queryset for deleted gazettes
deleted_qs = Gazette.objects

if "-" not in self.jurisdiction:
# locality code not specified hence None e.g "ZA"
updated_qs = updated_qs.filter(
jurisdiction=self.jurisdiction, locality=None
)
deleted_qs = deleted_qs.filter(
jurisdiction=self.jurisdiction, locality=None
)
else:
updated_qs = updated_qs.filter(jurisdiction=self.jurisdiction.split("-")[0])
deleted_qs = deleted_qs.filter(jurisdiction=self.jurisdiction.split("-")[0])

# locality code present e.g. "ZA-gp"
if self.jurisdiction.split("-")[1] != "*":
locality_code = self.jurisdiction.split("-")[1]
updated_qs = updated_qs.filter(locality__code=locality_code)
deleted_qs = deleted_qs.filter(locality__code=locality_code)
else:
# fetch all localities for this jurisdiction e.g. "ZA-*"
updated_qs = updated_qs.exclude(locality=None)
deleted_qs = deleted_qs.exclude(locality=None)

updated_qs = updated_qs.values_list("expression_frbr_uri", flat=True)

deleted_qs = deleted_qs.exclude(
expression_frbr_uri__in=list(updated_qs)
).values_list("expression_frbr_uri", flat=True)

if last_refreshed:
updated_qs = updated_qs.filter(updated_at__gt=last_refreshed)

return updated_qs, deleted_qs

def update_document(self, expression_frbr_uri):
log.info(f"Updating new gazette {expression_frbr_uri}")

ga_gazette = (
CoreDocument.objects.filter(expression_frbr_uri=expression_frbr_uri)
.using("gazettes_africa")
.non_polymorphic()
.first()
)

if ga_gazette:
data = {
"title": ga_gazette.title,
"date": ga_gazette.date,
"source_url": ga_gazette.source_url,
"citation": ga_gazette.citation,
"content_html_is_akn": ga_gazette.content_html_is_akn,
"language": Language.objects.get(pk=ga_gazette.language.pk),
"jurisdiction": Country.objects.get(pk=ga_gazette.jurisdiction.pk),
"work_frbr_uri": ga_gazette.work_frbr_uri,
"frbr_uri_subtype": ga_gazette.frbr_uri_subtype,
"frbr_uri_actor": ga_gazette.frbr_uri_actor,
"frbr_uri_date": ga_gazette.frbr_uri_date,
"frbr_uri_number": ga_gazette.frbr_uri_number,
"expression_frbr_uri": ga_gazette.expression_frbr_uri,
}

if ga_gazette.locality:
data["locality"] = Locality.objects.get(code=ga_gazette.locality.code)

if ga_gazette.nature:
document_nature_name = " ".join(
[name for name in ga_gazette.nature.name.split("-")]
).capitalize()
data["nature"] = DocumentNature.objects.get_or_create(
code=ga_gazette.nature.code,
defaults={"name": document_nature_name},
)[0]

updated_gazette, new = Gazette.objects.update_or_create(
expression_frbr_uri=expression_frbr_uri, defaults={**data}
)

ga_source_file = (
SourceFile.objects.filter(document=ga_gazette)
.values("file", "filename", "size", "mimetype")
.using("gazettes_africa")
)

file_path = ga_source_file[0].pop("file")

if ga_source_file:
source_url = f"https://gazettes.africa{expression_frbr_uri}/source"
updated_source_file, _ = SourceFile.objects.update_or_create(
document=updated_gazette,
defaults={"file": f"{file_path}", "source_url": source_url},
)
# update the source file to include the bucket name
updated_source_file.file.set_raw_value(file_path)

if hasattr(ga_gazette, "document_content"):
ga_content_text = ga_gazette.document_content.content_text
DocumentContent.objects.update_or_create(
document=updated_gazette, content_text=ga_content_text
)
updated_gazette.extract_citations()

log.info("Update Done.")

def delete_document(self, expression_frbr_uri):
ga_gazette = (
CoreDocument.objects.filter(
doc_type="gazette", expression_frbr_uri=expression_frbr_uri
)
.non_polymorphic()
.using("gazettes_africa")
.first()
)
local_gazette = Gazette.objects.filter(
expression_frbr_uri=expression_frbr_uri
).first()
if not ga_gazette and local_gazette:
local_gazette.delete()
logger = logging.getLogger(__name__)


@plugins.register("ingestor-adapter")
class GazetteAPIAdapter(Adapter):
def __init__(self, settings):
super().__init__(settings)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.jurisdiction = self.settings.get("jurisdiction")
self.client = requests.session()
self.client.headers.update(
Expand All @@ -185,7 +38,7 @@ def get_updated_docs(self, last_refreshed):
results = []
# self.jurisdiction can be a space-separated list of jurisdiction codes or an empty string for all jurisdictions
for juri in (self.jurisdiction or "").split() or [None]:
log.info(
logger.info(
f"Checking for new gazettes from Gazettes.Africa since {last_refreshed} for jurisdiction {juri}"
)

Expand All @@ -212,7 +65,7 @@ def get_updated_docs(self, last_refreshed):
return results

def update_document(self, url):
log.info(f"Updating gazette ... {url}")
logger.info(f"Updating gazette ... {url}")
if url.endswith("/"):
url = url[:-1]

Expand Down Expand Up @@ -259,7 +112,7 @@ def update_document(self, url):
f"FRBR URIs do not match: {frbr_uri.expression_uri()} != {gazette.expression_frbr_uri}"
)

log.info(f"New document: {new}")
logger.info(f"New document: {new}")

s3_file = "s3:" + document["s3_location"].replace("/", ":", 1)
sf, created = SourceFile.objects.update_or_create(
Expand All @@ -275,14 +128,34 @@ def update_document(self, url):
# force the dynamic file field to be set correctly
SourceFile.objects.filter(pk=sf.pk).update(file=s3_file)

log.info("Done.")
logger.info("Done.")

def delete_document(self, frbr_uri):
url = f"{self.api_url}{frbr_uri}"

try:
self.client_get(url)
except requests.exceptions.HTTPError as e:
if e.response.status_code == 404:
document = Gazette.objects.filter(expression_frbr_uri=frbr_uri).first()
if document:
document.delete()
else:
raise e

def handle_webhook(self, data):
from peachjam.tasks import delete_document, update_document

logger.info(f"Handling webhook {data}")

if data.get("action") == "updated" and data.get("data", {}).get("url"):
update_document(self.ingestor.pk, data["data"]["url"])

def delete_document(self, expression_frbr_uri):
# TODO:
pass
if data.get("action") == "deleted" and data.get("data", {}).get("frbr_uri"):
delete_document(self.ingestor.pk, data["data"]["frbr_uri"])

def client_get(self, url, **kwargs):
log.debug(f"GET {url} kwargs={kwargs}")
logger.debug(f"GET {url} kwargs={kwargs}")
r = self.client.get(url, **kwargs)
r.raise_for_status()
return r
17 changes: 15 additions & 2 deletions peachjam/adapters/indigo.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ class IndigoAdapter(Adapter):
* places: space-separate list of place codes, such as: bw za-*
"""

def __init__(self, settings):
super().__init__(settings)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.client = requests.session()
self.client.headers.update(
{
Expand Down Expand Up @@ -534,3 +534,16 @@ def fetch_and_create_aliases(self, document, created_document):
AlternativeName.objects.create(document=created_document, title=alias)

logger.info(f"Fetching of aliases for {created_document} is complete!")

def handle_webhook(self, data):
from peachjam.tasks import delete_document, update_document

logger.info(f"Handling webhook {data}")

if data.get("action") == "updated" and data.get("data", {}).get("url"):
update_document(self.ingestor.pk, data["data"]["url"])

if data.get("action") == "deleted" and data.get("data", {}).get(
"expression_frbr_uri"
):
delete_document(self.ingestor.pk, data["data"]["expression_frbr_uri"])
6 changes: 5 additions & 1 deletion peachjam/models/ingestors.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,12 @@ def delete_document(self, expression_frbr_uri):
adapter = self.get_adapter()
adapter.delete_document(expression_frbr_uri)

def handle_webhook(self, data):
adapter = self.get_adapter()
adapter.handle_webhook(data)

def get_adapter(self):
klass = plugins.registry["ingestor-adapter"][self.adapter]
ingestor_settings = IngestorSetting.objects.filter(ingestor=self)
settings = {s.name: s.value for s in ingestor_settings}
return klass(settings)
return klass(self, settings)
11 changes: 0 additions & 11 deletions peachjam_api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,17 +121,6 @@ class Meta:
)


class IngestorWebHookSerializer(serializers.Serializer):
action = serializers.CharField()
data = WebhookDataSerializer()

class Meta:
fields = (
"action",
"data",
)


class LabelSerializer(serializers.ModelSerializer):
class Meta:
model = Label
Expand Down
31 changes: 13 additions & 18 deletions peachjam_api/views.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
from django import forms
from django.db.models import Q
from django.shortcuts import get_object_or_404
from django.urls import reverse
from django.utils.html import format_html
from django.utils.translation import gettext_lazy as _
from rest_framework import authentication, viewsets
from rest_framework.response import Response
from rest_framework.views import APIView

from peachjam.models import CaseNumber, CitationLink, Judgment, Relationship, Work
from peachjam.tasks import delete_document, update_document
from peachjam.models import (
CaseNumber,
CitationLink,
Ingestor,
Judgment,
Relationship,
Work,
)
from peachjam_api.permissions import CoreDocumentPermission
from peachjam_api.serializers import (
CitationLinkSerializer,
IngestorWebHookSerializer,
RelationshipSerializer,
WorkSerializer,
)
Expand Down Expand Up @@ -40,23 +46,12 @@ class CitationLinkViewSet(viewsets.ModelViewSet):
class IngestorWebhookView(APIView):
authentication_classes = [authentication.TokenAuthentication]
permission_classes = [CoreDocumentPermission]
serializer_class = IngestorWebHookSerializer

def post(self, request, ingestor_id):
body = self.request.data

serializer = self.serializer_class(data=body)
if serializer.is_valid():
if serializer.data["action"] == "updated":
update_document(ingestor_id, serializer.data["data"]["url"])

elif serializer.data["action"] == "deleted":
delete_document(
ingestor_id, serializer.data["data"]["expression_frbr_uri"]
)

return Response({"data": serializer.data, "ingestor_id": ingestor_id})
return Response(serializer.errors, status=400)
ingestor = get_object_or_404(Ingestor, pk=ingestor_id)
if ingestor.enabled:
ingestor.handle_webhook(request.data)
return Response({}, status=200)


class DuplicateForm(forms.Form):
Expand Down

0 comments on commit 6bd424c

Please sign in to comment.