refactor:PublicGateway and add construction_legislation

Refactor PublicGateway class by moving date normalization and SQL condition building functions outside the class. Added a new construction_legislation.py file to handle construction legislation data, while deleting the unnecessary Excel file.
uktrade · Nov 12, 2024 · 0092a02 · 0092a02
1 parent f65d771
commit 0092a02
Show file tree

Hide file tree

Showing 9 changed files with 2,126 additions and 130 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+per-file-ignores =
+    construction_legislation.py: E501
diff --git a/orp/config/urls.py b/orp/config/urls.py
@@ -3,7 +3,11 @@
 import orp_search.views as orp_search_views
 
 from orp_search.models import DataResponseModel
-from rest_framework import routers, serializers, viewsets
+from orp_search.utils.documents import clear_all_documents
+from orp_search.utils.search import search
+from rest_framework import routers, serializers, status, viewsets
+from rest_framework.decorators import action
+from rest_framework.response import Response
 
 from django.conf import settings
 from django.contrib import admin
@@ -52,12 +56,45 @@ class Meta:
 
 class DataResponseViewSet(viewsets.ModelViewSet):
     serializer_class = DataResponseSerializer
-    queryset = DataResponseModel.objects.all()
+
+    def list(self, request, *args, **kwargs):
+        # Assuming `search` is a function that
+        # processes the request and returns data
+        context = {
+            "service_name": settings.SERVICE_NAME_SEARCH,
+        }
+        response_data = search(context, request)
+
+        # Return the response
+        return Response(response_data, status=status.HTTP_200_OK)
+
+
+class RebuildCacheViewSet(viewsets.ViewSet):
+    @action(detail=False, methods=["post"], url_path="cache")
+    def rebuild_cache(self, request, *args, **kwargs):
+        from orp_search.legislation import Legislation
+
+        # from orp_search.public_gateway import PublicGateway
+
+        try:
+            clear_all_documents()
+            Legislation().build_cache()
+            # PublicGateway().build_cache()
+        except Exception as e:
+            return Response(
+                data={"message": f"error clearing documents: {e}"},
+                status=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            )
+
+        return Response(
+            data={"message": "rebuilt cache"}, status=status.HTTP_200_OK
+        )
 
 
 # Routers provide an easy way of automatically determining the URL conf.
 router = routers.DefaultRouter()
-router.register(r"results", DataResponseViewSet)
+router.register(r"dataresults", DataResponseViewSet, basename="dataresponse")
+router.register(r"rebuild", RebuildCacheViewSet, basename="cache")
 
 urlpatterns = [
     path("", include(router.urls)),

diff --git a/orp/orp_search/construction_legislation.py b/orp/orp_search/construction_legislation.py
diff --git a/orp/orp_search/construction_legislation.xlsx b/orp/orp_search/construction_legislation.xlsx
diff --git a/orp/orp_search/legislation.py b/orp/orp_search/legislation.py
@@ -4,9 +4,13 @@
 
 from datetime import datetime
 
-import pandas as pd
 import requests  # type: ignore
 
+from numpy.f2py.auxfuncs import throw_error
+from orp_search.config import SearchDocumentConfig
+from orp_search.construction_legislation import (  # noqa: E501
+    construction_legislation_dataframe,
+)
 from orp_search.utils.documents import insert_or_update_document
 
 logger = logging.getLogger(__name__)
@@ -17,17 +21,21 @@ def _encode_url(url):
     return encoded_bytes.decode("utf-8")
 
 
-def _get_url_data(url, config=None):
+def _get_url_data(config, url):
     try:
-        response = requests.get(  # nosec BXXX
-            url, timeout=10 if not config.timeout else config.timeout
-        )
+        response = requests.get(url, timeout=config.timeout)  # nosec BXXX
         if response.status_code == 200:
             return response.text
+
+        # If the status code is not 200, log the error
+        logger.error(
+            f"error fetching legislation data "
+            f"[{response.status_code}]: {response.reason}"
+        )
         return None
     except requests.exceptions.RequestException as e:
         logger.error(f"error fetching legislation data: {e}")
-        return None
+        return e
 
 
 class Legislation:
@@ -41,63 +49,76 @@ def __init__(self):
             "ukm": "http://www.legislation.gov.uk/namespaces/metadata",
         }
 
-    def parse_dataset_and_store(self):
-        # Read construction_legislation.xlsx into panda
-        dataset = pd.read_excel("construction_legislation.xlsx")
+    def build_cache(self):
+        logger.info("building legislation cache...")
+        dataset = construction_legislation_dataframe()
 
         # For each row, get the URL from the column named
         # 'URI to Extract XML Data'
         # and store the XML data in a list
-        xml_data = []
         for index, row in dataset.iterrows():
             url = row["URI to Extract XML Data"]
-            data = _get_url_data(url)
-            if data:
-                xml_data.append(data)
-
-        # For each xml_data parse the XML data but extracting the
-        # following fields and store the data in a dictionary and
-        # the key should be identifier
-        for data in xml_data:
-            root = ET.fromstring(data)  # nosec BXXX
-            identifier = root.find(
-                ".//dc:identifier", self._namespaces
-            ).text  # nosec BXXX
-            title = root.find(
-                ".//dc:title", self._namespaces
-            ).text  # nosec BXXX
-            description = root.find(
-                ".//dc:description", self._namespaces
-            ).text  # nosec BXXX
-            format = root.find(
-                ".//dc:format", self._namespaces
-            ).text  # nosec BXXX
-            language = root.find(
-                ".//dc:language", self._namespaces
-            ).text  # nosec BXXX
-            publisher = root.find(
-                ".//dc:publisher", self._namespaces
-            ).text  # nosec BXXX
-            modified = root.find(
-                ".//dc:modified", self._namespaces
-            ).text  # nosec BXXX
-            valid = root.find(
-                ".//dct:valid", self._namespaces
-            ).text  # nosec BXXX
-
-            document_json = self._to_json(
-                description,
-                format,
-                identifier,
-                language,
-                modified,
-                publisher,
-                title,
-                valid,
+            logger.info(
+                f"fetching data from page {index + 1} / "
+                f"{len(dataset)}: {url}..."
             )
 
-            # Insert or update the document
-            insert_or_update_document(document_json)
+            try:
+                config = SearchDocumentConfig(search_query="", timeout=10)
+                data = _get_url_data(config, url)
+
+                if data is None:
+                    logger.error(
+                        f"error fetching data from {url}. no data returned"
+                    )
+                    raise Exception(
+                        f"error fetching data from {url}. no data returned"
+                    )
+
+                if data:
+                    logger.info(f"parsing data from {url}...")
+                    root = ET.fromstring(data)  # nosec BXXX
+                    identifier = root.find(
+                        ".//dc:identifier", self._namespaces
+                    ).text  # nosec BXXX
+                    title = root.find(
+                        ".//dc:title", self._namespaces
+                    ).text  # nosec BXXX
+                    description = root.find(
+                        ".//dc:description", self._namespaces
+                    ).text  # nosec BXXX
+                    format = root.find(
+                        ".//dc:format", self._namespaces
+                    ).text  # nosec BXXX
+                    language = root.find(
+                        ".//dc:language", self._namespaces
+                    ).text  # nosec BXXX
+                    publisher = root.find(
+                        ".//dc:publisher", self._namespaces
+                    ).text  # nosec BXXX
+                    modified = root.find(
+                        ".//dc:modified", self._namespaces
+                    ).text  # nosec BXXX
+                    valid = root.find(
+                        ".//dct:valid", self._namespaces
+                    ).text  # nosec BXXX
+
+                    document_json = self._to_json(
+                        description,
+                        format,
+                        identifier,
+                        language,
+                        modified,
+                        publisher,
+                        title,
+                        valid,
+                    )
+
+                    # Insert or update the document
+                    insert_or_update_document(document_json)
+            except Exception as e:
+                logger.error(f"error fetching data from {url}: {e}")
+                throw_error(f"error fetching data from {url}: {e}")
 
     def _to_json(
         self,
@@ -111,6 +132,7 @@ def _to_json(
         valid,
     ):
         return {
+            "query": {"search_terms": []},
             "id": _encode_url(identifier),
             "title": title,
             "identifier": identifier,
@@ -128,23 +150,5 @@ def _to_json(
                 "%Y-%m-%d"
             ),
             "type": "legislation",
-            "coverage": "gb",
-            "audience": None,
-            "subject": None,
-            "license": None,
-            "regulatory_topics": None,
-            "status": None,
-            "date_uploaded_to_orp": None,
-            "has_format": None,
-            "is_format_of": None,
-            "has_version": None,
-            "is_version_of": None,
-            "references": None,
-            "is_referenced_by": None,
-            "has_part": None,
-            "is_part_of": None,
-            "is_replaced_by": None,
-            "replaces": None,
-            "related_legislation": None,
             "score": 0,
         }
diff --git a/orp/orp_search/models.py b/orp/orp_search/models.py
@@ -7,7 +7,7 @@
 
 
 class DataResponseModel(models.Model):
-    query = models.JSONField()
+    query = models.JSONField()  # TODO: remove this field
     title = models.CharField(max_length=_default_char_size)
     identifier = models.URLField(unique=True)
     publisher = models.CharField(

diff --git a/orp/orp_search/public_gateway.py b/orp/orp_search/public_gateway.py
@@ -10,68 +10,65 @@
 logger = logging.getLogger(__name__)
 
 
-class PublicGateway:
-    def __init__(self):
-        """
-        Initializes the API client with the base URL for the Trade Data API.
+def _normalize_date(date_str):
+    if date_str is None:
+        return None
 
-        Attributes:
-            base_url (str): The base URL of the Trade Data API.
-        """
-        self.base_url = "https://data.api.trade.gov.uk"
-
-    def _normalize_date(self, date_str):
-        if date_str is None:
-            return None
-
-        # If the date is in YYYY format, add "-01-01"
-        if len(date_str) == 4:
-            return f"{date_str}-01-01"
-        # If the date is in YYYY-MM format, add "-01"
-        elif len(date_str) == 7:
-            return f"{date_str}-01"
-        # Otherwise, assume the date is already in YYYY-MM-DD format
-        return datetime.strptime(date_str, "%Y-%m-%d").strftime("%Y-%m-%d")
-
-    def _build_like_conditions(self, field, and_terms, or_terms):
-        """
+    # If the date is in YYYY format, add "-01-01"
+    if len(date_str) == 4:
+        return f"{date_str}-01-01"
+    # If the date is in YYYY-MM format, add "-01"
+    elif len(date_str) == 7:
+        return f"{date_str}-01"
+    # Otherwise, assume the date is already in YYYY-MM-DD format
+    return datetime.strptime(date_str, "%Y-%m-%d").strftime("%Y-%m-%d")
 
-        Generates SQL LIKE conditions.
 
-        Args:
-            field (str): The database field to apply the LIKE condition to.
-            terms (list of str): A list of terms to include in the LIKE
-                                 condition.
+def _build_like_conditions(field, and_terms, or_terms):
+    """
 
-        Returns:
-            str: A string containing the LIKE conditions combined with 'OR'.
-        """
-        # Put each term into the list
-        terms = and_terms
+    Generates SQL LIKE conditions.
 
-        # If there are OR terms, then put an OR condition between them
-        if or_terms:
-            terms.append("(" + " OR ".join(or_terms) + ")")
+    Args:
+        field (str): The database field to apply the LIKE condition to.
+        terms (list of str): A list of terms to include in the LIKE
+                             condition.
 
-        return " OR ".join(
-            [f"{field} LIKE LOWER('%{term}%')" for term in terms]
-        )
+    Returns:
+        str: A string containing the LIKE conditions combined with 'OR'.
+    """
+    # Put each term into the list
+    terms = and_terms
 
-    def get_all(self, config=None):
-        logger.info("fetching all data from orpd...")
+    # If there are OR terms, then put an OR condition between them
+    if or_terms:
+        terms.append("(" + " OR ".join(or_terms) + ")")
+
+    return " OR ".join([f"{field} LIKE LOWER('%{term}%')" for term in terms])
 
-        # Base URL for the API
-        url = (
+
+class PublicGateway:
+    def __init__(self):
+        """
+        Initializes the API client with the base URL for the Trade Data API.
+
+        Attributes:
+            base_url (str): The base URL of the Trade Data API.
+        """
+        self._base_url = (
             "https://data.api.trade.gov.uk/v1/datasets/orp-regulations"
             "/versions/v1.0.0/data"
         )
 
+    def build_cache(self, config=None):
+        logger.info("fetching all data from orpd...")
+
         # URL encode the query for the API request
         params = {"format": "json"}
 
         # Make the GET request
         response = requests.get(
-            url,
+            self._base_url,
             params=params,
             timeout=10 if not config.timeout else config.timeout,  # nosec BXXX
         )
@@ -90,11 +87,9 @@ def get_all(self, config=None):
                     "language": row["language"],
                     "format": row["format"],
                     "description": row["description"],
-                    "date_issued": self._normalize_date(row["date_issued"]),
-                    "date_modified": self._normalize_date(
-                        row["date_modified"]
-                    ),
-                    "date_valid": self._normalize_date(row["date_valid"]),
+                    "date_issued": _normalize_date(row["date_issued"]),
+                    "date_modified": _normalize_date(row["date_modified"]),
+                    "date_valid": _normalize_date(row["date_valid"]),
                     "audience": row["audience"],
                     "coverage": row["coverage"],
                     "subject": row["subject"],