Skip to content

Commit

Permalink
refactor:PublicGateway and add construction_legislation
Browse files Browse the repository at this point in the history
Refactor PublicGateway class by moving date normalization and SQL condition building functions outside the class. Added a new construction_legislation.py file to handle construction legislation data, while deleting the unnecessary Excel file.
  • Loading branch information
hareshkainthdbt committed Nov 12, 2024
1 parent f65d771 commit 0092a02
Show file tree
Hide file tree
Showing 9 changed files with 2,126 additions and 130 deletions.
3 changes: 3 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[flake8]
per-file-ignores =
construction_legislation.py: E501
43 changes: 40 additions & 3 deletions orp/config/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
import orp_search.views as orp_search_views

from orp_search.models import DataResponseModel
from rest_framework import routers, serializers, viewsets
from orp_search.utils.documents import clear_all_documents
from orp_search.utils.search import search
from rest_framework import routers, serializers, status, viewsets
from rest_framework.decorators import action
from rest_framework.response import Response

from django.conf import settings
from django.contrib import admin
Expand Down Expand Up @@ -52,12 +56,45 @@ class Meta:

class DataResponseViewSet(viewsets.ModelViewSet):
serializer_class = DataResponseSerializer
queryset = DataResponseModel.objects.all()

def list(self, request, *args, **kwargs):
# Assuming `search` is a function that
# processes the request and returns data
context = {
"service_name": settings.SERVICE_NAME_SEARCH,
}
response_data = search(context, request)

# Return the response
return Response(response_data, status=status.HTTP_200_OK)


class RebuildCacheViewSet(viewsets.ViewSet):
@action(detail=False, methods=["post"], url_path="cache")
def rebuild_cache(self, request, *args, **kwargs):
from orp_search.legislation import Legislation

# from orp_search.public_gateway import PublicGateway

try:
clear_all_documents()
Legislation().build_cache()
# PublicGateway().build_cache()
except Exception as e:
return Response(
data={"message": f"error clearing documents: {e}"},
status=status.HTTP_500_INTERNAL_SERVER_ERROR,
)

return Response(
data={"message": "rebuilt cache"}, status=status.HTTP_200_OK
)


# Routers provide an easy way of automatically determining the URL conf.
router = routers.DefaultRouter()
router.register(r"results", DataResponseViewSet)
router.register(r"dataresults", DataResponseViewSet, basename="dataresponse")
router.register(r"rebuild", RebuildCacheViewSet, basename="cache")

urlpatterns = [
path("", include(router.urls)),
Expand Down
1,945 changes: 1,945 additions & 0 deletions orp/orp_search/construction_legislation.py

Large diffs are not rendered by default.

Binary file removed orp/orp_search/construction_legislation.xlsx
Binary file not shown.
150 changes: 77 additions & 73 deletions orp/orp_search/legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@

from datetime import datetime

import pandas as pd
import requests # type: ignore

from numpy.f2py.auxfuncs import throw_error
from orp_search.config import SearchDocumentConfig
from orp_search.construction_legislation import ( # noqa: E501
construction_legislation_dataframe,
)
from orp_search.utils.documents import insert_or_update_document

logger = logging.getLogger(__name__)
Expand All @@ -17,17 +21,21 @@ def _encode_url(url):
return encoded_bytes.decode("utf-8")


def _get_url_data(url, config=None):
def _get_url_data(config, url):
try:
response = requests.get( # nosec BXXX
url, timeout=10 if not config.timeout else config.timeout
)
response = requests.get(url, timeout=config.timeout) # nosec BXXX
if response.status_code == 200:
return response.text

# If the status code is not 200, log the error
logger.error(
f"error fetching legislation data "
f"[{response.status_code}]: {response.reason}"
)
return None
except requests.exceptions.RequestException as e:
logger.error(f"error fetching legislation data: {e}")
return None
return e


class Legislation:
Expand All @@ -41,63 +49,76 @@ def __init__(self):
"ukm": "http://www.legislation.gov.uk/namespaces/metadata",
}

def parse_dataset_and_store(self):
# Read construction_legislation.xlsx into panda
dataset = pd.read_excel("construction_legislation.xlsx")
def build_cache(self):
logger.info("building legislation cache...")
dataset = construction_legislation_dataframe()

# For each row, get the URL from the column named
# 'URI to Extract XML Data'
# and store the XML data in a list
xml_data = []
for index, row in dataset.iterrows():
url = row["URI to Extract XML Data"]
data = _get_url_data(url)
if data:
xml_data.append(data)

# For each xml_data parse the XML data but extracting the
# following fields and store the data in a dictionary and
# the key should be identifier
for data in xml_data:
root = ET.fromstring(data) # nosec BXXX
identifier = root.find(
".//dc:identifier", self._namespaces
).text # nosec BXXX
title = root.find(
".//dc:title", self._namespaces
).text # nosec BXXX
description = root.find(
".//dc:description", self._namespaces
).text # nosec BXXX
format = root.find(
".//dc:format", self._namespaces
).text # nosec BXXX
language = root.find(
".//dc:language", self._namespaces
).text # nosec BXXX
publisher = root.find(
".//dc:publisher", self._namespaces
).text # nosec BXXX
modified = root.find(
".//dc:modified", self._namespaces
).text # nosec BXXX
valid = root.find(
".//dct:valid", self._namespaces
).text # nosec BXXX

document_json = self._to_json(
description,
format,
identifier,
language,
modified,
publisher,
title,
valid,
logger.info(
f"fetching data from page {index + 1} / "
f"{len(dataset)}: {url}..."
)

# Insert or update the document
insert_or_update_document(document_json)
try:
config = SearchDocumentConfig(search_query="", timeout=10)
data = _get_url_data(config, url)

if data is None:
logger.error(
f"error fetching data from {url}. no data returned"
)
raise Exception(
f"error fetching data from {url}. no data returned"
)

if data:
logger.info(f"parsing data from {url}...")
root = ET.fromstring(data) # nosec BXXX
identifier = root.find(
".//dc:identifier", self._namespaces
).text # nosec BXXX
title = root.find(
".//dc:title", self._namespaces
).text # nosec BXXX
description = root.find(
".//dc:description", self._namespaces
).text # nosec BXXX
format = root.find(
".//dc:format", self._namespaces
).text # nosec BXXX
language = root.find(
".//dc:language", self._namespaces
).text # nosec BXXX
publisher = root.find(
".//dc:publisher", self._namespaces
).text # nosec BXXX
modified = root.find(
".//dc:modified", self._namespaces
).text # nosec BXXX
valid = root.find(
".//dct:valid", self._namespaces
).text # nosec BXXX

document_json = self._to_json(
description,
format,
identifier,
language,
modified,
publisher,
title,
valid,
)

# Insert or update the document
insert_or_update_document(document_json)
except Exception as e:
logger.error(f"error fetching data from {url}: {e}")
throw_error(f"error fetching data from {url}: {e}")

def _to_json(
self,
Expand All @@ -111,6 +132,7 @@ def _to_json(
valid,
):
return {
"query": {"search_terms": []},
"id": _encode_url(identifier),
"title": title,
"identifier": identifier,
Expand All @@ -128,23 +150,5 @@ def _to_json(
"%Y-%m-%d"
),
"type": "legislation",
"coverage": "gb",
"audience": None,
"subject": None,
"license": None,
"regulatory_topics": None,
"status": None,
"date_uploaded_to_orp": None,
"has_format": None,
"is_format_of": None,
"has_version": None,
"is_version_of": None,
"references": None,
"is_referenced_by": None,
"has_part": None,
"is_part_of": None,
"is_replaced_by": None,
"replaces": None,
"related_legislation": None,
"score": 0,
}
2 changes: 1 addition & 1 deletion orp/orp_search/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


class DataResponseModel(models.Model):
query = models.JSONField()
query = models.JSONField() # TODO: remove this field
title = models.CharField(max_length=_default_char_size)
identifier = models.URLField(unique=True)
publisher = models.CharField(
Expand Down
95 changes: 45 additions & 50 deletions orp/orp_search/public_gateway.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,68 +10,65 @@
logger = logging.getLogger(__name__)


class PublicGateway:
def __init__(self):
"""
Initializes the API client with the base URL for the Trade Data API.
def _normalize_date(date_str):
if date_str is None:
return None

Attributes:
base_url (str): The base URL of the Trade Data API.
"""
self.base_url = "https://data.api.trade.gov.uk"

def _normalize_date(self, date_str):
if date_str is None:
return None

# If the date is in YYYY format, add "-01-01"
if len(date_str) == 4:
return f"{date_str}-01-01"
# If the date is in YYYY-MM format, add "-01"
elif len(date_str) == 7:
return f"{date_str}-01"
# Otherwise, assume the date is already in YYYY-MM-DD format
return datetime.strptime(date_str, "%Y-%m-%d").strftime("%Y-%m-%d")

def _build_like_conditions(self, field, and_terms, or_terms):
"""
# If the date is in YYYY format, add "-01-01"
if len(date_str) == 4:
return f"{date_str}-01-01"
# If the date is in YYYY-MM format, add "-01"
elif len(date_str) == 7:
return f"{date_str}-01"
# Otherwise, assume the date is already in YYYY-MM-DD format
return datetime.strptime(date_str, "%Y-%m-%d").strftime("%Y-%m-%d")

Generates SQL LIKE conditions.

Args:
field (str): The database field to apply the LIKE condition to.
terms (list of str): A list of terms to include in the LIKE
condition.
def _build_like_conditions(field, and_terms, or_terms):
"""
Returns:
str: A string containing the LIKE conditions combined with 'OR'.
"""
# Put each term into the list
terms = and_terms
Generates SQL LIKE conditions.
# If there are OR terms, then put an OR condition between them
if or_terms:
terms.append("(" + " OR ".join(or_terms) + ")")
Args:
field (str): The database field to apply the LIKE condition to.
terms (list of str): A list of terms to include in the LIKE
condition.
return " OR ".join(
[f"{field} LIKE LOWER('%{term}%')" for term in terms]
)
Returns:
str: A string containing the LIKE conditions combined with 'OR'.
"""
# Put each term into the list
terms = and_terms

def get_all(self, config=None):
logger.info("fetching all data from orpd...")
# If there are OR terms, then put an OR condition between them
if or_terms:
terms.append("(" + " OR ".join(or_terms) + ")")

return " OR ".join([f"{field} LIKE LOWER('%{term}%')" for term in terms])

# Base URL for the API
url = (

class PublicGateway:
def __init__(self):
"""
Initializes the API client with the base URL for the Trade Data API.
Attributes:
base_url (str): The base URL of the Trade Data API.
"""
self._base_url = (
"https://data.api.trade.gov.uk/v1/datasets/orp-regulations"
"/versions/v1.0.0/data"
)

def build_cache(self, config=None):
logger.info("fetching all data from orpd...")

# URL encode the query for the API request
params = {"format": "json"}

# Make the GET request
response = requests.get(
url,
self._base_url,
params=params,
timeout=10 if not config.timeout else config.timeout, # nosec BXXX
)
Expand All @@ -90,11 +87,9 @@ def get_all(self, config=None):
"language": row["language"],
"format": row["format"],
"description": row["description"],
"date_issued": self._normalize_date(row["date_issued"]),
"date_modified": self._normalize_date(
row["date_modified"]
),
"date_valid": self._normalize_date(row["date_valid"]),
"date_issued": _normalize_date(row["date_issued"]),
"date_modified": _normalize_date(row["date_modified"]),
"date_valid": _normalize_date(row["date_valid"]),
"audience": row["audience"],
"coverage": row["coverage"],
"subject": row["subject"],
Expand Down
Loading

0 comments on commit 0092a02

Please sign in to comment.