Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(orpd-23):cleanup #54

Merged
merged 2 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions orp/orp_search/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,17 @@


class SearchConfig(AppConfig):
"""
Configuration class for the ORP Search application.

Attributes:
name (str): The full Python path to the application.
verbose_name (str): A human-readable name for the application.
default_auto_field (str): Specifies the type of auto-created
primary key field to use.

"""

name = "orp_search"
verbose_name = "ORP application functionality"
default_auto_field = "django.db.models.BigAutoField"
74 changes: 60 additions & 14 deletions orp/orp_search/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,41 @@ def __init__(
id=None,
):
"""
Initializes a new instance of the class.
Initializes the SearchRequest object with the given parameters.

:param searchTerms: A comma-separated string of search terms.
:param documentTypes: Optional. A list of document types
to filter the search.
:param timeout: Optional. The timeout in seconds for the search
request.
Args:
search_query (str): The search query string.
document_types (Optional[List[str]]):
A list of document types to filter by. Defaults to None.
timeout (Optional[int]):
The timeout value for the request in seconds. Defaults to None.
limit (int):
The maximum number of search results to return. Defaults to 10.
offset (int):
The starting position of the search results. Defaults to 1.
publisher_names (Optional[List[str]]):
A list of publisher names to filter by. Defaults to None.
sort_by (Optional[str]):
The field by which to sort the search results. Defaults to
None.
id (Optional[str]):
An optional identifier for the search request. Defaults to
None.

Attributes:
search_query (str): The search query string.
document_types (Optional[List[str]]):
A list of document types to filter by.
timeout (Optional[int]):
The timeout value for the request in seconds.
limit (int): The maximum number of search results to return.
offset (int): The starting position of the search results.
publisher_names (Optional[List[str]]):
A list of publisher names to filter by.
sort_by (Optional[str]):
The field by which to sort the search results.
id (Optional[str]):
An optional identifier for the search request.
"""
self.search_query = search_query
self.document_types = (
Expand All @@ -46,17 +74,20 @@ def __init__(

def validate(self):
"""
Validates the constraints defined for offset, limit,
and sort_by attributes.

Validates the presence of search terms.
Returns:
bool
True if all constraints are satisfied, False otherwise.

Checks if the 'searchTerms' attribute exists and is non-empty. Logs
an error message and returns False if 'searchTerms' is missing or
empty.
Notes:
- The offset must be a non-negative integer.
- The limit must be a non-negative integer.
- The sort_by attribute, if specified, must be either
'recent' or 'relevance'.

Returns
-------
bool
True if 'searchTerms' is present and non-empty, False otherwise.
Errors are logged if any of the constraints are violated.
"""
if self.offset < 0:
logger.error("offset must be a positive integer")
Expand All @@ -73,6 +104,21 @@ def validate(self):
return True

def print_to_log(self):
"""

Logs the current state of various search parameters.

Logs the following attributes:
- search_query: The search query string.
- document_types: The list of document types being searched.
- timeout: The timeout value for the search query.
- limit: The maximum number of results to return.
- offset: The starting point from which results are returned.
- publisher_names: The list of publisher names to filter the search.
- sort_by: The criteria for sorting the search results.
- id: The unique identifier for the search query.

"""
logger.info(f"search_query: {self.search_query}")
logger.info(f"document_types: {self.document_types}")
logger.info(f"timeout: {self.timeout}")
Expand Down
6 changes: 6 additions & 0 deletions orp/orp_search/construction_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1936,6 +1936,12 @@


def construction_legislation_dataframe():
"""
Reads CSV data from a predefined string, converts the data into a Pandas DataFrame, and returns the resulting DataFrame.

Returns:
pandas.DataFrame: The dataframe containing the CSV data.
"""
# Use StringIO to simulate reading from a file
csv_data = StringIO(_csv_text)

Expand Down
88 changes: 82 additions & 6 deletions orp/orp_search/legislation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import base64
import logging
import re
import xml.etree.ElementTree as ET # nosec BXXX
Expand All @@ -20,12 +19,23 @@
logger = logging.getLogger(__name__)


def _encode_url(url):
encoded_bytes = base64.urlsafe_b64encode(url.encode("utf-8"))
return encoded_bytes.decode("utf-8")


def _get_url_data(config, url):
"""
Fetch data from a given URL and return the response text if successful,
otherwise log the error.

Parameters:
- config: Configuration object that includes the request timeout.
- url: String representing the URL to request.

Returns:
- Response text if the status code is 200.
- None if the response status code is not 200, or if there is an exception
during the request.

Logs:
- Error messages for request failures and non-200 response codes.
"""
try:
response = requests.get(url, timeout=config.timeout) # nosec BXXX
if response.status_code == 200:
Expand All @@ -43,11 +53,36 @@ def _get_url_data(config, url):


def _get_text_from_element(element: Optional[ET.Element]) -> Optional[str]:
"""
Extracts and returns the text content from an XML element if it exists.

This function checks if the provided XML element is not None.
If the element is available, it returns the text content of that element.
If the element is None, it returns None.

Parameters:
element (Optional[ET.Element]):
The XML element from which to extract the text.

Returns:
Optional[str]:
The text content of the element if it exists, otherwise None.
"""
return element.text if element is not None else None


class Legislation:
def __init__(self):
"""
Initializes the class instance and defines the XML namespaces.

Attributes:
_namespaces (dict):
A dictionary containing XML namespaces with their
corresponding URLs. These namespaces are used to
refer to elements in XML documents adhering to
different XML schemas.
"""
# Define the XML namespaces
self._namespaces = {
"leg": "http://www.legislation.gov.uk/namespaces/legislation",
Expand All @@ -58,6 +93,31 @@ def __init__(self):
}

def build_cache(self, config: SearchDocumentConfig):
"""
Builds a cache of legislation documents by retrieving XML data from
URLs specified in a DataFrame.

Parameters:
config (SearchDocumentConfig): Configuration object for searching
documents.

Raises:
Exception: If there's an error fetching data from the URL or no data
is returned.

Functionality:
1. Logs the start of the caching process.
2. Loads legislation data into a DataFrame.
3. Iterates over each row in the DataFrame to fetch XML data from
specified URLs.
4. Extracts and parses XML data, logging relevant informational
and error messages.
5. Extracts specific fields (identifier, title, description, etc.)
from the parsed XML data.
6. Converts the extracted data to JSON format.
7. Inserts or updates the document in the cache.
8. Logs errors and re-raises them if data retrieval fails.
"""
logger.info("building legislation cache...")
dataset = construction_legislation_dataframe()

Expand Down Expand Up @@ -138,6 +198,22 @@ def _to_json(
title,
valid,
):
"""
Converts given parameters into a JSON-like dictionary format.

Arguments:
description (str): Description of the item.
format (str): Format of the item.
identifier (str): Unique identifier for the item.
language (str): Language in which the item is available.
modified (str): The date when the item was last modified.
publisher (str): The publisher of the item.
title (str): The title of the item.
valid (str): The date until which the item is considered valid.

Returns:
dict: A dictionary containing the item details in a structured format.
"""
return {
"id": generate_short_uuid(),
"title": title,
Expand Down
63 changes: 46 additions & 17 deletions orp/orp_search/models.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,57 @@
import logging

from django.core.exceptions import ValidationError
from django.core.validators import URLValidator
from django.db import models

logger = logging.getLogger(__name__)


class DataResponseModel(models.Model):
"""
DataResponseModel

A Django model representing various metadata fields related to data
responses.

Attributes:
title: Title of the data response.
identifier: Unique identifier for the data response.
publisher: Entity that published the data response.
publisher_id: Unique ID of the publisher.
language: Language in which the data response is published.
format: Format of the data response.
description: Brief description of the data response.
date_issued: Date when the data response was issued.
date_modified: Date when the data response was last modified.
date_valid: Validity date of the data response as text.
sort_date: Date used for sorting the data responses.
audience: Intended audience for the data response.
coverage: Coverage details of the data response.
subject: Subject matter of the data response.
type: Type of the data response.
license: Licensing information of the data response.
regulatory_topics: Topics covered by the data response.
status: Current status of the data response.
date_uploaded_to_orp: Date when the data response was uploaded to ORP.
has_format: Format details that the data response has.
is_format_of:
Indicates if the data response is a format of another resource.
has_version: Version details that the data response has.
is_version_of:
Indicates if the data response is a version of another resource.
references: References cited in the data response.
is_referenced_by:
Indicates if the data response is referenced by another resource.
has_part: Part details that the data response has.
is_part_of:
Indicates if the data response is a part of another resource.
is_replaced_by:
Indicates if the data response is replaced by another resource.
replaces: Indicates if the data response replaces another resource.
related_legislation: Related legislation details for the data response.
id: Primary key of the data response.
score: Score associated with the data response, default is 0.
"""

title = models.TextField(null=True, blank=True)
identifier = models.TextField(null=True, blank=True)
publisher = models.TextField(null=True, blank=True)
Expand Down Expand Up @@ -40,18 +84,3 @@ class DataResponseModel(models.Model):
related_legislation = models.TextField(null=True, blank=True)
id = models.TextField(primary_key=True)
score = models.IntegerField(null=True, blank=True, default=0)

def __str__(self):
return self.title

def clean(self):
"""
Validate the id field to check if it's a URL or not.
"""
url_validator = URLValidator()
try:
url_validator(self.id)
except ValidationError:
# It's not a URL, which is acceptable as it's a
# CharField that supports both
pass
19 changes: 12 additions & 7 deletions orp/orp_search/utils/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,35 +9,34 @@


def clear_all_documents():
logger.info("clearing all documents from table...")
logger.debug("clearing all documents from table...")
try:
DataResponseModel.objects.all().delete()
logger.info("documents cleared from table")
logger.debug("documents cleared from table")
except Exception as e:
logger.error(f"error clearing documents: {e}")
throw_error(f"error clearing documents: {e}")


def insert_or_update_document(document_json):
try:
logger.info("creating document...")
logger.debug("creating document...")
logger.debug(f"document: {document_json}")
# Try to create a new document
document = DataResponseModel(**document_json)
document.full_clean()
document.save()
except Exception as e:
logger.error(f"error creating document: {document_json}")
logger.error(f"error: {e}")
logger.info("document already exists, updating...")
logger.debug("document already exists, updating...")

# If a duplicate key error occurs, update the existing document
try:
document = DataResponseModel.objects.get(pk=document_json["id"])
for key, value in document_json.items():
setattr(document, key, value)
document.save()
logger.info(f"document updated: {document}")
logger.debug(f"document updated: {document}")
except Exception as e:
logger.error(f"error updating document: {document_json}")
logger.error(f"error: {e}")
Expand Down Expand Up @@ -86,8 +85,14 @@ def _extract_terms(search_query):


def generate_short_uuid():
# Generate a UUID
"""
Generates a short, URL-safe UUID.

Returns:
str: A URL-safe base64 encoded UUID truncated to 22 characters.
"""
uid = uuid.uuid4()

# Encode it to base64
uid_b64 = base64.urlsafe_b64encode(uid.bytes).rstrip(b"=").decode("ascii")
return uid_b64[
Expand Down
Loading