Merge pull request #54 from uktrade/feature/orpd-123-cleanup

feat(orpd-23):cleanup
uktrade · Nov 25, 2024 · 62004a3 · 62004a3
2 parents aebb411 + 123f4e4
commit 62004a3
Show file tree

Hide file tree

Showing 9 changed files with 304 additions and 59 deletions.
diff --git a/orp/orp_search/apps.py b/orp/orp_search/apps.py
@@ -2,6 +2,17 @@
 
 
 class SearchConfig(AppConfig):
+    """
+    Configuration class for the ORP Search application.
+
+    Attributes:
+        name (str): The full Python path to the application.
+        verbose_name (str): A human-readable name for the application.
+        default_auto_field (str): Specifies the type of auto-created
+                                  primary key field to use.
+
+    """
+
     name = "orp_search"
     verbose_name = "ORP application functionality"
     default_auto_field = "django.db.models.BigAutoField"
diff --git a/orp/orp_search/config.py b/orp/orp_search/config.py
@@ -16,13 +16,41 @@ def __init__(
         id=None,
     ):
         """
-        Initializes a new instance of the class.
+        Initializes the SearchRequest object with the given parameters.
 
-        :param searchTerms: A comma-separated string of search terms.
-        :param documentTypes: Optional. A list of document types
-                              to filter the search.
-        :param timeout: Optional. The timeout in seconds for the search
-                        request.
+        Args:
+            search_query (str): The search query string.
+            document_types (Optional[List[str]]):
+                A list of document types to filter by. Defaults to None.
+            timeout (Optional[int]):
+                The timeout value for the request in seconds. Defaults to None.
+            limit (int):
+                The maximum number of search results to return. Defaults to 10.
+            offset (int):
+                The starting position of the search results. Defaults to 1.
+            publisher_names (Optional[List[str]]):
+                A list of publisher names to filter by. Defaults to None.
+            sort_by (Optional[str]):
+                The field by which to sort the search results. Defaults to
+                None.
+            id (Optional[str]):
+                An optional identifier for the search request. Defaults to
+                None.
+
+        Attributes:
+            search_query (str): The search query string.
+            document_types (Optional[List[str]]):
+                A list of document types to filter by.
+            timeout (Optional[int]):
+                The timeout value for the request in seconds.
+            limit (int): The maximum number of search results to return.
+            offset (int): The starting position of the search results.
+            publisher_names (Optional[List[str]]):
+                A list of publisher names to filter by.
+            sort_by (Optional[str]):
+                The field by which to sort the search results.
+            id (Optional[str]):
+                An optional identifier for the search request.
         """
         self.search_query = search_query
         self.document_types = (
@@ -46,17 +74,20 @@ def __init__(
 
     def validate(self):
         """
+        Validates the constraints defined for offset, limit,
+        and sort_by attributes.
 
-        Validates the presence of search terms.
+        Returns:
+        bool
+            True if all constraints are satisfied, False otherwise.
 
-        Checks if the 'searchTerms' attribute exists and is non-empty. Logs
-        an error message and returns False if 'searchTerms' is missing or
-        empty.
+        Notes:
+        - The offset must be a non-negative integer.
+        - The limit must be a non-negative integer.
+        - The sort_by attribute, if specified, must be either
+            'recent' or 'relevance'.
 
-        Returns
-        -------
-        bool
-            True if 'searchTerms' is present and non-empty, False otherwise.
+        Errors are logged if any of the constraints are violated.
         """
         if self.offset < 0:
             logger.error("offset must be a positive integer")
@@ -73,6 +104,21 @@ def validate(self):
         return True
 
     def print_to_log(self):
+        """
+
+        Logs the current state of various search parameters.
+
+        Logs the following attributes:
+        - search_query: The search query string.
+        - document_types: The list of document types being searched.
+        - timeout: The timeout value for the search query.
+        - limit: The maximum number of results to return.
+        - offset: The starting point from which results are returned.
+        - publisher_names: The list of publisher names to filter the search.
+        - sort_by: The criteria for sorting the search results.
+        - id: The unique identifier for the search query.
+
+        """
         logger.info(f"search_query: {self.search_query}")
         logger.info(f"document_types: {self.document_types}")
         logger.info(f"timeout: {self.timeout}")

diff --git a/orp/orp_search/construction_legislation.py b/orp/orp_search/construction_legislation.py
@@ -1936,6 +1936,12 @@
 
 
 def construction_legislation_dataframe():
+    """
+    Reads CSV data from a predefined string, converts the data into a Pandas DataFrame, and returns the resulting DataFrame.
+
+    Returns:
+        pandas.DataFrame: The dataframe containing the CSV data.
+    """
     # Use StringIO to simulate reading from a file
     csv_data = StringIO(_csv_text)
 

diff --git a/orp/orp_search/legislation.py b/orp/orp_search/legislation.py
@@ -1,4 +1,3 @@
-import base64
 import logging
 import re
 import xml.etree.ElementTree as ET  # nosec BXXX
@@ -20,12 +19,23 @@
 logger = logging.getLogger(__name__)
 
 
-def _encode_url(url):
-    encoded_bytes = base64.urlsafe_b64encode(url.encode("utf-8"))
-    return encoded_bytes.decode("utf-8")
-
-
 def _get_url_data(config, url):
+    """
+    Fetch data from a given URL and return the response text if successful,
+    otherwise log the error.
+
+    Parameters:
+    - config: Configuration object that includes the request timeout.
+    - url: String representing the URL to request.
+
+    Returns:
+    - Response text if the status code is 200.
+    - None if the response status code is not 200, or if there is an exception
+        during the request.
+
+    Logs:
+    - Error messages for request failures and non-200 response codes.
+    """
     try:
         response = requests.get(url, timeout=config.timeout)  # nosec BXXX
         if response.status_code == 200:
@@ -43,11 +53,36 @@ def _get_url_data(config, url):
 
 
 def _get_text_from_element(element: Optional[ET.Element]) -> Optional[str]:
+    """
+    Extracts and returns the text content from an XML element if it exists.
+
+    This function checks if the provided XML element is not None.
+    If the element is available, it returns the text content of that element.
+    If the element is None, it returns None.
+
+    Parameters:
+        element (Optional[ET.Element]):
+            The XML element from which to extract the text.
+
+    Returns:
+        Optional[str]:
+            The text content of the element if it exists, otherwise None.
+    """
     return element.text if element is not None else None
 
 
 class Legislation:
     def __init__(self):
+        """
+        Initializes the class instance and defines the XML namespaces.
+
+        Attributes:
+            _namespaces (dict):
+                A dictionary containing XML namespaces with their
+                corresponding URLs. These namespaces are used to
+                refer to elements in XML documents adhering to
+                different XML schemas.
+        """
         # Define the XML namespaces
         self._namespaces = {
             "leg": "http://www.legislation.gov.uk/namespaces/legislation",
@@ -58,6 +93,31 @@ def __init__(self):
         }
 
     def build_cache(self, config: SearchDocumentConfig):
+        """
+        Builds a cache of legislation documents by retrieving XML data from
+        URLs specified in a DataFrame.
+
+        Parameters:
+        config (SearchDocumentConfig): Configuration object for searching
+        documents.
+
+        Raises:
+        Exception: If there's an error fetching data from the URL or no data
+        is returned.
+
+        Functionality:
+        1. Logs the start of the caching process.
+        2. Loads legislation data into a DataFrame.
+        3. Iterates over each row in the DataFrame to fetch XML data from
+            specified URLs.
+        4. Extracts and parses XML data, logging relevant informational
+            and error messages.
+        5. Extracts specific fields (identifier, title, description, etc.)
+            from the parsed XML data.
+        6. Converts the extracted data to JSON format.
+        7. Inserts or updates the document in the cache.
+        8. Logs errors and re-raises them if data retrieval fails.
+        """
         logger.info("building legislation cache...")
         dataset = construction_legislation_dataframe()
 
@@ -138,6 +198,22 @@ def _to_json(
         title,
         valid,
     ):
+        """
+        Converts given parameters into a JSON-like dictionary format.
+
+        Arguments:
+        description (str): Description of the item.
+        format (str): Format of the item.
+        identifier (str): Unique identifier for the item.
+        language (str): Language in which the item is available.
+        modified (str): The date when the item was last modified.
+        publisher (str): The publisher of the item.
+        title (str): The title of the item.
+        valid (str): The date until which the item is considered valid.
+
+        Returns:
+        dict: A dictionary containing the item details in a structured format.
+        """
         return {
             "id": generate_short_uuid(),
             "title": title,

diff --git a/orp/orp_search/models.py b/orp/orp_search/models.py
@@ -1,13 +1,57 @@
 import logging
 
-from django.core.exceptions import ValidationError
-from django.core.validators import URLValidator
 from django.db import models
 
 logger = logging.getLogger(__name__)
 
 
 class DataResponseModel(models.Model):
+    """
+    DataResponseModel
+
+    A Django model representing various metadata fields related to data
+    responses.
+
+    Attributes:
+        title: Title of the data response.
+        identifier: Unique identifier for the data response.
+        publisher: Entity that published the data response.
+        publisher_id: Unique ID of the publisher.
+        language: Language in which the data response is published.
+        format: Format of the data response.
+        description: Brief description of the data response.
+        date_issued: Date when the data response was issued.
+        date_modified: Date when the data response was last modified.
+        date_valid: Validity date of the data response as text.
+        sort_date: Date used for sorting the data responses.
+        audience: Intended audience for the data response.
+        coverage: Coverage details of the data response.
+        subject: Subject matter of the data response.
+        type: Type of the data response.
+        license: Licensing information of the data response.
+        regulatory_topics: Topics covered by the data response.
+        status: Current status of the data response.
+        date_uploaded_to_orp: Date when the data response was uploaded to ORP.
+        has_format: Format details that the data response has.
+        is_format_of:
+            Indicates if the data response is a format of another resource.
+        has_version: Version details that the data response has.
+        is_version_of:
+            Indicates if the data response is a version of another resource.
+        references: References cited in the data response.
+        is_referenced_by:
+            Indicates if the data response is referenced by another resource.
+        has_part: Part details that the data response has.
+        is_part_of:
+            Indicates if the data response is a part of another resource.
+        is_replaced_by:
+            Indicates if the data response is replaced by another resource.
+        replaces: Indicates if the data response replaces another resource.
+        related_legislation: Related legislation details for the data response.
+        id: Primary key of the data response.
+        score: Score associated with the data response, default is 0.
+    """
+
     title = models.TextField(null=True, blank=True)
     identifier = models.TextField(null=True, blank=True)
     publisher = models.TextField(null=True, blank=True)
@@ -40,18 +84,3 @@ class DataResponseModel(models.Model):
     related_legislation = models.TextField(null=True, blank=True)
     id = models.TextField(primary_key=True)
     score = models.IntegerField(null=True, blank=True, default=0)
-
-    def __str__(self):
-        return self.title
-
-    def clean(self):
-        """
-        Validate the id field to check if it's a URL or not.
-        """
-        url_validator = URLValidator()
-        try:
-            url_validator(self.id)
-        except ValidationError:
-            # It's not a URL, which is acceptable as it's a
-            # CharField that supports both
-            pass
diff --git a/orp/orp_search/utils/documents.py b/orp/orp_search/utils/documents.py
@@ -9,35 +9,34 @@
 
 
 def clear_all_documents():
-    logger.info("clearing all documents from table...")
+    logger.debug("clearing all documents from table...")
     try:
         DataResponseModel.objects.all().delete()
-        logger.info("documents cleared from table")
+        logger.debug("documents cleared from table")
     except Exception as e:
         logger.error(f"error clearing documents: {e}")
         throw_error(f"error clearing documents: {e}")
 
 
 def insert_or_update_document(document_json):
     try:
-        logger.info("creating document...")
+        logger.debug("creating document...")
         logger.debug(f"document: {document_json}")
-        # Try to create a new document
         document = DataResponseModel(**document_json)
         document.full_clean()
         document.save()
     except Exception as e:
         logger.error(f"error creating document: {document_json}")
         logger.error(f"error: {e}")
-        logger.info("document already exists, updating...")
+        logger.debug("document already exists, updating...")
 
         # If a duplicate key error occurs, update the existing document
         try:
             document = DataResponseModel.objects.get(pk=document_json["id"])
             for key, value in document_json.items():
                 setattr(document, key, value)
             document.save()
-            logger.info(f"document updated: {document}")
+            logger.debug(f"document updated: {document}")
         except Exception as e:
             logger.error(f"error updating document: {document_json}")
             logger.error(f"error: {e}")
@@ -86,8 +85,14 @@ def _extract_terms(search_query):
 
 
 def generate_short_uuid():
-    # Generate a UUID
+    """
+    Generates a short, URL-safe UUID.
+
+    Returns:
+        str: A URL-safe base64 encoded UUID truncated to 22 characters.
+    """
     uid = uuid.uuid4()
+
     # Encode it to base64
     uid_b64 = base64.urlsafe_b64encode(uid.bytes).rstrip(b"=").decode("ascii")
     return uid_b64[