From 0c037076fe689548e0c06d9cb84dc766829aa4ed Mon Sep 17 00:00:00 2001
From: Haresh Kainth <haresh.kainth@businessandtrade.gov.uk>
Date: Mon, 25 Nov 2024 22:37:09 +0000
Subject: [PATCH 1/2] chore:add detailed docstrings to various modules and
 functions

This commit introduces comprehensive docstrings across multiple files in the project to improve code readability and maintainability. These additions clarify the purpose, parameters, returns, and additional notes for existing functions, methods, and class definitions. Enhanced documentation will aid future developers in understanding the codebase more efficiently.
---
 orp/orp_search/apps.py                     | 11 +++
 orp/orp_search/config.py                   | 74 ++++++++++++++----
 orp/orp_search/construction_legislation.py |  6 ++
 orp/orp_search/legislation.py              | 88 ++++++++++++++++++++--
 orp/orp_search/models.py                   | 63 +++++++++++-----
 orp/orp_search/utils/documents.py          |  7 +-
 orp/orp_search/utils/paginate.py           | 46 +++++++++++
 orp/orp_search/views.py                    | 31 +++++++-
 8 files changed, 285 insertions(+), 41 deletions(-)

diff --git a/orp/orp_search/apps.py b/orp/orp_search/apps.py
index d8e653f..728cd76 100644
--- a/orp/orp_search/apps.py
+++ b/orp/orp_search/apps.py
@@ -2,6 +2,17 @@
 
 
 class SearchConfig(AppConfig):
+    """
+    Configuration class for the ORP Search application.
+
+    Attributes:
+        name (str): The full Python path to the application.
+        verbose_name (str): A human-readable name for the application.
+        default_auto_field (str): Specifies the type of auto-created
+                                  primary key field to use.
+
+    """
+
     name = "orp_search"
     verbose_name = "ORP application functionality"
     default_auto_field = "django.db.models.BigAutoField"
diff --git a/orp/orp_search/config.py b/orp/orp_search/config.py
index 1c03ec1..f08849a 100644
--- a/orp/orp_search/config.py
+++ b/orp/orp_search/config.py
@@ -16,13 +16,41 @@ def __init__(
         id=None,
     ):
         """
-        Initializes a new instance of the class.
+        Initializes the SearchRequest object with the given parameters.
 
-        :param searchTerms: A comma-separated string of search terms.
-        :param documentTypes: Optional. A list of document types
-                              to filter the search.
-        :param timeout: Optional. The timeout in seconds for the search
-                        request.
+        Args:
+            search_query (str): The search query string.
+            document_types (Optional[List[str]]):
+                A list of document types to filter by. Defaults to None.
+            timeout (Optional[int]):
+                The timeout value for the request in seconds. Defaults to None.
+            limit (int):
+                The maximum number of search results to return. Defaults to 10.
+            offset (int):
+                The starting position of the search results. Defaults to 1.
+            publisher_names (Optional[List[str]]):
+                A list of publisher names to filter by. Defaults to None.
+            sort_by (Optional[str]):
+                The field by which to sort the search results. Defaults to
+                None.
+            id (Optional[str]):
+                An optional identifier for the search request. Defaults to
+                None.
+
+        Attributes:
+            search_query (str): The search query string.
+            document_types (Optional[List[str]]):
+                A list of document types to filter by.
+            timeout (Optional[int]):
+                The timeout value for the request in seconds.
+            limit (int): The maximum number of search results to return.
+            offset (int): The starting position of the search results.
+            publisher_names (Optional[List[str]]):
+                A list of publisher names to filter by.
+            sort_by (Optional[str]):
+                The field by which to sort the search results.
+            id (Optional[str]):
+                An optional identifier for the search request.
         """
         self.search_query = search_query
         self.document_types = (
@@ -46,17 +74,20 @@ def __init__(
 
     def validate(self):
         """
+        Validates the constraints defined for offset, limit,
+        and sort_by attributes.
 
-        Validates the presence of search terms.
+        Returns:
+        bool
+            True if all constraints are satisfied, False otherwise.
 
-        Checks if the 'searchTerms' attribute exists and is non-empty. Logs
-        an error message and returns False if 'searchTerms' is missing or
-        empty.
+        Notes:
+        - The offset must be a non-negative integer.
+        - The limit must be a non-negative integer.
+        - The sort_by attribute, if specified, must be either
+            'recent' or 'relevance'.
 
-        Returns
-        -------
-        bool
-            True if 'searchTerms' is present and non-empty, False otherwise.
+        Errors are logged if any of the constraints are violated.
         """
         if self.offset < 0:
             logger.error("offset must be a positive integer")
@@ -73,6 +104,21 @@ def validate(self):
         return True
 
     def print_to_log(self):
+        """
+
+        Logs the current state of various search parameters.
+
+        Logs the following attributes:
+        - search_query: The search query string.
+        - document_types: The list of document types being searched.
+        - timeout: The timeout value for the search query.
+        - limit: The maximum number of results to return.
+        - offset: The starting point from which results are returned.
+        - publisher_names: The list of publisher names to filter the search.
+        - sort_by: The criteria for sorting the search results.
+        - id: The unique identifier for the search query.
+
+        """
         logger.info(f"search_query: {self.search_query}")
         logger.info(f"document_types: {self.document_types}")
         logger.info(f"timeout: {self.timeout}")
diff --git a/orp/orp_search/construction_legislation.py b/orp/orp_search/construction_legislation.py
index 6a2ba63..010ac71 100644
--- a/orp/orp_search/construction_legislation.py
+++ b/orp/orp_search/construction_legislation.py
@@ -1936,6 +1936,12 @@
 
 
 def construction_legislation_dataframe():
+    """
+    Reads CSV data from a predefined string, converts the data into a Pandas DataFrame, and returns the resulting DataFrame.
+
+    Returns:
+        pandas.DataFrame: The dataframe containing the CSV data.
+    """
     # Use StringIO to simulate reading from a file
     csv_data = StringIO(_csv_text)
 
diff --git a/orp/orp_search/legislation.py b/orp/orp_search/legislation.py
index 789d403..b0a930a 100644
--- a/orp/orp_search/legislation.py
+++ b/orp/orp_search/legislation.py
@@ -1,4 +1,3 @@
-import base64
 import logging
 import re
 import xml.etree.ElementTree as ET  # nosec BXXX
@@ -20,12 +19,23 @@
 logger = logging.getLogger(__name__)
 
 
-def _encode_url(url):
-    encoded_bytes = base64.urlsafe_b64encode(url.encode("utf-8"))
-    return encoded_bytes.decode("utf-8")
-
-
 def _get_url_data(config, url):
+    """
+    Fetch data from a given URL and return the response text if successful,
+    otherwise log the error.
+
+    Parameters:
+    - config: Configuration object that includes the request timeout.
+    - url: String representing the URL to request.
+
+    Returns:
+    - Response text if the status code is 200.
+    - None if the response status code is not 200, or if there is an exception
+        during the request.
+
+    Logs:
+    - Error messages for request failures and non-200 response codes.
+    """
     try:
         response = requests.get(url, timeout=config.timeout)  # nosec BXXX
         if response.status_code == 200:
@@ -43,11 +53,36 @@ def _get_url_data(config, url):
 
 
 def _get_text_from_element(element: Optional[ET.Element]) -> Optional[str]:
+    """
+    Extracts and returns the text content from an XML element if it exists.
+
+    This function checks if the provided XML element is not None.
+    If the element is available, it returns the text content of that element.
+    If the element is None, it returns None.
+
+    Parameters:
+        element (Optional[ET.Element]):
+            The XML element from which to extract the text.
+
+    Returns:
+        Optional[str]:
+            The text content of the element if it exists, otherwise None.
+    """
     return element.text if element is not None else None
 
 
 class Legislation:
     def __init__(self):
+        """
+        Initializes the class instance and defines the XML namespaces.
+
+        Attributes:
+            _namespaces (dict):
+                A dictionary containing XML namespaces with their
+                corresponding URLs. These namespaces are used to
+                refer to elements in XML documents adhering to
+                different XML schemas.
+        """
         # Define the XML namespaces
         self._namespaces = {
             "leg": "http://www.legislation.gov.uk/namespaces/legislation",
@@ -58,6 +93,31 @@ def __init__(self):
         }
 
     def build_cache(self, config: SearchDocumentConfig):
+        """
+        Builds a cache of legislation documents by retrieving XML data from
+        URLs specified in a DataFrame.
+
+        Parameters:
+        config (SearchDocumentConfig): Configuration object for searching
+        documents.
+
+        Raises:
+        Exception: If there's an error fetching data from the URL or no data
+        is returned.
+
+        Functionality:
+        1. Logs the start of the caching process.
+        2. Loads legislation data into a DataFrame.
+        3. Iterates over each row in the DataFrame to fetch XML data from
+            specified URLs.
+        4. Extracts and parses XML data, logging relevant informational
+            and error messages.
+        5. Extracts specific fields (identifier, title, description, etc.)
+            from the parsed XML data.
+        6. Converts the extracted data to JSON format.
+        7. Inserts or updates the document in the cache.
+        8. Logs errors and re-raises them if data retrieval fails.
+        """
         logger.info("building legislation cache...")
         dataset = construction_legislation_dataframe()
 
@@ -138,6 +198,22 @@ def _to_json(
         title,
         valid,
     ):
+        """
+        Converts given parameters into a JSON-like dictionary format.
+
+        Arguments:
+        description (str): Description of the item.
+        format (str): Format of the item.
+        identifier (str): Unique identifier for the item.
+        language (str): Language in which the item is available.
+        modified (str): The date when the item was last modified.
+        publisher (str): The publisher of the item.
+        title (str): The title of the item.
+        valid (str): The date until which the item is considered valid.
+
+        Returns:
+        dict: A dictionary containing the item details in a structured format.
+        """
         return {
             "id": generate_short_uuid(),
             "title": title,
diff --git a/orp/orp_search/models.py b/orp/orp_search/models.py
index 46b7551..11be85f 100644
--- a/orp/orp_search/models.py
+++ b/orp/orp_search/models.py
@@ -1,13 +1,57 @@
 import logging
 
-from django.core.exceptions import ValidationError
-from django.core.validators import URLValidator
 from django.db import models
 
 logger = logging.getLogger(__name__)
 
 
 class DataResponseModel(models.Model):
+    """
+    DataResponseModel
+
+    A Django model representing various metadata fields related to data
+    responses.
+
+    Attributes:
+        title: Title of the data response.
+        identifier: Unique identifier for the data response.
+        publisher: Entity that published the data response.
+        publisher_id: Unique ID of the publisher.
+        language: Language in which the data response is published.
+        format: Format of the data response.
+        description: Brief description of the data response.
+        date_issued: Date when the data response was issued.
+        date_modified: Date when the data response was last modified.
+        date_valid: Validity date of the data response as text.
+        sort_date: Date used for sorting the data responses.
+        audience: Intended audience for the data response.
+        coverage: Coverage details of the data response.
+        subject: Subject matter of the data response.
+        type: Type of the data response.
+        license: Licensing information of the data response.
+        regulatory_topics: Topics covered by the data response.
+        status: Current status of the data response.
+        date_uploaded_to_orp: Date when the data response was uploaded to ORP.
+        has_format: Format details that the data response has.
+        is_format_of:
+            Indicates if the data response is a format of another resource.
+        has_version: Version details that the data response has.
+        is_version_of:
+            Indicates if the data response is a version of another resource.
+        references: References cited in the data response.
+        is_referenced_by:
+            Indicates if the data response is referenced by another resource.
+        has_part: Part details that the data response has.
+        is_part_of:
+            Indicates if the data response is a part of another resource.
+        is_replaced_by:
+            Indicates if the data response is replaced by another resource.
+        replaces: Indicates if the data response replaces another resource.
+        related_legislation: Related legislation details for the data response.
+        id: Primary key of the data response.
+        score: Score associated with the data response, default is 0.
+    """
+
     title = models.TextField(null=True, blank=True)
     identifier = models.TextField(null=True, blank=True)
     publisher = models.TextField(null=True, blank=True)
@@ -40,18 +84,3 @@ class DataResponseModel(models.Model):
     related_legislation = models.TextField(null=True, blank=True)
     id = models.TextField(primary_key=True)
     score = models.IntegerField(null=True, blank=True, default=0)
-
-    def __str__(self):
-        return self.title
-
-    def clean(self):
-        """
-        Validate the id field to check if it's a URL or not.
-        """
-        url_validator = URLValidator()
-        try:
-            url_validator(self.id)
-        except ValidationError:
-            # It's not a URL, which is acceptable as it's a
-            # CharField that supports both
-            pass
diff --git a/orp/orp_search/utils/documents.py b/orp/orp_search/utils/documents.py
index b3d7dcc..18e47ff 100644
--- a/orp/orp_search/utils/documents.py
+++ b/orp/orp_search/utils/documents.py
@@ -86,7 +86,12 @@ def _extract_terms(search_query):
 
 
 def generate_short_uuid():
-    # Generate a UUID
+    """
+    Generates a short, URL-safe UUID.
+
+    Returns:
+        str: A URL-safe base64 encoded UUID truncated to 22 characters.
+    """
     uid = uuid.uuid4()
     # Encode it to base64
     uid_b64 = base64.urlsafe_b64encode(uid.bytes).rstrip(b"=").decode("ascii")
diff --git a/orp/orp_search/utils/paginate.py b/orp/orp_search/utils/paginate.py
index 310bf96..d45ab91 100644
--- a/orp/orp_search/utils/paginate.py
+++ b/orp/orp_search/utils/paginate.py
@@ -12,6 +12,52 @@
 def paginate(
     context: dict, config: SearchDocumentConfig, results: QuerySet
 ) -> dict:
+    """
+    Paginates the given query set and updates the context with
+    pagination details.
+
+    Parameters:
+    - context (dict):
+        The context dictionary to be updated with pagination details.
+    - config (SearchDocumentConfig):
+        Configuration object containing limit and offset for pagination.
+    - results (QuerySet): The query set of documents to be paginated.
+
+    Returns:
+    - dict:
+        The updated context dictionary containing pagination information and
+        paginated documents.
+
+    Logs the time taken for the pagination process in different stages:
+    1. Time taken to paginate the documents.
+    2. Time taken to process regulatory topics for each document.
+    3. Time taken to update the context with pagination details.
+
+    Handles pagination exceptions:
+    - If the page is not an integer, defaults to the first page.
+    - If the page is empty, defaults to the last page.
+
+    Converts the paginated documents into a list of JSON objects with keys:
+    - "id"
+    - "title"
+    - "publisher"
+    - "description"
+    - "type"
+    - "date_modified"
+    - "date_valid"
+    - "regulatory_topics"
+
+    Updates the context with:
+    - Paginator object.
+    - Paginated documents in JSON format.
+    - Total number of results in the current page.
+    - Boolean to indicate if pagination is needed.
+    - Total number of results.
+    - Total number of pages.
+    - Current page number.
+    - Start index of the results in the current page.
+    - End index of the results in the current page.
+    """
     start_time = time.time()
 
     logger.info("paginating documents...")
diff --git a/orp/orp_search/views.py b/orp/orp_search/views.py
index 2ce4507..9e78be6 100644
--- a/orp/orp_search/views.py
+++ b/orp/orp_search/views.py
@@ -18,7 +18,8 @@
 
 @require_http_methods(["GET"])
 def document(request: HttpRequest, id) -> HttpResponse:
-    """Document details view.
+    """
+    Document details view.
 
     Handles the GET request to fetch details based on the provided id.
     """
@@ -45,6 +46,28 @@ def document(request: HttpRequest, id) -> HttpResponse:
 
 @require_http_methods(["GET"])
 def download_search_csv(request: HttpRequest) -> HttpResponse:
+    """
+    Handles the download of search results as a CSV file.
+
+    This view function is restricted to the GET HTTP method.
+    It accepts several query
+
+    parameters to configure the search:
+    - `search`: A string to search within the documents.
+    - `document_type`:
+        A list of document types to filter the search results.
+    - `publisher`: A list of publishers to filter the search results.
+    - `sort`: A field name to sort the search results.
+
+    The function constructs a `SearchDocumentConfig` object using the
+    received query parameters and performs a search using this
+    configuration. `DataResponseModel` objects from the search results
+    are retrieved and compiled into a list of dictionaries, which is
+    then converted into a DataFrame for demonstration purposes.
+    Finally, the ataFrame is written into a CSV file and returned as
+    an HTTP response with the appropriate content type and file
+    attachment headers.
+    """
     search_query = request.GET.get("search", "")
     document_types = request.GET.getlist("document_type", "")
     publishers = request.GET.getlist("publisher", None)
@@ -95,7 +118,8 @@ def download_search_csv(request: HttpRequest) -> HttpResponse:
 
 @require_http_methods(["GET"])
 def search_django(request: HttpRequest):
-    """Search view.
+    """
+    Search view.
 
     Renders the Django based search page.
     """
@@ -109,7 +133,8 @@ def search_django(request: HttpRequest):
 
 @require_http_methods(["GET"])
 def search_react(request: HttpRequest) -> HttpResponse:
-    """Search view.
+    """
+    Search view.
 
     Renders the React based search page.
     """

From 123f4e4d5fe610ecf4037af30b62ce074573fc32 Mon Sep 17 00:00:00 2001
From: Haresh Kainth <haresh.kainth@businessandtrade.gov.uk>
Date: Mon, 25 Nov 2024 22:41:19 +0000
Subject: [PATCH 2/2] update:logging levels from info to debug

Changed all logger.info statements to logger.debug for less verbose logging in non-critical areas. This reduces log clutter and focuses info-level logging on more significant events.
---
 orp/orp_search/utils/documents.py | 12 ++++++------
 orp/orp_search/utils/paginate.py  |  9 +++++----
 orp/orp_search/utils/search.py    | 16 ++++++++--------
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/orp/orp_search/utils/documents.py b/orp/orp_search/utils/documents.py
index 18e47ff..c76eb8d 100644
--- a/orp/orp_search/utils/documents.py
+++ b/orp/orp_search/utils/documents.py
@@ -9,10 +9,10 @@
 
 
 def clear_all_documents():
-    logger.info("clearing all documents from table...")
+    logger.debug("clearing all documents from table...")
     try:
         DataResponseModel.objects.all().delete()
-        logger.info("documents cleared from table")
+        logger.debug("documents cleared from table")
     except Exception as e:
         logger.error(f"error clearing documents: {e}")
         throw_error(f"error clearing documents: {e}")
@@ -20,16 +20,15 @@ def clear_all_documents():
 
 def insert_or_update_document(document_json):
     try:
-        logger.info("creating document...")
+        logger.debug("creating document...")
         logger.debug(f"document: {document_json}")
-        # Try to create a new document
         document = DataResponseModel(**document_json)
         document.full_clean()
         document.save()
     except Exception as e:
         logger.error(f"error creating document: {document_json}")
         logger.error(f"error: {e}")
-        logger.info("document already exists, updating...")
+        logger.debug("document already exists, updating...")
 
         # If a duplicate key error occurs, update the existing document
         try:
@@ -37,7 +36,7 @@ def insert_or_update_document(document_json):
             for key, value in document_json.items():
                 setattr(document, key, value)
             document.save()
-            logger.info(f"document updated: {document}")
+            logger.debug(f"document updated: {document}")
         except Exception as e:
             logger.error(f"error updating document: {document_json}")
             logger.error(f"error: {e}")
@@ -93,6 +92,7 @@ def generate_short_uuid():
         str: A URL-safe base64 encoded UUID truncated to 22 characters.
     """
     uid = uuid.uuid4()
+
     # Encode it to base64
     uid_b64 = base64.urlsafe_b64encode(uid.bytes).rstrip(b"=").decode("ascii")
     return uid_b64[
diff --git a/orp/orp_search/utils/paginate.py b/orp/orp_search/utils/paginate.py
index d45ab91..55031d6 100644
--- a/orp/orp_search/utils/paginate.py
+++ b/orp/orp_search/utils/paginate.py
@@ -60,7 +60,7 @@ def paginate(
     """
     start_time = time.time()
 
-    logger.info("paginating documents...")
+    logger.debug("paginating documents...")
     paginator = Paginator(results, config.limit)
     try:
         paginated_documents = paginator.page(config.offset)
@@ -70,7 +70,7 @@ def paginate(
         paginated_documents = paginator.page(paginator.num_pages)
 
     end_time = time.time()
-    logger.info(
+    logger.debug(
         f"time taken to paginate (before description +/ regulatory topics):"
         f" {round(end_time - start_time, 2)} seconds"
     )
@@ -88,7 +88,7 @@ def paginate(
                     ).split("\n")
 
         end_time = time.time()
-        logger.info(
+        logger.debug(
             f"time taken to paginate "
             f"(after description +/ regulatory topics): "
             f"{round(end_time - start_time, 2)} seconds"
@@ -121,7 +121,8 @@ def paginate(
     context["start_index"] = paginated_documents.start_index()
     context["end_index"] = paginated_documents.end_index()
     end_time = time.time()
-    logger.info(
+
+    logger.debug(
         f"time taken to paginate (after adding to context): "
         f"{round(end_time - start_time, 2)} seconds"
     )
diff --git a/orp/orp_search/utils/search.py b/orp/orp_search/utils/search.py
index 9db61d6..4013c85 100644
--- a/orp/orp_search/utils/search.py
+++ b/orp/orp_search/utils/search.py
@@ -72,11 +72,11 @@ def search_database(
 
     # Sanatize the query string
     query_str = sanitize_input(config.search_query)
-    logger.info(f"sanitized search query: {query_str}")
+    logger.debug(f"sanitized search query: {query_str}")
 
     # Generate query object
     query_objs = _create_search_query(query_str)
-    logger.info(f"search query objects: {query_objs}")
+    logger.debug(f"search query objects: {query_objs}")
 
     # Search across specific fields
     vector = SearchVector("title", "description", "regulatory_topics")
@@ -142,7 +142,7 @@ def search_database(
 
 
 def search(context: dict, request: HttpRequest) -> dict:
-    logger.info("received search request: %s", request)
+    logger.debug("received search request: %s", request)
     start_time = time.time()
 
     search_query = request.GET.get("query", "")
@@ -175,13 +175,13 @@ def search(context: dict, request: HttpRequest) -> dict:
     context = paginate(context, config, results)
     pag_end_time = time.time()
 
-    logger.info(
+    logger.debug(
         f"time taken to paginate (called from views.py): "
         f"{round(pag_end_time - pag_start_time, 2)} seconds"
     )
 
     end_time = time.time()
-    logger.info(
+    logger.debug(
         f"time taken to search and produce response: "
         f"{round(end_time - start_time, 2)} seconds"
     )
@@ -195,7 +195,7 @@ class Trim(Func):
 
 
 def get_publisher_names():
-    logger.info("getting publisher names...")
+    logger.debug("getting publisher names...")
     publishers_list = []
 
     try:
@@ -213,7 +213,7 @@ def get_publisher_names():
 
     except Exception as e:
         logger.error(f"error getting publisher names: {e}")
-        logger.info("returning empty list of publishers")
+        logger.debug("returning empty list of publishers")
 
-    logger.info(f"publishers found: {publishers_list}")
+    logger.debug(f"publishers found: {publishers_list}")
     return publishers_list