diff --git a/orp/orp_search/apps.py b/orp/orp_search/apps.py index d8e653f..728cd76 100644 --- a/orp/orp_search/apps.py +++ b/orp/orp_search/apps.py @@ -2,6 +2,17 @@ class SearchConfig(AppConfig): + """ + Configuration class for the ORP Search application. + + Attributes: + name (str): The full Python path to the application. + verbose_name (str): A human-readable name for the application. + default_auto_field (str): Specifies the type of auto-created + primary key field to use. + + """ + name = "orp_search" verbose_name = "ORP application functionality" default_auto_field = "django.db.models.BigAutoField" diff --git a/orp/orp_search/config.py b/orp/orp_search/config.py index 1c03ec1..f08849a 100644 --- a/orp/orp_search/config.py +++ b/orp/orp_search/config.py @@ -16,13 +16,41 @@ def __init__( id=None, ): """ - Initializes a new instance of the class. + Initializes the SearchRequest object with the given parameters. - :param searchTerms: A comma-separated string of search terms. - :param documentTypes: Optional. A list of document types - to filter the search. - :param timeout: Optional. The timeout in seconds for the search - request. + Args: + search_query (str): The search query string. + document_types (Optional[List[str]]): + A list of document types to filter by. Defaults to None. + timeout (Optional[int]): + The timeout value for the request in seconds. Defaults to None. + limit (int): + The maximum number of search results to return. Defaults to 10. + offset (int): + The starting position of the search results. Defaults to 1. + publisher_names (Optional[List[str]]): + A list of publisher names to filter by. Defaults to None. + sort_by (Optional[str]): + The field by which to sort the search results. Defaults to + None. + id (Optional[str]): + An optional identifier for the search request. Defaults to + None. + + Attributes: + search_query (str): The search query string. + document_types (Optional[List[str]]): + A list of document types to filter by. + timeout (Optional[int]): + The timeout value for the request in seconds. + limit (int): The maximum number of search results to return. + offset (int): The starting position of the search results. + publisher_names (Optional[List[str]]): + A list of publisher names to filter by. + sort_by (Optional[str]): + The field by which to sort the search results. + id (Optional[str]): + An optional identifier for the search request. """ self.search_query = search_query self.document_types = ( @@ -46,17 +74,20 @@ def __init__( def validate(self): """ + Validates the constraints defined for offset, limit, + and sort_by attributes. - Validates the presence of search terms. + Returns: + bool + True if all constraints are satisfied, False otherwise. - Checks if the 'searchTerms' attribute exists and is non-empty. Logs - an error message and returns False if 'searchTerms' is missing or - empty. + Notes: + - The offset must be a non-negative integer. + - The limit must be a non-negative integer. + - The sort_by attribute, if specified, must be either + 'recent' or 'relevance'. - Returns - ------- - bool - True if 'searchTerms' is present and non-empty, False otherwise. + Errors are logged if any of the constraints are violated. """ if self.offset < 0: logger.error("offset must be a positive integer") @@ -73,6 +104,21 @@ def validate(self): return True def print_to_log(self): + """ + + Logs the current state of various search parameters. + + Logs the following attributes: + - search_query: The search query string. + - document_types: The list of document types being searched. + - timeout: The timeout value for the search query. + - limit: The maximum number of results to return. + - offset: The starting point from which results are returned. + - publisher_names: The list of publisher names to filter the search. + - sort_by: The criteria for sorting the search results. + - id: The unique identifier for the search query. + + """ logger.info(f"search_query: {self.search_query}") logger.info(f"document_types: {self.document_types}") logger.info(f"timeout: {self.timeout}") diff --git a/orp/orp_search/construction_legislation.py b/orp/orp_search/construction_legislation.py index 6a2ba63..010ac71 100644 --- a/orp/orp_search/construction_legislation.py +++ b/orp/orp_search/construction_legislation.py @@ -1936,6 +1936,12 @@ def construction_legislation_dataframe(): + """ + Reads CSV data from a predefined string, converts the data into a Pandas DataFrame, and returns the resulting DataFrame. + + Returns: + pandas.DataFrame: The dataframe containing the CSV data. + """ # Use StringIO to simulate reading from a file csv_data = StringIO(_csv_text) diff --git a/orp/orp_search/legislation.py b/orp/orp_search/legislation.py index 789d403..b0a930a 100644 --- a/orp/orp_search/legislation.py +++ b/orp/orp_search/legislation.py @@ -1,4 +1,3 @@ -import base64 import logging import re import xml.etree.ElementTree as ET # nosec BXXX @@ -20,12 +19,23 @@ logger = logging.getLogger(__name__) -def _encode_url(url): - encoded_bytes = base64.urlsafe_b64encode(url.encode("utf-8")) - return encoded_bytes.decode("utf-8") - - def _get_url_data(config, url): + """ + Fetch data from a given URL and return the response text if successful, + otherwise log the error. + + Parameters: + - config: Configuration object that includes the request timeout. + - url: String representing the URL to request. + + Returns: + - Response text if the status code is 200. + - None if the response status code is not 200, or if there is an exception + during the request. + + Logs: + - Error messages for request failures and non-200 response codes. + """ try: response = requests.get(url, timeout=config.timeout) # nosec BXXX if response.status_code == 200: @@ -43,11 +53,36 @@ def _get_url_data(config, url): def _get_text_from_element(element: Optional[ET.Element]) -> Optional[str]: + """ + Extracts and returns the text content from an XML element if it exists. + + This function checks if the provided XML element is not None. + If the element is available, it returns the text content of that element. + If the element is None, it returns None. + + Parameters: + element (Optional[ET.Element]): + The XML element from which to extract the text. + + Returns: + Optional[str]: + The text content of the element if it exists, otherwise None. + """ return element.text if element is not None else None class Legislation: def __init__(self): + """ + Initializes the class instance and defines the XML namespaces. + + Attributes: + _namespaces (dict): + A dictionary containing XML namespaces with their + corresponding URLs. These namespaces are used to + refer to elements in XML documents adhering to + different XML schemas. + """ # Define the XML namespaces self._namespaces = { "leg": "http://www.legislation.gov.uk/namespaces/legislation", @@ -58,6 +93,31 @@ def __init__(self): } def build_cache(self, config: SearchDocumentConfig): + """ + Builds a cache of legislation documents by retrieving XML data from + URLs specified in a DataFrame. + + Parameters: + config (SearchDocumentConfig): Configuration object for searching + documents. + + Raises: + Exception: If there's an error fetching data from the URL or no data + is returned. + + Functionality: + 1. Logs the start of the caching process. + 2. Loads legislation data into a DataFrame. + 3. Iterates over each row in the DataFrame to fetch XML data from + specified URLs. + 4. Extracts and parses XML data, logging relevant informational + and error messages. + 5. Extracts specific fields (identifier, title, description, etc.) + from the parsed XML data. + 6. Converts the extracted data to JSON format. + 7. Inserts or updates the document in the cache. + 8. Logs errors and re-raises them if data retrieval fails. + """ logger.info("building legislation cache...") dataset = construction_legislation_dataframe() @@ -138,6 +198,22 @@ def _to_json( title, valid, ): + """ + Converts given parameters into a JSON-like dictionary format. + + Arguments: + description (str): Description of the item. + format (str): Format of the item. + identifier (str): Unique identifier for the item. + language (str): Language in which the item is available. + modified (str): The date when the item was last modified. + publisher (str): The publisher of the item. + title (str): The title of the item. + valid (str): The date until which the item is considered valid. + + Returns: + dict: A dictionary containing the item details in a structured format. + """ return { "id": generate_short_uuid(), "title": title, diff --git a/orp/orp_search/models.py b/orp/orp_search/models.py index 46b7551..11be85f 100644 --- a/orp/orp_search/models.py +++ b/orp/orp_search/models.py @@ -1,13 +1,57 @@ import logging -from django.core.exceptions import ValidationError -from django.core.validators import URLValidator from django.db import models logger = logging.getLogger(__name__) class DataResponseModel(models.Model): + """ + DataResponseModel + + A Django model representing various metadata fields related to data + responses. + + Attributes: + title: Title of the data response. + identifier: Unique identifier for the data response. + publisher: Entity that published the data response. + publisher_id: Unique ID of the publisher. + language: Language in which the data response is published. + format: Format of the data response. + description: Brief description of the data response. + date_issued: Date when the data response was issued. + date_modified: Date when the data response was last modified. + date_valid: Validity date of the data response as text. + sort_date: Date used for sorting the data responses. + audience: Intended audience for the data response. + coverage: Coverage details of the data response. + subject: Subject matter of the data response. + type: Type of the data response. + license: Licensing information of the data response. + regulatory_topics: Topics covered by the data response. + status: Current status of the data response. + date_uploaded_to_orp: Date when the data response was uploaded to ORP. + has_format: Format details that the data response has. + is_format_of: + Indicates if the data response is a format of another resource. + has_version: Version details that the data response has. + is_version_of: + Indicates if the data response is a version of another resource. + references: References cited in the data response. + is_referenced_by: + Indicates if the data response is referenced by another resource. + has_part: Part details that the data response has. + is_part_of: + Indicates if the data response is a part of another resource. + is_replaced_by: + Indicates if the data response is replaced by another resource. + replaces: Indicates if the data response replaces another resource. + related_legislation: Related legislation details for the data response. + id: Primary key of the data response. + score: Score associated with the data response, default is 0. + """ + title = models.TextField(null=True, blank=True) identifier = models.TextField(null=True, blank=True) publisher = models.TextField(null=True, blank=True) @@ -40,18 +84,3 @@ class DataResponseModel(models.Model): related_legislation = models.TextField(null=True, blank=True) id = models.TextField(primary_key=True) score = models.IntegerField(null=True, blank=True, default=0) - - def __str__(self): - return self.title - - def clean(self): - """ - Validate the id field to check if it's a URL or not. - """ - url_validator = URLValidator() - try: - url_validator(self.id) - except ValidationError: - # It's not a URL, which is acceptable as it's a - # CharField that supports both - pass diff --git a/orp/orp_search/utils/documents.py b/orp/orp_search/utils/documents.py index b3d7dcc..c76eb8d 100644 --- a/orp/orp_search/utils/documents.py +++ b/orp/orp_search/utils/documents.py @@ -9,10 +9,10 @@ def clear_all_documents(): - logger.info("clearing all documents from table...") + logger.debug("clearing all documents from table...") try: DataResponseModel.objects.all().delete() - logger.info("documents cleared from table") + logger.debug("documents cleared from table") except Exception as e: logger.error(f"error clearing documents: {e}") throw_error(f"error clearing documents: {e}") @@ -20,16 +20,15 @@ def clear_all_documents(): def insert_or_update_document(document_json): try: - logger.info("creating document...") + logger.debug("creating document...") logger.debug(f"document: {document_json}") - # Try to create a new document document = DataResponseModel(**document_json) document.full_clean() document.save() except Exception as e: logger.error(f"error creating document: {document_json}") logger.error(f"error: {e}") - logger.info("document already exists, updating...") + logger.debug("document already exists, updating...") # If a duplicate key error occurs, update the existing document try: @@ -37,7 +36,7 @@ def insert_or_update_document(document_json): for key, value in document_json.items(): setattr(document, key, value) document.save() - logger.info(f"document updated: {document}") + logger.debug(f"document updated: {document}") except Exception as e: logger.error(f"error updating document: {document_json}") logger.error(f"error: {e}") @@ -86,8 +85,14 @@ def _extract_terms(search_query): def generate_short_uuid(): - # Generate a UUID + """ + Generates a short, URL-safe UUID. + + Returns: + str: A URL-safe base64 encoded UUID truncated to 22 characters. + """ uid = uuid.uuid4() + # Encode it to base64 uid_b64 = base64.urlsafe_b64encode(uid.bytes).rstrip(b"=").decode("ascii") return uid_b64[ diff --git a/orp/orp_search/utils/paginate.py b/orp/orp_search/utils/paginate.py index 310bf96..55031d6 100644 --- a/orp/orp_search/utils/paginate.py +++ b/orp/orp_search/utils/paginate.py @@ -12,9 +12,55 @@ def paginate( context: dict, config: SearchDocumentConfig, results: QuerySet ) -> dict: + """ + Paginates the given query set and updates the context with + pagination details. + + Parameters: + - context (dict): + The context dictionary to be updated with pagination details. + - config (SearchDocumentConfig): + Configuration object containing limit and offset for pagination. + - results (QuerySet): The query set of documents to be paginated. + + Returns: + - dict: + The updated context dictionary containing pagination information and + paginated documents. + + Logs the time taken for the pagination process in different stages: + 1. Time taken to paginate the documents. + 2. Time taken to process regulatory topics for each document. + 3. Time taken to update the context with pagination details. + + Handles pagination exceptions: + - If the page is not an integer, defaults to the first page. + - If the page is empty, defaults to the last page. + + Converts the paginated documents into a list of JSON objects with keys: + - "id" + - "title" + - "publisher" + - "description" + - "type" + - "date_modified" + - "date_valid" + - "regulatory_topics" + + Updates the context with: + - Paginator object. + - Paginated documents in JSON format. + - Total number of results in the current page. + - Boolean to indicate if pagination is needed. + - Total number of results. + - Total number of pages. + - Current page number. + - Start index of the results in the current page. + - End index of the results in the current page. + """ start_time = time.time() - logger.info("paginating documents...") + logger.debug("paginating documents...") paginator = Paginator(results, config.limit) try: paginated_documents = paginator.page(config.offset) @@ -24,7 +70,7 @@ def paginate( paginated_documents = paginator.page(paginator.num_pages) end_time = time.time() - logger.info( + logger.debug( f"time taken to paginate (before description +/ regulatory topics):" f" {round(end_time - start_time, 2)} seconds" ) @@ -42,7 +88,7 @@ def paginate( ).split("\n") end_time = time.time() - logger.info( + logger.debug( f"time taken to paginate " f"(after description +/ regulatory topics): " f"{round(end_time - start_time, 2)} seconds" @@ -75,7 +121,8 @@ def paginate( context["start_index"] = paginated_documents.start_index() context["end_index"] = paginated_documents.end_index() end_time = time.time() - logger.info( + + logger.debug( f"time taken to paginate (after adding to context): " f"{round(end_time - start_time, 2)} seconds" ) diff --git a/orp/orp_search/utils/search.py b/orp/orp_search/utils/search.py index 9db61d6..4013c85 100644 --- a/orp/orp_search/utils/search.py +++ b/orp/orp_search/utils/search.py @@ -72,11 +72,11 @@ def search_database( # Sanatize the query string query_str = sanitize_input(config.search_query) - logger.info(f"sanitized search query: {query_str}") + logger.debug(f"sanitized search query: {query_str}") # Generate query object query_objs = _create_search_query(query_str) - logger.info(f"search query objects: {query_objs}") + logger.debug(f"search query objects: {query_objs}") # Search across specific fields vector = SearchVector("title", "description", "regulatory_topics") @@ -142,7 +142,7 @@ def search_database( def search(context: dict, request: HttpRequest) -> dict: - logger.info("received search request: %s", request) + logger.debug("received search request: %s", request) start_time = time.time() search_query = request.GET.get("query", "") @@ -175,13 +175,13 @@ def search(context: dict, request: HttpRequest) -> dict: context = paginate(context, config, results) pag_end_time = time.time() - logger.info( + logger.debug( f"time taken to paginate (called from views.py): " f"{round(pag_end_time - pag_start_time, 2)} seconds" ) end_time = time.time() - logger.info( + logger.debug( f"time taken to search and produce response: " f"{round(end_time - start_time, 2)} seconds" ) @@ -195,7 +195,7 @@ class Trim(Func): def get_publisher_names(): - logger.info("getting publisher names...") + logger.debug("getting publisher names...") publishers_list = [] try: @@ -213,7 +213,7 @@ def get_publisher_names(): except Exception as e: logger.error(f"error getting publisher names: {e}") - logger.info("returning empty list of publishers") + logger.debug("returning empty list of publishers") - logger.info(f"publishers found: {publishers_list}") + logger.debug(f"publishers found: {publishers_list}") return publishers_list diff --git a/orp/orp_search/views.py b/orp/orp_search/views.py index 2ce4507..9e78be6 100644 --- a/orp/orp_search/views.py +++ b/orp/orp_search/views.py @@ -18,7 +18,8 @@ @require_http_methods(["GET"]) def document(request: HttpRequest, id) -> HttpResponse: - """Document details view. + """ + Document details view. Handles the GET request to fetch details based on the provided id. """ @@ -45,6 +46,28 @@ def document(request: HttpRequest, id) -> HttpResponse: @require_http_methods(["GET"]) def download_search_csv(request: HttpRequest) -> HttpResponse: + """ + Handles the download of search results as a CSV file. + + This view function is restricted to the GET HTTP method. + It accepts several query + + parameters to configure the search: + - `search`: A string to search within the documents. + - `document_type`: + A list of document types to filter the search results. + - `publisher`: A list of publishers to filter the search results. + - `sort`: A field name to sort the search results. + + The function constructs a `SearchDocumentConfig` object using the + received query parameters and performs a search using this + configuration. `DataResponseModel` objects from the search results + are retrieved and compiled into a list of dictionaries, which is + then converted into a DataFrame for demonstration purposes. + Finally, the ataFrame is written into a CSV file and returned as + an HTTP response with the appropriate content type and file + attachment headers. + """ search_query = request.GET.get("search", "") document_types = request.GET.getlist("document_type", "") publishers = request.GET.getlist("publisher", None) @@ -95,7 +118,8 @@ def download_search_csv(request: HttpRequest) -> HttpResponse: @require_http_methods(["GET"]) def search_django(request: HttpRequest): - """Search view. + """ + Search view. Renders the Django based search page. """ @@ -109,7 +133,8 @@ def search_django(request: HttpRequest): @require_http_methods(["GET"]) def search_react(request: HttpRequest) -> HttpResponse: - """Search view. + """ + Search view. Renders the React based search page. """