From 0674624d21e6fe314bfaab6cd87e0f7cf16d764f Mon Sep 17 00:00:00 2001 From: Haresh Kainth Date: Wed, 30 Oct 2024 20:58:03 +0000 Subject: [PATCH 1/2] feat+refactor:add search term parsing and refactor utility functions Integrated a search term parsing function into the configuration code. Refactored pagination, date parsing, and score calculation into separate utility modules to enhance code maintainability and readability. --- orp/orp_search/config.py | 7 +++ orp/orp_search/utils/__init__.py | 0 orp/orp_search/utils/paginate.py | 44 ++++++++++++++ orp/orp_search/utils/results.py | 39 ++++++++++++ orp/orp_search/utils/terms.py | 100 +++++++++++++++++++++++++++++++ orp/orp_search/views.py | 87 ++------------------------- 6 files changed, 196 insertions(+), 81 deletions(-) create mode 100644 orp/orp_search/utils/__init__.py create mode 100644 orp/orp_search/utils/paginate.py create mode 100644 orp/orp_search/utils/results.py create mode 100644 orp/orp_search/utils/terms.py diff --git a/orp/orp_search/config.py b/orp/orp_search/config.py index b8bef40..96a6694 100644 --- a/orp/orp_search/config.py +++ b/orp/orp_search/config.py @@ -1,5 +1,7 @@ import logging +from orp_search.utils.terms import parse_search_terms + logger = logging.getLogger(__name__) @@ -35,6 +37,11 @@ def __init__( self.sort_by = sort_by self.id = id + # Parse search terms + search_terms_and, search_terms_or = parse_search_terms(search_terms) + self.search_terms_and = search_terms_and + self.search_terms_or = search_terms_or + def validate(self): """ diff --git a/orp/orp_search/utils/__init__.py b/orp/orp_search/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/orp/orp_search/utils/paginate.py b/orp/orp_search/utils/paginate.py new file mode 100644 index 0000000..3c21c5f --- /dev/null +++ b/orp/orp_search/utils/paginate.py @@ -0,0 +1,44 @@ +from orp_search.config import SearchDocumentConfig + +from django.core.paginator import EmptyPage, PageNotAnInteger, Paginator + + +def paginate(context, config: SearchDocumentConfig, search_results): + paginator = Paginator(search_results, config.limit) + try: + paginated_documents = paginator.page(config.offset) + except PageNotAnInteger: + paginated_documents = paginator.page(1) + except EmptyPage: + paginated_documents = paginator.page(paginator.num_pages) + + # Iterate over each document in paginated_documents + if paginated_documents: + for paginated_document in paginated_documents: + if "description" in paginated_document: + description = paginated_document["description"] + + # If description is not an empty string + if description: + # Truncate description to 100 characters + paginated_document["description"] = ( + description[:100] + "..." + if len(description) > 100 + else description + ) + if "regulatory_topics" in paginated_document: + paginated_document["regulatory_topics"] = str( + paginated_document["regulatory_topics"] + ).split("\n") + + context["paginator"] = paginator + context["results"] = paginated_documents + context["results_count"] = len(paginated_documents) + context["is_paginated"] = paginator.num_pages > 1 + context["results_total_count"] = paginator.count + context["results_page_total"] = paginator.num_pages + context["current_page"] = config.offset + context["start_index"] = paginated_documents.start_index() + context["end_index"] = paginated_documents.end_index() + + return context diff --git a/orp/orp_search/utils/results.py b/orp/orp_search/utils/results.py new file mode 100644 index 0000000..87d8704 --- /dev/null +++ b/orp/orp_search/utils/results.py @@ -0,0 +1,39 @@ +from datetime import datetime, timezone + +import dateutil.parser # type: ignore + + +def parse_date(date_value): + if isinstance(date_value, datetime): + if date_value.tzinfo is None: + # If the datetime is offset-naive, make it offset-aware in UTC + return date_value.replace(tzinfo=timezone.utc) + return date_value + if isinstance(date_value, str): + try: + dt = dateutil.parser.parse(date_value) + if dt.tzinfo is None: + # If parsed datetime is offset-naive, + # make it offset-aware in UTC + return dt.replace(tzinfo=timezone.utc) + return dt + except ValueError: + return None + return None # Return None for invalid date types + + +def calculate_score(search_result, search_terms): + """ + Calculate the score of a search result based on the number of + search terms found in the title and description. + + :param search_result: A dictionary containing the search result. + :param search_terms: A list of search terms to look for in the + search result. + :return: The score based on the number of search terms found. + """ + title = search_result.get("title", "") or "" + description = search_result.get("description", "") or "" + combined_content = title.lower() + " " + description.lower() + score = sum(combined_content.count(term.lower()) for term in search_terms) + return score diff --git a/orp/orp_search/utils/terms.py b/orp/orp_search/utils/terms.py new file mode 100644 index 0000000..6c632f9 --- /dev/null +++ b/orp/orp_search/utils/terms.py @@ -0,0 +1,100 @@ +import re + + +def sanitize_input(search): + """ + Sanitize the input to remove potential threats like SQL injection + characters. + This function removes or escapes characters that are commonly used + in SQL injection attacks. + """ + # Define a regular expression pattern to match unwanted characters or + # patterns + # This removes SQL keywords, single quotes, double quotes, semicolons, + # and escape sequences + sanitized_search = re.sub( + r"(--|\b(SELECT|INSERT|DELETE|UPDATE|DROP|ALTER|EXEC|UNION" + r"|CREATE)\b|'|\"|;)", + "", + search, + flags=re.IGNORECASE, + ) + return sanitized_search.strip() + + +def parse_search_terms(search): + # Sanitize input before processing + search = sanitize_input(search) + + # Initialize lists to hold terms + search_terms_and = [] + search_terms_or = [] + + # Check if input only contains "AND", "OR", "+", or whitespace + if re.fullmatch(r"(AND|OR|\+|\s)+", search): + return search_terms_and, search_terms_or + + # Split the search string into tokens based on spaces and keywords + tokens = re.split(r"(\s+|\bAND\b|\bOR\b|\+)", search) + + # Temporary variables for managing terms within quotes + current_and_term = [] + current_or_term = [] + + # Flag to determine if we are inside quotes + in_quotes = False + current_connector = None # Track AND/OR status outside of quotes + + for token in tokens: + token = token.strip() + + if not token: + continue + + # Check if token is the start/end of a quoted phrase + if token.startswith('"') and token.endswith('"'): + # Complete quoted term in one token + quoted_term = token.strip('"') + if current_connector == "AND" or current_connector is None: + search_terms_and.append(quoted_term) + elif current_connector == "OR": + search_terms_or.append(quoted_term) + continue + elif token.startswith('"'): + in_quotes = True + current_and_term = [] + current_or_term = [] + current_and_term.append(token.strip('"')) + continue + elif token.endswith('"'): + if in_quotes: + if current_connector == "AND" or current_connector is None: + current_and_term.append(token.strip('"')) + search_terms_and.append(" ".join(current_and_term)) + elif current_connector == "OR": + current_or_term.append(token.strip('"')) + search_terms_or.append(" ".join(current_or_term)) + in_quotes = False + continue + + # Handle token within quotes + if in_quotes: + if current_connector == "AND" or current_connector is None: + current_and_term.append(token) + elif current_connector == "OR": + current_or_term.append(token) + continue + + # Treat both + and AND as equivalent for "AND" logic + if token.upper() == "AND" or token == "+": # nosec BXXX + current_connector = "AND" + elif token.upper() == "OR": + current_connector = "OR" + else: + # Handle individual terms outside quotes + if current_connector == "AND" or current_connector is None: + search_terms_and.append(token) + elif current_connector == "OR": + search_terms_or.append(token) + + return search_terms_and, search_terms_or diff --git a/orp/orp_search/views.py b/orp/orp_search/views.py index 720272b..486bb12 100644 --- a/orp/orp_search/views.py +++ b/orp/orp_search/views.py @@ -2,16 +2,14 @@ import csv import logging -from datetime import datetime, timezone - -import dateutil.parser # type: ignore import pandas as pd from orp_search.legislation import Legislation from orp_search.public_gateway import PublicGateway, SearchDocumentConfig +from orp_search.utils.paginate import paginate +from orp_search.utils.results import calculate_score, parse_date from django.conf import settings -from django.core.paginator import EmptyPage, PageNotAnInteger, Paginator from django.http import HttpRequest, HttpResponse from django.shortcuts import redirect, render from django.views.decorators.http import require_http_methods @@ -161,42 +159,6 @@ def download_search_csv(request: HttpRequest) -> HttpResponse: return response -def _parse_date(date_value): - if isinstance(date_value, datetime): - if date_value.tzinfo is None: - # If the datetime is offset-naive, make it offset-aware in UTC - return date_value.replace(tzinfo=timezone.utc) - return date_value - if isinstance(date_value, str): - try: - dt = dateutil.parser.parse(date_value) - if dt.tzinfo is None: - # If parsed datetime is offset-naive, - # make it offset-aware in UTC - return dt.replace(tzinfo=timezone.utc) - return dt - except ValueError: - return None - return None # Return None for invalid date types - - -def _calculate_score(search_result, search_terms): - """ - Calculate the score of a search result based on the number of - search terms found in the title and description. - - :param search_result: A dictionary containing the search result. - :param search_terms: A list of search terms to look for in the - search result. - :return: The score based on the number of search terms found. - """ - title = search_result.get("title", "") or "" - description = search_result.get("description", "") or "" - combined_content = title.lower() + " " + description.lower() - score = sum(combined_content.count(term.lower()) for term in search_terms) - return score - - @require_http_methods(["GET"]) def search(request: HttpRequest) -> HttpResponse: """Search view. @@ -253,7 +215,7 @@ def search(request: HttpRequest) -> HttpResponse: # Get the search results from the Data API using PublicGateway class config = SearchDocumentConfig( - str(search_query).lower(), + search_query, document_types, dummy=True, limit=limit, @@ -287,14 +249,14 @@ def search(request: HttpRequest) -> HttpResponse: if sort_by == "recent": search_results = sorted( search_results, - key=lambda x: _parse_date(x["date_modified"]), + key=lambda x: parse_date(x["date_modified"]), reverse=True, ) elif sort_by == "relevance": # Add the 'score' to each search result for result in search_results: logger.info("result to pass to calculate score: %s", result) - result["score"] = _calculate_score(result, config.search_terms) + result["score"] = calculate_score(result, config.search_terms) search_results = sorted( search_results, @@ -302,42 +264,5 @@ def search(request: HttpRequest) -> HttpResponse: reverse=True, ) - # Paginate results - paginator = Paginator(search_results, config.limit) - try: - paginated_documents = paginator.page(config.offset) - except PageNotAnInteger: - paginated_documents = paginator.page(1) - except EmptyPage: - paginated_documents = paginator.page(paginator.num_pages) - - # Iterate over each document in paginated_documents - if paginated_documents: - for paginated_document in paginated_documents: - if "description" in paginated_document: - description = paginated_document["description"] - - # If description is not an empty string - if description: - # Truncate description to 100 characters - paginated_document["description"] = ( - description[:100] + "..." - if len(description) > 100 - else description - ) - if "regulatory_topics" in paginated_document: - paginated_document["regulatory_topics"] = str( - paginated_document["regulatory_topics"] - ).split("\n") - - context["paginator"] = paginator - context["results"] = paginated_documents - context["results_count"] = len(paginated_documents) - context["is_paginated"] = paginator.num_pages > 1 - context["results_total_count"] = paginator.count - context["results_page_total"] = paginator.num_pages - context["current_page"] = config.offset - context["start_index"] = paginated_documents.start_index() - context["end_index"] = paginated_documents.end_index() - + context = paginate(context, config, search_results) return render(request, template_name="orp.html", context=context) From 4dc1722fbe4a9346520df8ab41f61482e448a7d1 Mon Sep 17 00:00:00 2001 From: Haresh Kainth Date: Wed, 30 Oct 2024 21:29:59 +0000 Subject: [PATCH 2/2] chore:enhance search functionality with combined search terms Added a new utility function `combine_search_terms` to merge `AND` and `OR` search terms into a single expression. Updated `SearchDocumentConfig` to use this combined expression and modified `search` method in `legislation.py` to use the new combined search terms. This improves query precision and flexibility. --- orp/orp_search/config.py | 5 ++++- orp/orp_search/legislation.py | 10 ++++++---- orp/orp_search/utils/terms.py | 17 +++++++++++++++++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/orp/orp_search/config.py b/orp/orp_search/config.py index 96a6694..d3c8e91 100644 --- a/orp/orp_search/config.py +++ b/orp/orp_search/config.py @@ -1,6 +1,6 @@ import logging -from orp_search.utils.terms import parse_search_terms +from orp_search.utils.terms import combine_search_terms, parse_search_terms logger = logging.getLogger(__name__) @@ -41,6 +41,9 @@ def __init__( search_terms_and, search_terms_or = parse_search_terms(search_terms) self.search_terms_and = search_terms_and self.search_terms_or = search_terms_or + self.final_search_expression = combine_search_terms( + search_terms_and, search_terms_or + ) def validate(self): """ diff --git a/orp/orp_search/legislation.py b/orp/orp_search/legislation.py index 3f65ce7..50f65f2 100644 --- a/orp/orp_search/legislation.py +++ b/orp/orp_search/legislation.py @@ -21,14 +21,16 @@ def __init__(self): def search(self, config: SearchDocumentConfig): logger.info("searching legislation...") + logger.info( + f"final_search_expression terms: {config.final_search_expression}" + ) + # List of search terms - title_search_terms = config.search_terms - search_terms = ",".join(title_search_terms) headers = {"Accept": "application/atom+xml"} params = { "lang": "en", - "title": search_terms, - "text": search_terms, + "title": config.final_search_expression, + "text": config.final_search_expression, "results-count": 20, } diff --git a/orp/orp_search/utils/terms.py b/orp/orp_search/utils/terms.py index 6c632f9..0866d2f 100644 --- a/orp/orp_search/utils/terms.py +++ b/orp/orp_search/utils/terms.py @@ -98,3 +98,20 @@ def parse_search_terms(search): search_terms_or.append(token) return search_terms_and, search_terms_or + + +def combine_search_terms(search_terms_and, search_terms_or): + # Join terms in `search_terms_and` with " AND " + combined_and = " AND ".join(search_terms_and) if search_terms_and else "" + + # Join terms in `search_terms_or` with " OR " + combined_or = " OR ".join(search_terms_or) if search_terms_or else "" + + # Combine both parts, adding parentheses around each if both are present + if combined_and and combined_or: + combined_query = f"{combined_and} OR {combined_or}" + else: + # Use whichever part is non-empty + combined_query = combined_and or combined_or + + return combined_query