From 6bf5d0b8cd127ac188aa48ec16b03a57e07154f6 Mon Sep 17 00:00:00 2001
From: Haresh Kainth <haresh.kainth@businessandtrade.gov.uk>
Date: Wed, 30 Oct 2024 00:50:56 +0000
Subject: [PATCH 1/3] chore:reduce result count and fix pagination handling

Updated the API result count from 100 to 20 for better performance. Fixed pagination handling by correctly iterating through remaining pages and avoiding nested structure issues. Removed redundant checks for empty search terms in views.py to streamline the search process.
---
 orp/orp_search/legislation.py | 9 ++++++---
 orp/orp_search/views.py       | 6 ------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/orp/orp_search/legislation.py b/orp/orp_search/legislation.py
index 521c880..3f65ce7 100644
--- a/orp/orp_search/legislation.py
+++ b/orp/orp_search/legislation.py
@@ -29,7 +29,7 @@ def search(self, config: SearchDocumentConfig):
             "lang": "en",
             "title": search_terms,
             "text": search_terms,
-            "results-count": 100,
+            "results-count": 20,
         }
 
         # Register namespaces
@@ -137,12 +137,15 @@ def _extract_entries(root):
         all_entries += _extract_entries(root)
 
         morePages = int(page_data["morePages"])
-        logger.info(f"legislation more pages: {morePages}")
         if morePages > 1:
+            logger.info(f"legislation more pages: {morePages}")
+
             # Get remaining pages
             for page in range(2, morePages + 1):
+                params["page"] = page
                 root, _ = _do_request()
-                all_entries.append(_extract_entries(root))
+                results = _extract_entries(root)
+                all_entries += results
 
         logger.info(f"legislation total results: {len(all_entries)}")
         return all_entries
diff --git a/orp/orp_search/views.py b/orp/orp_search/views.py
index 8cc1a62..fc1881b 100644
--- a/orp/orp_search/views.py
+++ b/orp/orp_search/views.py
@@ -238,12 +238,6 @@ def search(request: HttpRequest) -> HttpResponse:
         search_results = public_gateway.search(config)
 
     # Legislation search
-    # If config.search_terms is empty then we don't need to
-    # search for legislation
-    if not config.search_terms or "" in config.search_terms:
-        logger.info("no search terms provided")
-        return render(request, template_name="orp.html", context=context)
-
     if not config.document_types or "legislation" in config.document_types:
         logger.info("searching for legislation: %s", config.search_terms)
         legislation = Legislation()

From b813df82355d00c9f5c8ce17fa3293904a587d18 Mon Sep 17 00:00:00 2001
From: Haresh Kainth <haresh.kainth@businessandtrade.gov.uk>
Date: Wed, 30 Oct 2024 01:10:01 +0000
Subject: [PATCH 2/3] feat: add date parsing utility and improve sorting

Introduced a utility function to parse dates ensuring proper handling of datetime objects and strings. Improved search result sorting by modifying sorting logic to consider date_modified, ensuring consistent ordering of results.
---
 orp/orp_search/views.py | 46 ++++++++++++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 12 deletions(-)

diff --git a/orp/orp_search/views.py b/orp/orp_search/views.py
index fc1881b..62b6366 100644
--- a/orp/orp_search/views.py
+++ b/orp/orp_search/views.py
@@ -2,6 +2,9 @@
 import csv
 import logging
 
+from datetime import datetime, timezone
+
+import dateutil.parser  # type: ignore
 import pandas as pd
 
 from orp_search.legislation import Legislation
@@ -158,6 +161,25 @@ def download_search_csv(request: HttpRequest) -> HttpResponse:
     return response
 
 
+def _parse_date(date_value):
+    if isinstance(date_value, datetime):
+        if date_value.tzinfo is None:
+            # If the datetime is offset-naive, make it offset-aware in UTC
+            return date_value.replace(tzinfo=timezone.utc)
+        return date_value
+    if isinstance(date_value, str):
+        try:
+            dt = dateutil.parser.parse(date_value)
+            if dt.tzinfo is None:
+                # If parsed datetime is offset-naive,
+                # make it offset-aware in UTC
+                return dt.replace(tzinfo=timezone.utc)
+            return dt
+        except ValueError:
+            return None
+    return None  # Return None for invalid date types
+
+
 @require_http_methods(["GET"])
 def search(request: HttpRequest) -> HttpResponse:
     """Search view.
@@ -243,20 +265,20 @@ def search(request: HttpRequest) -> HttpResponse:
         legislation = Legislation()
         search_results += legislation.search(config)
 
-    search_results_normalised = []
-
-    for result in search_results:
-        # If result is type of [] then extract each item and append to
-        # search_dict_results otherwise just append the result to
-        # search_dict_results
-        if isinstance(result, list):
-            for item in result:
-                search_results_normalised.append(item)
-        else:
-            search_results_normalised.append(result)
-
     # Sort results by date_modified (recent) or relevance
     # (calculate score and sort by score)
+    if sort_by == "recent":
+        search_results = sorted(
+            search_results,
+            key=lambda x: _parse_date(x["date_modified"]),
+            reverse=True,
+        )
+    # elif sort_by == "relevance":
+    #     search_results = sorted(
+    #         search_results_normalised,
+    #         key=lambda x: x["score"],
+    #         reverse=True,
+    #     )
 
     # Paginate results
     paginator = Paginator(search_results, config.limit)

From baa578bab31d3f2f1d07d7a79dc21e7f7b761cd8 Mon Sep 17 00:00:00 2001
From: Haresh Kainth <haresh.kainth@businessandtrade.gov.uk>
Date: Wed, 30 Oct 2024 01:20:44 +0000
Subject: [PATCH 3/3] fix:add relevance-based sorting to search results

Introduce a scoring function to calculate relevance based on search terms found in titles and descriptions. Implemented logic to sort search results by relevance using the computed scores.
---
 orp/orp_search/views.py | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/orp/orp_search/views.py b/orp/orp_search/views.py
index 62b6366..720272b 100644
--- a/orp/orp_search/views.py
+++ b/orp/orp_search/views.py
@@ -180,6 +180,23 @@ def _parse_date(date_value):
     return None  # Return None for invalid date types
 
 
+def _calculate_score(search_result, search_terms):
+    """
+    Calculate the score of a search result based on the number of
+    search terms found in the title and description.
+
+    :param search_result: A dictionary containing the search result.
+    :param search_terms: A list of search terms to look for in the
+                         search result.
+    :return: The score based on the number of search terms found.
+    """
+    title = search_result.get("title", "") or ""
+    description = search_result.get("description", "") or ""
+    combined_content = title.lower() + " " + description.lower()
+    score = sum(combined_content.count(term.lower()) for term in search_terms)
+    return score
+
+
 @require_http_methods(["GET"])
 def search(request: HttpRequest) -> HttpResponse:
     """Search view.
@@ -273,12 +290,17 @@ def search(request: HttpRequest) -> HttpResponse:
             key=lambda x: _parse_date(x["date_modified"]),
             reverse=True,
         )
-    # elif sort_by == "relevance":
-    #     search_results = sorted(
-    #         search_results_normalised,
-    #         key=lambda x: x["score"],
-    #         reverse=True,
-    #     )
+    elif sort_by == "relevance":
+        # Add the 'score' to each search result
+        for result in search_results:
+            logger.info("result to pass to calculate score: %s", result)
+            result["score"] = _calculate_score(result, config.search_terms)
+
+        search_results = sorted(
+            search_results,
+            key=lambda x: x["score"],
+            reverse=True,
+        )
 
     # Paginate results
     paginator = Paginator(search_results, config.limit)