Skip to content

Commit

Permalink
Merge branch 'main' of github.com:uktrade/orp into spike/add-react-fr…
Browse files Browse the repository at this point in the history
…ontend
  • Loading branch information
gdbarnes committed Oct 30, 2024
2 parents 5abac81 + 159b199 commit c1a0c23
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 21 deletions.
9 changes: 6 additions & 3 deletions orp/orp_search/legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def search(self, config: SearchDocumentConfig):
"lang": "en",
"title": search_terms,
"text": search_terms,
"results-count": 100,
"results-count": 20,
}

# Register namespaces
Expand Down Expand Up @@ -137,12 +137,15 @@ def _extract_entries(root):
all_entries += _extract_entries(root)

morePages = int(page_data["morePages"])
logger.info(f"legislation more pages: {morePages}")
if morePages > 1:
logger.info(f"legislation more pages: {morePages}")

# Get remaining pages
for page in range(2, morePages + 1):
params["page"] = page
root, _ = _do_request()
all_entries.append(_extract_entries(root))
results = _extract_entries(root)
all_entries += results

logger.info(f"legislation total results: {len(all_entries)}")
return all_entries
74 changes: 56 additions & 18 deletions orp/orp_search/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import csv
import logging

from datetime import datetime, timezone

import dateutil.parser # type: ignore
import pandas as pd

from orp_search.legislation import Legislation
Expand Down Expand Up @@ -158,6 +161,42 @@ def download_search_csv(request: HttpRequest) -> HttpResponse:
return response


def _parse_date(date_value):
if isinstance(date_value, datetime):
if date_value.tzinfo is None:
# If the datetime is offset-naive, make it offset-aware in UTC
return date_value.replace(tzinfo=timezone.utc)
return date_value
if isinstance(date_value, str):
try:
dt = dateutil.parser.parse(date_value)
if dt.tzinfo is None:
# If parsed datetime is offset-naive,
# make it offset-aware in UTC
return dt.replace(tzinfo=timezone.utc)
return dt
except ValueError:
return None
return None # Return None for invalid date types


def _calculate_score(search_result, search_terms):
"""
Calculate the score of a search result based on the number of
search terms found in the title and description.
:param search_result: A dictionary containing the search result.
:param search_terms: A list of search terms to look for in the
search result.
:return: The score based on the number of search terms found.
"""
title = search_result.get("title", "") or ""
description = search_result.get("description", "") or ""
combined_content = title.lower() + " " + description.lower()
score = sum(combined_content.count(term.lower()) for term in search_terms)
return score


@require_http_methods(["GET"])
def search(request: HttpRequest) -> HttpResponse:
"""Search view.
Expand Down Expand Up @@ -238,31 +277,30 @@ def search(request: HttpRequest) -> HttpResponse:
search_results = public_gateway.search(config)

# Legislation search
# If config.search_terms is empty then we don't need to
# search for legislation
if not config.search_terms or "" in config.search_terms:
logger.info("no search terms provided")
return render(request, template_name="orp.html", context=context)

if not config.document_types or "legislation" in config.document_types:
logger.info("searching for legislation: %s", config.search_terms)
legislation = Legislation()
search_results += legislation.search(config)

search_results_normalised = []

for result in search_results:
# If result is type of [] then extract each item and append to
# search_dict_results otherwise just append the result to
# search_dict_results
if isinstance(result, list):
for item in result:
search_results_normalised.append(item)
else:
search_results_normalised.append(result)

# Sort results by date_modified (recent) or relevance
# (calculate score and sort by score)
if sort_by == "recent":
search_results = sorted(
search_results,
key=lambda x: _parse_date(x["date_modified"]),
reverse=True,
)
elif sort_by == "relevance":
# Add the 'score' to each search result
for result in search_results:
logger.info("result to pass to calculate score: %s", result)
result["score"] = _calculate_score(result, config.search_terms)

search_results = sorted(
search_results,
key=lambda x: x["score"],
reverse=True,
)

# Paginate results
paginator = Paginator(search_results, config.limit)
Expand Down

0 comments on commit c1a0c23

Please sign in to comment.