Skip to content

Commit

Permalink
feat:add sorting by relevance
Browse files Browse the repository at this point in the history
Enhanced search functionality by adding a new sorting option based on relevance. The relevance score is calculated using the presence of search terms in the document title and description. Updated the configuration validation, HTML template, and logging to support this new sorting method.
  • Loading branch information
hareshkainthdbt committed Oct 14, 2024
1 parent 565675f commit 77801a6
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 5 deletions.
5 changes: 5 additions & 0 deletions orp/orp_search/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,9 @@ def validate(self):
if self.limit < 0:
logger.error("limit must be a positive integer")
return False

if self.sort_by:
if self.sort_by not in ["recently", "relevance"]:
logger.error("sort_by must be 'recently' or 'relevance'")
return False
return True
37 changes: 33 additions & 4 deletions orp/orp_search/public_gateway.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging

import pandas as pd
import requests # type: ignore

from jinja2 import Template
Expand Down Expand Up @@ -130,17 +131,45 @@ def search(self, config: SearchDocumentConfig):

sorted_df = None

if config.sort_by == "recently_updated":
if config.sort_by == "recently":
# Sort the DataFrame by 'date_modified' in descending order
sorted_df = filtered_df.sort_values(
by="date_modified", ascending=False
# Ensure 'date_issued' is in datetime format
filtered_df["date_issued"] = pd.to_datetime(
filtered_df["date_issued"], format="%d/%m/%Y"
)

elif config.sort_by == "recently_published":
# Sort the DataFrame by 'date_issued' in descending order
sorted_df = filtered_df.sort_values(
by="date_issued", ascending=False
)
elif config.sort_by == "relevance":
# Calculate relevance score
# (based on the number of keywords found)
def calculate_relevance(row, search_terms):
def score_text(text, terms):
text_processed = text.replace(" ", "").lower()
return sum(
1
for term in terms
if term.replace(" ", "").lower() in text_processed
)

title_score = score_text(row["title"], search_terms)
description_score = score_text(
row["description"], search_terms
)
return title_score + description_score

filtered_df["relevance_score"] = filtered_df.apply(
calculate_relevance,
axis=1,
search_terms=config.search_terms,
)

# Sort the DataFrame by 'relevance_score' in descending order
sorted_df = filtered_df.sort_values(
by="relevance_score", ascending=False
)

if sorted_df is not None:
results = sorted_df.to_dict(orient="records")
Expand Down
2 changes: 1 addition & 1 deletion orp/orp_search/templates/orp.html
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ <h2 class="govuk-fieldset__heading">
</label>
<select class="govuk-select" id="sort" name="sort">
<option value="date">Recently updated</option>
<option value="sort">???</option>
<option value="sort">Relevance</option>
</select>
</div>
</div>
Expand Down
7 changes: 7 additions & 0 deletions orp/orp_search/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ def search(request: HttpRequest) -> HttpResponse:
else:
logger.info("search query: %s", search_query)

sort_by = request.GET.get("sort", None)
if sort_by:
logger.info("sort by: %s", sort_by)

logger.info("document types: %s", document_types)
logger.info("page: %s", offset)

Expand All @@ -110,6 +114,9 @@ def search(request: HttpRequest) -> HttpResponse:
if publisher:
config.publisher_terms = publisher

if sort_by:
config.sort_by = sort_by

# Check if the response is cached
public_gateway = PublicGateway()
search_results = public_gateway.search(config)
Expand Down

0 comments on commit 77801a6

Please sign in to comment.