Skip to content

Commit

Permalink
Merge pull request #48 from uktrade/feature/orpd-54-search-terms
Browse files Browse the repository at this point in the history
feat(orpd-54): significant backend improvements
  • Loading branch information
hareshkainthdbt authored Nov 20, 2024
2 parents 81ccab7 + e7ec9c5 commit 2a0488d
Show file tree
Hide file tree
Showing 8 changed files with 103 additions and 73 deletions.
9 changes: 7 additions & 2 deletions orp/orp_search/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,19 @@ def __init__(
request.
"""
self.search_query = search_query
self.document_types = document_types
self.document_types = [doc_type.lower() for doc_type in document_types]
self.timeout = None if timeout is None else int(timeout)
self.limit = limit
self.offset = offset
self.publisher_names = publisher_names
self.publisher_names = [
pub_name.lower() for pub_name in publisher_names
]
self.sort_by = sort_by
self.id = id

logger.info(f"document_types from request: {self.document_types}")
logger.info(f"publisher_names from request: {self.publisher_names}")

def validate(self):
"""
Expand Down
10 changes: 7 additions & 3 deletions orp/orp_search/legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
construction_legislation_dataframe,
)
from orp_search.utils.date import convert_date_string_to_obj
from orp_search.utils.documents import insert_or_update_document
from orp_search.utils.documents import ( # noqa: E501
generate_short_uuid,
insert_or_update_document,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -135,7 +138,7 @@ def _to_json(
valid,
):
return {
"id": _encode_url(identifier),
"id": generate_short_uuid(),
"title": title,
"identifier": identifier,
"publisher": publisher,
Expand All @@ -144,7 +147,8 @@ def _to_json(
"description": description if description is not None else "",
"date_issued": convert_date_string_to_obj(modified),
"date_modified": convert_date_string_to_obj(modified),
"date_valid": convert_date_string_to_obj(valid),
"date_valid": valid,
"sort_date": convert_date_string_to_obj(valid),
"type": "Legislation",
"score": 0,
}
7 changes: 4 additions & 3 deletions orp/orp_search/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Generated by Django 4.2.16 on 2024-11-13 22:39
# Generated by Django 4.2.16 on 2024-11-19 17:29

from django.db import migrations, models

Expand All @@ -14,14 +14,15 @@ class Migration(migrations.Migration):
name="DataResponseModel",
fields=[
("title", models.TextField(blank=True, null=True)),
("identifier", models.URLField(unique=True)),
("identifier", models.TextField(blank=True, null=True)),
("publisher", models.TextField(blank=True, null=True)),
("language", models.TextField(blank=True, null=True)),
("format", models.TextField(blank=True, null=True)),
("description", models.TextField(blank=True, null=True)),
("date_issued", models.DateField(blank=True, null=True)),
("date_modified", models.DateField(blank=True, null=True)),
("date_valid", models.DateField(blank=True, null=True)),
("date_valid", models.TextField(blank=True, null=True)),
("sort_date", models.DateField(blank=True, null=True)),
("audience", models.TextField(blank=True, null=True)),
("coverage", models.TextField(blank=True, null=True)),
("subject", models.TextField(blank=True, null=True)),
Expand Down
3 changes: 2 additions & 1 deletion orp/orp_search/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ class DataResponseModel(models.Model):
description = models.TextField(null=True, blank=True)
date_issued = models.DateField(null=True, blank=True)
date_modified = models.DateField(null=True, blank=True)
date_valid = models.DateField(null=True, blank=True)
date_valid = models.TextField(null=True, blank=True)
sort_date = models.DateField(null=True, blank=True)
audience = models.TextField(null=True, blank=True)
coverage = models.TextField(null=True, blank=True)
subject = models.TextField(null=True, blank=True)
Expand Down
7 changes: 5 additions & 2 deletions orp/orp_search/public_gateway.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
import requests # type: ignore

from orp_search.utils.date import convert_date_string_to_obj
from orp_search.utils.documents import insert_or_update_document
from orp_search.utils.documents import ( # noqa: E501
generate_short_uuid,
insert_or_update_document,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -96,7 +99,7 @@ def build_cache(self, config):
row["date_valid"] = convert_date_string_to_obj(
row.get("date_valid")
)
row["id"] = row.get("identifier")
row["id"] = (generate_short_uuid(),)

insert_or_update_document(row)
inserted_document_count += 1
Expand Down
13 changes: 13 additions & 0 deletions orp/orp_search/utils/documents.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import base64
import re
import uuid

from numpy.f2py.auxfuncs import throw_error
from orp_search.models import DataResponseModel, logger
Expand Down Expand Up @@ -81,3 +83,14 @@ def _extract_terms(search_query):
combined_content.count(term.lower()) for term in search_query
)
document.save()


def generate_short_uuid():
# Generate a UUID
uid = uuid.uuid4()
# Encode it to base64
uid_b64 = base64.urlsafe_b64encode(uid.bytes).rstrip(b"=").decode("ascii")
return uid_b64[
:22
] # Shorten as needed, typically more than 22 characters are
# unnecessary and remain unique.
75 changes: 54 additions & 21 deletions orp/orp_search/utils/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from orp_search.utils.terms import sanitize_input

from django.contrib.postgres.search import SearchQuery, SearchVector
from django.db.models import QuerySet
from django.db.models import Q, QuerySet
from django.http import HttpRequest

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -56,9 +56,20 @@ def _create_search_query(search_string):
return preprocess_query


def _search_database(
def search_database(
config: SearchDocumentConfig,
) -> QuerySet[DataResponseModel]:
"""
Search the database for documents based on the search query
:param config: The search configuration object
:return: A QuerySet of DataResponseModel objects
"""

# If an id is provided, return the document with that id
if config.id:
return DataResponseModel.objects.filter(id=config.id)

# Sanatize the query string
query_str = sanitize_input(config.search_query)
logger.info(f"sanitized search query: {query_str}")
Expand All @@ -70,38 +81,54 @@ def _search_database(
# Search across specific fields
vector = SearchVector("title", "description", "regulatory_topics")

# Filter results based on document types if provided
queryset = DataResponseModel.objects.annotate(search=vector).filter(
search=query_objs,
# **(
# {"type__in": config.document_types}
# if config.document_types
# else {}
# ),
)
logger.info(f"search results: {queryset}")
if query_objs:
queryset = DataResponseModel.objects.annotate(search=vector).filter(
search=query_objs,
)
else:
queryset = DataResponseModel.objects.annotate(search=vector)

# Filter by document types
if config.document_types:
query = Q()

# Loop through the document types and add a Q object for each one
for doc_type in config.document_types:
query |= Q(type__icontains=doc_type)
queryset = queryset.filter(query)

# Filter by publisher
if config.publisher_names:
query = Q()

# Loop through the document types and add a Q object for each one
for publisher in config.publisher_names:
query |= Q(publisher__icontains=publisher)
queryset = queryset.filter(query)

# Sort results based on the sort_by parameter (default)
if config.sort_by is None or config.sort_by == "recent":
return queryset.order_by("-date_modified")
return queryset.order_by("-sort_date")

if config.sort_by is not None and config.sort_by == "relevance":
# Calculate the score for each document
calculate_score(config, queryset)
return queryset.order_by("score")

return queryset


def search(context: dict, request: HttpRequest) -> dict:
logger.info("received search request: %s", request)
start_time = time.time()

search_query = request.GET.get("query", "")
document_types = request.GET.get("document_type", "").lower().split(",")
document_types = request.GET.getlist("document_type", [])
offset = request.GET.get("page", "1")
offset = int(offset) if offset.isdigit() else 1
limit = request.GET.get("limit", "10")
limit = int(limit) if limit.isdigit() else 10
publisher = request.GET.getlist("publisher", None)
publishers = request.GET.getlist("publisher", [])
sort_by = request.GET.get("sort", None)

# Get the search results from the Data API using PublicGateway class
Expand All @@ -110,15 +137,15 @@ def search(context: dict, request: HttpRequest) -> dict:
document_types,
limit=limit,
offset=offset,
publisher_names=publisher,
publisher_names=publishers,
sort_by=sort_by,
)

# Display the search query in the log
config.print_to_log()

# Search across specific fields
results = _search_database(config)
results = search_database(config)

# convert search_results into json
pag_start_time = time.time()
Expand All @@ -140,10 +167,16 @@ def search(context: dict, request: HttpRequest) -> dict:


def get_publisher_names():
publishers = DataResponseModel.objects.values("publisher").distinct()

logger.info("getting publisher names...")
publishers_list = []
for publisher in publishers:
publishers_list.append(publisher.publisher)

try:
publishers_list = DataResponseModel.objects.values_list(
"publisher", flat=True
).distinct()
except Exception as e:
logger.error(f"error getting publisher names: {e}")
logger.info("returning empty list of publishers")

logger.info(f"publishers found: {publishers_list}")
return publishers_list
52 changes: 11 additions & 41 deletions orp/orp_search/views.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
import base64
import csv
import logging

import pandas as pd

from orp_search.config import SearchDocumentConfig
from orp_search.models import DataResponseModel
from orp_search.public_gateway import PublicGateway
from orp_search.utils.search import search
from orp_search.utils.search import search, search_database

from django.conf import settings
from django.core.serializers import serialize
from django.http import HttpRequest, HttpResponse
from django.shortcuts import redirect, render
from django.shortcuts import render
from django.views.decorators.http import require_http_methods

logger = logging.getLogger(__name__)
Expand All @@ -27,50 +26,21 @@ def document(request: HttpRequest, id) -> HttpResponse:
"service_name": settings.SERVICE_NAME_SEARCH,
}

def _decode_url(encoded_url):
decoded_bytes = base64.urlsafe_b64decode(encoded_url.encode("utf-8"))
return decoded_bytes.decode("utf-8")

# Extract the id parameter from the request
document_id = id

# Decode id to see if it's a url ?
try:
decoded_url = _decode_url(document_id)
return redirect(decoded_url)
except Exception:
logger.info("document id is not a url")

logger.info("document id: %s", document_id)
if not document_id:
context["error"] = "no document id provided"
if not id:
context["error"] = "id parameter is required"
return render(request, template_name="document.html", context=context)

# Create a SearchDocumentConfig instance and set the id parameter
config = SearchDocumentConfig(search_query="", id=document_id)
# Create a search configuration object with the provided id
config = SearchDocumentConfig(search_query="", id=id)

# Use the PublicGateway class to fetch the details
public_gateway = PublicGateway()
try:
search_result = public_gateway.search(config)
# logger.info("search result: %s", search_result)

if "regulatory_topics" in search_result:
search_result["regulatory_topics"] = str(
search_result["regulatory_topics"]
).split("\n")

if "related_legislation" in search_result:
search_result["related_legislation"] = str(
search_result["related_legislation"]
).split("\n")

context["result"] = search_result
return render(request, template_name="document.html", context=context)
queryset = search_database(config)
context["result"] = serialize("json", queryset)
except Exception as e:
logger.error("error fetching details: %s", e)
context["error"] = f"error fetching details: {e}"
return render(request, template_name="document.html", context=context)

return render(request, template_name="document.html", context=context)


@require_http_methods(["GET"])
Expand Down

0 comments on commit 2a0488d

Please sign in to comment.