Skip to content

Commit

Permalink
Merge branch 'main' of github.com:uktrade/orp into feature/ORPD-101-d…
Browse files Browse the repository at this point in the history
…ocument-type-filter
  • Loading branch information
gdbarnes committed Nov 20, 2024
2 parents cf3e78d + cdae88a commit eb2f646
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 35 deletions.
10 changes: 7 additions & 3 deletions orp/config/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import orp_search.views as orp_search_views

from orp_search.config import SearchDocumentConfig
from orp_search.models import DataResponseModel, logger
from orp_search.models import DataResponseModel
from orp_search.utils.documents import clear_all_documents
from orp_search.utils.search import get_publisher_names, search
from rest_framework import routers, serializers, status, viewsets
Expand Down Expand Up @@ -128,10 +128,14 @@ class PublishersViewSet(viewsets.ViewSet):
def publishers(self, request, *args, **kwargs):
try:
publishers = get_publisher_names()
logger.info(f"publishers: {publishers}")

results = [
{"name": item["publisher"], "key": item["publisher_id"]}
for item in publishers
]

return Response(
data={"results": publishers},
data={"results": results},
status=status.HTTP_200_OK,
)
except Exception as e:
Expand Down
8 changes: 8 additions & 0 deletions orp/orp_search/legislation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import base64
import logging
import re
import xml.etree.ElementTree as ET # nosec BXXX

from typing import Optional
Expand Down Expand Up @@ -142,6 +143,13 @@ def _to_json(
"title": title,
"identifier": identifier,
"publisher": publisher,
"publisher_id": (
None
if publisher is None
else re.sub(
r"[^a-zA-Z0-9]", "", publisher.replace(" ", "").lower()
)
),
"language": language if language is not None else "eng",
"format": format if format is not None else "",
"description": description if description is not None else "",
Expand Down
3 changes: 2 additions & 1 deletion orp/orp_search/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Generated by Django 4.2.16 on 2024-11-19 17:29
# Generated by Django 4.2.16 on 2024-11-20 15:25

from django.db import migrations, models

Expand All @@ -16,6 +16,7 @@ class Migration(migrations.Migration):
("title", models.TextField(blank=True, null=True)),
("identifier", models.TextField(blank=True, null=True)),
("publisher", models.TextField(blank=True, null=True)),
("publisher_id", models.TextField(blank=True, null=True)),
("language", models.TextField(blank=True, null=True)),
("format", models.TextField(blank=True, null=True)),
("description", models.TextField(blank=True, null=True)),
Expand Down
1 change: 1 addition & 0 deletions orp/orp_search/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class DataResponseModel(models.Model):
title = models.TextField(null=True, blank=True)
identifier = models.TextField(null=True, blank=True)
publisher = models.TextField(null=True, blank=True)
publisher_id = models.TextField(null=True, blank=True)
language = models.TextField(null=True, blank=True)
format = models.TextField(null=True, blank=True)
description = models.TextField(null=True, blank=True)
Expand Down
25 changes: 11 additions & 14 deletions orp/orp_search/public_gateway.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import logging
import re

import requests # type: ignore

Expand All @@ -12,20 +13,6 @@
logger = logging.getLogger(__name__)


# def _normalize_date(date_str):
# if date_str is None:
# return None
#
# # If the date is in YYYY format, add "-01-01"
# if len(date_str) == 4:
# return f"{date_str}-01-01"
# # If the date is in YYYY-MM format, add "-01"
# elif len(date_str) == 7:
# return f"{date_str}-01"
# # Otherwise, assume the date is already in YYYY-MM-DD format
# return datetime.strptime(date_str, "%Y-%m-%d").strftime("%Y-%m-%d")


def _build_like_conditions(field, and_terms, or_terms):
"""
Expand Down Expand Up @@ -101,6 +88,16 @@ def build_cache(self, config):
)
row["id"] = (generate_short_uuid(),)

row["publisher_id"] = (
None
if row["publisher"] is None
else re.sub(
r"[^a-zA-Z0-9]",
"",
row["publisher"].replace(" ", "").lower(),
)
)

insert_or_update_document(row)
inserted_document_count += 1
return response.status_code, inserted_document_count
Expand Down
12 changes: 0 additions & 12 deletions orp/orp_search/utils/paginate.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,6 @@ def paginate(
start_time = time.time()

for paginated_document in paginated_documents:
# if hasattr(paginated_document, "description"):
# description = paginated_document.description
# if description:
# paginated_document.description = (
# (
# description[:100] + "..."
# if len(description) > 100
# else description
# )
# .lstrip(".")
# .capitalize()
# )
if hasattr(paginated_document, "regulatory_topics"):
regulatory_topics = paginated_document.regulatory_topics
if regulatory_topics:
Expand Down
35 changes: 30 additions & 5 deletions orp/orp_search/utils/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,33 @@ def search_database(
# Search across specific fields
vector = SearchVector("title", "description", "regulatory_topics")

queryset = DataResponseModel.objects.all()

if query_objs:
queryset = DataResponseModel.objects.annotate(search=vector).filter(
search=query_objs,
# Treat the query for partial and full-text search
query_chunks = query_str.split()
search_vector = SearchVector(
"title", "description", "regulatory_topics"
)
queryset = queryset.annotate(search=search_vector)

# Creating a combined SearchQuery object from chunks
search_queries = [
SearchQuery(chunk, search_type="plain") for chunk in query_chunks
]
combined_query = search_queries[0]
for sq in search_queries[1:]:
combined_query |= sq

partial_matches = Q()
for chunk in query_chunks:
partial_matches |= (
Q(title__icontains=chunk)
| Q(description__icontains=chunk)
| Q(regulatory_topics__icontains=chunk)
)

queryset = queryset.filter(partial_matches | Q(search=combined_query))
else:
queryset = DataResponseModel.objects.annotate(search=vector)

Expand All @@ -103,7 +126,7 @@ def search_database(

# Loop through the document types and add a Q object for each one
for publisher in config.publisher_names:
query |= Q(publisher__icontains=publisher)
query |= Q(publisher_id__icontains=publisher)
queryset = queryset.filter(query)

# Sort results based on the sort_by parameter (default)
Expand Down Expand Up @@ -171,9 +194,11 @@ def get_publisher_names():
publishers_list = []

try:
publishers_list = DataResponseModel.objects.values_list(
"publisher", flat=True
publishers_list = DataResponseModel.objects.values(
"publisher",
"publisher_id",
).distinct()

except Exception as e:
logger.error(f"error getting publisher names: {e}")
logger.info("returning empty list of publishers")
Expand Down

0 comments on commit eb2f646

Please sign in to comment.