Skip to content

Commit

Permalink
chore:add legislation search functionality and refactor logging
Browse files Browse the repository at this point in the history
Introduced a new Legislation class to handle legislation-specific searches and integrated it into the search view. Additionally, commented out excessive logging in the public gateway and included `xmltodict` dependency.
  • Loading branch information
hareshkainthdbt committed Oct 16, 2024
1 parent 78e0ae2 commit 0f600a8
Show file tree
Hide file tree
Showing 3 changed files with 223 additions and 10 deletions.
176 changes: 176 additions & 0 deletions orp/orp_search/legislation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import base64
import logging

from io import StringIO

import pandas as pd
import requests # type: ignore

from orp_search.config import SearchDocumentConfig

from django.core.paginator import EmptyPage, PageNotAnInteger, Paginator

logger = logging.getLogger(__name__)


def _extract_td_value(html_content, target_text):
# Step 1: Locate the <h2> element and the subsequent <table> element
h2_text = '<h2 class="title">Count Results</h2>'
table_class = '<table class="results results-single query-builder"'
start_index = html_content.find(h2_text)
if start_index == -1:
raise ValueError("specified <h2> text not found in the HTML content")

start_index = html_content.find(table_class, start_index)
if start_index == -1:
raise ValueError(
"specified <table> class not found in the HTML content"
)

# Step 2: Locate the <th> tag with the target text
th_start = html_content.find(f"<th>{target_text}</th>", start_index)
if th_start == -1:
raise ValueError(
f"<th>{target_text}</th> not found in the HTML content"
)

# Step 3: Find the <td> tag immediately following the located <th> tag
td_start = html_content.find("<td>", th_start)
if td_start == -1:
raise ValueError("no <td> tag found after the specified <th> tag")

td_end = html_content.find("</td>", td_start)
if td_end == -1:
raise ValueError(
"No closing </td> tag found after the specified <th> tag"
)

# Step 4: Extract and return the content within the <td> tag
td_value = html_content[ # noqa: E203
td_start + len("<td>") : td_end # noqa: E203
].strip() # noqa: E203
return td_value


def _perform_request(url, params, timeout=10):
logger.info(f"url for request: {url}")
logger.info(f"params for request: {params}")
response = requests.get(url, params=params, timeout=timeout)
return response.text if response.status_code == 200 else None


def _encode_url(url):
encoded_bytes = base64.urlsafe_b64encode(url.encode("utf-8"))
return encoded_bytes.decode("utf-8")


class Legislation:
def __init__(self):
self.search_url = (
"https://research.legislation.gov.uk/query-builder/search/data.csv"
)
self.count_url = (
"https://research.legislation.gov.uk/query-builder/count"
)

def search(self, config: SearchDocumentConfig):
# List of search terms
title_search_terms = config.search_terms
search_terms = ",".join(title_search_terms)
params = {
"amendments": "include",
"query": search_terms,
"count": "100",
}

# Get search results
data_csv = _perform_request(self.search_url, params, config.timeout)

# Convert the response (string) to a file-like object
data_io = StringIO(data_csv)

# Read the CSV string into a DataFrame
df = pd.read_csv(data_io)

results = []
# Convert data_csv into data api format and to list
for index, item in df.iterrows():
results.append(
{
"id": _encode_url(item["id"]),
"title": item["title"],
"document_type": "legislation",
"publisher_id": item["type"],
"publisher": "UK Legislation",
"type": "Legislation",
"date_modified": item["valid"],
}
)

logger.info(f"legislation total results: {len(results)}")
return results

def finalise_results(
self, config: SearchDocumentConfig, results, context
) -> dict:
title_search_terms = config.search_terms
search_terms = ",".join(title_search_terms)
params = {
"amendments": "include",
"query": search_terms,
# 'counting': 'documents',
}

# Get count of total results
count_data_html_page = _perform_request(
self.count_url, params, config.timeout
)
total_document_count = _extract_td_value(
count_data_html_page, "documents"
)

paginated_documents = []
exists = False

# Check if paginator exists in context
if "paginator" not in context:
logger.info("paginator not in context for legislation")
context["paginator"] = {}
paginator = Paginator(results, config.limit)
try:
paginated_documents = paginator.page(config.offset)
except PageNotAnInteger:
paginated_documents = paginator.page(1)
except EmptyPage:
paginated_documents = paginator.page(paginator.num_pages)
else:
logger.info("paginator exists in context for legislation")
exists = True
paginator = context["paginator"]

# If paginator exists then add all results to paginator
if exists:
all_items = paginator.object_list

# Convert to a list if necessary
all_non_legislation_items = list(all_items)

# Combine with legislation results
all_items = all_non_legislation_items + results

paginator = Paginator(all_items, config.limit)
try:
paginated_documents = paginator.page(config.offset)
except PageNotAnInteger:
paginated_documents = paginator.page(1)
except EmptyPage:
paginated_documents = paginator.page(paginator.num_pages)

context["paginator"] = paginator
context["is_paginated"] = paginator.num_pages > 1
context["results"] = paginated_documents
context["results_count"] = len(paginated_documents)
context["results_total_count"] = total_document_count
context["results_page_total"] = paginator.num_pages
context["current_page"] = config.offset
return context
4 changes: 2 additions & 2 deletions orp/orp_search/public_gateway.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def search(self, config: SearchDocumentConfig):

if config.sort_by is None:
results = filtered_df.to_dict(orient="records")
logger.info("filtered data: %s", results)
# logger.info("filtered data: %s", results)
return results

sorted_df = None
Expand Down Expand Up @@ -206,7 +206,7 @@ def score_text(text, terms):
else:
results = []

logger.info("filtered data: %s", results)
# logger.info("filtered data: %s", results)
return results

# Base URL for the API
Expand Down
53 changes: 45 additions & 8 deletions orp/orp_search/views.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import base64
import csv
import logging

import pandas as pd

from orp_search.legislation import Legislation
from orp_search.public_gateway import PublicGateway, SearchDocumentConfig

from django.conf import settings
from django.http import HttpRequest, HttpResponse
from django.shortcuts import render
from django.shortcuts import redirect, render
from django.views.decorators.http import require_http_methods

from core.forms import RegulationSearchForm
Expand All @@ -21,13 +23,24 @@ def document(request: HttpRequest, id) -> HttpResponse:
Handles the GET request to fetch details based on the provided id.
"""

context = {
"service_name": settings.SERVICE_NAME_SEARCH,
}

def _decode_url(encoded_url):
decoded_bytes = base64.urlsafe_b64decode(encoded_url.encode("utf-8"))
return decoded_bytes.decode("utf-8")

# Extract the id parameter from the request
document_id = id

# Decode id to see if it's a url ?
try:
decoded_url = _decode_url(document_id)
return redirect(decoded_url)
except Exception:
logger.info("document id is not a url")

logger.info("document id: %s", document_id)
if not document_id:
context["error"] = "no document id provided"
Expand Down Expand Up @@ -155,7 +168,11 @@ def search(request: HttpRequest) -> HttpResponse:

# Get the search results from the Data API using PublicGateway class
config = SearchDocumentConfig(
search_query, document_types, dummy=True, limit=limit, offset=offset
str(search_query).lower(),
document_types,
dummy=True,
limit=limit,
offset=offset,
)

if publisher:
Expand All @@ -164,11 +181,31 @@ def search(request: HttpRequest) -> HttpResponse:
if sort_by:
config.sort_by = sort_by

public_gateway = PublicGateway()
search_results = public_gateway.search(config)
if (
not config.document_types
or "standard" in config.document_types
or "guidance" in config.document_types
):
public_gateway = PublicGateway()
search_results = public_gateway.search(config)
context = public_gateway.finalise_results(
config, search_results, context
)

# Legislation search
# If config.search_terms is empty then we don't need to
# search for legislation
if not config.search_terms or "" in config.search_terms:
logger.info("no search terms provided")
return render(request, template_name="orp.html", context=context)

context = public_gateway.finalise_results(config, search_results, context)
if not config.document_types or "legislation" in config.document_types:
logger.info("searching for legislation: %s", config.search_terms)
legislation = Legislation()
legislation_results = legislation.search(config)
logger.info(f"legislation results: {legislation_results}")
context = legislation.finalise_results(
config, legislation_results, context
)

logger.info("search results: %s", context["results"])
# logger.info("search results count: %s", context["results_count"])
return render(request, template_name="orp.html", context=context)

0 comments on commit 0f600a8

Please sign in to comment.