Skip to content

Commit

Permalink
feat:add dummy data handling and pandas dependency
Browse files Browse the repository at this point in the history
Introduced a dummy flag in SearchDocumentConfig and enhanced public_gateway to return filtered data from a CSV if the flag is set. Added pandas to dependencies for data handling capabilities. Removed obsolete migration and search files.
  • Loading branch information
hareshkainthdbt committed Oct 10, 2024
1 parent 7ea3c46 commit 928a199
Show file tree
Hide file tree
Showing 9 changed files with 7,487 additions and 141 deletions.
5 changes: 4 additions & 1 deletion orp/orp_search/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@


class SearchDocumentConfig:
def __init__(self, search_terms: str, document_types=None, timeout=None):
def __init__(
self, search_terms: str, document_types=None, timeout=None, dummy=False
):
"""
Initializes a new instance of the class.
Expand All @@ -17,6 +19,7 @@ def __init__(self, search_terms: str, document_types=None, timeout=None):
self.search_terms = [term.strip() for term in search_terms.split(",")]
self.document_types = document_types
self.timeout = None if timeout is None else int(timeout)
self.dummy = dummy

def validate(self):
"""
Expand Down
7,270 changes: 7,270 additions & 0 deletions orp/orp_search/construction-data.csv

Large diffs are not rendered by default.

31 changes: 0 additions & 31 deletions orp/orp_search/migrations/0001_initial.py

This file was deleted.

147 changes: 131 additions & 16 deletions orp/orp_search/models.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,50 @@
import json
import logging
import uuid

from datetime import timedelta

from orp_search.public_gateway import SearchDocumentConfig
from orp_search.config import SearchDocumentConfig

from django.db import models
from django.utils import timezone

logger = logging.getLogger(__name__)


class PublicGatewayCache(models.Model):
title = models.CharField(max_length=255)
identifier = models.CharField(max_length=255)
publisher = models.CharField(max_length=255)
language = models.CharField(max_length=255)
format = models.CharField(max_length=255)
description = models.TextField()
date_issued = models.CharField(max_length=255)
date_modified = models.DateTimeField()
date_valid = models.DateTimeField()
audience = models.CharField(max_length=255)
coverage = models.CharField(max_length=255)
subject = models.CharField(max_length=255)
type = models.CharField(max_length=255)
license = models.CharField(max_length=255)
regulatory_topics = models.CharField(max_length=255)
status = models.CharField(max_length=255)
date_uploaded_to_orp = models.CharField(max_length=255)
has_format = models.CharField(max_length=255)
is_format_of = models.CharField(max_length=255)
has_version = models.CharField(max_length=255)
is_version_of = models.CharField(max_length=255)
references = models.CharField(max_length=255)
is_referenced_by = models.CharField(max_length=255)
has_part = models.CharField(max_length=255)
is_part_of = models.CharField(max_length=255)
is_replaced_by = models.CharField(max_length=255)
replaces = models.CharField(max_length=255)
related_legislation = models.CharField(max_length=255)

search_terms = models.CharField(max_length=255)
document_types = models.JSONField()
response = models.TextField()
document_types = models.CharField(max_length=255)
created_at = models.DateTimeField(auto_now_add=True) # Timestamp for TTL

TTL = timedelta(days=1) # Time-To-Live duration for cache entries

@staticmethod
Expand All @@ -28,27 +59,111 @@ def get_cached_response(cls, config):
# Look up the cached response for the given config
key = cls._config_to_key(config)
try:
cache_entry = cls.objects.get(
cache_entries = cls.objects.filter(
search_terms=key[0], document_types=key[1]
)
if cls.is_expired(cache_entry):
# If expired, delete it and return None
cache_entry.delete()
if not cache_entries:
return None
return cache_entry.response
if any(cls.is_expired(entry) for entry in cache_entries):
# If any entry is expired, delete all related entries and
# return None
cache_entries.delete()
return None
return [
{
"title": entry.title,
"identifier": entry.identifier,
"publisher": entry.publisher,
"language": entry.language,
"format": entry.format,
"description": entry.description,
"date_issued": entry.date_issued,
"date_modified": entry.date_modified,
"date_valid": entry.date_valid,
"audience": entry.audience,
"coverage": entry.coverage,
"subject": entry.subject,
"type": entry.type,
"license": entry.license,
"regulatory_topics": entry.regulatory_topics,
"status": entry.status,
"date_uploaded_to_orp": entry.date_uploaded_to_orp,
"has_format": entry.has_format,
"is_format_of": entry.is_format_of,
"has_version": entry.has_version,
"is_version_of": entry.is_version_of,
"references": entry.references,
"is_referenced_by": entry.is_referenced_by,
"has_part": entry.has_part,
"is_part_of": entry.is_part_of,
"is_replaced_by": entry.is_replaced_by,
"replaces": entry.replaces,
"related_legislation": entry.related_legislation,
"created_at": entry.created_at,
}
for entry in cache_entries
]
except cls.DoesNotExist:
return None

@classmethod
def cache_response(cls, config, response):
# Store the response in the cache
logger.info(f"caching service received response: {response}")

# Cache the response for the given config
if isinstance(response, list) and response:
records = response[0].get("rows", [])
else:
records = []

if not records:
logger.info("no records to cache")
return

logger.info("attempting to cache records: %s", records)

# Store each record in the response in the cache
key = cls._config_to_key(config)
cache_entry, created = cls.objects.update_or_create(
search_terms=key[0],
document_types=key[1],
defaults={"response": response, "created_at": timezone.now()},
)
return cache_entry
for record in records:
logger.info("caching record: %s", record)
record_id = record.get(
"id", str(uuid.uuid4())
) # Use a unique identifier for each record
cls.objects.update_or_create(
search_terms=key[0],
document_types=key[1],
record_id=record_id,
defaults={
"title": record.get("title"),
"identifier": record.get("identifier"),
"publisher": record.get("publisher"),
"language": record.get("language"),
"format": record.get("format"),
"description": record.get("description"),
"date_issued": record.get("date_issued"),
"date_modified": record.get("date_modified"),
"date_valid": record.get("date_valid"),
"audience": record.get("audience"),
"coverage": record.get("coverage"),
"subject": record.get("subject"),
"type": record.get("type"),
"license": record.get("license"),
"regulatory_topics": record.get("regulatory_topics"),
"status": record.get("status"),
"date_uploaded_to_orp": record.get("date_uploaded_to_orp"),
"has_format": record.get("has_format"),
"is_format_of": record.get("is_format_of"),
"has_version": record.get("has_version"),
"is_version_of": record.get("is_version_of"),
"references": record.get("references"),
"is_referenced_by": record.get("is_referenced_by"),
"has_part": record.get("has_part"),
"is_part_of": record.get("is_part_of"),
"is_replaced_by": record.get("is_replaced_by"),
"replaces": record.get("replaces"),
"related_legislation": record.get("related_legislation"),
},
)

@classmethod
def is_expired(cls, cache_entry):
Expand Down
36 changes: 31 additions & 5 deletions orp/orp_search/public_gateway.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging

import pandas as pd
import requests # type: ignore

from jinja2 import Template
Expand Down Expand Up @@ -34,18 +35,43 @@ def _build_like_conditions(self, field, terms):
return " OR ".join([f"{field} LIKE '%{term}%'" for term in terms])

def search(self, config: SearchDocumentConfig):
logger.info("searching for market barriers")
# List of search terms
title_search_terms = config.search_terms
summary_search_terms = config.search_terms

# If the dummy flag is set, return dummy data. Ideally, this will be
# removed from the final implementation
if config.dummy:
df = pd.read_csv("orp/orp_search/construction-data.csv")
server_terms_pattern = "|".join(title_search_terms)
document_types_pattern = "|".join(summary_search_terms)
logger.info("server_terms_pattern: %s", server_terms_pattern)
logger.info("document_types_pattern: %s", document_types_pattern)

# Filter the DataFrame based on the search terms
filtered_df = df[
(
df["title"].str.contains(
server_terms_pattern, case=False, na=False
)
)
& (
df["description"].str.contains(
document_types_pattern, case=False, na=False
)
)
]
results = filtered_df.to_dict(orient="records")
logger.info("filtered data: %s", results)
return results

# Base URL for the API
# TODO: need to use aws parameter store to store the base url
url = (
"https://data.api.trade.gov.uk/v1/datasets/market-barriers"
"/versions/v1.0.10/data"
)

# List of search terms
title_search_terms = config.search_terms
summary_search_terms = config.search_terms

# Build the WHERE clause
# TODO: need to use aws parameter store to store the field names
title_conditions = self._build_like_conditions(
Expand Down
14 changes: 0 additions & 14 deletions orp/orp_search/search.py

This file was deleted.

49 changes: 6 additions & 43 deletions orp/orp_search/views.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import json
import logging
import re

from orp_search.models import PublicGatewayCache
from orp_search.public_gateway import PublicGateway, SearchDocumentConfig
Expand All @@ -15,36 +13,6 @@
logger = logging.getLogger(__name__)


def clean_json_response(response):
# Ensure the response is a string
if isinstance(response, dict):
response = json.dumps(response)

# Clean the escape characters and fix JSON format
cleaned_response = response.replace('\\"', '"').replace("\\r\\n", "\n")

# Remove invalid control characters
# Regex to match and remove control characters except '\n' or '\t'
cleaned_response = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", cleaned_response)

# Split concatenated JSON objects by looking for "} {"
json_objects = re.split(r"}\s*{", cleaned_response)

# Add missing braces to objects
json_objects = [
obj if obj.strip().startswith("{") else "{" + obj
for obj in json_objects
]
json_objects = [
obj if obj.strip().endswith("}") else obj + "}" for obj in json_objects
]

# Parse each JSON object
parsed_objects = [json.loads(obj) for obj in json_objects]

return parsed_objects


@require_http_methods(["GET"])
def search(request: HttpRequest) -> HttpResponse:
"""Search view.
Expand Down Expand Up @@ -86,31 +54,26 @@ def search(request: HttpRequest) -> HttpResponse:
if not search_query:
return render(request, template_name="orp.html", context=context)

search_query_json = json.dumps(search_query)
document_types_json = json.dumps(document_types)

logger.info("Search query (json): %s", search_query_json)
logger.info("Document types (json): %s", document_types_json)
logger.info("Search query: %s", search_query)
logger.info("Document types: %s", document_types)

# Get the search results from the Data API using PublicGateway class
config = SearchDocumentConfig(search_query, document_types)
config = SearchDocumentConfig(search_query, document_types, dummy=True)

# Check if the response is cached
logger.info("checking for cached response")
cached_response = PublicGatewayCache.get_cached_response(config)
if cached_response:
logger.info("using cached response")
search_results = json.loads(cached_response)
search_results = cached_response
else:
logger.info("fetching new response")
public_gateway = PublicGateway()
search_results = public_gateway.search(config)
search_results = clean_json_response(search_results)

# Cache the response
logger.info("caching response")
PublicGatewayCache.cache_response(config, json.dumps(search_results))
PublicGatewayCache.cache_response(config, search_results)

logger.info("search results json: %s", search_results)
context["search_results"] = search_results
context["results"] = search_results
return render(request, template_name="orp.html", context=context)
Loading

0 comments on commit 928a199

Please sign in to comment.