feat:add search functionality and caching

Introduced new classes for search config and documents. Implemented search and caching in views, and added new dependencies. Updated HTML and README for improvements.
uktrade · Oct 8, 2024 · 7ea3c46 · 7ea3c46
1 parent df978e5
commit 7ea3c46
Show file tree

Hide file tree

Showing 13 changed files with 541 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ Create the initial database:
     $ make database
 
 > The `make database` command will create a `PostgreSQL` database. If you have
-> an existing database and want to start from scratch, use `make drop-databse`
+> an existing database and want to start from scratch, use `make drop-database`
 > to delete an existing database first.
 
 Prepare the application for first use:
@@ -98,3 +98,22 @@ With your Poetry shell active:
 > This will ensure that your code passes quality checks before you commit it.
 > Code quality checks are also performed when pushing your code to origin
 > but pre-commit hooks catch issues early and will improve Developer Experience.
+
+
+### Update database tables
+
+> To update local database tables, you need to set the `DATABASE_URL` environment variable. You can set it in the terminal or in the `.env` file.
+
+    <!-- pragma: allowlist secret --> $ export DATABASE_URL=postgres://postgres:postgres@localhost:5432/orp
+
+> If you want to migrate all apps then navigate /orp/orp and use the following command:
+
+    $ poetry run python manage.py migrate
+
+> If you want to migrate a single app then navigate /orp/orp and use the following command:
+
+    $ poetry run python manage.py migrate <app_name>
+
+
+
+poetry add boto3 awswrangler
diff --git a/__init__.py b/__init__.py
diff --git a/orp/core/forms.py b/orp/core/forms.py
@@ -39,3 +39,20 @@ class RegulationSearchForm(forms.Form):
             }
         ),
     )
+
+    document_type = forms.MultipleChoiceField(
+        required=False,
+        choices=[
+            ("employment-tribunal", "Legislation"),
+            ("MOD", "Guidance"),
+            ("DfT", "Statutory guidance"),
+        ],
+        widget=forms.CheckboxSelectMultiple(
+            attrs={
+                "class": "govuk-checkboxes__input",
+                "data-module": "govuk-checkboxes",
+            }
+        ),
+        label="Select document types",
+        help_text="You can select multiple document types.",
+    )
diff --git a/orp/manage.py b/orp/manage.py
@@ -6,7 +6,7 @@
 
 def main():
     """Run administrative tasks."""
-    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")
+    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings.local")
     try:
         from django.core.management import execute_from_command_line
     except ImportError as exc:

diff --git a/orp/orp_search/config.py b/orp/orp_search/config.py
@@ -0,0 +1,38 @@
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class SearchDocumentConfig:
+    def __init__(self, search_terms: str, document_types=None, timeout=None):
+        """
+        Initializes a new instance of the class.
+
+        :param searchTerms: A comma-separated string of search terms.
+        :param documentTypes: Optional. A list of document types
+                              to filter the search.
+        :param timeout: Optional. The timeout in seconds for the search
+                        request.
+        """
+        self.search_terms = [term.strip() for term in search_terms.split(",")]
+        self.document_types = document_types
+        self.timeout = None if timeout is None else int(timeout)
+
+    def validate(self):
+        """
+
+        Validates the presence of search terms.
+
+        Checks if the 'searchTerms' attribute exists and is non-empty. Logs
+        an error message and returns False if 'searchTerms' is missing or
+        empty.
+
+        Returns
+        -------
+        bool
+            True if 'searchTerms' is present and non-empty, False otherwise.
+        """
+        if not self.search_terms:
+            logger.error("search terms are required")
+            return False
+        return True
diff --git a/orp/orp_search/management/commands/lean_expired_cache.py b/orp/orp_search/management/commands/lean_expired_cache.py
@@ -0,0 +1,13 @@
+from django.core.management.base import BaseCommand
+
+from ...models import PublicGatewayCache
+
+
+class Command(BaseCommand):
+    help = "clean up expired cache entries"
+
+    def handle(self, *args, **kwargs):
+        PublicGatewayCache.clean_up_expired_entries()
+        self.stdout.write(
+            self.style.SUCCESS("successfully cleaned up expired cache entries")
+        )
diff --git a/orp/orp_search/migrations/0001_initial.py b/orp/orp_search/migrations/0001_initial.py
@@ -0,0 +1,31 @@
+# Generated by Django 4.2.15 on 2024-10-02 14:53
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = []
+
+    operations = [
+        migrations.CreateModel(
+            name="PublicGatewayCache",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("search_terms", models.CharField(max_length=255)),
+                ("document_types", models.JSONField()),
+                ("response", models.TextField()),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+            ],
+        ),
+    ]
diff --git a/orp/orp_search/migrations/__init__.py b/orp/orp_search/migrations/__init__.py
diff --git a/orp/orp_search/models.py b/orp/orp_search/models.py
@@ -0,0 +1,61 @@
+import json
+
+from datetime import timedelta
+
+from orp_search.public_gateway import SearchDocumentConfig
+
+from django.db import models
+from django.utils import timezone
+
+
+class PublicGatewayCache(models.Model):
+    search_terms = models.CharField(max_length=255)
+    document_types = models.JSONField()
+    response = models.TextField()
+    created_at = models.DateTimeField(auto_now_add=True)  # Timestamp for TTL
+
+    TTL = timedelta(days=1)  # Time-To-Live duration for cache entries
+
+    @staticmethod
+    def _config_to_key(config: SearchDocumentConfig):
+        # Convert config to a tuple that can be used as a key
+        return config.search_terms, json.dumps(
+            config.document_types, sort_keys=True
+        )
+
+    @classmethod
+    def get_cached_response(cls, config):
+        # Look up the cached response for the given config
+        key = cls._config_to_key(config)
+        try:
+            cache_entry = cls.objects.get(
+                search_terms=key[0], document_types=key[1]
+            )
+            if cls.is_expired(cache_entry):
+                # If expired, delete it and return None
+                cache_entry.delete()
+                return None
+            return cache_entry.response
+        except cls.DoesNotExist:
+            return None
+
+    @classmethod
+    def cache_response(cls, config, response):
+        # Store the response in the cache
+        key = cls._config_to_key(config)
+        cache_entry, created = cls.objects.update_or_create(
+            search_terms=key[0],
+            document_types=key[1],
+            defaults={"response": response, "created_at": timezone.now()},
+        )
+        return cache_entry
+
+    @classmethod
+    def is_expired(cls, cache_entry):
+        # Check if the cache entry has expired
+        return timezone.now() > cache_entry.created_at + cls.TTL
+
+    @classmethod
+    def clean_up_expired_entries(cls):
+        # Delete expired cache entries
+        cls.objects.filter(created_at__lt=timezone.now() - cls.TTL).delete()
diff --git a/orp/orp_search/public_gateway.py b/orp/orp_search/public_gateway.py
@@ -0,0 +1,93 @@
+import logging
+
+import requests  # type: ignore
+
+from jinja2 import Template
+from orp_search.config import SearchDocumentConfig
+
+logger = logging.getLogger(__name__)
+
+
+class PublicGateway:
+    def __init__(self):
+        """
+        Initializes the API client with the base URL for the Trade Data API.
+
+        Attributes:
+            base_url (str): The base URL of the Trade Data API.
+        """
+        self.base_url = "https://data.api.trade.gov.uk"
+
+    def _build_like_conditions(self, field, terms):
+        """
+
+        Generates SQL LIKE conditions.
+
+        Args:
+            field (str): The database field to apply the LIKE condition to.
+            terms (list of str): A list of terms to include in the LIKE
+                                 condition.
+
+        Returns:
+            str: A string containing the LIKE conditions combined with 'OR'.
+        """
+        return " OR ".join([f"{field} LIKE '%{term}%'" for term in terms])
+
+    def search(self, config: SearchDocumentConfig):
+        logger.info("searching for market barriers")
+        # Base URL for the API
+        # TODO: need to use aws parameter store to store the base url
+        url = (
+            "https://data.api.trade.gov.uk/v1/datasets/market-barriers"
+            "/versions/v1.0.10/data"
+        )
+
+        # List of search terms
+        title_search_terms = config.search_terms
+        summary_search_terms = config.search_terms
+
+        # Build the WHERE clause
+        # TODO: need to use aws parameter store to store the field names
+        title_conditions = self._build_like_conditions(
+            "b.title", title_search_terms
+        )
+        summary_conditions = self._build_like_conditions(
+            "b.summary", summary_search_terms
+        )
+
+        # SQL query to filter based on title and summary containing search
+        # terms
+        # TODO: we are using example data here, this needs to be updated with
+        #  the actual table and field names
+        query_template = """
+            SELECT *
+            FROM S3Object[*].barriers[*] b
+            WHERE ({{ title_conditions }}) AND ({{ summary_conditions }})
+        """
+
+        template = Template(query_template)
+        query = template.render(
+            title_conditions=title_conditions,
+            summary_conditions=summary_conditions,
+        )
+
+        # URL encode the query for the API request
+        params = {"format": "json", "query-s3-select": query}
+
+        # Log the query with parameters
+        logger.info("request will contain the following query: %s", query)
+        logger.info(
+            "request will contain the following parameters: %s", params
+        )
+
+        # Make the GET request
+        response = requests.get(url, params=params, timeout=config.timeout)
+
+        # Check if the request was successful
+        if response.status_code == 200:
+            data = response.text
+            logger.info("data fetched successfully: %s", data)
+            return data
+        else:
+            logger.error("data fetch failed: %s", response.text)
+            return None