From 25809496097a684774f3dbc3dab90f2c09ae47cd Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Tue, 14 May 2024 16:56:28 -0700
Subject: [PATCH 1/3] required for registry's installation

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 0a118d05..f204e9cc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,5 @@ markdown==3.6
 boto3==1.34.84
 Pillow==10.3.0
 python-magic==0.4.27
-retrying==1.3.4
\ No newline at end of file
+retrying==1.3.4
+elasticsearch==7.13.4               # 8.13.0 is current

From 9d5e60326acda6622218392f46c6d7f055ddb5b7 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Tue, 14 May 2024 17:02:07 -0700
Subject: [PATCH 2/3] Add caching to the es_cache_retry module

---
 exhibits/es_cache_retry.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/exhibits/es_cache_retry.py b/exhibits/es_cache_retry.py
index 8302bfc1..6855f5fa 100644
--- a/exhibits/es_cache_retry.py
+++ b/exhibits/es_cache_retry.py
@@ -14,6 +14,8 @@
 from elasticsearch.exceptions import ConnectionError as ESConnectionError
 from elasticsearch.exceptions import RequestError as ESRequestError
 
+from exhibits.utils import kwargs_md5
+
 urllib3.disable_warnings()
 
 if hasattr(settings, 'XRAY_RECORDER'):
@@ -31,6 +33,11 @@
 
 
 def es_search(body) -> ESResults:
+    cache_key = f"es_search_{kwargs_md5(**body)}"
+    cached_results = cache.get(cache_key)
+    if cached_results:
+        return cached_results
+
     try:
         results = elastic_client.search(
             index=settings.ES_ALIAS, body=json.dumps(body))
@@ -83,6 +90,8 @@ def es_search(body) -> ESResults:
         results['hits']['total']['value'],
         facet_counts)
 
+    cache.set(cache_key, results, settings.DJANGO_CACHE_TIMEOUT)  # seconds
+
     return results
 
 def get_thumbnail_key(metadata):
@@ -101,8 +110,9 @@ def get_media_key(metadata):
             key_parts = uri_path.split('/')[2:]
             return '/'.join(key_parts)
 
-def es_search_nocache(**kwargs):
-    return es_search(kwargs)
+
+# def es_search_nocache(**kwargs):
+#     return es_search(kwargs)
 
 
 def es_get(item_id: str) -> Optional[ESItem]:
@@ -110,6 +120,11 @@ def es_get(item_id: str) -> Optional[ESItem]:
     if not item_id:
         return None
 
+    cache_key = f"es_get_{item_id}"
+    cached_results = cache.get(cache_key)
+    if cached_results:
+        return cached_results
+
     # cannot use Elasticsearch.get() for multi-index alias
     body = {'query': {'match': {'_id': item_id}}}
     try:
@@ -150,6 +165,7 @@ def es_get(item_id: str) -> Optional[ESItem]:
         item['children'] = updated_children
 
     results = ESItem(found, item, item_search)
+    cache.set(cache_key, results, settings.DJANGO_CACHE_TIMEOUT)  # seconds
     return results
 
 

From 9d57b9b30e999c602eb4bf73678bcca5c42c6f2f Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Wed, 15 May 2024 12:25:35 -0700
Subject: [PATCH 3/3] Add retries w/ 3sec max; update es_cache_retry to match
 public_interface

---
 exhibits/es_cache_retry.py | 107 +++++++++++++++++--------------------
 exhibits/utils.py          |  23 ++++++++
 2 files changed, 71 insertions(+), 59 deletions(-)

diff --git a/exhibits/es_cache_retry.py b/exhibits/es_cache_retry.py
index 6855f5fa..e8915910 100644
--- a/exhibits/es_cache_retry.py
+++ b/exhibits/es_cache_retry.py
@@ -1,20 +1,22 @@
 """ logic for cache / retry for es (opensearch) and JSON from registry
 """
 
-from django.core.cache import cache
-from django.conf import settings
+import json
+import urllib3
 
 from collections import namedtuple
-from urllib.parse import urlparse
-import urllib3
-import json
 from typing import Dict, List, Tuple, Optional
+from urllib.parse import urlparse
+
 from aws_xray_sdk.core import patch
+from django.core.cache import cache
+from django.conf import settings
 from elasticsearch import Elasticsearch
 from elasticsearch.exceptions import ConnectionError as ESConnectionError
 from elasticsearch.exceptions import RequestError as ESRequestError
+from retrying import retry
 
-from exhibits.utils import kwargs_md5
+from exhibits.utils import kwargs_md5, UCLDC_SCHEMA_TERM_FIELDS
 
 urllib3.disable_warnings()
 
@@ -32,6 +34,37 @@
     'ESItem', 'found, item, resp')
 
 
+def shim_record(metadata):
+    # TODO replace type_ss with type globally
+    metadata['type_ss'] = metadata.get('type')
+    metadata['collection_ids'] = metadata.get('collection_url')
+    metadata['repository_ids'] = metadata.get('repository_url')
+
+    thumbnail_key = get_thumbnail_key(metadata)
+    if thumbnail_key:
+        metadata['reference_image_md5'] = thumbnail_key
+
+    media_key = get_media_key(metadata)
+    if media_key:
+        metadata['media']['media_key'] = media_key
+
+    if metadata.get('children'):
+        children = metadata.pop('children')
+        updated_children = []
+        for child in children:
+            thumbnail_key = get_thumbnail_key(child)
+            if thumbnail_key:
+                child['thumbnail_key'] = thumbnail_key
+            media_key = get_media_key(child)
+            if media_key:
+                child['media']['media_key'] = media_key
+            updated_children.append(child)
+        metadata['children'] = updated_children
+
+    return metadata
+
+
+@retry(stop_max_delay=3000)  # milliseconds
 def es_search(body) -> ESResults:
     cache_key = f"es_search_{kwargs_md5(**body)}"
     cached_results = cache.get(cache_key)
@@ -59,30 +92,7 @@ def es_search(body) -> ESResults:
         facet_counts = {}
 
     for result in results['hits']['hits']:
-        metadata = result.pop('_source')
-        # TODO replace type_ss with type globally
-        metadata['type_ss'] = metadata.get('type')
-        thumbnail_key = get_thumbnail_key(metadata)
-        if thumbnail_key:
-            metadata['reference_image_md5'] = thumbnail_key
-
-        media_key = get_media_key(metadata)
-        if media_key:
-            metadata['media']['media_key'] = media_key
-
-        if metadata.get('children'):
-            children = metadata.pop('children')
-            updated_children = []
-            for child in children:
-                thumbnail_key = get_thumbnail_key(child)
-                if thumbnail_key:
-                    child['thumbnail_key'] = thumbnail_key
-                media_key = get_media_key(child)
-                if media_key:
-                    child['media']['media_key'] = media_key
-                updated_children.append(child)
-            metadata['children'] = updated_children
-
+        metadata = shim_record(result.pop('_source'))
         result.update(metadata)
 
     results = ESResults(
@@ -115,6 +125,7 @@ def get_media_key(metadata):
 #     return es_search(kwargs)
 
 
+@retry(stop_max_delay=3000)  # milliseconds
 def es_get(item_id: str) -> Optional[ESItem]:
     # cannot search Elasticsearch with empty string
     if not item_id:
@@ -140,29 +151,7 @@ def es_get(item_id: str) -> Optional[ESItem]:
     found = item_search['hits']['total']['value']
     if not found:
         return None
-    item = item_search['hits']['hits'][0]['_source']
-
-    item['collection_ids'] = item.get('collection_url')
-    item['repository_ids'] = item.get('repository_url')
-    thumbnail_key = get_thumbnail_key(item)
-    if thumbnail_key:
-        item['reference_image_md5'] = thumbnail_key
-    media_key = get_media_key(item)
-    if media_key:
-        item['media']['media_key'] = media_key
-
-    if item.get('children'):
-        children = item.pop('children')
-        updated_children = []
-        for child in children:
-            thumbnail_key = get_thumbnail_key(child)
-            if thumbnail_key:
-                child['thumbnail_key'] = thumbnail_key
-            media_key = get_media_key(child)
-            if media_key:
-                child['media']['media_key'] = media_key
-            updated_children.append(child)
-        item['children'] = updated_children
+    item = shim_record(item_search['hits']['hits'][0]['_source'])
 
     results = ESItem(found, item, item_search)
     cache.set(cache_key, results, settings.DJANGO_CACHE_TIMEOUT)  # seconds
@@ -180,9 +169,9 @@ def es_mlt(item_id):
         "query": {
             "more_like_this": {
                 "fields": [
-                    "title.keyword",
+                    "title.raw",
                     "collection_data",
-                    "subject.keyword",
+                    "subject.raw",
                 ],
                 "like": [
                     {"_id": item_id}
@@ -255,14 +244,12 @@ def query_encode(query_string: str = None,
                 es_params['query'] = es_filters[0]
 
     if facets:
-        # exceptions = ['collection_url', 'repository_url', 'campus_url']
-        exceptions = []
         aggs = {}
         for facet in facets:
-            if facet in exceptions or facet[-8:] == '.keyword':
+            if facet in UCLDC_SCHEMA_TERM_FIELDS or facet[-4:] == '.raw':
                 field = facet
             else:
-                field = f'{facet}.keyword'
+                field = f'{facet}.raw'
 
             aggs[facet] = {
                 "terms": {
@@ -297,6 +284,8 @@ def query_encode(query_string: str = None,
     es_params.update({'size': rows})
     if start:
         es_params.update({'from': start})
+
+    es_params.update({'track_total_hits': True})
     return es_params
 
 
diff --git a/exhibits/utils.py b/exhibits/utils.py
index 73ef9cb8..93e8dd48 100644
--- a/exhibits/utils.py
+++ b/exhibits/utils.py
@@ -11,6 +11,29 @@
 from functools import wraps
 from django.views.decorators.cache import cache_page
 
+# index schema fields that are of type=keyword so we can get
+# facets directly without needing an _ss or .raw suffix
+UCLDC_SCHEMA_TERM_FIELDS = [
+    'calisphere-id',
+    'id',
+    'campus_name',
+    'campus_data',
+    'campus_url',
+    'campus_id',
+    'collection_name',
+    'collection_data',
+    'collection_url',
+    'collection_id',
+    'sort_collection_data',
+    'repository_name',
+    'repository_data',
+    'repository_url',
+    'repository_id',
+    'rights_uri',
+    'url_item',
+    'fetcher_type',
+    'mapper_type'
+]
 
 # create a hash for a cache key
 def kwargs_md5(**kwargs):