From 25809496097a684774f3dbc3dab90f2c09ae47cd Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Tue, 14 May 2024 16:56:28 -0700 Subject: [PATCH 1/3] required for registry's installation --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0a118d05..f204e9cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ markdown==3.6 boto3==1.34.84 Pillow==10.3.0 python-magic==0.4.27 -retrying==1.3.4 \ No newline at end of file +retrying==1.3.4 +elasticsearch==7.13.4 # 8.13.0 is current From 9d5e60326acda6622218392f46c6d7f055ddb5b7 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Tue, 14 May 2024 17:02:07 -0700 Subject: [PATCH 2/3] Add caching to the es_cache_retry module --- exhibits/es_cache_retry.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/exhibits/es_cache_retry.py b/exhibits/es_cache_retry.py index 8302bfc1..6855f5fa 100644 --- a/exhibits/es_cache_retry.py +++ b/exhibits/es_cache_retry.py @@ -14,6 +14,8 @@ from elasticsearch.exceptions import ConnectionError as ESConnectionError from elasticsearch.exceptions import RequestError as ESRequestError +from exhibits.utils import kwargs_md5 + urllib3.disable_warnings() if hasattr(settings, 'XRAY_RECORDER'): @@ -31,6 +33,11 @@ def es_search(body) -> ESResults: + cache_key = f"es_search_{kwargs_md5(**body)}" + cached_results = cache.get(cache_key) + if cached_results: + return cached_results + try: results = elastic_client.search( index=settings.ES_ALIAS, body=json.dumps(body)) @@ -83,6 +90,8 @@ def es_search(body) -> ESResults: results['hits']['total']['value'], facet_counts) + cache.set(cache_key, results, settings.DJANGO_CACHE_TIMEOUT) # seconds + return results def get_thumbnail_key(metadata): @@ -101,8 +110,9 @@ def get_media_key(metadata): key_parts = uri_path.split('/')[2:] return '/'.join(key_parts) -def es_search_nocache(**kwargs): - return es_search(kwargs) + +# def es_search_nocache(**kwargs): +# return es_search(kwargs) def es_get(item_id: str) -> Optional[ESItem]: @@ -110,6 +120,11 @@ def es_get(item_id: str) -> Optional[ESItem]: if not item_id: return None + cache_key = f"es_get_{item_id}" + cached_results = cache.get(cache_key) + if cached_results: + return cached_results + # cannot use Elasticsearch.get() for multi-index alias body = {'query': {'match': {'_id': item_id}}} try: @@ -150,6 +165,7 @@ def es_get(item_id: str) -> Optional[ESItem]: item['children'] = updated_children results = ESItem(found, item, item_search) + cache.set(cache_key, results, settings.DJANGO_CACHE_TIMEOUT) # seconds return results From 9d57b9b30e999c602eb4bf73678bcca5c42c6f2f Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Wed, 15 May 2024 12:25:35 -0700 Subject: [PATCH 3/3] Add retries w/ 3sec max; update es_cache_retry to match public_interface --- exhibits/es_cache_retry.py | 107 +++++++++++++++++-------------------- exhibits/utils.py | 23 ++++++++ 2 files changed, 71 insertions(+), 59 deletions(-) diff --git a/exhibits/es_cache_retry.py b/exhibits/es_cache_retry.py index 6855f5fa..e8915910 100644 --- a/exhibits/es_cache_retry.py +++ b/exhibits/es_cache_retry.py @@ -1,20 +1,22 @@ """ logic for cache / retry for es (opensearch) and JSON from registry """ -from django.core.cache import cache -from django.conf import settings +import json +import urllib3 from collections import namedtuple -from urllib.parse import urlparse -import urllib3 -import json from typing import Dict, List, Tuple, Optional +from urllib.parse import urlparse + from aws_xray_sdk.core import patch +from django.core.cache import cache +from django.conf import settings from elasticsearch import Elasticsearch from elasticsearch.exceptions import ConnectionError as ESConnectionError from elasticsearch.exceptions import RequestError as ESRequestError +from retrying import retry -from exhibits.utils import kwargs_md5 +from exhibits.utils import kwargs_md5, UCLDC_SCHEMA_TERM_FIELDS urllib3.disable_warnings() @@ -32,6 +34,37 @@ 'ESItem', 'found, item, resp') +def shim_record(metadata): + # TODO replace type_ss with type globally + metadata['type_ss'] = metadata.get('type') + metadata['collection_ids'] = metadata.get('collection_url') + metadata['repository_ids'] = metadata.get('repository_url') + + thumbnail_key = get_thumbnail_key(metadata) + if thumbnail_key: + metadata['reference_image_md5'] = thumbnail_key + + media_key = get_media_key(metadata) + if media_key: + metadata['media']['media_key'] = media_key + + if metadata.get('children'): + children = metadata.pop('children') + updated_children = [] + for child in children: + thumbnail_key = get_thumbnail_key(child) + if thumbnail_key: + child['thumbnail_key'] = thumbnail_key + media_key = get_media_key(child) + if media_key: + child['media']['media_key'] = media_key + updated_children.append(child) + metadata['children'] = updated_children + + return metadata + + +@retry(stop_max_delay=3000) # milliseconds def es_search(body) -> ESResults: cache_key = f"es_search_{kwargs_md5(**body)}" cached_results = cache.get(cache_key) @@ -59,30 +92,7 @@ def es_search(body) -> ESResults: facet_counts = {} for result in results['hits']['hits']: - metadata = result.pop('_source') - # TODO replace type_ss with type globally - metadata['type_ss'] = metadata.get('type') - thumbnail_key = get_thumbnail_key(metadata) - if thumbnail_key: - metadata['reference_image_md5'] = thumbnail_key - - media_key = get_media_key(metadata) - if media_key: - metadata['media']['media_key'] = media_key - - if metadata.get('children'): - children = metadata.pop('children') - updated_children = [] - for child in children: - thumbnail_key = get_thumbnail_key(child) - if thumbnail_key: - child['thumbnail_key'] = thumbnail_key - media_key = get_media_key(child) - if media_key: - child['media']['media_key'] = media_key - updated_children.append(child) - metadata['children'] = updated_children - + metadata = shim_record(result.pop('_source')) result.update(metadata) results = ESResults( @@ -115,6 +125,7 @@ def get_media_key(metadata): # return es_search(kwargs) +@retry(stop_max_delay=3000) # milliseconds def es_get(item_id: str) -> Optional[ESItem]: # cannot search Elasticsearch with empty string if not item_id: @@ -140,29 +151,7 @@ def es_get(item_id: str) -> Optional[ESItem]: found = item_search['hits']['total']['value'] if not found: return None - item = item_search['hits']['hits'][0]['_source'] - - item['collection_ids'] = item.get('collection_url') - item['repository_ids'] = item.get('repository_url') - thumbnail_key = get_thumbnail_key(item) - if thumbnail_key: - item['reference_image_md5'] = thumbnail_key - media_key = get_media_key(item) - if media_key: - item['media']['media_key'] = media_key - - if item.get('children'): - children = item.pop('children') - updated_children = [] - for child in children: - thumbnail_key = get_thumbnail_key(child) - if thumbnail_key: - child['thumbnail_key'] = thumbnail_key - media_key = get_media_key(child) - if media_key: - child['media']['media_key'] = media_key - updated_children.append(child) - item['children'] = updated_children + item = shim_record(item_search['hits']['hits'][0]['_source']) results = ESItem(found, item, item_search) cache.set(cache_key, results, settings.DJANGO_CACHE_TIMEOUT) # seconds @@ -180,9 +169,9 @@ def es_mlt(item_id): "query": { "more_like_this": { "fields": [ - "title.keyword", + "title.raw", "collection_data", - "subject.keyword", + "subject.raw", ], "like": [ {"_id": item_id} @@ -255,14 +244,12 @@ def query_encode(query_string: str = None, es_params['query'] = es_filters[0] if facets: - # exceptions = ['collection_url', 'repository_url', 'campus_url'] - exceptions = [] aggs = {} for facet in facets: - if facet in exceptions or facet[-8:] == '.keyword': + if facet in UCLDC_SCHEMA_TERM_FIELDS or facet[-4:] == '.raw': field = facet else: - field = f'{facet}.keyword' + field = f'{facet}.raw' aggs[facet] = { "terms": { @@ -297,6 +284,8 @@ def query_encode(query_string: str = None, es_params.update({'size': rows}) if start: es_params.update({'from': start}) + + es_params.update({'track_total_hits': True}) return es_params diff --git a/exhibits/utils.py b/exhibits/utils.py index 73ef9cb8..93e8dd48 100644 --- a/exhibits/utils.py +++ b/exhibits/utils.py @@ -11,6 +11,29 @@ from functools import wraps from django.views.decorators.cache import cache_page +# index schema fields that are of type=keyword so we can get +# facets directly without needing an _ss or .raw suffix +UCLDC_SCHEMA_TERM_FIELDS = [ + 'calisphere-id', + 'id', + 'campus_name', + 'campus_data', + 'campus_url', + 'campus_id', + 'collection_name', + 'collection_data', + 'collection_url', + 'collection_id', + 'sort_collection_data', + 'repository_name', + 'repository_data', + 'repository_url', + 'repository_id', + 'rights_uri', + 'url_item', + 'fetcher_type', + 'mapper_type' +] # create a hash for a cache key def kwargs_md5(**kwargs):