Skip to content

Commit

Permalink
Merge pull request #42 from ucldc/stage
Browse files Browse the repository at this point in the history
  • Loading branch information
amywieliczka authored May 23, 2024
2 parents 1c203db + 718978d commit c49470d
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 61 deletions.
125 changes: 65 additions & 60 deletions exhibits/es_cache_retry.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
""" logic for cache / retry for es (opensearch) and JSON from registry
"""

from django.core.cache import cache
from django.conf import settings
import json
import urllib3

from collections import namedtuple
from urllib.parse import urlparse
import urllib3
import json
from typing import Dict, List, Tuple, Optional
from urllib.parse import urlparse

from aws_xray_sdk.core import patch
from django.core.cache import cache
from django.conf import settings
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import ConnectionError as ESConnectionError
from elasticsearch.exceptions import RequestError as ESRequestError
from retrying import retry

from exhibits.utils import kwargs_md5, UCLDC_SCHEMA_TERM_FIELDS

urllib3.disable_warnings()

Expand All @@ -30,7 +34,43 @@
'ESItem', 'found, item, resp')


def shim_record(metadata):
# TODO replace type_ss with type globally
metadata['type_ss'] = metadata.get('type')
metadata['collection_ids'] = metadata.get('collection_url')
metadata['repository_ids'] = metadata.get('repository_url')

thumbnail_key = get_thumbnail_key(metadata)
if thumbnail_key:
metadata['reference_image_md5'] = thumbnail_key

media_key = get_media_key(metadata)
if media_key:
metadata['media']['media_key'] = media_key

if metadata.get('children'):
children = metadata.pop('children')
updated_children = []
for child in children:
thumbnail_key = get_thumbnail_key(child)
if thumbnail_key:
child['thumbnail_key'] = thumbnail_key
media_key = get_media_key(child)
if media_key:
child['media']['media_key'] = media_key
updated_children.append(child)
metadata['children'] = updated_children

return metadata


@retry(stop_max_delay=3000) # milliseconds
def es_search(body) -> ESResults:
cache_key = f"es_search_{kwargs_md5(**body)}"
cached_results = cache.get(cache_key)
if cached_results:
return cached_results

try:
results = elastic_client.search(
index=settings.ES_ALIAS, body=json.dumps(body))
Expand All @@ -52,37 +92,16 @@ def es_search(body) -> ESResults:
facet_counts = {}

for result in results['hits']['hits']:
metadata = result.pop('_source')
# TODO replace type_ss with type globally
metadata['type_ss'] = metadata.get('type')
thumbnail_key = get_thumbnail_key(metadata)
if thumbnail_key:
metadata['reference_image_md5'] = thumbnail_key

media_key = get_media_key(metadata)
if media_key:
metadata['media']['media_key'] = media_key

if metadata.get('children'):
children = metadata.pop('children')
updated_children = []
for child in children:
thumbnail_key = get_thumbnail_key(child)
if thumbnail_key:
child['thumbnail_key'] = thumbnail_key
media_key = get_media_key(child)
if media_key:
child['media']['media_key'] = media_key
updated_children.append(child)
metadata['children'] = updated_children

metadata = shim_record(result.pop('_source'))
result.update(metadata)

results = ESResults(
results['hits']['hits'],
results['hits']['total']['value'],
facet_counts)

cache.set(cache_key, results, settings.DJANGO_CACHE_TIMEOUT) # seconds

return results

def get_thumbnail_key(metadata):
Expand All @@ -101,15 +120,22 @@ def get_media_key(metadata):
key_parts = uri_path.split('/')[2:]
return '/'.join(key_parts)

def es_search_nocache(**kwargs):
return es_search(kwargs)

# def es_search_nocache(**kwargs):
# return es_search(kwargs)


@retry(stop_max_delay=3000) # milliseconds
def es_get(item_id: str) -> Optional[ESItem]:
# cannot search Elasticsearch with empty string
if not item_id:
return None

cache_key = f"es_get_{item_id}"
cached_results = cache.get(cache_key)
if cached_results:
return cached_results

# cannot use Elasticsearch.get() for multi-index alias
body = {'query': {'match': {'_id': item_id}}}
try:
Expand All @@ -125,31 +151,10 @@ def es_get(item_id: str) -> Optional[ESItem]:
found = item_search['hits']['total']['value']
if not found:
return None
item = item_search['hits']['hits'][0]['_source']

item['collection_ids'] = item.get('collection_url')
item['repository_ids'] = item.get('repository_url')
thumbnail_key = get_thumbnail_key(item)
if thumbnail_key:
item['reference_image_md5'] = thumbnail_key
media_key = get_media_key(item)
if media_key:
item['media']['media_key'] = media_key

if item.get('children'):
children = item.pop('children')
updated_children = []
for child in children:
thumbnail_key = get_thumbnail_key(child)
if thumbnail_key:
child['thumbnail_key'] = thumbnail_key
media_key = get_media_key(child)
if media_key:
child['media']['media_key'] = media_key
updated_children.append(child)
item['children'] = updated_children
item = shim_record(item_search['hits']['hits'][0]['_source'])

results = ESItem(found, item, item_search)
cache.set(cache_key, results, settings.DJANGO_CACHE_TIMEOUT) # seconds
return results


Expand All @@ -164,9 +169,9 @@ def es_mlt(item_id):
"query": {
"more_like_this": {
"fields": [
"title.keyword",
"title.raw",
"collection_data",
"subject.keyword",
"subject.raw",
],
"like": [
{"_id": item_id}
Expand Down Expand Up @@ -239,14 +244,12 @@ def query_encode(query_string: str = None,
es_params['query'] = es_filters[0]

if facets:
# exceptions = ['collection_url', 'repository_url', 'campus_url']
exceptions = []
aggs = {}
for facet in facets:
if facet in exceptions or facet[-8:] == '.keyword':
if facet in UCLDC_SCHEMA_TERM_FIELDS or facet[-4:] == '.raw':
field = facet
else:
field = f'{facet}.keyword'
field = f'{facet}.raw'

aggs[facet] = {
"terms": {
Expand Down Expand Up @@ -281,6 +284,8 @@ def query_encode(query_string: str = None,
es_params.update({'size': rows})
if start:
es_params.update({'from': start})

es_params.update({'track_total_hits': True})
return es_params


Expand Down
23 changes: 23 additions & 0 deletions exhibits/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,29 @@
from functools import wraps
from django.views.decorators.cache import cache_page

# index schema fields that are of type=keyword so we can get
# facets directly without needing an _ss or .raw suffix
UCLDC_SCHEMA_TERM_FIELDS = [
'calisphere-id',
'id',
'campus_name',
'campus_data',
'campus_url',
'campus_id',
'collection_name',
'collection_data',
'collection_url',
'collection_id',
'sort_collection_data',
'repository_name',
'repository_data',
'repository_url',
'repository_id',
'rights_uri',
'url_item',
'fetcher_type',
'mapper_type'
]

# create a hash for a cache key
def kwargs_md5(**kwargs):
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ markdown==3.6
boto3==1.34.84
Pillow==10.3.0
python-magic==0.4.27
retrying==1.3.4
retrying==1.3.4
elasticsearch==7.13.4 # 8.13.0 is current

0 comments on commit c49470d

Please sign in to comment.