From 14e2780c6c234e44b8ee1c4c49e6aa6d378ec40a Mon Sep 17 00:00:00 2001 From: MaertHaekkinen Date: Wed, 23 Oct 2024 17:57:02 +0300 Subject: [PATCH] Add command to fix corrupted MUIS imports --- .../fix_missing_images_for_muis_imports.py | 75 +++++++++++ .../commands/import_photos_from_muis.py | 75 ++++++----- ajapaik/ajapaik/muis_utils.py | 122 +++++++++--------- ajapaik/ajapaik/urls_opendata.py | 10 +- 4 files changed, 185 insertions(+), 97 deletions(-) create mode 100644 ajapaik/ajapaik/management/commands/fix_missing_images_for_muis_imports.py diff --git a/ajapaik/ajapaik/management/commands/fix_missing_images_for_muis_imports.py b/ajapaik/ajapaik/management/commands/fix_missing_images_for_muis_imports.py new file mode 100644 index 000000000..251316d09 --- /dev/null +++ b/ajapaik/ajapaik/management/commands/fix_missing_images_for_muis_imports.py @@ -0,0 +1,75 @@ +import logging +import traceback +import urllib +import xml.etree.ElementTree as ET + +import requests +from django.core.management.base import BaseCommand + +from ajapaik.ajapaik.models import Photo, ApplicationException + + +class Command(BaseCommand): + help = 'Import photos from MUIS set' + + def add_arguments(self, parser): + parser.add_argument( + 'photo_ids', nargs='+', type=int, + help='Photo ids, where to import the photos from muis' + ) + + def handle(self, *args, **options): + logger = logging.getLogger(__name__) + + photo_ids = (options['album_ids']) + if not photo_ids: + logger.info("Please add photo ids that you want to update") + + photos = Photo.objects.filter(id__in=photo_ids) + muis_url = 'https://www.muis.ee/OAIService/OAIService' + + for photo in photos: + list_identifiers_url = f'{muis_url}?verb=GetRecord&identifier={photo.external_id}&metadataPrefix=lido' + url_response = urllib.request.urlopen(list_identifiers_url) + + ns = {'d': 'http://www.openarchives.org/OAI/2.0/', 'lido': 'http://www.lido-schema.org'} + record = 'd:metadata/lido:lidoWrap/lido:lido/' + resource_wrap = f'{record}lido:administrativeMetadata/lido:resourceWrap/' + + read_url = url_response.read() + parser = ET.XMLParser(encoding="utf-8") + tree = ET.fromstring(read_url, parser=parser) + rec = tree.find('d:GetRecord/d:record', ns) + + try: + rp_lr = 'resourceRepresentation/lido:linkResource' + link_resource_record = rec.find(f'{resource_wrap}lido:resourceSet/lido:{rp_lr}', ns) + + if not link_resource_record: + logger.info(f"Skipping {photo.external_id}, as there is no image resource") + + image_url = link_resource_record.text + + if link_resource_record: + image_extension = (link_resource_record.attrib['{' + ns['lido'] + '}formatResource']).lower() + else: + image_extension = None + logger.info(f"Skipping {photo.external_id}, as there is not image extension specified") + + if not image_url or image_extension not in ['gif', 'jpg', 'jpeg', 'png', 'tif', 'tiff', 'webp']: + logger.info(f"Skipping {photo.external_id}, as there are no photos which are supported") + continue + + response = requests.get(image_url) + if response.status_code != 200: + logger.info(f"Skipping {photo.external_id}, as we did not get a valid response when downloading") + + img_data = response.content + with open(photo.image.name, 'wb') as handler: + handler.write(img_data) + + photo.set_calculated_fields() + except Exception as e: + logger.exception(e) + exception = ApplicationException(exception=traceback.format_exc(), photo=photo) + exception.save() diff --git a/ajapaik/ajapaik/management/commands/import_photos_from_muis.py b/ajapaik/ajapaik/management/commands/import_photos_from_muis.py index b2d03f279..6ef8de6e4 100644 --- a/ajapaik/ajapaik/management/commands/import_photos_from_muis.py +++ b/ajapaik/ajapaik/management/commands/import_photos_from_muis.py @@ -49,8 +49,9 @@ def handle(self, *args, **options): for s in sets: if s.find('d:setSpec', ns).text == museum_name: source_description = s.find('d:setName', ns).text - source = Source(name=museum_name, description=source_description) - source.save() + Source.objects.create(name=museum_name, description=source_description) + break + source = Source.objects.filter(name=museum_name).first() dates = [] @@ -95,22 +96,21 @@ def handle(self, *args, **options): try: locations = [] person_album_ids = [] - external_id = rec.find(f'{header}d:identifier', ns).text \ - if rec.find(f'{header}d:identifier', ns) is not None \ - else None + identifier_record = rec.find(f'{header}d:identifier', ns) + external_id = identifier_record.text if identifier_record else None existing_photo = Photo.objects.filter(external_id=external_id).first() - if existing_photo is not None: + + if existing_photo: continue rp_lr = 'resourceRepresentation/lido:linkResource' - image_url = rec.find(f'{resource_wrap}lido:resourceSet/lido:{rp_lr}', ns).text \ - if rec.find(f'{resource_wrap}lido:resourceSet/lido:{rp_lr}', ns) is not None \ - else None + link_resource_record = rec.find(f'{resource_wrap}lido:resourceSet/lido:{rp_lr}', ns) + image_url = link_resource_record.text if link_resource_record else None - image_extension = (rec.find(f'{resource_wrap}lido:resourceSet/lido:{rp_lr}', - ns).attrib['{' + ns['lido'] + '}formatResource']).lower() \ - if rec.find(f'{resource_wrap}lido:resourceSet/lido:{rp_lr}', ns) is not None \ - else None + if link_resource_record: + image_extension = (link_resource_record.attrib['{' + ns['lido'] + '}formatResource']).lower() + else: + image_extension = None source_url_find = rec.find(f'{record_wrap}lido:recordInfoSet/lido:recordInfoLink', ns) source_url = source_url_find.text \ @@ -118,16 +118,22 @@ def handle(self, *args, **options): else None identifier_find = rec.find(f'{repository_wrap}lido:repositorySet/lido:workID', ns) - identifier = identifier_find.text if identifier_find is not None else None + identifier = identifier_find.text if identifier_find else None if identifier and import_blacklist_service.is_blacklisted(identifier): logger.info(f'Skipping {identifier} as it is blacklisted.') continue - if image_url is None or image_extension not in ['gif', 'jpg', 'jpeg', 'png', 'tif', 'webp']: + if not image_url or image_extension not in ['gif', 'jpg', 'jpeg', 'png', 'tif', 'tiff', 'webp']: + logger.info(f"Skipping {identifier}, as there are no photos which are supported") continue - img_data = requests.get(image_url).content + response = requests.get(image_url) + + if response.status_code != 200: + logger.info(f"Skipping {identifier}, as we did not get a valid response when downloading") + + img_data = response.content image_id = external_id.split(':')[-1] file_name = f'{set_name}_{image_id}.{image_extension}' file_name = file_name.replace(':', '_') @@ -166,15 +172,14 @@ def handle(self, *args, **options): date_earliest_has_suffix = False date_latest_has_suffix = False events = rec.findall(f'{event_wrap}lido:eventSet/lido:event', ns) - if events is not None and len(events) > 0: - locations, \ - creation_date_earliest, \ - creation_date_latest, \ - date_prefix_earliest, \ - date_prefix_latest, \ - date_earliest_has_suffix, \ - date_latest_has_suffix, \ - = extract_dating_from_event( + if events and len(events) > 0: + (locations, + creation_date_earliest, + creation_date_latest, + date_prefix_earliest, + date_prefix_latest, + date_earliest_has_suffix, + date_latest_has_suffix) = extract_dating_from_event( events, locations, creation_date_earliest, @@ -182,7 +187,7 @@ def handle(self, *args, **options): photo.latest_dating is not None or dating is not None, ns ) - if dating is not None: + if dating: creation_date_earliest, date_prefix_earliest, date_earliest_has_suffix = \ get_muis_date_and_prefix(dating, False) creation_date_latest, date_prefix_latest, date_latest_has_suffix = \ @@ -190,10 +195,11 @@ def handle(self, *args, **options): actors = rec.findall(f'{actor_wrap}lido:actorInRole', ns) person_album_ids, author = add_person_albums(actors, person_album_ids, ns) - if author is not None: + if author: photo.author = author + photo.add_to_source_album() - if locations != []: + if locations and len(locations) > 0: photo = add_geotag_from_address_to_photo(photo, locations) photo = add_dating_to_photo( @@ -207,6 +213,7 @@ def handle(self, *args, **options): date_latest_has_suffix, ) photo.light_save() + for album in albums: if not album.cover_photo: album.cover_photo = photo @@ -215,13 +222,13 @@ def handle(self, *args, **options): person_albums = Album.objects.filter(id__in=person_album_ids) if person_albums is not None: - for album in person_albums: - if not album.cover_photo: - album.cover_photo = photo - ap = AlbumPhoto(photo=photo, album=album, type=AlbumPhoto.FACE_TAGGED) + for person_album in person_albums: + if not person_album.cover_photo: + person_album.cover_photo = photo + ap = AlbumPhoto(photo=photo, album=person_album, type=AlbumPhoto.FACE_TAGGED) ap.save() + all_person_album_ids_set.add(person_album.id) - all_person_album_ids_set.add(album.id) photo.set_calculated_fields() except Exception as e: logger.exception(e) @@ -237,6 +244,6 @@ def handle(self, *args, **options): all_person_album_ids = list(all_person_album_ids_set) all_person_albums = Album.objects.filter(id__in=all_person_album_ids) - if all_person_albums.exists(): + if all_person_albums: for person_album in all_person_albums: person_album.light_save() diff --git a/ajapaik/ajapaik/muis_utils.py b/ajapaik/ajapaik/muis_utils.py index 34db1c0bc..229b26f02 100644 --- a/ajapaik/ajapaik/muis_utils.py +++ b/ajapaik/ajapaik/muis_utils.py @@ -1,36 +1,36 @@ +import datetime import itertools -import requests import json -import roman +import requests +import roman from django.conf import settings -import datetime from ajapaik.ajapaik.models import Album, GeoTag, GoogleMapsReverseGeocode, Location, Photo, LocationPhoto century_suffixes = [ - 'saj x', - ' saj x', - '.saj x', - ' .saj x', - '.saj. x', - '. saj.x', - '. saj. x', - ' .saj. x', - 'saj.x', - 'saj. x', - ' saj.x', - ' saj. x', - 'sajandi x', - ' sajandi x', - '.sajandi x', - ' .sajandi x' - ] + 'saj x', + ' saj x', + '.saj x', + ' .saj x', + '.saj. x', + '. saj.x', + '. saj. x', + ' .saj. x', + 'saj.x', + 'saj. x', + ' saj.x', + ' saj. x', + 'sajandi x', + ' sajandi x', + '.sajandi x', + ' .sajandi x' +] start_of_century_suffixes = [x.replace('x', 'algus') for x in century_suffixes] end_of_century_suffixes = [x.replace('x', 'lõpp') for x in century_suffixes] -starts_of_century = [b+a for a in start_of_century_suffixes for b in map(str, list(range(1, 21)))] -ends_of_century = [b+a for a in end_of_century_suffixes for b in map(str, list(range(1, 21)))] +starts_of_century = [b + a for a in start_of_century_suffixes for b in map(str, list(range(1, 21)))] +ends_of_century = [b + a for a in end_of_century_suffixes for b in map(str, list(range(1, 21)))] def unstructured_date_to_structured_date(date, all_date_prefixes, is_later_date): @@ -67,16 +67,16 @@ def unstructured_date_to_structured_date(date, all_date_prefixes, is_later_date) date = f'umbes.{date}' date = date.strip().strip('.').strip().strip('.') irregular_decade_suffixes = [ - '-ndad aastad', "'ndad", '-ndad a', '.dad', '. aastad', ' aastad', 'ndad', '-dad', 'a-d' - ] + '-ndad aastad', "'ndad", '-ndad a', '.dad', '. aastad', ' aastad', 'ndad', '-dad', 'a-d' + ] for suffix in irregular_decade_suffixes: if date.endswith(suffix): date = date.replace(suffix, '.aastad') break irregular_approximate_date_prefixes = [ - 'u. ', 'u.', 'u ', 'ca. ', 'ca.', 'ca ', 'ca', 'arvatavasti. ', 'arvatavasti.', 'arvatavasti' - ] + 'u. ', 'u.', 'u ', 'ca. ', 'ca.', 'ca ', 'ca', 'arvatavasti. ', 'arvatavasti.', 'arvatavasti' + ] for prefix in irregular_approximate_date_prefixes: if date.startswith(prefix): date = date.replace(prefix, 'umbes.', 1) @@ -238,10 +238,10 @@ def set_text_fields_from_muis(photo, dating, rec, object_description_wraps, ns): if description_type in muis_description_field_pairs: if description_type == 'sisu kirjeldus': photo = reset_modeltranslated_field( - photo, - muis_description_field_pairs[description_type], - description_text - ) + photo, + muis_description_field_pairs[description_type], + description_text + ) photo.description_original_language = None elif description_type == 'dateering': dating = description_text @@ -265,13 +265,13 @@ def reset_modeltranslated_field(photo, attribute_name, attribute_value): def extract_dating_from_event( - events, - location, - creation_date_earliest, - creation_date_latest, - skip_dating, - ns - ): + events, + location, + creation_date_earliest, + creation_date_latest, + skip_dating, + ns +): duplicate_event_type = 'kopeerimine (valmistamine)' creation_event_types = ['valmistamine', '', 'pildistamine', 'sõjandus ja kaitse', 'sõjad'] date_prefix_earliest = None @@ -295,14 +295,14 @@ def extract_dating_from_event( if earliest_date is not None and earliest_date.text is not None: creation_date_earliest, date_prefix_earliest, earliest_had_decade_suffix, \ = get_muis_date_and_prefix( - earliest_date.text, False - ) + earliest_date.text, False + ) if latest_date is not None and latest_date.text is not None: creation_date_latest, date_prefix_latest, latest_had_decade_suffix, \ = get_muis_date_and_prefix( - latest_date.text, True - ) + latest_date.text, True + ) places = event.findall('lido:eventPlace/lido:place', ns) if places is not None: @@ -327,7 +327,7 @@ def extract_dating_from_event( if new_location != []: location.append(new_location) return location, creation_date_earliest, creation_date_latest, date_prefix_earliest, date_prefix_latest, \ - earliest_had_decade_suffix, latest_had_decade_suffix + earliest_had_decade_suffix, latest_had_decade_suffix def add_person_albums(actors, person_album_ids, ns): @@ -423,6 +423,7 @@ def add_geotag_from_address_to_photo(photo, locations): for location in locations: search_string = '' parent_location_object = None + for sublocation in location: location_objects = \ Location.objects.filter(name=sublocation[0], location_type=sublocation[1]) @@ -450,26 +451,31 @@ def add_geotag_from_address_to_photo(photo, locations): lon = location_object.google_reverse_geocode.lon address = location_object.google_reverse_geocode.response.get('results')[0].get('formatted_address') else: + # $$$ in the start and end of text signifies unstructured data (coordinates, instructions about location, etc.) + search_string = search_string.strip("$ ") google_geocode_url = f'https://maps.googleapis.com/maps/api/geocode/json?' \ - f'address={search_string}' \ - f'&key={settings.UNRESTRICTED_GOOGLE_MAPS_API_KEY}' + f'address={search_string}' \ + f'&key={settings.UNRESTRICTED_GOOGLE_MAPS_API_KEY}' google_response_json = requests.get(google_geocode_url).text google_response_parsed = json.loads(google_response_json) status = google_response_parsed.get('status', None) - lat_lng = None - if status == 'OK': - # Google was able to answer some geolocation for this description - address = google_response_parsed.get('results')[0].get('formatted_address') - lat_lng = google_response_parsed.get('results')[0].get('geometry').get('location') - if lat_lng is None: - return photo - - lat = lat_lng['lat'] - lon = lat_lng['lng'] - google_maps_reverse_geocode = GoogleMapsReverseGeocode(lat=lat, lon=lon, response=google_response_parsed) - google_maps_reverse_geocode.save() - location_object.google_reverse_geocode = google_maps_reverse_geocode - location_object.save() + + if not status == 'OK': + return photo + + # Google was able to answer some geolocation for this description + address = google_response_parsed.get('results')[0].get('formatted_address') + lat_lng = google_response_parsed.get('results')[0].get('geometry').get('location') + + if not lat_lng: + return photo + + lat = lat_lng['lat'] + lon = lat_lng['lng'] + google_maps_reverse_geocode = GoogleMapsReverseGeocode(lat=lat, lon=lon, response=google_response_parsed) + google_maps_reverse_geocode.save() + location_object.google_reverse_geocode = google_maps_reverse_geocode + location_object.save() if photo.lat is None: photo.lat = lat diff --git a/ajapaik/ajapaik/urls_opendata.py b/ajapaik/ajapaik/urls_opendata.py index 41fc84f54..5b4bfc684 100644 --- a/ajapaik/ajapaik/urls_opendata.py +++ b/ajapaik/ajapaik/urls_opendata.py @@ -1,4 +1,4 @@ -from django.urls import re_path +from django.urls import re_path, path from django.views.generic import TemplateView from rest_framework.routers import DefaultRouter from rest_framework.urlpatterns import format_suffix_patterns @@ -11,8 +11,8 @@ urlpatterns = router.urls urlpatterns += format_suffix_patterns([ - re_path(r'^photos/(?P[0-9]+)/geotags/$', - PhotoGeoTagViewSet.as_view({'get': 'retrieve'}), - name='opendata-photo-geotags'), - re_path(r'^robots\.txt$', TemplateView.as_view(template_name='robots.txt', content_type='text/plain')), + path('photos//geotags/', + PhotoGeoTagViewSet.as_view({'get': 'retrieve'}), + name='opendata-photo-geotags'), + re_path(r'^robots\.txt', TemplateView.as_view(template_name='robots.txt', content_type='text/plain')), ])