-
Notifications
You must be signed in to change notification settings - Fork 213
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Retrieve Auckland Museum Image Data #3258
Changes from 10 commits
d253f47
7f46257
24a9d31
7fd1303
b6b5447
4566bda
3682cbe
072978d
65c6913
43fa686
cceab62
1fe2ce0
dacc97e
3278cbc
9ed6045
e0994ed
7719832
cda5b2d
3ecf716
3e23a5c
ce12051
8282360
fdce81b
bf5e2bb
1549601
41edd9e
f5a6bd8
0425fcb
692c89d
082dafa
44daa71
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,143 @@ | ||||||
""" | ||||||
Content Provider: AucklandMuseum | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
||||||
ETL Process: Use the API to identify all CC licensed media. | ||||||
|
||||||
Output: TSV file containing the media and the | ||||||
respective meta-data. | ||||||
|
||||||
Notes: https://api.aucklandmuseum.com/ | ||||||
|
||||||
Resource: https://api.aucklandmuseum.com/ | ||||||
https://github.com/AucklandMuseum/API/wiki/Tutorial | ||||||
stacimc marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
""" | ||||||
import logging | ||||||
|
||||||
from common.constants import IMAGE | ||||||
from common.licenses import get_license_info | ||||||
from common.loader import provider_details as prov | ||||||
from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester | ||||||
|
||||||
|
||||||
logger = logging.getLogger(__name__) | ||||||
|
||||||
LANDING_URL = ( | ||||||
"https://www.aucklandmuseum.com/collections-research/collections/record/am_" | ||||||
) | ||||||
|
||||||
|
||||||
class AucklandMuseumDataIngester(ProviderDataIngester): | ||||||
providers = { | ||||||
"image": prov.AUCKLAND_MUSEUM_IMAGE_PROVIDER, | ||||||
} | ||||||
endpoint = "https://api.aucklandmuseum.com/search/collectionsonline/_search" | ||||||
license_url = "https://creativecommons.org/licenses/by/4.0/" | ||||||
delay = 4 | ||||||
from_start = 0 | ||||||
stacimc marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
total_amount_of_data = 10000 | ||||||
DEFAULT_LICENSE_INFO = get_license_info(license_url=license_url) | ||||||
|
||||||
def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: | ||||||
# On the first request, `prev_query_params` will be `None`. We can detect this | ||||||
# and return our default params. | ||||||
if not prev_query_params: | ||||||
# Return default query params on the first request | ||||||
# primaryRepresentation contain a image url for each data | ||||||
# "+" is a query string syntax for must be present | ||||||
# copyright:CC state Creative Commons Attribution 4.0 | ||||||
return { | ||||||
"q": "_exists_:primaryRepresentation+copyright:CC", | ||||||
"size": "100", | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since this is the default There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The max amount of data retrieved from the api is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I meant that by default
Rather than hard-coding it separately here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will update "size" with batch_limit. Please take a look at the comment below about |
||||||
"from": self.from_start, | ||||||
} | ||||||
else: | ||||||
# Increment `from` by 100. | ||||||
return { | ||||||
**prev_query_params, | ||||||
"from": prev_query_params["from"] + 100, | ||||||
} | ||||||
|
||||||
def get_batch_data(self, response_json): | ||||||
# Takes the raw API response from calling `get` on the endpoint, and returns | ||||||
# the list of records to process. | ||||||
if response_json: | ||||||
return response_json.get("hits").get("hits") | ||||||
stacimc marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
return None | ||||||
|
||||||
def get_should_continue(self, response_json): | ||||||
# Do not continue if we have exceeded the total amount of data | ||||||
if self.from_start >= self.total_amount_of_data: | ||||||
stacimc marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
logger.info( | ||||||
"The final amount of data has been processed. Halting ingestion." | ||||||
) | ||||||
return False | ||||||
|
||||||
return True | ||||||
|
||||||
def get_media_type(self, record: dict): | ||||||
return IMAGE | ||||||
|
||||||
def get_record_data(self, data: dict) -> dict | list[dict] | None: | ||||||
url_parameter = data.get("_id").split("id/")[-1].replace("/", "-") | ||||||
ngken0995 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
foreign_landing_url = f"{LANDING_URL}{url_parameter}" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should return |
||||||
|
||||||
foreign_identifier = data.get("_id").split("/")[-1] | ||||||
ngken0995 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
information = data.get("_source") | ||||||
stacimc marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
url = information.get("primaryRepresentation") | ||||||
|
||||||
thumbnail_url = f"{url}?rendering=thumbnail.jpg" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The thumbnail their API provides is tiny, fixed width of 70px. @obulat would know best -- should we use this or just default to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have previously discussed the thumbnail sizes, and decided against using thumbnails smaller than 600px: #675 (comment)
ngken0995 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
license_info = self.DEFAULT_LICENSE_INFO | ||||||
filesize = self._get_file_info(url) | ||||||
|
||||||
creator = ( | ||||||
information.get("dc_contributor")[0] | ||||||
stacimc marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
if information.get("dc_contributor") | ||||||
else "" | ||||||
) | ||||||
|
||||||
title = information.get("appellation").get("Primary Title")[0] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
When testing this locally, I also got an IndexError here, so we may need to check that this is not an empty list. |
||||||
meta_data = self._get_meta_data(information) | ||||||
data.get("tags") | ||||||
stacimc marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
return { | ||||||
"foreign_landing_url": foreign_landing_url, | ||||||
"foreign_identifier": foreign_identifier, | ||||||
"url": url, | ||||||
"license_info": license_info, | ||||||
"thumbnail_url": thumbnail_url, | ||||||
ngken0995 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
"filesize": filesize, | ||||||
"creator": creator, | ||||||
"title": title, | ||||||
"meta_data": meta_data, | ||||||
} | ||||||
|
||||||
def _get_meta_data(self, object_json: dict) -> dict | None: | ||||||
ngken0995 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
geopos = object_json.get("geopos")[0] if object_json.get("geopos") else "" | ||||||
metadata = { | ||||||
"type": object_json.get("type"), | ||||||
"geopos": geopos, | ||||||
"department": object_json.get("department")[0], | ||||||
} | ||||||
|
||||||
metadata = {k: v for k, v in metadata.items() if v is not None} | ||||||
return metadata | ||||||
|
||||||
def _get_file_info(self, url) -> int | None: | ||||||
"""Get the image size in bytes.""" | ||||||
resp = self.delayed_requester.head(url) | ||||||
if resp: | ||||||
filesize = int(resp.headers.get("Content-Length", 0)) | ||||||
return filesize if filesize != 0 else None | ||||||
|
||||||
|
||||||
def main(): | ||||||
# Allows running ingestion from the CLI without Airflow running for debugging | ||||||
# purposes. | ||||||
ingester = AucklandMuseumDataIngester() | ||||||
ingester.ingest_records() | ||||||
|
||||||
|
||||||
if __name__ == "__main__": | ||||||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
{ | ||
"_index": "collectionsonline-2022-05-04-1", | ||
"_type": "_doc", | ||
"_id": "http://api.aucklandmuseum.com/id/naturalsciences/object/691102", | ||
"_score": 2.0630994, | ||
"_source": { | ||
"copyright": ["© Auckland Museum CC BY"], | ||
"notes": [], | ||
"references": [ | ||
{ | ||
"person": { | ||
"secondary_maker": [], | ||
"primary_maker": [], | ||
"classified": ["http://api.aucklandmuseum.com/id/person/28441"], | ||
"collected": ["http://api.aucklandmuseum.com/id/person/25299"], | ||
"_all": [ | ||
"http://api.aucklandmuseum.com/id/person/28441", | ||
"http://api.aucklandmuseum.com/id/person/25299" | ||
], | ||
"referred": [] | ||
} | ||
}, | ||
{ "object": { "childOf": [], "_all": [], "referred": [] } } | ||
], | ||
"documentType": [], | ||
"geoSubject": [], | ||
"language": [], | ||
"type": "ecrm:E20_Biological_Object", | ||
"content": [], | ||
"localityDescription": ["[Western Samoa, Savai'i] Hinter [behind] Safai"], | ||
"acquisitionStatement": [], | ||
"recordScore": 40, | ||
"responsibility": [], | ||
"dc_contributor": ["R. O. Gardner"], | ||
"isTaonga": false, | ||
"place": { | ||
"found": { "_all": [] }, | ||
"made": { "_all": [] }, | ||
"associated": { "_all": [] }, | ||
"captured": { "_all": [] }, | ||
"published": { "_all": [] }, | ||
"acquired": { "_all": ["Samoa"] }, | ||
"_all": ["Samoa"] | ||
}, | ||
"appellation": { | ||
"Common Name": [], | ||
"Classification Display Value": ["Cypholophus macrocephalus mollis"], | ||
"Primary Title": [ | ||
"Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd." | ||
], | ||
"Other Title": [], | ||
"_all_suggest": { | ||
"input": [ | ||
"mollis", | ||
"Cypholophus macrocephalus mollis", | ||
"Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd." | ||
], | ||
"contexts": { "type_context": "ecrm:E20_Biological_Object" } | ||
}, | ||
"Maori Name": [], | ||
"Classification Value": ["mollis"], | ||
"_all": [ | ||
"mollis", | ||
"Cypholophus macrocephalus mollis", | ||
"Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd." | ||
] | ||
}, | ||
"keyword": [], | ||
"department": ["botany"], | ||
"dc_identifier": ["AK28252"], | ||
"process": [], | ||
"period": { | ||
"made": [{ "exact": "2010-07-08T00:00:00.000Z" }], | ||
"associated": [], | ||
"published": [], | ||
"accession": [ | ||
{ | ||
"end": "2010-07-08T00:00:00.000Z", | ||
"text": "08 Jul 2010", | ||
"begin": "2010-07-08T00:00:00.000Z" | ||
} | ||
], | ||
"acquired": [ | ||
{ | ||
"end": "1905-06-22T00:00:00.000Z", | ||
"text": "22 Jun 1905", | ||
"begin": "1905-06-22T00:00:00.000Z" | ||
} | ||
], | ||
"time_period": [], | ||
"_all": [ | ||
{ | ||
"end": "2010-07-08T00:00:00.000Z", | ||
"text": "08 Jul 2010", | ||
"begin": "2010-07-08T00:00:00.000Z" | ||
}, | ||
{ | ||
"end": "1905-06-22T00:00:00.000Z", | ||
"text": "22 Jun 1905", | ||
"begin": "1905-06-22T00:00:00.000Z" | ||
}, | ||
{ "exact": "2010-07-08T00:00:00.000Z" } | ||
] | ||
}, | ||
"subjectStatus": [], | ||
"geopos": [], | ||
"primaryRepresentation": "http://api.aucklandmuseum.com/id/media/v/214749", | ||
"dc_date": ["Jul 1989"], | ||
"typeStatus": [], | ||
"collection": [], | ||
"classification": [ | ||
{ | ||
"object": [ | ||
{ | ||
"Kingdom": "Linnaean", | ||
"Genus": "Linnaean", | ||
"Linnaean System": "Linnaean", | ||
"Family": "Linnaean", | ||
"Species": "Linnaean", | ||
"_all": [ | ||
"Mollis", | ||
"Macrocephalus", | ||
"Cypholophus", | ||
"Urticaceae", | ||
"Plantae", | ||
"Linnaean", | ||
"Mollis", | ||
"Macrocephalus", | ||
"Cypholophus", | ||
"Urticaceae", | ||
"Plantae", | ||
"Linnaean", | ||
"Mollis", | ||
"Macrocephalus", | ||
"Cypholophus", | ||
"Urticaceae", | ||
"Plantae", | ||
"Linnaean", | ||
"Mollis", | ||
"Macrocephalus", | ||
"Cypholophus", | ||
"Urticaceae", | ||
"Plantae", | ||
"Linnaean", | ||
"Mollis", | ||
"Macrocephalus", | ||
"Cypholophus", | ||
"Urticaceae", | ||
"Plantae", | ||
"Linnaean", | ||
"Mollis", | ||
"Macrocephalus", | ||
"Cypholophus", | ||
"Urticaceae", | ||
"Plantae", | ||
"Linnaean" | ||
], | ||
"Var.": "Linnaean" | ||
} | ||
] | ||
}, | ||
{ "material": [] }, | ||
{ "place": [] } | ||
], | ||
"subjectCategory": [], | ||
"lastModifiedOn": ["2022-06-23T09:44:41.824Z"], | ||
"tags": { "official": [], "user": [], "_all": [] }, | ||
"dc_place": ["Samoa"], | ||
"kindOfSpecimen": ["1F- Foreign dry"], | ||
"unit": [], | ||
"culturalOrigin": [], | ||
"isSensitive": false, | ||
"series": [], | ||
"dc_title": [ | ||
"mollis", | ||
"Cypholophus macrocephalus mollis", | ||
"Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd." | ||
], | ||
"location": [], | ||
"isInLibrary": false, | ||
"support": [] | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately it's not realistic to have diacritical marks (macron) on these slugs, but we can still have both. I left out "war memorial" from the colonial name because it's less important and the institution itself doesn't always include it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is good to hear. The changes are made.