Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retrieve Auckland Museum Image Data #3258

Merged
merged 31 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
d253f47
create auckland museum and implement get_next_query_params
ngken0995 Oct 23, 2023
7f46257
implement get_should_continue and get_batch_data
ngken0995 Oct 24, 2023
24a9d31
implement _get_meta_data and _get_file_info
ngken0995 Oct 24, 2023
7fd1303
add test sample
ngken0995 Oct 24, 2023
b6b5447
add new single_item
ngken0995 Oct 24, 2023
4566bda
add test_get_record_data
ngken0995 Oct 24, 2023
3682cbe
implement test_get_record_data
ngken0995 Oct 24, 2023
072978d
if else condition for creator exist
ngken0995 Oct 25, 2023
65c6913
add foreign_identifier and foregin_landing_url
ngken0995 Oct 26, 2023
43fa686
add AucklandMuseumDataIngester to provider_workflow
ngken0995 Oct 26, 2023
cceab62
variables have default values
ngken0995 Oct 27, 2023
1fe2ce0
increment batch_start with batch_limit
ngken0995 Oct 27, 2023
dacc97e
increase batch limit and comment filesize
ngken0995 Nov 2, 2023
3278cbc
remove filesize
ngken0995 Nov 2, 2023
9ed6045
revise test
ngken0995 Nov 2, 2023
e0994ed
implement POST method in requester and have a body for POST request
ngken0995 Dec 12, 2023
7719832
edit test
ngken0995 Dec 12, 2023
cda5b2d
include te reo name in provider_details
ngken0995 Dec 12, 2023
3ecf716
use self.date
ngken0995 Dec 22, 2023
3e23a5c
edit date format
ngken0995 Dec 22, 2023
ce12051
remove if/else condition for date
ngken0995 Dec 22, 2023
8282360
Merge branch 'main' into 1771-auckland-museum
ngken0995 Dec 22, 2023
fdce81b
lint format
ngken0995 Dec 22, 2023
bf5e2bb
add date to test
ngken0995 Dec 22, 2023
1549601
Update catalog/dags/providers/provider_api_scripts/auckland_museum.py
ngken0995 Dec 29, 2023
41edd9e
Update catalog/dags/providers/provider_api_scripts/auckland_museum.py
ngken0995 Dec 29, 2023
f5a6bd8
Update catalog/dags/providers/provider_api_scripts/auckland_museum.py
ngken0995 Dec 29, 2023
0425fcb
Update catalog/dags/providers/provider_api_scripts/auckland_museum.py
ngken0995 Dec 29, 2023
692c89d
Update catalog/dags/providers/provider_api_scripts/auckland_museum.py
ngken0995 Dec 29, 2023
082dafa
Update catalog/dags/providers/provider_api_scripts/auckland_museum.py
ngken0995 Dec 29, 2023
44daa71
using static method
ngken0995 Jan 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions catalog/dags/common/loader/provider_details.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@


# Default provider names
AUCKLAND_MUSEUM_IMAGE_PROVIDER = "aucklandmuseum"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately it's not realistic to have diacritical marks (macron) on these slugs, but we can still have both. I left out "war memorial" from the colonial name because it's less important and the institution itself doesn't always include it.

Suggested change
AUCKLAND_MUSEUM_IMAGE_PROVIDER = "aucklandmuseum"
AUCKLAND_MUSEUM_IMAGE_PROVIDER = "aucklandmuseum_tamakipaengahira"

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is good to hear. The changes are made.

BROOKLYN_DEFAULT_PROVIDER = "brooklynmuseum"
CLEVELAND_DEFAULT_PROVIDER = "clevelandmuseum"
EUROPEANA_DEFAULT_PROVIDER = "europeana"
Expand Down
143 changes: 143 additions & 0 deletions catalog/dags/providers/provider_api_scripts/auckland_museum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
"""
Content Provider: AucklandMuseum
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Content Provider: AucklandMuseum
Content Provider: Auckland War Memorial Museum Tāmaki Paenga Hira


ETL Process: Use the API to identify all CC licensed media.

Output: TSV file containing the media and the
respective meta-data.

Notes: https://api.aucklandmuseum.com/

Resource: https://api.aucklandmuseum.com/
https://github.com/AucklandMuseum/API/wiki/Tutorial
stacimc marked this conversation as resolved.
Show resolved Hide resolved
"""
import logging

from common.constants import IMAGE
from common.licenses import get_license_info
from common.loader import provider_details as prov
from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester


logger = logging.getLogger(__name__)

LANDING_URL = (
"https://www.aucklandmuseum.com/collections-research/collections/record/am_"
)


class AucklandMuseumDataIngester(ProviderDataIngester):
providers = {
"image": prov.AUCKLAND_MUSEUM_IMAGE_PROVIDER,
}
endpoint = "https://api.aucklandmuseum.com/search/collectionsonline/_search"
license_url = "https://creativecommons.org/licenses/by/4.0/"
delay = 4
from_start = 0
stacimc marked this conversation as resolved.
Show resolved Hide resolved
total_amount_of_data = 10000
DEFAULT_LICENSE_INFO = get_license_info(license_url=license_url)

def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict:
# On the first request, `prev_query_params` will be `None`. We can detect this
# and return our default params.
if not prev_query_params:
# Return default query params on the first request
# primaryRepresentation contain a image url for each data
# "+" is a query string syntax for must be present
# copyright:CC state Creative Commons Attribution 4.0
return {
"q": "_exists_:primaryRepresentation+copyright:CC",
"size": "100",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is the default batch_limit from the parent class, we can use self.batch_limit here (and in the increment in the else statement).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The max amount of data retrieved from the api is 10,000. Look at hits -> total -> value(api) Size is the amount of data to present in a get request. From is the index of the total value. We can keep incrementing From till it reach 10,000 and get_should_continue function should know when it reach the limit.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant that by default self.batch_limit is 100, and that you can just say:

    "size": self.batch_limit,

Rather than hard-coding it separately here.

Copy link
Collaborator Author

@ngken0995 ngken0995 Oct 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will update "size" with batch_limit. Please take a look at the comment below about batch_limit default value.

"from": self.from_start,
}
else:
# Increment `from` by 100.
return {
**prev_query_params,
"from": prev_query_params["from"] + 100,
}

def get_batch_data(self, response_json):
# Takes the raw API response from calling `get` on the endpoint, and returns
# the list of records to process.
if response_json:
return response_json.get("hits").get("hits")
stacimc marked this conversation as resolved.
Show resolved Hide resolved
return None

def get_should_continue(self, response_json):
# Do not continue if we have exceeded the total amount of data
if self.from_start >= self.total_amount_of_data:
stacimc marked this conversation as resolved.
Show resolved Hide resolved
logger.info(
"The final amount of data has been processed. Halting ingestion."
)
return False

return True

def get_media_type(self, record: dict):
return IMAGE

def get_record_data(self, data: dict) -> dict | list[dict] | None:
url_parameter = data.get("_id").split("id/")[-1].replace("/", "-")
ngken0995 marked this conversation as resolved.
Show resolved Hide resolved
foreign_landing_url = f"{LANDING_URL}{url_parameter}"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should return None early if license, foreign_identifier, or foreign_landing_url are None (example from another provider).


foreign_identifier = data.get("_id").split("/")[-1]
ngken0995 marked this conversation as resolved.
Show resolved Hide resolved

information = data.get("_source")
stacimc marked this conversation as resolved.
Show resolved Hide resolved

url = information.get("primaryRepresentation")

thumbnail_url = f"{url}?rendering=thumbnail.jpg"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The thumbnail their API provides is tiny, fixed width of 70px. @obulat would know best -- should we use this or just default to None here and use our own thumbnail service? They also have a slightly bigger preview rendering with a fixed width of 100px.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have previously discussed the thumbnail sizes, and decided against using thumbnails smaller than 600px: #675 (comment)

ngken0995 marked this conversation as resolved.
Show resolved Hide resolved
license_info = self.DEFAULT_LICENSE_INFO
filesize = self._get_file_info(url)

creator = (
information.get("dc_contributor")[0]
stacimc marked this conversation as resolved.
Show resolved Hide resolved
if information.get("dc_contributor")
else ""
)

title = information.get("appellation").get("Primary Title")[0]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
title = information.get("appellation").get("Primary Title")[0]
title = information.get("appellation", {}).get("Primary Title")[0]

When testing this locally, I also got an IndexError here, so we may need to check that this is not an empty list.

meta_data = self._get_meta_data(information)
data.get("tags")
stacimc marked this conversation as resolved.
Show resolved Hide resolved

return {
"foreign_landing_url": foreign_landing_url,
"foreign_identifier": foreign_identifier,
"url": url,
"license_info": license_info,
"thumbnail_url": thumbnail_url,
ngken0995 marked this conversation as resolved.
Show resolved Hide resolved
"filesize": filesize,
"creator": creator,
"title": title,
"meta_data": meta_data,
}

def _get_meta_data(self, object_json: dict) -> dict | None:
ngken0995 marked this conversation as resolved.
Show resolved Hide resolved
geopos = object_json.get("geopos")[0] if object_json.get("geopos") else ""
metadata = {
"type": object_json.get("type"),
"geopos": geopos,
"department": object_json.get("department")[0],
}

metadata = {k: v for k, v in metadata.items() if v is not None}
return metadata

def _get_file_info(self, url) -> int | None:
"""Get the image size in bytes."""
resp = self.delayed_requester.head(url)
if resp:
filesize = int(resp.headers.get("Content-Length", 0))
return filesize if filesize != 0 else None


def main():
# Allows running ingestion from the CLI without Airflow running for debugging
# purposes.
ingester = AucklandMuseumDataIngester()
ingester.ingest_records()


if __name__ == "__main__":
main()
5 changes: 5 additions & 0 deletions catalog/dags/providers/provider_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from airflow.models import Variable
from typing_extensions import NotRequired, TypedDict

from providers.provider_api_scripts.auckland_museum import AucklandMuseumDataIngester
from providers.provider_api_scripts.brooklyn_museum import BrooklynMuseumDataIngester
from providers.provider_api_scripts.cleveland_museum import ClevelandDataIngester
from providers.provider_api_scripts.europeana import EuropeanaDataIngester
Expand Down Expand Up @@ -192,6 +193,10 @@ def __post_init__(self):


PROVIDER_WORKFLOWS = [
ProviderWorkflow(
start_date=datetime(2023, 11, 1),
stacimc marked this conversation as resolved.
Show resolved Hide resolved
ingester_class=AucklandMuseumDataIngester,
stacimc marked this conversation as resolved.
Show resolved Hide resolved
),
ProviderWorkflow(
start_date=datetime(2020, 1, 1),
ingester_class=BrooklynMuseumDataIngester,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
{
"_index": "collectionsonline-2022-05-04-1",
"_type": "_doc",
"_id": "http://api.aucklandmuseum.com/id/naturalsciences/object/691102",
"_score": 2.0630994,
"_source": {
"copyright": ["© Auckland Museum CC BY"],
"notes": [],
"references": [
{
"person": {
"secondary_maker": [],
"primary_maker": [],
"classified": ["http://api.aucklandmuseum.com/id/person/28441"],
"collected": ["http://api.aucklandmuseum.com/id/person/25299"],
"_all": [
"http://api.aucklandmuseum.com/id/person/28441",
"http://api.aucklandmuseum.com/id/person/25299"
],
"referred": []
}
},
{ "object": { "childOf": [], "_all": [], "referred": [] } }
],
"documentType": [],
"geoSubject": [],
"language": [],
"type": "ecrm:E20_Biological_Object",
"content": [],
"localityDescription": ["[Western Samoa, Savai'i] Hinter [behind] Safai"],
"acquisitionStatement": [],
"recordScore": 40,
"responsibility": [],
"dc_contributor": ["R. O. Gardner"],
"isTaonga": false,
"place": {
"found": { "_all": [] },
"made": { "_all": [] },
"associated": { "_all": [] },
"captured": { "_all": [] },
"published": { "_all": [] },
"acquired": { "_all": ["Samoa"] },
"_all": ["Samoa"]
},
"appellation": {
"Common Name": [],
"Classification Display Value": ["Cypholophus macrocephalus mollis"],
"Primary Title": [
"Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd."
],
"Other Title": [],
"_all_suggest": {
"input": [
"mollis",
"Cypholophus macrocephalus mollis",
"Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd."
],
"contexts": { "type_context": "ecrm:E20_Biological_Object" }
},
"Maori Name": [],
"Classification Value": ["mollis"],
"_all": [
"mollis",
"Cypholophus macrocephalus mollis",
"Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd."
]
},
"keyword": [],
"department": ["botany"],
"dc_identifier": ["AK28252"],
"process": [],
"period": {
"made": [{ "exact": "2010-07-08T00:00:00.000Z" }],
"associated": [],
"published": [],
"accession": [
{
"end": "2010-07-08T00:00:00.000Z",
"text": "08 Jul 2010",
"begin": "2010-07-08T00:00:00.000Z"
}
],
"acquired": [
{
"end": "1905-06-22T00:00:00.000Z",
"text": "22 Jun 1905",
"begin": "1905-06-22T00:00:00.000Z"
}
],
"time_period": [],
"_all": [
{
"end": "2010-07-08T00:00:00.000Z",
"text": "08 Jul 2010",
"begin": "2010-07-08T00:00:00.000Z"
},
{
"end": "1905-06-22T00:00:00.000Z",
"text": "22 Jun 1905",
"begin": "1905-06-22T00:00:00.000Z"
},
{ "exact": "2010-07-08T00:00:00.000Z" }
]
},
"subjectStatus": [],
"geopos": [],
"primaryRepresentation": "http://api.aucklandmuseum.com/id/media/v/214749",
"dc_date": ["Jul 1989"],
"typeStatus": [],
"collection": [],
"classification": [
{
"object": [
{
"Kingdom": "Linnaean",
"Genus": "Linnaean",
"Linnaean System": "Linnaean",
"Family": "Linnaean",
"Species": "Linnaean",
"_all": [
"Mollis",
"Macrocephalus",
"Cypholophus",
"Urticaceae",
"Plantae",
"Linnaean",
"Mollis",
"Macrocephalus",
"Cypholophus",
"Urticaceae",
"Plantae",
"Linnaean",
"Mollis",
"Macrocephalus",
"Cypholophus",
"Urticaceae",
"Plantae",
"Linnaean",
"Mollis",
"Macrocephalus",
"Cypholophus",
"Urticaceae",
"Plantae",
"Linnaean",
"Mollis",
"Macrocephalus",
"Cypholophus",
"Urticaceae",
"Plantae",
"Linnaean",
"Mollis",
"Macrocephalus",
"Cypholophus",
"Urticaceae",
"Plantae",
"Linnaean"
],
"Var.": "Linnaean"
}
]
},
{ "material": [] },
{ "place": [] }
],
"subjectCategory": [],
"lastModifiedOn": ["2022-06-23T09:44:41.824Z"],
"tags": { "official": [], "user": [], "_all": [] },
"dc_place": ["Samoa"],
"kindOfSpecimen": ["1F- Foreign dry"],
"unit": [],
"culturalOrigin": [],
"isSensitive": false,
"series": [],
"dc_title": [
"mollis",
"Cypholophus macrocephalus mollis",
"Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd."
],
"location": [],
"isInLibrary": false,
"support": []
}
}
Loading