WordPress · ngken0995 · Jan 12, 2024 · Oct 23, 2023 · Oct 24, 2023 · Oct 24, 2023
@@ -12,6 +12,7 @@
 
 
 # Default provider names
+AUCKLAND_MUSEUM_IMAGE_PROVIDER = "aucklandmuseum"
-AUCKLAND_MUSEUM_IMAGE_PROVIDER = "aucklandmuseum"
+AUCKLAND_MUSEUM_IMAGE_PROVIDER = "aucklandmuseum_tamakipaengahira"
-AUCKLAND_MUSEUM_IMAGE_PROVIDER = "aucklandmuseum"
+AUCKLAND_MUSEUM_IMAGE_PROVIDER = "aucklandmuseum_tamakipaengahira"
 BROOKLYN_DEFAULT_PROVIDER = "brooklynmuseum"
 CLEVELAND_DEFAULT_PROVIDER = "clevelandmuseum"
 EUROPEANA_DEFAULT_PROVIDER = "europeana"

@@ -0,0 +1,143 @@
+"""
+Content Provider:       AucklandMuseum
-Content Provider:       AucklandMuseum
+Content Provider:       Auckland War Memorial Museum Tāmaki Paenga Hira
-Content Provider:       AucklandMuseum
+Content Provider:       Auckland War Memorial Museum Tāmaki Paenga Hira
+
+ETL Process:            Use the API to identify all CC licensed media.
+
+Output:                 TSV file containing the media and the
+                        respective meta-data.
+
+Notes:                  https://api.aucklandmuseum.com/
+
+Resource:               https://api.aucklandmuseum.com/
+                        https://github.com/AucklandMuseum/API/wiki/Tutorial
+"""
+import logging
+
+from common.constants import IMAGE
+from common.licenses import get_license_info
+from common.loader import provider_details as prov
+from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester
+
+
+logger = logging.getLogger(__name__)
+
+LANDING_URL = (
+    "https://www.aucklandmuseum.com/collections-research/collections/record/am_"
+)
+
+
+class AucklandMuseumDataIngester(ProviderDataIngester):
+    providers = {
+        "image": prov.AUCKLAND_MUSEUM_IMAGE_PROVIDER,
+    }
+    endpoint = "https://api.aucklandmuseum.com/search/collectionsonline/_search"
+    license_url = "https://creativecommons.org/licenses/by/4.0/"
+    delay = 4
+    from_start = 0
+    total_amount_of_data = 10000
+    DEFAULT_LICENSE_INFO = get_license_info(license_url=license_url)
+
+    def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict:
+        # On the first request, `prev_query_params` will be `None`. We can detect this
+        # and return our default params.
+        if not prev_query_params:
+            # Return default query params on the first request
+            # primaryRepresentation contain a image url for each data
+            # "+" is a query string syntax for must be present
+            # copyright:CC state Creative Commons Attribution 4.0
+            return {
+                "q": "_exists_:primaryRepresentation+copyright:CC",
+                "size": "100",
+                "from": self.from_start,
+            }
+        else:
+            # Increment `from` by 100.
+            return {
+                **prev_query_params,
+                "from": prev_query_params["from"] + 100,
+            }
+
+    def get_batch_data(self, response_json):
+        # Takes the raw API response from calling `get` on the endpoint, and returns
+        # the list of records to process.
+        if response_json:
+            return response_json.get("hits").get("hits")
+        return None
+
+    def get_should_continue(self, response_json):
+        # Do not continue if we have exceeded the total amount of data
+        if self.from_start >= self.total_amount_of_data:
+            logger.info(
+                "The final amount of data has been processed. Halting ingestion."
+            )
+            return False
+
+        return True
+
+    def get_media_type(self, record: dict):
+        return IMAGE
+
+    def get_record_data(self, data: dict) -> dict | list[dict] | None:
+        url_parameter = data.get("_id").split("id/")[-1].replace("/", "-")
+        foreign_landing_url = f"{LANDING_URL}{url_parameter}"
+
+        foreign_identifier = data.get("_id").split("/")[-1]
+
+        information = data.get("_source")
+
+        url = information.get("primaryRepresentation")
+
+        thumbnail_url = f"{url}?rendering=thumbnail.jpg"
+        license_info = self.DEFAULT_LICENSE_INFO
+        filesize = self._get_file_info(url)
+
+        creator = (
+            information.get("dc_contributor")[0]
+            if information.get("dc_contributor")
+            else ""
+        )
+
+        title = information.get("appellation").get("Primary Title")[0]
-        title = information.get("appellation").get("Primary Title")[0]
+        title = information.get("appellation", {}).get("Primary Title")[0]
-        title = information.get("appellation").get("Primary Title")[0]
+        title = information.get("appellation", {}).get("Primary Title")[0]
+        meta_data = self._get_meta_data(information)
+        data.get("tags")
+
+        return {
+            "foreign_landing_url": foreign_landing_url,
+            "foreign_identifier": foreign_identifier,
+            "url": url,
+            "license_info": license_info,
+            "thumbnail_url": thumbnail_url,
+            "filesize": filesize,
+            "creator": creator,
+            "title": title,
+            "meta_data": meta_data,
+        }
+
+    def _get_meta_data(self, object_json: dict) -> dict | None:
+        geopos = object_json.get("geopos")[0] if object_json.get("geopos") else ""
+        metadata = {
+            "type": object_json.get("type"),
+            "geopos": geopos,
+            "department": object_json.get("department")[0],
+        }
+
+        metadata = {k: v for k, v in metadata.items() if v is not None}
+        return metadata
+
+    def _get_file_info(self, url) -> int | None:
+        """Get the image size in bytes."""
+        resp = self.delayed_requester.head(url)
+        if resp:
+            filesize = int(resp.headers.get("Content-Length", 0))
+            return filesize if filesize != 0 else None
+
+
+def main():
+    # Allows running ingestion from the CLI without Airflow running for debugging
+    # purposes.
+    ingester = AucklandMuseumDataIngester()
+    ingester.ingest_records()
+
+
+if __name__ == "__main__":
+    main()
@@ -8,6 +8,7 @@
 from airflow.models import Variable
 from typing_extensions import NotRequired, TypedDict
 
+from providers.provider_api_scripts.auckland_museum import AucklandMuseumDataIngester
 from providers.provider_api_scripts.brooklyn_museum import BrooklynMuseumDataIngester
 from providers.provider_api_scripts.cleveland_museum import ClevelandDataIngester
 from providers.provider_api_scripts.europeana import EuropeanaDataIngester
@@ -192,6 +193,10 @@ def __post_init__(self):
 
 
 PROVIDER_WORKFLOWS = [
+    ProviderWorkflow(
+        start_date=datetime(2023, 11, 1),
+        ingester_class=AucklandMuseumDataIngester,
+    ),
     ProviderWorkflow(
         start_date=datetime(2020, 1, 1),
         ingester_class=BrooklynMuseumDataIngester,

@@ -0,0 +1,183 @@
+{
+  "_index": "collectionsonline-2022-05-04-1",
+  "_type": "_doc",
+  "_id": "http://api.aucklandmuseum.com/id/naturalsciences/object/691102",
+  "_score": 2.0630994,
+  "_source": {
+    "copyright": ["© Auckland Museum CC BY"],
+    "notes": [],
+    "references": [
+      {
+        "person": {
+          "secondary_maker": [],
+          "primary_maker": [],
+          "classified": ["http://api.aucklandmuseum.com/id/person/28441"],
+          "collected": ["http://api.aucklandmuseum.com/id/person/25299"],
+          "_all": [
+            "http://api.aucklandmuseum.com/id/person/28441",
+            "http://api.aucklandmuseum.com/id/person/25299"
+          ],
+          "referred": []
+        }
+      },
+      { "object": { "childOf": [], "_all": [], "referred": [] } }
+    ],
+    "documentType": [],
+    "geoSubject": [],
+    "language": [],
+    "type": "ecrm:E20_Biological_Object",
+    "content": [],
+    "localityDescription": ["[Western Samoa, Savai'i]   Hinter [behind] Safai"],
+    "acquisitionStatement": [],
+    "recordScore": 40,
+    "responsibility": [],
+    "dc_contributor": ["R. O. Gardner"],
+    "isTaonga": false,
+    "place": {
+      "found": { "_all": [] },
+      "made": { "_all": [] },
+      "associated": { "_all": [] },
+      "captured": { "_all": [] },
+      "published": { "_all": [] },
+      "acquired": { "_all": ["Samoa"] },
+      "_all": ["Samoa"]
+    },
+    "appellation": {
+      "Common Name": [],
+      "Classification Display Value": ["Cypholophus macrocephalus mollis"],
+      "Primary Title": [
+        "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd."
+      ],
+      "Other Title": [],
+      "_all_suggest": {
+        "input": [
+          "mollis",
+          "Cypholophus macrocephalus mollis",
+          "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd."
+        ],
+        "contexts": { "type_context": "ecrm:E20_Biological_Object" }
+      },
+      "Maori Name": [],
+      "Classification Value": ["mollis"],
+      "_all": [
+        "mollis",
+        "Cypholophus macrocephalus mollis",
+        "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd."
+      ]
+    },
+    "keyword": [],
+    "department": ["botany"],
+    "dc_identifier": ["AK28252"],
+    "process": [],
+    "period": {
+      "made": [{ "exact": "2010-07-08T00:00:00.000Z" }],
+      "associated": [],
+      "published": [],
+      "accession": [
+        {
+          "end": "2010-07-08T00:00:00.000Z",
+          "text": "08 Jul 2010",
+          "begin": "2010-07-08T00:00:00.000Z"
+        }
+      ],
+      "acquired": [
+        {
+          "end": "1905-06-22T00:00:00.000Z",
+          "text": "22 Jun 1905",
+          "begin": "1905-06-22T00:00:00.000Z"
+        }
+      ],
+      "time_period": [],
+      "_all": [
+        {
+          "end": "2010-07-08T00:00:00.000Z",
+          "text": "08 Jul 2010",
+          "begin": "2010-07-08T00:00:00.000Z"
+        },
+        {
+          "end": "1905-06-22T00:00:00.000Z",
+          "text": "22 Jun 1905",
+          "begin": "1905-06-22T00:00:00.000Z"
+        },
+        { "exact": "2010-07-08T00:00:00.000Z" }
+      ]
+    },
+    "subjectStatus": [],
+    "geopos": [],
+    "primaryRepresentation": "http://api.aucklandmuseum.com/id/media/v/214749",
+    "dc_date": ["Jul 1989"],
+    "typeStatus": [],
+    "collection": [],
+    "classification": [
+      {
+        "object": [
+          {
+            "Kingdom": "Linnaean",
+            "Genus": "Linnaean",
+            "Linnaean System": "Linnaean",
+            "Family": "Linnaean",
+            "Species": "Linnaean",
+            "_all": [
+              "Mollis",
+              "Macrocephalus",
+              "Cypholophus",
+              "Urticaceae",
+              "Plantae",
+              "Linnaean",
+              "Mollis",
+              "Macrocephalus",
+              "Cypholophus",
+              "Urticaceae",
+              "Plantae",
+              "Linnaean",
+              "Mollis",
+              "Macrocephalus",
+              "Cypholophus",
+              "Urticaceae",
+              "Plantae",
+              "Linnaean",
+              "Mollis",
+              "Macrocephalus",
+              "Cypholophus",
+              "Urticaceae",
+              "Plantae",
+              "Linnaean",
+              "Mollis",
+              "Macrocephalus",
+              "Cypholophus",
+              "Urticaceae",
+              "Plantae",
+              "Linnaean",
+              "Mollis",
+              "Macrocephalus",
+              "Cypholophus",
+              "Urticaceae",
+              "Plantae",
+              "Linnaean"
+            ],
+            "Var.": "Linnaean"
+          }
+        ]
+      },
+      { "material": [] },
+      { "place": [] }
+    ],
+    "subjectCategory": [],
+    "lastModifiedOn": ["2022-06-23T09:44:41.824Z"],
+    "tags": { "official": [], "user": [], "_all": [] },
+    "dc_place": ["Samoa"],
+    "kindOfSpecimen": ["1F- Foreign dry"],
+    "unit": [],
+    "culturalOrigin": [],
+    "isSensitive": false,
+    "series": [],
+    "dc_title": [
+      "mollis",
+      "Cypholophus macrocephalus mollis",
+      "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd."
+    ],
+    "location": [],
+    "isInLibrary": false,
+    "support": []
+  }
+}