From d253f4773fde344c0047127f009a35872c0680f1 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Mon, 23 Oct 2023 17:04:08 -0400 Subject: [PATCH 01/30] create auckland museum and implement get_next_query_params --- .../dags/common/loader/provider_details.py | 1 + .../provider_api_scripts/auckland_museum.py | 155 ++++++++++++++++++ .../test_auckland_museum.py | 67 ++++++++ 3 files changed, 223 insertions(+) create mode 100644 catalog/dags/providers/provider_api_scripts/auckland_museum.py create mode 100644 catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py diff --git a/catalog/dags/common/loader/provider_details.py b/catalog/dags/common/loader/provider_details.py index a07108ffc17..8a2a20cbfb9 100644 --- a/catalog/dags/common/loader/provider_details.py +++ b/catalog/dags/common/loader/provider_details.py @@ -12,6 +12,7 @@ # Default provider names +AUCKLAND_MUSEUM_IMAGE_PROVIDER = "aucklandmuseum" BROOKLYN_DEFAULT_PROVIDER = "brooklynmuseum" CLEVELAND_DEFAULT_PROVIDER = "clevelandmuseum" EUROPEANA_DEFAULT_PROVIDER = "europeana" diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py new file mode 100644 index 00000000000..61f2b8a5588 --- /dev/null +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -0,0 +1,155 @@ +""" +TODO: This doc string will be used to generate documentation for the DAG in +DAGs.md. Update it to include any relevant information that you'd like to +be documented. + +Content Provider: AucklandMuseum + +ETL Process: Use the API to identify all CC licensed media. + +Output: TSV file containing the media and the + respective meta-data. + +Notes: https://api.aucklandmuseum.com/ +""" +import logging + +from common.licenses import get_license_info +from common.loader import provider_details as prov +from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester + + +logger = logging.getLogger(__name__) + + +class AucklandMuseumDataIngester(ProviderDataIngester): + """ + This is a template for a ProviderDataIngester. + + Methods are shown with example implementations. Adjust them to suit your API. + """ + + # TODO: Add the provider constants to `common.loader.provider_details.py` + providers = { + "image": prov.AUCKLAND_MUSEUM_IMAGE_PROVIDER, + } + endpoint = "https://api.aucklandmuseum.com/search/collectionsonline/_search" + + delay = 4 + from_start = 0 + + def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: + # On the first request, `prev_query_params` will be `None`. We can detect this + # and return our default params. + if not prev_query_params: + # Return default query params on the first request + # primaryRepresentation contain a image url for each data + # "+" is a query string syntax for must be present + # copyright:CC state Creative Commons Attribution 4.0 + return { + "q": "_exists_:primaryRepresentation+copyright:CC", + "size": "100", + "from": self.from_start, + } + else: + # Increment `from` by 100. + return { + **prev_query_params, + "from": prev_query_params["from"] + 100, + } + + def get_batch_data(self, response_json): + # Takes the raw API response from calling `get` on the endpoint, and returns + # the list of records to process. + # TODO: Update based on your API. + if response_json: + return response_json.get("results") + return None + + def get_media_type(self, record: dict): + # For a given record json, return the media type it represents. + # TODO: Update based on your API. TIP: May be hard-coded if the provider only + # returns records of one type, eg `return constants.IMAGE` + return record["media_type"] + + def get_record_data(self, data: dict) -> dict | list[dict] | None: + # Parse out the necessary info from the record data into a dictionary. + # TODO: Update based on your API. + # TODO: Important! Refer to the most up-to-date documentation about the + # available fields in `openverse_catalog/docs/data_models.md` + + # REQUIRED FIELDS: + # - foreign_identifier + # - foreign_landing_url + # - license_info + # - url + # + # If a required field is missing, return early to prevent unnecessary + # processing. + if not (foreign_identifier := data.get("foreign_id")): + return None + + if not (foreign_landing_url := data.get("foreign_landing_url")): + return None + + if not (url := data.get("url")): + return None + + # Use the `get_license_info` utility to get license information from a URL. + license_url = data.get("license") + license_info = get_license_info(license_url) + if license_info is None: + return None + + # OPTIONAL FIELDS + # Obtain as many optional fields as possible. + thumbnail_url = data.get("thumbnail") + filesize = data.get("filesize") + filetype = data.get("filetype") + creator = data.get("creator") + creator_url = data.get("creator_url") + title = data.get("title") + meta_data = data.get("meta_data") + raw_tags = data.get("tags") + watermarked = data.get("watermarked") + + # MEDIA TYPE-SPECIFIC FIELDS + # Each Media type may also have its own optional fields. See documentation. + # TODO: Populate media type-specific fields. + # If your provider supports more than one media type, you'll need to first + # determine the media type of the record being processed. + # + # Example: + # media_type = self.get_media_type(data) + # media_type_specific_fields = self.get_media_specific_fields(media_type, data) + # + # If only one media type is supported, simply extract the fields here. + + return { + "foreign_landing_url": foreign_landing_url, + "url": url, + "license_info": license_info, + # Optional fields + "foreign_identifier": foreign_identifier, + "thumbnail_url": thumbnail_url, + "filesize": filesize, + "filetype": filetype, + "creator": creator, + "creator_url": creator_url, + "title": title, + "meta_data": meta_data, + "raw_tags": raw_tags, + "watermarked": watermarked, + # TODO: Remember to add any media-type specific fields here + } + + +def main(): + # Allows running ingestion from the CLI without Airflow running for debugging + # purposes. + ingester = AucklandMuseumDataIngester() + ingester.ingest_records() + + +if __name__ == "__main__": + main() diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py new file mode 100644 index 00000000000..d78ab6ad523 --- /dev/null +++ b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py @@ -0,0 +1,67 @@ +""" +TODO: Add additional tests for any methods you added in your subclass. +Try to test edge cases (missing keys, different data types returned, Nones, etc). +You may also need to update the given test names to be more specific. + +Run your tests locally with `just test -k auckland_museum` +""" + +import json +from pathlib import Path + +from providers.provider_api_scripts.auckland_museum import AucklandMuseumDataIngester + + +# TODO: API responses used for testing can be added to this directory +RESOURCES = Path(__file__).parent / "resources/auckland_museum" + +# Set up test class +ingester = AucklandMuseumDataIngester() + + +def test_get_next_query_params_default_response(): + actual_result = ingester.get_next_query_params(None) + expected_result = { + # TODO: Fill out expected default query params + } + assert actual_result == expected_result + + +def test_get_next_query_params_updates_parameters(): + previous_query_params = { + # TODO: Fill out a realistic set of previous query params + } + actual_result = ingester.get_next_query_params(previous_query_params) + + expected_result = { + # TODO: Fill out what the next set of query params should be, + # incrementing offsets or page numbers if necessary + } + assert actual_result == expected_result + + +def test_get_media_type(): + # TODO: Test the correct media type is returned for each possible media type. + pass + + +def test_get_record_data(): + # High level test for `get_record_data`. One way to test this is to create a + # `tests/resources/AucklandMuseum/single_item.json` file containing a sample json + # representation of a record from the API under test, call `get_record_data` with + # the json, and directly compare to expected output. + # + # Make sure to add additional tests for records of each media type supported by + # your provider. + + # Sample code for loading in the sample json + with open(RESOURCES / "single_item.json") as f: + resource_json = json.load(f) + + actual_data = ingester.get_record_data(resource_json) + + expected_data = { + # TODO: Fill out the expected data which will be saved to the Catalog + } + + assert actual_data == expected_data From 7f46257a469589e24ac142a14987d0378b43b3e1 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Tue, 24 Oct 2023 09:34:28 -0400 Subject: [PATCH 02/30] implement get_should_continue and get_batch_data --- .../provider_api_scripts/auckland_museum.py | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index 61f2b8a5588..bbe9e1c20e1 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -14,6 +14,7 @@ """ import logging +from common.constants import IMAGE from common.licenses import get_license_info from common.loader import provider_details as prov from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester @@ -27,9 +28,11 @@ class AucklandMuseumDataIngester(ProviderDataIngester): This is a template for a ProviderDataIngester. Methods are shown with example implementations. Adjust them to suit your API. + Resource: + - https://api.aucklandmuseum.com/ + - https://github.com/AucklandMuseum/API/wiki/Tutorial """ - # TODO: Add the provider constants to `common.loader.provider_details.py` providers = { "image": prov.AUCKLAND_MUSEUM_IMAGE_PROVIDER, } @@ -37,6 +40,7 @@ class AucklandMuseumDataIngester(ProviderDataIngester): delay = 4 from_start = 0 + total_amount_of_data = 10000 def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: # On the first request, `prev_query_params` will be `None`. We can detect this @@ -61,16 +65,22 @@ def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dic def get_batch_data(self, response_json): # Takes the raw API response from calling `get` on the endpoint, and returns # the list of records to process. - # TODO: Update based on your API. if response_json: - return response_json.get("results") + return response_json.get("hits").get("hits") return None + def get_should_continue(self, response_json): + # Do not continue if we have exceeded the total amount of data + if self.from_start >= self.total_amount_of_data: + logger.info( + "The final amount of data has been processed. Halting ingestion." + ) + return False + + return True + def get_media_type(self, record: dict): - # For a given record json, return the media type it represents. - # TODO: Update based on your API. TIP: May be hard-coded if the provider only - # returns records of one type, eg `return constants.IMAGE` - return record["media_type"] + return IMAGE def get_record_data(self, data: dict) -> dict | list[dict] | None: # Parse out the necessary info from the record data into a dictionary. From 24a9d312c58625650b7cc4eb735a10228560a021 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Tue, 24 Oct 2023 11:00:54 -0400 Subject: [PATCH 03/30] implement _get_meta_data and _get_file_info --- .../provider_api_scripts/auckland_museum.py | 95 +++++++------------ 1 file changed, 35 insertions(+), 60 deletions(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index bbe9e1c20e1..6804c6d7f09 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -37,10 +37,11 @@ class AucklandMuseumDataIngester(ProviderDataIngester): "image": prov.AUCKLAND_MUSEUM_IMAGE_PROVIDER, } endpoint = "https://api.aucklandmuseum.com/search/collectionsonline/_search" - + license_url = "https://creativecommons.org/licenses/by/4.0/" delay = 4 from_start = 0 total_amount_of_data = 10000 + DEFAULT_LICENSE_INFO = get_license_info(license_url=license_url) def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: # On the first request, `prev_query_params` will be `None`. We can detect this @@ -83,76 +84,50 @@ def get_media_type(self, record: dict): return IMAGE def get_record_data(self, data: dict) -> dict | list[dict] | None: - # Parse out the necessary info from the record data into a dictionary. - # TODO: Update based on your API. - # TODO: Important! Refer to the most up-to-date documentation about the - # available fields in `openverse_catalog/docs/data_models.md` - - # REQUIRED FIELDS: - # - foreign_identifier - # - foreign_landing_url - # - license_info - # - url - # - # If a required field is missing, return early to prevent unnecessary - # processing. - if not (foreign_identifier := data.get("foreign_id")): - return None - - if not (foreign_landing_url := data.get("foreign_landing_url")): - return None - - if not (url := data.get("url")): - return None - - # Use the `get_license_info` utility to get license information from a URL. - license_url = data.get("license") - license_info = get_license_info(license_url) - if license_info is None: - return None - - # OPTIONAL FIELDS - # Obtain as many optional fields as possible. - thumbnail_url = data.get("thumbnail") - filesize = data.get("filesize") - filetype = data.get("filetype") - creator = data.get("creator") - creator_url = data.get("creator_url") - title = data.get("title") - meta_data = data.get("meta_data") - raw_tags = data.get("tags") - watermarked = data.get("watermarked") - - # MEDIA TYPE-SPECIFIC FIELDS - # Each Media type may also have its own optional fields. See documentation. - # TODO: Populate media type-specific fields. - # If your provider supports more than one media type, you'll need to first - # determine the media type of the record being processed. - # - # Example: - # media_type = self.get_media_type(data) - # media_type_specific_fields = self.get_media_specific_fields(media_type, data) - # - # If only one media type is supported, simply extract the fields here. + information = data.get("_source") + + url = information.get("primaryRepresentation") + thumbnail_url = f"{url}?rendering=thumbnail.jpg" + license_info = self.DEFAULT_LICENSE_INFO + filesize = self._get_file_info(url) + + if information.get("dc_contributor")[0]: + creator = information.get("dc_contributor")[0] + else: + creator = "" + + creator = information.get("dc_contributor")[0] + title = information.get("appellation").get("Primary Title")[0] + meta_data = self._get_meta_data(information) + data.get("tags") return { - "foreign_landing_url": foreign_landing_url, "url": url, "license_info": license_info, - # Optional fields - "foreign_identifier": foreign_identifier, "thumbnail_url": thumbnail_url, "filesize": filesize, - "filetype": filetype, "creator": creator, - "creator_url": creator_url, "title": title, "meta_data": meta_data, - "raw_tags": raw_tags, - "watermarked": watermarked, - # TODO: Remember to add any media-type specific fields here } + def _get_meta_data(self, object_json: dict) -> dict | None: + metadata = { + "type": object_json.get("type"), + "geopos": object_json.get("geopos")[0], + "department": object_json.get("department")[0], + } + + metadata = {k: v for k, v in metadata.items() if v is not None} + return metadata + + def _get_file_info(self, url) -> int | None: + """Get the image size in bytes.""" + resp = self.delayed_requester.head(url) + if resp: + filesize = int(resp.headers.get("Content-Length", 0)) + return filesize if filesize != 0 else None + def main(): # Allows running ingestion from the CLI without Airflow running for debugging From 7fd1303024eb92b89b4c930fb43fd3bfe4ae8156 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Tue, 24 Oct 2023 14:19:51 -0400 Subject: [PATCH 04/30] add test sample --- .../resources/aucklandmuseum/sample_data.json | 168 ++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 catalog/tests/dags/providers/provider_api_scripts/resources/aucklandmuseum/sample_data.json diff --git a/catalog/tests/dags/providers/provider_api_scripts/resources/aucklandmuseum/sample_data.json b/catalog/tests/dags/providers/provider_api_scripts/resources/aucklandmuseum/sample_data.json new file mode 100644 index 00000000000..3183a37b6f5 --- /dev/null +++ b/catalog/tests/dags/providers/provider_api_scripts/resources/aucklandmuseum/sample_data.json @@ -0,0 +1,168 @@ +[ + { + "_index": "collectionsonline-2022-05-04-1", + "_type": "_doc", + "_id": "http://api.aucklandmuseum.com/id/naturalsciences/object/923361", + "_score": 2.0630994, + "_source": { + "copyright": ["© Auckland Museum CC BY"], + "notes": [], + "references": [ + { + "person": { + "secondary_maker": [], + "primary_maker": [], + "classified": ["http://api.aucklandmuseum.com/id/person/18160"], + "collected": ["http://api.aucklandmuseum.com/id/person/18160"], + "_all": ["http://api.aucklandmuseum.com/id/person/18160"], + "referred": [] + } + }, + { "object": { "childOf": [], "_all": [], "referred": [] } } + ], + "documentType": [], + "geoSubject": [], + "language": [], + "type": "ecrm:E20_Biological_Object", + "content": [], + "localityDescription": ["New Caledonia, near Touaourou"], + "acquisitionStatement": [], + "recordScore": 40, + "responsibility": [], + "dc_contributor": ["John Taylor"], + "isTaonga": false, + "place": { + "found": { "_all": [] }, + "made": { "_all": [] }, + "associated": { "_all": [] }, + "captured": { "_all": [] }, + "published": { "_all": [] }, + "acquired": { "_all": ["New Caledonia"] }, + "_all": ["New Caledonia"] + }, + "appellation": { + "Common Name": [], + "Classification Display Value": ["Blechnum obtusatum"], + "Primary Title": ["Blechnum obtusatum (Labill.) Mett."], + "Other Title": [], + "_all_suggest": { + "input": [ + "obtusatum", + "Blechnum obtusatum", + "Blechnum obtusatum (Labill.) Mett." + ], + "contexts": { "type_context": "ecrm:E20_Biological_Object" } + }, + "Maori Name": [], + "Classification Value": ["obtusatum"], + "_all": [ + "obtusatum", + "Blechnum obtusatum", + "Blechnum obtusatum (Labill.) Mett." + ] + }, + "keyword": [], + "department": ["Botany"], + "dc_identifier": ["AK379487"], + "process": [], + "period": { + "made": [{ "exact": "2020-02-05T00:00:00.000Z" }], + "associated": [], + "published": [], + "accession": [ + { + "end": "2020-02-05T00:00:00.000Z", + "text": "05 Feb 2020", + "begin": "2020-02-05T00:00:00.000Z" + } + ], + "acquired": [ + { + "end": "1973-08-19T00:00:00.000Z", + "text": "19 Aug 1973", + "begin": "1973-08-19T00:00:00.000Z" + } + ], + "time_period": [], + "_all": [ + { + "end": "1973-08-19T00:00:00.000Z", + "text": "19 Aug 1973", + "begin": "1973-08-19T00:00:00.000Z" + }, + { "exact": "2020-02-05T00:00:00.000Z" }, + { + "end": "2020-02-05T00:00:00.000Z", + "text": "05 Feb 2020", + "begin": "2020-02-05T00:00:00.000Z" + } + ] + }, + "subjectStatus": [], + "geopos": ["-22.2409166666667, 167.009111111111"], + "primaryRepresentation": "http://api.aucklandmuseum.com/id/media/v/801545", + "dc_date": ["1973"], + "typeStatus": [], + "collection": [], + "classification": [ + { + "object": [ + { + "Kingdom": "Linnaean", + "Genus": "Linnaean", + "Linnaean System": "Linnaean", + "Family": "Linnaean", + "Species": "Linnaean", + "_all": [ + "Obtusatum", + "Blechnum", + "Blechnaceae", + "Plantae", + "Linnaean", + "Obtusatum", + "Blechnum", + "Blechnaceae", + "Plantae", + "Linnaean", + "Obtusatum", + "Blechnum", + "Blechnaceae", + "Plantae", + "Linnaean", + "Obtusatum", + "Blechnum", + "Blechnaceae", + "Plantae", + "Linnaean", + "Obtusatum", + "Blechnum", + "Blechnaceae", + "Plantae", + "Linnaean" + ] + } + ] + }, + { "material": [] }, + { "place": [] } + ], + "subjectCategory": [], + "lastModifiedOn": ["2022-04-28T09:52:10.256Z"], + "tags": { "official": [], "user": [], "_all": [] }, + "dc_place": ["New Caledonia"], + "kindOfSpecimen": ["1F- Foreign dry"], + "unit": [], + "culturalOrigin": [], + "isSensitive": false, + "series": [], + "dc_title": [ + "obtusatum", + "Blechnum obtusatum", + "Blechnum obtusatum (Labill.) Mett." + ], + "location": [], + "isInLibrary": false, + "support": [] + } + } +] From b6b5447927824488b29f7b348d803d79b71c1b8a Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Tue, 24 Oct 2023 14:45:41 -0400 Subject: [PATCH 05/30] add new single_item --- .../resources/aucklandmuseum/sample_data.json | 168 ---------------- .../resources/aucklandmuseum/single_item.json | 183 ++++++++++++++++++ 2 files changed, 183 insertions(+), 168 deletions(-) delete mode 100644 catalog/tests/dags/providers/provider_api_scripts/resources/aucklandmuseum/sample_data.json create mode 100644 catalog/tests/dags/providers/provider_api_scripts/resources/aucklandmuseum/single_item.json diff --git a/catalog/tests/dags/providers/provider_api_scripts/resources/aucklandmuseum/sample_data.json b/catalog/tests/dags/providers/provider_api_scripts/resources/aucklandmuseum/sample_data.json deleted file mode 100644 index 3183a37b6f5..00000000000 --- a/catalog/tests/dags/providers/provider_api_scripts/resources/aucklandmuseum/sample_data.json +++ /dev/null @@ -1,168 +0,0 @@ -[ - { - "_index": "collectionsonline-2022-05-04-1", - "_type": "_doc", - "_id": "http://api.aucklandmuseum.com/id/naturalsciences/object/923361", - "_score": 2.0630994, - "_source": { - "copyright": ["© Auckland Museum CC BY"], - "notes": [], - "references": [ - { - "person": { - "secondary_maker": [], - "primary_maker": [], - "classified": ["http://api.aucklandmuseum.com/id/person/18160"], - "collected": ["http://api.aucklandmuseum.com/id/person/18160"], - "_all": ["http://api.aucklandmuseum.com/id/person/18160"], - "referred": [] - } - }, - { "object": { "childOf": [], "_all": [], "referred": [] } } - ], - "documentType": [], - "geoSubject": [], - "language": [], - "type": "ecrm:E20_Biological_Object", - "content": [], - "localityDescription": ["New Caledonia, near Touaourou"], - "acquisitionStatement": [], - "recordScore": 40, - "responsibility": [], - "dc_contributor": ["John Taylor"], - "isTaonga": false, - "place": { - "found": { "_all": [] }, - "made": { "_all": [] }, - "associated": { "_all": [] }, - "captured": { "_all": [] }, - "published": { "_all": [] }, - "acquired": { "_all": ["New Caledonia"] }, - "_all": ["New Caledonia"] - }, - "appellation": { - "Common Name": [], - "Classification Display Value": ["Blechnum obtusatum"], - "Primary Title": ["Blechnum obtusatum (Labill.) Mett."], - "Other Title": [], - "_all_suggest": { - "input": [ - "obtusatum", - "Blechnum obtusatum", - "Blechnum obtusatum (Labill.) Mett." - ], - "contexts": { "type_context": "ecrm:E20_Biological_Object" } - }, - "Maori Name": [], - "Classification Value": ["obtusatum"], - "_all": [ - "obtusatum", - "Blechnum obtusatum", - "Blechnum obtusatum (Labill.) Mett." - ] - }, - "keyword": [], - "department": ["Botany"], - "dc_identifier": ["AK379487"], - "process": [], - "period": { - "made": [{ "exact": "2020-02-05T00:00:00.000Z" }], - "associated": [], - "published": [], - "accession": [ - { - "end": "2020-02-05T00:00:00.000Z", - "text": "05 Feb 2020", - "begin": "2020-02-05T00:00:00.000Z" - } - ], - "acquired": [ - { - "end": "1973-08-19T00:00:00.000Z", - "text": "19 Aug 1973", - "begin": "1973-08-19T00:00:00.000Z" - } - ], - "time_period": [], - "_all": [ - { - "end": "1973-08-19T00:00:00.000Z", - "text": "19 Aug 1973", - "begin": "1973-08-19T00:00:00.000Z" - }, - { "exact": "2020-02-05T00:00:00.000Z" }, - { - "end": "2020-02-05T00:00:00.000Z", - "text": "05 Feb 2020", - "begin": "2020-02-05T00:00:00.000Z" - } - ] - }, - "subjectStatus": [], - "geopos": ["-22.2409166666667, 167.009111111111"], - "primaryRepresentation": "http://api.aucklandmuseum.com/id/media/v/801545", - "dc_date": ["1973"], - "typeStatus": [], - "collection": [], - "classification": [ - { - "object": [ - { - "Kingdom": "Linnaean", - "Genus": "Linnaean", - "Linnaean System": "Linnaean", - "Family": "Linnaean", - "Species": "Linnaean", - "_all": [ - "Obtusatum", - "Blechnum", - "Blechnaceae", - "Plantae", - "Linnaean", - "Obtusatum", - "Blechnum", - "Blechnaceae", - "Plantae", - "Linnaean", - "Obtusatum", - "Blechnum", - "Blechnaceae", - "Plantae", - "Linnaean", - "Obtusatum", - "Blechnum", - "Blechnaceae", - "Plantae", - "Linnaean", - "Obtusatum", - "Blechnum", - "Blechnaceae", - "Plantae", - "Linnaean" - ] - } - ] - }, - { "material": [] }, - { "place": [] } - ], - "subjectCategory": [], - "lastModifiedOn": ["2022-04-28T09:52:10.256Z"], - "tags": { "official": [], "user": [], "_all": [] }, - "dc_place": ["New Caledonia"], - "kindOfSpecimen": ["1F- Foreign dry"], - "unit": [], - "culturalOrigin": [], - "isSensitive": false, - "series": [], - "dc_title": [ - "obtusatum", - "Blechnum obtusatum", - "Blechnum obtusatum (Labill.) Mett." - ], - "location": [], - "isInLibrary": false, - "support": [] - } - } -] diff --git a/catalog/tests/dags/providers/provider_api_scripts/resources/aucklandmuseum/single_item.json b/catalog/tests/dags/providers/provider_api_scripts/resources/aucklandmuseum/single_item.json new file mode 100644 index 00000000000..ac482f907ed --- /dev/null +++ b/catalog/tests/dags/providers/provider_api_scripts/resources/aucklandmuseum/single_item.json @@ -0,0 +1,183 @@ +{ + "_index": "collectionsonline-2022-05-04-1", + "_type": "_doc", + "_id": "http://api.aucklandmuseum.com/id/naturalsciences/object/691102", + "_score": 2.0630994, + "_source": { + "copyright": ["© Auckland Museum CC BY"], + "notes": [], + "references": [ + { + "person": { + "secondary_maker": [], + "primary_maker": [], + "classified": ["http://api.aucklandmuseum.com/id/person/28441"], + "collected": ["http://api.aucklandmuseum.com/id/person/25299"], + "_all": [ + "http://api.aucklandmuseum.com/id/person/28441", + "http://api.aucklandmuseum.com/id/person/25299" + ], + "referred": [] + } + }, + { "object": { "childOf": [], "_all": [], "referred": [] } } + ], + "documentType": [], + "geoSubject": [], + "language": [], + "type": "ecrm:E20_Biological_Object", + "content": [], + "localityDescription": ["[Western Samoa, Savai'i] Hinter [behind] Safai"], + "acquisitionStatement": [], + "recordScore": 40, + "responsibility": [], + "dc_contributor": ["R. O. Gardner"], + "isTaonga": false, + "place": { + "found": { "_all": [] }, + "made": { "_all": [] }, + "associated": { "_all": [] }, + "captured": { "_all": [] }, + "published": { "_all": [] }, + "acquired": { "_all": ["Samoa"] }, + "_all": ["Samoa"] + }, + "appellation": { + "Common Name": [], + "Classification Display Value": ["Cypholophus macrocephalus mollis"], + "Primary Title": [ + "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd." + ], + "Other Title": [], + "_all_suggest": { + "input": [ + "mollis", + "Cypholophus macrocephalus mollis", + "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd." + ], + "contexts": { "type_context": "ecrm:E20_Biological_Object" } + }, + "Maori Name": [], + "Classification Value": ["mollis"], + "_all": [ + "mollis", + "Cypholophus macrocephalus mollis", + "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd." + ] + }, + "keyword": [], + "department": ["botany"], + "dc_identifier": ["AK28252"], + "process": [], + "period": { + "made": [{ "exact": "2010-07-08T00:00:00.000Z" }], + "associated": [], + "published": [], + "accession": [ + { + "end": "2010-07-08T00:00:00.000Z", + "text": "08 Jul 2010", + "begin": "2010-07-08T00:00:00.000Z" + } + ], + "acquired": [ + { + "end": "1905-06-22T00:00:00.000Z", + "text": "22 Jun 1905", + "begin": "1905-06-22T00:00:00.000Z" + } + ], + "time_period": [], + "_all": [ + { + "end": "2010-07-08T00:00:00.000Z", + "text": "08 Jul 2010", + "begin": "2010-07-08T00:00:00.000Z" + }, + { + "end": "1905-06-22T00:00:00.000Z", + "text": "22 Jun 1905", + "begin": "1905-06-22T00:00:00.000Z" + }, + { "exact": "2010-07-08T00:00:00.000Z" } + ] + }, + "subjectStatus": [], + "geopos": [], + "primaryRepresentation": "http://api.aucklandmuseum.com/id/media/v/214749", + "dc_date": ["Jul 1989"], + "typeStatus": [], + "collection": [], + "classification": [ + { + "object": [ + { + "Kingdom": "Linnaean", + "Genus": "Linnaean", + "Linnaean System": "Linnaean", + "Family": "Linnaean", + "Species": "Linnaean", + "_all": [ + "Mollis", + "Macrocephalus", + "Cypholophus", + "Urticaceae", + "Plantae", + "Linnaean", + "Mollis", + "Macrocephalus", + "Cypholophus", + "Urticaceae", + "Plantae", + "Linnaean", + "Mollis", + "Macrocephalus", + "Cypholophus", + "Urticaceae", + "Plantae", + "Linnaean", + "Mollis", + "Macrocephalus", + "Cypholophus", + "Urticaceae", + "Plantae", + "Linnaean", + "Mollis", + "Macrocephalus", + "Cypholophus", + "Urticaceae", + "Plantae", + "Linnaean", + "Mollis", + "Macrocephalus", + "Cypholophus", + "Urticaceae", + "Plantae", + "Linnaean" + ], + "Var.": "Linnaean" + } + ] + }, + { "material": [] }, + { "place": [] } + ], + "subjectCategory": [], + "lastModifiedOn": ["2022-06-23T09:44:41.824Z"], + "tags": { "official": [], "user": [], "_all": [] }, + "dc_place": ["Samoa"], + "kindOfSpecimen": ["1F- Foreign dry"], + "unit": [], + "culturalOrigin": [], + "isSensitive": false, + "series": [], + "dc_title": [ + "mollis", + "Cypholophus macrocephalus mollis", + "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd." + ], + "location": [], + "isInLibrary": false, + "support": [] + } +} From 4566bdad90ea3269e621c2810b942c4dfa506029 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Tue, 24 Oct 2023 15:01:39 -0400 Subject: [PATCH 06/30] add test_get_record_data --- .../test_auckland_museum.py | 46 +++++++++++++------ 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py index d78ab6ad523..bcfd7e48629 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py @@ -5,46 +5,51 @@ Run your tests locally with `just test -k auckland_museum` """ - import json from pathlib import Path +from catalog.tests.dags.providers.provider_api_scripts.resources.json_load import ( + make_resource_json_func, +) +from common.licenses import get_license_info from providers.provider_api_scripts.auckland_museum import AucklandMuseumDataIngester # TODO: API responses used for testing can be added to this directory RESOURCES = Path(__file__).parent / "resources/auckland_museum" +CC_BY_4_0 = get_license_info("https://creativecommons.org/licenses/by/4.0/") # Set up test class ingester = AucklandMuseumDataIngester() +_get_resource_json = make_resource_json_func("aucklandmuseum") def test_get_next_query_params_default_response(): - actual_result = ingester.get_next_query_params(None) - expected_result = { - # TODO: Fill out expected default query params + actual_param = ingester.get_next_query_params(None) + expected_param = { + "q": "_exists_:primaryRepresentation+copyright:CC", + "size": "100", + "from": ingester.from_start, } - assert actual_result == expected_result + assert actual_param == expected_param def test_get_next_query_params_updates_parameters(): previous_query_params = { - # TODO: Fill out a realistic set of previous query params + "q": "_exists_:primaryRepresentation+copyright:CC", + "size": "100", + "from": ingester.from_start, } actual_result = ingester.get_next_query_params(previous_query_params) expected_result = { - # TODO: Fill out what the next set of query params should be, - # incrementing offsets or page numbers if necessary + "q": "_exists_:primaryRepresentation+copyright:CC", + "size": "100", + "from": ingester.from_start + 100, } assert actual_result == expected_result -def test_get_media_type(): - # TODO: Test the correct media type is returned for each possible media type. - pass - - def test_get_record_data(): # High level test for `get_record_data`. One way to test this is to create a # `tests/resources/AucklandMuseum/single_item.json` file containing a sample json @@ -55,13 +60,24 @@ def test_get_record_data(): # your provider. # Sample code for loading in the sample json + with open(RESOURCES / "single_item.json") as f: resource_json = json.load(f) actual_data = ingester.get_record_data(resource_json) - + meta_data = { + "type": "ecrm:E20_Biological_Object", + "geopos": "", + "department": "botany", + } expected_data = { - # TODO: Fill out the expected data which will be saved to the Catalog + "url": "http://api.aucklandmuseum.com/id/media/v/214749", + "license_info": CC_BY_4_0, + "thumbnail_url": "http://api.aucklandmuseum.com/id/media/v/214749?rendering=thumbnail.jpg", + "filesize": "2484439", + "creator": "R. O. Gardner", + "title": "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd.", + "meta_data": meta_data, } assert actual_data == expected_data From 3682cbe9214eddfc12a620bb399415a3283f14d0 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Tue, 24 Oct 2023 15:44:30 -0400 Subject: [PATCH 07/30] implement test_get_record_data --- .../provider_api_scripts/auckland_museum.py | 3 ++- .../test_auckland_museum.py | 26 +++++++++++++------ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index 6804c6d7f09..53a85af387d 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -112,9 +112,10 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: } def _get_meta_data(self, object_json: dict) -> dict | None: + geopos = object_json.get("geopos")[0] if object_json.get("geopos") else "" metadata = { "type": object_json.get("type"), - "geopos": object_json.get("geopos")[0], + "geopos": geopos, "department": object_json.get("department")[0], } diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py index bcfd7e48629..a57b33b05b9 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py @@ -5,8 +5,11 @@ Run your tests locally with `just test -k auckland_museum` """ -import json + from pathlib import Path +from unittest.mock import patch + +import pytest from catalog.tests.dags.providers.provider_api_scripts.resources.json_load import ( make_resource_json_func, @@ -16,13 +19,22 @@ # TODO: API responses used for testing can be added to this directory -RESOURCES = Path(__file__).parent / "resources/auckland_museum" +RESOURCES = Path(__file__).parent / "resources/aucklandmuseum" CC_BY_4_0 = get_license_info("https://creativecommons.org/licenses/by/4.0/") # Set up test class ingester = AucklandMuseumDataIngester() _get_resource_json = make_resource_json_func("aucklandmuseum") +AUDIO_FILE_SIZE = 2484439 + + +@pytest.fixture +def file_size_patch(): + with patch.object(ingester, "_get_file_info") as get_file_info_mock: + get_file_info_mock.return_value = AUDIO_FILE_SIZE + yield + def test_get_next_query_params_default_response(): actual_param = ingester.get_next_query_params(None) @@ -50,7 +62,7 @@ def test_get_next_query_params_updates_parameters(): assert actual_result == expected_result -def test_get_record_data(): +def test_get_record_data(file_size_patch): # High level test for `get_record_data`. One way to test this is to create a # `tests/resources/AucklandMuseum/single_item.json` file containing a sample json # representation of a record from the API under test, call `get_record_data` with @@ -61,10 +73,8 @@ def test_get_record_data(): # Sample code for loading in the sample json - with open(RESOURCES / "single_item.json") as f: - resource_json = json.load(f) - - actual_data = ingester.get_record_data(resource_json) + single_item = _get_resource_json("single_item.json") + actual_data = ingester.get_record_data(single_item) meta_data = { "type": "ecrm:E20_Biological_Object", "geopos": "", @@ -74,7 +84,7 @@ def test_get_record_data(): "url": "http://api.aucklandmuseum.com/id/media/v/214749", "license_info": CC_BY_4_0, "thumbnail_url": "http://api.aucklandmuseum.com/id/media/v/214749?rendering=thumbnail.jpg", - "filesize": "2484439", + "filesize": AUDIO_FILE_SIZE, "creator": "R. O. Gardner", "title": "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd.", "meta_data": meta_data, From 072978d5c7a82a5fe290ca07658c6ea04a8360f7 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Wed, 25 Oct 2023 15:29:17 -0400 Subject: [PATCH 08/30] if else condition for creator exist --- .../provider_api_scripts/auckland_museum.py | 26 ++++++------------- .../test_auckland_museum.py | 4 +-- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index 53a85af387d..94a6fd4943c 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -1,8 +1,4 @@ """ -TODO: This doc string will be used to generate documentation for the DAG in -DAGs.md. Update it to include any relevant information that you'd like to -be documented. - Content Provider: AucklandMuseum ETL Process: Use the API to identify all CC licensed media. @@ -11,6 +7,9 @@ respective meta-data. Notes: https://api.aucklandmuseum.com/ + +Resource: https://api.aucklandmuseum.com/ + https://github.com/AucklandMuseum/API/wiki/Tutorial """ import logging @@ -24,15 +23,6 @@ class AucklandMuseumDataIngester(ProviderDataIngester): - """ - This is a template for a ProviderDataIngester. - - Methods are shown with example implementations. Adjust them to suit your API. - Resource: - - https://api.aucklandmuseum.com/ - - https://github.com/AucklandMuseum/API/wiki/Tutorial - """ - providers = { "image": prov.AUCKLAND_MUSEUM_IMAGE_PROVIDER, } @@ -91,12 +81,12 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: license_info = self.DEFAULT_LICENSE_INFO filesize = self._get_file_info(url) - if information.get("dc_contributor")[0]: - creator = information.get("dc_contributor")[0] - else: - creator = "" + creator = ( + information.get("dc_contributor")[0] + if information.get("dc_contributor") + else "" + ) - creator = information.get("dc_contributor")[0] title = information.get("appellation").get("Primary Title")[0] meta_data = self._get_meta_data(information) data.get("tags") diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py index a57b33b05b9..a344a1e337e 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py @@ -1,5 +1,5 @@ """ -TODO: Add additional tests for any methods you added in your subclass. + Try to test edge cases (missing keys, different data types returned, Nones, etc). You may also need to update the given test names to be more specific. @@ -18,11 +18,9 @@ from providers.provider_api_scripts.auckland_museum import AucklandMuseumDataIngester -# TODO: API responses used for testing can be added to this directory RESOURCES = Path(__file__).parent / "resources/aucklandmuseum" CC_BY_4_0 = get_license_info("https://creativecommons.org/licenses/by/4.0/") -# Set up test class ingester = AucklandMuseumDataIngester() _get_resource_json = make_resource_json_func("aucklandmuseum") From 65c6913eb6dd44c314d9d2cba26f6dd67b7a0545 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Thu, 26 Oct 2023 11:48:15 -0400 Subject: [PATCH 09/30] add foreign_identifier and foregin_landing_url --- .../provider_api_scripts/auckland_museum.py | 12 ++++++++++++ .../provider_api_scripts/test_auckland_museum.py | 2 ++ 2 files changed, 14 insertions(+) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index 94a6fd4943c..121271d5d7b 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -21,6 +21,10 @@ logger = logging.getLogger(__name__) +LANDING_URL = ( + "https://www.aucklandmuseum.com/collections-research/collections/record/am_" +) + class AucklandMuseumDataIngester(ProviderDataIngester): providers = { @@ -74,9 +78,15 @@ def get_media_type(self, record: dict): return IMAGE def get_record_data(self, data: dict) -> dict | list[dict] | None: + url_parameter = data.get("_id").split("id/")[-1].replace("/", "-") + foreign_landing_url = f"{LANDING_URL}{url_parameter}" + + foreign_identifier = data.get("_id").split("/")[-1] + information = data.get("_source") url = information.get("primaryRepresentation") + thumbnail_url = f"{url}?rendering=thumbnail.jpg" license_info = self.DEFAULT_LICENSE_INFO filesize = self._get_file_info(url) @@ -92,6 +102,8 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: data.get("tags") return { + "foreign_landing_url": foreign_landing_url, + "foreign_identifier": foreign_identifier, "url": url, "license_info": license_info, "thumbnail_url": thumbnail_url, diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py index a344a1e337e..392d5e8c887 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py @@ -79,6 +79,8 @@ def test_get_record_data(file_size_patch): "department": "botany", } expected_data = { + "foreign_landing_url": "https://www.aucklandmuseum.com/collections-research/collections/record/am_naturalsciences-object-691102", + "foreign_identifier": "691102", "url": "http://api.aucklandmuseum.com/id/media/v/214749", "license_info": CC_BY_4_0, "thumbnail_url": "http://api.aucklandmuseum.com/id/media/v/214749?rendering=thumbnail.jpg", From 43fa686e52bcf767ff3f0d0ec749cd995c84f80b Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Thu, 26 Oct 2023 11:59:46 -0400 Subject: [PATCH 10/30] add AucklandMuseumDataIngester to provider_workflow --- catalog/dags/providers/provider_workflows.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/catalog/dags/providers/provider_workflows.py b/catalog/dags/providers/provider_workflows.py index 81f8c1e55e0..04972b72e23 100644 --- a/catalog/dags/providers/provider_workflows.py +++ b/catalog/dags/providers/provider_workflows.py @@ -8,6 +8,7 @@ from airflow.models import Variable from typing_extensions import NotRequired, TypedDict +from providers.provider_api_scripts.auckland_museum import AucklandMuseumDataIngester from providers.provider_api_scripts.brooklyn_museum import BrooklynMuseumDataIngester from providers.provider_api_scripts.cleveland_museum import ClevelandDataIngester from providers.provider_api_scripts.europeana import EuropeanaDataIngester @@ -192,6 +193,10 @@ def __post_init__(self): PROVIDER_WORKFLOWS = [ + ProviderWorkflow( + start_date=datetime(2023, 11, 1), + ingester_class=AucklandMuseumDataIngester, + ), ProviderWorkflow( start_date=datetime(2020, 1, 1), ingester_class=BrooklynMuseumDataIngester, From cceab62936c1a985baef066db33a15a743dc7793 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Fri, 27 Oct 2023 09:03:37 -0400 Subject: [PATCH 11/30] variables have default values --- .../provider_api_scripts/auckland_museum.py | 48 ++++++++++++++----- catalog/dags/providers/provider_workflows.py | 2 +- .../test_auckland_museum.py | 8 ---- 3 files changed, 38 insertions(+), 20 deletions(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index 121271d5d7b..5690f3537ad 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -10,6 +10,11 @@ Resource: https://api.aucklandmuseum.com/ https://github.com/AucklandMuseum/API/wiki/Tutorial + +Resource | Requests per second | Requests per day +-- | -- | -- +/search, /id | 10 | 1000 +/id/media | 10 | 1000 """ import logging @@ -32,11 +37,15 @@ class AucklandMuseumDataIngester(ProviderDataIngester): } endpoint = "https://api.aucklandmuseum.com/search/collectionsonline/_search" license_url = "https://creativecommons.org/licenses/by/4.0/" - delay = 4 - from_start = 0 total_amount_of_data = 10000 + DEFAULT_LICENSE_INFO = get_license_info(license_url=license_url) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.delay = 4 + self.batch_start = 0 + def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: # On the first request, `prev_query_params` will be `None`. We can detect this # and return our default params. @@ -48,10 +57,11 @@ def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dic return { "q": "_exists_:primaryRepresentation+copyright:CC", "size": "100", - "from": self.from_start, + "from": self.batch_start, } else: # Increment `from` by 100. + self.batch_start += 100 return { **prev_query_params, "from": prev_query_params["from"] + 100, @@ -61,7 +71,7 @@ def get_batch_data(self, response_json): # Takes the raw API response from calling `get` on the endpoint, and returns # the list of records to process. if response_json: - return response_json.get("hits").get("hits") + return response_json.get("hits", {}).get("hits") return None def get_should_continue(self, response_json): @@ -78,14 +88,21 @@ def get_media_type(self, record: dict): return IMAGE def get_record_data(self, data: dict) -> dict | list[dict] | None: + # check if _id is empty then foreign_landing_url and + # foreign_identifier doesn't exist + + if not data.get("_id"): + return None + url_parameter = data.get("_id").split("id/")[-1].replace("/", "-") foreign_landing_url = f"{LANDING_URL}{url_parameter}" foreign_identifier = data.get("_id").split("/")[-1] - information = data.get("_source") + information = data.get("_source", {}) - url = information.get("primaryRepresentation") + if not (url := information.get("primaryRepresentation")): + return None thumbnail_url = f"{url}?rendering=thumbnail.jpg" license_info = self.DEFAULT_LICENSE_INFO @@ -93,13 +110,16 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: creator = ( information.get("dc_contributor")[0] - if information.get("dc_contributor") + if information.get("dc_contributor", []) else "" ) - title = information.get("appellation").get("Primary Title")[0] + title = ( + information.get("appellation").get("Primary Title")[0] + if information.get("appellation", []) + else "" + ) meta_data = self._get_meta_data(information) - data.get("tags") return { "foreign_landing_url": foreign_landing_url, @@ -114,11 +134,17 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: } def _get_meta_data(self, object_json: dict) -> dict | None: - geopos = object_json.get("geopos")[0] if object_json.get("geopos") else "" + geopos = object_json.get("geopos")[0] if object_json.get("geopos", []) else "" + department = ( + object_json.get("department")[0] + if object_json.get("department", []) + else "" + ) + metadata = { "type": object_json.get("type"), "geopos": geopos, - "department": object_json.get("department")[0], + "department": department, } metadata = {k: v for k, v in metadata.items() if v is not None} diff --git a/catalog/dags/providers/provider_workflows.py b/catalog/dags/providers/provider_workflows.py index 04972b72e23..2300d6517b0 100644 --- a/catalog/dags/providers/provider_workflows.py +++ b/catalog/dags/providers/provider_workflows.py @@ -194,7 +194,7 @@ def __post_init__(self): PROVIDER_WORKFLOWS = [ ProviderWorkflow( - start_date=datetime(2023, 11, 1), + start_date=datetime(2023, 10, 1), ingester_class=AucklandMuseumDataIngester, ), ProviderWorkflow( diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py index 392d5e8c887..02c7ba84a2f 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py @@ -1,11 +1,3 @@ -""" - -Try to test edge cases (missing keys, different data types returned, Nones, etc). -You may also need to update the given test names to be more specific. - -Run your tests locally with `just test -k auckland_museum` -""" - from pathlib import Path from unittest.mock import patch From 1fe2ce00912882419d8e3922bc227e0abd99316e Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Fri, 27 Oct 2023 09:09:02 -0400 Subject: [PATCH 12/30] increment batch_start with batch_limit --- .../provider_api_scripts/auckland_museum.py | 33 +++++++------------ 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index 5690f3537ad..6f815761f07 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -38,34 +38,24 @@ class AucklandMuseumDataIngester(ProviderDataIngester): endpoint = "https://api.aucklandmuseum.com/search/collectionsonline/_search" license_url = "https://creativecommons.org/licenses/by/4.0/" total_amount_of_data = 10000 - DEFAULT_LICENSE_INFO = get_license_info(license_url=license_url) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.delay = 4 self.batch_start = 0 + self.batch_limit = 100 def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: - # On the first request, `prev_query_params` will be `None`. We can detect this - # and return our default params. - if not prev_query_params: - # Return default query params on the first request - # primaryRepresentation contain a image url for each data - # "+" is a query string syntax for must be present - # copyright:CC state Creative Commons Attribution 4.0 - return { - "q": "_exists_:primaryRepresentation+copyright:CC", - "size": "100", - "from": self.batch_start, - } - else: - # Increment `from` by 100. - self.batch_start += 100 - return { - **prev_query_params, - "from": prev_query_params["from"] + 100, - } + # Return default query params on the first request + # primaryRepresentation contain a image url for each data + # "+" is a query string syntax for must be present + # copyright:CC state Creative Commons Attribution 4.0 + return { + "q": "_exists_:primaryRepresentation+copyright:CC", + "size": "100", + "from": self.batch_start, + } def get_batch_data(self, response_json): # Takes the raw API response from calling `get` on the endpoint, and returns @@ -76,7 +66,8 @@ def get_batch_data(self, response_json): def get_should_continue(self, response_json): # Do not continue if we have exceeded the total amount of data - if self.from_start >= self.total_amount_of_data: + self.batch_start += self.batch_limit + if self.batch_start >= self.total_amount_of_data: logger.info( "The final amount of data has been processed. Halting ingestion." ) From dacc97ec6db7cbbfc381130ca39af0a7364760a4 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Thu, 2 Nov 2023 16:00:43 -0400 Subject: [PATCH 13/30] increase batch limit and comment filesize --- .../providers/provider_api_scripts/auckland_museum.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index 6f815761f07..9bbca19f17f 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -44,7 +44,7 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.delay = 4 self.batch_start = 0 - self.batch_limit = 100 + self.batch_limit = 2000 def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: # Return default query params on the first request @@ -53,7 +53,7 @@ def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dic # copyright:CC state Creative Commons Attribution 4.0 return { "q": "_exists_:primaryRepresentation+copyright:CC", - "size": "100", + "size": "2000", "from": self.batch_start, } @@ -97,7 +97,7 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: thumbnail_url = f"{url}?rendering=thumbnail.jpg" license_info = self.DEFAULT_LICENSE_INFO - filesize = self._get_file_info(url) + # filesize = self._get_file_info(url) creator = ( information.get("dc_contributor")[0] @@ -108,6 +108,7 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: title = ( information.get("appellation").get("Primary Title")[0] if information.get("appellation", []) + and information.get("appellation").get("Primary Title", []) else "" ) meta_data = self._get_meta_data(information) @@ -118,7 +119,7 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: "url": url, "license_info": license_info, "thumbnail_url": thumbnail_url, - "filesize": filesize, + # "filesize": filesize, "creator": creator, "title": title, "meta_data": meta_data, From 3278cbc939e1e0553ea31448254034cd784af6da Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Thu, 2 Nov 2023 16:01:53 -0400 Subject: [PATCH 14/30] remove filesize --- catalog/dags/providers/provider_api_scripts/auckland_museum.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index 9bbca19f17f..58f8401f882 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -97,7 +97,6 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: thumbnail_url = f"{url}?rendering=thumbnail.jpg" license_info = self.DEFAULT_LICENSE_INFO - # filesize = self._get_file_info(url) creator = ( information.get("dc_contributor")[0] @@ -119,7 +118,6 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: "url": url, "license_info": license_info, "thumbnail_url": thumbnail_url, - # "filesize": filesize, "creator": creator, "title": title, "meta_data": meta_data, From 9ed60456f547e16b7c6c327ac213ff0b02d56702 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Thu, 2 Nov 2023 16:16:27 -0400 Subject: [PATCH 15/30] revise test --- .../test_auckland_museum.py | 35 ++----------------- 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py index 02c7ba84a2f..257131e60c1 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py @@ -1,7 +1,4 @@ from pathlib import Path -from unittest.mock import patch - -import pytest from catalog.tests.dags.providers.provider_api_scripts.resources.json_load import ( make_resource_json_func, @@ -16,43 +13,18 @@ ingester = AucklandMuseumDataIngester() _get_resource_json = make_resource_json_func("aucklandmuseum") -AUDIO_FILE_SIZE = 2484439 - - -@pytest.fixture -def file_size_patch(): - with patch.object(ingester, "_get_file_info") as get_file_info_mock: - get_file_info_mock.return_value = AUDIO_FILE_SIZE - yield - def test_get_next_query_params_default_response(): actual_param = ingester.get_next_query_params(None) expected_param = { "q": "_exists_:primaryRepresentation+copyright:CC", - "size": "100", - "from": ingester.from_start, + "size": "2000", + "from": ingester.batch_start, } assert actual_param == expected_param -def test_get_next_query_params_updates_parameters(): - previous_query_params = { - "q": "_exists_:primaryRepresentation+copyright:CC", - "size": "100", - "from": ingester.from_start, - } - actual_result = ingester.get_next_query_params(previous_query_params) - - expected_result = { - "q": "_exists_:primaryRepresentation+copyright:CC", - "size": "100", - "from": ingester.from_start + 100, - } - assert actual_result == expected_result - - -def test_get_record_data(file_size_patch): +def test_get_record_data(): # High level test for `get_record_data`. One way to test this is to create a # `tests/resources/AucklandMuseum/single_item.json` file containing a sample json # representation of a record from the API under test, call `get_record_data` with @@ -76,7 +48,6 @@ def test_get_record_data(file_size_patch): "url": "http://api.aucklandmuseum.com/id/media/v/214749", "license_info": CC_BY_4_0, "thumbnail_url": "http://api.aucklandmuseum.com/id/media/v/214749?rendering=thumbnail.jpg", - "filesize": AUDIO_FILE_SIZE, "creator": "R. O. Gardner", "title": "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd.", "meta_data": meta_data, From e0994eddcdd1cd8e9f07e4bfc69c83b289b57f63 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Tue, 12 Dec 2023 14:10:05 -0500 Subject: [PATCH 16/30] implement POST method in requester and have a body for POST request --- catalog/dags/common/requester.py | 24 +++++++-- .../provider_api_scripts/auckland_museum.py | 49 ++++++++++++++++++- 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/catalog/dags/common/requester.py b/catalog/dags/common/requester.py index 6ce81aff176..8677dab2b30 100644 --- a/catalog/dags/common/requester.py +++ b/catalog/dags/common/requester.py @@ -122,20 +122,38 @@ def head(self, url, **kwargs): """ return self._make_request(self.session.head, url, **kwargs) + def post(self, url, params=None, **kwargs): + """ + Make a POST request, and return the response object if it exists. + + Required Arguments: + + url: URL to make the request as a string. + params: Dictionary of query string params. + **kwargs: Optional arguments that will be passed to `requests.get`. + """ + return self._make_request(self.session.post, url, params=params, **kwargs) + def _delay_processing(self): wait = self._DELAY - (time.time() - self._last_request) if wait >= 0: logging.debug(f"Waiting {wait} second(s)") time.sleep(wait) - def get_response_json(self, endpoint, retries=0, query_params=None, **kwargs): + def get_response_json( + self, endpoint, retries=0, query_params=None, requestMethod="get", **kwargs + ): response_json = None - + response = None if retries < 0: logger.error("No retries remaining. Failure.") raise RetriesExceeded("Retries exceeded") - response = self.get(endpoint, params=query_params, **kwargs) + if requestMethod == "get": + response = self.get(endpoint, params=query_params, **kwargs) + elif requestMethod == "post": + response = self.post(endpoint, params=query_params, **kwargs) + if response is not None and response.status_code == 200: try: response_json = response.json() diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index 58f8401f882..e7233a3666c 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -17,6 +17,7 @@ /id/media | 10 | 1000 """ import logging +from datetime import datetime, timedelta from common.constants import IMAGE from common.licenses import get_license_info @@ -24,6 +25,13 @@ from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester +def convert_date_format(date_obj) -> str: + date = str(date_obj) + date = date.replace(" ", "T") + date = date + "Z" + return date + + logger = logging.getLogger(__name__) LANDING_URL = ( @@ -45,6 +53,27 @@ def __init__(self, *args, **kwargs): self.delay = 4 self.batch_start = 0 self.batch_limit = 2000 + self.headers = {"Content-Type": "application/json"} + self.date_from = convert_date_format(datetime.now() - timedelta(days=1)) + self.date_to = convert_date_format(datetime.now()) + self.data = { + "query": { + "bool": { + "must": [ + {"wildcard": {"copyright": {"value": "Auckland"}}}, + {"exists": {"field": "primaryRepresentation"}}, + { + "range": { + "lastModifiedOn": { + "from": self.date_from, + "to": self.date_to, + } + } + }, + ] + } + } + } def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: # Return default query params on the first request @@ -52,7 +81,6 @@ def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dic # "+" is a query string syntax for must be present # copyright:CC state Creative Commons Attribution 4.0 return { - "q": "_exists_:primaryRepresentation+copyright:CC", "size": "2000", "from": self.batch_start, } @@ -147,6 +175,25 @@ def _get_file_info(self, url) -> int | None: filesize = int(resp.headers.get("Content-Length", 0)) return filesize if filesize != 0 else None + def get_response_json( + self, query_params: dict, endpoint: str | None = None, **kwargs + ): + """ + Make the actual API requests needed to ingest a batch. + + This can be overridden in order to support APIs that require multiple requests, + for example. + """ + return self.delayed_requester.get_response_json( + endpoint or self.endpoint, + self.retries, + query_params, + headers=self.headers, + requestMethod="post", + json=self.data, + **kwargs, + ) + def main(): # Allows running ingestion from the CLI without Airflow running for debugging From 771983280026d0645b140d4995f7cbe358b21d9b Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Tue, 12 Dec 2023 14:15:50 -0500 Subject: [PATCH 17/30] edit test --- .../dags/providers/provider_api_scripts/test_auckland_museum.py | 1 - 1 file changed, 1 deletion(-) diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py index 257131e60c1..94941e4f735 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py @@ -17,7 +17,6 @@ def test_get_next_query_params_default_response(): actual_param = ingester.get_next_query_params(None) expected_param = { - "q": "_exists_:primaryRepresentation+copyright:CC", "size": "2000", "from": ingester.batch_start, } From cda5b2d9ad7657aaad49ef20c8d33e06840de1a0 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Tue, 12 Dec 2023 14:49:26 -0500 Subject: [PATCH 18/30] include te reo name in provider_details --- catalog/dags/common/loader/provider_details.py | 2 +- .../provider_api_scripts/auckland_museum.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/catalog/dags/common/loader/provider_details.py b/catalog/dags/common/loader/provider_details.py index 8a2a20cbfb9..b758560cac3 100644 --- a/catalog/dags/common/loader/provider_details.py +++ b/catalog/dags/common/loader/provider_details.py @@ -12,7 +12,7 @@ # Default provider names -AUCKLAND_MUSEUM_IMAGE_PROVIDER = "aucklandmuseum" +AUCKLAND_MUSEUM_IMAGE_PROVIDER = "aucklandmuseum_tamakipaengahira" BROOKLYN_DEFAULT_PROVIDER = "brooklynmuseum" CLEVELAND_DEFAULT_PROVIDER = "clevelandmuseum" EUROPEANA_DEFAULT_PROVIDER = "europeana" diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index e7233a3666c..1b27f2c4225 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -1,5 +1,5 @@ """ -Content Provider: AucklandMuseum +Content Provider: Auckland War Memorial Museum Tāmaki Paenga Hira ETL Process: Use the API to identify all CC licensed media. @@ -129,14 +129,14 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: creator = ( information.get("dc_contributor")[0] if information.get("dc_contributor", []) - else "" + else None ) + appellation = information.get("appellation", {}) title = ( - information.get("appellation").get("Primary Title")[0] - if information.get("appellation", []) - and information.get("appellation").get("Primary Title", []) - else "" + appellation.get("Primary Title")[0] + if appellation.get("Primary Title") + else None ) meta_data = self._get_meta_data(information) @@ -156,7 +156,7 @@ def _get_meta_data(self, object_json: dict) -> dict | None: department = ( object_json.get("department")[0] if object_json.get("department", []) - else "" + else None ) metadata = { From 3ecf71602fe3302187b90813175955508b6bd326 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Fri, 22 Dec 2023 08:07:41 -0500 Subject: [PATCH 19/30] use self.date --- .../provider_api_scripts/auckland_museum.py | 12 ++++++++++-- catalog/dags/providers/provider_workflows.py | 2 ++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index 1b27f2c4225..95600b9c89b 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -54,8 +54,16 @@ def __init__(self, *args, **kwargs): self.batch_start = 0 self.batch_limit = 2000 self.headers = {"Content-Type": "application/json"} - self.date_from = convert_date_format(datetime.now() - timedelta(days=1)) - self.date_to = convert_date_format(datetime.now()) + if self.date: + date_from = datetime.strptime(self.date, "%Y-%m-%d").date() + self.date_from = str(date_from) + self.date_to = str(date_from + timedelta(days=1)) + logger.info( + f"Start timestamp: {self.date_from}, end timestamp: {self.date_to}" + ) + else: + self.date_from = convert_date_format(datetime.now()) + self.date_to = convert_date_format(datetime.now() + timedelta(days=1)) self.data = { "query": { "bool": { diff --git a/catalog/dags/providers/provider_workflows.py b/catalog/dags/providers/provider_workflows.py index 2300d6517b0..d0e82b1e8f3 100644 --- a/catalog/dags/providers/provider_workflows.py +++ b/catalog/dags/providers/provider_workflows.py @@ -196,6 +196,8 @@ def __post_init__(self): ProviderWorkflow( start_date=datetime(2023, 10, 1), ingester_class=AucklandMuseumDataIngester, + schedule_string="@daily", + dated=True, ), ProviderWorkflow( start_date=datetime(2020, 1, 1), From 3e23a5c9ee91559e6f4c57c09a24c64879c8d0a3 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Fri, 22 Dec 2023 15:34:44 -0500 Subject: [PATCH 20/30] edit date format --- .../dags/providers/provider_api_scripts/auckland_museum.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index 95600b9c89b..baedc3aaedd 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -55,9 +55,9 @@ def __init__(self, *args, **kwargs): self.batch_limit = 2000 self.headers = {"Content-Type": "application/json"} if self.date: - date_from = datetime.strptime(self.date, "%Y-%m-%d").date() - self.date_from = str(date_from) - self.date_to = str(date_from + timedelta(days=1)) + date_from = datetime.strptime(self.date, "%Y-%m-%d") + self.date_from = date_from.isoformat() + self.date_to = (date_from + timedelta(days=1)).isoformat() logger.info( f"Start timestamp: {self.date_from}, end timestamp: {self.date_to}" ) From ce120519a95ab2e3893d4a9131c26195e13a2e20 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Fri, 22 Dec 2023 15:36:57 -0500 Subject: [PATCH 21/30] remove if/else condition for date --- .../provider_api_scripts/auckland_museum.py | 21 ++++--------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index baedc3aaedd..1ec7a455e50 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -25,13 +25,6 @@ from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester -def convert_date_format(date_obj) -> str: - date = str(date_obj) - date = date.replace(" ", "T") - date = date + "Z" - return date - - logger = logging.getLogger(__name__) LANDING_URL = ( @@ -54,16 +47,10 @@ def __init__(self, *args, **kwargs): self.batch_start = 0 self.batch_limit = 2000 self.headers = {"Content-Type": "application/json"} - if self.date: - date_from = datetime.strptime(self.date, "%Y-%m-%d") - self.date_from = date_from.isoformat() - self.date_to = (date_from + timedelta(days=1)).isoformat() - logger.info( - f"Start timestamp: {self.date_from}, end timestamp: {self.date_to}" - ) - else: - self.date_from = convert_date_format(datetime.now()) - self.date_to = convert_date_format(datetime.now() + timedelta(days=1)) + date_from = datetime.strptime(self.date, "%Y-%m-%d") + self.date_from = date_from.isoformat() + self.date_to = (date_from + timedelta(days=1)).isoformat() + logger.info(f"Start timestamp: {self.date_from}, end timestamp: {self.date_to}") self.data = { "query": { "bool": { From fdce81b8dd7de01bdf2a804c5fb393ae4845a38f Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Fri, 22 Dec 2023 15:53:02 -0500 Subject: [PATCH 22/30] lint format --- catalog/dags/common/requester.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/catalog/dags/common/requester.py b/catalog/dags/common/requester.py index d10348b8bdc..c417de28738 100644 --- a/catalog/dags/common/requester.py +++ b/catalog/dags/common/requester.py @@ -146,7 +146,9 @@ def _get_json(self, response) -> dict | list | None: except JSONDecodeError as e: logger.warning(f"Could not get response_json.\n{e}") - def get_response_json(self, endpoint, retries=0, query_params=None, requestMethod="get", **kwargs): + def get_response_json( + self, endpoint, retries=0, query_params=None, requestMethod="get", **kwargs + ): response_json = None response = None if retries < 0: From bf5e2bb419412176b46cc17d0c6b23fa173fabac Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Fri, 22 Dec 2023 16:07:08 -0500 Subject: [PATCH 23/30] add date to test --- .../dags/providers/provider_api_scripts/test_auckland_museum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py index 94941e4f735..de5c6e86d2e 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py @@ -10,7 +10,7 @@ RESOURCES = Path(__file__).parent / "resources/aucklandmuseum" CC_BY_4_0 = get_license_info("https://creativecommons.org/licenses/by/4.0/") -ingester = AucklandMuseumDataIngester() +ingester = AucklandMuseumDataIngester(date="2018-01-15") _get_resource_json = make_resource_json_func("aucklandmuseum") From 1549601aa709c3e47f4b3b99d0cc03d290123010 Mon Sep 17 00:00:00 2001 From: Kenneth Ng <59739226+ngken0995@users.noreply.github.com> Date: Thu, 28 Dec 2023 21:44:13 -0500 Subject: [PATCH 24/30] Update catalog/dags/providers/provider_api_scripts/auckland_museum.py Co-authored-by: Olga Bulat --- catalog/dags/providers/provider_api_scripts/auckland_museum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index 1ec7a455e50..5437e1c9b25 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -105,7 +105,7 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: # check if _id is empty then foreign_landing_url and # foreign_identifier doesn't exist - if not data.get("_id"): + if not (identifier := data.get("_id")): return None url_parameter = data.get("_id").split("id/")[-1].replace("/", "-") From 41edd9ec3cc8a8ae826d34b2301b6f72bcb44a56 Mon Sep 17 00:00:00 2001 From: Kenneth Ng <59739226+ngken0995@users.noreply.github.com> Date: Thu, 28 Dec 2023 21:44:33 -0500 Subject: [PATCH 25/30] Update catalog/dags/providers/provider_api_scripts/auckland_museum.py Co-authored-by: Olga Bulat --- catalog/dags/providers/provider_api_scripts/auckland_museum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index 5437e1c9b25..7daf25337fc 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -108,7 +108,7 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: if not (identifier := data.get("_id")): return None - url_parameter = data.get("_id").split("id/")[-1].replace("/", "-") + url_parameter = identifier.split("id/")[-1].replace("/", "-") foreign_landing_url = f"{LANDING_URL}{url_parameter}" foreign_identifier = data.get("_id").split("/")[-1] From f5a6bd86467046ede5115e9e87662530910fd24b Mon Sep 17 00:00:00 2001 From: Kenneth Ng <59739226+ngken0995@users.noreply.github.com> Date: Thu, 28 Dec 2023 21:44:45 -0500 Subject: [PATCH 26/30] Update catalog/dags/providers/provider_api_scripts/auckland_museum.py Co-authored-by: Olga Bulat --- catalog/dags/providers/provider_api_scripts/auckland_museum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index 7daf25337fc..a51576097e9 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -111,7 +111,7 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: url_parameter = identifier.split("id/")[-1].replace("/", "-") foreign_landing_url = f"{LANDING_URL}{url_parameter}" - foreign_identifier = data.get("_id").split("/")[-1] + foreign_identifier = identifier.split("/")[-1] information = data.get("_source", {}) From 0425fcb807884c9b8e5a64ee2adac14580ba29ac Mon Sep 17 00:00:00 2001 From: Kenneth Ng <59739226+ngken0995@users.noreply.github.com> Date: Thu, 28 Dec 2023 21:45:48 -0500 Subject: [PATCH 27/30] Update catalog/dags/providers/provider_api_scripts/auckland_museum.py Co-authored-by: Olga Bulat --- catalog/dags/providers/provider_api_scripts/auckland_museum.py | 1 - 1 file changed, 1 deletion(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index a51576097e9..44167f72334 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -118,7 +118,6 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: if not (url := information.get("primaryRepresentation")): return None - thumbnail_url = f"{url}?rendering=thumbnail.jpg" license_info = self.DEFAULT_LICENSE_INFO creator = ( From 692c89d8232de58cba3c38ac77d034e1e4198113 Mon Sep 17 00:00:00 2001 From: Kenneth Ng <59739226+ngken0995@users.noreply.github.com> Date: Thu, 28 Dec 2023 21:46:07 -0500 Subject: [PATCH 28/30] Update catalog/dags/providers/provider_api_scripts/auckland_museum.py Co-authored-by: Olga Bulat --- catalog/dags/providers/provider_api_scripts/auckland_museum.py | 1 - 1 file changed, 1 deletion(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index 44167f72334..dff8d13a097 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -139,7 +139,6 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: "foreign_identifier": foreign_identifier, "url": url, "license_info": license_info, - "thumbnail_url": thumbnail_url, "creator": creator, "title": title, "meta_data": meta_data, From 082dafa89507f50f1f4a10ac697397b34dd49bb1 Mon Sep 17 00:00:00 2001 From: Kenneth Ng <59739226+ngken0995@users.noreply.github.com> Date: Thu, 28 Dec 2023 21:55:29 -0500 Subject: [PATCH 29/30] Update catalog/dags/providers/provider_api_scripts/auckland_museum.py Co-authored-by: Olga Bulat --- catalog/dags/providers/provider_api_scripts/auckland_museum.py | 1 + 1 file changed, 1 insertion(+) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index dff8d13a097..769a69168da 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -144,6 +144,7 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: "meta_data": meta_data, } + @staticmethod def _get_meta_data(self, object_json: dict) -> dict | None: geopos = object_json.get("geopos")[0] if object_json.get("geopos", []) else "" department = ( From 44daa714abf5bec8e0fb4984ac0b616310b43697 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Wed, 3 Jan 2024 10:18:43 -0500 Subject: [PATCH 30/30] using static method --- catalog/dags/providers/provider_api_scripts/auckland_museum.py | 2 +- .../dags/providers/provider_api_scripts/test_auckland_museum.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py index 769a69168da..87035537ae4 100644 --- a/catalog/dags/providers/provider_api_scripts/auckland_museum.py +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -145,7 +145,7 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: } @staticmethod - def _get_meta_data(self, object_json: dict) -> dict | None: + def _get_meta_data(object_json: dict) -> dict | None: geopos = object_json.get("geopos")[0] if object_json.get("geopos", []) else "" department = ( object_json.get("department")[0] diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py index de5c6e86d2e..cae12187521 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py @@ -46,7 +46,6 @@ def test_get_record_data(): "foreign_identifier": "691102", "url": "http://api.aucklandmuseum.com/id/media/v/214749", "license_info": CC_BY_4_0, - "thumbnail_url": "http://api.aucklandmuseum.com/id/media/v/214749?rendering=thumbnail.jpg", "creator": "R. O. Gardner", "title": "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd.", "meta_data": meta_data,