WordPress · krysal · Mar 30, 2023 · Mar 23, 2023 · Mar 23, 2023 · Mar 24, 2023
@@ -78,7 +78,7 @@ The following are DAGs grouped by their primary tag:
 | `museum_victoria_workflow`                                      | `@monthly`        | `False` | image         |
 | [`nappy_workflow`](#nappy_workflow)                             | `@monthly`        | `False` | image         |
 | `nypl_workflow`                                                 | `@monthly`        | `False` | image         |
-| [`phylopic_workflow`](#phylopic_workflow)                       | `@daily`          | `True`  | image         |
+| [`phylopic_workflow`](#phylopic_workflow)                       | `@weekly`         | `False` | image         |
 | [`rawpixel_workflow`](#rawpixel_workflow)                       | `@monthly`        | `False` | image         |
 | [`science_museum_workflow`](#science_museum_workflow)           | `@monthly`        | `False` | image         |
 | [`smithsonian_workflow`](#smithsonian_workflow)                 | `@weekly`         | `False` | image         |
@@ -461,7 +461,7 @@ ETL Process: Use the API to identify all CC licensed images.
 
 Output: TSV file containing the image, their respective meta-data.
 
-Notes: http://phylopic.org/api/ No rate limit specified.
+Notes: http://api-docs.phylopic.org/v2/ No rate limit specified.
 
 ## `phylopic_workflow`
 
@@ -471,7 +471,7 @@ ETL Process: Use the API to identify all CC licensed images.
 
 Output: TSV file containing the image, their respective meta-data.
 
-Notes: http://phylopic.org/api/ No rate limit specified.
+Notes: http://api-docs.phylopic.org/v2/ No rate limit specified.
 
 ## `pr_review_reminders`
 

@@ -6,13 +6,11 @@
 Output:                 TSV file containing the image,
                         their respective meta-data.
 
-Notes:                  http://phylopic.org/api/
+Notes:                  http://api-docs.phylopic.org/v2/
                         No rate limit specified.
 """
 
-import argparse
 import logging
-from datetime import date, timedelta
 
 from common import constants
 from common.licenses import get_license_info
@@ -25,200 +23,112 @@
 
 class PhylopicDataIngester(ProviderDataIngester):
     delay = 5
-    host = "http://phylopic.org"
-    # Use "base_endpoint" since class's "endpoint" parameter gets defined as a property
-    base_endpoint = f"{host}/api/a/image"
+    host = "https://www.phylopic.org"
+    endpoint = "https://api.phylopic.org/images"
     providers = {constants.IMAGE: prov.PHYLOPIC_DEFAULT_PROVIDER}
-    batch_limit = 25
 
-    @property
-    def endpoint(self) -> str:
-        """
-        Due to the way this DAG is run (via a dated range), **only one request is ever
-        issued** to retrieve all updated IDs. As such, it will only
-        return one endpoint.
-        """
-        list_endpoint = f"{self.base_endpoint}/list"
-        # Process for a given date
-        end_date = (date.fromisoformat(self.date) + timedelta(days=1)).isoformat()
-        # Get a list of objects uploaded/updated within a date range
-        # http://phylopic.org/api/#method-image-time-range
-        endpoint = f"{list_endpoint}/modified/{self.date}/{end_date}"
-        logger.info(f"Constructed endpoint: {endpoint}")
-        return endpoint
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.current_page = 1
+        self.total_pages = 0
+        self.build_param = 0
+
+    def ingest_records(self):
+        self._get_initial_query_params()
+        super().ingest_records()
+
+    def _get_initial_query_params(self) -> None:
+        """Get the required `build` param from the API and set the total pages."""
+        resp = self.get_response_json(query_params={})
+        if not resp:
+            raise Exception("No response from Phylopic API.")
-            raise Exception("No response from Phylopic API.")
+            raise ValueError("No response from Phylopic API.")
-            raise Exception("No response from Phylopic API.")
+            raise ValueError("No response from Phylopic API.")
+        self.build_param = resp.get("build")
+        self.total_pages = resp.get("totalPages")
+        logger.info(
+            f"Total items to fetch: {resp.get('totalItems')}. "
+            f"Total pages: {self.total_pages}."
+        )
 
     def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict:
-        """Noop since the range is determined by the endpoint property."""
-        return {}
+        if prev_query_params is not None:
+            self.current_page += 1
 
-    def get_should_continue(self, response_json):
-        """
-        Override for upstream "return True". This DAG will only ever make 1 query so
-        they should not continue to loop.
-        """
-        return False
+        return {
+            "build": self.build_param,
+            "page": self.current_page - 1,  # PhyloPic pages are 0-indexed.
+            "embed_items": "true",
+        }
 
-    @staticmethod
-    def _get_response_data(response_json) -> dict | list | None:
-        """Intermediate method for pulling out results from a Phylopic API request."""
-        if response_json and response_json.get("success") is True:
-            return response_json.get("result")
+    def get_should_continue(self, response_json):
+        logger.debug(f"Processing page {self.current_page} of {self.total_pages}.")
+        return self.current_page < self.total_pages
-        return self.current_page < self.total_pages
+        return self.current_page <= self.total_pages
-        return self.current_page < self.total_pages
+        return self.current_page <= self.total_pages
 
     def get_batch_data(self, response_json):
-        """
-        Process the returned IDs.
-
-        The Phylopic API returns only lists of IDs in the initial request. We must take
-        this request and iterate through all the IDs to get the metadata for each one.
-        """
-        data = self._get_response_data(response_json)
-
-        if not data:
-            logger.warning("No content available!")
-            return None
-
-        return data
+        return response_json.get("_embedded", {}).get("items", [])
 
-    @staticmethod
-    def _image_url(uid: str) -> str:
-        return f"{PhylopicDataIngester.host}/image/{uid}"
+    def _get_creator(self, data: dict) -> tuple[str | None, str | None]:
+        creator_name = data.get("title")
+        href = data.get("href")
+        creator_url = self.host + href if href else None
+        return creator_name, creator_url
 
     @staticmethod
-    def _get_image_info(
-        result: dict, uid: str
-    ) -> tuple[str | None, int | None, int | None]:
-        img_url = None
-        width = None
-        height = None
-
-        image_info = result.get("pngFiles")
-        if image_info:
-            images = list(
-                filter(lambda x: (int(str(x.get("width", "0"))) >= 257), image_info)
-            )
-            if images:
-                image = sorted(images, key=lambda x: x["width"], reverse=True)[0]
-                img_url = image.get("url")
-                if not img_url:
-                    logging.warning(
-                        "Image not detected in url: "
-                        f"{PhylopicDataIngester._image_url(uid)}"
-                    )
-                else:
-                    img_url = f"{PhylopicDataIngester.host}{img_url}"
-                    width = image.get("width")
-                    height = image.get("height")
-
-        return img_url, width, height
+    def _get_image_sizes(data: dict) -> tuple[int | None, int | None]:
+        width, height = None, None
+        sizes = data.get("sourceFile", {}).get("sizes")
+        if sizes and "x" in sizes:
+            width, height = sizes.split("x")
+            # SVG sizes include decimal points so we get an approximation.
+            width, height = int(float(width)), int(float(height))
+        return width, height
 
-    @staticmethod
-    def _get_taxa_details(result: dict) -> tuple[list[str] | None, str]:
-        taxa = result.get("taxa", [])
-        taxa_list = None
-        title = ""
-        if taxa:
-            taxa = [
-                _.get("canonicalName")
-                for _ in taxa
-                if _.get("canonicalName") is not None
-            ]
-            taxa_list = [_.get("string", "") for _ in taxa]
-
-        if taxa_list:
-            title = taxa_list[0]
-
-        return taxa_list, title
+    def get_record_data(self, data: dict) -> dict | list[dict] | None:
+        """
+        Get the data for a single image record.
 
-    @staticmethod
-    def _get_creator_details(result: dict) -> tuple[str | None, str | None, str | None]:
-        credit_line = None
-        pub_date = None
-        creator = None
-        submitter = result.get("submitter", {})
-        first_name = submitter.get("firstName")
-        last_name = submitter.get("lastName")
-        if first_name and last_name:
-            creator = f"{first_name} {last_name}".strip()
-
-        if credit := result.get("credit"):
-            credit_line = credit.strip()
-            pub_date = result.get("submitted").strip()
-
-        return creator, credit_line, pub_date
+        TODO: Adapt `url` and `creator_url` to avoid redirects.
+        """
 
-    def get_record_data(self, data: dict) -> dict | list[dict] | None:
-        uid = data.get("uid")
+        uid = data.get("uuid")
         if not uid:
-            return
-        logger.debug(f"Processing UUID: {uid}")
-        params = {
-            "options": " ".join(
-                [
-                    "credit",
-                    "licenseURL",
-                    "pngFiles",
-                    "submitted",
-                    "submitter",
-                    "taxa",
-                    "canonicalName",
-                    "string",
-                    "firstName",
-                    "lastName",
-                ]
-            )
-        }
-        endpoint = f"{self.base_endpoint}/{uid}"
-        response_json = self.get_response_json(params, endpoint)
-        result = self._get_response_data(response_json)
-        if not result:
             return None
 
-        meta_data = {}
-        uid = result.get("uid")
-        license_url = result.get("licenseURL")
-
-        img_url, width, height = self._get_image_info(result, uid)
-
-        if img_url is None:
+        data = data.get("_links", {})
+        license_url = data.get("license", {}).get("href")
+        img_url = data.get("sourceFile", {}).get("href")
+        foreign_url = data.get("self", {}).get("href")
+        if not license_url or not img_url or not foreign_url:
             return None
 
-        meta_data["taxa"], title = self._get_taxa_details(result)
-
-        foreign_url = self._image_url(uid)
+        foreign_url = self.host + foreign_url
 
-        (
-            creator,
-            meta_data["credit_line"],
-            meta_data["pub_date"],
-        ) = self._get_creator_details(result)
+        title = data.get("self", {}).get("title")
+        creator, creator_url = self._get_creator(data.get("contributor", {}))
+        width, height = self._get_image_sizes(data)
 
         return {
+            "license_info": get_license_info(license_url=license_url),
             "foreign_identifier": uid,
             "foreign_landing_url": foreign_url,
             "image_url": img_url,
-            "license_info": get_license_info(license_url=license_url),
+            "title": title,
+            "creator": creator,
+            "creator_url": creator_url,
             "width": width,
             "height": height,
-            "creator": creator,
-            "title": title,
-            "meta_data": meta_data,
+            # TODO: Evaluate whether to include upstream thumbnails.
+            # Sizes available: 192x192, 128x128, 64x64.
+            # "thumbnail": thumbnail,
+            # TODO: Evaluate whether to include nodes' titles as tags.
+            # "tags": tags,
         }
 
 
-def main(date: str = None):
+def main():
     logger.info("Begin: Phylopic provider script")
-    ingester = PhylopicDataIngester(date=date)
+    ingester = PhylopicDataIngester()
     ingester.ingest_records()
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="PhyloPic API Job", add_help=True)
-    parser.add_argument(
-        "--date",
-        default=None,
-        help="Identify all images updated on a particular date (YYYY-MM-DD).",
-    )
-
-    args = parser.parse_args()
-
-    main(args.date)
+    main()
@@ -257,8 +257,7 @@ def __post_init__(self):
     ProviderWorkflow(
         ingester_class=PhylopicDataIngester,
         start_date=datetime(2011, 2, 7),
-        schedule_string="@daily",
-        dated=True,
+        schedule_string="@weekly",
         pull_timeout=timedelta(hours=12),
     ),
     ProviderWorkflow(