-
Notifications
You must be signed in to change notification settings - Fork 54
Update PhyloPic DAG to use API v2 #1060
Changes from all commits
68afd3d
46d503e
8c7a2b3
36fdb1d
e19989c
eb886ed
59aa069
4f998c1
714b65b
8e5d1c6
22fc64e
9d10205
2873b50
9a0cad9
6989c4d
8a04833
6adde6d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -6,13 +6,11 @@ | |||||
Output: TSV file containing the image, | ||||||
their respective meta-data. | ||||||
|
||||||
Notes: http://phylopic.org/api/ | ||||||
Notes: http://api-docs.phylopic.org/v2/ | ||||||
No rate limit specified. | ||||||
""" | ||||||
|
||||||
import argparse | ||||||
import logging | ||||||
from datetime import date, timedelta | ||||||
|
||||||
from common import constants | ||||||
from common.licenses import get_license_info | ||||||
|
@@ -25,200 +23,112 @@ | |||||
|
||||||
class PhylopicDataIngester(ProviderDataIngester): | ||||||
delay = 5 | ||||||
host = "http://phylopic.org" | ||||||
# Use "base_endpoint" since class's "endpoint" parameter gets defined as a property | ||||||
base_endpoint = f"{host}/api/a/image" | ||||||
host = "https://www.phylopic.org" | ||||||
endpoint = "https://api.phylopic.org/images" | ||||||
providers = {constants.IMAGE: prov.PHYLOPIC_DEFAULT_PROVIDER} | ||||||
batch_limit = 25 | ||||||
|
||||||
@property | ||||||
def endpoint(self) -> str: | ||||||
""" | ||||||
Due to the way this DAG is run (via a dated range), **only one request is ever | ||||||
issued** to retrieve all updated IDs. As such, it will only | ||||||
return one endpoint. | ||||||
""" | ||||||
list_endpoint = f"{self.base_endpoint}/list" | ||||||
# Process for a given date | ||||||
end_date = (date.fromisoformat(self.date) + timedelta(days=1)).isoformat() | ||||||
# Get a list of objects uploaded/updated within a date range | ||||||
# http://phylopic.org/api/#method-image-time-range | ||||||
endpoint = f"{list_endpoint}/modified/{self.date}/{end_date}" | ||||||
logger.info(f"Constructed endpoint: {endpoint}") | ||||||
return endpoint | ||||||
def __init__(self, *args, **kwargs): | ||||||
super().__init__(*args, **kwargs) | ||||||
self.current_page = 1 | ||||||
self.total_pages = 0 | ||||||
self.build_param = 0 | ||||||
|
||||||
def ingest_records(self): | ||||||
self._get_initial_query_params() | ||||||
super().ingest_records() | ||||||
|
||||||
def _get_initial_query_params(self) -> None: | ||||||
"""Get the required `build` param from the API and set the total pages.""" | ||||||
resp = self.get_response_json(query_params={}) | ||||||
if not resp: | ||||||
raise Exception("No response from Phylopic API.") | ||||||
self.build_param = resp.get("build") | ||||||
self.total_pages = resp.get("totalPages") | ||||||
logger.info( | ||||||
f"Total items to fetch: {resp.get('totalItems')}. " | ||||||
f"Total pages: {self.total_pages}." | ||||||
) | ||||||
|
||||||
def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: | ||||||
"""Noop since the range is determined by the endpoint property.""" | ||||||
return {} | ||||||
if prev_query_params is not None: | ||||||
self.current_page += 1 | ||||||
|
||||||
def get_should_continue(self, response_json): | ||||||
""" | ||||||
Override for upstream "return True". This DAG will only ever make 1 query so | ||||||
they should not continue to loop. | ||||||
""" | ||||||
return False | ||||||
return { | ||||||
"build": self.build_param, | ||||||
"page": self.current_page - 1, # PhyloPic pages are 0-indexed. | ||||||
"embed_items": "true", | ||||||
} | ||||||
|
||||||
@staticmethod | ||||||
def _get_response_data(response_json) -> dict | list | None: | ||||||
"""Intermediate method for pulling out results from a Phylopic API request.""" | ||||||
if response_json and response_json.get("success") is True: | ||||||
return response_json.get("result") | ||||||
def get_should_continue(self, response_json): | ||||||
logger.debug(f"Processing page {self.current_page} of {self.total_pages}.") | ||||||
return self.current_page < self.total_pages | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
This DAG ran really well until the last page, when the last requested page returned 404:
The
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Gotcha. I noticed it's trying to request the page 146, which is actually the page 147 because it starts at |
||||||
|
||||||
def get_batch_data(self, response_json): | ||||||
""" | ||||||
Process the returned IDs. | ||||||
|
||||||
The Phylopic API returns only lists of IDs in the initial request. We must take | ||||||
this request and iterate through all the IDs to get the metadata for each one. | ||||||
""" | ||||||
data = self._get_response_data(response_json) | ||||||
|
||||||
if not data: | ||||||
logger.warning("No content available!") | ||||||
return None | ||||||
|
||||||
return data | ||||||
return response_json.get("_embedded", {}).get("items", []) | ||||||
|
||||||
@staticmethod | ||||||
def _image_url(uid: str) -> str: | ||||||
return f"{PhylopicDataIngester.host}/image/{uid}" | ||||||
def _get_creator(self, data: dict) -> tuple[str | None, str | None]: | ||||||
creator_name = data.get("title") | ||||||
href = data.get("href") | ||||||
creator_url = self.host + href if href else None | ||||||
return creator_name, creator_url | ||||||
|
||||||
@staticmethod | ||||||
def _get_image_info( | ||||||
result: dict, uid: str | ||||||
) -> tuple[str | None, int | None, int | None]: | ||||||
img_url = None | ||||||
width = None | ||||||
height = None | ||||||
|
||||||
image_info = result.get("pngFiles") | ||||||
if image_info: | ||||||
images = list( | ||||||
filter(lambda x: (int(str(x.get("width", "0"))) >= 257), image_info) | ||||||
) | ||||||
if images: | ||||||
image = sorted(images, key=lambda x: x["width"], reverse=True)[0] | ||||||
img_url = image.get("url") | ||||||
if not img_url: | ||||||
logging.warning( | ||||||
"Image not detected in url: " | ||||||
f"{PhylopicDataIngester._image_url(uid)}" | ||||||
) | ||||||
else: | ||||||
img_url = f"{PhylopicDataIngester.host}{img_url}" | ||||||
width = image.get("width") | ||||||
height = image.get("height") | ||||||
|
||||||
return img_url, width, height | ||||||
def _get_image_sizes(data: dict) -> tuple[int | None, int | None]: | ||||||
width, height = None, None | ||||||
sizes = data.get("sourceFile", {}).get("sizes") | ||||||
if sizes and "x" in sizes: | ||||||
width, height = sizes.split("x") | ||||||
# SVG sizes include decimal points so we get an approximation. | ||||||
width, height = int(float(width)), int(float(height)) | ||||||
return width, height | ||||||
|
||||||
@staticmethod | ||||||
def _get_taxa_details(result: dict) -> tuple[list[str] | None, str]: | ||||||
taxa = result.get("taxa", []) | ||||||
taxa_list = None | ||||||
title = "" | ||||||
if taxa: | ||||||
taxa = [ | ||||||
_.get("canonicalName") | ||||||
for _ in taxa | ||||||
if _.get("canonicalName") is not None | ||||||
] | ||||||
taxa_list = [_.get("string", "") for _ in taxa] | ||||||
|
||||||
if taxa_list: | ||||||
title = taxa_list[0] | ||||||
|
||||||
return taxa_list, title | ||||||
def get_record_data(self, data: dict) -> dict | list[dict] | None: | ||||||
""" | ||||||
Get the data for a single image record. | ||||||
|
||||||
@staticmethod | ||||||
def _get_creator_details(result: dict) -> tuple[str | None, str | None, str | None]: | ||||||
credit_line = None | ||||||
pub_date = None | ||||||
creator = None | ||||||
submitter = result.get("submitter", {}) | ||||||
first_name = submitter.get("firstName") | ||||||
last_name = submitter.get("lastName") | ||||||
if first_name and last_name: | ||||||
creator = f"{first_name} {last_name}".strip() | ||||||
|
||||||
if credit := result.get("credit"): | ||||||
credit_line = credit.strip() | ||||||
pub_date = result.get("submitted").strip() | ||||||
|
||||||
return creator, credit_line, pub_date | ||||||
TODO: Adapt `url` and `creator_url` to avoid redirects. | ||||||
""" | ||||||
|
||||||
def get_record_data(self, data: dict) -> dict | list[dict] | None: | ||||||
uid = data.get("uid") | ||||||
uid = data.get("uuid") | ||||||
if not uid: | ||||||
return | ||||||
logger.debug(f"Processing UUID: {uid}") | ||||||
params = { | ||||||
"options": " ".join( | ||||||
[ | ||||||
"credit", | ||||||
"licenseURL", | ||||||
"pngFiles", | ||||||
"submitted", | ||||||
"submitter", | ||||||
"taxa", | ||||||
"canonicalName", | ||||||
"string", | ||||||
"firstName", | ||||||
"lastName", | ||||||
] | ||||||
) | ||||||
} | ||||||
endpoint = f"{self.base_endpoint}/{uid}" | ||||||
response_json = self.get_response_json(params, endpoint) | ||||||
result = self._get_response_data(response_json) | ||||||
if not result: | ||||||
return None | ||||||
|
||||||
meta_data = {} | ||||||
uid = result.get("uid") | ||||||
license_url = result.get("licenseURL") | ||||||
|
||||||
img_url, width, height = self._get_image_info(result, uid) | ||||||
|
||||||
if img_url is None: | ||||||
data = data.get("_links", {}) | ||||||
license_url = data.get("license", {}).get("href") | ||||||
img_url = data.get("sourceFile", {}).get("href") | ||||||
foreign_url = data.get("self", {}).get("href") | ||||||
if not license_url or not img_url or not foreign_url: | ||||||
AetherUnbound marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
return None | ||||||
|
||||||
meta_data["taxa"], title = self._get_taxa_details(result) | ||||||
|
||||||
foreign_url = self._image_url(uid) | ||||||
foreign_url = self.host + foreign_url | ||||||
|
||||||
( | ||||||
creator, | ||||||
meta_data["credit_line"], | ||||||
meta_data["pub_date"], | ||||||
) = self._get_creator_details(result) | ||||||
title = data.get("self", {}).get("title") | ||||||
creator, creator_url = self._get_creator(data.get("contributor", {})) | ||||||
width, height = self._get_image_sizes(data) | ||||||
|
||||||
return { | ||||||
"license_info": get_license_info(license_url=license_url), | ||||||
"foreign_identifier": uid, | ||||||
"foreign_landing_url": foreign_url, | ||||||
"image_url": img_url, | ||||||
"license_info": get_license_info(license_url=license_url), | ||||||
"title": title, | ||||||
"creator": creator, | ||||||
"creator_url": creator_url, | ||||||
"width": width, | ||||||
"height": height, | ||||||
"creator": creator, | ||||||
"title": title, | ||||||
"meta_data": meta_data, | ||||||
# TODO: Evaluate whether to include upstream thumbnails. | ||||||
# Sizes available: 192x192, 128x128, 64x64. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried the other images we get in the I wonder if we should handle SVGs differently: they can be basically any size, their width and height don't mean much beyond the aspect ratio (which of course is very important for the frontend rendering), and their thumbnails will probably be larger in size and worse in quality than the original image. Not for this PR specifically, of course. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, the raster images are an option but look really blurry, not sure if it's worth including those for thumbnails. |
||||||
# "thumbnail": thumbnail, | ||||||
# TODO: Evaluate whether to include nodes' titles as tags. | ||||||
# "tags": tags, | ||||||
krysal marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
} | ||||||
|
||||||
|
||||||
def main(date: str = None): | ||||||
def main(): | ||||||
logger.info("Begin: Phylopic provider script") | ||||||
ingester = PhylopicDataIngester(date=date) | ||||||
ingester = PhylopicDataIngester() | ||||||
ingester.ingest_records() | ||||||
|
||||||
|
||||||
if __name__ == "__main__": | ||||||
parser = argparse.ArgumentParser(description="PhyloPic API Job", add_help=True) | ||||||
parser.add_argument( | ||||||
"--date", | ||||||
default=None, | ||||||
help="Identify all images updated on a particular date (YYYY-MM-DD).", | ||||||
) | ||||||
|
||||||
args = parser.parse_args() | ||||||
|
||||||
main(args.date) | ||||||
main() |
This file was deleted.
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe this might be a more specific exception?