Skip to content

Commit

Permalink
Merge pull request #1336 from depositar/ckan
Browse files Browse the repository at this point in the history
[MRG] Add CKAN content provider
  • Loading branch information
yuvipanda authored Jun 28, 2024
2 parents 09f3d53 + a390013 commit 9f15678
Show file tree
Hide file tree
Showing 5 changed files with 216 additions and 1 deletion.
4 changes: 3 additions & 1 deletion docs/source/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ where ``<source-repository>`` is:

* a URL of a Git repository (``https://github.com/binder-examples/requirements``),
* a Zenodo DOI (``10.5281/zenodo.1211089``),
* a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), or
* a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``),
* a URL of a CKAN_ dataset (``https://demo.ckan.org/dataset/sample-dataset-1``), or
* a path to a local directory (``a/local/directory``)

of the source repository you want to build.
Expand Down Expand Up @@ -136,3 +137,4 @@ Command line API

.. _Pytudes: https://github.com/norvig/pytudes
.. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
.. _CKAN: https://ckan.org
1 change: 1 addition & 0 deletions repo2docker/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def _default_log_level(self):
contentproviders.Dataverse,
contentproviders.Hydroshare,
contentproviders.Swhid,
contentproviders.CKAN,
contentproviders.Mercurial,
contentproviders.Git,
],
Expand Down
1 change: 1 addition & 0 deletions repo2docker/contentproviders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .base import Local
from .ckan import CKAN
from .dataverse import Dataverse
from .figshare import Figshare
from .git import Git
Expand Down
132 changes: 132 additions & 0 deletions repo2docker/contentproviders/ckan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
from datetime import datetime, timedelta, timezone
from os import path
from urllib.parse import parse_qs, urlencode, urlparse

from requests import Session

from .. import __version__
from .base import ContentProvider


class CKAN(ContentProvider):
"""Provide contents of a remote CKAN dataset."""

def __init__(self):
super().__init__()
self.session = Session()
self.session.headers.update(
{
"user-agent": f"repo2docker {__version__}",
}
)

def _fetch_version(self, api_url):
"""Fetch dataset modified date and convert to epoch.
Borrowed from the Hydroshare provider.
"""
package_show_url = f"{api_url}package_show?id={self.dataset_id}"
resp = self.urlopen(package_show_url).json()
date = resp["result"]["metadata_modified"]
parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp()
# truncate the timestamp
return str(int(epoch))

def _request(self, url, **kwargs):
return self.session.get(url, **kwargs)

urlopen = _request

def detect(self, source, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a CKAN dataset."""
parsed_url = urlparse(source)
if not parsed_url.netloc:
return None

if "/dataset/" not in parsed_url.path:
# Not actually a dataset
return None

# CKAN may be under a URL prefix, and we should accomodate that
url_prefix, dataset_url = parsed_url.path.split("/dataset/")

dataset_url_parts = dataset_url.split("/")
self.dataset_id = dataset_url_parts[0]

api_url = parsed_url._replace(
path=f"{url_prefix}/api/3/action/", query=""
).geturl()

status_show_url = f"{api_url}status_show"
resp = self.urlopen(status_show_url)
if resp.status_code == 200:

# Activity ID may be present either as a query parameter, activity_id
# or as part of the URL, under `/history/<activity-id>`. If `/history/`
# is present, that takes precedence over `activity_id`
activity_id = None
if "history" in dataset_url_parts:
activity_id = dataset_url_parts[dataset_url_parts.index("history") + 1]
elif parse_qs(parsed_url.query).get("activity_id") is not None:
activity_id = parse_qs(parsed_url.query).get("activity_id")[0]

self.version = self._fetch_version(api_url)
return {
"dataset_id": self.dataset_id,
"activity_id": activity_id,
"api_url": api_url,
"version": self.version,
}
else:
return None

def fetch(self, spec, output_dir, yield_output=False):
"""Fetch a CKAN dataset."""
dataset_id = spec["dataset_id"]
activity_id = spec["activity_id"]

yield f"Fetching CKAN dataset {dataset_id}.\n"

# handle the activites
if activity_id:
fetch_url = f"{spec['api_url']}activity_data_show?" + urlencode(
{"id": activity_id, "object_type": "package"}
)
else:
fetch_url = f"{spec['api_url']}package_show?" + urlencode(
{"id": dataset_id}
)

resp = self.urlopen(
fetch_url,
headers={"accept": "application/json"},
)

dataset = resp.json()

yield "Fetching CKAN resources.\n"

resources = dataset["result"]["resources"]

for resource in resources:
file_url = resource["url"]
if file_url == "":
continue
fname = file_url.rsplit("/", maxsplit=1)[-1]
if fname == "":
fname = resource["id"]

yield f"Requesting {file_url}\n"
resp = self._request(file_url, stream=True)
resp.raise_for_status()

dst_fname = path.join(output_dir, fname)
with open(dst_fname, "wb") as dst:
yield f"Fetching {fname}\n"
for chunk in resp.iter_content(chunk_size=None):
dst.write(chunk)

@property
def content_id(self):
"""A unique ID to represent the version of the content."""
return f"{self.dataset_id}.v{self.version}"
79 changes: 79 additions & 0 deletions tests/unit/contentproviders/test_ckan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import os
from contextlib import contextmanager
from tempfile import NamedTemporaryFile, TemporaryDirectory

from repo2docker.contentproviders import CKAN


def test_detect_ckan(requests_mock):
mock_response = {"result": {"metadata_modified": "2024-02-27T14:15:54.573058"}}
requests_mock.get("http://demo.ckan.org/api/3/action/status_show", status_code=200)
requests_mock.get(
"http://demo.ckan.org/api/3/action/package_show?id=1234", json=mock_response
)

expected = {
"dataset_id": "1234",
"activity_id": None,
"api_url": "http://demo.ckan.org/api/3/action/",
"version": "1709043354",
}

expected_activity = expected.copy()
expected_activity["activity_id"] = "5678"

assert CKAN().detect("http://demo.ckan.org/dataset/1234") == expected
assert (
CKAN().detect("http://demo.ckan.org/dataset/1234?activity_id=5678")
== expected_activity
)
assert (
CKAN().detect("http://demo.ckan.org/dataset/1234/history/5678")
== expected_activity
)


def test_detect_not_ckan():
# Don't trigger the CKAN content provider
assert CKAN().detect("/some/path/here") is None
assert CKAN().detect("https://example.com/path/here") is None
assert CKAN().detect("https://data.gov.tw/dataset/6564") is None


@contextmanager
def ckan_file():
with NamedTemporaryFile() as file:
file.write(b"some content")
yield file.name


def test_ckan_fetch(requests_mock):
with ckan_file() as ckan_path:
mock_response = {"result": {"resources": [{"url": f"file://{ckan_path}"}]}}
requests_mock.get(
"http://demo.ckan.org/api/3/action/package_show?id=1234", json=mock_response
)
requests_mock.get(
"http://demo.ckan.org/api/3/action/activity_data_show?id=5678",
json=mock_response,
)
requests_mock.get(f"file://{ckan_path}", content=open(ckan_path, "rb").read())

ckan = CKAN()
spec = {"dataset_id": "1234", "api_url": "http://demo.ckan.org/api/3/action/"}

expected = {ckan_path.rsplit("/", maxsplit=1)[1]}

with TemporaryDirectory() as d:
spec["activity_id"] = None
output = []
for l in ckan.fetch(spec, d):
output.append(l)
assert expected == set(os.listdir(d))

with TemporaryDirectory() as d:
spec["activity_id"] = "5678"
output = []
for l in ckan.fetch(spec, d):
output.append(l)
assert expected == set(os.listdir(d))

0 comments on commit 9f15678

Please sign in to comment.