From 391b9bc5bafe09658e24b028510a7c00cec521c3 Mon Sep 17 00:00:00 2001 From: Sol Lee Date: Tue, 20 Feb 2024 06:37:21 +0000 Subject: [PATCH 01/10] Add CKAN content provider --- docs/source/usage.rst | 4 +- repo2docker/app.py | 1 + repo2docker/contentproviders/__init__.py | 1 + repo2docker/contentproviders/ckan.py | 101 +++++++++++++++++++++++ tests/unit/contentproviders/test_ckan.py | 58 +++++++++++++ 5 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 repo2docker/contentproviders/ckan.py create mode 100644 tests/unit/contentproviders/test_ckan.py diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 8795a2228..e89eb8ff6 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -39,7 +39,8 @@ where ```` is: * a URL of a Git repository (``https://github.com/binder-examples/requirements``), * a Zenodo DOI (``10.5281/zenodo.1211089``), - * a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), or + * a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), + * a URL of a CKAN_ dataset (``https://demo.ckan.org/dataset/sample-dataset-1``), or * a path to a local directory (``a/local/directory``) of the source repository you want to build. @@ -136,3 +137,4 @@ Command line API .. _Pytudes: https://github.com/norvig/pytudes .. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html +.. _CKAN: https://ckan.org diff --git a/repo2docker/app.py b/repo2docker/app.py index c7f3ab819..41e831e25 100755 --- a/repo2docker/app.py +++ b/repo2docker/app.py @@ -152,6 +152,7 @@ def _default_log_level(self): contentproviders.Dataverse, contentproviders.Hydroshare, contentproviders.Swhid, + contentproviders.CKAN, contentproviders.Mercurial, contentproviders.Git, ], diff --git a/repo2docker/contentproviders/__init__.py b/repo2docker/contentproviders/__init__.py index 5c40476df..a0a2e019b 100755 --- a/repo2docker/contentproviders/__init__.py +++ b/repo2docker/contentproviders/__init__.py @@ -1,4 +1,5 @@ from .base import Local +from .ckan import CKAN from .dataverse import Dataverse from .figshare import Figshare from .git import Git diff --git a/repo2docker/contentproviders/ckan.py b/repo2docker/contentproviders/ckan.py new file mode 100644 index 000000000..2a1de4486 --- /dev/null +++ b/repo2docker/contentproviders/ckan.py @@ -0,0 +1,101 @@ +import re +from datetime import datetime, timedelta, timezone +from os import path +from urllib.parse import urlparse + +from requests import Session + +from .. import __version__ +from .base import ContentProvider + + +class CKAN(ContentProvider): + """Provide contents of a remote CKAN dataset.""" + + def __init__(self): + super().__init__() + self.session = Session() + self.session.headers.update( + { + "user-agent": f"repo2docker {__version__}", + } + ) + + def _fetch_version(self, api_url): + """Fetch dataset modified date and convert to epoch. + Borrowed from the Hydroshare provider. + """ + package_show_url = f"{api_url}package_show?id={self.dataset_id}" + resp = self.urlopen(package_show_url).json() + date = resp["result"]["metadata_modified"] + parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f") + epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp() + # truncate the timestamp + return str(int(epoch)) + + def _request(self, url, **kwargs): + return self.session.get(url, **kwargs) + + urlopen = _request + url_regex = r"/dataset/[a-z0-9_\\-]*$" + + def detect(self, source, ref=None, extra_args=None): + """Trigger this provider for things that resolve to a CKAN dataset.""" + parsed_url = urlparse(source) + if not parsed_url.netloc: + return None + + api_url = parsed_url._replace( + path=re.sub(self.url_regex, "/api/3/action/", parsed_url.path) + ).geturl() + + status_show_url = f"{api_url}status_show" + resp = self.urlopen(status_show_url) + if resp.status_code == 200: + self.dataset_id = parsed_url.path.rsplit("/", maxsplit=1)[1] + self.version = self._fetch_version(api_url) + return { + "dataset_id": self.dataset_id, + "api_url": api_url, + "version": self.version, + } + else: + return None + + def fetch(self, spec, output_dir, yield_output=False): + """Fetch a CKAN dataset.""" + dataset_id = spec["dataset_id"] + + yield f"Fetching CKAN dataset {dataset_id}.\n" + package_show_url = f"{spec['api_url']}package_show?id={dataset_id}" + resp = self.urlopen( + package_show_url, + headers={"accept": "application/json"}, + ) + + dataset = resp.json() + + yield "Fetching CKAN resources.\n" + + resources = dataset["result"]["resources"] + + for resource in resources: + file_url = resource["url"] + fname = file_url.rsplit("/", maxsplit=1)[-1] + if fname == "": + fname = resource["id"] + + yield f"Requesting {file_url}\n" + resp = self._request(file_url, stream=True) + resp.raise_for_status() + + dst_fname = path.join(output_dir, fname) + with open(dst_fname, "wb") as dst: + yield f"Fetching {fname}\n" + for chunk in resp.iter_content(chunk_size=None): + dst.write(chunk) + + @property + def content_id(self): + """A unique ID to represent the version of the content.""" + return f"{self.dataset_id}.v{self.version}" diff --git a/tests/unit/contentproviders/test_ckan.py b/tests/unit/contentproviders/test_ckan.py new file mode 100644 index 000000000..ec65162b0 --- /dev/null +++ b/tests/unit/contentproviders/test_ckan.py @@ -0,0 +1,58 @@ +import os +from contextlib import contextmanager +from tempfile import NamedTemporaryFile, TemporaryDirectory + +import pytest + +from repo2docker.contentproviders import CKAN + +test_ckan = CKAN() +test_hosts = [ + ( + [ + "http://demo.ckan.org/dataset/sample-dataset-1", + ], + { + "dataset_id": "sample-dataset-1", + "api_url": "http://demo.ckan.org/api/3/action/", + "version": "1707387710", + }, + ) +] + + +@pytest.mark.parametrize("test_input, expected", test_hosts) +def test_detect_ckan(test_input, expected): + assert CKAN().detect(test_input[0]) == expected + + # Don't trigger the CKAN content provider + assert CKAN().detect("/some/path/here") is None + assert CKAN().detect("https://example.com/path/here") is None + assert CKAN().detect("https://data.gov.tw/dataset/6564") is None + + +@contextmanager +def ckan_file(): + with NamedTemporaryFile() as file: + file.write(b"some content") + yield file.name + + +def test_ckan_fetch(requests_mock): + with ckan_file() as ckan_path: + mock_response = {"result": {"resources": [{"url": f"file://{ckan_path}"}]}} + requests_mock.get( + "http://demo.ckan.org/api/3/action/package_show?id=1234", json=mock_response + ) + requests_mock.get(f"file://{ckan_path}", content=open(ckan_path, "rb").read()) + with TemporaryDirectory() as d: + ckan = CKAN() + spec = { + "dataset_id": "1234", + "api_url": "http://demo.ckan.org/api/3/action/", + } + output = [] + for l in ckan.fetch(spec, d): + output.append(l) + expected = {ckan_path.rsplit("/", maxsplit=1)[1]} + assert expected == set(os.listdir(d)) From 472c5cdc29da11d29c892f1492db19a0101ca692 Mon Sep 17 00:00:00 2001 From: Sol Lee Date: Mon, 25 Mar 2024 02:37:31 +0000 Subject: [PATCH 02/10] Make the splitting logic more straightforward --- repo2docker/contentproviders/ckan.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/repo2docker/contentproviders/ckan.py b/repo2docker/contentproviders/ckan.py index 2a1de4486..46ea7c750 100644 --- a/repo2docker/contentproviders/ckan.py +++ b/repo2docker/contentproviders/ckan.py @@ -1,4 +1,3 @@ -import re from datetime import datetime, timedelta, timezone from os import path from urllib.parse import urlparse @@ -37,7 +36,6 @@ def _request(self, url, **kwargs): return self.session.get(url, **kwargs) urlopen = _request - url_regex = r"/dataset/[a-z0-9_\\-]*$" def detect(self, source, ref=None, extra_args=None): """Trigger this provider for things that resolve to a CKAN dataset.""" @@ -45,14 +43,18 @@ def detect(self, source, ref=None, extra_args=None): if not parsed_url.netloc: return None - api_url = parsed_url._replace( - path=re.sub(self.url_regex, "/api/3/action/", parsed_url.path) - ).geturl() + url_parts = parsed_url.path.split("/") + if url_parts[-2] == "dataset": + self.dataset_id = url_parts[-1] + else: + return None + + api_url_path = "/api/3/action/" + api_url = parsed_url._replace(path=api_url_path).geturl() status_show_url = f"{api_url}status_show" resp = self.urlopen(status_show_url) if resp.status_code == 200: - self.dataset_id = parsed_url.path.rsplit("/", maxsplit=1)[1] self.version = self._fetch_version(api_url) return { "dataset_id": self.dataset_id, From 7712751c8f68f43a5f7756d010975dfa557ec6e9 Mon Sep 17 00:00:00 2001 From: Sol Lee Date: Mon, 25 Mar 2024 08:47:57 +0000 Subject: [PATCH 03/10] Make tests without parameterized information have their own test --- tests/unit/contentproviders/test_ckan.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/contentproviders/test_ckan.py b/tests/unit/contentproviders/test_ckan.py index ec65162b0..cb1f4a2d8 100644 --- a/tests/unit/contentproviders/test_ckan.py +++ b/tests/unit/contentproviders/test_ckan.py @@ -25,6 +25,8 @@ def test_detect_ckan(test_input, expected): assert CKAN().detect(test_input[0]) == expected + +def test_detect_not_ckan(): # Don't trigger the CKAN content provider assert CKAN().detect("/some/path/here") is None assert CKAN().detect("https://example.com/path/here") is None From 1039d8dcb9932e6365247dd923df2c489797bafb Mon Sep 17 00:00:00 2001 From: Sol Lee Date: Mon, 25 Mar 2024 08:58:34 +0000 Subject: [PATCH 04/10] Mock the response for test_detect_ckan --- tests/unit/contentproviders/test_ckan.py | 29 ++++++++++-------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/tests/unit/contentproviders/test_ckan.py b/tests/unit/contentproviders/test_ckan.py index cb1f4a2d8..2fa09073c 100644 --- a/tests/unit/contentproviders/test_ckan.py +++ b/tests/unit/contentproviders/test_ckan.py @@ -2,28 +2,23 @@ from contextlib import contextmanager from tempfile import NamedTemporaryFile, TemporaryDirectory -import pytest - from repo2docker.contentproviders import CKAN -test_ckan = CKAN() -test_hosts = [ - ( - [ - "http://demo.ckan.org/dataset/sample-dataset-1", - ], - { - "dataset_id": "sample-dataset-1", - "api_url": "http://demo.ckan.org/api/3/action/", - "version": "1707387710", - }, + +def test_detect_ckan(requests_mock): + mock_response = {"result": {"metadata_modified": "2024-02-27T14:15:54.573058"}} + requests_mock.get("http://demo.ckan.org/api/3/action/status_show", status_code=200) + requests_mock.get( + "http://demo.ckan.org/api/3/action/package_show?id=1234", json=mock_response ) -] + expected = { + "dataset_id": "1234", + "api_url": "http://demo.ckan.org/api/3/action/", + "version": "1709043354", + } -@pytest.mark.parametrize("test_input, expected", test_hosts) -def test_detect_ckan(test_input, expected): - assert CKAN().detect(test_input[0]) == expected + assert CKAN().detect("http://demo.ckan.org/dataset/1234") == expected def test_detect_not_ckan(): From 0808018da7db92b62acb3e4a026c8ae2def9b17c Mon Sep 17 00:00:00 2001 From: Sol Lee Date: Mon, 25 Mar 2024 12:11:38 +0000 Subject: [PATCH 05/10] Handle the URL with subdirectories --- repo2docker/contentproviders/ckan.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/repo2docker/contentproviders/ckan.py b/repo2docker/contentproviders/ckan.py index 46ea7c750..f00218ffa 100644 --- a/repo2docker/contentproviders/ckan.py +++ b/repo2docker/contentproviders/ckan.py @@ -50,7 +50,9 @@ def detect(self, source, ref=None, extra_args=None): return None api_url_path = "/api/3/action/" - api_url = parsed_url._replace(path=api_url_path).geturl() + api_url = parsed_url._replace( + path="/".join(url_parts[:-2]) + api_url_path + ).geturl() status_show_url = f"{api_url}status_show" resp = self.urlopen(status_show_url) From 7faa53c83b8710f8fdfdd27becfb193f5c2f1bb5 Mon Sep 17 00:00:00 2001 From: Sol Lee Date: Thu, 18 Apr 2024 09:12:54 +0000 Subject: [PATCH 06/10] Handle the activities --- repo2docker/contentproviders/ckan.py | 34 +++++++++++++++++----- tests/unit/contentproviders/test_ckan.py | 36 ++++++++++++++++++++---- 2 files changed, 57 insertions(+), 13 deletions(-) diff --git a/repo2docker/contentproviders/ckan.py b/repo2docker/contentproviders/ckan.py index f00218ffa..4888738a8 100644 --- a/repo2docker/contentproviders/ckan.py +++ b/repo2docker/contentproviders/ckan.py @@ -1,6 +1,6 @@ from datetime import datetime, timedelta, timezone from os import path -from urllib.parse import urlparse +from urllib.parse import parse_qs, urlparse from requests import Session @@ -43,23 +43,33 @@ def detect(self, source, ref=None, extra_args=None): if not parsed_url.netloc: return None - url_parts = parsed_url.path.split("/") - if url_parts[-2] == "dataset": - self.dataset_id = url_parts[-1] + url_parts_1 = parsed_url.path.split("/history/") + url_parts_2 = url_parts_1[0].split("/") + if url_parts_2[-2] == "dataset": + self.dataset_id = url_parts_2[-1] else: return None api_url_path = "/api/3/action/" api_url = parsed_url._replace( - path="/".join(url_parts[:-2]) + api_url_path + path="/".join(url_parts_2[:-2]) + api_url_path, query="" ).geturl() status_show_url = f"{api_url}status_show" resp = self.urlopen(status_show_url) if resp.status_code == 200: + + # handle the activites + activity_id = None + if parse_qs(parsed_url.query).get("activity_id") is not None: + activity_id = parse_qs(parsed_url.query).get("activity_id")[0] + if len(url_parts_1) == 2: + activity_id = url_parts_1[-1] + self.version = self._fetch_version(api_url) return { "dataset_id": self.dataset_id, + "activity_id": activity_id, "api_url": api_url, "version": self.version, } @@ -69,11 +79,21 @@ def detect(self, source, ref=None, extra_args=None): def fetch(self, spec, output_dir, yield_output=False): """Fetch a CKAN dataset.""" dataset_id = spec["dataset_id"] + activity_id = spec["activity_id"] yield f"Fetching CKAN dataset {dataset_id}.\n" - package_show_url = f"{spec['api_url']}package_show?id={dataset_id}" + + # handle the activites + if activity_id: + fetch_url = ( + f"{spec['api_url']}activity_data_show?" + f"id={activity_id}&object_type=package" + ) + else: + fetch_url = f"{spec['api_url']}package_show?id={dataset_id}" + resp = self.urlopen( - package_show_url, + fetch_url, headers={"accept": "application/json"}, ) diff --git a/tests/unit/contentproviders/test_ckan.py b/tests/unit/contentproviders/test_ckan.py index 2fa09073c..75a022692 100644 --- a/tests/unit/contentproviders/test_ckan.py +++ b/tests/unit/contentproviders/test_ckan.py @@ -14,11 +14,23 @@ def test_detect_ckan(requests_mock): expected = { "dataset_id": "1234", + "activity_id": None, "api_url": "http://demo.ckan.org/api/3/action/", "version": "1709043354", } + expected_activity = expected.copy() + expected_activity["activity_id"] = "5678" + assert CKAN().detect("http://demo.ckan.org/dataset/1234") == expected + assert ( + CKAN().detect("http://demo.ckan.org/dataset/1234?activity_id=5678") + == expected_activity + ) + assert ( + CKAN().detect("http://demo.ckan.org/dataset/1234/history/5678") + == expected_activity + ) def test_detect_not_ckan(): @@ -41,15 +53,27 @@ def test_ckan_fetch(requests_mock): requests_mock.get( "http://demo.ckan.org/api/3/action/package_show?id=1234", json=mock_response ) + requests_mock.get( + "http://demo.ckan.org/api/3/action/activity_data_show?id=5678", + json=mock_response, + ) requests_mock.get(f"file://{ckan_path}", content=open(ckan_path, "rb").read()) + + ckan = CKAN() + spec = {"dataset_id": "1234", "api_url": "http://demo.ckan.org/api/3/action/"} + + expected = {ckan_path.rsplit("/", maxsplit=1)[1]} + + with TemporaryDirectory() as d: + spec["activity_id"] = None + output = [] + for l in ckan.fetch(spec, d): + output.append(l) + assert expected == set(os.listdir(d)) + with TemporaryDirectory() as d: - ckan = CKAN() - spec = { - "dataset_id": "1234", - "api_url": "http://demo.ckan.org/api/3/action/", - } + spec["activity_id"] = "5678" output = [] for l in ckan.fetch(spec, d): output.append(l) - expected = {ckan_path.rsplit("/", maxsplit=1)[1]} assert expected == set(os.listdir(d)) From c948d780137f8335588a8f9d5a756b16ed9c8874 Mon Sep 17 00:00:00 2001 From: Sol Lee Date: Fri, 19 Apr 2024 01:00:33 +0000 Subject: [PATCH 07/10] Skip the resource with an empty link --- repo2docker/contentproviders/ckan.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/repo2docker/contentproviders/ckan.py b/repo2docker/contentproviders/ckan.py index 4888738a8..9131b0172 100644 --- a/repo2docker/contentproviders/ckan.py +++ b/repo2docker/contentproviders/ckan.py @@ -105,6 +105,8 @@ def fetch(self, spec, output_dir, yield_output=False): for resource in resources: file_url = resource["url"] + if file_url == "": + continue fname = file_url.rsplit("/", maxsplit=1)[-1] if fname == "": fname = resource["id"] From 930bf67a83858e8a099120ea0185c2026c273971 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 10 Jun 2024 15:49:40 -0700 Subject: [PATCH 08/10] Use urlencode to construct query strings --- repo2docker/contentproviders/ckan.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/repo2docker/contentproviders/ckan.py b/repo2docker/contentproviders/ckan.py index 9131b0172..c1e31a540 100644 --- a/repo2docker/contentproviders/ckan.py +++ b/repo2docker/contentproviders/ckan.py @@ -1,6 +1,6 @@ from datetime import datetime, timedelta, timezone from os import path -from urllib.parse import parse_qs, urlparse +from urllib.parse import parse_qs, urlparse, urlencode from requests import Session @@ -86,11 +86,10 @@ def fetch(self, spec, output_dir, yield_output=False): # handle the activites if activity_id: fetch_url = ( - f"{spec['api_url']}activity_data_show?" - f"id={activity_id}&object_type=package" + f"{spec['api_url']}activity_data_show?" + urlencode({"id": activity_id, "object_type": "package"}) ) else: - fetch_url = f"{spec['api_url']}package_show?id={dataset_id}" + fetch_url = f"{spec['api_url']}package_show?" + urlencode({"id": dataset_id}) resp = self.urlopen( fetch_url, From 327c6d3dcfbb29ae4a77b259f323beed769558bd Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 10 Jun 2024 17:13:22 -0700 Subject: [PATCH 09/10] Cleanup URL parsing mechanisms --- repo2docker/contentproviders/ckan.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/repo2docker/contentproviders/ckan.py b/repo2docker/contentproviders/ckan.py index c1e31a540..9c74afdf2 100644 --- a/repo2docker/contentproviders/ckan.py +++ b/repo2docker/contentproviders/ckan.py @@ -43,28 +43,32 @@ def detect(self, source, ref=None, extra_args=None): if not parsed_url.netloc: return None - url_parts_1 = parsed_url.path.split("/history/") - url_parts_2 = url_parts_1[0].split("/") - if url_parts_2[-2] == "dataset": - self.dataset_id = url_parts_2[-1] - else: + if "/dataset/" not in parsed_url.path: + # Not actually a dataset return None - api_url_path = "/api/3/action/" + # CKAN may be under a URL prefix, and we should accomodate that + url_prefix, dataset_url = parsed_url.path.split("/dataset/") + + dataset_url_parts = dataset_url.split("/") + self.dataset_id = dataset_url_parts[0] + api_url = parsed_url._replace( - path="/".join(url_parts_2[:-2]) + api_url_path, query="" + path=f"{url_prefix}/api/3/action/", query="" ).geturl() status_show_url = f"{api_url}status_show" resp = self.urlopen(status_show_url) if resp.status_code == 200: - # handle the activites + # Activity ID may be present either as a query parameter, activity_id + # or as part of the URL, under `/history/`. If `/history/` + # is present, that takes precedence over `activity_id` activity_id = None - if parse_qs(parsed_url.query).get("activity_id") is not None: + if "history" in dataset_url_parts: + activity_id = dataset_url_parts[dataset_url_parts.index("history") + 1] + elif parse_qs(parsed_url.query).get("activity_id") is not None: activity_id = parse_qs(parsed_url.query).get("activity_id")[0] - if len(url_parts_1) == 2: - activity_id = url_parts_1[-1] self.version = self._fetch_version(api_url) return { From a390013368c0b0a3a00b66a5555dce2f8d988277 Mon Sep 17 00:00:00 2001 From: Sol Lee Date: Fri, 14 Jun 2024 11:56:52 +0000 Subject: [PATCH 10/10] Black and isort --- repo2docker/contentproviders/ckan.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/repo2docker/contentproviders/ckan.py b/repo2docker/contentproviders/ckan.py index 9c74afdf2..2618fd06e 100644 --- a/repo2docker/contentproviders/ckan.py +++ b/repo2docker/contentproviders/ckan.py @@ -1,6 +1,6 @@ from datetime import datetime, timedelta, timezone from os import path -from urllib.parse import parse_qs, urlparse, urlencode +from urllib.parse import parse_qs, urlencode, urlparse from requests import Session @@ -89,11 +89,13 @@ def fetch(self, spec, output_dir, yield_output=False): # handle the activites if activity_id: - fetch_url = ( - f"{spec['api_url']}activity_data_show?" + urlencode({"id": activity_id, "object_type": "package"}) + fetch_url = f"{spec['api_url']}activity_data_show?" + urlencode( + {"id": activity_id, "object_type": "package"} ) else: - fetch_url = f"{spec['api_url']}package_show?" + urlencode({"id": dataset_id}) + fetch_url = f"{spec['api_url']}package_show?" + urlencode( + {"id": dataset_id} + ) resp = self.urlopen( fetch_url,