diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 2661650a..81cef792 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -42,7 +42,7 @@ jobs: # Checks-out your repository under $GITHUB_WORKSPACE - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: # Need to fetch more than the last commit so that setuptools-scm can # create the correct version string. If the number of commits since @@ -58,9 +58,9 @@ jobs: run: git fetch origin 'refs/tags/*:refs/tags/*' - name: Setup Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.x" - name: Collect requirements run: | @@ -86,7 +86,7 @@ jobs: echo "::set-output name=dir::$(pip cache dir)" - name: Setup caching for pip packages - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} key: ${{ runner.os }}-pip-${{ hashFiles('requirements-full.txt') }} @@ -113,7 +113,7 @@ jobs: # Store the docs as a build artifact so we can deploy it later - name: Upload HTML documentation as an artifact - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: docs-${{ github.sha }} path: doc/_build/html @@ -127,11 +127,11 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 # Fetch the built docs from the "build" job - name: Download HTML documentation artifact - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v3 with: name: docs-${{ github.sha }} path: doc/_build/html diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 30fca3d6..11fb69d9 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -29,7 +29,7 @@ jobs: steps: # Checks-out your repository under $GITHUB_WORKSPACE - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: # Need to fetch more than the last commit so that setuptools_scm can # create the correct version string. If the number of commits since @@ -45,9 +45,9 @@ jobs: run: git fetch origin 'refs/tags/*:refs/tags/*' - name: Setup Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.x" - name: Install requirements run: | @@ -79,7 +79,7 @@ jobs: - name: Upload archives as artifacts # Only if not a pull request if: success() && github.event_name != 'pull_request' - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: pypi-${{ github.sha }} path: dist @@ -94,7 +94,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: # The GitHub token is preserved by default but this job doesn't need # to be able to push to GitHub. @@ -102,7 +102,7 @@ jobs: # Fetch the built archives from the "build" job - name: Download built archives artifact - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v3 with: name: pypi-${{ github.sha }} path: dist diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index c4c549a7..036b7d39 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -20,12 +20,12 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: persist-credentials: false - name: Setup Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: "3.10" @@ -42,12 +42,12 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: persist-credentials: false - name: Setup Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: "3.10" diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d4102d77..7e51a8af 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -20,7 +20,7 @@ on: schedule: # Run every Monday at 12:00 UTC # * is a special character in YAML so you have to quote this string - - cron: '00 12 * * 1' + - cron: "00 12 * * 1" # Use bash by default in all jobs defaults: @@ -28,7 +28,6 @@ defaults: shell: bash jobs: - ############################################################################# # Run tests and upload to codecov test: @@ -45,7 +44,7 @@ jobs: - windows python: - "3.7" - - "3.10" + - "3.11" dependencies: - latest - optional @@ -68,7 +67,7 @@ jobs: # Checks-out your repository under $GITHUB_WORKSPACE - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: # Need to fetch more than the last commit so that setuptools-scm can # create the correct version string. If the number of commits since @@ -84,7 +83,7 @@ jobs: run: git fetch origin 'refs/tags/*:refs/tags/*' - name: Setup Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} @@ -119,7 +118,7 @@ jobs: echo "::set-output name=dir::$(pip cache dir)" - name: Setup caching for pip packages - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} key: ${{ runner.os }}-pip-${{ hashFiles('requirements-full.txt') }} @@ -148,9 +147,9 @@ jobs: run: coverage xml - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 + uses: codecov/codecov-action@v3 with: - file: ./coverage.xml + files: ./coverage.xml env_vars: OS,PYTHON,DEPENDENCIES # Don't mark the job as failed if the upload fails for some reason. # It does sometimes but shouldn't be the reason for running diff --git a/AUTHORS.md b/AUTHORS.md index 52661064..ea21de72 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -13,6 +13,7 @@ order by last name) and are considered "The Pooch Developers": * [Dominic Kempf](https://github.com/dokempf) - Scientific Software Center, Heidelberg University, Germany (ORCID: [0000-0002-6140-2332](https://www.orcid.org/0000-0002-6140-2332)) * [Kacper Kowalik](https://github.com/Xarthisius) - National Center for Supercomputing Applications, University of Illinois at Urbana-Champaign, USA (ORCID: [0000-0003-1709-3744](https://www.orcid.org/0000-0003-1709-3744)) * [John Leeman](https://github.com/jrleeman) +* [Björn Ludwig](https://github.com/BjoernLudwigPTB) - Physikalisch-Technische Bundesanstalt, Germany (ORCID: [0000-0002-5910-9137](https://www.orcid.org/0000-0002-5910-9137)) * [Daniel McCloy](https://github.com/drammock) - University of Washington, USA (ORCID: [0000-0002-7572-3241](https://orcid.org/0000-0002-7572-3241)) * [Rémi Rampin](https://github.com/remram44) - New York University, USA (ORCID: [0000-0002-0524-2282](https://www.orcid.org/0000-0002-0524-2282)) * [Clément Robert](https://github.com/neutrinoceros) - Institut de Planétologie et d'Astrophysique de Grenoble, France (ORCID: [0000-0001-8629-7068](https://orcid.org/0000-0001-8629-7068)) diff --git a/CITATION.rst b/CITATION.rst index 5fb0255f..a1126316 100644 --- a/CITATION.rst +++ b/CITATION.rst @@ -14,5 +14,20 @@ If you used Pooch in your research, please consider citing our paper: This is an open-access publication. The paper and the associated software review can be freely accessed at: https://doi.org/10.21105/joss.01943 -If you need a Bibtex entry for the paper, grab it here: -https://www.doi2bib.org/bib/10.21105/joss.01943 +Here is a Bibtex entry to make things easier if you’re using Latex: + +.. code:: bibtex + + @article{uieda2020, + title = {{Pooch}: {A} friend to fetch your data files}, + author = {Leonardo Uieda and Santiago Soler and R{\'{e}}mi Rampin and Hugo van Kemenade and Matthew Turk and Daniel Shapero and Anderson Banihirwe and John Leeman}, + year = {2020}, + doi = {10.21105/joss.01943}, + url = {https://doi.org/10.21105/joss.01943}, + month = jan, + publisher = {The Open Journal}, + volume = {5}, + number = {45}, + pages = {1943}, + journal = {Journal of Open Source Software} + } diff --git a/README.md b/README.md index bb4191e0..806990a4 100644 --- a/README.md +++ b/README.md @@ -149,6 +149,7 @@ def fetch_gravity_data(): * [climlab](https://github.com/climlab/climlab) * [napari](https://github.com/napari/napari) * [mne-python](https://github.com/mne-tools/mne-python) +* [GemGIS](https://github.com/cgre-aachen/gemgis) *If you're using Pooch, send us a pull request adding your project to the list.* diff --git a/doc/changes.rst b/doc/changes.rst index 7ea1638f..cd7d7eae 100644 --- a/doc/changes.rst +++ b/doc/changes.rst @@ -3,6 +3,64 @@ Changelog ========= +Version 1.7.0 +------------- + +*Released on: 2023/02/27* + +doi:`10.5281/zenodo.7678844 `__ + +Bug fixes: + +* Make archive extraction always take members into account (`#316 `__) +* Figshare downloaders fetch the correct version, instead of always the latest one. (`#343 `__) + +New features: + +* Allow spaces in filenames in registry files (`#315 `__) +* Refactor ``Pooch.is_available`` to use downloaders (`#322 `__) +* Add support for downloading files from Dataverse DOIs (`#318 `__) +* Add a new ``Pooch.load_registry_from_doi`` method that populates the Pooch registry using DOI-based data repositories (`#325 `__) +* Support urls for Zenodo repositories created through the GitHub integration service, which include slashes in the filename of the main zip files (`#340 `__) +* Automatically add a trailing slash to ``base_url`` on ``pooch.create`` (`#344 `__) + +Maintenance: + +* Drop support for Python 3.6 (`#299 `__) +* Port from deprecated ``appdirs`` to ``platformdirs`` (`#339 `__) +* Update version of Codecov's Action to v3 (`#345 `__) + +Documentation: + +* Update sphinx, theme, and sphinx-panels (`#300 `__) +* Add CITATION.cff for the JOSS article (`#308 `__) +* Use Markdown for the README (`#311 `__) +* Improve docstring of `known_hash` in `retrieve` function (`#333 `__) +* Replace link to Pooch's citation with a BibTeX code snippet (`#335 `__) + +Projects that started using Pooch: + +* Open AR-Sandbox (`#305 `__) +* ``climlab`` (`#312 `__) +* SciPy (`#320 `__) +* ``napari`` (`#321 `__) +* ``mne-python`` (`#323 `__) + +This release contains contributions from: + +* Alex Fikl +* Anirudh Dagar +* Björn Ludwig +* Brian Rose +* Dominic Kempf +* Florian Wellmann +* Gabriel Fu +* Kyle I S Harrington +* Leonardo Uieda +* myd7349 +* Rowan Cockett +* Santiago Soler + Version 1.6.0 ------------- diff --git a/doc/install.rst b/doc/install.rst index 8ecb7b6c..fb00e1a8 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -53,7 +53,7 @@ manually. Required: -* `appdirs `__ +* `platformdirs `__ * `packaging `__ * `requests `__ diff --git a/doc/protocols.rst b/doc/protocols.rst index e0ca4318..94fe4564 100644 --- a/doc/protocols.rst +++ b/doc/protocols.rst @@ -103,3 +103,24 @@ figshare dataset: ``doi:10.6084/m9.figshare.c.4362224.v1``. Attempting to download files from a figshare collection will raise an error. See `issue #274 `__ details. + +Since this type of repositories store information about the files contained in +them, we can avoid having to manually type the registry with the file names and +their hashes. +Instead, we can use the :meth:`pooch.Pooch.load_registry_from_doi` to +automatically populate the registry: + +.. code-block:: python + + POOCH = pooch.create( + path=pooch.os_cache("plumbus"), + # Use the figshare DOI + base_url="doi:10.6084/m9.figshare.14763051.v1/", + registry=None, + ) + + # Automatically populate the registry + POOCH.load_registry_from_doi() + + # Fetch one of the files in the repository + fname = POOCH.fetch("tiny-data.txt") diff --git a/doc/versions.rst b/doc/versions.rst index cab21e8d..1b0a3450 100644 --- a/doc/versions.rst +++ b/doc/versions.rst @@ -7,6 +7,7 @@ Use the links below to access documentation for specific versions * `Latest release `__ * `Development `__ (reflects the current development branch on GitHub) +* `v1.7.0 `__ * `v1.6.0 `__ * `v1.5.2 `__ * `v1.5.1 `__ diff --git a/environment.yml b/environment.yml index a1800a9a..29dfafb9 100644 --- a/environment.yml +++ b/environment.yml @@ -3,12 +3,12 @@ channels: - conda-forge - defaults dependencies: - - python==3.10 + - python==3.11 - pip # Run - requests - packaging - - appdirs + - platformdirs # Build - build # Test diff --git a/pooch/core.py b/pooch/core.py index 677aa574..6c5d46f4 100644 --- a/pooch/core.py +++ b/pooch/core.py @@ -25,7 +25,7 @@ os_cache, unique_file_name, ) -from .downloaders import choose_downloader +from .downloaders import DOIDownloader, choose_downloader, doi_to_repository def retrieve( @@ -74,7 +74,7 @@ def retrieve( url : str The URL to the file that is to be downloaded. Ideally, the URL should end in a file name. - known_hash : str + known_hash : str or None A known hash (checksum) of the file. Will be used to verify the download or check if an existing file needs to be updated. By default, will assume it's a SHA256 hash. To specify a different hashing method, @@ -84,7 +84,7 @@ def retrieve( existing file needs to be updated. fname : str or None The name that will be used to save the file. Should NOT include the - full the path, just the file name (it will be appended to *path*). If + full path, just the file name (it will be appended to *path*). If None, will create a unique file name using a combination of the last part of the URL (assuming it's the file name) and the MD5 hash of the URL. For example, ``81whdo2d2e928yd1wi22-data-file.csv``. This ensures @@ -293,8 +293,8 @@ def create( Base URL for the remote data source. All requests will be made relative to this URL. The string should have a ``{version}`` formatting mark in it. We will call ``.format(version=version)`` on this string. If the - URL is a directory path, it must end in a ``'/'`` because we will not - include it. + URL does not end in a ``'/'``, a trailing ``'/'`` will be added + automatically. version : str or None The version string for your project. Should be PEP440 compatible. If None is given, will not attempt to format *base_url* and no subfolder @@ -421,6 +421,8 @@ def create( path = cache_location(path, env, version) if isinstance(allow_updates, str): allow_updates = os.environ.get(allow_updates, "true").lower() != "false" + # add trailing "/" + base_url = base_url.rstrip("/") + "/" pup = Pooch( path=path, base_url=base_url, @@ -668,6 +670,36 @@ def load_registry(self, fname): self.urls[file_name] = file_url self.registry[file_name] = file_checksum.lower() + def load_registry_from_doi(self): + """ + Populate the registry using the data repository API + + Fill the registry with all the files available in the data repository, + along with their hashes. It will make a request to the data repository + API to retrieve this information. No file is downloaded during this + process. + + .. important:: + + This method is intended to be used only when the ``base_url`` is + a DOI. + """ + + # Ensure that this is indeed a DOI-based pooch + downloader = choose_downloader(self.base_url) + if not isinstance(downloader, DOIDownloader): + raise ValueError( + f"Invalid base_url '{self.base_url}': " + + "Pooch.load_registry_from_doi is only implemented for DOIs" + ) + + # Create a repository instance + doi = self.base_url.replace("doi:", "") + repository = doi_to_repository(doi) + + # Call registry population for this repository + return repository.populate_registry(self) + def is_available(self, fname, downloader=None): """ Check availability of a remote file without downloading it. diff --git a/pooch/downloaders.py b/pooch/downloaders.py index fa452ab4..1cae6b1e 100644 --- a/pooch/downloaders.py +++ b/pooch/downloaders.py @@ -11,6 +11,8 @@ import sys import ftplib +import warnings + from .utils import parse_url try: @@ -293,7 +295,6 @@ def __init__( progressbar=False, chunk_size=1024, ): - self.port = port self.username = username self.password = password @@ -515,7 +516,7 @@ class DOIDownloader: # pylint: disable=too-few-public-methods * `figshare `__ * `Zenodo `__ - * `DataVerse `__ instances + * `Dataverse `__ instances .. attention:: @@ -595,36 +596,14 @@ def __call__(self, url, output_file, pooch): """ - repositories = [ - FigshareRepository, - ZenodoRepository, - DataverseRepository, - ] - - # Extract the DOI and the repository information parsed_url = parse_url(url) - doi = parsed_url["netloc"] - archive_url = doi_to_url(doi) - - # Try the converters one by one until one of them returned a URL - data_repository = None - for repo in repositories: - if data_repository is None: - data_repository = repo.initialize( - archive_url=archive_url, - doi=doi, - ) - - if data_repository is None: - repository = parse_url(archive_url)["netloc"] - raise ValueError( - f"Invalid data repository '{repository}'. " - "To request or contribute support for this repository, " - "please open an issue at https://github.com/fatiando/pooch/issues" - ) + data_repository = doi_to_repository(parsed_url["netloc"]) # Resolve the URL - file_name = parsed_url["path"].split("/")[-1] + file_name = parsed_url["path"] + # remove the leading slash in the path + if file_name[0] == "/": + file_name = file_name[1:] download_url = data_repository.download_url(file_name) # Instantiate the downloader object @@ -662,6 +641,59 @@ def doi_to_url(doi): return url +def doi_to_repository(doi): + """ + Instantiate a data repository instance from a given DOI. + + This function implements the chain of responsibility dispatch + to the correct data repository class. + + Parameters + ---------- + doi : str + The DOI of the archive. + + Returns + ------- + data_repository : DataRepository + The data repository object + """ + + # This should go away in a separate issue: DOI handling should + # not rely on the (non-)existence of trailing slashes. The issue + # is documented in https://github.com/fatiando/pooch/issues/324 + if doi[-1] == "/": + doi = doi[:-1] + + repositories = [ + FigshareRepository, + ZenodoRepository, + DataverseRepository, + ] + + # Extract the DOI and the repository information + archive_url = doi_to_url(doi) + + # Try the converters one by one until one of them returned a URL + data_repository = None + for repo in repositories: + if data_repository is None: + data_repository = repo.initialize( + archive_url=archive_url, + doi=doi, + ) + + if data_repository is None: + repository = parse_url(archive_url)["netloc"] + raise ValueError( + f"Invalid data repository '{repository}'. " + "To request or contribute support for this repository, " + "please open an issue at https://github.com/fatiando/pooch/issues" + ) + + return data_repository + + class DataRepository: # pylint: disable=too-few-public-methods, missing-class-docstring @classmethod def initialize(cls, doi, archive_url): # pylint: disable=unused-argument @@ -702,11 +734,24 @@ def download_url(self, file_name): raise NotImplementedError # pragma: no cover + def populate_registry(self, pooch): + """ + Populate the registry using the data repository's API + + Parameters + ---------- + pooch : Pooch + The pooch instance that the registry will be added to. + """ + + raise NotImplementedError # pragma: no cover + class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring def __init__(self, doi, archive_url): self.archive_url = archive_url self.doi = doi + self._api_response = None @classmethod def initialize(cls, doi, archive_url): @@ -734,6 +779,20 @@ def initialize(cls, doi, archive_url): return cls(doi, archive_url) + @property + def api_response(self): + """Cached API response from Zenodo""" + if self._api_response is None: + # Lazy import requests to speed up import time + import requests # pylint: disable=C0415 + + article_id = self.archive_url.split("/")[-1] + self._api_response = requests.get( + f"https://zenodo.org/api/records/{article_id}" + ).json() + + return self._api_response + def download_url(self, file_name): """ Use the repository API to get the download URL for a file given @@ -749,13 +808,7 @@ def download_url(self, file_name): download_url : str The HTTP URL that can be used to download the file. """ - # Lazy import requests to speed up import time - import requests # pylint: disable=C0415 - - article_id = self.archive_url.split("/")[-1] - # With the ID, we can get a list of files and their download links - article = requests.get(f"https://zenodo.org/api/records/{article_id}").json() - files = {item["key"]: item for item in article["files"]} + files = {item["key"]: item for item in self.api_response["files"]} if file_name not in files: raise ValueError( f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})." @@ -763,11 +816,25 @@ def download_url(self, file_name): download_url = files[file_name]["links"]["self"] return download_url + def populate_registry(self, pooch): + """ + Populate the registry using the data repository's API + + Parameters + ---------- + pooch : Pooch + The pooch instance that the registry will be added to. + """ + + for filedata in self.api_response["files"]: + pooch.registry[filedata["key"]] = filedata["checksum"] + class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring def __init__(self, doi, archive_url): self.archive_url = archive_url self.doi = doi + self._api_response = None @classmethod def initialize(cls, doi, archive_url): @@ -795,6 +862,64 @@ def initialize(cls, doi, archive_url): return cls(doi, archive_url) + def _parse_version_from_doi(self): + """ + Parse version from the doi + + Return None if version is not available in the doi. + """ + # Get suffix of the doi + _, suffix = self.doi.split("/") + # Split the suffix by dots and keep the last part + last_part = suffix.split(".")[-1] + # Parse the version from the last part + if last_part[0] != "v": + return None + version = int(last_part[1:]) + return version + + @property + def api_response(self): + """Cached API response from Figshare""" + if self._api_response is None: + # Lazy import requests to speed up import time + import requests # pylint: disable=C0415 + + # Use the figshare API to find the article ID from the DOI + article = requests.get( + f"https://api.figshare.com/v2/articles?doi={self.doi}" + ).json()[0] + article_id = article["id"] + # Parse desired version from the doi + version = self._parse_version_from_doi() + # With the ID and version, we can get a list of files and their + # download links + if version is None: + # Figshare returns the latest version available when no version + # is specified through the DOI. + warnings.warn( + f"The Figshare DOI '{self.doi}' doesn't specify which version of " + "the repository should be used. " + "Figshare will point to the latest version available.", + UserWarning, + ) + # Define API url using only the article id + # (figshare will resolve the latest version) + api_url = f"https://api.figshare.com/v2/articles/{article_id}" + else: + # Define API url using article id and the desired version + # Get list of files using article id and the version + api_url = ( + "https://api.figshare.com/v2/articles/" + f"{article_id}/versions/{version}" + ) + # Make the request and return the files in the figshare repository + response = requests.get(api_url) + response.raise_for_status() + self._api_response = response.json()["files"] + + return self._api_response + def download_url(self, file_name): """ Use the repository API to get the download URL for a file given @@ -810,20 +935,7 @@ def download_url(self, file_name): download_url : str The HTTP URL that can be used to download the file. """ - # Lazy import requests to speed up import time - import requests # pylint: disable=C0415 - - # Use the figshare API to find the article ID from the DOI - article = requests.get( - f"https://api.figshare.com/v2/articles?doi={self.doi}" - ).json()[0] - article_id = article["id"] - # With the ID, we can get a list of files and their download links - response = requests.get( - f"https://api.figshare.com/v2/articles/{article_id}/files" - ) - response.raise_for_status() - files = {item["name"]: item for item in response.json()} + files = {item["name"]: item for item in self.api_response} if file_name not in files: raise ValueError( f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})." @@ -831,11 +943,25 @@ def download_url(self, file_name): download_url = files[file_name]["download_url"] return download_url + def populate_registry(self, pooch): + """ + Populate the registry using the data repository's API + + Parameters + ---------- + pooch : Pooch + The pooch instance that the registry will be added to. + """ + + for filedata in self.api_response: + pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}" + class DataverseRepository(DataRepository): # pylint: disable=missing-class-docstring def __init__(self, doi, archive_url): self.archive_url = archive_url self.doi = doi + self._api_response = None @classmethod def initialize(cls, doi, archive_url): @@ -855,21 +981,52 @@ def initialize(cls, doi, archive_url): archive_url : str The resolved URL for the DOI """ + # Access the DOI as if this was a DataVerse instance + response = cls._get_api_response(doi, archive_url) + + # If we failed, this is probably not a DataVerse instance + if 400 <= response.status_code < 600: + return None + + # Initialize the repository and overwrite the api response + repository = cls(doi, archive_url) + repository.api_response = response + return repository + + @classmethod + def _get_api_response(cls, doi, archive_url): + """ + Perform the actual API request + + This has been separated into a separate ``classmethod``, as it can be + used prior and after the initialization. + """ # Lazy import requests to speed up import time import requests # pylint: disable=C0415 - # Access the DOI as if this was a DataVerse instance parsed = parse_url(archive_url) response = requests.get( f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/" f":persistentId?persistentId=doi:{doi}" ) + return response - # If we failed, this is probably not a DataVerse instance - if 400 <= response.status_code < 600: - return None + @property + def api_response(self): + """Cached API response from a DataVerse instance""" - return cls(doi, archive_url) + if self._api_response is None: + self._api_response = self._get_api_response( + self.doi, self.archive_url + ) # pragma: no cover + + return self._api_response + + @api_response.setter + def api_response(self, response): + """Update the cached API response""" + + self._api_response = response def download_url(self, file_name): """ @@ -886,18 +1043,10 @@ def download_url(self, file_name): download_url : str The HTTP URL that can be used to download the file. """ - # Lazy import requests to speed up import time - import requests # pylint: disable=C0415 - - # Access the DOI as if this was a DataVerse instance parsed = parse_url(self.archive_url) - response = requests.get( - f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/" - f":persistentId?persistentId=doi:{self.doi}" - ) # Iterate over the given files until we find one of the requested name - for filedata in response.json()["data"]["latestVersion"]["files"]: + for filedata in self.api_response.json()["data"]["latestVersion"]["files"]: if file_name == filedata["dataFile"]["filename"]: return ( f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/" @@ -907,3 +1056,18 @@ def download_url(self, file_name): raise ValueError( f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})." ) + + def populate_registry(self, pooch): + """ + Populate the registry using the data repository's API + + Parameters + ---------- + pooch : Pooch + The pooch instance that the registry will be added to. + """ + + for filedata in self.api_response.json()["data"]["latestVersion"]["files"]: + pooch.registry[ + filedata["dataFile"]["filename"] + ] = f"md5:{filedata['dataFile']['md5']}" diff --git a/pooch/tests/test_core.py b/pooch/tests/test_core.py index addf0fa3..0a46aea5 100644 --- a/pooch/tests/test_core.py +++ b/pooch/tests/test_core.py @@ -28,6 +28,8 @@ data_over_ftp, pooch_test_figshare_url, pooch_test_zenodo_url, + pooch_test_zenodo_with_slash_url, + pooch_test_dataverse_url, pooch_test_registry, check_tiny_data, check_large_data, @@ -40,6 +42,8 @@ BASEURL = pooch_test_url() FIGSHAREURL = pooch_test_figshare_url() ZENODOURL = pooch_test_zenodo_url() +ZENODOURL_W_SLASH = pooch_test_zenodo_with_slash_url() +DATAVERSEURL = pooch_test_dataverse_url() REGISTRY_CORRUPTED = { # The same data file but I changed the hash manually to a wrong one "tiny-data.txt": "098h0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d" @@ -135,7 +139,9 @@ def test_pooch_local(data_dir_mirror): @pytest.mark.network @pytest.mark.parametrize( - "url", [BASEURL, FIGSHAREURL, ZENODOURL], ids=["https", "figshare", "zenodo"] + "url", + [BASEURL, FIGSHAREURL, ZENODOURL, DATAVERSEURL], + ids=["https", "figshare", "zenodo", "dataverse"], ) def test_pooch_custom_url(url): "Have pooch download the file from URL that is not base_url" @@ -159,7 +165,9 @@ def test_pooch_custom_url(url): @pytest.mark.network @pytest.mark.parametrize( - "url", [BASEURL, FIGSHAREURL, ZENODOURL], ids=["https", "figshare", "zenodo"] + "url", + [BASEURL, FIGSHAREURL, ZENODOURL, DATAVERSEURL], + ids=["https", "figshare", "zenodo", "dataverse"], ) def test_pooch_download(url): "Setup a pooch that has no local data and needs to download" @@ -374,6 +382,15 @@ def test_pooch_update_disallowed_environment(): os.environ.pop(variable_name) +def test_pooch_create_base_url_no_trailing_slash(): + """ + Test if pooch.create appends a trailing slash to the base url if missing + """ + base_url = "https://mybase.url" + pup = create(base_url=base_url, registry=None, path=DATA_DIR) + assert pup.base_url == base_url + "/" + + @pytest.mark.network def test_pooch_corrupted(data_dir_mirror): "Raise an exception if the file hash doesn't match the registry" @@ -608,3 +625,56 @@ def test_stream_download(fname): stream_download(url, destination, known_hash, downloader, pooch=None) assert destination.exists() check_tiny_data(str(destination)) + + +@pytest.mark.parametrize( + "url", + [FIGSHAREURL, ZENODOURL, DATAVERSEURL], + ids=["figshare", "zenodo", "dataverse"], +) +def test_load_registry_from_doi(url): + """Check that the registry is correctly populated from the API""" + with TemporaryDirectory() as local_store: + path = os.path.abspath(local_store) + pup = Pooch(path=path, base_url=url) + pup.load_registry_from_doi() + + # Check the existence of all files in the registry + assert len(pup.registry) == 2 + assert "tiny-data.txt" in pup.registry + assert "store.zip" in pup.registry + + # Ensure that all files have correct checksums by fetching them + for filename in pup.registry: + pup.fetch(filename) + + +def test_load_registry_from_doi_zenodo_with_slash(): + """ + Check that the registry is correctly populated from the Zenodo API when + the filename contains a slash + """ + url = ZENODOURL_W_SLASH + with TemporaryDirectory() as local_store: + path = os.path.abspath(local_store) + pup = Pooch(path=path, base_url=url) + pup.load_registry_from_doi() + + # Check the existence of all files in the registry + assert len(pup.registry) == 1 + assert "santisoler/pooch-test-data-v1.zip" in pup.registry + + # Ensure that all files have correct checksums by fetching them + for filename in pup.registry: + pup.fetch(filename) + + +def test_wrong_load_registry_from_doi(): + """Check that non-DOI URLs produce an error""" + + pup = Pooch(path="", base_url=BASEURL) + + with pytest.raises(ValueError) as exc: + pup.load_registry_from_doi() + + assert "only implemented for DOIs" in str(exc.value) diff --git a/pooch/tests/test_downloaders.py b/pooch/tests/test_downloaders.py index 04c916f1..ec85f91a 100644 --- a/pooch/tests/test_downloaders.py +++ b/pooch/tests/test_downloaders.py @@ -34,6 +34,7 @@ DataverseRepository, doi_to_url, ) +from ..processors import Unzip from .utils import ( pooch_test_url, check_large_data, @@ -41,6 +42,7 @@ data_over_ftp, pooch_test_figshare_url, pooch_test_zenodo_url, + pooch_test_zenodo_with_slash_url, pooch_test_dataverse_url, ) @@ -48,6 +50,7 @@ BASEURL = pooch_test_url() FIGSHAREURL = pooch_test_figshare_url() ZENODOURL = pooch_test_zenodo_url() +ZENODOURL_W_SLASH = pooch_test_zenodo_with_slash_url() DATAVERSEURL = pooch_test_dataverse_url() @@ -132,6 +135,75 @@ def test_doi_downloader(url): check_tiny_data(outfile) +@pytest.mark.network +def test_zenodo_downloader_with_slash_in_fname(): + """ + Test the Zenodo downloader when the path contains a forward slash + + Related to issue #336 + """ + # Use the test data we have on the repository + with TemporaryDirectory() as local_store: + base_url = ZENODOURL_W_SLASH + "santisoler/pooch-test-data-v1.zip" + downloader = DOIDownloader() + outfile = os.path.join(local_store, "test-data.zip") + downloader(base_url, outfile, None) + # unpack the downloaded zip file so we can check the integrity of + # tiny-data.txt + fnames = Unzip()(outfile, action="download", pooch=None) + (fname,) = [f for f in fnames if "tiny-data.txt" in f] + check_tiny_data(fname) + + +@pytest.mark.network +def test_figshare_unspecified_version(): + """ + Test if passing a Figshare url without a version warns about it, but still + downloads it. + """ + url = FIGSHAREURL + # Remove the last bits of the doi, where the version is specified and + url = url[: url.rindex(".")] + "/" + # Create expected warning message + doi = url[4:-1] + warning_msg = f"The Figshare DOI '{doi}' doesn't specify which version of " + with TemporaryDirectory() as local_store: + downloader = DOIDownloader() + outfile = os.path.join(local_store, "tiny-data.txt") + with pytest.warns(UserWarning, match=warning_msg): + downloader(url + "tiny-data.txt", outfile, None) + + +@pytest.mark.network +@pytest.mark.parametrize( + "version, missing, present", + [ + ( + 1, + "LC08_L2SP_218074_20190114_20200829_02_T1-cropped.tar.gz", + "cropped-before.tar.gz", + ), + ( + 2, + "cropped-before.tar.gz", + "LC08_L2SP_218074_20190114_20200829_02_T1-cropped.tar.gz", + ), + ], +) +def test_figshare_data_repository_versions(version, missing, present): + """ + Test if setting the version in Figshare DOI works as expected + """ + # Use a Figshare repo as example (we won't download files from it since + # they are too big) + doi = f"10.6084/m9.figshare.21665630.v{version}" + url = f"https://doi.org/{doi}/" + figshare = FigshareRepository(doi, url) + filenames = [item["name"] for item in figshare.api_response] + assert present in filenames + assert missing not in filenames + + @pytest.mark.network def test_ftp_downloader(ftpserver): "Test ftp downloader" diff --git a/pooch/tests/test_utils.py b/pooch/tests/test_utils.py index fadfbfe3..b364fc97 100644 --- a/pooch/tests/test_utils.py +++ b/pooch/tests/test_utils.py @@ -141,8 +141,16 @@ def mocktempfile(**kwargs): # pylint: disable=unused-argument "path": "/dike.json", }, ), + ( + r"doi:10.5281/zenodo.7632643/santisoler/pooch-test-data-v1.zip", + { + "protocol": "doi", + "netloc": "10.5281/zenodo.7632643", + "path": "/santisoler/pooch-test-data-v1.zip", + }, + ), ], - ids=["http", "ftp", "doi"], + ids=["http", "ftp", "doi", "zenodo-doi-with-slash"], ) def test_parse_url(url, output): "Parse URL into 3 components" diff --git a/pooch/tests/utils.py b/pooch/tests/utils.py index fe719f3b..5a6ce699 100644 --- a/pooch/tests/utils.py +++ b/pooch/tests/utils.py @@ -98,6 +98,24 @@ def pooch_test_zenodo_url(): return url +def pooch_test_zenodo_with_slash_url(): + """ + Get base URL for test data in Zenodo, where the file name contains a slash + + The URL contains the DOI for the Zenodo dataset that has a slash in the + filename (created with the GitHub-Zenodo integration service), using the + appropriate version for this version of Pooch. + + Returns + ------- + url + The URL for pooch's test data. + + """ + url = "doi:10.5281/zenodo.7632643/" + return url + + def pooch_test_dataverse_url(): """ Get the base URL for the test data stored on a DataVerse instance. diff --git a/pooch/utils.py b/pooch/utils.py index bc2b4aee..13670f12 100644 --- a/pooch/utils.py +++ b/pooch/utils.py @@ -16,7 +16,7 @@ from contextlib import contextmanager import warnings -import appdirs +import platformdirs from packaging.version import Version @@ -74,10 +74,10 @@ def os_cache(project): r""" Default cache location based on the operating system. - The folder locations are defined by the ``appdirs`` package + The folder locations are defined by the ``platformdirs`` package using the ``user_cache_dir`` function. Usually, the locations will be following (see the - `appdirs documentation `__): + `platformdirs documentation `__): * Mac: ``~/Library/Caches/`` * Unix: ``~/.cache/`` or the value of the ``XDG_CACHE_HOME`` @@ -96,7 +96,7 @@ def os_cache(project): not expanded. """ - return Path(appdirs.user_cache_dir(project)) + return Path(platformdirs.user_cache_dir(project)) def check_version(version, fallback="master"): @@ -159,6 +159,11 @@ def parse_url(url): The DOI is a special case. The protocol will be "doi", the netloc will be the DOI, and the path is what comes after the last "/". + The only exception are Zenodo dois: the protocol will be "doi", the netloc + will be composed by the "prefix/suffix" and the path is what comes after + the second "/". This allows to support special cases of Zenodo dois where + the path contains forward slashes "/", created by the GitHub-Zenodo + integration service. Parameters ---------- @@ -179,8 +184,12 @@ def parse_url(url): if url.startswith("doi:"): protocol = "doi" parts = url[4:].split("/") - netloc = "/".join(parts[:-1]) - path = "/" + parts[-1] + if "zenodo" in parts[1].lower(): + netloc = "/".join(parts[:2]) + path = "/" + "/".join(parts[2:]) + else: + netloc = "/".join(parts[:-1]) + path = "/" + parts[-1] else: parsed_url = urlsplit(url) protocol = parsed_url.scheme or "file" diff --git a/setup.cfg b/setup.cfg index b9fea6ae..aacdc661 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,6 +27,7 @@ classifiers = Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 url = https://github.com/fatiando/pooch project_urls = Documentation = https://www.fatiando.org/pooch @@ -41,7 +42,7 @@ packages = find: python_requires = >=3.7 setup_requires = install_requires = - appdirs>=1.3.0 + platformdirs>=2.5.0 packaging>=20.0 requests>=2.19.0