diff --git a/latest b/latest index 4e2cea3b..3f0882c5 120000 --- a/latest +++ b/latest @@ -1 +1 @@ -v1.8.0 \ No newline at end of file +v1.8.1 \ No newline at end of file diff --git a/v1.8.1/.buildinfo b/v1.8.1/.buildinfo new file mode 100644 index 00000000..428edd83 --- /dev/null +++ b/v1.8.1/.buildinfo @@ -0,0 +1,4 @@ +# Sphinx build info version 1 +# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. +config: 8f23f83272046b59dd86c0692981ccde +tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/v1.8.1/_modules/index.html b/v1.8.1/_modules/index.html new file mode 100644 index 00000000..587cbc92 --- /dev/null +++ b/v1.8.1/_modules/index.html @@ -0,0 +1,447 @@ + + + + + + + +
+ + + +
+# Copyright (c) 2018 The Pooch Developers.
+# Distributed under the terms of the BSD 3-Clause License.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
+#
+# pylint: disable=missing-docstring,import-outside-toplevel,import-self
+#
+# Import functions/classes to make the API
+from .core import Pooch, create, retrieve
+from .utils import os_cache, check_version, get_logger
+from .hashes import file_hash, make_registry
+from .downloaders import (
+ HTTPDownloader,
+ FTPDownloader,
+ SFTPDownloader,
+ DOIDownloader,
+)
+from .processors import Unzip, Untar, Decompress
+
+# This file is generated automatically by setuptools_scm
+from . import _version
+
+
+# Add a "v" to the version number
+__version__ = f"v{_version.version}"
+
+
+
+[docs]
+def test(doctest=True, verbose=True, coverage=False):
+ """
+ Run the test suite.
+
+ Uses `py.test <http://pytest.org/>`__ to discover and run the tests.
+
+ Parameters
+ ----------
+
+ doctest : bool
+ If ``True``, will run the doctests as well (code examples that start
+ with a ``>>>`` in the docs).
+ verbose : bool
+ If ``True``, will print extra information during the test run.
+ coverage : bool
+ If ``True``, will run test coverage analysis on the code as well.
+ Requires ``pytest-cov``.
+
+ Raises
+ ------
+
+ AssertionError
+ If pytest returns a non-zero error code indicating that some tests have
+ failed.
+
+ """
+ import pytest
+
+ package = __name__
+ args = []
+ if verbose:
+ args.append("-vv")
+ if coverage:
+ args.append(f"--cov={package}")
+ args.append("--cov-report=term-missing")
+ if doctest:
+ args.append("--doctest-modules")
+ args.append("--pyargs")
+ args.append(package)
+ status = pytest.main(args)
+ assert status == 0, "Some tests have failed."
+
+
+# Copyright (c) 2018 The Pooch Developers.
+# Distributed under the terms of the BSD 3-Clause License.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
+#
+"""
+The main Pooch class and a factory function for it.
+"""
+import os
+import time
+import contextlib
+from pathlib import Path
+import shlex
+import shutil
+
+
+from .hashes import hash_matches, file_hash
+from .utils import (
+ check_version,
+ get_logger,
+ make_local_storage,
+ cache_location,
+ temporary_file,
+ os_cache,
+ unique_file_name,
+)
+from .downloaders import DOIDownloader, choose_downloader, doi_to_repository
+
+
+
+[docs]
+def retrieve(
+ url,
+ known_hash,
+ fname=None,
+ path=None,
+ processor=None,
+ downloader=None,
+ progressbar=False,
+):
+ """
+ Download and cache a single file locally.
+
+ Uses HTTP or FTP by default, depending on the protocol in the given *url*.
+ Other download methods can be controlled through the *downloader* argument
+ (see below).
+
+ The file will be downloaded to a temporary location first and its hash will
+ be compared to the given *known_hash*. This is done to ensure that the
+ download happened correctly and securely. If the hash doesn't match, the
+ file will be deleted and an exception will be raised.
+
+ If the file already exists locally, its hash will be compared to
+ *known_hash*. If they are not the same, this is interpreted as the file
+ needing to be updated and it will be downloaded again.
+
+ You can bypass these checks by passing ``known_hash=None``. If this is
+ done, the SHA256 hash of the downloaded file will be logged to the screen.
+ It is highly recommended that you copy and paste this hash as *known_hash*
+ so that future downloads are guaranteed to be the exact same file. This is
+ crucial for reproducible computations.
+
+ If the file exists in the given *path* with the given *fname* and the hash
+ matches, it will not be downloaded and the absolute path to the file will
+ be returned.
+
+ .. note::
+
+ This function is meant for downloading single files. If you need to
+ manage the download and caching of several files, with versioning, use
+ :func:`pooch.create` and :class:`pooch.Pooch` instead.
+
+ Parameters
+ ----------
+ url : str
+ The URL to the file that is to be downloaded. Ideally, the URL should
+ end in a file name.
+ known_hash : str or None
+ A known hash (checksum) of the file. Will be used to verify the
+ download or check if an existing file needs to be updated. By default,
+ will assume it's a SHA256 hash. To specify a different hashing method,
+ prepend the hash with ``algorithm:``, for example
+ ``md5:pw9co2iun29juoh`` or ``sha1:092odwhi2ujdp2du2od2odh2wod2``. If
+ None, will NOT check the hash of the downloaded file or check if an
+ existing file needs to be updated.
+ fname : str or None
+ The name that will be used to save the file. Should NOT include the
+ full path, just the file name (it will be appended to *path*). If
+ None, will create a unique file name using a combination of the last
+ part of the URL (assuming it's the file name) and the MD5 hash of the
+ URL. For example, ``81whdo2d2e928yd1wi22-data-file.csv``. This ensures
+ that files from different URLs never overwrite each other, even if they
+ have the same name.
+ path : str or PathLike or None
+ The location of the cache folder on disk. This is where the file will
+ be saved. If None, will save to a ``pooch`` folder in the default cache
+ location for your operating system (see :func:`pooch.os_cache`).
+ processor : None or callable
+ If not None, then a function (or callable object) that will be called
+ before returning the full path and after the file has been downloaded
+ (if required). See :ref:`processors` for details.
+ downloader : None or callable
+ If not None, then a function (or callable object) that will be called
+ to download a given URL to a provided local file name. See
+ :ref:`downloaders` for details.
+ progressbar : bool or an arbitrary progress bar object
+ If True, will print a progress bar of the download to standard error
+ (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
+ installed. Alternatively, an arbitrary progress bar object can be
+ passed. See :ref:`custom-progressbar` for details.
+
+ Returns
+ -------
+ full_path : str
+ The absolute path (including the file name) of the file in the local
+ storage.
+
+ Examples
+ --------
+
+ Download one of the data files from the Pooch repository on GitHub:
+
+ >>> import os
+ >>> from pooch import __version__, check_version, retrieve
+ >>> # Make a URL for the version of pooch we have installed
+ >>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
+ >>> url = url.format(check_version(__version__, fallback="main"))
+ >>> # Download the file and save it locally. Will check the MD5 checksum of
+ >>> # the downloaded file against the given value to make sure it's the
+ >>> # right file. You can use other hashes by specifying different
+ >>> # algorithm names (sha256, sha1, etc).
+ >>> fname = retrieve(
+ ... url, known_hash="md5:70e2afd3fd7e336ae478b1e740a5f08e",
+ ... )
+ >>> with open(fname) as f:
+ ... print(f.read().strip())
+ # A tiny data file for test purposes only
+ 1 2 3 4 5 6
+ >>> # Running again won't trigger a download and only return the path to
+ >>> # the existing file.
+ >>> fname2 = retrieve(
+ ... url, known_hash="md5:70e2afd3fd7e336ae478b1e740a5f08e",
+ ... )
+ >>> print(fname2 == fname)
+ True
+ >>> os.remove(fname)
+
+ Files that are compressed with gzip, xz/lzma, or bzip2 can be automatically
+ decompressed by passing using the :class:`pooch.Decompress` processor:
+
+ >>> from pooch import Decompress
+ >>> # URLs to a gzip compressed version of the data file.
+ >>> url = ("https://github.com/fatiando/pooch/raw/{}/"
+ ... + "pooch/tests/data/tiny-data.txt.gz")
+ >>> url = url.format(check_version(__version__, fallback="main"))
+ >>> # By default, you would have to decompress the file yourself
+ >>> fname = retrieve(
+ ... url,
+ ... known_hash="md5:8812ba10b6c7778014fdae81b03f9def",
+ ... )
+ >>> print(os.path.splitext(fname)[1])
+ .gz
+ >>> # Use the processor to decompress after download automatically and
+ >>> # return the path to the decompressed file instead.
+ >>> fname2 = retrieve(
+ ... url,
+ ... known_hash="md5:8812ba10b6c7778014fdae81b03f9def",
+ ... processor=Decompress(),
+ ... )
+ >>> print(fname2 == fname)
+ False
+ >>> with open(fname2) as f:
+ ... print(f.read().strip())
+ # A tiny data file for test purposes only
+ 1 2 3 4 5 6
+ >>> os.remove(fname)
+ >>> os.remove(fname2)
+
+ When downloading archives (zip or tar), it can be useful to unpack them
+ after download to avoid having to do that yourself. Use the processors
+ :class:`pooch.Unzip` or :class:`pooch.Untar` to do this automatically:
+
+ >>> from pooch import Unzip
+ >>> # URLs to a zip archive with a single data file.
+ >>> url = ("https://github.com/fatiando/pooch/raw/{}/"
+ ... + "pooch/tests/data/tiny-data.zip")
+ >>> url = url.format(check_version(__version__, fallback="main"))
+ >>> # By default, you would get the path to the archive
+ >>> fname = retrieve(
+ ... url,
+ ... known_hash="md5:e9592cb46cf3514a1079051f8a148148",
+ ... )
+ >>> print(os.path.splitext(fname)[1])
+ .zip
+ >>> os.remove(fname)
+ >>> # Using the processor, the archive will be unzipped and a list with the
+ >>> # path to every file will be returned instead of a single path.
+ >>> fnames = retrieve(
+ ... url,
+ ... known_hash="md5:e9592cb46cf3514a1079051f8a148148",
+ ... processor=Unzip(),
+ ... )
+ >>> # There was only a single file in our archive.
+ >>> print(len(fnames))
+ 1
+ >>> with open(fnames[0]) as f:
+ ... print(f.read().strip())
+ # A tiny data file for test purposes only
+ 1 2 3 4 5 6
+ >>> for f in fnames:
+ ... os.remove(f)
+
+
+ """
+ if path is None:
+ path = os_cache("pooch")
+ if fname is None:
+ fname = unique_file_name(url)
+ # Make the path absolute.
+ path = cache_location(path, env=None, version=None)
+
+ full_path = path.resolve() / fname
+ action, verb = download_action(full_path, known_hash)
+
+ if action in ("download", "update"):
+ # We need to write data, so create the local data directory if it
+ # doesn't already exist.
+ make_local_storage(path)
+
+ get_logger().info(
+ "%s data from '%s' to file '%s'.",
+ verb,
+ url,
+ str(full_path),
+ )
+
+ if downloader is None:
+ downloader = choose_downloader(url, progressbar=progressbar)
+
+ stream_download(url, full_path, known_hash, downloader, pooch=None)
+
+ if known_hash is None:
+ get_logger().info(
+ "SHA256 hash of downloaded file: %s\n"
+ "Use this value as the 'known_hash' argument of 'pooch.retrieve'"
+ " to ensure that the file hasn't changed if it is downloaded again"
+ " in the future.",
+ file_hash(str(full_path)),
+ )
+
+ if processor is not None:
+ return processor(str(full_path), action, None)
+
+ return str(full_path)
+
+
+
+
+[docs]
+def create(
+ path,
+ base_url,
+ version=None,
+ version_dev="master",
+ env=None,
+ registry=None,
+ urls=None,
+ retry_if_failed=0,
+ allow_updates=True,
+):
+ """
+ Create a :class:`~pooch.Pooch` with sensible defaults to fetch data files.
+
+ If a version string is given, the Pooch will be versioned, meaning that the
+ local storage folder and the base URL depend on the project version. This
+ is necessary if your users have multiple versions of your library installed
+ (using virtual environments) and you updated the data files between
+ versions. Otherwise, every time a user switches environments would trigger
+ a re-download of the data. The version string will be appended to the local
+ storage path (for example, ``~/.mypooch/cache/v0.1``) and inserted into the
+ base URL (for example,
+ ``https://github.com/fatiando/pooch/raw/v0.1/data``). If the version string
+ contains ``+XX.XXXXX``, it will be interpreted as a development version.
+
+ Does **not** create the local data storage folder. The folder will only be
+ created the first time a download is attempted with
+ :meth:`pooch.Pooch.fetch`. This makes it safe to use this function at the
+ module level (so it's executed on ``import`` and the resulting
+ :class:`~pooch.Pooch` is a global variable).
+
+ Parameters
+ ----------
+ path : str, PathLike, list or tuple
+ The path to the local data storage folder. If this is a list or tuple,
+ we'll join the parts with the appropriate separator. The *version* will
+ be appended to the end of this path. Use :func:`pooch.os_cache` for a
+ sensible default.
+ base_url : str
+ Base URL for the remote data source. All requests will be made relative
+ to this URL. The string should have a ``{version}`` formatting mark in
+ it. We will call ``.format(version=version)`` on this string. If the
+ URL does not end in a ``'/'``, a trailing ``'/'`` will be added
+ automatically.
+ version : str or None
+ The version string for your project. Should be PEP440 compatible. If
+ None is given, will not attempt to format *base_url* and no subfolder
+ will be appended to *path*.
+ version_dev : str
+ The name used for the development version of a project. If your data is
+ hosted on Github (and *base_url* is a Github raw link), then
+ ``"master"`` is a good choice (default). Ignored if *version* is None.
+ env : str or None
+ An environment variable that can be used to overwrite *path*. This
+ allows users to control where they want the data to be stored. We'll
+ append *version* to the end of this value as well.
+ registry : dict or None
+ A record of the files that are managed by this Pooch. Keys should be
+ the file names and the values should be their hashes. Only files
+ in the registry can be fetched from the local storage. Files in
+ subdirectories of *path* **must use Unix-style separators** (``'/'``)
+ even on Windows.
+ urls : dict or None
+ Custom URLs for downloading individual files in the registry. A
+ dictionary with the file names as keys and the custom URLs as values.
+ Not all files in *registry* need an entry in *urls*. If a file has an
+ entry in *urls*, the *base_url* will be ignored when downloading it in
+ favor of ``urls[fname]``.
+ retry_if_failed : int
+ Retry a file download the specified number of times if it fails because
+ of a bad connection or a hash mismatch. By default, downloads are only
+ attempted once (``retry_if_failed=0``). Initially, will wait for 1s
+ between retries and then increase the wait time by 1s with each retry
+ until a maximum of 10s.
+ allow_updates : bool or str
+ Whether existing files in local storage that have a hash mismatch with
+ the registry are allowed to update from the remote URL. If a string is
+ passed, we will assume it's the name of an environment variable that
+ will be checked for the true/false value. If ``False``, any mismatch
+ with hashes in the registry will result in an error. Defaults to
+ ``True``.
+
+ Returns
+ -------
+ pooch : :class:`~pooch.Pooch`
+ The :class:`~pooch.Pooch` initialized with the given arguments.
+
+ Examples
+ --------
+
+ Create a :class:`~pooch.Pooch` for a release (v0.1):
+
+ >>> pup = create(path="myproject",
+ ... base_url="http://some.link.com/{version}/",
+ ... version="v0.1",
+ ... registry={"data.txt": "9081wo2eb2gc0u..."})
+ >>> print(pup.path.parts) # The path is a pathlib.Path
+ ('myproject', 'v0.1')
+ >>> # The local folder is only created when a dataset is first downloaded
+ >>> print(pup.path.exists())
+ False
+ >>> print(pup.base_url)
+ http://some.link.com/v0.1/
+ >>> print(pup.registry)
+ {'data.txt': '9081wo2eb2gc0u...'}
+ >>> print(pup.registry_files)
+ ['data.txt']
+
+ If this is a development version (12 commits ahead of v0.1), then the
+ ``version_dev`` will be used (defaults to ``"master"``):
+
+ >>> pup = create(path="myproject",
+ ... base_url="http://some.link.com/{version}/",
+ ... version="v0.1+12.do9iwd")
+ >>> print(pup.path.parts)
+ ('myproject', 'master')
+ >>> print(pup.base_url)
+ http://some.link.com/master/
+
+ Versioning is optional (but highly encouraged):
+
+ >>> pup = create(path="myproject",
+ ... base_url="http://some.link.com/",
+ ... registry={"data.txt": "9081wo2eb2gc0u..."})
+ >>> print(pup.path.parts) # The path is a pathlib.Path
+ ('myproject',)
+ >>> print(pup.base_url)
+ http://some.link.com/
+
+ To place the storage folder at a subdirectory, pass in a list and we'll
+ join the path for you using the appropriate separator for your operating
+ system:
+
+ >>> pup = create(path=["myproject", "cache", "data"],
+ ... base_url="http://some.link.com/{version}/",
+ ... version="v0.1")
+ >>> print(pup.path.parts)
+ ('myproject', 'cache', 'data', 'v0.1')
+
+ The user can overwrite the storage path by setting an environment variable:
+
+ >>> # The variable is not set so we'll use *path*
+ >>> pup = create(path=["myproject", "not_from_env"],
+ ... base_url="http://some.link.com/{version}/",
+ ... version="v0.1",
+ ... env="MYPROJECT_DATA_DIR")
+ >>> print(pup.path.parts)
+ ('myproject', 'not_from_env', 'v0.1')
+ >>> # Set the environment variable and try again
+ >>> import os
+ >>> os.environ["MYPROJECT_DATA_DIR"] = os.path.join("myproject", "env")
+ >>> pup = create(path=["myproject", "not_env"],
+ ... base_url="http://some.link.com/{version}/",
+ ... version="v0.1",
+ ... env="MYPROJECT_DATA_DIR")
+ >>> print(pup.path.parts)
+ ('myproject', 'env', 'v0.1')
+
+ """
+ if version is not None:
+ version = check_version(version, fallback=version_dev)
+ base_url = base_url.format(version=version)
+ # Don't create the cache folder here! This function is usually called in
+ # the module context (at import time), so touching the file system is not
+ # recommended. It could cause crashes when multiple processes/threads try
+ # to import at the same time (which would try to create the folder several
+ # times at once).
+ path = cache_location(path, env, version)
+ if isinstance(allow_updates, str):
+ allow_updates = os.environ.get(allow_updates, "true").lower() != "false"
+ # add trailing "/"
+ base_url = base_url.rstrip("/") + "/"
+ pup = Pooch(
+ path=path,
+ base_url=base_url,
+ registry=registry,
+ urls=urls,
+ retry_if_failed=retry_if_failed,
+ allow_updates=allow_updates,
+ )
+ return pup
+
+
+
+
+[docs]
+class Pooch:
+ """
+ Manager for a local data storage that can fetch from a remote source.
+
+ Avoid creating ``Pooch`` instances directly. Use :func:`pooch.create`
+ instead.
+
+ Parameters
+ ----------
+ path : str
+ The path to the local data storage folder. The path must exist in the
+ file system.
+ base_url : str
+ Base URL for the remote data source. All requests will be made relative
+ to this URL.
+ registry : dict or None
+ A record of the files that are managed by this good boy. Keys should be
+ the file names and the values should be their hashes. Only files
+ in the registry can be fetched from the local storage. Files in
+ subdirectories of *path* **must use Unix-style separators** (``'/'``)
+ even on Windows.
+ urls : dict or None
+ Custom URLs for downloading individual files in the registry. A
+ dictionary with the file names as keys and the custom URLs as values.
+ Not all files in *registry* need an entry in *urls*. If a file has an
+ entry in *urls*, the *base_url* will be ignored when downloading it in
+ favor of ``urls[fname]``.
+ retry_if_failed : int
+ Retry a file download the specified number of times if it fails because
+ of a bad connection or a hash mismatch. By default, downloads are only
+ attempted once (``retry_if_failed=0``). Initially, will wait for 1s
+ between retries and then increase the wait time by 1s with each retry
+ until a maximum of 10s.
+ allow_updates : bool
+ Whether existing files in local storage that have a hash mismatch with
+ the registry are allowed to update from the remote URL. If ``False``,
+ any mismatch with hashes in the registry will result in an error.
+ Defaults to ``True``.
+
+ """
+
+ def __init__(
+ self,
+ path,
+ base_url,
+ registry=None,
+ urls=None,
+ retry_if_failed=0,
+ allow_updates=True,
+ ):
+ self.path = path
+ self.base_url = base_url
+ if registry is None:
+ registry = {}
+ self.registry = registry
+ if urls is None:
+ urls = {}
+ self.urls = dict(urls)
+ self.retry_if_failed = retry_if_failed
+ self.allow_updates = allow_updates
+
+ @property
+ def abspath(self):
+ "Absolute path to the local storage"
+ return Path(os.path.abspath(os.path.expanduser(str(self.path))))
+
+ @property
+ def registry_files(self):
+ "List of file names on the registry"
+ return list(self.registry)
+
+
+[docs]
+ def fetch(self, fname, processor=None, downloader=None, progressbar=False):
+ """
+ Get the absolute path to a file in the local storage.
+
+ If it's not in the local storage, it will be downloaded. If the hash of
+ the file in local storage doesn't match the one in the registry, will
+ download a new copy of the file. This is considered a sign that the
+ file was updated in the remote storage. If the hash of the downloaded
+ file still doesn't match the one in the registry, will raise an
+ exception to warn of possible file corruption.
+
+ Post-processing actions sometimes need to be taken on downloaded files
+ (unzipping, conversion to a more efficient format, etc). If these
+ actions are time or memory consuming, it would be best to do this only
+ once right after the file is downloaded. Use the *processor* argument
+ to specify a function that is executed after the download to perform
+ these actions. See :ref:`processors` for details.
+
+ Custom file downloaders can be provided through the *downloader*
+ argument. By default, Pooch will determine the download protocol from
+ the URL in the registry. If the server for a given file requires
+ authentication (username and password), use a downloader that support
+ these features. Downloaders can also be used to print custom messages
+ (like a progress bar), etc. See :ref:`downloaders` for details.
+
+ Parameters
+ ----------
+ fname : str
+ The file name (relative to the *base_url* of the remote data
+ storage) to fetch from the local storage.
+ processor : None or callable
+ If not None, then a function (or callable object) that will be
+ called before returning the full path and after the file has been
+ downloaded. See :ref:`processors` for details.
+ downloader : None or callable
+ If not None, then a function (or callable object) that will be
+ called to download a given URL to a provided local file name. See
+ :ref:`downloaders` for details.
+ progressbar : bool or an arbitrary progress bar object
+ If True, will print a progress bar of the download to standard
+ error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to
+ be installed. Alternatively, an arbitrary progress bar object can
+ be passed. See :ref:`custom-progressbar` for details.
+
+ Returns
+ -------
+ full_path : str
+ The absolute path (including the file name) of the file in the
+ local storage.
+
+ """
+ self._assert_file_in_registry(fname)
+
+ url = self.get_url(fname)
+ full_path = self.abspath / fname
+ known_hash = self.registry[fname]
+ action, verb = download_action(full_path, known_hash)
+
+ if action == "update" and not self.allow_updates:
+ raise ValueError(
+ f"{fname} needs to update {full_path} but updates are disallowed."
+ )
+
+ if action in ("download", "update"):
+ # We need to write data, so create the local data directory if it
+ # doesn't already exist.
+ make_local_storage(str(self.abspath))
+
+ get_logger().info(
+ "%s file '%s' from '%s' to '%s'.",
+ verb,
+ fname,
+ url,
+ str(self.abspath),
+ )
+
+ if downloader is None:
+ downloader = choose_downloader(url, progressbar=progressbar)
+
+ stream_download(
+ url,
+ full_path,
+ known_hash,
+ downloader,
+ pooch=self,
+ retry_if_failed=self.retry_if_failed,
+ )
+
+ if processor is not None:
+ return processor(str(full_path), action, self)
+
+ return str(full_path)
+
+
+ def _assert_file_in_registry(self, fname):
+ """
+ Check if a file is in the registry and raise :class:`ValueError` if
+ it's not.
+ """
+ if fname not in self.registry:
+ raise ValueError(f"File '{fname}' is not in the registry.")
+
+
+[docs]
+ def get_url(self, fname):
+ """
+ Get the full URL to download a file in the registry.
+
+ Parameters
+ ----------
+ fname : str
+ The file name (relative to the *base_url* of the remote data
+ storage) to fetch from the local storage.
+
+ """
+ self._assert_file_in_registry(fname)
+ return self.urls.get(fname, "".join([self.base_url, fname]))
+
+
+
+[docs]
+ def load_registry(self, fname):
+ """
+ Load entries from a file and add them to the registry.
+
+ Use this if you are managing many files.
+
+ Each line of the file should have file name and its hash separated by
+ a space. Hash can specify checksum algorithm using "alg:hash" format.
+ In case no algorithm is provided, SHA256 is used by default.
+ Only one file per line is allowed. Custom download URLs for individual
+ files can be specified as a third element on the line. Line comments
+ can be added and must be prepended with ``#``.
+
+ Parameters
+ ----------
+ fname : str | fileobj
+ Path (or open file object) to the registry file.
+
+ """
+ with contextlib.ExitStack() as stack:
+ if hasattr(fname, "read"):
+ # It's a file object
+ fin = fname
+ else:
+ # It's a file path
+ fin = stack.enter_context(open(fname, encoding="utf-8"))
+
+ for linenum, line in enumerate(fin):
+ if isinstance(line, bytes):
+ line = line.decode("utf-8")
+
+ line = line.strip()
+ # skip line comments
+ if line.startswith("#"):
+ continue
+
+ elements = shlex.split(line)
+ if not len(elements) in [0, 2, 3]:
+ raise OSError(
+ f"Invalid entry in Pooch registry file '{fname}': "
+ f"expected 2 or 3 elements in line {linenum + 1} but got "
+ f"{len(elements)}. Offending entry: '{line}'"
+ )
+ if elements:
+ file_name = elements[0]
+ file_checksum = elements[1]
+ if len(elements) == 3:
+ file_url = elements[2]
+ self.urls[file_name] = file_url
+ self.registry[file_name] = file_checksum.lower()
+
+
+
+[docs]
+ def load_registry_from_doi(self):
+ """
+ Populate the registry using the data repository API
+
+ Fill the registry with all the files available in the data repository,
+ along with their hashes. It will make a request to the data repository
+ API to retrieve this information. No file is downloaded during this
+ process.
+
+ .. important::
+
+ This method is intended to be used only when the ``base_url`` is
+ a DOI.
+ """
+
+ # Ensure that this is indeed a DOI-based pooch
+ downloader = choose_downloader(self.base_url)
+ if not isinstance(downloader, DOIDownloader):
+ raise ValueError(
+ f"Invalid base_url '{self.base_url}': "
+ + "Pooch.load_registry_from_doi is only implemented for DOIs"
+ )
+
+ # Create a repository instance
+ doi = self.base_url.replace("doi:", "")
+ repository = doi_to_repository(doi)
+
+ # Call registry population for this repository
+ return repository.populate_registry(self)
+
+
+
+[docs]
+ def is_available(self, fname, downloader=None):
+ """
+ Check availability of a remote file without downloading it.
+
+ Use this method when working with large files to check if they are
+ available for download.
+
+ Parameters
+ ----------
+ fname : str
+ The file name (relative to the *base_url* of the remote data
+ storage).
+ downloader : None or callable
+ If not None, then a function (or callable object) that will be
+ called to check the availability of the file on the server. See
+ :ref:`downloaders` for details.
+
+ Returns
+ -------
+ status : bool
+ True if the file is available for download. False otherwise.
+
+ """
+ self._assert_file_in_registry(fname)
+ url = self.get_url(fname)
+ if downloader is None:
+ downloader = choose_downloader(url)
+ try:
+ available = downloader(url, None, self, check_only=True)
+ except TypeError as error:
+ error_msg = (
+ f"Downloader '{str(downloader)}' does not support availability checks."
+ )
+ raise NotImplementedError(error_msg) from error
+ return available
+
+
+
+
+def download_action(path, known_hash):
+ """
+ Determine the action that is needed to get the file on disk.
+
+ Parameters
+ ----------
+ path : PathLike
+ The path to the file on disk.
+ known_hash : str
+ A known hash (checksum) of the file. Will be used to verify the
+ download or check if an existing file needs to be updated. By default,
+ will assume it's a SHA256 hash. To specify a different hashing method,
+ prepend the hash with ``algorithm:``, for example
+ ``md5:pw9co2iun29juoh`` or ``sha1:092odwhi2ujdp2du2od2odh2wod2``.
+
+ Returns
+ -------
+ action, verb : str
+ The action that must be taken and the English verb (infinitive form of
+ *action*) used in the log:
+ * ``'download'``: File does not exist locally and must be downloaded.
+ * ``'update'``: File exists locally but needs to be updated.
+ * ``'fetch'``: File exists locally and only need to inform its path.
+
+
+ """
+ if not path.exists():
+ action = "download"
+ verb = "Downloading"
+ elif not hash_matches(str(path), known_hash):
+ action = "update"
+ verb = "Updating"
+ else:
+ action = "fetch"
+ verb = "Fetching"
+ return action, verb
+
+
+def stream_download(url, fname, known_hash, downloader, pooch=None, retry_if_failed=0):
+ """
+ Stream the file and check that its hash matches the known one.
+
+ The file is first downloaded to a temporary file name in the cache folder.
+ It will be moved to the desired file name only if the hash matches the
+ known hash. Otherwise, the temporary file is deleted.
+
+ If the download fails for either a bad connection or a hash mismatch, we
+ will retry the download the specified number of times in case the failure
+ was due to a network error.
+ """
+ # Lazy import requests to speed up import time
+ import requests.exceptions # pylint: disable=C0415
+
+ # Ensure the parent directory exists in case the file is in a subdirectory.
+ # Otherwise, move will cause an error.
+ if not fname.parent.exists():
+ os.makedirs(str(fname.parent))
+ download_attempts = 1 + retry_if_failed
+ max_wait = 10
+ for i in range(download_attempts):
+ try:
+ # Stream the file to a temporary so that we can safely check its
+ # hash before overwriting the original.
+ with temporary_file(path=str(fname.parent)) as tmp:
+ downloader(url, tmp, pooch)
+ hash_matches(tmp, known_hash, strict=True, source=str(fname.name))
+ shutil.move(tmp, str(fname))
+ break
+ except (ValueError, requests.exceptions.RequestException):
+ if i == download_attempts - 1:
+ raise
+ retries_left = download_attempts - (i + 1)
+ get_logger().info(
+ "Failed to download '%s'. "
+ "Will attempt the download again %d more time%s.",
+ str(fname.name),
+ retries_left,
+ "s" if retries_left > 1 else "",
+ )
+ time.sleep(min(i + 1, max_wait))
+
+# Copyright (c) 2018 The Pooch Developers.
+# Distributed under the terms of the BSD 3-Clause License.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
+#
+"""
+The classes that actually handle the downloads.
+"""
+import os
+import sys
+import ftplib
+
+import warnings
+
+from .utils import parse_url
+
+try:
+ from tqdm import tqdm
+except ImportError:
+ tqdm = None
+
+try:
+ import paramiko
+except ImportError:
+ paramiko = None
+
+
+def choose_downloader(url, progressbar=False):
+ """
+ Choose the appropriate downloader for the given URL based on the protocol.
+
+ Parameters
+ ----------
+ url : str
+ A URL (including protocol).
+ progressbar : bool or an arbitrary progress bar object
+ If True, will print a progress bar of the download to standard error
+ (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
+ installed. Alternatively, an arbitrary progress bar object can be
+ passed. See :ref:`custom-progressbar` for details.
+
+ Returns
+ -------
+ downloader
+ A downloader class, like :class:`pooch.HTTPDownloader`,
+ :class:`pooch.FTPDownloader`, or :class: `pooch.SFTPDownloader`.
+
+ Examples
+ --------
+
+ >>> downloader = choose_downloader("http://something.com")
+ >>> print(downloader.__class__.__name__)
+ HTTPDownloader
+ >>> downloader = choose_downloader("https://something.com")
+ >>> print(downloader.__class__.__name__)
+ HTTPDownloader
+ >>> downloader = choose_downloader("ftp://something.com")
+ >>> print(downloader.__class__.__name__)
+ FTPDownloader
+ >>> downloader = choose_downloader("doi:DOI/filename.csv")
+ >>> print(downloader.__class__.__name__)
+ DOIDownloader
+
+ """
+ known_downloaders = {
+ "ftp": FTPDownloader,
+ "https": HTTPDownloader,
+ "http": HTTPDownloader,
+ "sftp": SFTPDownloader,
+ "doi": DOIDownloader,
+ }
+
+ parsed_url = parse_url(url)
+ if parsed_url["protocol"] not in known_downloaders:
+ raise ValueError(
+ f"Unrecognized URL protocol '{parsed_url['protocol']}' in '{url}'. "
+ f"Must be one of {known_downloaders.keys()}."
+ )
+ downloader = known_downloaders[parsed_url["protocol"]](progressbar=progressbar)
+ return downloader
+
+
+
+[docs]
+class HTTPDownloader: # pylint: disable=too-few-public-methods
+ """
+ Download manager for fetching files over HTTP/HTTPS.
+
+ When called, downloads the given file URL into the specified local file.
+ Uses the :mod:`requests` library to manage downloads.
+
+ Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
+ the download of files (for example, to use authentication or print a
+ progress bar).
+
+ Parameters
+ ----------
+ progressbar : bool or an arbitrary progress bar object
+ If True, will print a progress bar of the download to standard error
+ (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
+ installed. Alternatively, an arbitrary progress bar object can be
+ passed. See :ref:`custom-progressbar` for details.
+ chunk_size : int
+ Files are streamed *chunk_size* bytes at a time instead of loading
+ everything into memory at one. Usually doesn't need to be changed.
+ **kwargs
+ All keyword arguments given when creating an instance of this class
+ will be passed to :func:`requests.get`.
+
+ Examples
+ --------
+
+ Download one of the data files from the Pooch repository:
+
+ >>> import os
+ >>> from pooch import __version__, check_version
+ >>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
+ >>> url = url.format(check_version(__version__, fallback="main"))
+ >>> downloader = HTTPDownloader()
+ >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
+ >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
+ >>> os.path.exists("tiny-data.txt")
+ True
+ >>> with open("tiny-data.txt") as f:
+ ... print(f.read().strip())
+ # A tiny data file for test purposes only
+ 1 2 3 4 5 6
+ >>> os.remove("tiny-data.txt")
+
+ Authentication can be handled by passing a user name and password to
+ :func:`requests.get`. All arguments provided when creating an instance of
+ the class are forwarded to :func:`requests.get`. We'll use
+ ``auth=(username, password)`` to use basic HTTPS authentication. The
+ https://httpbin.org website allows us to make a fake a login request using
+ whatever username and password we provide to it:
+
+ >>> user = "doggo"
+ >>> password = "goodboy"
+ >>> # httpbin will ask for the user and password we provide in the URL
+ >>> url = f"https://httpbin.org/basic-auth/{user}/{password}"
+ >>> # Trying without the login credentials causes an error
+ >>> downloader = HTTPDownloader()
+ >>> try:
+ ... downloader(url=url, output_file="tiny-data.txt", pooch=None)
+ ... except Exception:
+ ... print("There was an error!")
+ There was an error!
+ >>> # Pass in the credentials to HTTPDownloader
+ >>> downloader = HTTPDownloader(auth=(user, password))
+ >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
+ >>> with open("tiny-data.txt") as f:
+ ... for line in f:
+ ... print(line.rstrip())
+ {
+ "authenticated": true,
+ "user": "doggo"
+ }
+ >>> os.remove("tiny-data.txt")
+
+ """
+
+ def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
+ self.kwargs = kwargs
+ self.progressbar = progressbar
+ self.chunk_size = chunk_size
+ if self.progressbar is True and tqdm is None:
+ raise ValueError("Missing package 'tqdm' required for progress bars.")
+
+
+[docs]
+ def __call__(
+ self, url, output_file, pooch, check_only=False
+ ): # pylint: disable=R0914
+ """
+ Download the given URL over HTTP to the given output file.
+
+ Uses :func:`requests.get`.
+
+ Parameters
+ ----------
+ url : str
+ The URL to the file you want to download.
+ output_file : str or file-like object
+ Path (and file name) to which the file will be downloaded.
+ pooch : :class:`~pooch.Pooch`
+ The instance of :class:`~pooch.Pooch` that is calling this method.
+ check_only : bool
+ If True, will only check if a file exists on the server and
+ **without downloading the file**. Will return ``True`` if the file
+ exists and ``False`` otherwise.
+
+ Returns
+ -------
+ availability : bool or None
+ If ``check_only==True``, returns a boolean indicating if the file
+ is available on the server. Otherwise, returns ``None``.
+
+ """
+ # Lazy import requests to speed up import time
+ import requests # pylint: disable=C0415
+
+ if check_only:
+ timeout = self.kwargs.get("timeout", 5)
+ response = requests.head(url, timeout=timeout, allow_redirects=True)
+ available = bool(response.status_code == 200)
+ return available
+
+ kwargs = self.kwargs.copy()
+ timeout = kwargs.pop("timeout", 5)
+ kwargs.setdefault("stream", True)
+ ispath = not hasattr(output_file, "write")
+ if ispath:
+ # pylint: disable=consider-using-with
+ output_file = open(output_file, "w+b")
+ # pylint: enable=consider-using-with
+ try:
+ response = requests.get(url, timeout=timeout, **kwargs)
+ response.raise_for_status()
+ content = response.iter_content(chunk_size=self.chunk_size)
+ total = int(response.headers.get("content-length", 0))
+ if self.progressbar is True:
+ # Need to use ascii characters on Windows because there isn't
+ # always full unicode support
+ # (see https://github.com/tqdm/tqdm/issues/454)
+ use_ascii = bool(sys.platform == "win32")
+ progress = tqdm(
+ total=total,
+ ncols=79,
+ ascii=use_ascii,
+ unit="B",
+ unit_scale=True,
+ leave=True,
+ )
+ elif self.progressbar:
+ progress = self.progressbar
+ progress.total = total
+ for chunk in content:
+ if chunk:
+ output_file.write(chunk)
+ output_file.flush()
+ if self.progressbar:
+ # Use the chunk size here because chunk may be much
+ # larger if the data are decompressed by requests after
+ # reading (happens with text files).
+ progress.update(self.chunk_size)
+ # Make sure the progress bar gets filled even if the actual number
+ # is chunks is smaller than expected. This happens when streaming
+ # text files that are compressed by the server when sending (gzip).
+ # Binary files don't experience this.
+ if self.progressbar:
+ progress.reset()
+ progress.update(total)
+ progress.close()
+ finally:
+ if ispath:
+ output_file.close()
+ return None
+
+
+
+
+
+[docs]
+class FTPDownloader: # pylint: disable=too-few-public-methods
+ """
+ Download manager for fetching files over FTP.
+
+ When called, downloads the given file URL into the specified local file.
+ Uses the :mod:`ftplib` module to manage downloads.
+
+ Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
+ the download of files (for example, to use authentication or print a
+ progress bar).
+
+ Parameters
+ ----------
+ port : int
+ Port used for the FTP connection.
+ username : str
+ User name used to login to the server. Only needed if the server
+ requires authentication (i.e., no anonymous FTP).
+ password : str
+ Password used to login to the server. Only needed if the server
+ requires authentication (i.e., no anonymous FTP). Use the empty string
+ to indicate no password is required.
+ account : str
+ Some servers also require an "account" name for authentication.
+ timeout : int
+ Timeout in seconds for ftp socket operations, use None to mean no
+ timeout.
+ progressbar : bool
+ If True, will print a progress bar of the download to standard error
+ (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
+ installed. **Custom progress bars are not yet supported.**
+ chunk_size : int
+ Files are streamed *chunk_size* bytes at a time instead of loading
+ everything into memory at one. Usually doesn't need to be changed.
+
+ """
+
+ def __init__(
+ self,
+ port=21,
+ username="anonymous",
+ password="",
+ account="",
+ timeout=None,
+ progressbar=False,
+ chunk_size=1024,
+ ):
+ self.port = port
+ self.username = username
+ self.password = password
+ self.account = account
+ self.timeout = timeout
+ self.progressbar = progressbar
+ self.chunk_size = chunk_size
+ if self.progressbar is True and tqdm is None:
+ raise ValueError("Missing package 'tqdm' required for progress bars.")
+
+
+[docs]
+ def __call__(self, url, output_file, pooch, check_only=False):
+ """
+ Download the given URL over FTP to the given output file.
+
+ Parameters
+ ----------
+ url : str
+ The URL to the file you want to download.
+ output_file : str or file-like object
+ Path (and file name) to which the file will be downloaded.
+ pooch : :class:`~pooch.Pooch`
+ The instance of :class:`~pooch.Pooch` that is calling this method.
+ check_only : bool
+ If True, will only check if a file exists on the server and
+ **without downloading the file**. Will return ``True`` if the file
+ exists and ``False`` otherwise.
+
+ Returns
+ -------
+ availability : bool or None
+ If ``check_only==True``, returns a boolean indicating if the file
+ is available on the server. Otherwise, returns ``None``.
+
+ """
+ parsed_url = parse_url(url)
+ ftp = ftplib.FTP(timeout=self.timeout)
+ ftp.connect(host=parsed_url["netloc"], port=self.port)
+
+ if check_only:
+ directory, file_name = os.path.split(parsed_url["path"])
+ try:
+ ftp.login(user=self.username, passwd=self.password, acct=self.account)
+ available = file_name in ftp.nlst(directory)
+ finally:
+ ftp.close()
+ return available
+
+ ispath = not hasattr(output_file, "write")
+ if ispath:
+ # pylint: disable=consider-using-with
+ output_file = open(output_file, "w+b")
+ # pylint: enable=consider-using-with
+ try:
+ ftp.login(user=self.username, passwd=self.password, acct=self.account)
+ command = f"RETR {parsed_url['path']}"
+ if self.progressbar:
+ # Make sure the file is set to binary mode, otherwise we can't
+ # get the file size. See: https://stackoverflow.com/a/22093848
+ ftp.voidcmd("TYPE I")
+ use_ascii = bool(sys.platform == "win32")
+ progress = tqdm(
+ total=int(ftp.size(parsed_url["path"])),
+ ncols=79,
+ ascii=use_ascii,
+ unit="B",
+ unit_scale=True,
+ leave=True,
+ )
+ with progress:
+
+ def callback(data):
+ "Update the progress bar and write to output"
+ progress.update(len(data))
+ output_file.write(data)
+
+ ftp.retrbinary(command, callback, blocksize=self.chunk_size)
+ else:
+ ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size)
+ finally:
+ ftp.quit()
+ if ispath:
+ output_file.close()
+ return None
+
+
+
+
+
+[docs]
+class SFTPDownloader: # pylint: disable=too-few-public-methods
+ """
+ Download manager for fetching files over SFTP.
+
+ When called, downloads the given file URL into the specified local file.
+ Requires `paramiko <https://github.com/paramiko/paramiko>`__ to be
+ installed.
+
+ Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
+ the download of files (for example, to use authentication or print a
+ progress bar).
+
+ Parameters
+ ----------
+ port : int
+ Port used for the SFTP connection.
+ username : str
+ User name used to login to the server. Only needed if the server
+ requires authentication (i.e., no anonymous SFTP).
+ password : str
+ Password used to login to the server. Only needed if the server
+ requires authentication (i.e., no anonymous SFTP). Use the empty
+ string to indicate no password is required.
+ timeout : int
+ Timeout in seconds for sftp socket operations, use None to mean no
+ timeout.
+ progressbar : bool or an arbitrary progress bar object
+ If True, will print a progress bar of the download to standard
+ error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to
+ be installed.
+
+ """
+
+ def __init__(
+ self,
+ port=22,
+ username="anonymous",
+ password="",
+ account="",
+ timeout=None,
+ progressbar=False,
+ ):
+ self.port = port
+ self.username = username
+ self.password = password
+ self.account = account
+ self.timeout = timeout
+ self.progressbar = progressbar
+ # Collect errors and raise only once so that both missing packages are
+ # captured. Otherwise, the user is only warned of one of them at a
+ # time (and we can't test properly when they are both missing).
+ errors = []
+ if self.progressbar and tqdm is None:
+ errors.append("Missing package 'tqdm' required for progress bars.")
+ if paramiko is None:
+ errors.append("Missing package 'paramiko' required for SFTP downloads.")
+ if errors:
+ raise ValueError(" ".join(errors))
+
+
+[docs]
+ def __call__(self, url, output_file, pooch):
+ """
+ Download the given URL over SFTP to the given output file.
+
+ The output file must be given as a string (file name/path) and not an
+ open file object! Otherwise, paramiko cannot save to that file.
+
+ Parameters
+ ----------
+ url : str
+ The URL to the file you want to download.
+ output_file : str
+ Path (and file name) to which the file will be downloaded. **Cannot
+ be a file object**.
+ pooch : :class:`~pooch.Pooch`
+ The instance of :class:`~pooch.Pooch` that is calling this method.
+ """
+ parsed_url = parse_url(url)
+ connection = paramiko.Transport(sock=(parsed_url["netloc"], self.port))
+ sftp = None
+ try:
+ connection.connect(username=self.username, password=self.password)
+ sftp = paramiko.SFTPClient.from_transport(connection)
+ sftp.get_channel().settimeout = self.timeout
+ if self.progressbar:
+ size = int(sftp.stat(parsed_url["path"]).st_size)
+ use_ascii = bool(sys.platform == "win32")
+ progress = tqdm(
+ total=size,
+ ncols=79,
+ ascii=use_ascii,
+ unit="B",
+ unit_scale=True,
+ leave=True,
+ )
+ if self.progressbar:
+ with progress:
+
+ def callback(current, total):
+ "Update the progress bar and write to output"
+ progress.total = int(total)
+ progress.update(int(current - progress.n))
+
+ sftp.get(parsed_url["path"], output_file, callback=callback)
+ else:
+ sftp.get(parsed_url["path"], output_file)
+ finally:
+ connection.close()
+ if sftp is not None:
+ sftp.close()
+
+
+
+
+
+[docs]
+class DOIDownloader: # pylint: disable=too-few-public-methods
+ """
+ Download manager for fetching files from Digital Object Identifiers (DOIs).
+
+ Open-access data repositories often issue Digital Object Identifiers (DOIs)
+ for data which provide a stable link and citation point. The trick is
+ finding out the download URL for a file given the DOI.
+
+ When called, this downloader uses the repository's public API to find out
+ the download URL from the DOI and file name. It then uses
+ :class:`pooch.HTTPDownloader` to download the URL into the specified local
+ file. Allowing "URL"s to be specified with the DOI instead of the actual
+ HTTP download link. Uses the :mod:`requests` library to manage downloads
+ and interact with the APIs.
+
+ The **format of the "URL"** is: ``doi:{DOI}/{file name}``.
+
+ Notice that there are no ``//`` like in HTTP/FTP and you must specify a
+ file name after the DOI (separated by a ``/``).
+
+ Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to be able to
+ download files given the DOI instead of an HTTP link.
+
+ Supported repositories:
+
+ * `figshare <https://www.figshare.com>`__
+ * `Zenodo <https://www.zenodo.org>`__
+ * `Dataverse <https://dataverse.org/>`__ instances
+
+ .. attention::
+
+ DOIs from other repositories **will not work** since we need to access
+ their particular APIs to find the download links. We welcome
+ suggestions and contributions adding new repositories.
+
+ Parameters
+ ----------
+ progressbar : bool or an arbitrary progress bar object
+ If True, will print a progress bar of the download to standard error
+ (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
+ installed. Alternatively, an arbitrary progress bar object can be
+ passed. See :ref:`custom-progressbar` for details.
+ chunk_size : int
+ Files are streamed *chunk_size* bytes at a time instead of loading
+ everything into memory at one. Usually doesn't need to be changed.
+ **kwargs
+ All keyword arguments given when creating an instance of this class
+ will be passed to :func:`requests.get`.
+
+ Examples
+ --------
+
+ Download one of the data files from the figshare archive of Pooch test
+ data:
+
+ >>> import os
+ >>> downloader = DOIDownloader()
+ >>> url = "doi:10.6084/m9.figshare.14763051.v1/tiny-data.txt"
+ >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
+ >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
+ >>> os.path.exists("tiny-data.txt")
+ True
+ >>> with open("tiny-data.txt") as f:
+ ... print(f.read().strip())
+ # A tiny data file for test purposes only
+ 1 2 3 4 5 6
+ >>> os.remove("tiny-data.txt")
+
+ Same thing but for our Zenodo archive:
+
+ >>> url = "doi:10.5281/zenodo.4924875/tiny-data.txt"
+ >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
+ >>> os.path.exists("tiny-data.txt")
+ True
+ >>> with open("tiny-data.txt") as f:
+ ... print(f.read().strip())
+ # A tiny data file for test purposes only
+ 1 2 3 4 5 6
+ >>> os.remove("tiny-data.txt")
+
+ """
+
+ def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
+ self.kwargs = kwargs
+ self.progressbar = progressbar
+ self.chunk_size = chunk_size
+
+
+[docs]
+ def __call__(self, url, output_file, pooch):
+ """
+ Download the given DOI URL over HTTP to the given output file.
+
+ Uses the repository's API to determine the actual HTTP download URL
+ from the given DOI.
+
+ Uses :func:`requests.get`.
+
+ Parameters
+ ----------
+ url : str
+ The URL to the file you want to download.
+ output_file : str or file-like object
+ Path (and file name) to which the file will be downloaded.
+ pooch : :class:`~pooch.Pooch`
+ The instance of :class:`~pooch.Pooch` that is calling this method.
+
+ """
+
+ parsed_url = parse_url(url)
+ data_repository = doi_to_repository(parsed_url["netloc"])
+
+ # Resolve the URL
+ file_name = parsed_url["path"]
+ # remove the leading slash in the path
+ if file_name[0] == "/":
+ file_name = file_name[1:]
+ download_url = data_repository.download_url(file_name)
+
+ # Instantiate the downloader object
+ downloader = HTTPDownloader(
+ progressbar=self.progressbar, chunk_size=self.chunk_size, **self.kwargs
+ )
+ downloader(download_url, output_file, pooch)
+
+
+
+
+def doi_to_url(doi):
+ """
+ Follow a DOI link to resolve the URL of the archive.
+
+ Parameters
+ ----------
+ doi : str
+ The DOI of the archive.
+
+ Returns
+ -------
+ url : str
+ The URL of the archive in the data repository.
+
+ """
+ # Lazy import requests to speed up import time
+ import requests # pylint: disable=C0415
+
+ # Use doi.org to resolve the DOI to the repository website.
+ response = requests.get(f"https://doi.org/{doi}", timeout=5)
+ url = response.url
+ if 400 <= response.status_code < 600:
+ raise ValueError(
+ f"Archive with doi:{doi} not found (see {url}). Is the DOI correct?"
+ )
+ return url
+
+
+def doi_to_repository(doi):
+ """
+ Instantiate a data repository instance from a given DOI.
+
+ This function implements the chain of responsibility dispatch
+ to the correct data repository class.
+
+ Parameters
+ ----------
+ doi : str
+ The DOI of the archive.
+
+ Returns
+ -------
+ data_repository : DataRepository
+ The data repository object
+ """
+
+ # This should go away in a separate issue: DOI handling should
+ # not rely on the (non-)existence of trailing slashes. The issue
+ # is documented in https://github.com/fatiando/pooch/issues/324
+ if doi[-1] == "/":
+ doi = doi[:-1]
+
+ repositories = [
+ FigshareRepository,
+ ZenodoRepository,
+ DataverseRepository,
+ ]
+
+ # Extract the DOI and the repository information
+ archive_url = doi_to_url(doi)
+
+ # Try the converters one by one until one of them returned a URL
+ data_repository = None
+ for repo in repositories:
+ if data_repository is None:
+ data_repository = repo.initialize(
+ archive_url=archive_url,
+ doi=doi,
+ )
+
+ if data_repository is None:
+ repository = parse_url(archive_url)["netloc"]
+ raise ValueError(
+ f"Invalid data repository '{repository}'. "
+ "To request or contribute support for this repository, "
+ "please open an issue at https://github.com/fatiando/pooch/issues"
+ )
+
+ return data_repository
+
+
+class DataRepository: # pylint: disable=too-few-public-methods, missing-class-docstring
+ @classmethod
+ def initialize(cls, doi, archive_url): # pylint: disable=unused-argument
+ """
+ Initialize the data repository if the given URL points to a
+ corresponding repository.
+
+ Initializes a data repository object. This is done as part of
+ a chain of responsibility. If the class cannot handle the given
+ repository URL, it returns `None`. Otherwise a `DataRepository`
+ instance is returned.
+
+ Parameters
+ ----------
+ doi : str
+ The DOI that identifies the repository
+ archive_url : str
+ The resolved URL for the DOI
+ """
+
+ return None # pragma: no cover
+
+ def download_url(self, file_name):
+ """
+ Use the repository API to get the download URL for a file given
+ the archive URL.
+
+ Parameters
+ ----------
+ file_name : str
+ The name of the file in the archive that will be downloaded.
+
+ Returns
+ -------
+ download_url : str
+ The HTTP URL that can be used to download the file.
+ """
+
+ raise NotImplementedError # pragma: no cover
+
+ def populate_registry(self, pooch):
+ """
+ Populate the registry using the data repository's API
+
+ Parameters
+ ----------
+ pooch : Pooch
+ The pooch instance that the registry will be added to.
+ """
+
+ raise NotImplementedError # pragma: no cover
+
+
+class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring
+ base_api_url = "https://zenodo.org/api/records"
+
+ def __init__(self, doi, archive_url):
+ self.archive_url = archive_url
+ self.doi = doi
+ self._api_response = None
+ self._api_version = None
+
+ @classmethod
+ def initialize(cls, doi, archive_url):
+ """
+ Initialize the data repository if the given URL points to a
+ corresponding repository.
+
+ Initializes a data repository object. This is done as part of
+ a chain of responsibility. If the class cannot handle the given
+ repository URL, it returns `None`. Otherwise a `DataRepository`
+ instance is returned.
+
+ Parameters
+ ----------
+ doi : str
+ The DOI that identifies the repository
+ archive_url : str
+ The resolved URL for the DOI
+ """
+
+ # Check whether this is a Zenodo URL
+ parsed_archive_url = parse_url(archive_url)
+ if parsed_archive_url["netloc"] != "zenodo.org":
+ return None
+
+ return cls(doi, archive_url)
+
+ @property
+ def api_response(self):
+ """Cached API response from Zenodo"""
+ if self._api_response is None:
+ # Lazy import requests to speed up import time
+ import requests # pylint: disable=C0415
+
+ article_id = self.archive_url.split("/")[-1]
+ self._api_response = requests.get(
+ f"{self.base_api_url}/{article_id}",
+ timeout=5,
+ ).json()
+
+ return self._api_response
+
+ @property
+ def api_version(self):
+ """
+ Version of the Zenodo API we are interacting with
+
+ The versions can either be :
+
+ - ``"legacy"``: corresponds to the Zenodo API that was supported until
+ 2023-10-12 (before the migration to InvenioRDM).
+ - ``"new"``: corresponds to the new API that went online on 2023-10-13
+ after the migration to InvenioRDM.
+
+ The ``"new"`` API breaks backward compatibility with the ``"legacy"``
+ one and could probably be replaced by an updated version that restores
+ the behaviour of the ``"legacy"`` one.
+
+ Returns
+ -------
+ str
+ """
+ if self._api_version is None:
+ if all("key" in file for file in self.api_response["files"]):
+ self._api_version = "legacy"
+ elif all("filename" in file for file in self.api_response["files"]):
+ self._api_version = "new"
+ else:
+ raise ValueError(
+ "Couldn't determine the version of the Zenodo API for "
+ f"{self.archive_url} (doi:{self.doi})."
+ )
+ return self._api_version
+
+ def download_url(self, file_name):
+ """
+ Use the repository API to get the download URL for a file given
+ the archive URL.
+
+ Parameters
+ ----------
+ file_name : str
+ The name of the file in the archive that will be downloaded.
+
+ Returns
+ -------
+ download_url : str
+ The HTTP URL that can be used to download the file.
+
+ Notes
+ -----
+ After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
+ link to the desired files that appears in the API response leads to 404
+ errors (by 2023-10-17). The files are available in the following url:
+ ``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``.
+
+ This method supports both the legacy and the new API.
+ """
+ # Create list of files in the repository
+ if self.api_version == "legacy":
+ files = {item["key"]: item for item in self.api_response["files"]}
+ else:
+ files = [item["filename"] for item in self.api_response["files"]]
+ # Check if file exists in the repository
+ if file_name not in files:
+ raise ValueError(
+ f"File '{file_name}' not found in data archive "
+ f"{self.archive_url} (doi:{self.doi})."
+ )
+ # Build download url
+ if self.api_version == "legacy":
+ download_url = files[file_name]["links"]["self"]
+ else:
+ article_id = self.api_response["id"]
+ download_url = (
+ f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1"
+ )
+ return download_url
+
+ def populate_registry(self, pooch):
+ """
+ Populate the registry using the data repository's API
+
+ Parameters
+ ----------
+ pooch : Pooch
+ The pooch instance that the registry will be added to.
+
+ Notes
+ -----
+ After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
+ checksums for each file listed in the API reference is now an md5 sum.
+
+ This method supports both the legacy and the new API.
+ """
+ for filedata in self.api_response["files"]:
+ checksum = filedata["checksum"]
+ if self.api_version == "legacy":
+ key = "key"
+ else:
+ key = "filename"
+ checksum = f"md5:{checksum}"
+ pooch.registry[filedata[key]] = checksum
+
+
+class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring
+ def __init__(self, doi, archive_url):
+ self.archive_url = archive_url
+ self.doi = doi
+ self._api_response = None
+
+ @classmethod
+ def initialize(cls, doi, archive_url):
+ """
+ Initialize the data repository if the given URL points to a
+ corresponding repository.
+
+ Initializes a data repository object. This is done as part of
+ a chain of responsibility. If the class cannot handle the given
+ repository URL, it returns `None`. Otherwise a `DataRepository`
+ instance is returned.
+
+ Parameters
+ ----------
+ doi : str
+ The DOI that identifies the repository
+ archive_url : str
+ The resolved URL for the DOI
+ """
+
+ # Check whether this is a Figshare URL
+ parsed_archive_url = parse_url(archive_url)
+ if parsed_archive_url["netloc"] != "figshare.com":
+ return None
+
+ return cls(doi, archive_url)
+
+ def _parse_version_from_doi(self):
+ """
+ Parse version from the doi
+
+ Return None if version is not available in the doi.
+ """
+ # Get suffix of the doi
+ _, suffix = self.doi.split("/")
+ # Split the suffix by dots and keep the last part
+ last_part = suffix.split(".")[-1]
+ # Parse the version from the last part
+ if last_part[0] != "v":
+ return None
+ version = int(last_part[1:])
+ return version
+
+ @property
+ def api_response(self):
+ """Cached API response from Figshare"""
+ if self._api_response is None:
+ # Lazy import requests to speed up import time
+ import requests # pylint: disable=C0415
+
+ # Use the figshare API to find the article ID from the DOI
+ article = requests.get(
+ f"https://api.figshare.com/v2/articles?doi={self.doi}",
+ timeout=5,
+ ).json()[0]
+ article_id = article["id"]
+ # Parse desired version from the doi
+ version = self._parse_version_from_doi()
+ # With the ID and version, we can get a list of files and their
+ # download links
+ if version is None:
+ # Figshare returns the latest version available when no version
+ # is specified through the DOI.
+ warnings.warn(
+ f"The Figshare DOI '{self.doi}' doesn't specify which version of "
+ "the repository should be used. "
+ "Figshare will point to the latest version available.",
+ UserWarning,
+ )
+ # Define API url using only the article id
+ # (figshare will resolve the latest version)
+ api_url = f"https://api.figshare.com/v2/articles/{article_id}"
+ else:
+ # Define API url using article id and the desired version
+ # Get list of files using article id and the version
+ api_url = (
+ "https://api.figshare.com/v2/articles/"
+ f"{article_id}/versions/{version}"
+ )
+ # Make the request and return the files in the figshare repository
+ response = requests.get(api_url, timeout=5)
+ response.raise_for_status()
+ self._api_response = response.json()["files"]
+
+ return self._api_response
+
+ def download_url(self, file_name):
+ """
+ Use the repository API to get the download URL for a file given
+ the archive URL.
+
+ Parameters
+ ----------
+ file_name : str
+ The name of the file in the archive that will be downloaded.
+
+ Returns
+ -------
+ download_url : str
+ The HTTP URL that can be used to download the file.
+ """
+ files = {item["name"]: item for item in self.api_response}
+ if file_name not in files:
+ raise ValueError(
+ f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})."
+ )
+ download_url = files[file_name]["download_url"]
+ return download_url
+
+ def populate_registry(self, pooch):
+ """
+ Populate the registry using the data repository's API
+
+ Parameters
+ ----------
+ pooch : Pooch
+ The pooch instance that the registry will be added to.
+ """
+
+ for filedata in self.api_response:
+ pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}"
+
+
+class DataverseRepository(DataRepository): # pylint: disable=missing-class-docstring
+ def __init__(self, doi, archive_url):
+ self.archive_url = archive_url
+ self.doi = doi
+ self._api_response = None
+
+ @classmethod
+ def initialize(cls, doi, archive_url):
+ """
+ Initialize the data repository if the given URL points to a
+ corresponding repository.
+
+ Initializes a data repository object. This is done as part of
+ a chain of responsibility. If the class cannot handle the given
+ repository URL, it returns `None`. Otherwise a `DataRepository`
+ instance is returned.
+
+ Parameters
+ ----------
+ doi : str
+ The DOI that identifies the repository
+ archive_url : str
+ The resolved URL for the DOI
+ """
+ # Access the DOI as if this was a DataVerse instance
+ response = cls._get_api_response(doi, archive_url)
+
+ # If we failed, this is probably not a DataVerse instance
+ if 400 <= response.status_code < 600:
+ return None
+
+ # Initialize the repository and overwrite the api response
+ repository = cls(doi, archive_url)
+ repository.api_response = response
+ return repository
+
+ @classmethod
+ def _get_api_response(cls, doi, archive_url):
+ """
+ Perform the actual API request
+
+ This has been separated into a separate ``classmethod``, as it can be
+ used prior and after the initialization.
+ """
+ # Lazy import requests to speed up import time
+ import requests # pylint: disable=C0415
+
+ parsed = parse_url(archive_url)
+ response = requests.get(
+ f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/"
+ f":persistentId?persistentId=doi:{doi}",
+ timeout=5,
+ )
+ return response
+
+ @property
+ def api_response(self):
+ """Cached API response from a DataVerse instance"""
+
+ if self._api_response is None:
+ self._api_response = self._get_api_response(
+ self.doi, self.archive_url
+ ) # pragma: no cover
+
+ return self._api_response
+
+ @api_response.setter
+ def api_response(self, response):
+ """Update the cached API response"""
+
+ self._api_response = response
+
+ def download_url(self, file_name):
+ """
+ Use the repository API to get the download URL for a file given
+ the archive URL.
+
+ Parameters
+ ----------
+ file_name : str
+ The name of the file in the archive that will be downloaded.
+
+ Returns
+ -------
+ download_url : str
+ The HTTP URL that can be used to download the file.
+ """
+ parsed = parse_url(self.archive_url)
+ response = self.api_response.json()
+ files = {
+ file["dataFile"]["filename"]: file["dataFile"]
+ for file in response["data"]["latestVersion"]["files"]
+ }
+ if file_name not in files:
+ raise ValueError(
+ f"File '{file_name}' not found in data archive "
+ f"{self.archive_url} (doi:{self.doi})."
+ )
+ # Generate download_url using the file id
+ download_url = (
+ f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/"
+ f"{files[file_name]['id']}"
+ )
+ return download_url
+
+ def populate_registry(self, pooch):
+ """
+ Populate the registry using the data repository's API
+
+ Parameters
+ ----------
+ pooch : Pooch
+ The pooch instance that the registry will be added to.
+ """
+
+ for filedata in self.api_response.json()["data"]["latestVersion"]["files"]:
+ pooch.registry[filedata["dataFile"]["filename"]] = (
+ f"md5:{filedata['dataFile']['md5']}"
+ )
+
+# Copyright (c) 2018 The Pooch Developers.
+# Distributed under the terms of the BSD 3-Clause License.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
+#
+"""
+Calculating and checking file hashes.
+"""
+import hashlib
+import functools
+from pathlib import Path
+
+# From the docs: https://docs.python.org/3/library/hashlib.html#hashlib.new
+# The named constructors are much faster than new() and should be
+# preferred.
+# Need to fallback on new() for some algorithms.
+ALGORITHMS_AVAILABLE = {
+ alg: getattr(hashlib, alg, functools.partial(hashlib.new, alg))
+ for alg in hashlib.algorithms_available
+}
+
+try:
+ import xxhash
+
+ # xxhash doesn't have a list of available algorithms yet.
+ # https://github.com/ifduyue/python-xxhash/issues/48
+ ALGORITHMS_AVAILABLE.update(
+ {
+ alg: getattr(xxhash, alg, None)
+ for alg in ["xxh128", "xxh64", "xxh32", "xxh3_128", "xxh3_64"]
+ }
+ )
+ # The xxh3 algorithms are only available for version>=2.0. Set to None and
+ # remove to ensure backwards compatibility.
+ ALGORITHMS_AVAILABLE = {
+ alg: func for alg, func in ALGORITHMS_AVAILABLE.items() if func is not None
+ }
+except ImportError:
+ pass
+
+
+
+[docs]
+def file_hash(fname, alg="sha256"):
+ """
+ Calculate the hash of a given file.
+
+ Useful for checking if a file has changed or been corrupted.
+
+ Parameters
+ ----------
+ fname : str
+ The name of the file.
+ alg : str
+ The type of the hashing algorithm
+
+ Returns
+ -------
+ hash : str
+ The hash of the file.
+
+ Examples
+ --------
+
+ >>> fname = "test-file-for-hash.txt"
+ >>> with open(fname, "w") as f:
+ ... __ = f.write("content of the file")
+ >>> print(file_hash(fname))
+ 0fc74468e6a9a829f103d069aeb2bb4f8646bad58bf146bb0e3379b759ec4a00
+ >>> import os
+ >>> os.remove(fname)
+
+ """
+ if alg not in ALGORITHMS_AVAILABLE:
+ raise ValueError(
+ f"Algorithm '{alg}' not available to the pooch library. "
+ "Only the following algorithms are available "
+ f"{list(ALGORITHMS_AVAILABLE.keys())}."
+ )
+ # Calculate the hash in chunks to avoid overloading the memory
+ chunksize = 65536
+ hasher = ALGORITHMS_AVAILABLE[alg]()
+ with open(fname, "rb") as fin:
+ buff = fin.read(chunksize)
+ while buff:
+ hasher.update(buff)
+ buff = fin.read(chunksize)
+ return hasher.hexdigest()
+
+
+
+def hash_algorithm(hash_string):
+ """
+ Parse the name of the hash method from the hash string.
+
+ The hash string should have the following form ``algorithm:hash``, where
+ algorithm can be the name of any algorithm known to :mod:`hashlib`.
+
+ If the algorithm is omitted or the hash string is None, will default to
+ ``"sha256"``.
+
+ Parameters
+ ----------
+ hash_string : str
+ The hash string with optional algorithm prepended.
+
+ Returns
+ -------
+ hash_algorithm : str
+ The name of the algorithm.
+
+ Examples
+ --------
+
+ >>> print(hash_algorithm("qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
+ sha256
+ >>> print(hash_algorithm("md5:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
+ md5
+ >>> print(hash_algorithm("sha256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
+ sha256
+ >>> print(hash_algorithm("SHA256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
+ sha256
+ >>> print(hash_algorithm("xxh3_64:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
+ xxh3_64
+ >>> print(hash_algorithm(None))
+ sha256
+
+ """
+ default = "sha256"
+ if hash_string is None:
+ algorithm = default
+ elif ":" not in hash_string:
+ algorithm = default
+ else:
+ algorithm = hash_string.split(":")[0]
+ return algorithm.lower()
+
+
+def hash_matches(fname, known_hash, strict=False, source=None):
+ """
+ Check if the hash of a file matches a known hash.
+
+ If the *known_hash* is None, will always return True.
+
+ Coverts hashes to lowercase before comparison to avoid system specific
+ mismatches between hashes in the registry and computed hashes.
+
+ Parameters
+ ----------
+ fname : str or PathLike
+ The path to the file.
+ known_hash : str
+ The known hash. Optionally, prepend ``alg:`` to the hash to specify the
+ hashing algorithm. Default is SHA256.
+ strict : bool
+ If True, will raise a :class:`ValueError` if the hash does not match
+ informing the user that the file may be corrupted.
+ source : str
+ The source of the downloaded file (name or URL, for example). Will be
+ used in the error message if *strict* is True. Has no other use other
+ than reporting to the user where the file came from in case of hash
+ mismatch. If None, will default to *fname*.
+
+ Returns
+ -------
+ is_same : bool
+ True if the hash matches, False otherwise.
+
+ """
+ if known_hash is None:
+ return True
+ algorithm = hash_algorithm(known_hash)
+ new_hash = file_hash(fname, alg=algorithm)
+ matches = new_hash.lower() == known_hash.split(":")[-1].lower()
+ if strict and not matches:
+ if source is None:
+ source = str(fname)
+ raise ValueError(
+ f"{algorithm.upper()} hash of downloaded file ({source}) does not match"
+ f" the known hash: expected {known_hash} but got {new_hash}. Deleted"
+ " download for safety. The downloaded file may have been corrupted or"
+ " the known hash may be outdated."
+ )
+ return matches
+
+
+
+[docs]
+def make_registry(directory, output, recursive=True):
+ """
+ Make a registry of files and hashes for the given directory.
+
+ This is helpful if you have many files in your test dataset as it keeps you
+ from needing to manually update the registry.
+
+ Parameters
+ ----------
+ directory : str
+ Directory of the test data to put in the registry. All file names in
+ the registry will be relative to this directory.
+ output : str
+ Name of the output registry file.
+ recursive : bool
+ If True, will recursively look for files in subdirectories of
+ *directory*.
+
+ """
+ directory = Path(directory)
+ if recursive:
+ pattern = "**/*"
+ else:
+ pattern = "*"
+
+ files = sorted(
+ str(path.relative_to(directory))
+ for path in directory.glob(pattern)
+ if path.is_file()
+ )
+
+ hashes = [file_hash(str(directory / fname)) for fname in files]
+
+ with open(output, "w", encoding="utf-8") as outfile:
+ for fname, fhash in zip(files, hashes):
+ # Only use Unix separators for the registry so that we don't go
+ # insane dealing with file paths.
+ outfile.write("{} {}\n".format(fname.replace("\\", "/"), fhash))
+
+
+# Copyright (c) 2018 The Pooch Developers.
+# Distributed under the terms of the BSD 3-Clause License.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
+#
+# pylint: disable=line-too-long
+"""
+Post-processing hooks
+"""
+import abc
+import os
+import bz2
+import gzip
+import lzma
+import shutil
+from zipfile import ZipFile
+from tarfile import TarFile
+
+from .utils import get_logger
+
+
+class ExtractorProcessor(abc.ABC): # pylint: disable=too-few-public-methods
+ """
+ Abstract base class for extractions from compressed archives.
+
+ Subclasses can be used with :meth:`pooch.Pooch.fetch` and
+ :func:`pooch.retrieve` to unzip a downloaded data file into a folder in the
+ local data store. :meth:`~pooch.Pooch.fetch` will return a list with the
+ names of the extracted files instead of the archive.
+
+ Parameters
+ ----------
+ members : list or None
+ If None, will unpack all files in the archive. Otherwise, *members*
+ must be a list of file names to unpack from the archive. Only these
+ files will be unpacked.
+ extract_dir : str or None
+ If None, files will be unpacked to the default location (a folder in
+ the same location as the downloaded zip file, with a suffix added).
+ Otherwise, files will be unpacked to ``extract_dir``, which is
+ interpreted as a *relative path* (relative to the cache location
+ provided by :func:`pooch.retrieve` or :meth:`pooch.Pooch.fetch`).
+
+ """
+
+ def __init__(self, members=None, extract_dir=None):
+ self.members = members
+ self.extract_dir = extract_dir
+
+ @property
+ @abc.abstractmethod
+ def suffix(self):
+ """
+ String appended to unpacked archive folder name.
+ Only used if extract_dir is None.
+ MUST BE IMPLEMENTED BY CHILD CLASSES.
+ """
+
+ @abc.abstractmethod
+ def _all_members(self, fname):
+ """
+ Return all the members in the archive.
+ MUST BE IMPLEMENTED BY CHILD CLASSES.
+ """
+
+ @abc.abstractmethod
+ def _extract_file(self, fname, extract_dir):
+ """
+ This method receives an argument for the archive to extract and the
+ destination path.
+ MUST BE IMPLEMENTED BY CHILD CLASSES.
+ """
+
+ def __call__(self, fname, action, pooch):
+ """
+ Extract all files from the given archive.
+
+ Parameters
+ ----------
+ fname : str
+ Full path of the zipped file in local storage.
+ action : str
+ Indicates what action was taken by :meth:`pooch.Pooch.fetch` or
+ :func:`pooch.retrieve`:
+
+ * ``"download"``: File didn't exist locally and was downloaded
+ * ``"update"``: Local file was outdated and was re-download
+ * ``"fetch"``: File exists and is updated so it wasn't downloaded
+
+ pooch : :class:`pooch.Pooch`
+ The instance of :class:`pooch.Pooch` that is calling this.
+
+ Returns
+ -------
+ fnames : list of str
+ A list of the full path to all files in the extracted archive.
+
+ """
+ if self.extract_dir is None:
+ self.extract_dir = fname + self.suffix
+ else:
+ archive_dir = fname.rsplit(os.path.sep, maxsplit=1)[0]
+ self.extract_dir = os.path.join(archive_dir, self.extract_dir)
+ # Get a list of everyone who is supposed to be in the unpacked folder
+ # so we can check if they are all there or if we need to extract new
+ # files.
+ if self.members is None or not self.members:
+ members = self._all_members(fname)
+ else:
+ members = self.members
+ if (
+ (action in ("update", "download"))
+ or (not os.path.exists(self.extract_dir))
+ or not all(
+ os.path.exists(os.path.join(self.extract_dir, m)) for m in members
+ )
+ ):
+ # Make sure that the folder with the extracted files exists
+ os.makedirs(self.extract_dir, exist_ok=True)
+ self._extract_file(fname, self.extract_dir)
+
+ # Get a list of all file names (including subdirectories) in our folder
+ # of unzipped files, filtered by the given members list
+ fnames = []
+ for path, _, files in os.walk(self.extract_dir):
+ for filename in files:
+ relpath = os.path.normpath(
+ os.path.join(os.path.relpath(path, self.extract_dir), filename)
+ )
+ if self.members is None or any(
+ relpath.startswith(os.path.normpath(m)) for m in self.members
+ ):
+ fnames.append(os.path.join(path, filename))
+
+ return fnames
+
+
+
+[docs]
+class Unzip(ExtractorProcessor): # pylint: disable=too-few-public-methods
+ """
+ Processor that unpacks a zip archive and returns a list of all files.
+
+ Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to unzip a
+ downloaded data file into a folder in the local data store. The
+ method/function will return a list with the names of the unzipped files
+ instead of the zip archive.
+
+ The output folder is ``{fname}.unzip``.
+
+ Parameters
+ ----------
+ members : list or None
+ If None, will unpack all files in the zip archive. Otherwise, *members*
+ must be a list of file names to unpack from the archive. Only these
+ files will be unpacked.
+ extract_dir : str or None
+ If None, files will be unpacked to the default location (a folder in
+ the same location as the downloaded zip file, with the suffix
+ ``.unzip`` added). Otherwise, files will be unpacked to
+ ``extract_dir``, which is interpreted as a *relative path* (relative to
+ the cache location provided by :func:`pooch.retrieve` or
+ :meth:`pooch.Pooch.fetch`).
+
+ """
+
+ @property
+ def suffix(self):
+ """
+ String appended to unpacked archive folder name.
+ Only used if extract_dir is None.
+ """
+ return ".unzip"
+
+ def _all_members(self, fname):
+ """Return all members from a given archive."""
+ with ZipFile(fname, "r") as zip_file:
+ return zip_file.namelist()
+
+ def _extract_file(self, fname, extract_dir):
+ """
+ This method receives an argument for the archive to extract and the
+ destination path.
+ """
+ with ZipFile(fname, "r") as zip_file:
+ if self.members is None:
+ get_logger().info(
+ "Unzipping contents of '%s' to '%s'", fname, extract_dir
+ )
+ # Unpack all files from the archive into our new folder
+ zip_file.extractall(path=extract_dir)
+ else:
+ for member in self.members:
+ get_logger().info(
+ "Extracting '%s' from '%s' to '%s'", member, fname, extract_dir
+ )
+ # If the member is a dir, we need to get the names of the
+ # elements it contains for extraction (ZipFile does not
+ # support dirs on .extract). If it's not a dir, this will
+ # only include the member itself.
+ # Based on:
+ # https://stackoverflow.com/questions/8008829/extract-only-a-single-directory-from-tar
+ subdir_members = [
+ name
+ for name in zip_file.namelist()
+ if os.path.normpath(name).startswith(os.path.normpath(member))
+ ]
+ # Extract the data file from within the archive
+ zip_file.extractall(members=subdir_members, path=extract_dir)
+
+
+
+
+[docs]
+class Untar(ExtractorProcessor): # pylint: disable=too-few-public-methods
+ """
+ Processor that unpacks a tar archive and returns a list of all files.
+
+ Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to untar a
+ downloaded data file into a folder in the local data store. The
+ method/function will return a list with the names of the extracted files
+ instead of the archive.
+
+ The output folder is ``{fname}.untar``.
+
+
+ Parameters
+ ----------
+ members : list or None
+ If None, will unpack all files in the archive. Otherwise, *members*
+ must be a list of file names to unpack from the archive. Only these
+ files will be unpacked.
+ extract_dir : str or None
+ If None, files will be unpacked to the default location (a folder in
+ the same location as the downloaded tar file, with the suffix
+ ``.untar`` added). Otherwise, files will be unpacked to
+ ``extract_dir``, which is interpreted as a *relative path* (relative to
+ the cache location provided by :func:`pooch.retrieve` or
+ :meth:`pooch.Pooch.fetch`).
+ """
+
+ @property
+ def suffix(self):
+ """
+ String appended to unpacked archive folder name.
+ Only used if extract_dir is None.
+ """
+ return ".untar"
+
+ def _all_members(self, fname):
+ """Return all members from a given archive."""
+ with TarFile.open(fname, "r") as tar_file:
+ return [info.name for info in tar_file.getmembers()]
+
+ def _extract_file(self, fname, extract_dir):
+ """
+ This method receives an argument for the archive to extract and the
+ destination path.
+ """
+ with TarFile.open(fname, "r") as tar_file:
+ if self.members is None:
+ get_logger().info(
+ "Untarring contents of '%s' to '%s'", fname, extract_dir
+ )
+ # Unpack all files from the archive into our new folder
+ tar_file.extractall(path=extract_dir)
+ else:
+ for member in self.members:
+ get_logger().info(
+ "Extracting '%s' from '%s' to '%s'", member, fname, extract_dir
+ )
+ # If the member is a dir, we need to get the names of the
+ # elements it contains for extraction (TarFile does not
+ # support dirs on .extract). If it's not a dir, this will
+ # only include the member itself.
+ # Based on:
+ # https://stackoverflow.com/questions/8008829/extract-only-a-single-directory-from-tar
+ # Can't use .getnames because extractall expects TarInfo
+ # objects.
+ subdir_members = [
+ info
+ for info in tar_file.getmembers()
+ if os.path.normpath(info.name).startswith(
+ os.path.normpath(member)
+ )
+ ]
+ # Extract the data file from within the archive
+ tar_file.extractall(members=subdir_members, path=extract_dir)
+
+
+
+
+[docs]
+class Decompress: # pylint: disable=too-few-public-methods
+ """
+ Processor that decompress a file and returns the decompressed version.
+
+ Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to decompress
+ a downloaded data file so that it can be easily opened. Useful for data
+ files that take a long time to decompress (exchanging disk space for
+ speed).
+
+ Supported decompression methods are LZMA (``.xz``), bzip2 (``.bz2``), and
+ gzip (``.gz``).
+
+ File names with the standard extensions (see above) can use
+ ``method="auto"`` to automatically determine the compression method. This
+ can be overwritten by setting the *method* argument.
+
+ .. note::
+
+ To unpack zip and tar archives with one or more files, use
+ :class:`pooch.Unzip` and :class:`pooch.Untar` instead.
+
+ The output file is ``{fname}.decomp`` by default but it can be changed by
+ setting the ``name`` parameter.
+
+ .. warning::
+
+ Passing in ``name`` can cause existing data to be lost! For example, if
+ a file already exists with the specified name it will be overwritten
+ with the new decompressed file content. **Use this option with
+ caution.**
+
+ Parameters
+ ----------
+ method : str
+ Name of the compression method. Can be "auto", "lzma", "xz", "bzip2",
+ or "gzip".
+ name : None or str
+ Defines the decompressed file name. The file name will be
+ ``{fname}.decomp`` if ``None`` (default) or the given name otherwise.
+ Note that the name should **not** include the full (or relative) path,
+ it should be just the file name itself.
+
+ """
+
+ modules = {"auto": None, "lzma": lzma, "xz": lzma, "gzip": gzip, "bzip2": bz2}
+ extensions = {".xz": "lzma", ".gz": "gzip", ".bz2": "bzip2"}
+
+ def __init__(self, method="auto", name=None):
+ self.method = method
+ self.name = name
+
+
+[docs]
+ def __call__(self, fname, action, pooch):
+ """
+ Decompress the given file.
+
+ The output file will be either ``{fname}.decomp`` or the given *name*
+ class attribute.
+
+ Parameters
+ ----------
+ fname : str
+ Full path of the compressed file in local storage.
+ action : str
+ Indicates what action was taken by :meth:`pooch.Pooch.fetch` or
+ :func:`pooch.retrieve`:
+
+ - ``"download"``: File didn't exist locally and was downloaded
+ - ``"update"``: Local file was outdated and was re-download
+ - ``"fetch"``: File exists and is updated so it wasn't downloaded
+
+ pooch : :class:`pooch.Pooch`
+ The instance of :class:`pooch.Pooch` that is calling this.
+
+ Returns
+ -------
+ fname : str
+ The full path to the decompressed file.
+ """
+ if self.name is None:
+ decompressed = fname + ".decomp"
+ else:
+ decompressed = os.path.join(os.path.dirname(fname), self.name)
+ if action in ("update", "download") or not os.path.exists(decompressed):
+ get_logger().info(
+ "Decompressing '%s' to '%s' using method '%s'.",
+ fname,
+ decompressed,
+ self.method,
+ )
+ module = self._compression_module(fname)
+ with open(decompressed, "w+b") as output:
+ with module.open(fname) as compressed:
+ shutil.copyfileobj(compressed, output)
+ return decompressed
+
+
+ def _compression_module(self, fname):
+ """
+ Get the Python module compatible with fname and the chosen method.
+
+ If the *method* attribute is "auto", will select a method based on the
+ extension. If no recognized extension is in the file name, will raise a
+ ValueError.
+ """
+ error_archives = "To unpack zip/tar archives, use pooch.Unzip/Untar instead."
+ if self.method not in self.modules:
+ message = (
+ f"Invalid compression method '{self.method}'. "
+ f"Must be one of '{list(self.modules.keys())}'."
+ )
+ if self.method in {"zip", "tar"}:
+ message = " ".join([message, error_archives])
+ raise ValueError(message)
+ if self.method == "auto":
+ ext = os.path.splitext(fname)[-1]
+ if ext not in self.extensions:
+ message = (
+ f"Unrecognized file extension '{ext}'. "
+ f"Must be one of '{list(self.extensions.keys())}'."
+ )
+ if ext in {".zip", ".tar"}:
+ message = " ".join([message, error_archives])
+ raise ValueError(message)
+ return self.modules[self.extensions[ext]]
+ return self.modules[self.method]
+
+
+# Copyright (c) 2018 The Pooch Developers.
+# Distributed under the terms of the BSD 3-Clause License.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
+#
+"""
+Misc utilities
+"""
+import logging
+import os
+import tempfile
+import hashlib
+from pathlib import Path
+from urllib.parse import urlsplit
+from contextlib import contextmanager
+import warnings
+
+import platformdirs
+from packaging.version import Version
+
+
+LOGGER = logging.Logger("pooch")
+LOGGER.addHandler(logging.StreamHandler())
+
+
+def file_hash(*args, **kwargs):
+ """
+ WARNING: Importing this function from pooch.utils is DEPRECATED.
+ Please import from the top-level namespace (`from pooch import file_hash`)
+ instead, which is fully backwards compatible with pooch >= 0.1.
+
+ Examples
+ --------
+
+ >>> fname = "test-file-for-hash.txt"
+ >>> with open(fname, "w") as f:
+ ... __ = f.write("content of the file")
+ >>> print(file_hash(fname))
+ 0fc74468e6a9a829f103d069aeb2bb4f8646bad58bf146bb0e3379b759ec4a00
+ >>> import os
+ >>> os.remove(fname)
+
+ """
+ # pylint: disable=import-outside-toplevel
+ from .hashes import file_hash as new_file_hash
+
+ message = """
+ Importing file_hash from pooch.utils is DEPRECATED. Please import from the
+ top-level namespace (`from pooch import file_hash`) instead, which is fully
+ backwards compatible with pooch >= 0.1.
+ """
+ warnings.warn(message, DeprecationWarning, stacklevel=2)
+ return new_file_hash(*args, **kwargs)
+
+
+
+[docs]
+def get_logger():
+ r"""
+ Get the default event logger.
+
+ The logger records events like downloading files, unzipping archives, etc.
+ Use the method :meth:`logging.Logger.setLevel` of this object to adjust the
+ verbosity level from Pooch.
+
+ Returns
+ -------
+ logger : :class:`logging.Logger`
+ The logger object for Pooch
+ """
+ return LOGGER
+
+
+
+
+[docs]
+def os_cache(project):
+ r"""
+ Default cache location based on the operating system.
+
+ The folder locations are defined by the ``platformdirs`` package
+ using the ``user_cache_dir`` function.
+ Usually, the locations will be following (see the
+ `platformdirs documentation <https://platformdirs.readthedocs.io>`__):
+
+ * Mac: ``~/Library/Caches/<AppName>``
+ * Unix: ``~/.cache/<AppName>`` or the value of the ``XDG_CACHE_HOME``
+ environment variable, if defined.
+ * Windows: ``C:\Users\<user>\AppData\Local\<AppAuthor>\<AppName>\Cache``
+
+ Parameters
+ ----------
+ project : str
+ The project name.
+
+ Returns
+ -------
+ cache_path : :class:`pathlib.Path`
+ The default location for the data cache. User directories (``'~'``) are
+ not expanded.
+
+ """
+ return Path(platformdirs.user_cache_dir(project))
+
+
+
+
+[docs]
+def check_version(version, fallback="master"):
+ """
+ Check if a version is PEP440 compliant and there are no unreleased changes.
+
+ For example, ``version = "0.1"`` will be returned as is but ``version =
+ "0.1+10.8dl8dh9"`` will return the fallback. This is the convention used by
+ `versioneer <https://github.com/warner/python-versioneer>`__ to mark that
+ this version is 10 commits ahead of the last release.
+
+ Parameters
+ ----------
+ version : str
+ A version string.
+ fallback : str
+ What to return if the version string has unreleased changes.
+
+ Returns
+ -------
+ version : str
+ If *version* is PEP440 compliant and there are unreleased changes, then
+ return *version*. Otherwise, return *fallback*.
+
+ Raises
+ ------
+ InvalidVersion
+ If *version* is not PEP440 compliant.
+
+ Examples
+ --------
+
+ >>> check_version("0.1")
+ '0.1'
+ >>> check_version("0.1a10")
+ '0.1a10'
+ >>> check_version("0.1+111.9hdg36")
+ 'master'
+ >>> check_version("0.1+111.9hdg36", fallback="dev")
+ 'dev'
+
+ """
+ parse = Version(version)
+ if parse.local is not None:
+ return fallback
+ return version
+
+
+
+def parse_url(url):
+ """
+ Parse a URL into 3 components:
+
+ <protocol>://<netloc>/<path>
+
+ Example URLs:
+
+ * http://127.0.0.1:8080/test.nc
+ * ftp://127.0.0.1:8080/test.nc
+ * doi:10.6084/m9.figshare.923450.v1/test.nc
+
+ The DOI is a special case. The protocol will be "doi", the netloc will be
+ the DOI, and the path is what comes after the last "/".
+ The only exception are Zenodo dois: the protocol will be "doi", the netloc
+ will be composed by the "prefix/suffix" and the path is what comes after
+ the second "/". This allows to support special cases of Zenodo dois where
+ the path contains forward slashes "/", created by the GitHub-Zenodo
+ integration service.
+
+ Parameters
+ ----------
+ url : str
+ The URL.
+
+ Returns
+ -------
+ parsed_url : dict
+ Three components of a URL (e.g.,
+ ``{'protocol':'http', 'netloc':'127.0.0.1:8080','path': '/test.nc'}``).
+
+ """
+ if url.startswith("doi://"):
+ raise ValueError(
+ f"Invalid DOI link '{url}'. You must not use '//' after 'doi:'."
+ )
+ if url.startswith("doi:"):
+ protocol = "doi"
+ parts = url[4:].split("/")
+ if "zenodo" in parts[1].lower():
+ netloc = "/".join(parts[:2])
+ path = "/" + "/".join(parts[2:])
+ else:
+ netloc = "/".join(parts[:-1])
+ path = "/" + parts[-1]
+ else:
+ parsed_url = urlsplit(url)
+ protocol = parsed_url.scheme or "file"
+ netloc = parsed_url.netloc
+ path = parsed_url.path
+ return {"protocol": protocol, "netloc": netloc, "path": path}
+
+
+def cache_location(path, env=None, version=None):
+ """
+ Location of the cache given a base path and optional configuration.
+
+ Checks for the environment variable to overwrite the path of the local
+ cache. Optionally add *version* to the path if given.
+
+ Parameters
+ ----------
+ path : str, PathLike, list or tuple
+ The path to the local data storage folder. If this is a list or tuple,
+ we'll join the parts with the appropriate separator. Use
+ :func:`pooch.os_cache` for a sensible default.
+ version : str or None
+ The version string for your project. Will be appended to given path if
+ not None.
+ env : str or None
+ An environment variable that can be used to overwrite *path*. This
+ allows users to control where they want the data to be stored. We'll
+ append *version* to the end of this value as well.
+
+ Returns
+ -------
+ local_path : PathLike
+ The path to the local directory.
+
+ """
+ if env is not None and env in os.environ and os.environ[env]:
+ path = os.environ[env]
+ if isinstance(path, (list, tuple)):
+ path = os.path.join(*path)
+ if version is not None:
+ path = os.path.join(str(path), version)
+ path = os.path.expanduser(str(path))
+ return Path(path)
+
+
+def make_local_storage(path, env=None):
+ """
+ Create the local cache directory and make sure it's writable.
+
+ Parameters
+ ----------
+ path : str or PathLike
+ The path to the local data storage folder.
+ env : str or None
+ An environment variable that can be used to overwrite *path*. Only used
+ in the error message in case the folder is not writable.
+ """
+ path = str(path)
+ # Check that the data directory is writable
+ if not os.path.exists(path):
+ action = "create"
+ else:
+ action = "write to"
+
+ try:
+ if action == "create":
+ # When running in parallel, it's possible that multiple jobs will
+ # try to create the path at the same time. Use exist_ok to avoid
+ # raising an error.
+ os.makedirs(path, exist_ok=True)
+ else:
+ with tempfile.NamedTemporaryFile(dir=path):
+ pass
+ except PermissionError as error:
+ message = [
+ str(error),
+ f"| Pooch could not {action} data cache folder '{path}'.",
+ "Will not be able to download data files.",
+ ]
+ if env is not None:
+ message.append(
+ f"Use environment variable '{env}' to specify a different location."
+ )
+ raise PermissionError(" ".join(message)) from error
+
+
+@contextmanager
+def temporary_file(path=None):
+ """
+ Create a closed and named temporary file and make sure it's cleaned up.
+
+ Using :class:`tempfile.NamedTemporaryFile` will fail on Windows if trying
+ to open the file a second time (when passing its name to Pooch function,
+ for example). This context manager creates the file, closes it, yields the
+ file path, and makes sure it's deleted in the end.
+
+ Parameters
+ ----------
+ path : str or PathLike
+ The directory in which the temporary file will be created.
+
+ Yields
+ ------
+ fname : str
+ The path to the temporary file.
+
+ """
+ tmp = tempfile.NamedTemporaryFile(delete=False, dir=path)
+ # Close the temp file so that it can be opened elsewhere
+ tmp.close()
+ try:
+ yield tmp.name
+ finally:
+ if os.path.exists(tmp.name):
+ os.remove(tmp.name)
+
+
+def unique_file_name(url):
+ """
+ Create a unique file name based on the given URL.
+
+ The file name will be unique to the URL by prepending the name with the MD5
+ hash (hex digest) of the URL. The name will also include the last portion
+ of the URL.
+
+ The format will be: ``{md5}-{filename}.{ext}``
+
+ The file name will be cropped so that the entire name (including the hash)
+ is less than 255 characters long (the limit on most file systems).
+
+ Parameters
+ ----------
+ url : str
+ The URL with a file name at the end.
+
+ Returns
+ -------
+ fname : str
+ The file name, unique to this URL.
+
+ Examples
+ --------
+
+ >>> print(unique_file_name("https://www.some-server.org/2020/data.txt"))
+ 02ddee027ce5ebb3d7059fb23d210604-data.txt
+ >>> print(unique_file_name("https://www.some-server.org/2019/data.txt"))
+ 9780092867b497fca6fc87d8308f1025-data.txt
+ >>> print(unique_file_name("https://www.some-server.org/2020/data.txt.gz"))
+ 181a9d52e908219c2076f55145d6a344-data.txt.gz
+
+ """
+ md5 = hashlib.md5(url.encode()).hexdigest()
+ fname = parse_url(url)["path"].split("/")[-1]
+ # Crop the start of the file name to fit 255 characters including the hash
+ # and the :
+ fname = fname[-(255 - len(md5) - 1) :]
+ unique_name = f"{md5}-{fname}"
+ return unique_name
+
+ + A friend to fetch your data files + +
+
+ Just want to download a file without messing with
+ requests
and urllib
?
+ Trying to add sample datasets to your Python package?
+ Pooch is here to help!
+