diff --git a/doc/conf.py b/doc/conf.py index 6c7bb4de..41e1b79c 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -41,6 +41,7 @@ "python": ("https://docs.python.org/3/", None), "pandas": ("http://pandas.pydata.org/pandas-docs/stable/", None), "requests": ("https://requests.readthedocs.io/en/latest/", None), + "filelock": ("https://py-filelock.readthedocs.io/en/latest/", None), } # Autosummary pages will be generated by sphinx-autogen instead of sphinx-build diff --git a/doc/index.rst b/doc/index.rst index f2306777..6b0f11b7 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -134,6 +134,7 @@ Are you a **scientist** or researcher? Pooch can help you too! progressbars.rst unpacking.rst decompressing.rst + parallel-downloads.rst .. toctree:: :caption: Reference diff --git a/doc/parallel-downloads.rst b/doc/parallel-downloads.rst new file mode 100644 index 00000000..bfa68af7 --- /dev/null +++ b/doc/parallel-downloads.rst @@ -0,0 +1,51 @@ +.. _paralleldownloads: + +Parallel downloads +================== + +When running :func:`pooch.retrieve` or :meth:`pooch.Pooch.fetch` on parallel +processes, Pooch will trigger multiple downloads of the same file(s). Although +there is no `race condition `_ +happening in this process, download the same file multiple time is not +desirable, it slows down the fetching process and consumes more bandwidth than +necessary. + +A solution to this problem is to create a `lock file +`_ that will allow only +one process to download the desired file, and force all the other processes to +wait until it finishes for fetching the file directly from the cache. +Lock files can be easily created through the :mod:`filelock` package. + +For example, let's create a ``download.py`` file that defines a lock file +before calling the :fun:`pooch.retrieve` function. + +.. code:: python + + # file: download.py + import pooch + import filelock + + lock = filelock.LockFile(path="foo.lock") + with lock: + file_path = pooch.retrieve( + url="https://github.com/fatiando/pooch/raw/v1.0.0/data/tiny-data.txt", + known_hash="md5:70e2afd3fd7e336ae478b1e740a5f08e", + path="my_dir", + ) + + # Perform tasks with this file using different parameters passed as argument + parameter = sys.arg[1] # get parameter from first argument + ... # perform tasks using the file and the parameter + +We can run this script in parallel using the Bash ampersand: + +.. code:: bash + + python download.py 1 & + python download.py 2 & + python download.py 3 & + +Since we are using a lock file, only one of these process will take care of the +download. The rest will wait for it to finish, and then fetch the file from the +cache. Then all further tasks that the ``download.py`` performs using the +different arguments will be run in parallel as usual. diff --git a/env/requirements-docs.txt b/env/requirements-docs.txt index adc9427a..ddb2b639 100644 --- a/env/requirements-docs.txt +++ b/env/requirements-docs.txt @@ -2,3 +2,4 @@ sphinx==7.2.* sphinx-book-theme==1.1.* sphinx-design==0.5.* +filelock diff --git a/environment.yml b/environment.yml index 8c36ebbb..c2c382a6 100644 --- a/environment.yml +++ b/environment.yml @@ -21,6 +21,7 @@ dependencies: - sphinx==7.2.* - sphinx-book-theme==1.1.* - sphinx-design==0.5.* + - filelock # Style - pathspec - black>=20.8b1