Merge pull request #585 from fractal-analytics-platform/583-cache-the…

…-dowloaded-zenodo-data-in-github-actions Test new Zenodo-download procedure to speed up CI
fractal-analytics-platform · Oct 20, 2023 · 6216926 · 6216926
2 parents df339ea + ce0c9d0
commit 6216926
Show file tree

Hide file tree

Showing 5 changed files with 155 additions and 76 deletions.
diff --git a/.github/workflows/ci_pip.yml b/.github/workflows/ci_pip.yml
@@ -35,6 +35,17 @@ jobs:
       - name: Install some testing dependencies (hard-coded)
         run: python -m pip install pytest devtools jsonschema requests wget
 
+      - name: Cache Zenodo data
+        id: cache-zenodo-data
+        uses: actions/cache@v3
+        with:
+          path: tests/data/
+          key: zenodo-data
+
+      - name: Download Zenodo data
+        if: steps.cache-zenodo-data.outputs.cache-hit != 'true'
+        run: bash tests/data/download_zenodo_data.sh
+
       - name: Test core library with pytest
         run: pytest tests --ignore tests/tasks
 
@@ -65,5 +76,16 @@ jobs:
       - name: Install some testing dependencies (hard-coded)
         run: python -m pip install pytest devtools jsonschema requests wget
 
+      - name: Cache Zenodo data
+        id: cache-zenodo-data
+        uses: actions/cache@v3
+        with:
+          path: tests/data/
+          key: zenodo-data
+
+      - name: Download Zenodo data
+        if: steps.cache-zenodo-data.outputs.cache-hit != 'true'
+        run: bash tests/data/download_zenodo_data.sh
+
       - name: Test tasks with pytest
         run: pytest tests tests/tasks
diff --git a/.github/workflows/ci_poetry.yml b/.github/workflows/ci_poetry.yml
@@ -42,6 +42,17 @@ jobs:
       - name: Install dependencies (without extras)
         run: poetry install --with dev --without docs --no-interaction
 
+      - name: Cache Zenodo data
+        id: cache-zenodo-data
+        uses: actions/cache@v3
+        with:
+          path: tests/data/
+          key: zenodo-data
+
+      - name: Download Zenodo data
+        if: steps.cache-zenodo-data.outputs.cache-hit != 'true'
+        run: bash tests/data/download_zenodo_data.sh
+
       - name: Test core library with pytest
         run: poetry run coverage run -m pytest tests --ignore tests/tasks
 
@@ -88,6 +99,17 @@ jobs:
       - name: Check manifest task metadata
         run: poetry run python fractal_tasks_core/dev/check_manifest.py
 
+      - name: Cache Zenodo data
+        id: cache-zenodo-data
+        uses: actions/cache@v3
+        with:
+          path: tests/data/
+          key: zenodo-data
+
+      - name: Download Zenodo data
+        if: steps.cache-zenodo-data.outputs.cache-hit != 'true'
+        run: bash tests/data/download_zenodo_data.sh
+
       - name: Test tasks with pytest
         run: poetry run coverage run -m pytest tests/tasks
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 **Note**: Numbers like (\#123) point to closed Pull Requests on the fractal-tasks-core repository.
 
+# Unreleased
+
+* Testing:
+    * Cache Zenodo data, within GitHub actions (\#585).
+
 # 0.13.0
 
 * Tasks:

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,13 +1,19 @@
 import json
+import logging
 import os
 import shutil
 import time
 from pathlib import Path
-from urllib.parse import unquote
 
+import anndata as ad
 import pytest
 import requests  # type: ignore
 import wget
+import zarr
+from devtools import debug
+
+from fractal_tasks_core.lib_regions_of_interest import reset_origin
+from fractal_tasks_core.lib_write import write_table
 
 
 @pytest.fixture(scope="session")
@@ -16,54 +22,44 @@ def testdata_path() -> Path:
     return TEST_DIR / "data/"
 
 
-@pytest.fixture(scope="function")
-def zenodo_images(testdata_path, capsys):
+@pytest.fixture(scope="session")
+def zenodo_images(testdata_path):
     """
     Inspired by
     https://github.com/dvolgyes/zenodo_get/blob/master/zenodo_get/zget.py
-
-    See https://docs.pytest.org/en/7.4.x/how-to/capture-stdout-stderr.html for
-    the use of capsys
     """
-
     t_start = time.perf_counter()
 
-    url = "10.5281/zenodo.7059515"
-    folder = str(testdata_path / (url.replace(".", "_").replace("/", "_")))
+    # Download images and metadata files
+    recordID = "7059515"
+    url = "10_5281_zenodo_7059515"
+    folder = str(testdata_path / f"10_5281_zenodo_{recordID}")
     if os.path.isdir(folder):
-        print(f"{folder} already exists, skip")
-        return folder
-    os.makedirs(folder)
-    url = "https://doi.org/" + url
-    print(f"I will download {url} files to {folder}")
-
-    r = requests.get(url)
-    recordID = r.url.split("/")[-1]
-    url = "https://zenodo.org/api/records/"
-    r = requests.get(url + recordID)
-
-    js = json.loads(r.text)
-    files = js["files"]
-    for f in files:
-        fname = f["filename"]
-        link = f"https://zenodo.org/record/{recordID}/files/{fname}"
-        print(link)
-        link = unquote(link)
-        wget.download(link, out=folder)
-        print()
+        print(f"{folder} already exists, skip download")
+    else:
+        os.makedirs(folder)
+        url = f"https://zenodo.org/api/records/{recordID}"
+        r = requests.get(url)
+        js = json.loads(r.text)
+        files = js["files"]
+        for f in files:
+            file_url = f["links"]["download"]
+            file_name = file_url.split("/")[-2]
+            wget.download(file_url, out=f"{folder}/{file_name}", bar=False)
 
     # Add an image with invalid name, that should be skipped during parsing
     with open(f"{folder}/invalid_path.png", "w") as f:
         f.write("This file has an invalid filename, which cannot be parsed.")
 
     t_end = time.perf_counter()
-    with capsys.disabled():
-        print(f"\n    Time spent in zenodo_images: {t_end-t_start:.2f} s")
+    logging.warning(
+        f"\n    Time spent in zenodo_images: {t_end-t_start:.2f} s"
+    )
 
     return folder
 
 
-@pytest.fixture(scope="function")
+@pytest.fixture(scope="session")
 def zenodo_images_multiplex(testdata_path, zenodo_images):
     folder = str(testdata_path / "fake_multiplex")
     cycle_folder_1 = str(Path(folder) / "cycle1")
@@ -78,70 +74,65 @@ def zenodo_images_multiplex(testdata_path, zenodo_images):
     return cycle_folders
 
 
-@pytest.fixture(scope="function")
-def zenodo_zarr(testdata_path, tmpdir_factory, capsys):
-    """
-    See https://docs.pytest.org/en/7.4.x/how-to/capture-stdout-stderr.html for
-    the use of capsys
-    """
+@pytest.fixture(scope="session")
+def zenodo_zarr(testdata_path, tmpdir_factory):
     t_start = time.perf_counter()
 
     doi = "10.5281/zenodo.8091756"
     rootfolder = testdata_path / (doi.replace(".", "_").replace("/", "_"))
     platenames = ["plate.zarr", "plate_mip.zarr"]
     folders = [rootfolder / plate for plate in platenames]
+    zarrnames = [
+        "20200812-CardiomyocyteDifferentiation14-Cycle1.zarr",
+        "20200812-CardiomyocyteDifferentiation14-Cycle1_mip.zarr",
+    ]
 
+    # Download dataset
     if rootfolder.exists():
-        print(f"{str(rootfolder)} already exists, skip")
-        folders = [str(f) for f in folders]
-        return folders
+        print(f"{str(rootfolder)} already exists, skip download part")
     else:
-
-        import zarr
-        import anndata as ad
-        import logging
-
-        from fractal_tasks_core.lib_regions_of_interest import reset_origin
-        from fractal_tasks_core.lib_write import write_table
-
         rootfolder.mkdir()
         tmp_path = tmpdir_factory.mktemp("zenodo_zarr")
-        zarrnames = [
-            "20200812-CardiomyocyteDifferentiation14-Cycle1.zarr",
-            "20200812-CardiomyocyteDifferentiation14-Cycle1_mip.zarr",
-        ]
-        for zarrname, folder in zip(zarrnames, folders):
+        for zarrname in zarrnames:
             zipname = f"{zarrname}.zip"
             url = f"https://zenodo.org/record/8091756/files/{zipname}"
+            debug(url)
             wget.download(url, out=str(tmp_path / zipname), bar=None)
+            time.sleep(0.5)
             shutil.unpack_archive(
-                str(tmp_path / zipname), extract_dir=rootfolder, format="zip"
+                str(tmp_path / zipname),
+                extract_dir=rootfolder,
+                format="zip",
+            )
+
+    # Based on the Zenodo OME-Zarrs, create the appropriate OME-Zarrs to be
+    # used in tests
+    for zarrname, folder in zip(zarrnames, folders):
+        if os.path.isdir(str(folder)):
+            shutil.rmtree(str(folder))
+        shutil.copytree(str(rootfolder / zarrname), str(folder))
+
+        # Update well/FOV ROI tables, by shifting their origin to 0
+        # TODO: remove this fix, by uploading new zarrs to zenodo (ref
+        # issue 526)
+        image_group_path = folder / "B/03/0"
+        group_image = zarr.open_group(str(image_group_path))
+        for table_name in ["FOV_ROI_table", "well_ROI_table"]:
+            table_path = str(image_group_path / "tables" / table_name)
+            old_table = ad.read_zarr(table_path)
+            new_table = reset_origin(old_table)
+            write_table(
+                group_image,
+                table_name,
+                new_table,
+                overwrite=True,
+                logger=logging.getLogger(),
             )
-            shutil.move(str(rootfolder / zarrname), str(folder))
-
-            # Update well/FOV ROI tables, by shifting their origin to 0
-            # TODO: remove this fix, by uploading new zarrs to zenodo (ref
-            # issue 526)
-            image_group_path = folder / "B/03/0"
-            group_image = zarr.open_group(str(image_group_path))
-            for table_name in ["FOV_ROI_table", "well_ROI_table"]:
-                table_path = str(image_group_path / "tables" / table_name)
-                old_table = ad.read_zarr(table_path)
-                new_table = reset_origin(old_table)
-                write_table(
-                    group_image,
-                    table_name,
-                    new_table,
-                    overwrite=True,
-                    logger=logging.getLogger(),
-                )
 
     folders = [str(f) for f in folders]
 
     t_end = time.perf_counter()
-    with capsys.disabled():
-        print(f"\n    Time spent in zenodo_zarr: {t_end-t_start:.2f} s")
-
+    logging.warning(f"\n    Time spent in zenodo_zarr: {t_end-t_start:.2f} s")
     return folders
 
 

diff --git a/tests/data/download_zenodo_data.sh b/tests/data/download_zenodo_data.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+LIST_RECORD_ID="7059515 8091756"
+LIST_RECORD_ID="7059515 8091756"
+
+for RECORD_ID in $LIST_RECORD_ID; do
+    echo "****************************"
+    echo "START RECORD_ID=$RECORD_ID"
+    OUTPUT_FOLDER=tests/data/10_5281_zenodo_$RECORD_ID
+    echo "OUTPUT_FOLDER: $OUTPUT_FOLDER"
+
+    if [ -d $OUTPUT_FOLDER ]; then
+        echo "OUTPUT_FOLDER already exists. Exit."
+    else
+        mkdir $OUTPUT_FOLDER
+        FILES=`curl https://zenodo.org/api/records/$RECORD_ID | jq -r ".files[].links.download"`
+        echo "curl exit code: $?"
+        echo
+        for FILE in $FILES; do
+            FILEPATH=${FILE%"/content"}
+            FILENAME=`basename $FILEPATH`
+            echo "FILE:     $FILE"
+            echo "FILEPATH: $FILEPATH"
+            echo "FILENAME: $FILENAME"
+            echo
+            wget --no-verbose $FILE --output-document=${OUTPUT_FOLDER}/${FILENAME}
+            echo
+        done
+
+        if [ $RECORD_ID == "8091756" ]; then
+            unzip tests/data/10_5281_zenodo_8091756/20200812-CardiomyocyteDifferentiation14-Cycle1.zarr.zip -d tests/data/10_5281_zenodo_8091756
+            unzip tests/data/10_5281_zenodo_8091756/20200812-CardiomyocyteDifferentiation14-Cycle1_mip.zarr.zip -d tests/data/10_5281_zenodo_8091756
+        fi
+    fi
+
+    echo "END RECORD_ID=$RECORD_ID"
+    echo "****************************"
+    echo
+done