Skip to content

Commit

Permalink
Merge pull request #585 from fractal-analytics-platform/583-cache-the…
Browse files Browse the repository at this point in the history
…-dowloaded-zenodo-data-in-github-actions

Test new Zenodo-download procedure to speed up CI
  • Loading branch information
tcompa authored Oct 20, 2023
2 parents df339ea + ce0c9d0 commit 6216926
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 76 deletions.
22 changes: 22 additions & 0 deletions .github/workflows/ci_pip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,17 @@ jobs:
- name: Install some testing dependencies (hard-coded)
run: python -m pip install pytest devtools jsonschema requests wget

- name: Cache Zenodo data
id: cache-zenodo-data
uses: actions/cache@v3
with:
path: tests/data/
key: zenodo-data

- name: Download Zenodo data
if: steps.cache-zenodo-data.outputs.cache-hit != 'true'
run: bash tests/data/download_zenodo_data.sh

- name: Test core library with pytest
run: pytest tests --ignore tests/tasks

Expand Down Expand Up @@ -65,5 +76,16 @@ jobs:
- name: Install some testing dependencies (hard-coded)
run: python -m pip install pytest devtools jsonschema requests wget

- name: Cache Zenodo data
id: cache-zenodo-data
uses: actions/cache@v3
with:
path: tests/data/
key: zenodo-data

- name: Download Zenodo data
if: steps.cache-zenodo-data.outputs.cache-hit != 'true'
run: bash tests/data/download_zenodo_data.sh

- name: Test tasks with pytest
run: pytest tests tests/tasks
22 changes: 22 additions & 0 deletions .github/workflows/ci_poetry.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,17 @@ jobs:
- name: Install dependencies (without extras)
run: poetry install --with dev --without docs --no-interaction

- name: Cache Zenodo data
id: cache-zenodo-data
uses: actions/cache@v3
with:
path: tests/data/
key: zenodo-data

- name: Download Zenodo data
if: steps.cache-zenodo-data.outputs.cache-hit != 'true'
run: bash tests/data/download_zenodo_data.sh

- name: Test core library with pytest
run: poetry run coverage run -m pytest tests --ignore tests/tasks

Expand Down Expand Up @@ -88,6 +99,17 @@ jobs:
- name: Check manifest task metadata
run: poetry run python fractal_tasks_core/dev/check_manifest.py

- name: Cache Zenodo data
id: cache-zenodo-data
uses: actions/cache@v3
with:
path: tests/data/
key: zenodo-data

- name: Download Zenodo data
if: steps.cache-zenodo-data.outputs.cache-hit != 'true'
run: bash tests/data/download_zenodo_data.sh

- name: Test tasks with pytest
run: poetry run coverage run -m pytest tests/tasks

Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
**Note**: Numbers like (\#123) point to closed Pull Requests on the fractal-tasks-core repository.

# Unreleased

* Testing:
* Cache Zenodo data, within GitHub actions (\#585).

# 0.13.0

* Tasks:
Expand Down
143 changes: 67 additions & 76 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
import json
import logging
import os
import shutil
import time
from pathlib import Path
from urllib.parse import unquote

import anndata as ad
import pytest
import requests # type: ignore
import wget
import zarr
from devtools import debug

from fractal_tasks_core.lib_regions_of_interest import reset_origin
from fractal_tasks_core.lib_write import write_table


@pytest.fixture(scope="session")
Expand All @@ -16,54 +22,44 @@ def testdata_path() -> Path:
return TEST_DIR / "data/"


@pytest.fixture(scope="function")
def zenodo_images(testdata_path, capsys):
@pytest.fixture(scope="session")
def zenodo_images(testdata_path):
"""
Inspired by
https://github.com/dvolgyes/zenodo_get/blob/master/zenodo_get/zget.py
See https://docs.pytest.org/en/7.4.x/how-to/capture-stdout-stderr.html for
the use of capsys
"""

t_start = time.perf_counter()

url = "10.5281/zenodo.7059515"
folder = str(testdata_path / (url.replace(".", "_").replace("/", "_")))
# Download images and metadata files
recordID = "7059515"
url = "10_5281_zenodo_7059515"
folder = str(testdata_path / f"10_5281_zenodo_{recordID}")
if os.path.isdir(folder):
print(f"{folder} already exists, skip")
return folder
os.makedirs(folder)
url = "https://doi.org/" + url
print(f"I will download {url} files to {folder}")

r = requests.get(url)
recordID = r.url.split("/")[-1]
url = "https://zenodo.org/api/records/"
r = requests.get(url + recordID)

js = json.loads(r.text)
files = js["files"]
for f in files:
fname = f["filename"]
link = f"https://zenodo.org/record/{recordID}/files/{fname}"
print(link)
link = unquote(link)
wget.download(link, out=folder)
print()
print(f"{folder} already exists, skip download")
else:
os.makedirs(folder)
url = f"https://zenodo.org/api/records/{recordID}"
r = requests.get(url)
js = json.loads(r.text)
files = js["files"]
for f in files:
file_url = f["links"]["download"]
file_name = file_url.split("/")[-2]
wget.download(file_url, out=f"{folder}/{file_name}", bar=False)

# Add an image with invalid name, that should be skipped during parsing
with open(f"{folder}/invalid_path.png", "w") as f:
f.write("This file has an invalid filename, which cannot be parsed.")

t_end = time.perf_counter()
with capsys.disabled():
print(f"\n Time spent in zenodo_images: {t_end-t_start:.2f} s")
logging.warning(
f"\n Time spent in zenodo_images: {t_end-t_start:.2f} s"
)

return folder


@pytest.fixture(scope="function")
@pytest.fixture(scope="session")
def zenodo_images_multiplex(testdata_path, zenodo_images):
folder = str(testdata_path / "fake_multiplex")
cycle_folder_1 = str(Path(folder) / "cycle1")
Expand All @@ -78,70 +74,65 @@ def zenodo_images_multiplex(testdata_path, zenodo_images):
return cycle_folders


@pytest.fixture(scope="function")
def zenodo_zarr(testdata_path, tmpdir_factory, capsys):
"""
See https://docs.pytest.org/en/7.4.x/how-to/capture-stdout-stderr.html for
the use of capsys
"""
@pytest.fixture(scope="session")
def zenodo_zarr(testdata_path, tmpdir_factory):
t_start = time.perf_counter()

doi = "10.5281/zenodo.8091756"
rootfolder = testdata_path / (doi.replace(".", "_").replace("/", "_"))
platenames = ["plate.zarr", "plate_mip.zarr"]
folders = [rootfolder / plate for plate in platenames]
zarrnames = [
"20200812-CardiomyocyteDifferentiation14-Cycle1.zarr",
"20200812-CardiomyocyteDifferentiation14-Cycle1_mip.zarr",
]

# Download dataset
if rootfolder.exists():
print(f"{str(rootfolder)} already exists, skip")
folders = [str(f) for f in folders]
return folders
print(f"{str(rootfolder)} already exists, skip download part")
else:

import zarr
import anndata as ad
import logging

from fractal_tasks_core.lib_regions_of_interest import reset_origin
from fractal_tasks_core.lib_write import write_table

rootfolder.mkdir()
tmp_path = tmpdir_factory.mktemp("zenodo_zarr")
zarrnames = [
"20200812-CardiomyocyteDifferentiation14-Cycle1.zarr",
"20200812-CardiomyocyteDifferentiation14-Cycle1_mip.zarr",
]
for zarrname, folder in zip(zarrnames, folders):
for zarrname in zarrnames:
zipname = f"{zarrname}.zip"
url = f"https://zenodo.org/record/8091756/files/{zipname}"
debug(url)
wget.download(url, out=str(tmp_path / zipname), bar=None)
time.sleep(0.5)
shutil.unpack_archive(
str(tmp_path / zipname), extract_dir=rootfolder, format="zip"
str(tmp_path / zipname),
extract_dir=rootfolder,
format="zip",
)

# Based on the Zenodo OME-Zarrs, create the appropriate OME-Zarrs to be
# used in tests
for zarrname, folder in zip(zarrnames, folders):
if os.path.isdir(str(folder)):
shutil.rmtree(str(folder))
shutil.copytree(str(rootfolder / zarrname), str(folder))

# Update well/FOV ROI tables, by shifting their origin to 0
# TODO: remove this fix, by uploading new zarrs to zenodo (ref
# issue 526)
image_group_path = folder / "B/03/0"
group_image = zarr.open_group(str(image_group_path))
for table_name in ["FOV_ROI_table", "well_ROI_table"]:
table_path = str(image_group_path / "tables" / table_name)
old_table = ad.read_zarr(table_path)
new_table = reset_origin(old_table)
write_table(
group_image,
table_name,
new_table,
overwrite=True,
logger=logging.getLogger(),
)
shutil.move(str(rootfolder / zarrname), str(folder))

# Update well/FOV ROI tables, by shifting their origin to 0
# TODO: remove this fix, by uploading new zarrs to zenodo (ref
# issue 526)
image_group_path = folder / "B/03/0"
group_image = zarr.open_group(str(image_group_path))
for table_name in ["FOV_ROI_table", "well_ROI_table"]:
table_path = str(image_group_path / "tables" / table_name)
old_table = ad.read_zarr(table_path)
new_table = reset_origin(old_table)
write_table(
group_image,
table_name,
new_table,
overwrite=True,
logger=logging.getLogger(),
)

folders = [str(f) for f in folders]

t_end = time.perf_counter()
with capsys.disabled():
print(f"\n Time spent in zenodo_zarr: {t_end-t_start:.2f} s")

logging.warning(f"\n Time spent in zenodo_zarr: {t_end-t_start:.2f} s")
return folders


Expand Down
39 changes: 39 additions & 0 deletions tests/data/download_zenodo_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash

LIST_RECORD_ID="7059515 8091756"
LIST_RECORD_ID="7059515 8091756"

for RECORD_ID in $LIST_RECORD_ID; do
echo "****************************"
echo "START RECORD_ID=$RECORD_ID"
OUTPUT_FOLDER=tests/data/10_5281_zenodo_$RECORD_ID
echo "OUTPUT_FOLDER: $OUTPUT_FOLDER"

if [ -d $OUTPUT_FOLDER ]; then
echo "OUTPUT_FOLDER already exists. Exit."
else
mkdir $OUTPUT_FOLDER
FILES=`curl https://zenodo.org/api/records/$RECORD_ID | jq -r ".files[].links.download"`
echo "curl exit code: $?"
echo
for FILE in $FILES; do
FILEPATH=${FILE%"/content"}
FILENAME=`basename $FILEPATH`
echo "FILE: $FILE"
echo "FILEPATH: $FILEPATH"
echo "FILENAME: $FILENAME"
echo
wget --no-verbose $FILE --output-document=${OUTPUT_FOLDER}/${FILENAME}
echo
done

if [ $RECORD_ID == "8091756" ]; then
unzip tests/data/10_5281_zenodo_8091756/20200812-CardiomyocyteDifferentiation14-Cycle1.zarr.zip -d tests/data/10_5281_zenodo_8091756
unzip tests/data/10_5281_zenodo_8091756/20200812-CardiomyocyteDifferentiation14-Cycle1_mip.zarr.zip -d tests/data/10_5281_zenodo_8091756
fi
fi

echo "END RECORD_ID=$RECORD_ID"
echo "****************************"
echo
done

0 comments on commit 6216926

Please sign in to comment.