From 791b7f7f543d35212c481a3382b486c3d6417918 Mon Sep 17 00:00:00 2001 From: Izzet Yildirim Date: Thu, 11 Jul 2024 17:52:42 +0000 Subject: [PATCH 1/9] PyPI publish workflow added. --- .github/workflows/cd.yml | 53 +++++++++++++++++++ .../{python-package-conda.yml => ci.yml} | 2 +- MANIFEST.in | 4 +- setup.py | 8 +-- 4 files changed, 61 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/cd.yml rename .github/workflows/{python-package-conda.yml => ci.yml} (99%) diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml new file mode 100644 index 00000000..85581614 --- /dev/null +++ b/.github/workflows/cd.yml @@ -0,0 +1,53 @@ +name: Publish DLIO Benchmark + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + release-build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Build release distributions + run: | + # NOTE: put your own distribution build steps here. + python -m pip install build + python -m build + + - name: Upload distributions + uses: actions/upload-artifact@v4 + with: + name: release-dists + path: dist/ + + pypi-publish: + runs-on: ubuntu-latest + + needs: + - release-build + + permissions: + id-token: write + + steps: + - name: Retrieve release distributions + uses: actions/download-artifact@v4 + with: + name: release-dists + path: dist/ + + - name: Publish release distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_DLIO_TOKEN }} diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/ci.yml similarity index 99% rename from .github/workflows/python-package-conda.yml rename to .github/workflows/ci.yml index 70be28f8..de05c225 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/ci.yml @@ -1,4 +1,4 @@ -name: Python Package using Conda +name: Build and Test DLIO Benchmark on: pull_request: diff --git a/MANIFEST.in b/MANIFEST.in index 49623d51..3ee4b4c1 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,2 @@ -include requirements.txt -recursive-include configs * \ No newline at end of file +prune docs +recursive-include dlio_benchmark/configs *.yaml \ No newline at end of file diff --git a/setup.py b/setup.py index 6a2478a8..fd77d9a1 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ long_description_content_type="text/markdown", url="https://github.com/argonne-lcf/dlio_benchmark", author="Huihuo Zheng, Hariharan Devarajan (Hari)", - email="zhenghh04@gmail.com, mani.hariharan@gmail.com", + author_email="zhenghh04@gmail.com, mani.hariharan@gmail.com", classifiers=[ # Optional # How mature is this project? Common values are # 3 - Alpha @@ -60,10 +60,10 @@ # 5 - Production/Stable "Development Status :: 5 - Production/Stable", # Indicate who your project is intended for - "Intended Audience :: HPC", + "Intended Audience :: Science/Research", "Topic :: Software Development :: Build Tools", # Pick your license as you wish - "License :: OSI Approved :: Apache 2.0 License", + "License :: OSI Approved :: Apache Software License", # Specify the Python versions you support here. In particular, ensure # that you indicate you support Python 3. These classifiers are *not* # checked by 'pip install'. See instead 'python_requires' below. @@ -71,6 +71,8 @@ "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3 :: Only", ], keywords="deep learning, I/O, benchmark, NPZ, pytorch benchmark, tensorflow benchmark", From 05c9b61c5cc4bc29dd4cd3adef3ed62c5f914a51 Mon Sep 17 00:00:00 2001 From: Izzet Yildirim Date: Thu, 11 Jul 2024 17:54:47 +0000 Subject: [PATCH 2/9] `requirements.txt` cleaned up. --- dev-requirements.txt | 62 -------------------------------------- requirements.txt | 72 ++++++++++---------------------------------- 2 files changed, 16 insertions(+), 118 deletions(-) delete mode 100644 dev-requirements.txt diff --git a/dev-requirements.txt b/dev-requirements.txt deleted file mode 100644 index d6c1bd55..00000000 --- a/dev-requirements.txt +++ /dev/null @@ -1,62 +0,0 @@ -# Use cpu version of torch ---extra-index-url https://download.pytorch.org/whl/cpu ---extra-index-url https://developer.download.nvidia.com/compute/redist - -absl-py==1.3.0 -antlr4-python3-runtime==4.9.3 -astunparse==1.6.3 -cachetools==5.2.0 -certifi==2022.9.24 -charset-normalizer==2.1.1 -flatbuffers==22.10.26 -gast==0.4.0 -google-auth==2.14.1 -google-auth-oauthlib==0.4.6 -google-pasta==0.2.0 -grpcio==1.51.0 -h5py==3.7.0 -hydra-core==1.2.0 -idna==3.4 -keras==2.11.0 -libclang==14.0.6 -Markdown==3.4.1 -MarkupSafe==2.1.1 -mpi4py==3.1.4 -numpy==1.23.5 -oauthlib==3.2.2 -omegaconf==2.2.3 -opt-einsum==3.3.0 -packaging==21.3 -pandas==1.5.1 -Pillow==9.3.0 -protobuf==3.19.6 -pyasn1==0.4.8 -pyasn1-modules==0.2.8 -pyparsing==3.0.9 -python-dateutil==2.8.2 -pytz==2022.6 -PyYAML==6.0 -requests==2.28.1 -requests-oauthlib==1.3.1 -rsa==4.9 -six==1.16.0 -tensorboard==2.11.0 -tensorboard-data-server==0.6.1 -tensorboard-plugin-wit==1.8.1 -tensorflow==2.11.0 -tensorflow-io==0.28.0 -tensorflow-estimator==2.11.0 -termcolor==2.1.1 -torch==1.13.0 -torchaudio==0.13.0 -torchvision==0.14.0 -typing_extensions==4.4.0 -urllib3==1.26.12 -Werkzeug==2.2.2 -wrapt==1.14.1 -pytest -pytest-mpi -pytest-subtests -pytest-timeout -nvidia-dali-cuda110 -psutil \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b45f0908..7177bfdd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,57 +1,17 @@ -absl-py>=1.3.0 -antlr4-python3-runtime>=4.9.3 -astunparse>=1.6.3 -cachetools>=5.2.0 -certifi>=2022.9.24 -charset-normalizer>=2.1.1 +--extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://developer.download.nvidia.com/compute/redist + +PyYAML==6.0.1 +hydra-core==1.3.2 +mpi4py==3.1.6 +numpy==1.23.5 +nvidia-dali-cuda110==1.39.0 +omegaconf==2.2.3 +pandas==1.5.3 +Pillow==9.3.0 +psutil==5.9.8 pydftracer==1.0.2 -flatbuffers>=23.5.26 -gast>=0.4.0 -google-auth>=2.14.1 -google-auth-oauthlib>=0.7.0 -google-pasta>=0.2.0 -grpcio>=1.51.0 -h5py>=3.7.0 -hydra-core>=1.2.0 -idna>=3.4 -keras>=2.15.0 -libclang>=14.0.6 -Markdown>=3.4.1 -MarkupSafe>=2.1.1 -mpi4py>=3.1.4 -numpy>=1.23.5 -nvidia-dali-cuda110>=1.34.0 -oauthlib>=3.2.2 -omegaconf>=2.2.3 -opt-einsum>=3.3.0 -packaging>=21.3 -pandas>=1.5.1 -Pillow>=9.3.0 -protobuf>=4.23.4 -psutil>=5.9.8 -pyasn1>=0.4.8 -pyasn1-modules>=0.2.8 -pyparsing>=3.0.9 -python-dateutil>=2.8.2 -pytz>=2022.6 -PyYAML>=6.0 -requests>=2.28.1 -requests-oauthlib>=1.3.1 -rsa>=4.9 -six>=1.16.0 -tensorboard>=2.11.0 -tensorboard-data-server>=0.7.2 -tensorboard-plugin-wit>=1.8.1 -tensorflow>=2.11.0 -tensorflow-io>=0.28.0 -tensorflow-estimator>=2.11.0 -termcolor>=2.1.1 -# Use cpu version of torch ---extra-index-url https://download.pytorch.org/whl/cpu torch>=2.2.0 -torchaudio>=2.2.0 -torchvision>=0.17.0 -typing_extensions>=4.9.0 -urllib3>=1.26.12 -Werkzeug>=2.2.2 -wrapt>=1.14.1 -psutil>=5.9.5 +pytest==8.2.0 +torch==2.3.1 +torchaudio==2.3.1 +triton==2.3.1 From b59aa41bc9f7ea2f275a20ec220018fa62a37683 Mon Sep 17 00:00:00 2001 From: Izzet Yildirim Date: Thu, 11 Jul 2024 18:02:52 +0000 Subject: [PATCH 3/9] Action names refactored. --- .github/workflows/cd.yml | 2 +- .github/workflows/ci.yml | 2 +- .github/workflows/jekyll-gh-pages.yml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 85581614..e834b4a2 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -1,4 +1,4 @@ -name: Publish DLIO Benchmark +name: Release on: release: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index de05c225..d3b601ff 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,4 +1,4 @@ -name: Build and Test DLIO Benchmark +name: Build and Test on: pull_request: diff --git a/.github/workflows/jekyll-gh-pages.yml b/.github/workflows/jekyll-gh-pages.yml index 07173e34..bdb2ab26 100644 --- a/.github/workflows/jekyll-gh-pages.yml +++ b/.github/workflows/jekyll-gh-pages.yml @@ -1,5 +1,5 @@ # Sample workflow for building and deploying a Jekyll site to GitHub Pages -name: Deploy Jekyll with GitHub Pages dependencies preinstalled +name: Deploy Documentation on: # Runs on pushes targeting the default branch @@ -51,5 +51,5 @@ jobs: - name: Deploy to GitHub Pages id: deployment uses: actions/deploy-pages@v1 - with: + with: folder: _build/html/ From 63fcd11afa7414eb800f02e7f28d81b04f2442d6 Mon Sep 17 00:00:00 2001 From: Izzet Yildirim Date: Sun, 14 Jul 2024 19:27:25 -0500 Subject: [PATCH 4/9] Missing requirements fixed. --- requirements.txt | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7177bfdd..3374aeb7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,18 @@ --extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://developer.download.nvidia.com/compute/redist -PyYAML==6.0.1 +Pillow~=9.3.0 +PyYAML~=6.0.0 hydra-core==1.3.2 -mpi4py==3.1.6 -numpy==1.23.5 -nvidia-dali-cuda110==1.39.0 -omegaconf==2.2.3 -pandas==1.5.3 -Pillow==9.3.0 -psutil==5.9.8 +mpi4py~=3.1.4 +numpy~=1.23.5 +nvidia-dali-cuda110>=1.34.0 +omegaconf~=2.2.0 +pandas~=1.5.1 +psutil~=5.9.8 pydftracer==1.0.2 -pytest==8.2.0 -torch==2.3.1 -torchaudio==2.3.1 -triton==2.3.1 +pytest +tensorflow>=2.11.0 +torch>=2.2.0 +torchaudio +torchvision From 78fa9037a1ccd2cf9951317d081fd3b2238cee87 Mon Sep 17 00:00:00 2001 From: Izzet Yildirim Date: Sun, 14 Jul 2024 19:27:42 -0500 Subject: [PATCH 5/9] `setup.py` requirement versions refactored. --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index fd77d9a1..49e0176d 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ ] core_deps = [ "Pillow~=9.3.0", - "PyYAML~=6.0", + "PyYAML~=6.0.0", "h5py~=3.11.0", "mpi4py~=3.1.4", "numpy~=1.23.5", @@ -21,8 +21,8 @@ x86_deps = [ f"hydra-core=={HYDRA_VERSION}", "nvidia-dali-cuda110>=1.34.0", - "tensorflow>=2.11", - "torch>=2.2", + "tensorflow>=2.11.0", + "torch>=2.2.0", "torchaudio", "torchvision", ] From d3b5ac462e19f11b78b0005d7f5c36ed0e620275 Mon Sep 17 00:00:00 2001 From: Izzet Yildirim Date: Sun, 14 Jul 2024 19:28:37 -0500 Subject: [PATCH 6/9] CI support for running via `requirements.txt`. --- .github/workflows/ci.yml | 422 ++++++++++++++++++++------------------- 1 file changed, 216 insertions(+), 206 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d3b601ff..57fc704c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,226 +2,236 @@ name: Build and Test on: pull_request: - branches: [ main, dev ] + branches: [main, dev] push: - + jobs: build-and-test: strategy: fail-fast: false matrix: - os: [ ubuntu-20.04, ubuntu-22.04 ] - profiler: [ 0, 1 ] + os: [ubuntu-20.04, ubuntu-22.04] + profiler: [0, 1] gcc: [10] - python: ["3.8", "3.9", "3.10" ] - name: ${{ matrix.os }}-${{ matrix.profiler }}-${{ matrix.gcc }}-${{ matrix.python }} + python: ["3.8", "3.9", "3.10"] + venv: ["via-setup", "via-reqs"] + name: ${{ matrix.os }}-${{ matrix.gcc }}-${{ matrix.python }}-${{ matrix.profiler }}-${{ matrix.venv }} runs-on: ${{ matrix.os }} env: CC: gcc-${{ matrix.gcc }} CXX: g++-${{ matrix.gcc }} + DFTRACER_BUILD_TYPE: "Debug" DFTRACER_ENABLE: ${{ matrix.profiler }} DFTRACER_LOG_LEVEL: "DEBUG" GOTCHA_DEBUG: 3 + OMPI_ALLOW_RUN_AS_ROOT: 1 + OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 PYTHON_VER: ${{ matrix.python }} RDMAV_FORK_SAFE: "1" - DFTRACER_BUILD_TYPE: "Debug" - VENV: "/home/runner/work/venv" + VENV_PATH: "/home/runner/work/.venv/${{ matrix.venv }}" steps: - - name: clear disc - run: | - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - - name: Push checkout - if: github.event_name == 'push' - uses: actions/checkout@v3 - - name: PR checkout - if: github.event_name == 'pull_request' - uses: actions/checkout@v3 - with: - ref: ${{ github.event.pull_request.head.sha }} - - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python }} - - name: Cache install modules - id: cache-modules - uses: actions/cache@v3 - with: - path: ${{ env.VENV }} - key: ${{env.VENV }}-${{env.DFTRACER_ENABLE}}-${{ matrix.gcc }}-${{ matrix.python }}-${{ hashFiles('setup.py') }} - - name: Install System Tools - run: | - sudo apt update - sudo apt-get install $CC $CXX libc6 git - sudo apt-get install openmpi-bin openmpi-common libopenmpi-dev - - name: Install DLIO - if: steps.cache-modules.outputs.cache-hit != 'true' - run: | - echo "Tracer ${DFTRACER_ENABLE} gcc $CC" - python -m pip install --upgrade pip - pip install virtualenv - python -m venv ${VENV} - source ${VENV}/bin/activate - pip install .[test] - - name: test_gen_data - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_gen_data[png-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[npz-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[jpeg-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[tfrecord-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[hdf5-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[indexed_binary-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[mmap_indexed_binary-tensorflow] -v - rm -rf data - - name: test_custom_storage_root_gen_data - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_storage_root_gen_data[png-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[npz-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[jpeg-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[tfrecord-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[hdf5-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[indexed_binary-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[mmap_indexed_binary-tensorflow] -v - rm -rf data - - name: test_train - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_train[png-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[npz-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[jpeg-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[tfrecord-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[hdf5-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[csv-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[png-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[npz-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[jpeg-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[hdf5-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[csv-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[png-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[npz-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[jpeg-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[hdf5-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[csv-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[png-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[npz-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[jpeg-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[hdf5-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[csv-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-dali] -v - rm -rf data - - name: test_custom_storage_root_train - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_custom_storage_root_train[png-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[npz-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[tfrecord-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[csv-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[png-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[npz-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[csv-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-pytorch] -v - rm -rf data - - name: test_checkpoint_epoch - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers0-2-layer_params0-all_ranks] -v - mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers1-2-layer_params1-all_ranks] -v - mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers2-2-layer_params2-rank_zero] -v - mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers3-2-layer_params3-rank_zero] -v - mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers4-1-layer_params4-all_ranks] -v - mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers5-1-layer_params5-all_ranks] -v - rm -rf data - - name: test_checkpoint_step - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_checkpoint_step -v - - name: test_eval - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_eval -v - - name: test_multi_threads - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_multi_threads[tensorflow-0] -v - mpirun -np 2 pytest -k test_multi_threads[tensorflow-1] -v - mpirun -np 2 pytest -k test_multi_threads[tensorflow-2] -v - mpirun -np 2 pytest -k test_multi_threads[pytorch-0] -v - mpirun -np 2 pytest -k test_multi_threads[pytorch-1] -v - mpirun -np 2 pytest -k test_multi_threads[pytorch-2] -v - rm -rf data - - name: test-pytorch-multiprocessing-context - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[0-None] -v - mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[1-fork] -v - mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-forkserver] -v - mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-spawn] -v - rm -rf data - - name: test-tf-loader-tfrecord - run: | - source ${VENV}/bin/activate - rm -rf output data checkpoints - mpirun -np 2 dlio_benchmark workload=resnet50_tf ++workload.dataset.num_files_train=64 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 ++workload.dataset.num_samples_per_file=16 - mpirun -np 2 dlio_benchmark workload=resnet50_tf ++workload.dataset.num_files_train=64 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=4 ++workload.dataset.num_samples_per_file=16 ++workload.train.computation_time=0.01 ++workload.train.epochs=1 - rm -rf data - - name: test-torch-loader-npz - run: | - source ${VENV}/bin/activate - rm -rf output data checkpoints - mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 ++workload.dataset.num_files_eval=8 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=8 ++workload.dataset.num_files_eval=8 ++workload.reader.read_threads=0 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - rm -rf data - - name: test-tf-loader-npz - run: | - source ${VENV}/bin/activate - rm -rf output data checkpoints - mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - rm -rf data - - name: test_subset - run: | - source ${VENV}/bin/activate - rm -rf output data checkpoints - mpirun -np 2 pytest -k test_subset -v - - name: test_unet3d - run: | - source ${VENV}/bin/activate - rm -rf output data checkpoints - mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 - mpirun -np 2 dlio_benchmark workload=unet3d_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 - mpirun -np 2 dlio_benchmark workload=unet3d_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 ++workload.dataset.format=synthetic - rm -rf data - - name: test_resnet50 - run: | - source ${VENV}/bin/activate - rm -rf output data checkpoints - mpirun -np 2 dlio_benchmark workload=resnet50_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 - mpirun -np 2 dlio_benchmark workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 - mpirun -np 2 dlio_benchmark workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 ++workload.dataset.format=synthetic - rm -rf data - - name: test_cosmoflow - run: | - source ${VENV}/bin/activate - rm -rf output data checkpoints - mpirun -np 2 dlio_benchmark workload=cosmoflow_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 - mpirun -np 2 dlio_benchmark workload=cosmoflow_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 - mpirun -np 2 dlio_benchmark workload=cosmoflow_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.format=synthetic - rm -rf data \ No newline at end of file + - name: Clear disc + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + - name: Push checkout + if: github.event_name == 'push' + uses: actions/checkout@v3 + - name: PR checkout + if: github.event_name == 'pull_request' + uses: actions/checkout@v3 + with: + ref: ${{ github.event.pull_request.head.sha }} + - name: Set up Python ${{ matrix.python }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python }} + - name: Cache install modules + id: cache-modules + uses: actions/cache@v3 + with: + path: ${{ env.VENV_PATH }} + key: ${{ env.VENV_PATH }}-${{ env.DFTRACER_ENABLE }}-${{ matrix.gcc }}-${{ matrix.python }}-${{ hashFiles('setup.py') }} + - name: Install System Tools + run: | + sudo apt update + sudo apt-get install -y $CC $CXX libc6 git + sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev + - name: Install DLIO via setup.py + if: matrix.venv == 'via-setup' && steps.cache-modules.outputs.cache-hit != 'true' + run: | + echo "venv: ${matrix.venv} - tracer: ${DFTRACER_ENABLE} - gcc: $CC" + python -m venv ${VENV_PATH} + source ${VENV_PATH}/bin/activate + pip install --upgrade pip + pip install .[test] + - name: Install DLIO via requirements.txt + if: matrix.venv == 'via-reqs' && steps.cache-modules.outputs.cache-hit != 'true' + run: | + echo "venv: ${matrix.venv} - tracer: ${DFTRACER_ENABLE} - gcc: $CC" + python -m venv ${VENV_PATH} + source ${VENV_PATH}/bin/activate + pip install --upgrade pip + pip install -r requirements.txt + - name: test_gen_data + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_gen_data[png-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[npz-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[jpeg-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[tfrecord-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[hdf5-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[indexed_binary-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[mmap_indexed_binary-tensorflow] -v + rm -rf data + - name: test_custom_storage_root_gen_data + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_storage_root_gen_data[png-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[npz-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[jpeg-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[tfrecord-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[hdf5-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[indexed_binary-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[mmap_indexed_binary-tensorflow] -v + rm -rf data + - name: test_train + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_train[png-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[npz-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[jpeg-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[tfrecord-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[hdf5-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[csv-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[png-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[npz-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[jpeg-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[hdf5-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[csv-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[png-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[npz-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[jpeg-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[hdf5-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[csv-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[png-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[npz-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[jpeg-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[hdf5-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[csv-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-dali] -v + rm -rf data + - name: test_custom_storage_root_train + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_custom_storage_root_train[png-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[npz-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[tfrecord-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[csv-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[png-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[npz-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[csv-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-pytorch] -v + rm -rf data + - name: test_checkpoint_epoch + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers0-2-layer_params0-all_ranks] -v + mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers1-2-layer_params1-all_ranks] -v + mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers2-2-layer_params2-rank_zero] -v + mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers3-2-layer_params3-rank_zero] -v + mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers4-1-layer_params4-all_ranks] -v + mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers5-1-layer_params5-all_ranks] -v + rm -rf data + - name: test_checkpoint_step + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_checkpoint_step -v + - name: test_eval + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_eval -v + - name: test_multi_threads + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_multi_threads[tensorflow-0] -v + mpirun -np 2 pytest -k test_multi_threads[tensorflow-1] -v + mpirun -np 2 pytest -k test_multi_threads[tensorflow-2] -v + mpirun -np 2 pytest -k test_multi_threads[pytorch-0] -v + mpirun -np 2 pytest -k test_multi_threads[pytorch-1] -v + mpirun -np 2 pytest -k test_multi_threads[pytorch-2] -v + rm -rf data + - name: test-pytorch-multiprocessing-context + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[0-None] -v + mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[1-fork] -v + mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-forkserver] -v + mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-spawn] -v + rm -rf data + - name: test-tf-loader-tfrecord + run: | + source ${VENV_PATH}/bin/activate + rm -rf output data checkpoints + mpirun -np 2 dlio_benchmark workload=resnet50_tf ++workload.dataset.num_files_train=64 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 ++workload.dataset.num_samples_per_file=16 + mpirun -np 2 dlio_benchmark workload=resnet50_tf ++workload.dataset.num_files_train=64 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=4 ++workload.dataset.num_samples_per_file=16 ++workload.train.computation_time=0.01 ++workload.train.epochs=1 + rm -rf data + - name: test-torch-loader-npz + run: | + source ${VENV_PATH}/bin/activate + rm -rf output data checkpoints + mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 ++workload.dataset.num_files_eval=8 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=8 ++workload.dataset.num_files_eval=8 ++workload.reader.read_threads=0 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + rm -rf data + - name: test-tf-loader-npz + run: | + source ${VENV_PATH}/bin/activate + rm -rf output data checkpoints + mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + rm -rf data + - name: test_subset + run: | + source ${VENV_PATH}/bin/activate + rm -rf output data checkpoints + mpirun -np 2 pytest -k test_subset -v + - name: test_unet3d + run: | + source ${VENV_PATH}/bin/activate + rm -rf output data checkpoints + mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 + mpirun -np 2 dlio_benchmark workload=unet3d_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 + mpirun -np 2 dlio_benchmark workload=unet3d_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 ++workload.dataset.format=synthetic + rm -rf data + - name: test_resnet50 + run: | + source ${VENV_PATH}/bin/activate + rm -rf output data checkpoints + mpirun -np 2 dlio_benchmark workload=resnet50_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 + mpirun -np 2 dlio_benchmark workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 + mpirun -np 2 dlio_benchmark workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 ++workload.dataset.format=synthetic + rm -rf data + - name: test_cosmoflow + run: | + source ${VENV_PATH}/bin/activate + rm -rf output data checkpoints + mpirun -np 2 dlio_benchmark workload=cosmoflow_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 + mpirun -np 2 dlio_benchmark workload=cosmoflow_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 + mpirun -np 2 dlio_benchmark workload=cosmoflow_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.format=synthetic + rm -rf data From 973c79a3744f1fe343e20dcca13f8a54c052905d Mon Sep 17 00:00:00 2001 From: Izzet Yildirim Date: Sun, 14 Jul 2024 19:31:51 -0500 Subject: [PATCH 7/9] `VENV_PATH` fixed in CI script. --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 57fc704c..352cba25 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -62,7 +62,7 @@ jobs: - name: Install DLIO via setup.py if: matrix.venv == 'via-setup' && steps.cache-modules.outputs.cache-hit != 'true' run: | - echo "venv: ${matrix.venv} - tracer: ${DFTRACER_ENABLE} - gcc: $CC" + echo "venv: ${VENV_PATH} - tracer: ${DFTRACER_ENABLE} - gcc: $CC" python -m venv ${VENV_PATH} source ${VENV_PATH}/bin/activate pip install --upgrade pip @@ -70,7 +70,7 @@ jobs: - name: Install DLIO via requirements.txt if: matrix.venv == 'via-reqs' && steps.cache-modules.outputs.cache-hit != 'true' run: | - echo "venv: ${matrix.venv} - tracer: ${DFTRACER_ENABLE} - gcc: $CC" + echo "venv: ${VENV_PATH} - tracer: ${DFTRACER_ENABLE} - gcc: $CC" python -m venv ${VENV_PATH} source ${VENV_PATH}/bin/activate pip install --upgrade pip From 0ea0ed7b86b0caecd6209b35b4c1a359f25cddb0 Mon Sep 17 00:00:00 2001 From: Izzet Yildirim Date: Mon, 15 Jul 2024 11:40:28 -0500 Subject: [PATCH 8/9] CI script conditional executable fix. --- .github/workflows/ci.yml | 51 ++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 352cba25..5550b601 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,6 +23,7 @@ jobs: DFTRACER_BUILD_TYPE: "Debug" DFTRACER_ENABLE: ${{ matrix.profiler }} DFTRACER_LOG_LEVEL: "DEBUG" + DLIO_EXEC: ${{ matrix.venv == 'via-setup' && 'dlio_benchmark' || 'python dlio_benchmark/main.py' }} GOTCHA_DEBUG: 3 OMPI_ALLOW_RUN_AS_ROOT: 1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 @@ -48,17 +49,20 @@ jobs: uses: actions/setup-python@v3 with: python-version: ${{ matrix.python }} + - name: Add current directory to PYTHONPATH + if: matrix.venv == 'via-reqs' + run: echo "PYTHONPATH=$(pwd):$PYTHONPATH" >> $GITHUB_ENV - name: Cache install modules id: cache-modules uses: actions/cache@v3 with: path: ${{ env.VENV_PATH }} - key: ${{ env.VENV_PATH }}-${{ env.DFTRACER_ENABLE }}-${{ matrix.gcc }}-${{ matrix.python }}-${{ hashFiles('setup.py') }} - - name: Install System Tools + key: ${{ matrix.venv }}-gcc${{ matrix.gcc }}-python${{ matrix.python }}-tracer${{ env.DFTRACER_ENABLE }}-${{ hashFiles('setup.py') }} + - name: Install system dependencies run: | sudo apt update sudo apt-get install -y $CC $CXX libc6 git - sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev + sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev python3-dev - name: Install DLIO via setup.py if: matrix.venv == 'via-setup' && steps.cache-modules.outputs.cache-hit != 'true' run: | @@ -185,53 +189,54 @@ jobs: mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-forkserver] -v mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-spawn] -v rm -rf data - - name: test-tf-loader-tfrecord + - name: test_subset run: | source ${VENV_PATH}/bin/activate rm -rf output data checkpoints - mpirun -np 2 dlio_benchmark workload=resnet50_tf ++workload.dataset.num_files_train=64 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 ++workload.dataset.num_samples_per_file=16 - mpirun -np 2 dlio_benchmark workload=resnet50_tf ++workload.dataset.num_files_train=64 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=4 ++workload.dataset.num_samples_per_file=16 ++workload.train.computation_time=0.01 ++workload.train.epochs=1 + mpirun -np 2 pytest -k test_subset -v rm -rf data - - name: test-torch-loader-npz + - name: test-tf-loader-tfrecord run: | source ${VENV_PATH}/bin/activate rm -rf output data checkpoints - mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 ++workload.dataset.num_files_eval=8 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=8 ++workload.dataset.num_files_eval=8 ++workload.reader.read_threads=0 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_tf ++workload.dataset.num_files_train=64 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 ++workload.dataset.num_samples_per_file=16 + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_tf ++workload.dataset.num_files_train=64 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=4 ++workload.dataset.num_samples_per_file=16 ++workload.train.computation_time=0.01 ++workload.train.epochs=1 rm -rf data - - name: test-tf-loader-npz + - name: test-torch-loader-npz run: | source ${VENV_PATH}/bin/activate rm -rf output data checkpoints - mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + mpirun -np 2 ${DLIO_EXEC} workload=unet3d_a100 ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 ++workload.dataset.num_files_eval=8 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + mpirun -np 2 ${DLIO_EXEC} workload=unet3d_a100 ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=8 ++workload.dataset.num_files_eval=8 ++workload.reader.read_threads=0 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 rm -rf data - - name: test_subset + - name: test-tf-loader-npz run: | source ${VENV_PATH}/bin/activate rm -rf output data checkpoints - mpirun -np 2 pytest -k test_subset -v + mpirun -np 2 ${DLIO_EXEC} workload=unet3d_a100 ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + mpirun -np 2 ${DLIO_EXEC} workload=unet3d_a100 ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + rm -rf data - name: test_unet3d run: | source ${VENV_PATH}/bin/activate rm -rf output data checkpoints - mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 - mpirun -np 2 dlio_benchmark workload=unet3d_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 - mpirun -np 2 dlio_benchmark workload=unet3d_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 ++workload.dataset.format=synthetic + mpirun -np 2 ${DLIO_EXEC} workload=unet3d_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 + mpirun -np 2 ${DLIO_EXEC} workload=unet3d_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 + mpirun -np 2 ${DLIO_EXEC} workload=unet3d_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 ++workload.dataset.format=synthetic rm -rf data - name: test_resnet50 run: | source ${VENV_PATH}/bin/activate rm -rf output data checkpoints - mpirun -np 2 dlio_benchmark workload=resnet50_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 - mpirun -np 2 dlio_benchmark workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 - mpirun -np 2 dlio_benchmark workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 ++workload.dataset.format=synthetic + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 ++workload.dataset.format=synthetic rm -rf data - name: test_cosmoflow run: | source ${VENV_PATH}/bin/activate rm -rf output data checkpoints - mpirun -np 2 dlio_benchmark workload=cosmoflow_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 - mpirun -np 2 dlio_benchmark workload=cosmoflow_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 - mpirun -np 2 dlio_benchmark workload=cosmoflow_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.format=synthetic + mpirun -np 2 ${DLIO_EXEC} workload=cosmoflow_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 + mpirun -np 2 ${DLIO_EXEC} workload=cosmoflow_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 + mpirun -np 2 ${DLIO_EXEC} workload=cosmoflow_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.format=synthetic rm -rf data From e8338f16b9815da14d17fa94f346fd16c1cf3027 Mon Sep 17 00:00:00 2001 From: Izzet Yildirim Date: Tue, 16 Jul 2024 17:29:36 -0500 Subject: [PATCH 9/9] Cleared redundant CI matrix options. --- .github/workflows/ci.yml | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5550b601..8fe5ce04 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,18 +10,17 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, ubuntu-22.04] - profiler: [0, 1] + os: [ubuntu-22.04] gcc: [10] - python: ["3.8", "3.9", "3.10"] + python: ["3.9", "3.10", "3.11"] venv: ["via-setup", "via-reqs"] - name: ${{ matrix.os }}-${{ matrix.gcc }}-${{ matrix.python }}-${{ matrix.profiler }}-${{ matrix.venv }} + name: ${{ matrix.os }}-${{ matrix.gcc }}-${{ matrix.python }}-${{ matrix.venv }} runs-on: ${{ matrix.os }} env: CC: gcc-${{ matrix.gcc }} CXX: g++-${{ matrix.gcc }} DFTRACER_BUILD_TYPE: "Debug" - DFTRACER_ENABLE: ${{ matrix.profiler }} + DFTRACER_ENABLE: 1 DFTRACER_LOG_LEVEL: "DEBUG" DLIO_EXEC: ${{ matrix.venv == 'via-setup' && 'dlio_benchmark' || 'python dlio_benchmark/main.py' }} GOTCHA_DEBUG: 3 @@ -57,7 +56,7 @@ jobs: uses: actions/cache@v3 with: path: ${{ env.VENV_PATH }} - key: ${{ matrix.venv }}-gcc${{ matrix.gcc }}-python${{ matrix.python }}-tracer${{ env.DFTRACER_ENABLE }}-${{ hashFiles('setup.py') }} + key: ${{ matrix.venv }}-gcc${{ matrix.gcc }}-python${{ matrix.python }}-${{ hashFiles('requirements.txt', 'setup.py') }} - name: Install system dependencies run: | sudo apt update @@ -66,7 +65,7 @@ jobs: - name: Install DLIO via setup.py if: matrix.venv == 'via-setup' && steps.cache-modules.outputs.cache-hit != 'true' run: | - echo "venv: ${VENV_PATH} - tracer: ${DFTRACER_ENABLE} - gcc: $CC" + echo "venv: ${VENV_PATH} - gcc: $CC" python -m venv ${VENV_PATH} source ${VENV_PATH}/bin/activate pip install --upgrade pip @@ -74,7 +73,7 @@ jobs: - name: Install DLIO via requirements.txt if: matrix.venv == 'via-reqs' && steps.cache-modules.outputs.cache-hit != 'true' run: | - echo "venv: ${VENV_PATH} - tracer: ${DFTRACER_ENABLE} - gcc: $CC" + echo "venv: ${VENV_PATH} - gcc: $CC" python -m venv ${VENV_PATH} source ${VENV_PATH}/bin/activate pip install --upgrade pip