Skip to content

[TUTORIAL] Add the non-persistent softmax and make it for CPU #34

[TUTORIAL] Add the non-persistent softmax and make it for CPU

[TUTORIAL] Add the non-persistent softmax and make it for CPU #34

# AUTOGENERATED by pre-commit, modify the .in file instead.
# integration-tests.yml.in is used to generate integration-tests.yml by
# expanding yaml anchors, because github actions don't support them
# (https://github.com/actions/runner/issues/1182). pre-commit will do this for
# you automatically.
name: Integration Tests
on:
workflow_dispatch:
pull_request:
# You can name your branch dev-foo to get CI runs.
branches: [main, 'dev-**']
merge_group:
branches: [main, 'dev-**']
types: [checks_requested]
push:
branches: [main]
concurrency:
group: ${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
permissions: read-all
env:
TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
TRITON_DISABLE_LINE_INFO: 1
jobs:
Runner-Preparation:
runs-on: ubuntu-latest
timeout-minutes: 30
outputs:
matrix-CUDA: ${{ steps.set-matrix.outputs.matrix-CUDA }}
matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
steps:
- name: Decide pre-submit integration test enablement
# Always enable integration tests for pre-submit pull requests.
if: github.event_name == 'pull_request'
run: |
echo "enable_integration=true" >> $GITHUB_ENV
- name: Checkout post-submit commits
if: github.event_name == 'push'
uses: actions/checkout@v4
with:
# Only fetch two commits to check the latest changed files.
fetch-depth: 2
- name: Detect if build deps (e.g. LLVM hash) changed
id: detect-change
if: github.event_name == 'push'
uses: tj-actions/changed-files@v44
with:
files: |
cmake/*.txt
- name: Detect if enough time has passed since last post-submit run
id: detect-time
if: github.event_name == 'push'
run: |
GITHUB_TOKEN=${{ secrets.GITHUB_TOKEN }}
REPO_NAME="${{ github.repository }}"
# ID of integration-tests workflow
WORKFLOW_ID="11678186"
# Fetch the last run time of this workflow
LAST_RUN=$(curl -s \
-H "Authorization: token $GITHUB_TOKEN" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/$REPO_NAME/actions/workflows/$WORKFLOW_ID/runs?branch=main&status=success&per_page=1" \
| jq -r '.workflow_runs[0].updated_at')
# Convert to timestamp
LAST_RUN_TS=$(date -d "$LAST_RUN" +%s)
NOW_TS=$(date +%s)
DIFF=$(( (NOW_TS - LAST_RUN_TS) / 3600 )) # Difference in hours
echo "Last run was $DIFF hours ago."
if [ "$DIFF" -ge 4 ]; then
echo "Will run CI; last build was long enough ago."
echo "n_hours_since_last_run=true" >> $GITHUB_ENV
else
echo "Will not run CI; last build was too recent."
echo "n_hours_since_last_run=false" >> $GITHUB_ENV
fi
# We want to run integration tests on the main branch (i.e. post-submit)
# occasionally, because pre-submit CI caches will only read from caches
# generated from the main branch (or the PR's branch), and we want these
# caches to be recent.
#
# But we also don't want to run the tests on *every* commit, because this
# would compete for resources with pre-commit CI (and the whole point of
# caching is to speed up CI).
#
# As a compromise, run every N hours, or if a build dependency changes
# (e.g. we update the LLVM hash).
- name: Decide whether to run integration tests post-submit
if: |
github.event_name == 'push' &&
(steps.detect-change.outputs.any_changed == 'true' ||
env.n_hours_since_last_run == 'true')
run: |
echo "enable_integration=true" >> $GITHUB_ENV
- name: Prepare runner matrix
id: set-matrix
if: env.enable_integration == 'true'
run: |
if [ x"${{ github.repository }}" == x"triton-lang/triton" ]; then
echo '::set-output name=matrix-CUDA::[["self-hosted", "A100"], ["self-hosted", "H100"]]'
echo '::set-output name=matrix-HIP::[["self-hosted", "gfx90a"]]'
else
echo '::set-output name=matrix-CUDA::["ubuntu-latest"]'
echo '::set-output name=matrix-HIP::["ubuntu-latest"]'
fi
pre-commit:
name: pre-commit (code formatting)
needs: Runner-Preparation
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.12'
cache: 'pip'
- name: Compute hash of pre-commit config
id: cache-key
run: |
echo "pre_commit_hash=$(sha256sum .pre-commit-config.yaml)" >> $GITHUB_OUTPUT
shell: bash
- name: Cache pre-commit's cache dir
uses: actions/cache@v4
with:
# Note that we cannot use environment variables here given there is
# no shell to interpret them in the paths.
path: |
~/.cache/pre-commit
key: ${{ runner.os }}-${{ steps.cache-key.outputs.pre_commit_hash }}
- name: Check pre-commit
run: |
python3 -m pip install --upgrade pre-commit
# TODO: ignore the first yapf failure until https://github.com/google/yapf/issues/1164 is fixed
python3 -m pre_commit run --all-files --verbose yapf &> /dev/null || true
# If first run of yapf worked and made changes reset the tree to the original state
git reset --hard
python3 -m pre_commit run --all-files --verbose
- name: Print diff of changes if pre-commit failed
if: failure()
run: |
git diff
Integration-Tests:
needs: Runner-Preparation
if: needs.Runner-Preparation.outputs.matrix-CUDA != ''
runs-on: ${{ matrix.runner }}
timeout-minutes: 30
strategy:
matrix:
runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-CUDA)}}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: "true"
- name: Compute cache keys
id: cache-key
run: |
echo "llvm=$(cat cmake/llvm-hash.txt | cut -c 1-8)" >> $GITHUB_OUTPUT
echo "pybind11=$(cat cmake/pybind11-version.txt)" >> $GITHUB_OUTPUT
echo "nvidia=$(cat cmake/nvidia-toolchain-version.txt)" >> $GITHUB_OUTPUT
echo "datetime=$(date -u -Iseconds)" >> $GITHUB_OUTPUT
shell: bash
- name: Cache build dependencies
uses: actions/cache@v4
with:
# Note that we cannot use environment variables here given there is
# no shell to interpret them in the paths.
path: |
~/.triton/llvm
~/.triton/nvidia
~/.triton/pybind11
key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-pybind11-${{ steps.cache-key.outputs.pybind11 }}
- # Cache ~/.triton/cache because the vast majority of unit test time is
# spent compiling. Triton won't (well, should not) use these cached files
# if something internal to Triton changes, because Triton's internal
# source code is part of the cache key.
#
# Similarly, cache ~/.cache/ccache to speed up compilation.
#
# On branch `main` we always start from an empty cache, i.e. we skip the
# "restore" step. This is to prevent the caches from accumulating stale
# files over time.
name: Restore cache of ccache and Triton compilation artifacts
if: github.event_name != 'push'
uses: actions/cache/restore@v4
with:
path: |
~/.triton/cache
~/.cache/ccache
# Restore the most recent cache entry.
restore-keys: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-
# We expect this cache key never to hit and for us to fall back
# unconditionally to the restore-key, so it doesn't actually matter
# what we put here (so long as it doesn't hit an existing key).
key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
- name: Inspect cache directory
run: |
mkdir -p ~/.triton
ls -alh ~/.triton
- name: Update PATH
run: |
echo "$HOME/.local/bin" >> $GITHUB_PATH
- name: Install pip dependencies
run: |
python3 -m pip install --upgrade pip
python3 -m pip install wheel cmake==3.24 ninja pytest-xdist lit
- name: Install Triton
env:
TRITON_BUILD_WITH_CCACHE: "true"
CUDA_HOME: "/usr/local/cuda"
run: |
echo "PATH is '$PATH'"
cd python
python3 -m pip install --no-build-isolation -vvv '.[tests]'
- name: Run lit tests
run: |
cd python
LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
if [ ! -d "${LIT_TEST_DIR}" ]; then
echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
fi
lit -v "${LIT_TEST_DIR}"
- name: Run python tests on CUDA
run: |
cd python/test/unit
python3 -m pytest -vvv -n 8 --ignore=hopper/test_flashattention.py --ignore=runtime --ignore=language/test_line_info.py --ignore=language/test_subprocess.py
python3 -m pytest -vvv -n 8 language/test_subprocess.py
# Run runtime tests serially to avoid race condition with cache handling
python3 -m pytest -vvv runtime/
# Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -vvv language/test_line_info.py
# Run hopper/test_flashattention.py separately to avoid out of gpu memory
python3 -m pytest -vs hopper/test_flashattention.py
- name: Run interpreter tests
if: ${{matrix.runner[0] == 'self-hosted' && matrix.runner[1] == 'H100'}}
env:
TRITON_INTERPRET: "1"
run: |
cd python/test/unit
python3 -m pytest -vvv -n 16 -m interpreter language/test_core.py language/test_standard.py \
language/test_random.py language/test_block_pointer.py language/test_subprocess.py \
operators/test_flash_attention.py::test_op \
../../tutorials/06-fused-attention.py::test_op --device cpu
- name: Run C++ unittests
run: |
cd python
cd "build/$(ls build | grep -i cmake)"
ctest
- name: Run Proton tests
env:
LD_LIBRARY_PATH: "/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
run: |
cd third_party/proton
python3 -m pytest -vvv test
- # If we're on branch `main`, save the ccache Triton compilation artifacts
# to the cache so they can be used by other (non-main) CI runs.
#
# (It wouldn't be a problem to save the cache on every run, because github
# evicts cache entries LRU, but maybe this saves a bit of time in CI.)
name: Save ccache and Triton compilation artifacts to cache
if: github.ref == 'refs/heads/main'
uses: actions/cache/save@v4
with:
path: ~/.triton/cache ~/.cache/ccache
key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
- name: Inspect cache directories
run: |
mkdir -p ~/.triton
ls -alh ~/.triton
du -sh ~/.triton/**
mkdir -p ~/.cache/ccache
ls -alh ~/.cache/ccache
du -sh ~/.cache/ccache
Integration-Tests-AMD:
needs: Runner-Preparation
if: needs.Runner-Preparation.outputs.matrix-HIP != ''
runs-on: ${{ matrix.runner }}
timeout-minutes: 30
strategy:
matrix:
runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-HIP)}}
container:
image: rocm/pytorch:rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2
options: --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: 'true'
- name: Compute cache keys
id: cache-key
run: |
echo "llvm=$(cat cmake/llvm-hash.txt | cut -c 1-8)" >> $GITHUB_OUTPUT
echo "pybind11=$(cat cmake/pybind11-version.txt)" >> $GITHUB_OUTPUT
echo "nvidia=$(cat cmake/nvidia-toolchain-version.txt)" >> $GITHUB_OUTPUT
echo "datetime=$(date -u -Iseconds)" >> $GITHUB_OUTPUT
shell: bash
- name: Cache build dependencies
uses: actions/cache@v4
with:
# Note that we cannot use environment variables here given there is
# no shell to interpret them in the paths.
path: |
~/.triton/llvm
~/.triton/nvidia
~/.triton/pybind11
key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-pybind11-${{ steps.cache-key.outputs.pybind11 }}
- # Cache ~/.triton/cache because the vast majority of unit test time is
# spent compiling. Triton won't (well, should not) use these cached files
# if something internal to Triton changes, because Triton's internal
# source code is part of the cache key.
#
# Similarly, cache ~/.cache/ccache to speed up compilation.
#
# On branch `main` we always start from an empty cache, i.e. we skip the
# "restore" step. This is to prevent the caches from accumulating stale
# files over time.
name: Restore cache of ccache and Triton compilation artifacts
if: github.event_name != 'push'
uses: actions/cache/restore@v4
with:
path: |
~/.triton/cache
~/.cache/ccache
# Restore the most recent cache entry.
restore-keys: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-
# We expect this cache key never to hit and for us to fall back
# unconditionally to the restore-key, so it doesn't actually matter
# what we put here (so long as it doesn't hit an existing key).
key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
- name: Inspect cache directory
run: |
mkdir -p ~/.triton
ls -alh ~/.triton
- name: Update PATH
run: |
echo "/opt/rocm/llvm/bin" >> $GITHUB_PATH
- name: Install pip dependencies
run: |
python3 -m pip install --upgrade pip
python3 -m pip install lit
- name: Install Triton
run: |
echo "PATH is '$PATH'"
pip uninstall -y triton
cd python
pip install -v -e .
- name: Run lit tests
run: |
cd python
LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
if [ ! -d "${LIT_TEST_DIR}" ]; then
echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
fi
lit -v "${LIT_TEST_DIR}"
- name: Run python tests on HIP
run: |
pytest --capture=tee-sys -rfs -vvv python/tutorials/06-fused-attention.py
cd python/test/unit
pytest --capture=tee-sys -rfs -vvv -n 32 language operators \
hopper/test_mixed_io.py \
hopper/test_gemm.py \
hopper/test_tma_store_gemm.py \
hopper/test_persistent_warp_specialized_fused-attention.py \
--ignore=language/test_line_info.py
# Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -vvv -n 8 language/test_line_info.py
# Run runtime tests serially to avoid race condition with cache handling
python3 -m pytest -vvv runtime
- name: Run C++ unittests
run: |
cd python
cd "build/$(ls build | grep -i cmake)"
ctest
- # If we're on branch `main`, save the ccache Triton compilation artifacts
# to the cache so they can be used by other (non-main) CI runs.
#
# (It wouldn't be a problem to save the cache on every run, because github
# evicts cache entries LRU, but maybe this saves a bit of time in CI.)
name: Save ccache and Triton compilation artifacts to cache
if: github.ref == 'refs/heads/main'
uses: actions/cache/save@v4
with:
path: ~/.triton/cache ~/.cache/ccache
key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
- name: Inspect cache directories
run: |
mkdir -p ~/.triton
ls -alh ~/.triton
du -sh ~/.triton/**
mkdir -p ~/.cache/ccache
ls -alh ~/.cache/ccache
du -sh ~/.cache/ccache