[not for merge] test armv8 CI

triton-lang · Dec 19, 2024 · 00f231b · 00f231b
1 parent 5830c67
commit 00f231b
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 13 deletions.
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
@@ -45,14 +45,14 @@ jobs:
           python3 -m pre_commit run --show-diff-on-failure --color=always --all-files --verbose
 
   build-test:
-    name: Build and test
-    runs-on:
-      - glados
-      - intel
-      - x86
+    name: Build and test on ${{ matrix.config.runner }}
+    runs-on: ${{ matrix.config.runs_on }}
     strategy:
       matrix:
         python: ['3.11']
+        config:
+          - {runner: 'Ubuntu Intel x86', runs_on: ['glados', 'intel', 'x86'], target-os: 'ubuntu', arch: 'x86'}
+          - {runner: 'MacOS-latest ARM64', runs_on: ['macos-latest'], target-os: 'macos',  arch: 'arm64'}
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
@@ -65,11 +65,16 @@ jobs:
           python-version: ${{ matrix.python }}
 
       - name: Install pip and apt dependencies
+        env:
+          RUNNER_TARGET_OS: ${{ matrix.config.target-os }}
         run: |
+          echo "RUNNER_TARGET_OS: ${RUNNER_TARGET_OS}"
           python3 -m pip install --upgrade pip
           python3 -m pip install wheel cmake==3.24 ninja pytest-xdist lit pybind11
-          sudo apt-get update
-          sudo apt-get install -y zlib1g-dev g++
+          if [[ "${RUNNER_TARGET_OS}" == "ubuntu" ]]; then
+            sudo apt-get update
+            sudo apt-get install -y zlib1g-dev g++
+          fi
           pip install torch==2.1.2
 
       - name: Install Triton
@@ -79,7 +84,21 @@ jobs:
           python3 -m pip install --no-build-isolation -vvv '.[tests]'
 
       - name: Run python unit tests
+        env:
+          RUNNER_TARGET_OS: ${{ matrix.config.target-os }}
         run: |
+          if [[ "${RUNNER_TARGET_OS}" == "macos" ]]; then
+            export TRITON_DISABLE_OPENMP=1 # temporary
+            which clang
+            export CC=$(which clang)
+            clang --version
+            xcode-select -p
+            echo "TRITON_DISABLE_OPENMP=${TRITON_DISABLE_OPENMP:-0}"
+            export TRITON_SYS_PATH=/
+            export TRITON_CPU_BACKEND=1 
+            $(which python3) python/tutorials/01-vector-add.py
+          fi
+
           python -m pytest -s -n 32 --device cpu python/test/unit/language/test_core.py -m cpu
           python -m pytest -s -n 32 --device cpu \
             python/test/unit/cpu/test_math.py \

diff --git a/python/setup.py b/python/setup.py
@@ -757,7 +757,7 @@ def get_git_commit_hash(length=8):
             "pytest-forked",
             "pytest-xdist",
             "scipy>=1.7.1",
-            "llnl-hatchet",
+            # "llnl-hatchet", # TODO: Re-enable this, not available on macos-arm64
         ],
         "tutorials": [
             "matplotlib",

diff --git a/python/triton/runtime/build.py b/python/triton/runtime/build.py
@@ -93,6 +93,9 @@ def _build(name, src, srcdir, library_dirs, include_dirs, libraries):
         if system == "Linux" and machine in ("aarch64", "arm64"):
             # On Arm backend, some CPU (neoverse-v2) needs to be specified through -mcpu
             cc_cmd += ["-mcpu=native"]
+    print("*****")
+    print(f"cc_cmd: {cc_cmd}")
+    print("*****")
     ret = subprocess.check_call(cc_cmd)
     if ret == 0:
         return so

diff --git a/python/tutorials/01-vector-add.py b/python/tutorials/01-vector-add.py
@@ -24,7 +24,7 @@
 import triton.language as tl
 
 GPU_BLOCK_SIZE = 1024
-CPU_BLOCK_SIZE = 4096
+CPU_BLOCK_SIZE = 16
 # Single Thread Threshold
 CPU_ST_THRESHOLD = 65536
 USE_GPU = False
@@ -224,7 +224,7 @@ def add_tiled_autotuned(x: torch.Tensor, y: torch.Tensor, output):
 @triton.testing.perf_report(
     triton.testing.Benchmark(
         x_names=['size'],  # Argument names to use as an x-axis for the plot.
-        x_vals=[2**i for i in range(12, 28, 1)],  # Different possible values for `x_name`.
+        x_vals=[2**i for i in range(12, 18, 1)],  # Different possible values for `x_name`.
         x_log=True,  # x axis is logarithmic.
         line_arg='provider',  # Argument name whose value corresponds to a different line in the plot.
         line_vals=LINE_VALS,  # Possible values for `line_arg`.
@@ -280,4 +280,4 @@ def benchmark(size, provider):
 # %%
 # We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or
 # `save_path='/path/to/results/' to save them to disk along with raw CSV data:
-benchmark.run(print_data=True, show_plots=True)
+benchmark.run(print_data=True, show_plots=False)
diff --git a/third_party/cpu/backend/driver.py b/third_party/cpu/backend/driver.py
@@ -12,6 +12,7 @@
 from triton.backends.driver import DriverBase
 from triton.backends.compiler import GPUTarget
 
+from pathlib import Path
 from triton._C.libtriton import llvm
 
 _dirname = os.getenv("TRITON_SYS_PATH", default="/usr/local")
@@ -22,10 +23,18 @@
     # resources.files() doesn't exist for Python < 3.9
     _triton_C_dir = importlib.resources.path(triton, "_C").__enter__()
 
-include_dirs = [os.path.join(_dirname, "include")]
-library_dirs = [os.path.join(_dirname, "lib"), _triton_C_dir]
+include_dirs = []
+library_dirs = [_triton_C_dir]
 libraries = ["stdc++"]
 
+# We may not have these dirs available on all platforms
+sys_include_dir = os.path.join(_dirname, "include")
+if os.path.exists(sys_include_dir):
+    include_dirs.append(sys_include_dir)
+
+sys_lib_dir = os.path.join(_dirname, "lib")
+if os.path.exists(sys_lib_dir):
+    library_dirs.append(sys_lib_dir)
 
 def compile_module_from_src(src, name):
     key = hashlib.md5(src.encode("utf-8")).hexdigest()