nod-ai · makslevental · Oct 1, 2024 · Sep 19, 2024 · Sep 19, 2024 · Sep 27, 2024
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
@@ -142,6 +142,8 @@ def generate_aie_vmfb(
         f"--iree-amd-aie-vitis-install-dir={config.vitis_dir}",
         f"--iree-hal-dump-executable-files-to={config.output_dir}",
         "--iree-scheduling-optimize-bindings=false",
+        "--iree-hal-memoization=false",
+        "--iree-hal-indirect-command-buffers=false",
         f"--mlir-disable-threading",
         "--mlir-elide-resource-strings-if-larger=10",
     ]

diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh
@@ -405,6 +405,8 @@ function run_matmul_test() {
                       --iree-amd-aie-enable-chess=${use_chess} \
                       --iree-amdaie-enable-packet-flow=${enable_packet_flow} \
                       --iree-hal-dump-executable-files-to=$PWD \
+                      --iree-hal-memoization=false \
+                      --iree-hal-indirect-command-buffers=false \
                       --mlir-elide-resource-strings-if-larger=10 \
                       --iree-amd-aie-show-invoked-commands"
 
@@ -416,6 +418,8 @@ function run_matmul_test() {
   set +e
 
   echo "**** Generating matmul .vmfb file for ${name} ****"
+  ${IREE_COMPILE_EXE} "${matmul_ir}" \
+    ${compilation_flags} --compile-to=vm -o "${matmul_vmfb}.vm"
   ${IREE_COMPILE_EXE} "${matmul_ir}" \
     ${compilation_flags} -o "${matmul_vmfb}"
 

@@ -27,8 +27,7 @@ namespace {
 
 /// Converts `scf.forall` into nested `scf.for` and then coalesce the `scf.for`
 /// loops.
-LogicalResult coreForallToFor(RewriterBase &rewriter,
-                              AMDAIE::CoreOp coreOp) {
+LogicalResult coreForallToFor(RewriterBase &rewriter, AMDAIE::CoreOp coreOp) {
   WalkResult res = coreOp->walk([&](scf::ForallOp forallOp) {
     SmallVector<Operation *> forOpResults;
     if (failed(scf::forallToForLoop(rewriter, forallOp, &forOpResults))) {
@@ -55,12 +54,12 @@ class AMDAIEConvertCoreForallToForPass
           AMDAIEConvertCoreForallToForPass> {
  public:
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<scf::SCFDialect>();
+    registry.insert<scf::SCFDialect, affine::AffineDialect>();
   }
 
   AMDAIEConvertCoreForallToForPass() = default;
   AMDAIEConvertCoreForallToForPass(
-      const AMDAIEConvertCoreForallToForPass &pass){};
+      const AMDAIEConvertCoreForallToForPass &pass) {};
   void runOnOperation() override;
 };
 

@@ -343,13 +343,15 @@ void AMDAIETileAndFusePass::runOnOperation() {
   if (isTilingReductionDimension(consumerOp, tileSizesVal)) {
     tileAndFuseOptions.setFusionControlFn(
         [&](tensor::ExtractSliceOp sliceOp, OpResult originalProducer,
-            bool isDestinationOperand) -> std::tuple<bool, bool> {
-          return {false, false};
+            bool isDestinationOperand)
+            -> std::optional<scf::SCFTileAndFuseOptions::ControlFnResult> {
+          return std::nullopt;
         });
   } else {
     tileAndFuseOptions.setFusionControlFn(
         [&](tensor::ExtractSliceOp sliceOp, OpResult originalProducer,
-            bool isDestinationOperand) -> std::tuple<bool, bool> {
+            bool isDestinationOperand)
+            -> std::optional<scf::SCFTileAndFuseOptions::ControlFnResult> {
           bool fusableOp =
               TypeSwitch<Operation *, bool>(originalProducer.getOwner())
                   // List ops that shouldnt be fused.
@@ -360,7 +362,8 @@ void AMDAIETileAndFusePass::runOnOperation() {
                     return op->getDialect() ==
                            context->getLoadedDialect<linalg::LinalgDialect>();
                   });
-          return {fusableOp, false};
+          if (!fusableOp) return std::nullopt;
+          return scf::SCFTileAndFuseOptions::ControlFnResult{false};
         });
   }
 

@@ -1,15 +1,14 @@
 // RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-convert-core-forall-to-for,canonicalize)" --split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @test_single
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-// CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
 // CHECK-DAG:   amdaie.core
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
 // CHECK:         scf.for %[[ARG0:.+]] = %[[C0]] to %[[C4]] step %[[C1]] {
-// CHECK-DAG:       %[[REM:.+]] = arith.remsi %[[ARG0]], %[[C2]] : index
-// CHECK-DAG:       %[[DIV:.+]] = arith.divsi %[[ARG0]], %[[C2]] : index
-// CHECK-DAG:       func.call @callee(%[[DIV]], %[[REM]]) : (index, index) -> ()
+// CHECK-DAG:       %[[D:.+]]:2 = affine.delinearize_index %[[ARG0]] into (%[[C2]], %[[C2]]) : index, index
+// CHECK-DAG:       func.call @callee(%[[D]]#0, %[[D]]#1) : (index, index) -> ()
 module @test_single {
   func.func private @callee(%i: index, %j: index)
   %c0 = arith.constant 0 : index
@@ -28,20 +27,18 @@ module @test_single {
 // -----
 
 // CHECK-LABEL: @test_multi
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-// CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
-// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
 // CHECK-DAG:   amdaie.core
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
 // CHECK:         scf.for %[[ARG0:.+]] = %[[C0]] to %[[C4]] step %[[C1]] {
-// CHECK-DAG:       %[[REM:.+]] = arith.remsi %[[ARG0]], %[[C2]] : index
-// CHECK-DAG:       %[[DIV:.+]] = arith.divsi %[[ARG0]], %[[C2]] : index
-// CHECK-DAG:       func.call @callee(%[[DIV]], %[[REM]]) : (index, index) -> ()
+// CHECK-DAG:       %[[D:.+]]:2 = affine.delinearize_index %[[ARG0]] into (%[[C2]], %[[C2]]) : index, index
+// CHECK-DAG:       func.call @callee(%[[D]]#0, %[[D]]#1) : (index, index) -> ()
 // CHECK-DAG:     scf.for %[[ARG1:.+]] = %[[C0]] to %[[C16]] step %[[C1]] {
-// CHECK-DAG:       %[[REM1:.+]] = arith.remsi %[[ARG1]], %[[C4]] : index
-// CHECK-DAG:       %[[DIV1:.+]] = arith.divsi %[[ARG1]], %[[C4]] : index
-// CHECK-DAG:       func.call @callee(%[[DIV1]], %[[REM1]]) : (index, index) -> ()
+// CHECK-DAG:       %[[D:.+]]:2 = affine.delinearize_index %[[ARG0]] into (%[[C4]], %[[C4]]) : index, index
+// CHECK-DAG:       func.call @callee(%[[D]]#0, %[[D]]#1) : (index, index) -> ()
 module @test_multi {
   func.func private @callee(%i: index, %j: index)
   %c0 = arith.constant 0 : index
@@ -63,19 +60,17 @@ module @test_multi {
 // -----
 
 // CHECK-LABEL: @test_nested
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-// CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
-// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
 // CHECK-DAG:   amdaie.core
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
 // CHECK:         scf.for %[[ARG0:.+]] = %[[C0]] to %[[C16]] step %[[C1]] {
-// CHECK-DAG:       %[[REM0:.+]] = arith.remsi %[[ARG0]], %[[C4]] : index
-// CHECK-DAG:       %[[DIV0:.+]] = arith.divsi %[[ARG0]], %[[C4]] : index
+// CHECK-DAG:       %[[D1:.+]]:2 = affine.delinearize_index %[[ARG0]] into (%[[C4]], %[[C4]]) : index, index
 // CHECK-DAG:       scf.for %[[ARG1:.+]] = %[[C0]] to %[[C4]] step %[[C1]] {
-// CHECK-DAG:         %[[REM1:.+]] = arith.remsi %[[ARG1]], %[[C2]] : index
-// CHECK-DAG:         %[[DIV1:.+]] = arith.divsi %[[ARG1]], %[[C2]] : index
-// CHECK-DAG:         func.call @callee(%[[DIV0]], %[[REM0]], %[[DIV1]], %[[REM1]]) : (index, index, index, index) -> ()
+// CHECK-DAG:         %[[D2:.+]]:2 = affine.delinearize_index %[[ARG1]] into (%[[C2]], %[[C2]]) : index, index
+// CHECK-DAG:         func.call @callee(%[[D1]]#0, %[[D1]]#1, %[[D2]]#0, %[[D2]]#1) : (index, index, index, index) -> ()
 module @test_nested {
   func.func private @callee(%i: index, %j: index, %k: index, %l: index)
   %c0 = arith.constant 0 : index

@@ -62,12 +62,12 @@ module {
 //      CHECK:       %[[SECOND_LOOP:.*]]:2 = scf.for %[[IV0:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ITER_ARG_1:.*]] = %[[FIRST_LOOP]], %[[ITER_ARG_3:.*]] = %[[UNPACK_OUT]])
 //      CHECK:       {
 //      CHECK:            %[[MATMUL:.*]] = linalg.generic
+//      CHECK:            affine.apply
+//      CHECK:            affine.apply
 //      CHECK:            %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]])
 //      CHECK:            %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV0]])
 //      CHECK:            %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1]
 //      CHECK:            %[[TILED_UNPACK:.*]] = tensor.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]]
-//      CHECK:            %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]])
-//      CHECK:            %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV0]])
 //      CHECK:            %[[YIELD_MATMUL:.*]] = tensor.insert_slice %[[MATMUL]] into %[[ITER_ARG_1]]
 //      CHECK:            %[[YIELD_UNPACK:.*]] = tensor.insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]]
 //      CHECK:            scf.yield %[[YIELD_MATMUL]], %[[YIELD_UNPACK]]
@@ -159,12 +159,12 @@ module {
 //      CHECK:                                         arith.addi  
 //      CHECK:                                    }
 //      CHECK:            %[[YIELD_MATMUL:.*]] = tensor.insert_slice %[[MATMUL]] into %[[ITER_ARG_1]]
+//      CHECK:            affine.apply
+//      CHECK:            affine.apply
 //      CHECK:            %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]])
 //      CHECK:            %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV0]])
 //      CHECK:            %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1]
 //      CHECK:            %[[TILED_UNPACK:.*]] = tensor.unpack %[[FUSED_CONSUMER]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]]
-//      CHECK:            %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]])
-//      CHECK:            %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV0]])
 //      CHECK:            %[[YIELD_ELEM:.*]] = tensor.insert_slice %[[FUSED_CONSUMER]] into %[[ITER_ARG_2]]
 //      CHECK:            %[[YIELD_UNPACK:.*]] = tensor.insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]]
 //      CHECK:            scf.yield %[[YIELD_MATMUL]], %[[YIELD_ELEM]], %[[YIELD_UNPACK]]

@@ -59,15 +59,15 @@ module {
 //      CHECK:       %[[SECOND_LOOP:.*]]:2 = scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) shared_outs(%[[ITER_ARG_1:.*]] = %[[FIRST_LOOP]], %[[ITER_ARG_3:.*]] = %[[UNPACK_OUT]])
 //      CHECK:       {
 //      CHECK:            %[[MATMUL:.*]] = linalg.generic
+//      CHECK:            affine.apply
+//      CHECK:            affine.apply
 //      CHECK:            %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]])
 //      CHECK:            %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]])
 //      CHECK:            %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1]
 //      CHECK:            %[[TILED_UNPACK:.*]] = tensor.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]]
-//      CHECK:            %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]])
-//      CHECK:            %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]])
 //      CHECK:            scf.forall.in_parallel {
-//      CHECK:                 tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1]
 //      CHECK:                 tensor.parallel_insert_slice %[[MATMUL]] into %[[ITER_ARG_1]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
+//      CHECK:                 tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1]
 //      CHECK:            }
 //      CHECK:        }
 //      CHECK:        %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] :
@@ -153,16 +153,16 @@ module {
 //      CHECK:                                    {
 //      CHECK:                                         arith.addi  
 //      CHECK:                                    }
+//      CHECK:            affine.apply
+//      CHECK:            affine.apply
 //      CHECK:            %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]])
 //      CHECK:            %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]])
 //      CHECK:            %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1]
 //      CHECK:            %[[TILED_UNPACK:.*]] = tensor.unpack %[[FUSED_CONSUMER]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]]
-//      CHECK:            %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]])
-//      CHECK:            %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]])
 //      CHECK:            scf.forall.in_parallel {
-//      CHECK:                 tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1]
-//      CHECK:                 tensor.parallel_insert_slice %[[FUSED_CONSUMER]] into %[[ITER_ARG_2]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
 //      CHECK:                 tensor.parallel_insert_slice %[[MATMUL]] into %[[ITER_ARG_1]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
+//      CHECK:                 tensor.parallel_insert_slice %[[FUSED_CONSUMER]] into %[[ITER_ARG_2]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
+//      CHECK:                 tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, 32, 32] [1, 1, 1, 1]
 //      CHECK:            }
 //      CHECK:        }
 //      CHECK:        %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] :

@@ -340,8 +340,9 @@ static iree_status_t iree_hal_xrt_direct_command_buffer_dispatch(
   std::vector<xrt::bo> bos;
   // TODO(max): do we need multiple descriptor sets ever for AIE?
   uint32_t set = 0;
-  iree_hal_xrt_direct_command_buffer_push_descriptor_set(
-      base_command_buffer, set, bindings.count, bindings.values);
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_xrt_direct_command_buffer_push_descriptor_set(
+              base_command_buffer, set, bindings.count, bindings.values));
   for (iree_host_size_t j = 0; j < bindings.count; ++j) {
     xrt::bo arg_buffer =
         xrt::bo(*command_buffer->descriptor_sets[set].bindings[j],

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -7,7 +7,7 @@
 
 from iree.compiler import ir
 from iree.compiler._mlir_libs import get_dialect_registry
-from iree.compiler.api import Session, Output, Source
+from iree.compiler.api import Session, Output, Source, _initializeGlobalCL
 from iree.compiler.extras import types as T
 from iree.runtime import VmModule
 from iree.runtime import get_driver, Config, SystemContext
@@ -49,8 +49,16 @@ def pytest_addoption(parser):
     parser.addoption("--iree-aie-debug", action="store_true")
 
 
+@pytest.fixture(scope="session")
+def global_cl_args(request):
+    _initializeGlobalCL(
+        "--iree-hal-memoization=false",
+        "--iree-hal-indirect-command-buffers=false",
+    )
+
+
 @pytest.fixture
-def iree_session(request, pytestconfig) -> Session:
+def iree_session(request, pytestconfig, global_cl_args) -> Session:
     s = Session()
     s.context.append_dialect_registry(get_dialect_registry())
     s.context.load_all_available_dialects()

diff --git a/tests/test_matmul.py b/tests/test_matmul.py
@@ -147,8 +147,5 @@ def test_matmul(
     arg0 = np.random.randint(-1, 3, (M, K), dtype=lhs_rhs_type)
     arg1 = np.random.randint(-1, 3, (K, N), dtype=lhs_rhs_type)
     with invokable_module(session, module, device) as module:
-        for i in range(num_repeat_runs):
-            results = module[matmul_name](arg0, arg1).to_host()
-            assert np.array_equal(
-                results, (arg0.astype(acc_type) @ arg1.astype(acc_type))
-            )
+        results = module[matmul_name](arg0, arg1).to_host()
+        assert np.array_equal(results, (arg0.astype(acc_type) @ arg1.astype(acc_type)))
diff --git a/third_party/iree b/third_party/iree