diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index 5c30d8579..af51628a8 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -735,6 +735,7 @@ run_matmul_test \ --acc_type "f32" \ --m "8192" --k "2432" --n "7296" +################################################################### # ObjectFifo Matmul tests ################################################################### @@ -744,7 +745,7 @@ run_matmul_test \ --tile_pipeline "pack-peel" \ --lhs_rhs_type "i32" \ --acc_type "i32" \ - --m "128" --k "256" --n "128" + --m "32" --k "32" --n "32" run_matmul_test \ --name_prefix "small" \ @@ -752,7 +753,16 @@ run_matmul_test \ --tile_pipeline "pack-peel" \ --lhs_rhs_type "i32" \ --acc_type "i32" \ - --m "32" --k "32" --n "32" + --m "128" --k "32" --n "64" \ + --num_repeat_runs "10" + +run_matmul_test \ + --name_prefix "small" \ + --lower_to_aie_pipeline "objectFifo" \ + --tile_pipeline "pack-peel" \ + --lhs_rhs_type "i32" \ + --acc_type "i32" \ + --m "128" --k "256" --n "128" run_matmul_test \ --name_prefix "medium" \ diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp index 402661116..b6de094f7 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp @@ -34,6 +34,18 @@ namespace mlir::iree_compiler::AMDAIE { +/// Utility to calculate the number of iterations of a loop with provided bounds +/// and step: `ceilDiv(upperBound - lowerBound, step)`. +int64_t calculateNbIterations(int64_t lowerBound, int64_t upperBound, + int64_t step) { + int64_t diff = upperBound - lowerBound; + assert(diff > 0 && + "expected positive difference between upper bound and lower " + "bound"); + assert(step > 0 && "expected positive step"); + return 1 + ((diff - 1) / step); +} + // Constant specifying the number of inter-iteration dimension for DMA // operations. // @@ -117,13 +129,23 @@ class SubsumeLoopIntoDMA using OpInterfaceRewritePattern::OpInterfaceRewritePattern; /// Utility to add a loop iteration to an offsets/sizes/strides access - /// pattern. + /// pattern. This function handles following cases: + /// 1. If an offset which has an loop induction variable dependency can be + /// found, calculate the stride and size based on the dependency, potentially + /// taking into account an affine expression multiplier. + /// 2. If there is no loop induction variable dependency, the iteration means + /// that this strided operation is repeated `ceilDiv(upperBound - lowerBound, + /// step)` number of times, so a new dimension is added to the access pattern + /// with `stride == 0` and `size == ceilDiv(upperBound - lowerBound, step)`. LogicalResult addIterationToAccessPattern( RewriterBase &rewriter, int64_t lowerBound, int64_t upperBound, int64_t step, const DenseSet &inductionValues, SmallVector &newOffsets, SmallVector &newSizes, SmallVector &newStrides) const { + const int64_t nbIterations = + calculateNbIterations(lowerBound, upperBound, step); + SmallVector insertOffsets; SmallVector insertSizes; SmallVector insertStrides; @@ -159,24 +181,31 @@ class SubsumeLoopIntoDMA newOffsets[i] = getAsIndexOpFoldResult(rewriter.getContext(), lowerBound * offsetStride); + + // Don't add a unit iteration to better use available dimensions. + // However, the current offset should be updated, therefore this check + // is placed after `newOffsets[i]` has been updated. + if (nbIterations == 1) continue; + insertOffsets.push_back( getAsIndexOpFoldResult(rewriter.getContext(), 0)); - - // The step size is equal to the the number of iterations - // (ceilDiv(upperBound - lowerBound, step)) - int64_t diff = upperBound - lowerBound; - assert(diff > 0 && - "expected positive difference between upper bound and lower " - "bound"); - assert(step > 0 && "expected positive step"); - int64_t newSize = 1 + ((diff - 1) / step); insertSizes.push_back( - getAsIndexOpFoldResult(rewriter.getContext(), newSize)); - + getAsIndexOpFoldResult(rewriter.getContext(), nbIterations)); insertStrides.push_back( getAsIndexOpFoldResult(rewriter.getContext(), stride)); } } + assert(insertOffsets.size() == insertSizes.size() && + "expected same number of offsets and sizes to be inserted"); + assert(insertOffsets.size() == insertStrides.size() && + "expected same number of offsets and strides to be inserted"); + // Handle the 'no loop dependency' case. + if (insertOffsets.empty() && nbIterations != 1) { + insertOffsets.push_back(getAsIndexOpFoldResult(rewriter.getContext(), 0)); + insertSizes.push_back( + getAsIndexOpFoldResult(rewriter.getContext(), nbIterations)); + insertStrides.push_back(getAsIndexOpFoldResult(rewriter.getContext(), 0)); + } newOffsets.insert(newOffsets.begin(), insertOffsets.begin(), insertOffsets.end()); newSizes.insert(newSizes.begin(), insertSizes.begin(), insertSizes.end()); @@ -206,40 +235,38 @@ class SubsumeLoopIntoDMA SmallVector newTargetSizes = op.getTargetMixedSizes(); SmallVector newTargetStrides = op.getTargetMixedStrides(); - // Use source/target maxNbDims to check whether there are sufficient source - // and target dimensions. Otherwise, abort. - auto verifyNbDimsNeeded = [&](const SmallVector &dynamicOffsets, - size_t nbOffsets, - size_t maxNbDims) -> LogicalResult { - size_t counter = 0; - for (Value offset : dynamicOffsets) - if (allInductionValues.contains(offset)) counter++; - if (nbOffsets + counter > maxNbDims) return failure(); - return success(); - }; - SmallVector dynamicSourceOffsets = op.getSourceOffsets(); - SmallVector dynamicTargetOffsets = op.getTargetOffsets(); - if (failed(verifyNbDimsNeeded(dynamicSourceOffsets, newSourceOffsets.size(), - sourceMaxNbDims))) + // Verify number of dimensions needed to subsume this loop into the strided + // access pattern and fail early if there aren't enough dimensions. + size_t nbNonUnitIterations{0}; + for (auto &&[lb, ub, step] : llvm::zip(lowerBounds, upperBounds, steps)) { + const int64_t nbIterations = calculateNbIterations(lb, ub, step); + // We should not do any rewrite if we encounter a loop with no iterations. + if (nbIterations == 0) return failure(); + if (nbIterations > 1) nbNonUnitIterations++; + } + if (newSourceOffsets.size() + nbNonUnitIterations > sourceMaxNbDims) return failure(); - if (failed(verifyNbDimsNeeded(dynamicTargetOffsets, newTargetOffsets.size(), - targetMaxNbDims))) + if (newTargetOffsets.size() + nbNonUnitIterations > targetMaxNbDims) return failure(); // Add the loop iterations to the DMA access patterns. for (auto &&[lb, ub, step, iterationIvValues] : llvm::reverse( llvm::zip(lowerBounds, upperBounds, steps, inductionValues))) { // Add loop iteration to the access pattern on the source side. - if (failed(addIterationToAccessPattern( - rewriter, lb, ub, step, iterationIvValues, newSourceOffsets, - newSourceSizes, newSourceStrides))) { - return failure(); + if (!newSourceOffsets.empty()) { + if (failed(addIterationToAccessPattern( + rewriter, lb, ub, step, iterationIvValues, newSourceOffsets, + newSourceSizes, newSourceStrides))) { + return failure(); + } } // Add loop iteration to the access pattern on the target side. - if (failed(addIterationToAccessPattern( - rewriter, lb, ub, step, iterationIvValues, newTargetOffsets, - newTargetSizes, newTargetStrides))) { - return failure(); + if (!newTargetOffsets.empty()) { + if (failed(addIterationToAccessPattern( + rewriter, lb, ub, step, iterationIvValues, newTargetOffsets, + newTargetSizes, newTargetStrides))) { + return failure(); + } } } @@ -290,11 +317,6 @@ class SubsumeLoopIntoDMA curIvValues.insert(userApplyOp.getResult()); } } - if (!llvm::any_of(op->getOperands(), [&](Value operand) { - return curIvValues.contains(operand); - })) { - return failure(); - } SmallVector lowerBounds = {lowerBound.value()}; SmallVector upperBounds = {upperBound.value()}; @@ -342,13 +364,6 @@ class SubsumeLoopIntoDMA } inductionValues.push_back(curIvValues); } - // Return early if the strided operation doesn't use any of the - // induction variable dependent values. - if (!llvm::any_of(op->getOperands(), [&](Value operand) { - return allInductionValues.contains(operand); - })) { - return failure(); - } return rewriteWithLoopLikeOpParent(op, rewriter, sourceMaxNbDims, targetMaxNbDims, lowerBounds.value(), upperBounds.value(), steps.value(), diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir index 4ee6f640d..694ecf3f4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir @@ -4,76 +4,6 @@ // Sanity checks for cases where no modification should happen. //===----------------------------------------------------------------------===// -// Sanity check: ensure no modification in case of no loop depedency -// CHECK-LABEL: @npu_dma_cpy_nd_without_loop_dependency -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd -// CHECK: amdaie.controlcode -// CHECK: scf.forall (%{{.+}}, %{{.+}}) in (2, 2) -// CHECK: scf.for %{{.+}} = %[[C0]] to %[[C6]] step %[[C1]] -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] []) -// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) -#map = affine_map<(d0) -> (d0 * 16)> -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @npu_dma_cpy_nd_without_loop_dependency(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c6 = arith.constant 6 : index - amdaie.workgroup { - %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.controlcode { - scf.forall (%arg2, %arg3) in (2, 2) { - scf.for %arg4 = %c0 to %c6 step %c1 { - %1 = affine.apply #map(%arg4) - %2 = amdaie.npu.dma_cpy_nd %0([0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] []) - amdaie.npu.dma_wait(%2, S2MM) - } - } - amdaie.end - } - } - return - } -} - -// ----- - -// Ensure no modification in case of a dynamic offset not originating from an induction variable. -// CHECK-LABEL: @dynamic_non_induction_var_offset -// CHECK-SAME: %{{.+}}: !amdaie.logicalobjectfifo>, %{{.+}}: !amdaie.logicalobjectfifo>, %[[ARG:.+]]: index -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd -// CHECK: amdaie.controlcode -// CHECK: scf.for %{{.+}} = %[[C0]] to %[[C6]] step %[[C1]] -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[ARG]]] [16] [1], [] [] []) -// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @dynamic_non_induction_var_offset(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: index) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c6 = arith.constant 6 : index - amdaie.workgroup { - %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.controlcode { - scf.for %arg3 = %c0 to %c6 step %c1 { - %2 = amdaie.npu.dma_cpy_nd %0([%arg2] [16] [1], [] [] []) - amdaie.npu.dma_wait(%2, S2MM) - } - amdaie.end - } - } - return - } -} - -// ----- - // Ensure no modification in case of a invalid affine expressions. // CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)> // CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 16 + 3)> @@ -346,6 +276,37 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// Ensure no modification in case of an scf.forall with too many dimensions, +// i.e. 3 existing dimensions and two loop iterators. +// CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)> +// CHECK-LABEL: @forall_too_many_dims_target +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: scf.forall (%[[ARG2:.+]], %[[ARG3:.+]]) in (2, 6) +// CHECK: %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG3]]) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @forall_too_many_dims_target(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.forall (%arg2, %arg3) in (2, 6) { + %1 = affine.apply #map(%arg3) + %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return + } +} + +// ----- + // Ensure no modification in case of multiple npu.dma_cpy_nd users with the same source in the same scope. // CHECK-LABEL: @for_with_multiple_npu_dma_cpy_nd_same_source // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index @@ -417,12 +378,161 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} } } +// ----- + //===----------------------------------------------------------------------===// -// Checks for dependencies via `affine.apply` on both source and target sides. +// Checks for loops with no dependencies, which should be subsumed. //===----------------------------------------------------------------------===// +// Subsume loop iteration into strided op without dependency. +// CHECK-LABEL: @for_without_loop_dependency +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.for +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C0]], %[[C128]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @for_without_loop_dependency(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg4 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg4) + %2 = amdaie.npu.dma_cpy_nd %0([0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return + } +} + // ----- +// Subsume loop iteration into strided op without dependency. +// CHECK-LABEL: @forall_without_loop_dependency +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.forall (%{{.+}}, %{{.+}}) in (2, 2) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C2]], %[[C8]], %[[C16]]] [%[[C0]], %[[C0]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @forall_without_loop_dependency(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.forall (%arg2, %arg3) in (2, 2) { + %1 = affine.apply #map(%arg2) + %2 = amdaie.npu.dma_cpy_nd %0([0, 0] [8, 16] [16, 1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return + } +} + +// ----- + +// Subsume loop iteration into strided op without dependency. +// CHECK-LABEL: @nested_without_loop_dependency +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.forall +// CHECK-NOT: scf.for +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C3]], %[[C6]], %[[C16]]] [%[[C0]], %[[C0]], %[[C0]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @nested_without_loop_dependency(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.forall (%arg2, %arg3) in (2, 3) { + scf.for %arg4 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg4) + %2 = amdaie.npu.dma_cpy_nd %0([0] [16] [1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + } + } + amdaie.end + } + } + return + } +} + +// ----- + +// Subsume loop into stride op in case of a dynamic offset not originating from +// an induction variable. +// CHECK-LABEL: @dynamic_non_induction_var_offset +// CHECK-SAME: %{{.+}}: !amdaie.logicalobjectfifo>, %{{.+}}: !amdaie.logicalobjectfifo>, %[[ARG:.+]]: index +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.for +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[ARG]]] [%[[C6]], %[[C16]]] [%[[C0]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @dynamic_non_induction_var_offset(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: index) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg3 = %c0 to %c6 step %c1 { + %2 = amdaie.npu.dma_cpy_nd %0([%arg2] [16] [1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return + } +} + +// ----- + +//===----------------------------------------------------------------------===// +// Checks for dependencies via `affine.apply` on both source and target sides. +//===----------------------------------------------------------------------===// + // Check that loop subsumption happens in case of an identity affine expression. // CHECK-LABEL: @identity_affine_expr // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index @@ -495,14 +605,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-LABEL: @forall_dependency_on_target // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index // CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.forall -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C6]], %[[C8]], %[[C16]]] [%[[C0]], %[[C16]], %[[C16]], %[[C1]]], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (16 * d0)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -513,7 +623,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.controlcode { scf.forall (%arg2, %arg3) in (2, 6) { %1 = affine.apply #map(%arg3) - %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] []) + %2 = amdaie.npu.dma_cpy_nd %0([0, %1] [8, 16] [16, 1], [] [] []) amdaie.npu.dma_wait(%2, S2MM) } amdaie.end @@ -564,14 +674,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-LABEL: @forall_dependency_on_source // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index // CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.forall -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]]) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C6]], %[[C8]], %[[C16]]] [%[[C0]], %[[C16]], %[[C16]], %[[C1]]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -582,7 +692,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.controlcode { scf.forall (%arg2, %arg3) in (2, 6) { %1 = affine.apply #map(%arg3) - %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, %1] [1, 8, 16] [128, 16, 1]) + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, %1] [8, 16] [16, 1]) amdaie.npu.dma_wait(%2, S2MM) } amdaie.end @@ -753,16 +863,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-LABEL: @nested_dependencies // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index // CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.forall // CHECK-NOT: scf.for -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C1]], %[[C0]]] [%[[C6]], %[[C3]], %[[C16]], %[[C8]]] [%[[C32]], %[[C32]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C6]], %[[C3]], %[[C8]]] [%[[C0]], %[[C32]], %[[C0]], %[[C1]]], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (d0 * 16)> #map1 = affine_map<(d0) -> (d0 * 32)> @@ -779,7 +889,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %1 = affine.apply #map(%arg2) %2 = affine.apply #map1(%arg3) scf.for %arg4 = %c1 to %c6 step %c2 { - %3 = amdaie.npu.dma_cpy_nd %0([%arg4, %2] [16, 8] [16, 1], [] [] []) + %3 = amdaie.npu.dma_cpy_nd %0([%2] [8] [1], [] [] []) amdaie.npu.dma_wait(%3, S2MM) } } diff --git a/tests/samples/matmul_peeled_objectfifo.mlir b/tests/samples/matmul_peeled_objectfifo.mlir index 7651e484b..c3c9e6d93 100644 --- a/tests/samples/matmul_peeled_objectfifo.mlir +++ b/tests/samples/matmul_peeled_objectfifo.mlir @@ -10,10 +10,8 @@ // CHECK-DAG: aie.core(%[[TILE_1_2]]) // CHECK: aie.objectfifo.acquire @[[OBJ1]](Produce, 1) // CHECK: func.func @matmul_i32(%[[ARG0:.+]]: memref<32x1024xi32>, %[[ARG1:.+]]: memref<1024x64xi32>, %[[ARG2:.+]]: memref<32x64xi32>) -// CHECK-DAG: aiex.npu.dma_memcpy_nd -// CHECK-SAME: %[[ARG0]] -// CHECK-DAG: aiex.npu.dma_memcpy_nd -// CHECK-SAME: %[[ARG1]] +// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 32, 64][0, 0, 1024, 1] +// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][1, 1, 64, 32][0, 0, 64, 1] // CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG2]][0, 0, 0, 0][1, 1, 32, 32][0, 0, 64, 1] // CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG2]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1] #map = affine_map<(d0) -> (d0 * 32)>