Skip to content

Commit

Permalink
Merge branch 'main' into zhewen_moveAssignChannels
Browse files Browse the repository at this point in the history
  • Loading branch information
Yu-Zhewen authored Dec 10, 2024
2 parents 3844e23 + e623a6a commit b163405
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,27 @@ MemRefType getDistributedType(memref::AllocOp alloc,
MemRefType type;
for (Operation *allocUser : alloc->getUsers()) {
if (auto subview = dyn_cast<memref::SubViewOp>(allocUser)) {
// Check that all offsets are either constants or thread ids. We assume
// that if a subview has an offset which is not a constant and not a
// thread id, it's not 'distributing'.
// Check that all offsets are either constants or depending on thread ids.
// We assume that if a subview has an offset which is not a constant and
// does not depend on thread id, it's not 'distributing'.
Operation::operand_range offsets = subview.getOffsets();
int nIndVars{0};
for (Value offset : offsets) {
bool isConst = matchPattern(offset, m_Constant());
bool isIndVar = llvm::is_contained(indVars, offset);
nIndVars += isIndVar;
if (!isConst && !isIndVar) return {};
bool dependsOnIndVar = false;
if (!isConst &&
isa_and_present<affine::AffineApplyOp>(offset.getDefiningOp())) {
auto applyOp = cast<affine::AffineApplyOp>(offset.getDefiningOp());
for (auto operand : applyOp.getSymbolOperands()) {
dependsOnIndVar = llvm::is_contained(indVars, operand);
if (dependsOnIndVar) break;
}
} else {
dependsOnIndVar = llvm::is_contained(indVars, offset);
}

nIndVars += dependsOnIndVar;
if (!isConst && !dependsOnIndVar) return {};
}

// If there are no thread ids, this subview is not distributing.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -219,14 +219,13 @@ def AMDAIEDistributeL1Allocations :
Pass<"iree-amdaie-distribute-l1-allocations", "ModuleOp"> {
let summary = "Replace distributed L1 allocations with private allocations.";
let description = [{
Each AIE core/tile is uniquely identified by gpu thread ids, usually
'y' (for AIE row) and 'x' (for AIE column).

Some of the compilation pipelines in iree-amd-aie generate a single L1
memory allocation describing the concatenation of all memory for all
cores/tiles. Each thread then slices into a mutually exclusive rectangle
of the allocation, along its thread dimensions, so 'privatizing' its
memory.
memory. Note the thread dimension is specified by the `GPUThreadMappingAttr`
in `scf.forall` ops, indicating that the distribution occurs across one or
more blocks of cores, with subsets designated as threads.

This pass rewrites these allocations to be private to each core/tile. So
it replaces a large allocation in L1 with a smaller allocation, smaller by
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,30 @@ func.func @distribute_l1_memory_test_0() {

// -----

// CHECK-LABEL: distribute_l1_memory_nested_loops
// CHECK: %[[L2ALLOC:.+]] = memref.alloc() : memref<1x1x32x32xi32, 2>
// CHECK: linalg.fill
// CHECK-SAME: outs(%[[L2ALLOC]] : memref<1x1x32x32xi32, 2>)
// CHECK: memref.dealloc %[[L2ALLOC]] : memref<1x1x32x32xi32, 2>

#map = affine_map<()[s0, s1] -> (s0 + s1)>
func.func @distribute_l1_memory_nested_loops() {
%c0_i32 = arith.constant 0 : i32
%alloc = memref.alloc() : memref<4x4x32x32xi32, 2>
scf.forall (%arg0, %arg1) = (0, 0) to (4, 4) step (2, 2) {
scf.forall (%arg2, %arg3) in (2, 2) {
%0 = affine.apply #map()[%arg0, %arg2]
%1 = affine.apply #map()[%arg1, %arg3]
%subview = memref.subview %alloc[%0, %1, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<4x4x32x32xi32, 2> to memref<1x1x32x32xi32, strided<[4096, 1024, 32, 1], offset: ?>, 2>
linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x32x32xi32, strided<[4096, 1024, 32, 1], offset: ?>, 2>)
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
memref.dealloc %alloc : memref<4x4x32x32xi32, 2>
return
}

// -----

// CHECK-LABEL: @transfer_read_test()
// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<1x8xbf16, 2>
// CHECK: vector.transfer_read %[[ALLOC]]
Expand Down

0 comments on commit b163405

Please sign in to comment.