diff --git a/mlir/include/air/Util/Util.h b/mlir/include/air/Util/Util.h
index 36e845931..dad3f83ea 100644
--- a/mlir/include/air/Util/Util.h
+++ b/mlir/include/air/Util/Util.h
@@ -222,6 +222,13 @@ std::optional<int> getOffsetDimFromMemrefDim(int dimOnMemref,
                                              SmallVector<Value> strides,
                                              SmallVector<int> memrefShape);
 
+// Evaluate the affine expression of affine map on a sparse vector of constant
+// ints.
+std::optional<int64_t>
+evaluateConstantsInMap(AffineMap map,
+                       SmallVector<std::optional<int64_t>> const_inputs,
+                       MLIRContext *ctx);
+
 } // namespace air
 } // namespace xilinx
 
diff --git a/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp b/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp
index 6414f8b02..343961a00 100644
--- a/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp
+++ b/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp
@@ -1697,11 +1697,15 @@ struct CanonicalizeAffineApplyOnLoopInductionVar
       if (!getStaticScfForTripCountAsInt(sfo))
         return failure();
       int tripCount = *getStaticScfForTripCountAsInt(sfo);
-      auto new_ub = evaluateConstantInMap(
-          apply.getAffineMap(), *mlir::getConstantIntValue(sfo.getUpperBound()),
+      auto new_ub = air::evaluateConstantsInMap(
+          apply.getAffineMap(),
+          SmallVector<std::optional<int64_t>>{
+              *mlir::getConstantIntValue(sfo.getUpperBound())},
           ctx);
-      auto new_lb = evaluateConstantInMap(
-          apply.getAffineMap(), *mlir::getConstantIntValue(sfo.getLowerBound()),
+      auto new_lb = air::evaluateConstantsInMap(
+          apply.getAffineMap(),
+          SmallVector<std::optional<int64_t>>{
+              *mlir::getConstantIntValue(sfo.getLowerBound())},
           ctx);
       assert(new_ub && new_lb);
       int newStepInInt = llvm::divideCeilSigned(*new_ub - *new_lb, tripCount);
@@ -1723,10 +1727,14 @@ struct CanonicalizeAffineApplyOnLoopInductionVar
       if (!afo.hasConstantBounds())
         return failure();
       int tripCount = *getStaticAffineForTripCountAsInt(afo);
-      auto new_ub = evaluateConstantInMap(apply.getAffineMap(),
-                                          afo.getConstantUpperBound(), ctx);
-      auto new_lb = evaluateConstantInMap(apply.getAffineMap(),
-                                          afo.getConstantLowerBound(), ctx);
+      auto new_ub = air::evaluateConstantsInMap(
+          apply.getAffineMap(),
+          SmallVector<std::optional<int64_t>>{afo.getConstantUpperBound()},
+          ctx);
+      auto new_lb = air::evaluateConstantsInMap(
+          apply.getAffineMap(),
+          SmallVector<std::optional<int64_t>>{afo.getConstantLowerBound()},
+          ctx);
       assert(new_ub && new_lb);
       int newStepInInt = llvm::divideCeilSigned(*new_ub - *new_lb, tripCount);
       IRMapping remap;
@@ -1743,19 +1751,6 @@ struct CanonicalizeAffineApplyOnLoopInductionVar
   }
 
 private:
-  // Evaluate the affine expression of affine map on a constant affine
-  // expression. Only works with affine maps with a single input.
-  std::optional<int64_t> evaluateConstantInMap(AffineMap map,
-                                               int64_t const_input,
-                                               MLIRContext *ctx) const {
-    std::optional<int64_t> output = std::nullopt;
-    if (map.getNumInputs() != 1)
-      return output;
-    auto c = getAffineConstantExpr(const_input, ctx);
-    auto newmap = map.replace(getAffineSymbolExpr(0, ctx), c, 0, 1);
-    output = simplifyAffineMap(newmap).getSingleConstantResult();
-    return output;
-  }
 };
 
 // Fold arith.muli op operating on loop induction variable into loop bounds.
diff --git a/mlir/lib/Transform/AIRMiscPasses.cpp b/mlir/lib/Transform/AIRMiscPasses.cpp
index 89b905415..18cf6ff61 100644
--- a/mlir/lib/Transform/AIRMiscPasses.cpp
+++ b/mlir/lib/Transform/AIRMiscPasses.cpp
@@ -883,8 +883,8 @@ int findGCD(SmallVector<int> vec) {
 // Tile air.channel put/get wrt a memref.
 Value tileChannelOpByFactor(air::ChannelInterface originalChanOp, int factor,
                             int originalMemrefSize, int dim,
-                            air::ChannelOp newChanOp, Location loc,
-                            MLIRContext *ctx) {
+                            memref::AllocOp allocOp, air::ChannelOp newChanOp,
+                            Location loc, MLIRContext *ctx) {
   OpBuilder builder(originalChanOp);
   SmallVector<Value> originalApplyOperands;
   Operation *affineApplyOp = nullptr;
@@ -912,11 +912,21 @@ Value tileChannelOpByFactor(air::ChannelInterface originalChanOp, int factor,
     auto checkpoint = builder.saveInsertionPoint();
     if (affineApplyOp)
       builder.setInsertionPoint(affineApplyOp);
+    // Generate default affine.map assuming non-overlapping data access pattern.
     AffineExpr s0 = builder.getAffineSymbolExpr(0);
     AffineExpr mul = s0 * originalMemrefSize;
     AffineExpr add =
         mul + i * llvm::divideCeilSigned(originalMemrefSize, factor);
     auto map = AffineMap::get(0, 1, add);
+    // If allocOp has "affine_map" attribute set, then use that map instead
+    // (potentially overlapping access pattern).
+    if (allocOp->hasAttr("affine_map")) {
+      auto original_map =
+          allocOp->getAttrOfType<AffineMapAttr>("affine_map").getAffineMap();
+      if (original_map.getNumInputs() == 2)
+        map = original_map.replace(getAffineSymbolExpr(1, ctx),
+                                   getAffineConstantExpr(i, ctx), 0, 1);
+    }
     auto newApplyOp =
         builder.create<affine::AffineApplyOp>(loc, map, originalApplyOperands);
     if (affineApplyOp)
@@ -956,6 +966,26 @@ Value tileChannelOpByFactor(air::ChannelInterface originalChanOp, int factor,
   return newWaitAll.getAsyncToken();
 }
 
+// Get scf.for op whose iv (indirectly) produces the val.
+scf::ForOp getScfForFromVal(Value val) {
+  if (!val)
+    return scf::ForOp();
+  if (auto res = scf::getForInductionVarOwner(val))
+    return res;
+  auto defOp = val.getDefiningOp();
+  if (!defOp)
+    return scf::ForOp();
+  if (auto exec = dyn_cast<air::ExecuteOp>(defOp)) {
+    auto exec_child = exec.getChildOp();
+    if (!exec_child)
+      return scf::ForOp();
+    for (auto oper : exec_child->getOperands())
+      if (auto res = scf::getForInductionVarOwner(oper))
+        return res;
+  }
+  return scf::ForOp();
+}
+
 // Partition L2 memref.
 void AIRSplitL2MemrefForBufferConstraintPass::partitionMemref(
     SmallVector<air::ChannelPutOp> &puts, SmallVector<air::ChannelGetOp> &gets,
@@ -964,6 +994,7 @@ void AIRSplitL2MemrefForBufferConstraintPass::partitionMemref(
   MemRefType ty = llvm::cast<MemRefType>(memref.getType());
   auto allocOp = memref.getDefiningOp();
   auto loc = allocOp->getLoc();
+  auto ctx = allocOp->getContext();
   Operation *deallocOp = nullptr;
   for (auto user : memref.getUsers()) {
     if (auto execOp = dyn_cast<air::ExecuteOp>(user->getParentOp())) {
@@ -979,34 +1010,63 @@ void AIRSplitL2MemrefForBufferConstraintPass::partitionMemref(
 
   std::map<int, SmallVector<air::ChannelInterface>> chanOpPartitions;
   SmallVector<int> keys;
-  for (auto op : puts) {
-    auto offsetDim = air::getOffsetDimFromMemrefDim(dim, op.getStrides(),
-                                                    air::getTensorShape(ty));
-    if (!offsetDim)
-      continue;
-    auto offset = getConstantIntValue(op.getOffsets()[*offsetDim]);
-    if (!offset)
-      continue;
-    push_back_if_unique<int>(keys, *offset);
-    if (!chanOpPartitions.count(*offset))
-      chanOpPartitions[*offset] = SmallVector<air::ChannelInterface>{op};
-    else
-      chanOpPartitions[*offset].push_back(op);
-  }
-  for (auto op : gets) {
-    auto offsetDim = air::getOffsetDimFromMemrefDim(dim, op.getStrides(),
-                                                    air::getTensorShape(ty));
-    if (!offsetDim)
-      continue;
-    auto offset = getConstantIntValue(op.getOffsets()[*offsetDim]);
-    if (!offset)
-      continue;
-    push_back_if_unique<int>(keys, *offset);
-    if (!chanOpPartitions.count(*offset))
-      chanOpPartitions[*offset] = SmallVector<air::ChannelInterface>{op};
-    else
-      chanOpPartitions[*offset].push_back(op);
-  }
+
+  // Get map of channel ops
+  auto getChanOpPartitionsMap =
+      [ctx, dim,
+       ty](std::map<int, SmallVector<air::ChannelInterface>> &chanOpPartitions,
+           SmallVector<int> &keys, air::ChannelInterface op) {
+        auto offsetDim = air::getOffsetDimFromMemrefDim(
+            dim, op.getStrides(), air::getTensorShape(ty));
+        if (!offsetDim)
+          return;
+        auto offset = getConstantIntValue(op.getOffsets()[*offsetDim]);
+        int offset_key = -1;
+        if (offset)
+          offset_key = *offset; // Const offset.
+        else { // Variadic offset (induction variable to an scf.for).
+          auto forOp = getScfForFromVal(op.getOffsets()[*offsetDim]);
+          if (!forOp)
+            return;
+          auto iv = forOp.getInductionVar();
+          if (!iv.hasOneUse())
+            return;
+          auto lb = getConstantIntValue(forOp.getLowerBound());
+          if (!lb)
+            return;
+          Operation *oneUser = nullptr;
+          for (auto user : iv.getUsers())
+            oneUser = user;
+          if (auto apply = dyn_cast<affine::AffineApplyOp>(oneUser)) {
+            SmallVector<std::optional<int64_t>> const_ints;
+            for (auto oper : apply->getOperands()) {
+              if (auto constVal = getConstantIntValue(oper))
+                const_ints.push_back(constVal);
+              else
+                const_ints.push_back(lb);
+            }
+            auto key_opt = air::evaluateConstantsInMap(apply.getAffineMap(),
+                                                       const_ints, ctx);
+            if (!key_opt)
+              return;
+            offset_key = *key_opt;
+          } else
+            offset_key = *lb;
+        }
+        if (offset_key < 0)
+          return;
+        push_back_if_unique<int>(keys, offset_key);
+        if (!chanOpPartitions.count(offset_key))
+          chanOpPartitions[offset_key] = SmallVector<air::ChannelInterface>{op};
+        else
+          chanOpPartitions[offset_key].push_back(op);
+      };
+
+  for (auto op : puts)
+    getChanOpPartitionsMap(chanOpPartitions, keys, op);
+  for (auto op : gets)
+    getChanOpPartitionsMap(chanOpPartitions, keys, op);
+
   OpBuilder builder(allocOp);
   SmallVector<scf::ForOp> mutatedScfForOps;
   for (auto key : keys) {
@@ -1019,10 +1079,22 @@ void AIRSplitL2MemrefForBufferConstraintPass::partitionMemref(
                                                       air::getTensorShape(ty));
       if (!offsetDim)
         continue;
-      if (op.getSizes().size() == newMemrefShape.size()) {
+      if (op.getSizes().size() != newMemrefShape.size())
+        continue;
+      auto offset = getConstantIntValue(op.getOffsets()[*offsetDim]);
+      if (offset)
         newMemrefShape[dim] = *getConstantIntValue(op.getSizes()[*offsetDim]);
-        break;
+      else {
+        auto forOp = getScfForFromVal(op.getOffsets()[*offsetDim]);
+        if (!forOp)
+          continue;
+        auto trip_count = air::getStaticScfForTripCountAsInt(forOp);
+        if (!trip_count)
+          continue;
+        newMemrefShape[dim] =
+            *getConstantIntValue(op.getSizes()[*offsetDim]) * (*trip_count);
       }
+      break;
     }
 
     auto newMemrefType = MemRefType::get(newMemrefShape, ty.getElementType(),
@@ -1031,9 +1103,9 @@ void AIRSplitL2MemrefForBufferConstraintPass::partitionMemref(
     Value newMemref = nullptr;
     // Create new alloc ops.
     if (isa<air::ExecuteOp>(allocOp)) {
-      auto execOp = builder.create<air::ExecuteOp>(
-          loc, air::AsyncTokenType::get(allocOp->getContext()), newMemrefType,
-          SmallVector<Value>{});
+      auto execOp =
+          builder.create<air::ExecuteOp>(loc, air::AsyncTokenType::get(ctx),
+                                         newMemrefType, SmallVector<Value>{});
       Block *async_bb = builder.createBlock(&execOp.getBody());
       builder.setInsertionPointToStart(async_bb);
       auto childMemAlloc = builder.create<memref::AllocOp>(loc, newMemrefType);
@@ -1048,7 +1120,7 @@ void AIRSplitL2MemrefForBufferConstraintPass::partitionMemref(
       builder.setInsertionPoint(deallocOp);
       if (auto execDeallocOp = dyn_cast<air::ExecuteOp>(deallocOp)) {
         auto execOp = builder.create<air::ExecuteOp>(
-            loc, air::AsyncTokenType::get(deallocOp->getContext()),
+            loc, air::AsyncTokenType::get(ctx),
             execDeallocOp.getAsyncDependencies());
         Block *async_bb = builder.createBlock(&execOp.getBody());
         builder.setInsertionPointToStart(async_bb);
@@ -1073,7 +1145,24 @@ void AIRSplitL2MemrefForBufferConstraintPass::partitionMemref(
         continue;
       int offsetOperandOffset = memrefOperandOffset + *offsetDim + 1;
       auto &offsetOpOper = op->getOpOperand(offsetOperandOffset);
-      offsetOpOper.assign(builder.create<arith::ConstantIndexOp>(loc, 0));
+
+      // Const offset. Reset offset to 0.
+      if (getConstantIntValue(op.getOffsets()[*offsetDim]))
+        offsetOpOper.assign(builder.create<arith::ConstantIndexOp>(loc, 0));
+      // Variadic offset. Reset const operands of apply to 0.
+      else {
+        auto defOp = op.getOffsets()[*offsetDim].getDefiningOp();
+        affine::AffineApplyOp apply = dyn_cast<affine::AffineApplyOp>(defOp);
+        air::ExecuteOp exec = dyn_cast<air::ExecuteOp>(defOp);
+        if (exec && isa<affine::AffineApplyOp>(exec.getChildOp()))
+          apply = dyn_cast<affine::AffineApplyOp>(exec.getChildOp());
+        assert(apply && "Apply op not found. NYI.");
+        for (auto oper : apply->getOperands())
+          if (getConstantIntValue(oper))
+            apply->replaceUsesOfWith(
+                oper, builder.create<arith::ConstantIndexOp>(loc, 0));
+      }
+
       // Update strides (contiguous, row-major) after memref tiling.
       SmallVector<int> newStrides;
       // One dimensional default stride value.
@@ -1209,6 +1298,21 @@ AIRSplitL2MemrefForBufferConstraintPass::getTargetMemrefAllocs(
             allocOp->setAttr(
                 "split_dim",
                 IntegerAttr::get(IntegerType::get(ctx, 32), splitDim));
+
+          // If there is an affine.apply operating on offsets[split_dim], then
+          // log the affine.map.
+          auto offsetDefOp = chanOp.getOffsets()[splitDim].getDefiningOp();
+          if (offsetDefOp) {
+            affine::AffineApplyOp apply =
+                dyn_cast<affine::AffineApplyOp>(offsetDefOp);
+            if (auto exec = dyn_cast<air::ExecuteOp>(offsetDefOp))
+              if (auto exec_child_apply =
+                      dyn_cast<affine::AffineApplyOp>(exec.getChildOp()))
+                apply = exec_child_apply;
+            if (apply)
+              allocOp->setAttr("affine_map",
+                               AffineMapAttr::get(apply.getAffineMap()));
+          }
         }
         // Tiling along the first (x) dimension of scf.parallel only, as one NPU
         // memtile is located at the bottom of each column.
@@ -1345,7 +1449,7 @@ void AIRSplitL2MemrefForBufferConstraintPass::runOnOperation() {
         int offsetDim = offsetDimOpt ? *offsetDimOpt : dim;
         auto newWaitAll = tileChannelOpByFactor(
             chanUserOp, targetColTilingFactor, memrefShape[dim], offsetDim,
-            new_chan, loc, ctx);
+            allocOp, new_chan, loc, ctx);
 
         // Update async dependency.
         auto old_token =
@@ -1384,7 +1488,7 @@ void AIRSplitL2MemrefForBufferConstraintPass::runOnOperation() {
         }
         Value newWaitAll1 = tileChannelOpByFactor(
             theOtherChanOp[0], targetColTilingFactor, memrefShape[dim],
-            dim - numLeadingSingletonDimDiff, new_chan, loc, ctx);
+            dim - numLeadingSingletonDimDiff, allocOp, new_chan, loc, ctx);
 
         // Update dependency.
         auto oldToken =
diff --git a/mlir/lib/Util/Util.cpp b/mlir/lib/Util/Util.cpp
index 245a7404b..1647a1062 100644
--- a/mlir/lib/Util/Util.cpp
+++ b/mlir/lib/Util/Util.cpp
@@ -1511,3 +1511,26 @@ air::getOffsetDimFromMemrefDim(int dimOnMemref, SmallVector<Value> strides,
   }
   return std::nullopt;
 }
+
+// Evaluate the affine expression of affine map on a sparse vector of constant
+// ints.
+std::optional<int64_t>
+air::evaluateConstantsInMap(AffineMap map,
+                            SmallVector<std::optional<int64_t>> const_inputs,
+                            MLIRContext *ctx) {
+  std::optional<int64_t> output = std::nullopt;
+  if (map.getNumInputs() != const_inputs.size())
+    return output;
+  auto newmap = map;
+  for (unsigned i = 0; i < map.getNumSymbols(); i++) {
+    if (!const_inputs[i])
+      continue;
+    auto c = getAffineConstantExpr(*const_inputs[i], ctx);
+    newmap =
+        newmap.replace(getAffineSymbolExpr(i, ctx), c, 0, map.getNumSymbols());
+  }
+  // auto c = getAffineConstantExpr(const_input, ctx);
+  // auto newmap = map.replace(getAffineSymbolExpr(0, ctx), c, 0, 1);
+  output = simplifyAffineMap(newmap).getSingleConstantResult();
+  return output;
+}
diff --git a/mlir/test/Transform/AIRMiscPasses/air_split_l2_memref.mlir b/mlir/test/Transform/AIRMiscPasses/air_split_l2_memref.mlir
index 3a246dbd0..57749050e 100644
--- a/mlir/test/Transform/AIRMiscPasses/air_split_l2_memref.mlir
+++ b/mlir/test/Transform/AIRMiscPasses/air_split_l2_memref.mlir
@@ -1174,3 +1174,156 @@ module {
     return
   }
 }
+
+// -----
+
+// Conv2d 3x3, stride 2 (overlapping l2 access).
+
+// CHECK: [[$MAP0:#map[0-9]*]] = affine_map<()[s0] -> (s0 + 2)>
+// CHECK: [[$MAP1:#map[0-9]+]] = affine_map<()[s0] -> (s0 + 4)>
+// CHECK: [[$MAP2:#map[0-9]+]] = affine_map<()[s0] -> (s0 + 6)>
+
+// CHECK-LABEL: func.func @test9
+// CHECK: air.launch
+// CHECK: %[[VAL0:.*]] = affine.apply [[$MAP0]]()
+// CHECK: %[[VAL1:.*]] = affine.apply [[$MAP1]]()
+// CHECK: %[[VAL2:.*]] = affine.apply [[$MAP2]]()
+// CHECK: air.channel.put {{.*}} @channel_0[%c0, %c0]
+// CHECK: air.channel.put {{.*}} @channel_0[%c1, %c0] (%{{.*}}[%c0, %[[VAL0]]
+// CHECK: air.channel.put {{.*}} @channel_0[%c2, %c0] (%{{.*}}[%c0, %[[VAL1]]
+// CHECK: air.channel.put {{.*}} @channel_0[%c3, %c0] (%{{.*}}[%c0, %[[VAL2]]
+// CHECK: air.segment
+// CHECK: %[[TOKEN0:.*]], %[[ALLOC0:.*]] = air.execute -> (memref<1x3x33x16xi8, 1>) {
+// CHECK-NEXT: memref.alloc() : memref<1x3x33x16xi8, 1>
+// CHECK: %[[TOKEN1:.*]], %[[ALLOC1:.*]] = air.execute -> (memref<1x3x33x16xi8, 1>) {
+// CHECK-NEXT: memref.alloc() : memref<1x3x33x16xi8, 1>
+// CHECK: %[[TOKEN2:.*]], %[[ALLOC2:.*]] = air.execute -> (memref<1x3x33x16xi8, 1>) {
+// CHECK-NEXT: memref.alloc() : memref<1x3x33x16xi8, 1>
+// CHECK: %[[TOKEN3:.*]], %[[ALLOC3:.*]] = air.execute -> (memref<1x3x33x16xi8, 1>) {
+// CHECK-NEXT: memref.alloc() : memref<1x3x33x16xi8, 1>
+// CHECK: air.channel.get async{{.*}}@channel_0[%c0{{.*}}, %c0{{.*}}] (%[[ALLOC0]]
+// CHECK: air.channel.get async{{.*}}@channel_0[%c1{{.*}}, %c0{{.*}}] (%[[ALLOC1]]
+// CHECK: air.channel.get async{{.*}}@channel_0[%c2{{.*}}, %c0{{.*}}] (%[[ALLOC2]]
+// CHECK: air.channel.get async{{.*}}@channel_0[%c3{{.*}}, %c0{{.*}}] (%[[ALLOC3]]
+// CHECK: air.channel.put async{{.*}}@channel_3[%c0{{.*}}, %c0{{.*}}] (%[[ALLOC0]]
+// CHECK: air.channel.put async{{.*}}@channel_3[%c1{{.*}}, %c0{{.*}}] (%[[ALLOC1]]
+// CHECK: air.channel.put async{{.*}}@channel_3[%c2{{.*}}, %c0{{.*}}] (%[[ALLOC2]]
+// CHECK: air.channel.put async{{.*}}@channel_3[%c3{{.*}}, %c0{{.*}}] (%[[ALLOC3]]
+// CHECK: air.channel.put async{{.*}}@channel_3[%c0{{.*}}, %c1{{.*}}] (%[[ALLOC0]]
+// CHECK: air.channel.put async{{.*}}@channel_3[%c1{{.*}}, %c1{{.*}}] (%[[ALLOC1]]
+// CHECK: air.channel.put async{{.*}}@channel_3[%c2{{.*}}, %c1{{.*}}] (%[[ALLOC2]]
+// CHECK: air.channel.put async{{.*}}@channel_3[%c3{{.*}}, %c1{{.*}}] (%[[ALLOC3]]
+// CHECK: air.channel.put async{{.*}}@channel_3[%c0{{.*}}, %c2{{.*}}] (%[[ALLOC0]]
+// CHECK: air.channel.put async{{.*}}@channel_3[%c1{{.*}}, %c2{{.*}}] (%[[ALLOC1]]
+// CHECK: air.channel.put async{{.*}}@channel_3[%c2{{.*}}, %c2{{.*}}] (%[[ALLOC2]]
+// CHECK: air.channel.put async{{.*}}@channel_3[%c3{{.*}}, %c2{{.*}}] (%[[ALLOC3]]
+// CHECK: air.channel.put async{{.*}}@channel_3[%c0{{.*}}, %c3{{.*}}] (%[[ALLOC0]]
+// CHECK: air.channel.put async{{.*}}@channel_3[%c1{{.*}}, %c3{{.*}}] (%[[ALLOC1]]
+// CHECK: air.channel.put async{{.*}}@channel_3[%c2{{.*}}, %c3{{.*}}] (%[[ALLOC2]]
+// CHECK: air.channel.put async{{.*}}@channel_3[%c3{{.*}}, %c3{{.*}}] (%[[ALLOC3]]
+// CHECK: air.herd
+// CHECK: air.channel.get async{{.*}}@channel_3
+
+#map = affine_map<()[s0] -> (s0 * 8)>
+#map1 = affine_map<()[s0] -> (s0 * 32)>
+#map2 = affine_map<()[s0, s1] -> (s0 + s1 * 2)>
+#map3 = affine_map<()[s0, s1] -> (s0 + s1 * 8)>
+module {
+  air.channel @channel_3 [4, 4]
+  air.channel @channel_1 [1, 1]
+  func.func @test9(%arg0: memref<1x513x513x16xi8>, %arg1: memref<3x3x16x32xi8>, %arg2: memref<1x256x256x32xi32>) {
+    %c64 = arith.constant 64 : index
+    %c4 = arith.constant 4 : index
+    %c16 = arith.constant 16 : index
+    %0 = air.launch async (%arg3, %arg4, %arg5) in (%arg6=%c64, %arg7=%c16, %arg8=%c4) args(%arg9=%arg0) : memref<1x513x513x16xi8> attributes {id = 1 : i32} {
+      %c8208 = arith.constant 8208 : index
+      %c4210704 = arith.constant 4210704 : index
+      %c16_0 = arith.constant 16 : index
+      %c33 = arith.constant 33 : index
+      %c9 = arith.constant 9 : index
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      %async_token, %results = air.execute -> (index) {
+        %3 = affine.apply #map()[%arg3]
+        air.execute_terminator %3 : index
+      }
+      %async_token_1, %results_2 = air.execute -> (index) {
+        %3 = affine.apply #map1()[%arg4]
+        air.execute_terminator %3 : index
+      }
+      %1 = air.channel.put async [%async_token, %async_token_1]  @channel_1[] (%arg9[%c0, %results, %results_2, %c0] [%c1, %c9, %c33, %c16_0] [%c4210704, %c8208, %c16_0, %c1]) {id = 1 : i32} : (memref<1x513x513x16xi8>)
+      %2 = air.segment @segment_0 async  attributes {id = 2 : i32} {
+        %c7 = arith.constant 7 : index
+        %c4752 = arith.constant 4752 : index
+        %c528 = arith.constant 528 : index
+        %c8 = arith.constant 8 : index
+        %c3 = arith.constant 3 : index
+        %c16_3 = arith.constant 16 : index
+        %c1_4 = arith.constant 1 : index
+        %c0_5 = arith.constant 0 : index
+        %c4_6 = arith.constant 4 : index
+        %3 = air.wait_all async 
+        %4 = air.wait_all async 
+        %async_token_7, %results_8 = air.execute -> (memref<1x9x33x16xi8, 1 : i32>) {
+          %alloc = memref.alloc() : memref<1x9x33x16xi8, 1 : i32>
+          air.execute_terminator %alloc : memref<1x9x33x16xi8, 1 : i32>
+        }
+        %5 = air.channel.get async [%3, %4, %async_token_7]  @channel_1[] (%results_8[] [] []) {id = 4 : i32} : (memref<1x9x33x16xi8, 1 : i32>)
+        %6 = scf.parallel (%arg10, %arg11) = (%c0_5, %c0_5) to (%c4_6, %c4_6) step (%c1_4, %c1_4) init (%5) -> !air.async.token {
+          %8 = scf.for %arg12 = %c0_5 to %c3 step %c1_4 iter_args(%arg13 = %5) -> (!air.async.token) {
+            %9 = scf.for %arg14 = %c0_5 to %c3 step %c1_4 iter_args(%arg15 = %arg13) -> (!air.async.token) {
+              %10 = scf.for %arg16 = %c0_5 to %c16_3 step %c8 iter_args(%arg17 = %arg15) -> (!air.async.token) {
+                %async_token_10, %results_11 = air.execute [%arg17] -> (index) {
+                  %12 = affine.apply #map2()[%arg12, %arg10]
+                  air.execute_terminator %12 : index
+                }
+                %async_token_12, %results_13 = air.execute [%arg17] -> (index) {
+                  %12 = affine.apply #map3()[%arg14, %arg11]
+                  air.execute_terminator %12 : index
+                }
+                %11 = air.channel.put async [%async_token_10, %async_token_12]  @channel_3[%arg10, %arg11] (%results_8[%c0_5, %results_11, %results_13, %arg16] [%c1_4, %c1_4, %c7, %c8] [%c4752, %c528, %c16_3, %c1_4]) {id = 7 : i32} : (memref<1x9x33x16xi8, 1 : i32>)
+                scf.yield %11 : !air.async.token
+              }
+              scf.yield %10 : !air.async.token
+            }
+            scf.yield %9 : !air.async.token
+          }
+          scf.reduce(%8 : !air.async.token) {
+          ^bb0(%arg12: !air.async.token, %arg13: !air.async.token):
+            %9 = air.wait_all async [%arg12, %arg13] 
+            scf.reduce.return %9 : !air.async.token
+          }
+        }
+        %7 = air.herd @herd_0 async [%5]  tile (%arg10, %arg11) in (%arg12=%c4_6, %arg13=%c4_6) attributes {id = 3 : i32} {
+          %c0_10 = arith.constant 0 : index
+          %c16_11 = arith.constant 16 : index
+          %c8_12 = arith.constant 8 : index
+          %c3_13 = arith.constant 3 : index
+          %c1_14 = arith.constant 1 : index
+          %8 = air.wait_all async 
+          %9 = scf.for %arg14 = %c0_10 to %c3_13 step %c1_14 iter_args(%arg15 = %8) -> (!air.async.token) {
+            %10 = scf.for %arg16 = %c0_10 to %c3_13 step %c1_14 iter_args(%arg17 = %arg15) -> (!air.async.token) {
+              %11 = scf.for %arg18 = %c0_10 to %c16_11 step %c8_12 iter_args(%arg19 = %arg17) -> (!air.async.token) {
+                %async_token_15, %results_16 = air.execute -> (memref<1x1x7x8xi8, 2 : i32>) {
+                  %alloc = memref.alloc() : memref<1x1x7x8xi8, 2 : i32>
+                  air.execute_terminator %alloc : memref<1x1x7x8xi8, 2 : i32>
+                }
+                %12 = air.channel.get async [%arg19, %async_token_15]  @channel_3[%arg10, %arg11] (%results_16[] [] []) {id = 9 : i32} : (memref<1x1x7x8xi8, 2 : i32>)
+                %async_token_17 = air.execute {
+                  memref.dealloc %results_16 : memref<1x1x7x8xi8, 2 : i32>
+                }
+                scf.yield %12 : !air.async.token
+              }
+              scf.yield %11 : !air.async.token
+            }
+            scf.yield %10 : !air.async.token
+          }
+        }
+        %async_token_9 = air.execute [%5] {
+          memref.dealloc %results_8 : memref<1x9x33x16xi8, 1 : i32>
+        }
+      }
+    }
+    return
+  }
+}