diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h index 6fc8ea3f1a7d..f874b2b2914e 100644 --- a/include/triton/Conversion/TritonGPUToLLVM/Utility.h +++ b/include/triton/Conversion/TritonGPUToLLVM/Utility.h @@ -1439,8 +1439,7 @@ inline void storeDistributedToShared(Value src, ArrayRef inVals, auto srcTy = cast(src.getType()); auto srcShape = srcTy.getShape(); auto rank = srcShape.size(); - assert(rank == 2 || - rank == 3 && "Unexpected rank of storeDistributedToShared"); + assert(rank <= 3 && "Unexpected rank of storeDistributedToShared"); auto dstTy = cast(dst.getType()); auto srcDistributedLayout = srcTy.getEncoding(); if (auto mmaLayout = dyn_cast(srcDistributedLayout)) { diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td index 32b1aa370dae..4966a5f73710 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td @@ -218,4 +218,18 @@ def TTG_LocalLoadOp : TTG_Op<"local_load", [MemoryEffects<[MemRead let results = (outs TT_Tensor:$result); } +def TTG_LocalStoreOp : TTG_Op<"local_store", [MemoryEffects<[MemWrite]>]> { + let summary = "Store a distributed tensor into a buffer in local memory"; + + let description = [{ + Store a distributed tensor into a buffer in local memory. + }]; + let arguments = (ins TT_Tensor:$src, TT_MemDescType:$result); + + // Use qualified() otherwise "!tt.memdesc" is printed as "". + let assemblyFormat = [{ + $src `,` $result attr-dict `:` type($src) `->` qualified(type($result)) + }]; +} + #endif diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td index fc012dcb78db..7de7f9d909f7 100644 --- a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td +++ b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td @@ -214,4 +214,16 @@ def TTNG_AsyncTMACopyLocalToGlobalOp : TTNG_Op<"async_tma_copy_local_to_global", }]; } +def TTNG_TMAStoreWait : TTNG_Op<"async_tma_store_wait"> { + let summary = "wait until all the inputs are read."; + let arguments = (ins I32Attr:$pendings); + let description = [{ + Wait until all the read operations are done from the associated store operations. + This is needed before the shared memory can be written to. + }]; + + let assemblyFormat = "attr-dict"; +} + + #endif diff --git a/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp index 2cbd993e0bc2..296fba5c8241 100644 --- a/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp @@ -15,29 +15,30 @@ using namespace mlir::triton::gpu; // blocked -> shared. // Swizzling in shared memory to avoid bank conflict. Normally used for // A/B operands of dots. -void lowerDistributedToShared(LocalAllocOp op, LocalAllocOpAdaptor adaptor, +void lowerDistributedToShared(Operation *op, Value src, Value dst, + Value adaptorSrc, const LLVMTypeConverter *typeConverter, ConversionPatternRewriter &rewriter, const TargetInfoBase &targetInfo) { - auto loc = op.getLoc(); - auto srcTy = op.getSrc().getType(); - auto dstTy = op.getType(); + auto loc = op->getLoc(); + auto srcTy = cast(src.getType()); + auto dstTy = cast(dst.getType()); auto dstShapePerCTA = triton::gpu::getShapePerCTA(dstTy); auto srcLayout = srcTy.getEncoding(); auto outOrd = mlir::cast(dstTy.getEncoding()).getOrder(); assert(srcTy.getShape().size() == 2 || (srcTy.getShape().size() <= 3 && outOrd[2] == 0) && "Unexpected rank of ConvertLayout(blocked->shared)"); - Value smemBase = LLVM::getSharedMemoryBase(loc, rewriter, op.getOperation()); + Value smemBase = LLVM::getSharedMemoryBase(loc, rewriter, op); auto elemTy = typeConverter->convertType(srcTy.getElementType()); int32_t elemSize = elemTy.getIntOrFloatBitWidth(); unsigned numElems = triton::gpu::getTotalElemsPerThread(srcTy); auto dstStrides = LLVM::getStridesFromShapeAndOrder(dstShapePerCTA, outOrd, loc, rewriter); - auto inVals = unpackLLElements(loc, adaptor.getSrc(), rewriter); - storeDistributedToShared(op.getSrc(), inVals, dstStrides, op.getResult(), - smemBase, elemTy, loc, rewriter, targetInfo); + auto inVals = unpackLLElements(loc, adaptorSrc, rewriter); + storeDistributedToShared(src, inVals, dstStrides, dst, smemBase, elemTy, loc, + rewriter, targetInfo); } struct LocalAllocOpConversion @@ -73,7 +74,8 @@ struct LocalAllocOpConversion // If there is an initial tensor, store it into the shared memory. if (op.getSrc()) { - lowerDistributedToShared(op, adaptor, typeConverter, rewriter, + lowerDistributedToShared(op, op.getSrc(), op.getResult(), + adaptor.getSrc(), typeConverter, rewriter, targetInfo); } @@ -103,6 +105,32 @@ struct LocalDeallocOpConversion } }; +struct LocalStoreOpConversion + : public ConvertOpToLLVMPattern { +public: + using ConvertOpToLLVMPattern< + triton::gpu::LocalStoreOp>::ConvertOpToLLVMPattern; + + LocalStoreOpConversion(const LLVMTypeConverter &converter, + const TargetInfoBase &targetInfo, + PatternBenefit benefit = 1) + : ConvertOpToLLVMPattern(converter, benefit), + targetInfo(targetInfo) {} + + LogicalResult + matchAndRewrite(triton::gpu::LocalStoreOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + lowerDistributedToShared(op, op.getSrc(), op.getResult(), adaptor.getSrc(), + getTypeConverter(), rewriter, targetInfo); + rewriter.eraseOp(op); + + return success(); + } + +private: + const TargetInfoBase &targetInfo; +}; + } // namespace void mlir::triton::populateMemoryOpToLLVMPattern( @@ -110,4 +138,5 @@ void mlir::triton::populateMemoryOpToLLVMPattern( RewritePatternSet &patterns, PatternBenefit benefit) { patterns.add(typeConverter, targetInfo, benefit); patterns.add(typeConverter, benefit); + patterns.add(typeConverter, targetInfo, benefit); } diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp index e5a818bdd70a..0ce7ecf18cba 100644 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp @@ -2635,6 +2635,23 @@ struct CanonicalizeConvertFromAlloc } }; +// local_store(cvt) -> local_store +struct CanonicalizeConvertFromLocalStore + : public mlir::OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + mlir::LogicalResult + matchAndRewrite(triton::gpu::LocalStoreOp op, + PatternRewriter &rewriter) const override { + auto convert = op.getSrc().getDefiningOp(); + if (!convert) + return failure(); + rewriter.replaceOpWithNewOp(op, convert.getSrc(), + op.getResult()); + return mlir::success(); + } +}; + struct CanonicalizeConvertFromConvert : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -2760,6 +2777,7 @@ void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns, patterns.add(context); patterns.add(context); patterns.add(context); + patterns.add(context); } // LocalAllocOp diff --git a/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt b/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt index 2f4a4e9a9a52..00d4d823cee2 100644 --- a/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt +++ b/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt @@ -9,6 +9,7 @@ add_triton_library(TritonGPUTransforms Pipeliner/OuterLoopPipeline.cpp Pipeliner/PipelineExpander.cpp Pipeliner/SoftwarePipeliner.cpp + Pipeliner/TMAStoresPipeline.cpp Pipeliner/PipeliningUtility.cpp Prefetch.cpp RemoveLayoutConversions.cpp diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.h b/lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.h index 50c025ed211e..c61e81818201 100644 --- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.h +++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.h @@ -21,6 +21,9 @@ bool preProcessLoopAndGetSchedule(scf::ForOp &forOp, int numStages, bool getOuterLoopSchedule(scf::ForOp &forOp, int numStages, mlir::triton::PipeliningOption &options); +/// Pipeline the TMA stores in the loop. +bool pipelineTMAStores(scf::ForOp forOp); + /// This does post-processing on the pipelined loop to try to pipeline wgmma /// ops. // TODO: this should be included as part of the pipeline but currently the wgmma diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp index 52df69a8572b..017d37963dbb 100644 --- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp @@ -149,6 +149,18 @@ struct PipelinePass : public TritonGPUPipelineBase { // the inner loop. for (scf::ForOp outerLoop : outerLoops) tryAndPipelineOuterLoop(outerLoop); + + // Re-collect loop ops + loops.clear(); + getOperation()->walk([&](scf::ForOp forOp) { + // Bail out for loops with num_stage <= 1. + if (getNumStagesOrDefault(forOp) > 1) + loops.push_back(forOp); + }); + + for (scf::ForOp forOp : loops) { + mlir::triton::pipelineTMAStores(forOp); + } } }; } // anonymous namespace diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/TMAStoresPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/TMAStoresPipeline.cpp new file mode 100644 index 000000000000..6318b178d39f --- /dev/null +++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/TMAStoresPipeline.cpp @@ -0,0 +1,93 @@ +#include "Schedule.h" +#include "triton/Dialect/TritonGPU/IR/Dialect.h" +#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h" + +using namespace mlir; +namespace tt = mlir::triton; +namespace ttg = mlir::triton::gpu; +namespace ttng = mlir::triton::nvidia_gpu; + +static SmallVector +getTMAStores(scf::ForOp forOp) { + SmallVector tmaStores; + + // Do not use walk, as we don't want to walk into nested loops. + std::function collectTMAStores = [&](Operation *op) { + if (auto storeOp = dyn_cast(op)) { + tmaStores.push_back(storeOp); + } + for (Region ®ion : op->getRegions()) { + for (Operation &op : region.getOps()) { + if (!isa(op)) + collectTMAStores(&op); + } + } + }; + collectTMAStores(forOp); + return tmaStores; +} + +static Value createAlloc(scf::ForOp &forOp, + tt::ExperimentalDescriptorStoreOp storeOp) { + OpBuilder builder(forOp); + auto ty = cast(storeOp.getSrc().getType()); + auto order = ttg::getOrder(ty.getEncoding()); + auto ctaLayout = ttg::getCTALayout(ty.getEncoding()); + Attribute encoding = + ttg::SharedEncodingAttr::get(ty.getContext(), 1, 1, 1, order, ctaLayout); + if (ty.getRank() > 1) { + encoding = ttg::SharedEncodingAttr::get( + ty.getContext(), ty.getShape(), order, ctaLayout, ty.getElementType()); + } + + Type memdescType = tt::MemDescType::get(ty.getShape(), ty.getElementType(), + encoding, /*mutableMemory*/ true); + Value alloc = builder.create(storeOp->getLoc(), + memdescType, Value()); + return alloc; +} + +static void createTMAAsyncCopy(scf::ForOp &forOp, + tt::ExperimentalDescriptorStoreOp storeOp, + Value alloc) { + OpBuilder builder(storeOp); + auto loc = storeOp.getLoc(); + auto ty = cast(storeOp.getSrc().getType()); + auto order = ttg::getOrder(ty.getEncoding()); + auto ctaLayout = ttg::getCTALayout(ty.getEncoding()); + + // Put wait before the local_store make the store truly async. We know + // that we are the only user of the CopyLocalToGlobal. + builder.create(loc, 0); + builder.create(loc, storeOp.getSrc(), alloc); + builder.create(loc, false); + builder.create( + loc, storeOp.getDescPtr(), storeOp.getIndices(), alloc); + + storeOp->erase(); +} + +bool mlir::triton::pipelineTMAStores(scf::ForOp forOp) { + SmallVector tmaStores = + getTMAStores(forOp); + if (tmaStores.empty()) + return false; + + DenseMap storeToAlloc; + for (tt::ExperimentalDescriptorStoreOp op : tmaStores) { + storeToAlloc[op] = createAlloc(forOp, op); + } + + for (tt::ExperimentalDescriptorStoreOp op : tmaStores) { + createTMAAsyncCopy(forOp, op, storeToAlloc[op]); + } + + // Deallocate shared memory buffers. + OpBuilder builder(forOp); + builder.setInsertionPointAfter(forOp); + builder.create(forOp->getLoc(), 0); + for (auto it : storeToAlloc) { + builder.create(forOp->getLoc(), it.second); + } + return true; +} diff --git a/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp b/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp index 3ee827c2a491..12fb4161a569 100644 --- a/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp +++ b/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp @@ -249,7 +249,7 @@ bool hasConvertToMMATransisitiveUse(Operation *op, Attribute encoding) { bool isMMAV3 = isa(encoding) && cast(encoding).getVersionMajor() == 3; - if (isMMAV3 && isa(op)) + if (isMMAV3 && (isa(op) || isa(op))) return true; auto yield = dyn_cast(op); if (!yield) diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp index 5cb55820f9ab..cc1818ca7b6e 100644 --- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp @@ -563,7 +563,7 @@ bool canFoldIntoConversion(Operation *op, Attribute targetEncoding) { } return isa(op); + triton::gpu::LocalAllocOp, triton::gpu::LocalStoreOp>(op); } scf::ForOp replaceForOpWithNewSignature( diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/TMALowering.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/TMALowering.cpp index 42c14e7d1482..7b2f8d3fb6e7 100644 --- a/lib/Dialect/TritonNvidiaGPU/Transforms/TMALowering.cpp +++ b/lib/Dialect/TritonNvidiaGPU/Transforms/TMALowering.cpp @@ -84,6 +84,7 @@ class TMAStoreLowering rewriter.create(loc, false); rewriter.create( loc, op.getDescPtr(), op.getIndices(), alloc); + rewriter.create(loc, 0); rewriter.eraseOp(op); return success(); } diff --git a/test/Conversion/tritonnvidiagpu_to_llvm.mlir b/test/Conversion/tritonnvidiagpu_to_llvm.mlir index 4606d84a42b2..d6385adba962 100644 --- a/test/Conversion/tritonnvidiagpu_to_llvm.mlir +++ b/test/Conversion/tritonnvidiagpu_to_llvm.mlir @@ -49,9 +49,20 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: elect.sync // CHECK: "@$0 cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [$1, {$2, $3}], [$4];", "b,l,r,r,r" {{.*}} : (i1, !llvm.ptr<1>, i32, i32, !llvm.ptr<3>) -> !llvm.void // CHECK: cp.async.bulk.commit_group - // CHECK: cp.async.bulk.wait_group 0 tt.func @tma_copy_local_to_global(%tma: !tt.ptr, %alloc: !tt.memdesc<128x128xf32, #shared1>, %x: i32) { triton_nvidia_gpu.async_tma_copy_local_to_global %tma[%x, %x] %alloc : , <128x128xf32, #shared1> tt.return } } + +// ----- + +#shared1 = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { + // CHECK-LABEL: async_tma_store_wait + // CHECK: "cp.async.bulk.wait_group.read 0x0;", "" : () -> !llvm.void + tt.func @async_tma_store_wait() { + triton_nvidia_gpu.async_tma_store_wait {pendings = 0 : i32} + tt.return + } +} diff --git a/test/TritonGPU/loop-pipeline-hopper.mlir b/test/TritonGPU/loop-pipeline-hopper.mlir index af7d86a08c5e..b3dc9d883433 100644 --- a/test/TritonGPU/loop-pipeline-hopper.mlir +++ b/test/TritonGPU/loop-pipeline-hopper.mlir @@ -691,3 +691,22 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : tt.return %17#0, %17#2 : tensor<128x64xf32, #mma>, tensor<128x16xf32, #mma1> } } + +// ----- +// Test pipelining of experimental_descriptor_store +#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} { + // CHECK-LABEL: tma_store_pipeline + tt.func public @tma_store_pipeline(%arg0: tensor<1xf32, #blocked>, %arg1: !tt.ptr, %arg2: i32, %arg3: i32) attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 + scf.for %arg4 = %c0_i32 to %arg3 step %arg2 : i32 { + %1 = arith.divsi %arg4, %arg2 : i32 + // CHECK: triton_nvidia_gpu.async_tma_store_wait {pendings = 0 : i32} + // CHECK-NEXT: triton_gpu.local_store + // CHECK-NEXT: triton_nvidia_gpu.fence_async_shared + // CHECK-NEXT: triton_nvidia_gpu.async_tma_copy_local_to_global + tt.experimental_descriptor_store %arg1[%1], %arg0 : !tt.ptr, tensor<1xf32, #blocked> + } + tt.return + } +} diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp index 092cda1d7b04..bc6d1ace6724 100644 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -1061,8 +1061,7 @@ struct AsyncTMACopyLocalToGlobalOpConversion // TODO: Separate the syncronizations operations into separate TTGIR ops to // be able to schedule them at the high level. - const std::string ptx = "cp.async.bulk.commit_group; \n\t" - "cp.async.bulk.wait_group 0"; + const std::string ptx = "cp.async.bulk.commit_group"; PTXBuilder ptxBuilderSync; ptxBuilderSync.create<>(ptx)->operator()(); ptxBuilderSync.launch(rewriter, op.getLoc(), void_ty(op.getContext())); @@ -1121,6 +1120,27 @@ struct AsyncCommitGroupOpConversion } }; +struct TMAStoreWaitConversion + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(triton::nvidia_gpu::TMAStoreWait op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + PTXBuilder ptxBuilder; + auto &asyncWaitOp = *ptxBuilder.create<>("cp.async.bulk.wait_group.read"); + auto num = op.getPendings(); + asyncWaitOp(ptxBuilder.newConstantOperand(num)); + + auto ctx = op.getContext(); + auto loc = op.getLoc(); + auto voidTy = void_ty(ctx); + ptxBuilder.launch(rewriter, loc, voidTy); + rewriter.eraseOp(op); + return success(); + } +}; + } // namespace void mlir::triton::NVIDIA::populateLoadStoreOpToLLVMPatterns( @@ -1133,5 +1153,6 @@ void mlir::triton::NVIDIA::populateLoadStoreOpToLLVMPatterns( patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); + AsyncTMACopyLocalToGlobalOpConversion, TMAStoreWaitConversion>( + typeConverter, benefit); }