diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp index 761fb475de5..91b3d9c7807 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp @@ -18,6 +18,7 @@ #include "KernelRunner.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/envUtils.h" #include "trtllmGen_gemm_export/GemmInterface.h" #include "trtllmGen_gemm_export/GemmOptions.h" #include "trtllmGen_gemm_export/trtllm/gen/DtypeDecl.h" @@ -46,9 +47,10 @@ TrtllmGenGemmRunner::TrtllmGenGemmRunner(TrtllmGenGemmRunnerOptions const& optio auto const options = configs[i].mOptions; // When we include low-latency kernels we can set transposeMmaOutput via constructor - if (options.mDtypeA == mOptions.eltType && options.mDtypeC == mOptions.outputType + if (options.mDtypeA == mOptions.eltTypeA && options.mDtypeC == mOptions.outputType && options.mUseDeepSeekFp8 == mOptions.deepSeekFp8 - && options.mTransposeMmaOutput == mOptions.transposeMmaOutput) + && options.mTransposeMmaOutput == mOptions.transposeMmaOutput + && (mOptions.eltTypeB == gemm::trtllm::gen::Dtype::Void || options.mDtypeB == mOptions.eltTypeB)) { mPassingConfigIndices.push_back(i); } @@ -113,8 +115,8 @@ void TrtllmGenGemmRunner::run(int32_t m, int32_t n, int32_t k, void const* a, fl // FIXME once we start using all-reduce in the epilogue of the gemm this can be moved elsewhere gemm.runInitBeforeWorldSync(config, gemmData, static_cast(stream)); - auto const err = gemm.run( - config, workspace, gemmData, static_cast(stream), multiProcessorCount, globalTrtllmGenGemmModuleCache); + auto const err = gemm.run(config, workspace, gemmData, static_cast(stream), multiProcessorCount, + tensorrt_llm::common::getEnvEnablePDL(), globalTrtllmGenGemmModuleCache); TLLM_CHECK_WITH_INFO(err == 0, "Error occurred when running GEMM!"); } @@ -141,12 +143,30 @@ void TrtllmGenGemmRunner::selectGemmConfig(int32_t m, int32_t n, int32_t k) std::vector sortedIndices = mPassingConfigIndices; std::sort(sortedIndices.begin(), sortedIndices.end(), - [&configs](int32_t idx0, int32_t idx1) + [&configs, &gemmData](int32_t idx0, int32_t idx1) { auto const& optionsA = configs[idx0].mOptions; auto const& optionsB = configs[idx1].mOptions; - // Sort by tileK sizes first + // Choose the tileN that is closest to the problem N. Also if one tileN is larger and the other is smaller, + // prefer the larger one. This is the batch size dimension for low latency (transposeMmaOutput) case; + if (optionsA.mTileN != optionsB.mTileN) + { + auto const N = gemmData.mProblemDimensions.mN; + auto const tileA = optionsA.mTileN; + auto const tileB = optionsB.mTileN; + + // If one tile is larger than N and one is smaller, prefer the larger one + if ((tileA >= N) != (tileB >= N)) + { + return tileA > tileB; + } + + // Otherwise, choose the closest to N + return abs(N - tileA) < abs(N - tileB); + } + + // Sort by tileK sizes if (optionsA.mTileK != optionsB.mTileK) { return optionsA.mTileK > optionsB.mTileK; @@ -158,6 +178,13 @@ void TrtllmGenGemmRunner::selectGemmConfig(int32_t m, int32_t n, int32_t k) return optionsA.mUseUnrollLoop2xForMma; } + // Sort by tileM sizes + // This is the batch size dimension for throughput (non-transposeMmaOutput) case; + if (optionsA.mTileM != optionsB.mTileM) + { + return optionsA.mTileM > optionsB.mTileM; + } + // Then by splitK sizes if (optionsA.mNumSlicesForSplitK != optionsB.mNumSlicesForSplitK) { diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h index 9510a0ae8ab..6bddd8cf3d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h @@ -28,7 +28,8 @@ namespace kernels struct TrtllmGenGemmRunnerOptions { - gemm::trtllm::gen::Dtype eltType; + gemm::trtllm::gen::Dtype eltTypeA; + gemm::trtllm::gen::Dtype eltTypeB{gemm::trtllm::gen::Dtype::Void}; gemm::trtllm::gen::Dtype outputType; bool deepSeekFp8{false}; bool transposeMmaOutput{false}; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/Enums.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/Enums.h index 0ff3334a3ed..adae51a36df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/Enums.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/Enums.h @@ -39,6 +39,31 @@ enum class AllReduceAlgo : uint32_t //////////////////////////////////////////////////////////////////////////////////////////////////// +enum class MatrixLayout +{ + // K-major layout (default). [Mn, K] + MajorK = 0, + // M-major for A and N-major for B. [K, Mn] + MajorMn, + // Layout is blocked along the K dimension as seen in the diagram below. [K / blockK, Mn, blockK] + // where blockK is fixed at 128B + // + // ├────────────── K ──────────────┤ + // ┬ ┬ ├──── K block ───┤ + // │ │ │ 0 1 2 3 ║ 32 33 34 35 │ + // │ CTA0 │ 4 5 6 7 ║ 36 37 38 39 │ + // │ │ │ 8 9 10 11 ║ 40 41 42 43 │ + // │ ┴ │ 12 13 14 15 ║ 44 45 46 47 │ + // M ┬ ├────────────────║────────────────┤ + // │ │ │ 16 17 18 19 ║ 48 49 50 51 │ + // │ CTA1 │ 20 21 22 23 ║ 52 53 54 55 │ + // │ │ │ 24 25 26 27 ║ 56 57 58 59 │ + // ┴ ┴ │ 28 29 30 31 ║ 60 61 62 63 │ + BlockMajorK +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + enum class SplitK : uint32_t { // No split-k is needed. I.e. mNumSlicesForSplitK == 1. @@ -54,6 +79,20 @@ enum class SplitK : uint32_t //////////////////////////////////////////////////////////////////////////////////////////////////// +enum class BiasType : uint32_t +{ + // No bias. + None = 0, + // One bias value per N of the output tensor. + M = 1, + // One bias value per row M of the output tensor. + N = 2, + // One bias value for each element of the output tensor. + Mn = 3, +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + enum class TileScheduler { // Static scheduler (Non-persistent). @@ -80,6 +119,23 @@ SPLIT_K_FUNCTION(Dsmem) //////////////////////////////////////////////////////////////////////////////////////////////////// +// Helper functions to check the Bias type. + +#define BIAS_TYPE_FUNCTION(Mode) \ + inline bool isBiasType##Mode(BiasType type) \ + { \ + return (type == BiasType::Mode); \ + } + +BIAS_TYPE_FUNCTION(None) +BIAS_TYPE_FUNCTION(N) +BIAS_TYPE_FUNCTION(M) +BIAS_TYPE_FUNCTION(Mn) + +#undef BIAS_TYPE_FUNCTION + +//////////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace gemm } // namespace gemm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h index 459d831e0b8..8fd3b10e830 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h @@ -63,8 +63,10 @@ struct GemmData { // The matrix A. The data type is controlled by options.mDtypeA. // - // When transposeMatrixA is false, the shape is [M, K]. - // Otherwise, the shape is [K, M]. + // When layoutA is MatrixLayout::MajorK, the shape is [M, K]. + // When LayoutA is MatrixLayout::MajorMn, the shape is [K, M]. + // When LayoutA is MatrixLayout::BlockMajorK, the shape is [K / blockK, M, blockK] where blockK + // is 128B. // The rightmost dimension is contiguous in memory. void const* mPtrA{nullptr}; @@ -100,8 +102,10 @@ struct GemmData // The matrix B. The data type is controlled by options.mDtypeB. // - // When transposeMatrixB is true, the shape is [N, K]. - // Otherwise, the shape is [K, N]. + // When layoutB is MatrixLayout::MajorK, the shape is [N, K]. + // When layoutB is MatrixLayout::MajorMn, the shape is [K, N]. + // When layoutB is MatrixLayout::BlockMajorK, the shape is [K / blockK, N, blockK] where blockK + // is 128B. // The rightmost dimension is contiguous in memory. void const* mPtrB{nullptr}; @@ -142,8 +146,33 @@ struct GemmData // The shape is [N] void const* mPtrPerTokenSfB{nullptr}; - // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. + // The bias applied after the GEMM. + // The bias is applied before applying the global scaling factor. I.e. + // C' = (A * B + bias') * scaleC + // scaleC = dequantA * dequantB * quantC + // Thus, the bias' = bias / (dequantA * dequantB), where the bias is the original bias. + // + // if BiasType is N, the shape is [N]. + // The bias is broadcasted along the M dimension. + // + // if BiasType is M, the shape is [M]. + // The bias is broadcasted along the N dimension. + // + // The dtype is float32. + void const* mPtrBias{nullptr}; + + // The output tensor scaling factor for Fp8 (not DeepSeek FP8) and NvFp4 quantization. // TensorRT-LLM API requires a scaling factor on the device. + // scaleC = dequantA * dequantB * quantC, + // where dequantA is global dequantization scaling factor of A + // if dtypeA is FP8, it transforms the range from [-448, 448] to [-amaxA, amaxA] + // if dtypeA is NvFp4, it transforms the range from [-448 * 6, 448 * 6] to [-amaxA, amaxA], + // otherwise it is 1. + // dequantB is defined similarly to dequantA. + // quantC is the quantization scaling factor of C. + // if dtypeC is FP8, it transforms the range from [-amaxC, amaxC] to [-448, 448] + // if dtypeC is NvFp4, it transforms the range from [-amaxC, amaxC] to [-448 * 6, 448 * 6], + // otherwise it is 1. // Shape is [1]. void* mPtrScaleC{nullptr}; }; @@ -230,7 +259,7 @@ class GemmInterface // Launch the cubin from the provided config. It calls all necessary memsets for internal buffers. // Provided config must be validated with isValidConfig before the call. int32_t run(GemmConfig const& config, void* workspace, GemmData const& options, void* cudaStream, - int32_t multiProcessorCount, + int32_t multiProcessorCount, bool usePdl = true, std::optional> moduleCache = std::nullopt) const; // Initializes the buffers before the world sync. Must be called before run. @@ -378,7 +407,7 @@ bool GemmInterface::isValidConfig(GemmConfig const& config, GemmData const& data auto options = getOptionsFromConfigAndData(config, data); // Is Blackwell? - bool isBlackwell = config.mSm == SmVersion::Sm100a; + bool isBlackwell = isSmVersionBlackwell(config.mSm); // Check options without modifications. return checkAndUpdateGemmOptions(options, isBlackwell, data.mProblemDimensions.mWorldSize, @@ -388,8 +417,11 @@ bool GemmInterface::isValidConfig(GemmConfig const& config, GemmData const& data //////////////////////////////////////////////////////////////////////////////////////////////////// int32_t GemmInterface::run(GemmConfig const& config, void* workspace, GemmData const& data, void* cudaStream, - int32_t multiProcessorCount, std::optional> moduleCache) const + int32_t multiProcessorCount, bool usePdl, std::optional> moduleCache) const { + // Might be used. + (void) usePdl; + (void) moduleCache; // Get options from config and data. auto options = getOptionsFromConfigAndData(config, data); @@ -417,15 +449,14 @@ int32_t GemmInterface::run(GemmConfig const& config, void* workspace, GemmData c int numTilesN = gemm::divUp(options.mN, options.mTileN); // Create kernel params. - auto kernelParams = gemm::KernelParams::setKernelParams(options, data.mInputBuffers.mPtrA, + auto kernelParams = gemm::KernelParamsSetup::setKernelParams(options, data.mInputBuffers.mPtrA, data.mInputBuffers.mPtrSfA, data.mInputBuffers.mPtrPerTokenSfA, data.mInputBuffers.mPtrB, - data.mInputBuffers.mPtrSfB, data.mInputBuffers.mPtrPerTokenSfB, data.mOutputBuffers.mPtrC, - data.mOutputBuffers.mPtrSfC, data.mOutputBuffers.mPtrMultiMemC, (float*) data.mInputBuffers.mPtrScaleC, - dSplitKSlices, data.mAllReduceBuffers.mPtrTileBars, data.mAllReduceBuffers.mPtrMultiMemTileBars, - data.mAllReduceBuffers.mPtrCompletionBars, data.mAllReduceBuffers.mPtrMultiMemCompletionBars, - dPtrSplitKCompletionBars, + data.mInputBuffers.mPtrSfB, data.mInputBuffers.mPtrPerTokenSfB, data.mInputBuffers.mPtrBias, + data.mOutputBuffers.mPtrC, data.mOutputBuffers.mPtrSfC, data.mOutputBuffers.mPtrMultiMemC, + (float*) data.mInputBuffers.mPtrScaleC, dSplitKSlices, data.mAllReduceBuffers.mPtrTileBars, + data.mAllReduceBuffers.mPtrMultiMemTileBars, data.mAllReduceBuffers.mPtrCompletionBars, + data.mAllReduceBuffers.mPtrMultiMemCompletionBars, dPtrSplitKCompletionBars, /* dPtrNumNonExitingCtas */ nullptr, data.mProblemDimensions.mRank, data.mProblemDimensions.mWorldSize); - // The size of the grid. std::vector grid{numTilesM, numTilesN, options.mNumSlicesForSplitK}; @@ -443,26 +474,26 @@ int32_t GemmInterface::run(GemmConfig const& config, void* workspace, GemmData c #ifdef TLLM_GEN_EXPORT_INTERFACE CUmodule cuModule; CUfunction cuFunction; + if (moduleCache.has_value()) { ModuleCache& moduleCacheRef = moduleCache.value().get(); - // Modules are associated with a specific context so include the ctxId in the key + // Modules are associated with a specific context, so the context is included in the key CUcontext ctx; unsigned long long ctxId; cuCtxGetCurrent(&ctx); cuCtxGetId(ctx, &ctxId); - // Reinterpret the ctxId as a string to avoid needing a custom hash or converting it to a string in decimal - // representation. + // Reinterpret the ctxId as a string to avoid needing a custom hash or converting it to a + // string in decimal representation. std::string const ctxName = std::string(reinterpret_cast(&ctxId), sizeof(unsigned long long) / sizeof(char)); std::string const funcName = std::string(config.mFunctionName); - // As the ctxName is a fixed number of bytes, the two strings can just be appended without risk of a collision auto const moduleKey = ctxName + funcName; auto module = moduleCacheRef.find(moduleKey); - // Check if module exists in cache. Otherwise, load it + // Use cache if module is found, otherwise load and insert into cache if (module != moduleCacheRef.end()) { cuFunction = std::get<1>(module->second); @@ -492,17 +523,18 @@ int32_t GemmInterface::run(GemmConfig const& config, void* workspace, GemmData c // Run the kernel. auto result = trtllm::gen::launchKernel((void*) &kernelParams, cudaStream, config.mSharedMemSize, cuFunction, block3, grid3, cluster3, - config.mOptions.mGridWaitForPrimaryEarlyExit | config.mOptions.mGridWaitForPrimaryA - | config.mOptions.mGridWaitForPrimaryB); - if (result != CUDA_SUCCESS) - { - return -1; - } + usePdl + && (config.mOptions.mGridWaitForPrimaryEarlyExit | config.mOptions.mGridWaitForPrimaryA + | config.mOptions.mGridWaitForPrimaryB)); // If a module cache has not been given, unload the module to avoid leaking if (!moduleCache.has_value()) { cuModuleUnload(cuModule); } + if (result != CUDA_SUCCESS) + { + return -1; + } #else config.mCudaRunner->run((void*) &kernelParams, (void*) cudaStream, grid); #endif diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmOptions.h index 8ab241fc6cf..234f406af6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmOptions.h @@ -16,6 +16,7 @@ */ #pragma once +#include #include #include @@ -31,23 +32,30 @@ #else #include +template +void printArgs(T arg) +{ +#ifdef TLLM_GEN_DEBUG + std::cout << arg; +#endif +} + template void printArgs(T first, Args... args) { -#ifdef TLLM_GEN_DEBUG - std::cout << first; + printArgs(first); if constexpr (sizeof...(args) > 0) { - std::cout << " "; + printArgs(", "); printArgs(args...); } -#endif } #define TLLM_CHECK_ERROR(cond, ...) \ if (!(cond)) \ { \ printArgs(__VA_ARGS__); \ + printArgs("\n"); \ return false; \ } @@ -59,6 +67,7 @@ void printArgs(T first, Args... args) if (!(cond)) \ { \ printArgs(__VA_ARGS__); \ + printArgs("\n"); \ return false; \ } @@ -66,7 +75,7 @@ void printArgs(T first, Args... args) #define TLLM_LOG_INFO(...) TLLM_CHECK_WARNING(false, __VA_ARGS__) -#endif +#endif // TLLM_GEN_EXPORT_INTERFACE namespace gemm { @@ -91,20 +100,23 @@ struct GemmOptions GemmOptions() = default; - GemmOptions(AllReduceAlgo allReduceAlgo, int clusterDimX, int clusterDimY, int clusterDimZ, tg::Dtype dtypeAcc, - tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, bool enablesEarlyExit, bool enablesDelayedEarlyExit, - bool enablesGlobalPtxKnobs, int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, - bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, - bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, - KernelTraits kernelTraits, int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n, - int numSlicesForSplitK, int numSlicesForSliceK, int numStages, int numStagesMma, int numStagesMmaWithinWorkTile, - int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, bool useShuffledMatrixA, - bool sliceK, SplitK splitK, bool transposeMatrixA, bool transposeMatrixB, bool transposeMmaOutput, int tileM, - int tileN, int tileK, bool useUnrollLoop2xForMma, bool useCustomMmaSchedule, - bool useHoistTryWaitForCustomMmaSchedule, bool useDeepSeekFp8, bool usePerTokenSfA, bool usePerTokenSfB, - bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, - tg::SfLayout sfLayoutC, TileScheduler tileScheduler) + GemmOptions(AllReduceAlgo allReduceAlgo, BiasType biasType, int blockK, int clusterDimX, int clusterDimY, + int clusterDimZ, tg::Dtype dtypeAcc, tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeMmaA, + tg::Dtype dtypeMmaB, bool enablesEarlyExit, bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, + int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, bool gridTriggerSecondaryA, + bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, + bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, KernelTraits kernelTraits, + MatrixLayout layoutA, MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, + bool mockAllReduce, int n, int numSlicesForSplitK, int numSlicesForSliceK, int numStages, int numStagesMma, + int numStagesMmaWithinWorkTile, int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, + bool patchF2fp, std::optional sfBlockSizeA, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, + tg::SfLayout sfLayoutC, int sfReshapeFactor, bool sliceK, SplitK splitK, int tileK, int tileM, int tileN, + TileScheduler tileScheduler, bool transposeMmaOutput, bool useCustomMmaSchedule, bool useDeepSeekFp8, + bool useHoistTryWaitForCustomMmaSchedule, bool usePerTokenSfA, bool usePerTokenSfB, bool useShuffledMatrixA, + bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, bool useUnrollLoop2xForMma, int worldSize) : mAllReduceAlgo{allReduceAlgo} + , mBiasType{biasType} + , mBlockK(blockK) , mClusterDimX{clusterDimX} , mClusterDimY{clusterDimY} , mClusterDimZ{clusterDimZ} @@ -112,6 +124,8 @@ struct GemmOptions , mDtypeA{dtypeA} , mDtypeB{dtypeB} , mDtypeC{dtypeC} + , mDtypeMmaA{dtypeMmaA} + , mDtypeMmaB{dtypeMmaB} , mEnablesEarlyExit{enablesEarlyExit} , mEnablesDelayedEarlyExit{enablesDelayedEarlyExit} , mEnablesGlobalPtxKnobs{enablesGlobalPtxKnobs} @@ -128,6 +142,8 @@ struct GemmOptions , mHoistMmaTaskTryWaits{hoistMmaTaskTryWaits} , mK{k} , mKernelTraits{kernelTraits} + , mLayoutA{layoutA} + , mLayoutB{layoutB} , mM{m} , mMmaK{mmaK} , mMmaKind{mmaKind} @@ -143,34 +159,39 @@ struct GemmOptions , mNumStagesMmaAcrossWorkTile{numStagesMmaAcrossWorkTile} , mNumStagesWorkId{numStagesWorkId} , mOutputDebugTensors{outputDebugTensors} - , mUseShuffledMatrixA{useShuffledMatrixA} + , mPatchF2fp{patchF2fp} + , mSfBlockSizeA{sfBlockSizeA} + , mSfLayoutA{sfLayoutA} + , mSfLayoutB{sfLayoutB} + , mSfLayoutC{sfLayoutC} + , mSfReshapeFactor{sfReshapeFactor} , mSliceK{sliceK} , mSplitK{splitK} - , mTransposeMatrixA{transposeMatrixA} - , mTransposeMatrixB{transposeMatrixB} - , mTransposeMmaOutput{transposeMmaOutput} + , mTileK{tileK} , mTileM{tileM} , mTileN{tileN} - , mTileK{tileK} - , mUseUnrollLoop2xForMma{useUnrollLoop2xForMma} + , mTileScheduler{tileScheduler} + , mTransposeMmaOutput{transposeMmaOutput} , mUseCustomMmaSchedule{useCustomMmaSchedule} - , mUseHoistTryWaitForCustomMmaSchedule{useHoistTryWaitForCustomMmaSchedule} , mUseDeepSeekFp8{useDeepSeekFp8} + , mUseHoistTryWaitForCustomMmaSchedule{useHoistTryWaitForCustomMmaSchedule} , mUsePerTokenSfA{usePerTokenSfA} , mUsePerTokenSfB{usePerTokenSfB} + , mUseShuffledMatrixA{useShuffledMatrixA} , mUseTmaStore{useTmaStore} , mUseTwoTmaLoadWarps{useTwoTmaLoadWarps} , mUseTwoMmaWarps{useTwoMmaWarps} - , mSfLayoutA{sfLayoutA} - , mSfLayoutB{sfLayoutB} - , mSfLayoutC{sfLayoutC} - , mTileScheduler{tileScheduler} + , mUseUnrollLoop2xForMma{useUnrollLoop2xForMma} + , mWorldSize{worldSize} { } // The all-reduce algorithm. AllReduceAlgo mAllReduceAlgo{AllReduceAlgo::None}; - + // The type of bias. + BiasType mBiasType{BiasType::None}; + // Block size in the K dimension + int mBlockK{-1}; // Cluster size in X dim. int mClusterDimX{1}; // Cluster size in Y dim. @@ -185,6 +206,10 @@ struct GemmOptions tg::Dtype mDtypeB{tg::Dtype::Void}; // Data type of the outputs. tg::Dtype mDtypeC{tg::Dtype::Void}; + // Data type of the A matrix for the MMA, if different from the input type. + tg::Dtype mDtypeMmaA{tg::Dtype::Void}; + // Data type of the B matrix for the MMA, if different from the input type. + tg::Dtype mDtypeMmaB{tg::Dtype::Void}; // Whether to enable early exit. bool mEnablesEarlyExit{false}; // Whether to enable delayed early exit to overlap @@ -225,6 +250,10 @@ struct GemmOptions int mK{16 * 16}; // Traits of the kernel. KernelTraits mKernelTraits{}; + // Layout of A matrix + MatrixLayout mLayoutA{MatrixLayout::MajorK}; + // Layout of B matrix + MatrixLayout mLayoutB{MatrixLayout::MajorK}; // The M dimension of GEMM. int mM{128 * 2}; // Size of the MMA instruction in the K dimension. @@ -259,52 +288,60 @@ struct GemmOptions int mNumStagesWorkId{3}; // Whether to output debug tensors. bool mOutputDebugTensors{false}; - // Reorder rows/cols in the A matrix for the better memory accesses in the M-major epilogue. - bool mUseShuffledMatrixA{false}; + // Patch float conversions. + bool mPatchF2fp{false}; + // Block size of A. For dtypeA == E2m1 and dtypeB == E4m3. + std::optional mSfBlockSizeA{std::nullopt}; + // Scale factors layout for A. + tg::SfLayout mSfLayoutA{tg::SfLayout::R128c4}; + // Scale factors layout for B. + tg::SfLayout mSfLayoutB{tg::SfLayout::R128c4}; + // Scale factors layout for C. + tg::SfLayout mSfLayoutC{tg::SfLayout::R128c4}; + // Number of "repeats", i.e. reshaping factor, to fold hidden dimension into SfBlock dimension. + // As result, the hidden dimension of the SF tensor must be a multiple of NumRepeats * + // numEltsPerSf * 4. This reduces the problem shape space that the kernel is able to run. + // But it reduces the number of L2 requests under the hood and potentially improves perf. + // Applies to layout 8x4 only. + int mSfReshapeFactor{1}; // Slice-K implementation to use TileM dimension for TileK. bool mSliceK{false}; // The location of the exchange for split-K (it's None when split-K is disabled). SplitK mSplitK{SplitK::None}; - // Is A matrix in a transposed layout? M major if true, K major otherwise - bool mTransposeMatrixA{false}; - // Is B matrix in a transposed layout? K major if true, N major otherwise - bool mTransposeMatrixB{true}; - // Save output of MMA in M-major format. - bool mTransposeMmaOutput{false}; + // K tile dimension of GEMM. + int mTileK{16}; // M tile dimension of GEMM. int mTileM{128}; // N tile dimension of GEMM. int mTileN{32}; - // K tile dimension of GEMM. - int mTileK{16}; - // Whether to unroll the loop by 2x. - bool mUseUnrollLoop2xForMma{true}; + // Tile scheduler type. + TileScheduler mTileScheduler{TileScheduler::Static}; + // Save output of MMA in M-major format. + bool mTransposeMmaOutput{false}; // Use custom MMA schedule optimized for low-latency. bool mUseCustomMmaSchedule{false}; + // Use DeepSeek Fp8. + bool mUseDeepSeekFp8{false}; // The purpose of hoisting trywaits is to opportunistically peek at the availability of the next // k-block. It benefits when the next k-block is already available and thus sustaining the // momentum, but it adds latency to the first k-block for smaller k-loop. bool mUseHoistTryWaitForCustomMmaSchedule{false}; - // Use DeepSeek Fp8. - bool mUseDeepSeekFp8{false}; // Apply per-token scales from A bool mUsePerTokenSfA{false}; // Apply per-token scales from B bool mUsePerTokenSfB{false}; + // Reorder rows/cols in the A matrix for the better memory accesses in the M-major epilogue. + bool mUseShuffledMatrixA{false}; // Use TMA to store the result. bool mUseTmaStore{true}; // Use two different warps for A and B matrix load. bool mUseTwoTmaLoadWarps{false}; // Use two different warps for MMA tasks. Applicable only to DeepSeek FP8. bool mUseTwoMmaWarps{false}; - // Scale factors layout for A. - tg::SfLayout mSfLayoutA{tg::SfLayout::R128c4}; - // Scale factors layout for B. - tg::SfLayout mSfLayoutB{tg::SfLayout::R128c4}; - // Scale factors layout for C. - tg::SfLayout mSfLayoutC{tg::SfLayout::R128c4}; - // Tile scheduler type. - TileScheduler mTileScheduler{TileScheduler::Static}; + // Whether to unroll the loop by 2x. + bool mUseUnrollLoop2xForMma{true}; + // World size for all-reduce. + int mWorldSize{1}; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -312,9 +349,18 @@ struct GemmOptions enum class SmVersion { Sm90a, - Sm100a + Sm100a, + Sm100f, + Sm103a }; +//////////////////////////////////////////////////////////////////////////////////////////////////// + +bool isSmVersionBlackwell(SmVersion smVersion) +{ + return smVersion == SmVersion::Sm100a || smVersion == SmVersion::Sm100f || smVersion == SmVersion::Sm103a; +} + //////////////////////////////////////////////////////////////////////////////////////////////////// // // GemmConfig @@ -332,6 +378,7 @@ struct GemmConfig uint32_t const mSharedMemSize{0}; char const* mFunctionName{nullptr}; uint32_t const mNumThreadsPerCTA{0}; + char const* mHash{nullptr}; #else trtllm::gen::CudaRunner* mCudaRunner{nullptr}; #endif @@ -373,6 +420,10 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mAllReduceAlgo=" << "gemm::AllReduceAlgo(" << static_cast(options.mAllReduceAlgo) << ")" << "," << std::endl; + ss << "mBiasType=" + << "gemm::BiasType(" << static_cast(options.mBiasType) << ")" + << "," << std::endl; + ss << "mBlockK=" << options.mBlockK << "," << std::endl; ss << "mClusterDimX=" << options.mClusterDimX << "," << std::endl; ss << "mClusterDimY=" << options.mClusterDimY << "," << std::endl; ss << "mClusterDimZ=" << options.mClusterDimZ << "," << std::endl; @@ -388,6 +439,12 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mDtypeC=" << "trtllm::gen::Dtype(" << static_cast(options.mDtypeC) << ")" << "," << std::endl; + ss << "mDtypeMmaA=" + << "trtllm::gen::Dtype(" << static_cast(options.mDtypeMmaA) << ")" + << "," << std::endl; + ss << "mDtypeMmaB=" + << "trtllm::gen::Dtype(" << static_cast(options.mDtypeMmaB) << ")" + << "," << std::endl; ss << "mEnablesEarlyExit=" << options.mEnablesEarlyExit << "," << std::endl; ss << "mEnablesDelayedEarlyExit=" << options.mEnablesDelayedEarlyExit << "," << std::endl; ss << "mEnablesGlobalPtxKnobs=" << options.mEnablesGlobalPtxKnobs << "," << std::endl; @@ -405,6 +462,10 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mK=" << options.mK << "," << std::endl; ss << "mKernelTraits={}" << "," << std::endl; + ss << "mLayoutA=gemm::MatrixLayout(" << static_cast(options.mLayoutA) << ")" + << "," << std::endl; + ss << "mLayoutB=gemm::MatrixLayout(" << static_cast(options.mLayoutB) << ")" + << "," << std::endl; ss << "mM=" << options.mM << "," << std::endl; ss << "mMmaK=" << options.mMmaK << "," << std::endl; ss << "mMmaKind=" @@ -422,37 +483,49 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mNumStagesMmaAcrossWorkTile=" << options.mNumStagesMmaAcrossWorkTile << "," << std::endl; ss << "mNumStagesWorkId=" << options.mNumStagesWorkId << "," << std::endl; ss << "mOutputDebugTensors=" << options.mOutputDebugTensors << "," << std::endl; - ss << "mUseShuffledMatrixA=" << options.mUseShuffledMatrixA << "," << std::endl; + ss << "mPatchF2fp=" << options.mPatchF2fp << "," << std::endl; + if (options.mSfBlockSizeA.has_value()) + { + ss << "mSfBlockSizeA=" << options.mSfBlockSizeA.value() << "," << std::endl; + } + else + { + ss << "mSfBlockSizeA=" + << "std::nullopt" + << ", " << std::endl; + } + ss << "mSfLayoutA=" + << "trtllm::gen::SfLayout(" << static_cast(options.mSfLayoutA) << ")" + << "," << std::endl; + ss << "mSfLayoutB=" + << "trtllm::gen::SfLayout(" << static_cast(options.mSfLayoutB) << ")" + << "," << std::endl; + ss << "mSfLayoutC=" + << "trtllm::gen::SfLayout(" << static_cast(options.mSfLayoutC) << ")" + << "," << std::endl; + ss << "mSfReshapeFactor=" << options.mSfReshapeFactor << "," << std::endl; ss << "mSliceK=" << options.mSliceK << "," << std::endl; ss << "mSplitK=" << "gemm::SplitK(" << static_cast(options.mSplitK) << ")" << "," << std::endl; - ss << "mTransposeMatrixA=" << options.mTransposeMatrixA << "," << std::endl; - ss << "mTransposeMatrixB=" << options.mTransposeMatrixB << "," << std::endl; - ss << "mTransposeMmaOutput=" << options.mTransposeMmaOutput << "," << std::endl; + ss << "mTileK=" << options.mTileK << "," << std::endl; ss << "mTileM=" << options.mTileM << "," << std::endl; ss << "mTileN=" << options.mTileN << "," << std::endl; - ss << "mTileK=" << options.mTileK << "," << std::endl; - ss << "mUseUnrollLoop2xForMma=" << options.mUseUnrollLoop2xForMma << "," << std::endl; + ss << "mTileScheduler=" + << "gemm::TileScheduler(" << static_cast(options.mTileScheduler) << ")" + << "," << std::endl; + ss << "mTransposeMmaOutput=" << options.mTransposeMmaOutput << "," << std::endl; ss << "mUseCustomMmaSchedule=" << options.mUseCustomMmaSchedule << "," << std::endl; - ss << "mUseHoistTryWaitForCustomMmaSchedule=" << options.mUseHoistTryWaitForCustomMmaSchedule << "," << std::endl; ss << "mUseDeepSeekFp8=" << options.mUseDeepSeekFp8 << "," << std::endl; + ss << "mUseHoistTryWaitForCustomMmaSchedule=" << options.mUseHoistTryWaitForCustomMmaSchedule << "," << std::endl; ss << "mUsePerTokenSfA=" << options.mUsePerTokenSfA << "," << std::endl; ss << "mUsePerTokenSfB=" << options.mUsePerTokenSfB << "," << std::endl; + ss << "mUseShuffledMatrixA=" << options.mUseShuffledMatrixA << "," << std::endl; ss << "mUseTmaStore=" << options.mUseTmaStore << "," << std::endl; ss << "mUseTwoTmaLoadWarps=" << options.mUseTwoTmaLoadWarps << "," << std::endl; ss << "mUseTwoMmaWarps=" << options.mUseTwoMmaWarps << "," << std::endl; - ss << "mSfLayoutA=" - << "trtllm::gen::SfLayout(" << static_cast(options.mSfLayoutA) << ")" - << "," << std::endl; - ss << "mSfLayoutB=" - << "trtllm::gen::SfLayout(" << static_cast(options.mSfLayoutB) << ")" - << "," << std::endl; - ss << "mSfLayoutC=" - << "trtllm::gen::SfLayout(" << static_cast(options.mSfLayoutC) << ")" - << "," << std::endl; - ss << "mTileScheduler=" - << "gemm::TileScheduler(" << static_cast(options.mTileScheduler) << ")" << std::endl; + ss << "mUseUnrollLoop2xForMma=" << options.mUseUnrollLoop2xForMma << "," << std::endl; + ss << "mWorldSize=" << options.mWorldSize << std::endl; return ss.str(); } @@ -487,9 +560,10 @@ inline int32_t getShuffleBlockSize(int epilogueTileM) //////////////////////////////////////////////////////////////////////////////////////////////////// // Check if the options are valid or not. -inline bool checkAndUpdateGemmOptions( - GemmOptions& options, bool isBlackwell, int /* tpGrpSize */, bool updateOptions = true) +inline bool checkAndUpdateGemmOptions(GemmOptions& options, bool isBlackwell, int tpGrpSize, bool updateOptions = true) { + options.mWorldSize = tpGrpSize; + if (options.mDtypeB == tg::Dtype::Void) { if (updateOptions) @@ -502,39 +576,98 @@ inline bool checkAndUpdateGemmOptions( } } + // If not specified, used the input dtypes as MMA dtypes (no cast required). + if (options.mDtypeMmaA == tg::Dtype::Void) + { + if (updateOptions) + { + options.mDtypeMmaA = options.mDtypeA; + } + else + { + return false; + } + } + if (options.mDtypeMmaB == tg::Dtype::Void) + { + if (updateOptions) + { + options.mDtypeMmaB = options.mDtypeB; + } + else + { + return false; + } + } + + // Check that the A cast is supported. + // Currently, we only support {MxFp4, NvFp4} -> Bf16. + TLLM_CHECK_ERROR((options.mDtypeA == options.mDtypeMmaA) + || ((options.mDtypeA == tg::Dtype::MxE2m1 || options.mDtypeA == tg::Dtype::E2m1) + && options.mDtypeMmaA == tg::Dtype::Bfloat16) + || (options.mDtypeA == tg::Dtype::E2m1 && options.mDtypeMmaA == tg::Dtype::E4m3), + "Unsupported cast for A: ", tg::dtypeToString(options.mDtypeA), " -> ", tg::dtypeToString(options.mDtypeMmaA)); + + // Check that the B cast is supported. + // Currently, we only support Fp8 -> MxFp8. + // TODO: add same support for A (no transpose) + TLLM_CHECK_ERROR((options.mDtypeB == options.mDtypeMmaB) + || (options.mDtypeB == tg::Dtype::E4m3 && options.mDtypeMmaB == tg::Dtype::MxE4m3), + "Unsupported cast for B: ", tg::dtypeToString(options.mDtypeB), " -> ", tg::dtypeToString(options.mDtypeMmaB)); + + if (options.mDtypeA != options.mDtypeMmaA) + { + TLLM_CHECK_ERROR(options.mTileM == 128, "TileM must be 128 when casting the input matrix A before the MMA."); + } + + if (options.mPatchF2fp) + { + TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::MxE2m1 && options.mDtypeMmaA == tg::Dtype::Bfloat16, + "PatchF2fp is only supported for MxFp4 to Bf16 casts."); + } + // FIXME: We do not support different dtypes for A and B when not on Blackwell. if (!isBlackwell) { - TLLM_CHECK_ERROR(options.mDtypeA == options.mDtypeB, "For non-Blackwell, A and B must have the same dtype."); + TLLM_CHECK_ERROR( + options.mDtypeMmaA == options.mDtypeMmaB, "For non-Blackwell, A and B must have the same dtype."); } // Check that the different dtypes for A and B are supported by the tensor core // kind::f8f6f4 - if (options.mDtypeA == tg::Dtype::E4m3 || options.mDtypeA == tg::Dtype::E2m1) + if (options.mDtypeMmaA == tg::Dtype::E4m3 || options.mDtypeMmaA == tg::Dtype::E2m1) { - TLLM_CHECK_ERROR(options.mDtypeB == tg::Dtype::E4m3 || options.mDtypeB == tg::Dtype::E2m1, - "For E4m3/E2m1 A, B must also be E4m3/E2m1."); + TLLM_CHECK_ERROR(options.mDtypeMmaB == tg::Dtype::E4m3 || options.mDtypeMmaB == tg::Dtype::E2m1, + "For dtypeMmaA = E4m3/E2m1 A, dtypeMmaB must also be E4m3/E2m1."); } // kind::mxf8f6f4 - if (options.mDtypeA == tg::Dtype::MxE4m3 || options.mDtypeA == tg::Dtype::MxE2m1) + if (options.mDtypeMmaA == tg::Dtype::MxE4m3 || options.mDtypeMmaA == tg::Dtype::MxE2m1) { - TLLM_CHECK_ERROR(options.mDtypeB == tg::Dtype::MxE4m3 || options.mDtypeB == tg::Dtype::MxE2m1, - "For dtypeA = MxE4m3 or MxE2m1, dtypeB must also be MxE4m3 or MxE2m1."); + TLLM_CHECK_ERROR(options.mDtypeMmaB == tg::Dtype::MxE4m3 || options.mDtypeMmaB == tg::Dtype::MxE2m1, + "For dtypeMmaA = MxE4m3 or MxE2m1, dtypeMmaB must also be MxE4m3 or MxE2m1."); } - if (options.mDtypeB == tg::Dtype::MxE4m3 || options.mDtypeB == tg::Dtype::MxE2m1) + if (options.mDtypeMmaB == tg::Dtype::MxE4m3 || options.mDtypeMmaB == tg::Dtype::MxE2m1) { - TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::MxE4m3 || options.mDtypeA == tg::Dtype::MxE2m1, - "For dtypeB = MxE4m3 or MxE2m1, dtypeA must also be MxE4m3 or MxE2m1."); + TLLM_CHECK_ERROR(options.mDtypeMmaA == tg::Dtype::MxE4m3 || options.mDtypeMmaA == tg::Dtype::MxE2m1, + "For dtypeMmaB = MxE4m3 or MxE2m1, dtypeMmaA must also be MxE4m3 or MxE2m1."); } // kind::f16 - if (options.mDtypeA == tg::Dtype::Fp16 || options.mDtypeA == tg::Dtype::Bfloat16) + if (options.mDtypeMmaA == tg::Dtype::Fp16 || options.mDtypeMmaA == tg::Dtype::Bfloat16) { - TLLM_CHECK_ERROR(options.mDtypeB == options.mDtypeA, "For Fp16/Bfloat16 A, B must be the same type as A."); + TLLM_CHECK_ERROR(options.mDtypeMmaB == options.mDtypeMmaA, + "For dtypeMmaA = Fp16/Bfloat16, dtypeMmaB must be the same as dtypeMmaA."); } - // When different dtype are used for A and B, we must use different tile to do the loading. + // When one of the inputs needs to be cast, we must use two load warps. + if ((options.mDtypeMmaA != options.mDtypeA || options.mDtypeMmaB != options.mDtypeB) + && !options.mUseTwoTmaLoadWarps) + { + TLLM_LOG_WARNING("Two TMA load warps must be enabled if any of the inputs needs to be cast."); + } + + // When different dtypes are used for A and B, we must use different tiles to do the loading. // It is not strictly required, but current implementation of SmemAb requires that. if (options.mDtypeA != options.mDtypeB) { @@ -547,7 +680,7 @@ inline bool checkAndUpdateGemmOptions( { if (updateOptions) { - options.mMmaKind = dtypeGetMmaKind(options.mDtypeA, options.mDtypeB); + options.mMmaKind = dtypeGetMmaKind(options.mDtypeMmaA, options.mDtypeMmaB); } else { @@ -555,11 +688,6 @@ inline bool checkAndUpdateGemmOptions( } } - if (options.mMmaKind == tg::MmaKind::Fp16) - { - TLLM_CHECK_ERROR(options.mDtypeA == options.mDtypeB, "For Fp16 MMA, A and B must have the same dtype."); - } - if ((options.mMmaKind == tg::MmaKind::Fp8Fp6Fp4 || options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) && options.mMmaK != 32) { @@ -626,10 +754,20 @@ inline bool checkAndUpdateGemmOptions( { TLLM_CHECK_ERROR(isBlackwell, "Block scaling is only supported on Blackwell"); - TLLM_CHECK_ERROR(options.mSfLayoutB == tg::SfLayout::R128c4 || options.mSfLayoutB == tg::SfLayout::R8c4, - "Only the 128x4 and 8x4 SF layouts are supported for B, got ", tg::sfLayoutToString(options.mSfLayoutB)); - - int const mmaK = (options.mMmaKind == tg::MmaKind::MxFp4NvFp4) ? 64 : 32; + int mmaK = 32; + if (options.mMmaKind == tg::MmaKind::MxFp4NvFp4) + { + if (options.mMmaK == 96) + { + mmaK = 96; + TLLM_CHECK_ERROR(options.mTileK == 768, "When mmaK == 96, only tileK == 768 is supported"); + TLLM_CHECK_ERROR(options.mTileN <= 128, "When mmaK == 96, only tileN <= 128 is supported"); + } + else + { + mmaK = 64; + } + } if (options.mMmaK != mmaK) { int newTileK = mmaK * divUp(options.mTileK, mmaK); @@ -646,21 +784,74 @@ inline bool checkAndUpdateGemmOptions( } } - // TileN must be a multiple of the number of rows per SF tile. - int const numSfTileRowsB = options.mSfLayoutB == tg::SfLayout::R128c4 ? 128 : 8; - TLLM_CHECK_ERROR(options.mTileN % numSfTileRowsB == 0, "TileN (", options.mTileN, ") must be a multiple of ", - numSfTileRowsB, " for B SF layout ", tg::sfLayoutToString(options.mSfLayoutB)); // The MMA N may only be smaller than 64 if it is equal to the tile N. TLLM_CHECK_ERROR(options.mMmaN >= 64 || options.mMmaN == options.mTileN, "MmaN (", options.mMmaN, ") must be >= 64 or equal to TileN (", options.mTileN, ")"); + } - int numEltsPerSfA = tg::dtypeNumEltsPerSf(options.mDtypeA); - int numEltsPerSfB = tg::dtypeNumEltsPerSf(options.mDtypeB); + if (options.mSfBlockSizeA.has_value()) + { + // Only E2m1 x E4m3 is tested. MxE2m1 x bf16 may also work. + TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::E2m1 && options.mDtypeB == tg::Dtype::E4m3, + "sfBlockSizeA is only supported for E2m1 and E4m3 types. Found dtypeA=", tg::dtypeToString(options.mDtypeA), + " dtypeB=", tg::dtypeToString(options.mDtypeB)); + + // sfBlockSizeA must be 16 or 32. + // SfBlockSizeA can also support 64 and 128, although they are not officially supported Nvida + // format. Note that the type conversion needs to happen before TCs. + // For example, convert e2m1 to e4m3 inside TmemCastA. + // If we want to support sfBlockSizeA=8, we can write another version of convertE2m1ToSfE4m3, + // which only packs 8 e2m1 elements. + TLLM_CHECK_ERROR(options.mSfBlockSizeA.value() == 16 || options.mSfBlockSizeA.value() == 32, "SfBlockSizeA (", + options.mSfBlockSizeA.value(), ") must be 16 or 32."); + } + + if (tg::dtypeIsBlockFmt(options.mDtypeA)) + { + int numEltsPerSfA = options.mSfBlockSizeA.value_or(tg::dtypeNumEltsPerSf(options.mDtypeA)); TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfA) == 0, "TileK (", options.mTileK, ") must be a multiple of ", (4 * numEltsPerSfA), " for typeA ", gemm::toString(options.mDtypeA)); + auto const numEltsPerSfAInK = options.mK / numEltsPerSfA; + TLLM_CHECK_ERROR(numEltsPerSfAInK % 4 == 0, "K dimension of scaling factors for A (", numEltsPerSfAInK, + ") must be a multiple of 4"); + } + if (tg::dtypeIsBlockFmt(options.mDtypeB)) + { + TLLM_CHECK_ERROR(options.mSfLayoutB == tg::SfLayout::R128c4 || options.mSfLayoutB == tg::SfLayout::R8c4 + || options.mSfLayoutB == tg::SfLayout::Linear, + "Only the 128x4 and 8x4 SF layouts are supported for B, got ", tg::sfLayoutToString(options.mSfLayoutB)); + + // TileN must be a multiple of the number of rows per SF tile. + int const numSfTileRowsB = options.mSfLayoutB == tg::SfLayout::R128c4 ? 128 : 8; + TLLM_CHECK_ERROR(options.mTileN % numSfTileRowsB == 0, "TileN (", options.mTileN, ") must be a multiple of ", + numSfTileRowsB, " for B SF layout ", tg::sfLayoutToString(options.mSfLayoutB)); + + int numEltsPerSfB = tg::dtypeNumEltsPerSf(options.mDtypeB); TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfB) == 0, "TileK (", options.mTileK, ") must be a multiple of ", (4 * numEltsPerSfB), " for typeB ", gemm::toString(options.mDtypeB)); + auto const numEltsPerSfBInK = options.mK / numEltsPerSfB; + TLLM_CHECK_ERROR(numEltsPerSfBInK % 4 == 0, "K dimension of scaling factors for B (", numEltsPerSfBInK, + ") must be a multiple of 4"); + } + + int32_t padMultiplierA = 1; + int32_t padMultiplierB = 1; + if (options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) + { + if (options.mDtypeA == tg::Dtype::MxE2m1) + { + padMultiplierA = 2; + } + if (options.mDtypeB == tg::Dtype::MxE2m1) + { + padMultiplierB = 2; + } } + TLLM_CHECK_ERROR((padMultiplierA * tg::dtypeGetNumBits(options.mDtypeA) * options.mK / 8) % 16 == 0, + "K dimension of A must be aligned to 16 bytes."); + TLLM_CHECK_ERROR((padMultiplierB * tg::dtypeGetNumBits(options.mDtypeB) * options.mK / 8) % 16 == 0, + "K dimension of B must be aligned to 16 bytes."); + if (options.mDtypeC == tg::Dtype::E2m1 || options.mDtypeC == tg::Dtype::MxE4m3) { TLLM_CHECK_ERROR(isBlackwell, "Block scaling is only supported on Blackwell"); @@ -668,8 +859,10 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mSfLayoutC == tg::SfLayout::R128c4 || options.mSfLayoutC == tg::SfLayout::R8c4, "Only the 128x4 and 8x4 SF layouts are supported for C."); int const numSfTileRowsC = options.mSfLayoutC == tg::SfLayout::R128c4 ? 128 : 8; - TLLM_CHECK_ERROR(options.mTileN % numSfTileRowsC == 0, "TileN (", options.mTileN, ") must be a multiple of ", - numSfTileRowsC, " for C SF layout ", tg::sfLayoutToString(options.mSfLayoutC)); + int const tileTokenDim = options.mTransposeMmaOutput ? options.mTileN : options.mTileM; + TLLM_CHECK_ERROR_FMT(tileTokenDim % numSfTileRowsC == 0, + "Tile%s (%d) must be a multiple of %d for C SF layout %s", options.mTransposeMmaOutput ? "N" : "M", + tileTokenDim, numSfTileRowsC, tg::sfLayoutToString(options.mSfLayoutC).c_str()); int const hiddenDim = options.mTransposeMmaOutput ? options.mM : options.mN; int const hiddenGranularity = 4 * tg::dtypeNumEltsPerSf(options.mDtypeC); @@ -753,7 +946,6 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mM > 0 && options.mN > 0 && options.mK > 0, "M, N and K must be larger than 0"); TLLM_CHECK_ERROR(options.mNumSlicesForSplitK > 0, "Split K must be larger than 0."); - TLLM_CHECK_ERROR(options.mK % options.mTileK == 0, "K must be a multiple of TileK"); if (options.mUseShuffledMatrixA) { @@ -774,6 +966,13 @@ inline bool checkAndUpdateGemmOptions( options.mClusterDimZ == 1 || options.mNumSlicesForSplitK > 1, "Cluster DimZ is only allowed for split-k."); TLLM_CHECK_ERROR(options.mTileM <= 128, "GEMM does not support TileM > 128."); + // FIXME: this is a bug in DeepSeek Fp8. + if (options.mUseDeepSeekFp8) + { + TLLM_CHECK_ERROR(options.mK % (options.mNumSlicesForSplitK * options.mTileK) == 0, + "K must be a multiple of TileK * numSlicesForSplitK for DeepSeekFp8"); + } + // When the A-matrix is shuffled, the output must be transposed. if (options.mUseShuffledMatrixA) { @@ -911,6 +1110,11 @@ inline bool checkAndUpdateGemmOptions( { TLLM_CHECK_ERROR( options.mNumStagesMmaWithinWorkTile == 1, "Non-DeepSeekFp8 requires numStagesMmaWithinWorkTile == 1"); + if (options.mNumStagesMma > 1) + { + TLLM_CHECK_ERROR(options.mTileScheduler == TileScheduler::Persistent, + "Non-DeepSeekFp8 requires persistent scheduler when using numStagesMma >1"); + } } if (options.mUseDeepSeekFp8) { @@ -923,6 +1127,7 @@ inline bool checkAndUpdateGemmOptions( // Check that TileK = 128 for correct scaling of every 128 channels. TLLM_CHECK_ERROR(options.mTileK == 128, "Tile-K must be equal to 128 for DeepSeek Fp8"); + TLLM_CHECK_ERROR(options.mK % options.mTileK == 0, "K must be a multiple of TileK"); // Tile sizes of the output hidden dimension. auto hiddenDimPerOutputTile = options.mTransposeMmaOutput ? options.mTileM : options.mTileN; auto hiddenDimPerEpilogueTile = options.mTransposeMmaOutput ? options.mEpilogueTileM : options.mEpilogueTileN; @@ -995,17 +1200,36 @@ inline bool checkAndUpdateGemmOptions( ")"); } + // Number of iterations in K dimension after padding. + // Note the perCtaK in each CTA in the splitK group are padded to the same number of iterations. + // E.g., K = 512, TileK = 128, numSlicesForSplitK = 3. Then the padded K is + // + // ceil(512 / (128*3)) * (128*3) = 768 + // + int const paddedK = divUpMul(options.mK, options.mTileK * options.mNumSlicesForSplitK); + int const perCtaK = paddedK / options.mNumSlicesForSplitK; + // However, number of iterations is clamped to multiples of tileK within individual CTAs + // E.g., K = 448, TileK = 64, numSlicesForSplitK = 4. + // + // paddedK = 512 + // perCtaK = 128 + // clampedPerCtaK for CTA 0, 1, 2 = 128 + // clampedPerCtaK for CTA 3 = 64 + int const paddingForK = paddedK - options.mK; + int const clampedAndPaddedPerCtaK = divUpMul(perCtaK - paddingForK, options.mTileK); if (options.mUseUnrollLoop2xForMma) { - bool notSupported = (options.mK / options.mNumSlicesForSplitK) % (options.mTileK * 2) != 0; - // Check that the 2*TileK is a multiple of MmaK when UnrollLoop2x is enabled. - // This is to avoid deadlock when mma runs even-numbered loop while the other warps run - // odd-numbered loop. + // Check that the padded K and clamped padded K (K rounded to next multiple of tileK) is a + // multiple of 2*TileK when UnrollLoop2x is enabled. This is to avoid deadlock when mma runs + // even-numbered loop while the other warps run odd-numbered loop. + // + bool notSupported + = (perCtaK % (options.mTileK * 2) != 0) || (clampedAndPaddedPerCtaK % (options.mTileK * 2) != 0); if (notSupported) { TLLM_LOG_WARNING("Size K / splitK must be a multiple of TileK * 2. Found TileK=", options.mTileK, - " and K=", options.mK, " and numSlicesForSplitK=", options.mNumSlicesForSplitK, - ". Disabling unrollLoop2xForMma."); + " and K=", options.mK, " (paddedK=", paddedK, " clampedAndPaddedPerCtaK=", clampedAndPaddedPerCtaK, + ") and numSlicesForSplitK=", options.mNumSlicesForSplitK, ". Disabling unrollLoop2xForMma."); if (updateOptions) { options.mUseUnrollLoop2xForMma = false; @@ -1016,6 +1240,11 @@ inline bool checkAndUpdateGemmOptions( } } } + if (options.mNumSlicesForSplitK > 1) + { + TLLM_CHECK_ERROR(perCtaK * (options.mNumSlicesForSplitK - 1) < options.mK, + "K must be greater than perCtaK * (numSlicesForSplitK - 1) to ensure each CTA has work"); + } if (!isBlackwell && options.mTileScheduler == TileScheduler::Persistent) { @@ -1059,43 +1288,108 @@ inline bool checkAndUpdateGemmOptions( // // Kernel 1: ----PREEXIT-----------FLUSH // Kernel 2: -------PREEXIT----ACQBULK---FLUSH - // Kernel 3: Warp 0: ---- (!) Output of 1,2 is not yet visible ----------------------- - // Warp 1: ---- (!) We normally assume that 1 is visible is not yet visible- - // Warp 2: -------------------ACQBULK-- Kernel 1,2 output visible ---------- + // Kernel 3: Warp 0: ---- (!) Output of 1,2 is not yet visible + // ----------------------- + // Warp 1: ---- (!) We normally assume that 1 is visible is not yet + // visible- Warp 2: -------------------ACQBULK-- Kernel 1,2 output visible + // ---------- TLLM_CHECK_ERROR((options.mGridWaitForPrimaryA || !options.mGridTriggerSecondaryA), "A: If a task triggers a secondary kernel, it must also wait for primary kernel."); TLLM_CHECK_ERROR((options.mGridWaitForPrimaryB || !options.mGridTriggerSecondaryB), "B: If a task triggers a secondary kernel, it must also wait for primary kernel."); + if (options.mUsePerTokenSfA || options.mUsePerTokenSfB) + { + // Checks applicable to both MetaFP8 and RoutingScalesOnInput + TLLM_CHECK_ERROR(!options.mUseDeepSeekFp8, "DeepSeek FP8 and per-token scaling are not compatible"); + TLLM_CHECK_ERROR(isBlackwell, "Per-token scaling is not supported for Hopper"); + if (options.mUsePerTokenSfA && options.mUsePerTokenSfB) + { + // MetaFP8 case + TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::E4m3 && options.mDtypeB == tg::Dtype::E4m3, + "A and B dtype must be E4m3 for Meta Fp8. Found dtypeA=", tg::dtypeToString(options.mDtypeA), + " dtypeB=", tg::dtypeToString(options.mDtypeB)); + } + else + { + // RoutingScalesOnInput case + TLLM_CHECK_ERROR((options.mUsePerTokenSfA && !options.mTransposeMmaOutput) + || (options.mUsePerTokenSfB && options.mTransposeMmaOutput), + "In RoutingScalesOnInput mode, perToken scales must be used on activations"); + } + } + // The generation should support non K-major layouts for both A and B; however, it is unclear if // there is a use-case - TLLM_CHECK_ERROR(!options.mTransposeMatrixA || options.mTransposeMatrixB, - "TransposeA true and TransposeB false is not supported"); + TLLM_CHECK_ERROR((options.mLayoutA == MatrixLayout::MajorK) || (options.mLayoutB == MatrixLayout::MajorK), + "At least one matrix must be in k-major layout"); // Some features are currently only support when both matrices are in K-major format - if (options.mTransposeMatrixA || !options.mTransposeMatrixB) + if (options.mLayoutB != MatrixLayout::MajorK || options.mLayoutB != MatrixLayout::MajorK) { TLLM_CHECK_ERROR(isBlackwell, "Non K-major layouts are only supported on Blackwell"); TLLM_CHECK_ERROR(options.mSplitK == SplitK::None, "Non K-major layouts do not support split K"); } - if (options.mTransposeMatrixA) + if (options.mLayoutA == MatrixLayout::MajorMn) { TLLM_CHECK_ERROR(tg::dtypeGetNumBits(options.mDtypeA) >= 8, "Subbyte types only support K major layout"); } - if (!options.mTransposeMatrixB) + if (options.mLayoutB == MatrixLayout::MajorMn) { TLLM_CHECK_ERROR(tg::dtypeGetNumBits(options.mDtypeB) >= 8, "Subbyte types only support K major layout"); } + if ((options.mLayoutA == MatrixLayout::BlockMajorK) || (options.mLayoutB == MatrixLayout::BlockMajorK)) + { + bool const isBlockA = options.mLayoutA == MatrixLayout::BlockMajorK; + + // Block K size must be 128B. + // TODO Leaving this as an option for now in case we want to expertiment with other block sizes + // As the user is not expected to set this, do not fail if updateOptions is false + int32_t const elemSizeInBits + = (isBlockA) ? tg::dtypeGetNumBits(options.mDtypeA) : tg::dtypeGetNumBits(options.mDtypeB); + int32_t const elemsIn128B = 128 * 8 /* Bits in byte */ / elemSizeInBits; + + if (options.mBlockK != elemsIn128B) + { + if (updateOptions) + { + options.mBlockK = elemsIn128B; + } + else + { + return false; + } + } + + if (options.mBlockK > options.mTileK) + { + TLLM_CHECK_ERROR(options.mBlockK % options.mTileK == 0, + "If block size is greater than tile size, block size must be a multiple of tile size"); + } + else if (options.mBlockK < options.mTileK) + { + TLLM_CHECK_ERROR(options.mTileK % options.mBlockK == 0, + "If tile size is greater than block size, tile size must be a multiple of block size"); + } + } + + if (!isBiasTypeNone(options.mBiasType)) + { + TLLM_CHECK_ERROR(!isBiasTypeMn(options.mBiasType), "BiasType::Mn is not supported"); + TLLM_CHECK_ERROR(!options.mUseDeepSeekFp8, "Bias is not supported for DeepSeek Fp8"); + TLLM_CHECK_ERROR(!(options.mUsePerTokenSfA && options.mUsePerTokenSfB), "Bias is not supported for Meta Fp8"); + } + if (updateOptions) { // Init kernel traits. options.mKernelTraits = KernelTraits(options.mDtypeA, options.mDtypeB, options.mDtypeC, options.mDtypeAcc, - options.mMmaKind, options.mTileM, options.mTileN, options.mTileK, options.mEpilogueTileM, - options.mEpilogueTileN, options.mNumStages, options.mNumStagesMma, options.mNumSlicesForSplitK, - options.mNumSlicesForSliceK, options.mSplitK, options.mUseTmaStore, options.mTransposeMmaOutput, - options.mAllReduceAlgo, options.mTileScheduler == TileScheduler::Persistent, options.mUseDeepSeekFp8, - options.mUsePerTokenSfA, options.mUsePerTokenSfB); + options.mDtypeMmaA, options.mDtypeMmaB, options.mMmaKind, options.mMmaK, options.mTileM, options.mTileN, + options.mTileK, options.mEpilogueTileM, options.mEpilogueTileN, options.mNumStages, options.mNumStagesMma, + options.mNumSlicesForSplitK, options.mNumSlicesForSliceK, options.mSplitK, options.mUseTmaStore, + options.mTransposeMmaOutput, options.mAllReduceAlgo, options.mTileScheduler == TileScheduler::Persistent, + options.mUseDeepSeekFp8, options.mUsePerTokenSfA, options.mUsePerTokenSfB, options.mBiasType); } return true; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelMetaInfo.h index 5d55ff418b4..190997b5fc4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelMetaInfo.h @@ -28,113 +28,175 @@ namespace kernels { // clang-format off -#define TLLM_GEN_COMMIT "744dc79e" -#define TLLM_GEN_EXPORT_VERSION "6.0" +#define TLLM_GEN_COMMIT "9c8dacbc" +#define TLLM_GEN_EXPORT_VERSION "7.0" -static constexpr size_t tllmGenGemmListLen = 46; +static constexpr size_t tllmGenGemmListLen = 76; #ifndef EXCLUDE_SM_100 -extern unsigned char GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; #endif // EXCLUDE_SM_100 #ifndef EXCLUDE_SM_100 -extern unsigned int GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; #endif // EXCLUDE_SM_100 static const gemm::GemmConfig tllmGenGemmList[] = { #ifndef EXCLUDE_SM_100 -{GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 150528, "gemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a", 320, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 150528, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a", 320, "e6d3dc384d67ec4c7800c348329fa27e78de2f9c75a99773c32bdaa218c6f7c0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -142,6 +204,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -158,6 +222,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -173,185 +239,209 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 0 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 128 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 0 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 175104, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 141312, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "bf4b912ef49aecd710a765e65f102200730ab034f486451363aee8aef4b7a469", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 +, /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 128 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 128 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 175104, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 141312, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "5f0e1254acfb37e63c838b6fd47d45a5fe9e9213f2c95fbd62df13c56825bda8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 +, /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 128 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 128 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 168960, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "724a4aa2483618e369cceccdd8809ad2ef94cfead480b3740619011538a79830", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 1 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 +, /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 -, /* mMmaN */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 2 , /* mNumStagesMma */ 1 @@ -359,464 +449,524 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 84992, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "e2e8e1949e682b104646123cec2efd2e200edd0bd203cbbd8af540d4f94460b5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 +, /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 84992, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "d1121bb8d3332bdb949fe9a60bedc665ba066978beb35d0777eda26b8996af63", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 1 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 +, /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 97280, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "ed0af1af18a02f6f642202a898f65e5fd34cb0d243d0f013bab9e86b74913823", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 +, /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 97280, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "e1c9baccce47d5a341502cb4df5f20b19a6f72fc62542502e54302d878719b6e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 1 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 +, /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 +, /* mMmaM */ 128 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 123904, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "3c938c1017505ab5cc2d4a8845de078772611ee36a57aac495bb81107f8d2cd7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 +, /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 123904, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "f857c48a0bfc7af6d69eb2cb9e3dc2575dc0deb2b1d09ec8b30aee2e7c08f4c6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 1 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 +, /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "d297f004c49cbe68ce3b450b250356ad4b85ac616d750932fd32f6692d0f04ee", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 +, /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 175104, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "13a961a704d5c176d112037d12fb58cbfb313de3054eb438c427d357cd0e5330", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -824,27 +974,31 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 64 -, /* mMmaN */ 8 +, /* mMmaN */ 128 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 @@ -855,30 +1009,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 , /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 217088, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 175104, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "660709d5dcefc3a486cffff988d56f1b3645c366194c26a6a767d682dbffc286", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -886,199 +1044,225 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 +, /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 64 +, /* mMmaN */ 128 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len, 215040, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTwoMmaWarps */ 1 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 168960, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a", 224, "f4da56a4f2a0f149ae01be597605cd9f86f1f4d1ce93a45265a9617f410436cd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 +, /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaN */ 128 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 +, /* mNumStages */ 2 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 0 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 225280, "gemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 84992, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "0060cbd3ef7e85bee35925e8f85252fc963b9fc1a8d1d3f030e81ddfd44c9219", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 +, /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 158720, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTwoMmaWarps */ 1 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 84992, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "b2720b46204d7aaa71efb6f4c231f50eba5a4efe8a29824622b5b7eaac1ccff9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 128 +, /* mEpilogueTileN */ 16 , /* mGridTriggerSecondaryA */ 1 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -1088,11 +1272,13 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 64 -, /* mMmaN */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 @@ -1103,44 +1289,50 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 128 , /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 158720, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 97280, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "690f6d0591ee95a2a06f2609ebf414374e3e9ea3cf15b10b25f71a6aa0f61d1f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 128 +, /* mEpilogueTileN */ 32 , /* mGridTriggerSecondaryA */ 1 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -1150,11 +1342,13 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 64 -, /* mMmaN */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 @@ -1165,106 +1359,120 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 128 , /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 218112, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 97280, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "eb2f813b85c713797e2253a58f7531f863ccb4df7af63131f4fbba0441b5a308", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 32 , /* mGridTriggerSecondaryA */ 1 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 0 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 +, /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 128 +, /* mMmaM */ 64 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 0 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 82944, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTwoMmaWarps */ 1 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 123904, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "2cea16187c12841a47cbabec23f1c313b96a88c5732441cf073e5989babc79c3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 64 , /* mGridTriggerSecondaryA */ 1 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -1274,11 +1482,13 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 64 -, /* mMmaN */ 16 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 @@ -1289,44 +1499,50 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 , /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 82944, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 123904, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "f0cacc9e31348d5606fff0450d5598f3fa303b528a7d2e4152fad04eaba02ae7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 64 , /* mGridTriggerSecondaryA */ 1 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -1336,11 +1552,13 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 64 -, /* mMmaN */ 16 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 @@ -1351,58 +1569,66 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 , /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 93184, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "919a6393be3c1e23bc2bccb06897836ff35fadb5d24fe4489cbd9ec3243f1cf7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 1 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 64 -, /* mMmaN */ 32 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 @@ -1413,58 +1639,66 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 , /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 93184, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "bbbc5003dea9407d41e8367623bd3c22293d5671fcbbeb832a51e2fbcfec18c4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 1 -, /* mGridTriggerSecondaryB */ 0 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 64 -, /* mMmaN */ 32 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 @@ -1475,347 +1709,2493 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 , /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 115712, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 217088, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 224, "918ef95236bc2f3c81e2ed9989be3bfb016e6eb65d271e42d3d8eb186eddd5c4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 1 -, /* mGridTriggerSecondaryB */ 0 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len, 215040, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a", 224, "307418470d4ef813c1a7c5575055c58c371609a62851093f5995c8d0d3e8ace2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 115712, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 225280, "gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 448, "5b9975720db352c071a1b858fa786d49a8a000bf9b6342fc946d55bdfd8cc5b6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 1 -, /* mGridTriggerSecondaryB */ 0 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 124928, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "e20bc9d5f9616133a3437631e9402f94b1d5d5139d34d2bfabe56f18d799fb6a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 124928, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "c5ed5c93263514c16d0ebef8c8a485c184a7922debc9845a4f3e219a61f37b89", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "273f8a4cadeab850f02a525f9977d3896b5b1f01061fe901af63dc28eb260b8d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "588b30af891d54e8368ad0c6f3e01e26f18973c414148447c805a756c36c48d2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "43c6f1dee40555f5e2a90e928a236ed4d9ded2f20839cffa094691eaac22f54b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "943937ffbda786b260fb77c3e364edbd9938cbae533b71d0372ab2dcb8751860", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "c6fcdecba1038f8536f359dcbae20a1ea65380cc2c405a093ef86ea6291a576d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "c69fa4e954d2d67b37f5fe623540d89b4035fc5cbcbfc31d1840098856c1153b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "1bf023d6823fee204172f676a8a4c0ccaa402ebb7e72a516a32b13710167f7c0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "35d746b34db52e771410ad7a31f7d527208e4dd3413bda287ef92eb320d51cb3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 158720, "gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "3b40d2d78135ddfb5cf0405934dddb16108610969752875056fdedbbb7ce51dd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 1 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 158720, "gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "26f71a8873dc4a3f462e21db7a63386478b10a900dd747a38a19e423583a99db", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 1 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 218112, "gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a", 224, "f9ab9cdce6f8ea34183ead171b97e210c7e5d66d2fca298a03be7a7d3738d1f8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 82944, "gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "17b8288b4273a425e1548c8bbad822e7467b4a428a53b07a97cb9667442bdaed", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 1 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 82944, "gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "ec20681a8abfc2dcaadda085086ae17d37db4273fa42aa7f508bb1d32cb9b10b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 1 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 93184, "gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "57ca8aca293859dcdc2f4482d3349f88c1f58a20da17390c03a54d5eb227a1d1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 1 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 93184, "gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "88f8de20196e4d8e1ba1f3f1459962cd1244eea7962603910b042df1633a383f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 1 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 115712, "gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "85cbfa66d3701626b7c73c4ea6b60b997f8b58f9aa014f3e85b4d7aabfb2b875", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 1 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 115712, "gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "9c15d9d240ab1f460b50077f258db134d99f499c1a2813335fcad3265b7ab1b1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 1 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "42e71f1fc0c608ed658311ece12ab39d89b0eee567ed7cc5020526d0ff30a896", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 1 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "b1abe932dd0269b773df87b58a2d6d4d4ead87fc0cf2bf654dac1d2c2107a214", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 1 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 216064, "gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 224, "b22f88d0a8867940becf94cae6655aa123bf379564faa93395c060f06cb4111b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len, 215040, "gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a", 224, "ef0d52a9af301dad63d2f051fb61980e733eb831029c273ecd45299f97eaaba2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 150528, "gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a", 320, "8b12b2e138bc46504264df8cdf60bf7da8b720dcb49dc91a7e451aa25e3bea17", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 141312, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "77f3ca8ee57365b17eb722db2f3f58bc3ff672a17db48486820158c8a4555ddf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 141312, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "2a110d111ba58d98d5e1b0ffe643fc7c023b980b2d09f7d03552b0a4672b5084", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "edfc7011e09fabc382600cbab66146991be33cef1f7b9ebcf41532dea225e814", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "18501bcb133c5d65eabe1eab28353e5125159d2b00dd7421ffa04fb989bb2c91", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "7df5042c43431af636fb38ac5c37eea71a56fd283e1f268f77cd8195c11c6c9e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "4896c2a3cba708b1b0514460c6d9b1fb830b5fbd0eb37bd1e9b4fe7422347c0c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 +, /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "232fe5043b773dd2ded2b3bbdd4f87bf058811cb93fca1becd8b8b28d8d1766a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 +, /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 +, /* mMmaM */ 128 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 216064, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "824abe26e01fc2e993f7d2328314ff8c7753bc042388fcf63d9211bb75e49206", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 64 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 +, /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 +, /* mNumStages */ 2 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len, 215040, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "4e1d452d16e5c2e0c4d9b1cb1f333bd2a814cb7b032efca3877b07093297dacf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 2 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1824,14 +4204,16 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 +, /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1841,98 +4223,110 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mN */ 256 , /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 +, /* mNumStages */ 2 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 150528, "gemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a", 320, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "ab99969c21da234acc1a4ea9e7651f807d383d20a0797e13ecd5f7381708c92d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 1 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 +, /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 -, /* mMmaN */ 128 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 +, /* mNumStages */ 2 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 175104, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 175104, "gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "d1b195aa2274d75333b046cf0af6872cde53652df63e9bd0b1bd440e8ce11071", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1940,6 +4334,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1956,6 +4352,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1971,30 +4369,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 128 , /* mTileM */ 128 , /* mTileN */ 128 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 175104, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 175104, "gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "bdeae336af848195a201753690a53bbca4b983f9a1d358264d76f365beb3765b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2002,6 +4404,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2018,6 +4422,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2033,30 +4439,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 128 , /* mTileM */ 128 , /* mTileN */ 128 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 168960, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 168960, "gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a", 224, "8da09f053ef3779830cd84c7fbf5134f2ea72dbab75a50f433c9e75eaf78906b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2064,6 +4474,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2080,6 +4492,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2095,30 +4509,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 0 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 128 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 0 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 84992, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 84992, "gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "f56950feba67b6d510e0fcc3c3ffee5951e6f595c2e906eee7d6210fd0ed471e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2126,6 +4544,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2142,6 +4562,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2157,30 +4579,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 128 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 84992, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 84992, "gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "8ba67335fa749050e10cefddd57aab6ba85b763d3f51195208ec4cf159c0c630", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2188,6 +4614,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2204,6 +4632,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2219,30 +4649,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 128 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 97280, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 97280, "gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "d1a1757399bd715d73a6832ee2c72171a4c0acffbb0e4137aaf93120123b870a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2250,6 +4684,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2266,6 +4702,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2281,30 +4719,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 128 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 97280, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 97280, "gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "4ae92b97c4a99ae20154561832c1a6c3d5c126e4b732e0f1bea71ac58a7f3dfc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2312,6 +4754,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2328,6 +4772,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2343,30 +4789,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 128 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 123904, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 123904, "gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "dc668f964bbe93deac8a8fc69b1d8f4b25266bd46b8d7ddcfdf0081f6a2bc4f3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2374,6 +4824,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2390,6 +4842,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2405,30 +4859,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 128 , /* mTileM */ 128 , /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 123904, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 123904, "gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "ab2766c1b754f66bc83f7cb4aeb80f71664dadb22b1a8cff9c34e155cf105f9f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2436,6 +4894,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2452,6 +4912,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2467,30 +4929,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 128 , /* mTileM */ 128 , /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "e1b9436e0c49e0a569110c3506dfdb9fcc9918f46dd3cefaacd1fe4fc22b54ef", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2498,6 +4964,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2514,6 +4982,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2529,30 +4999,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 128 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "229c795f95014f0f3c7ed840159c88a0c3fb98e349dec882c3e9639dcb7fd7e4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2560,6 +5034,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2576,6 +5052,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2591,30 +5069,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 128 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 1 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 217088, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 217088, "gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 224, "4269d0c59e4d46511d9df9fe97ac1cbf12f163e933e43405b8787d35cf7b7b07", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2622,6 +5104,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2638,6 +5122,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2653,30 +5139,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len, 215040, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len, 215040, "gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a", 224, "ad5759c6ab7008c37008fab54349e12c05f8590073ca9b2a61d2580c3475f27d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 2 @@ -2684,6 +5174,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2700,6 +5192,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2715,30 +5209,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 225280, "gemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 225280, "gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 448, "e07aa099d0dcf6e0d778beb056bbe1fd6578ee2a9883952c31fc4c2045187741", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2746,6 +5244,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826828) , /* mDtypeB */ trtllm::gen::Dtype(17827853) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2762,6 +5262,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(5) @@ -2777,30 +5279,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 183296, "gemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a", 320, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 183296, "gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a", 320, "8ec82dff1e9ec601f811eca74c26f5152ddfba65f0f4d3cba2b575d8b517ca22", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2808,6 +5314,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2824,6 +5332,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -2839,30 +5349,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 0 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 128 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 0 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 227328, "gemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 227328, "gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 448, "6c4322f586dce2bd0c54ecf5ac20810aa07978d6b05016e342863ccd3bf0210d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2870,6 +5384,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826828) , /* mDtypeB */ trtllm::gen::Dtype(17827853) , /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2886,6 +5402,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(5) @@ -2901,30 +5419,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 224256, "gemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 224256, "gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 448, "6626a12d3f1b7fb2d19b62df0fa6f12c3bee4fa861a5ae1ac339f7bdc7c4b17e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2932,6 +5454,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826828) , /* mDtypeB */ trtllm::gen::Dtype(17827853) , /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2948,6 +5472,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(5) @@ -2963,33 +5489,34 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 +, /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, #endif // EXCLUDE_SM_100 }; // clang-format on - } // namespace kernels } // namespace tensorrt_llm } // namespace gemm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h index 142e9728dcc..2776b90aadf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h @@ -22,6 +22,10 @@ #include "Enums.h" #include "TmaDescriptor.h" +// NOTE: keep this code dependency free. It has to be included by the device code and has to be +// compilable with NVRTC. +#include "KernelParamsDecl.h" + namespace gemm { @@ -29,535 +33,307 @@ namespace gemm { //////////////////////////////////////////////////////////////////////////////////////////////////// - namespace tg = trtllm::gen; -//////////////////////////////////////////////////////////////////////////////////////////////////// - -struct KernelParams +namespace KernelParamsSetup { #ifdef TLLM_ENABLE_CUDA - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // Gemm parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - - // TMA descriptor for A. - // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from - // makeTmaShapeStrideAb. - // - // If transposeMatrixA is false - // Logical shape is [M, K]. - // Logical strides are [K, 1]. - // Tile box shape is [tileM, tileK]. - // Tile box strides are [tileK, 1]. - // Dtype is set from options.mDtypeA. - // - // If transposeMatrixA is true - // Logical shape is [K, M]. - // Logical strides are [M, 1]. - // Tile box shape is [tileK, tileM]. - // Tile box strides are [tileM, 1]. - // Dtype is set from options.mDtypeA. - CUtensorMap tmaA; - - // TMA descriptor for B. - // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from - // makeTmaShapeStrideAb. - // - // If transposeMatrixB is true - // Logical shape is [N, K]. - // Logical strides are [K, 1]. - // Tile box shape is [tileN, tileK]. - // Tile box strides are [tileK, 1]. - // Dtype is set from options.mDtypeB. - // - // If transposeMatrixB is false - // Logical shape is [K, N]. - // Logical strides are [N, 1]. - // Tile box shape is [tileK, tileN]. - // Tile box strides are [tileN, 1]. - // Dtype is set from options.mDtypeB. - CUtensorMap tmaB; - - // TMA descriptor for C, (when useTmaStore is true) - // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from - // makeTmaShapeStrideC. - // - // If transposeMmaOutput is false, - // Logical shape is [M, N]. - // Logical strides are [N, 1]. - // Tile box shape is [epilogueTileM, epilogueTileN]. - // Tile box strides are [epilogueTileN, 1]. - // Dtype is set from options.mDtypeC. - // - // If transposeMmaOutput is true, - // Logical shape is [N, M]. - // Logical strides are [M, 1]. - // Tile box shape is [epilogueTileN, epilogueTileM]. - // Tile box strides are [epilogueTileM, 1]. - // Dtype is set from options.mDtypeC. - CUtensorMap tmaC; - - // TMA descriptor for the block scaling factors for A, for MxFp{4,8} and NvFp4 formats. - // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from - // makeTmaShapeStrideSfAb. - // The layout of scaling factors for A is always R128c4 - // - // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats. - // K must be a multiple of 4P. - // The "logical" shape is: [M, K / P]. - // The R128c4 layout is: [⌈M / 128⌉, K / P / 4, 512]. - // The shape we use for TMA is: [⌈M / 128⌉, K / P / 4, 2, 256]. - // - // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats. - CUtensorMap tmaSfA; - - // TMA descriptor for the block scaling factors for B, for MxFp{4,8} and NvFp4 formats. - // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from - // makeTmaShapeStrideSfAb. - // The layout of scaling factors for B is controlled by options.mSfLayoutB. - // - // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats. - // The "logical" shape is: [N, K / P] - // - // If the layout is R128c4, - // K must be a multiple of 4P. - // The R128c4 layout is: [⌈N / 128⌉, K / P / 4, 512] - // The shape we use for TMA is: [⌈N / 128⌉, K / P / 4, 2, 256] - // - // If the layout is R8c4, - // K must be a multiple of 4P. - // The R8c4 layout is: [⌈N / 8⌉, K / P / 4, 32] - // The shape we use for TMA is: [⌈N / 8⌉, K / P / 4 / r, r * 32] - // where r = min(tileK / P / 4, 8) - // - // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats. - CUtensorMap tmaSfB; - - // The output matrix C. The data type is controlled by options.mDtypeC. - // - // When transposeMmaOutput is true, the shape is [N, M]. - // Otherwise, the shape is [M, N]. - // Elements in a given row are stored contiguously in memory (row-major). - void* ptrC; - - // The block scaling factors to dequantize A. - // - // If DeepSeek FP8 recipe is used: - // If transposeMmaOutput is false, shape is [K / 128, M]. - // Otherwise, shape is [M / 128, K / 128]. - // The rightmost dimension is contiguous in memory. - // - // If DeepSeek FP8 recipe is not used, but for MxFp{4,8} and NvFp4 formats: - // The layout and data type is the same as explained in tmaSfA. - // - // Otherwise should be set to nullptr. - void const* ptrSfA; - - // The scaling factors to dequantize B. - // - // If DeepSeek FP8 recipe is used: - // If transposeMmaOutput is false, shape is [N / 128, K / 128]. - // Otherwise, shape is [K / 128, N]. - // The rightmost dimension is contiguous in memory. - // - // If DeepSeek FP8 recipe is not used, but for MxFp{4,8} and NvFp4 formats: - // The layout and data type is the same as explained in tmaSfB. - // - // Otherwise should be set to nullptr. - void const* ptrSfB; - - // The per-token scaling factors from scale A. - // - // This is used for either: - // * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32 - // * When the routing scales are applied to the input activations (only when output is not - // transposed). The dtype is Dtype::Bfloat16 - // - // The shape is [M] - void const* ptrPerTokenSfA; - - // The per-token scaling factors from scale B. - // - // This is used for either: - // * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32 - // * When the routing scales are applied to the input activations (only when output is - // transposed). The dtype is Dtype::Bfloat16 - // - // The shape is [N] - void const* ptrPerTokenSfB; - - // The scaling factors calculated when quantizing C, for MxFp{4,8} and NvFp4 formats, also - // used for the DeepSeek FP8 recipe. - // - // For DeepSeek FP8 recipe: - // If transposeMmaOutput is false, shape is [N / 128, M]. - // Otherwise, shape is [M / 128, N]. - // The rightmost dimension is contiguous in memory. - // - // For MxFp{4,8} and NvFp4 formats: - // If transposeMmaOutput is false, shape is [M, N / 16]. - // Otherwise, shape is [N, M / 16]. - // The layout is controlled by options.mSfLayoutC (either R128c4 or R8c4). - void* ptrSfC; - - // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. - // TensorRT-LLM API requires a scaling factor on the device. - // Shape is [1]. - float const* ptrScaleC; - - // The M dimension. - // It is the total number of tokens if A is the activation matrix. - // It is the total number of output channels if A is the weight matrix. - int32_t m; - // The N dimension. - // It is the total number of tokens if B is the activation matrix. - // It is the total number of output channels if B is the weight matrix. - int32_t n; - // The K dimension. It is the hidden dimension of the input matrices. - int32_t k; - - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // All-reduce parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - - // The rank id of the current device in the multi-gpu space. - int rank; - // The number of peer devices in tensor-parallel group. - int tpGrpSize; - // Pointer for output with multicast mapping. It is used by the "reduce" op (LDGMC.ADD) of the - // two-shot reduce-scatter phase. - // The shape is [M, N] and the dtype is float. - void* multimemC; - - // The barriers in global memory. - // - // The kernel arrives at (with release ordering) the multicast mapping of the barrier to broadcast - // amongst peer devices. It then waits (with acquire ordering) for the unicast mapping of the - // barrier. - // - // Flags in global memory that sync on "entrance" of reduce-scatter phase in two-shot all-reduce. - // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. - // The pointer to the unicast memory created with IpcNvlsHandle. - // Must be set to 0 before the kernel launch. - void* ptrTileBars; - // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. - // The pointer to the multicast memory created with IpcNvlsHandle. - void* multimemTileBars; - - // Flags in global memory that sync on "exit" after the all-reduce finishes. - // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. - // The pointer to the unicast memory created with IpcNvlsHandle. - // Must be set to 0 before the kernel launch. - void* ptrCompletionBars; - // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. - // The pointer to the multicast memory created with IpcNvlsHandle - void* multimemCompletionBars; - - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // Miscellaneous parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - - // The barriers in global memory for Split-k reduction with exchange in GMEM. - // Each CTAs arrives at the barrier and blockIdx.z == gridDim.Z - 1 waits for the barrier to flip - // to perform a reduction. - // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. - // For DeepSeek FP8 recipe, the shape is [numTilesM * numTilesN * 2]. - // The memory must be set to 0 before the kernel launch. - void* ptrSplitKCompletionBars; - - // Pointer to the memory holding the partial sums for split-K in GMEM. - // The shape is [numSlicesForSplitK, numSlicesForSliceK, numTilesM * tileM, numTilesN * tileN]. - // The dtype is dtypeAcc, i.e. float. - void* ptrPartialSumsForSplitK; - - // In some cases, some CTAs need to exit early. E.g. when the grid is statically set, but the - // actual workload is decided at runtime. This device pointer maps to the number of non exiting - // CTAs in the X dim of the grid when transposeMmaOutput is false. And the Y dim, otherwise. - // The pointer points to a scalar and the dtype is int32_t. The pointed value must be >= 0. - int32_t* ptrNumNonExitingCtas; - - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // Miscellaneous parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - - enum class MatrixType - { - MatrixA = 0, - MatrixB - }; - // Create the TMA shape/stride for A/B. - template - static auto makeTmaShapeStrideAb(GemmOptions const& options, MatrixType matrixType) - { - // The outer dimension. - auto numTokens = (matrixType == MatrixType::MatrixA) ? options.mM : options.mN; - // The inner dimension. - auto hiddenSize = options.mK; - // The cute tensor shape for A/B: (numTokens, hiddenSize). - // Note that TMA descriptor expects the first dimension's stride to be - // 1, so swap the first two dimension so that the hiddenSize dimension comes first. - auto shape = std::vector{static_cast(hiddenSize), static_cast(numTokens)}; - - // Assemble the stride (strideTokens, 1). - // Swap the first two dimension as mentioned before. - auto stride = std::vector{1, static_cast(hiddenSize)}; +using MatrixType = KernelParams::MatrixType; +// Create the TMA shape/stride for A/B. +template +static auto makeTmaShapeStrideAb(GemmOptions const& options, MatrixType matrixType) +{ + // The outer dimension. + auto numTokens = (matrixType == MatrixType::MatrixA) ? options.mM : options.mN; + // The outer dimension tile size. + auto tileMn = (matrixType == MatrixType::MatrixA) ? options.mTileM : options.mTileN; + // The inner dimension. + auto hiddenSize = options.mK; + // The cute tensor shape for A/B: (numTokens, hiddenSize). + // Note that TMA descriptor expects the first dimension's stride to be + // 1, so swap the first two dimension so that the hiddenSize dimension comes first. + auto shape = std::vector{static_cast(hiddenSize), static_cast(numTokens)}; + + // Assemble the stride (strideTokens, 1). + // Swap the first two dimension as mentioned before. + auto stride = std::vector{1, static_cast(hiddenSize)}; + + // Assemble the box shape + std::vector tileShape = {options.mTileK, tileMn}; + + MatrixLayout layout = (matrixType == MatrixType::MatrixA) ? options.mLayoutA : options.mLayoutB; + if (layout == MatrixLayout::MajorMn) + { // Apply transpose if necessary - if ((matrixType == MatrixType::MatrixA && options.mTransposeMatrixA) - || (matrixType == MatrixType::MatrixB && !options.mTransposeMatrixB)) + std::swap(shape[0], shape[1]); + stride[1] = numTokens; + std::swap(tileShape[0], tileShape[1]); + } + else if (layout == MatrixLayout::BlockMajorK) + { + // Set shapes based on blocking layout + shape = {static_cast(options.mBlockK), static_cast(numTokens), + static_cast(options.mK / options.mBlockK)}; + stride = {1, static_cast(options.mBlockK), static_cast(numTokens * options.mBlockK)}; + + // If blockK > tileK, then the inner most box size will be based on the tile + int32_t const tileBlockK = std::min(options.mBlockK, options.mTileK); + tileShape = {tileBlockK, tileMn, options.mTileK / tileBlockK}; + } + + return std::make_tuple(shape, stride, tileShape); +} + +// Create the TMA shape/stride for C. +template +static auto makeTmaShapeStrideC(GemmOptions const& options) +{ + // The number of tokens. + auto numTokens = options.mTransposeMmaOutput ? options.mN : options.mM; + // The hidden dimension. + auto hiddenSize = options.mTransposeMmaOutput ? options.mM : options.mN; + // Note that TMA descriptor expects the first dimension's stride to be + // 1, so swap the first two dimension so that the hiddenSize dimension comes first. + auto shape = std::vector{static_cast(hiddenSize), static_cast(numTokens)}; + + // Assemble the stride (strideTokens, 1). + // Swap the first two dimension as mentioned before. + auto stride = std::vector{1, static_cast(hiddenSize)}; + + return std::make_tuple(shape, stride); +} + +// Create the TMA shape/stride for A/B block scaling factors. +template +static auto makeTmaShapeStrideSfAb(GemmOptions const& options, MatrixType matrixType, tg::SfLayout layout) +{ + // The outer dimension. + auto numTokens = matrixType == MatrixType::MatrixA ? options.mM : options.mN; + // The inner dimension. + auto hiddenSize = options.mK; + // The outer tile dimension. + auto numTokensPerTile = matrixType == MatrixType::MatrixA ? options.mTileM : options.mTileN; + // The inner tile dimension. + auto hiddenSizePerTile = options.mTileK; + // The dtype of the matrix. + tg::Dtype matrixDtype = matrixType == MatrixType::MatrixA ? options.mDtypeA : options.mDtypeB; + // Number of elements per scaling factor. + int32_t const numEltsPerSf = (matrixType == MatrixType::MatrixA && options.mSfBlockSizeA.has_value()) + ? options.mSfBlockSizeA.value() + : (tg::dtypeIsBlockFmt(matrixDtype) ? tg::dtypeNumEltsPerSf(matrixDtype) : 32); + + switch (layout) + { + case tg::SfLayout::R128c4: + { + // The scaling factor tensor packs 128x4 tiles into contiguous 512B blocks. + // The 512B block maps to a 32x16B (32x128b) block in TMEM. + // See https://nvbugspro.nvidia.com/bug/4165523 + // + // Additionally, we have to meet constraints of TMA that the box dimensions are less + // than 256 and boxDim[0] is a multiple of 16B. + // + // The "logical" tensor is: [outer, inner / numEltsPerSf] + // The aforementioned format is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 512] + // The shape we use for TMA is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 2, 256] + + auto shape = std::vector{256, 2, static_cast(tg::ceilDiv(hiddenSize, numEltsPerSf * 4)), + static_cast(tg::ceilDiv(numTokens, 128))}; + + std::vector stride(shape.size()); + stride[0] = 1; + for (size_t i = 1; i < shape.size(); i++) { - std::swap(shape[0], shape[1]); - stride[1] = numTokens; + stride[i] = shape[i - 1] * stride[i - 1]; } - return std::make_tuple(shape, stride); - } + auto tileShapes + = std::vector{256, 2, static_cast(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4)), + static_cast(tg::ceilDiv(numTokensPerTile, 128))}; - // Create the TMA shape/stride for C. - template - static auto makeTmaShapeStrideC(GemmOptions const& options) - { - // The number of tokens. - auto numTokens = options.mTransposeMmaOutput ? options.mN : options.mM; - // The hidden dimension. - auto hiddenSize = options.mTransposeMmaOutput ? options.mM : options.mN; - // Note that TMA descriptor expects the first dimension's stride to be - // 1, so swap the first two dimension so that the hiddenSize dimension comes first. - auto shape = std::vector{static_cast(hiddenSize), static_cast(numTokens)}; - - // Assemble the stride (strideTokens, 1). - // Swap the first two dimension as mentioned before. - auto stride = std::vector{1, static_cast(hiddenSize)}; - - return std::make_tuple(shape, stride); + return std::make_tuple(shape, stride, tileShapes); } - // Create the TMA shape/stride for A/B block scaling factors. - template - static auto makeTmaShapeStrideSfAb(GemmOptions const& options, MatrixType matrixType, tg::SfLayout layout) + case tg::SfLayout::R8c4: { - // The outer dimension. - auto numTokens = matrixType == MatrixType::MatrixA ? options.mM : options.mN; - // The inner dimension. - auto hiddenSize = options.mK; - // The outer tile dimension. - auto numTokensPerTile = matrixType == MatrixType::MatrixA ? options.mTileM : options.mTileN; - // The inner tile dimension. - auto hiddenSizePerTile = options.mTileK; - // The dtype of the matrix. - tg::Dtype matrixDtype = matrixType == MatrixType::MatrixA ? options.mDtypeA : options.mDtypeB; - // Number of elements per scaling factor. - int32_t const numEltsPerSf = (matrixDtype == tg::Dtype::E2m1) ? 16 : 32; - - switch (layout) + // The scaling factor tensor packs 8x4 tiles into contiguous 32B blocks. + // + // As the inner dimension (k) is often a multiple of the tile size, we can reshape to use + // fewer read requests, if the tile dimensions allow. It does not reduce the number of + // instructions. + // + // I.e., let's define r = min(⌈hiddenSizePerTile / (numEltsPerSf * 4)⌉, 8) + // + // The "logical" tensor is: [outer, inner / numEltsPerSf] + // The 8x4 SF layout is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf), 32] + // The TMA tensor shape is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf * r), r * 32] + // + // The caveat of NumRepeats>1 is we must pad the hidden dimension of SF to multiples of + // NumRepeats * numEltsPerSf * 4. + + // Detect if the supplied factor is power of 2. E.g., 0b0100 and (0b0100 - 1) == 0b0000. + int const r = options.mSfReshapeFactor; + if (r > 0 && (r & (r - 1)) != 0) { - case tg::SfLayout::R128c4: - { - // The scaling factor tensor packs 128x4 tiles into contiguous 512B blocks. - // The 512B block maps to a 32x16B (32x128b) block in TMEM. - // See https://nvbugspro.nvidia.com/bug/4165523 - // - // Additionally, we have to meet constraints of TMA that the box dimensions are less - // than 256 and boxDim[0] is a multiple of 16B. - // - // The "logical" tensor is: [outer, inner / numEltsPerSf] - // The aforementioned format is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 512] - // The shape we use for TMA is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 2, 256] - - auto shape = std::vector{256, 2, static_cast(tg::ceilDiv(hiddenSize, numEltsPerSf * 4)), - static_cast(tg::ceilDiv(numTokens, 128))}; - - std::vector stride(shape.size()); - stride[0] = 1; - for (size_t i = 1; i < shape.size(); i++) - { - stride[i] = shape[i - 1] * stride[i - 1]; - } - - auto tileShapes - = std::vector{256, 2, static_cast(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4)), - static_cast(tg::ceilDiv(numTokensPerTile, 128))}; - - return std::make_tuple(shape, stride, tileShapes); + throw std::runtime_error("mSfReshapeFactor must be positive and a power of 2. Found " + std::to_string(r)); } - case tg::SfLayout::R8c4: + // Sanitize number of repeats so it doesn't exceed the dimension. + int const repeats = std::min(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4), r); + + // Detect if the input hidden size K is a multiple of the repeats. + if (tg::ceilDiv(hiddenSize, numEltsPerSf * 4) % repeats != 0) { - // The scaling factor tensor packs 8x4 tiles into contiguous 32B blocks. - // - // As the inner dimension (k) is required to be a multiple of the tile size, we - // can reshape to use fewer read requests, if the tile dimensions allow. - // I.e., let's define r = min(⌈hiddenSizePerTile / (numEltsPerSf * 4)⌉, 8) - // - // The "logical" tensor is: [outer, inner / numEltsPerSf] - // The 8x4 SF layout is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 32] - // The TMA tensor shape is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf * r), r * 32] - - int const repeats = std::min(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4), 8); - - auto shape = std::vector{static_cast(repeats * 32), - static_cast(tg::ceilDiv(hiddenSize, numEltsPerSf * 4 * repeats)), - static_cast(tg::ceilDiv(numTokens, 8))}; - - std::vector stride(shape.size()); - stride[0] = 1; - for (size_t i = 1; i < shape.size(); i++) - { - stride[i] = shape[i - 1] * stride[i - 1]; - } - - auto tileShapes = std::vector{static_cast(repeats * 32), - static_cast(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4 * repeats)), - static_cast(tg::ceilDiv(numTokensPerTile, 8))}; - - return std::make_tuple(shape, stride, tileShapes); + throw std::runtime_error("SF hiddenSize K (" + std::to_string(tg::ceilDiv(hiddenSize, numEltsPerSf * 4)) + + ") must be a multiple of repeats (" + std::to_string(repeats) + ")"); } - default: throw std::runtime_error("Unsupported SF layout"); + auto shape = std::vector{static_cast(repeats * 32), + static_cast(tg::ceilDiv(hiddenSize, numEltsPerSf * 4 * repeats)), + static_cast(tg::ceilDiv(numTokens, 8))}; + + std::vector stride(shape.size()); + stride[0] = 1; + for (size_t i = 1; i < shape.size(); i++) + { + stride[i] = shape[i - 1] * stride[i - 1]; } - return std::make_tuple(std::vector{}, std::vector{}, std::vector{}); + + auto tileShapes = std::vector{static_cast(repeats * 32), + static_cast(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4 * repeats)), + static_cast(tg::ceilDiv(numTokensPerTile, 8))}; + + return std::make_tuple(shape, stride, tileShapes); } - // Setup the kernel parameters. - template - static KernelParams setKernelParams(GemmOptions_ const& options, void const* ptrA, void const* ptrSfA, - void const* ptrPerTokenSfA, void const* ptrB, void const* ptrSfB, void const* ptrPerTokenSfB, void* ptrC, - void* ptrSfC, void* multimemC, float* ptrScaleC, void* ptrPartialSumsForSplitK, void* ptrTileBars, - void* multimemTileBars, void* ptrCompletionBars, void* multimemCompletionBars, void* ptrSplitKCompletionBars, - int32_t* ptrNumNonExitingCtas, int rank, int tpGrpSize) - { + default: throw std::runtime_error("Unsupported SF layout"); + } + return std::make_tuple(std::vector{}, std::vector{}, std::vector{}); +} + +// Setup the kernel parameters. +template +static KernelParams setKernelParams(GemmOptions_ const& options, void const* ptrA, void const* ptrSfA, + void const* ptrPerTokenSfA, void const* ptrB, void const* ptrSfB, void const* ptrPerTokenSfB, void const* ptrBias, + void* ptrC, void* ptrSfC, void* multimemC, float* ptrScaleC, void* ptrPartialSumsForSplitK, void* ptrTileBars, + void* multimemTileBars, void* ptrCompletionBars, void* multimemCompletionBars, void* ptrSplitKCompletionBars, + int32_t* ptrNumNonExitingCtas, int rank, int tpGrpSize) +{ - // Is one-shot all-reduce? - bool const oneShotAr{options.mAllReduceAlgo == AllReduceAlgo::OneShot}; - // Is two-shot all-reduce? - bool const twoShotAr{options.mAllReduceAlgo == AllReduceAlgo::TwoShot}; - // Are there peer devices? - bool const multiDevice{tpGrpSize > 1}; - - // Create the return struct. - KernelParams params; - - // Shape/stride for gmem tensor A. - auto [shapeA, strideA] = makeTmaShapeStrideAb(options, MatrixType::MatrixA); - // Build tma descriptor for A. - params.tmaA = gemm::buildNdTmaDescriptor(options.mDtypeA, options.mMmaKind, shapeA, strideA, - options.mTransposeMatrixA ? options.mTileK : options.mTileM, - options.mTransposeMatrixA ? options.mTileM : options.mTileK, const_cast(ptrA)); - - // Shape/stride for gmem tensor B. - auto [shapeB, strideB] = makeTmaShapeStrideAb(options, MatrixType::MatrixB); - // Build tma descriptor for B. - params.tmaB = gemm::buildNdTmaDescriptor(options.mDtypeB, options.mMmaKind, shapeB, strideB, - !options.mTransposeMatrixB ? options.mTileK : options.mTileN, - !options.mTransposeMatrixB ? options.mTileN : options.mTileK, const_cast(ptrB), - /* swizzle */ !options.mSliceK); - - if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE2m1 - || options.mDtypeA == tg::Dtype::MxE4m3) - { - tg::Dtype const dTypeSfA = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; + // Is one-shot all-reduce? + bool const oneShotAr{options.mAllReduceAlgo == AllReduceAlgo::OneShot}; + // Is two-shot all-reduce? + bool const twoShotAr{options.mAllReduceAlgo == AllReduceAlgo::TwoShot}; + // Are there peer devices? + bool const multiDevice{tpGrpSize > 1}; + + // Create the return struct. + KernelParams params; + + // Shape/stride for gmem tensor A. + auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAb(options, MatrixType::MatrixA); + // Build tma descriptor for A. + params.tmaA = gemm::buildNdTmaDescriptor( + options.mDtypeA, options.mMmaKind, shapeA, strideA, tileShapeA, const_cast(ptrA)); + + // Shape/stride for gmem tensor B. + auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAb(options, MatrixType::MatrixB); + // Build tma descriptor for B. + params.tmaB = gemm::buildNdTmaDescriptor(options.mDtypeB, options.mMmaKind, shapeB, strideB, tileShapeB, + const_cast(ptrB), + /* swizzle */ !options.mSliceK); + + if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE2m1 + || options.mDtypeA == tg::Dtype::MxE4m3) + { + tg::Dtype const dTypeSfA = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; - // Build TMA descriptor for gmem A block scaling factors. - auto [shapeSfA, strideSfA, tileShapesSfA] - = makeTmaShapeStrideSfAb(options, MatrixType::MatrixA, tg::SfLayout::R128c4); - params.tmaSfA - = gemm::buildSfTmaDescriptor(dTypeSfA, shapeSfA, strideSfA, tileShapesSfA, const_cast(ptrSfA)); - } + // Build TMA descriptor for gmem A block scaling factors. + auto [shapeSfA, strideSfA, tileShapesSfA] + = makeTmaShapeStrideSfAb(options, MatrixType::MatrixA, tg::SfLayout::R128c4); + params.tmaSfA + = gemm::buildSfTmaDescriptor(dTypeSfA, shapeSfA, strideSfA, tileShapesSfA, const_cast(ptrSfA)); + } - if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE2m1 - || options.mDtypeB == tg::Dtype::MxE4m3) - { - tg::Dtype const dTypeSfB = (options.mDtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; + if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE2m1 + || options.mDtypeB == tg::Dtype::MxE4m3) + { + tg::Dtype const dTypeSfB = (options.mDtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; - // Build TMA descriptor for gmem B block scaling factors. - auto [shapeSfB, strideSfB, tileShapesSfB] - = makeTmaShapeStrideSfAb(options, MatrixType::MatrixB, options.mSfLayoutB); - params.tmaSfB - = gemm::buildSfTmaDescriptor(dTypeSfB, shapeSfB, strideSfB, tileShapesSfB, const_cast(ptrSfB)); - } + // Build TMA descriptor for gmem B block scaling factors. + auto [shapeSfB, strideSfB, tileShapesSfB] + = makeTmaShapeStrideSfAb(options, MatrixType::MatrixB, options.mSfLayoutB); + params.tmaSfB + = gemm::buildSfTmaDescriptor(dTypeSfB, shapeSfB, strideSfB, tileShapesSfB, const_cast(ptrSfB)); + } - if (options.mUseTmaStore) + if (options.mUseTmaStore) + { + // Shape/stride for gmem tensor C. + auto [shapeC, strideC] = makeTmaShapeStrideC(options); + + // Swap M and N tiles for the M-major epilogue. + auto outputTileM = options.mTransposeMmaOutput ? options.mEpilogueTileN : options.mEpilogueTileM; + auto outputTileN = options.mTransposeMmaOutput ? options.mEpilogueTileM : options.mEpilogueTileN; + + // One-shot performs TMA reduction on multicast mapping of the output buffer directly. + // Two-shot performs TMA store on unicast mapping of the output buffer. The reduction happens + // in the next phase. + void* ptrTmaC{oneShotAr && multiDevice ? multimemC : ptrC}; + auto dtypeC{options.mDtypeC}; + // Regardless of output dtype, two-shot all-reduce store partial + // accumulation results to global memory in float32 precision. + if (twoShotAr && multiDevice) { - // Shape/stride for gmem tensor C. - auto [shapeC, strideC] = makeTmaShapeStrideC(options); - - // Swap M and N tiles for the M-major epilogue. - auto outputTileM = options.mTransposeMmaOutput ? options.mEpilogueTileN : options.mEpilogueTileM; - auto outputTileN = options.mTransposeMmaOutput ? options.mEpilogueTileM : options.mEpilogueTileN; - - // One-shot performs TMA reduction on multicast mapping of the output buffer directly. - // Two-shot performs TMA store on unicast mapping of the output buffer. The reduction happens - // in the next phase. - void* ptrTmaC{oneShotAr && multiDevice ? multimemC : ptrC}; - auto dtypeC{options.mDtypeC}; - // Regardless of output dtype, two-shot all-reduce store partial - // accumulation results to global memory in float32 precision. - if (twoShotAr && multiDevice) - { - dtypeC = options.mDtypeAcc; - } - - // Build tma descriptor for C. - params.tmaC = gemm::buildNdTmaDescriptor( - dtypeC, tg::MmaKind::Auto, shapeC, strideC, outputTileM, outputTileN, const_cast(ptrTmaC)); + dtypeC = options.mDtypeAcc; } - // Set the dequantization factors for A and B when DeepSeek FP8 recipe is used. - params.ptrSfA = ptrSfA; - params.ptrSfB = ptrSfB; + // Build tma descriptor for C. + params.tmaC = gemm::buildNdTmaDescriptor(dtypeC, tg::MmaKind::Auto, shapeC, strideC, + std::vector{outputTileN, outputTileM}, const_cast(ptrTmaC)); + } - // Set the per-token scale factors for MetaFP8 or scale inputs - params.ptrPerTokenSfA = ptrPerTokenSfA; - params.ptrPerTokenSfB = ptrPerTokenSfB; + // Set the dequantization factors for A and B when DeepSeek FP8 recipe is used. + params.ptrSfA = ptrSfA; + params.ptrSfB = ptrSfB; - // Also set ptrC (it may be used by the NCCL reduction code in "layers/Llama"). - params.ptrC = ptrC; - params.ptrScaleC = ptrScaleC; + // Set the per-token scale factors for MetaFP8 or scale inputs + params.ptrPerTokenSfA = ptrPerTokenSfA; + params.ptrPerTokenSfB = ptrPerTokenSfB; - // The block scaling factors of C for MxFp{4,8} and NvFp4 formats. - // (not to be confused with the tensor-level scaling factor stored in ptrScaleC) - params.ptrSfC = ptrSfC; + // Set the bias. + params.ptrBias = ptrBias; - params.m = options.mM; - params.n = options.mN; - params.k = options.mK; + // Also set ptrC (it may be used by the NCCL reduction code in "layers/Llama"). + params.ptrC = ptrC; + params.ptrScaleC = ptrScaleC; - params.rank = rank; - params.tpGrpSize = tpGrpSize; + // The block scaling factors of C for MxFp{4,8} and NvFp4 formats. + // (not to be confused with the tensor-level scaling factor stored in ptrScaleC) + params.ptrSfC = ptrSfC; - params.multimemC = multimemC; - params.ptrPartialSumsForSplitK = ptrPartialSumsForSplitK; - params.ptrTileBars = ptrTileBars; - params.multimemTileBars = multimemTileBars; - params.ptrCompletionBars = ptrCompletionBars; - params.multimemCompletionBars = multimemCompletionBars; + params.m = options.mM; + params.n = options.mN; + params.k = options.mK; - params.ptrSplitKCompletionBars = ptrSplitKCompletionBars; - params.ptrNumNonExitingCtas = ptrNumNonExitingCtas; - return params; - } + params.rank = rank; + params.tpGrpSize = tpGrpSize; - // Setup the kernel parameters. - template - static KernelParams setKernelParams(GemmOptions_ const& options, void const* ptrA, void const* ptrB, void* ptrC, - void* multimemC, float const* ptrScaleC, void* ptrTileBars, void* multimemTileBars, void* ptrCompletionBars, - void* multimemCompletionBars, int rank, int tpGrpSize) - { - return setKernelParams(options, ptrA, nullptr, ptrB, nullptr, ptrC, multimemC, ptrScaleC, ptrTileBars, - multimemTileBars, ptrCompletionBars, multimemCompletionBars, rank, tpGrpSize); - } + params.multimemC = multimemC; + params.ptrPartialSumsForSplitK = ptrPartialSumsForSplitK; + params.ptrTileBars = ptrTileBars; + params.multimemTileBars = multimemTileBars; + params.ptrCompletionBars = ptrCompletionBars; + params.multimemCompletionBars = multimemCompletionBars; + + params.ptrSplitKCompletionBars = ptrSplitKCompletionBars; + params.ptrNumNonExitingCtas = ptrNumNonExitingCtas; + return params; +} #endif -}; +}; // namespace KernelParamsSetup //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParamsDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParamsDecl.h new file mode 100644 index 00000000000..f248278acc2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParamsDecl.h @@ -0,0 +1,324 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & + * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +// NOTE: keep this code dependency free. It has to be included by the device code and has to be +// compilable with NVRTC. + +namespace gemm +{ + +namespace gemm +{ + +struct KernelParams +{ +#ifdef TLLM_ENABLE_CUDA + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // Gemm parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // TMA descriptor for A. + // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from + // makeTmaShapeStrideAb. + // + // If layoutA is MatrixLayout::MajorK + // Logical shape is [M, K]. + // Logical strides are [K, 1]. + // Tile box shape is [tileM, tileK]. + // Tile box strides are [tileK, 1]. + // Dtype is set from options.mDtypeA. + // + // If layoutA is MatrixLayout::MajorMn + // Logical shape is [K, M]. + // Logical strides are [M, 1]. + // Tile box shape is [tileK, tileM]. + // Tile box strides are [tileM, 1]. + // Dtype is set from options.mDtypeA. + // + // If layoutA is MatrixLayout::BlockMajorK + // Logical shape is [K / blockK, M, blockK]. + // Logical strides are [M * blockK, blockK, 1]. + // Tile box shape is [tileK / min(blockK, tileK), tileM, min(blockK, tileK)]. + // Tile box strides are [tileM * min(blockK, tileK), min(blockK, tileK), 1]. + // Dtype is set from options.mDtypeA, and blockK is 128B. + CUtensorMap tmaA; + + // TMA descriptor for B. + // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from + // makeTmaShapeStrideAb. + // + // If layoutB is MatrixLayout::MajorK + // Logical shape is [N, K]. + // Logical strides are [K, 1]. + // Tile box shape is [tileN, tileK]. + // Tile box strides are [tileK, 1]. + // Dtype is set from options.mDtypeB. + // + // If layoutB is MatrixLayout::MajorMn + // Logical shape is [K, N]. + // Logical strides are [N, 1]. + // Tile box shape is [tileK, tileN]. + // Tile box strides are [tileN, 1]. + // Dtype is set from options.mDtypeB. + // + // If layoutB is MatrixLayout::BlockMajorK + // Logical shape is [K / blockK, N, blockK]. + // Logical strides are [N * blockK, blockK, 1]. + // Tile box shape is [tileK / min(blockK, tileK), tileN, min(blockK, tileK)]. + // Tile box strides are [tileN * min(blockK, tileK), min(blockK, tileK), 1]. + // Dtype is set from options.mDtypeB, and blockK is 128B. + CUtensorMap tmaB; + + // TMA descriptor for C, (when useTmaStore is true) + // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from + // makeTmaShapeStrideC. + // + // If transposeMmaOutput is false, + // Logical shape is [M, N]. + // Logical strides are [N, 1]. + // Tile box shape is [epilogueTileM, epilogueTileN]. + // Tile box strides are [epilogueTileN, 1]. + // Dtype is set from options.mDtypeC. + // + // If transposeMmaOutput is true, + // Logical shape is [N, M]. + // Logical strides are [M, 1]. + // Tile box shape is [epilogueTileN, epilogueTileM]. + // Tile box strides are [epilogueTileM, 1]. + // Dtype is set from options.mDtypeC. + CUtensorMap tmaC; + + // TMA descriptor for the block scaling factors for A, for MxFp{4,8} and NvFp4 formats. + // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from + // makeTmaShapeStrideSfAb. + // The layout of scaling factors for A is always R128c4 + // + // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats. + // K must be a multiple of 4P. + // The "logical" shape is: [M, K / P]. + // The R128c4 layout is: [⌈M / 128⌉, K / P / 4, 512]. + // The shape we use for TMA is: [⌈M / 128⌉, K / P / 4, 2, 256]. + // + // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats. + CUtensorMap tmaSfA; + + // TMA descriptor for the block scaling factors for B, for MxFp{4,8} and NvFp4 formats. + // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from + // makeTmaShapeStrideSfAb. + // The layout of scaling factors for B is controlled by options.mSfLayoutB. + // + // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats. + // The "logical" shape is: [N, K / P] + // + // If the layout is R128c4, + // K must be a multiple of 4P. + // The R128c4 layout is: [⌈N / 128⌉, K / P / 4, 512] + // The shape we use for TMA is: [⌈N / 128⌉, K / P / 4, 2, 256] + // + // If the layout is R8c4, + // K must be a multiple of 4P. + // The R8c4 layout is: [⌈N / 8⌉, K / P / 4, 32] + // The shape we use for TMA is: [⌈N / 8⌉, K / P / 4 / r, r * 32] + // where r = min(tileK / P / 4, 8) + // + // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats. + CUtensorMap tmaSfB; + + // The output matrix C. The data type is controlled by options.mDtypeC. + // + // When transposeMmaOutput is true, the shape is [N, M]. + // Otherwise, the shape is [M, N]. + // Elements in a given row are stored contiguously in memory (row-major). + void* ptrC; + + // The block scaling factors to dequantize A. + // + // If DeepSeek FP8 recipe is used: + // If transposeMmaOutput is false, shape is [K / 128, M]. + // Otherwise, shape is [M / 128, K / 128]. + // The rightmost dimension is contiguous in memory. + // + // If DeepSeek FP8 recipe is not used, but for MxFp{4,8} and NvFp4 formats: + // The layout and data type is the same as explained in tmaSfA. + // + // Otherwise should be set to nullptr. + void const* ptrSfA; + + // The scaling factors to dequantize B. + // + // If DeepSeek FP8 recipe is used: + // If transposeMmaOutput is false, shape is [N / 128, K / 128]. + // Otherwise, shape is [K / 128, N]. + // The rightmost dimension is contiguous in memory. + // + // If DeepSeek FP8 recipe is not used, but for MxFp{4,8} and NvFp4 formats: + // The layout and data type is the same as explained in tmaSfB. + // + // Otherwise should be set to nullptr. + void const* ptrSfB; + + // The bias applied after the GEMM. + // The bias is applied before applying the global scaling factor. I.e. + // C' = (A * B + bias') * scaleC + // scaleC = dequantA * dequantB * quantC + // Thus, the bias' = bias / (dequantA * dequantB), where the bias is the original bias. + // + // if BiasType is N, the shape is [N]. + // The bias is broadcasted along the M dimension. + // + // if BiasType is M, the shape is [M]. + // The bias is broadcasted along the N dimension. + // + // The dtype is float32. + void const* ptrBias; + + // The per-token scaling factors from scale A. + // + // This is used for either: + // * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32 + // * When the routing scales are applied to the input activations (only when output is not + // transposed). The dtype is Dtype::Bfloat16 + // + // The shape is [M] + void const* ptrPerTokenSfA; + + // The per-token scaling factors from scale B. + // + // This is used for either: + // * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32 + // * When the routing scales are applied to the input activations (only when output is + // transposed). The dtype is Dtype::Bfloat16 + // + // The shape is [N] + void const* ptrPerTokenSfB; + + // The scaling factors calculated when quantizing C, for MxFp{4,8} and NvFp4 formats, also + // used for the DeepSeek FP8 recipe. + // + // For DeepSeek FP8 recipe: + // If transposeMmaOutput is false, shape is [N / 128, M]. + // Otherwise, shape is [M / 128, N]. + // The rightmost dimension is contiguous in memory. + // + // For MxFp{4,8} and NvFp4 formats: + // If transposeMmaOutput is false, shape is [M, N / 16]. + // Otherwise, shape is [N, M / 16]. + // The layout is controlled by options.mSfLayoutC (either R128c4 or R8c4). + void* ptrSfC; + + // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. + // TensorRT-LLM API requires a scaling factor on the device. + // Shape is [1]. + float const* ptrScaleC; + + // The M dimension. + // It is the total number of tokens if A is the activation matrix. + // It is the total number of output channels if A is the weight matrix. + int32_t m; + // The N dimension. + // It is the total number of tokens if B is the activation matrix. + // It is the total number of output channels if B is the weight matrix. + int32_t n; + // The K dimension. It is the hidden dimension of the input matrices. + int32_t k; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // All-reduce parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // The rank id of the current device in the multi-gpu space. + int rank; + // The number of peer devices in tensor-parallel group. + int tpGrpSize; + // Pointer for output with multicast mapping. It is used by the "reduce" op (LDGMC.ADD) of the + // two-shot reduce-scatter phase. + // The shape is [M, N] and the dtype is float. + void* multimemC; + + // The barriers in global memory. + // + // The kernel arrives at (with release ordering) the multicast mapping of the barrier to broadcast + // amongst peer devices. It then waits (with acquire ordering) for the unicast mapping of the + // barrier. + // + // Flags in global memory that sync on "entrance" of reduce-scatter phase in two-shot all-reduce. + // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. + // The pointer to the unicast memory created with IpcNvlsHandle. + // Must be set to 0 before the kernel launch. + void* ptrTileBars; + // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. + // The pointer to the multicast memory created with IpcNvlsHandle. + void* multimemTileBars; + + // Flags in global memory that sync on "exit" after the all-reduce finishes. + // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. + // The pointer to the unicast memory created with IpcNvlsHandle. + // Must be set to 0 before the kernel launch. + void* ptrCompletionBars; + // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. + // The pointer to the multicast memory created with IpcNvlsHandle + void* multimemCompletionBars; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // Miscellaneous parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // The barriers in global memory for Split-k reduction with exchange in GMEM. + // Each CTAs arrives at the barrier and blockIdx.z == gridDim.Z - 1 waits for the barrier to flip + // to perform a reduction. + // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. + // For DeepSeek FP8 recipe, the shape is [numTilesM * numTilesN * 2]. + // The memory must be set to 0 before the kernel launch. + void* ptrSplitKCompletionBars; + + // Pointer to the memory holding the partial sums for split-K in GMEM. + // The shape is [numSlicesForSplitK, numSlicesForSliceK, numTilesM * tileM, numTilesN * tileN]. + // The dtype is dtypeAcc, i.e. float. + void* ptrPartialSumsForSplitK; + + // In some cases, some CTAs need to exit early. E.g. when the grid is statically set, but the + // actual workload is decided at runtime. This device pointer maps to the number of non exiting + // CTAs in the X dim of the grid when transposeMmaOutput is false. And the Y dim, otherwise. + // The pointer points to a scalar and the dtype is int32_t. The pointed value must be >= 0. + int32_t* ptrNumNonExitingCtas; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // Miscellaneous parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// + + enum class MatrixType + { + MatrixA = 0, + MatrixB + }; +#endif +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace gemm + +} // namespace gemm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelTraits.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelTraits.h index 9a4db96c7c9..0e528a7774d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelTraits.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelTraits.h @@ -19,7 +19,9 @@ #include "Enums.h" #include "trtllm/gen/CommonUtils.h" #include "trtllm/gen/DtypeDecl.h" +#include "trtllm/gen/MmaDecl.h" #include +#include namespace gemm { @@ -77,18 +79,29 @@ class MemAllocatorHelper } // Returns the offset of the ith chunk - int32_t getChunkOffset(int32_t ii) const + int32_t getChunkOffsetByName(std::string const& name) const { - if (mFirstChunkReuse[ii]) + for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii) { - // Reuse the offset of the 0th chunk. - return getChunkOffset(0); + if (mSmemChunkNames[ii] == name) + { + return getChunkOffset(ii); + } } + throw std::runtime_error("Name not found: " + name); + } - // Get offset of ii chunks. - auto offset = getOffsetBeforeChunk(ii); - // Ensure alignment for the current chunk - return getSizePaddedToAlignment(offset, mNumBytesAndAlignmentPerSmemChunk[ii].second); + // Returns the first chunk reuse flag given chunk name. + int getFirstChunkReuseFlagByName(std::string const& name) const + { + for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii) + { + if (mSmemChunkNames[ii] == name) + { + return getFirstChunkReuseFlag(ii); + } + } + throw std::runtime_error("Name not found: " + name); } // Function to calculate the total size of the SMEM array @@ -97,12 +110,6 @@ class MemAllocatorHelper return getOffsetBeforeChunk(static_cast(mNumBytesAndAlignmentPerSmemChunk.size())); } - // Returns the first chunk reuse flag for the ith chunk. - int getFirstChunkReuseFlag(int32_t ii) const - { - return mFirstChunkReuse[ii]; - } - // Print the contents of this object. void print() const { @@ -115,6 +122,26 @@ class MemAllocatorHelper } private: + int32_t getChunkOffset(int32_t ii) const + { + if (mFirstChunkReuse[ii]) + { + // Reuse the offset of the 0th chunk. + return getChunkOffset(0); + } + + // Get offset of ii chunks. + auto offset = getOffsetBeforeChunk(ii); + // Ensure alignment for the current chunk + return getSizePaddedToAlignment(offset, mNumBytesAndAlignmentPerSmemChunk[ii].second); + } + + // Returns the first chunk reuse flag for the ith chunk. + int getFirstChunkReuseFlag(int32_t ii) const + { + return mFirstChunkReuse[ii]; + } + // Helper function to calculate padded size int32_t getSizePaddedToAlignment(int32_t size, int32_t alignment) const { @@ -139,9 +166,7 @@ int getNumSmemBitsPerElt(tg::Dtype dtype, tg::MmaKind mmaKind) { if (mmaKind == tg::MmaKind::Auto) { - std::cout << "mmaKind != tg::MmaKind::Auto" << std::endl; - assert(false); - return -1; + throw std::runtime_error("mmaKind != tg::MmaKind::Auto"); } if (mmaKind == tg::MmaKind::MxFp8Fp6Fp4) { @@ -162,11 +187,12 @@ class KernelTraits KernelTraits() {} // The constructor. - KernelTraits(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeAcc, tg::MmaKind mmaKind, - int32_t tileM, int32_t tileN, int32_t tileK, int32_t epilogueTileM, int32_t epilogueTileN, int32_t numStages, - int32_t numStagesMma, int32_t numSlicesForSplitK, int32_t numSlicesForSliceK, SplitK splitK, bool useTmaStore, + KernelTraits(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeAcc, tg::Dtype dtypeMmaA, + tg::Dtype dtypeMmaB, tg::MmaKind mmaKind, int32_t mmaK, int32_t tileM, int32_t tileN, int32_t tileK, + int32_t epilogueTileM, int32_t epilogueTileN, int32_t numStages, int32_t numStagesMma, + int32_t numSlicesForSplitK, int32_t numSlicesForSliceK, SplitK splitK, bool useTmaStore, bool transposeMmaOutput, AllReduceAlgo allReduceAlgo, bool usePersistentScheduler, bool useDeepSeekFp8, - bool usePerTokenSfA, bool usePerTokenSfB) + bool usePerTokenSfA, bool usePerTokenSfB, BiasType biasType) : mMmaKind{mmaKind} { // @@ -181,16 +207,17 @@ class KernelTraits // [rowMax ] (16B aligned) (if needed) // [sliceK ] (16B aligned) (if needed) // [per-token SF ] (16B aligned) (if needed) + // [bias ] (16B aligned) (if needed) // // SMEM for smemA and smemB might be repurposed and used for gmemC0 and gmemC1: // // [..smemA..][..smemB..][..smemBShuffle..] - // [..gmemC0..][..gmemC1..][..rowMax..][..sliceK..] + // [..gmemC0..][..gmemC1..][..rowMax..][..sliceK..][..per-token SF..][..bias..] // if (mMmaKind == tg::MmaKind::Auto) { - mMmaKind = dtypeGetMmaKind(dtypeA, dtypeB); + mMmaKind = dtypeGetMmaKind(dtypeMmaA, dtypeMmaB); } std::vector> numBytesAndAlignmentPerSmemChunk; @@ -344,6 +371,29 @@ class KernelTraits firstChunkReuseSmem.emplace_back(false); } + // Bias + { + int32_t numBytesSmemBias = 0; + if (isBiasTypeN(biasType)) + { + numBytesSmemBias = tileN * sizeof(float); + } + else if (isBiasTypeM(biasType)) + { + numBytesSmemBias = tileM * sizeof(float); + } + else if (isBiasTypeMn(biasType)) + { + numBytesSmemBias = tileM * tileN * sizeof(float); + } + // Number of bytes alignment for bias + auto const numBytesAlignmentBias = 16; + // Add info. + smemChunkNames.emplace_back("smemBias"); + numBytesAndAlignmentPerSmemChunk.emplace_back(std::make_pair(numBytesSmemBias, numBytesAlignmentBias)); + firstChunkReuseSmem.emplace_back(false); + } + // Per-block absolute maximum for multi-warp reduction. { // Number of bytes: number of epilogue warps * number of tile columns. @@ -358,6 +408,25 @@ class KernelTraits firstChunkReuseSmem.emplace_back(false); } + // SmemConstSfBuf + // A buffer used to copy constant values to TMEM. + { + // Do we need the buffer? + bool const useConstSfBuf = dtypeB == tg::Dtype::E4m3 && dtypeMmaB == tg::Dtype::MxE4m3; + // Number of bytes for the buffer. + auto const numSmemBytesConstSfBuf = useConstSfBuf ? 512 : 0; + // Number of bytes for the alignment of the buffer. + auto const numBytesAlignmentConstSfBuf = 16; + // No need to reuse the first chunk. + auto const reuseChunksSmemConstSfBuf = false; + + // Add info. + smemChunkNames.emplace_back("smemConstSfBuf"); + numBytesAndAlignmentPerSmemChunk.emplace_back( + std::make_pair(numSmemBytesConstSfBuf, numBytesAlignmentConstSfBuf)); + firstChunkReuseSmem.emplace_back(reuseChunksSmemConstSfBuf); + } + // Create SMEM helper object. mSmemAllocatorHelper = MemAllocatorHelper(numBytesAndAlignmentPerSmemChunk, firstChunkReuseSmem, smemChunkNames); @@ -401,10 +470,12 @@ class KernelTraits // Matrix A { + // We use TMEM for A if we use slice-K or if we need to cast A. + bool const useTmemA = (numSlicesForSliceK > 1) || (dtypeMmaA != dtypeA); // Number of columns for A. - auto const numTmemColsA = numSlicesForSliceK > 1 ? numStages * tileK - / (numSlicesForSliceK * tg::dtypeGetNumBits(tg::Dtype::UInt32) / tg::dtypeGetNumBits(dtypeA)) - : 0; + auto const numTmemColsA = useTmemA ? numStages * tileK + / (numSlicesForSliceK * tg::dtypeGetNumBits(tg::Dtype::UInt32) / tg::dtypeGetNumBits(dtypeMmaA)) + : 0; // Number of columns for A alignment. auto const numColsAlignmentA = 4; // No need to reuse TMEM. @@ -418,12 +489,16 @@ class KernelTraits // Sf A { - bool const useBlockScalingA = tg::dtypeIsBlockFmt(dtypeA); + // Does the MMA require block scales in TMEM for A? + bool const useBlockScalingA = tg::dtypeIsBlockFmt(dtypeMmaA); + // Are the block scales constant? + bool const useConstSfA = useBlockScalingA && !tg::dtypeIsBlockFmt(dtypeA); // Number of columns for scaling factors of A. - auto const numTmemColsSfA - = useBlockScalingA ? ((tileK / 64) * 2 * tg::ceilDiv(tileM, 64)) * numStages : 0; + auto const numTmemColsSfA = useConstSfA + ? tg::roundUp((tileK / 64) * tg::getTmemColStridePerGroup(tileM, mmaK), 4) + : (useBlockScalingA ? ((tileK / 64) * tg::getTmemColStridePerGroup(tileM, mmaK)) * numStages : 0); // Number of columns for Sf alignment. - auto const numColsAlignmentSfA = 2; + auto const numColsAlignmentSfA = 4; // No need to reuse TMEM. auto const reuseChunksTmemSfA = false; @@ -435,12 +510,16 @@ class KernelTraits // Sf B { - bool const useBlockScalingB = tg::dtypeIsBlockFmt(dtypeB); + // Does the MMA require block scales in TMEM for B? + bool const useBlockScalingB = tg::dtypeIsBlockFmt(dtypeMmaB); + // Are the block scales constant? + bool const useConstSfB = useBlockScalingB && !tg::dtypeIsBlockFmt(dtypeB); // Number of columns for scaling factors of B. - auto const numTmemColsSfB - = useBlockScalingB ? ((tileK / 64) * 2 * tg::ceilDiv(tileN, 64)) * numStages : 0; + auto const numTmemColsSfB = useConstSfB + ? tg::roundUp((tileK / 64) * tg::getTmemColStridePerGroup(tileN, mmaK), 4) + : (useBlockScalingB ? ((tileK / 64) * tg::getTmemColStridePerGroup(tileN, mmaK)) * numStages : 0); // Number of columns for Sf alignment. - auto const numColsAlignmentSfB = 2; + auto const numColsAlignmentSfB = 4; // No need to reuse TMEM. auto const reuseChunksTmemSfB = false; @@ -487,14 +566,14 @@ inline int32_t getTmemBufferSize(KernelTraits traits) inline int32_t getSmemOffsetLoadA(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(0); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadA"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetLoadB(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(1); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadB"); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -508,50 +587,63 @@ inline int32_t getSmemOffsetLoadAb(KernelTraits traits) inline int32_t getSmemOffsetLoadShuffleB(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(2); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBShuffle"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetGmemC(KernelTraits traits, int resIdx = 0) { - return traits.mSmemAllocatorHelper.getChunkOffset(3 + resIdx); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemGmemC" + std::to_string(resIdx)); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetRowMax(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(5); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemRowMax"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetSliceK(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(6); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemSliceK"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetPerTokenSf(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(7); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemPerTokenSf"); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline int32_t getSmemOffsetBias(KernelTraits traits) +{ + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBias"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetBlockAmax(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(8); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBlockAmax"); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline int32_t getSmemOffsetConstSfBuf(KernelTraits traits) +{ + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemConstSfBuf"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0) { - // Be conscious that the index (3 + resIdx) should match the index in getSmemOffsetGmemC(). - return traits.mSmemAllocatorHelper.getFirstChunkReuseFlag(3 + resIdx); + return traits.mSmemAllocatorHelper.getFirstChunkReuseFlagByName("smemGmemC" + std::to_string(resIdx)); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -562,28 +654,28 @@ inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0) inline int32_t getTmemOffsetD(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffset(0); + return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemD"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetA(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffset(1); + return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemA"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetSfA(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffset(2); + return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfA"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetSfB(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffset(3); + return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfB"); } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/TmaDescriptor.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/TmaDescriptor.h index 0b7574260ef..a246ac35b37 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/TmaDescriptor.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/TmaDescriptor.h @@ -41,14 +41,14 @@ namespace tg = trtllm::gen; #ifdef TLLM_ENABLE_CUDA inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, std::vector const& shapes, - std::vector const& strides, int32_t tileSizeMn, int32_t tileSizeK, void* gmemAddr, bool doSwizzle = true) + std::vector const& strides, std::vector const& tileShapes, void* gmemAddr, bool doSwizzle = true) { // The multiplication factor of the data padding in SMEM. int32_t padMultiplier = 1; CUtensorMap desc{}; // The data type. CUtensorMapDataType tmaDataFormat{CU_TENSOR_MAP_DATA_TYPE_FLOAT32}; - if (dtype == tg::Dtype::E4m3 || dtype == tg::Dtype::MxE4m3) + if (dtype == tg::Dtype::E4m3 || dtype == tg::Dtype::MxE4m3 || dtype == tg::Dtype::UE8m0) { tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_UINT8; } @@ -71,15 +71,11 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st padMultiplier = 2; tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B; } - else if (mmaKind == tg::MmaKind::MxFp4NvFp4 || mmaKind == tg::MmaKind::Auto) - { - tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B; - } else { - std::cerr << "Invalid dtype / mmaKind combination " << tg::dtypeToString(dtype) << "/" - << tg::mmaKindToString(mmaKind) << std::endl; - assert(false); + // Note: this is used with the MMA kind MxFp4NvFp4 and also when casting to a higher-precision + // type such as Bfloat16 before the MMA. + tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B; } } else if (dtype == tg::Dtype::Fp32) @@ -94,24 +90,30 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st // The swizzle type. CUtensorMapSwizzle swizzleType{CU_TENSOR_MAP_SWIZZLE_NONE}; - int32_t tileKSizeInBytes = (tileSizeK * tg::dtypeGetNumBits(dtype) * padMultiplier) / /* bits */ 8; + int32_t fastestDimTileSizeBytes = (tileShapes[0] * tg::dtypeGetNumBits(dtype) * padMultiplier) / /* bits */ 8; if (doSwizzle) { - if ((tileKSizeInBytes % 128) == 0) + if ((fastestDimTileSizeBytes % 128) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_128B; } - else if ((tileKSizeInBytes % 64) == 0) + else if ((fastestDimTileSizeBytes % 64) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_64B; } - else if ((tileKSizeInBytes % 32) == 0) + else if ((fastestDimTileSizeBytes % 32) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_32B; + // This path is only for the scaling factors. + } + else if ((fastestDimTileSizeBytes % 16) == 0 && (dtype == tg::Dtype::UE8m0 || dtype == tg::Dtype::E4m3)) + { + swizzleType = CU_TENSOR_MAP_SWIZZLE_NONE; } else { - std::cerr << "buildNdTmaDescriptor: unexpected tileKSizeInBytes " << tileKSizeInBytes << std::endl; + std::cerr << "buildNdTmaDescriptor: unexpected fastestDimTileSizeBytes " << fastestDimTileSizeBytes + << std::endl; assert(false); } } @@ -121,8 +123,9 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st // Check shape must be in range [1, 2^32] int32_t dim = shapes.size(); - // Expect 2 dimensions. - assert(dim == 2 || dim == 3); + // Expect 2 dimensions for regular gemm, 3 dimensions for batched gemm or blocked layout, and 4 + // dimensions for batched gemm with blocked layout. + assert(dim == 2 || dim == 3 || dim == 4); // Check shape range. for (int32_t ii = 0; ii < dim; ++ii) { @@ -147,59 +150,74 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st // The number of elements in 128B. auto const numEltsIn128B = numEltsPerUInt32 /*4B*/ * 32; // The number of tile K hidden size (per token) in each block of shared memory. - auto const numEltsInClampedTileKSize = std::min(numEltsIn128B, tileSizeK); + auto const numEltsInClampedFastestTileSize = std::min(numEltsIn128B, tileShapes[0]); - // Build tile shapes. - std::vector tileShapes(dim, 1); - tileShapes[0] = numEltsInClampedTileKSize; // tileSizeK - tileShapes[1] = tileSizeMn; // tileSizeMn + // Build box dim array. If tileShapes is smaller than dim, just fill with 1s. + assert(static_cast(tileShapes.size()) <= dim); + std::vector boxDim(dim, 1); + boxDim[0] = numEltsInClampedFastestTileSize; + for (size_t ii = 1; ii < tileShapes.size(); ++ii) + { + if (tileShapes[ii] > 256) + { + std::cerr << "buildNdTmaDescriptor: boxDim too large " << tileShapes[ii] << std::endl; + assert(false); + } + else + { + boxDim[ii] = tileShapes[ii]; + } + } // Set tile strides to 1; std::vector tileStrides(dim, 1); // Build the descriptor. CUresult result = cuTensorMapEncodeTiled(&desc, tmaDataFormat, - /*tensorRank=*/dim, gmemAddr, shapes.data(), stridesInBytes.data(), tileShapes.data(), tileStrides.data(), + /*tensorRank=*/dim, gmemAddr, shapes.data(), stridesInBytes.data(), boxDim.data(), tileStrides.data(), /*interleave=*/CU_TENSOR_MAP_INTERLEAVE_NONE, swizzleType, /*l2Promotion=*/CU_TENSOR_MAP_L2_PROMOTION_L2_128B, /*oobFill=*/CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); if (result != CUDA_SUCCESS) { - std::cerr << "Error: Failed to initialize the TMA descriptor " << result << std::endl; + char const* errorString; + cuGetErrorString(result, &errorString); + std::stringstream ss; + ss << "Error: Failed to initialize the TMA descriptor " << result << std::endl; - std::cerr << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr - << std::endl; + ss << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl; - std::cerr << "Shape: "; + ss << "Shape: "; for (int ii = 0; ii < dim; ++ii) { - std::cerr << shapes[ii] << " "; + ss << shapes[ii] << " "; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "Stride: "; + ss << "Stride: "; for (int ii = 0; ii < dim - 1; ++ii) { - std::cerr << stridesInBytes[ii] << " "; + ss << stridesInBytes[ii] << " "; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "tileShapes: "; + ss << "tileShapes: "; for (int ii = 0; ii < dim; ++ii) { - std::cerr << tileShapes[ii] << " "; + ss << boxDim[ii] << " "; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "tileStrides: "; + ss << "tileStrides: "; for (int ii = 0; ii < dim; ++ii) { - std::cerr << tileStrides[ii] << " "; + ss << tileStrides[ii] << " "; } - std::cerr << std::endl; - std::cerr << "swizzleType: " << int(swizzleType) << std::endl; - assert(false); + ss << std::endl; + ss << "swizzleType: " << int(swizzleType) << std::endl; + ss << "(in " << __FILE__ << ":" << __LINE__ << ")" << std::endl; + throw std::runtime_error(ss.str()); } return desc; @@ -267,41 +285,44 @@ inline CUtensorMap buildSfTmaDescriptor(tg::Dtype dtype, std::vector c if (result != CUDA_SUCCESS) { - std::cerr << "Error: Failed to initialize the TMA descriptor for SF " << result << std::endl; + char const* errorString; + cuGetErrorString(result, &errorString); + std::stringstream ss; + ss << "Error: Failed to initialize the TMA descriptor for SF " << errorString << std::endl; - std::cerr << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr - << std::endl; + ss << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl; - std::cerr << "shape:"; + ss << "shape:"; for (uint32_t shape_i : shapes) { - std::cerr << " " << shape_i; + ss << " " << shape_i; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "stridesInBytes:"; + ss << "stridesInBytes:"; for (uint32_t stride_i : stridesInBytes) { - std::cerr << " " << stride_i; + ss << " " << stride_i; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "tileShapes:"; + ss << "tileShapes:"; for (uint32_t tileShape_i : tileShapes) { - std::cerr << " " << tileShape_i; + ss << " " << tileShape_i; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "tileStrides:"; + ss << "tileStrides:"; for (uint32_t tileStride_i : tileStrides) { - std::cerr << " " << tileStride_i; + ss << " " << tileStride_i; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "swizzleType: " << int(swizzleType) << std::endl; - assert(false); + ss << "swizzleType: " << int(swizzleType) << std::endl; + ss << "(in " << __FILE__ << ":" << __LINE__ << ")" << std::endl; + throw std::runtime_error(ss.str()); } return desc; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/config.json b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/config.json index fbbcdfa0594..b9d77fb25e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/config.json +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/config.json @@ -12,7 +12,7 @@ "epilogueTileM": 128, "epilogueTileN": 8, "numStages": 4, - "numMmaStages": 1, + "numStagesMma": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -43,7 +43,7 @@ "epilogueTileM": 128, "epilogueTileN": 128, "numStages": 3, - "numMmaStages": 1, + "numStagesMma": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -75,7 +75,7 @@ "epilogueTileM": 64, "epilogueTileN": 8, "numStages": 3, - "numMmaStages": 1, + "numStagesMma": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -105,7 +105,7 @@ "epilogueTileM": 128, "epilogueTileN": 128, "numStages": 3, - "numMmaStages": 1, + "numStagesMma": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -135,7 +135,7 @@ "epilogueTileM": 128, "epilogueTileN": 8, "numStages": 3, - "numMmaStages": 1, + "numStagesMma": 1, "numSlicesForSplitK": 2, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -182,7 +182,6 @@ "numStagesMma": 2, "numStagesMmaWithinWorkTile": 2, "useTwoMmaWarps": true, - "useMetaFp8": false, "usePdl": true }, "GemmDeepSeekFp8Throughput": { @@ -212,7 +211,6 @@ "numStagesMma": 2, "numStagesMmaWithinWorkTile": 2, "useTwoMmaWarps": true, - "useMetaFp8": false, "usePdl": true, "gridTriggerSecondaryA": true, "gridTriggerSecondaryB": false, @@ -232,7 +230,7 @@ "epilogueTileM": 128, "epilogueTileN": 8, "numStages": 3, - "numMmaStages": 1, + "numStagesMma": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -246,11 +244,38 @@ "useCustomMmaSchedule": true, "sfLayoutB": "8x4", "sfLayoutC": "8x4", - "useMetaFp8": false, "gridTriggerSecondaryB": true, "gridWaitForPrimaryA": false, "gridWaitForPrimaryB": true, "usePdl": true + }, + "GemmFp4xFp8": { + "dtypeA": "e2m1", + "dtypeMmaA": "e4m3", + "dtypeB": "e4m3", + "dtypeC": "fp16", + "sfBlockSizeA": 32, + "sfLayoutA" : "128x4", + "mmaM": 128, + "mmaN": 8, + "mmaK": 32, + "tileM": 128, + "tileN": 8, + "tileK": 512, + "epilogueTileM": 128, + "epilogueTileN": 8, + "numStages": 2, + "numSlicesForSplitK": 1, + "clusterDimZ": 1, + "useTwoTmaLoadWarps": true, + "sliceK": false, + "transposeMmaOutput": true, + "useShuffledMatrixA": true, + "useCustomMmaSchedule": true, + "numStagesMma": 1, + "useTwoMmaWarps": false, + "usePdl": true, + "tileScheduler": "persistent" } }, "configs": [ @@ -323,6 +348,13 @@ { "_template": "GemmMxE2m1MxE4m3LowLatency", "dtypeC": ["bf16", "fp16", "fp32", "mxe4m3"] + }, + { + "_template": "GemmFp4xFp8", + "_comment": "Tile 8 to 128", + "dtypeC": ["bf16", "fp16", "e4m3"], + "useUnrollLoop2xForMma": [true, false], + "mmaN,tileN,epilogueTileN,tileK,numSlicesForSplitK,clusterDimZ": [[8, 8, 8, 512, 2, 2], [16, 16, 16, 512, 2, 2], [32, 32, 32, 512, 2, 2], [64, 64, 64, 512, 2, 2], [128, 128, 128, 256, 1, 1]] } ] } diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp deleted file mode 100644 index 3d32c2ee250..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:856ce9e462068d464a244eb5179277c6aeb4eba8c9767b354d664eb6eafee0d3 -size 416980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 1657a0701fe..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c461d5767472f619e7cffd41cc609bb9bf244b78342c55a1b42ae344ccc87292 -size 523680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index c21ee7f925f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9d393b7e86991ea2757655b479ef75bfe660f3a1846f46c38e6f55c6ba9d6a25 -size 558316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp deleted file mode 100644 index 2f35e766210..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2ecd12d9d9e7d4cec0e7c530e72328420c868f37bab285ed55864776fc6eeec7 -size 304696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 15535b511a7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ff5b953f3226300d647adc3328d04fae0888b2de91f39a27f5ce7efc6f88f15e -size 401032 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 6c80cc53812..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3f921e229f0d48546a2087d02f526e0c5c8d5189696ec2e71349227182d1bee0 -size 438480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 2a9bb9cb191..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1acf78f5c7f9505a95f782a4c781c94a9b34bb5958c8f511f32058da40f81868 -size 418890 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index d191118f3b8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:32f834741c1c2f721409b71b5aaa45b79e7d337c5fc422af33a1bbe1b56b3da5 -size 455548 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 9708ab6fef5..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:87785c72e84d52ff962f252e98868e1cc3f2595aaa1e9aaf2924fa50e886aba2 -size 458160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 00515a3f3b0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6055448d6d6cbb547b3d5656fecb5044465d88be3121e42e6b3c39f96e3bd828 -size 495608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 9f4c2f41873..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:df7b94d53bc5517b94f6c2c5c7e6108695a32809cbc55e7d83124f07c06a786c -size 426334 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 41791455686..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b64cf6bf660b14299c1957170426f089af93e19fccc93295fd32f0c5df77951d -size 463830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index bacabe8b3a0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9359453507f25d7d90bf1f6ac4a453756ba9f8006d6c76fcb3ff09a1ba8cf71a -size 305610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 76d4f50448e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:52a9dc57d86dc176ad59234a764959c0ebec01d9738889a8989fbfca925cf72f -size 338088 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index daef06303dc..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:631e364a1ae1a29d386f624849176118c5d4f7b01e38f7c973f190d89e7136f2 -size 506554 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 5d0f3377e65..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d52dfd5ac422362ede96cbda888383f4452df7dc39d6653f8529a560d6b12d37 -size 687361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index adfbdfa4ad2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1b8ed23721a4d1eb0260e36dff2bad3cb1b603d287ef7584cf18c5db73ee869f -size 722835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp deleted file mode 100644 index 354888f2829..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:12cea6d18fe0c95f6eb073e3956ca785835b2740ee65b7d8f934313f709cae87 -size 317072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index e52810e22e6..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5901f4239aff1497ca81fd9f853fdb35fb2a14ab35c89b1acbde9c87fc909da1 -size 423322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 005124d8237..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:db42b1e4d61dc69b6a5ac4304fb3e89eb04e9c858af216ee88f877099f400013 -size 460770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index f280d1b90d7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ca8b1eb8b32d9211ed9f4842548e57128ee6ede550d4dcc3b6ac804de45a9f2b -size 460470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index c458bd9b23e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:70dc60c09fae6aa9bfa809107032a931b5edfc211d66b374b5092192c902c222 -size 497918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 3ac72b581fd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ae575f338e0db3bb053f9cfcee4804a75611b330b87718e18f7692f865ca6984 -size 538960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 97410829904..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c59fa6c8e96dedcb4956d1fbe3282ed1f809dcd34499bb9b4d7b06bc168b579d -size 575568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 0ef28473e53..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4c6028820f760089f09b0a2e4b32fa0c76725f56ebdfd97fc6c53331616282d6 -size 438116 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 516f77e48ac..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0760d0c99a44691f281310268a6751ea62292b2205719b285ff9e7429fcffaf1 -size 475614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 03aad1c232d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:173abead241a103af02a25daf432d028324561d57f64d9a6d2087cd444c45758 -size 304814 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 02617fa32b5..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92b924875fff3571d0efe3478f2fdddb11a8ec60796830f3dd172a7b4f24acac -size 335712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp deleted file mode 100644 index 9c05d1e6c2d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ba1a393cd48175f1cfc18436435ea638de675e0d98847510d97bd03d8bae234d -size 421756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 406e016af29..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:54e20ad645aafe8ff45abd1c6c8d418c6d0f2c7cd38eb01d6969a32b7cea60de -size 528458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index f475229583d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9b74282de9bcfced0c173d9aee838a54c8ba286bbc84719fcdb81dd9cbac7f15 -size 563882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp deleted file mode 100644 index c765b304d33..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c9527b1cee4c45bf8d4346493fddb1b0f8dd3e6abf90ab358dfdf8864533a130 -size 309474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 96c0d7a7acc..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f8669d845fb8bdacda6073d674ca9325a0b241591e1d020020b81977006cf71f -size 400234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 872c8bccec0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c5e32724bfdbe9ac14ad6d54822778c6a1ec63772db04160719aaaed8d29f51 -size 438472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 3d70d3b63cc..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3339a4104298437c3628354d2b14431933314e9f2d2ecdce7b88865dccbee038 -size 418882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index d9d12366664..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6f4bfa0a6a96895cc2165a02f0c83f2627cf625ff6807a5be9c6f7bdbd210428 -size 456330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 340e167a2dc..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ac4fe726d14a219bb03ce7bd9bbdeeb5ea7e3195d64a4c1b9cf0018d53aa3df5 -size 459780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index d44d09ccfb6..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:de6c8769b5814f20ca34ba8df001c795dc54ae81267488bb18510fe5d334f510 -size 498016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 59566f3babb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9a8b5e0bdece2e73f72e300afab464d0680aa00b9cbcbe08903ba0210780e149 -size 425536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 44856c74819..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cdb1d9d1b80de2668ddd79db5c491d3a768188b4b3f5c5d10efb335af7ff28ae -size 463034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 43281689853..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0835489e53c9690803dd1e44ec4290b6ddd9d4294bcb6c6d76b9f39680d12964 -size 304814 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index e8a2a1a09b8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:520f1eed6f696ceaa3b40bc534a96120a1f01b616f6a93d60606b9e72a13c3d6 -size 336502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 0a252fa5e72..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5a9fa4a70c46c56dbc57d38d6578b413554d5f24ee9bd9e223c9a14dca856db1 -size 504966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp deleted file mode 100644 index f5ac8a259db..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7e36ec5821ab21cdaa67e44755daaea4896b21d8324ad95b1227cd5060ba06df -size 421164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 61c6525168e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:675e4a797e20b6a606fa670c6a02d23dedc06affba9bedbe9d5eef4c9bf4fc28 -size 505954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 8d2a38bce3f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:51dd2e16aae507d655121a3869b4a9db92ed11a69d4a04f56d1716743740d38a -size 516120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..46a72cc582e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f811d33d80842160a2ee26716d3e5bf4ad7fd2a497e8761d067a72f1608053b +size 402540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..f7000af18d2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c31e771760a040cdccdecc2cb115b4ac88a216ed1dacd36a571beaafce00eef +size 435250 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..2b05842e5b3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5f970c1d7af7eb0dae865d637e3ceabc741541569ef080a021d3aa0dc2fde54 +size 441322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..1034f8a06da --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5468140541cba6b53f5c5233ddd8d451dba42c66fe7d66c6580ccff36b7fec1 +size 539502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..0ebaf17ba0b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:883067370db4a8dcf2a52c6e148a8c59e8d74d4491d0b94670e29869fb9a1e7e +size 610644 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..73d54192665 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0985fb3d4c681ff1ad3c13d20f340c3a148eaab5f4df15d0bed35094f92ce113 +size 561012 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..2848c4fc94f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a9e112e3175ee6d64c1ff76cc3b404dd75c3b2141a7146160ecbb9a57bbe4c6 +size 632155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..e633b411b6c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:772428f18b342613c83eeef97d6b38f09cf399997f218f4440654d511f2cc8d5 +size 615920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..fc76cde68b6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1df03bdccc4f41b298be4d2df28a764cda834c6ae9ed8ff91f1fd5b45b2e8d16 +size 686965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..2866e55118c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80af264b83492c79130ce08a95c9cbd1d58fa0d36da7759dd0a495209f83485d +size 528742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..16fc0a47f26 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0fe35dc86d38bcef3c3c689cbb561199cf8cf98597f9ff4ae9895eb58e2a369 +size 599884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..a3ae20e88de --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:903142bcccbe54f2d7b494eb99d121880c22335051d45faf0b95104b874eb463 +size 511260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..ec39cacdd15 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f0fadcfe6a193d113bcb19062514c82d2995f186d60695565e3f483f83f67b2 +size 542788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..9f3e071b679 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1c05018c4da492492d9c8bab66c424917c46117f8b7a67208043f8fc7fba704 +size 291096 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..af7f18ab3d7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c6d934483b0c258a701b7d865e40fe7aa7c477336634007cb7dca7fae95871d +size 388906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..c78985c3871 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a1aef3173b48379b2bc6088c7b3171b335da30373340f181e6b3128c51691b1 +size 423592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..aef20e1e5a6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4ea06d167a693a818dd590b56401af415f897c092ced523d0fbfa6b0c5906eb +size 405976 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..be2fca7fa79 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:805c814d02aca70430eaec401317f931b3854d7fe4f07d655ca958ca42533205 +size 441450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..68b7d6137d2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aebbf5a4b7e2c5551b9da7f161a7e4e595034ddb85c7edbca1cb4f163c2ce108 +size 446034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..d696c006cf0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9145c573e096aefb3bbb56c1cadd31eba4b6cd3ea8db4c2f219be2ff3b5bf35 +size 480720 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..5cd0db1c400 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5102b2d333b6351fd81c87fd3b6d565b0a11487c18ac757c5fcfb4bdf07ab6e2 +size 414998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..cbeee501083 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2331b555c625adbcc552f000423a1563943199c4cf83743d1e0dff865fbe77b1 +size 449684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..9515434f028 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:810dd29b0147752e090272e476795327299eefe3594b07ad6a916b997b7715dc +size 291990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..9536164b2d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cfa36aa48820d645e8baf93101c186d6a1d47c959ef0f1858be993477046449 +size 296100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..f38c33b4fa2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fad520c7710f04f82b728c641b17747f07dd2dfc430aeab12bd294cde2a295ac +size 482720 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..536183ccb00 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:849b6eb121c1125dcd8944ebd5fa0bbdb5c25ae0cd083aff8942623d3b1ee6cc +size 435242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..ef0f2f5c9f2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0def6d94d10c80e70f7c57deed553f8847bf7b32fb02d8d4de8d4e744897ee6 +size 442104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..5e1e27e6dc8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b25ce03959310c1ceb7b738e23b42314ae77185d874dcb3136a54bdb3afb8b8 +size 538112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..860e5cc1d1e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98270c39f23adcdc847e40ec4af0906bce0eadfadc0478cd80b6c5b01b67d4d2 +size 610044 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..fc9feecf248 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cfe52e1a652b3c98a7036e8183dd54c19eb656b93c6cf1c4bb8515aaa0cb618 +size 559128 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..c33fa7b16f3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b3bdf48cff6be67299350858d2e179db6a168b8b4499df4024bf8abec748534 +size 631851 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..1d45797d3c7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8237552efb5f053c7b93118145411ea1c440d7303da2d9596f113ae9bca687a3 +size 614332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..afd5351d773 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:857632aea04ed78ead18b560bde571e98d1b91d2e255ff596333273f8106a1a6 +size 686907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..fefc2f2c089 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a335e06b9157f5a1d3662a31fca4db8eadcd523dddf5a4acd4aabd35ad83d3fc +size 526464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..c31883a1f2b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864aaafa89f55cc352096dd0fab344b2c26e4ce1bc838a5ddf2310126b9294e2 +size 598346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..7a9d7cd90ab --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8d770bfe3bd1bbfc7ba7e1d998356e7c5fa30128e531475630beddbf6a89518 +size 674989 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..1e4dc695588 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7592d2fcdd5334457804c60e997caab7b55bca0b98e49b704338851b1f26fcc5 +size 707257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..cd29ebc7e89 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f20d9bf427d6963d127d8fd7ffd455164e97dafe74bc129187cb13aa2c1afb7b +size 304112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..c74352b14c8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e42191612ceffb7de8b7d112433f54a2d136f215c39782aa46384fdef72a04 +size 411986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..4ba43e65a48 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4aea2a209820f76e2408c7204a09b51d49b98691b1bc42ab2ed41e6ba279741d +size 446672 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..ea163fc42ba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34b2788fd26d7f32f515f331d68bc1b1d9f52ab7e58795f74f568e3ea7114cf3 +size 449134 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..95a23f8861d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e4858f8eb635c01dedb7a10c17a948aa7d792438052e278f12083c504ef5344 +size 483820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..9f3e1a78714 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:239d1614204c6114a67842a21de11fa88f2da2c838c819609428f409c1a692f0 +size 525206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..f31f184e3be --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f920f02c0aa0dcdf604ce1b6a9f6f3e076a0974ec301cbb49f325ddd4639e54 +size 561470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..7328fbeebe4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:151143fe03078b16180e32fb803c0cbb67e3e4cda70d7e080971cc590ca14cf7 +size 425992 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..54d8705b655 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35d609db47a06ef1df2365983a4f6f4aeee5326731d42d21cd3f763c35ed8dfe +size 460676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..eb16c193a96 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4ad4bf1c626ebc7772502f08694368ff1f40f05902595dd245fa2b7afa9c087 +size 289614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..3d395110fdb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5a182c886f003dbfe1ad202c81e5be1463778b39479bd173727a1cb0a2b951f +size 294514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..b38d75fbef0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03723b80c6a02fe986e0a7c64447fbd7233e63d49f5638ededf5ecaf079e9f0d +size 408106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..2db8cda6f3a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:910dc558f3954db07df445f2c8655eb8762e5cef44eb402ac15ac3da61422a1b +size 433664 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..33bed2a1e53 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8d96b98b3533bd19e0124278967d196210549a599628fc5f52cb6e0d66398fa +size 439736 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..c1cef9b98f7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61ddd2d6ae324ef8a5c839309502b6a982e08baa5f128e07c93898e9cc6d4e53 +size 537916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..f66d57f17f4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60d654e8e5d2f397db9703173d40e5f58456dda50529862439ba6aae270d7c63 +size 609848 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..3f6ab4e2189 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:173f10fb5ca2918b6b06d40565bcd3d14e65ac140e635b8afed42483bcbf5da1 +size 559424 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..683358bc4c8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa64839c032fab6d686ebffa24e91b4130679d1a15f565d9cdcd2bd1ab54032f +size 631357 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..82f77bde995 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50acb10dbf8cff161f0e2941dc48b55b14b4b4e34919ec3bdb0ed6910ba26a82 +size 614332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..b117040fb8d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d23121e6d98070f720303795537dc52b5954b634942b7b1c310863c0ec8e73 +size 686167 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..7079efe8328 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36adc60c8995b5c59a9a90490e71acbcd52b51b6ed3a1c0572a6c3f07f524d78 +size 527154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp new file mode 100644 index 00000000000..6d0840e5331 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:349d96b3edf1d26f24e2759259138852f5fd8c7009b023a527b7b47426b66a66 +size 599086 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..f1ee2e385c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7a3ab2575bbf63668d4d98ac7cb793b166927851e6b02115c7c6487a125d94d +size 516826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..25813a13179 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca0d8b95ff673b929d053661ee7ead43c9266d3f1b4d26e48cf296fb4b043230 +size 548354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..8136d3ac77f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30df1127d3d5369c72284d12c02f1ede7e3afb71d056eda013314a3ac8348b65 +size 295874 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..2ea25c21e53 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12dc356d968c7a8240b43f8867a4ff1b2986247d3908ee77f5b00a8405bec539 +size 388110 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..d756383c64f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d4691d95f92cc35fada0692f792067b8488982a3b4a97d2887c39a1eb85d62c +size 423584 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..c3549eee3e5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6326d91b1f02c0c08c9e20215991f448f64227bb6ec07ac4460bff1252d95bc8 +size 405968 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..6293ef820b6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:523ab36a22353f35c6cb75ffd9b860da48224744638708f20f497e1a21ca7022 +size 442232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..242df541c04 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d64c5fac8420ebdcda0978e078db135e5b32781f7cbafc58b8c11db49ab026e4 +size 447654 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..95e0b0a206c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65325433325aeea2737935b5f5311e8f37655932bd4b3248de1972b62c3eec1f +size 483080 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..0a5d67dd37a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e7fe657be9045ecb106287a93886e232b7e21c643f5648a3e8a60be70b676f4 +size 413412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..c23f0106e0a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bffa59cbfec3de0b7a7202ac20181f297dff52658674d9f47463d677669dd701 +size 448886 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..b74dc38d76f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:577927ec58bb493d60903142b46e9b6f30ee78df4caa40c80b6297cc8da6c56d +size 290404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..dfdecce4dce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:103cb758d7a7fe46834445d92bc2d104866fdb4c394b79641ae740c3cfe6126b +size 295304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..57343c4c570 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4851646f0bedbe03b4a08b747b94fac2fc5f00daf3ee73de4e699a9a67481216 +size 481134 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..e833d3ca6aa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ba61f2e390f6164922619594edc3d1b36a97774a2d3d5f0a6cf1c7ddcffea13 +size 406676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..8ed2e66da93 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8960d87aa6e44cc8aca9d97f5d1631cc645ae478fffa96029bae45b3c700c60 +size 482120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 00000000000..f67aa8e463e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93a2bca446d9ee7df95e3d960e4acc6872f281cc86b409c25740f5ca99653377 +size 492288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/trtllm/gen/MmaDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/trtllm/gen/MmaDecl.h index 6169f1abbe0..27efe1db867 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/trtllm/gen/MmaDecl.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/trtllm/gen/MmaDecl.h @@ -16,6 +16,15 @@ */ #pragma once +#include +#include +#include +#ifndef TLLM_GEN_EXPORT_INTERFACE +#include "trtllm/gen/CommonUtils.h" +#else // TLLM_GEN_EXPORT_INTERFACE +#include "CommonUtils.h" +#endif // TLLM_GEN_EXPORT_INTERFACE + namespace gemm { @@ -84,6 +93,16 @@ inline std::string mmaKindToString(MmaKind mmaKind) //////////////////////////////////////////////////////////////////////////////////////////////////// +// function to get the TMEM column stride per group (i.e., 64 K elements) +inline int32_t getTmemColStridePerGroup(int32_t tileMn, int32_t mmaK) +{ + // Calculate the stride of TMEM column for every 64 elements in the K dimension + int32_t div = 2 * ceilDiv(tileMn, 64); + return mmaK == 96 ? std::max(4, div) : div; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace gen } // namespace trtllm diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt index 6224c0d2c96..aabc79e6406 100644 --- a/cpp/tensorrt_llm/thop/CMakeLists.txt +++ b/cpp/tensorrt_llm/thop/CMakeLists.txt @@ -56,6 +56,7 @@ add_library( fp8BatchedGemmTrtllmGen.cpp fp4Quantize.cpp fp4BatchedQuantize.cpp + fp4xFp8GemmTrtllmGen.cpp fp8BlockScalingGemm.cpp fp8RowwiseGemm.cpp fp8Quantize.cpp diff --git a/cpp/tensorrt_llm/thop/fp4GemmTrtllmGen.cpp b/cpp/tensorrt_llm/thop/fp4GemmTrtllmGen.cpp index 28f8c090d36..6b923336d1b 100644 --- a/cpp/tensorrt_llm/thop/fp4GemmTrtllmGen.cpp +++ b/cpp/tensorrt_llm/thop/fp4GemmTrtllmGen.cpp @@ -40,7 +40,7 @@ void runGemm(at::Tensor& out, at::Tensor const& mat1, at::Tensor const& mat2, at auto eltType = tg::Dtype::E2m1; tensorrt_llm::kernels::TrtllmGenGemmRunnerOptions options - = {.eltType = eltType, .outputType = outDtype, .deepSeekFp8 = false}; + = {.eltTypeA = eltType, .outputType = outDtype, .deepSeekFp8 = false}; tensorrt_llm::kernels::TrtllmGenGemmRunner runner(options); diff --git a/cpp/tensorrt_llm/thop/fp4xFp8GemmTrtllmGen.cpp b/cpp/tensorrt_llm/thop/fp4xFp8GemmTrtllmGen.cpp new file mode 100644 index 00000000000..8ed81c4aa99 --- /dev/null +++ b/cpp/tensorrt_llm/thop/fp4xFp8GemmTrtllmGen.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/multiHeadAttentionCommon.h" +#include "tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h" +#include "tensorrt_llm/thop/thUtils.h" + +#include +#include + +#include + +#include + +namespace torch_ext +{ + +namespace +{ + +namespace tg = gemm::trtllm::gen; + +template +void runGemm(at::Tensor& out, at::Tensor const& mat1, at::Tensor const& mat2, at::Tensor const& mat2Scale, + at::Tensor const& globalScale, int64_t m, int64_t n, int64_t k) +{ + tensorrt_llm::kernels::TrtllmGenGemmRunnerOptions options = {.eltTypeA = tg::Dtype::E2m1, + .eltTypeB = tg::Dtype::E4m3, + .outputType = outDtype, + .deepSeekFp8 = false, + .transposeMmaOutput = true}; + + tensorrt_llm::kernels::TrtllmGenGemmRunner runner(options); + + int64_t const numBytesWorkspace = runner.getWorkspaceSizeInBytes(m, n, k); + at::Tensor workspace + = at::detail::empty_cuda({numBytesWorkspace}, at::ScalarType::Char, torch::kCUDA, std::nullopt); + + auto stream = at::cuda::getCurrentCUDAStream(mat1.get_device()); + + float* mat2ScalePtr = static_cast(mat2Scale.data_ptr()); + float* outScalePtr = globalScale.data_ptr(); + + runner.run(m, n, k, mat1.const_data_ptr(), nullptr, mat2.const_data_ptr(), /* bScale */ mat2ScalePtr, + out.data_ptr(), outScalePtr, /* cScalePtr */ nullptr, workspace.data_ptr(), stream.stream(), mat1.get_device()); +} + +at::Tensor fp4_fp8_gemm_impl(at::Tensor const& mat1, at::Tensor const& mat2, at::Tensor const& mat2Scale, + at::Tensor const& globalScale, std::optional out_dtype) +{ + using tensorrt_llm::kernels::Data_type; + + CHECK_INPUT(mat1, c10::ScalarType::Float8_e4m3fn); + CHECK_INPUT(mat2, FLOAT4_E2M1X2); + + CHECK_INPUT(mat2Scale, c10::ScalarType::Float8_e4m3fn); + + CHECK_INPUT(globalScale, c10::ScalarType::Float); + + TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix"); + TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix"); + + TORCH_CHECK(mat1.sizes()[1] == mat2.sizes()[1] * 2, "mat1 and mat2 shapes cannot be multiplied (", mat1.sizes()[0], + "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")"); + + auto const m = mat1.sizes()[0]; + auto const n = mat2.sizes()[0]; + auto const k = mat1.sizes()[1]; + + if (!out_dtype) + { + out_dtype = torch::kHalf; + } + TORCH_CHECK(out_dtype == torch::kFloat8_e4m3fn || out_dtype == torch::kHalf || out_dtype == torch::kBFloat16, + "out_dtype must be one of fp8/fp16/bf16. It defaults to fp16."); + + at::Tensor out = at::detail::empty_cuda({m, n}, out_dtype.value(), mat1.device(), std::nullopt); + + switch (out_dtype.value()) + { + case at::ScalarType::Float8_e4m3fn: + runGemm(out, mat1, mat2, mat2Scale, globalScale, m, n, k); + break; + case at::ScalarType::Half: runGemm(out, mat1, mat2, mat2Scale, globalScale, m, n, k); break; + case at::ScalarType::BFloat16: + runGemm(out, mat1, mat2, mat2Scale, globalScale, m, n, k); + break; + default: C10_THROW_ERROR(NotImplementedError, "out_dtype must be one of fp8/fp16/bf16."); + } + return out; +} + +} // namespace + +at::Tensor fp4_fp8_gemm_trtllmgen(at::Tensor const& mat1, at::Tensor const& mat2, at::Tensor const& mat2Scale, + at::Tensor const& globalScale, std::optional out_dtype) +{ + return fp4_fp8_gemm_impl(mat1, mat2, mat2Scale, globalScale, out_dtype); +} + +} // namespace torch_ext + +TORCH_LIBRARY_FRAGMENT(trtllm, m) +{ + m.def( + "fp4_fp8_gemm_trtllmgen(Tensor mat1, Tensor mat2, Tensor mat2Scale, Tensor globalScale, " + "ScalarType? out_dtype=None) -> Tensor"); +} + +TORCH_LIBRARY_IMPL(trtllm, CUDA, m) +{ + m.impl("fp4_fp8_gemm_trtllmgen", &torch_ext::fp4_fp8_gemm_trtllmgen); +} diff --git a/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp b/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp index b64f9b71110..d3b845bc2a2 100644 --- a/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp +++ b/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp @@ -182,7 +182,7 @@ torch::Tensor fp8_block_scale_gemm_blackwell(torch::Tensor const& mat1, torch::T float* outScalePtr = nullptr; // transposeMmaOutput is hardcoded for now - tensorrt_llm::kernels::TrtllmGenGemmRunnerOptions options = {.eltType = gemm::trtllm::gen::Dtype::E4m3, + tensorrt_llm::kernels::TrtllmGenGemmRunnerOptions options = {.eltTypeA = gemm::trtllm::gen::Dtype::E4m3, .outputType = gemm::trtllm::gen::Dtype::Bfloat16, .deepSeekFp8 = true, .transposeMmaOutput = true}; diff --git a/cpp/tensorrt_llm/thop/fp8PerTensorScalingTrtllmGenGemm.cpp b/cpp/tensorrt_llm/thop/fp8PerTensorScalingTrtllmGenGemm.cpp index f0fd52868f8..5de525a6727 100644 --- a/cpp/tensorrt_llm/thop/fp8PerTensorScalingTrtllmGenGemm.cpp +++ b/cpp/tensorrt_llm/thop/fp8PerTensorScalingTrtllmGenGemm.cpp @@ -37,7 +37,7 @@ void runGemm(at::Tensor& out, at::Tensor const& mat1, at::Tensor const& mat2, at auto eltType = gemm::trtllm::gen::Dtype::E4m3; tensorrt_llm::kernels::TrtllmGenGemmRunnerOptions options - = {.eltType = eltType, .outputType = outDtype, .deepSeekFp8 = false, .transposeMmaOutput = lowLatencyKernel}; + = {.eltTypeA = eltType, .outputType = outDtype, .deepSeekFp8 = false, .transposeMmaOutput = lowLatencyKernel}; tensorrt_llm::kernels::TrtllmGenGemmRunner runner(options); diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py index 686cca16ba6..a95519f22ce 100644 --- a/tensorrt_llm/_torch/attention_backend/trtllm.py +++ b/tensorrt_llm/_torch/attention_backend/trtllm.py @@ -1085,6 +1085,8 @@ def update_quant_config(self, new_quant_config: Optional[QuantConfig]): self.has_fp8_rowwise = self.quant_config.layer_quant_mode.has_fp8_rowwise( ) self.has_nvfp4 = self.quant_config.layer_quant_mode.has_nvfp4() + self.has_w4a8_nvfp4_fp8 = self.quant_config.layer_quant_mode.has_w4a8_nvfp4_fp8( + ) def get_local_layer_idx(self, metadata: TrtllmAttentionMetadata) -> int: if metadata.kv_cache_manager is None: @@ -1194,8 +1196,9 @@ def forward( # Use UINT8 as the container dtype for NVFP4. out_dtype = torch.uint8 elif (self.has_fp8_qdq or self.has_nvfp4 or self.has_fp8_block_wise - or self.has_fp8_rowwise) and (self.has_fp8_kv_cache - or self.has_fp4_kv_cache): + or self.has_fp8_rowwise + or self.has_w4a8_nvfp4_fp8) and (self.has_fp8_kv_cache + or self.has_fp4_kv_cache): # TODO(qijun): revisit fp8_context_fmha logic out_dtype = torch.float8_e4m3fn diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py index abf703c11ce..994b04cf12a 100644 --- a/tensorrt_llm/_torch/modules/attention.py +++ b/tensorrt_llm/_torch/modules/attention.py @@ -315,7 +315,8 @@ def create_output(self, q: torch.Tensor): if self.attn_backend == "TRTLLM": has_quant_scale = (self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4 or self.o_proj.has_fp8_block_scales - or self.o_proj.has_fp8_rowwise) + or self.o_proj.has_fp8_rowwise + or self.o_proj.has_w4a8_nvfp4_fp8) if has_quant_scale and (self.attn.has_fp8_kv_cache or self.attn.has_fp4_kv_cache): out_dtype = torch.float8_e4m3fn @@ -356,7 +357,8 @@ def _attn_impl( out_scale_sf = None has_quant_scale = (self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4 or self.o_proj.has_fp8_block_scales - or self.o_proj.has_fp8_rowwise) + or self.o_proj.has_fp8_rowwise + or self.o_proj.has_w4a8_nvfp4_fp8) if has_quant_scale: out_scale = self.o_proj.inv_input_scale if self.o_proj.has_nvfp4 and self.support_nvfp4_output and enable_attn_nvfp4_output: diff --git a/tensorrt_llm/_torch/modules/gated_mlp.py b/tensorrt_llm/_torch/modules/gated_mlp.py index d7c20fe8f04..f177c418850 100644 --- a/tensorrt_llm/_torch/modules/gated_mlp.py +++ b/tensorrt_llm/_torch/modules/gated_mlp.py @@ -100,7 +100,7 @@ def __init__(self, def _apply_activation(self, x): if self.activation == F.silu: - if self.down_proj.has_fp8_qdq: + if self.down_proj.has_fp8_qdq or self.down_proj.has_w4a8_nvfp4_fp8: return swiglu(x, quant_scale=self.down_proj.input_scale, quant_type=torch.float8_e4m3fn) diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py index 40f8d9a9367..c91e4532ab4 100644 --- a/tensorrt_llm/_torch/modules/linear.py +++ b/tensorrt_llm/_torch/modules/linear.py @@ -899,6 +899,198 @@ def load_weights_fused_gate_up_linear(self, module: Linear, copy_weight(module.alpha, alpha) +class W4A8NVFP4FP8LinearMethod(LinearMethodBase): + + def create_weights(self, module: Linear, in_features: int, + out_features: int, bias: bool, dtype: torch.dtype): + module.epilogue_tile_m = 128 + module.scaling_vector_size = 32 + assert in_features % module.scaling_vector_size == 0, ( + f"in_features {in_features} must be divisible by scaling_vector_size {module.scaling_vector_size}" + ) + + # Quantized weights + module.weight = Parameter( + torch.empty([out_features, in_features // 2], + dtype=fp4_utils.float4_e2m1x2), + requires_grad=False, + ) + + # FP8 per-block scaling factors. dtype must be aligned with SF_DTYPE + # Padding is required. See computeSFSize in quantization.h + nrows = fp4_utils.pad_up(out_features, 128) + ncols = fp4_utils.pad_up(in_features // module.scaling_vector_size, 4) + module.weight_scale = Parameter(torch.empty( + [nrows * ncols], dtype=fp4_utils.float4_sf_dtype), + requires_grad=False) + + # amax_input / 448 + module.input_scale = Parameter(torch.empty([1], dtype=torch.float32), + requires_grad=False) + module.inv_input_scale = Parameter(torch.tensor(1., + dtype=torch.float32), + requires_grad=False) + # amax_weight / 448 + module.weight_scale_2 = Parameter(torch.empty([1], dtype=torch.float32), + requires_grad=False) + # (amax_input * amax_weight) / (448 * 448) + module.alpha = Parameter(torch.empty([1], dtype=torch.float32), + requires_grad=False) + + if bias: + module.bias = Parameter(torch.empty((out_features), dtype=dtype), + requires_grad=False) + else: + module.register_parameter("bias", None) + + def apply(self, module: Linear, input: torch.Tensor, + bias: Optional[torch.Tensor]): + alpha = module.alpha + if input.dtype != torch.float8_e4m3fn: + if module.input_scale is not None and not module.force_dynamic_quantization: + # Static quantization + fp8_input, _ = torch.ops.tensorrt_llm.static_quantize_e4m3_per_tensor( + input, module.input_scale) + else: + # Dynamic quantization + fp8_input, input_scale = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor( + input) + alpha = module.weight_scale_2 * input_scale.to(torch.float32) + + else: + fp8_input = input + output = torch.ops.trtllm.fp4_fp8_gemm_trtllmgen( + fp8_input, module.weight, + module.weight_scale.view(dtype=torch.float8_e4m3fn), alpha, + module.dtype) + if bias is not None: + output = output + bias + return output + + def load_weight_scales( + self, + weights: List[Dict], + tp_size: int = 1, + tp_rank: int = 0, + tp_mode: Optional[TensorParallelMode] = None, + ): + # For concatenated weights (qkv_proj / up_gate_proj), the global scaling factors and input scaling factors should be shared. + input_scale = None + weight_scale_2 = None + weight_scale = [] + + device = torch.device("cuda") + + for w in weights: + if "input_scale" in w: + if input_scale is None: + input_scale = w["input_scale"][...] + else: + assert input_scale == w["input_scale"][ + ...], "The input_scale should be same for all the weights" + if "weight_scale" in w: + ws = load_weight_shard(w["weight_scale"], + tp_size, + tp_rank, + tp_mode, + device=device).contiguous() + assert ws.dtype == torch.float8_e4m3fn + # The kernel we use will convert nvfp4 to e4m3 before matmul, + # so the range of the scale factor can only be [0,448/6]. + ws = (ws.to(torch.float32) / 6.0).to(torch.float8_e4m3fn) + weight_scale.append(ws.view(dtype=fp4_utils.float4_sf_dtype)) + if "weight_scale_2" in w: + if weight_scale_2 is None: + weight_scale_2 = w["weight_scale_2"][...] * 6.0 + else: + assert weight_scale_2 == w["weight_scale_2"][...] * 6.0, ( + f"The weight_scale_2 should be same for all the weights: {weight_scale_2} vs. {w['weight_scale_2']}*6" + ) + + # TODO: ModelOpt's o_proj.weight_scale_2 is bfloat16, which should be float32 + input_scale = input_scale.to(torch.float32) + weight_scale_2 = weight_scale_2.to(torch.float32) + alpha = input_scale * weight_scale_2 + return input_scale, weight_scale, weight_scale_2, alpha + + def load_weights_vanilla(self, module: Linear, weights: List[Dict]) -> None: + # FIXME: this depends on the kernel internals + load_weights_vanilla_helper( + module, weights, + lambda w: fp4_utils.shuffle_matrix_a(w, module.epilogue_tile_m)) + + input_scale, weight_scale, weight_scale_2, alpha = self.load_weight_scales( + weights, + tp_size=module.tp_size, + tp_rank=module.tp_rank, + tp_mode=module.tp_mode) + + assert len(weights) == 1 + weight_scale = weight_scale[0] + # Shuffle and Swizzle weight scale + weight_scale = fp4_utils.shuffle_matrix_sf_a(weight_scale, + module.epilogue_tile_m, + module.scaling_vector_size) + copy_weight(module.input_scale, input_scale) + copy_weight(module.inv_input_scale, 1.0 / input_scale) + copy_weight(module.weight_scale, weight_scale) + copy_weight(module.weight_scale_2, weight_scale_2) + copy_weight(module.alpha, alpha) + + def load_weights_fused_qkv_linear(self, module: Linear, + weights: List[Dict]) -> None: + q_weight, k_weight, v_weight = load_weights_fused_qkv_helper( + module, weights) + + input_scale, weight_scales, weight_scale_2, alpha = self.load_weight_scales( + weights, + tp_size=module.tp_size, + tp_rank=module.tp_rank, + tp_mode=module.tp_mode) + # Swizzle weight scales after concatenation + weight_scale = torch.cat(weight_scales, 0) + # Shuffle and Swizzle weight scale + weight_scale = fp4_utils.shuffle_matrix_sf_a(weight_scale, + module.epilogue_tile_m, + module.scaling_vector_size) + copy_weight(module.input_scale, input_scale) + copy_weight(module.inv_input_scale, 1.0 / input_scale) + copy_weight(module.weight_scale, weight_scale) + copy_weight(module.weight_scale_2, weight_scale_2) + copy_weight(module.alpha, alpha) + + fused_weight = torch.cat((q_weight, k_weight, v_weight)) + fused_weight = fp4_utils.shuffle_matrix_a(fused_weight, + module.epilogue_tile_m) + copy_weight(module.weight, fused_weight) + + def load_weights_fused_gate_up_linear(self, module: Linear, + weights: List[Dict]) -> None: + gate_weight, up_weight = load_weights_fused_gate_up_helper( + module, weights) + fused_weight = torch.cat((gate_weight, up_weight)) + fused_weight = fp4_utils.shuffle_matrix_a(fused_weight, + module.epilogue_tile_m) + copy_weight(module.weight, fused_weight) + + input_scale, weight_scales, weight_scale_2, alpha = self.load_weight_scales( + weights, + tp_size=module.tp_size, + tp_rank=module.tp_rank, + tp_mode=module.tp_mode) + # Swizzle weight scales after concatenation + weight_scale = torch.cat(weight_scales, 0) + # Shuffle and Swizzle weight scale + weight_scale = fp4_utils.shuffle_matrix_sf_a(weight_scale, + module.epilogue_tile_m, + module.scaling_vector_size) + copy_weight(module.input_scale, input_scale) + copy_weight(module.inv_input_scale, 1.0 / input_scale) + copy_weight(module.weight_scale, weight_scale) + copy_weight(module.weight_scale_2, weight_scale_2) + copy_weight(module.alpha, alpha) + + class W4A8MXFP4FP8LinearMethod(LinearMethodBase): def create_weights(self, module: Linear, in_features: int, @@ -1559,6 +1751,8 @@ def get_quant_method(quant_config: Optional[QuantConfig] = None): return FP8BlockScalesLinearMethod() if quant_config.layer_quant_mode.has_nvfp4(): return NVFP4LinearMethod() + if quant_config.layer_quant_mode.has_w4a8_nvfp4_fp8(): + return W4A8NVFP4FP8LinearMethod() if quant_config.layer_quant_mode.has_w4a8_mxfp4_fp8(): return W4A8MXFP4FP8LinearMethod() if quant_config.layer_quant_mode.is_weight_only( @@ -1713,6 +1907,12 @@ def has_w4a8_awq(self): return self.quant_config is not None and self.quant_config.layer_quant_mode.is_int4_weight_only_per_group( ) and self.quant_config.quant_algo == QuantAlgo.W4A8_AWQ + @property + def has_w4a8_nvfp4_fp8(self): + assert self._weights_created + return self.quant_config is not None and self.quant_config.layer_quant_mode.has_w4a8_nvfp4_fp8( + ) + @property def has_w4a8_mxfp4_fp8(self): assert self._weights_created diff --git a/tensorrt_llm/quantization/mode.py b/tensorrt_llm/quantization/mode.py index a8b38d885f7..4615bc1376f 100644 --- a/tensorrt_llm/quantization/mode.py +++ b/tensorrt_llm/quantization/mode.py @@ -40,6 +40,7 @@ class QuantAlgo(StrEnum, metaclass=BaseEnumMeta): INT8 = auto() MIXED_PRECISION = auto() NVFP4 = auto() + W4A8_NVFP4_FP8 = auto() W4A8_MXFP4_FP8 = auto() W4A8_MXFP4_MXFP8 = auto() W4A16_MXFP4 = auto() @@ -90,6 +91,8 @@ class QuantMode(IntFlag): # FP4 NVFP4 = auto() NVFP4_KV_CACHE = auto() + # W4A8 NVFP4 + W4A8_NVFP4_FP8 = auto() # W4A8 MXFP4 W4A8_MXFP4_FP8 = auto() W4A8_MXFP4_MXFP8 = auto() @@ -179,6 +182,9 @@ def has_fp8_rowwise(self): def has_nvfp4(self): return self._any(self.NVFP4) + def has_w4a8_nvfp4_fp8(self): + return self._any(self.W4A8_NVFP4_FP8) + def has_w4a8_mxfp4_fp8(self): return self._any(self.W4A8_MXFP4_FP8) @@ -203,6 +209,7 @@ def has_any_quant(self, exclude_kv_cache: bool = False): | self.W4A8_QSERVE | self.FP8_1x128_128x128 | self.NVFP4 + | self.W4A8_NVFP4_FP8 | self.W4A8_MXFP4_FP8 | self.W4A16_MXFP4 | self.W4A8_MXFP4_MXFP8) @@ -240,6 +247,7 @@ def from_description(quantize_weights=False, use_fp8_block_scales=False, use_fp8_rowwise=False, use_nvfp4=False, + use_w4a8_nvfp4_fp8=False, use_w4a8_qserve=False, use_w4a8_mxfp4_fp8=False, use_w4a8_mxfp4_mxfp8=False, @@ -313,6 +321,9 @@ def raise_error(): if use_nvfp4: mode = mode | QuantMode.NVFP4 + if use_w4a8_nvfp4_fp8: + mode = mode | QuantMode.W4A8_NVFP4_FP8 + # W4A8 QServe if use_w4a8_qserve: mode = mode | QuantMode.W4A8_QSERVE @@ -399,6 +410,8 @@ def from_quant_algo( quant_mode = QuantMode.from_description(use_fp8_block_scales=True) elif quant_algo == QuantAlgo.NVFP4: quant_mode = QuantMode.from_description(use_nvfp4=True) + elif quant_algo == QuantAlgo.W4A8_NVFP4_FP8: + quant_mode = QuantMode.from_description(use_w4a8_nvfp4_fp8=True) elif quant_algo == QuantAlgo.W4A8_MXFP4_FP8: quant_mode = QuantMode.from_description(use_w4a8_mxfp4_fp8=True) elif quant_algo == QuantAlgo.W4A8_MXFP4_MXFP8: @@ -437,6 +450,8 @@ def to_dict(self): self.has_fp8_block_scales(), 'enable_nvfp4': self.has_nvfp4(), + 'enable_w4a8_nvfp4_fp8': + self.has_w4a8_nvfp4_fp8(), 'enable_w4a8_mxfp4_fp8': self.has_w4a8_mxfp4_fp8(), 'enable_w4a8_mxfp4_mxfp8': diff --git a/tests/unittest/_torch/thop/parallel/test_custom_ops.py b/tests/unittest/_torch/thop/parallel/test_custom_ops.py index f14c1d60201..0743b13ec8d 100644 --- a/tests/unittest/_torch/thop/parallel/test_custom_ops.py +++ b/tests/unittest/_torch/thop/parallel/test_custom_ops.py @@ -80,6 +80,7 @@ def test_register_fake(custom_ops): "trtllm::fp4_batched_quantize", "trtllm::fp4_gemm_trtllmgen", "trtllm::fp4_bmm", + "trtllm::fp4_fp8_gemm_trtllmgen", "trtllm::cuda_scaled_mm", "trtllm::initialize_static_lowprecision_buffers", "trtllm::cutlass_scaled_mm", diff --git a/tests/unittest/_torch/thop/parallel/test_fp4_gemm_quantize.py b/tests/unittest/_torch/thop/parallel/test_fp4_gemm_quantize.py index f1faf281095..0790ed339e8 100644 --- a/tests/unittest/_torch/thop/parallel/test_fp4_gemm_quantize.py +++ b/tests/unittest/_torch/thop/parallel/test_fp4_gemm_quantize.py @@ -127,6 +127,49 @@ def test_fp4_quantize_gemm_trtllmgen(self, m, n, k): c_pt = torch.nn.functional.linear(a_pt, b_pt) self.assertTrue(torch.allclose(c_pt, c, atol=1e-2, rtol=1e-2)) + @parameterized.expand( + list([ + [1024, 1024, 1024], + [128, 8, 256], + ]), + name_func=unittest_name_func, + ) + @skip_pre_blackwell_unittest + @skip_blackwell_geforce + def test_fp4_fp8_gemm_trtllmgen(self, m, n, k): + a = torch.randn([m, k], dtype=torch.float32) + b = torch.randn([n, k], dtype=torch.float32) + b_fp8, b_global_sf = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor( + b.cuda()) + b_fp8 = b_fp8.view(torch.float8_e4m3fn) + a_global_sf = 448.0 / a.abs().max().float() + + # FIXME: this depends on the kernel internals + epilogue_tile_m = 128 + sf_vec_size = 32 + + a_fp4, a_sf, rep_float = torch.ops.tensorrt_llm.float_to_e2m1_and_ufp8sf_scale( + a * a_global_sf, sf_vec_size, 1, False) + a_pt = e2m1_and_ufp8_scale_to_float_tensor_v2(a_fp4, a_sf, + 1.0 / a_global_sf, + sf_vec_size, 1, False) + b_pt = (b_fp8.to(torch.float32) * b_global_sf).cpu() + c_pt = torch.nn.functional.linear(b_pt, a_pt) + + a_fp4_shuffled = fp4_utils.shuffle_matrix_a(a_fp4, epilogue_tile_m) + # sf is swizzled as well. + a_sf_shuffled = fp4_utils.shuffle_matrix_sf_a(a_sf.reshape( + (m, -1)), epilogue_tile_m, sf_vec_size) + + ab_global_sf = b_global_sf / a_global_sf + c = torch.ops.trtllm.fp4_fp8_gemm_trtllmgen( + b_fp8, a_fp4_shuffled.cuda(), + a_sf_shuffled.view(dtype=torch.float8_e4m3fn).cuda(), + ab_global_sf.cuda()) + torch.cuda.synchronize() + c = c.float().cpu() + self.assertTrue(torch.allclose(c_pt, c, atol=1e-2, rtol=1e-2)) + @parameterized.expand(list([[1024, 1024, torch.half, False, True], [2, 512, torch.bfloat16, False, True], [2, 512, torch.bfloat16, True, True], diff --git a/tests/unittest/trt/quantization/test_mode.py b/tests/unittest/trt/quantization/test_mode.py index d211a4a0ed7..f4538eaa4b8 100644 --- a/tests/unittest/trt/quantization/test_mode.py +++ b/tests/unittest/trt/quantization/test_mode.py @@ -44,7 +44,7 @@ def test_any(self): def test_count(self): # Make sure the COUNT value is as expected - change that test if you add a new flag. - self.assertEqual(QuantMode.COUNT.value, 1 << 17) + self.assertEqual(QuantMode.COUNT.value, 1 << 18) def test_from_description(self): # Test weight only.