diff --git a/tensilelite/Tensile/Contractions.py b/tensilelite/Tensile/Contractions.py index c2391f0b0d..05c52bc0f2 100644 --- a/tensilelite/Tensile/Contractions.py +++ b/tensilelite/Tensile/Contractions.py @@ -507,12 +507,13 @@ def CompoundPredicates(cls, state, problemType): if ('WorkGroupMappingXCC' in state) and ('WorkGroupMappingXCCGroup' in state): rv += [cls("WorkgroupMappingXCCCheck", value=[state['WorkGroupMappingXCC'], state['WorkGroupMappingXCCGroup']])] - # TODO- To improve the perf of these non-multiples cases if state['ProblemType']['SwizzleTensorA']: rv += [cls('SwizzleTensorA', value=state['ProblemType']['SwizzleTensorA'])] - rv += [cls("Free0SizeMultiple", index=0, value=state['MacroTile0'])] - rv += [cls("BoundSizeMultiple", index=-1, value=state['DepthU'])] + # TODO- (TT + DTVA) tail-loop is not working yet. + if state['ProblemType']['TransposeB']: + rv += [cls("BoundSizeMultiple", index=-1, value=state['DepthU'])] + # TODO- Will remove the size predicate once we have SWZ-B request if state['ProblemType']['SwizzleTensorB']: rv += [cls('SwizzleTensorB', value=state['ProblemType']['SwizzleTensorB'])] rv += [cls("Free1SizeMultiple", index=0, value=state['MacroTile1'])] diff --git a/tensilelite/Tensile/KernelWriter.py b/tensilelite/Tensile/KernelWriter.py index 1b9780dd06..3d06faaa9e 100644 --- a/tensilelite/Tensile/KernelWriter.py +++ b/tensilelite/Tensile/KernelWriter.py @@ -994,7 +994,7 @@ def checkLocalReadFIFOFull(currentMFMA, fifo, lrItems, numLR, numLREven): return numToBeIssued oneBufferScheduling = kernel["1LDSBuffer"] or kernel["DirectToLdsA"] or kernel["DirectToLdsB"] - + def hasDependency(lr: DSLoadInstruction, inst: Instruction) -> bool: lrDataReg = lr.dst @@ -2569,7 +2569,7 @@ def kernelBody( self, kernel, tensorParametersA, tensorParametersB ): # last NLL or pack DTV case, no deep copy for pack # pack code for local prefetch is generated in noLoadLoopBody and used for DTV even deepCopyPack = pack - else: + else: # deepCopy packCode for OptNLL noLoadLoop deepCopyPack = fastdeepcopy(pack) module.add(self.noLoadLoop(kernel, tensorParametersA, tensorParametersB, isOptNLL=False, isNGLL=False, pack=deepCopyPack, NLLindex=NLLindex, NLLnum=NLLnum)) @@ -2636,15 +2636,21 @@ def kernelBody( self, kernel, tensorParametersA, tensorParametersB ): kernel["tailLoopOpt"] == False) else 0 globalReadMode2nd = 2 if (((tensorParameters2nd["glvw"] * tensorParameters2nd["bpeGR"]) < 4) or \ kernel["tailLoopOpt"] == False) else 0 + + # if we have swizzled A or B, then size-K is already guarded, we don't have to used guarded-k GR again + hasSwizzled = tensorParametersA["isSwizzled"] or tensorParametersB["isSwizzled"] + globalReadMode1st = 0 if hasSwizzled else globalReadMode1st + globalReadMode2nd = 0 if hasSwizzled else globalReadMode2nd + module.addComment1("Update M0 for DTLDS") moduleTmp = self.directToLdsM0Update(kernel, 1, tensorParameters1st) module.add(replaceHolder(moduleTmp, 0)) - module.addComment1("global read %s"%tc1) + module.addComment1("Tail global read %s"%tc1) module.add(self.globalReadDo(kernel, globalReadMode1st, tensorParameters1st)) module.addComment1("Update M0 for DTLDS") moduleTmp = self.directToLdsM0Update(kernel, 1, tensorParameters2nd) module.add(replaceHolder(moduleTmp, 0)) - module.addComment1("global read %s"%tc2) + module.addComment1("Tail global read %s"%tc2) module.add(self.globalReadDo(kernel, globalReadMode2nd, tensorParameters2nd)) if kernel["tailLoopOpt"] and \ (((tensorParameters1st["glvw"] * tensorParameters1st["bpeGR"]) >= 4) or \ @@ -2679,13 +2685,13 @@ def kernelBody( self, kernel, tensorParametersA, tensorParametersB ): # tail: re-init local read addresses if kernel["PrefetchGlobalRead"]: - module.addComment1("local read reset offsets a") + module.addComment1("Tail: local read reset offsets a") module.add(self.localReadResetOffsets(kernel, tensorParametersA)) - module.addComment1("local read reset offsets b") + module.addComment1("Tail: local read reset offsets b") module.add(self.localReadResetOffsets(kernel, tensorParametersB)) - module.addComment1("local read init pointers a") + module.addComment1("Tail: local read init pointers a") module.add(self.localReadInitPointers(kernel, tensorParametersA, tensorParametersA)) - module.addComment1("local read init pointers b") + module.addComment1("Tail: local read init pointers b") module.add(self.localReadInitPointers(kernel, tensorParametersA, tensorParametersB)) if kernel["ProblemType"]["Sparse"] and not kernel["DirectToVgprSparseMetadata"]: module.addComment1("local read reset offsets metadata") @@ -2908,7 +2914,7 @@ def initKernel(self, kernel, tensorParametersA, tensorParametersB): self.states.asmCaps = self.ti.getAsmCaps() self.states.archCaps = self.ti.getArchCaps() self.states.regCaps = self.ti.getRegCaps() - + self.asmAssert = Assert(self.states.laneSGPRCount, kernel["WavefrontSize"], self.db["EnableAsserts"]) # Only assembly supports scheduling diff --git a/tensilelite/Tensile/KernelWriterAssembly.py b/tensilelite/Tensile/KernelWriterAssembly.py index 166232e8e1..07da43ebc7 100644 --- a/tensilelite/Tensile/KernelWriterAssembly.py +++ b/tensilelite/Tensile/KernelWriterAssembly.py @@ -5328,11 +5328,13 @@ def mfmaIter(self, kernel, tPA, tPB, u, innerUnroll, vregSetIdx, unrollLoopIdx = if kernel["LocalSplitU"] > 1: shiftK.add(SMinI32(dst=sgpr(loopCntSgpr), src0=sgpr(loopCounterName), src1=sgpr("LSUTailLoopOffset"), comment="check lsu bound")) shiftK.add(VCmpGEI32(dst=sgpr(tmpSgprX2, self.states.laneSGPRCount), src0=vgpr(kReg), src1=sgpr(loopCntSgpr), comment="check K index >= Size L")) - for bk in range(0, vgprPerSet0Group): - for a in range(0, kernel["MIWaveTileA"]): - for iui in range(0, innerUnroll): - aStr = vgpr(self.generateSrcStrForMFMA(kernel, tPA, innerUnroll, vregSetIdx, vgprPerInputA, m, u, iui, a, bk=bk + group * vgprPerSet0Group), 1) - shiftK.add(VCndMaskB32(dst=aStr, src0=aStr, src1=hex(0), src2=sgpr(tmpSgprX2, self.states.laneSGPRCount), comment="set 0 if K_idx >= sizeL")) + # if A is swizzled, then no need to do this since MatB will be set to 0 + if not tPA["isSwizzled"]: + for bk in range(0, vgprPerSet0Group): + for a in range(0, kernel["MIWaveTileA"]): + for iui in range(0, innerUnroll): + aStr = vgpr(self.generateSrcStrForMFMA(kernel, tPA, innerUnroll, vregSetIdx, vgprPerInputA, m, u, iui, a, bk=bk + group * vgprPerSet0Group), 1) + shiftK.add(VCndMaskB32(dst=aStr, src0=aStr, src1=hex(0), src2=sgpr(tmpSgprX2, self.states.laneSGPRCount), comment="set 0 if K_idx >= sizeL")) if kernel["ProblemType"]["Sparse"] == 2 and numMIInput//8 >= 1: shiftK.add(vectorStaticRemainder(dummy, kReg, "Serial", kernel["WavefrontSize"], tmpVgpr, tmpSgprInfo)) diff --git a/tensilelite/Tensile/SolutionStructs.py b/tensilelite/Tensile/SolutionStructs.py index 5f8949b470..2d8fb2c7dd 100644 --- a/tensilelite/Tensile/SolutionStructs.py +++ b/tensilelite/Tensile/SolutionStructs.py @@ -2776,16 +2776,17 @@ def calcLdsNumBytes(ldsPadA: int, LdsBlockSizePerPadA: int, ldsPadB: int, LdsBlo reject(state, f"SwizzleTensor{tc} requires VectorWidth{tc} ({VW_TC}) == 1") if state["ProblemType"]["SwizzleTensorA"]: - if state["ProblemType"]["TransposeA"] is False: - reject(state, f"Tensor A swizzling supports TN or TT only") - if state["DirectToVgprA"] is False: + if not state["DirectToVgprA"]: reject(state, f"Tensor A swizzling requires DirectToVgprA") + if not state["ProblemType"]["TransposeA"]: + reject(state, f"Tensor A swizzling supports TN or TT only") if state["ProblemType"]["SwizzleTensorB"]: - if state["ProblemType"]["TransposeB"] is True: - reject(state, f"Tensor B swizzling supports NN or TN only") - if state["DirectToVgprB"] is False: + if not state["DirectToVgprB"]: reject(state, f"Tensor B swizzling requires DirectToVgprB") + # TODO- NN fails validation due to DTVB + Tail-Loop is not working correctly + if not (state["ProblemType"]["TransposeA"] and not state["ProblemType"]["TransposeB"]): + reject(state, f"Tensor B swizzling supports TN only") def calcOptGRVW(lrvw: int, unrollMajorLDS: bool, datatype: DataType) -> int: # with UnrollMajorLDS, GRVW need to less or equal than LRVW to have conflict free LDS read with padding. diff --git a/tensilelite/Tensile/Tests/common/gemm/dtvA_swizzleA.yaml b/tensilelite/Tensile/Tests/common/gemm/dtvA_swizzleA.yaml index 2075074364..815ee20911 100644 --- a/tensilelite/Tensile/Tests/common/gemm/dtvA_swizzleA.yaml +++ b/tensilelite/Tensile/Tests/common/gemm/dtvA_swizzleA.yaml @@ -11,6 +11,8 @@ GlobalParameters: MergeFiles: False KernelTime: True MaxWorkspaceSize: 13421772800 + DataInitTypeA: 13 + DataInitTypeB: 13 DataInitTypeAlpha: 1 DataInitTypeBeta: 1 BoundsCheck: 2 @@ -55,9 +57,11 @@ BenchmarkProblems: - [16, 16, 16, 1, 1, 4, 16, 4,1 ] # MT = 256x256 - [16, 16, 16, 1, 1, 8, 8, 4,1 ] # MT = 512x128 - [16, 16, 16, 1, 1, 8, 16, 4,1 ] # MT = 512x256 - - [16, 16, 16, 1, 1, 5, 8, 2, 1 ] # MT = 160x128 + - [16, 16, 16, 1, 1, 5, 8, 2,1 ] # MT = 160x128 + - AssertFree0ElementMultiple: [16] + - AssertSummationElementMultiple: [32] - GlobalReadVectorWidthA: [8] - - GlobalReadVectorWidthB: [2,4,8] + - GlobalReadVectorWidthB: [-1] - PrefetchGlobalRead: [1,2] - PrefetchLocalRead: [1,2,4] - ClusterLocalRead: [1] @@ -68,7 +72,7 @@ BenchmarkProblems: - LocalWritePerMfma: [-1] - StaggerU: [4] - StaggerUStride: [256] - - StaggerUMapping: [2] + - StaggerUMapping: [0] - WorkGroupMappingXCC: [8] - ScheduleIterAlg: [3] - LdsBlockSizePerPadA: [-1] @@ -87,9 +91,18 @@ BenchmarkProblems: BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: + - Exact: [160, 256, 1, 224] - Exact: [160, 256, 1, 256] - - Exact: [1600, 512, 1, 1024] + - Exact: [160, 256, 1, 288] + + # Enable the big sizes when GSU3 + MBSK is ready. + # - Exact: [1600, 512, 1, 992] + # - Exact: [1600, 512, 1, 1024] + # - Exact: [1600, 512, 1, 1056] + + - Exact: [512, 256, 1, 224] - Exact: [512, 256, 1, 256] + - Exact: [512, 256, 1, 288] - BiasTypeArgs: ['h'] - ActivationArgs: - [Enum: none]