From 07252765b9a3a8d41491f28f2f715c676b3bdd6b Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Fri, 22 Nov 2024 16:09:01 +0800 Subject: [PATCH 01/32] feat(backend): NewDispatch --- src/main/scala/xiangshan/Parameters.scala | 26 +- .../scala/xiangshan/backend/Backend.scala | 48 +- .../scala/xiangshan/backend/CtrlBlock.scala | 120 ++- .../backend/dispatch/NewDispatch.scala | 868 ++++++++++++++++++ .../xiangshan/backend/exu/ExeUnitParams.scala | 2 + .../backend/issue/IssueBlockParams.scala | 8 +- .../xiangshan/backend/issue/Scheduler.scala | 230 +---- .../backend/regcache/RegCacheTagTable.scala | 21 +- .../xiangshan/backend/rename/BusyTable.scala | 54 +- .../xiangshan/backend/rename/Rename.scala | 9 +- .../scala/xiangshan/backend/rob/Rob.scala | 2 +- .../xiangshan/mem/lsqueue/LSQWrapper.scala | 4 +- .../backend/dispatch/Dispatch2IqMain.scala | 20 - 13 files changed, 1070 insertions(+), 342 deletions(-) create mode 100644 src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala delete mode 100644 src/test/scala/xiangshan/backend/dispatch/Dispatch2IqMain.scala diff --git a/src/main/scala/xiangshan/Parameters.scala b/src/main/scala/xiangshan/Parameters.scala index 586fe9d52b0..19821cb841b 100644 --- a/src/main/scala/xiangshan/Parameters.scala +++ b/src/main/scala/xiangshan/Parameters.scala @@ -483,31 +483,31 @@ case class XSCoreParameters SchdBlockParams(Seq( IssueBlockParams(Seq( ExeUnitParams("STA0", Seq(StaCfg, MouCfg), Seq(FakeIntWB()), Seq(Seq(IntRD(7, 2)))), - ), numEntries = 16, numEnq = 1, numComp = 15), + ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( ExeUnitParams("STA1", Seq(StaCfg, MouCfg), Seq(FakeIntWB()), Seq(Seq(IntRD(6, 2)))), - ), numEntries = 16, numEnq = 1, numComp = 15), + ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( ExeUnitParams("LDU0", Seq(LduCfg), Seq(IntWB(5, 0), FpWB(5, 0)), Seq(Seq(IntRD(8, 0))), true, 2), - ), numEntries = 16, numEnq = 1, numComp = 15), + ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( ExeUnitParams("LDU1", Seq(LduCfg), Seq(IntWB(6, 0), FpWB(6, 0)), Seq(Seq(IntRD(9, 0))), true, 2), - ), numEntries = 16, numEnq = 1, numComp = 15), + ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( ExeUnitParams("LDU2", Seq(LduCfg), Seq(IntWB(7, 0), FpWB(7, 0)), Seq(Seq(IntRD(10, 0))), true, 2), - ), numEntries = 16, numEnq = 1, numComp = 15), + ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( ExeUnitParams("VLSU0", Seq(VlduCfg, VstuCfg, VseglduSeg, VsegstuCfg), Seq(VfWB(4, 0), V0WB(4, 0), VlWB(port = 2, 0)), Seq(Seq(VfRD(6, 0)), Seq(VfRD(7, 0)), Seq(VfRD(8, 0)), Seq(V0RD(2, 0)), Seq(VlRD(2, 0)))), - ), numEntries = 16, numEnq = 1, numComp = 15), + ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( ExeUnitParams("VLSU1", Seq(VlduCfg, VstuCfg), Seq(VfWB(5, 0), V0WB(5, 0), VlWB(port = 3, 0)), Seq(Seq(VfRD(9, 0)), Seq(VfRD(10, 0)), Seq(VfRD(11, 0)), Seq(V0RD(3, 0)), Seq(VlRD(3, 0)))), - ), numEntries = 16, numEnq = 1, numComp = 15), + ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( ExeUnitParams("STD0", Seq(StdCfg, MoudCfg), Seq(), Seq(Seq(IntRD(5, 2), FpRD(12, 0)))), - ), numEntries = 16, numEnq = 1, numComp = 15), + ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( ExeUnitParams("STD1", Seq(StdCfg, MoudCfg), Seq(), Seq(Seq(IntRD(3, 2), FpRD(13, 0)))), - ), numEntries = 16, numEnq = 1, numComp = 15), + ), numEntries = 16, numEnq = 2, numComp = 14), ), numPregs = intPreg.numEntries max vfPreg.numEntries, numDeqOutside = 0, @@ -530,14 +530,6 @@ case class XSCoreParameters Seq("FEX0", "FEX1", "FEX2", "FEX3") -> Seq("FEX0", "FEX1", "FEX2", "FEX3", "FEX4", "FEX5") ), - WakeUpConfig( - Seq("FEX0", "FEX1", "FEX2", "FEX3") -> - Seq("STD0", "STD1") - ), -// WakeUpConfig( -// Seq("VFEX0", "VFEX1", "VFEX2", "VFEX3") -> -// Seq("VFEX0", "VFEX1", "VFEX2", "VFEX3") -// ), ).flatten } diff --git a/src/main/scala/xiangshan/backend/Backend.scala b/src/main/scala/xiangshan/backend/Backend.scala index 10146f0e7ad..6c5e9c4d1b0 100644 --- a/src/main/scala/xiangshan/backend/Backend.scala +++ b/src/main/scala/xiangshan/backend/Backend.scala @@ -242,8 +242,6 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame private val backendCriticalError = Wire(Bool()) - ctrlBlock.io.intIQValidNumVec := intScheduler.io.intIQValidNumVec - ctrlBlock.io.fpIQValidNumVec := fpScheduler.io.fpIQValidNumVec ctrlBlock.io.fromTop.hartId := io.fromTop.hartId ctrlBlock.io.frontend <> io.frontend ctrlBlock.io.fromCSR.toDecode := intExuBlock.io.csrToDecode.get @@ -253,6 +251,41 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame ctrlBlock.io.fromMem.violation <> io.mem.memoryViolation ctrlBlock.io.lqCanAccept := io.mem.lqCanAccept ctrlBlock.io.sqCanAccept := io.mem.sqCanAccept + + io.mem.lsqEnqIO <> ctrlBlock.io.toMem.lsqEnqIO + ctrlBlock.io.fromMemToDispatch.scommit := io.mem.sqDeq + ctrlBlock.io.fromMemToDispatch.lcommit := io.mem.lqDeq + ctrlBlock.io.fromMemToDispatch.sqDeqPtr := io.mem.sqDeqPtr + ctrlBlock.io.fromMemToDispatch.lqDeqPtr := io.mem.lqDeqPtr + ctrlBlock.io.fromMemToDispatch.sqCancelCnt := io.mem.sqCancelCnt + ctrlBlock.io.fromMemToDispatch.lqCancelCnt := io.mem.lqCancelCnt + ctrlBlock.io.toDispatch.wakeUpInt := intScheduler.io.toSchedulers.wakeupVec + ctrlBlock.io.toDispatch.wakeUpFp := fpScheduler.io.toSchedulers.wakeupVec + ctrlBlock.io.toDispatch.wakeUpVec := vfScheduler.io.toSchedulers.wakeupVec + ctrlBlock.io.toDispatch.wakeUpMem := memScheduler.io.toSchedulers.wakeupVec + ctrlBlock.io.toDispatch.IQValidNumVec := intScheduler.io.IQValidNumVec ++ fpScheduler.io.IQValidNumVec ++ vfScheduler.io.IQValidNumVec ++ memScheduler.io.IQValidNumVec + ctrlBlock.io.toDispatch.ldCancel := io.mem.ldCancel + ctrlBlock.io.toDispatch.og0Cancel := og0Cancel + ctrlBlock.io.toDispatch.wbPregsInt.zip(wbDataPath.io.toIntPreg).map(x => { + x._1.valid := x._2.wen && x._2.intWen + x._1.bits := x._2.addr + }) + ctrlBlock.io.toDispatch.wbPregsFp.zip(wbDataPath.io.toFpPreg).map(x => { + x._1.valid := x._2.wen && x._2.fpWen + x._1.bits := x._2.addr + }) + ctrlBlock.io.toDispatch.wbPregsVec.zip(wbDataPath.io.toVfPreg).map(x => { + x._1.valid := x._2.wen && x._2.vecWen + x._1.bits := x._2.addr + }) + ctrlBlock.io.toDispatch.wbPregsV0.zip(wbDataPath.io.toV0Preg).map(x => { + x._1.valid := x._2.wen && x._2.v0Wen + x._1.bits := x._2.addr + }) + ctrlBlock.io.toDispatch.wbPregsVl.zip(wbDataPath.io.toVlPreg).map(x => { + x._1.valid := x._2.wen && x._2.vlWen + x._1.bits := x._2.addr + }) ctrlBlock.io.csrCtrl <> intExuBlock.io.csrio.get.customCtrl ctrlBlock.io.robio.csr.intrBitSet := intExuBlock.io.csrio.get.interrupt ctrlBlock.io.robio.csr.trapTarget := intExuBlock.io.csrio.get.trapTarget @@ -264,14 +297,13 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame ctrlBlock.io.robio.debug_ls <> io.mem.debugLS ctrlBlock.io.debugEnqLsq.canAccept := io.mem.lsqEnqIO.canAccept ctrlBlock.io.debugEnqLsq.resp := io.mem.lsqEnqIO.resp - ctrlBlock.io.debugEnqLsq.req := memScheduler.io.memIO.get.lsqEnqIO.req - ctrlBlock.io.debugEnqLsq.needAlloc := memScheduler.io.memIO.get.lsqEnqIO.needAlloc - ctrlBlock.io.debugEnqLsq.iqAccept := memScheduler.io.memIO.get.lsqEnqIO.iqAccept + ctrlBlock.io.debugEnqLsq.req := ctrlBlock.io.toMem.lsqEnqIO.req + ctrlBlock.io.debugEnqLsq.needAlloc := ctrlBlock.io.toMem.lsqEnqIO.needAlloc + ctrlBlock.io.debugEnqLsq.iqAccept := ctrlBlock.io.toMem.lsqEnqIO.iqAccept ctrlBlock.io.fromVecExcpMod.busy := vecExcpMod.o.status.busy intScheduler.io.fromTop.hartId := io.fromTop.hartId intScheduler.io.fromCtrlBlock.flush := ctrlBlock.io.toIssueBlock.flush - intScheduler.io.fromDispatch.allocPregs <> ctrlBlock.io.toIssueBlock.allocPregs intScheduler.io.fromDispatch.uops <> ctrlBlock.io.toIssueBlock.intUops intScheduler.io.intWriteBack := wbDataPath.io.toIntPreg intScheduler.io.fpWriteBack := 0.U.asTypeOf(intScheduler.io.fpWriteBack) @@ -291,7 +323,6 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame fpScheduler.io.fromTop.hartId := io.fromTop.hartId fpScheduler.io.fromCtrlBlock.flush := ctrlBlock.io.toIssueBlock.flush - fpScheduler.io.fromDispatch.allocPregs <> ctrlBlock.io.toIssueBlock.allocPregs fpScheduler.io.fromDispatch.uops <> ctrlBlock.io.toIssueBlock.fpUops fpScheduler.io.intWriteBack := 0.U.asTypeOf(fpScheduler.io.intWriteBack) fpScheduler.io.fpWriteBack := wbDataPath.io.toFpPreg @@ -310,7 +341,6 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame memScheduler.io.fromTop.hartId := io.fromTop.hartId memScheduler.io.fromCtrlBlock.flush := ctrlBlock.io.toIssueBlock.flush - memScheduler.io.fromDispatch.allocPregs <> ctrlBlock.io.toIssueBlock.allocPregs memScheduler.io.fromDispatch.uops <> ctrlBlock.io.toIssueBlock.memUops memScheduler.io.intWriteBack := wbDataPath.io.toIntPreg memScheduler.io.fpWriteBack := wbDataPath.io.toFpPreg @@ -350,7 +380,6 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame vfScheduler.io.fromTop.hartId := io.fromTop.hartId vfScheduler.io.fromCtrlBlock.flush := ctrlBlock.io.toIssueBlock.flush - vfScheduler.io.fromDispatch.allocPregs <> ctrlBlock.io.toIssueBlock.allocPregs vfScheduler.io.fromDispatch.uops <> ctrlBlock.io.toIssueBlock.vfUops vfScheduler.io.intWriteBack := 0.U.asTypeOf(vfScheduler.io.intWriteBack) vfScheduler.io.fpWriteBack := 0.U.asTypeOf(vfScheduler.io.fpWriteBack) @@ -740,7 +769,6 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame ctrlBlock.io.robio.robHeadLsIssue := io.mem.issueUops.map(deq => deq.fire && deq.bits.uop.robIdx === ctrlBlock.io.robio.robDeqPtr).reduce(_ || _) // mem io - io.mem.lsqEnqIO <> memScheduler.io.memIO.get.lsqEnqIO io.mem.robLsqIO <> ctrlBlock.io.robio.lsq io.mem.storeDebugInfo <> ctrlBlock.io.robio.storeDebugInfo diff --git a/src/main/scala/xiangshan/backend/CtrlBlock.scala b/src/main/scala/xiangshan/backend/CtrlBlock.scala index 5afe7e1d813..0666fbb2f68 100644 --- a/src/main/scala/xiangshan/backend/CtrlBlock.scala +++ b/src/main/scala/xiangshan/backend/CtrlBlock.scala @@ -24,18 +24,19 @@ import utility._ import utils._ import xiangshan.ExceptionNO._ import xiangshan._ -import xiangshan.backend.Bundles.{DecodedInst, DynInst, ExceptionInfo, ExuOutput, StaticInst, TrapInstInfo} +import xiangshan.backend.Bundles.{DecodedInst, DynInst, ExceptionInfo, ExuOutput, ExuVec, StaticInst, TrapInstInfo} import xiangshan.backend.ctrlblock.{DebugLSIO, DebugLsInfoBundle, LsTopdownInfo, MemCtrl, RedirectGenerator} -import xiangshan.backend.datapath.DataConfig.VAddrData +import xiangshan.backend.datapath.DataConfig.{FpData, IntData, V0Data, VAddrData, VecData, VlData} import xiangshan.backend.decode.{DecodeStage, FusionDecoder} import xiangshan.backend.dispatch.{CoreDispatchTopDownIO, Dispatch, DispatchQueue} +import xiangshan.backend.dispatch.NewDispatch import xiangshan.backend.fu.PFEvent import xiangshan.backend.fu.vector.Bundles.{VType, Vl} import xiangshan.backend.fu.wrapper.CSRToDecode import xiangshan.backend.rename.{Rename, RenameTableWrapper, SnapshotGenerator} import xiangshan.backend.rob.{Rob, RobCSRIO, RobCoreTopDownIO, RobDebugRollingIO, RobLsqIO, RobPtr} import xiangshan.frontend.{FtqPtr, FtqRead, Ftq_RF_Components} -import xiangshan.mem.{LqPtr, LsqEnqIO} +import xiangshan.mem.{LqPtr, LsqEnqIO, SqPtr} import xiangshan.backend.issue.{FpScheduler, IntScheduler, MemScheduler, VfScheduler} import xiangshan.backend.trace._ @@ -86,17 +87,12 @@ class CtrlBlockImp( val io = IO(new CtrlBlockIO()) + val dispatch = Module(new NewDispatch) val gpaMem = wrapper.gpaMem.module val decode = Module(new DecodeStage) val fusionDecoder = Module(new FusionDecoder) val rat = Module(new RenameTableWrapper) val rename = Module(new Rename) - val dispatch = Module(new Dispatch) - val intDq0 = Module(new DispatchQueue(dpParams.IntDqSize, RenameWidth, dpParams.IntDqDeqWidth/2, dqIndex = 0)) - val intDq1 = Module(new DispatchQueue(dpParams.IntDqSize, RenameWidth, dpParams.IntDqDeqWidth/2, dqIndex = 1)) - val fpDq = Module(new DispatchQueue(dpParams.FpDqSize, RenameWidth, dpParams.VecDqDeqWidth)) - val vecDq = Module(new DispatchQueue(dpParams.FpDqSize, RenameWidth, dpParams.VecDqDeqWidth)) - val lsDq = Module(new DispatchQueue(dpParams.LsDqSize, RenameWidth, dpParams.LsDqDeqWidth)) val redirectGen = Module(new RedirectGenerator) private def hasRen: Boolean = true private val pcMem = Module(new SyncDataModuleTemplate(new Ftq_RF_Components, FtqSize, numPcMemRead, 1, "BackendPC", hasRen = hasRen)) @@ -473,6 +469,8 @@ class CtrlBlockImp( decodePipeRename(i).ready := rename.io.in(i).ready rename.io.in(i).valid := decodePipeRename(i).valid && !fusionDecoder.io.clear(i) rename.io.in(i).bits := decodePipeRename(i).bits + dispatch.io.renameIn(i).valid := decodePipeRename(i).valid && !fusionDecoder.io.clear(i) && !decodePipeRename(i).bits.isMove + dispatch.io.renameIn(i).bits := decodePipeRename(i).bits } for (i <- 0 until RenameWidth - 1) { @@ -483,6 +481,7 @@ class CtrlBlockImp( decode.io.fusion(i) := fusionDecoder.io.out(i).valid && rename.io.out(i).fire when (fusionDecoder.io.out(i).valid) { fusionDecoder.io.out(i).bits.update(rename.io.in(i).bits) + fusionDecoder.io.out(i).bits.update(dispatch.io.renameIn(i).bits) // TODO: remove this dirty code for ftq update val sameFtqPtr = rename.io.in(i).bits.ftqPtr.value === rename.io.in(i + 1).bits.ftqPtr.value val ftqOffset0 = rename.io.in(i).bits.ftqOffset @@ -573,43 +572,38 @@ class CtrlBlockImp( // pipeline between rename and dispatch PipeGroupConnect(renameOut, dispatch.io.fromRename, s1_s3_redirect.valid, dispatch.io.toRenameAllFire, "renamePipeDispatch") - dispatch.io.intIQValidNumVec := io.intIQValidNumVec - dispatch.io.fpIQValidNumVec := io.fpIQValidNumVec - dispatch.io.fromIntDQ.intDQ0ValidDeq0Num := intDq0.io.validDeq0Num - dispatch.io.fromIntDQ.intDQ0ValidDeq1Num := intDq0.io.validDeq1Num - dispatch.io.fromIntDQ.intDQ1ValidDeq0Num := intDq1.io.validDeq0Num - dispatch.io.fromIntDQ.intDQ1ValidDeq1Num := intDq1.io.validDeq1Num - - dispatch.io.hartId := io.fromTop.hartId + dispatch.io.redirect := s1_s3_redirect dispatch.io.enqRob <> rob.io.enq dispatch.io.robHead := rob.io.debugRobHead dispatch.io.stallReason <> rename.io.stallReason.out dispatch.io.lqCanAccept := io.lqCanAccept dispatch.io.sqCanAccept := io.sqCanAccept + dispatch.io.fromMem.lcommit := io.fromMemToDispatch.lcommit + dispatch.io.fromMem.scommit := io.fromMemToDispatch.scommit + dispatch.io.fromMem.lqDeqPtr := io.fromMemToDispatch.lqDeqPtr + dispatch.io.fromMem.sqDeqPtr := io.fromMemToDispatch.sqDeqPtr + dispatch.io.fromMem.lqCancelCnt := io.fromMemToDispatch.lqCancelCnt + dispatch.io.fromMem.sqCancelCnt := io.fromMemToDispatch.sqCancelCnt + io.toMem.lsqEnqIO <> dispatch.io.toMem.lsqEnqIO + dispatch.io.wakeUpAll.wakeUpInt := io.toDispatch.wakeUpInt + dispatch.io.wakeUpAll.wakeUpFp := io.toDispatch.wakeUpFp + dispatch.io.wakeUpAll.wakeUpVec := io.toDispatch.wakeUpVec + dispatch.io.wakeUpAll.wakeUpMem := io.toDispatch.wakeUpMem + dispatch.io.IQValidNumVec := io.toDispatch.IQValidNumVec + dispatch.io.ldCancel := io.toDispatch.ldCancel + dispatch.io.og0Cancel := io.toDispatch.og0Cancel + dispatch.io.wbPregsInt := io.toDispatch.wbPregsInt + dispatch.io.wbPregsFp := io.toDispatch.wbPregsFp + dispatch.io.wbPregsVec := io.toDispatch.wbPregsVec + dispatch.io.wbPregsV0 := io.toDispatch.wbPregsV0 + dispatch.io.wbPregsVl := io.toDispatch.wbPregsVl dispatch.io.robHeadNotReady := rob.io.headNotReady dispatch.io.robFull := rob.io.robFull dispatch.io.singleStep := GatedValidRegNext(io.csrCtrl.singlestep) - intDq0.io.enq <> dispatch.io.toIntDq0 - intDq0.io.redirect <> s2_s4_redirect - intDq1.io.enq <> dispatch.io.toIntDq1 - intDq1.io.redirect <> s2_s4_redirect - - fpDq.io.enq <> dispatch.io.toFpDq - fpDq.io.redirect <> s2_s4_redirect - - vecDq.io.enq <> dispatch.io.toVecDq - vecDq.io.redirect <> s2_s4_redirect - - lsDq.io.enq <> dispatch.io.toLsDq - lsDq.io.redirect <> s2_s4_redirect - - io.toIssueBlock.intUops <> (intDq0.io.deq :++ intDq1.io.deq) - io.toIssueBlock.fpUops <> fpDq.io.deq - io.toIssueBlock.vfUops <> vecDq.io.deq - io.toIssueBlock.memUops <> lsDq.io.deq - io.toIssueBlock.allocPregs <> dispatch.io.allocPregs + val toIssueBlockUops = Seq(io.toIssueBlock.intUops, io.toIssueBlock.fpUops, io.toIssueBlock.vfUops, io.toIssueBlock.memUops).flatten + toIssueBlockUops.zip(dispatch.io.toIssueQueues).map(x => x._1 <> x._2) io.toIssueBlock.flush <> s2_s4_redirect pcMem.io.wen.head := GatedValidRegNext(io.frontend.fromFtq.pc_mem_wen) @@ -703,11 +697,11 @@ class CtrlBlockImp( io.debugRolling := rob.io.debugRolling io.perfInfo.ctrlInfo.robFull := GatedValidRegNext(rob.io.robFull) - io.perfInfo.ctrlInfo.intdqFull := GatedValidRegNext(intDq0.io.dqFull || intDq1.io.dqFull) - io.perfInfo.ctrlInfo.fpdqFull := GatedValidRegNext(vecDq.io.dqFull) - io.perfInfo.ctrlInfo.lsdqFull := GatedValidRegNext(lsDq.io.dqFull) + io.perfInfo.ctrlInfo.intdqFull := false.B + io.perfInfo.ctrlInfo.fpdqFull := false.B + io.perfInfo.ctrlInfo.lsdqFull := false.B - val perfEvents = Seq(decode, rename, dispatch, intDq0, intDq1, vecDq, lsDq, rob).flatMap(_.getPerfEvents) + val perfEvents = Seq(decode, rename, dispatch, rob).flatMap(_.getPerfEvents) generatePerfEvent() val criticalErrors = rob.getCriticalErrors @@ -728,11 +722,45 @@ class CtrlBlockIO()(implicit p: Parameters, params: BackendParams) extends XSBun } val toIssueBlock = new Bundle { val flush = ValidIO(new Redirect) - val allocPregs = Vec(RenameWidth, Output(new ResetPregStateReq)) - val intUops = Vec(dpParams.IntDqDeqWidth, DecoupledIO(new DynInst)) - val vfUops = Vec(dpParams.VecDqDeqWidth, DecoupledIO(new DynInst)) - val fpUops = Vec(dpParams.FpDqDeqWidth, DecoupledIO(new DynInst)) - val memUops = Vec(dpParams.LsDqDeqWidth, DecoupledIO(new DynInst)) + val intUopsNum = backendParams.intSchdParams.get.issueBlockParams.map(_.numEnq).sum + val fpUopsNum = backendParams.fpSchdParams.get.issueBlockParams.map(_.numEnq).sum + val vfUopsNum = backendParams.vfSchdParams.get.issueBlockParams.map(_.numEnq).sum + val memUopsNum = backendParams.memSchdParams.get.issueBlockParams.filter(x => x.StdCnt == 0).map(_.numEnq).sum + val intUops = Vec(intUopsNum, DecoupledIO(new DynInst)) + val fpUops = Vec(fpUopsNum, DecoupledIO(new DynInst)) + val vfUops = Vec(vfUopsNum, DecoupledIO(new DynInst)) + val memUops = Vec(memUopsNum, DecoupledIO(new DynInst)) + } + val fromMemToDispatch = new Bundle { + val lcommit = Input(UInt(log2Up(CommitWidth + 1).W)) + val scommit = Input(UInt(log2Ceil(EnsbufferWidth + 1).W)) // connected to `memBlock.io.sqDeq` instead of ROB + val lqDeqPtr = Input(new LqPtr) + val sqDeqPtr = Input(new SqPtr) + // from lsq + val lqCancelCnt = Input(UInt(log2Up(VirtualLoadQueueSize + 1).W)) + val sqCancelCnt = Input(UInt(log2Up(StoreQueueSize + 1).W)) + } + //toMem + val toMem = new Bundle { + val lsqEnqIO = Flipped(new LsqEnqIO) + } + val toDispatch = new Bundle { + val wakeUpInt = Flipped(backendParams.intSchdParams.get.genIQWakeUpOutValidBundle) + val wakeUpFp = Flipped(backendParams.fpSchdParams.get.genIQWakeUpOutValidBundle) + val wakeUpVec = Flipped(backendParams.vfSchdParams.get.genIQWakeUpOutValidBundle) + val wakeUpMem = Flipped(backendParams.memSchdParams.get.genIQWakeUpOutValidBundle) + val allIssueParams = backendParams.allIssueParams.filter(_.StdCnt == 0) + val allExuParams = allIssueParams.map(_.exuBlockParams).flatten + val exuNum = allExuParams.size + val maxIQSize = allIssueParams.map(_.numEntries).max + val IQValidNumVec = Vec(exuNum, Input(UInt(maxIQSize.U.getWidth.W))) + val og0Cancel = Input(ExuVec()) + val ldCancel = Vec(backendParams.LdExuCnt, Flipped(new LoadCancelIO)) + val wbPregsInt = Vec(backendParams.numPregWb(IntData()), Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) + val wbPregsFp = Vec(backendParams.numPregWb(FpData()), Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) + val wbPregsVec = Vec(backendParams.numPregWb(VecData()), Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) + val wbPregsV0 = Vec(backendParams.numPregWb(V0Data()), Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) + val wbPregsVl = Vec(backendParams.numPregWb(VlData()), Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) } val toDataPath = new Bundle { val flush = ValidIO(new Redirect) @@ -743,8 +771,6 @@ class CtrlBlockIO()(implicit p: Parameters, params: BackendParams) extends XSBun val toCSR = new Bundle { val trapInstInfo = Output(ValidIO(new TrapInstInfo)) } - val intIQValidNumVec = Input(MixedVec(params.genIntIQValidNumBundle)) - val fpIQValidNumVec = Input(MixedVec(params.genFpIQValidNumBundle)) val fromWB = new Bundle { val wbData = Flipped(MixedVec(params.genWrite2CtrlBundles)) } diff --git a/src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala b/src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala new file mode 100644 index 00000000000..fbcf99eb89c --- /dev/null +++ b/src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala @@ -0,0 +1,868 @@ +/*************************************************************************************** +* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences +* Copyright (c) 2020-2021 Peng Cheng Laboratory +* +* XiangShan is licensed under Mulan PSL v2. +* You can use this software according to the terms and conditions of the Mulan PSL v2. +* You may obtain a copy of Mulan PSL v2 at: +* http://license.coscl.org.cn/MulanPSL2 +* +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +* +* See the Mulan PSL v2 for more details. +***************************************************************************************/ + +package xiangshan.backend.dispatch + +import org.chipsalliance.cde.config.Parameters +import chisel3._ +import chisel3.util._ +import utility._ +import xiangshan.ExceptionNO._ +import xiangshan._ +import xiangshan.backend.MemCoreTopDownIO +import xiangshan.backend.rob.{RobDispatchTopDownIO, RobEnqIO} +import xiangshan.mem.mdp._ +import xiangshan.mem.{HasVLSUParameters, _} +import xiangshan.backend.Bundles.{DecodedInst, DynInst, ExuOH, ExuVec, IssueQueueIQWakeUpBundle} +import xiangshan.backend.fu.{FuConfig, FuType} +import xiangshan.backend.rename.BusyTable +import chisel3.util.experimental.decode._ +import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp} +import xiangshan.backend.fu.{FuConfig, FuType} +import xiangshan.backend.rename.BusyTableReadIO +import xiangshan.backend.datapath.DataConfig._ +import xiangshan.backend.datapath.WbConfig._ +import xiangshan.backend.datapath.DataSource +import xiangshan.backend.datapath.WbConfig.VfWB +import xiangshan.backend.fu.FuType.FuTypeOrR +import xiangshan.backend.dispatch.Dispatch2IqFpImp +import xiangshan.backend.regcache.{RCTagTableReadPort, RegCacheTagTable} + + +// TODO delete trigger message from frontend to iq +class NewDispatch(implicit p: Parameters) extends XSModule with HasPerfEvents with HasVLSUParameters { + // std IQ donot need dispatch, only copy sta IQ, but need sta IQ's ready && std IQ's ready + val allIssueParams = backendParams.allIssueParams.filter(_.StdCnt == 0) + val allExuParams = allIssueParams.map(_.exuBlockParams).flatten + val allFuConfigs = allExuParams.map(_.fuConfigs).flatten.toSet.toSeq + val sortedFuConfigs = allFuConfigs.sortBy(_.fuType.id) + println(s"[NewDispatch] ${allExuParams.map(_.name)}") + println(s"[NewDispatch] ${allFuConfigs.map(_.name)}") + println(s"[NewDispatch] ${allFuConfigs.map(_.fuType.id)}") + println(s"[NewDispatch] ${sortedFuConfigs.map(_.name)}") + println(s"[NewDispatch] ${sortedFuConfigs.map(_.fuType.id)}") + val fuConfigsInIssueParams = allIssueParams.map(_.allExuParams.map(_.fuConfigs).flatten.toSet.toSeq) + val fuMapIQIdx = sortedFuConfigs.map( fu => { + val fuInIQIdx = fuConfigsInIssueParams.zipWithIndex.filter { case (f, i) => f.contains(fu) }.map(_._2) + (fu -> fuInIQIdx) + } + ) + fuMapIQIdx.map { case (fu, iqidx) => + println(s"[NewDispatch] ${fu.name} $iqidx") + } + val sameIQIdxFus = fuMapIQIdx.map{ case (fu, iqidx) => + fuMapIQIdx.filter(_._2 == iqidx).map(_._1) -> iqidx + }.toSet.toSeq + val needMultiIQ = sameIQIdxFus.sortBy(_._1.head.fuType.id).filter(_._2.size > 1) + val needSingleIQ = sameIQIdxFus.sortBy(_._1.head.fuType.id).filter(_._2.size == 1) + needMultiIQ.map { case (fus, iqidx) => + println(s"[NewDispatch] needMultiIQ: ${fus.map(_.name)} $iqidx") + } + needSingleIQ.map { case (fus, iqidx) => + println(s"[NewDispatch] needSingleIQ: ${fus.map(_.name)} $iqidx") + } + val fuConfigsInExuParams = allExuParams.map(_.fuConfigs) + val fuMapExuIdx = sortedFuConfigs.map { case fu => { + val fuInExuIdx = fuConfigsInExuParams.zipWithIndex.filter { case (f, i) => f.contains(fu) }.map(_._2) + (fu -> fuInExuIdx) + } + } + val sameExuIdxFus = fuMapExuIdx.map { case (fu, exuidx) => + fuMapExuIdx.filter(_._2 == exuidx).map(_._1) -> exuidx + }.toSet.toSeq + val needMultiExu = sameExuIdxFus.sortBy(_._1.head.fuType.id).filter(_._2.size > 1).filter{ x => + x._1.map(y => fuMapIQIdx.filter(_._1 == y).head._2.size > 1).reduce(_ && _) + } + + val exuNum = allExuParams.size + val maxIQSize = allIssueParams.map(_.numEntries).max + val IQEnqSum = allIssueParams.map(_.numEnq).sum + + val io = IO(new Bundle { + // from rename + val renameIn = Vec(RenameWidth, Flipped(ValidIO(new DecodedInst))) + val fromRename = Vec(RenameWidth, Flipped(DecoupledIO(new DynInst))) + val toRenameAllFire = Output(Bool()) + // enq Rob + val enqRob = Flipped(new RobEnqIO) + // IssueQueues + val IQValidNumVec = Vec(exuNum, Input(UInt(maxIQSize.U.getWidth.W))) + val toIssueQueues = Vec(IQEnqSum, DecoupledIO(new DynInst)) + // to busyTable + // set preg state to ready (write back regfile) + val wbPregsInt = Vec(backendParams.numPregWb(IntData()), Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) + val wbPregsFp = Vec(backendParams.numPregWb(FpData()), Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) + val wbPregsVec = Vec(backendParams.numPregWb(VecData()), Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) + val wbPregsV0 = Vec(backendParams.numPregWb(V0Data()), Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) + val wbPregsVl = Vec(backendParams.numPregWb(VlData()), Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) + val wakeUpAll = new Bundle { + val wakeUpInt: MixedVec[ValidIO[IssueQueueIQWakeUpBundle]] = Flipped(backendParams.intSchdParams.get.genIQWakeUpOutValidBundle) + val wakeUpFp: MixedVec[ValidIO[IssueQueueIQWakeUpBundle]] = Flipped(backendParams.fpSchdParams.get.genIQWakeUpOutValidBundle) + val wakeUpVec: MixedVec[ValidIO[IssueQueueIQWakeUpBundle]] = Flipped(backendParams.vfSchdParams.get.genIQWakeUpOutValidBundle) + val wakeUpMem: MixedVec[ValidIO[IssueQueueIQWakeUpBundle]] = Flipped(backendParams.memSchdParams.get.genIQWakeUpOutValidBundle) + } + val og0Cancel = Input(ExuVec()) + val ldCancel = Vec(backendParams.LdExuCnt, Flipped(new LoadCancelIO)) + // from MemBlock + val fromMem = new Bundle { + val lcommit = Input(UInt(log2Up(CommitWidth + 1).W)) + val scommit = Input(UInt(log2Ceil(EnsbufferWidth + 1).W)) // connected to `memBlock.io.sqDeq` instead of ROB + val lqDeqPtr = Input(new LqPtr) + val sqDeqPtr = Input(new SqPtr) + // from lsq + val lqCancelCnt = Input(UInt(log2Up(VirtualLoadQueueSize + 1).W)) + val sqCancelCnt = Input(UInt(log2Up(StoreQueueSize + 1).W)) + } + //toMem + val toMem = new Bundle { + val lsqEnqIO = Flipped(new LsqEnqIO) + } + // redirect + val redirect = Flipped(ValidIO(new Redirect)) + // singleStep + val singleStep = Input(Bool()) + // lfst + val lfst = new DispatchLFSTIO + + // perf only + val robHead = Input(new DynInst) + val stallReason = Flipped(new StallReasonIO(RenameWidth)) + val lqCanAccept = Input(Bool()) + val sqCanAccept = Input(Bool()) + val robHeadNotReady = Input(Bool()) + val robFull = Input(Bool()) + val debugTopDown = new Bundle { + val fromRob = Flipped(new RobDispatchTopDownIO) + val fromCore = new CoreDispatchTopDownIO + } + }) + // Deq for std's IQ is not assigned in Dispatch2Iq, so add one more src for it. + val issueBlockParams = backendParams.allIssueParams + val renameIn = io.renameIn + val fromRename = io.fromRename + io.toRenameAllFire := io.fromRename.map(x => !x.valid || x.fire).reduce(_ && _) + val fromRenameUpdate = Wire(Vec(RenameWidth, Flipped(ValidIO(new DynInst)))) + fromRenameUpdate := fromRename + val renameWidth = io.fromRename.size + val issueQueueCount = io.IQValidNumVec + val issueQueueNum = allIssueParams.size + // int fp vec v0 vl + val numRegType = 5 + val idxRegTypeInt = allFuConfigs.map(x => { + x.srcData.map(xx => { + xx.zipWithIndex.filter(y => IntRegSrcDataSet.contains(y._1)).map(_._2) + }).flatten + }).flatten.toSet.toSeq.sorted + val idxRegTypeFp = allFuConfigs.map(x => { + x.srcData.map(xx => { + xx.zipWithIndex.filter(y => FpRegSrcDataSet.contains(y._1)).map(_._2) + }).flatten + }).flatten.toSet.toSeq.sorted + val idxRegTypeVec = allFuConfigs.map(x => { + x.srcData.map(xx => { + xx.zipWithIndex.filter(y => VecRegSrcDataSet.contains(y._1)).map(_._2) + }).flatten + }).flatten.toSet.toSeq.sorted + val idxRegTypeV0 = allFuConfigs.map(x => { + x.srcData.map(xx => { + xx.zipWithIndex.filter(y => V0RegSrcDataSet.contains(y._1)).map(_._2) + }).flatten + }).flatten.toSet.toSeq.sorted + val idxRegTypeVl = allFuConfigs.map(x => { + x.srcData.map(xx => { + xx.zipWithIndex.filter(y => VlRegSrcDataSet.contains(y._1)).map(_._2) + }).flatten + }).flatten.toSet.toSeq.sorted + println(s"[NewDispatch] idxRegTypeInt: $idxRegTypeInt") + println(s"[NewDispatch] idxRegTypeFp: $idxRegTypeFp") + println(s"[NewDispatch] idxRegTypeVec: $idxRegTypeVec") + println(s"[NewDispatch] idxRegTypeV0: $idxRegTypeV0") + println(s"[NewDispatch] idxRegTypeVl: $idxRegTypeVl") + val numRegSrc: Int = issueBlockParams.map(_.exuBlockParams.map( + x => if (x.hasStdFu) x.numRegSrc + 1 else x.numRegSrc + ).max).max + + val numRegSrcInt: Int = issueBlockParams.map(_.exuBlockParams.map( + x => if (x.hasStdFu) x.numIntSrc + 1 else x.numIntSrc + ).max).max + val numRegSrcFp: Int = issueBlockParams.map(_.exuBlockParams.map( + x => if (x.hasStdFu) x.numFpSrc + 1 else x.numFpSrc + ).max).max + val numRegSrcVf: Int = issueBlockParams.map(_.exuBlockParams.map( + x => x.numVecSrc + ).max).max + val numRegSrcV0: Int = issueBlockParams.map(_.exuBlockParams.map( + x => x.numV0Src + ).max).max + val numRegSrcVl: Int = issueBlockParams.map(_.exuBlockParams.map( + x => x.numVlSrc + ).max).max + + println(s"[Dispatch2Iq] numRegSrc: ${numRegSrc}, numRegSrcInt: ${numRegSrcInt}, numRegSrcFp: ${numRegSrcFp}, " + + s"numRegSrcVf: ${numRegSrcVf}, numRegSrcV0: ${numRegSrcV0}, numRegSrcVl: ${numRegSrcVl}") + + // RegCacheTagTable Module + val rcTagTable = Module(new RegCacheTagTable(numRegSrcInt * renameWidth)) + // BusyTable Modules + val intBusyTable = Module(new BusyTable(numRegSrcInt * renameWidth, backendParams.numPregWb(IntData()), IntPhyRegs, IntWB())) + val fpBusyTable = Module(new BusyTable(numRegSrcFp * renameWidth, backendParams.numPregWb(FpData()), FpPhyRegs, FpWB())) + val vecBusyTable = Module(new BusyTable(numRegSrcVf * renameWidth, backendParams.numPregWb(VecData()), VfPhyRegs, VfWB())) + val v0BusyTable = Module(new BusyTable(numRegSrcV0 * renameWidth, backendParams.numPregWb(V0Data()), V0PhyRegs, V0WB())) + val vlBusyTable = Module(new BusyTable(numRegSrcVl * renameWidth, backendParams.numPregWb(VlData()), VlPhyRegs, VlWB())) + val busyTables = Seq(intBusyTable, fpBusyTable, vecBusyTable, v0BusyTable, vlBusyTable) + val wbPregs = Seq(io.wbPregsInt, io.wbPregsFp, io.wbPregsVec, io.wbPregsV0, io.wbPregsVl) + val idxRegType = Seq(idxRegTypeInt, idxRegTypeFp, idxRegTypeVec, idxRegTypeV0, idxRegTypeVl) + val allocPregsValid = Wire(Vec(busyTables.size, Vec(RenameWidth, Bool()))) + allocPregsValid(0) := VecInit(fromRename.map(x => x.valid && x.bits.rfWen && !x.bits.eliminatedMove)) + allocPregsValid(1) := VecInit(fromRename.map(x => x.valid && x.bits.fpWen)) + allocPregsValid(2) := VecInit(fromRename.map(x => x.valid && x.bits.vecWen)) + allocPregsValid(3) := VecInit(fromRename.map(x => x.valid && x.bits.v0Wen)) + allocPregsValid(4) := VecInit(fromRename.map(x => x.valid && x.bits.vlWen)) + val allocPregs = Wire(Vec(busyTables.size, Vec(RenameWidth, ValidIO(UInt(PhyRegIdxWidth.W))))) + allocPregs.zip(allocPregsValid).map(x =>{ + x._1.zip(x._2).zipWithIndex.map{case ((sink, source), i) => { + sink.valid := source + sink.bits := fromRename(i).bits.pdest + }} + }) + val wakeUp = io.wakeUpAll.wakeUpInt ++ io.wakeUpAll.wakeUpFp ++ io.wakeUpAll.wakeUpVec ++ io.wakeUpAll.wakeUpMem + busyTables.zip(wbPregs).zip(allocPregs).map{ case ((b, w), a) => { + b.io.wakeUpInt := io.wakeUpAll.wakeUpInt + b.io.wakeUpFp := io.wakeUpAll.wakeUpFp + b.io.wakeUpVec := io.wakeUpAll.wakeUpVec + b.io.wakeUpMem := io.wakeUpAll.wakeUpMem + b.io.og0Cancel := io.og0Cancel + b.io.ldCancel := io.ldCancel + b.io.wbPregs := w + b.io.allocPregs := a + }} + rcTagTable.io.allocPregs.zip(allocPregs(0)).map(x => x._1 := x._2) + rcTagTable.io.wakeupFromIQ := io.wakeUpAll.wakeUpInt ++ io.wakeUpAll.wakeUpMem + rcTagTable.io.og0Cancel := io.og0Cancel + rcTagTable.io.ldCancel := io.ldCancel + busyTables.zip(idxRegType).zipWithIndex.map { case ((b, idxseq), i) => { + val readAddr = VecInit(fromRename.map(x => x.bits.psrc.zipWithIndex.filter(xx => idxseq.contains(xx._2)).map(_._1)).flatten) + val readValid = VecInit(fromRename.map(x => x.bits.psrc.zipWithIndex.filter(xx => idxseq.contains(xx._2)).map(y => x.valid && SrcType.isXp(x.bits.srcType(y._2)))).flatten) + b.io.read.map(_.req).zip(readAddr).map(x => x._1 := x._2) + // only int src need srcLoadDependency, src0 src1 + if (i == 0) { + val srcLoadDependencyUpdate = fromRenameUpdate.map(x => x.bits.srcLoadDependency.zipWithIndex.filter(x => idxseq.contains(x._2)).map(_._1)).flatten + val srcType = fromRenameUpdate.map(x => x.bits.srcType.zipWithIndex.filter(x => idxseq.contains(x._2)).map(_._1)).flatten + // for std, int src need srcLoadDependency, fp src donot need srcLoadDependency + srcLoadDependencyUpdate.lazyZip(b.io.read.map(_.loadDependency)).lazyZip(srcType).map{ case (sink, source, srctype) => + sink := Mux(SrcType.isXp(srctype), source, 0.U.asTypeOf(sink)) + } + // only int src need rcTag + val rcTagUpdate = fromRenameUpdate.map(x => x.bits.regCacheIdx.zipWithIndex.filter(x => idxseq.contains(x._2)).map(_._1)).flatten + rcTagUpdate.zip(rcTagTable.io.readPorts.map(_.addr)).map(x => x._1 := x._2) + val useRegCacheUpdate = fromRenameUpdate.map(x => x.bits.useRegCache.zipWithIndex.filter(x => idxseq.contains(x._2)).map(_._1)).flatten + useRegCacheUpdate.zip(rcTagTable.io.readPorts.map(_.valid)).map(x => x._1 := x._2) + rcTagTable.io.readPorts.map(_.ren).zip(readValid).map(x => x._1 := x._2) + rcTagTable.io.readPorts.map(_.tag).zip(readAddr).map(x => x._1 := x._2) + } + }} + val allSrcState = Wire(Vec(renameWidth, Vec(numRegSrc, Vec(numRegType, Bool())))) + for (i <- 0 until renameWidth){ + for (j <- 0 until numRegSrc){ + for (k <- 0 until numRegType){ + if (!idxRegType(k).contains(j)) { + allSrcState(i)(j)(k) := false.B + } + else { + val readidx = i * idxRegType(k).size + idxRegType(k).indexOf(j) + val readEn = k match { + case 0 => SrcType.isXp(fromRename(i).bits.srcType(j)) + case 1 => SrcType.isFp(fromRename(i).bits.srcType(j)) + case 2 => SrcType.isVp(fromRename(i).bits.srcType(j)) + case 3 => SrcType.isV0(fromRename(i).bits.srcType(j)) + case 4 => true.B + } + allSrcState(i)(j)(k) := readEn && busyTables(k).io.read(readidx).resp || SrcType.isImm(fromRename(i).bits.srcType(j)) + } + } + } + } + + + + val minIQSelAll = Wire(Vec(needMultiExu.size, Vec(renameWidth, Vec(issueQueueNum, Bool())))) + needMultiExu.zipWithIndex.map{ case ((fus, exuidx), needMultiExuidx) => { + val suffix = fus.map(_.name).mkString("_") + val iqNum = exuidx.size + val iqidx = allIssueParams.map(_.exuBlockParams.map(_.fuConfigs).flatten.toSet.toSeq).zipWithIndex.filter{x => fus.toSet.subsetOf(x._1.toSet)}.map(_._2) + println(s"[NewDispatch] ${fus.map(_.name)};iqidx:$iqidx;exuIdx:$exuidx") + val compareMatrix = Wire(Vec(iqNum, Vec(iqNum, Bool()))).suggestName(s"compareMatrix_$suffix") + for (i <- 0 until iqNum) { + for (j <- 0 until iqNum) { + if (i == j) compareMatrix(i)(j) := false.B + else if (i < j) compareMatrix(i)(j) := issueQueueCount(exuidx(i)) < issueQueueCount(exuidx(j)) + else compareMatrix(i)(j) := !compareMatrix(j)(i) + } + } + val IQSort = Reg(Vec(iqNum, Vec(iqNum, Bool()))).suggestName(s"IQSort_$suffix}") + for (i <- 0 until iqNum){ + // i = 0 minimum iq, i = iqNum - 1 -> maximum iq + IQSort(i) := compareMatrix.map(x => PopCount(x) === (iqNum - 1 - i).U) + } + val minIQSel = Wire(Vec(renameWidth, Vec(issueQueueNum, Bool()))).suggestName(s"minIQSel_$suffix") + for (i <- 0 until renameWidth){ + val minIQSel_ith = IQSort(i % iqNum) + println(s"minIQSel_${i}th_$suffix = IQSort(${i % iqNum})") + for (j <- 0 until issueQueueNum){ + minIQSel(i)(j) := false.B + if (iqidx.contains(j)){ + minIQSel(i)(j) := minIQSel_ith(iqidx.indexOf(j)) + println(s"minIQSel_${suffix}_${i}_${j} = minIQSel_ith(iqidx.indexOf(${j}))") + } + } + } + minIQSelAll(needMultiExuidx) := minIQSel + if (backendParams.debugEn){ + dontTouch(compareMatrix) + dontTouch(IQSort) + dontTouch(minIQSel) + } + } + } + val fuConfigSeq = needMultiExu.map(_._1) + val fuTypeOH = Wire(Vec(renameWidth, Vec(needMultiExu.size, Bool()))) + fuTypeOH.zip(renameIn).map{ case(oh, in) => { + oh := fuConfigSeq.map(x => x.map(xx => in.bits.fuType(xx.fuType.id)).reduce(_ || _) && in.valid) + } + } + // not count itself + val popFuTypeOH = Wire(Vec(renameWidth, Vec(needMultiExu.size, UInt((renameWidth-1).U.getWidth.W)))) + popFuTypeOH.zipWithIndex.map{ case (pop, idx) => { + if (idx == 0){ + pop := 0.U.asTypeOf(pop) + } + else { + pop.zipWithIndex.map{ case (p, i) => { + p := PopCount(fuTypeOH.take(idx).map(x => x(i))) + } + } + } + }} + val uopSelIQ = Reg(Vec(renameWidth, Vec(issueQueueNum, Bool()))) + val fuTypeOHSingle = Wire(Vec(renameWidth, Vec(needSingleIQ.size, Bool()))) + fuTypeOHSingle.zip(renameIn).map{ case (oh, in) => { + oh := needSingleIQ.map(_._1).map(x => x.map(xx => in.valid && in.bits.fuType(xx.fuType.id)).reduce(_ || _)) + }} + val uopSelIQSingle = Wire(Vec(needSingleIQ.size, Vec(issueQueueNum, Bool()))) + uopSelIQSingle := VecInit(needSingleIQ.map(_._2).flatten.map(x => VecInit((1.U(issueQueueNum.W) << x)(issueQueueNum-1, 0).asBools))) + uopSelIQ.zipWithIndex.map{ case (u, i) => { + when(io.toRenameAllFire){ + u := Mux(renameIn(i).valid, + Mux(fuTypeOH(i).asUInt.orR, + Mux1H(fuTypeOH(i), minIQSelAll)(Mux1H(fuTypeOH(i), popFuTypeOH(i))), + Mux1H(fuTypeOHSingle(i), uopSelIQSingle)), + 0.U.asTypeOf(u) + ) + }.elsewhen(io.fromRename(i).fire){ + u := 0.U.asTypeOf(u) + } + }} + val uopSelIQMatrix = Wire(Vec(renameWidth, Vec(issueQueueNum, UInt(renameWidth.U.getWidth.W)))) + uopSelIQMatrix.zipWithIndex.map{ case (u, i) => { + u.zipWithIndex.map{ case (uu, j) => { + uu := PopCount(uopSelIQ.take(i+1).map(x => x.zipWithIndex.filter(_._2 == j).map(_._1)).flatten) + }} + }} + val IQSelUop = Wire(Vec(IQEnqSum, ValidIO(new DynInst))) + val uopBlockByIQ = Wire(Vec(renameWidth, Bool())) + val allowDispatch = Wire(Vec(renameWidth, Bool())) + val thisCanActualOut = Wire(Vec(renameWidth, Bool())) + val lsqCanAccept = Wire(Bool()) + for (i <- 0 until RenameWidth){ + // update valid logic + fromRenameUpdate(i).valid := fromRename(i).valid && allowDispatch(i) && !uopBlockByIQ(i) && thisCanActualOut(i) && lsqCanAccept && !fromRename(i).bits.eliminatedMove + fromRename(i).ready := allowDispatch(i) && !uopBlockByIQ(i) && thisCanActualOut(i) && lsqCanAccept + } + var temp = 0 + allIssueParams.zipWithIndex.map{ case(issue, iqidx) => { + for (i <- 0 until issue.numEnq){ + val oh = Wire(Vec(renameWidth, Bool())).suggestName(s"oh_IQSelUop_$temp") + oh := uopSelIQMatrix.map(_(iqidx)).map(_ === (i+1).U) + IQSelUop(temp) := PriorityMux(oh, fromRenameUpdate) + // there only assign valid not use PriorityMuxDefalut for better timing + IQSelUop(temp).valid := PriorityMuxDefault(oh.zip(fromRenameUpdate.map(_.valid)), false.B) + val allFuThisIQ = issue.exuBlockParams.map(_.fuConfigs).flatten.toSet.toSeq + val hasStaFu = !allFuThisIQ.filter(_.name == "sta").isEmpty + for (j <- 0 until numRegSrc){ + val maskForStd = hasStaFu && (j == 1) + val thisSrcHasInt = allFuThisIQ.map(x => {x.srcData.map(xx => {if (j < xx.size) IntRegSrcDataSet.contains(xx(j)) else false}).reduce(_ || _)}).reduce(_ || _) + val thisSrcHasFp = allFuThisIQ.map(x => {x.srcData.map(xx => {if (j < xx.size) FpRegSrcDataSet.contains(xx(j)) else false}).reduce(_ || _)}).reduce(_ || _) + val thisSrcHasVec = allFuThisIQ.map(x => {x.srcData.map(xx => {if (j < xx.size) VecRegSrcDataSet.contains(xx(j)) else false}).reduce(_ || _)}).reduce(_ || _) + val thisSrcHasV0 = allFuThisIQ.map(x => {x.srcData.map(xx => {if (j < xx.size) V0RegSrcDataSet.contains(xx(j)) else false}).reduce(_ || _)}).reduce(_ || _) + val thisSrcHasVl = allFuThisIQ.map(x => {x.srcData.map(xx => {if (j < xx.size) VlRegSrcDataSet.contains(xx(j)) else false}).reduce(_ || _)}).reduce(_ || _) + val selSrcState = Seq(thisSrcHasInt || maskForStd, thisSrcHasFp || maskForStd, thisSrcHasVec, thisSrcHasV0, thisSrcHasVl) + IQSelUop(temp).bits.srcState(j) := PriorityMux(oh, allSrcState)(j).zip(selSrcState).filter(_._2 == true).map(_._1).foldLeft(false.B)(_ || _).asUInt + } + temp = temp + 1 + if (backendParams.debugEn){ + dontTouch(oh) + } + } + }} + temp = 0 + val uopBlockMatrix = Wire(Vec(renameWidth, Vec(issueQueueNum, Bool()))) + val uopBlockMatrixForAssign = allIssueParams.zipWithIndex.map { case (issue, iqidx) => { + val result = uopSelIQMatrix.map(_(iqidx)).map(x => Mux(io.toIssueQueues(temp).ready, x > issue.numEnq.U, x.orR)) + temp = temp + issue.numEnq + result + }}.transpose + uopBlockMatrix.zip(uopBlockMatrixForAssign).map(x => x._1 := VecInit(x._2)) + uopBlockByIQ := uopBlockMatrix.map(_.reduce(_ || _)) + io.toIssueQueues.zip(IQSelUop).map(x => { + x._1.valid := x._2.valid + x._1.bits := x._2.bits + }) + if (backendParams.debugEn){ + dontTouch(uopSelIQMatrix) + dontTouch(IQSelUop) + dontTouch(fromRenameUpdate) + dontTouch(uopBlockByIQ) + dontTouch(allowDispatch) + dontTouch(thisCanActualOut) + dontTouch(popFuTypeOH) + dontTouch(fuTypeOH) + dontTouch(fuTypeOHSingle) + dontTouch(minIQSelAll) + } + /////////////////////////////////////////////////////////// + + val lsqEnqCtrl = Module(new LsqEnqCtrl) + + // TODO: check lsqEnqCtrl redirect logic + // here is RegNext because dispatch2iq use s2_s4_redirect, newDispatch use s1_s3_redirect + lsqEnqCtrl.io.redirect := RegNext(io.redirect) + lsqEnqCtrl.io.lcommit := io.fromMem.lcommit + lsqEnqCtrl.io.scommit := io.fromMem.scommit + lsqEnqCtrl.io.lqCancelCnt := io.fromMem.lqCancelCnt + lsqEnqCtrl.io.sqCancelCnt := io.fromMem.sqCancelCnt + lsqEnqCtrl.io.enq.iqAccept := io.fromRename.map(x => !x.valid || x.fire) + io.toMem.lsqEnqIO <> lsqEnqCtrl.io.enqLsq + + private val enqLsqIO = lsqEnqCtrl.io.enq + private val lqFreeCount = lsqEnqCtrl.io.lqFreeCount + private val sqFreeCount = lsqEnqCtrl.io.sqFreeCount + + private val numLoadDeq = LSQLdEnqWidth + private val numStoreAMODeq = LSQStEnqWidth + private val numVLoadDeq = LoadPipelineWidth + private val numDeq = enqLsqIO.req.size + lsqCanAccept := enqLsqIO.canAccept + + private val isLoadVec = VecInit(fromRename.map(x => x.valid && FuType.isLoad(x.bits.fuType))) + private val isStoreVec = VecInit(fromRename.map(x => x.valid && FuType.isStore(x.bits.fuType))) + private val isAMOVec = fromRename.map(x => x.valid && FuType.isAMO(x.bits.fuType)) + private val isStoreAMOVec = fromRename.map(x => x.valid && (FuType.isStore(x.bits.fuType) || FuType.isAMO(x.bits.fuType))) + private val isVLoadVec = VecInit(fromRename.map(x => x.valid && FuType.isVLoad(x.bits.fuType))) + private val isVStoreVec = VecInit(fromRename.map(x => x.valid && FuType.isVStore(x.bits.fuType))) + + private val loadCntVec = VecInit(isLoadVec.indices.map(x => PopCount(isLoadVec.slice(0, x + 1)))) + private val storeAMOCntVec = VecInit(isStoreAMOVec.indices.map(x => PopCount(isStoreAMOVec.slice(0, x + 1)))) + private val vloadCntVec = VecInit(isVLoadVec.indices.map(x => PopCount(isVLoadVec.slice(0, x + 1)))) + + private val s0_enqLsq_resp = Wire(enqLsqIO.resp.cloneType) + for (i <- 0 until RenameWidth) { + // update lqIdx sqIdx + fromRenameUpdate(i).bits.lqIdx := s0_enqLsq_resp(i).lqIdx + fromRenameUpdate(i).bits.sqIdx := s0_enqLsq_resp(i).sqIdx + } + + val loadBlockVec = VecInit(loadCntVec.map(_ > numLoadDeq.U)) + val storeAMOBlockVec = VecInit(storeAMOCntVec.map(_ > numStoreAMODeq.U)) + val vloadBlockVec = VecInit(vloadCntVec.map(_ > numVLoadDeq.U)) + val lsStructBlockVec = VecInit((loadBlockVec.zip(storeAMOBlockVec)).zip(vloadBlockVec).map(x => x._1._1 || x._1._2 || x._2)) + if (backendParams.debugEn) { + dontTouch(loadBlockVec) + dontTouch(storeAMOBlockVec) + dontTouch(lsStructBlockVec) + dontTouch(vloadBlockVec) + dontTouch(isLoadVec) + dontTouch(isVLoadVec) + dontTouch(loadCntVec) + } + + private val uop = fromRename.map(_.bits) + private val fuType = uop.map(_.fuType) + private val fuOpType = uop.map(_.fuOpType) + private val vtype = uop.map(_.vpu.vtype) + private val sew = vtype.map(_.vsew) + private val lmul = vtype.map(_.vlmul) + private val eew = uop.map(_.vpu.veew) + private val mop = fuOpType.map(fuOpTypeItem => LSUOpType.getVecLSMop(fuOpTypeItem)) + private val nf = fuOpType.zip(uop.map(_.vpu.nf)).map { case (fuOpTypeItem, nfItem) => Mux(LSUOpType.isWhole(fuOpTypeItem), 0.U, nfItem) } + private val emul = fuOpType.zipWithIndex.map { case (fuOpTypeItem, index) => + Mux( + LSUOpType.isWhole(fuOpTypeItem), + GenUSWholeEmul(nf(index)), + Mux( + LSUOpType.isMasked(fuOpTypeItem), + 0.U(mulBits.W), + EewLog2(eew(index)) - sew(index) + lmul(index) + ) + ) + } + + private val isVlsType = fuType.map(fuTypeItem => FuType.isVls(fuTypeItem)).zip(fromRename.map(_.valid)).map(x => x._1 && x._2) + private val isSegment = fuType.map(fuTypeItem => FuType.isVsegls(fuTypeItem)).zip(fromRename.map(_.valid)).map(x => x._1 && x._2) + // TODO + private val isUnitStride = fuOpType.map(fuOpTypeItem => LSUOpType.isAllUS(fuOpTypeItem)) + private val isVecUnitType = isVlsType.zip(isUnitStride).map { case (isVlsTypeItme, isUnitStrideItem) => + isVlsTypeItme && isUnitStrideItem + } + private val isfofFixVlUop = uop.map { x => x.vpu.isVleff && x.lastUop } + private val instType = isSegment.zip(mop).map { case (isSegementItem, mopItem) => Cat(isSegementItem, mopItem) } + // There is no way to calculate the 'flow' for 'unit-stride' exactly: + // Whether 'unit-stride' needs to be split can only be known after obtaining the address. + // For scalar instructions, this is not handled here, and different assignments are done later according to the situation. + private val numLsElem = VecInit(uop.map(_.numLsElem)) + + // The maximum 'numLsElem' number that can be emitted per port is: + // 16 2 2 2 2 2. + // The 'allowDispatch' calculations are done conservatively for timing purposes: + // The Flow of scalar instructions is considered 1, + // The flow of vector 'unit-stride' instructions is considered 2, and the flow of other vector instructions is considered 16. + private val conserveFlows = isVlsType.zipWithIndex.map { case (isVlsTyepItem, index) => + Mux( + isVlsTyepItem, + if (index == 0) Mux(isUnitStride(index), VecMemUnitStrideMaxFlowNum.U, 16.U) else VecMemUnitStrideMaxFlowNum.U, + 1.U + ) + } + + // A conservative allocation strategy is adopted here. + // Vector 'unit-stride' instructions and scalar instructions can be issued from all six ports, + // while other vector instructions can only be issued from the first port + // if is segment instruction, need disptch it to Vldst_RS0, so, except port 0, stall other. + // The allocation needs to meet a few conditions: + // 1) The lsq has enough entris. + // 2) The number of flows accumulated does not exceed VecMemDispatchMaxNumber. + // 3) Vector instructions other than 'unit-stride' can only be issued on the first port. + + + for (index <- allowDispatch.indices) { + val flowTotal = Wire(UInt(log2Up(VirtualLoadQueueMaxStoreQueueSize + 1).W)) + flowTotal := conserveFlows.take(index + 1).reduce(_ +& _) + if (index == 0) { + when(isStoreVec(index) || isVStoreVec(index)) { + allowDispatch(index) := sqFreeCount > flowTotal + }.elsewhen(isLoadVec(index) || isVLoadVec(index)) { + allowDispatch(index) := lqFreeCount > flowTotal + }.elsewhen(isAMOVec(index)) { + allowDispatch(index) := true.B + }.otherwise { + allowDispatch(index) := true.B + } + } + else { + when(isStoreVec(index) || isVStoreVec(index)) { + allowDispatch(index) := (sqFreeCount > flowTotal) && (isVecUnitType(index) || !isVlsType(index)) && allowDispatch(index - 1) + }.elsewhen(isLoadVec(index) || isVLoadVec(index)) { + allowDispatch(index) := (lqFreeCount > flowTotal) && (isVecUnitType(index) || !isVlsType(index)) && allowDispatch(index - 1) + }.elsewhen(isAMOVec(index)) { + allowDispatch(index) := allowDispatch(index - 1) + }.otherwise { + allowDispatch(index) := allowDispatch(index - 1) + } + } + } + + + // enqLsq io + require(enqLsqIO.req.size == enqLsqIO.resp.size) + for (i <- enqLsqIO.req.indices) { + when(!io.fromRename(i).fire) { + enqLsqIO.needAlloc(i) := 0.U + }.elsewhen(isStoreVec(i) || isVStoreVec(i)) { + enqLsqIO.needAlloc(i) := 2.U // store | vstore + }.elsewhen(isLoadVec(i) || isVLoadVec(i)){ + enqLsqIO.needAlloc(i) := 1.U // load | vload + }.otherwise { + enqLsqIO.needAlloc(i) := 0.U + } + enqLsqIO.req(i).valid := io.fromRename(i).fire && !isAMOVec(i) && !isSegment(i) && !isfofFixVlUop(i) + enqLsqIO.req(i).bits := io.fromRename(i).bits + + // This is to make it easier to calculate in LSQ. + // Both scalar instructions and vector instructions with FLOW equal to 1 have a NUM value of 1.” + // But, the 'numLsElem' that is not a vector is set to 0 when passed to IQ + enqLsqIO.req(i).bits.numLsElem := Mux(isVlsType(i), numLsElem(i), 1.U) + s0_enqLsq_resp(i) := enqLsqIO.resp(i) + } + + + + + + + + + + + + + val isFp = VecInit(fromRename.map(req => FuType.isFArith(req.bits.fuType))) + val isVec = VecInit(fromRename.map(req => FuType.isVArith (req.bits.fuType) || + FuType.isVsetRvfWvf(req.bits.fuType))) + val isMem = VecInit(fromRename.map(req => FuType.isMem(req.bits.fuType) || + FuType.isVls (req.bits.fuType))) + val isLs = VecInit(fromRename.map(req => FuType.isLoadStore(req.bits.fuType))) + val isVls = VecInit(fromRename.map(req => FuType.isVls (req.bits.fuType))) + val isStore = VecInit(fromRename.map(req => FuType.isStore(req.bits.fuType))) + val isVStore = VecInit(fromRename.map(req => FuType.isVStore(req.bits.fuType))) + val isAMO = VecInit(fromRename.map(req => FuType.isAMO(req.bits.fuType))) + val isBlockBackward = VecInit(fromRename.map(x => x.valid && x.bits.blockBackward)) + val isWaitForward = VecInit(fromRename.map(x => x.valid && x.bits.waitForward)) + + // Singlestep should only commit one machine instruction after dret, and then hart enter debugMode according to singlestep exception. + val s_holdRobidx :: s_updateRobidx :: Nil = Enum(2) + val singleStepState = RegInit(s_updateRobidx) + + val robidxStepNext = WireInit(0.U.asTypeOf(fromRename(0).bits.robIdx)) + val robidxStepReg = RegInit(0.U.asTypeOf(fromRename(0).bits.robIdx)) + val robidxCanCommitStepping = WireInit(0.U.asTypeOf(fromRename(0).bits.robIdx)) + + when(!io.singleStep) { + singleStepState := s_updateRobidx + }.elsewhen(io.singleStep && fromRename(0).fire && io.enqRob.req(0).valid) { + singleStepState := s_holdRobidx + robidxStepNext := fromRename(0).bits.robIdx + } + + when(singleStepState === s_updateRobidx) { + robidxStepReg := robidxStepNext + robidxCanCommitStepping := robidxStepNext + }.elsewhen(singleStepState === s_holdRobidx) { + robidxStepReg := robidxStepReg + robidxCanCommitStepping := robidxStepReg + } + + val updatedUop = Wire(Vec(RenameWidth, new DynInst)) + val checkpoint_id = RegInit(0.U(64.W)) + checkpoint_id := checkpoint_id + PopCount((0 until RenameWidth).map(i => + fromRename(i).fire + )) + + + for (i <- 0 until RenameWidth) { + + updatedUop(i) := fromRename(i).bits + updatedUop(i).debugInfo.eliminatedMove := fromRename(i).bits.eliminatedMove + // For the LUI instruction: psrc(0) is from register file and should always be zero. + when (fromRename(i).bits.isLUI) { + updatedUop(i).psrc(0) := 0.U + } + //TODO: vec ls mdp + io.lfst.req(i).valid := fromRename(i).fire && updatedUop(i).storeSetHit + io.lfst.req(i).bits.isstore := isStore(i) + io.lfst.req(i).bits.ssid := updatedUop(i).ssid + io.lfst.req(i).bits.robIdx := updatedUop(i).robIdx // speculatively assigned in rename + + // override load delay ctrl signal with store set result + if(StoreSetEnable) { + updatedUop(i).loadWaitBit := io.lfst.resp(i).bits.shouldWait + updatedUop(i).waitForRobIdx := io.lfst.resp(i).bits.robIdx + } else { + updatedUop(i).loadWaitBit := isLs(i) && !isStore(i) && fromRename(i).bits.loadWaitBit + } + // // update singleStep, singleStep exception only enable in next machine instruction. + updatedUop(i).singleStep := io.singleStep && (fromRename(i).bits.robIdx =/= robidxCanCommitStepping) + when (fromRename(i).fire) { + XSDebug(TriggerAction.isDmode(updatedUop(i).trigger) || updatedUop(i).exceptionVec(breakPoint), s"Debug Mode: inst ${i} has frontend trigger exception\n") + XSDebug(updatedUop(i).singleStep, s"Debug Mode: inst ${i} has single step exception\n") + } + if (env.EnableDifftest) { + // debug runahead hint + val debug_runahead_checkpoint_id = Wire(checkpoint_id.cloneType) + if(i == 0){ + debug_runahead_checkpoint_id := checkpoint_id + } else { + debug_runahead_checkpoint_id := checkpoint_id + PopCount((0 until i).map(i => + fromRename(i).fire + )) + } + } + } + + // store set perf count + XSPerfAccumulate("waittable_load_wait", PopCount((0 until RenameWidth).map(i => + fromRename(i).fire && fromRename(i).bits.loadWaitBit && !isStore(i) && isLs(i) + ))) + XSPerfAccumulate("storeset_load_wait", PopCount((0 until RenameWidth).map(i => + fromRename(i).fire && updatedUop(i).loadWaitBit && !isStore(i) && isLs(i) + ))) + XSPerfAccumulate("storeset_load_strict_wait", PopCount((0 until RenameWidth).map(i => + fromRename(i).fire && updatedUop(i).loadWaitBit && updatedUop(i).loadWaitStrict && !isStore(i) && isLs(i) + ))) + XSPerfAccumulate("storeset_store_wait", PopCount((0 until RenameWidth).map(i => + fromRename(i).fire && updatedUop(i).loadWaitBit && isStore(i) + ))) + + val allResourceReady = io.enqRob.canAccept + + // Instructions should enter dispatch queues in order. + // blockedByWaitForward: this instruction is blocked by itself (based on waitForward) + // nextCanOut: next instructions can out (based on blockBackward) + // notBlockedByPrevious: previous instructions can enqueue + val hasException = VecInit(fromRename.zip(updatedUop).map { + case (fromRename: DecoupledIO[DynInst], uop: DynInst) => + fromRename.bits.hasException || uop.singleStep + }) + + private val blockedByWaitForward = Wire(Vec(RenameWidth, Bool())) + blockedByWaitForward(0) := !io.enqRob.isEmpty && isWaitForward(0) + for (i <- 1 until RenameWidth) { + blockedByWaitForward(i) := blockedByWaitForward(i - 1) || (!io.enqRob.isEmpty || Cat(fromRename.take(i).map(_.valid)).orR) && isWaitForward(i) + } + if(backendParams.debugEn){ + dontTouch(blockedByWaitForward) + } + + // Only the uop with block backward flag will block the next uop + val nextCanOut = VecInit((0 until RenameWidth).map(i => + !isBlockBackward(i) + )) + val notBlockedByPrevious = VecInit((0 until RenameWidth).map(i => + if (i == 0) true.B + else Cat((0 until i).map(j => nextCanOut(j))).andR + )) + + // for noSpecExec: (robEmpty || !this.noSpecExec) && !previous.noSpecExec + // For blockBackward: + // this instruction can actually dequeue: 3 conditions + // (1) resources are ready + // (2) previous instructions are ready + thisCanActualOut := VecInit((0 until RenameWidth).map(i => !blockedByWaitForward(i) && notBlockedByPrevious(i) && io.enqRob.canAccept)) + val thisActualOut = (0 until RenameWidth).map(i => io.enqRob.req(i).valid && io.enqRob.canAccept) + val hasValidException = fromRename.zip(hasException).map(x => x._1.valid && x._2) + + // input for ROB, LSQ + for (i <- 0 until RenameWidth) { + // needAlloc no use, need deleted + io.enqRob.needAlloc(i) := fromRename(i).valid + io.enqRob.req(i).valid := fromRename(i).fire + io.enqRob.req(i).bits := updatedUop(i) + io.enqRob.req(i).bits.hasException := updatedUop(i).hasException || updatedUop(i).singleStep + io.enqRob.req(i).bits.numWB := Mux(updatedUop(i).singleStep, 0.U, updatedUop(i).numWB) + } + + val hasValidInstr = VecInit(fromRename.map(_.valid)).asUInt.orR + val hasSpecialInstr = Cat((0 until RenameWidth).map(i => isBlockBackward(i))).orR + + private val canAccept = !hasValidInstr || !hasSpecialInstr && io.enqRob.canAccept + + val isWaitForwardOrBlockBackward = isWaitForward.asUInt.orR || isBlockBackward.asUInt.orR + val renameFireCnt = PopCount(fromRename.map(_.fire)) + + val stall_rob = hasValidInstr && !io.enqRob.canAccept + val stall_int_dq = hasValidInstr && io.enqRob.canAccept + val stall_int_dq0 = hasValidInstr && io.enqRob.canAccept + val stall_int_dq1 = hasValidInstr && io.enqRob.canAccept + val stall_fp_dq = hasValidInstr && io.enqRob.canAccept + val stall_ls_dq = hasValidInstr && io.enqRob.canAccept + + XSPerfAccumulate("in_valid_count", PopCount(fromRename.map(_.valid))) + XSPerfAccumulate("in_fire_count", PopCount(fromRename.map(_.fire))) + XSPerfAccumulate("in_valid_not_ready_count", PopCount(fromRename.map(x => x.valid && !x.ready))) + XSPerfAccumulate("wait_cycle", !fromRename.head.valid && allResourceReady) + + XSPerfAccumulate("stall_cycle_rob", stall_rob) + XSPerfAccumulate("stall_cycle_int_dq0", stall_int_dq0) + XSPerfAccumulate("stall_cycle_int_dq1", stall_int_dq1) + XSPerfAccumulate("stall_cycle_fp_dq", stall_fp_dq) + XSPerfAccumulate("stall_cycle_ls_dq", stall_ls_dq) + + val notIssue = !io.debugTopDown.fromRob.robHeadLsIssue + val tlbReplay = io.debugTopDown.fromCore.fromMem.robHeadTlbReplay + val tlbMiss = io.debugTopDown.fromCore.fromMem.robHeadTlbMiss + val vioReplay = io.debugTopDown.fromCore.fromMem.robHeadLoadVio + val mshrReplay = io.debugTopDown.fromCore.fromMem.robHeadLoadMSHR + val l1Miss = io.debugTopDown.fromCore.fromMem.robHeadMissInDCache + val l2Miss = io.debugTopDown.fromCore.l2MissMatch + val l3Miss = io.debugTopDown.fromCore.l3MissMatch + + val ldReason = Mux(l3Miss, TopDownCounters.LoadMemStall.id.U, + Mux(l2Miss, TopDownCounters.LoadL3Stall.id.U, + Mux(l1Miss, TopDownCounters.LoadL2Stall.id.U, + Mux(notIssue, TopDownCounters.MemNotReadyStall.id.U, + Mux(tlbMiss, TopDownCounters.LoadTLBStall.id.U, + Mux(tlbReplay, TopDownCounters.LoadTLBStall.id.U, + Mux(mshrReplay, TopDownCounters.LoadMSHRReplayStall.id.U, + Mux(vioReplay, TopDownCounters.LoadVioReplayStall.id.U, + TopDownCounters.LoadL1Stall.id.U)))))))) + + val decodeReason = RegNextN(io.stallReason.reason, 2) + val renameReason = RegNext(io.stallReason.reason) + + val stallReason = Wire(chiselTypeOf(io.stallReason.reason)) + val firedVec = fromRename.map(_.fire) + io.stallReason.backReason.valid := !canAccept + io.stallReason.backReason.bits := TopDownCounters.OtherCoreStall.id.U + stallReason.zip(io.stallReason.reason).zip(firedVec).zipWithIndex.map { case (((update, in), fire), idx) => + val headIsInt = FuType.isInt(io.robHead.getDebugFuType) && io.robHeadNotReady + val headIsFp = FuType.isFArith(io.robHead.getDebugFuType) && io.robHeadNotReady + val headIsDiv = FuType.isDivSqrt(io.robHead.getDebugFuType) && io.robHeadNotReady + val headIsLd = io.robHead.getDebugFuType === FuType.ldu.U && io.robHeadNotReady || !io.lqCanAccept + val headIsSt = io.robHead.getDebugFuType === FuType.stu.U && io.robHeadNotReady || !io.sqCanAccept + val headIsAmo = io.robHead.getDebugFuType === FuType.mou.U && io.robHeadNotReady + val headIsLs = headIsLd || headIsSt + val robLsFull = io.robFull || !io.lqCanAccept || !io.sqCanAccept + + import TopDownCounters._ + update := MuxCase(OtherCoreStall.id.U, Seq( + // fire + (fire ) -> NoStall.id.U , + // dispatch not stall / core stall from decode or rename + (in =/= OtherCoreStall.id.U && in =/= NoStall.id.U ) -> in , + // rob stall + (headIsAmo ) -> AtomicStall.id.U , + (headIsSt ) -> StoreStall.id.U , + (headIsLd ) -> ldReason , + (headIsDiv ) -> DivStall.id.U , + (headIsInt ) -> IntNotReadyStall.id.U , + (headIsFp ) -> FPNotReadyStall.id.U , + (renameReason(idx) =/= NoStall.id.U ) -> renameReason(idx) , + (decodeReason(idx) =/= NoStall.id.U ) -> decodeReason(idx) , + )) + } + + TopDownCounters.values.foreach(ctr => XSPerfAccumulate(ctr.toString(), PopCount(stallReason.map(_ === ctr.id.U)))) + + val robTrueCommit = io.debugTopDown.fromRob.robTrueCommit + TopDownCounters.values.foreach(ctr => XSPerfRolling("td_"+ctr.toString(), PopCount(stallReason.map(_ === ctr.id.U)), + robTrueCommit, 1000, clock, reset)) + + XSPerfHistogram("slots_fire", PopCount(thisActualOut), true.B, 0, RenameWidth+1, 1) + // Explaination: when out(0) not fire, PopCount(valid) is not meaningfull + XSPerfHistogram("slots_valid_pure", PopCount(io.enqRob.req.map(_.valid)), thisActualOut(0), 0, RenameWidth+1, 1) + XSPerfHistogram("slots_valid_rough", PopCount(io.enqRob.req.map(_.valid)), true.B, 0, RenameWidth+1, 1) + + val perfEvents = Seq( + ("dispatch_in", PopCount(fromRename.map(_.valid && fromRename(0).ready)) ), + ("dispatch_empty", !hasValidInstr ), + ("dispatch_utili", PopCount(fromRename.map(_.valid)) ), + ("dispatch_waitinstr", PopCount(fromRename.map(!_.valid && canAccept)) ), + ("dispatch_stall_cycle_lsq", false.B ), + ("dispatch_stall_cycle_rob", stall_rob ), + ("dispatch_stall_cycle_int_dq", stall_int_dq ), + ("dispatch_stall_cycle_fp_dq", stall_fp_dq ), + ("dispatch_stall_cycle_ls_dq", stall_ls_dq ) + ) + generatePerfEvent() +} diff --git a/src/main/scala/xiangshan/backend/exu/ExeUnitParams.scala b/src/main/scala/xiangshan/backend/exu/ExeUnitParams.scala index 2adfa8458b6..bfeb1095442 100644 --- a/src/main/scala/xiangshan/backend/exu/ExeUnitParams.scala +++ b/src/main/scala/xiangshan/backend/exu/ExeUnitParams.scala @@ -258,6 +258,8 @@ case class ExeUnitParams( def hasBrhFu = fuConfigs.map(_.fuType == FuType.brh).reduce(_ || _) + def hasAluFu = fuConfigs.map(_.fuType == FuType.alu).reduce(_ || _) + def hasi2vFu = fuConfigs.map(_.fuType == FuType.i2v).reduce(_ || _) def hasJmpFu = fuConfigs.map(_.fuType == FuType.jmp).reduce(_ || _) diff --git a/src/main/scala/xiangshan/backend/issue/IssueBlockParams.scala b/src/main/scala/xiangshan/backend/issue/IssueBlockParams.scala index b615b56676d..ff87e9ab70f 100644 --- a/src/main/scala/xiangshan/backend/issue/IssueBlockParams.scala +++ b/src/main/scala/xiangshan/backend/issue/IssueBlockParams.scala @@ -272,7 +272,13 @@ case class IssueBlockParams( def needWakeupFromIntWBPort = backendParam.allExuParams.filter(x => !wakeUpInExuSources.map(_.name).contains(x.name) && this.readIntRf).groupBy(x => x.getIntWBPort.getOrElse(IntWB(port = -1)).port).filter(_._1 != -1) - def needWakeupFromFpWBPort = backendParam.allExuParams.filter(x => !wakeUpInExuSources.map(_.name).contains(x.name) && this.readFpRf).groupBy(x => x.getFpWBPort.getOrElse(FpWB(port = -1)).port).filter(_._1 != -1) + def needWakeupFromFpWBPort = if (this.exuBlockParams.map(_.hasStdFu).reduce(_ || _)) { + // here add fp load WB wakeup to std + backendParam.allExuParams.filter(x => (!wakeUpInExuSources.map(_.name).contains(x.name) || x.hasLoadExu) && this.readFpRf).groupBy(x => x.getFpWBPort.getOrElse(FpWB(port = -1)).port).filter(_._1 != -1) + } + else { + backendParam.allExuParams.filter(x => !wakeUpInExuSources.map(_.name).contains(x.name) && this.readFpRf).groupBy(x => x.getFpWBPort.getOrElse(FpWB(port = -1)).port).filter(_._1 != -1) + } def needWakeupFromVfWBPort = backendParam.allExuParams.filter(x => !wakeUpInExuSources.map(_.name).contains(x.name) && this.readVecRf).groupBy(x => x.getVfWBPort.getOrElse(VfWB(port = -1)).port).filter(_._1 != -1) diff --git a/src/main/scala/xiangshan/backend/issue/Scheduler.scala b/src/main/scala/xiangshan/backend/issue/Scheduler.scala index 14d49f04ac9..ec113ef9248 100644 --- a/src/main/scala/xiangshan/backend/issue/Scheduler.scala +++ b/src/main/scala/xiangshan/backend/issue/Scheduler.scala @@ -12,7 +12,6 @@ import xiangshan.backend.datapath.DataConfig._ import xiangshan.backend.datapath.WbConfig._ import xiangshan.backend.fu.FuType import xiangshan.backend.regfile.RfWritePortWithConfig -import xiangshan.backend.rename.{BusyTable, VlBusyTable} import xiangshan.mem.{LsqEnqCtrl, LsqEnqIO, MemWaitUpdateReq, SqPtr, LqPtr} import xiangshan.backend.datapath.WbConfig.V0WB import xiangshan.backend.regfile.VlPregParams @@ -35,7 +34,6 @@ class Scheduler(val params: SchdBlockParams)(implicit p: Parameters) extends Laz val numV0StateWrite = backendParams.numPregWb(V0Data()) val numVlStateWrite = backendParams.numPregWb(VlData()) - val dispatch2Iq = LazyModule(new Dispatch2Iq(params)) val issueQueue = params.issueBlockParams.map(x => LazyModule(new IssueQueue(x).suggestName(x.getIQName))) lazy val module: SchedulerImpBase = params.schdType match { @@ -50,7 +48,10 @@ class Scheduler(val params: SchdBlockParams)(implicit p: Parameters) extends Laz class SchedulerIO()(implicit params: SchdBlockParams, p: Parameters) extends XSBundle { // params alias private val LoadQueueSize = VirtualLoadQueueSize - + val fromDispatchUopNum = params.issueBlockParams.filter(x => x.StdCnt == 0).map(_.numEnq).sum + val allIssueParams = params.issueBlockParams.filter(_.StdCnt == 0) + val IssueQueueDeqSum = allIssueParams.map(_.numDeq).sum + val maxIQSize = allIssueParams.map(_.numEntries).max val fromTop = new Bundle { val hartId = Input(UInt(8.W)) } @@ -58,15 +59,13 @@ class SchedulerIO()(implicit params: SchdBlockParams, p: Parameters) extends XSB val fuBusyTableRead = MixedVec(params.issueBlockParams.map(x => Input(x.genWbFuBusyTableReadBundle))) } val wbFuBusyTable = MixedVec(params.issueBlockParams.map(x => Output(x.genWbFuBusyTableWriteBundle))) - val intIQValidNumVec = Output(MixedVec(backendParams.genIntIQValidNumBundle)) - val fpIQValidNumVec = Output(MixedVec(backendParams.genFpIQValidNumBundle)) + val IQValidNumVec = Output(Vec(IssueQueueDeqSum, UInt((maxIQSize).U.getWidth.W))) val fromCtrlBlock = new Bundle { val flush = Flipped(ValidIO(new Redirect)) } val fromDispatch = new Bundle { - val allocPregs = Vec(RenameWidth, Input(new ResetPregStateReq)) - val uops = Vec(params.numUopIn, Flipped(DecoupledIO(new DynInst))) + val uops = Vec(fromDispatchUopNum, Flipped(DecoupledIO(new DynInst))) } val intWriteBack = MixedVec(Vec(backendParams.numPregWb(IntData()), new RfWritePortWithConfig(backendParams.intPregParams.dataCfg, backendParams.intPregParams.addrWidth))) @@ -113,9 +112,6 @@ class SchedulerIO()(implicit params: SchdBlockParams, p: Parameters) extends XSB val ldCancel = Vec(backendParams.LduCnt + backendParams.HyuCnt, Flipped(new LoadCancelIO)) - val memIO = if (params.isMemSchd) Some(new Bundle { - val lsqEnqIO = Flipped(new LsqEnqIO) - }) else None val fromMem = if (params.isMemSchd) Some(new Bundle { val ldaFeedback = Flipped(Vec(params.LduCnt, new MemRSFeedbackIO)) val staFeedback = Flipped(Vec(params.StaCnt, new MemRSFeedbackIO)) @@ -151,156 +147,9 @@ abstract class SchedulerImpBase(wrapper: Scheduler)(implicit params: SchdBlockPa private val schdType = params.schdType // Modules - val dispatch2Iq: Dispatch2IqImp = wrapper.dispatch2Iq.module val issueQueues: Seq[IssueQueueImp] = wrapper.issueQueue.map(_.module) - io.intIQValidNumVec := 0.U.asTypeOf(io.intIQValidNumVec) - io.fpIQValidNumVec := 0.U.asTypeOf(io.fpIQValidNumVec) - if (params.isIntSchd) { - dispatch2Iq.io.intIQValidNumVec.get := io.intIQValidNumVec - io.intIQValidNumVec := MixedVecInit(issueQueues.map(_.io.validCntDeqVec)) - } - else if (params.isFpSchd) { - dispatch2Iq.io.fpIQValidNumVec.get := io.fpIQValidNumVec - io.fpIQValidNumVec := MixedVecInit(issueQueues.map(_.io.validCntDeqVec)) - } - - // valid count - dispatch2Iq.io.iqValidCnt := issueQueues.filter(_.params.StdCnt == 0).map(_.io.status.validCnt) - - // BusyTable Modules - val intBusyTable = schdType match { - case IntScheduler() | MemScheduler() => Some(Module(new BusyTable(dispatch2Iq.numIntStateRead, wrapper.numIntStateWrite, IntPhyRegs, IntWB()))) - case _ => None - } - val fpBusyTable = schdType match { - case FpScheduler() | MemScheduler() => Some(Module(new BusyTable(dispatch2Iq.numFpStateRead, wrapper.numFpStateWrite, FpPhyRegs, FpWB()))) - case _ => None - } - val vfBusyTable = schdType match { - case VfScheduler() | MemScheduler() => Some(Module(new BusyTable(dispatch2Iq.numVfStateRead, wrapper.numVfStateWrite, VfPhyRegs, VfWB()))) - case _ => None - } - val v0BusyTable = schdType match { - case VfScheduler() | MemScheduler() => Some(Module(new BusyTable(dispatch2Iq.numV0StateRead, wrapper.numV0StateWrite, V0PhyRegs, V0WB()))) - case _ => None - } - val vlBusyTable = schdType match { - case VfScheduler() | MemScheduler() => Some(Module(new VlBusyTable(dispatch2Iq.numVlStateRead, wrapper.numVlStateWrite, VlPhyRegs, VlWB()))) - case _ => None - } - - // RegCacheTagTable Module - val rcTagTable = schdType match { - case IntScheduler() | MemScheduler() => Some(Module(new RegCacheTagTable(dispatch2Iq.numRCTagTableStateRead))) - case _ => None - } - - dispatch2Iq.io match { case dp2iq => - dp2iq.redirect <> io.fromCtrlBlock.flush - dp2iq.in <> io.fromDispatch.uops - dp2iq.readIntState.foreach(_ <> intBusyTable.get.io.read) - dp2iq.readFpState.foreach(_ <> fpBusyTable.get.io.read) - dp2iq.readVfState.foreach(_ <> vfBusyTable.get.io.read) - dp2iq.readV0State.foreach(_ <> v0BusyTable.get.io.read) - dp2iq.readVlState.foreach(_ <> vlBusyTable.get.io.read) - dp2iq.readVlInfo.foreach(_ <> vlBusyTable.get.io_vl_read.vlReadInfo) - dp2iq.readRCTagTableState.foreach(_ <> rcTagTable.get.io.readPorts) - } - - intBusyTable match { - case Some(bt) => - bt.io.allocPregs.zip(io.fromDispatch.allocPregs).foreach { case (btAllocPregs, dpAllocPregs) => - btAllocPregs.valid := dpAllocPregs.isInt - btAllocPregs.bits := dpAllocPregs.preg - } - bt.io.wbPregs.zipWithIndex.foreach { case (wb, i) => - wb.valid := io.intWriteBack(i).wen && io.intWriteBack(i).intWen - wb.bits := io.intWriteBack(i).addr - } - bt.io.wakeUp := io.fromSchedulers.wakeupVec - bt.io.og0Cancel := io.fromDataPath.og0Cancel - bt.io.ldCancel := io.ldCancel - case None => - } - - fpBusyTable match { - case Some(bt) => - bt.io.allocPregs.zip(io.fromDispatch.allocPregs).foreach { case (btAllocPregs, dpAllocPregs) => - btAllocPregs.valid := dpAllocPregs.isFp - btAllocPregs.bits := dpAllocPregs.preg - } - bt.io.wbPregs.zipWithIndex.foreach { case (wb, i) => - wb.valid := io.fpWriteBack(i).wen && io.fpWriteBack(i).fpWen - wb.bits := io.fpWriteBack(i).addr - } - bt.io.wakeUp := io.fromSchedulers.wakeupVec - bt.io.og0Cancel := io.fromDataPath.og0Cancel - bt.io.ldCancel := io.ldCancel - case None => - } - - vfBusyTable match { - case Some(bt) => - bt.io.allocPregs.zip(io.fromDispatch.allocPregs).foreach { case (btAllocPregs, dpAllocPregs) => - btAllocPregs.valid := dpAllocPregs.isVec - btAllocPregs.bits := dpAllocPregs.preg - } - bt.io.wbPregs.zipWithIndex.foreach { case (wb, i) => - wb.valid := io.vfWriteBack(i).wen && io.vfWriteBack(i).vecWen - wb.bits := io.vfWriteBack(i).addr - } - bt.io.wakeUp := io.fromSchedulers.wakeupVec - bt.io.og0Cancel := io.fromDataPath.og0Cancel - bt.io.ldCancel := io.ldCancel - case None => - } - - v0BusyTable match { - case Some(bt) => - bt.io.allocPregs.zip(io.fromDispatch.allocPregs).foreach { case (btAllocPregs, dpAllocPregs) => - btAllocPregs.valid := dpAllocPregs.isV0 - btAllocPregs.bits := dpAllocPregs.preg - } - bt.io.wbPregs.zipWithIndex.foreach { case (wb, i) => - wb.valid := io.v0WriteBack(i).wen && io.v0WriteBack(i).v0Wen - wb.bits := io.v0WriteBack(i).addr - } - bt.io.wakeUp := io.fromSchedulers.wakeupVec - bt.io.og0Cancel := io.fromDataPath.og0Cancel - bt.io.ldCancel := io.ldCancel - case None => - } - - vlBusyTable match { - case Some(bt) => - bt.io.allocPregs.zip(io.fromDispatch.allocPregs).foreach { case (btAllocPregs, dpAllocPregs) => - btAllocPregs.valid := dpAllocPregs.isVl - btAllocPregs.bits := dpAllocPregs.preg - } - bt.io.wbPregs.zipWithIndex.foreach { case (wb, i) => - wb.valid := io.vlWriteBack(i).wen && io.vlWriteBack(i).vlWen - wb.bits := io.vlWriteBack(i).addr - } - bt.io.wakeUp := io.fromSchedulers.wakeupVec - bt.io.og0Cancel := io.fromDataPath.og0Cancel - bt.io.ldCancel := io.ldCancel - - bt.io_vl_Wb.vlWriteBackInfo := io.vlWriteBackInfo - case None => - } - - rcTagTable match { - case Some(rct) => - rct.io.allocPregs.zip(io.fromDispatch.allocPregs).foreach { case (btAllocPregs, dpAllocPregs) => - btAllocPregs.valid := dpAllocPregs.isInt - btAllocPregs.bits := dpAllocPregs.preg - } - rct.io.wakeupFromIQ := io.fromSchedulers.wakeupVec - rct.io.og0Cancel := io.fromDataPath.og0Cancel - rct.io.ldCancel := io.ldCancel - case None => - } + io.IQValidNumVec := issueQueues.filter(_.params.StdCnt == 0).map(_.io.validCntDeqVec).flatten val wakeupFromIntWBVec = Wire(params.genIntWBWakeUpSinkValidBundle) val wakeupFromFpWBVec = Wire(params.genFpWBWakeUpSinkValidBundle) val wakeupFromVfWBVec = Wire(params.genVfWBWakeUpSinkValidBundle) @@ -465,13 +314,11 @@ abstract class SchedulerImpBase(wrapper: Scheduler)(implicit params: SchdBlockPa } // perfEvent - val lastCycleDp2IqOutFireVec = RegNext(VecInit(dispatch2Iq.io.out.flatten.map(_.fire))) val lastCycleIqEnqFireVec = RegNext(VecInit(issueQueues.map(_.io.enq.map(_.fire)).flatten)) val lastCycleIqFullVec = RegNext(VecInit(issueQueues.map(_.io.enq.head.ready))) val issueQueueFullVecPerf = issueQueues.zip(lastCycleIqFullVec)map{ case (iq, full) => (iq.params.getIQName + s"_full", full) } val basePerfEvents = Seq( - ("dispatch2Iq_out_fire_cnt", PopCount(lastCycleDp2IqOutFireVec) ), ("issueQueue_enq_fire_cnt", PopCount(lastCycleIqEnqFireVec) ) ) ++ issueQueueFullVecPerf @@ -487,14 +334,10 @@ class SchedulerArithImp(override val wrapper: Scheduler)(implicit params: SchdBl with HasXSParameter with HasPerfEvents { -// dontTouch(io.vfWbFuBusyTable) - println(s"[SchedulerArithImp] " + - s"has intBusyTable: ${intBusyTable.nonEmpty}, " + - s"has vfBusyTable: ${vfBusyTable.nonEmpty}") - + val issueQueuesUopIn = issueQueues.map(_.io.enq).flatten + issueQueuesUopIn.zip(io.fromDispatch.uops).map(x => x._1 <> x._2) issueQueues.zipWithIndex.foreach { case (iq, i) => iq.io.flush <> io.fromCtrlBlock.flush - iq.io.enq <> dispatch2Iq.io.out(i) if (!iq.params.needLoadDependency) { iq.io.enq.map(x => x.bits.srcLoadDependency := 0.U.asTypeOf(x.bits.srcLoadDependency)) } @@ -519,10 +362,16 @@ class SchedulerMemImp(override val wrapper: Scheduler)(implicit params: SchdBloc with HasXSParameter with HasPerfEvents { - println(s"[SchedulerMemImp] " + - s"has intBusyTable: ${intBusyTable.nonEmpty}, " + - s"has vfBusyTable: ${vfBusyTable.nonEmpty}") + val issueQueuesUopIn = issueQueues.filter(_.params.StdCnt == 0).map(_.io.enq).flatten + issueQueuesUopIn.zip(io.fromDispatch.uops).map(x => x._1 <> x._2) + val noStdExuParams = params.issueBlockParams.map(x => Seq.fill(x.numEnq)(x.exuBlockParams)).flatten.filter{x => x.map(!_.hasStdFu).reduce(_ && _)} + val staIdx = noStdExuParams.zipWithIndex.filter{x => x._1.map(_.hasStoreAddrFu).reduce(_ || _)}.map(_._2) + val staReady = issueQueues.filter(iq => iq.params.StaCnt > 0).map(_.io.enq.map(_.ready)).flatten + val stdReady = issueQueues.filter(iq => iq.params.StdCnt > 0).map(_.io.enq.map(_.ready)).flatten + staIdx.zipWithIndex.map{ case (sta, i) => { + io.fromDispatch.uops(sta).ready := staReady(i) && stdReady(i) + }} val memAddrIQs = issueQueues.filter(_.params.isMemAddrIQ) val stAddrIQs = issueQueues.filter(iq => iq.params.StaCnt > 0) // included in memAddrIQs val ldAddrIQs = issueQueues.filter(iq => iq.params.LduCnt > 0) @@ -545,7 +394,6 @@ class SchedulerMemImp(override val wrapper: Scheduler)(implicit params: SchdBloc memAddrIQs.zipWithIndex.foreach { case (iq, i) => iq.io.flush <> io.fromCtrlBlock.flush - iq.io.enq <> dispatch2Iq.io.out(i) if (!iq.params.needLoadDependency) { iq.io.enq.map(x => x.bits.srcLoadDependency := 0.U.asTypeOf(x.bits.srcLoadDependency)) } @@ -604,23 +452,6 @@ class SchedulerMemImp(override val wrapper: Scheduler)(implicit params: SchdBloc require(hyaEnqs.size == hydEnqs.size, s"number of enq ports of hybrid address IQs(${hyaEnqs.size}) " + s"should be equal to number of enq ports of hybrid data IQs(${hydEnqs.size})") - val d2IqStaOut = dispatch2Iq.io.out.zipWithIndex.filter(staIdxSeq contains _._2).unzip._1.flatten - d2IqStaOut.zip(staEnqs).zip(stdEnqs).foreach{ case((dp, staIQ), stdIQ) => - val isAllReady = staIQ.ready && stdIQ.ready - dp.ready := isAllReady - val isDropAmocasSta = dp.bits.isAMOCAS && dp.bits.uopIdx(0) === 1.U - staIQ.valid := dp.valid && isAllReady && !isDropAmocasSta - stdIQ.valid := dp.valid && isAllReady && FuType.FuTypeOrR(dp.bits.fuType, FuType.stu, FuType.mou) - } - - val d2IqHyaOut = dispatch2Iq.io.out.zipWithIndex.filter(hyaIdxSeq contains _._2).unzip._1.flatten - d2IqHyaOut.zip(hyaEnqs).zip(hydEnqs).foreach{ case((dp, hyaIQ), hydIQ) => - val isAllReady = hyaIQ.ready && hydIQ.ready - dp.ready := isAllReady - hyaIQ.valid := dp.valid && isAllReady - hydIQ.valid := dp.valid && isAllReady && FuType.FuTypeOrR(dp.bits.fuType, FuType.stu, FuType.mou) - } - stDataIQs.zipWithIndex.foreach { case (iq, i) => iq.io.flush <> io.fromCtrlBlock.flush iq.io.wakeupFromWB.zip( @@ -630,9 +461,12 @@ class SchedulerMemImp(override val wrapper: Scheduler)(implicit params: SchdBloc wakeupFromV0WBVec.zipWithIndex.filter(x => iq.params.needWakeupFromV0WBPort.keys.toSeq.contains(x._2)).map(_._1).toSeq ++ wakeupFromVlWBVec.zipWithIndex.filter(x => iq.params.needWakeupFromVlWBPort.keys.toSeq.contains(x._2)).map(_._1).toSeq ).foreach{ case (sink, source) => sink := source} + // here disable fp load fast wakeup to std, and no FEX wakeup to std + iq.io.wakeupFromIQ.map(_.bits.fpWen := false.B) } (stdEnqs ++ hydEnqs).zip(staEnqs ++ hyaEnqs).zipWithIndex.foreach { case ((stdIQEnq, staIQEnq), i) => + stdIQEnq.valid := staIQEnq.valid && FuType.FuTypeOrR(staIQEnq.bits.fuType, FuType.stu, FuType.mou) stdIQEnq.bits := staIQEnq.bits // Store data reuses store addr src(1) in dispatch2iq // [dispatch2iq] --src*------src*(0)--> [staIQ|hyaIQ] @@ -679,26 +513,6 @@ class SchedulerMemImp(override val wrapper: Scheduler)(implicit params: SchdBloc sink := source } - val lsqEnqCtrl = Module(new LsqEnqCtrl) - - lsqEnqCtrl.io.redirect <> io.fromCtrlBlock.flush - lsqEnqCtrl.io.enq <> dispatch2Iq.io.enqLsqIO.get - lsqEnqCtrl.io.lcommit := io.fromMem.get.lcommit - lsqEnqCtrl.io.scommit := io.fromMem.get.scommit - lsqEnqCtrl.io.lqCancelCnt := io.fromMem.get.lqCancelCnt - lsqEnqCtrl.io.sqCancelCnt := io.fromMem.get.sqCancelCnt - dispatch2Iq.io.lqFreeCount.get := lsqEnqCtrl.io.lqFreeCount - dispatch2Iq.io.sqFreeCount.get := lsqEnqCtrl.io.sqFreeCount - io.memIO.get.lsqEnqIO <> lsqEnqCtrl.io.enqLsq - - dontTouch(io.vecLoadIssueResp) - - val intBusyTablePerf = intBusyTable.get - val fpBusyTablePerf = fpBusyTable.get - val vecBusyTablePerf = vfBusyTable.get - val v0BusyTablePerf = v0BusyTable.get - val vlBusyTablePerf = vlBusyTable.get - - val perfEvents = basePerfEvents ++ Seq(intBusyTablePerf, fpBusyTablePerf, vecBusyTablePerf, v0BusyTablePerf, vlBusyTablePerf).flatten(_.getPerfEvents) + val perfEvents = basePerfEvents generatePerfEvent() } diff --git a/src/main/scala/xiangshan/backend/regcache/RegCacheTagTable.scala b/src/main/scala/xiangshan/backend/regcache/RegCacheTagTable.scala index 6a7e7a4deea..709a57111e5 100644 --- a/src/main/scala/xiangshan/backend/regcache/RegCacheTagTable.scala +++ b/src/main/scala/xiangshan/backend/regcache/RegCacheTagTable.scala @@ -27,14 +27,14 @@ import xiangshan.backend.BackendParams import xiangshan.backend.issue.SchdBlockParams import freechips.rocketchip.util.SeqToAugmentedSeq -class RegCacheTagTable(numReadPorts: Int)(implicit p: Parameters, schdParams: SchdBlockParams) extends XSModule { +class RegCacheTagTable(numReadPorts: Int)(implicit p: Parameters) extends XSModule { val io = IO(new RegCacheTagTableIO(numReadPorts)) println(s"[RegCacheTagTable] readPorts: ${numReadPorts}, " + s"writePorts: ${backendParams.getIntExuRCWriteSize} + ${backendParams.getMemExuRCWriteSize}") - println(s"[RegCacheTagTable] addrWidth: ${RegCacheIdxWidth}, tagWidth: ${schdParams.pregIdxWidth}") + println(s"[RegCacheTagTable] addrWidth: ${RegCacheIdxWidth}, tagWidth: ${IntPhyRegIdxWidth}") private val IntRegCacheReadSize = numReadPorts private val IntRegCacheWriteSize = backendParams.getIntExuRCWriteSize @@ -42,10 +42,10 @@ class RegCacheTagTable(numReadPorts: Int)(implicit p: Parameters, schdParams: Sc private val MemRegCacheWriteSize = backendParams.getMemExuRCWriteSize val IntRCTagTable = Module(new RegCacheTagModule("IntRCTagTable", IntRegCacheSize, IntRegCacheReadSize, IntRegCacheWriteSize, - RegCacheIdxWidth - 1, schdParams.pregIdxWidth)) + RegCacheIdxWidth - 1, IntPhyRegIdxWidth)) val MemRCTagTable = Module(new RegCacheTagModule("MemRCTagTable", MemRegCacheSize, MemRegCacheReadSize, MemRegCacheWriteSize, - RegCacheIdxWidth - 1, schdParams.pregIdxWidth)) + RegCacheIdxWidth - 1, IntPhyRegIdxWidth)) // read io.readPorts @@ -55,7 +55,8 @@ class RegCacheTagTable(numReadPorts: Int)(implicit p: Parameters, schdParams: Sc r_mem.ren := r_in.ren r_int.tag := r_in.tag r_mem.tag := r_in.tag - r_in.valid := r_int.valid || r_mem.valid + val matchAlloc = io.allocPregs.map(x => x.valid && r_in.tag === x.bits).reduce(_ || _) + r_in.valid := (r_int.valid || r_mem.valid) && !matchAlloc r_in.addr := Mux(r_int.valid, Cat("b0".U, r_int.addr), Cat("b1".U, r_mem.addr)) } @@ -65,7 +66,7 @@ class RegCacheTagTable(numReadPorts: Int)(implicit p: Parameters, schdParams: Sc require(wakeupFromIQNeedWriteRC.size == IntRegCacheWriteSize + MemRegCacheWriteSize, "wakeup size should be equal to RC write size") - shiftLoadDependency.zip(wakeupFromIQNeedWriteRC.map(_.bits.loadDependency)).zip(schdParams.wakeUpInExuSources.map(_.name)).foreach { + shiftLoadDependency.zip(wakeupFromIQNeedWriteRC.map(_.bits.loadDependency)).zip(backendParams.intSchdParams.get.wakeUpInExuSources.map(_.name)).foreach { case ((deps, originalDeps), name) => deps.zip(originalDeps).zipWithIndex.foreach { case ((dep, originalDep), deqPortIdx) => if (backendParams.getLdExuIdx(backendParams.allExuParams.find(_.name == name).get) == deqPortIdx) @@ -108,14 +109,14 @@ class RegCacheTagTable(numReadPorts: Int)(implicit p: Parameters, schdParams: Sc } } -class RegCacheTagTableIO(numReadPorts: Int)(implicit p: Parameters, schdParams: SchdBlockParams) extends XSBundle { +class RegCacheTagTableIO(numReadPorts: Int)(implicit p: Parameters) extends XSBundle { - val readPorts = Vec(numReadPorts, new RCTagTableReadPort(RegCacheIdxWidth, schdParams.pregIdxWidth)) + val readPorts = Vec(numReadPorts, new RCTagTableReadPort(RegCacheIdxWidth, IntPhyRegIdxWidth)) - val wakeupFromIQ: MixedVec[ValidIO[IssueQueueIQWakeUpBundle]] = Flipped(schdParams.genIQWakeUpInValidBundle) + val wakeupFromIQ: MixedVec[ValidIO[IssueQueueIQWakeUpBundle]] = Flipped(backendParams.intSchdParams.get.genIQWakeUpInValidBundle) // set preg state to invalid - val allocPregs = Vec(RenameWidth, Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) + val allocPregs = Vec(RenameWidth, Flipped(ValidIO(UInt(IntPhyRegIdxWidth.W)))) // cancelFromDatapath val og0Cancel = Input(ExuVec()) diff --git a/src/main/scala/xiangshan/backend/rename/BusyTable.scala b/src/main/scala/xiangshan/backend/rename/BusyTable.scala index 85c6674a89a..0db6a488ba0 100644 --- a/src/main/scala/xiangshan/backend/rename/BusyTable.scala +++ b/src/main/scala/xiangshan/backend/rename/BusyTable.scala @@ -33,19 +33,23 @@ class BusyTableReadIO(implicit p: Parameters) extends XSBundle { val loadDependency = Vec(LoadPipelineWidth, Output(UInt(LoadDependencyWidth.W))) } + class VlBusyTableReadIO(implicit p: Parameters) extends XSBundle { val is_zero = Output(Bool()) val is_vlmax = Output(Bool()) } -class BusyTable(numReadPorts: Int, numWritePorts: Int, numPhyPregs: Int, pregWB: PregWB)(implicit p: Parameters, params: SchdBlockParams) extends XSModule with HasPerfEvents { +class BusyTable(numReadPorts: Int, numWritePorts: Int, numPhyPregs: Int, pregWB: PregWB)(implicit p: Parameters) extends XSModule with HasPerfEvents { val io = IO(new Bundle() { // set preg state to busy val allocPregs = Vec(RenameWidth, Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) // set preg state to ready (write back regfile + rob walk) val wbPregs = Vec(numWritePorts, Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) // fast wakeup - val wakeUp: MixedVec[ValidIO[IssueQueueIQWakeUpBundle]] = Flipped(params.genIQWakeUpInValidBundle) + val wakeUpInt = Flipped(backendParams.intSchdParams.get.genIQWakeUpOutValidBundle) + val wakeUpFp = Flipped(backendParams.fpSchdParams.get.genIQWakeUpOutValidBundle) + val wakeUpVec = Flipped(backendParams.vfSchdParams.get.genIQWakeUpOutValidBundle) + val wakeUpMem = Flipped(backendParams.memSchdParams.get.genIQWakeUpOutValidBundle) // cancelFromDatapath val og0Cancel = Input(ExuVec()) // cancelFromMem @@ -54,13 +58,11 @@ class BusyTable(numReadPorts: Int, numWritePorts: Int, numPhyPregs: Int, pregWB: val read = Vec(numReadPorts, new BusyTableReadIO) }) - val allExuParams = params.backendParam.allExuParams + val allExuParams = backendParams.allExuParams val intBusyTableNeedLoadCancel = allExuParams.map(x => x.needLoadDependency && x.writeIntRf && x.iqWakeUpSourcePairs.map(y => y.sink.getExuParam(allExuParams).readIntRf).foldLeft(false)(_ || _) ).reduce(_ || _) - val fpBusyTableNeedLoadCancel = allExuParams.map(x => - x.needLoadDependency && x.writeFpRf && x.iqWakeUpSourcePairs.map(y => y.sink.getExuParam(allExuParams).readFpRf).foldLeft(false)(_ || _) - ).reduce(_ || _) + val fpBusyTableNeedLoadCancel = false val vfBusyTableNeedLoadCancel = allExuParams.map(x => x.needLoadDependency && x.writeVfRf && x.iqWakeUpSourcePairs.map(y => y.sink.getExuParam(allExuParams).readVecRf).foldLeft(false)(_ || _) ).reduce(_ || _) @@ -80,12 +82,13 @@ class BusyTable(numReadPorts: Int, numWritePorts: Int, numPhyPregs: Int, pregWB: } if (!needLoadCancel) println(s"[BusyTable]: WbConfig ${pregWB} busyTable don't need loadCancel") val loadCancel = if (needLoadCancel) io.ldCancel else 0.U.asTypeOf(io.ldCancel) + val allWakeUp = io.wakeUpInt ++ io.wakeUpFp ++ io.wakeUpVec ++ io.wakeUpMem val wakeUpIn = pregWB match { - case IntWB(_, _) => io.wakeUp.filter(_.bits.params.writeIntRf) - case FpWB(_, _) => io.wakeUp.filter(_.bits.params.writeFpRf) - case VfWB(_, _) => io.wakeUp.filter(_.bits.params.writeVfRf) - case V0WB(_, _) => io.wakeUp.filter(_.bits.params.writeV0Rf) - case VlWB(_, _) => io.wakeUp.filter(_.bits.params.writeVlRf) + case IntWB(_, _) => allWakeUp.filter{x => x.bits.params.writeIntRf && (x.bits.params.hasLoadExu || x.bits.params.hasAluFu)} + case FpWB(_, _) => allWakeUp.filter{x => x.bits.params.writeFpRf && !x.bits.params.hasLoadExu} + case VfWB(_, _) => allWakeUp.filter(_.bits.params.writeVfRf) + case V0WB(_, _) => allWakeUp.filter(_.bits.params.writeV0Rf) + case VlWB(_, _) => allWakeUp.filter(_.bits.params.writeVlRf) case _ => throw new IllegalArgumentException(s"WbConfig ${pregWB} is not permitted") } val loadDependency = RegInit(0.U.asTypeOf(Vec(numPhyPregs, Vec(LoadPipelineWidth, UInt(LoadDependencyWidth.W))))) @@ -131,9 +134,9 @@ class BusyTable(numReadPorts: Int, numWritePorts: Int, numPhyPregs: Int, pregWB: when(wakeUpMask(idx)) { ldDp := (if (wakeUpIn.nonEmpty) Mux1H(wakeupOHVec(idx), shiftLoadDependency) else 0.U.asTypeOf(ldDp)) } - .elsewhen(ldDp.map(x => x.orR).reduce(_ | _)) { - ldDp := VecInit(ldDp.map(x => x << 1)) - } + .elsewhen(ldDp.map(x => x.orR).reduce(_ | _)) { + ldDp := VecInit(ldDp.map(x => x << 1)) + } } /* @@ -153,21 +156,22 @@ class BusyTable(numReadPorts: Int, numWritePorts: Int, numPhyPregs: Int, pregWB: when(wakeUpMask(idx) || wbMask(idx)) { update := false.B //ready } - .elsewhen(allocMask(idx) || ldCancelMask(idx)) { - update := true.B //busy - if (idx == 0 && pregWB.isInstanceOf[IntWB]) { + .elsewhen(allocMask(idx) || ldCancelMask(idx)) { + update := true.B //busy + if (idx == 0 && pregWB.isInstanceOf[IntWB]) { // Int RegFile 0 is always ready update := false.B + } + } + .otherwise { + update := table(idx) } - } - .otherwise { - update := table(idx) - } } io.read.foreach{ case res => - res.resp := !table(res.req) - res.loadDependency := loadDependency(res.req) + val readBypass = VecInit(io.allocPregs.map(x => x.valid && x.bits === res.req)) + res.resp := !(table(res.req) || readBypass.asUInt.orR) + res.loadDependency := (if (needLoadCancel) loadDependency(res.req) else 0.U.asTypeOf(res.loadDependency)) } val oddTable = table.asBools.zipWithIndex.filter(_._2 % 2 == 1).map(_._1) @@ -183,9 +187,10 @@ class BusyTable(numReadPorts: Int, numWritePorts: Int, numPhyPregs: Int, pregWB: ("bt_std_freelist_4_4_valid", busyCount > (numPhyPregs * 3 / 4).U ) ) generatePerfEvent() + } -class VlBusyTable(numReadPorts: Int, numWritePorts: Int, numPhyPregs: Int, pregWB: PregWB)(implicit p: Parameters, params: SchdBlockParams) extends BusyTable(numReadPorts, numWritePorts, numPhyPregs, pregWB) { +class VlBusyTable(numReadPorts: Int, numWritePorts: Int, numPhyPregs: Int, pregWB: PregWB)(implicit p: Parameters) extends BusyTable(numReadPorts, numWritePorts, numPhyPregs, pregWB) { val io_vl_Wb = IO(new Bundle() { val vlWriteBackInfo = new Bundle { @@ -249,3 +254,4 @@ class VlBusyTable(numReadPorts: Int, numWritePorts: Int, numPhyPregs: Int, pregW vlRes.is_vlmax := !vlmaxTable(res.req) } } + diff --git a/src/main/scala/xiangshan/backend/rename/Rename.scala b/src/main/scala/xiangshan/backend/rename/Rename.scala index 360850ac02d..57b3fdef48b 100644 --- a/src/main/scala/xiangshan/backend/rename/Rename.scala +++ b/src/main/scala/xiangshan/backend/rename/Rename.scala @@ -398,6 +398,10 @@ class Rename(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHe io.out(i).valid := io.in(i).valid && intFreeList.io.canAllocate && fpFreeList.io.canAllocate && vecFreeList.io.canAllocate && v0FreeList.io.canAllocate && vlFreeList.io.canAllocate && !io.rabCommits.isWalk io.out(i).bits := uops(i) + // dirty code + if (i == 0) { + io.out(i).bits.psrc(0) := Mux(io.out(i).bits.isLUI, 0.U, uops(i).psrc(0)) + } // Todo: move these shit in decode stage // dirty code for fence. The lsrc is passed by imm. when (io.out(i).bits.fuType === FuType.fence.U) { @@ -534,9 +538,10 @@ class Rename(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHe } bypassCond(j)(i - 1) := VecInit(destToSrc).asUInt } - io.out(i).bits.psrc(0) := io.out.take(i).map(_.bits.pdest).zip(bypassCond(0)(i-1).asBools).foldLeft(uops(i).psrc(0)) { + // For the LUI instruction: psrc(0) is from register file and should always be zero. + io.out(i).bits.psrc(0) := Mux(io.out(i).bits.isLUI, 0.U, io.out.take(i).map(_.bits.pdest).zip(bypassCond(0)(i-1).asBools).foldLeft(uops(i).psrc(0)) { (z, next) => Mux(next._2, next._1, z) - } + }) io.out(i).bits.psrc(1) := io.out.take(i).map(_.bits.pdest).zip(bypassCond(1)(i-1).asBools).foldLeft(uops(i).psrc(1)) { (z, next) => Mux(next._2, next._1, z) } diff --git a/src/main/scala/xiangshan/backend/rob/Rob.scala b/src/main/scala/xiangshan/backend/rob/Rob.scala index 4ef7252587b..1615a7cf6f9 100644 --- a/src/main/scala/xiangshan/backend/rob/Rob.scala +++ b/src/main/scala/xiangshan/backend/rob/Rob.scala @@ -983,7 +983,7 @@ class RobImp(override val wrapper: Rob)(implicit p: Parameters, params: BackendP val isFirstEnq = !robEntries(i).valid && instCanEnqFlag val realDestEnqNum = PopCount(enqNeedWriteRFSeq.zip(uopCanEnqSeq).map { case (writeFlag, valid) => writeFlag && valid }) when(isFirstEnq){ - robEntries(i).realDestSize := Mux(hasExcpFlag, 0.U, realDestEnqNum) + robEntries(i).realDestSize := realDestEnqNum //Mux(hasExcpFlag, 0.U, realDestEnqNum) }.elsewhen(robEntries(i).valid && Cat(uopCanEnqSeq).orR){ robEntries(i).realDestSize := robEntries(i).realDestSize + realDestEnqNum } diff --git a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala index f8e87d9898b..0d1222b40d8 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala @@ -325,8 +325,8 @@ class LsqEnqCtrl(implicit p: Parameters) extends XSModule val blockVec = io.enq.iqAccept.map(!_) :+ true.B val numLsElem = io.enq.req.map(_.bits.numLsElem) - val needEnqLoadQueue = VecInit(io.enq.req.map(x => FuType.isLoad(x.bits.fuType) || FuType.isVNonsegLoad(x.bits.fuType))) - val needEnqStoreQueue = VecInit(io.enq.req.map(x => FuType.isStore(x.bits.fuType) || FuType.isVNonsegStore(x.bits.fuType))) + val needEnqLoadQueue = VecInit(io.enq.req.map(x => x.valid && (FuType.isLoad(x.bits.fuType) || FuType.isVNonsegLoad(x.bits.fuType)))) + val needEnqStoreQueue = VecInit(io.enq.req.map(x => x.valid && (FuType.isStore(x.bits.fuType) || FuType.isVNonsegStore(x.bits.fuType)))) val loadQueueElem = needEnqLoadQueue.zip(numLsElem).map(x => Mux(x._1, x._2, 0.U)) val storeQueueElem = needEnqStoreQueue.zip(numLsElem).map(x => Mux(x._1, x._2, 0.U)) val loadFlowPopCount = 0.U +: loadQueueElem.zipWithIndex.map{ case (l, i) => diff --git a/src/test/scala/xiangshan/backend/dispatch/Dispatch2IqMain.scala b/src/test/scala/xiangshan/backend/dispatch/Dispatch2IqMain.scala deleted file mode 100644 index 4abec8d75d1..00000000000 --- a/src/test/scala/xiangshan/backend/dispatch/Dispatch2IqMain.scala +++ /dev/null @@ -1,20 +0,0 @@ -package xiangshan.backend.dispatch - -import chisel3._ -import freechips.rocketchip.diplomacy.{DisableMonitors, LazyModule} -import top.{ArgParser, Generator, XSTop} -import xiangshan.XSCoreParamsKey - - -object Dispatch2IqMain extends App { - val (config, firrtlOpts, firtoolOpts) = ArgParser.parse(args) - - val backendParams = config(XSCoreParamsKey).backendParams - val soc = DisableMonitors(p => LazyModule(new XSTop()(p)))(config) - - Generator.execute( - firrtlOpts, - soc.core_with_l2(0).core.backend.inner.intScheduler.get.dispatch2Iq.module, - firtoolOpts - ) -} From 84e81007d626526436e25428a5320206082bc7ca Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Fri, 22 Nov 2024 18:43:32 +0800 Subject: [PATCH 02/32] area(backend): reduce 4 fexu to 3 fexu --- src/main/scala/xiangshan/Parameters.scala | 29 ++++++++++------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/src/main/scala/xiangshan/Parameters.scala b/src/main/scala/xiangshan/Parameters.scala index 19821cb841b..19880f71c4b 100644 --- a/src/main/scala/xiangshan/Parameters.scala +++ b/src/main/scala/xiangshan/Parameters.scala @@ -410,7 +410,7 @@ case class XSCoreParameters ), numEntries = IssueQueueSize, numEnq = 2, numComp = IssueQueueCompEntrySize), IssueBlockParams(Seq( ExeUnitParams("ALU2", Seq(AluCfg), Seq(IntWB(port = 2, 0)), Seq(Seq(IntRD(4, 0)), Seq(IntRD(5, 0))), true, 2), - ExeUnitParams("BJU2", Seq(BrhCfg, JmpCfg, I2fCfg, VSetRiWiCfg, VSetRiWvfCfg, I2vCfg), Seq(IntWB(port = 4, 0), VfWB(2, 0), V0WB(port = 2, 0), VlWB(port = intSchdVlWbPort, 0), FpWB(port = 4, 0)), Seq(Seq(IntRD(2, 1)), Seq(IntRD(3, 1)))), + ExeUnitParams("BJU2", Seq(BrhCfg, JmpCfg, I2fCfg, VSetRiWiCfg, VSetRiWvfCfg, I2vCfg), Seq(IntWB(port = 4, 0), VfWB(2, 0), V0WB(port = 2, 0), VlWB(port = intSchdVlWbPort, 0), FpWB(port = 2, 1)), Seq(Seq(IntRD(2, 1)), Seq(IntRD(3, 1)))), ), numEntries = IssueQueueSize, numEnq = 2, numComp = IssueQueueCompEntrySize), IssueBlockParams(Seq( ExeUnitParams("ALU3", Seq(AluCfg), Seq(IntWB(port = 3, 0)), Seq(Seq(IntRD(6, 0)), Seq(IntRD(7, 0))), true, 2), @@ -430,19 +430,14 @@ case class XSCoreParameters SchdBlockParams(Seq( IssueBlockParams(Seq( ExeUnitParams("FEX0", Seq(FaluCfg, FcvtCfg, F2vCfg, FmacCfg), Seq(FpWB(port = 0, 0), IntWB(port = 0, 2), VfWB(port = 3, 0), V0WB(port = 3, 0)), Seq(Seq(FpRD(0, 0)), Seq(FpRD(1, 0)), Seq(FpRD(2, 0)))), + ExeUnitParams("FEX1", Seq(FdivCfg), Seq(FpWB(port = 3, 1)), Seq(Seq(FpRD(2, 1)), Seq(FpRD(5, 1)))), ), numEntries = 18, numEnq = 2, numComp = 16), IssueBlockParams(Seq( - ExeUnitParams("FEX1", Seq(FaluCfg, FmacCfg), Seq(FpWB(port = 1, 0), IntWB(port = 1, 2)), Seq(Seq(FpRD(3, 0)), Seq(FpRD(4, 0)), Seq(FpRD(5, 0)))), + ExeUnitParams("FEX2", Seq(FaluCfg, FmacCfg), Seq(FpWB(port = 1, 0), IntWB(port = 1, 2)), Seq(Seq(FpRD(3, 0)), Seq(FpRD(4, 0)), Seq(FpRD(5, 0)))), + ExeUnitParams("FEX3", Seq(FdivCfg), Seq(FpWB(port = 4, 1)), Seq(Seq(FpRD(8, 1)), Seq(FpRD(9, 1)))), ), numEntries = 18, numEnq = 2, numComp = 16), IssueBlockParams(Seq( - ExeUnitParams("FEX2", Seq(FaluCfg, FmacCfg), Seq(FpWB(port = 2, 0), IntWB(port = 2, 2)), Seq(Seq(FpRD(6, 0)), Seq(FpRD(7, 0)), Seq(FpRD(8, 0)))), - ), numEntries = 18, numEnq = 2, numComp = 16), - IssueBlockParams(Seq( - ExeUnitParams("FEX3", Seq(FaluCfg, FmacCfg), Seq(FpWB(port = 3, 0), IntWB(port = 3, 2)), Seq(Seq(FpRD(9, 0)), Seq(FpRD(10, 0)), Seq(FpRD(11, 0)))), - ), numEntries = 18, numEnq = 2, numComp = 16), - IssueBlockParams(Seq( - ExeUnitParams("FEX4", Seq(FdivCfg), Seq(FpWB(port = 4, 1)), Seq(Seq(FpRD(2, 1)), Seq(FpRD(5, 1)))), - ExeUnitParams("FEX5", Seq(FdivCfg), Seq(FpWB(port = 3, 1)), Seq(Seq(FpRD(8, 1)), Seq(FpRD(11, 1)))), + ExeUnitParams("FEX4", Seq(FaluCfg, FmacCfg), Seq(FpWB(port = 2, 0), IntWB(port = 2, 1)), Seq(Seq(FpRD(6, 0)), Seq(FpRD(7, 0)), Seq(FpRD(8, 0)))), ), numEntries = 18, numEnq = 2, numComp = 16), ), numPregs = fpPreg.numEntries, @@ -488,13 +483,13 @@ case class XSCoreParameters ExeUnitParams("STA1", Seq(StaCfg, MouCfg), Seq(FakeIntWB()), Seq(Seq(IntRD(6, 2)))), ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( - ExeUnitParams("LDU0", Seq(LduCfg), Seq(IntWB(5, 0), FpWB(5, 0)), Seq(Seq(IntRD(8, 0))), true, 2), + ExeUnitParams("LDU0", Seq(LduCfg), Seq(IntWB(5, 0), FpWB(3, 0)), Seq(Seq(IntRD(8, 0))), true, 2), ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( - ExeUnitParams("LDU1", Seq(LduCfg), Seq(IntWB(6, 0), FpWB(6, 0)), Seq(Seq(IntRD(9, 0))), true, 2), + ExeUnitParams("LDU1", Seq(LduCfg), Seq(IntWB(6, 0), FpWB(4, 0)), Seq(Seq(IntRD(9, 0))), true, 2), ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( - ExeUnitParams("LDU2", Seq(LduCfg), Seq(IntWB(7, 0), FpWB(7, 0)), Seq(Seq(IntRD(10, 0))), true, 2), + ExeUnitParams("LDU2", Seq(LduCfg), Seq(IntWB(7, 0), FpWB(5, 0)), Seq(Seq(IntRD(10, 0))), true, 2), ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( ExeUnitParams("VLSU0", Seq(VlduCfg, VstuCfg, VseglduSeg, VsegstuCfg), Seq(VfWB(4, 0), V0WB(4, 0), VlWB(port = 2, 0)), Seq(Seq(VfRD(6, 0)), Seq(VfRD(7, 0)), Seq(VfRD(8, 0)), Seq(V0RD(2, 0)), Seq(VlRD(2, 0)))), @@ -503,10 +498,10 @@ case class XSCoreParameters ExeUnitParams("VLSU1", Seq(VlduCfg, VstuCfg), Seq(VfWB(5, 0), V0WB(5, 0), VlWB(port = 3, 0)), Seq(Seq(VfRD(9, 0)), Seq(VfRD(10, 0)), Seq(VfRD(11, 0)), Seq(V0RD(3, 0)), Seq(VlRD(3, 0)))), ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( - ExeUnitParams("STD0", Seq(StdCfg, MoudCfg), Seq(), Seq(Seq(IntRD(5, 2), FpRD(12, 0)))), + ExeUnitParams("STD0", Seq(StdCfg, MoudCfg), Seq(), Seq(Seq(IntRD(5, 2), FpRD(9, 0)))), ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( - ExeUnitParams("STD1", Seq(StdCfg, MoudCfg), Seq(), Seq(Seq(IntRD(3, 2), FpRD(13, 0)))), + ExeUnitParams("STD1", Seq(StdCfg, MoudCfg), Seq(), Seq(Seq(IntRD(3, 2), FpRD(10, 0)))), ), numEntries = 16, numEnq = 2, numComp = 14), ), numPregs = intPreg.numEntries max vfPreg.numEntries, @@ -527,8 +522,8 @@ case class XSCoreParameters ), // TODO: add load -> fp slow wakeup WakeUpConfig( - Seq("FEX0", "FEX1", "FEX2", "FEX3") -> - Seq("FEX0", "FEX1", "FEX2", "FEX3", "FEX4", "FEX5") + Seq("FEX0", "FEX2", "FEX4") -> + Seq("FEX0", "FEX1", "FEX2", "FEX3", "FEX4") ), ).flatten } From 6a1210dc4c37a3b332f108a98ff6d8605fe622a2 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Mon, 25 Nov 2024 18:35:33 +0800 Subject: [PATCH 03/32] area(Backend): merge pcMem and pcTargetMem --- src/main/scala/xiangshan/XSCore.scala | 1 - .../scala/xiangshan/backend/Backend.scala | 14 +-------- .../scala/xiangshan/backend/CtrlBlock.scala | 31 ++++++++++++++++--- .../scala/xiangshan/backend/MemBlock.scala | 11 ++++--- .../xiangshan/backend/datapath/DataPath.scala | 4 +-- .../xiangshan/backend/exu/ExeUnitParams.scala | 3 -- .../scala/xiangshan/backend/fu/Fence.scala | 5 ++- .../scala/xiangshan/backend/fu/FuConfig.scala | 4 +-- .../scala/xiangshan/backend/fu/FuncUnit.scala | 2 -- .../xiangshan/backend/issue/IssueQueue.scala | 1 + 10 files changed, 41 insertions(+), 35 deletions(-) diff --git a/src/main/scala/xiangshan/XSCore.scala b/src/main/scala/xiangshan/XSCore.scala index 11196085d05..633511b225d 100644 --- a/src/main/scala/xiangshan/XSCore.scala +++ b/src/main/scala/xiangshan/XSCore.scala @@ -195,7 +195,6 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer) // By default, instructions do not have exceptions when they enter the function units. memBlock.io.ooo_to_mem.issueUops.map(_.bits.uop.clearExceptions()) - memBlock.io.ooo_to_mem.loadPc := backend.io.mem.loadPcRead memBlock.io.ooo_to_mem.storePc := backend.io.mem.storePcRead memBlock.io.ooo_to_mem.hybridPc := backend.io.mem.hyuPcRead memBlock.io.ooo_to_mem.flushSb := backend.io.fenceio.sbuffer.flushSb diff --git a/src/main/scala/xiangshan/backend/Backend.scala b/src/main/scala/xiangshan/backend/Backend.scala index 6c5e9c4d1b0..58efddecc61 100644 --- a/src/main/scala/xiangshan/backend/Backend.scala +++ b/src/main/scala/xiangshan/backend/Backend.scala @@ -176,7 +176,6 @@ class BackendInlined(val params: BackendParams)(implicit p: Parameters) extends println(s"[Backend] copyPdestInfo ${params.copyPdestInfo}") params.allExuParams.map(_.copyNum) val ctrlBlock = LazyModule(new CtrlBlock(params)) - val pcTargetMem = LazyModule(new PcTargetMem(params)) val intScheduler = params.intSchdParams.map(x => LazyModule(new Scheduler(x))) val fpScheduler = params.fpSchdParams.map(x => LazyModule(new Scheduler(x))) val vfScheduler = params.vfSchdParams.map(x => LazyModule(new Scheduler(x))) @@ -199,7 +198,6 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame val io = IO(new BackendIO()(p, wrapper.params)) private val ctrlBlock = wrapper.ctrlBlock.module - private val pcTargetMem = wrapper.pcTargetMem.module private val intScheduler: SchedulerImpBase = wrapper.intScheduler.get.module private val fpScheduler = wrapper.fpScheduler.get.module private val vfScheduler = wrapper.vfScheduler.get.module @@ -484,8 +482,7 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame } } - pcTargetMem.io.fromFrontendFtq := io.frontend.fromFtq - pcTargetMem.io.toDataPath <> dataPath.io.fromPcTargetMem + ctrlBlock.io.toDataPath.pcToDataPathIO <> dataPath.io.fromPcTargetMem private val csrin = intExuBlock.io.csrin.get csrin.hartId := io.fromTop.hartId @@ -744,13 +741,6 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame io.mem.sfence := fenceio.sfence io.mem.isStoreException := CommitType.lsInstIsStore(ctrlBlock.io.robio.exception.bits.commitType) io.mem.isVlsException := ctrlBlock.io.robio.exception.bits.vls - require(io.mem.loadPcRead.size == params.LduCnt) - io.mem.loadPcRead.zipWithIndex.foreach { case (loadPcRead, i) => - loadPcRead := ctrlBlock.io.memLdPcRead(i).data - ctrlBlock.io.memLdPcRead(i).valid := io.mem.issueLda(i).valid - ctrlBlock.io.memLdPcRead(i).ptr := io.mem.issueLda(i).bits.uop.ftqPtr - ctrlBlock.io.memLdPcRead(i).offset := io.mem.issueLda(i).bits.uop.ftqOffset - } io.mem.storePcRead.zipWithIndex.foreach { case (storePcRead, i) => storePcRead := ctrlBlock.io.memStPcRead(i).data @@ -806,7 +796,6 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame ModuleNode(wbDataPath) )) val leftResetTree = ResetGenNode(Seq( - ModuleNode(pcTargetMem), ModuleNode(intScheduler), ModuleNode(fpScheduler), ModuleNode(vfScheduler), @@ -883,7 +872,6 @@ class BackendMemIO(implicit p: Parameters, params: BackendParams) extends XSBund val vlduIqFeedback = Flipped(Vec(params.VlduCnt, new MemRSFeedbackIO(isVector = true))) val ldCancel = Vec(params.LdExuCnt, Input(new LoadCancelIO)) val wakeup = Vec(params.LdExuCnt, Flipped(Valid(new DynInst))) - val loadPcRead = Vec(params.LduCnt, Output(UInt(VAddrBits.W))) val storePcRead = Vec(params.StaCnt, Output(UInt(VAddrBits.W))) val hyuPcRead = Vec(params.HyuCnt, Output(UInt(VAddrBits.W))) // Input diff --git a/src/main/scala/xiangshan/backend/CtrlBlock.scala b/src/main/scala/xiangshan/backend/CtrlBlock.scala index 0666fbb2f68..4914f2c218d 100644 --- a/src/main/scala/xiangshan/backend/CtrlBlock.scala +++ b/src/main/scala/xiangshan/backend/CtrlBlock.scala @@ -72,6 +72,8 @@ class CtrlBlockImp( "redirect" -> 1, "memPred" -> 1, "robFlush" -> 1, + "bjuPc" -> params.BrhCnt, + "bjuTarget" -> params.BrhCnt, "load" -> params.LduCnt, "hybrid" -> params.HyuCnt, "store" -> (if(EnableStorePrefetchSMS) params.StaCnt else 0), @@ -213,11 +215,32 @@ class CtrlBlockImp( pcMem.io.raddr(pcMemRdIndexes("memPred").head) := memViolation.bits.stFtqIdx.value redirectGen.io.memPredPcRead.data := pcMem.io.rdata(pcMemRdIndexes("memPred").head).getPc(RegEnable(memViolation.bits.stFtqOffset, memViolation.valid)) + for ((pcMemIdx, i) <- pcMemRdIndexes("bjuPc").zipWithIndex) { + val ren = io.toDataPath.pcToDataPathIO.fromDataPathValid(i) + val raddr = io.toDataPath.pcToDataPathIO.fromDataPathFtqPtr(i).value + val roffset = io.toDataPath.pcToDataPathIO.fromDataPathFtqOffset(i) + pcMem.io.ren.get(pcMemIdx) := ren + pcMem.io.raddr(pcMemIdx) := raddr + io.toDataPath.pcToDataPathIO.toDataPathPC(i) := pcMem.io.rdata(pcMemIdx).getPc(RegEnable(roffset, ren)) + } + + for ((pcMemIdx, i) <- pcMemRdIndexes("bjuTarget").zipWithIndex) { + val ren = io.toDataPath.pcToDataPathIO.fromDataPathValid(i) + val raddr = io.toDataPath.pcToDataPathIO.fromDataPathFtqPtr(i).value + 1.U + pcMem.io.ren.get(pcMemIdx) := ren + pcMem.io.raddr(pcMemIdx) := raddr + io.toDataPath.pcToDataPathIO.toDataPathTargetPC(i) := pcMem.io.rdata(pcMemIdx).startAddr + } + + val baseIdx = params.BrhCnt for ((pcMemIdx, i) <- pcMemRdIndexes("load").zipWithIndex) { // load read pcMem (s0) -> get rdata (s1) -> reg next in Memblock (s2) -> reg next in Memblock (s3) -> consumed by pf (s3) - pcMem.io.ren.get(pcMemIdx) := io.memLdPcRead(i).valid - pcMem.io.raddr(pcMemIdx) := io.memLdPcRead(i).ptr.value - io.memLdPcRead(i).data := pcMem.io.rdata(pcMemIdx).getPc(RegEnable(io.memLdPcRead(i).offset, io.memLdPcRead(i).valid)) + val ren = io.toDataPath.pcToDataPathIO.fromDataPathValid(baseIdx+i) + val raddr = io.toDataPath.pcToDataPathIO.fromDataPathFtqPtr(baseIdx+i).value + val roffset = io.toDataPath.pcToDataPathIO.fromDataPathFtqOffset(baseIdx+i) + pcMem.io.ren.get(pcMemIdx) := ren + pcMem.io.raddr(pcMemIdx) := raddr + io.toDataPath.pcToDataPathIO.toDataPathPC(baseIdx+i) := pcMem.io.rdata(pcMemIdx).getPc(RegEnable(roffset, ren)) } for ((pcMemIdx, i) <- pcMemRdIndexes("hybrid").zipWithIndex) { @@ -764,6 +787,7 @@ class CtrlBlockIO()(implicit p: Parameters, params: BackendParams) extends XSBun } val toDataPath = new Bundle { val flush = ValidIO(new Redirect) + val pcToDataPathIO = new PcToDataPathIO(params) } val toExuBlock = new Bundle { val flush = ValidIO(new Redirect) @@ -779,7 +803,6 @@ class CtrlBlockIO()(implicit p: Parameters, params: BackendParams) extends XSBun val stIn = Vec(params.StaExuCnt, Flipped(ValidIO(new DynInst))) // use storeSetHit, ssid, robIdx val violation = Flipped(ValidIO(new Redirect)) } - val memLdPcRead = Vec(params.LduCnt, Flipped(new FtqRead(UInt(VAddrBits.W)))) val memStPcRead = Vec(params.StaCnt, Flipped(new FtqRead(UInt(VAddrBits.W)))) val memHyPcRead = Vec(params.HyuCnt, Flipped(new FtqRead(UInt(VAddrBits.W)))) diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala index 3b2c6f1ee70..dbb0cdea14a 100644 --- a/src/main/scala/xiangshan/backend/MemBlock.scala +++ b/src/main/scala/xiangshan/backend/MemBlock.scala @@ -106,7 +106,6 @@ class ooo_to_mem(implicit p: Parameters) extends MemBlockBundle { val enqLsq = new LsqEnqIO val flushSb = Input(Bool()) - val loadPc = Vec(LduCnt, Input(UInt(VAddrBits.W))) // for hw prefetch val storePc = Vec(StaCnt, Input(UInt(VAddrBits.W))) // for hw prefetch val hybridPc = Vec(HyuCnt, Input(UInt(VAddrBits.W))) // for hw prefetch @@ -417,10 +416,11 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) source.bits.miss || isFromStride(source.bits.meta_prefetch) ) l1Prefetcher.stride_train(i).bits := source.bits + val loadPc = RegNext(io.ooo_to_mem.issueLda(i).bits.uop.pc) // for s1 l1Prefetcher.stride_train(i).bits.uop.pc := Mux( loadUnits(i).io.s2_ptr_chasing, - RegEnable(io.ooo_to_mem.loadPc(i), loadUnits(i).io.s2_prefetch_spec), - RegEnable(RegEnable(io.ooo_to_mem.loadPc(i), loadUnits(i).io.s1_prefetch_spec), loadUnits(i).io.s2_prefetch_spec) + RegEnable(loadPc, loadUnits(i).io.s2_prefetch_spec), + RegEnable(RegEnable(loadPc, loadUnits(i).io.s1_prefetch_spec), loadUnits(i).io.s2_prefetch_spec) ) } for (i <- 0 until HyuCnt) { @@ -877,10 +877,11 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) source.valid && source.bits.isFirstIssue && source.bits.miss ) pf.io.ld_in(i).bits := source.bits + val loadPc = RegNext(io.ooo_to_mem.issueLda(i).bits.uop.pc) // for s1 pf.io.ld_in(i).bits.uop.pc := Mux( loadUnits(i).io.s2_ptr_chasing, - RegEnable(io.ooo_to_mem.loadPc(i), loadUnits(i).io.s2_prefetch_spec), - RegEnable(RegEnable(io.ooo_to_mem.loadPc(i), loadUnits(i).io.s1_prefetch_spec), loadUnits(i).io.s2_prefetch_spec) + RegEnable(loadPc, loadUnits(i).io.s2_prefetch_spec), + RegEnable(RegEnable(loadPc, loadUnits(i).io.s1_prefetch_spec), loadUnits(i).io.s2_prefetch_spec) ) }) l1PrefetcherOpt.foreach(pf => { diff --git a/src/main/scala/xiangshan/backend/datapath/DataPath.scala b/src/main/scala/xiangshan/backend/datapath/DataPath.scala index c019dc56be0..de237ca7f46 100644 --- a/src/main/scala/xiangshan/backend/datapath/DataPath.scala +++ b/src/main/scala/xiangshan/backend/datapath/DataPath.scala @@ -257,7 +257,7 @@ class DataPathImp(override val wrapper: DataPath)(implicit p: Parameters, params private val vlRfWaddr = Wire(Vec(io.fromVlWb.length, UInt(log2Up(VlPhyRegs).W))) private val vlRfWdata = Wire(Vec(io.fromVlWb.length, UInt(VlData().dataWidth.W))) - val pcReadFtqPtrFormIQ = fromIntIQ.flatten.filter(x => x.bits.exuParams.needPc) + val pcReadFtqPtrFormIQ = (fromIntIQ ++ fromMemIQ).flatten.filter(x => x.bits.exuParams.needPc) assert(pcReadFtqPtrFormIQ.size == pcReadFtqPtr.size, s"pcReadFtqPtrFormIQ.size ${pcReadFtqPtrFormIQ.size} not equal pcReadFtqPtr.size ${pcReadFtqPtr.size}") pcReadValid.zip(pcReadFtqPtrFormIQ.map(_.valid)).map(x => x._1 := x._2) pcReadFtqPtr.zip(pcReadFtqPtrFormIQ.map(_.bits.common.ftqIdx.get)).map(x => x._1 := x._2) @@ -711,7 +711,7 @@ class DataPathImp(override val wrapper: DataPath)(implicit p: Parameters, params if (readRfMap.nonEmpty) sinkData.src(k) := Mux1H(readRfMap) } - if (sinkData.params.hasJmpFu) { + if (sinkData.params.hasJmpFu || sinkData.params.hasLoadFu) { val index = pcReadFtqPtrFormIQ.map(_.bits.exuParams).indexOf(sinkData.params) sinkData.pc.get := pcRdata(index) } diff --git a/src/main/scala/xiangshan/backend/exu/ExeUnitParams.scala b/src/main/scala/xiangshan/backend/exu/ExeUnitParams.scala index bfeb1095442..62877662b2b 100644 --- a/src/main/scala/xiangshan/backend/exu/ExeUnitParams.scala +++ b/src/main/scala/xiangshan/backend/exu/ExeUnitParams.scala @@ -115,9 +115,6 @@ case class ExeUnitParams( val wbIndex: Seq[Int] = Seq(wbIntIndex, wbFpIndex, wbVecIndex, wbV0Index, wbVlIndex) - - require(needPc && needTarget || !needPc && !needTarget, "The ExeUnit must need both PC and Target PC") - def copyNum: Int = { val setIQ = mutable.Set[IssueBlockParams]() iqWakeUpSourcePairs.map(_.sink).foreach{ wakeupSink => diff --git a/src/main/scala/xiangshan/backend/fu/Fence.scala b/src/main/scala/xiangshan/backend/fu/Fence.scala index 2f59ddd49fa..481e4e1f7fc 100644 --- a/src/main/scala/xiangshan/backend/fu/Fence.scala +++ b/src/main/scala/xiangshan/backend/fu/Fence.scala @@ -84,15 +84,14 @@ class Fence(cfg: FuConfig)(implicit p: Parameters) extends FuncUnit(cfg) { io.out.valid := state =/= s_idle && state =/= s_wait io.out.bits.res.data := 0.U io.out.bits.ctrl.robIdx := uop.ctrl.robIdx - io.out.bits.res.pc.get := uop.data.pc.get io.out.bits.ctrl.pdest := uop.ctrl.pdest io.out.bits.ctrl.flushPipe.get := uop.ctrl.flushPipe.get io.out.bits.ctrl.exceptionVec.get := 0.U.asTypeOf(io.out.bits.ctrl.exceptionVec.get) io.out.bits.perfDebugInfo := io.in.bits.perfDebugInfo - XSDebug(io.in.valid, p"In(${io.in.valid} ${io.in.ready}) state:${state} Inpc:0x${Hexadecimal(io.in.bits.data.pc.get)} InrobIdx:${io.in.bits.ctrl.robIdx}\n") + XSDebug(io.in.valid, p"In(${io.in.valid} ${io.in.ready}) state:${state} InrobIdx:${io.in.bits.ctrl.robIdx}\n") XSDebug(state =/= s_idle, p"state:${state} sbuffer(flush:${sbuffer} empty:${sbEmpty}) fencei:${fencei} sfence:${sfence}\n") - XSDebug(io.out.valid, p" Out(${io.out.valid} ${io.out.ready}) state:${state} Outpc:0x${Hexadecimal(io.out.bits.res.pc.get)} OutrobIdx:${io.out.bits.ctrl.robIdx}\n") + XSDebug(io.out.valid, p" Out(${io.out.valid} ${io.out.ready}) state:${state} OutrobIdx:${io.out.bits.ctrl.robIdx}\n") assert(!io.out.valid || io.out.ready, "when fence is out valid, out ready should always be true") } diff --git a/src/main/scala/xiangshan/backend/fu/FuConfig.scala b/src/main/scala/xiangshan/backend/fu/FuConfig.scala index 46080d6c331..a7928c80934 100644 --- a/src/main/scala/xiangshan/backend/fu/FuConfig.scala +++ b/src/main/scala/xiangshan/backend/fu/FuConfig.scala @@ -146,12 +146,12 @@ case class FuConfig ( def hasPredecode: Boolean = Seq(FuType.jmp, FuType.brh, FuType.csr, FuType.ldu).contains(fuType) - def needTargetPc: Boolean = Seq(FuType.jmp, FuType.brh, FuType.csr).contains(fuType) + def needTargetPc: Boolean = Seq(FuType.jmp, FuType.brh).contains(fuType) // predict info def needPdInfo: Boolean = Seq(FuType.jmp, FuType.brh, FuType.csr).contains(fuType) - def needPc: Boolean = Seq(FuType.jmp, FuType.brh, FuType.fence).contains(fuType) + def needPc: Boolean = Seq(FuType.jmp, FuType.brh, FuType.ldu).contains(fuType) def needFPUCtrl: Boolean = { import FuType._ diff --git a/src/main/scala/xiangshan/backend/fu/FuncUnit.scala b/src/main/scala/xiangshan/backend/fu/FuncUnit.scala index 334c3fd699a..6a133f04b66 100644 --- a/src/main/scala/xiangshan/backend/fu/FuncUnit.scala +++ b/src/main/scala/xiangshan/backend/fu/FuncUnit.scala @@ -65,7 +65,6 @@ class FuncUnitDataOutput(cfg: FuConfig)(implicit p: Parameters) extends XSBundle val data = UInt(cfg.destDataBits.W) val fflags = OptionWrapper(cfg.writeFflags, UInt(5.W)) val vxsat = OptionWrapper(cfg.writeVxsat, Vxsat()) - val pc = OptionWrapper(cfg.isFence, UInt(VAddrData().dataWidth.W)) val redirect = OptionWrapper(cfg.hasRedirect, ValidIO(new Redirect)) } @@ -221,7 +220,6 @@ trait HasPipelineReg { this: FuncUnit => io.in.ready := fixRdyVec.head io.out.valid := fixValidVec.last - io.out.bits.res.pc.zip(pcVec.last).foreach { case (l, r) => l := r } io.out.bits.ctrl.robIdx := fixCtrlVec.last.robIdx io.out.bits.ctrl.pdest := fixCtrlVec.last.pdest diff --git a/src/main/scala/xiangshan/backend/issue/IssueQueue.scala b/src/main/scala/xiangshan/backend/issue/IssueQueue.scala index b34892b0e8f..a0912165616 100644 --- a/src/main/scala/xiangshan/backend/issue/IssueQueue.scala +++ b/src/main/scala/xiangshan/backend/issue/IssueQueue.scala @@ -1131,6 +1131,7 @@ class IssueQueueMemAddrImp(override val wrapper: IssueQueue)(implicit p: Paramet require(!loadWakeUpIter.hasNext) deqBeforeDly.zipWithIndex.foreach { case (deq, i) => + deq.bits.common.pc.foreach(_ := 0.U) deq.bits.common.loadWaitBit.foreach(_ := deqEntryVec(i).bits.payload.loadWaitBit) deq.bits.common.waitForRobIdx.foreach(_ := deqEntryVec(i).bits.payload.waitForRobIdx) deq.bits.common.storeSetHit.foreach(_ := deqEntryVec(i).bits.payload.storeSetHit) From 4a8537f9e4445c6d6a25f5b2d5b68bec3838f7ca Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Tue, 26 Nov 2024 11:07:34 +0800 Subject: [PATCH 04/32] area(backend): remove memCtrl and disble mdp --- src/main/scala/xiangshan/backend/CtrlBlock.scala | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/main/scala/xiangshan/backend/CtrlBlock.scala b/src/main/scala/xiangshan/backend/CtrlBlock.scala index 4914f2c218d..dc817977d43 100644 --- a/src/main/scala/xiangshan/backend/CtrlBlock.scala +++ b/src/main/scala/xiangshan/backend/CtrlBlock.scala @@ -558,6 +558,10 @@ class CtrlBlockImp( RegEnable(waittable2rename, decodeOut.fire) } rename.io.ssit := memCtrl.io.ssit2Rename + // disble mdp + dispatch.io.lfst.resp := 0.U.asTypeOf(dispatch.io.lfst.resp) + rename.io.waittable := 0.U.asTypeOf(rename.io.waittable) + rename.io.ssit := 0.U.asTypeOf(rename.io.ssit) rename.io.intReadPorts := VecInit(rat.io.intReadPorts.map(x => VecInit(x.map(_.data)))) rename.io.fpReadPorts := VecInit(rat.io.fpReadPorts.map(x => VecInit(x.map(_.data)))) rename.io.vecReadPorts := VecInit(rat.io.vecReadPorts.map(x => VecInit(x.map(_.data)))) From 93d3c27ace833c48ee92a715c1a5d5617d7ee651 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Tue, 26 Nov 2024 14:05:22 +0800 Subject: [PATCH 05/32] area(Backend): reduce VfScheduler iq num from 3 to 2 and remove a vfcvt fu --- src/main/scala/xiangshan/Parameters.scala | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/main/scala/xiangshan/Parameters.scala b/src/main/scala/xiangshan/Parameters.scala index 19880f71c4b..5f6730137bc 100644 --- a/src/main/scala/xiangshan/Parameters.scala +++ b/src/main/scala/xiangshan/Parameters.scala @@ -452,16 +452,13 @@ case class XSCoreParameters implicit val schdType: SchedulerType = VfScheduler() SchdBlockParams(Seq( IssueBlockParams(Seq( - ExeUnitParams("VFEX0", Seq(VfmaCfg, VialuCfg, VimacCfg, VppuCfg), Seq(VfWB(port = 0, 0), V0WB(port = 0, 0)), Seq(Seq(VfRD(0, 0)), Seq(VfRD(1, 0)), Seq(VfRD(2, 0)), Seq(V0RD(0, 0)), Seq(VlRD(0, 0)))), - ExeUnitParams("VFEX1", Seq(VfaluCfg, VfcvtCfg, VipuCfg, VSetRvfWvfCfg), Seq(VfWB(port = 0, 1), V0WB(port = 0, 1), VlWB(port = vfSchdVlWbPort, 0), IntWB(port = 1, 1), FpWB(port = 0, 1)), Seq(Seq(VfRD(0, 1)), Seq(VfRD(1, 1)), Seq(VfRD(2, 1)), Seq(V0RD(0, 1)), Seq(VlRD(0, 1)))), - ), numEntries = 16, numEnq = 2, numComp = 14), - IssueBlockParams(Seq( - ExeUnitParams("VFEX2", Seq(VfmaCfg, VialuCfg), Seq(VfWB(port = 1, 0), V0WB(port = 1, 0)), Seq(Seq(VfRD(3, 0)), Seq(VfRD(4, 0)), Seq(VfRD(5, 0)), Seq(V0RD(1, 0)), Seq(VlRD(1, 0)))), - ExeUnitParams("VFEX3", Seq(VfaluCfg, VfcvtCfg), Seq(VfWB(port = 2, 1), V0WB(port = 2, 1), FpWB(port = 1, 1)), Seq(Seq(VfRD(3, 1)), Seq(VfRD(4, 1)), Seq(VfRD(5, 1)), Seq(V0RD(1, 1)), Seq(VlRD(1, 1)))), - ), numEntries = 16, numEnq = 2, numComp = 14), + ExeUnitParams("VFEX0", Seq(VialuCfg, VfaluCfg, VfmaCfg, VipuCfg, VSetRvfWvfCfg), Seq(VfWB(port = 0, 0), V0WB(port = 0, 0), VlWB(port = vfSchdVlWbPort, 0), IntWB(port = 1, 1), FpWB(port = 0, 1)), Seq(Seq(VfRD(0, 0)), Seq(VfRD(1, 0)), Seq(VfRD(2, 0)), Seq(V0RD(0, 0)), Seq(VlRD(0, 0)))), + ExeUnitParams("VFEX1", Seq(VfcvtCfg, VimacCfg, VppuCfg), Seq(VfWB(port = 2, 1), V0WB(port = 0, 1)), Seq(Seq(VfRD(0, 1)), Seq(VfRD(1, 1)), Seq(VfRD(2, 1)), Seq(V0RD(0, 1)), Seq(VlRD(0, 1)))), + ), numEntries = 18, numEnq = 2, numComp = 14), IssueBlockParams(Seq( - ExeUnitParams("VFEX4", Seq(VfdivCfg, VidivCfg), Seq(VfWB(port = 3, 1), V0WB(port = 3, 1)), Seq(Seq(VfRD(3, 2)), Seq(VfRD(4, 2)), Seq(VfRD(5, 2)), Seq(V0RD(1, 2)), Seq(VlRD(1, 2)))), - ), numEntries = 10, numEnq = 2, numComp = 8), + ExeUnitParams("VFEX2", Seq(VialuCfg, VfaluCfg, VfmaCfg), Seq(VfWB(port = 1, 0), V0WB(port = 1, 0), FpWB(port = 1, 1)), Seq(Seq(VfRD(3, 0)), Seq(VfRD(4, 0)), Seq(VfRD(5, 0)), Seq(V0RD(1, 0)), Seq(VlRD(1, 0)))), + ExeUnitParams("VFEX3", Seq(VfdivCfg, VidivCfg), Seq(VfWB(port = 3, 1), V0WB(port = 2, 1)), Seq(Seq(VfRD(3, 1)), Seq(VfRD(4, 1)), Seq(VfRD(5, 1)), Seq(V0RD(1, 1)), Seq(VlRD(1, 1)))), + ), numEntries = 18, numEnq = 2, numComp = 14), ), numPregs = vfPreg.numEntries, numDeqOutside = 0, From 2f6b6d07d7fb883bd1a93dadfbfb4192bc9c6a68 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Thu, 28 Nov 2024 16:35:21 +0800 Subject: [PATCH 06/32] timing(backend): pipe robCommits for better timing and area --- src/main/scala/xiangshan/backend/rename/RenameTable.scala | 5 +++-- .../xiangshan/backend/rename/freelist/BaseFreeList.scala | 4 ++-- src/main/scala/xiangshan/backend/rob/Rob.scala | 3 ++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/main/scala/xiangshan/backend/rename/RenameTable.scala b/src/main/scala/xiangshan/backend/rename/RenameTable.scala index a02dd081912..91f0e3b15e3 100644 --- a/src/main/scala/xiangshan/backend/rename/RenameTable.scala +++ b/src/main/scala/xiangshan/backend/rename/RenameTable.scala @@ -129,6 +129,7 @@ class RenameTable(reg_t: RegType)(implicit p: Parameters) extends XSModule with val t1_wSpec = RegNext(Mux(io.redirect, 0.U.asTypeOf(io.specWritePorts), io.specWritePorts)) val t1_snpt = RegNext(io.snpt, 0.U.asTypeOf(io.snpt)) + val t2_snpt = RegNext(t1_snpt, 0.U.asTypeOf(io.snpt)) val snapshots = SnapshotGenerator(spec_table, t1_snpt.snptEnq, t1_snpt.snptDeq, t1_redirect, t1_snpt.flushVec) @@ -139,8 +140,8 @@ class RenameTable(reg_t: RegType)(implicit p: Parameters) extends XSModule with val wMatch = ParallelPriorityMux(matchVec.reverse, t1_wSpec.map(_.data).reverse) // When there's a flush, we use arch_table to update spec_table. next := Mux( - t1_redirect, - Mux(t1_snpt.useSnpt, snapshots(t1_snpt.snptSelect)(i), arch_table(i)), + RegNext(t1_redirect), + Mux(t2_snpt.useSnpt, snapshots(t2_snpt.snptSelect)(i), arch_table(i)), Mux(VecInit(matchVec).asUInt.orR, wMatch, spec_table(i)) ) } diff --git a/src/main/scala/xiangshan/backend/rename/freelist/BaseFreeList.scala b/src/main/scala/xiangshan/backend/rename/freelist/BaseFreeList.scala index 2d1fc7129ec..130570386a4 100644 --- a/src/main/scala/xiangshan/backend/rename/freelist/BaseFreeList.scala +++ b/src/main/scala/xiangshan/backend/rename/freelist/BaseFreeList.scala @@ -57,8 +57,8 @@ abstract class BaseFreeList(size: Int, numLogicRegs:Int = 32)(implicit p: Parame } } - val lastCycleRedirect = GatedValidRegNext(io.redirect, false.B) - val lastCycleSnpt = RegNext(io.snpt, 0.U.asTypeOf(io.snpt)) + val lastCycleRedirect = RegNext(RegNext(io.redirect)) + val lastCycleSnpt = RegNext(RegNext(io.snpt, 0.U.asTypeOf(io.snpt))) val headPtr = RegInit(FreeListPtr(false, 0)) val headPtrOH = RegInit(1.U(size.W)) diff --git a/src/main/scala/xiangshan/backend/rob/Rob.scala b/src/main/scala/xiangshan/backend/rob/Rob.scala index 1615a7cf6f9..d15bc2f9943 100644 --- a/src/main/scala/xiangshan/backend/rob/Rob.scala +++ b/src/main/scala/xiangshan/backend/rob/Rob.scala @@ -347,7 +347,8 @@ class RobImp(override val wrapper: Rob)(implicit p: Parameters, params: BackendP rab.io.snpt := io.snpt rab.io.snpt.snptEnq := snptEnq - io.rabCommits := rab.io.commits + // pipe rab commits for better timing and area + io.rabCommits := RegNext(rab.io.commits) io.diffCommits.foreach(_ := rab.io.diffCommits.get) /** From 3e64debd84a91a5c7868ef7f16440ccbb68b4a2b Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Fri, 29 Nov 2024 17:35:58 +0800 Subject: [PATCH 07/32] Revert "area(Backend): reduce VfScheduler iq num from 3 to 2 and remove a vfcvt fu" This reverts commit 10b44fa68ead2a8d79ce215b6bb116912f72f3a4. --- src/main/scala/xiangshan/Parameters.scala | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/main/scala/xiangshan/Parameters.scala b/src/main/scala/xiangshan/Parameters.scala index 5f6730137bc..19880f71c4b 100644 --- a/src/main/scala/xiangshan/Parameters.scala +++ b/src/main/scala/xiangshan/Parameters.scala @@ -452,13 +452,16 @@ case class XSCoreParameters implicit val schdType: SchedulerType = VfScheduler() SchdBlockParams(Seq( IssueBlockParams(Seq( - ExeUnitParams("VFEX0", Seq(VialuCfg, VfaluCfg, VfmaCfg, VipuCfg, VSetRvfWvfCfg), Seq(VfWB(port = 0, 0), V0WB(port = 0, 0), VlWB(port = vfSchdVlWbPort, 0), IntWB(port = 1, 1), FpWB(port = 0, 1)), Seq(Seq(VfRD(0, 0)), Seq(VfRD(1, 0)), Seq(VfRD(2, 0)), Seq(V0RD(0, 0)), Seq(VlRD(0, 0)))), - ExeUnitParams("VFEX1", Seq(VfcvtCfg, VimacCfg, VppuCfg), Seq(VfWB(port = 2, 1), V0WB(port = 0, 1)), Seq(Seq(VfRD(0, 1)), Seq(VfRD(1, 1)), Seq(VfRD(2, 1)), Seq(V0RD(0, 1)), Seq(VlRD(0, 1)))), - ), numEntries = 18, numEnq = 2, numComp = 14), + ExeUnitParams("VFEX0", Seq(VfmaCfg, VialuCfg, VimacCfg, VppuCfg), Seq(VfWB(port = 0, 0), V0WB(port = 0, 0)), Seq(Seq(VfRD(0, 0)), Seq(VfRD(1, 0)), Seq(VfRD(2, 0)), Seq(V0RD(0, 0)), Seq(VlRD(0, 0)))), + ExeUnitParams("VFEX1", Seq(VfaluCfg, VfcvtCfg, VipuCfg, VSetRvfWvfCfg), Seq(VfWB(port = 0, 1), V0WB(port = 0, 1), VlWB(port = vfSchdVlWbPort, 0), IntWB(port = 1, 1), FpWB(port = 0, 1)), Seq(Seq(VfRD(0, 1)), Seq(VfRD(1, 1)), Seq(VfRD(2, 1)), Seq(V0RD(0, 1)), Seq(VlRD(0, 1)))), + ), numEntries = 16, numEnq = 2, numComp = 14), + IssueBlockParams(Seq( + ExeUnitParams("VFEX2", Seq(VfmaCfg, VialuCfg), Seq(VfWB(port = 1, 0), V0WB(port = 1, 0)), Seq(Seq(VfRD(3, 0)), Seq(VfRD(4, 0)), Seq(VfRD(5, 0)), Seq(V0RD(1, 0)), Seq(VlRD(1, 0)))), + ExeUnitParams("VFEX3", Seq(VfaluCfg, VfcvtCfg), Seq(VfWB(port = 2, 1), V0WB(port = 2, 1), FpWB(port = 1, 1)), Seq(Seq(VfRD(3, 1)), Seq(VfRD(4, 1)), Seq(VfRD(5, 1)), Seq(V0RD(1, 1)), Seq(VlRD(1, 1)))), + ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( - ExeUnitParams("VFEX2", Seq(VialuCfg, VfaluCfg, VfmaCfg), Seq(VfWB(port = 1, 0), V0WB(port = 1, 0), FpWB(port = 1, 1)), Seq(Seq(VfRD(3, 0)), Seq(VfRD(4, 0)), Seq(VfRD(5, 0)), Seq(V0RD(1, 0)), Seq(VlRD(1, 0)))), - ExeUnitParams("VFEX3", Seq(VfdivCfg, VidivCfg), Seq(VfWB(port = 3, 1), V0WB(port = 2, 1)), Seq(Seq(VfRD(3, 1)), Seq(VfRD(4, 1)), Seq(VfRD(5, 1)), Seq(V0RD(1, 1)), Seq(VlRD(1, 1)))), - ), numEntries = 18, numEnq = 2, numComp = 14), + ExeUnitParams("VFEX4", Seq(VfdivCfg, VidivCfg), Seq(VfWB(port = 3, 1), V0WB(port = 3, 1)), Seq(Seq(VfRD(3, 2)), Seq(VfRD(4, 2)), Seq(VfRD(5, 2)), Seq(V0RD(1, 2)), Seq(VlRD(1, 2)))), + ), numEntries = 10, numEnq = 2, numComp = 8), ), numPregs = vfPreg.numEntries, numDeqOutside = 0, From 2e6fb1b355b873abe2ddcc9be3719150c90b0cdb Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Fri, 29 Nov 2024 17:38:09 +0800 Subject: [PATCH 08/32] area(backend): reduce a vfcvt for better area --- src/main/scala/xiangshan/Parameters.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/xiangshan/Parameters.scala b/src/main/scala/xiangshan/Parameters.scala index 19880f71c4b..24fbcc0af91 100644 --- a/src/main/scala/xiangshan/Parameters.scala +++ b/src/main/scala/xiangshan/Parameters.scala @@ -457,7 +457,7 @@ case class XSCoreParameters ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( ExeUnitParams("VFEX2", Seq(VfmaCfg, VialuCfg), Seq(VfWB(port = 1, 0), V0WB(port = 1, 0)), Seq(Seq(VfRD(3, 0)), Seq(VfRD(4, 0)), Seq(VfRD(5, 0)), Seq(V0RD(1, 0)), Seq(VlRD(1, 0)))), - ExeUnitParams("VFEX3", Seq(VfaluCfg, VfcvtCfg), Seq(VfWB(port = 2, 1), V0WB(port = 2, 1), FpWB(port = 1, 1)), Seq(Seq(VfRD(3, 1)), Seq(VfRD(4, 1)), Seq(VfRD(5, 1)), Seq(V0RD(1, 1)), Seq(VlRD(1, 1)))), + ExeUnitParams("VFEX3", Seq(VfaluCfg), Seq(VfWB(port = 2, 1), V0WB(port = 2, 1), FpWB(port = 1, 1)), Seq(Seq(VfRD(3, 1)), Seq(VfRD(4, 1)), Seq(VfRD(5, 1)), Seq(V0RD(1, 1)), Seq(VlRD(1, 1)))), ), numEntries = 16, numEnq = 2, numComp = 14), IssueBlockParams(Seq( ExeUnitParams("VFEX4", Seq(VfdivCfg, VidivCfg), Seq(VfWB(port = 3, 1), V0WB(port = 3, 1)), Seq(Seq(VfRD(3, 2)), Seq(VfRD(4, 2)), Seq(VfRD(5, 2)), Seq(V0RD(1, 2)), Seq(VlRD(1, 2)))), From e2bf2ed35066c6173d9f7f6e8232eebdf83c8e4b Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Mon, 2 Dec 2024 18:27:34 +0800 Subject: [PATCH 09/32] area(backend): only pipe wakeupFromIQ and wakeupFromWB once --- .../scala/xiangshan/backend/Backend.scala | 71 ++++++++++ .../xiangshan/backend/issue/Entries.scala | 6 +- .../xiangshan/backend/issue/IssueQueue.scala | 14 ++ .../xiangshan/backend/issue/Scheduler.scala | 130 +++++++++++------- 4 files changed, 171 insertions(+), 50 deletions(-) diff --git a/src/main/scala/xiangshan/backend/Backend.scala b/src/main/scala/xiangshan/backend/Backend.scala index 58efddecc61..5dec900522f 100644 --- a/src/main/scala/xiangshan/backend/Backend.scala +++ b/src/main/scala/xiangshan/backend/Backend.scala @@ -219,6 +219,18 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame memScheduler.io.toSchedulers.wakeupVec ).map(x => (x.bits.exuIdx, x)).toMap + private val iqWakeUpMappedBundleDelayed: Map[Int, ValidIO[IssueQueueIQWakeUpBundle]] = ( + intScheduler.io.toSchedulers.wakeupVec ++ + fpScheduler.io.toSchedulers.wakeupVec ++ + vfScheduler.io.toSchedulers.wakeupVec ++ + memScheduler.io.toSchedulers.wakeupVec + ).map{ case x => + val delayed = Wire(chiselTypeOf(x)) + // TODO: add clock gate use Wen, remove issuequeue wakeupToIQ logic Wen = Wen && valid + delayed := RegNext(x) + (x.bits.exuIdx, delayed) + }.toMap + println(s"[Backend] iq wake up keys: ${iqWakeUpMappedBundle.keys}") wbFuBusyTable.io.in.intSchdBusyTable := intScheduler.io.wbFuBusyTable @@ -300,6 +312,41 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame ctrlBlock.io.debugEnqLsq.iqAccept := ctrlBlock.io.toMem.lsqEnqIO.iqAccept ctrlBlock.io.fromVecExcpMod.busy := vecExcpMod.o.status.busy + val intWriteBackDelayed = Wire(chiselTypeOf(wbDataPath.io.toIntPreg)) + intWriteBackDelayed.zip(wbDataPath.io.toIntPreg).map{ case (sink, source) => + sink := DontCare + sink.wen := RegNext(source.wen) + sink.intWen := RegNext(source.intWen) + sink.addr := RegEnable(source.addr, source.wen) + } + val fpWriteBackDelayed = Wire(chiselTypeOf(wbDataPath.io.toFpPreg)) + fpWriteBackDelayed.zip(wbDataPath.io.toFpPreg).map { case (sink, source) => + sink := DontCare + sink.wen := RegNext(source.wen) + sink.fpWen := RegNext(source.fpWen) + sink.addr := RegEnable(source.addr, source.wen) + } + val vfWriteBackDelayed = Wire(chiselTypeOf(wbDataPath.io.toVfPreg)) + vfWriteBackDelayed.zip(wbDataPath.io.toVfPreg).map { case (sink, source) => + sink := DontCare + sink.wen := RegNext(source.wen) + sink.vecWen := RegNext(source.vecWen) + sink.addr := RegEnable(source.addr, source.wen) + } + val v0WriteBackDelayed = Wire(chiselTypeOf(wbDataPath.io.toV0Preg)) + v0WriteBackDelayed.zip(wbDataPath.io.toV0Preg).map { case (sink, source) => + sink := DontCare + sink.wen := RegNext(source.wen) + sink.v0Wen := RegNext(source.v0Wen) + sink.addr := RegEnable(source.addr, source.wen) + } + val vlWriteBackDelayed = Wire(chiselTypeOf(wbDataPath.io.toVlPreg)) + vlWriteBackDelayed.zip(wbDataPath.io.toVlPreg).map { case (sink, source) => + sink := DontCare + sink.wen := RegNext(source.wen) + sink.vlWen := RegNext(source.vlWen) + sink.addr := RegEnable(source.addr, source.wen) + } intScheduler.io.fromTop.hartId := io.fromTop.hartId intScheduler.io.fromCtrlBlock.flush := ctrlBlock.io.toIssueBlock.flush intScheduler.io.fromDispatch.uops <> ctrlBlock.io.toIssueBlock.intUops @@ -308,8 +355,14 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame intScheduler.io.vfWriteBack := 0.U.asTypeOf(intScheduler.io.vfWriteBack) intScheduler.io.v0WriteBack := 0.U.asTypeOf(intScheduler.io.v0WriteBack) intScheduler.io.vlWriteBack := 0.U.asTypeOf(intScheduler.io.vlWriteBack) + intScheduler.io.intWriteBackDelayed := intWriteBackDelayed + intScheduler.io.fpWriteBackDelayed := 0.U.asTypeOf(intScheduler.io.fpWriteBackDelayed) + intScheduler.io.vfWriteBackDelayed := 0.U.asTypeOf(intScheduler.io.vfWriteBackDelayed) + intScheduler.io.v0WriteBackDelayed := 0.U.asTypeOf(intScheduler.io.v0WriteBackDelayed) + intScheduler.io.vlWriteBackDelayed := 0.U.asTypeOf(intScheduler.io.vlWriteBackDelayed) intScheduler.io.fromDataPath.resp := dataPath.io.toIntIQ intScheduler.io.fromSchedulers.wakeupVec.foreach { wakeup => wakeup := iqWakeUpMappedBundle(wakeup.bits.exuIdx) } + intScheduler.io.fromSchedulers.wakeupVecDelayed.foreach { wakeup => wakeup := iqWakeUpMappedBundleDelayed(wakeup.bits.exuIdx) } intScheduler.io.fromDataPath.og0Cancel := og0Cancel intScheduler.io.fromDataPath.og1Cancel := og1Cancel intScheduler.io.ldCancel := io.mem.ldCancel @@ -327,8 +380,14 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame fpScheduler.io.vfWriteBack := 0.U.asTypeOf(fpScheduler.io.vfWriteBack) fpScheduler.io.v0WriteBack := 0.U.asTypeOf(fpScheduler.io.v0WriteBack) fpScheduler.io.vlWriteBack := 0.U.asTypeOf(fpScheduler.io.vlWriteBack) + fpScheduler.io.intWriteBackDelayed := 0.U.asTypeOf(intWriteBackDelayed) + fpScheduler.io.fpWriteBackDelayed := fpWriteBackDelayed + fpScheduler.io.vfWriteBackDelayed := 0.U.asTypeOf(intScheduler.io.vfWriteBackDelayed) + fpScheduler.io.v0WriteBackDelayed := 0.U.asTypeOf(intScheduler.io.v0WriteBackDelayed) + fpScheduler.io.vlWriteBackDelayed := 0.U.asTypeOf(intScheduler.io.vlWriteBackDelayed) fpScheduler.io.fromDataPath.resp := dataPath.io.toFpIQ fpScheduler.io.fromSchedulers.wakeupVec.foreach { wakeup => wakeup := iqWakeUpMappedBundle(wakeup.bits.exuIdx) } + fpScheduler.io.fromSchedulers.wakeupVecDelayed.foreach { wakeup => wakeup := iqWakeUpMappedBundleDelayed(wakeup.bits.exuIdx) } fpScheduler.io.fromDataPath.og0Cancel := og0Cancel fpScheduler.io.fromDataPath.og1Cancel := og1Cancel fpScheduler.io.ldCancel := io.mem.ldCancel @@ -345,6 +404,11 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame memScheduler.io.vfWriteBack := wbDataPath.io.toVfPreg memScheduler.io.v0WriteBack := wbDataPath.io.toV0Preg memScheduler.io.vlWriteBack := wbDataPath.io.toVlPreg + memScheduler.io.intWriteBackDelayed := intWriteBackDelayed + memScheduler.io.fpWriteBackDelayed := fpWriteBackDelayed + memScheduler.io.vfWriteBackDelayed := vfWriteBackDelayed + memScheduler.io.v0WriteBackDelayed := v0WriteBackDelayed + memScheduler.io.vlWriteBackDelayed := vlWriteBackDelayed memScheduler.io.fromMem.get.scommit := io.mem.sqDeq memScheduler.io.fromMem.get.lcommit := io.mem.lqDeq memScheduler.io.fromMem.get.wakeup := io.mem.wakeup @@ -366,6 +430,7 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame memScheduler.io.fromMem.get.vstuFeedback := io.mem.vstuIqFeedback memScheduler.io.fromMem.get.vlduFeedback := io.mem.vlduIqFeedback memScheduler.io.fromSchedulers.wakeupVec.foreach { wakeup => wakeup := iqWakeUpMappedBundle(wakeup.bits.exuIdx) } + memScheduler.io.fromSchedulers.wakeupVecDelayed.foreach { wakeup => wakeup := iqWakeUpMappedBundleDelayed(wakeup.bits.exuIdx) } memScheduler.io.fromDataPath.og0Cancel := og0Cancel memScheduler.io.fromDataPath.og1Cancel := og1Cancel memScheduler.io.ldCancel := io.mem.ldCancel @@ -384,8 +449,14 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame vfScheduler.io.vfWriteBack := wbDataPath.io.toVfPreg vfScheduler.io.v0WriteBack := wbDataPath.io.toV0Preg vfScheduler.io.vlWriteBack := wbDataPath.io.toVlPreg + vfScheduler.io.intWriteBackDelayed := 0.U.asTypeOf(intWriteBackDelayed) + vfScheduler.io.fpWriteBackDelayed := 0.U.asTypeOf(fpWriteBackDelayed) + vfScheduler.io.vfWriteBackDelayed := vfWriteBackDelayed + vfScheduler.io.v0WriteBackDelayed := v0WriteBackDelayed + vfScheduler.io.vlWriteBackDelayed := vlWriteBackDelayed vfScheduler.io.fromDataPath.resp := dataPath.io.toVfIQ vfScheduler.io.fromSchedulers.wakeupVec.foreach { wakeup => wakeup := iqWakeUpMappedBundle(wakeup.bits.exuIdx) } + vfScheduler.io.fromSchedulers.wakeupVecDelayed.foreach { wakeup => wakeup := iqWakeUpMappedBundleDelayed(wakeup.bits.exuIdx) } vfScheduler.io.fromDataPath.og0Cancel := og0Cancel vfScheduler.io.fromDataPath.og1Cancel := og1Cancel vfScheduler.io.ldCancel := io.mem.ldCancel diff --git a/src/main/scala/xiangshan/backend/issue/Entries.scala b/src/main/scala/xiangshan/backend/issue/Entries.scala index c5284506702..5f04f0c1faa 100644 --- a/src/main/scala/xiangshan/backend/issue/Entries.scala +++ b/src/main/scala/xiangshan/backend/issue/Entries.scala @@ -135,8 +135,8 @@ class Entries(implicit p: Parameters, params: IssueBlockParams) extends XSModule enqEntry.io.commonIn.transSel := (if (params.isAllComp || params.isAllSimp) enqCanTrans2Others.get && othersTransSelVec.get(entryIdx).valid else enqCanTrans2Simp.get && simpTransSelVec.get(entryIdx).valid || enqCanTrans2Comp.get && compTransSelVec.get(entryIdx).valid) EntriesConnect(enqEntry.io.commonIn, enqEntry.io.commonOut, entryIdx) - enqEntry.io.enqDelayIn1.wakeUpFromWB := RegEnable(io.wakeUpFromWB, io.enq(entryIdx).valid) - enqEntry.io.enqDelayIn1.wakeUpFromIQ := RegEnable(io.wakeUpFromIQ, io.enq(entryIdx).valid) + enqEntry.io.enqDelayIn1.wakeUpFromWB := io.wakeUpFromWBDelayed + enqEntry.io.enqDelayIn1.wakeUpFromIQ := io.wakeUpFromIQDelayed enqEntry.io.enqDelayIn1.srcLoadDependency := RegEnable(VecInit(io.enq(entryIdx).bits.payload.srcLoadDependency.take(params.numRegSrc)), io.enq(entryIdx).valid) enqEntry.io.enqDelayIn1.og0Cancel := RegNext(io.og0Cancel) enqEntry.io.enqDelayIn1.ldCancel := RegNext(io.ldCancel) @@ -533,6 +533,8 @@ class EntriesIO(implicit p: Parameters, params: IssueBlockParams) extends XSBund // wakeup val wakeUpFromWB: MixedVec[ValidIO[IssueQueueWBWakeUpBundle]] = Flipped(params.genWBWakeUpSinkValidBundle) val wakeUpFromIQ: MixedVec[ValidIO[IssueQueueIQWakeUpBundle]] = Flipped(params.genIQWakeUpSinkValidBundle) + val wakeUpFromWBDelayed: MixedVec[ValidIO[IssueQueueWBWakeUpBundle]] = Flipped(params.genWBWakeUpSinkValidBundle) + val wakeUpFromIQDelayed: MixedVec[ValidIO[IssueQueueIQWakeUpBundle]] = Flipped(params.genIQWakeUpSinkValidBundle) val vlFromIntIsZero = Input(Bool()) val vlFromIntIsVlmax = Input(Bool()) val vlFromVfIsZero = Input(Bool()) diff --git a/src/main/scala/xiangshan/backend/issue/IssueQueue.scala b/src/main/scala/xiangshan/backend/issue/IssueQueue.scala index a0912165616..7d8c6b9bff1 100644 --- a/src/main/scala/xiangshan/backend/issue/IssueQueue.scala +++ b/src/main/scala/xiangshan/backend/issue/IssueQueue.scala @@ -57,6 +57,8 @@ class IssueQueueIO()(implicit p: Parameters, params: IssueBlockParams) extends X val wbBusyTableWrite = Output(params.genWbFuBusyTableWriteBundle) val wakeupFromWB: MixedVec[ValidIO[IssueQueueWBWakeUpBundle]] = Flipped(params.genWBWakeUpSinkValidBundle) val wakeupFromIQ: MixedVec[ValidIO[IssueQueueIQWakeUpBundle]] = Flipped(params.genIQWakeUpSinkValidBundle) + val wakeupFromWBDelayed: MixedVec[ValidIO[IssueQueueWBWakeUpBundle]] = Flipped(params.genWBWakeUpSinkValidBundle) + val wakeupFromIQDelayed: MixedVec[ValidIO[IssueQueueIQWakeUpBundle]] = Flipped(params.genIQWakeUpSinkValidBundle) val vlFromIntIsZero = Input(Bool()) val vlFromIntIsVlmax = Input(Bool()) val vlFromVfIsZero = Input(Bool()) @@ -277,6 +279,16 @@ class IssueQueueImp(override val wrapper: IssueQueue)(implicit p: Parameters, va w := w_src } } + val wakeupFromIQDelayed = Wire(chiselTypeOf(io.wakeupFromIQDelayed)) + wakeupFromIQDelayed.zip(io.wakeupFromIQDelayed).foreach { case (w, w_src) => + if (!params.inVfSchd && params.readVfRf && params.hasWakeupFromVf && w_src.bits.params.isVfExeUnit) { + val noCancel = !LoadShouldCancel(Some(w_src.bits.loadDependency), io.ldCancel) + w := RegNext(Mux(noCancel, w_src, 0.U.asTypeOf(w))) + w.bits.loadDependency.zip(w_src.bits.loadDependency).foreach { case (ld, ld_src) => ld := RegNext(Mux(noCancel, ld_src << 1, 0.U.asTypeOf(ld))) } + } else { + w := w_src + } + } /** * Connection of [[entries]] @@ -362,6 +374,8 @@ class IssueQueueImp(override val wrapper: IssueQueue)(implicit p: Parameters, va } entriesIO.wakeUpFromWB := io.wakeupFromWB entriesIO.wakeUpFromIQ := wakeupFromIQ + entriesIO.wakeUpFromWBDelayed := io.wakeupFromWBDelayed + entriesIO.wakeUpFromIQDelayed := wakeupFromIQDelayed entriesIO.vlFromIntIsZero := io.vlFromIntIsZero entriesIO.vlFromIntIsVlmax := io.vlFromIntIsVlmax entriesIO.vlFromVfIsZero := io.vlFromVfIsZero diff --git a/src/main/scala/xiangshan/backend/issue/Scheduler.scala b/src/main/scala/xiangshan/backend/issue/Scheduler.scala index ec113ef9248..917d6072cd1 100644 --- a/src/main/scala/xiangshan/backend/issue/Scheduler.scala +++ b/src/main/scala/xiangshan/backend/issue/Scheduler.scala @@ -77,6 +77,16 @@ class SchedulerIO()(implicit params: SchdBlockParams, p: Parameters) extends XSB new RfWritePortWithConfig(backendParams.v0PregParams.dataCfg, backendParams.v0PregParams.addrWidth))) val vlWriteBack = MixedVec(Vec(backendParams.numPregWb(VlData()), new RfWritePortWithConfig(backendParams.vlPregParams.dataCfg, backendParams.vlPregParams.addrWidth))) + val intWriteBackDelayed = MixedVec(Vec(backendParams.numPregWb(IntData()), + new RfWritePortWithConfig(backendParams.intPregParams.dataCfg, backendParams.intPregParams.addrWidth))) + val fpWriteBackDelayed = MixedVec(Vec(backendParams.numPregWb(FpData()), + new RfWritePortWithConfig(backendParams.fpPregParams.dataCfg, backendParams.fpPregParams.addrWidth))) + val vfWriteBackDelayed = MixedVec(Vec(backendParams.numPregWb(VecData()), + new RfWritePortWithConfig(backendParams.vfPregParams.dataCfg, backendParams.vfPregParams.addrWidth))) + val v0WriteBackDelayed = MixedVec(Vec(backendParams.numPregWb(V0Data()), + new RfWritePortWithConfig(backendParams.v0PregParams.dataCfg, backendParams.v0PregParams.addrWidth))) + val vlWriteBackDelayed = MixedVec(Vec(backendParams.numPregWb(VlData()), + new RfWritePortWithConfig(backendParams.vlPregParams.dataCfg, backendParams.vlPregParams.addrWidth))) val toDataPathAfterDelay: MixedVec[MixedVec[DecoupledIO[IssueQueueIssueBundle]]] = MixedVec(params.issueBlockParams.map(_.genIssueDecoupledBundle)) val vlWriteBackInfo = new Bundle { @@ -88,6 +98,7 @@ class SchedulerIO()(implicit params: SchdBlockParams, p: Parameters) extends XSB val fromSchedulers = new Bundle { val wakeupVec: MixedVec[ValidIO[IssueQueueIQWakeUpBundle]] = Flipped(params.genIQWakeUpInValidBundle) + val wakeupVecDelayed: MixedVec[ValidIO[IssueQueueIQWakeUpBundle]] = Flipped(params.genIQWakeUpInValidBundle) } val toSchedulers = new Bundle { @@ -144,6 +155,8 @@ abstract class SchedulerImpBase(wrapper: Scheduler)(implicit params: SchdBlockPa // alias private val iqWakeUpInMap: Map[Int, ValidIO[IssueQueueIQWakeUpBundle]] = io.fromSchedulers.wakeupVec.map(x => (x.bits.exuIdx, x)).toMap + private val iqWakeUpInMapDelayed: Map[Int, ValidIO[IssueQueueIQWakeUpBundle]] = + io.fromSchedulers.wakeupVecDelayed.map(x => (x.bits.exuIdx, x)).toMap private val schdType = params.schdType // Modules @@ -155,57 +168,39 @@ abstract class SchedulerImpBase(wrapper: Scheduler)(implicit params: SchdBlockPa val wakeupFromVfWBVec = Wire(params.genVfWBWakeUpSinkValidBundle) val wakeupFromV0WBVec = Wire(params.genV0WBWakeUpSinkValidBundle) val wakeupFromVlWBVec = Wire(params.genVlWBWakeUpSinkValidBundle) - - wakeupFromIntWBVec.zip(io.intWriteBack).foreach { case (sink, source) => - sink.valid := source.wen - sink.bits.rfWen := source.intWen - sink.bits.fpWen := source.fpWen - sink.bits.vecWen := source.vecWen - sink.bits.v0Wen := source.v0Wen - sink.bits.vlWen := source.vlWen - sink.bits.pdest := source.addr - } - - wakeupFromFpWBVec.zip(io.fpWriteBack).foreach { case (sink, source) => - sink.valid := source.wen - sink.bits.rfWen := source.intWen - sink.bits.fpWen := source.fpWen - sink.bits.vecWen := source.vecWen - sink.bits.v0Wen := source.v0Wen - sink.bits.vlWen := source.vlWen - sink.bits.pdest := source.addr - } - - wakeupFromVfWBVec.zip(io.vfWriteBack).foreach { case (sink, source) => - sink.valid := source.wen - sink.bits.rfWen := source.intWen - sink.bits.fpWen := source.fpWen - sink.bits.vecWen := source.vecWen - sink.bits.v0Wen := source.v0Wen - sink.bits.vlWen := source.vlWen - sink.bits.pdest := source.addr - } - - wakeupFromV0WBVec.zip(io.v0WriteBack).foreach { case (sink, source) => - sink.valid := source.wen - sink.bits.rfWen := source.intWen - sink.bits.fpWen := source.fpWen - sink.bits.vecWen := source.vecWen - sink.bits.v0Wen := source.v0Wen - sink.bits.vlWen := source.vlWen - sink.bits.pdest := source.addr + val wakeupFromIntWBVecDelayed = Wire(params.genIntWBWakeUpSinkValidBundle) + val wakeupFromFpWBVecDelayed = Wire(params.genFpWBWakeUpSinkValidBundle) + val wakeupFromVfWBVecDelayed = Wire(params.genVfWBWakeUpSinkValidBundle) + val wakeupFromV0WBVecDelayed = Wire(params.genV0WBWakeUpSinkValidBundle) + val wakeupFromVlWBVecDelayed = Wire(params.genVlWBWakeUpSinkValidBundle) + + val wakeupFromWBVec = Seq(wakeupFromIntWBVec, wakeupFromFpWBVec, wakeupFromVfWBVec, wakeupFromV0WBVec, wakeupFromVlWBVec) + val allWriteBack = Seq(io.intWriteBack, io.fpWriteBack, io.vfWriteBack, io.v0WriteBack, io.vlWriteBack) + wakeupFromWBVec.zip(allWriteBack).map{ case (sinks, sources) => + sinks.zip(sources).map{ case (sink, source) => + sink.valid := source.wen + sink.bits.rfWen := source.intWen + sink.bits.fpWen := source.fpWen + sink.bits.vecWen := source.vecWen + sink.bits.v0Wen := source.v0Wen + sink.bits.vlWen := source.vlWen + sink.bits.pdest := source.addr + } } - wakeupFromVlWBVec.zip(io.vlWriteBack).foreach { case (sink, source) => - sink.valid := source.wen - sink.bits.rfWen := source.intWen - sink.bits.fpWen := source.fpWen - sink.bits.vecWen := source.vecWen - sink.bits.v0Wen := source.v0Wen - sink.bits.vlWen := source.vlWen - sink.bits.pdest := source.addr + val wakeupFromWBVecDelayed = Seq(wakeupFromIntWBVecDelayed, wakeupFromFpWBVecDelayed, wakeupFromVfWBVecDelayed, wakeupFromV0WBVecDelayed, wakeupFromVlWBVecDelayed) + val allWriteBackDelayed = Seq(io.intWriteBackDelayed, io.fpWriteBackDelayed, io.vfWriteBackDelayed, io.v0WriteBackDelayed, io.vlWriteBackDelayed) + wakeupFromWBVecDelayed.zip(allWriteBackDelayed).map { case (sinks, sources) => + sinks.zip(sources).map { case (sink, source) => + sink.valid := source.wen + sink.bits.rfWen := source.intWen + sink.bits.fpWen := source.fpWen + sink.bits.vecWen := source.vecWen + sink.bits.v0Wen := source.v0Wen + sink.bits.vlWen := source.vlWen + sink.bits.pdest := source.addr + } } - // Connect bundles having the same wakeup source issueQueues.zipWithIndex.foreach { case(iq, i) => iq.io.wakeupFromIQ.foreach { wakeUp => @@ -230,6 +225,15 @@ abstract class SchedulerImpBase(wrapper: Scheduler)(implicit params: SchdBlockPa if (iq.params.numV0Src == 0) wakeUp.bits.v0Wen := false.B if (iq.params.numVlSrc == 0) wakeUp.bits.vlWen := false.B } + iq.io.wakeupFromIQDelayed.foreach { wakeUp => + val wakeUpIn = iqWakeUpInMapDelayed(wakeUp.bits.exuIdx) + connectSamePort(wakeUp, wakeUpIn) + if (iq.params.numIntSrc == 0) wakeUp.bits.rfWen := false.B + if (iq.params.numFpSrc == 0) wakeUp.bits.fpWen := false.B + if (iq.params.numVfSrc == 0) wakeUp.bits.vecWen := false.B + if (iq.params.numV0Src == 0) wakeUp.bits.v0Wen := false.B + if (iq.params.numVlSrc == 0) wakeUp.bits.vlWen := false.B + } iq.io.og0Cancel := io.fromDataPath.og0Cancel iq.io.og1Cancel := io.fromDataPath.og1Cancel if (iq.params.needLoadDependency) @@ -349,7 +353,16 @@ class SchedulerArithImp(override val wrapper: Scheduler)(implicit params: SchdBl wakeupFromVlWBVec.zipWithIndex.filter(x => iq.params.needWakeupFromVlWBPort.keys.toSeq.contains(x._2)).map(_._1)) case _ => null } + val intWBIQDelayed = params.schdType match { + case IntScheduler() => wakeupFromIntWBVecDelayed.zipWithIndex.filter(x => iq.params.needWakeupFromIntWBPort.keys.toSeq.contains(x._2)).map(_._1) + case FpScheduler() => wakeupFromFpWBVecDelayed.zipWithIndex.filter(x => iq.params.needWakeupFromFpWBPort.keys.toSeq.contains(x._2)).map(_._1) + case VfScheduler() => (wakeupFromVfWBVecDelayed.zipWithIndex.filter(x => iq.params.needWakeupFromVfWBPort.keys.toSeq.contains(x._2)).map(_._1) ++ + wakeupFromV0WBVecDelayed.zipWithIndex.filter(x => iq.params.needWakeupFromV0WBPort.keys.toSeq.contains(x._2)).map(_._1) ++ + wakeupFromVlWBVecDelayed.zipWithIndex.filter(x => iq.params.needWakeupFromVlWBPort.keys.toSeq.contains(x._2)).map(_._1)) + case _ => null + } iq.io.wakeupFromWB.zip(intWBIQ).foreach{ case (sink, source) => sink := source} + iq.io.wakeupFromWBDelayed.zip(intWBIQDelayed).foreach{ case (sink, source) => sink := source} } val perfEvents = basePerfEvents @@ -404,6 +417,13 @@ class SchedulerMemImp(override val wrapper: Scheduler)(implicit params: SchdBloc wakeupFromV0WBVec.zipWithIndex.filter(x => iq.params.needWakeupFromV0WBPort.keys.toSeq.contains(x._2)).map(_._1) ++ wakeupFromVlWBVec.zipWithIndex.filter(x => iq.params.needWakeupFromVlWBPort.keys.toSeq.contains(x._2)).map(_._1) ).foreach{ case (sink, source) => sink := source} + iq.io.wakeupFromWBDelayed.zip( + wakeupFromIntWBVecDelayed.zipWithIndex.filter(x => iq.params.needWakeupFromIntWBPort.keys.toSeq.contains(x._2)).map(_._1) ++ + wakeupFromFpWBVecDelayed.zipWithIndex.filter(x => iq.params.needWakeupFromFpWBPort.keys.toSeq.contains(x._2)).map(_._1) ++ + wakeupFromVfWBVecDelayed.zipWithIndex.filter(x => iq.params.needWakeupFromVfWBPort.keys.toSeq.contains(x._2)).map(_._1) ++ + wakeupFromV0WBVecDelayed.zipWithIndex.filter(x => iq.params.needWakeupFromV0WBPort.keys.toSeq.contains(x._2)).map(_._1) ++ + wakeupFromVlWBVecDelayed.zipWithIndex.filter(x => iq.params.needWakeupFromVlWBPort.keys.toSeq.contains(x._2)).map(_._1) + ).foreach { case (sink, source) => sink := source } } ldAddrIQs.zipWithIndex.foreach { @@ -461,6 +481,13 @@ class SchedulerMemImp(override val wrapper: Scheduler)(implicit params: SchdBloc wakeupFromV0WBVec.zipWithIndex.filter(x => iq.params.needWakeupFromV0WBPort.keys.toSeq.contains(x._2)).map(_._1).toSeq ++ wakeupFromVlWBVec.zipWithIndex.filter(x => iq.params.needWakeupFromVlWBPort.keys.toSeq.contains(x._2)).map(_._1).toSeq ).foreach{ case (sink, source) => sink := source} + iq.io.wakeupFromWBDelayed.zip( + wakeupFromIntWBVecDelayed.zipWithIndex.filter(x => iq.params.needWakeupFromIntWBPort.keys.toSeq.contains(x._2)).map(_._1).toSeq ++ + wakeupFromFpWBVecDelayed.zipWithIndex.filter(x => iq.params.needWakeupFromFpWBPort.keys.toSeq.contains(x._2)).map(_._1).toSeq ++ + wakeupFromVfWBVecDelayed.zipWithIndex.filter(x => iq.params.needWakeupFromVfWBPort.keys.toSeq.contains(x._2)).map(_._1).toSeq ++ + wakeupFromV0WBVecDelayed.zipWithIndex.filter(x => iq.params.needWakeupFromV0WBPort.keys.toSeq.contains(x._2)).map(_._1).toSeq ++ + wakeupFromVlWBVecDelayed.zipWithIndex.filter(x => iq.params.needWakeupFromVlWBPort.keys.toSeq.contains(x._2)).map(_._1).toSeq + ).foreach { case (sink, source) => sink := source } // here disable fp load fast wakeup to std, and no FEX wakeup to std iq.io.wakeupFromIQ.map(_.bits.fpWen := false.B) } @@ -500,6 +527,13 @@ class SchedulerMemImp(override val wrapper: Scheduler)(implicit params: SchdBloc wakeupFromV0WBVec.zipWithIndex.filter(x => imp.params.needWakeupFromV0WBPort.keys.toSeq.contains(x._2)).map(_._1).toSeq ++ wakeupFromVlWBVec.zipWithIndex.filter(x => imp.params.needWakeupFromVlWBPort.keys.toSeq.contains(x._2)).map(_._1).toSeq ).foreach{ case (sink, source) => sink := source} + imp.io.wakeupFromWBDelayed.zip( + wakeupFromIntWBVecDelayed.zipWithIndex.filter(x => imp.params.needWakeupFromIntWBPort.keys.toSeq.contains(x._2)).map(_._1).toSeq ++ + wakeupFromFpWBVecDelayed.zipWithIndex.filter(x => imp.params.needWakeupFromFpWBPort.keys.toSeq.contains(x._2)).map(_._1).toSeq ++ + wakeupFromVfWBVecDelayed.zipWithIndex.filter(x => imp.params.needWakeupFromVfWBPort.keys.toSeq.contains(x._2)).map(_._1).toSeq ++ + wakeupFromV0WBVecDelayed.zipWithIndex.filter(x => imp.params.needWakeupFromV0WBPort.keys.toSeq.contains(x._2)).map(_._1).toSeq ++ + wakeupFromVlWBVecDelayed.zipWithIndex.filter(x => imp.params.needWakeupFromVlWBPort.keys.toSeq.contains(x._2)).map(_._1).toSeq + ).foreach { case (sink, source) => sink := source } case _ => } From 1498053213f9da11f9ee4c54527e6a54e67c5a93 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Mon, 2 Dec 2024 18:37:47 +0800 Subject: [PATCH 10/32] area(backend): only use startAddr in pcMem --- .../scala/xiangshan/backend/Backend.scala | 2 +- .../scala/xiangshan/backend/Bundles.scala | 4 +++- .../scala/xiangshan/backend/CtrlBlock.scala | 20 ++++++++++--------- .../backend/datapath/BypassNetwork.scala | 17 +++++++++++++++- .../scala/xiangshan/backend/exu/ExeUnit.scala | 1 + .../scala/xiangshan/backend/fu/FuncUnit.scala | 1 + .../scala/xiangshan/backend/fu/Jump.scala | 13 +++++------- .../backend/fu/wrapper/BranchUnit.scala | 15 +++++++++----- .../backend/fu/wrapper/JumpUnit.scala | 5 +++-- .../xiangshan/backend/issue/IssueQueue.scala | 1 + 10 files changed, 52 insertions(+), 27 deletions(-) diff --git a/src/main/scala/xiangshan/backend/Backend.scala b/src/main/scala/xiangshan/backend/Backend.scala index 5dec900522f..65572fa29d9 100644 --- a/src/main/scala/xiangshan/backend/Backend.scala +++ b/src/main/scala/xiangshan/backend/Backend.scala @@ -789,7 +789,7 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame sink.bits.uop.v0Wen := source.bits.v0Wen.getOrElse(false.B) sink.bits.uop.vlWen := source.bits.vlWen.getOrElse(false.B) sink.bits.uop.flushPipe := source.bits.flushPipe.getOrElse(false.B) - sink.bits.uop.pc := source.bits.pc.getOrElse(0.U) + sink.bits.uop.pc := source.bits.pc.getOrElse(0.U) + (source.bits.ftqOffset.getOrElse(0.U) << instOffsetBits) sink.bits.uop.loadWaitBit := Mux(enableMdp, source.bits.loadWaitBit.getOrElse(false.B), false.B) sink.bits.uop.waitForRobIdx := Mux(enableMdp, source.bits.waitForRobIdx.getOrElse(0.U.asTypeOf(new RobPtr)), 0.U.asTypeOf(new RobPtr)) sink.bits.uop.storeSetHit := Mux(enableMdp, source.bits.storeSetHit.getOrElse(false.B), false.B) diff --git a/src/main/scala/xiangshan/backend/Bundles.scala b/src/main/scala/xiangshan/backend/Bundles.scala index 4a738d65f44..5def275061a 100644 --- a/src/main/scala/xiangshan/backend/Bundles.scala +++ b/src/main/scala/xiangshan/backend/Bundles.scala @@ -596,7 +596,8 @@ object Bundles { val fuType = FuType() val fuOpType = FuOpType() val src = Vec(params.numRegSrc, UInt(params.srcDataBitsMax.W)) - val imm = UInt(32.W) + val imm = UInt(64.W) + val nextPcOffset = OptionWrapper(params.hasBrhFu, UInt((log2Up(PredictWidth) + 1).W)) val robIdx = new RobPtr val iqIdx = UInt(log2Up(MemIQSizeMax).W)// Only used by store yet val isFirstIssue = Bool() // Only used by store yet @@ -666,6 +667,7 @@ object Bundles { this.flushPipe .foreach(_ := source.common.flushPipe.get) this.pc .foreach(_ := source.common.pc.get) this.preDecode .foreach(_ := source.common.preDecode.get) + this.nextPcOffset .foreach(_ := source.common.nextPcOffset.get) this.ftqIdx .foreach(_ := source.common.ftqIdx.get) this.ftqOffset .foreach(_ := source.common.ftqOffset.get) this.predictInfo .foreach(_ := source.common.predictInfo.get) diff --git a/src/main/scala/xiangshan/backend/CtrlBlock.scala b/src/main/scala/xiangshan/backend/CtrlBlock.scala index dc817977d43..f72820f4e0a 100644 --- a/src/main/scala/xiangshan/backend/CtrlBlock.scala +++ b/src/main/scala/xiangshan/backend/CtrlBlock.scala @@ -110,7 +110,7 @@ class CtrlBlockImp( pcMem.io.ren.get(pcMemRdIndexes("robFlush").head) := s0_robFlushRedirect.valid pcMem.io.raddr(pcMemRdIndexes("robFlush").head) := s0_robFlushRedirect.bits.ftqIdx.value - private val s1_robFlushPc = pcMem.io.rdata(pcMemRdIndexes("robFlush").head).getPc(RegEnable(s0_robFlushRedirect.bits.ftqOffset, s0_robFlushRedirect.valid)) + private val s1_robFlushPc = pcMem.io.rdata(pcMemRdIndexes("robFlush").head).startAddr + (RegEnable(s0_robFlushRedirect.bits.ftqOffset, s0_robFlushRedirect.valid) << instOffsetBits) private val s3_redirectGen = redirectGen.io.stage2Redirect private val s1_s3_redirect = Mux(s1_robFlushRedirect.valid, s1_robFlushRedirect, s3_redirectGen) private val s2_s4_pendingRedirectValid = RegInit(false.B) @@ -213,7 +213,7 @@ class CtrlBlockImp( pcMem.io.raddr(pcMemRdIndexes("redirect").head) := memViolation.bits.ftqIdx.value pcMem.io.ren.get(pcMemRdIndexes("memPred").head) := memViolation.valid pcMem.io.raddr(pcMemRdIndexes("memPred").head) := memViolation.bits.stFtqIdx.value - redirectGen.io.memPredPcRead.data := pcMem.io.rdata(pcMemRdIndexes("memPred").head).getPc(RegEnable(memViolation.bits.stFtqOffset, memViolation.valid)) + redirectGen.io.memPredPcRead.data := pcMem.io.rdata(pcMemRdIndexes("memPred").head).startAddr + (RegEnable(memViolation.bits.stFtqOffset, memViolation.valid) << instOffsetBits) for ((pcMemIdx, i) <- pcMemRdIndexes("bjuPc").zipWithIndex) { val ren = io.toDataPath.pcToDataPathIO.fromDataPathValid(i) @@ -221,7 +221,7 @@ class CtrlBlockImp( val roffset = io.toDataPath.pcToDataPathIO.fromDataPathFtqOffset(i) pcMem.io.ren.get(pcMemIdx) := ren pcMem.io.raddr(pcMemIdx) := raddr - io.toDataPath.pcToDataPathIO.toDataPathPC(i) := pcMem.io.rdata(pcMemIdx).getPc(RegEnable(roffset, ren)) + io.toDataPath.pcToDataPathIO.toDataPathPC(i) := pcMem.io.rdata(pcMemIdx).startAddr } for ((pcMemIdx, i) <- pcMemRdIndexes("bjuTarget").zipWithIndex) { @@ -240,21 +240,21 @@ class CtrlBlockImp( val roffset = io.toDataPath.pcToDataPathIO.fromDataPathFtqOffset(baseIdx+i) pcMem.io.ren.get(pcMemIdx) := ren pcMem.io.raddr(pcMemIdx) := raddr - io.toDataPath.pcToDataPathIO.toDataPathPC(baseIdx+i) := pcMem.io.rdata(pcMemIdx).getPc(RegEnable(roffset, ren)) + io.toDataPath.pcToDataPathIO.toDataPathPC(baseIdx+i) := pcMem.io.rdata(pcMemIdx).startAddr } for ((pcMemIdx, i) <- pcMemRdIndexes("hybrid").zipWithIndex) { // load read pcMem (s0) -> get rdata (s1) -> reg next in Memblock (s2) -> reg next in Memblock (s3) -> consumed by pf (s3) pcMem.io.ren.get(pcMemIdx) := io.memHyPcRead(i).valid pcMem.io.raddr(pcMemIdx) := io.memHyPcRead(i).ptr.value - io.memHyPcRead(i).data := pcMem.io.rdata(pcMemIdx).getPc(RegEnable(io.memHyPcRead(i).offset, io.memHyPcRead(i).valid)) + io.memHyPcRead(i).data := pcMem.io.rdata(pcMemIdx).startAddr + (RegEnable(io.memHyPcRead(i).offset, io.memHyPcRead(i).valid) << instOffsetBits) } if (EnableStorePrefetchSMS) { for ((pcMemIdx, i) <- pcMemRdIndexes("store").zipWithIndex) { pcMem.io.ren.get(pcMemIdx) := io.memStPcRead(i).valid pcMem.io.raddr(pcMemIdx) := io.memStPcRead(i).ptr.value - io.memStPcRead(i).data := pcMem.io.rdata(pcMemIdx).getPc(RegEnable(io.memStPcRead(i).offset, io.memStPcRead(i).valid)) + io.memStPcRead(i).data := pcMem.io.rdata(pcMemIdx).startAddr + (RegEnable(io.memStPcRead(i).offset, io.memStPcRead(i).valid) << instOffsetBits) } } else { io.memStPcRead.foreach(_.data := 0.U) @@ -302,10 +302,12 @@ class CtrlBlockImp( redirectGen.io.oldestExuOutPredecode.valid := GatedValidRegNext(oldestExuPredecode.valid) redirectGen.io.oldestExuOutPredecode := RegEnable(oldestExuPredecode, oldestExuPredecode.valid) redirectGen.io.loadReplay <> loadReplay - val loadRedirectPcRead = pcMem.io.rdata(pcMemRdIndexes("redirect").head).getPc(RegEnable(memViolation.bits.ftqOffset, memViolation.valid)) + val loadRedirectOffset = Mux(memViolation.bits.flushItself(), 0.U, Mux(memViolation.bits.isRVC, 2.U, 4.U)) + val loadRedirectPcFtqOffset = RegEnable((memViolation.bits.ftqOffset << instOffsetBits).asUInt +& loadRedirectOffset, memViolation.valid) + val loadRedirectPcRead = pcMem.io.rdata(pcMemRdIndexes("redirect").head).startAddr + loadRedirectPcFtqOffset + redirectGen.io.loadReplay.bits.cfiUpdate.pc := loadRedirectPcRead - val load_pc_offset = Mux(loadReplay.bits.flushItself(), 0.U, Mux(loadReplay.bits.isRVC, 2.U, 4.U)) - val load_target = loadRedirectPcRead + load_pc_offset + val load_target = loadRedirectPcRead redirectGen.io.loadReplay.bits.cfiUpdate.target := load_target redirectGen.io.robFlush := s1_robFlushRedirect diff --git a/src/main/scala/xiangshan/backend/datapath/BypassNetwork.scala b/src/main/scala/xiangshan/backend/datapath/BypassNetwork.scala index ae6e8055e8a..9e157c23371 100644 --- a/src/main/scala/xiangshan/backend/datapath/BypassNetwork.scala +++ b/src/main/scala/xiangshan/backend/datapath/BypassNetwork.scala @@ -4,13 +4,14 @@ import org.chipsalliance.cde.config.Parameters import chisel3._ import chisel3.util._ import utility.{GatedValidRegNext, SignExt, ZeroExt} -import xiangshan.{XSBundle, XSModule} +import xiangshan.{JumpOpType, SelImm, XSBundle, XSModule} import xiangshan.backend.BackendParams import xiangshan.backend.Bundles.{ExuBypassBundle, ExuInput, ExuOutput, ExuVec, ImmInfo} import xiangshan.backend.issue.{FpScheduler, ImmExtractor, IntScheduler, MemScheduler, VfScheduler} import xiangshan.backend.datapath.DataConfig.RegDataMaxWidth import xiangshan.backend.decode.ImmUnion import xiangshan.backend.regcache._ +import xiangshan.backend.fu.FuType class BypassNetworkIO()(implicit p: Parameters, params: BackendParams) extends XSBundle { // params @@ -169,6 +170,20 @@ class BypassNetwork()(implicit p: Parameters, params: BackendParams) extends XSM ) ) } + if (exuInput.bits.params.hasBrhFu) { + val immWidth = exuInput.bits.params.immType.map(x => SelImm.getImmUnion(x).len).max + val nextPcOffset = exuInput.bits.ftqOffset.get +& Mux(exuInput.bits.preDecode.get.isRVC, 1.U, 2.U) + val imm = ImmExtractor( + immInfo(exuIdx).imm, + immInfo(exuIdx).immType, + exuInput.bits.params.destDataBitsMax, + exuInput.bits.params.immType.map(_.litValue) + ) + val isJALR = FuType.isJump(exuInput.bits.fuType) && JumpOpType.jumpOpisJalr(exuInput.bits.fuOpType) + val immBJU = imm + Mux(isJALR, 0.U, (exuInput.bits.ftqOffset.getOrElse(0.U) << instOffsetBits).asUInt) + exuInput.bits.imm := immBJU + exuInput.bits.nextPcOffset.get := nextPcOffset + } } // to reg cache diff --git a/src/main/scala/xiangshan/backend/exu/ExeUnit.scala b/src/main/scala/xiangshan/backend/exu/ExeUnit.scala index 50fe87e485b..48fd073fe0e 100644 --- a/src/main/scala/xiangshan/backend/exu/ExeUnit.scala +++ b/src/main/scala/xiangshan/backend/exu/ExeUnit.scala @@ -233,6 +233,7 @@ class ExeUnitImp( sink.bits.data.src.zip(source.bits.src).foreach { case(fuSrc, exuSrc) => fuSrc := exuSrc } sink.bits.data.pc .foreach(x => x := source.bits.pc.get) + sink.bits.data.nextPcOffset.foreach(x => x := source.bits.nextPcOffset.get) sink.bits.data.imm := source.bits.imm sink.bits.ctrl.fuOpType := source.bits.fuOpType sink.bits.ctrl.robIdx := source.bits.robIdx diff --git a/src/main/scala/xiangshan/backend/fu/FuncUnit.scala b/src/main/scala/xiangshan/backend/fu/FuncUnit.scala index 6a133f04b66..186083cd7c4 100644 --- a/src/main/scala/xiangshan/backend/fu/FuncUnit.scala +++ b/src/main/scala/xiangshan/backend/fu/FuncUnit.scala @@ -56,6 +56,7 @@ class FuncUnitDataInput(cfg: FuConfig)(implicit p: Parameters) extends XSBundle val src = MixedVec(cfg.genSrcDataVec) val imm = UInt(cfg.destDataBits.W) val pc = OptionWrapper(cfg.needPc, UInt(VAddrData().dataWidth.W)) + val nextPcOffset = OptionWrapper(cfg.needPc, UInt((log2Up(PredictWidth) + 1).W)) def getSrcVConfig : UInt = src(cfg.vconfigIdx) def getSrcMask : UInt = src(cfg.maskSrcIdx) diff --git a/src/main/scala/xiangshan/backend/fu/Jump.scala b/src/main/scala/xiangshan/backend/fu/Jump.scala index 61f3a4bb027..c57e9a03182 100644 --- a/src/main/scala/xiangshan/backend/fu/Jump.scala +++ b/src/main/scala/xiangshan/backend/fu/Jump.scala @@ -35,23 +35,20 @@ class JumpDataModule(implicit p: Parameters) extends XSModule { val io = IO(new Bundle() { val src = Input(UInt(XLEN.W)) val pc = Input(UInt(XLEN.W)) // sign-ext to XLEN - val immMin = Input(UInt(ImmUnion.maxLen.W)) + val imm = Input(UInt(33.W)) // imm-U need 32 bits, highest bit is sign bit + val nextPcOffset = Input(UInt((log2Up(PredictWidth) + 1).W)) val func = Input(FuOpType()) val isRVC = Input(Bool()) val result, target = Output(UInt(XLEN.W)) val isAuipc = Output(Bool()) }) - val (src1, pc, immMin, func, isRVC) = (io.src, io.pc, io.immMin, io.func, io.isRVC) + val (src1, pc, imm, func, isRVC) = (io.src, io.pc, io.imm, io.func, io.isRVC) val isJalr = JumpOpType.jumpOpisJalr(func) val isAuipc = JumpOpType.jumpOpisAuipc(func) - val offset = SignExt(ParallelMux(Seq( - isJalr -> ImmUnion.I.toImm32(immMin), - isAuipc -> ImmUnion.U.toImm32(immMin), - !(isJalr || isAuipc) -> ImmUnion.J.toImm32(immMin) - )), XLEN) + val offset = SignExt(imm, XLEN) - val snpc = pc + Mux(isRVC, 2.U, 4.U) + val snpc = pc + (io.nextPcOffset << instOffsetBits).asUInt val target = Mux(JumpOpType.jumpOpisJalr(func), src1, pc) + offset // NOTE: src1 is (pc/rf(rs1)), src2 is (offset) // RISC-V spec for JALR: diff --git a/src/main/scala/xiangshan/backend/fu/wrapper/BranchUnit.scala b/src/main/scala/xiangshan/backend/fu/wrapper/BranchUnit.scala index ad6f4153dfd..d11c67e1070 100644 --- a/src/main/scala/xiangshan/backend/fu/wrapper/BranchUnit.scala +++ b/src/main/scala/xiangshan/backend/fu/wrapper/BranchUnit.scala @@ -2,24 +2,28 @@ package xiangshan.backend.fu.wrapper import org.chipsalliance.cde.config.Parameters import chisel3._ +import chisel3.util.log2Up import utility.SignExt import xiangshan.backend.decode.ImmUnion import xiangshan.backend.fu.{BranchModule, FuConfig, FuncUnit} import xiangshan.backend.datapath.DataConfig.VAddrData -import xiangshan.{RedirectLevel, XSModule} +import xiangshan.{RedirectLevel, SelImm, XSModule} class AddrAddModule(implicit p: Parameters) extends XSModule { val io = IO(new Bundle { val pc = Input(UInt(VAddrBits.W)) val taken = Input(Bool()) val isRVC = Input(Bool()) - val offset = Input(UInt(12.W)) // branch inst only support 12 bits immediate num + val imm = Input(UInt(32.W)) // branch inst only support 12 bits immediate num val target = Output(UInt(XLEN.W)) + val nextPcOffset = Input(UInt((log2Up(PredictWidth) + 1).W)) }) val pcExtend = SignExt(io.pc, VAddrBits + 1) + val immMinWidth = FuConfig.BrhCfg.immType.map(x => SelImm.getImmUnion(x).len).max + print(s"[Branch]: immMinWidth = $immMinWidth\n") io.target := SignExt(Mux(io.taken, - pcExtend + SignExt(ImmUnion.B.toImm32(io.offset), VAddrBits + 1), - pcExtend + Mux(io.isRVC, 2.U, 4.U) + pcExtend + SignExt(io.imm(immMinWidth + 2, 0), VAddrBits + 1), + pcExtend + (io.nextPcOffset << instOffsetBits).asUInt ), XLEN) } @@ -32,9 +36,10 @@ class BranchUnit(cfg: FuConfig)(implicit p: Parameters) extends FuncUnit(cfg) { dataModule.io.pred_taken := io.in.bits.ctrl.predictInfo.get.taken addModule.io.pc := io.in.bits.data.pc.get // pc - addModule.io.offset := io.in.bits.data.imm // imm + addModule.io.imm := io.in.bits.data.imm // imm addModule.io.taken := dataModule.io.taken addModule.io.isRVC := io.in.bits.ctrl.preDecode.get.isRVC + addModule.io.nextPcOffset := io.in.bits.data.nextPcOffset.get io.out.valid := io.in.valid io.in.ready := io.out.ready diff --git a/src/main/scala/xiangshan/backend/fu/wrapper/JumpUnit.scala b/src/main/scala/xiangshan/backend/fu/wrapper/JumpUnit.scala index b0bc6ee7f96..c0c92731b90 100644 --- a/src/main/scala/xiangshan/backend/fu/wrapper/JumpUnit.scala +++ b/src/main/scala/xiangshan/backend/fu/wrapper/JumpUnit.scala @@ -16,13 +16,14 @@ class JumpUnit(cfg: FuConfig)(implicit p: Parameters) extends PipedFuncUnit(cfg) // associated with AddrData's position of JmpCfg.srcData private val src = io.in.bits.data.src(0) private val pc = SignExt(io.in.bits.data.pc.get, cfg.destDataBits) - private val immMin = io.in.bits.data.imm + private val imm = io.in.bits.data.imm private val func = io.in.bits.ctrl.fuOpType private val isRVC = io.in.bits.ctrl.preDecode.get.isRVC jumpDataModule.io.src := src jumpDataModule.io.pc := pc - jumpDataModule.io.immMin := immMin + jumpDataModule.io.imm := imm + jumpDataModule.io.nextPcOffset := io.in.bits.data.nextPcOffset.get jumpDataModule.io.func := func jumpDataModule.io.isRVC := isRVC diff --git a/src/main/scala/xiangshan/backend/issue/IssueQueue.scala b/src/main/scala/xiangshan/backend/issue/IssueQueue.scala index 7d8c6b9bff1..fcbc070a0e5 100644 --- a/src/main/scala/xiangshan/backend/issue/IssueQueue.scala +++ b/src/main/scala/xiangshan/backend/issue/IssueQueue.scala @@ -788,6 +788,7 @@ class IssueQueueImp(override val wrapper: IssueQueue)(implicit p: Parameters, va } deq.bits.immType := deqEntryVec(i).bits.payload.selImm deq.bits.common.imm := deqEntryVec(i).bits.imm.getOrElse(0.U) + deq.bits.common.nextPcOffset.foreach(_ := 0.U) deq.bits.rcIdx.foreach(_ := deqEntryVec(i).bits.status.srcStatus.map(_.regCacheIdx.get)) deq.bits.common.perfDebugInfo := deqEntryVec(i).bits.payload.debugInfo From 13ac06807a5b757739f4f6ff9747325bb2744cdc Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Wed, 4 Dec 2024 14:18:39 +0800 Subject: [PATCH 11/32] timing(rob): enqRob pipe for better timing --- .../scala/xiangshan/backend/CtrlBlock.scala | 17 ++++++++++++++++- src/main/scala/xiangshan/backend/rob/Rob.scala | 5 ++++- .../xiangshan/backend/rob/RobBundles.scala | 1 + 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/main/scala/xiangshan/backend/CtrlBlock.scala b/src/main/scala/xiangshan/backend/CtrlBlock.scala index f72820f4e0a..31dcfda3bc2 100644 --- a/src/main/scala/xiangshan/backend/CtrlBlock.scala +++ b/src/main/scala/xiangshan/backend/CtrlBlock.scala @@ -603,7 +603,22 @@ class CtrlBlockImp( PipeGroupConnect(renameOut, dispatch.io.fromRename, s1_s3_redirect.valid, dispatch.io.toRenameAllFire, "renamePipeDispatch") dispatch.io.redirect := s1_s3_redirect - dispatch.io.enqRob <> rob.io.enq + val enqRob = Wire(chiselTypeOf(rob.io.enq)) + enqRob.canAccept := rob.io.enq.canAccept + enqRob.canAcceptForDispatch := rob.io.enq.canAcceptForDispatch + enqRob.isEmpty := rob.io.enq.isEmpty + enqRob.resp := rob.io.enq.resp + enqRob.needAlloc := RegNext(dispatch.io.enqRob.needAlloc) + enqRob.req.zip(dispatch.io.enqRob.req).map { case (sink, source) => + sink.valid := RegNext(source.valid && !rob.io.redirect.valid) + sink.bits := RegEnable(source.bits, source.valid) + } + dispatch.io.enqRob.canAccept := enqRob.canAcceptForDispatch && !enqRob.req.map(x => x.valid && x.bits.blockBackward && enqRob.canAccept).reduce(_ || _) + dispatch.io.enqRob.canAcceptForDispatch := enqRob.canAcceptForDispatch + dispatch.io.enqRob.isEmpty := enqRob.isEmpty && !enqRob.req.map(_.valid).reduce(_ || _) + dispatch.io.enqRob.resp := enqRob.resp + rob.io.enq.needAlloc := enqRob.needAlloc + rob.io.enq.req := enqRob.req dispatch.io.robHead := rob.io.debugRobHead dispatch.io.stallReason <> rename.io.stallReason.out dispatch.io.lqCanAccept := io.lqCanAccept diff --git a/src/main/scala/xiangshan/backend/rob/Rob.scala b/src/main/scala/xiangshan/backend/rob/Rob.scala index d15bc2f9943..189fe15c860 100644 --- a/src/main/scala/xiangshan/backend/rob/Rob.scala +++ b/src/main/scala/xiangshan/backend/rob/Rob.scala @@ -159,6 +159,7 @@ class RobImp(override val wrapper: Rob)(implicit p: Parameters, params: BackendP val walkPtrTrue = Reg(new RobPtr) val lastWalkPtr = Reg(new RobPtr) val allowEnqueue = RegInit(true.B) + val allowEnqueueForDispatch = RegInit(true.B) val vecExcpInfo = RegInit(ValidIO(new VecExcpInfo).Lit( _.valid -> false.B, )) @@ -175,6 +176,7 @@ class RobImp(override val wrapper: Rob)(implicit p: Parameters, params: BackendP val walkPtr = walkPtrVec(0) val allocatePtrVec = VecInit((0 until RenameWidth).map(i => enqPtrVec(PopCount(io.enq.req.take(i).map(req => req.valid && req.bits.firstUop))))) io.enq.canAccept := allowEnqueue && !hasBlockBackward && rab.io.canEnq && vtypeBuffer.io.canEnq && !io.fromVecExcpMod.busy + io.enq.canAcceptForDispatch := allowEnqueueForDispatch && !hasBlockBackward && rab.io.canEnq && vtypeBuffer.io.canEnq && !io.fromVecExcpMod.busy io.enq.resp := allocatePtrVec val canEnqueue = VecInit(io.enq.req.map(req => req.valid && req.bits.firstUop && io.enq.canAccept)) val timer = GTimer() @@ -896,7 +898,8 @@ class RobImp(override val wrapper: Rob)(implicit p: Parameters, params: BackendP val numValidEntries = distanceBetween(enqPtr, deqPtr) val commitCnt = PopCount(io.commits.commitValid) - allowEnqueue := numValidEntries + dispatchNum <= (RobSize - CommitWidth).U + allowEnqueue := numValidEntries + dispatchNum <= (RobSize - RenameWidth).U + allowEnqueueForDispatch := numValidEntries + dispatchNum <= (RobSize - 2 * RenameWidth).U val redirectWalkDistance = distanceBetween(io.redirect.bits.robIdx, deqPtrVec_next(0)) when(io.redirect.valid) { diff --git a/src/main/scala/xiangshan/backend/rob/RobBundles.scala b/src/main/scala/xiangshan/backend/rob/RobBundles.scala index 410832fbf8c..8e55bd8b6a0 100644 --- a/src/main/scala/xiangshan/backend/rob/RobBundles.scala +++ b/src/main/scala/xiangshan/backend/rob/RobBundles.scala @@ -247,6 +247,7 @@ class RobLsqIO(implicit p: Parameters) extends XSBundle { class RobEnqIO(implicit p: Parameters) extends XSBundle { val canAccept = Output(Bool()) + val canAcceptForDispatch = Output(Bool()) val isEmpty = Output(Bool()) // valid vector, for robIdx gen and walk val needAlloc = Vec(RenameWidth, Input(Bool())) From 5878755023b17d9494c63cc1595c0d99d2204ae5 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Wed, 4 Dec 2024 15:37:00 +0800 Subject: [PATCH 12/32] timing(jumpUnit): fix target timing --- src/main/scala/xiangshan/backend/fu/Jump.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/xiangshan/backend/fu/Jump.scala b/src/main/scala/xiangshan/backend/fu/Jump.scala index c57e9a03182..227951229b7 100644 --- a/src/main/scala/xiangshan/backend/fu/Jump.scala +++ b/src/main/scala/xiangshan/backend/fu/Jump.scala @@ -49,7 +49,7 @@ class JumpDataModule(implicit p: Parameters) extends XSModule { val offset = SignExt(imm, XLEN) val snpc = pc + (io.nextPcOffset << instOffsetBits).asUInt - val target = Mux(JumpOpType.jumpOpisJalr(func), src1, pc) + offset // NOTE: src1 is (pc/rf(rs1)), src2 is (offset) + val target = Mux(JumpOpType.jumpOpisJalr(func), src1 + offset, pc + offset) // NOTE: src1 is (pc/rf(rs1)), src2 is (offset) // RISC-V spec for JALR: // The target address is obtained by adding the sign-extended 12-bit I-immediate to the register rs1, From c938f7d881c0c9778423fc20f2bce348d63b2e86 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Thu, 5 Dec 2024 13:03:14 +0800 Subject: [PATCH 13/32] timing(redirectGen): fix timing of addr trans type exception --- src/main/scala/xiangshan/backend/Backend.scala | 1 + src/main/scala/xiangshan/backend/CtrlBlock.scala | 6 ++++++ .../backend/ctrlblock/RedirectGenerator.scala | 12 +++++++++--- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/main/scala/xiangshan/backend/Backend.scala b/src/main/scala/xiangshan/backend/Backend.scala index 65572fa29d9..bea489e5ac3 100644 --- a/src/main/scala/xiangshan/backend/Backend.scala +++ b/src/main/scala/xiangshan/backend/Backend.scala @@ -256,6 +256,7 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame ctrlBlock.io.frontend <> io.frontend ctrlBlock.io.fromCSR.toDecode := intExuBlock.io.csrToDecode.get ctrlBlock.io.fromCSR.traceCSR := intExuBlock.io.csrio.get.traceCSR + ctrlBlock.io.fromCSR.instrAddrTransType := RegNext(intExuBlock.io.csrio.get.instrAddrTransType) ctrlBlock.io.fromWB.wbData <> wbDataPath.io.toCtrlBlock.writeback ctrlBlock.io.fromMem.stIn <> io.mem.stIn ctrlBlock.io.fromMem.violation <> io.mem.memoryViolation diff --git a/src/main/scala/xiangshan/backend/CtrlBlock.scala b/src/main/scala/xiangshan/backend/CtrlBlock.scala index 31dcfda3bc2..d086decc772 100644 --- a/src/main/scala/xiangshan/backend/CtrlBlock.scala +++ b/src/main/scala/xiangshan/backend/CtrlBlock.scala @@ -196,6 +196,10 @@ class CtrlBlockImp( out.bits := x.bits.redirect.get.bits out.bits.debugIsCtrl := true.B out.bits.debugIsMemVio := false.B + // for fix timing, next cycle assgin + out.bits.cfiUpdate.backendIAF := false.B + out.bits.cfiUpdate.backendIPF := false.B + out.bits.cfiUpdate.backendIGPF := false.B out }).toSeq private val oldestOneHot = Redirect.selectOldestRedirect(exuRedirects) @@ -299,6 +303,7 @@ class CtrlBlockImp( redirectGen.io.hartId := io.fromTop.hartId redirectGen.io.oldestExuRedirect.valid := GatedValidRegNext(oldestExuRedirect.valid) redirectGen.io.oldestExuRedirect.bits := RegEnable(oldestExuRedirect.bits, oldestExuRedirect.valid) + redirectGen.io.instrAddrTransType := RegNext(io.fromCSR.instrAddrTransType) redirectGen.io.oldestExuOutPredecode.valid := GatedValidRegNext(oldestExuPredecode.valid) redirectGen.io.oldestExuOutPredecode := RegEnable(oldestExuPredecode, oldestExuPredecode.valid) redirectGen.io.loadReplay <> loadReplay @@ -763,6 +768,7 @@ class CtrlBlockIO()(implicit p: Parameters, params: BackendParams) extends XSBun val fromCSR = new Bundle{ val toDecode = Input(new CSRToDecode) val traceCSR = Input(new TraceCSR) + val instrAddrTransType = Input(new AddrTransType) } val toIssueBlock = new Bundle { val flush = ValidIO(new Redirect) diff --git a/src/main/scala/xiangshan/backend/ctrlblock/RedirectGenerator.scala b/src/main/scala/xiangshan/backend/ctrlblock/RedirectGenerator.scala index 9237193b70a..addb100ce48 100644 --- a/src/main/scala/xiangshan/backend/ctrlblock/RedirectGenerator.scala +++ b/src/main/scala/xiangshan/backend/ctrlblock/RedirectGenerator.scala @@ -5,7 +5,7 @@ import chisel3.util._ import chisel3._ import utility.{HasCircularQueuePtrHelper, XORFold, GatedValidRegNext} import xiangshan.frontend.{FtqRead, PreDecodeInfo} -import xiangshan.{MemPredUpdateReq, Redirect, XSBundle, XSModule} +import xiangshan.{MemPredUpdateReq, Redirect, XSBundle, XSModule, AddrTransType} class RedirectGenerator(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHelper { @@ -15,6 +15,7 @@ class RedirectGenerator(implicit p: Parameters) extends XSModule val hartId = Input(UInt(8.W)) val oldestExuRedirect = Flipped(ValidIO(new Redirect)) + val instrAddrTransType = Input(new AddrTransType) val oldestExuOutPredecode = Input(new PreDecodeInfo) // guarded by exuRedirect.valid val loadReplay = Flipped(ValidIO(new Redirect)) val robFlush = Flipped(ValidIO(new Redirect)) @@ -29,12 +30,17 @@ class RedirectGenerator(implicit p: Parameters) extends XSModule val loadRedirect = io.loadReplay val robFlush = io.robFlush - val allRedirect: Vec[ValidIO[Redirect]] = VecInit(io.oldestExuRedirect, loadRedirect) + val oldestExuRedirect = Wire(chiselTypeOf(io.oldestExuRedirect)) + oldestExuRedirect := io.oldestExuRedirect + oldestExuRedirect.bits.fullTarget := Cat(io.oldestExuRedirect.bits.fullTarget.head(XLEN - VAddrBits), io.oldestExuRedirect.bits.cfiUpdate.target) + oldestExuRedirect.bits.cfiUpdate.backendIAF := io.instrAddrTransType.checkAccessFault(oldestExuRedirect.bits.fullTarget) + oldestExuRedirect.bits.cfiUpdate.backendIPF := io.instrAddrTransType.checkPageFault(oldestExuRedirect.bits.fullTarget) + oldestExuRedirect.bits.cfiUpdate.backendIGPF := io.instrAddrTransType.checkGuestPageFault(oldestExuRedirect.bits.fullTarget) + val allRedirect: Vec[ValidIO[Redirect]] = VecInit(oldestExuRedirect, loadRedirect) val oldestOneHot = Redirect.selectOldestRedirect(allRedirect) val flushAfter = RegInit(0.U.asTypeOf(ValidIO(new Redirect))) val needFlushVec = VecInit(allRedirect.map(_.bits.robIdx.needFlush(flushAfter) || robFlush.valid)) val oldestValid = VecInit(oldestOneHot.zip(needFlushVec).map { case (v, f) => v && !f }).asUInt.orR - val oldestExuRedirect = io.oldestExuRedirect val oldestExuPredecode = io.oldestExuOutPredecode val oldestRedirect = Mux1H(oldestOneHot, allRedirect) val s1_redirect_bits_reg = RegEnable(oldestRedirect.bits, oldestValid) From 2c7f1c80b3ef1dc7acff12b94c37592c1e0ca066 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Thu, 5 Dec 2024 18:41:32 +0800 Subject: [PATCH 14/32] fix(rob): fix bug of canAcceptForDispatch --- src/main/scala/xiangshan/backend/rob/Rab.scala | 3 +++ src/main/scala/xiangshan/backend/rob/Rob.scala | 2 +- src/main/scala/xiangshan/backend/rob/VTypeBuffer.scala | 6 ++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/main/scala/xiangshan/backend/rob/Rab.scala b/src/main/scala/xiangshan/backend/rob/Rab.scala index 2f785c84d6f..34d8c106e01 100644 --- a/src/main/scala/xiangshan/backend/rob/Rab.scala +++ b/src/main/scala/xiangshan/backend/rob/Rab.scala @@ -49,6 +49,7 @@ class RenameBuffer(size: Int)(implicit p: Parameters) extends XSModule with HasC val snpt = Input(new SnapshotPort) val canEnq = Output(Bool()) + val canEnqForDispatch = Output(Bool()) val enqPtrVec = Output(Vec(RenameWidth, new RenameBufferPtr)) val commits = Output(new RabCommitIO) @@ -260,8 +261,10 @@ class RenameBuffer(size: Int)(implicit p: Parameters) extends XSModule with HasC val numValidEntries = distanceBetween(enqPtr, deqPtr) val allowEnqueue = GatedValidRegNext(numValidEntries + enqCount <= (size - RenameWidth).U, true.B) + val allowEnqueueForDispatch = GatedValidRegNext(numValidEntries + enqCount <= (size - 2*RenameWidth).U, true.B) io.canEnq := allowEnqueue && state === s_idle + io.canEnqForDispatch := allowEnqueueForDispatch && state === s_idle io.enqPtrVec := enqPtrVec io.status.walkEnd := walkEndNext diff --git a/src/main/scala/xiangshan/backend/rob/Rob.scala b/src/main/scala/xiangshan/backend/rob/Rob.scala index 189fe15c860..d824f65f418 100644 --- a/src/main/scala/xiangshan/backend/rob/Rob.scala +++ b/src/main/scala/xiangshan/backend/rob/Rob.scala @@ -176,7 +176,7 @@ class RobImp(override val wrapper: Rob)(implicit p: Parameters, params: BackendP val walkPtr = walkPtrVec(0) val allocatePtrVec = VecInit((0 until RenameWidth).map(i => enqPtrVec(PopCount(io.enq.req.take(i).map(req => req.valid && req.bits.firstUop))))) io.enq.canAccept := allowEnqueue && !hasBlockBackward && rab.io.canEnq && vtypeBuffer.io.canEnq && !io.fromVecExcpMod.busy - io.enq.canAcceptForDispatch := allowEnqueueForDispatch && !hasBlockBackward && rab.io.canEnq && vtypeBuffer.io.canEnq && !io.fromVecExcpMod.busy + io.enq.canAcceptForDispatch := allowEnqueueForDispatch && !hasBlockBackward && rab.io.canEnqForDispatch && vtypeBuffer.io.canEnqForDispatch && !io.fromVecExcpMod.busy io.enq.resp := allocatePtrVec val canEnqueue = VecInit(io.enq.req.map(req => req.valid && req.bits.firstUop && io.enq.canAccept)) val timer = GTimer() diff --git a/src/main/scala/xiangshan/backend/rob/VTypeBuffer.scala b/src/main/scala/xiangshan/backend/rob/VTypeBuffer.scala index 2a6f76a5273..0f47de6ba13 100644 --- a/src/main/scala/xiangshan/backend/rob/VTypeBuffer.scala +++ b/src/main/scala/xiangshan/backend/rob/VTypeBuffer.scala @@ -43,6 +43,7 @@ class VTypeBufferIO(size: Int)(implicit p: Parameters) extends XSBundle { val snpt = Input(new SnapshotPort) val canEnq = Output(Bool()) + val canEnqForDispatch = Output(Bool()) val toDecode = Output(new Bundle { val isResumeVType = Bool() @@ -263,6 +264,10 @@ class VTypeBuffer(size: Int)(implicit p: Parameters) extends XSModule with HasCi numValidEntries + enqCount <= (size - RenameWidth).U, true.B ) + val allowEnqueueForDispatch = GatedValidRegNext( + numValidEntries + enqCount <= (size - 2*RenameWidth).U, + true.B + ) private val decodeResumeVType = RegInit(0.U.asTypeOf(new ValidIO(VType()))) private val newestVType = PriorityMux(walkValidVec.zip(infoVec).map { case(walkValid, info) => walkValid -> info }.reverse) @@ -292,6 +297,7 @@ class VTypeBuffer(size: Int)(implicit p: Parameters) extends XSModule with HasCi } io.canEnq := allowEnqueue && state === s_idle + io.canEnqForDispatch := allowEnqueueForDispatch && state === s_idle io.status.walkEnd := walkEndNext // update vtype in decode when VTypeBuffer resumes from walk state // note that VTypeBuffer can still send resuming request in the first cycle of s_idle From 60a5bebb3beb7970fb8ece7a32584e6fac628f41 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Sun, 8 Dec 2024 17:51:06 +0800 Subject: [PATCH 15/32] area(exu): ctrl signals only pipe once in exu --- .../scala/xiangshan/backend/exu/ExeUnit.scala | 40 ++++++++++++++++ .../scala/xiangshan/backend/fu/FuConfig.scala | 2 + .../scala/xiangshan/backend/fu/FuncUnit.scala | 46 +++++++++---------- .../scala/xiangshan/backend/fu/fpu/FMA.scala | 8 ++-- 4 files changed, 69 insertions(+), 27 deletions(-) diff --git a/src/main/scala/xiangshan/backend/exu/ExeUnit.scala b/src/main/scala/xiangshan/backend/exu/ExeUnit.scala index 48fd073fe0e..9d5e632028f 100644 --- a/src/main/scala/xiangshan/backend/exu/ExeUnit.scala +++ b/src/main/scala/xiangshan/backend/exu/ExeUnit.scala @@ -225,6 +225,20 @@ class ExeUnitImp( in1ToN.io.in.bits := io.in.bits io.in.ready := !busy && in1ToN.io.in.ready + def pipelineReg(init: ExuInput, valid: Bool, latency: Int, flush: ValidIO[Redirect]): (Seq[ExuInput], Seq[Bool]) = { + val validVec = valid +: Seq.fill(latency)(RegInit(false.B)) + val inVec = init +: Seq.fill(latency)(Reg(new ExuInput(exuParams))) + val robIdxVec = inVec.map(_.robIdx) + // if flush(0), valid 0 will not given, so set flushVec(0) to false.B + val flushVec = validVec.zip(robIdxVec).map(x => x._1 && x._2.needFlush(flush)) + for (i <- 1 to latency) { + validVec(i) := validVec(i - 1) && !flushVec(i - 1) + inVec(i) := inVec(i - 1) + } + (inVec, validVec) + } + val latencyMax = fuCfgs.map(_.latency.latencyVal.getOrElse(0)).max + val inPipe = pipelineReg(io.in.bits, io.in.valid, latencyMax, io.flush) // Dispatcher.out <---> FunctionUnits in1ToN.io.out.zip(funcUnits.map(_.io.in)).foreach { case (source: DecoupledIO[ExuInput], sink: DecoupledIO[FuncUnitInput]) => @@ -255,6 +269,32 @@ class ExeUnitImp( sink.bits.ctrl.vpu .foreach(x => x.fpu.isFP64Instr := 0.U) sink.bits.perfDebugInfo := source.bits.perfDebugInfo } + funcUnits.filter(_.cfg.latency.latencyVal.nonEmpty).map{ fu => + val latency = fu.cfg.latency.latencyVal.getOrElse(0) + for (i <- 0 until (latency+1)) { + val sink = fu.io.in.bits.ctrlPipe.get(i) + val source = inPipe._1(i) + fu.io.in.bits.validPipe.get(i) := inPipe._2(i) + sink.fuOpType := source.fuOpType + sink.robIdx := source.robIdx + sink.pdest := source.pdest + sink.rfWen.foreach(x => x := source.rfWen.get) + sink.fpWen.foreach(x => x := source.fpWen.get) + sink.vecWen.foreach(x => x := source.vecWen.get) + sink.v0Wen.foreach(x => x := source.v0Wen.get) + sink.vlWen.foreach(x => x := source.vlWen.get) + sink.flushPipe.foreach(x => x := source.flushPipe.get) + sink.preDecode.foreach(x => x := source.preDecode.get) + sink.ftqIdx.foreach(x => x := source.ftqIdx.get) + sink.ftqOffset.foreach(x => x := source.ftqOffset.get) + sink.predictInfo.foreach(x => x := source.predictInfo.get) + sink.fpu.foreach(x => x := source.fpu.get) + sink.vpu.foreach(x => x := source.vpu.get) + sink.vpu.foreach(x => x.fpu.isFpToVecInst := 0.U) + sink.vpu.foreach(x => x.fpu.isFP32Instr := 0.U) + sink.vpu.foreach(x => x.fpu.isFP64Instr := 0.U) + } + } private val OutresVecs = funcUnits.map { fu => def latDiff :Int = fu.cfg.latency.extraLatencyVal.getOrElse(0) diff --git a/src/main/scala/xiangshan/backend/fu/FuConfig.scala b/src/main/scala/xiangshan/backend/fu/FuConfig.scala index a7928c80934..dbf77b2ffda 100644 --- a/src/main/scala/xiangshan/backend/fu/FuConfig.scala +++ b/src/main/scala/xiangshan/backend/fu/FuConfig.scala @@ -191,6 +191,8 @@ case class FuConfig ( def isSta: Boolean = name.contains("sta") + def isStd: Boolean = name.contains("std") + def ckAlwaysEn: Boolean = isCsr || isFence /** diff --git a/src/main/scala/xiangshan/backend/fu/FuncUnit.scala b/src/main/scala/xiangshan/backend/fu/FuncUnit.scala index 186083cd7c4..50374327be5 100644 --- a/src/main/scala/xiangshan/backend/fu/FuncUnit.scala +++ b/src/main/scala/xiangshan/backend/fu/FuncUnit.scala @@ -70,7 +70,10 @@ class FuncUnitDataOutput(cfg: FuConfig)(implicit p: Parameters) extends XSBundle } class FuncUnitInput(cfg: FuConfig)(implicit p: Parameters) extends XSBundle { + val needCtrlPipe = cfg.latency.latencyVal.nonEmpty && (!cfg.isStd) val ctrl = new FuncUnitCtrlInput(cfg) + val ctrlPipe = OptionWrapper(needCtrlPipe, Vec(cfg.latency.latencyVal.get + 1, new FuncUnitCtrlInput(cfg))) + val validPipe = OptionWrapper(needCtrlPipe, Vec(cfg.latency.latencyVal.get + 1, Bool())) val data = new FuncUnitDataInput(cfg) val perfDebugInfo = new PerfDebugInfo() } @@ -176,13 +179,11 @@ trait HasPipelineReg { this: FuncUnit => rdyVec(i) := !validVec(i + 1) || rdyVec(i + 1).asTypeOf(Bool()) } for (i <- 1 to latency) { - when(rdyVec(i - 1) && validVec(i - 1) && !flushVec(i - 1)) { - validVec(i) := validVec(i - 1) + validVec(i) := validVec(i - 1) + when(rdyVec(i - 1) && validVec(i - 1)) { ctrlVec(i) := ctrlVec(i - 1) dataVec(i) := dataVec(i - 1) perfVec(i) := perfVec(i - 1) - }.elsewhen(flushVec(i) || rdyVec(i)) { - validVec(i) := false.B } } @@ -190,58 +191,57 @@ trait HasPipelineReg { this: FuncUnit => case(( ctrl,data), perf) => { val out = Wire(new FuncUnitInput(cfg)) out.ctrl := ctrl + out.ctrlPipe.foreach(_ := 0.U.asTypeOf(out.ctrlPipe.get)) + out.validPipe.foreach(_ := 0.U.asTypeOf(out.validPipe.get)) out.data := data out.perfDebugInfo := perf out } },validVec, rdyVec) } - val (pipeReg : Seq[FuncUnitInput],validVec ,rdyVec ) = pipelineReg(io.in.bits, io.in.valid,io.out.ready,preLat, io.flush) - val ctrlVec = pipeReg.map(_.ctrl) + val (pipeReg : Seq[FuncUnitInput], validVecThisFu ,rdyVec ) = pipelineReg(io.in.bits, io.in.valid,io.out.ready,preLat, io.flush) + val validVec = io.in.bits.validPipe.get.zip(validVecThisFu).map(x => x._1 && x._2) + val ctrlVec = io.in.bits.ctrlPipe.get val dataVec = pipeReg.map(_.data) val perfVec = pipeReg.map(_.perfDebugInfo) - val robIdxVec = ctrlVec.map(_.robIdx) - val pipeflushVec = validVec.zip(robIdxVec).map(x => x._1 && x._2.needFlush(io.flush)) val fixtiminginit = Wire(new FuncUnitInput(cfg)) fixtiminginit.ctrl := ctrlVec.last + fixtiminginit.ctrlPipe.foreach(_ := 0.U.asTypeOf(fixtiminginit.ctrlPipe.get)) + fixtiminginit.validPipe.foreach(_ := 0.U.asTypeOf(fixtiminginit.validPipe.get)) fixtiminginit.data := dataVec.last fixtiminginit.perfDebugInfo := perfVec.last // fixtiming pipelinereg val (fixpipeReg : Seq[FuncUnitInput], fixValidVec, fixRdyVec) = pipelineReg(fixtiminginit, validVec.last,rdyVec.head ,latdiff, io.flush) - val fixCtrlVec = fixpipeReg.map(_.ctrl) val fixDataVec = fixpipeReg.map(_.data) val fixPerfVec = fixpipeReg.map(_.perfDebugInfo) - val fixrobIdxVec = ctrlVec.map(_.robIdx) - val fixflushVec = fixValidVec.zip(fixrobIdxVec).map(x => x._1 && x._2.needFlush(io.flush)) - val flushVec = pipeflushVec ++ fixflushVec val pcVec = fixDataVec.map(_.pc) io.in.ready := fixRdyVec.head io.out.valid := fixValidVec.last - io.out.bits.ctrl.robIdx := fixCtrlVec.last.robIdx - io.out.bits.ctrl.pdest := fixCtrlVec.last.pdest - io.out.bits.ctrl.rfWen.foreach(_ := fixCtrlVec.last.rfWen.get) - io.out.bits.ctrl.fpWen.foreach(_ := fixCtrlVec.last.fpWen.get) - io.out.bits.ctrl.vecWen.foreach(_ := fixCtrlVec.last.vecWen.get) - io.out.bits.ctrl.v0Wen.foreach(_ := fixCtrlVec.last.v0Wen.get) - io.out.bits.ctrl.vlWen.foreach(_ := fixCtrlVec.last.vlWen.get) - io.out.bits.ctrl.fpu.foreach(_ := fixCtrlVec.last.fpu.get) - io.out.bits.ctrl.vpu.foreach(_ := fixCtrlVec.last.vpu.get) + io.out.bits.ctrl.robIdx := ctrlVec.last.robIdx + io.out.bits.ctrl.pdest := ctrlVec.last.pdest + io.out.bits.ctrl.rfWen.foreach(_ := ctrlVec.last.rfWen.get) + io.out.bits.ctrl.fpWen.foreach(_ := ctrlVec.last.fpWen.get) + io.out.bits.ctrl.vecWen.foreach(_ := ctrlVec.last.vecWen.get) + io.out.bits.ctrl.v0Wen.foreach(_ := ctrlVec.last.v0Wen.get) + io.out.bits.ctrl.vlWen.foreach(_ := ctrlVec.last.vlWen.get) + io.out.bits.ctrl.fpu.foreach(_ := ctrlVec.last.fpu.get) + io.out.bits.ctrl.vpu.foreach(_ := ctrlVec.last.vpu.get) io.out.bits.perfDebugInfo := fixPerfVec.last // vstart illegal if (cfg.exceptionOut.nonEmpty) { - val outVstart = fixCtrlVec.last.vpu.get.vstart + val outVstart = ctrlVec.last.vpu.get.vstart val vstartIllegal = outVstart =/= 0.U io.out.bits.ctrl.exceptionVec.get := 0.U.asTypeOf(io.out.bits.ctrl.exceptionVec.get) io.out.bits.ctrl.exceptionVec.get(illegalInstr) := vstartIllegal } - def regEnable(i: Int): Bool = validVec(i - 1) && rdyVec(i - 1) && !flushVec(i - 1) + def regEnable(i: Int): Bool = validVec(i - 1) && rdyVec(i - 1) def PipelineReg[TT <: Data](i: Int)(next: TT) = { val lat = preLat min i diff --git a/src/main/scala/xiangshan/backend/fu/fpu/FMA.scala b/src/main/scala/xiangshan/backend/fu/fpu/FMA.scala index 680864e5040..48ee87b5013 100644 --- a/src/main/scala/xiangshan/backend/fu/fpu/FMA.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/FMA.scala @@ -51,7 +51,7 @@ class FMUL_pipe(cfg: FuConfig, val mulLat: Int = 2)(implicit p: Parameters) val toAdd = IO(Output(new MulToAddIO(FPU.ftypes))) - val robIdx = robIdxVec(0) + val robIdx = io.in.bits.ctrl.robIdx val fpCtrl = DataHoldBypass(io.in.bits.ctrl.fpu.get, io.in.fire) val typeTagIn = fpCtrl.typeTagIn @@ -100,14 +100,14 @@ class FMUL_pipe(cfg: FuConfig, val mulLat: Int = 2)(implicit p: Parameters) toAdd.rm := S2Reg(S1Reg(rm)) toAdd.mul_out.zip(s3.map(_.io.to_fadd)).foreach(x => x._1 := x._2) toAdd.fpCtrl := S2Reg(S1Reg(io.in.bits.ctrl.fpu.get)) - toAdd.robIdx := robIdxVec(latency) + toAdd.robIdx := io.in.bits.ctrlPipe.get(latency).robIdx toAdd.pdest := S2Reg(S1Reg(io.in.bits.ctrl.pdest)) toAdd.fpWen := S2Reg(S1Reg(io.in.bits.ctrl.fpWen.get)) io.out.bits.res.data := Mux1H(outSel, s3.zip(FPU.ftypes).map{ case (mod, t) => FPU.box(mod.io.result, t) }) io.out.bits.res.fflags.get := Mux1H(outSel, s3.map(_.io.fflags)) - io.out.bits.ctrl.robIdx := robIdxVec(latency) + io.out.bits.ctrl.robIdx := io.in.bits.ctrlPipe.get(latency).robIdx io.out.bits.ctrl.pdest := S2Reg(S1Reg(io.in.bits.ctrl.pdest)) io.out.bits.ctrl.fpu.get := S2Reg(S1Reg(io.in.bits.ctrl.fpu.get)) } @@ -163,7 +163,7 @@ class FADD_pipe(cfg: FuConfig, val addLat: Int = 2)(implicit p: Parameters) exte case (mod, t) => FPU.box(mod.io.result, t) }) io.out.bits.res.fflags.get := Mux1H(outSel, s2.map(_.io.fflags)) - io.out.bits.ctrl.robIdx := robIdxVec(latency) + io.out.bits.ctrl.robIdx := io.in.bits.ctrlPipe.get(latency).robIdx io.out.bits.ctrl.pdest := S2Reg(S1Reg(io.in.bits.ctrl.pdest)) io.out.bits.ctrl.fpu.get := S2Reg(S1Reg(io.in.bits.ctrl.fpu.get)) } From f94811c4116cbf4345186d716237d45cc9744f9e Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Mon, 9 Dec 2024 14:23:32 +0800 Subject: [PATCH 16/32] timing(intRegfile): use IntRegFileSplit for better timing --- .../xiangshan/backend/datapath/DataPath.scala | 2 +- .../xiangshan/backend/regfile/Regfile.scala | 41 +++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/src/main/scala/xiangshan/backend/datapath/DataPath.scala b/src/main/scala/xiangshan/backend/datapath/DataPath.scala index de237ca7f46..cd0ee96fa30 100644 --- a/src/main/scala/xiangshan/backend/datapath/DataPath.scala +++ b/src/main/scala/xiangshan/backend/datapath/DataPath.scala @@ -306,7 +306,7 @@ class DataPathImp(override val wrapper: DataPath)(implicit p: Parameters, params io.diffVl.foreach(_ := vlDiffReadData.get) - IntRegFile("IntRegFile", intSchdParams.numPregs, intRfRaddr, intRfRdata, intRfWen, intRfWaddr, intRfWdata, + IntRegFileSplit("IntRegFile", intSchdParams.numPregs, splitNum = 2, intRfRaddr, intRfRdata, intRfWen, intRfWaddr, intRfWdata, bankNum = 1, debugReadAddr = intDiffRead.map(_._1), debugReadData = intDiffRead.map(_._2) diff --git a/src/main/scala/xiangshan/backend/regfile/Regfile.scala b/src/main/scala/xiangshan/backend/regfile/Regfile.scala index bad3c41cb20..05a4c05d37e 100644 --- a/src/main/scala/xiangshan/backend/regfile/Regfile.scala +++ b/src/main/scala/xiangshan/backend/regfile/Regfile.scala @@ -19,6 +19,7 @@ package xiangshan.backend.regfile import org.chipsalliance.cde.config.Parameters import chisel3._ import chisel3.util._ +import utils.OptionWrapper import xiangshan._ import xiangshan.backend.datapath.DataConfig._ import xiangshan.backend.exu.ExeUnitParams @@ -223,6 +224,46 @@ object IntRegFile { } } +object IntRegFileSplit { + // non-return version + def apply( + name : String, + numEntries : Int, + splitNum : Int, + raddr : Seq[UInt], + rdata : Vec[UInt], + wen : Seq[Bool], + waddr : Seq[UInt], + wdata : Seq[UInt], + debugReadAddr: Option[Seq[UInt]], + debugReadData: Option[Vec[UInt]], + withReset : Boolean = false, + bankNum : Int, + )(implicit p: Parameters): Unit = { + require(Seq(1, 2, 4, 8).contains(splitNum)) + val rdataVec = Wire(Vec(splitNum, Vec(rdata.length, UInt((rdata.head.getWidth / splitNum).W)))) + rdata.zipWithIndex.map{ case (r, i) => + r := Cat((0 until splitNum).map(x => rdataVec(x)(i)).reverse) + } + val debugReadDataVec = OptionWrapper(debugReadData.nonEmpty, Wire(Vec(splitNum, Vec(debugReadData.get.length, UInt((debugReadData.get.head.getWidth / splitNum).W))))) + if (debugReadData.nonEmpty) { + debugReadData.get.zipWithIndex.map { case (r, i) => + r := Cat((0 until splitNum).map(x => debugReadDataVec.get(x)(i)).reverse) + } + } + for (i <- 0 until splitNum){ + val wdataThisPart = wdata.map { case x => + val widthThisPart = x.getWidth / splitNum + x((i + 1) * widthThisPart - 1, i * widthThisPart) + } + val nameSuffix = if (splitNum > 1) s"Part${i}" else "" + Regfile( + name + nameSuffix, numEntries, raddr, rdataVec(i), wen, waddr, wdataThisPart, + hasZero = true, withReset, bankNum, debugReadAddr, OptionWrapper(debugReadData.nonEmpty, debugReadDataVec.get(i))) + } + } +} + object FpRegFile { // non-return version def apply( From baad73ffa86063b1c1bdf65d634a45a688299393 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Mon, 9 Dec 2024 14:25:58 +0800 Subject: [PATCH 17/32] timing(DecodeUnit): remove fpToVecDecoder --- .../xiangshan/backend/decode/DecodeUnit.scala | 69 +++++++++---------- 1 file changed, 31 insertions(+), 38 deletions(-) diff --git a/src/main/scala/xiangshan/backend/decode/DecodeUnit.scala b/src/main/scala/xiangshan/backend/decode/DecodeUnit.scala index 719e3a23e43..1b3d82c2fcb 100644 --- a/src/main/scala/xiangshan/backend/decode/DecodeUnit.scala +++ b/src/main/scala/xiangshan/backend/decode/DecodeUnit.scala @@ -1019,43 +1019,36 @@ class DecodeUnit(implicit p: Parameters) extends XSModule with DecodeUnitConstan decodedInst.wfflags := wfflagsInsts.map(_ === inst.ALL).reduce(_ || _) decodedInst.needFrm.scalaNeedFrm := scalaNeedFrmInsts.map(_ === inst.ALL).reduce(_ || _) decodedInst.needFrm.vectorNeedFrm := vectorNeedFrmInsts.map(_ === inst.ALL).reduce(_ || _) - val fpToVecDecoder = Module(new FPToVecDecoder()) - fpToVecDecoder.io.instr := inst.asUInt - val isFpToVecInst = fpToVecDecoder.io.vpuCtrl.fpu.isFpToVecInst decodedInst.vpu := 0.U.asTypeOf(decodedInst.vpu) // Todo: Connect vpu decoder - when(isFpToVecInst){ - decodedInst.vpu := fpToVecDecoder.io.vpuCtrl - }.otherwise{ - decodedInst.vpu.vill := io.enq.vtype.illegal - decodedInst.vpu.vma := io.enq.vtype.vma - decodedInst.vpu.vta := io.enq.vtype.vta - decodedInst.vpu.vsew := io.enq.vtype.vsew - decodedInst.vpu.vlmul := io.enq.vtype.vlmul - decodedInst.vpu.vm := inst.VM - decodedInst.vpu.nf := inst.NF - decodedInst.vpu.veew := inst.WIDTH - decodedInst.vpu.isReverse := needReverseInsts.map(_ === inst.ALL).reduce(_ || _) - decodedInst.vpu.isExt := vextInsts.map(_ === inst.ALL).reduce(_ || _) - val isNarrow = narrowInsts.map(_ === inst.ALL).reduce(_ || _) - val isDstMask = maskDstInsts.map(_ === inst.ALL).reduce(_ || _) - val isOpMask = maskOpInsts.map(_ === inst.ALL).reduce(_ || _) - val isVload = FuType.isVLoad(decodedInst.fuType) - val isVlx = isVload && (decodedInst.fuOpType === VlduType.vloxe || decodedInst.fuOpType === VlduType.vluxe) - val isVle = isVload && (decodedInst.fuOpType === VlduType.vle || decodedInst.fuOpType === VlduType.vleff || decodedInst.fuOpType === VlduType.vlse) - val isVlm = isVload && (decodedInst.fuOpType === VlduType.vlm) - val isFof = isVload && (decodedInst.fuOpType === VlduType.vleff) - val isWritePartVd = decodedInst.uopSplitType === UopSplitType.VEC_VRED || decodedInst.uopSplitType === UopSplitType.VEC_0XV || decodedInst.uopSplitType === UopSplitType.VEC_VWW - val isVma = vmaInsts.map(_ === inst.ALL).reduce(_ || _) - val emulIsFrac = Cat(~decodedInst.vpu.vlmul(2), decodedInst.vpu.vlmul(1, 0)) +& decodedInst.vpu.veew < 4.U +& decodedInst.vpu.vsew - val vstartIsNotZero = io.enq.vstart =/= 0.U - decodedInst.vpu.isNarrow := isNarrow - decodedInst.vpu.isDstMask := isDstMask - decodedInst.vpu.isOpMask := isOpMask - decodedInst.vpu.isDependOldVd := isVppu || isVecOPF || isVStore || (isDstMask && !isOpMask) || isNarrow || isVlx || isVma || isFof || vstartIsNotZero - decodedInst.vpu.isWritePartVd := isWritePartVd || isVlm || isVle && emulIsFrac - decodedInst.vpu.vstart := io.enq.vstart - decodedInst.vpu.isVleff := isFof && inst.NF === 0.U - } + decodedInst.vpu.vill := io.enq.vtype.illegal + decodedInst.vpu.vma := io.enq.vtype.vma + decodedInst.vpu.vta := io.enq.vtype.vta + decodedInst.vpu.vsew := io.enq.vtype.vsew + decodedInst.vpu.vlmul := io.enq.vtype.vlmul + decodedInst.vpu.vm := inst.VM + decodedInst.vpu.nf := inst.NF + decodedInst.vpu.veew := inst.WIDTH + decodedInst.vpu.isReverse := needReverseInsts.map(_ === inst.ALL).reduce(_ || _) + decodedInst.vpu.isExt := vextInsts.map(_ === inst.ALL).reduce(_ || _) + val isNarrow = narrowInsts.map(_ === inst.ALL).reduce(_ || _) + val isDstMask = maskDstInsts.map(_ === inst.ALL).reduce(_ || _) + val isOpMask = maskOpInsts.map(_ === inst.ALL).reduce(_ || _) + val isVload = FuType.isVLoad(decodedInst.fuType) + val isVlx = isVload && (decodedInst.fuOpType === VlduType.vloxe || decodedInst.fuOpType === VlduType.vluxe) + val isVle = isVload && (decodedInst.fuOpType === VlduType.vle || decodedInst.fuOpType === VlduType.vleff || decodedInst.fuOpType === VlduType.vlse) + val isVlm = isVload && (decodedInst.fuOpType === VlduType.vlm) + val isFof = isVload && (decodedInst.fuOpType === VlduType.vleff) + val isWritePartVd = decodedInst.uopSplitType === UopSplitType.VEC_VRED || decodedInst.uopSplitType === UopSplitType.VEC_0XV || decodedInst.uopSplitType === UopSplitType.VEC_VWW + val isVma = vmaInsts.map(_ === inst.ALL).reduce(_ || _) + val emulIsFrac = Cat(~decodedInst.vpu.vlmul(2), decodedInst.vpu.vlmul(1, 0)) +& decodedInst.vpu.veew < 4.U +& decodedInst.vpu.vsew + val vstartIsNotZero = io.enq.vstart =/= 0.U + decodedInst.vpu.isNarrow := isNarrow + decodedInst.vpu.isDstMask := isDstMask + decodedInst.vpu.isOpMask := isOpMask + decodedInst.vpu.isDependOldVd := isVppu || isVecOPF || isVStore || (isDstMask && !isOpMask) || isNarrow || isVlx || isVma || isFof || vstartIsNotZero + decodedInst.vpu.isWritePartVd := isWritePartVd || isVlm || isVle && emulIsFrac + decodedInst.vpu.vstart := io.enq.vstart + decodedInst.vpu.isVleff := isFof && inst.NF === 0.U decodedInst.vpu.specVill := io.enq.vtype.illegal decodedInst.vpu.specVma := io.enq.vtype.vma decodedInst.vpu.specVta := io.enq.vtype.vta @@ -1064,8 +1057,8 @@ class DecodeUnit(implicit p: Parameters) extends XSModule with DecodeUnitConstan decodedInst.vlsInstr := isVls - decodedInst.srcType(3) := Mux(inst.VM === 0.U && !isFpToVecInst, SrcType.vp, SrcType.DC) // mask src - decodedInst.srcType(4) := Mux(!isFpToVecInst, SrcType.vp, SrcType.DC) // vconfig + decodedInst.srcType(3) := Mux(inst.VM === 0.U, SrcType.vp, SrcType.DC) // mask src + decodedInst.srcType(4) := SrcType.vp // vconfig val uopInfoGen = Module(new UopInfoGen) uopInfoGen.io.in.preInfo.typeOfSplit := decodedInst.uopSplitType From e31c5cbf7c59985b81097b7ef410d17c1190fb53 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Tue, 10 Dec 2024 10:56:33 +0800 Subject: [PATCH 18/32] area(exu): data signals only pipe once in exu --- src/main/scala/xiangshan/backend/exu/ExeUnit.scala | 6 ++++++ src/main/scala/xiangshan/backend/fu/FuncUnit.scala | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/main/scala/xiangshan/backend/exu/ExeUnit.scala b/src/main/scala/xiangshan/backend/exu/ExeUnit.scala index 9d5e632028f..6e7b54ea9f6 100644 --- a/src/main/scala/xiangshan/backend/exu/ExeUnit.scala +++ b/src/main/scala/xiangshan/backend/exu/ExeUnit.scala @@ -293,6 +293,12 @@ class ExeUnitImp( sink.vpu.foreach(x => x.fpu.isFpToVecInst := 0.U) sink.vpu.foreach(x => x.fpu.isFP32Instr := 0.U) sink.vpu.foreach(x => x.fpu.isFP64Instr := 0.U) + val sinkData = fu.io.in.bits.dataPipe.get(i) + val sourceData = inPipe._1(i) + sinkData.src.zip(sourceData.src).foreach { case (fuSrc, exuSrc) => fuSrc := exuSrc } + sinkData.pc.foreach(x => x := sourceData.pc.get) + sinkData.nextPcOffset.foreach(x => x := sourceData.nextPcOffset.get) + sinkData.imm := sourceData.imm } } diff --git a/src/main/scala/xiangshan/backend/fu/FuncUnit.scala b/src/main/scala/xiangshan/backend/fu/FuncUnit.scala index 50374327be5..68f6e48acae 100644 --- a/src/main/scala/xiangshan/backend/fu/FuncUnit.scala +++ b/src/main/scala/xiangshan/backend/fu/FuncUnit.scala @@ -75,6 +75,7 @@ class FuncUnitInput(cfg: FuConfig)(implicit p: Parameters) extends XSBundle { val ctrlPipe = OptionWrapper(needCtrlPipe, Vec(cfg.latency.latencyVal.get + 1, new FuncUnitCtrlInput(cfg))) val validPipe = OptionWrapper(needCtrlPipe, Vec(cfg.latency.latencyVal.get + 1, Bool())) val data = new FuncUnitDataInput(cfg) + val dataPipe = OptionWrapper(needCtrlPipe, Vec(cfg.latency.latencyVal.get + 1, new FuncUnitDataInput(cfg))) val perfDebugInfo = new PerfDebugInfo() } @@ -193,6 +194,7 @@ trait HasPipelineReg { this: FuncUnit => out.ctrl := ctrl out.ctrlPipe.foreach(_ := 0.U.asTypeOf(out.ctrlPipe.get)) out.validPipe.foreach(_ := 0.U.asTypeOf(out.validPipe.get)) + out.dataPipe.foreach(_ := 0.U.asTypeOf(out.dataPipe.get)) out.data := data out.perfDebugInfo := perf out @@ -202,7 +204,7 @@ trait HasPipelineReg { this: FuncUnit => val (pipeReg : Seq[FuncUnitInput], validVecThisFu ,rdyVec ) = pipelineReg(io.in.bits, io.in.valid,io.out.ready,preLat, io.flush) val validVec = io.in.bits.validPipe.get.zip(validVecThisFu).map(x => x._1 && x._2) val ctrlVec = io.in.bits.ctrlPipe.get - val dataVec = pipeReg.map(_.data) + val dataVec = io.in.bits.dataPipe.get val perfVec = pipeReg.map(_.perfDebugInfo) @@ -210,6 +212,7 @@ trait HasPipelineReg { this: FuncUnit => fixtiminginit.ctrl := ctrlVec.last fixtiminginit.ctrlPipe.foreach(_ := 0.U.asTypeOf(fixtiminginit.ctrlPipe.get)) fixtiminginit.validPipe.foreach(_ := 0.U.asTypeOf(fixtiminginit.validPipe.get)) + fixtiminginit.dataPipe.foreach(_ := 0.U.asTypeOf(fixtiminginit.dataPipe.get)) fixtiminginit.data := dataVec.last fixtiminginit.perfDebugInfo := perfVec.last From 5f0b975dba9f594fbb4b263e02c9108a9946c527 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Tue, 10 Dec 2024 10:57:57 +0800 Subject: [PATCH 19/32] timing(backend): each IQ has at least two simple entries --- src/main/scala/xiangshan/Parameters.scala | 30 +++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/main/scala/xiangshan/Parameters.scala b/src/main/scala/xiangshan/Parameters.scala index 24fbcc0af91..94f4082569e 100644 --- a/src/main/scala/xiangshan/Parameters.scala +++ b/src/main/scala/xiangshan/Parameters.scala @@ -431,14 +431,14 @@ case class XSCoreParameters IssueBlockParams(Seq( ExeUnitParams("FEX0", Seq(FaluCfg, FcvtCfg, F2vCfg, FmacCfg), Seq(FpWB(port = 0, 0), IntWB(port = 0, 2), VfWB(port = 3, 0), V0WB(port = 3, 0)), Seq(Seq(FpRD(0, 0)), Seq(FpRD(1, 0)), Seq(FpRD(2, 0)))), ExeUnitParams("FEX1", Seq(FdivCfg), Seq(FpWB(port = 3, 1)), Seq(Seq(FpRD(2, 1)), Seq(FpRD(5, 1)))), - ), numEntries = 18, numEnq = 2, numComp = 16), + ), numEntries = 18, numEnq = 2, numComp = 14), IssueBlockParams(Seq( ExeUnitParams("FEX2", Seq(FaluCfg, FmacCfg), Seq(FpWB(port = 1, 0), IntWB(port = 1, 2)), Seq(Seq(FpRD(3, 0)), Seq(FpRD(4, 0)), Seq(FpRD(5, 0)))), ExeUnitParams("FEX3", Seq(FdivCfg), Seq(FpWB(port = 4, 1)), Seq(Seq(FpRD(8, 1)), Seq(FpRD(9, 1)))), - ), numEntries = 18, numEnq = 2, numComp = 16), + ), numEntries = 18, numEnq = 2, numComp = 14), IssueBlockParams(Seq( ExeUnitParams("FEX4", Seq(FaluCfg, FmacCfg), Seq(FpWB(port = 2, 0), IntWB(port = 2, 1)), Seq(Seq(FpRD(6, 0)), Seq(FpRD(7, 0)), Seq(FpRD(8, 0)))), - ), numEntries = 18, numEnq = 2, numComp = 16), + ), numEntries = 18, numEnq = 2, numComp = 14), ), numPregs = fpPreg.numEntries, numDeqOutside = 0, @@ -454,14 +454,14 @@ case class XSCoreParameters IssueBlockParams(Seq( ExeUnitParams("VFEX0", Seq(VfmaCfg, VialuCfg, VimacCfg, VppuCfg), Seq(VfWB(port = 0, 0), V0WB(port = 0, 0)), Seq(Seq(VfRD(0, 0)), Seq(VfRD(1, 0)), Seq(VfRD(2, 0)), Seq(V0RD(0, 0)), Seq(VlRD(0, 0)))), ExeUnitParams("VFEX1", Seq(VfaluCfg, VfcvtCfg, VipuCfg, VSetRvfWvfCfg), Seq(VfWB(port = 0, 1), V0WB(port = 0, 1), VlWB(port = vfSchdVlWbPort, 0), IntWB(port = 1, 1), FpWB(port = 0, 1)), Seq(Seq(VfRD(0, 1)), Seq(VfRD(1, 1)), Seq(VfRD(2, 1)), Seq(V0RD(0, 1)), Seq(VlRD(0, 1)))), - ), numEntries = 16, numEnq = 2, numComp = 14), + ), numEntries = 16, numEnq = 2, numComp = 12), IssueBlockParams(Seq( ExeUnitParams("VFEX2", Seq(VfmaCfg, VialuCfg), Seq(VfWB(port = 1, 0), V0WB(port = 1, 0)), Seq(Seq(VfRD(3, 0)), Seq(VfRD(4, 0)), Seq(VfRD(5, 0)), Seq(V0RD(1, 0)), Seq(VlRD(1, 0)))), ExeUnitParams("VFEX3", Seq(VfaluCfg), Seq(VfWB(port = 2, 1), V0WB(port = 2, 1), FpWB(port = 1, 1)), Seq(Seq(VfRD(3, 1)), Seq(VfRD(4, 1)), Seq(VfRD(5, 1)), Seq(V0RD(1, 1)), Seq(VlRD(1, 1)))), - ), numEntries = 16, numEnq = 2, numComp = 14), + ), numEntries = 16, numEnq = 2, numComp = 12), IssueBlockParams(Seq( ExeUnitParams("VFEX4", Seq(VfdivCfg, VidivCfg), Seq(VfWB(port = 3, 1), V0WB(port = 3, 1)), Seq(Seq(VfRD(3, 2)), Seq(VfRD(4, 2)), Seq(VfRD(5, 2)), Seq(V0RD(1, 2)), Seq(VlRD(1, 2)))), - ), numEntries = 10, numEnq = 2, numComp = 8), + ), numEntries = 10, numEnq = 2, numComp = 6), ), numPregs = vfPreg.numEntries, numDeqOutside = 0, @@ -478,31 +478,31 @@ case class XSCoreParameters SchdBlockParams(Seq( IssueBlockParams(Seq( ExeUnitParams("STA0", Seq(StaCfg, MouCfg), Seq(FakeIntWB()), Seq(Seq(IntRD(7, 2)))), - ), numEntries = 16, numEnq = 2, numComp = 14), + ), numEntries = 16, numEnq = 2, numComp = 12), IssueBlockParams(Seq( ExeUnitParams("STA1", Seq(StaCfg, MouCfg), Seq(FakeIntWB()), Seq(Seq(IntRD(6, 2)))), - ), numEntries = 16, numEnq = 2, numComp = 14), + ), numEntries = 16, numEnq = 2, numComp = 12), IssueBlockParams(Seq( ExeUnitParams("LDU0", Seq(LduCfg), Seq(IntWB(5, 0), FpWB(3, 0)), Seq(Seq(IntRD(8, 0))), true, 2), - ), numEntries = 16, numEnq = 2, numComp = 14), + ), numEntries = 16, numEnq = 2, numComp = 12), IssueBlockParams(Seq( ExeUnitParams("LDU1", Seq(LduCfg), Seq(IntWB(6, 0), FpWB(4, 0)), Seq(Seq(IntRD(9, 0))), true, 2), - ), numEntries = 16, numEnq = 2, numComp = 14), + ), numEntries = 16, numEnq = 2, numComp = 12), IssueBlockParams(Seq( ExeUnitParams("LDU2", Seq(LduCfg), Seq(IntWB(7, 0), FpWB(5, 0)), Seq(Seq(IntRD(10, 0))), true, 2), - ), numEntries = 16, numEnq = 2, numComp = 14), + ), numEntries = 16, numEnq = 2, numComp = 12), IssueBlockParams(Seq( ExeUnitParams("VLSU0", Seq(VlduCfg, VstuCfg, VseglduSeg, VsegstuCfg), Seq(VfWB(4, 0), V0WB(4, 0), VlWB(port = 2, 0)), Seq(Seq(VfRD(6, 0)), Seq(VfRD(7, 0)), Seq(VfRD(8, 0)), Seq(V0RD(2, 0)), Seq(VlRD(2, 0)))), - ), numEntries = 16, numEnq = 2, numComp = 14), + ), numEntries = 16, numEnq = 2, numComp = 12), IssueBlockParams(Seq( ExeUnitParams("VLSU1", Seq(VlduCfg, VstuCfg), Seq(VfWB(5, 0), V0WB(5, 0), VlWB(port = 3, 0)), Seq(Seq(VfRD(9, 0)), Seq(VfRD(10, 0)), Seq(VfRD(11, 0)), Seq(V0RD(3, 0)), Seq(VlRD(3, 0)))), - ), numEntries = 16, numEnq = 2, numComp = 14), + ), numEntries = 16, numEnq = 2, numComp = 12), IssueBlockParams(Seq( ExeUnitParams("STD0", Seq(StdCfg, MoudCfg), Seq(), Seq(Seq(IntRD(5, 2), FpRD(9, 0)))), - ), numEntries = 16, numEnq = 2, numComp = 14), + ), numEntries = 16, numEnq = 2, numComp = 12), IssueBlockParams(Seq( ExeUnitParams("STD1", Seq(StdCfg, MoudCfg), Seq(), Seq(Seq(IntRD(3, 2), FpRD(10, 0)))), - ), numEntries = 16, numEnq = 2, numComp = 14), + ), numEntries = 16, numEnq = 2, numComp = 12), ), numPregs = intPreg.numEntries max vfPreg.numEntries, numDeqOutside = 0, From 3d1dc2999ddad2ecbefb8573dce9a32e1030bf3f Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Wed, 11 Dec 2024 16:03:36 +0800 Subject: [PATCH 20/32] fix(pcmem): add read target from newestEntryTarget --- src/main/scala/xiangshan/backend/CtrlBlock.scala | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/main/scala/xiangshan/backend/CtrlBlock.scala b/src/main/scala/xiangshan/backend/CtrlBlock.scala index d086decc772..9ceebc32bda 100644 --- a/src/main/scala/xiangshan/backend/CtrlBlock.scala +++ b/src/main/scala/xiangshan/backend/CtrlBlock.scala @@ -228,12 +228,18 @@ class CtrlBlockImp( io.toDataPath.pcToDataPathIO.toDataPathPC(i) := pcMem.io.rdata(pcMemIdx).startAddr } + val newestEn = RegNext(io.frontend.fromFtq.newest_entry_en) + val newestTarget = RegEnable(io.frontend.fromFtq.newest_entry_target, io.frontend.fromFtq.newest_entry_en) + val newestPtr = RegEnable(io.frontend.fromFtq.newest_entry_ptr, io.frontend.fromFtq.newest_entry_en) + val newestTargetNext = RegEnable(newestTarget, newestEn) for ((pcMemIdx, i) <- pcMemRdIndexes("bjuTarget").zipWithIndex) { val ren = io.toDataPath.pcToDataPathIO.fromDataPathValid(i) + val baseAddr = io.toDataPath.pcToDataPathIO.fromDataPathFtqPtr(i).value val raddr = io.toDataPath.pcToDataPathIO.fromDataPathFtqPtr(i).value + 1.U pcMem.io.ren.get(pcMemIdx) := ren pcMem.io.raddr(pcMemIdx) := raddr - io.toDataPath.pcToDataPathIO.toDataPathTargetPC(i) := pcMem.io.rdata(pcMemIdx).startAddr + val needNewest = RegNext(baseAddr === newestPtr.value) + io.toDataPath.pcToDataPathIO.toDataPathTargetPC(i) := Mux(needNewest, newestTargetNext, pcMem.io.rdata(pcMemIdx).startAddr) } val baseIdx = params.BrhCnt From a79b1248fa5ed720928fb7dea18007d15fe95a21 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Thu, 12 Dec 2024 13:59:04 +0800 Subject: [PATCH 21/32] fix(decode): scala fp fu's fmt use fpuCtrl instead of vsew --- src/main/scala/xiangshan/Bundle.scala | 9 - .../xiangshan/backend/decode/FPDecoder.scala | 50 +++- .../scala/xiangshan/backend/fu/FuConfig.scala | 3 +- .../xiangshan/backend/fu/fpu/FDivSqrt.scala | 106 -------- .../scala/xiangshan/backend/fu/fpu/FMA.scala | 246 ------------------ .../xiangshan/backend/fu/fpu/FPToFP.scala | 129 --------- .../xiangshan/backend/fu/fpu/FPToInt.scala | 142 ---------- .../backend/fu/fpu/FpPipedFuncUnit.scala | 3 +- .../xiangshan/backend/fu/wrapper/FALU.scala | 10 +- .../backend/fu/wrapper/FDivSqrt.scala | 16 +- .../xiangshan/backend/fu/wrapper/FMA.scala | 10 +- 11 files changed, 58 insertions(+), 666 deletions(-) delete mode 100644 src/main/scala/xiangshan/backend/fu/fpu/FDivSqrt.scala delete mode 100644 src/main/scala/xiangshan/backend/fu/fpu/FMA.scala delete mode 100644 src/main/scala/xiangshan/backend/fu/fpu/FPToFP.scala delete mode 100644 src/main/scala/xiangshan/backend/fu/fpu/FPToInt.scala diff --git a/src/main/scala/xiangshan/Bundle.scala b/src/main/scala/xiangshan/Bundle.scala index a5875e8494e..261286f8581 100644 --- a/src/main/scala/xiangshan/Bundle.scala +++ b/src/main/scala/xiangshan/Bundle.scala @@ -167,19 +167,10 @@ class CtrlFlow(implicit p: Parameters) extends XSBundle { class FPUCtrlSignals(implicit p: Parameters) extends XSBundle { - val isAddSub = Bool() // swap23 - val typeTagIn = UInt(2.W) // H S D val typeTagOut = UInt(2.W) // H S D - val fromInt = Bool() val wflags = Bool() - val fpWen = Bool() - val fmaCmd = UInt(2.W) - val div = Bool() - val sqrt = Bool() - val fcvt = Bool() val typ = UInt(2.W) val fmt = UInt(2.W) - val ren3 = Bool() //TODO: remove SrcType.fp val rm = UInt(3.W) } diff --git a/src/main/scala/xiangshan/backend/decode/FPDecoder.scala b/src/main/scala/xiangshan/backend/decode/FPDecoder.scala index add730fd7e4..842ec4b4469 100644 --- a/src/main/scala/xiangshan/backend/decode/FPDecoder.scala +++ b/src/main/scala/xiangshan/backend/decode/FPDecoder.scala @@ -274,14 +274,45 @@ class FPDecoder(implicit p: Parameters) extends XSModule{ val decoder = DecodeLogic(io.instr, default, table) val ctrl = io.fpCtrl - val sigs = Seq( - ctrl.isAddSub, ctrl.typeTagIn, ctrl.typeTagOut, - ctrl.fromInt, ctrl.wflags, ctrl.fpWen, - ctrl.div, ctrl.sqrt, ctrl.fcvt - ) - sigs.zip(decoder).foreach({case (s, d) => s := d}) + val sigs = Seq(ctrl.typeTagOut, ctrl.wflags) + // TODO dirty code + sigs(0) := decoder(2) + sigs(1) := decoder(4) ctrl.typ := inst.TYP - ctrl.fmt := inst.FMT + // scalar cvt inst + val isSew2Cvts = Seq( + FCVT_W_S, FCVT_WU_S, FCVT_L_S, FCVT_LU_S, + FCVT_W_D, FCVT_WU_D, FCVT_S_D, FCVT_D_S, + FMV_X_W, + // zfa inst + FCVTMOD_W_D, + ) + /* + The optype for FCVT_D_H and FCVT_H_D is the same, + so the two instructions are distinguished by sew. + FCVT_H_D:VSew.e64 + FCVT_D_H:VSew.e16 + */ + val isSew2Cvth = Seq( + FCVT_S_H, FCVT_H_S, FCVT_D_H, + FMV_X_H, + FCVT_W_H, FCVT_L_H, FCVT_H_W, + FCVT_H_L, FCVT_H_WU, FCVT_H_LU, + FCVT_WU_H, FCVT_LU_H, + ) + val simpleFmt = Mux1H( + // scala format to vsew format, when inst.FMT === "b11".U, ctrl.fmt := "b00".U + Seq( + (inst.FMT === "b00".U) -> "b10".U, // S + (inst.FMT === "b01".U) -> "b11".U, // D + (inst.FMT === "b10".U) -> "b01".U, // H + ) + ) + val isSew2Cvt32 = isSew2Cvts.map(io.instr === _).reduce(_ || _) + val isSew2Cvt16 = isSew2Cvth.map(io.instr === _).reduce(_ || _) + val complexFmt = Mux(isSew2Cvt32, VSew.e32, VSew.e16) + val isCompFmt = isSew2Cvt32 || isSew2Cvt16 + ctrl.fmt := Mux(isCompFmt, complexFmt, simpleFmt) ctrl.rm := inst.RM val fmaTable: Array[(BitPat, List[BitPat])] = Array( @@ -308,9 +339,4 @@ class FPDecoder(implicit p: Parameters) extends XSModule{ FNMSUB_H-> List(BitPat("b10"),Y) ) val fmaDefault = List(BitPat("b??"), N) - Seq(ctrl.fmaCmd, ctrl.ren3).zip( - DecodeLogic(io.instr, fmaDefault, fmaTable) - ).foreach({ - case (s, d) => s := d - }) } diff --git a/src/main/scala/xiangshan/backend/fu/FuConfig.scala b/src/main/scala/xiangshan/backend/fu/FuConfig.scala index dbf77b2ffda..52fc0414f73 100644 --- a/src/main/scala/xiangshan/backend/fu/FuConfig.scala +++ b/src/main/scala/xiangshan/backend/fu/FuConfig.scala @@ -160,8 +160,7 @@ case class FuConfig ( def needVecCtrl: Boolean = { import FuType._ - Seq(falu, fmac, fDivSqrt, fcvt, - vipu, vialuF, vimac, vidiv, vfpu, vppu, vfalu, vfma, vfdiv, vfcvt, vldu, vstu).contains(fuType) + Seq(vipu, vialuF, vimac, vidiv, vfpu, vppu, vfalu, vfma, vfdiv, vfcvt, vldu, vstu).contains(fuType) } def needCriticalErrors: Boolean = Seq(FuType.csr).contains(fuType) diff --git a/src/main/scala/xiangshan/backend/fu/fpu/FDivSqrt.scala b/src/main/scala/xiangshan/backend/fu/fpu/FDivSqrt.scala deleted file mode 100644 index 3b7cec0731a..00000000000 --- a/src/main/scala/xiangshan/backend/fu/fpu/FDivSqrt.scala +++ /dev/null @@ -1,106 +0,0 @@ -/*************************************************************************************** -* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences -* Copyright (c) 2020-2021 Peng Cheng Laboratory -* -* XiangShan is licensed under Mulan PSL v2. -* You can use this software according to the terms and conditions of the Mulan PSL v2. -* You may obtain a copy of Mulan PSL v2 at: -* http://license.coscl.org.cn/MulanPSL2 -* -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, -* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, -* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. -* -* See the Mulan PSL v2 for more details. -***************************************************************************************/ - -package xiangshan.backend.fu.fpu - -import org.chipsalliance.cde.config.Parameters -import chisel3._ -import chisel3.experimental.hierarchy.{Definition, Instance, instantiable, public} -import chisel3.util._ -import fudian.FDIV -import utility.{MaskExpand, RegNextWithEnable} -import xiangshan.backend.fu.FuConfig - -import scala.collection.mutable - -/* - Because fdiv use the decoder and decoder has 'Dedup' bug now, - we use hierarchy API to force FDIV be deduped to avoid the bug. - */ -object FDivGen { - val defMap = new mutable.HashMap[FPU.FType, Definition[InstantiableFDIV]]() - def apply(t: FPU.FType) = { - val divDef = defMap.getOrElseUpdate(t, Definition(new InstantiableFDIV(t))) - Instance(divDef) - } -} - -@instantiable -class InstantiableFDIV(t: FPU.FType) extends Module { - - val div = Module(new FDIV(t.expWidth, t.precision)) - - @public val io = IO(chiselTypeOf(div.io)) - - io <> div.io - -} - -class FDivSqrtDataModule(implicit p: Parameters) extends FPUDataModule { - val in_valid, out_ready = IO(Input(Bool())) - val in_ready, out_valid = IO(Output(Bool())) - val out_validNext = IO(Output(Bool())) - val kill_w = IO(Input(Bool())) - val kill_r = IO(Input(Bool())) - - val in_fire = in_valid && in_ready - val out_fire = out_valid && out_ready - - val fpCtrl = io.in.fpCtrl - val tag = fpCtrl.typeTagIn - val src1 = FPU.unbox(io.in.src(0), tag) - val src2 = FPU.unbox(io.in.src(1), tag) - - val typeSel = VecInit(FPU.ftypes.zipWithIndex.map(_._2.U === tag)) - val outSel = RegEnable(typeSel, VecInit.fill(typeSel.length)(true.B), in_fire) // inelegant - val outDataSel = RegEnable(MaskExpand(typeSel, 64), in_fire) - - val divSqrt = FPU.ftypes.map{ t => - val fdiv = FDivGen(t) - fdiv.io.a := src1 - fdiv.io.b := src2 - fdiv.io.rm := rm - fdiv.io.specialIO.in_valid := in_fire && !kill_w && (FPU.ftypes.indexOf(t).U === tag) - fdiv.io.specialIO.out_ready := out_ready - fdiv.io.specialIO.isSqrt := fpCtrl.sqrt - fdiv.io.specialIO.kill := kill_r - fdiv - } - - in_ready := divSqrt.map(_.io.specialIO.in_ready).foldRight(true.B)(_ && _) - out_validNext := Mux1H(outSel, divSqrt.map(_.io.specialIO.out_valid)) - out_valid := RegNext(out_validNext) - io.out.data := outDataSel.zip(divSqrt.zip(FPU.ftypes).map{ - case (mod, t) => FPU.box(mod.io.result, t) - }).map(x => x._1 & x._2).reduce(_ | _) - io.out.fflags := Mux1H(outSel, divSqrt.map(_.io.fflags)) -} - -class FDivSqrt(cfg: FuConfig)(implicit p: Parameters) extends FPUSubModule(cfg) { - - val robIdxReg = RegEnable(io.in.bits.ctrl.robIdx, io.in.fire) - val kill_r = !io.in.ready && robIdxReg.needFlush(io.flush) - - override val dataModule = Module(new FDivSqrtDataModule) - connectDataModule - dataModule.in_valid := io.in.valid - dataModule.out_ready := io.out.ready - dataModule.kill_w := io.in.bits.ctrl.robIdx.needFlush(io.flush) - dataModule.kill_r := kill_r - io.in.ready := dataModule.in_ready - io.out.valid := dataModule.out_valid - connectNonPipedCtrlSingal -} diff --git a/src/main/scala/xiangshan/backend/fu/fpu/FMA.scala b/src/main/scala/xiangshan/backend/fu/fpu/FMA.scala deleted file mode 100644 index 48ee87b5013..00000000000 --- a/src/main/scala/xiangshan/backend/fu/fpu/FMA.scala +++ /dev/null @@ -1,246 +0,0 @@ -/*************************************************************************************** -* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences -* Copyright (c) 2020-2021 Peng Cheng Laboratory -* -* XiangShan is licensed under Mulan PSL v2. -* You can use this software according to the terms and conditions of the Mulan PSL v2. -* You may obtain a copy of Mulan PSL v2 at: -* http://license.coscl.org.cn/MulanPSL2 -* -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, -* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, -* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. -* -* See the Mulan PSL v2 for more details. -***************************************************************************************/ - -package xiangshan.backend.fu.fpu - -import _root_.utils._ -import org.chipsalliance.cde.config.Parameters -import chisel3._ -import chisel3.util._ -import fudian._ -import fudian.utils.Multiplier -import utility._ -import xiangshan._ -import xiangshan.backend.rob.RobPtr -import xiangshan.backend.fu.FuConfig - - -class MulToAddIO(val ftypes: Seq[FPU.FType])(implicit p: Parameters) extends XSBundle { - val mul_out = MixedVec(ftypes.map(t => new FMULToFADD(t.expWidth, t.precision))) - val addend = UInt(ftypes.map(_.len).max.W) - val fpCtrl = new FPUCtrlSignals - val robIdx = new RobPtr - val pdest = UInt(PhyRegIdxWidth.W) - val fpWen = Bool() - val rm = UInt(3.W) - - def getFloat = mul_out.head - def getDouble = mul_out.last -} - -class FMUL_pipe(cfg: FuConfig, val mulLat: Int = 2)(implicit p: Parameters) - extends FPUPipelineModule(cfg) -{ - override def latency: Int = mulLat - override val dataModule: FPUDataModule = null - - private val rm = io.frm.get - - val toAdd = IO(Output(new MulToAddIO(FPU.ftypes))) - - val robIdx = io.in.bits.ctrl.robIdx - val fpCtrl = DataHoldBypass(io.in.bits.ctrl.fpu.get, io.in.fire) - val typeTagIn = fpCtrl.typeTagIn - - val typeSel = VecInit(FPU.ftypes.zipWithIndex.map(_._2.U === typeTagIn)) - - val src1 = FPU.unbox(io.in.bits.data.src(0), typeTagIn) - val src2 = FPU.unbox(io.in.bits.data.src(1), typeTagIn) - - val multiplier = Module(new Multiplier(FPU.ftypes.last.precision+1, pipeAt = Seq(1))) - - val stages = FPU.ftypes.map{ t => - // s1 -> s2 -> s3 - val s1 = Module(new FMUL_s1(t.expWidth, t.precision)) - val s2 = Module(new FMUL_s2(t.expWidth, t.precision)) - val s3 = Module(new FMUL_s3(t.expWidth, t.precision)) - - val in1 = src1 - val in2 = Mux(fpCtrl.fmaCmd(1), invert_sign(src2, t.len), src2) - s1.io.a := in1 - s1.io.b := in2 - s1.io.rm := rm - - s2.io.in := S1Reg(s1.io.out) - s2.io.prod := multiplier.io.result - s3.io.in := S2Reg(s2.io.out) - (s1, s2, s3) - } - - val (s1, s2, s3) = stages.unzip3 - val (mul_a_sel, mul_b_sel) = s1.zipWithIndex.map{ - case (s, i) => - val raw_a = RawFloat.fromUInt(s.io.a, s.expWidth, s.precision) - val raw_b = RawFloat.fromUInt(s.io.b, s.expWidth, s.precision) - ( - (typeTagIn === i.U) -> raw_a.sig, - (typeTagIn === i.U) -> raw_b.sig - ) - }.unzip - multiplier.io.a := Mux1H(mul_a_sel) - multiplier.io.b := Mux1H(mul_b_sel) - multiplier.io.regEnables(0) := regEnable(1) - - val outSel = S2Reg(S1Reg(typeSel)) - - toAdd.addend := S2Reg(S1Reg(io.in.bits.data.src(2))) - toAdd.rm := S2Reg(S1Reg(rm)) - toAdd.mul_out.zip(s3.map(_.io.to_fadd)).foreach(x => x._1 := x._2) - toAdd.fpCtrl := S2Reg(S1Reg(io.in.bits.ctrl.fpu.get)) - toAdd.robIdx := io.in.bits.ctrlPipe.get(latency).robIdx - toAdd.pdest := S2Reg(S1Reg(io.in.bits.ctrl.pdest)) - toAdd.fpWen := S2Reg(S1Reg(io.in.bits.ctrl.fpWen.get)) - io.out.bits.res.data := Mux1H(outSel, s3.zip(FPU.ftypes).map{ - case (mod, t) => FPU.box(mod.io.result, t) - }) - io.out.bits.res.fflags.get := Mux1H(outSel, s3.map(_.io.fflags)) - io.out.bits.ctrl.robIdx := io.in.bits.ctrlPipe.get(latency).robIdx - io.out.bits.ctrl.pdest := S2Reg(S1Reg(io.in.bits.ctrl.pdest)) - io.out.bits.ctrl.fpu.get := S2Reg(S1Reg(io.in.bits.ctrl.fpu.get)) -} - -class FADD_pipe(cfg: FuConfig, val addLat: Int = 2)(implicit p: Parameters) extends FPUPipelineModule(cfg) { - override val dataModule: FPUDataModule = null - override def latency: Int = addLat - - private val rm = io.frm.get - - val mulToAdd = IO(Input(new MulToAddIO(FPU.ftypes))) - val isFMA = IO(Input(Bool())) - - val src1 = S1Reg(FPU.unbox(io.in.bits.data.src(0), io.in.bits.ctrl.fpu.get.typeTagIn)) - val src2 = S1Reg(FPU.unbox( - Mux(isFMA, mulToAdd.addend, io.in.bits.data.src(1)), io.in.bits.ctrl.fpu.get.typeTagIn - )) - - val fpCtrl = S1Reg(Mux(isFMA, mulToAdd.fpCtrl, io.in.bits.ctrl.fpu.get)) - val typeTagIn = fpCtrl.typeTagIn - - val fma = S1Reg(isFMA) - val mulProd = S1Reg(mulToAdd.mul_out) - - val stages = FPU.ftypes.zipWithIndex.map{ - case (t, i) => - val s1 = Module(new FCMA_ADD_s1(t.expWidth, 2*t.precision, t.precision)) - val s2 = Module(new FCMA_ADD_s2(t.expWidth, 2*t.precision, t.precision)) - val in1 = Mux(fma, - mulProd(i).fp_prod.asUInt, - Cat(src1(t.len - 1, 0), 0.U(t.precision.W)) - ) - val in2 = Cat( - Mux(fpCtrl.fmaCmd(0), invert_sign(src2, t.len), src2(t.len - 1, 0)), - 0.U(t.precision.W) - ) - s1.io.a := in1 - s1.io.b := in2 - s1.io.b_inter_valid := fma - s1.io.b_inter_flags := Mux(fma, - mulProd(i).inter_flags, - 0.U.asTypeOf(s1.io.b_inter_flags) - ) - s1.io.rm := S1Reg(rm) - s2.io.in := S2Reg(s1.io.out) - (s1, s2) - } - - val (s1, s2) = stages.unzip - - val outSel = S2Reg(VecInit(FPU.ftypes.zipWithIndex.map(_._2.U === typeTagIn))) - io.out.bits.res.data := Mux1H(outSel, s2.zip(FPU.ftypes).map{ - case (mod, t) => FPU.box(mod.io.result, t) - }) - io.out.bits.res.fflags.get := Mux1H(outSel, s2.map(_.io.fflags)) - io.out.bits.ctrl.robIdx := io.in.bits.ctrlPipe.get(latency).robIdx - io.out.bits.ctrl.pdest := S2Reg(S1Reg(io.in.bits.ctrl.pdest)) - io.out.bits.ctrl.fpu.get := S2Reg(S1Reg(io.in.bits.ctrl.fpu.get)) -} - -class FMA(cfg: FuConfig)(implicit p: Parameters) extends FPUSubModule(cfg) { - private val rm = io.frm.get - override val dataModule = null - val mul_pipe = Module(new FMUL_pipe(cfg)) - val add_pipe = Module(new FADD_pipe(cfg)) - - - mul_pipe.io.flush := io.flush - mul_pipe.io.frm.get := rm - - add_pipe.io.flush := io.flush - add_pipe.io.frm.get := rm - - val fpCtrl = io.in.bits.ctrl.fpu.get - mul_pipe.io.in <> io.in - mul_pipe.io.in.valid := io.in.valid && !fpCtrl.isAddSub - - // For better timing, we let out.valid be true even if it's flushed. - val isFMA = mul_pipe.io.out.valid && mul_pipe.io.out.bits.ctrl.fpu.get.ren3 - // However, when sending instructions to add_pipe, we need to determine whether it's flushed. - val mulFlushed = mul_pipe.io.out.bits.ctrl.robIdx.needFlush(io.flush) - val isFMAReg = isFMA && !mulFlushed - - add_pipe.mulToAdd <> mul_pipe.toAdd - - // For FADD, it accepts instructions from io.in and FMUL. - // When FMUL gives an FMA, FADD accepts this instead of io.in. - // Since FADD gets FMUL data from add_pipe.mulToAdd, only uop needs Mux. - add_pipe.io.in.valid := io.in.valid && fpCtrl.isAddSub || isFMAReg - add_pipe.io.in.bits := 0.U.asTypeOf(add_pipe.io.in.bits) - add_pipe.io.in.bits.data.src := io.in.bits.data.src - add_pipe.io.in.bits.ctrl.robIdx := Mux(isFMAReg, add_pipe.mulToAdd.robIdx, io.in.bits.ctrl.robIdx) - add_pipe.io.in.bits.ctrl.pdest := Mux(isFMAReg, add_pipe.mulToAdd.pdest, io.in.bits.ctrl.pdest) - add_pipe.io.in.bits.ctrl.fpu.get := Mux(isFMAReg, add_pipe.mulToAdd.fpCtrl, io.in.bits.ctrl.fpu.get) - add_pipe.io.in.bits.ctrl.fpWen.get := Mux(isFMAReg, add_pipe.mulToAdd.fpWen, io.in.bits.ctrl.fpWen.get) - add_pipe.isFMA := isFMAReg - - // When the in uop is Add/Sub, we check FADD, otherwise fmul is checked. - io.in.ready := Mux(fpCtrl.isAddSub, - !isFMAReg && add_pipe.io.in.ready, - mul_pipe.io.in.ready - ) - - // For FMUL: - // (1) It always accept FMA from FADD (if an FMA wants FMUL, it's never blocked). - // (2) It has lower writeback arbitration priority than FADD (and may be blocked when FMUL.out.valid). - XSError(isFMA && !add_pipe.io.in.ready, "FMA should not be blocked\n") - mul_pipe.io.out.ready := isFMA || (io.out.ready && !add_pipe.io.out.valid) - add_pipe.io.out.ready := io.out.ready - - io.out.bits.ctrl.robIdx := Mux(add_pipe.io.out.valid, - add_pipe.io.out.bits.ctrl.robIdx, - mul_pipe.io.out.bits.ctrl.robIdx - ) - io.out.bits.ctrl.fpu.get := Mux(add_pipe.io.out.valid, - add_pipe.io.out.bits.ctrl.fpu.get, - mul_pipe.io.out.bits.ctrl.fpu.get - ) - io.out.bits.ctrl.pdest := Mux(add_pipe.io.out.valid, - add_pipe.io.out.bits.ctrl.pdest, - mul_pipe.io.out.bits.ctrl.pdest - ) - io.out.bits.res.data := Mux(add_pipe.io.out.valid, - add_pipe.io.out.bits.res.data, - mul_pipe.io.out.bits.res.data - ) - io.out.bits.res.fflags.get := Mux(add_pipe.io.out.valid, - add_pipe.io.out.bits.res.fflags.get, - mul_pipe.io.out.bits.res.fflags.get - ) - io.out.valid := add_pipe.io.out.valid || (mul_pipe.io.out.valid && !isFMA) - io.out.bits.ctrl.fpWen.get := Mux(add_pipe.io.out.valid, - add_pipe.io.out.bits.ctrl.fpWen.get, - mul_pipe.io.out.bits.ctrl.fpWen.get - ) -} diff --git a/src/main/scala/xiangshan/backend/fu/fpu/FPToFP.scala b/src/main/scala/xiangshan/backend/fu/fpu/FPToFP.scala deleted file mode 100644 index 20e241fc554..00000000000 --- a/src/main/scala/xiangshan/backend/fu/fpu/FPToFP.scala +++ /dev/null @@ -1,129 +0,0 @@ -/*************************************************************************************** -* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences -* Copyright (c) 2020-2021 Peng Cheng Laboratory -* -* XiangShan is licensed under Mulan PSL v2. -* You can use this software according to the terms and conditions of the Mulan PSL v2. -* You may obtain a copy of Mulan PSL v2 at: -* http://license.coscl.org.cn/MulanPSL2 -* -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, -* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, -* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. -* -* See the Mulan PSL v2 for more details. -***************************************************************************************/ - -// See LICENSE.Berkeley for license details. -// See LICENSE.SiFive for license details. - -package xiangshan.backend.fu.fpu - -import org.chipsalliance.cde.config.Parameters -import chisel3._ -import chisel3.util._ -import fudian.{FCMP, FloatPoint} -import xiangshan.backend.fu.FuConfig - -class FPToFPDataModule(latency: Int)(implicit p: Parameters) extends FPUDataModule { - - val regEnables = IO(Input(Vec(latency, Bool()))) - - val ctrlIn = io.in.fpCtrl - val ctrl = RegEnable(ctrlIn, regEnables(0)) - val inTag = ctrl.typeTagIn - val outTag = ctrl.typeTagOut - val wflags = ctrl.wflags - val src1 = RegEnable(FPU.unbox(io.in.src(0), ctrlIn.typeTagIn), regEnables(0)) - val src2 = RegEnable(FPU.unbox(io.in.src(1), ctrlIn.typeTagIn), regEnables(0)) - val rmReg = RegEnable(rm, regEnables(0)) - - val signNum = Mux(rmReg(1), src1 ^ src2, Mux(rmReg(0), ~src2, src2)) - val fsgnj = VecInit(FPU.ftypes.map { t => - Cat(signNum(t.len - 1), src1(t.len - 2, 0)) - })(inTag) - -// val signNum = Mux(rmReg(1), src1 ^ src2, Mux(rmReg(0), ~src2, src2)) -// val fsgnj = Cat(signNum(fLen - 1), src1(fLen - 2, 0)) - - val fsgnjMux = Wire(new Bundle() { - val data = UInt(XLEN.W) - val exc = UInt(5.W) - }) - fsgnjMux.data := fsgnj - fsgnjMux.exc := 0.U - - val scmp = Module(new FCMP(FPU.f32.expWidth, FPU.f32.precision)) - val dcmp = Module(new FCMP(FPU.f64.expWidth, FPU.f64.precision)) - val lt = VecInit(Seq(scmp, dcmp).map { fcmp => - fcmp.io.a := src1 - fcmp.io.b := src2 - fcmp.io.signaling := !rmReg(1) - fcmp.io.lt || (fcmp.io.a.asSInt < 0.S && fcmp.io.b.asSInt >= 0.S) - })(inTag) - - val fminmax = FPU.ftypes map { t => - val fp_a = FloatPoint.fromUInt(src1, t.expWidth, t.precision).decode - val fp_b = FloatPoint.fromUInt(src2, t.expWidth, t.precision).decode - val isnan1 = fp_a.isNaN - val isnan2 = fp_b.isNaN - val isInv = fp_a.isSNaN || fp_b.isSNaN - val isNaNOut = isnan1 && isnan2 - val isLHS = isnan2 || rmReg(0) =/= lt && !isnan1 - val data = Mux(isNaNOut, - FloatPoint.defaultNaNUInt(t.expWidth, t.precision), - Mux(isLHS, src1, src2) - ) - val exc = Cat(isInv, 0.U(4.W)) - (data, exc) - } - val (fminmax_data, fminmax_exc) = fminmax.unzip - when(wflags){ - fsgnjMux.exc := VecInit(fminmax_exc)(inTag) - fsgnjMux.data := VecInit(fminmax_data)(inTag) - } - -// val lt = dcmp.io.lt || (dcmp.io.a.asSInt < 0.S && dcmp.io.b.asSInt >= 0.S) - - val mux = WireInit(fsgnjMux) - - val s2d = Module(new fudian.FPToFP( - FPU.f32.expWidth, FPU.f32.precision, - FPU.f64.expWidth, FPU.f64.precision - )) - - val d2s = Module(new fudian.FPToFP( - FPU.f64.expWidth, FPU.f64.precision, - FPU.f32.expWidth, FPU.f32.precision - )) - - for(fcvt <- Seq(s2d, d2s)){ - fcvt.io.in := src1 - fcvt.io.rm := rmReg - } - - val fcvt_data = Mux(inTag === FPU.D, d2s.io.result, s2d.io.result) - val fcvt_exc = Mux(inTag === FPU.D, d2s.io.fflags, s2d.io.fflags) - - when(ctrl.fcvt){ - mux.data := fcvt_data - mux.exc := fcvt_exc - } - - val boxed_data = Mux(outTag === FPU.S, - FPU.box(mux.data, FPU.S), - FPU.box(mux.data, FPU.D) - ) - - io.out.data := RegEnable(boxed_data, regEnables(1)) - io.out.fflags := RegEnable(mux.exc, regEnables(1)) -} - -class FPToFP(cfg: FuConfig)(implicit p: Parameters) extends FPUPipelineModule(cfg) { - - override def latency: Int = cfg.latency.latencyVal.get - - override val dataModule = Module(new FPToFPDataModule(latency)) - connectDataModule - dataModule.regEnables <> VecInit((1 to latency) map (i => regEnable(i))) -} diff --git a/src/main/scala/xiangshan/backend/fu/fpu/FPToInt.scala b/src/main/scala/xiangshan/backend/fu/fpu/FPToInt.scala deleted file mode 100644 index 93fcada046f..00000000000 --- a/src/main/scala/xiangshan/backend/fu/fpu/FPToInt.scala +++ /dev/null @@ -1,142 +0,0 @@ -/*************************************************************************************** -* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences -* Copyright (c) 2020-2021 Peng Cheng Laboratory -* -* XiangShan is licensed under Mulan PSL v2. -* You can use this software according to the terms and conditions of the Mulan PSL v2. -* You may obtain a copy of Mulan PSL v2 at: -* http://license.coscl.org.cn/MulanPSL2 -* -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, -* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, -* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. -* -* See the Mulan PSL v2 for more details. -***************************************************************************************/ - -// See LICENSE.Berkeley for license details. -// See LICENSE.SiFive for license details. - -package xiangshan.backend.fu.fpu - -import org.chipsalliance.cde.config.Parameters -import chisel3._ -import chisel3.util._ -import fudian.FCMP -import utility.SignExt -import xiangshan.backend.fu.FuConfig - - - -class FPToIntDataModule(latency: Int)(implicit p: Parameters) extends FPUDataModule { - val regEnables = IO(Input(Vec(latency, Bool()))) - val (src1, src2) = (io.in.src(0), io.in.src(1)) - - val ctrl = io.in.fpCtrl - - // stage 1: unbox inputs - val src1_d = RegEnable(FPU.unbox(src1, ctrl.typeTagIn), regEnables(0)) - val src2_d = RegEnable(FPU.unbox(src2, ctrl.typeTagIn), regEnables(0)) - val ctrl_reg = RegEnable(ctrl, regEnables(0)) - val rm_reg = RegEnable(rm, regEnables(0)) - - // stage2 - - val src1_ieee = src1_d - val move_out = Mux(ctrl_reg.typeTagIn === FPU.S, - src1_ieee(FPU.f32.len - 1, 0), - src1_ieee - ) - - def classify(x: UInt, ftype: FPU.FType): UInt = { - val float = fudian.FloatPoint.fromUInt(x, ftype.expWidth, ftype.precision) - val decode = float.decode - val isNormal = !decode.expIsOnes && !decode.expIsZero - Cat( - decode.isQNaN, - decode.isSNaN, - decode.isInf && !float.sign, - isNormal && !float.sign, - decode.isSubnormal && !float.sign, - decode.isZero && !float.sign, - decode.isZero && float.sign, - decode.isSubnormal && float.sign, - isNormal && float.sign, - decode.isInf && float.sign - ) - } - - val classify_out = Mux(ctrl_reg.typeTagIn === FPU.S, - classify(src1_d(31, 0), FPU.f32), - classify(src1_d, FPU.f64) - ) - - val scmp = Module(new FCMP(FPU.f32.expWidth, FPU.f32.precision)) - val dcmp = Module(new FCMP(FPU.f64.expWidth, FPU.f64.precision)) - - for(mod <- Seq(scmp, dcmp)){ - mod.io.a := src1_d - mod.io.b := src2_d - mod.io.signaling := !rm_reg(1) - } - val lt = Mux(ctrl_reg.typeTagIn === FPU.S, - scmp.io.lt, - dcmp.io.lt - ) - val eq = Mux(ctrl_reg.typeTagIn === FPU.S, - scmp.io.eq, - dcmp.io.eq - ) - - val cmp_out = ((~rm_reg).asUInt & Cat(lt, eq)).orR - val cmp_exc = Mux(ctrl_reg.typeTagIn === FPU.S, - scmp.io.fflags, - dcmp.io.fflags - ) - - val s2i = Module(new fudian.FPToInt(FPU.f32.expWidth, FPU.f32.precision)) - val d2i = Module(new fudian.FPToInt(FPU.f64.expWidth, FPU.f64.precision)) - - for(f2i <- Seq(s2i, d2i)){ - f2i.io.a := src1_d - f2i.io.rm := rm_reg - f2i.io.op := Cat( - ctrl_reg.typ(1), - !ctrl_reg.typ(0) - ) - } - - val conv_out = Mux(ctrl_reg.typeTagIn === FPU.S, - s2i.io.result, - d2i.io.result - ) - val conv_exc = Mux(ctrl_reg.typeTagIn === FPU.S, - s2i.io.fflags, - d2i.io.fflags - ) - - val intData = Wire(UInt(XLEN.W)) - intData := Mux(ctrl_reg.wflags, - Mux(ctrl_reg.fcvt, conv_out, cmp_out), - Mux(rm_reg(0), classify_out, move_out) - ) - val long = Mux(ctrl_reg.fcvt, ctrl_reg.typ(1), ctrl_reg.fmt(0)) - val intValue = RegEnable(Mux(long, - SignExt(intData, XLEN), - SignExt(intData(31, 0), XLEN) - ), regEnables(1)) - - val exc = RegEnable(Mux(ctrl_reg.fcvt, conv_exc, cmp_exc), regEnables(1)) - - io.out.data := intValue - io.out.fflags := exc -} - -class FPToInt(cfg: FuConfig)(implicit p: Parameters) extends FPUPipelineModule(cfg) { - - override def latency = cfg.latency.latencyVal.get - - override val dataModule = Module(new FPToIntDataModule(latency)) - connectDataModule - dataModule.regEnables <> VecInit((1 to latency) map (i => regEnable(i))) -} diff --git a/src/main/scala/xiangshan/backend/fu/fpu/FpPipedFuncUnit.scala b/src/main/scala/xiangshan/backend/fu/fpu/FpPipedFuncUnit.scala index 489f71bbae7..fbb4bb925c3 100644 --- a/src/main/scala/xiangshan/backend/fu/fpu/FpPipedFuncUnit.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/FpPipedFuncUnit.scala @@ -9,8 +9,7 @@ import xiangshan.backend.fu.{FuConfig, FuncUnit, HasPipelineReg} trait FpFuncUnitAlias { this: FuncUnit => protected val inCtrl = io.in.bits.ctrl protected val inData = io.in.bits.data - protected val fpCtrl = inCtrl.vpu.get - protected val fp_fmt = fpCtrl.vsew // TODO: use fpu + protected val fp_fmt = inCtrl.fpu.get.fmt protected val frm = io.frm.getOrElse(0.U(3.W)) protected val instRm = inCtrl.fpu.getOrElse(0.U.asTypeOf(new FPUCtrlSignals)).rm diff --git a/src/main/scala/xiangshan/backend/fu/wrapper/FALU.scala b/src/main/scala/xiangshan/backend/fu/wrapper/FALU.scala index d936c9eee84..c6a6757d268 100644 --- a/src/main/scala/xiangshan/backend/fu/wrapper/FALU.scala +++ b/src/main/scala/xiangshan/backend/fu/wrapper/FALU.scala @@ -21,14 +21,14 @@ class FAlu(cfg: FuConfig)(implicit p: Parameters) extends FpPipedFuncUnit(cfg) { // modules private val falu = Module(new FloatAdder) - val fp_aIsFpCanonicalNAN = fp_fmt === VSew.e32 && !src1.head(32).andR || - fp_fmt === VSew.e16 && !src1.head(48).andR - val fp_bIsFpCanonicalNAN = fp_fmt === VSew.e32 && !src0.head(32).andR || + val fp_aIsFpCanonicalNAN = fp_fmt === VSew.e32 && !src0.head(32).andR || fp_fmt === VSew.e16 && !src0.head(48).andR + val fp_bIsFpCanonicalNAN = fp_fmt === VSew.e32 && !src1.head(32).andR || + fp_fmt === VSew.e16 && !src1.head(48).andR falu.io.fire := io.in.valid - falu.io.fp_a := src1 - falu.io.fp_b := src0 + falu.io.fp_a := src0 + falu.io.fp_b := src1 falu.io.round_mode := rm falu.io.fp_format := fp_fmt falu.io.op_code := opcode diff --git a/src/main/scala/xiangshan/backend/fu/wrapper/FDivSqrt.scala b/src/main/scala/xiangshan/backend/fu/wrapper/FDivSqrt.scala index 1cbaa13f0fe..05a9f449f3b 100644 --- a/src/main/scala/xiangshan/backend/fu/wrapper/FDivSqrt.scala +++ b/src/main/scala/xiangshan/backend/fu/wrapper/FDivSqrt.scala @@ -22,10 +22,10 @@ class FDivSqrt(cfg: FuConfig)(implicit p: Parameters) extends FpNonPipedFuncUnit // modules private val fdiv = Module(new FloatDivider) - val fp_aIsFpCanonicalNAN = fp_fmt === VSew.e32 && !src1.head(32).andR || - fp_fmt === VSew.e16 && !src1.head(48).andR - val fp_bIsFpCanonicalNAN = fp_fmt === VSew.e32 && !src0.head(32).andR || + val fp_aIsFpCanonicalNAN = fp_fmt === VSew.e32 && !src0.head(32).andR || fp_fmt === VSew.e16 && !src0.head(48).andR + val fp_bIsFpCanonicalNAN = fp_fmt === VSew.e32 && !src1.head(32).andR || + fp_fmt === VSew.e16 && !src1.head(48).andR val thisRobIdx = Wire(new RobPtr) when(io.in.ready){ @@ -38,8 +38,8 @@ class FDivSqrt(cfg: FuConfig)(implicit p: Parameters) extends FpNonPipedFuncUnit fdiv.io.finish_ready_i := io.out.ready & io.out.valid fdiv.io.flush_i := thisRobIdx.needFlush(io.flush) fdiv.io.fp_format_i := fp_fmt - fdiv.io.opa_i := src1 - fdiv.io.opb_i := src0 + fdiv.io.opa_i := src0 + fdiv.io.opb_i := src1 fdiv.io.is_sqrt_i := opcode fdiv.io.rm_i := rm fdiv.io.fp_aIsFpCanonicalNAN := fp_aIsFpCanonicalNAN @@ -47,9 +47,9 @@ class FDivSqrt(cfg: FuConfig)(implicit p: Parameters) extends FpNonPipedFuncUnit private val resultData = Mux1H( Seq( - (outCtrl.vpu.get.vsew === VSew.e16) -> Cat(Fill(48, 1.U), fdiv.io.fpdiv_res_o(15, 0)), - (outCtrl.vpu.get.vsew === VSew.e32) -> Cat(Fill(32, 1.U), fdiv.io.fpdiv_res_o(31, 0)), - (outCtrl.vpu.get.vsew === VSew.e64) -> fdiv.io.fpdiv_res_o + (outCtrl.fpu.get.fmt === VSew.e16) -> Cat(Fill(48, 1.U), fdiv.io.fpdiv_res_o(15, 0)), + (outCtrl.fpu.get.fmt === VSew.e32) -> Cat(Fill(32, 1.U), fdiv.io.fpdiv_res_o(31, 0)), + (outCtrl.fpu.get.fmt === VSew.e64) -> fdiv.io.fpdiv_res_o ) ) private val fflagsData = fdiv.io.fflags_o diff --git a/src/main/scala/xiangshan/backend/fu/wrapper/FMA.scala b/src/main/scala/xiangshan/backend/fu/wrapper/FMA.scala index 2b5d7e6c0e6..6f1279a7bbd 100644 --- a/src/main/scala/xiangshan/backend/fu/wrapper/FMA.scala +++ b/src/main/scala/xiangshan/backend/fu/wrapper/FMA.scala @@ -22,16 +22,16 @@ class FMA(cfg: FuConfig)(implicit p: Parameters) extends FpPipedFuncUnit(cfg) { // modules private val fma = Module(new FloatFMA) - val fp_aIsFpCanonicalNAN = fp_fmt === VSew.e32 && !src1.head(32).andR || - fp_fmt === VSew.e16 && !src1.head(48).andR - val fp_bIsFpCanonicalNAN = fp_fmt === VSew.e32 && !src0.head(32).andR || + val fp_aIsFpCanonicalNAN = fp_fmt === VSew.e32 && !src0.head(32).andR || fp_fmt === VSew.e16 && !src0.head(48).andR + val fp_bIsFpCanonicalNAN = fp_fmt === VSew.e32 && !src1.head(32).andR || + fp_fmt === VSew.e16 && !src1.head(48).andR val fp_cIsFpCanonicalNAN = !(opcode === VfmaType.vfmul) && (fp_fmt === VSew.e32 && !src2.head(32).andR || fp_fmt === VSew.e16 && !src2.head(48).andR) fma.io.fire := io.in.valid - fma.io.fp_a := src1 - fma.io.fp_b := src0 + fma.io.fp_a := src0 + fma.io.fp_b := src1 fma.io.fp_c := src2 fma.io.round_mode := rm fma.io.fp_format := fp_fmt From ee2f6cd3eba69cda726985f3dc48fb2cae1f4278 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Sun, 15 Dec 2024 14:26:06 +0800 Subject: [PATCH 22/32] fix(ctrlBlock): fix bug of useSnpt when only flag diffrence --- src/main/scala/xiangshan/backend/CtrlBlock.scala | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/main/scala/xiangshan/backend/CtrlBlock.scala b/src/main/scala/xiangshan/backend/CtrlBlock.scala index 9ceebc32bda..2380f881e6d 100644 --- a/src/main/scala/xiangshan/backend/CtrlBlock.scala +++ b/src/main/scala/xiangshan/backend/CtrlBlock.scala @@ -464,10 +464,13 @@ class CtrlBlockImp( val flushVecNext = flushVec zip snpt.io.valids map (x => GatedValidRegNext(x._1 && x._2, false.B)) snpt.io.flushVec := flushVecNext - val useSnpt = VecInit.tabulate(RenameSnapshotNum)(idx => - snpt.io.valids(idx) && (s1_s3_redirect.bits.robIdx > snpt.io.snapshots(idx).robIdx.head || - !s1_s3_redirect.bits.flushItself() && s1_s3_redirect.bits.robIdx === snpt.io.snapshots(idx).robIdx.head) - ).reduceTree(_ || _) + val redirectRobidx = s1_s3_redirect.bits.robIdx + val useSnpt = VecInit.tabulate(RenameSnapshotNum){ case idx => + val snptRobidx = snpt.io.snapshots(idx).robIdx.head + // (redirectRobidx.value =/= snptRobidx.value) for only flag diffrence + snpt.io.valids(idx) && ((redirectRobidx > snptRobidx) && (redirectRobidx.value =/= snptRobidx.value) || + !s1_s3_redirect.bits.flushItself() && redirectRobidx === snptRobidx) + }.reduceTree(_ || _) val snptSelect = MuxCase( 0.U(log2Ceil(RenameSnapshotNum).W), (1 to RenameSnapshotNum).map(i => (snpt.io.enqPtr - i.U).value).map(idx => From 8593f2a778f32c0c8fc53ed6f4177c6358f7b1e4 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Sun, 15 Dec 2024 14:27:49 +0800 Subject: [PATCH 23/32] timing(vldMgu): fix timing of wbReg's gate enable --- src/main/scala/xiangshan/backend/datapath/VldMergeUnit.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/xiangshan/backend/datapath/VldMergeUnit.scala b/src/main/scala/xiangshan/backend/datapath/VldMergeUnit.scala index 149db61b249..40fc0cee578 100644 --- a/src/main/scala/xiangshan/backend/datapath/VldMergeUnit.scala +++ b/src/main/scala/xiangshan/backend/datapath/VldMergeUnit.scala @@ -20,7 +20,7 @@ class VldMergeUnit(val params: ExeUnitParams)(implicit p: Parameters) extends XS val vdAfterMerge = Wire(UInt(VLEN.W)) val wbFire = !io.writeback.bits.robIdx.needFlush(io.flush) && io.writeback.fire - wbReg.bits := Mux(wbFire, io.writeback.bits, wbReg.bits) + wbReg.bits := Mux(io.writeback.fire, io.writeback.bits, wbReg.bits) wbReg.valid := wbFire mgu.io.in.vd := wbReg.bits.data(0) // oldVd is contained in data and is already masked with new data From 10c8cfef901c577a1c16485e60d5ba86908d3e4d Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Sun, 15 Dec 2024 14:29:22 +0800 Subject: [PATCH 24/32] area(intRegFile): change intRegFile splitNum to 4 --- src/main/scala/xiangshan/backend/datapath/DataPath.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/xiangshan/backend/datapath/DataPath.scala b/src/main/scala/xiangshan/backend/datapath/DataPath.scala index cd0ee96fa30..ee386869806 100644 --- a/src/main/scala/xiangshan/backend/datapath/DataPath.scala +++ b/src/main/scala/xiangshan/backend/datapath/DataPath.scala @@ -306,7 +306,7 @@ class DataPathImp(override val wrapper: DataPath)(implicit p: Parameters, params io.diffVl.foreach(_ := vlDiffReadData.get) - IntRegFileSplit("IntRegFile", intSchdParams.numPregs, splitNum = 2, intRfRaddr, intRfRdata, intRfWen, intRfWaddr, intRfWdata, + IntRegFileSplit("IntRegFile", intSchdParams.numPregs, splitNum = 4, intRfRaddr, intRfRdata, intRfWen, intRfWaddr, intRfWdata, bankNum = 1, debugReadAddr = intDiffRead.map(_._1), debugReadData = intDiffRead.map(_._2) From 293a3b8937d8afcfefcad6194cc4c4cc2a7b94ec Mon Sep 17 00:00:00 2001 From: linzhida Date: Mon, 16 Dec 2024 12:29:34 +0800 Subject: [PATCH 25/32] timing(zacas): move isDropAmocasSta logic gen from Scheduler to NewDispatch --- src/main/scala/xiangshan/backend/Bundles.scala | 1 + src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala | 4 ++++ src/main/scala/xiangshan/backend/issue/Scheduler.scala | 6 +++++- src/main/scala/xiangshan/backend/rename/Rename.scala | 1 + 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/main/scala/xiangshan/backend/Bundles.scala b/src/main/scala/xiangshan/backend/Bundles.scala index 5def275061a..d60ae05603a 100644 --- a/src/main/scala/xiangshan/backend/Bundles.scala +++ b/src/main/scala/xiangshan/backend/Bundles.scala @@ -202,6 +202,7 @@ object Bundles { val vlsInstr = Bool() val wfflags = Bool() val isMove = Bool() + val isDropAmocasSta = Bool() val uopIdx = UopIdx() val isVset = Bool() val firstUop = Bool() diff --git a/src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala b/src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala index fbcf99eb89c..40a9d81d945 100644 --- a/src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala +++ b/src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala @@ -391,6 +391,10 @@ class NewDispatch(implicit p: Parameters) extends XSModule with HasPerfEvents wi fromRenameUpdate(i).valid := fromRename(i).valid && allowDispatch(i) && !uopBlockByIQ(i) && thisCanActualOut(i) && lsqCanAccept && !fromRename(i).bits.eliminatedMove fromRename(i).ready := allowDispatch(i) && !uopBlockByIQ(i) && thisCanActualOut(i) && lsqCanAccept } + for (i <- 0 until RenameWidth){ + // check is drop amocas sta + fromRenameUpdate(i).bits.isDropAmocasSta := fromRename(i).bits.isAMOCAS && fromRename(i).bits.uopIdx(0) === 1.U + } var temp = 0 allIssueParams.zipWithIndex.map{ case(issue, iqidx) => { for (i <- 0 until issue.numEnq){ diff --git a/src/main/scala/xiangshan/backend/issue/Scheduler.scala b/src/main/scala/xiangshan/backend/issue/Scheduler.scala index 917d6072cd1..fba881f2c25 100644 --- a/src/main/scala/xiangshan/backend/issue/Scheduler.scala +++ b/src/main/scala/xiangshan/backend/issue/Scheduler.scala @@ -385,6 +385,10 @@ class SchedulerMemImp(override val wrapper: Scheduler)(implicit params: SchdBloc staIdx.zipWithIndex.map{ case (sta, i) => { io.fromDispatch.uops(sta).ready := staReady(i) && stdReady(i) }} + issueQueues.filter(iq => iq.params.StaCnt > 0).zip(staIdx).zipWithIndex.map{ case ((iq, idx),i) => + iq.io.enq(i).valid := io.fromDispatch.uops(idx).valid && !io.fromDispatch.uops(idx).bits.isDropAmocasSta + } + val staValidFromDispatch = staIdx.map(idx => io.fromDispatch.uops(idx).valid) val memAddrIQs = issueQueues.filter(_.params.isMemAddrIQ) val stAddrIQs = issueQueues.filter(iq => iq.params.StaCnt > 0) // included in memAddrIQs val ldAddrIQs = issueQueues.filter(iq => iq.params.LduCnt > 0) @@ -493,7 +497,7 @@ class SchedulerMemImp(override val wrapper: Scheduler)(implicit params: SchdBloc } (stdEnqs ++ hydEnqs).zip(staEnqs ++ hyaEnqs).zipWithIndex.foreach { case ((stdIQEnq, staIQEnq), i) => - stdIQEnq.valid := staIQEnq.valid && FuType.FuTypeOrR(staIQEnq.bits.fuType, FuType.stu, FuType.mou) + stdIQEnq.valid := staValidFromDispatch(i) stdIQEnq.bits := staIQEnq.bits // Store data reuses store addr src(1) in dispatch2iq // [dispatch2iq] --src*------src*(0)--> [staIQ|hyaIQ] diff --git a/src/main/scala/xiangshan/backend/rename/Rename.scala b/src/main/scala/xiangshan/backend/rename/Rename.scala index 57b3fdef48b..8d4855ac611 100644 --- a/src/main/scala/xiangshan/backend/rename/Rename.scala +++ b/src/main/scala/xiangshan/backend/rename/Rename.scala @@ -197,6 +197,7 @@ class Rename(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHe uop.useRegCache := DontCare uop.regCacheIdx := DontCare uop.traceBlockInPipe := DontCare + uop.isDropAmocasSta := DontCare }) private val inst = Wire(Vec(RenameWidth, new XSInstBitFields)) private val isCsr = Wire(Vec(RenameWidth, Bool())) From 1250f7e083adc951555d5fc3095d1d6bd117b490 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Mon, 16 Dec 2024 14:28:09 +0800 Subject: [PATCH 26/32] fix(dispatch): fix bug of index vld instr, each uop can be index vld instr --- src/main/scala/xiangshan/Parameters.scala | 2 +- .../backend/dispatch/NewDispatch.scala | 54 ++++++------------- 2 files changed, 18 insertions(+), 38 deletions(-) diff --git a/src/main/scala/xiangshan/Parameters.scala b/src/main/scala/xiangshan/Parameters.scala index 94f4082569e..b236e8594fe 100644 --- a/src/main/scala/xiangshan/Parameters.scala +++ b/src/main/scala/xiangshan/Parameters.scala @@ -229,7 +229,7 @@ case class XSCoreParameters VecMemDispatchWidth: Int = 1, VecMemDispatchMaxNumber: Int = 16, VecMemUnitStrideMaxFlowNum: Int = 2, - VecMemLSQEnqIteratorNumberSeq: Seq[Int] = Seq(16, 2, 2, 2, 2, 2), + VecMemLSQEnqIteratorNumberSeq: Seq[Int] = Seq(16, 16, 16, 16, 16, 16), StoreBufferSize: Int = 16, StoreBufferThreshold: Int = 7, EnsbufferWidth: Int = 2, diff --git a/src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala b/src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala index 40a9d81d945..53e29e27aea 100644 --- a/src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala +++ b/src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala @@ -26,7 +26,7 @@ import xiangshan.backend.MemCoreTopDownIO import xiangshan.backend.rob.{RobDispatchTopDownIO, RobEnqIO} import xiangshan.mem.mdp._ import xiangshan.mem.{HasVLSUParameters, _} -import xiangshan.backend.Bundles.{DecodedInst, DynInst, ExuOH, ExuVec, IssueQueueIQWakeUpBundle} +import xiangshan.backend.Bundles.{DecodedInst, DynInst, ExuVec, IssueQueueIQWakeUpBundle} import xiangshan.backend.fu.{FuConfig, FuType} import xiangshan.backend.rename.BusyTable import chisel3.util.experimental.decode._ @@ -524,6 +524,7 @@ class NewDispatch(implicit p: Parameters) extends XSModule with HasPerfEvents wi } private val isVlsType = fuType.map(fuTypeItem => FuType.isVls(fuTypeItem)).zip(fromRename.map(_.valid)).map(x => x._1 && x._2) + private val isLSType = fuType.map(fuTypeItem => FuType.isLoad(fuTypeItem) || FuType.isStore(fuTypeItem)).zip(fromRename.map(_.valid)).map(x => x._1 && x._2) private val isSegment = fuType.map(fuTypeItem => FuType.isVsegls(fuTypeItem)).zip(fromRename.map(_.valid)).map(x => x._1 && x._2) // TODO private val isUnitStride = fuOpType.map(fuOpTypeItem => LSUOpType.isAllUS(fuOpTypeItem)) @@ -542,13 +543,13 @@ class NewDispatch(implicit p: Parameters) extends XSModule with HasPerfEvents wi // The 'allowDispatch' calculations are done conservatively for timing purposes: // The Flow of scalar instructions is considered 1, // The flow of vector 'unit-stride' instructions is considered 2, and the flow of other vector instructions is considered 16. - private val conserveFlows = isVlsType.zipWithIndex.map { case (isVlsTyepItem, index) => + private val conserveFlows = VecInit(isVlsType.zip(isLSType).zipWithIndex.map { case ((isVlsTyepItem, isLSTypeItem), index) => Mux( isVlsTyepItem, - if (index == 0) Mux(isUnitStride(index), VecMemUnitStrideMaxFlowNum.U, 16.U) else VecMemUnitStrideMaxFlowNum.U, - 1.U + Mux(isUnitStride(index), VecMemUnitStrideMaxFlowNum.U, 16.U), + Mux(isLSTypeItem, 1.U, 0.U) ) - } + }) // A conservative allocation strategy is adopted here. // Vector 'unit-stride' instructions and scalar instructions can be issued from all six ports, @@ -563,27 +564,16 @@ class NewDispatch(implicit p: Parameters) extends XSModule with HasPerfEvents wi for (index <- allowDispatch.indices) { val flowTotal = Wire(UInt(log2Up(VirtualLoadQueueMaxStoreQueueSize + 1).W)) flowTotal := conserveFlows.take(index + 1).reduce(_ +& _) - if (index == 0) { - when(isStoreVec(index) || isVStoreVec(index)) { - allowDispatch(index) := sqFreeCount > flowTotal - }.elsewhen(isLoadVec(index) || isVLoadVec(index)) { - allowDispatch(index) := lqFreeCount > flowTotal - }.elsewhen(isAMOVec(index)) { - allowDispatch(index) := true.B - }.otherwise { - allowDispatch(index) := true.B - } - } - else { - when(isStoreVec(index) || isVStoreVec(index)) { - allowDispatch(index) := (sqFreeCount > flowTotal) && (isVecUnitType(index) || !isVlsType(index)) && allowDispatch(index - 1) - }.elsewhen(isLoadVec(index) || isVLoadVec(index)) { - allowDispatch(index) := (lqFreeCount > flowTotal) && (isVecUnitType(index) || !isVlsType(index)) && allowDispatch(index - 1) - }.elsewhen(isAMOVec(index)) { - allowDispatch(index) := allowDispatch(index - 1) - }.otherwise { - allowDispatch(index) := allowDispatch(index - 1) - } + val allowDispatchPrevious = if (index == 0) true.B else allowDispatch(index - 1) + val allowDispatchThisUop = true.B + when(isStoreVec(index) || isVStoreVec(index)) { + allowDispatch(index) := (sqFreeCount > flowTotal) && allowDispatchThisUop && allowDispatchPrevious + }.elsewhen(isLoadVec(index) || isVLoadVec(index)) { + allowDispatch(index) := (lqFreeCount > flowTotal) && allowDispatchThisUop && allowDispatchPrevious + }.elsewhen(isAMOVec(index)) { + allowDispatch(index) := allowDispatchPrevious + }.otherwise { + allowDispatch(index) := allowDispatchPrevious } } @@ -610,17 +600,6 @@ class NewDispatch(implicit p: Parameters) extends XSModule with HasPerfEvents wi s0_enqLsq_resp(i) := enqLsqIO.resp(i) } - - - - - - - - - - - val isFp = VecInit(fromRename.map(req => FuType.isFArith(req.bits.fuType))) val isVec = VecInit(fromRename.map(req => FuType.isVArith (req.bits.fuType) || FuType.isVsetRvfWvf(req.bits.fuType))) @@ -736,6 +715,7 @@ class NewDispatch(implicit p: Parameters) extends XSModule with HasPerfEvents wi } if(backendParams.debugEn){ dontTouch(blockedByWaitForward) + dontTouch(conserveFlows) } // Only the uop with block backward flag will block the next uop From cd88eb775fdcee6405fd890bfe8a374c7d0405b5 Mon Sep 17 00:00:00 2001 From: chengguanghui Date: Mon, 16 Dec 2024 14:10:11 +0800 Subject: [PATCH 27/32] area(trace, pcMem): Trace only get `startAddr` from pcmem --- src/main/scala/xiangshan/backend/Backend.scala | 3 +-- src/main/scala/xiangshan/backend/CtrlBlock.scala | 9 +++++---- src/main/scala/xiangshan/backend/MemBlock.scala | 9 ++++++--- .../scala/xiangshan/backend/trace/Interface.scala | 3 ++- .../scala/xiangshan/backend/trace/Trace.scala | 15 +++------------ 5 files changed, 17 insertions(+), 22 deletions(-) diff --git a/src/main/scala/xiangshan/backend/Backend.scala b/src/main/scala/xiangshan/backend/Backend.scala index bea489e5ac3..b0e208b7583 100644 --- a/src/main/scala/xiangshan/backend/Backend.scala +++ b/src/main/scala/xiangshan/backend/Backend.scala @@ -1039,8 +1039,7 @@ class BackendIO(implicit p: Parameters, params: BackendParams) extends XSBundle val toTop = new BackendToTopBundle - val traceCoreInterface = new TraceCoreInterface - + val traceCoreInterface = new TraceCoreInterface(hasOffset = true) val fenceio = new FenceIO // Todo: merge these bundles into BackendFrontendIO val frontend = Flipped(new FrontendToCtrlIO) diff --git a/src/main/scala/xiangshan/backend/CtrlBlock.scala b/src/main/scala/xiangshan/backend/CtrlBlock.scala index 2380f881e6d..09a4cd8405b 100644 --- a/src/main/scala/xiangshan/backend/CtrlBlock.scala +++ b/src/main/scala/xiangshan/backend/CtrlBlock.scala @@ -278,12 +278,12 @@ class CtrlBlockImp( trace.io.in.fromEncoder.enable := io.traceCoreInterface.fromEncoder.enable trace.io.in.fromRob := rob.io.trace.traceCommitInfo rob.io.trace.blockCommit := trace.io.out.blockRobCommit - + val tracePcStart = Wire(Vec(TraceGroupNum, UInt(IaddrWidth.W))) for ((pcMemIdx, i) <- pcMemRdIndexes("trace").zipWithIndex) { val traceValid = trace.toPcMem.blocks(i).valid pcMem.io.ren.get(pcMemIdx) := traceValid pcMem.io.raddr(pcMemIdx) := trace.toPcMem.blocks(i).bits.ftqIdx.get.value - trace.io.in.fromPcMem(i) := pcMem.io.rdata(pcMemIdx).getPc(RegEnable(trace.toPcMem.blocks(i).bits.ftqOffset.get, traceValid)) + tracePcStart(i) := pcMem.io.rdata(pcMemIdx).startAddr } // Trap/Xret only occur in block(0). @@ -296,7 +296,8 @@ class CtrlBlockImp( io.traceCoreInterface.toEncoder.priv := tracePriv (0 until TraceGroupNum).foreach(i => { io.traceCoreInterface.toEncoder.groups(i).valid := trace.io.out.toEncoder.blocks(i).valid - io.traceCoreInterface.toEncoder.groups(i).bits.iaddr := trace.io.out.toEncoder.blocks(i).bits.iaddr.getOrElse(0.U) + io.traceCoreInterface.toEncoder.groups(i).bits.iaddr := tracePcStart(i) + io.traceCoreInterface.toEncoder.groups(i).bits.ftqOffset.foreach(_ := trace.io.out.toEncoder.blocks(i).bits.ftqOffset.getOrElse(0.U)) io.traceCoreInterface.toEncoder.groups(i).bits.itype := trace.io.out.toEncoder.blocks(i).bits.tracePipe.itype io.traceCoreInterface.toEncoder.groups(i).bits.iretire := trace.io.out.toEncoder.blocks(i).bits.tracePipe.iretire io.traceCoreInterface.toEncoder.groups(i).bits.ilastsize := trace.io.out.toEncoder.blocks(i).bits.tracePipe.ilastsize @@ -878,7 +879,7 @@ class CtrlBlockIO()(implicit p: Parameters, params: BackendParams) extends XSBun val ratOldPest = new RatToVecExcpMod }) - val traceCoreInterface = new TraceCoreInterface + val traceCoreInterface = new TraceCoreInterface(hasOffset = true) val perfInfo = Output(new Bundle{ val ctrlInfo = new Bundle { diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala index dbb0cdea14a..82eb56c75e4 100644 --- a/src/main/scala/xiangshan/backend/MemBlock.scala +++ b/src/main/scala/xiangshan/backend/MemBlock.scala @@ -335,7 +335,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) val toL2Top = Output(Bool()) } val traceCoreInterfaceBypass = new Bundle{ - val fromBackend = Flipped(new TraceCoreInterface) + val fromBackend = Flipped(new TraceCoreInterface(hasOffset = true)) val toL2Top = new TraceCoreInterface } }) @@ -1940,7 +1940,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) traceFromBackend.toEncoder.priv, traceFromBackend.toEncoder.groups(0).valid ) - (0 until TraceGroupNum).foreach{ i => + (0 until TraceGroupNum).foreach { i => traceToL2Top.toEncoder.groups(i).valid := RegNext(traceFromBackend.toEncoder.groups(i).valid) traceToL2Top.toEncoder.groups(i).bits.iretire := RegNext(traceFromBackend.toEncoder.groups(i).bits.iretire) traceToL2Top.toEncoder.groups(i).bits.itype := RegNext(traceFromBackend.toEncoder.groups(i).bits.itype) @@ -1951,7 +1951,10 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) traceToL2Top.toEncoder.groups(i).bits.iaddr := RegEnable( traceFromBackend.toEncoder.groups(i).bits.iaddr, traceFromBackend.toEncoder.groups(i).valid - ) + ) + (RegEnable( + traceFromBackend.toEncoder.groups(i).bits.ftqOffset.getOrElse(0.U), + traceFromBackend.toEncoder.groups(i).valid + ) << instOffsetBits) } diff --git a/src/main/scala/xiangshan/backend/trace/Interface.scala b/src/main/scala/xiangshan/backend/trace/Interface.scala index e80066ee343..207e4094209 100644 --- a/src/main/scala/xiangshan/backend/trace/Interface.scala +++ b/src/main/scala/xiangshan/backend/trace/Interface.scala @@ -36,7 +36,7 @@ class FromEncoder extends Bundle { val stall = Bool() } -class TraceCoreInterface(implicit val p: Parameters) extends Bundle with HasXSParameter { +class TraceCoreInterface(hasOffset: Boolean = false)(implicit val p: Parameters) extends Bundle with HasXSParameter { val fromEncoder = Input(new Bundle { val enable = Bool() val stall = Bool() @@ -49,6 +49,7 @@ class TraceCoreInterface(implicit val p: Parameters) extends Bundle with HasXSPa } val groups = Vec(TraceGroupNum, ValidIO(new Bundle{ val iaddr = UInt(IaddrWidth.W) + val ftqOffset = if (hasOffset) Some(UInt(log2Up(PredictWidth).W)) else None val itype = UInt(ItypeWidth.W) val iretire = UInt(IretireWidthCompressed.W) val ilastsize = UInt(IlastsizeWidth.W) diff --git a/src/main/scala/xiangshan/backend/trace/Trace.scala b/src/main/scala/xiangshan/backend/trace/Trace.scala index 49a6f0086bb..01e6ca4eb0b 100644 --- a/src/main/scala/xiangshan/backend/trace/Trace.scala +++ b/src/main/scala/xiangshan/backend/trace/Trace.scala @@ -17,18 +17,17 @@ class TraceIO(implicit val p: Parameters) extends Bundle with HasXSParameter { val in = new Bundle { val fromEncoder = Input(new FromEncoder) val fromRob = Flipped(new TraceBundle(hasIaddr = false, CommitWidth, IretireWidthInPipe)) - val fromPcMem = Input(Vec(TraceGroupNum, UInt(IaddrWidth.W))) } val out = new Bundle { val toPcMem = new TraceBundle(hasIaddr = false, TraceGroupNum, IretireWidthCompressed) - val toEncoder = new TraceBundle(hasIaddr = true, TraceGroupNum, IretireWidthCompressed) + val toEncoder = new TraceBundle(hasIaddr = false, TraceGroupNum, IretireWidthCompressed) val blockRobCommit = Output(Bool()) } } class Trace(implicit val p: Parameters) extends Module with HasXSParameter { val io = IO(new TraceIO) - val (fromEncoder, fromRob, fromPcMem, toPcMem, toEncoder) = (io.in.fromEncoder, io.in.fromRob, io.in.fromPcMem, io.out.toPcMem, io.out.toEncoder) + val (fromEncoder, fromRob, toPcMem, toEncoder) = (io.in.fromEncoder, io.in.fromRob, io.out.toPcMem, io.out.toEncoder) /** * stage 0: CommitInfo from rob @@ -62,13 +61,5 @@ class Trace(implicit val p: Parameters) extends Module with HasXSParameter { val s3_in_groups = s2_out_groups val s3_out_groups = RegNext(s3_in_groups) toPcMem := s3_in_groups - - for(i <- 0 until TraceGroupNum) { - toEncoder.blocks(i).valid := s3_out_groups.blocks(i).valid - toEncoder.blocks(i).bits.iaddr.foreach(_ := Mux(s3_out_groups.blocks(i).valid, fromPcMem(i), 0.U)) - toEncoder.blocks(i).bits.tracePipe := s3_out_groups.blocks(i).bits.tracePipe - } - if(backendParams.debugEn) { - dontTouch(io.out.toEncoder) - } + io.out.toEncoder := s3_out_groups } From ecdf2e869ade0c9fef7bd6dd6654425b7f98a429 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Mon, 16 Dec 2024 15:21:53 +0800 Subject: [PATCH 28/32] fix(scheduler): fix bug of sta valid --- src/main/scala/xiangshan/backend/issue/Scheduler.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/xiangshan/backend/issue/Scheduler.scala b/src/main/scala/xiangshan/backend/issue/Scheduler.scala index fba881f2c25..e77b5848ba6 100644 --- a/src/main/scala/xiangshan/backend/issue/Scheduler.scala +++ b/src/main/scala/xiangshan/backend/issue/Scheduler.scala @@ -385,8 +385,8 @@ class SchedulerMemImp(override val wrapper: Scheduler)(implicit params: SchdBloc staIdx.zipWithIndex.map{ case (sta, i) => { io.fromDispatch.uops(sta).ready := staReady(i) && stdReady(i) }} - issueQueues.filter(iq => iq.params.StaCnt > 0).zip(staIdx).zipWithIndex.map{ case ((iq, idx),i) => - iq.io.enq(i).valid := io.fromDispatch.uops(idx).valid && !io.fromDispatch.uops(idx).bits.isDropAmocasSta + issueQueues.filter(iq => iq.params.StaCnt > 0).map(_.io.enq).flatten.zipWithIndex.map{ case (iq, idx) => + iq.valid := io.fromDispatch.uops(staIdx(idx)).valid && !io.fromDispatch.uops(staIdx(idx)).bits.isDropAmocasSta } val staValidFromDispatch = staIdx.map(idx => io.fromDispatch.uops(idx).valid) val memAddrIQs = issueQueues.filter(_.params.isMemAddrIQ) From 2898290d0603796f0cd70a087309b94725a9cb41 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Mon, 16 Dec 2024 18:53:09 +0800 Subject: [PATCH 29/32] fix(dispatch): fix bug of hasException's instr send to iq --- src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala b/src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala index 53e29e27aea..4fc7fe5e67d 100644 --- a/src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala +++ b/src/main/scala/xiangshan/backend/dispatch/NewDispatch.scala @@ -388,7 +388,8 @@ class NewDispatch(implicit p: Parameters) extends XSModule with HasPerfEvents wi val lsqCanAccept = Wire(Bool()) for (i <- 0 until RenameWidth){ // update valid logic - fromRenameUpdate(i).valid := fromRename(i).valid && allowDispatch(i) && !uopBlockByIQ(i) && thisCanActualOut(i) && lsqCanAccept && !fromRename(i).bits.eliminatedMove + fromRenameUpdate(i).valid := fromRename(i).valid && allowDispatch(i) && !uopBlockByIQ(i) && thisCanActualOut(i) && + lsqCanAccept && !fromRename(i).bits.eliminatedMove && !fromRename(i).bits.hasException fromRename(i).ready := allowDispatch(i) && !uopBlockByIQ(i) && thisCanActualOut(i) && lsqCanAccept } for (i <- 0 until RenameWidth){ @@ -734,7 +735,6 @@ class NewDispatch(implicit p: Parameters) extends XSModule with HasPerfEvents wi // (2) previous instructions are ready thisCanActualOut := VecInit((0 until RenameWidth).map(i => !blockedByWaitForward(i) && notBlockedByPrevious(i) && io.enqRob.canAccept)) val thisActualOut = (0 until RenameWidth).map(i => io.enqRob.req(i).valid && io.enqRob.canAccept) - val hasValidException = fromRename.zip(hasException).map(x => x._1.valid && x._2) // input for ROB, LSQ for (i <- 0 until RenameWidth) { From 3932d107c3f87b5b27e5414e204ef6fb1bd9e4cc Mon Sep 17 00:00:00 2001 From: Anzooooo Date: Tue, 17 Dec 2024 18:20:58 +0800 Subject: [PATCH 30/32] fix(LSQ): modify the enq logic This commit modifies the previous silly queue entry. This greatly reduces the generated verilog, making: StoreQueue verilog in StoreQueue from 26W lines -> 5W lines verilog in VirtualLoadQueue from 13W lines -> 2W lines Also, we can no longer limit the number of numLsElem per `io.enq`. --- .../xiangshan/mem/lsqueue/StoreQueue.scala | 65 +++++++++++-------- .../mem/lsqueue/VirtualLoadQueue.scala | 55 +++++++++------- 2 files changed, 71 insertions(+), 49 deletions(-) diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala index 25590840e20..5fde17a29da 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala @@ -348,6 +348,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule * Enqueue at dispatch * * Currently, StoreQueue only allows enqueue when #emptyEntries > EnqWidth + * Dynamic enq based on numLsElem number */ io.enq.canAccept := allowEnqueue val canEnqueue = io.enq.req.map(_.valid) @@ -357,38 +358,49 @@ class StoreQueue(implicit p: Parameters) extends XSModule val validVStoreOffset = vStoreFlow.zip(io.enq.needAlloc).map{case (flow, needAllocItem) => Mux(needAllocItem, flow, 0.U)} val validVStoreOffsetRShift = 0.U +: validVStoreOffset.take(vStoreFlow.length - 1) + val enqLowBound = io.enq.req.map(_.bits.sqIdx) + val enqUpBound = io.enq.req.map(x => x.bits.sqIdx + x.bits.numLsElem) + val enqCrossLoop = enqLowBound.zip(enqUpBound).map{case (low, up) => low.flag =/= up.flag} + + for(i <- 0 until StoreQueueSize) { + val entryCanEnqSeq = (0 until io.enq.req.length).map { j => + val entryHitBound = Mux( + enqCrossLoop(j), + enqLowBound(j).value <= i.U || i.U < enqUpBound(j).value, + enqLowBound(j).value <= i.U && i.U < enqUpBound(j).value + ) + canEnqueue(j) && !enqCancel(j) && entryHitBound + } + + val entryCanEnq = entryCanEnqSeq.reduce(_ || _) + val selectBits = ParallelPriorityMux(entryCanEnqSeq, io.enq.req.map(_.bits)) + val selectUpBound = ParallelPriorityMux(entryCanEnqSeq, enqUpBound) + when (entryCanEnq) { + uop(i) := selectBits + vecLastFlow(i) := Mux((i + 1).U === selectUpBound.value, selectBits.lastUop, false.B) + allocated(i) := true.B + datavalid(i) := false.B + addrvalid(i) := false.B + unaligned(i) := false.B + cross16Byte(i) := false.B + committed(i) := false.B + pending(i) := false.B + prefetch(i) := false.B + nc(i) := false.B + mmio(i) := false.B + isVec(i) := FuType.isVStore(selectBits.fuType) + vecMbCommit(i) := false.B + hasException(i) := false.B + waitStoreS2(i) := true.B + } + } + for (i <- 0 until io.enq.req.length) { val sqIdx = enqPtrExt(0) + validVStoreOffsetRShift.take(i + 1).reduce(_ + _) val index = io.enq.req(i).bits.sqIdx - val enqInstr = io.enq.req(i).bits.instr.asTypeOf(new XSInstBitFields) when (canEnqueue(i) && !enqCancel(i)) { - // The maximum 'numLsElem' number that can be emitted per dispatch port is: - // 16 2 2 2 2 2. - // Therefore, VecMemLSQEnqIteratorNumberSeq = Seq(16, 2, 2, 2, 2, 2) - for (j <- 0 until VecMemLSQEnqIteratorNumberSeq(i)) { - when (j.U < validVStoreOffset(i)) { - uop((index + j.U).value) := io.enq.req(i).bits - // NOTE: the index will be used when replay - uop((index + j.U).value).sqIdx := sqIdx + j.U - vecLastFlow((index + j.U).value) := Mux((j + 1).U === validVStoreOffset(i), io.enq.req(i).bits.lastUop, false.B) - allocated((index + j.U).value) := true.B - datavalid((index + j.U).value) := false.B - addrvalid((index + j.U).value) := false.B - unaligned((index + j.U).value) := false.B - cross16Byte((index + j.U).value) := false.B - committed((index + j.U).value) := false.B - pending((index + j.U).value) := false.B - prefetch((index + j.U).value) := false.B - nc((index + j.U).value) := false.B - mmio((index + j.U).value) := false.B - isVec((index + j.U).value) := FuType.isVStore(io.enq.req(i).bits.fuType) - vecMbCommit((index + j.U).value) := false.B - hasException((index + j.U).value) := false.B - waitStoreS2((index + j.U).value) := true.B XSError(!io.enq.canAccept || !io.enq.lqCanAccept, s"must accept $i\n") XSError(index.value =/= sqIdx.value, s"must be the same entry $i\n") - } - } } io.enq.resp(i) := sqIdx } @@ -800,6 +812,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule mmioState := s_req uncacheUop := uop(deqPtr) uncacheUop.exceptionVec := 0.U.asTypeOf(ExceptionVec()) + uncacheUop.trigger := 0.U.asTypeOf(TriggerAction()) cboFlushedSb := false.B cboMmioPAddr := paddrModule.io.rdata(0) } diff --git a/src/main/scala/xiangshan/mem/lsqueue/VirtualLoadQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/VirtualLoadQueue.scala index 13e4451287a..26566749f05 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/VirtualLoadQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/VirtualLoadQueue.scala @@ -159,35 +159,44 @@ class VirtualLoadQueue(implicit p: Parameters) extends XSModule * Enqueue at dispatch * * Currently, VirtualLoadQueue only allows enqueue when #emptyEntries > EnqWidth + * Dynamic enq based on numLsElem number */ io.enq.canAccept := allowEnqueue + val enqLowBound = io.enq.req.map(_.bits.lqIdx) + val enqUpBound = io.enq.req.map(x => x.bits.lqIdx + x.bits.numLsElem) + val enqCrossLoop = enqLowBound.zip(enqUpBound).map{case (low, up) => low.flag =/= up.flag} + + for(i <- 0 until VirtualLoadQueueSize) { + val entryCanEnqSeq = (0 until io.enq.req.length).map { j => + val entryHitBound = Mux( + enqCrossLoop(j), + enqLowBound(j).value <= i.U || i.U < enqUpBound(j).value, + enqLowBound(j).value <= i.U && i.U < enqUpBound(j).value + ) + canEnqueue(j) && !enqCancel(j) && entryHitBound + } + val entryCanEnq = entryCanEnqSeq.reduce(_ || _) + val selectBits = ParallelPriorityMux(entryCanEnqSeq, io.enq.req.map(_.bits)) + when (entryCanEnq) { + uop(i) := selectBits + allocated(i) := true.B + datavalid(i) := false.B + addrvalid(i) := false.B + isvec(i) := FuType.isVLoad(selectBits.fuType) + veccommitted(i) := false.B + + debug_mmio(i) := false.B + debug_paddr(i) := 0.U + } + + } + for (i <- 0 until io.enq.req.length) { val lqIdx = enqPtrExt(0) + validVLoadOffsetRShift.take(i + 1).reduce(_ + _) val index = io.enq.req(i).bits.lqIdx - val enqInstr = io.enq.req(i).bits.instr.asTypeOf(new XSInstBitFields) when (canEnqueue(i) && !enqCancel(i)) { - // The maximum 'numLsElem' number that can be emitted per dispatch port is: - // 16 2 2 2 2 2. - // Therefore, VecMemLSQEnqIteratorNumberSeq = Seq(16, 2, 2, 2, 2, 2) - for (j <- 0 until VecMemLSQEnqIteratorNumberSeq(i)) { - when (j.U < validVLoadOffset(i)) { - allocated((index + j.U).value) := true.B - uop((index + j.U).value) := io.enq.req(i).bits - uop((index + j.U).value).lqIdx := lqIdx + j.U - - // init - addrvalid((index + j.U).value) := false.B - datavalid((index + j.U).value) := false.B - isvec((index + j.U).value) := FuType.isVLoad(io.enq.req(i).bits.fuType) - veccommitted((index + j.U).value) := false.B - - debug_mmio((index + j.U).value) := false.B - debug_paddr((index + j.U).value) := 0.U - - XSError(!io.enq.canAccept || !io.enq.sqCanAccept, s"must accept $i\n") - XSError(index.value =/= lqIdx.value, s"must be the same entry $i\n") - } - } + XSError(!io.enq.canAccept || !io.enq.sqCanAccept, s"must accept $i\n") + XSError(index.value =/= lqIdx.value, s"must be the same entry $i\n") } io.enq.resp(i) := lqIdx } From 4137128f7a724d98a043789c2b2b18b5d959addb Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Tue, 17 Dec 2024 12:38:15 +0800 Subject: [PATCH 31/32] fix(fpDecoder): fix bug of fmt --- .../xiangshan/backend/decode/FPDecoder.scala | 32 +++++++++++++++++-- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/src/main/scala/xiangshan/backend/decode/FPDecoder.scala b/src/main/scala/xiangshan/backend/decode/FPDecoder.scala index 842ec4b4469..8d37ef8b4ac 100644 --- a/src/main/scala/xiangshan/backend/decode/FPDecoder.scala +++ b/src/main/scala/xiangshan/backend/decode/FPDecoder.scala @@ -279,6 +279,34 @@ class FPDecoder(implicit p: Parameters) extends XSModule{ sigs(0) := decoder(2) sigs(1) := decoder(4) ctrl.typ := inst.TYP + val isFP16Instrs = Seq( + // zfh inst + FADD_H, FSUB_H, FEQ_H, FLT_H, FLE_H, FMIN_H, FMAX_H, + FMUL_H, FDIV_H, FSQRT_H, + FMADD_H, FMSUB_H, FNMADD_H, FNMSUB_H, + FCLASS_H, FSGNJ_H, FSGNJX_H, FSGNJN_H, + // zfa inst + FLEQ_H, FLTQ_H, FMINM_H, FMAXM_H, + FROUND_H, FROUNDNX_H, + ) + val isFP16Instr = isFP16Instrs.map(io.instr === _).reduce(_ || _) + val isFP32Instrs = Seq( + FADD_S, FSUB_S, FEQ_S, FLT_S, FLE_S, FMIN_S, FMAX_S, + FMUL_S, FDIV_S, FSQRT_S, + FMADD_S, FMSUB_S, FNMADD_S, FNMSUB_S, + FCLASS_S, FSGNJ_S, FSGNJX_S, FSGNJN_S, + // zfa inst + FLEQ_S, FLTQ_S, FMINM_S, FMAXM_S, + FROUND_S, FROUNDNX_S, + ) + val isFP32Instr = isFP32Instrs.map(io.instr === _).reduce(_ || _) + val isFP64Instrs = Seq( + FADD_D, FSUB_D, FEQ_D, FLT_D, FLE_D, FMIN_D, FMAX_D, + FMUL_D, FDIV_D, FSQRT_D, + FMADD_D, FMSUB_D, FNMADD_D, FNMSUB_D, + FCLASS_D, FSGNJ_D, FSGNJX_D, FSGNJN_D, + ) + val isFP64Instr = isFP64Instrs.map(io.instr === _).reduce(_ || _) // scalar cvt inst val isSew2Cvts = Seq( FCVT_W_S, FCVT_WU_S, FCVT_L_S, FCVT_LU_S, @@ -310,9 +338,7 @@ class FPDecoder(implicit p: Parameters) extends XSModule{ ) val isSew2Cvt32 = isSew2Cvts.map(io.instr === _).reduce(_ || _) val isSew2Cvt16 = isSew2Cvth.map(io.instr === _).reduce(_ || _) - val complexFmt = Mux(isSew2Cvt32, VSew.e32, VSew.e16) - val isCompFmt = isSew2Cvt32 || isSew2Cvt16 - ctrl.fmt := Mux(isCompFmt, complexFmt, simpleFmt) + ctrl.fmt := Mux(isFP32Instr || isSew2Cvt32, VSew.e32, Mux(isFP16Instr || isSew2Cvt16, VSew.e16, VSew.e64)) ctrl.rm := inst.RM val fmaTable: Array[(BitPat, List[BitPat])] = Array( From c83a1b7041f04cf735aa21b3d713a7bedb36c314 Mon Sep 17 00:00:00 2001 From: xiaofeibao <1441675923@qq.com> Date: Wed, 18 Dec 2024 14:13:54 +0800 Subject: [PATCH 32/32] fix(rob): fix bug of redirect when all robEntries need flush --- src/main/scala/xiangshan/backend/rob/Rob.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/scala/xiangshan/backend/rob/Rob.scala b/src/main/scala/xiangshan/backend/rob/Rob.scala index d824f65f418..e2e25a259f6 100644 --- a/src/main/scala/xiangshan/backend/rob/Rob.scala +++ b/src/main/scala/xiangshan/backend/rob/Rob.scala @@ -922,9 +922,11 @@ class RobImp(override val wrapper: Rob)(implicit p: Parameters, params: BackendP val redirectValidReg = RegNext(io.redirect.valid) val redirectBegin = Reg(UInt(log2Up(RobSize).W)) val redirectEnd = Reg(UInt(log2Up(RobSize).W)) + val redirectAll = RegInit(false.B) when(io.redirect.valid){ redirectBegin := Mux(io.redirect.bits.flushItself(), io.redirect.bits.robIdx.value - 1.U, io.redirect.bits.robIdx.value) redirectEnd := enqPtr.value + redirectAll := io.redirect.bits.flushItself() && (io.redirect.bits.robIdx.value === enqPtr.value) && (io.redirect.bits.robIdx.flag ^ enqPtr.flag) } // update robEntries valid @@ -933,7 +935,7 @@ class RobImp(override val wrapper: Rob)(implicit p: Parameters, params: BackendP val commitCond = io.commits.isCommit && io.commits.commitValid.zip(deqPtrVec.map(_.value === i.U)).map(x => x._1 && x._2).reduce(_ || _) assert(PopCount(enqOH) < 2.U, s"robEntries$i enqOH is not one hot") val needFlush = redirectValidReg && Mux( - redirectEnd > redirectBegin, + (redirectEnd > redirectBegin) && !redirectAll, (i.U > redirectBegin) && (i.U < redirectEnd), (i.U > redirectBegin) || (i.U < redirectEnd) )