diff --git a/coupledL2 b/coupledL2 index a17d090738..3468725821 160000 --- a/coupledL2 +++ b/coupledL2 @@ -1 +1 @@ -Subproject commit a17d090738e6aceb25b09b5dca3c0e824c46ae8f +Subproject commit 346872582105980e05be96bbf2ad002f78336389 diff --git a/src/main/scala/xiangshan/Parameters.scala b/src/main/scala/xiangshan/Parameters.scala index ef4b403f1e..6a9b2cbde6 100644 --- a/src/main/scala/xiangshan/Parameters.scala +++ b/src/main/scala/xiangshan/Parameters.scala @@ -254,6 +254,7 @@ case class XSCoreParameters EnableAtCommitMissTrigger: Boolean = true, EnableStorePrefetchSMS: Boolean = false, EnableStorePrefetchSPB: Boolean = false, + EnableStorePrefetchASP: Boolean = true, HasCMO: Boolean = true, MMUAsidLen: Int = 16, // max is 16, 0 is not supported now MMUVmidLen: Int = 14, @@ -809,6 +810,7 @@ trait HasXSParameter { def EnableAtCommitMissTrigger = coreParams.EnableAtCommitMissTrigger def EnableStorePrefetchSMS = coreParams.EnableStorePrefetchSMS def EnableStorePrefetchSPB = coreParams.EnableStorePrefetchSPB + def EnableStorePrefetchASP = coreParams.EnableStorePrefetchASP def HasCMO = coreParams.HasCMO && p(EnableCHI) require(LoadPipelineWidth == backendParams.LdExuCnt, "LoadPipelineWidth must be equal exuParameters.LduCnt!") require(StorePipelineWidth == backendParams.StaCnt, "StorePipelineWidth must be equal exuParameters.StuCnt!") diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala index af9e975594..5406312b66 100644 --- a/src/main/scala/xiangshan/backend/MemBlock.scala +++ b/src/main/scala/xiangshan/backend/MemBlock.scala @@ -534,13 +534,31 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) // load/store prefetch to l2 cache prefetcherOpt.foreach(sms_pf => { l1PrefetcherOpt.foreach(l1_pf => { - val sms_pf_to_l2 = DelayNWithValid(sms_pf.io.l2_req, 2) val l1_pf_to_l2 = DelayNWithValid(l1_pf.io.l2_req, 2) - - outer.l2_pf_sender_opt.get.out.head._1.addr_valid := sms_pf_to_l2.valid || l1_pf_to_l2.valid - outer.l2_pf_sender_opt.get.out.head._1.addr := Mux(l1_pf_to_l2.valid, l1_pf_to_l2.bits.addr, sms_pf_to_l2.bits.addr) - outer.l2_pf_sender_opt.get.out.head._1.pf_source := Mux(l1_pf_to_l2.valid, l1_pf_to_l2.bits.source, sms_pf_to_l2.bits.source) + val sms_pf_to_l2 = DelayNWithValid(sms_pf.io.l2_req, 2) + val asp_pf_to_l2 = DelayNWithValid(lsq.io.aspPfIO.l2_pf_addr, 2) + + outer.l2_pf_sender_opt.get.out.head._1.addr_valid := sms_pf_to_l2.valid || l1_pf_to_l2.valid || asp_pf_to_l2.valid + outer.l2_pf_sender_opt.get.out.head._1.addr := Mux( + l1_pf_to_l2.valid, + l1_pf_to_l2.bits.addr, + Mux( + sms_pf_to_l2.valid, + sms_pf_to_l2.bits.addr, + asp_pf_to_l2.bits.addr + ) + ) + outer.l2_pf_sender_opt.get.out.head._1.pf_source := Mux( + l1_pf_to_l2.valid, + l1_pf_to_l2.bits.source, + Mux( + sms_pf_to_l2.valid, + sms_pf_to_l2.bits.source, + asp_pf_to_l2.bits.source + ) + ) outer.l2_pf_sender_opt.get.out.head._1.l2_pf_en := RegNextN(io.ooo_to_mem.csrCtrl.l2_pf_enable, 2, Some(true.B)) + outer.l2_pf_sender_opt.get.out.head._1.needT := !l1_pf_to_l2.valid && !sms_pf_to_l2.valid && asp_pf_to_l2.valid sms_pf.io.enable := RegNextN(io.ooo_to_mem.csrCtrl.l1D_pf_enable, 2, Some(false.B)) @@ -589,14 +607,14 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) // dtlb val dtlb_ld_tlb_ld = Module(new TLBNonBlock(LduCnt + HyuCnt + 1, 2, ldtlbParams)) val dtlb_st_tlb_st = Module(new TLBNonBlock(StaCnt, 1, sttlbParams)) - val dtlb_prefetch_tlb_prefetch = Module(new TLBNonBlock(2, 2, pftlbParams)) + val dtlb_prefetch_tlb_prefetch = Module(new TLBNonBlock(3, 2, pftlbParams)) val dtlb_ld = Seq(dtlb_ld_tlb_ld.io) val dtlb_st = Seq(dtlb_st_tlb_st.io) val dtlb_prefetch = Seq(dtlb_prefetch_tlb_prefetch.io) /* tlb vec && constant variable */ val dtlb = dtlb_ld ++ dtlb_st ++ dtlb_prefetch val (dtlb_ld_idx, dtlb_st_idx, dtlb_pf_idx) = (0, 1, 2) - val TlbSubSizeVec = Seq(LduCnt + HyuCnt + 1, StaCnt, 2) // (load + hyu + stream pf, store, sms+l2bop) + val TlbSubSizeVec = Seq(LduCnt + HyuCnt + 1, StaCnt, 3) // (load + hyu + stream pf, store, sms+l2bop+asp) val DTlbSize = TlbSubSizeVec.sum val TlbStartVec = TlbSubSizeVec.scanLeft(0)(_ + _).dropRight(1) val TlbEndVec = TlbSubSizeVec.scanLeft(0)(_ + _).drop(1) @@ -632,7 +650,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) replace_st.io.apply_sep(dtlb_st.map(_.replace), ptwio.resp.bits.data.s1.entry.tag) } if (pftlbParams.outReplace) { - val replace_pf = Module(new TlbReplace(2, pftlbParams)) + val replace_pf = Module(new TlbReplace(3, pftlbParams)) replace_pf.io.apply_sep(dtlb_prefetch.map(_.replace), ptwio.resp.bits.data.s1.entry.tag) } } @@ -1090,6 +1108,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) val StreamDTLBPortIndex = TlbStartVec(dtlb_ld_idx) + LduCnt + HyuCnt val PrefetcherDTLBPortIndex = TlbStartVec(dtlb_pf_idx) val L2toL1DLBPortIndex = TlbStartVec(dtlb_pf_idx) + 1 + val ASPDTLBPortIndex = TlbStartVec(dtlb_pf_idx) + 2 prefetcherOpt match { case Some(pf) => dtlb_reqs(PrefetcherDTLBPortIndex) <> pf.io.tlb_req case None => @@ -1108,6 +1127,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) dtlb_reqs(L2toL1DLBPortIndex).resp.ready := true.B io.l2_pmp_resp := pmp_check(L2toL1DLBPortIndex).resp + dtlb_reqs(ASPDTLBPortIndex) <> lsq.io.aspPfIO.tlb_req // StoreUnit for (i <- 0 until StdCnt) { stdExeUnits(i).io.flush <> redirect @@ -1375,6 +1395,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) io.mem_to_ooo.sqDeqPtr := lsq.io.sqDeqPtr io.mem_to_ooo.lqDeqPtr := lsq.io.lqDeqPtr lsq.io.tl_d_channel <> dcache.io.lsu.tl_d_channel + lsq.io.seqStoreDetected <> dcache.io.seqStoreDetected // LSQ to store buffer lsq.io.sbuffer <> sbuffer.io.in diff --git a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala index 95ffdadcfa..16ded8326d 100644 --- a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala +++ b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala @@ -781,6 +781,7 @@ class DCacheIO(implicit p: Parameters) extends DCacheBundle { val pf_ctrl = Output(new PrefetchControlBundle) val force_write = Input(Bool()) val sms_agt_evict_req = DecoupledIO(new AGTEvictReq) + val seqStoreDetected = Input(Bool()) val debugTopDown = new DCacheTopDownIO val debugRolling = Flipped(new RobDebugRollingIO) val l2_hint = Input(Valid(new L2ToL1Hint())) @@ -981,6 +982,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame missQueue.io.hartId := io.hartId missQueue.io.l2_pf_store_only := RegNext(io.l2_pf_store_only, false.B) missQueue.io.debugTopDown <> io.debugTopDown + missQueue.io.seqStoreDetected <> io.seqStoreDetected missQueue.io.l2_hint <> RegNext(io.l2_hint) missQueue.io.mainpipe_info := mainPipe.io.mainpipe_info mainPipe.io.refill_info := missQueue.io.refill_info diff --git a/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala b/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala index f943d572a0..859330a41a 100644 --- a/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala +++ b/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala @@ -869,6 +869,7 @@ class MissQueue(edge: TLEdgeOut, reqNum: Int)(implicit p: Parameters) extends DC val l2_pf_store_only = Input(Bool()) val memSetPattenDetected = Output(Bool()) + val seqStoreDetected = Input(Bool()) val lqEmpty = Input(Bool()) val prefetch_info = new Bundle { @@ -941,19 +942,7 @@ class MissQueue(edge: TLEdgeOut, reqNum: Int)(implicit p: Parameters) extends DC assert(PopCount(Seq(alloc && io.req.valid, merge && io.req.valid)) <= 1.U, "allocate and merge a mshr in same cycle!") - val source_except_load_cnt = RegInit(0.U(10.W)) - when(VecInit(req_mshr_handled_vec).asUInt.orR || req_pipeline_reg_handled) { - when(io.req.bits.isFromLoad) { - source_except_load_cnt := 0.U - }.otherwise { - when(io.req.bits.isFromStore) { - source_except_load_cnt := source_except_load_cnt + 1.U - } - } - } - val Threshold = 8 - val memSetPattenDetected = GatedValidRegNext((source_except_load_cnt >= Threshold.U) && io.lqEmpty) - + val memSetPattenDetected = GatedValidRegNext(io.seqStoreDetected && io.lqEmpty) io.memSetPattenDetected := memSetPattenDetected val forwardInfo_vec = VecInit(entries.map(_.io.forwardInfo)) diff --git a/src/main/scala/xiangshan/cache/mmu/Repeater.scala b/src/main/scala/xiangshan/cache/mmu/Repeater.scala index 98d452f356..faef208b80 100644 --- a/src/main/scala/xiangshan/cache/mmu/Repeater.scala +++ b/src/main/scala/xiangshan/cache/mmu/Repeater.scala @@ -219,14 +219,23 @@ class PTWFilterEntry(Width: Int, Size: Int, hasHint: Boolean = false)(implicit p canenq(1) := !(Cat(v.drop(Size/2)).andR) enqidx(1) := firstValidIndex(v.drop(Size/2), false.B) + (Size/2).U } else if (Width == 3) { - require(Size == 16, s"load filter Size ($Size) should be 16") - canenq(0) := !(Cat(v.take(8)).andR) - enqidx(0) := firstValidIndex(v.take(8), false.B) - canenq(1) := !(Cat(v.drop(8).take(4)).andR) - enqidx(1) := firstValidIndex(v.drop(8).take(4), false.B) + 8.U - // four entries for prefetch - canenq(2) := !(Cat(v.drop(12)).andR) - enqidx(2) := firstValidIndex(v.drop(12), false.B) + 12.U + require(Size == 16 || Size == 8, s"load/prefetcher filter Size ($Size) should be 16/8") + if (Size == 16) { + canenq(0) := !(Cat(v.take(8)).andR) + enqidx(0) := firstValidIndex(v.take(8), false.B) + canenq(1) := !(Cat(v.drop(8).take(4)).andR) + enqidx(1) := firstValidIndex(v.drop(8).take(4), false.B) + 8.U + // four entries for prefetch + canenq(2) := !(Cat(v.drop(12)).andR) + enqidx(2) := firstValidIndex(v.drop(12), false.B) + 12.U + } else { + canenq(0) := !(Cat(v.take(4)).andR) + enqidx(0) := firstValidIndex(v.take(4), false.B) + canenq(1) := !(Cat(v.drop(4).take(2)).andR) + enqidx(1) := firstValidIndex(v.drop(4).take(2), false.B) + 4.U + canenq(2) := !(Cat(v.drop(6)).andR) + enqidx(2) := firstValidIndex(v.drop(6), false.B) + 6.U + } } else if (Width == 4) { require(Size == 16, s"load filter Size ($Size) should be 16") for (i <- 0 until Width) { @@ -359,7 +368,7 @@ class PTWNewFilter(Width: Int, Size: Int, FenceDelay: Int)(implicit p: Parameter }) val prefetch_filter = VecInit(Seq.fill(1) { - val prefetch_entry = Module(new PTWFilterEntry(Width = 2, Size = prefetchfiltersize)) + val prefetch_entry = Module(new PTWFilterEntry(Width = 3, Size = prefetchfiltersize)) prefetch_entry.io }) diff --git a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala index c4681e9ec4..8dbc1b6ec4 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala @@ -124,6 +124,8 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete val flushSbuffer = new SbufferFlushBundle val force_write = Output(Bool()) val lqEmpty = Output(Bool()) + val seqStoreDetected = Output(Bool()) + val aspPfIO = new AspPfIO // top-down val debugTopDown = new LoadQueueTopDownIO @@ -179,16 +181,19 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete storeQueue.io.vecmmioStout <> io.vecmmioStout storeQueue.io.rob <> io.rob storeQueue.io.exceptionAddr.isStore := DontCare - storeQueue.io.sqCancelCnt <> io.sqCancelCnt - storeQueue.io.sqDeq <> io.sqDeq - storeQueue.io.sqEmpty <> io.sqEmpty - storeQueue.io.sqFull <> io.sqFull - storeQueue.io.forward <> io.forward // overlap forwardMask & forwardData, DO NOT CHANGE SEQUENCE - storeQueue.io.force_write <> io.force_write - storeQueue.io.cmoOpReq <> io.cmoOpReq - storeQueue.io.cmoOpResp <> io.cmoOpResp - storeQueue.io.flushSbuffer <> io.flushSbuffer - storeQueue.io.maControl <> io.maControl + storeQueue.io.sqCancelCnt <> io.sqCancelCnt + storeQueue.io.sqDeq <> io.sqDeq + storeQueue.io.sqEmpty <> io.sqEmpty + storeQueue.io.lqEmpty <> loadQueue.io.lqEmpty + storeQueue.io.sqFull <> io.sqFull + storeQueue.io.forward <> io.forward // overlap forwardMask & forwardData, DO NOT CHANGE SEQUENCE + storeQueue.io.force_write <> io.force_write + storeQueue.io.cmoOpReq <> io.cmoOpReq + storeQueue.io.cmoOpResp <> io.cmoOpResp + storeQueue.io.flushSbuffer <> io.flushSbuffer + storeQueue.io.maControl <> io.maControl + storeQueue.io.seqStoreDetected <> io.seqStoreDetected + storeQueue.io.aspPfIO <> io.aspPfIO /* <------- DANGEROUS: Don't change sequence here ! -------> */ diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala index 9541c561bf..414394044e 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala @@ -26,12 +26,14 @@ import utility._ import utils._ import xiangshan._ import xiangshan.cache._ +import xiangshan.cache.mmu.TlbRequestIO import xiangshan.cache.{DCacheLineIO, DCacheWordIO, MemoryOpConstants} import xiangshan.backend._ import xiangshan.backend.rob.{RobLsqIO, RobPtr} import xiangshan.backend.Bundles.{DynInst, MemExuOutput} import xiangshan.backend.decode.isa.bitfield.{Riscv32BitInst, XSInstBitFields} import xiangshan.backend.fu.FuConfig._ +import xiangshan.mem.prefetch.L2PrefetchReq import xiangshan.backend.fu.FuType import xiangshan.ExceptionNO._ import coupledL2.{CMOReq, CMOResp} @@ -58,6 +60,11 @@ class SqEnqIO(implicit p: Parameters) extends MemBlockBundle { val resp = Vec(LSQEnqWidth, Output(new SqPtr)) } +class AspPfIO(implicit p: Parameters) extends MemBlockBundle { + val tlb_req = new TlbRequestIO(nRespDups = 2) + val l2_pf_addr = ValidIO(new L2PrefetchReq()) +} + class DataBufferEntry (implicit p: Parameters) extends DCacheBundle { val addr = UInt(PAddrBits.W) val vaddr = UInt(VAddrBits.W) @@ -187,6 +194,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule val exceptionAddr = new ExceptionAddrIO val flushSbuffer = new SbufferFlushBundle val sqEmpty = Output(Bool()) + val lqEmpty = Input(Bool()) val stAddrReadySqPtr = Output(new SqPtr) val stAddrReadyVec = Output(Vec(StoreQueueSize, Bool())) val stDataReadySqPtr = Output(new SqPtr) @@ -198,10 +206,15 @@ class StoreQueue(implicit p: Parameters) extends XSModule val sqDeq = Output(UInt(log2Ceil(EnsbufferWidth + 1).W)) val force_write = Output(Bool()) val maControl = Flipped(new StoreMaBufToSqControlIO) + val seqStoreDetected = Output(Bool()) + val aspPfIO = new AspPfIO }) println("StoreQueue: size:" + StoreQueueSize) + // ASP prefetcher + val asp = Module(new ASP) + // data modules val uop = Reg(Vec(StoreQueueSize, new DynInst)) // val data = Reg(Vec(StoreQueueSize, new LsqEntry)) @@ -1026,6 +1039,16 @@ class StoreQueue(implicit p: Parameters) extends XSModule } } + asp.io.sbuffer.zipWithIndex.foreach {case (s, idx) => { + s.valid := io.sbuffer(idx).fire && io.sbuffer(idx).bits.vecValid + s.bits := io.sbuffer(idx).bits + }} + asp.io.sqEmpty := io.sqEmpty + asp.io.lqEmpty := io.lqEmpty + asp.io.enable := EnableStorePrefetchASP.B + io.seqStoreDetected := asp.io.seqStoreDetected + io.aspPfIO <> asp.io.aspPfIO + // All vector instruction uop normally dequeue, but the Uop after the exception is raised does not write to the 'sbuffer'. // Flags are used to record whether there are any exceptions when the queue is displayed. // This is determined each time a write is made to the 'databuffer', prevent subsequent uop of the same instruction from writing to the 'dataBuffer'. diff --git a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala index 0d53f5ac45..05519b6515 100644 --- a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala +++ b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala @@ -61,6 +61,9 @@ trait HasSbufferConst extends HasXSParameter { val VWordsWidth: Int = log2Up(CacheLineVWords) val VWordWidth: Int = log2Up(VDataBytes) val VWordOffsetWidth: Int = PAddrBits - VWordWidth + + val FullWriteMaxWaitCycles = CacheLineBytes / EnsbufferWidth + val FullWriteMaxWaitBits = log2Up(FullWriteMaxWaitCycles) + 1 } class SbufferEntryState (implicit p: Parameters) extends SbufferBundle { @@ -206,6 +209,9 @@ class Sbuffer(implicit p: Parameters) val force_write = Input(Bool()) }) + println("Sbuffer FullWriteMaxWaitBits: " + FullWriteMaxWaitBits) + println("Sbuffer FullWriteMaxWaitCycles: " + FullWriteMaxWaitCycles) + val dataModule = Module(new SbufferData) dataModule.io.writeReq <> DontCare val prefetcher = Module(new StorePfWrapper()) @@ -220,6 +226,7 @@ class Sbuffer(implicit p: Parameters) val stateVec = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U.asTypeOf(new SbufferEntryState)))) val cohCount = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U(EvictCountBits.W)))) val missqReplayCount = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U(MissqReplayCountBits.W)))) + val waitCntBeforeFull = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U(FullWriteMaxWaitBits.W)))) val sbuffer_out_s0_fire = Wire(Bool()) @@ -312,7 +319,7 @@ class Sbuffer(implicit p: Parameters) val activeMask = VecInit(stateVec.map(s => s.isActive())) val validMask = VecInit(stateVec.map(s => s.isValid())) - val drainIdx = PriorityEncoder(activeMask) + val drainIdx = Wire(UInt(SbufferIndexWidth.W)) val inflightMask = VecInit(stateVec.map(s => s.isInflight())) @@ -385,8 +392,8 @@ class Sbuffer(implicit p: Parameters) val do_uarch_drain = GatedValidRegNext(forward_need_uarch_drain) || GatedValidRegNext(GatedValidRegNext(merge_need_uarch_drain)) XSPerfAccumulate("do_uarch_drain", do_uarch_drain) - io.in(0).ready := firstCanInsert - io.in(1).ready := secondCanInsert && io.in(0).ready + io.in(0).ready := firstCanInsert || mergeVec(0).orR + io.in(1).ready := (secondCanInsert || mergeVec(1).orR) && io.in(0).ready for (i <- 0 until EnsbufferWidth) { // train @@ -437,6 +444,7 @@ class Sbuffer(implicit p: Parameters) // missqReplayCount(insertIdx) := 0.U ptag(entryIdx) := reqptag vtag(entryIdx) := reqvtag // update vtag if a new sbuffer line is allocated + waitCntBeforeFull(entryIdx) := FullWriteMaxWaitCycles.U } }) } @@ -468,6 +476,8 @@ class Sbuffer(implicit p: Parameters) }) } + waitCntBeforeFull.foreach(x => x := Mux(x.orR, x - 1.U, x)) + for(((in, vwordOffset), i) <- io.in.zip(Seq(firstWord, secondWord)).zipWithIndex){ writeReq(i).valid := in.fire && in.bits.vecValid writeReq(i).bits.vwordOffset := vwordOffset @@ -606,11 +616,22 @@ class Sbuffer(implicit p: Parameters) val sbuffer_out_s1_ready = Wire(Bool()) + // --------------------------------------------------------------------------- + // Memset Case + // --------------------------------------------------------------------------- + + val memSet_needDrain = io.memSetPattenDetected + val memSetActiveMask = VecInit(stateVec.zipWithIndex.map{case (s, idx) => { + s.isDcacheReqCandidate() && Mux(waitCntBeforeFull(idx).orR, mask(idx).asUInt.andR, true.B) + }}) + + drainIdx := Mux(memSet_needDrain, PriorityEncoder(memSetActiveMask), PriorityEncoder(activeMask)) + // --------------------------------------------------------------------------- // sbuffer_out_s0 // --------------------------------------------------------------------------- - val need_drain = needDrain(sbuffer_state) + val need_drain = needDrain(sbuffer_state) || memSet_needDrain val need_replace = do_eviction || (sbuffer_state === x_replace) val sbuffer_out_s0_evictionIdx = Mux(missqReplayHasTimeOut, missqReplayTimeOutIdx, @@ -620,14 +641,18 @@ class Sbuffer(implicit p: Parameters) ) ) + val sbuffer_out_s0_can_evict = Mux( + memSet_needDrain, + memSetActiveMask(sbuffer_out_s0_evictionIdx), + candidateVec(sbuffer_out_s0_evictionIdx) + ) + // If there is a inflight dcache req which has same ptag with sbuffer_out_s0_evictionIdx's ptag, // current eviction should be blocked. val sbuffer_out_s0_valid = missqReplayHasTimeOut || - stateVec(sbuffer_out_s0_evictionIdx).isDcacheReqCandidate() && - (need_drain || cohHasTimeOut || need_replace) + sbuffer_out_s0_can_evict && (need_drain || cohHasTimeOut || need_replace) assert(!( - stateVec(sbuffer_out_s0_evictionIdx).isDcacheReqCandidate() && - !noSameBlockInflight(sbuffer_out_s0_evictionIdx) + sbuffer_out_s0_can_evict && !noSameBlockInflight(sbuffer_out_s0_evictionIdx) )) val sbuffer_out_s0_cango = sbuffer_out_s1_ready sbuffer_out_s0_fire := sbuffer_out_s0_valid && sbuffer_out_s0_cango diff --git a/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala b/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala index 6455a96fa9..b4fb0f4d3f 100644 --- a/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala +++ b/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala @@ -23,6 +23,8 @@ import xiangshan._ import utils._ import utility._ import xiangshan.cache._ +import xiangshan.cache.mmu._ +import xiangshan.mem.prefetch.L2PrefetchReq trait HasStorePrefetchHelper extends HasCircularQueuePtrHelper with HasDCacheParameters { // common @@ -34,24 +36,39 @@ trait HasStorePrefetchHelper extends HasCircularQueuePtrHelper with HasDCachePar val ONLY_ON_MEMSET = false val SATURATE_COUNTER_BITS = 7 val BURST_ENGINE_SIZE = 2 + val SPB_GRANULARITY_BYTES = 4096 + val SPB_GRANULARITY_BITS = log2Up(SPB_GRANULARITY_BYTES) val SPB_N = 48 // serializer parameters val SERIALIZER_SIZE = 12 + // asp parameters + val LOCK_CYCLE = 2048 + val LOCK_BITS = log2Up(LOCK_CYCLE) + 1 + val ASP_GRANULARITY_BYTES = 1024 // 1KB + val ASP_GRANULARITY_BITS = log2Up(ASP_GRANULARITY_BYTES) + def block_addr(x: UInt): UInt = { val offset = log2Up(dcacheParameters.blockBytes) x(x.getWidth - 1, offset) } - // filter logic (granularity: a page) - def same_page_addr(addr0: UInt, addr1: UInt): Bool = { - addr0(addr0.getWidth - 1, PAGEOFFSET) === addr1(addr1.getWidth - 1, PAGEOFFSET) + // filter logic (granularity specified in args) + def same_granularity_addr(addr0: UInt, addr1: UInt, granularity: Int): Bool = { + addr0(addr0.getWidth - 1, granularity) === addr1(addr1.getWidth - 1, granularity) } - def filter_by_page_addr(valid_vec: Vec[Bool], data_vec: Vec[UInt], incoming_vaddr: UInt) : Bool = { + def filter_by_addr(valid_vec: Vec[Bool], data_vec: Vec[UInt], incoming_vaddr: UInt, granularity: Int) : Bool = { val match_vec = (valid_vec zip data_vec).map{ - case(v, e_vaddr) => v && same_page_addr(e_vaddr, incoming_vaddr) + case(v, e_vaddr) => v && same_granularity_addr(e_vaddr, incoming_vaddr, granularity) + } + VecInit(match_vec).asUInt.orR + } + + def filter_by_addr(valid_vec: Vec[Bool], data_vec: Vec[UInt], lock_vec: Vec[Bool], incoming_vaddr: UInt, granularity: Int) : Bool = { + val match_vec = (valid_vec zip lock_vec zip data_vec).map{ + case((v, l), e_vaddr) => (v || l) && same_granularity_addr(e_vaddr, incoming_vaddr, granularity) } VecInit(match_vec).asUInt.orR } @@ -79,7 +96,7 @@ trait HasStorePrefetchHelper extends HasCircularQueuePtrHelper with HasDCachePar // L1 Store prefetch component // an prefetch request generator used by spb to burst some prefetch request to L1 Dcache -class PrefetchBurstGenerator(is_store: Boolean)(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelper { +class PrefetchBurstGenerator(granularity: Int, is_store: Boolean)(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelper { val io = IO(new DCacheBundle { val alloc = Input(Bool()) val vaddr = Input(UInt(VAddrBits.W)) @@ -99,12 +116,12 @@ class PrefetchBurstGenerator(is_store: Boolean)(implicit p: Parameters) extends val enq_valids = ~(valids.asUInt) val full = !(enq_valids.orR) val enq_idx = PriorityEncoder(enq_valids) - val enq_filter = filter_by_page_addr(valids, datas, io.vaddr) + val enq_filter = filter_by_addr(valids, datas, io.vaddr, granularity) when(io.alloc && !full && !enq_filter) { valids(enq_idx) := true.B datas(enq_idx) := io.vaddr - pagebits(enq_idx) := io.vaddr(PAGEOFFSET) + pagebits(enq_idx) := io.vaddr(granularity) } XSPerfAccumulate("burst_generator_alloc_success", io.alloc && !full && !enq_filter) @@ -127,21 +144,21 @@ class PrefetchBurstGenerator(is_store: Boolean)(implicit p: Parameters) extends out_decouple(0).valid := deq_valid out_decouple(0).bits := DontCare out_decouple(0).bits.vaddr := data - out_decouple(1).valid := deq_valid && data_next(PAGEOFFSET) === pg_bit && out_decouple(0).fire + out_decouple(1).valid := deq_valid && data_next(granularity) === pg_bit && out_decouple(0).fire out_decouple(1).bits := DontCare out_decouple(1).bits.vaddr := data_next out_decouple.drop(2).foreach { out => out.valid := false.B; out.bits := DontCare } when(out_decouple(1).fire) { // fired 2 prefetch reqs data := data_next_next - when(data_next_next(PAGEOFFSET) =/= pg_bit) { + when(data_next_next(granularity) =/= pg_bit) { // cross page, invalid this entry v := false.B } }.elsewhen(out_decouple(0).fire) { // fired 1 prefetch req data := data_next - when(data_next(PAGEOFFSET) =/= pg_bit) { + when(data_next(granularity) =/= pg_bit) { // cross page, invalid this entry v := false.B } @@ -164,12 +181,14 @@ class StorePrefetchBursts(implicit p: Parameters) extends DCacheModule with HasS }) require(EnsbufferWidth == 2) + private val granularity = SPB_GRANULARITY_BITS + // meta for SPB val N = SPB_N val last_st_block_addr = RegInit(0.U(VAddrBits.W)) val saturate_counter = RegInit(0.S(SATURATE_COUNTER_BITS.W)) val store_count = RegInit(0.U((log2Up(N) + 1).W)) - val burst_engine = Module(new PrefetchBurstGenerator(is_store = true)) + val burst_engine = Module(new PrefetchBurstGenerator(granularity, true)) val sbuffer_fire = io.sbuffer_enq.valid val sbuffer_vaddr = io.sbuffer_enq.bits.vaddr @@ -284,4 +303,251 @@ class StorePfWrapper()(implicit p: Parameters) extends DCacheModule with HasStor // fire a prefetch req io.prefetch_req <> spb.io.prefetch_req +} + +// prefetch request generator used by asp to burst some prefetch request to L2 Cache +class ASPBurstGenerator(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelper { + val io = IO(new DCacheBundle { + val alloc = Input(Bool()) + val vaddr = Input(UInt(VAddrBits.W)) + val aspPfIO = new AspPfIO + }) + + private val granularity = ASP_GRANULARITY_BITS + + val SIZE = BURST_ENGINE_SIZE + + val valids = RegInit(VecInit(List.tabulate(SIZE){_ => false.B})) + val locks = RegInit(VecInit(List.tabulate(SIZE){_ => false.B})) + val cnts = RegInit(VecInit(List.tabulate(SIZE){_ => 0.U(LOCK_BITS.W)})) + val vaddrs = RegInit(VecInit(List.tabulate(SIZE){_ => 0.U.asTypeOf(io.vaddr)})) + val paddrs = RegInit(VecInit(List.tabulate(SIZE){_ => 0.U(PAddrBits.W)})) + val pa_vs = RegInit(VecInit(List.tabulate(SIZE){_ => false.B})) + val tlb_sent = RegInit(VecInit(List.tabulate(SIZE){_ => false.B})) + + val tlb_req_arb = Module(new RRArbiterInit(new TlbReq, SIZE)) + val l2_pf_req_arb = Module(new RRArbiterInit(new L2PrefetchReq, SIZE)) + + // enq + val enq_valids = ~(valids.asUInt) + val full = !(enq_valids.orR) + val enq_idx = PriorityEncoder(enq_valids) + val enq_filter = filter_by_addr(valids, vaddrs, locks, io.vaddr, granularity) + + for (i <- 0 until SIZE) { + when (!valids(i) && locks(i) && cnts(i).orR) { + cnts(i) := cnts(i) - 1.U + } + + when (!valids(i) && locks(i) && !cnts(i).orR) { + locks(i) := false.B + } + } + + when(io.alloc && !full && !enq_filter) { + valids(enq_idx) := true.B + locks(enq_idx) := false.B + cnts(enq_idx) := 0.U + vaddrs(enq_idx) := io.vaddr + pa_vs(enq_idx) := false.B + tlb_sent(enq_idx) := false.B + } + + // tlb req + val s0_tlb_fire_vec = VecInit((0 until SIZE).map{case i => tlb_req_arb.io.in(i).fire}) + for(i <- 0 until SIZE) { + tlb_req_arb.io.in(i).valid := valids(i) && !pa_vs(i) && !tlb_sent(i) + tlb_req_arb.io.in(i).bits := 0.U.asTypeOf(new TlbReq) + tlb_req_arb.io.in(i).bits.vaddr := vaddrs(i) + tlb_req_arb.io.in(i).bits.cmd := TlbCmd.write + tlb_req_arb.io.in(i).bits.size := 3.U + tlb_req_arb.io.in(i).bits.kill := false.B + tlb_req_arb.io.in(i).bits.no_translate := false.B + + when(tlb_req_arb.io.in(i).fire) { + tlb_sent(i) := true.B + } + } + assert(PopCount(s0_tlb_fire_vec) <= 1.U, "s0_tlb_fire_vec should be one-hot or empty") + + val s1_tlb_req_valid = RegInit(false.B) + val s1_tlb_req_bits = RegEnable(tlb_req_arb.io.out.bits, tlb_req_arb.io.out.fire) + val s1_tlb_req_index = RegEnable(OHToUInt(s0_tlb_fire_vec.asUInt), tlb_req_arb.io.out.fire) + when(io.aspPfIO.tlb_req.req.fire) { + s1_tlb_req_valid := false.B + } + when(tlb_req_arb.io.out.fire) { + s1_tlb_req_valid := true.B + } + io.aspPfIO.tlb_req.req.valid := s1_tlb_req_valid + io.aspPfIO.tlb_req.req.bits := s1_tlb_req_bits + io.aspPfIO.tlb_req.req_kill := false.B + tlb_req_arb.io.out.ready := !s1_tlb_req_valid || io.aspPfIO.tlb_req.req.ready + + // tlb resp + val s2_tlb_resp = io.aspPfIO.tlb_req.resp + val s2_tlb_update_index = RegEnable(s1_tlb_req_index, io.aspPfIO.tlb_req.req.fire) + when(s2_tlb_resp.valid) { + pa_vs(s2_tlb_update_index) := !s2_tlb_resp.bits.miss + tlb_sent(s2_tlb_update_index) := false.B + + when(!s2_tlb_resp.bits.miss) { + paddrs(s2_tlb_update_index) := s2_tlb_resp.bits.paddr.head + when(s2_tlb_resp.bits.excp.head.pf.st || s2_tlb_resp.bits.excp.head.af.st) { + valids(s2_tlb_update_index) := false.B + } + } + } + s2_tlb_resp.ready := true.B + + // next prefetch address + val paddrs_next = Wire(Vec(SIZE, chiselTypeOf(paddrs(0)))) + paddrs_next := paddrs.map(_ + Cat(1.U(1.W), 0.U(BLOCKOFFSET.W))) + + // pf to l2 + io.aspPfIO.l2_pf_addr.valid := l2_pf_req_arb.io.out.valid + io.aspPfIO.l2_pf_addr.bits := l2_pf_req_arb.io.out.bits + + l2_pf_req_arb.io.out.ready := true.B + + for(i <- 0 until SIZE) { + l2_pf_req_arb.io.in(i).valid := valids(i) && pa_vs(i) + l2_pf_req_arb.io.in(i).bits.addr := paddrs(i) + l2_pf_req_arb.io.in(i).bits.source := MemReqSource.Prefetch2L2Stream.id.U + } + + when(l2_pf_req_arb.io.out.fire) { + val idx = l2_pf_req_arb.io.chosen + val cross_page = !same_granularity_addr(paddrs_next(idx), paddrs(idx), granularity) + when(cross_page) { + valids(idx) := false.B + locks(idx) := true.B + cnts(idx) := LOCK_CYCLE.U + } + paddrs(idx) := paddrs_next(idx) + } + + XSPerfAccumulate("burst_generator_alloc_success", io.alloc && !full && !enq_filter) + XSPerfAccumulate("burst_generator_alloc_fail", io.alloc && full && !enq_filter) + XSPerfAccumulate("burst_generator_full", full) + + XSPerfAccumulate("burst_valid_num", PopCount(valids)) + XSPerfAccumulate("prefetch_req_fire_by_generator", io.aspPfIO.l2_pf_addr.valid) +} + +// an Accurate Store prefetcher +class ASP(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelper { + val io = IO(new DCacheBundle { + val sbuffer = Vec(EnsbufferWidth, Flipped(ValidIO(new DCacheWordReqWithVaddrAndPfFlag))) + val sqEmpty = Input(Bool()) + val lqEmpty = Input(Bool()) + val enable = Input(Bool()) + val seqStoreDetected = Output(Bool()) + val aspPfIO = new AspPfIO + }) + + private val granularity = ASP_GRANULARITY_BITS + + // sequential store detection: + // store D, (A); store D, (A + K), store D, (A + 2K) ... + val DATAHASHBITS = 16 + val SEQTHRESHOLD = 32 + val seqStoreDetected = WireInit(false.B) + val prevCycleVaddr = RegInit(0.U(VAddrBits.W)) + val prevCycleDataHash = RegInit(0.U(DATAHASHBITS.W)) + val seqKStride = RegInit(0.U(6.W)) + val seqPatternVec = WireInit(VecInit(List.fill(EnsbufferWidth)(false.B))) + val seqPatternCnt = RegInit(0.U((log2Up(SEQTHRESHOLD) + 1).W)) + val sbufferFire = Cat(VecInit((0 until EnsbufferWidth).map(i => io.sbuffer(i).fire))).orR + val sbufferFireCnt = PopCount(VecInit((0 until EnsbufferWidth).map(i => io.sbuffer(i).fire))) + val validKStride = (seqKStride === 1.U || seqKStride === 2.U || seqKStride === 4.U || seqKStride === 8.U) + + for (i <- 0 until EnsbufferWidth) { + when (io.sbuffer(i).fire) { + val thisCycleVaddr = io.sbuffer(i).bits.vaddr + val thisCycleDataHash = io.sbuffer(i).bits.data.asTypeOf(Vec(VLEN / DATAHASHBITS, UInt(DATAHASHBITS.W))).fold(0.U)(_ ^ _) + prevCycleVaddr := thisCycleVaddr + prevCycleDataHash := thisCycleDataHash + + if (i == 0) { + seqKStride := thisCycleVaddr - prevCycleVaddr + seqPatternVec(i) := ((thisCycleVaddr - prevCycleVaddr) === seqKStride) && + (prevCycleDataHash === thisCycleDataHash) && + (PopCount(io.sbuffer(i).bits.mask) === seqKStride) + } else { + val lastLoopVaddr = WireInit(prevCycleVaddr) + val lastLoopDataHash = WireInit(prevCycleDataHash) + for ( j <- 0 until i ) { + when (io.sbuffer(j).fire) { + lastLoopVaddr := io.sbuffer(j).bits.vaddr + lastLoopDataHash := io.sbuffer(j).bits.data.asTypeOf(Vec(VLEN / DATAHASHBITS, UInt(DATAHASHBITS.W))).fold(0.U)(_ ^ _) + } + } + seqKStride := thisCycleVaddr - lastLoopVaddr + seqPatternVec(i) := ((thisCycleVaddr - lastLoopVaddr) === seqKStride) && + (lastLoopDataHash === thisCycleDataHash) && + (PopCount(io.sbuffer(i).bits.mask) === seqKStride) + } + } .otherwise { + seqPatternVec(i) := true.B + } + } + + when (sbufferFire) { + when (Cat(seqPatternVec).andR) { + seqPatternCnt := Mux(seqPatternCnt >= SEQTHRESHOLD.U, seqPatternCnt, seqPatternCnt + sbufferFireCnt) + } .otherwise { + seqPatternCnt := 0.U + } + } + when (seqPatternCnt >= SEQTHRESHOLD.U && validKStride) { + seqStoreDetected := true.B + } .otherwise { + seqStoreDetected := false.B + } + when (io.sqEmpty) { + seqStoreDetected := false.B + } + io.seqStoreDetected := seqStoreDetected + + // generator + val generator = Module(new ASPBurstGenerator) + + generator.io.alloc := false.B + generator.io.vaddr := 0.U + generator.io.aspPfIO <> io.aspPfIO + + // prefetch Depth for SW + val depthSW = Wire(UInt(10.W)) + depthSW := Constantin.createRecord("ASP_DEPTH_SW" + p(XSCoreParamsKey).HartId.toString, initValue = 16) + + // The larger the size of the store instruction, the greater the bandwidth for sq to write to the sbuffer, + // causing the sbuffer to fill up faster, so we need a larger distance. + val depth = LookupTreeDefault(seqKStride, depthSW, List( + 1.U -> (depthSW >> 2), // memset using sb + 2.U -> (depthSW >> 1), // memset using sh + 4.U -> depthSW, // memset using sw + 8.U -> (depthSW << 1) // memset using sd + )) + + val prefetchVaddr = (0 until EnsbufferWidth).map(i => get_block_addr(io.sbuffer(i).bits.vaddr) + Cat(depth, 0.U(log2Up(dcacheParameters.blockBytes).W))) + for (i <- 0 until EnsbufferWidth) { + when (io.enable) { + if (i == 0) { + when(io.sbuffer(i).fire) { + generator.io.alloc := seqStoreDetected && io.lqEmpty + generator.io.vaddr := prefetchVaddr(0) + } + } else { + when(io.sbuffer(i).fire) { + generator.io.alloc := seqStoreDetected && io.lqEmpty + when (!same_granularity_addr(prefetchVaddr(i), prefetchVaddr(i - 1), granularity)) { + generator.io.vaddr := prefetchVaddr(i) + } + } + } + } + } + + XSPerfAccumulate("seqStoreDetected", seqStoreDetected) } \ No newline at end of file