From 86b30b2fdff3c60f7532687469ae5b550da93209 Mon Sep 17 00:00:00 2001 From: lixin <1037997956@qq.com> Date: Wed, 18 Sep 2024 14:29:55 +0800 Subject: [PATCH 01/12] perf(sbuffer): fix sbuffer enq ready Previously, sbuffer was only ready when there were empty items. In a scenario where there are no empty items but requests received from sq can be merged, sbuffer will refuse to receive requests from sq, which will result in failure to run at full throughput. --- src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala index 0d53f5ac45..e1dcc9123e 100644 --- a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala +++ b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala @@ -385,8 +385,8 @@ class Sbuffer(implicit p: Parameters) val do_uarch_drain = GatedValidRegNext(forward_need_uarch_drain) || GatedValidRegNext(GatedValidRegNext(merge_need_uarch_drain)) XSPerfAccumulate("do_uarch_drain", do_uarch_drain) - io.in(0).ready := firstCanInsert - io.in(1).ready := secondCanInsert && io.in(0).ready + io.in(0).ready := firstCanInsert || mergeVec(0).orR + io.in(1).ready := (secondCanInsert || mergeVec(1).orR) && io.in(0).ready for (i <- 0 until EnsbufferWidth) { // train From fcee98f8db1f1e20fb000a7b7fb7f4eda8544202 Mon Sep 17 00:00:00 2001 From: lixin <1037997956@qq.com> Date: Wed, 18 Sep 2024 14:34:36 +0800 Subject: [PATCH 02/12] perf(sbuffer): opt specially in memset pattern If a memset is detected, let each newly allocated sbuffer entry wait for 32 cycles before writing to the dcache.(When memset and the write bandwidth are full, at least two sbs are executed in each cycle and 2 bytes are written. It takes 32 cycles to fill a cacheline) This will help improve sbuffer utilization --- .../scala/xiangshan/mem/sbuffer/Sbuffer.scala | 37 ++++++++++++++++--- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala index e1dcc9123e..05519b6515 100644 --- a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala +++ b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala @@ -61,6 +61,9 @@ trait HasSbufferConst extends HasXSParameter { val VWordsWidth: Int = log2Up(CacheLineVWords) val VWordWidth: Int = log2Up(VDataBytes) val VWordOffsetWidth: Int = PAddrBits - VWordWidth + + val FullWriteMaxWaitCycles = CacheLineBytes / EnsbufferWidth + val FullWriteMaxWaitBits = log2Up(FullWriteMaxWaitCycles) + 1 } class SbufferEntryState (implicit p: Parameters) extends SbufferBundle { @@ -206,6 +209,9 @@ class Sbuffer(implicit p: Parameters) val force_write = Input(Bool()) }) + println("Sbuffer FullWriteMaxWaitBits: " + FullWriteMaxWaitBits) + println("Sbuffer FullWriteMaxWaitCycles: " + FullWriteMaxWaitCycles) + val dataModule = Module(new SbufferData) dataModule.io.writeReq <> DontCare val prefetcher = Module(new StorePfWrapper()) @@ -220,6 +226,7 @@ class Sbuffer(implicit p: Parameters) val stateVec = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U.asTypeOf(new SbufferEntryState)))) val cohCount = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U(EvictCountBits.W)))) val missqReplayCount = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U(MissqReplayCountBits.W)))) + val waitCntBeforeFull = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U(FullWriteMaxWaitBits.W)))) val sbuffer_out_s0_fire = Wire(Bool()) @@ -312,7 +319,7 @@ class Sbuffer(implicit p: Parameters) val activeMask = VecInit(stateVec.map(s => s.isActive())) val validMask = VecInit(stateVec.map(s => s.isValid())) - val drainIdx = PriorityEncoder(activeMask) + val drainIdx = Wire(UInt(SbufferIndexWidth.W)) val inflightMask = VecInit(stateVec.map(s => s.isInflight())) @@ -437,6 +444,7 @@ class Sbuffer(implicit p: Parameters) // missqReplayCount(insertIdx) := 0.U ptag(entryIdx) := reqptag vtag(entryIdx) := reqvtag // update vtag if a new sbuffer line is allocated + waitCntBeforeFull(entryIdx) := FullWriteMaxWaitCycles.U } }) } @@ -468,6 +476,8 @@ class Sbuffer(implicit p: Parameters) }) } + waitCntBeforeFull.foreach(x => x := Mux(x.orR, x - 1.U, x)) + for(((in, vwordOffset), i) <- io.in.zip(Seq(firstWord, secondWord)).zipWithIndex){ writeReq(i).valid := in.fire && in.bits.vecValid writeReq(i).bits.vwordOffset := vwordOffset @@ -606,11 +616,22 @@ class Sbuffer(implicit p: Parameters) val sbuffer_out_s1_ready = Wire(Bool()) + // --------------------------------------------------------------------------- + // Memset Case + // --------------------------------------------------------------------------- + + val memSet_needDrain = io.memSetPattenDetected + val memSetActiveMask = VecInit(stateVec.zipWithIndex.map{case (s, idx) => { + s.isDcacheReqCandidate() && Mux(waitCntBeforeFull(idx).orR, mask(idx).asUInt.andR, true.B) + }}) + + drainIdx := Mux(memSet_needDrain, PriorityEncoder(memSetActiveMask), PriorityEncoder(activeMask)) + // --------------------------------------------------------------------------- // sbuffer_out_s0 // --------------------------------------------------------------------------- - val need_drain = needDrain(sbuffer_state) + val need_drain = needDrain(sbuffer_state) || memSet_needDrain val need_replace = do_eviction || (sbuffer_state === x_replace) val sbuffer_out_s0_evictionIdx = Mux(missqReplayHasTimeOut, missqReplayTimeOutIdx, @@ -620,14 +641,18 @@ class Sbuffer(implicit p: Parameters) ) ) + val sbuffer_out_s0_can_evict = Mux( + memSet_needDrain, + memSetActiveMask(sbuffer_out_s0_evictionIdx), + candidateVec(sbuffer_out_s0_evictionIdx) + ) + // If there is a inflight dcache req which has same ptag with sbuffer_out_s0_evictionIdx's ptag, // current eviction should be blocked. val sbuffer_out_s0_valid = missqReplayHasTimeOut || - stateVec(sbuffer_out_s0_evictionIdx).isDcacheReqCandidate() && - (need_drain || cohHasTimeOut || need_replace) + sbuffer_out_s0_can_evict && (need_drain || cohHasTimeOut || need_replace) assert(!( - stateVec(sbuffer_out_s0_evictionIdx).isDcacheReqCandidate() && - !noSameBlockInflight(sbuffer_out_s0_evictionIdx) + sbuffer_out_s0_can_evict && !noSameBlockInflight(sbuffer_out_s0_evictionIdx) )) val sbuffer_out_s0_cango = sbuffer_out_s1_ready sbuffer_out_s0_fire := sbuffer_out_s0_valid && sbuffer_out_s0_cango From a7ac43f30a361d0f056d3a51fc40268edc030bd6 Mon Sep 17 00:00:00 2001 From: lixin <1037997956@qq.com> Date: Wed, 18 Sep 2024 15:23:02 +0800 Subject: [PATCH 03/12] perf(sq): A more accurate MemSet detection --- .../scala/xiangshan/backend/MemBlock.scala | 1 + .../cache/dcache/DCacheWrapper.scala | 2 + .../cache/dcache/mainpipe/MissQueue.scala | 15 +----- .../xiangshan/mem/lsqueue/LSQWrapper.scala | 22 ++++---- .../xiangshan/mem/lsqueue/StoreQueue.scala | 54 +++++++++++++++++++ 5 files changed, 71 insertions(+), 23 deletions(-) diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala index ea92f330dc..671752ac0d 100644 --- a/src/main/scala/xiangshan/backend/MemBlock.scala +++ b/src/main/scala/xiangshan/backend/MemBlock.scala @@ -1371,6 +1371,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) io.mem_to_ooo.sqDeqPtr := lsq.io.sqDeqPtr io.mem_to_ooo.lqDeqPtr := lsq.io.lqDeqPtr lsq.io.tl_d_channel <> dcache.io.lsu.tl_d_channel + lsq.io.seqStoreDetected <> dcache.io.seqStoreDetected // LSQ to store buffer lsq.io.sbuffer <> sbuffer.io.in diff --git a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala index 0a48aab932..58ca7eab7b 100644 --- a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala +++ b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala @@ -781,6 +781,7 @@ class DCacheIO(implicit p: Parameters) extends DCacheBundle { val pf_ctrl = Output(new PrefetchControlBundle) val force_write = Input(Bool()) val sms_agt_evict_req = DecoupledIO(new AGTEvictReq) + val seqStoreDetected = Input(Bool()) val debugTopDown = new DCacheTopDownIO val debugRolling = Flipped(new RobDebugRollingIO) val l2_hint = Input(Valid(new L2ToL1Hint())) @@ -981,6 +982,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame missQueue.io.hartId := io.hartId missQueue.io.l2_pf_store_only := RegNext(io.l2_pf_store_only, false.B) missQueue.io.debugTopDown <> io.debugTopDown + missQueue.io.seqStoreDetected <> io.seqStoreDetected missQueue.io.l2_hint <> RegNext(io.l2_hint) missQueue.io.mainpipe_info := mainPipe.io.mainpipe_info mainPipe.io.refill_info := missQueue.io.refill_info diff --git a/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala b/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala index f943d572a0..859330a41a 100644 --- a/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala +++ b/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala @@ -869,6 +869,7 @@ class MissQueue(edge: TLEdgeOut, reqNum: Int)(implicit p: Parameters) extends DC val l2_pf_store_only = Input(Bool()) val memSetPattenDetected = Output(Bool()) + val seqStoreDetected = Input(Bool()) val lqEmpty = Input(Bool()) val prefetch_info = new Bundle { @@ -941,19 +942,7 @@ class MissQueue(edge: TLEdgeOut, reqNum: Int)(implicit p: Parameters) extends DC assert(PopCount(Seq(alloc && io.req.valid, merge && io.req.valid)) <= 1.U, "allocate and merge a mshr in same cycle!") - val source_except_load_cnt = RegInit(0.U(10.W)) - when(VecInit(req_mshr_handled_vec).asUInt.orR || req_pipeline_reg_handled) { - when(io.req.bits.isFromLoad) { - source_except_load_cnt := 0.U - }.otherwise { - when(io.req.bits.isFromStore) { - source_except_load_cnt := source_except_load_cnt + 1.U - } - } - } - val Threshold = 8 - val memSetPattenDetected = GatedValidRegNext((source_except_load_cnt >= Threshold.U) && io.lqEmpty) - + val memSetPattenDetected = GatedValidRegNext(io.seqStoreDetected && io.lqEmpty) io.memSetPattenDetected := memSetPattenDetected val forwardInfo_vec = VecInit(entries.map(_.io.forwardInfo)) diff --git a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala index d369a57a62..832bac4dbc 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala @@ -122,6 +122,7 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete val flushSbuffer = new SbufferFlushBundle val force_write = Output(Bool()) val lqEmpty = Output(Bool()) + val seqStoreDetected = Output(Bool()) // top-down val debugTopDown = new LoadQueueTopDownIO @@ -177,16 +178,17 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete storeQueue.io.vecmmioStout <> io.vecmmioStout storeQueue.io.rob <> io.rob storeQueue.io.exceptionAddr.isStore := DontCare - storeQueue.io.sqCancelCnt <> io.sqCancelCnt - storeQueue.io.sqDeq <> io.sqDeq - storeQueue.io.sqEmpty <> io.sqEmpty - storeQueue.io.sqFull <> io.sqFull - storeQueue.io.forward <> io.forward // overlap forwardMask & forwardData, DO NOT CHANGE SEQUENCE - storeQueue.io.force_write <> io.force_write - storeQueue.io.cmoOpReq <> io.cmoOpReq - storeQueue.io.cmoOpResp <> io.cmoOpResp - storeQueue.io.flushSbuffer <> io.flushSbuffer - storeQueue.io.maControl <> io.maControl + storeQueue.io.sqCancelCnt <> io.sqCancelCnt + storeQueue.io.sqDeq <> io.sqDeq + storeQueue.io.sqEmpty <> io.sqEmpty + storeQueue.io.sqFull <> io.sqFull + storeQueue.io.forward <> io.forward // overlap forwardMask & forwardData, DO NOT CHANGE SEQUENCE + storeQueue.io.force_write <> io.force_write + storeQueue.io.cmoOpReq <> io.cmoOpReq + storeQueue.io.cmoOpResp <> io.cmoOpResp + storeQueue.io.flushSbuffer <> io.flushSbuffer + storeQueue.io.maControl <> io.maControl + storeQueue.io.seqStoreDetected <> io.seqStoreDetected /* <------- DANGEROUS: Don't change sequence here ! -------> */ diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala index 737183d766..0b6f9402ce 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala @@ -196,6 +196,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule val sqDeq = Output(UInt(log2Ceil(EnsbufferWidth + 1).W)) val force_write = Output(Bool()) val maControl = Flipped(new StoreMaBufToSqControlIO) + val seqStoreDetected = Output(Bool()) }) println("StoreQueue: size:" + StoreQueueSize) @@ -1019,6 +1020,59 @@ class StoreQueue(implicit p: Parameters) extends XSModule } } + // sequential store detection: + // store D, (A); store D, (A + K), store D, (A + 2K) ... + val DATAHASHBITS = 16 + val SEQTHRESHOLD = 64 + val seqStoreDetected = WireInit(false.B) + val prevCycleVaddr = RegInit(0.U(VAddrBits.W)) + val prevCycleDataHash = RegInit(0.U(DATAHASHBITS.W)) + val seqKStride = RegInit(0.U(6.W)) + val seqPatternVec = WireInit(VecInit(List.fill(EnsbufferWidth)(false.B))) + val seqPatternCnt = RegInit(0.U(log2Up(SEQTHRESHOLD+1).W)) + val sbufferFire = Cat(VecInit((0 until EnsbufferWidth).map(i => io.sbuffer(i).fire))).orR + val validKStride = (seqKStride === 1.U || seqKStride === 2.U || seqKStride === 4.U || seqKStride === 8.U) + + for (i <- 0 until EnsbufferWidth) { + when(io.sbuffer(i).fire) { + val thisCycleVaddr = io.sbuffer(i).bits.vaddr + val thisCycleDataHash = io.sbuffer(i).bits.data.asTypeOf(Vec(VLEN / DATAHASHBITS, UInt(DATAHASHBITS.W))).fold(0.U)(_ ^ _) + prevCycleVaddr := thisCycleVaddr + prevCycleDataHash := thisCycleDataHash + + if(i == 0) { + seqKStride := thisCycleVaddr - prevCycleVaddr + seqPatternVec(i) := ((thisCycleVaddr - prevCycleVaddr) === seqKStride) && + (prevCycleDataHash === thisCycleDataHash) + }else { + val lastLoopVaddr = io.sbuffer(i - 1).bits.vaddr + val lastLoopDataHash = io.sbuffer(i - 1).bits.data.asTypeOf(Vec(VLEN / DATAHASHBITS, UInt(DATAHASHBITS.W))).fold(0.U)(_ ^ _) + seqKStride := thisCycleVaddr - lastLoopVaddr + seqPatternVec(i) := ((thisCycleVaddr - lastLoopVaddr) === seqKStride) && + (lastLoopDataHash === thisCycleDataHash) + } + }.otherwise { + seqPatternVec(i) := true.B + } + } + + when(sbufferFire) { + when(Cat(seqPatternVec).andR) { + seqPatternCnt := Mux(seqPatternCnt === SEQTHRESHOLD.U, seqPatternCnt, seqPatternCnt + 1.U) + }.otherwise { + seqPatternCnt := 0.U + } + } + when(seqPatternCnt === SEQTHRESHOLD.U && validKStride) { + seqStoreDetected := true.B + }.otherwise { + seqStoreDetected := false.B + } + when(io.sqEmpty) { + seqStoreDetected := false.B + } + io.seqStoreDetected := seqStoreDetected + // All vector instruction uop normally dequeue, but the Uop after the exception is raised does not write to the 'sbuffer'. // Flags are used to record whether there are any exceptions when the queue is displayed. // This is determined each time a write is made to the 'databuffer', prevent subsequent uop of the same instruction from writing to the 'dataBuffer'. From f810f99358d0d07b7e8174060f968f3a9f217615 Mon Sep 17 00:00:00 2001 From: lixin <1037997956@qq.com> Date: Mon, 23 Sep 2024 17:13:28 +0800 Subject: [PATCH 04/12] perf(memblock): add ASP store prefetcher only works in MemSet Pattern --- src/main/scala/xiangshan/Parameters.scala | 2 + .../scala/xiangshan/backend/MemBlock.scala | 36 ++- .../scala/xiangshan/cache/mmu/Repeater.scala | 27 +- .../xiangshan/mem/lsqueue/LSQWrapper.scala | 2 + .../xiangshan/mem/lsqueue/StoreQueue.scala | 71 ++--- .../mem/sbuffer/StorePrefetchBursts.scala | 279 +++++++++++++++++- 6 files changed, 336 insertions(+), 81 deletions(-) diff --git a/src/main/scala/xiangshan/Parameters.scala b/src/main/scala/xiangshan/Parameters.scala index 5822f5fb14..a1d63ec606 100644 --- a/src/main/scala/xiangshan/Parameters.scala +++ b/src/main/scala/xiangshan/Parameters.scala @@ -252,6 +252,7 @@ case class XSCoreParameters EnableAtCommitMissTrigger: Boolean = true, EnableStorePrefetchSMS: Boolean = false, EnableStorePrefetchSPB: Boolean = false, + EnableStorePrefetchASP: Boolean = true, HasCMO: Boolean = true, MMUAsidLen: Int = 16, // max is 16, 0 is not supported now MMUVmidLen: Int = 14, @@ -801,6 +802,7 @@ trait HasXSParameter { def EnableAtCommitMissTrigger = coreParams.EnableAtCommitMissTrigger def EnableStorePrefetchSMS = coreParams.EnableStorePrefetchSMS def EnableStorePrefetchSPB = coreParams.EnableStorePrefetchSPB + def EnableStorePrefetchASP = coreParams.EnableStorePrefetchASP def HasCMO = coreParams.HasCMO && p(EnableCHI) require(LoadPipelineWidth == backendParams.LdExuCnt, "LoadPipelineWidth must be equal exuParameters.LduCnt!") require(StorePipelineWidth == backendParams.StaCnt, "StorePipelineWidth must be equal exuParameters.StuCnt!") diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala index 671752ac0d..11c7ca4411 100644 --- a/src/main/scala/xiangshan/backend/MemBlock.scala +++ b/src/main/scala/xiangshan/backend/MemBlock.scala @@ -530,13 +530,31 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) // load/store prefetch to l2 cache prefetcherOpt.foreach(sms_pf => { l1PrefetcherOpt.foreach(l1_pf => { - val sms_pf_to_l2 = DelayNWithValid(sms_pf.io.l2_req, 2) val l1_pf_to_l2 = DelayNWithValid(l1_pf.io.l2_req, 2) - - outer.l2_pf_sender_opt.get.out.head._1.addr_valid := sms_pf_to_l2.valid || l1_pf_to_l2.valid - outer.l2_pf_sender_opt.get.out.head._1.addr := Mux(l1_pf_to_l2.valid, l1_pf_to_l2.bits.addr, sms_pf_to_l2.bits.addr) - outer.l2_pf_sender_opt.get.out.head._1.pf_source := Mux(l1_pf_to_l2.valid, l1_pf_to_l2.bits.source, sms_pf_to_l2.bits.source) + val sms_pf_to_l2 = DelayNWithValid(sms_pf.io.l2_req, 2) + val asp_pf_to_l2 = DelayNWithValid(lsq.io.aspPfIO.l2_pf_addr, 2) + + outer.l2_pf_sender_opt.get.out.head._1.addr_valid := sms_pf_to_l2.valid || l1_pf_to_l2.valid || asp_pf_to_l2.valid + outer.l2_pf_sender_opt.get.out.head._1.addr := Mux( + l1_pf_to_l2.valid, + l1_pf_to_l2.bits.addr, + Mux( + sms_pf_to_l2.valid, + sms_pf_to_l2.bits.addr, + asp_pf_to_l2.bits.addr + ) + ) + outer.l2_pf_sender_opt.get.out.head._1.pf_source := Mux( + l1_pf_to_l2.valid, + l1_pf_to_l2.bits.source, + Mux( + sms_pf_to_l2.valid, + sms_pf_to_l2.bits.source, + asp_pf_to_l2.bits.source + ) + ) outer.l2_pf_sender_opt.get.out.head._1.l2_pf_en := RegNextN(io.ooo_to_mem.csrCtrl.l2_pf_enable, 2, Some(true.B)) + outer.l2_pf_sender_opt.get.out.head._1.needT := !l1_pf_to_l2.valid && !sms_pf_to_l2.valid && asp_pf_to_l2.valid sms_pf.io.enable := RegNextN(io.ooo_to_mem.csrCtrl.l1D_pf_enable, 2, Some(false.B)) @@ -585,14 +603,14 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) // dtlb val dtlb_ld_tlb_ld = Module(new TLBNonBlock(LduCnt + HyuCnt + 1, 2, ldtlbParams)) val dtlb_st_tlb_st = Module(new TLBNonBlock(StaCnt, 1, sttlbParams)) - val dtlb_prefetch_tlb_prefetch = Module(new TLBNonBlock(2, 2, pftlbParams)) + val dtlb_prefetch_tlb_prefetch = Module(new TLBNonBlock(3, 2, pftlbParams)) val dtlb_ld = Seq(dtlb_ld_tlb_ld.io) val dtlb_st = Seq(dtlb_st_tlb_st.io) val dtlb_prefetch = Seq(dtlb_prefetch_tlb_prefetch.io) /* tlb vec && constant variable */ val dtlb = dtlb_ld ++ dtlb_st ++ dtlb_prefetch val (dtlb_ld_idx, dtlb_st_idx, dtlb_pf_idx) = (0, 1, 2) - val TlbSubSizeVec = Seq(LduCnt + HyuCnt + 1, StaCnt, 2) // (load + hyu + stream pf, store, sms+l2bop) + val TlbSubSizeVec = Seq(LduCnt + HyuCnt + 1, StaCnt, 3) // (load + hyu + stream pf, store, sms+l2bop+asp) val DTlbSize = TlbSubSizeVec.sum val TlbStartVec = TlbSubSizeVec.scanLeft(0)(_ + _).dropRight(1) val TlbEndVec = TlbSubSizeVec.scanLeft(0)(_ + _).drop(1) @@ -628,7 +646,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) replace_st.io.apply_sep(dtlb_st.map(_.replace), ptwio.resp.bits.data.s1.entry.tag) } if (pftlbParams.outReplace) { - val replace_pf = Module(new TlbReplace(2, pftlbParams)) + val replace_pf = Module(new TlbReplace(3, pftlbParams)) replace_pf.io.apply_sep(dtlb_prefetch.map(_.replace), ptwio.resp.bits.data.s1.entry.tag) } } @@ -1086,6 +1104,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) val StreamDTLBPortIndex = TlbStartVec(dtlb_ld_idx) + LduCnt + HyuCnt val PrefetcherDTLBPortIndex = TlbStartVec(dtlb_pf_idx) val L2toL1DLBPortIndex = TlbStartVec(dtlb_pf_idx) + 1 + val ASPDTLBPortIndex = TlbStartVec(dtlb_pf_idx) + 2 prefetcherOpt match { case Some(pf) => dtlb_reqs(PrefetcherDTLBPortIndex) <> pf.io.tlb_req case None => @@ -1104,6 +1123,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) dtlb_reqs(L2toL1DLBPortIndex).resp.ready := true.B io.l2_pmp_resp := pmp_check(L2toL1DLBPortIndex).resp + dtlb_reqs(ASPDTLBPortIndex) <> lsq.io.aspPfIO.tlb_req // StoreUnit for (i <- 0 until StdCnt) { stdExeUnits(i).io.flush <> redirect diff --git a/src/main/scala/xiangshan/cache/mmu/Repeater.scala b/src/main/scala/xiangshan/cache/mmu/Repeater.scala index 98d452f356..faef208b80 100644 --- a/src/main/scala/xiangshan/cache/mmu/Repeater.scala +++ b/src/main/scala/xiangshan/cache/mmu/Repeater.scala @@ -219,14 +219,23 @@ class PTWFilterEntry(Width: Int, Size: Int, hasHint: Boolean = false)(implicit p canenq(1) := !(Cat(v.drop(Size/2)).andR) enqidx(1) := firstValidIndex(v.drop(Size/2), false.B) + (Size/2).U } else if (Width == 3) { - require(Size == 16, s"load filter Size ($Size) should be 16") - canenq(0) := !(Cat(v.take(8)).andR) - enqidx(0) := firstValidIndex(v.take(8), false.B) - canenq(1) := !(Cat(v.drop(8).take(4)).andR) - enqidx(1) := firstValidIndex(v.drop(8).take(4), false.B) + 8.U - // four entries for prefetch - canenq(2) := !(Cat(v.drop(12)).andR) - enqidx(2) := firstValidIndex(v.drop(12), false.B) + 12.U + require(Size == 16 || Size == 8, s"load/prefetcher filter Size ($Size) should be 16/8") + if (Size == 16) { + canenq(0) := !(Cat(v.take(8)).andR) + enqidx(0) := firstValidIndex(v.take(8), false.B) + canenq(1) := !(Cat(v.drop(8).take(4)).andR) + enqidx(1) := firstValidIndex(v.drop(8).take(4), false.B) + 8.U + // four entries for prefetch + canenq(2) := !(Cat(v.drop(12)).andR) + enqidx(2) := firstValidIndex(v.drop(12), false.B) + 12.U + } else { + canenq(0) := !(Cat(v.take(4)).andR) + enqidx(0) := firstValidIndex(v.take(4), false.B) + canenq(1) := !(Cat(v.drop(4).take(2)).andR) + enqidx(1) := firstValidIndex(v.drop(4).take(2), false.B) + 4.U + canenq(2) := !(Cat(v.drop(6)).andR) + enqidx(2) := firstValidIndex(v.drop(6), false.B) + 6.U + } } else if (Width == 4) { require(Size == 16, s"load filter Size ($Size) should be 16") for (i <- 0 until Width) { @@ -359,7 +368,7 @@ class PTWNewFilter(Width: Int, Size: Int, FenceDelay: Int)(implicit p: Parameter }) val prefetch_filter = VecInit(Seq.fill(1) { - val prefetch_entry = Module(new PTWFilterEntry(Width = 2, Size = prefetchfiltersize)) + val prefetch_entry = Module(new PTWFilterEntry(Width = 3, Size = prefetchfiltersize)) prefetch_entry.io }) diff --git a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala index 832bac4dbc..827418b16d 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala @@ -123,6 +123,7 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete val force_write = Output(Bool()) val lqEmpty = Output(Bool()) val seqStoreDetected = Output(Bool()) + val aspPfIO = new AspPfIO // top-down val debugTopDown = new LoadQueueTopDownIO @@ -189,6 +190,7 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete storeQueue.io.flushSbuffer <> io.flushSbuffer storeQueue.io.maControl <> io.maControl storeQueue.io.seqStoreDetected <> io.seqStoreDetected + storeQueue.io.aspPfIO <> io.aspPfIO /* <------- DANGEROUS: Don't change sequence here ! -------> */ diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala index 0b6f9402ce..a0eb796cb3 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala @@ -26,12 +26,14 @@ import utility._ import utils._ import xiangshan._ import xiangshan.cache._ +import xiangshan.cache.mmu.TlbRequestIO import xiangshan.cache.{DCacheLineIO, DCacheWordIO, MemoryOpConstants} import xiangshan.backend._ import xiangshan.backend.rob.{RobLsqIO, RobPtr} import xiangshan.backend.Bundles.{DynInst, MemExuOutput} import xiangshan.backend.decode.isa.bitfield.{Riscv32BitInst, XSInstBitFields} import xiangshan.backend.fu.FuConfig._ +import xiangshan.mem.prefetch.L2PrefetchReq import xiangshan.backend.fu.FuType import xiangshan.ExceptionNO._ import coupledL2.{CMOReq, CMOResp} @@ -58,6 +60,11 @@ class SqEnqIO(implicit p: Parameters) extends MemBlockBundle { val resp = Vec(LSQEnqWidth, Output(new SqPtr)) } +class AspPfIO(implicit p: Parameters) extends MemBlockBundle { + val tlb_req = new TlbRequestIO(nRespDups = 2) + val l2_pf_addr = ValidIO(new L2PrefetchReq()) +} + class DataBufferEntry (implicit p: Parameters) extends DCacheBundle { val addr = UInt(PAddrBits.W) val vaddr = UInt(VAddrBits.W) @@ -197,10 +204,14 @@ class StoreQueue(implicit p: Parameters) extends XSModule val force_write = Output(Bool()) val maControl = Flipped(new StoreMaBufToSqControlIO) val seqStoreDetected = Output(Bool()) + val aspPfIO = new AspPfIO }) println("StoreQueue: size:" + StoreQueueSize) + // ASP prefetcher + val asp = Module(new ASP) + // data modules val uop = Reg(Vec(StoreQueueSize, new DynInst)) // val data = Reg(Vec(StoreQueueSize, new LsqEntry)) @@ -1020,58 +1031,14 @@ class StoreQueue(implicit p: Parameters) extends XSModule } } - // sequential store detection: - // store D, (A); store D, (A + K), store D, (A + 2K) ... - val DATAHASHBITS = 16 - val SEQTHRESHOLD = 64 - val seqStoreDetected = WireInit(false.B) - val prevCycleVaddr = RegInit(0.U(VAddrBits.W)) - val prevCycleDataHash = RegInit(0.U(DATAHASHBITS.W)) - val seqKStride = RegInit(0.U(6.W)) - val seqPatternVec = WireInit(VecInit(List.fill(EnsbufferWidth)(false.B))) - val seqPatternCnt = RegInit(0.U(log2Up(SEQTHRESHOLD+1).W)) - val sbufferFire = Cat(VecInit((0 until EnsbufferWidth).map(i => io.sbuffer(i).fire))).orR - val validKStride = (seqKStride === 1.U || seqKStride === 2.U || seqKStride === 4.U || seqKStride === 8.U) - - for (i <- 0 until EnsbufferWidth) { - when(io.sbuffer(i).fire) { - val thisCycleVaddr = io.sbuffer(i).bits.vaddr - val thisCycleDataHash = io.sbuffer(i).bits.data.asTypeOf(Vec(VLEN / DATAHASHBITS, UInt(DATAHASHBITS.W))).fold(0.U)(_ ^ _) - prevCycleVaddr := thisCycleVaddr - prevCycleDataHash := thisCycleDataHash - - if(i == 0) { - seqKStride := thisCycleVaddr - prevCycleVaddr - seqPatternVec(i) := ((thisCycleVaddr - prevCycleVaddr) === seqKStride) && - (prevCycleDataHash === thisCycleDataHash) - }else { - val lastLoopVaddr = io.sbuffer(i - 1).bits.vaddr - val lastLoopDataHash = io.sbuffer(i - 1).bits.data.asTypeOf(Vec(VLEN / DATAHASHBITS, UInt(DATAHASHBITS.W))).fold(0.U)(_ ^ _) - seqKStride := thisCycleVaddr - lastLoopVaddr - seqPatternVec(i) := ((thisCycleVaddr - lastLoopVaddr) === seqKStride) && - (lastLoopDataHash === thisCycleDataHash) - } - }.otherwise { - seqPatternVec(i) := true.B - } - } - - when(sbufferFire) { - when(Cat(seqPatternVec).andR) { - seqPatternCnt := Mux(seqPatternCnt === SEQTHRESHOLD.U, seqPatternCnt, seqPatternCnt + 1.U) - }.otherwise { - seqPatternCnt := 0.U - } - } - when(seqPatternCnt === SEQTHRESHOLD.U && validKStride) { - seqStoreDetected := true.B - }.otherwise { - seqStoreDetected := false.B - } - when(io.sqEmpty) { - seqStoreDetected := false.B - } - io.seqStoreDetected := seqStoreDetected + asp.io.sbuffer.zipWithIndex.foreach {case (s, idx) => { + s.valid := io.sbuffer(idx).fire + s.bits := io.sbuffer(idx).bits + }} + asp.io.sqEmpty := io.sqEmpty + asp.io.enable := EnableStorePrefetchASP.B + io.seqStoreDetected := asp.io.seqStoreDetected + io.aspPfIO <> asp.io.aspPfIO // All vector instruction uop normally dequeue, but the Uop after the exception is raised does not write to the 'sbuffer'. // Flags are used to record whether there are any exceptions when the queue is displayed. diff --git a/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala b/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala index 6455a96fa9..d293ef1fa8 100644 --- a/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala +++ b/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala @@ -23,6 +23,8 @@ import xiangshan._ import utils._ import utility._ import xiangshan.cache._ +import xiangshan.cache.mmu._ +import xiangshan.mem.prefetch.L2PrefetchReq trait HasStorePrefetchHelper extends HasCircularQueuePtrHelper with HasDCacheParameters { // common @@ -34,24 +36,39 @@ trait HasStorePrefetchHelper extends HasCircularQueuePtrHelper with HasDCachePar val ONLY_ON_MEMSET = false val SATURATE_COUNTER_BITS = 7 val BURST_ENGINE_SIZE = 2 + val SPB_GRANULARITY_BYTES = 4096 + val SPB_GRANULARITY_BITS = log2Up(SPB_GRANULARITY_BYTES) val SPB_N = 48 // serializer parameters val SERIALIZER_SIZE = 12 + // asp parameters + val LOCK_CYCLE = 2048 + val LOCK_BITS = log2Up(LOCK_CYCLE) + 1 + val ASP_GRANULARITY_BYTES = 1024 // 1KB + val ASP_GRANULARITY_BITS = log2Up(ASP_GRANULARITY_BYTES) + def block_addr(x: UInt): UInt = { val offset = log2Up(dcacheParameters.blockBytes) x(x.getWidth - 1, offset) } - // filter logic (granularity: a page) - def same_page_addr(addr0: UInt, addr1: UInt): Bool = { - addr0(addr0.getWidth - 1, PAGEOFFSET) === addr1(addr1.getWidth - 1, PAGEOFFSET) + // filter logic (granularity specified in args) + def same_granularity_addr(addr0: UInt, addr1: UInt, granularity: Int): Bool = { + addr0(addr0.getWidth - 1, granularity) === addr1(addr1.getWidth - 1, granularity) } - def filter_by_page_addr(valid_vec: Vec[Bool], data_vec: Vec[UInt], incoming_vaddr: UInt) : Bool = { + def filter_by_addr(valid_vec: Vec[Bool], data_vec: Vec[UInt], incoming_vaddr: UInt, granularity: Int) : Bool = { val match_vec = (valid_vec zip data_vec).map{ - case(v, e_vaddr) => v && same_page_addr(e_vaddr, incoming_vaddr) + case(v, e_vaddr) => v && same_granularity_addr(e_vaddr, incoming_vaddr, granularity) + } + VecInit(match_vec).asUInt.orR + } + + def filter_by_addr(valid_vec: Vec[Bool], data_vec: Vec[UInt], lock_vec: Vec[Bool], incoming_vaddr: UInt, granularity: Int) : Bool = { + val match_vec = (valid_vec zip lock_vec zip data_vec).map{ + case((v, l), e_vaddr) => (v || l) && same_granularity_addr(e_vaddr, incoming_vaddr, granularity) } VecInit(match_vec).asUInt.orR } @@ -79,7 +96,7 @@ trait HasStorePrefetchHelper extends HasCircularQueuePtrHelper with HasDCachePar // L1 Store prefetch component // an prefetch request generator used by spb to burst some prefetch request to L1 Dcache -class PrefetchBurstGenerator(is_store: Boolean)(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelper { +class PrefetchBurstGenerator(granularity: Int, is_store: Boolean)(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelper { val io = IO(new DCacheBundle { val alloc = Input(Bool()) val vaddr = Input(UInt(VAddrBits.W)) @@ -99,12 +116,12 @@ class PrefetchBurstGenerator(is_store: Boolean)(implicit p: Parameters) extends val enq_valids = ~(valids.asUInt) val full = !(enq_valids.orR) val enq_idx = PriorityEncoder(enq_valids) - val enq_filter = filter_by_page_addr(valids, datas, io.vaddr) + val enq_filter = filter_by_addr(valids, datas, io.vaddr, granularity) when(io.alloc && !full && !enq_filter) { valids(enq_idx) := true.B datas(enq_idx) := io.vaddr - pagebits(enq_idx) := io.vaddr(PAGEOFFSET) + pagebits(enq_idx) := io.vaddr(granularity) } XSPerfAccumulate("burst_generator_alloc_success", io.alloc && !full && !enq_filter) @@ -127,21 +144,21 @@ class PrefetchBurstGenerator(is_store: Boolean)(implicit p: Parameters) extends out_decouple(0).valid := deq_valid out_decouple(0).bits := DontCare out_decouple(0).bits.vaddr := data - out_decouple(1).valid := deq_valid && data_next(PAGEOFFSET) === pg_bit && out_decouple(0).fire + out_decouple(1).valid := deq_valid && data_next(granularity) === pg_bit && out_decouple(0).fire out_decouple(1).bits := DontCare out_decouple(1).bits.vaddr := data_next out_decouple.drop(2).foreach { out => out.valid := false.B; out.bits := DontCare } when(out_decouple(1).fire) { // fired 2 prefetch reqs data := data_next_next - when(data_next_next(PAGEOFFSET) =/= pg_bit) { + when(data_next_next(granularity) =/= pg_bit) { // cross page, invalid this entry v := false.B } }.elsewhen(out_decouple(0).fire) { // fired 1 prefetch req data := data_next - when(data_next(PAGEOFFSET) =/= pg_bit) { + when(data_next(granularity) =/= pg_bit) { // cross page, invalid this entry v := false.B } @@ -164,12 +181,14 @@ class StorePrefetchBursts(implicit p: Parameters) extends DCacheModule with HasS }) require(EnsbufferWidth == 2) + private val granularity = SPB_GRANULARITY_BITS + // meta for SPB val N = SPB_N val last_st_block_addr = RegInit(0.U(VAddrBits.W)) val saturate_counter = RegInit(0.S(SATURATE_COUNTER_BITS.W)) val store_count = RegInit(0.U((log2Up(N) + 1).W)) - val burst_engine = Module(new PrefetchBurstGenerator(is_store = true)) + val burst_engine = Module(new PrefetchBurstGenerator(granularity, true)) val sbuffer_fire = io.sbuffer_enq.valid val sbuffer_vaddr = io.sbuffer_enq.bits.vaddr @@ -284,4 +303,240 @@ class StorePfWrapper()(implicit p: Parameters) extends DCacheModule with HasStor // fire a prefetch req io.prefetch_req <> spb.io.prefetch_req +} + +// prefetch request generator used by asp to burst some prefetch request to L2 Cache +class ASPBurstGenerator(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelper { + val io = IO(new DCacheBundle { + val alloc = Input(Bool()) + val vaddr = Input(UInt(VAddrBits.W)) + val aspPfIO = new AspPfIO + }) + + private val granularity = ASP_GRANULARITY_BITS + + val SIZE = BURST_ENGINE_SIZE + + val valids = RegInit(VecInit(List.tabulate(SIZE){_ => false.B})) + val locks = RegInit(VecInit(List.tabulate(SIZE){_ => false.B})) + val cnts = RegInit(VecInit(List.tabulate(SIZE){_ => 0.U(LOCK_BITS.W)})) + val vaddrs = RegInit(VecInit(List.tabulate(SIZE){_ => 0.U.asTypeOf(io.vaddr)})) + val paddrs = RegInit(VecInit(List.tabulate(SIZE){_ => 0.U(PAddrBits.W)})) + val pa_vs = RegInit(VecInit(List.tabulate(SIZE){_ => false.B})) + val tlb_sent = RegInit(VecInit(List.tabulate(SIZE){_ => false.B})) + + val tlb_req_arb = Module(new RRArbiterInit(new TlbReq, SIZE)) + val l2_pf_req_arb = Module(new RRArbiterInit(new L2PrefetchReq, SIZE)) + + // enq + val enq_valids = ~(valids.asUInt) + val full = !(enq_valids.orR) + val enq_idx = PriorityEncoder(enq_valids) + val enq_filter = filter_by_addr(valids, vaddrs, locks, io.vaddr, granularity) + + for (i <- 0 until SIZE) { + when (!valids(i) && locks(i) && cnts(i).orR) { + cnts(i) := cnts(i) - 1.U + } + + when (!valids(i) && locks(i) && !cnts(i).orR) { + locks(i) := false.B + } + } + + when(io.alloc && !full && !enq_filter) { + valids(enq_idx) := true.B + locks(enq_idx) := false.B + cnts(enq_idx) := 0.U + vaddrs(enq_idx) := io.vaddr + pa_vs(enq_idx) := false.B + tlb_sent(enq_idx) := false.B + } + + // tlb req + val s0_tlb_fire_vec = VecInit((0 until SIZE).map{case i => tlb_req_arb.io.in(i).fire}) + for(i <- 0 until SIZE) { + tlb_req_arb.io.in(i).valid := valids(i) && !pa_vs(i) && !tlb_sent(i) + tlb_req_arb.io.in(i).bits := 0.U.asTypeOf(new TlbReq) + tlb_req_arb.io.in(i).bits.vaddr := vaddrs(i) + tlb_req_arb.io.in(i).bits.cmd := TlbCmd.write + tlb_req_arb.io.in(i).bits.size := 3.U + tlb_req_arb.io.in(i).bits.kill := false.B + tlb_req_arb.io.in(i).bits.no_translate := false.B + + when(tlb_req_arb.io.in(i).fire) { + tlb_sent(i) := true.B + } + } + assert(PopCount(s0_tlb_fire_vec) <= 1.U, "s0_tlb_fire_vec should be one-hot or empty") + + val s1_tlb_req_valid = Reg(Bool()) + val s1_tlb_req_bits = RegEnable(tlb_req_arb.io.out.bits, tlb_req_arb.io.out.fire) + val s1_tlb_req_index = RegEnable(OHToUInt(s0_tlb_fire_vec.asUInt), tlb_req_arb.io.out.fire) + when(io.aspPfIO.tlb_req.req.fire) { + s1_tlb_req_valid := false.B + } + when(tlb_req_arb.io.out.fire) { + s1_tlb_req_valid := true.B + } + io.aspPfIO.tlb_req.req.valid := s1_tlb_req_valid + io.aspPfIO.tlb_req.req.bits := s1_tlb_req_bits + io.aspPfIO.tlb_req.req_kill := false.B + tlb_req_arb.io.out.ready := !s1_tlb_req_valid || io.aspPfIO.tlb_req.req.ready + + // tlb resp + val s2_tlb_resp = io.aspPfIO.tlb_req.resp + val s2_tlb_update_index = RegEnable(s1_tlb_req_index, io.aspPfIO.tlb_req.req.fire) + when(s2_tlb_resp.valid) { + pa_vs(s2_tlb_update_index) := !s2_tlb_resp.bits.miss + tlb_sent(s2_tlb_update_index) := false.B + + when(!s2_tlb_resp.bits.miss) { + paddrs(s2_tlb_update_index) := s2_tlb_resp.bits.paddr.head + when(s2_tlb_resp.bits.excp.head.pf.st || s2_tlb_resp.bits.excp.head.af.st) { + valids(s2_tlb_update_index) := false.B + } + } + } + s2_tlb_resp.ready := true.B + + // next prefetch address + val paddrs_next = Wire(Vec(SIZE, chiselTypeOf(paddrs(0)))) + paddrs_next := paddrs.map(_ + Cat(1.U(1.W), 0.U(BLOCKOFFSET.W))) + + // pf to l2 + io.aspPfIO.l2_pf_addr.valid := l2_pf_req_arb.io.out.valid + io.aspPfIO.l2_pf_addr.bits := l2_pf_req_arb.io.out.bits + + l2_pf_req_arb.io.out.ready := true.B + + for(i <- 0 until SIZE) { + l2_pf_req_arb.io.in(i).valid := valids(i) && pa_vs(i) + l2_pf_req_arb.io.in(i).bits.addr := paddrs(i) + l2_pf_req_arb.io.in(i).bits.source := MemReqSource.Prefetch2L2Stream.id.U + } + + when(l2_pf_req_arb.io.out.fire) { + val idx = l2_pf_req_arb.io.chosen + val cross_page = !same_granularity_addr(paddrs_next(idx), paddrs(idx), granularity) + when(cross_page) { + valids(idx) := false.B + locks(idx) := true.B + cnts(idx) := LOCK_CYCLE.U + } + paddrs(idx) := paddrs_next(idx) + } + + XSPerfAccumulate("burst_generator_alloc_success", io.alloc && !full && !enq_filter) + XSPerfAccumulate("burst_generator_alloc_fail", io.alloc && full && !enq_filter) + XSPerfAccumulate("burst_generator_full", full) + + XSPerfAccumulate("burst_valid_num", PopCount(valids)) + XSPerfAccumulate("prefetch_req_fire_by_generator", io.aspPfIO.l2_pf_addr.valid) +} + +// an Accurate Store prefetcher +class ASP(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelper { + val io = IO(new DCacheBundle { + val sbuffer = Vec(EnsbufferWidth, Flipped(ValidIO(new DCacheWordReqWithVaddrAndPfFlag))) + val sqEmpty = Input(Bool()) + val enable = Input(Bool()) + val seqStoreDetected = Output(Bool()) + val aspPfIO = new AspPfIO + }) + + private val granularity = ASP_GRANULARITY_BITS + + // sequential store detection: + // store D, (A); store D, (A + K), store D, (A + 2K) ... + val DATAHASHBITS = 16 + val SEQTHRESHOLD = 32 + val seqStoreDetected = WireInit(false.B) + val prevCycleVaddr = RegInit(0.U(VAddrBits.W)) + val prevCycleDataHash = RegInit(0.U(DATAHASHBITS.W)) + val seqKStride = RegInit(0.U(6.W)) + val seqPatternVec = WireInit(VecInit(List.fill(EnsbufferWidth)(false.B))) + val seqPatternCnt = RegInit(0.U((log2Up(SEQTHRESHOLD) + 1).W)) + val sbufferFire = Cat(VecInit((0 until EnsbufferWidth).map(i => io.sbuffer(i).fire))).orR + val sbufferFireCnt = PopCount(VecInit((0 until EnsbufferWidth).map(i => io.sbuffer(i).fire))) + val validKStride = (seqKStride === 1.U || seqKStride === 2.U || seqKStride === 4.U || seqKStride === 8.U) + + for (i <- 0 until EnsbufferWidth) { + when(io.sbuffer(i).fire) { + val thisCycleVaddr = io.sbuffer(i).bits.vaddr + val thisCycleDataHash = io.sbuffer(i).bits.data.asTypeOf(Vec(VLEN / DATAHASHBITS, UInt(DATAHASHBITS.W))).fold(0.U)(_ ^ _) + prevCycleVaddr := thisCycleVaddr + prevCycleDataHash := thisCycleDataHash + + if(i == 0) { + seqKStride := thisCycleVaddr - prevCycleVaddr + seqPatternVec(i) := ((thisCycleVaddr - prevCycleVaddr) === seqKStride) && + (prevCycleDataHash === thisCycleDataHash) + }else { + val lastLoopVaddr = io.sbuffer(i - 1).bits.vaddr + val lastLoopDataHash = io.sbuffer(i - 1).bits.data.asTypeOf(Vec(VLEN / DATAHASHBITS, UInt(DATAHASHBITS.W))).fold(0.U)(_ ^ _) + seqKStride := thisCycleVaddr - lastLoopVaddr + seqPatternVec(i) := ((thisCycleVaddr - lastLoopVaddr) === seqKStride) && + (lastLoopDataHash === thisCycleDataHash) + } + }.otherwise { + seqPatternVec(i) := true.B + } + } + + when(sbufferFire) { + when(Cat(seqPatternVec).andR) { + seqPatternCnt := Mux(seqPatternCnt >= SEQTHRESHOLD.U, seqPatternCnt, seqPatternCnt + sbufferFireCnt) + }.otherwise { + seqPatternCnt := 0.U + } + } + when(seqPatternCnt >= SEQTHRESHOLD.U && validKStride) { + seqStoreDetected := true.B + }.otherwise { + seqStoreDetected := false.B + } + when(io.sqEmpty) { + seqStoreDetected := false.B + } + io.seqStoreDetected := seqStoreDetected + + // generator + val generator = Module(new ASPBurstGenerator) + + generator.io.alloc := false.B + generator.io.vaddr := 0.U + generator.io.aspPfIO <> io.aspPfIO + + // prefetch Depth for SW + val depthSW = Wire(UInt(10.W)) + depthSW := Constantin.createRecord("ASP_DEPTH_SW" + p(XSCoreParamsKey).HartId.toString, initValue = 16) + + // The larger the size of the store instruction, the greater the bandwidth for sq to write to the sbuffer, + // causing the sbuffer to fill up faster, so we need a larger distance. + val depth = LookupTreeDefault(seqKStride, depthSW, List( + 1.U -> (depthSW >> 2), // memset using sb + 2.U -> (depthSW >> 1), // memset using sh + 4.U -> depthSW, // memset using sw + 8.U -> (depthSW << 1) // memset using sd + )) + + val prefetchVaddr = (0 until EnsbufferWidth).map(i => get_block_addr(io.sbuffer(i).bits.vaddr) + Cat(depth, 0.U(log2Up(dcacheParameters.blockBytes).W))) + for (i <- 0 until EnsbufferWidth) { + when (io.enable) { + if (i == 0) { + when(io.sbuffer(i).fire) { + generator.io.alloc := seqStoreDetected + generator.io.vaddr := prefetchVaddr(0) + } + } else { + when(io.sbuffer(i).fire) { + generator.io.alloc := seqStoreDetected + when (!same_granularity_addr(prefetchVaddr(i), prefetchVaddr(i - 1), granularity)) { + generator.io.vaddr := prefetchVaddr(i) + } + } + } + } + } } \ No newline at end of file From eff67f315fe69b734195209ce5e73a275f250d2e Mon Sep 17 00:00:00 2001 From: lixin <1037997956@qq.com> Date: Mon, 23 Sep 2024 17:29:59 +0800 Subject: [PATCH 05/12] bump(cpl2): add store prefetch needT --- coupledL2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coupledL2 b/coupledL2 index 233fb3face..4d3e56237e 160000 --- a/coupledL2 +++ b/coupledL2 @@ -1 +1 @@ -Subproject commit 233fb3face385777ae5444e021bf881f8058b51b +Subproject commit 4d3e56237e35b1a287dfb4f33679dc996c663877 From 746cdae1b4ca62ab722bb1c1b0362d3626fe814e Mon Sep 17 00:00:00 2001 From: lixin <1037997956@qq.com> Date: Tue, 24 Sep 2024 10:10:34 +0800 Subject: [PATCH 06/12] fix(spb): fix uninitialized Reg --- src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala b/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala index d293ef1fa8..c62dd4f9e4 100644 --- a/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala +++ b/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala @@ -370,7 +370,7 @@ class ASPBurstGenerator(implicit p: Parameters) extends DCacheModule with HasSto } assert(PopCount(s0_tlb_fire_vec) <= 1.U, "s0_tlb_fire_vec should be one-hot or empty") - val s1_tlb_req_valid = Reg(Bool()) + val s1_tlb_req_valid = RegInit(false.B) val s1_tlb_req_bits = RegEnable(tlb_req_arb.io.out.bits, tlb_req_arb.io.out.fire) val s1_tlb_req_index = RegEnable(OHToUInt(s0_tlb_fire_vec.asUInt), tlb_req_arb.io.out.fire) when(io.aspPfIO.tlb_req.req.fire) { From 0b5982cd8ca46106d4e34c9e5f365b1b608ca8aa Mon Sep 17 00:00:00 2001 From: lixin <1037997956@qq.com> Date: Thu, 26 Sep 2024 17:59:32 +0800 Subject: [PATCH 07/12] fix(storepf): only trigger asp in Memset --- src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala | 1 + src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala | 2 ++ .../scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala | 5 +++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala index 827418b16d..fb0ee2188b 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala @@ -182,6 +182,7 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete storeQueue.io.sqCancelCnt <> io.sqCancelCnt storeQueue.io.sqDeq <> io.sqDeq storeQueue.io.sqEmpty <> io.sqEmpty + storeQueue.io.lqEmpty <> loadQueue.io.lqEmpty storeQueue.io.sqFull <> io.sqFull storeQueue.io.forward <> io.forward // overlap forwardMask & forwardData, DO NOT CHANGE SEQUENCE storeQueue.io.force_write <> io.force_write diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala index a0eb796cb3..ae82e09796 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala @@ -192,6 +192,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule val exceptionAddr = new ExceptionAddrIO val flushSbuffer = new SbufferFlushBundle val sqEmpty = Output(Bool()) + val lqEmpty = Input(Bool()) val stAddrReadySqPtr = Output(new SqPtr) val stAddrReadyVec = Output(Vec(StoreQueueSize, Bool())) val stDataReadySqPtr = Output(new SqPtr) @@ -1036,6 +1037,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule s.bits := io.sbuffer(idx).bits }} asp.io.sqEmpty := io.sqEmpty + asp.io.lqEmpty := io.lqEmpty asp.io.enable := EnableStorePrefetchASP.B io.seqStoreDetected := asp.io.seqStoreDetected io.aspPfIO <> asp.io.aspPfIO diff --git a/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala b/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala index c62dd4f9e4..a79e9c0af6 100644 --- a/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala +++ b/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala @@ -440,6 +440,7 @@ class ASP(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelp val io = IO(new DCacheBundle { val sbuffer = Vec(EnsbufferWidth, Flipped(ValidIO(new DCacheWordReqWithVaddrAndPfFlag))) val sqEmpty = Input(Bool()) + val lqEmpty = Input(Bool()) val enable = Input(Bool()) val seqStoreDetected = Output(Bool()) val aspPfIO = new AspPfIO @@ -526,12 +527,12 @@ class ASP(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelp when (io.enable) { if (i == 0) { when(io.sbuffer(i).fire) { - generator.io.alloc := seqStoreDetected + generator.io.alloc := seqStoreDetected && io.lqEmpty generator.io.vaddr := prefetchVaddr(0) } } else { when(io.sbuffer(i).fire) { - generator.io.alloc := seqStoreDetected + generator.io.alloc := seqStoreDetected && io.lqEmpty when (!same_granularity_addr(prefetchVaddr(i), prefetchVaddr(i - 1), granularity)) { generator.io.vaddr := prefetchVaddr(i) } From e9e9668069b251194e4f2d53b5f1c0f03984c823 Mon Sep 17 00:00:00 2001 From: lixin <1037997956@qq.com> Date: Thu, 26 Sep 2024 18:03:15 +0800 Subject: [PATCH 08/12] bump(cpl2): bump to latest --- coupledL2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coupledL2 b/coupledL2 index 4d3e56237e..4652b53579 160000 --- a/coupledL2 +++ b/coupledL2 @@ -1 +1 @@ -Subproject commit 4d3e56237e35b1a287dfb4f33679dc996c663877 +Subproject commit 4652b535797478e2d3eb7c1b743a847b4e9fd4c9 From 30439139ba3b22a788fa5fcc8e8c567cf8cfcef6 Mon Sep 17 00:00:00 2001 From: lixin <1037997956@qq.com> Date: Tue, 8 Oct 2024 11:26:59 +0800 Subject: [PATCH 09/12] spf: refactor code --- .../mem/sbuffer/StorePrefetchBursts.scala | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala b/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala index a79e9c0af6..9caddf7cc7 100644 --- a/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala +++ b/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala @@ -463,41 +463,43 @@ class ASP(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelp val validKStride = (seqKStride === 1.U || seqKStride === 2.U || seqKStride === 4.U || seqKStride === 8.U) for (i <- 0 until EnsbufferWidth) { - when(io.sbuffer(i).fire) { + when (io.sbuffer(i).fire) { val thisCycleVaddr = io.sbuffer(i).bits.vaddr val thisCycleDataHash = io.sbuffer(i).bits.data.asTypeOf(Vec(VLEN / DATAHASHBITS, UInt(DATAHASHBITS.W))).fold(0.U)(_ ^ _) prevCycleVaddr := thisCycleVaddr prevCycleDataHash := thisCycleDataHash - if(i == 0) { + if (i == 0) { seqKStride := thisCycleVaddr - prevCycleVaddr seqPatternVec(i) := ((thisCycleVaddr - prevCycleVaddr) === seqKStride) && - (prevCycleDataHash === thisCycleDataHash) - }else { + (prevCycleDataHash === thisCycleDataHash) && + (PopCount(io.sbuffer(i).bits.mask) === seqKStride) + } else { val lastLoopVaddr = io.sbuffer(i - 1).bits.vaddr val lastLoopDataHash = io.sbuffer(i - 1).bits.data.asTypeOf(Vec(VLEN / DATAHASHBITS, UInt(DATAHASHBITS.W))).fold(0.U)(_ ^ _) seqKStride := thisCycleVaddr - lastLoopVaddr seqPatternVec(i) := ((thisCycleVaddr - lastLoopVaddr) === seqKStride) && - (lastLoopDataHash === thisCycleDataHash) + (lastLoopDataHash === thisCycleDataHash) && + (PopCount(io.sbuffer(i).bits.mask) === seqKStride) } - }.otherwise { + } .otherwise { seqPatternVec(i) := true.B } } - when(sbufferFire) { - when(Cat(seqPatternVec).andR) { + when (sbufferFire) { + when (Cat(seqPatternVec).andR) { seqPatternCnt := Mux(seqPatternCnt >= SEQTHRESHOLD.U, seqPatternCnt, seqPatternCnt + sbufferFireCnt) - }.otherwise { + } .otherwise { seqPatternCnt := 0.U } } - when(seqPatternCnt >= SEQTHRESHOLD.U && validKStride) { + when (seqPatternCnt >= SEQTHRESHOLD.U && validKStride) { seqStoreDetected := true.B - }.otherwise { + } .otherwise { seqStoreDetected := false.B } - when(io.sqEmpty) { + when (io.sqEmpty) { seqStoreDetected := false.B } io.seqStoreDetected := seqStoreDetected @@ -540,4 +542,6 @@ class ASP(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelp } } } + + XSPerfAccumulate("seqStoreDetected", seqStoreDetected) } \ No newline at end of file From 356a575c6b7e87b7e80a18dffbc998bb1de9b0df Mon Sep 17 00:00:00 2001 From: lixin <1037997956@qq.com> Date: Tue, 8 Oct 2024 11:33:12 +0800 Subject: [PATCH 10/12] bump cpl2 --- coupledL2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coupledL2 b/coupledL2 index 4652b53579..3468725821 160000 --- a/coupledL2 +++ b/coupledL2 @@ -1 +1 @@ -Subproject commit 4652b535797478e2d3eb7c1b743a847b4e9fd4c9 +Subproject commit 346872582105980e05be96bbf2ad002f78336389 From 14cfe7baeaa5a8361d5d51d84d750085fc2af32b Mon Sep 17 00:00:00 2001 From: lixin <1037997956@qq.com> Date: Tue, 8 Oct 2024 16:01:26 +0800 Subject: [PATCH 11/12] fix(spf): consider `vecValid` when trainning spf --- src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala index 12102d7365..414394044e 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala @@ -1040,7 +1040,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule } asp.io.sbuffer.zipWithIndex.foreach {case (s, idx) => { - s.valid := io.sbuffer(idx).fire + s.valid := io.sbuffer(idx).fire && io.sbuffer(idx).bits.vecValid s.bits := io.sbuffer(idx).bits }} asp.io.sqEmpty := io.sqEmpty From 01306eb64681a02aa7e6548f19efed56acfb34db Mon Sep 17 00:00:00 2001 From: lixin <1037997956@qq.com> Date: Tue, 8 Oct 2024 18:56:25 +0800 Subject: [PATCH 12/12] fix(spf): fix seqKStride --- .../xiangshan/mem/sbuffer/StorePrefetchBursts.scala | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala b/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala index 9caddf7cc7..b4fb0f4d3f 100644 --- a/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala +++ b/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala @@ -475,8 +475,14 @@ class ASP(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelp (prevCycleDataHash === thisCycleDataHash) && (PopCount(io.sbuffer(i).bits.mask) === seqKStride) } else { - val lastLoopVaddr = io.sbuffer(i - 1).bits.vaddr - val lastLoopDataHash = io.sbuffer(i - 1).bits.data.asTypeOf(Vec(VLEN / DATAHASHBITS, UInt(DATAHASHBITS.W))).fold(0.U)(_ ^ _) + val lastLoopVaddr = WireInit(prevCycleVaddr) + val lastLoopDataHash = WireInit(prevCycleDataHash) + for ( j <- 0 until i ) { + when (io.sbuffer(j).fire) { + lastLoopVaddr := io.sbuffer(j).bits.vaddr + lastLoopDataHash := io.sbuffer(j).bits.data.asTypeOf(Vec(VLEN / DATAHASHBITS, UInt(DATAHASHBITS.W))).fold(0.U)(_ ^ _) + } + } seqKStride := thisCycleVaddr - lastLoopVaddr seqPatternVec(i) := ((thisCycleVaddr - lastLoopVaddr) === seqKStride) && (lastLoopDataHash === thisCycleDataHash) &&