Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf(memblock): opt memset pattern #3632

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
2 changes: 2 additions & 0 deletions src/main/scala/xiangshan/Parameters.scala
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ case class XSCoreParameters
EnableAtCommitMissTrigger: Boolean = true,
EnableStorePrefetchSMS: Boolean = false,
EnableStorePrefetchSPB: Boolean = false,
EnableStorePrefetchASP: Boolean = true,
HasCMO: Boolean = true,
MMUAsidLen: Int = 16, // max is 16, 0 is not supported now
MMUVmidLen: Int = 14,
Expand Down Expand Up @@ -809,6 +810,7 @@ trait HasXSParameter {
def EnableAtCommitMissTrigger = coreParams.EnableAtCommitMissTrigger
def EnableStorePrefetchSMS = coreParams.EnableStorePrefetchSMS
def EnableStorePrefetchSPB = coreParams.EnableStorePrefetchSPB
def EnableStorePrefetchASP = coreParams.EnableStorePrefetchASP
def HasCMO = coreParams.HasCMO && p(EnableCHI)
require(LoadPipelineWidth == backendParams.LdExuCnt, "LoadPipelineWidth must be equal exuParameters.LduCnt!")
require(StorePipelineWidth == backendParams.StaCnt, "StorePipelineWidth must be equal exuParameters.StuCnt!")
Expand Down
37 changes: 29 additions & 8 deletions src/main/scala/xiangshan/backend/MemBlock.scala
Original file line number Diff line number Diff line change
Expand Up @@ -534,13 +534,31 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
// load/store prefetch to l2 cache
prefetcherOpt.foreach(sms_pf => {
l1PrefetcherOpt.foreach(l1_pf => {
val sms_pf_to_l2 = DelayNWithValid(sms_pf.io.l2_req, 2)
val l1_pf_to_l2 = DelayNWithValid(l1_pf.io.l2_req, 2)

outer.l2_pf_sender_opt.get.out.head._1.addr_valid := sms_pf_to_l2.valid || l1_pf_to_l2.valid
outer.l2_pf_sender_opt.get.out.head._1.addr := Mux(l1_pf_to_l2.valid, l1_pf_to_l2.bits.addr, sms_pf_to_l2.bits.addr)
outer.l2_pf_sender_opt.get.out.head._1.pf_source := Mux(l1_pf_to_l2.valid, l1_pf_to_l2.bits.source, sms_pf_to_l2.bits.source)
val sms_pf_to_l2 = DelayNWithValid(sms_pf.io.l2_req, 2)
val asp_pf_to_l2 = DelayNWithValid(lsq.io.aspPfIO.l2_pf_addr, 2)

outer.l2_pf_sender_opt.get.out.head._1.addr_valid := sms_pf_to_l2.valid || l1_pf_to_l2.valid || asp_pf_to_l2.valid
outer.l2_pf_sender_opt.get.out.head._1.addr := Mux(
l1_pf_to_l2.valid,
l1_pf_to_l2.bits.addr,
Mux(
sms_pf_to_l2.valid,
sms_pf_to_l2.bits.addr,
asp_pf_to_l2.bits.addr
)
)
outer.l2_pf_sender_opt.get.out.head._1.pf_source := Mux(
l1_pf_to_l2.valid,
l1_pf_to_l2.bits.source,
Mux(
sms_pf_to_l2.valid,
sms_pf_to_l2.bits.source,
asp_pf_to_l2.bits.source
)
)
outer.l2_pf_sender_opt.get.out.head._1.l2_pf_en := RegNextN(io.ooo_to_mem.csrCtrl.l2_pf_enable, 2, Some(true.B))
outer.l2_pf_sender_opt.get.out.head._1.needT := !l1_pf_to_l2.valid && !sms_pf_to_l2.valid && asp_pf_to_l2.valid

sms_pf.io.enable := RegNextN(io.ooo_to_mem.csrCtrl.l1D_pf_enable, 2, Some(false.B))

Expand Down Expand Up @@ -589,14 +607,14 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
// dtlb
val dtlb_ld_tlb_ld = Module(new TLBNonBlock(LduCnt + HyuCnt + 1, 2, ldtlbParams))
val dtlb_st_tlb_st = Module(new TLBNonBlock(StaCnt, 1, sttlbParams))
val dtlb_prefetch_tlb_prefetch = Module(new TLBNonBlock(2, 2, pftlbParams))
val dtlb_prefetch_tlb_prefetch = Module(new TLBNonBlock(3, 2, pftlbParams))
val dtlb_ld = Seq(dtlb_ld_tlb_ld.io)
val dtlb_st = Seq(dtlb_st_tlb_st.io)
val dtlb_prefetch = Seq(dtlb_prefetch_tlb_prefetch.io)
/* tlb vec && constant variable */
val dtlb = dtlb_ld ++ dtlb_st ++ dtlb_prefetch
val (dtlb_ld_idx, dtlb_st_idx, dtlb_pf_idx) = (0, 1, 2)
val TlbSubSizeVec = Seq(LduCnt + HyuCnt + 1, StaCnt, 2) // (load + hyu + stream pf, store, sms+l2bop)
val TlbSubSizeVec = Seq(LduCnt + HyuCnt + 1, StaCnt, 3) // (load + hyu + stream pf, store, sms+l2bop+asp)
val DTlbSize = TlbSubSizeVec.sum
val TlbStartVec = TlbSubSizeVec.scanLeft(0)(_ + _).dropRight(1)
val TlbEndVec = TlbSubSizeVec.scanLeft(0)(_ + _).drop(1)
Expand Down Expand Up @@ -632,7 +650,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
replace_st.io.apply_sep(dtlb_st.map(_.replace), ptwio.resp.bits.data.s1.entry.tag)
}
if (pftlbParams.outReplace) {
val replace_pf = Module(new TlbReplace(2, pftlbParams))
val replace_pf = Module(new TlbReplace(3, pftlbParams))
replace_pf.io.apply_sep(dtlb_prefetch.map(_.replace), ptwio.resp.bits.data.s1.entry.tag)
}
}
Expand Down Expand Up @@ -1090,6 +1108,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
val StreamDTLBPortIndex = TlbStartVec(dtlb_ld_idx) + LduCnt + HyuCnt
val PrefetcherDTLBPortIndex = TlbStartVec(dtlb_pf_idx)
val L2toL1DLBPortIndex = TlbStartVec(dtlb_pf_idx) + 1
val ASPDTLBPortIndex = TlbStartVec(dtlb_pf_idx) + 2
prefetcherOpt match {
case Some(pf) => dtlb_reqs(PrefetcherDTLBPortIndex) <> pf.io.tlb_req
case None =>
Expand All @@ -1108,6 +1127,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
dtlb_reqs(L2toL1DLBPortIndex).resp.ready := true.B
io.l2_pmp_resp := pmp_check(L2toL1DLBPortIndex).resp

dtlb_reqs(ASPDTLBPortIndex) <> lsq.io.aspPfIO.tlb_req
// StoreUnit
for (i <- 0 until StdCnt) {
stdExeUnits(i).io.flush <> redirect
Expand Down Expand Up @@ -1375,6 +1395,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
io.mem_to_ooo.sqDeqPtr := lsq.io.sqDeqPtr
io.mem_to_ooo.lqDeqPtr := lsq.io.lqDeqPtr
lsq.io.tl_d_channel <> dcache.io.lsu.tl_d_channel
lsq.io.seqStoreDetected <> dcache.io.seqStoreDetected

// LSQ to store buffer
lsq.io.sbuffer <> sbuffer.io.in
Expand Down
2 changes: 2 additions & 0 deletions src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,7 @@ class DCacheIO(implicit p: Parameters) extends DCacheBundle {
val pf_ctrl = Output(new PrefetchControlBundle)
val force_write = Input(Bool())
val sms_agt_evict_req = DecoupledIO(new AGTEvictReq)
val seqStoreDetected = Input(Bool())
val debugTopDown = new DCacheTopDownIO
val debugRolling = Flipped(new RobDebugRollingIO)
val l2_hint = Input(Valid(new L2ToL1Hint()))
Expand Down Expand Up @@ -981,6 +982,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
missQueue.io.hartId := io.hartId
missQueue.io.l2_pf_store_only := RegNext(io.l2_pf_store_only, false.B)
missQueue.io.debugTopDown <> io.debugTopDown
missQueue.io.seqStoreDetected <> io.seqStoreDetected
missQueue.io.l2_hint <> RegNext(io.l2_hint)
missQueue.io.mainpipe_info := mainPipe.io.mainpipe_info
mainPipe.io.refill_info := missQueue.io.refill_info
Expand Down
15 changes: 2 additions & 13 deletions src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala
Original file line number Diff line number Diff line change
Expand Up @@ -869,6 +869,7 @@ class MissQueue(edge: TLEdgeOut, reqNum: Int)(implicit p: Parameters) extends DC
val l2_pf_store_only = Input(Bool())

val memSetPattenDetected = Output(Bool())
val seqStoreDetected = Input(Bool())
val lqEmpty = Input(Bool())

val prefetch_info = new Bundle {
Expand Down Expand Up @@ -941,19 +942,7 @@ class MissQueue(edge: TLEdgeOut, reqNum: Int)(implicit p: Parameters) extends DC

assert(PopCount(Seq(alloc && io.req.valid, merge && io.req.valid)) <= 1.U, "allocate and merge a mshr in same cycle!")

val source_except_load_cnt = RegInit(0.U(10.W))
when(VecInit(req_mshr_handled_vec).asUInt.orR || req_pipeline_reg_handled) {
when(io.req.bits.isFromLoad) {
source_except_load_cnt := 0.U
}.otherwise {
when(io.req.bits.isFromStore) {
source_except_load_cnt := source_except_load_cnt + 1.U
}
}
}
val Threshold = 8
val memSetPattenDetected = GatedValidRegNext((source_except_load_cnt >= Threshold.U) && io.lqEmpty)

val memSetPattenDetected = GatedValidRegNext(io.seqStoreDetected && io.lqEmpty)
io.memSetPattenDetected := memSetPattenDetected

val forwardInfo_vec = VecInit(entries.map(_.io.forwardInfo))
Expand Down
27 changes: 18 additions & 9 deletions src/main/scala/xiangshan/cache/mmu/Repeater.scala
Original file line number Diff line number Diff line change
Expand Up @@ -219,14 +219,23 @@ class PTWFilterEntry(Width: Int, Size: Int, hasHint: Boolean = false)(implicit p
canenq(1) := !(Cat(v.drop(Size/2)).andR)
enqidx(1) := firstValidIndex(v.drop(Size/2), false.B) + (Size/2).U
} else if (Width == 3) {
require(Size == 16, s"load filter Size ($Size) should be 16")
canenq(0) := !(Cat(v.take(8)).andR)
enqidx(0) := firstValidIndex(v.take(8), false.B)
canenq(1) := !(Cat(v.drop(8).take(4)).andR)
enqidx(1) := firstValidIndex(v.drop(8).take(4), false.B) + 8.U
// four entries for prefetch
canenq(2) := !(Cat(v.drop(12)).andR)
enqidx(2) := firstValidIndex(v.drop(12), false.B) + 12.U
require(Size == 16 || Size == 8, s"load/prefetcher filter Size ($Size) should be 16/8")
if (Size == 16) {
canenq(0) := !(Cat(v.take(8)).andR)
enqidx(0) := firstValidIndex(v.take(8), false.B)
canenq(1) := !(Cat(v.drop(8).take(4)).andR)
enqidx(1) := firstValidIndex(v.drop(8).take(4), false.B) + 8.U
// four entries for prefetch
canenq(2) := !(Cat(v.drop(12)).andR)
enqidx(2) := firstValidIndex(v.drop(12), false.B) + 12.U
} else {
canenq(0) := !(Cat(v.take(4)).andR)
enqidx(0) := firstValidIndex(v.take(4), false.B)
canenq(1) := !(Cat(v.drop(4).take(2)).andR)
enqidx(1) := firstValidIndex(v.drop(4).take(2), false.B) + 4.U
canenq(2) := !(Cat(v.drop(6)).andR)
enqidx(2) := firstValidIndex(v.drop(6), false.B) + 6.U
}
} else if (Width == 4) {
require(Size == 16, s"load filter Size ($Size) should be 16")
for (i <- 0 until Width) {
Expand Down Expand Up @@ -359,7 +368,7 @@ class PTWNewFilter(Width: Int, Size: Int, FenceDelay: Int)(implicit p: Parameter
})

val prefetch_filter = VecInit(Seq.fill(1) {
val prefetch_entry = Module(new PTWFilterEntry(Width = 2, Size = prefetchfiltersize))
val prefetch_entry = Module(new PTWFilterEntry(Width = 3, Size = prefetchfiltersize))
prefetch_entry.io
})

Expand Down
25 changes: 15 additions & 10 deletions src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete
val flushSbuffer = new SbufferFlushBundle
val force_write = Output(Bool())
val lqEmpty = Output(Bool())
val seqStoreDetected = Output(Bool())
val aspPfIO = new AspPfIO

// top-down
val debugTopDown = new LoadQueueTopDownIO
Expand Down Expand Up @@ -179,16 +181,19 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete
storeQueue.io.vecmmioStout <> io.vecmmioStout
storeQueue.io.rob <> io.rob
storeQueue.io.exceptionAddr.isStore := DontCare
storeQueue.io.sqCancelCnt <> io.sqCancelCnt
storeQueue.io.sqDeq <> io.sqDeq
storeQueue.io.sqEmpty <> io.sqEmpty
storeQueue.io.sqFull <> io.sqFull
storeQueue.io.forward <> io.forward // overlap forwardMask & forwardData, DO NOT CHANGE SEQUENCE
storeQueue.io.force_write <> io.force_write
storeQueue.io.cmoOpReq <> io.cmoOpReq
storeQueue.io.cmoOpResp <> io.cmoOpResp
storeQueue.io.flushSbuffer <> io.flushSbuffer
storeQueue.io.maControl <> io.maControl
storeQueue.io.sqCancelCnt <> io.sqCancelCnt
storeQueue.io.sqDeq <> io.sqDeq
storeQueue.io.sqEmpty <> io.sqEmpty
storeQueue.io.lqEmpty <> loadQueue.io.lqEmpty
storeQueue.io.sqFull <> io.sqFull
storeQueue.io.forward <> io.forward // overlap forwardMask & forwardData, DO NOT CHANGE SEQUENCE
storeQueue.io.force_write <> io.force_write
storeQueue.io.cmoOpReq <> io.cmoOpReq
storeQueue.io.cmoOpResp <> io.cmoOpResp
storeQueue.io.flushSbuffer <> io.flushSbuffer
storeQueue.io.maControl <> io.maControl
storeQueue.io.seqStoreDetected <> io.seqStoreDetected
storeQueue.io.aspPfIO <> io.aspPfIO

/* <------- DANGEROUS: Don't change sequence here ! -------> */

Expand Down
23 changes: 23 additions & 0 deletions src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,14 @@ import utility._
import utils._
import xiangshan._
import xiangshan.cache._
import xiangshan.cache.mmu.TlbRequestIO
import xiangshan.cache.{DCacheLineIO, DCacheWordIO, MemoryOpConstants}
import xiangshan.backend._
import xiangshan.backend.rob.{RobLsqIO, RobPtr}
import xiangshan.backend.Bundles.{DynInst, MemExuOutput}
import xiangshan.backend.decode.isa.bitfield.{Riscv32BitInst, XSInstBitFields}
import xiangshan.backend.fu.FuConfig._
import xiangshan.mem.prefetch.L2PrefetchReq
import xiangshan.backend.fu.FuType
import xiangshan.ExceptionNO._
import coupledL2.{CMOReq, CMOResp}
Expand All @@ -58,6 +60,11 @@ class SqEnqIO(implicit p: Parameters) extends MemBlockBundle {
val resp = Vec(LSQEnqWidth, Output(new SqPtr))
}

class AspPfIO(implicit p: Parameters) extends MemBlockBundle {
val tlb_req = new TlbRequestIO(nRespDups = 2)
val l2_pf_addr = ValidIO(new L2PrefetchReq())
}

class DataBufferEntry (implicit p: Parameters) extends DCacheBundle {
val addr = UInt(PAddrBits.W)
val vaddr = UInt(VAddrBits.W)
Expand Down Expand Up @@ -187,6 +194,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule
val exceptionAddr = new ExceptionAddrIO
val flushSbuffer = new SbufferFlushBundle
val sqEmpty = Output(Bool())
val lqEmpty = Input(Bool())
val stAddrReadySqPtr = Output(new SqPtr)
val stAddrReadyVec = Output(Vec(StoreQueueSize, Bool()))
val stDataReadySqPtr = Output(new SqPtr)
Expand All @@ -198,10 +206,15 @@ class StoreQueue(implicit p: Parameters) extends XSModule
val sqDeq = Output(UInt(log2Ceil(EnsbufferWidth + 1).W))
val force_write = Output(Bool())
val maControl = Flipped(new StoreMaBufToSqControlIO)
val seqStoreDetected = Output(Bool())
val aspPfIO = new AspPfIO
})

println("StoreQueue: size:" + StoreQueueSize)

// ASP prefetcher
val asp = Module(new ASP)

// data modules
val uop = Reg(Vec(StoreQueueSize, new DynInst))
// val data = Reg(Vec(StoreQueueSize, new LsqEntry))
Expand Down Expand Up @@ -1026,6 +1039,16 @@ class StoreQueue(implicit p: Parameters) extends XSModule
}
}

asp.io.sbuffer.zipWithIndex.foreach {case (s, idx) => {
s.valid := io.sbuffer(idx).fire && io.sbuffer(idx).bits.vecValid
s.bits := io.sbuffer(idx).bits
}}
asp.io.sqEmpty := io.sqEmpty
asp.io.lqEmpty := io.lqEmpty
asp.io.enable := EnableStorePrefetchASP.B
io.seqStoreDetected := asp.io.seqStoreDetected
io.aspPfIO <> asp.io.aspPfIO

// All vector instruction uop normally dequeue, but the Uop after the exception is raised does not write to the 'sbuffer'.
// Flags are used to record whether there are any exceptions when the queue is displayed.
// This is determined each time a write is made to the 'databuffer', prevent subsequent uop of the same instruction from writing to the 'dataBuffer'.
Expand Down
Loading
Loading