diff --git a/coupledL2 b/coupledL2
index a17d090738..3468725821 160000
--- a/coupledL2
+++ b/coupledL2
@@ -1 +1 @@
-Subproject commit a17d090738e6aceb25b09b5dca3c0e824c46ae8f
+Subproject commit 346872582105980e05be96bbf2ad002f78336389
diff --git a/src/main/scala/xiangshan/Parameters.scala b/src/main/scala/xiangshan/Parameters.scala
index ef4b403f1e..6a9b2cbde6 100644
--- a/src/main/scala/xiangshan/Parameters.scala
+++ b/src/main/scala/xiangshan/Parameters.scala
@@ -254,6 +254,7 @@ case class XSCoreParameters
   EnableAtCommitMissTrigger: Boolean = true,
   EnableStorePrefetchSMS: Boolean = false,
   EnableStorePrefetchSPB: Boolean = false,
+  EnableStorePrefetchASP: Boolean = true,
   HasCMO: Boolean = true,
   MMUAsidLen: Int = 16, // max is 16, 0 is not supported now
   MMUVmidLen: Int = 14,
@@ -809,6 +810,7 @@ trait HasXSParameter {
   def EnableAtCommitMissTrigger = coreParams.EnableAtCommitMissTrigger
   def EnableStorePrefetchSMS = coreParams.EnableStorePrefetchSMS
   def EnableStorePrefetchSPB = coreParams.EnableStorePrefetchSPB
+  def EnableStorePrefetchASP = coreParams.EnableStorePrefetchASP
   def HasCMO = coreParams.HasCMO && p(EnableCHI)
   require(LoadPipelineWidth == backendParams.LdExuCnt, "LoadPipelineWidth must be equal exuParameters.LduCnt!")
   require(StorePipelineWidth == backendParams.StaCnt, "StorePipelineWidth must be equal exuParameters.StuCnt!")
diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala
index af9e975594..5406312b66 100644
--- a/src/main/scala/xiangshan/backend/MemBlock.scala
+++ b/src/main/scala/xiangshan/backend/MemBlock.scala
@@ -534,13 +534,31 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
   // load/store prefetch to l2 cache
   prefetcherOpt.foreach(sms_pf => {
     l1PrefetcherOpt.foreach(l1_pf => {
-      val sms_pf_to_l2 = DelayNWithValid(sms_pf.io.l2_req, 2)
       val l1_pf_to_l2 = DelayNWithValid(l1_pf.io.l2_req, 2)
-
-      outer.l2_pf_sender_opt.get.out.head._1.addr_valid := sms_pf_to_l2.valid || l1_pf_to_l2.valid
-      outer.l2_pf_sender_opt.get.out.head._1.addr := Mux(l1_pf_to_l2.valid, l1_pf_to_l2.bits.addr, sms_pf_to_l2.bits.addr)
-      outer.l2_pf_sender_opt.get.out.head._1.pf_source := Mux(l1_pf_to_l2.valid, l1_pf_to_l2.bits.source, sms_pf_to_l2.bits.source)
+      val sms_pf_to_l2 = DelayNWithValid(sms_pf.io.l2_req, 2)
+      val asp_pf_to_l2 = DelayNWithValid(lsq.io.aspPfIO.l2_pf_addr, 2)
+
+      outer.l2_pf_sender_opt.get.out.head._1.addr_valid := sms_pf_to_l2.valid || l1_pf_to_l2.valid || asp_pf_to_l2.valid
+      outer.l2_pf_sender_opt.get.out.head._1.addr := Mux(
+        l1_pf_to_l2.valid,
+        l1_pf_to_l2.bits.addr,
+        Mux(
+          sms_pf_to_l2.valid,
+          sms_pf_to_l2.bits.addr,
+          asp_pf_to_l2.bits.addr
+        )
+      )
+      outer.l2_pf_sender_opt.get.out.head._1.pf_source := Mux(
+        l1_pf_to_l2.valid,
+        l1_pf_to_l2.bits.source,
+        Mux(
+          sms_pf_to_l2.valid,
+          sms_pf_to_l2.bits.source,
+          asp_pf_to_l2.bits.source
+        )
+      )
       outer.l2_pf_sender_opt.get.out.head._1.l2_pf_en := RegNextN(io.ooo_to_mem.csrCtrl.l2_pf_enable, 2, Some(true.B))
+      outer.l2_pf_sender_opt.get.out.head._1.needT := !l1_pf_to_l2.valid && !sms_pf_to_l2.valid && asp_pf_to_l2.valid
 
       sms_pf.io.enable := RegNextN(io.ooo_to_mem.csrCtrl.l1D_pf_enable, 2, Some(false.B))
 
@@ -589,14 +607,14 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
   // dtlb
   val dtlb_ld_tlb_ld = Module(new TLBNonBlock(LduCnt + HyuCnt + 1, 2, ldtlbParams))
   val dtlb_st_tlb_st = Module(new TLBNonBlock(StaCnt, 1, sttlbParams))
-  val dtlb_prefetch_tlb_prefetch = Module(new TLBNonBlock(2, 2, pftlbParams))
+  val dtlb_prefetch_tlb_prefetch = Module(new TLBNonBlock(3, 2, pftlbParams))
   val dtlb_ld = Seq(dtlb_ld_tlb_ld.io)
   val dtlb_st = Seq(dtlb_st_tlb_st.io)
   val dtlb_prefetch = Seq(dtlb_prefetch_tlb_prefetch.io)
   /* tlb vec && constant variable */
   val dtlb = dtlb_ld ++ dtlb_st ++ dtlb_prefetch
   val (dtlb_ld_idx, dtlb_st_idx, dtlb_pf_idx) = (0, 1, 2)
-  val TlbSubSizeVec = Seq(LduCnt + HyuCnt + 1, StaCnt, 2) // (load + hyu + stream pf, store, sms+l2bop)
+  val TlbSubSizeVec = Seq(LduCnt + HyuCnt + 1, StaCnt, 3) // (load + hyu + stream pf, store, sms+l2bop+asp)
   val DTlbSize = TlbSubSizeVec.sum
   val TlbStartVec = TlbSubSizeVec.scanLeft(0)(_ + _).dropRight(1)
   val TlbEndVec = TlbSubSizeVec.scanLeft(0)(_ + _).drop(1)
@@ -632,7 +650,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
       replace_st.io.apply_sep(dtlb_st.map(_.replace), ptwio.resp.bits.data.s1.entry.tag)
     }
     if (pftlbParams.outReplace) {
-      val replace_pf = Module(new TlbReplace(2, pftlbParams))
+      val replace_pf = Module(new TlbReplace(3, pftlbParams))
       replace_pf.io.apply_sep(dtlb_prefetch.map(_.replace), ptwio.resp.bits.data.s1.entry.tag)
     }
   }
@@ -1090,6 +1108,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
   val StreamDTLBPortIndex = TlbStartVec(dtlb_ld_idx) + LduCnt + HyuCnt
   val PrefetcherDTLBPortIndex = TlbStartVec(dtlb_pf_idx)
   val L2toL1DLBPortIndex = TlbStartVec(dtlb_pf_idx) + 1
+  val ASPDTLBPortIndex = TlbStartVec(dtlb_pf_idx) + 2
   prefetcherOpt match {
   case Some(pf) => dtlb_reqs(PrefetcherDTLBPortIndex) <> pf.io.tlb_req
   case None =>
@@ -1108,6 +1127,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
   dtlb_reqs(L2toL1DLBPortIndex).resp.ready := true.B
   io.l2_pmp_resp := pmp_check(L2toL1DLBPortIndex).resp
 
+  dtlb_reqs(ASPDTLBPortIndex) <> lsq.io.aspPfIO.tlb_req
   // StoreUnit
   for (i <- 0 until StdCnt) {
     stdExeUnits(i).io.flush <> redirect
@@ -1375,6 +1395,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
   io.mem_to_ooo.sqDeqPtr := lsq.io.sqDeqPtr
   io.mem_to_ooo.lqDeqPtr := lsq.io.lqDeqPtr
   lsq.io.tl_d_channel <> dcache.io.lsu.tl_d_channel
+  lsq.io.seqStoreDetected <> dcache.io.seqStoreDetected
 
   // LSQ to store buffer
   lsq.io.sbuffer        <> sbuffer.io.in
diff --git a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
index 95ffdadcfa..16ded8326d 100644
--- a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
+++ b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
@@ -781,6 +781,7 @@ class DCacheIO(implicit p: Parameters) extends DCacheBundle {
   val pf_ctrl = Output(new PrefetchControlBundle)
   val force_write = Input(Bool())
   val sms_agt_evict_req = DecoupledIO(new AGTEvictReq)
+  val seqStoreDetected = Input(Bool())
   val debugTopDown = new DCacheTopDownIO
   val debugRolling = Flipped(new RobDebugRollingIO)
   val l2_hint = Input(Valid(new L2ToL1Hint()))
@@ -981,6 +982,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
   missQueue.io.hartId := io.hartId
   missQueue.io.l2_pf_store_only := RegNext(io.l2_pf_store_only, false.B)
   missQueue.io.debugTopDown <> io.debugTopDown
+  missQueue.io.seqStoreDetected <> io.seqStoreDetected
   missQueue.io.l2_hint <> RegNext(io.l2_hint)
   missQueue.io.mainpipe_info := mainPipe.io.mainpipe_info
   mainPipe.io.refill_info := missQueue.io.refill_info
diff --git a/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala b/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala
index f943d572a0..859330a41a 100644
--- a/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala
+++ b/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala
@@ -869,6 +869,7 @@ class MissQueue(edge: TLEdgeOut, reqNum: Int)(implicit p: Parameters) extends DC
     val l2_pf_store_only = Input(Bool())
 
     val memSetPattenDetected = Output(Bool())
+    val seqStoreDetected = Input(Bool())
     val lqEmpty = Input(Bool())
 
     val prefetch_info = new Bundle {
@@ -941,19 +942,7 @@ class MissQueue(edge: TLEdgeOut, reqNum: Int)(implicit p: Parameters) extends DC
 
   assert(PopCount(Seq(alloc && io.req.valid, merge && io.req.valid)) <= 1.U, "allocate and merge a mshr in same cycle!")
 
-  val source_except_load_cnt = RegInit(0.U(10.W))
-  when(VecInit(req_mshr_handled_vec).asUInt.orR || req_pipeline_reg_handled) {
-    when(io.req.bits.isFromLoad) {
-      source_except_load_cnt := 0.U
-    }.otherwise {
-      when(io.req.bits.isFromStore) {
-        source_except_load_cnt := source_except_load_cnt + 1.U
-      }
-    }
-  }
-  val Threshold = 8
-  val memSetPattenDetected = GatedValidRegNext((source_except_load_cnt >= Threshold.U) && io.lqEmpty)
-
+  val memSetPattenDetected = GatedValidRegNext(io.seqStoreDetected && io.lqEmpty)
   io.memSetPattenDetected := memSetPattenDetected
 
   val forwardInfo_vec = VecInit(entries.map(_.io.forwardInfo))
diff --git a/src/main/scala/xiangshan/cache/mmu/Repeater.scala b/src/main/scala/xiangshan/cache/mmu/Repeater.scala
index 98d452f356..faef208b80 100644
--- a/src/main/scala/xiangshan/cache/mmu/Repeater.scala
+++ b/src/main/scala/xiangshan/cache/mmu/Repeater.scala
@@ -219,14 +219,23 @@ class PTWFilterEntry(Width: Int, Size: Int, hasHint: Boolean = false)(implicit p
     canenq(1) := !(Cat(v.drop(Size/2)).andR)
     enqidx(1) := firstValidIndex(v.drop(Size/2), false.B) + (Size/2).U
   } else if (Width == 3) {
-    require(Size == 16, s"load filter Size ($Size) should be 16")
-    canenq(0) := !(Cat(v.take(8)).andR)
-    enqidx(0) := firstValidIndex(v.take(8), false.B)
-    canenq(1) := !(Cat(v.drop(8).take(4)).andR)
-    enqidx(1) := firstValidIndex(v.drop(8).take(4), false.B) + 8.U
-    // four entries for prefetch
-    canenq(2) := !(Cat(v.drop(12)).andR)
-    enqidx(2) := firstValidIndex(v.drop(12), false.B) + 12.U
+    require(Size == 16 || Size == 8, s"load/prefetcher filter Size ($Size) should be 16/8")
+    if (Size == 16) {
+      canenq(0) := !(Cat(v.take(8)).andR)
+      enqidx(0) := firstValidIndex(v.take(8), false.B)
+      canenq(1) := !(Cat(v.drop(8).take(4)).andR)
+      enqidx(1) := firstValidIndex(v.drop(8).take(4), false.B) + 8.U
+      // four entries for prefetch
+      canenq(2) := !(Cat(v.drop(12)).andR)
+      enqidx(2) := firstValidIndex(v.drop(12), false.B) + 12.U
+    } else {
+      canenq(0) := !(Cat(v.take(4)).andR)
+      enqidx(0) := firstValidIndex(v.take(4), false.B)
+      canenq(1) := !(Cat(v.drop(4).take(2)).andR)
+      enqidx(1) := firstValidIndex(v.drop(4).take(2), false.B) + 4.U
+      canenq(2) := !(Cat(v.drop(6)).andR)
+      enqidx(2) := firstValidIndex(v.drop(6), false.B) + 6.U
+    }
   } else if (Width == 4) {
     require(Size == 16, s"load filter Size ($Size) should be 16")
     for (i <- 0 until Width) {
@@ -359,7 +368,7 @@ class PTWNewFilter(Width: Int, Size: Int, FenceDelay: Int)(implicit p: Parameter
   })
 
   val prefetch_filter = VecInit(Seq.fill(1) {
-    val prefetch_entry = Module(new PTWFilterEntry(Width = 2, Size = prefetchfiltersize))
+    val prefetch_entry = Module(new PTWFilterEntry(Width = 3, Size = prefetchfiltersize))
     prefetch_entry.io
   })
 
diff --git a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala
index c4681e9ec4..8dbc1b6ec4 100644
--- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala
@@ -124,6 +124,8 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete
     val flushSbuffer = new SbufferFlushBundle
     val force_write = Output(Bool())
     val lqEmpty = Output(Bool())
+    val seqStoreDetected = Output(Bool())
+    val aspPfIO = new AspPfIO
 
     // top-down
     val debugTopDown = new LoadQueueTopDownIO
@@ -179,16 +181,19 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete
   storeQueue.io.vecmmioStout <> io.vecmmioStout
   storeQueue.io.rob         <> io.rob
   storeQueue.io.exceptionAddr.isStore := DontCare
-  storeQueue.io.sqCancelCnt  <> io.sqCancelCnt
-  storeQueue.io.sqDeq        <> io.sqDeq
-  storeQueue.io.sqEmpty      <> io.sqEmpty
-  storeQueue.io.sqFull       <> io.sqFull
-  storeQueue.io.forward      <> io.forward // overlap forwardMask & forwardData, DO NOT CHANGE SEQUENCE
-  storeQueue.io.force_write  <> io.force_write
-  storeQueue.io.cmoOpReq     <> io.cmoOpReq
-  storeQueue.io.cmoOpResp    <> io.cmoOpResp
-  storeQueue.io.flushSbuffer <> io.flushSbuffer
-  storeQueue.io.maControl    <> io.maControl
+  storeQueue.io.sqCancelCnt      <> io.sqCancelCnt
+  storeQueue.io.sqDeq            <> io.sqDeq
+  storeQueue.io.sqEmpty          <> io.sqEmpty
+  storeQueue.io.lqEmpty          <> loadQueue.io.lqEmpty
+  storeQueue.io.sqFull           <> io.sqFull
+  storeQueue.io.forward          <> io.forward // overlap forwardMask & forwardData, DO NOT CHANGE SEQUENCE
+  storeQueue.io.force_write      <> io.force_write
+  storeQueue.io.cmoOpReq         <> io.cmoOpReq
+  storeQueue.io.cmoOpResp        <> io.cmoOpResp
+  storeQueue.io.flushSbuffer     <> io.flushSbuffer
+  storeQueue.io.maControl        <> io.maControl
+  storeQueue.io.seqStoreDetected <> io.seqStoreDetected
+  storeQueue.io.aspPfIO          <> io.aspPfIO
 
   /* <------- DANGEROUS: Don't change sequence here ! -------> */
 
diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala
index 9541c561bf..414394044e 100644
--- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala
@@ -26,12 +26,14 @@ import utility._
 import utils._
 import xiangshan._
 import xiangshan.cache._
+import xiangshan.cache.mmu.TlbRequestIO
 import xiangshan.cache.{DCacheLineIO, DCacheWordIO, MemoryOpConstants}
 import xiangshan.backend._
 import xiangshan.backend.rob.{RobLsqIO, RobPtr}
 import xiangshan.backend.Bundles.{DynInst, MemExuOutput}
 import xiangshan.backend.decode.isa.bitfield.{Riscv32BitInst, XSInstBitFields}
 import xiangshan.backend.fu.FuConfig._
+import xiangshan.mem.prefetch.L2PrefetchReq
 import xiangshan.backend.fu.FuType
 import xiangshan.ExceptionNO._
 import coupledL2.{CMOReq, CMOResp}
@@ -58,6 +60,11 @@ class SqEnqIO(implicit p: Parameters) extends MemBlockBundle {
   val resp = Vec(LSQEnqWidth, Output(new SqPtr))
 }
 
+class AspPfIO(implicit p: Parameters) extends MemBlockBundle {
+  val tlb_req = new TlbRequestIO(nRespDups = 2)
+  val l2_pf_addr = ValidIO(new L2PrefetchReq())
+}
+
 class DataBufferEntry (implicit p: Parameters)  extends DCacheBundle {
   val addr   = UInt(PAddrBits.W)
   val vaddr  = UInt(VAddrBits.W)
@@ -187,6 +194,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule
     val exceptionAddr = new ExceptionAddrIO
     val flushSbuffer = new SbufferFlushBundle
     val sqEmpty = Output(Bool())
+    val lqEmpty = Input(Bool())
     val stAddrReadySqPtr = Output(new SqPtr)
     val stAddrReadyVec = Output(Vec(StoreQueueSize, Bool()))
     val stDataReadySqPtr = Output(new SqPtr)
@@ -198,10 +206,15 @@ class StoreQueue(implicit p: Parameters) extends XSModule
     val sqDeq = Output(UInt(log2Ceil(EnsbufferWidth + 1).W))
     val force_write = Output(Bool())
     val maControl   = Flipped(new StoreMaBufToSqControlIO)
+    val seqStoreDetected = Output(Bool())
+    val aspPfIO = new AspPfIO
   })
 
   println("StoreQueue: size:" + StoreQueueSize)
 
+  // ASP prefetcher
+  val asp = Module(new ASP)
+
   // data modules
   val uop = Reg(Vec(StoreQueueSize, new DynInst))
   // val data = Reg(Vec(StoreQueueSize, new LsqEntry))
@@ -1026,6 +1039,16 @@ class StoreQueue(implicit p: Parameters) extends XSModule
     }
   }
 
+  asp.io.sbuffer.zipWithIndex.foreach {case (s, idx) => {
+    s.valid := io.sbuffer(idx).fire && io.sbuffer(idx).bits.vecValid
+    s.bits  := io.sbuffer(idx).bits
+  }}
+  asp.io.sqEmpty := io.sqEmpty
+  asp.io.lqEmpty := io.lqEmpty
+  asp.io.enable  := EnableStorePrefetchASP.B
+  io.seqStoreDetected := asp.io.seqStoreDetected
+  io.aspPfIO <> asp.io.aspPfIO
+
   // All vector instruction uop normally dequeue, but the Uop after the exception is raised does not write to the 'sbuffer'.
   // Flags are used to record whether there are any exceptions when the queue is displayed.
   // This is determined each time a write is made to the 'databuffer', prevent subsequent uop of the same instruction from writing to the 'dataBuffer'.
diff --git a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala
index 0d53f5ac45..05519b6515 100644
--- a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala
+++ b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala
@@ -61,6 +61,9 @@ trait HasSbufferConst extends HasXSParameter {
   val VWordsWidth: Int = log2Up(CacheLineVWords)
   val VWordWidth: Int = log2Up(VDataBytes)
   val VWordOffsetWidth: Int = PAddrBits - VWordWidth
+
+  val FullWriteMaxWaitCycles = CacheLineBytes / EnsbufferWidth
+  val FullWriteMaxWaitBits = log2Up(FullWriteMaxWaitCycles) + 1
 }
 
 class SbufferEntryState (implicit p: Parameters) extends SbufferBundle {
@@ -206,6 +209,9 @@ class Sbuffer(implicit p: Parameters)
     val force_write = Input(Bool())
   })
 
+  println("Sbuffer FullWriteMaxWaitBits: " + FullWriteMaxWaitBits)
+  println("Sbuffer FullWriteMaxWaitCycles: " + FullWriteMaxWaitCycles)
+  
   val dataModule = Module(new SbufferData)
   dataModule.io.writeReq <> DontCare
   val prefetcher = Module(new StorePfWrapper())
@@ -220,6 +226,7 @@ class Sbuffer(implicit p: Parameters)
   val stateVec = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U.asTypeOf(new SbufferEntryState))))
   val cohCount = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U(EvictCountBits.W))))
   val missqReplayCount = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U(MissqReplayCountBits.W))))
+  val waitCntBeforeFull = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U(FullWriteMaxWaitBits.W))))
 
   val sbuffer_out_s0_fire = Wire(Bool())
 
@@ -312,7 +319,7 @@ class Sbuffer(implicit p: Parameters)
 
   val activeMask = VecInit(stateVec.map(s => s.isActive()))
   val validMask  = VecInit(stateVec.map(s => s.isValid()))
-  val drainIdx = PriorityEncoder(activeMask)
+  val drainIdx = Wire(UInt(SbufferIndexWidth.W))
 
   val inflightMask = VecInit(stateVec.map(s => s.isInflight()))
 
@@ -385,8 +392,8 @@ class Sbuffer(implicit p: Parameters)
   val do_uarch_drain = GatedValidRegNext(forward_need_uarch_drain) || GatedValidRegNext(GatedValidRegNext(merge_need_uarch_drain))
   XSPerfAccumulate("do_uarch_drain", do_uarch_drain)
 
-  io.in(0).ready := firstCanInsert
-  io.in(1).ready := secondCanInsert && io.in(0).ready
+  io.in(0).ready := firstCanInsert || mergeVec(0).orR
+  io.in(1).ready := (secondCanInsert || mergeVec(1).orR) && io.in(0).ready
 
   for (i <- 0 until EnsbufferWidth) {
     // train
@@ -437,6 +444,7 @@ class Sbuffer(implicit p: Parameters)
         // missqReplayCount(insertIdx) := 0.U
         ptag(entryIdx) := reqptag
         vtag(entryIdx) := reqvtag // update vtag if a new sbuffer line is allocated
+        waitCntBeforeFull(entryIdx) := FullWriteMaxWaitCycles.U
       }
     })
   }
@@ -468,6 +476,8 @@ class Sbuffer(implicit p: Parameters)
     })
   }
 
+  waitCntBeforeFull.foreach(x => x := Mux(x.orR, x - 1.U, x))
+
   for(((in, vwordOffset), i) <- io.in.zip(Seq(firstWord, secondWord)).zipWithIndex){
     writeReq(i).valid := in.fire && in.bits.vecValid
     writeReq(i).bits.vwordOffset := vwordOffset
@@ -606,11 +616,22 @@ class Sbuffer(implicit p: Parameters)
 
   val sbuffer_out_s1_ready = Wire(Bool())
 
+  // ---------------------------------------------------------------------------
+  // Memset Case
+  // ---------------------------------------------------------------------------
+
+  val memSet_needDrain = io.memSetPattenDetected
+  val memSetActiveMask = VecInit(stateVec.zipWithIndex.map{case (s, idx) => {
+    s.isDcacheReqCandidate() && Mux(waitCntBeforeFull(idx).orR, mask(idx).asUInt.andR, true.B)
+  }})
+
+  drainIdx := Mux(memSet_needDrain, PriorityEncoder(memSetActiveMask), PriorityEncoder(activeMask))
+
   // ---------------------------------------------------------------------------
   // sbuffer_out_s0
   // ---------------------------------------------------------------------------
 
-  val need_drain = needDrain(sbuffer_state)
+  val need_drain = needDrain(sbuffer_state) || memSet_needDrain
   val need_replace = do_eviction || (sbuffer_state === x_replace)
   val sbuffer_out_s0_evictionIdx = Mux(missqReplayHasTimeOut,
     missqReplayTimeOutIdx,
@@ -620,14 +641,18 @@ class Sbuffer(implicit p: Parameters)
     )
   )
 
+  val sbuffer_out_s0_can_evict = Mux(
+    memSet_needDrain,
+    memSetActiveMask(sbuffer_out_s0_evictionIdx),
+    candidateVec(sbuffer_out_s0_evictionIdx)
+  )
+
   // If there is a inflight dcache req which has same ptag with sbuffer_out_s0_evictionIdx's ptag,
   // current eviction should be blocked.
   val sbuffer_out_s0_valid = missqReplayHasTimeOut ||
-    stateVec(sbuffer_out_s0_evictionIdx).isDcacheReqCandidate() &&
-    (need_drain || cohHasTimeOut || need_replace)
+    sbuffer_out_s0_can_evict && (need_drain || cohHasTimeOut || need_replace)
   assert(!(
-    stateVec(sbuffer_out_s0_evictionIdx).isDcacheReqCandidate() &&
-    !noSameBlockInflight(sbuffer_out_s0_evictionIdx)
+    sbuffer_out_s0_can_evict && !noSameBlockInflight(sbuffer_out_s0_evictionIdx)
   ))
   val sbuffer_out_s0_cango = sbuffer_out_s1_ready
   sbuffer_out_s0_fire := sbuffer_out_s0_valid && sbuffer_out_s0_cango
diff --git a/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala b/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala
index 6455a96fa9..b4fb0f4d3f 100644
--- a/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala
+++ b/src/main/scala/xiangshan/mem/sbuffer/StorePrefetchBursts.scala
@@ -23,6 +23,8 @@ import xiangshan._
 import utils._
 import utility._
 import xiangshan.cache._
+import xiangshan.cache.mmu._
+import xiangshan.mem.prefetch.L2PrefetchReq
 
 trait HasStorePrefetchHelper extends HasCircularQueuePtrHelper with HasDCacheParameters {
   // common
@@ -34,24 +36,39 @@ trait HasStorePrefetchHelper extends HasCircularQueuePtrHelper with HasDCachePar
   val ONLY_ON_MEMSET = false
   val SATURATE_COUNTER_BITS = 7
   val BURST_ENGINE_SIZE = 2
+  val SPB_GRANULARITY_BYTES = 4096
+  val SPB_GRANULARITY_BITS  = log2Up(SPB_GRANULARITY_BYTES)
   val SPB_N = 48
 
   // serializer parameters
   val SERIALIZER_SIZE = 12
 
+  // asp parameters
+  val LOCK_CYCLE = 2048
+  val LOCK_BITS  = log2Up(LOCK_CYCLE) + 1
+  val ASP_GRANULARITY_BYTES = 1024 // 1KB
+  val ASP_GRANULARITY_BITS  = log2Up(ASP_GRANULARITY_BYTES)
+
   def block_addr(x: UInt): UInt = {
     val offset = log2Up(dcacheParameters.blockBytes)
     x(x.getWidth - 1, offset)
   }
 
-  // filter logic (granularity: a page)
-  def same_page_addr(addr0: UInt, addr1: UInt): Bool = {
-    addr0(addr0.getWidth - 1, PAGEOFFSET) === addr1(addr1.getWidth - 1, PAGEOFFSET)
+  // filter logic (granularity specified in args)
+  def same_granularity_addr(addr0: UInt, addr1: UInt, granularity: Int): Bool = {
+    addr0(addr0.getWidth - 1, granularity) === addr1(addr1.getWidth - 1, granularity)
   }
 
-  def filter_by_page_addr(valid_vec: Vec[Bool], data_vec: Vec[UInt], incoming_vaddr: UInt) : Bool = {
+  def filter_by_addr(valid_vec: Vec[Bool], data_vec: Vec[UInt], incoming_vaddr: UInt, granularity: Int) : Bool = {
     val match_vec = (valid_vec zip data_vec).map{
-      case(v, e_vaddr) => v && same_page_addr(e_vaddr, incoming_vaddr)
+      case(v, e_vaddr) => v && same_granularity_addr(e_vaddr, incoming_vaddr, granularity)
+    }
+    VecInit(match_vec).asUInt.orR
+  }
+
+  def filter_by_addr(valid_vec: Vec[Bool], data_vec: Vec[UInt], lock_vec: Vec[Bool], incoming_vaddr: UInt, granularity: Int) : Bool = {
+    val match_vec = (valid_vec zip lock_vec zip data_vec).map{
+      case((v, l), e_vaddr) => (v || l) && same_granularity_addr(e_vaddr, incoming_vaddr, granularity)
     }
     VecInit(match_vec).asUInt.orR
   }
@@ -79,7 +96,7 @@ trait HasStorePrefetchHelper extends HasCircularQueuePtrHelper with HasDCachePar
 // L1 Store prefetch component
 
 // an prefetch request generator used by spb to burst some prefetch request to L1 Dcache
-class PrefetchBurstGenerator(is_store: Boolean)(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelper {
+class PrefetchBurstGenerator(granularity: Int, is_store: Boolean)(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelper {
   val io = IO(new DCacheBundle {
     val alloc = Input(Bool())
     val vaddr = Input(UInt(VAddrBits.W))
@@ -99,12 +116,12 @@ class PrefetchBurstGenerator(is_store: Boolean)(implicit p: Parameters) extends
   val enq_valids = ~(valids.asUInt)
   val full = !(enq_valids.orR)
   val enq_idx = PriorityEncoder(enq_valids)
-  val enq_filter = filter_by_page_addr(valids, datas, io.vaddr)
+  val enq_filter = filter_by_addr(valids, datas, io.vaddr, granularity)
 
   when(io.alloc && !full && !enq_filter) {
     valids(enq_idx) := true.B
     datas(enq_idx) := io.vaddr
-    pagebits(enq_idx) := io.vaddr(PAGEOFFSET)
+    pagebits(enq_idx) := io.vaddr(granularity)
   }
 
   XSPerfAccumulate("burst_generator_alloc_success", io.alloc && !full && !enq_filter)
@@ -127,21 +144,21 @@ class PrefetchBurstGenerator(is_store: Boolean)(implicit p: Parameters) extends
     out_decouple(0).valid := deq_valid
     out_decouple(0).bits := DontCare
     out_decouple(0).bits.vaddr := data
-    out_decouple(1).valid := deq_valid && data_next(PAGEOFFSET) === pg_bit && out_decouple(0).fire
+    out_decouple(1).valid := deq_valid && data_next(granularity) === pg_bit && out_decouple(0).fire
     out_decouple(1).bits := DontCare
     out_decouple(1).bits.vaddr := data_next
     out_decouple.drop(2).foreach { out => out.valid := false.B; out.bits := DontCare }
     when(out_decouple(1).fire) {
       // fired 2 prefetch reqs
       data := data_next_next
-      when(data_next_next(PAGEOFFSET) =/= pg_bit) {
+      when(data_next_next(granularity) =/= pg_bit) {
         // cross page, invalid this entry
         v := false.B
       }
     }.elsewhen(out_decouple(0).fire) {
       // fired 1 prefetch req
       data := data_next
-      when(data_next(PAGEOFFSET) =/= pg_bit) {
+      when(data_next(granularity) =/= pg_bit) {
         // cross page, invalid this entry
         v := false.B
       }
@@ -164,12 +181,14 @@ class StorePrefetchBursts(implicit p: Parameters) extends DCacheModule with HasS
   })
   require(EnsbufferWidth == 2)
 
+  private val granularity = SPB_GRANULARITY_BITS
+
   // meta for SPB
   val N = SPB_N
   val last_st_block_addr = RegInit(0.U(VAddrBits.W))
   val saturate_counter = RegInit(0.S(SATURATE_COUNTER_BITS.W))
   val store_count = RegInit(0.U((log2Up(N) + 1).W))
-  val burst_engine = Module(new PrefetchBurstGenerator(is_store = true))
+  val burst_engine = Module(new PrefetchBurstGenerator(granularity, true))
 
   val sbuffer_fire = io.sbuffer_enq.valid
   val sbuffer_vaddr = io.sbuffer_enq.bits.vaddr
@@ -284,4 +303,251 @@ class StorePfWrapper()(implicit p: Parameters) extends DCacheModule with HasStor
 
   // fire a prefetch req
   io.prefetch_req <> spb.io.prefetch_req
+}
+
+// prefetch request generator used by asp to burst some prefetch request to L2 Cache
+class ASPBurstGenerator(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelper {
+  val io = IO(new DCacheBundle {
+    val alloc = Input(Bool())
+    val vaddr = Input(UInt(VAddrBits.W))
+    val aspPfIO = new AspPfIO
+  })
+
+  private val granularity = ASP_GRANULARITY_BITS
+
+  val SIZE = BURST_ENGINE_SIZE
+
+  val valids = RegInit(VecInit(List.tabulate(SIZE){_ => false.B}))
+  val locks  = RegInit(VecInit(List.tabulate(SIZE){_ => false.B}))
+  val cnts   = RegInit(VecInit(List.tabulate(SIZE){_ => 0.U(LOCK_BITS.W)}))
+  val vaddrs = RegInit(VecInit(List.tabulate(SIZE){_ => 0.U.asTypeOf(io.vaddr)}))
+  val paddrs = RegInit(VecInit(List.tabulate(SIZE){_ => 0.U(PAddrBits.W)}))
+  val pa_vs  = RegInit(VecInit(List.tabulate(SIZE){_ => false.B}))
+  val tlb_sent  = RegInit(VecInit(List.tabulate(SIZE){_ => false.B}))
+
+  val tlb_req_arb = Module(new RRArbiterInit(new TlbReq, SIZE))
+  val l2_pf_req_arb = Module(new RRArbiterInit(new L2PrefetchReq, SIZE))
+
+  // enq
+  val enq_valids = ~(valids.asUInt)
+  val full = !(enq_valids.orR)
+  val enq_idx = PriorityEncoder(enq_valids)
+  val enq_filter = filter_by_addr(valids, vaddrs, locks, io.vaddr, granularity)
+
+  for (i <- 0 until SIZE) {
+    when (!valids(i) && locks(i) && cnts(i).orR) {
+      cnts(i) := cnts(i) - 1.U
+    }
+
+    when (!valids(i) && locks(i) && !cnts(i).orR) {
+      locks(i) := false.B
+    }
+  }
+
+  when(io.alloc && !full && !enq_filter) {
+    valids(enq_idx) := true.B
+    locks(enq_idx) := false.B
+    cnts(enq_idx) := 0.U
+    vaddrs(enq_idx) := io.vaddr
+    pa_vs(enq_idx) := false.B
+    tlb_sent(enq_idx) := false.B
+  }
+
+  // tlb req
+  val s0_tlb_fire_vec = VecInit((0 until SIZE).map{case i => tlb_req_arb.io.in(i).fire})
+  for(i <- 0 until SIZE) {
+    tlb_req_arb.io.in(i).valid := valids(i) && !pa_vs(i) && !tlb_sent(i)
+    tlb_req_arb.io.in(i).bits := 0.U.asTypeOf(new TlbReq)
+    tlb_req_arb.io.in(i).bits.vaddr := vaddrs(i)
+    tlb_req_arb.io.in(i).bits.cmd := TlbCmd.write
+    tlb_req_arb.io.in(i).bits.size := 3.U
+    tlb_req_arb.io.in(i).bits.kill := false.B
+    tlb_req_arb.io.in(i).bits.no_translate := false.B
+
+    when(tlb_req_arb.io.in(i).fire) {
+      tlb_sent(i) := true.B
+    }
+  }
+  assert(PopCount(s0_tlb_fire_vec) <= 1.U, "s0_tlb_fire_vec should be one-hot or empty")
+
+  val s1_tlb_req_valid = RegInit(false.B)
+  val s1_tlb_req_bits = RegEnable(tlb_req_arb.io.out.bits, tlb_req_arb.io.out.fire)
+  val s1_tlb_req_index = RegEnable(OHToUInt(s0_tlb_fire_vec.asUInt), tlb_req_arb.io.out.fire)
+  when(io.aspPfIO.tlb_req.req.fire) {
+    s1_tlb_req_valid := false.B
+  }
+  when(tlb_req_arb.io.out.fire) {
+    s1_tlb_req_valid := true.B
+  }
+  io.aspPfIO.tlb_req.req.valid := s1_tlb_req_valid
+  io.aspPfIO.tlb_req.req.bits := s1_tlb_req_bits
+  io.aspPfIO.tlb_req.req_kill := false.B
+  tlb_req_arb.io.out.ready := !s1_tlb_req_valid || io.aspPfIO.tlb_req.req.ready
+
+  // tlb resp
+  val s2_tlb_resp = io.aspPfIO.tlb_req.resp
+  val s2_tlb_update_index = RegEnable(s1_tlb_req_index, io.aspPfIO.tlb_req.req.fire)
+  when(s2_tlb_resp.valid) {
+    pa_vs(s2_tlb_update_index) := !s2_tlb_resp.bits.miss
+    tlb_sent(s2_tlb_update_index) := false.B
+
+    when(!s2_tlb_resp.bits.miss) {
+      paddrs(s2_tlb_update_index) := s2_tlb_resp.bits.paddr.head
+      when(s2_tlb_resp.bits.excp.head.pf.st || s2_tlb_resp.bits.excp.head.af.st) {
+        valids(s2_tlb_update_index) := false.B
+      }
+    }
+  }
+  s2_tlb_resp.ready := true.B
+
+  // next prefetch address
+  val paddrs_next = Wire(Vec(SIZE, chiselTypeOf(paddrs(0))))
+  paddrs_next := paddrs.map(_ + Cat(1.U(1.W), 0.U(BLOCKOFFSET.W)))
+
+  // pf to l2
+  io.aspPfIO.l2_pf_addr.valid := l2_pf_req_arb.io.out.valid
+  io.aspPfIO.l2_pf_addr.bits := l2_pf_req_arb.io.out.bits
+
+  l2_pf_req_arb.io.out.ready := true.B
+  
+  for(i <- 0 until SIZE) {
+    l2_pf_req_arb.io.in(i).valid := valids(i) && pa_vs(i)
+    l2_pf_req_arb.io.in(i).bits.addr := paddrs(i)
+    l2_pf_req_arb.io.in(i).bits.source := MemReqSource.Prefetch2L2Stream.id.U
+  }
+
+  when(l2_pf_req_arb.io.out.fire) {
+    val idx = l2_pf_req_arb.io.chosen
+    val cross_page = !same_granularity_addr(paddrs_next(idx), paddrs(idx), granularity)
+    when(cross_page) {
+      valids(idx) := false.B
+      locks(idx) := true.B
+      cnts(idx) := LOCK_CYCLE.U
+    }
+    paddrs(idx) := paddrs_next(idx)
+  }
+  
+  XSPerfAccumulate("burst_generator_alloc_success", io.alloc && !full && !enq_filter)
+  XSPerfAccumulate("burst_generator_alloc_fail", io.alloc && full && !enq_filter)
+  XSPerfAccumulate("burst_generator_full", full)
+
+  XSPerfAccumulate("burst_valid_num", PopCount(valids))
+  XSPerfAccumulate("prefetch_req_fire_by_generator", io.aspPfIO.l2_pf_addr.valid)
+}
+
+// an Accurate Store prefetcher
+class ASP(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelper {
+  val io = IO(new DCacheBundle {
+    val sbuffer = Vec(EnsbufferWidth, Flipped(ValidIO(new DCacheWordReqWithVaddrAndPfFlag)))
+    val sqEmpty = Input(Bool())
+    val lqEmpty = Input(Bool())
+    val enable  = Input(Bool())
+    val seqStoreDetected = Output(Bool())
+    val aspPfIO = new AspPfIO
+  })
+
+  private val granularity = ASP_GRANULARITY_BITS
+
+  // sequential store detection:
+  // store D, (A); store D, (A + K), store D, (A + 2K) ...
+  val DATAHASHBITS = 16
+  val SEQTHRESHOLD = 32
+  val seqStoreDetected = WireInit(false.B)
+  val prevCycleVaddr = RegInit(0.U(VAddrBits.W))
+  val prevCycleDataHash = RegInit(0.U(DATAHASHBITS.W))
+  val seqKStride = RegInit(0.U(6.W))
+  val seqPatternVec = WireInit(VecInit(List.fill(EnsbufferWidth)(false.B)))
+  val seqPatternCnt = RegInit(0.U((log2Up(SEQTHRESHOLD) + 1).W))
+  val sbufferFire = Cat(VecInit((0 until EnsbufferWidth).map(i => io.sbuffer(i).fire))).orR
+  val sbufferFireCnt = PopCount(VecInit((0 until EnsbufferWidth).map(i => io.sbuffer(i).fire)))
+  val validKStride = (seqKStride === 1.U || seqKStride === 2.U || seqKStride === 4.U || seqKStride === 8.U)
+
+  for (i <- 0 until EnsbufferWidth) {
+    when (io.sbuffer(i).fire) {
+      val thisCycleVaddr    = io.sbuffer(i).bits.vaddr
+      val thisCycleDataHash = io.sbuffer(i).bits.data.asTypeOf(Vec(VLEN / DATAHASHBITS, UInt(DATAHASHBITS.W))).fold(0.U)(_ ^ _)
+      prevCycleVaddr    := thisCycleVaddr
+      prevCycleDataHash := thisCycleDataHash
+
+      if (i == 0) {
+        seqKStride := thisCycleVaddr - prevCycleVaddr
+        seqPatternVec(i) := ((thisCycleVaddr - prevCycleVaddr) === seqKStride) &&
+                            (prevCycleDataHash === thisCycleDataHash) &&
+                            (PopCount(io.sbuffer(i).bits.mask) === seqKStride)
+      } else {
+        val lastLoopVaddr    = WireInit(prevCycleVaddr)
+        val lastLoopDataHash = WireInit(prevCycleDataHash)
+        for ( j <- 0 until i ) {
+          when (io.sbuffer(j).fire) {
+            lastLoopVaddr    := io.sbuffer(j).bits.vaddr
+            lastLoopDataHash := io.sbuffer(j).bits.data.asTypeOf(Vec(VLEN / DATAHASHBITS, UInt(DATAHASHBITS.W))).fold(0.U)(_ ^ _)
+          }
+        }
+        seqKStride := thisCycleVaddr - lastLoopVaddr
+        seqPatternVec(i) := ((thisCycleVaddr - lastLoopVaddr) === seqKStride) &&
+                            (lastLoopDataHash === thisCycleDataHash) &&
+                            (PopCount(io.sbuffer(i).bits.mask) === seqKStride)
+      }
+    } .otherwise {
+      seqPatternVec(i) := true.B
+    }
+  }
+
+  when (sbufferFire) {
+    when (Cat(seqPatternVec).andR) {
+      seqPatternCnt := Mux(seqPatternCnt >= SEQTHRESHOLD.U, seqPatternCnt, seqPatternCnt + sbufferFireCnt)
+    } .otherwise {
+      seqPatternCnt := 0.U
+    }
+  }
+  when (seqPatternCnt >= SEQTHRESHOLD.U && validKStride) {
+    seqStoreDetected := true.B
+  } .otherwise {
+    seqStoreDetected := false.B
+  }
+  when (io.sqEmpty) {
+    seqStoreDetected := false.B
+  }
+  io.seqStoreDetected := seqStoreDetected
+
+  // generator
+  val generator = Module(new ASPBurstGenerator)
+  
+  generator.io.alloc := false.B
+  generator.io.vaddr := 0.U
+  generator.io.aspPfIO <> io.aspPfIO
+
+  // prefetch Depth for SW
+  val depthSW = Wire(UInt(10.W))
+  depthSW := Constantin.createRecord("ASP_DEPTH_SW" + p(XSCoreParamsKey).HartId.toString, initValue = 16)
+
+  // The larger the size of the store instruction, the greater the bandwidth for sq to write to the sbuffer,
+  // causing the sbuffer to fill up faster, so we need a larger distance.
+  val depth = LookupTreeDefault(seqKStride, depthSW, List(
+    1.U -> (depthSW >> 2), // memset using sb
+    2.U -> (depthSW >> 1), // memset using sh
+    4.U ->  depthSW,       // memset using sw
+    8.U -> (depthSW << 1)  // memset using sd
+  ))
+
+  val prefetchVaddr = (0 until EnsbufferWidth).map(i => get_block_addr(io.sbuffer(i).bits.vaddr) + Cat(depth, 0.U(log2Up(dcacheParameters.blockBytes).W)))
+  for (i <- 0 until EnsbufferWidth) {
+    when (io.enable) {
+      if (i == 0) {
+        when(io.sbuffer(i).fire) {
+          generator.io.alloc := seqStoreDetected && io.lqEmpty
+          generator.io.vaddr := prefetchVaddr(0)
+        }
+      } else {
+        when(io.sbuffer(i).fire) {
+          generator.io.alloc := seqStoreDetected && io.lqEmpty
+          when (!same_granularity_addr(prefetchVaddr(i), prefetchVaddr(i - 1), granularity)) {
+            generator.io.vaddr := prefetchVaddr(i)
+          }
+        }
+      }
+    }
+  }
+
+  XSPerfAccumulate("seqStoreDetected", seqStoreDetected)
 }
\ No newline at end of file