Revert "perf: use perfUtils in Utility (#179)"

This reverts commit f9dffb2.
OpenXiangShan · Dec 10, 2024 · fd4398f · fd4398f
1 parent 3fc7e7e
commit fd4398f
Show file tree

Hide file tree

Showing 12 changed files with 182 additions and 119 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -18,7 +18,7 @@ jobs:
   # This workflow contains a single job called "build"
   tl-test_L2:
     # The type of runner that the job will run on
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-latest
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
@@ -35,7 +35,7 @@ jobs:
         uses: coursier/cache-action@v5
 
       - name: Verilator
-        run: sudo apt install verilator libsqlite3-dev
+        run: sudo apt install verilator
 
       - name: Setup Mill
         uses: jodersky/[email protected]
@@ -60,7 +60,7 @@ jobs:
 
   tl-test_L2L3:
     # The type of runner that the job will run on
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-latest
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
@@ -77,7 +77,7 @@ jobs:
         uses: coursier/cache-action@v5
 
       - name: Verilator
-        run: sudo apt install verilator libsqlite3-dev
+        run: sudo apt install verilator
 
       - name: Setup Mill
         uses: jodersky/[email protected]

diff --git a/Utility b/Utility
diff --git a/rocket-chip b/rocket-chip
diff --git a/src/main/scala/huancun/DataStorage.scala b/src/main/scala/huancun/DataStorage.scala
@@ -22,7 +22,7 @@ package huancun
 import org.chipsalliance.cde.config.Parameters
 import chisel3._
 import chisel3.util._
-import huancun.utils.SRAMWrapper
+import huancun.utils.{SRAMWrapper, XSPerfAccumulate}
 import utility._
 
 class DataStorage(implicit p: Parameters) extends HuanCunModule {
@@ -264,7 +264,7 @@ class DataStorage(implicit p: Parameters) extends HuanCunModule {
   val debug_stack_used = PopCount(bank_en.grouped(stackSize).toList.map(seq => Cat(seq).orR))
 
   for (i <- 1 to nrStacks) {
-    XSPerfAccumulate(s"DS_${i}_stacks_used", debug_stack_used === i.U)
+    XSPerfAccumulate(cacheParams, s"DS_${i}_stacks_used", debug_stack_used === i.U)
   }
 
 }

diff --git a/src/main/scala/huancun/HuanCun.scala b/src/main/scala/huancun/HuanCun.scala
@@ -28,7 +28,7 @@ import freechips.rocketchip.tilelink._
 import freechips.rocketchip.tilelink.TLMessages._
 import freechips.rocketchip.util.{BundleField, BundleFieldBase, UIntToOH1}
 import huancun.prefetch._
-import utils.ResetGen
+import utils.{ResetGen, XSPerfAccumulate}
 import utility.{Pipeline, FastArbiter}
 import huancun.noninclusive.MSHR
 

diff --git a/src/main/scala/huancun/MSHRAlloc.scala b/src/main/scala/huancun/MSHRAlloc.scala
@@ -23,7 +23,7 @@ import org.chipsalliance.cde.config.Parameters
 import chisel3._
 import chisel3.util._
 import huancun.utils._
-import utility._
+import utility.{ParallelOR, ParallelPriorityMux}
 import freechips.rocketchip.tilelink._
 
 class MSHRSelector(implicit p: Parameters) extends HuanCunModule {
@@ -192,9 +192,9 @@ class MSHRAlloc(implicit p: Parameters) extends HuanCunModule {
       }
       val cntEnable =
         !io.status(i).valid && cnt =/= 0.U && cntStart && cnt < 5000.U // Ignore huge cnt during L3 dir reset
-      XSPerfHistogram("mshr_latency_" + Integer.toString(i, 10), cnt, cntEnable, 0, 300, 10, right_strict = true)
-      XSPerfHistogram("mshr_latency_" + Integer.toString(i, 10), cnt, cntEnable, 300, 1000, 50, right_strict = true)
-      XSPerfMax("mshr_latency", cnt, cntEnable)
+      XSPerfHistogram(cacheParams, "mshr_latency_" + Integer.toString(i, 10), cnt, cntEnable, 0, 300, 10, rStrict = true)
+      XSPerfHistogram(cacheParams, "mshr_latency_" + Integer.toString(i, 10), cnt, cntEnable, 300, 1000, 50, lStrict = true)
+      XSPerfMax(cacheParams, "mshr_latency", cnt, cntEnable)
     }
   }
 
@@ -203,13 +203,13 @@ class MSHRAlloc(implicit p: Parameters) extends HuanCunModule {
       (s.bits.set(block_granularity - 1, 0) === io.a_req.bits.set(block_granularity - 1, 0))
   ))
 
-  XSPerfAccumulate("nrWorkingABCmshr", PopCount(io.status.init.init.map(_.valid)))
-  XSPerfAccumulate("nrWorkingBmshr", io.status.take(mshrs+1).last.valid)
-  XSPerfAccumulate("nrWorkingCmshr", io.status.last.valid)
-  XSPerfAccumulate("conflictA", io.a_req.valid && conflict_a)
-  XSPerfAccumulate("conflictByPrefetch", io.a_req.valid && Cat(pretch_block_vec).orR)
-  XSPerfAccumulate("conflictB", io.b_req.valid && conflict_b)
-  XSPerfAccumulate("conflictC", io.c_req.valid && conflict_c)
+  XSPerfAccumulate(cacheParams, "nrWorkingABCmshr", PopCount(io.status.init.init.map(_.valid)))
+  XSPerfAccumulate(cacheParams, "nrWorkingBmshr", io.status.take(mshrs+1).last.valid)
+  XSPerfAccumulate(cacheParams, "nrWorkingCmshr", io.status.last.valid)
+  XSPerfAccumulate(cacheParams, "conflictA", io.a_req.valid && conflict_a)
+  XSPerfAccumulate(cacheParams, "conflictByPrefetch", io.a_req.valid && Cat(pretch_block_vec).orR)
+  XSPerfAccumulate(cacheParams, "conflictB", io.b_req.valid && conflict_b)
+  XSPerfAccumulate(cacheParams, "conflictC", io.c_req.valid && conflict_c)
   //val perfinfo = IO(new Bundle(){
   //  val perfEvents = Output(new PerfEventsBundle(numPCntHcMSHR))
   //})

diff --git a/src/main/scala/huancun/RequestBuffer.scala b/src/main/scala/huancun/RequestBuffer.scala
@@ -3,7 +3,8 @@ package huancun
 import org.chipsalliance.cde.config.Parameters
 import chisel3._
 import chisel3.util._
-import utility.{FastArbiter, XSPerfAccumulate}
+import huancun.utils.XSPerfAccumulate
+import utility.FastArbiter
 
 class RequestBuffer(flow: Boolean = true, entries: Int = 16)(implicit p: Parameters) extends HuanCunModule {
 
@@ -91,18 +92,18 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 16)(implicit p: Paramet
     }
   }
 
-  XSPerfAccumulate("req_buffer_merge", dup && !full)
+  XSPerfAccumulate(cacheParams, "req_buffer_merge", dup && !full)
   if(flow){
-    XSPerfAccumulate("req_buffer_flow", no_ready_entry && io.in.fire)
+    XSPerfAccumulate(cacheParams, "req_buffer_flow", no_ready_entry && io.in.fire)
   }
-  XSPerfAccumulate("req_buffer_alloc", alloc)
-  XSPerfAccumulate("req_buffer_full", full)
+  XSPerfAccumulate(cacheParams, "req_buffer_alloc", alloc)
+  XSPerfAccumulate(cacheParams, "req_buffer_full", full)
   for(i <- 0 until entries){
     val update = PopCount(valids) === i.U
-    XSPerfAccumulate(s"req_buffer_util_$i", update)
+    XSPerfAccumulate(cacheParams, s"req_buffer_util_$i", update)
   }
-  XSPerfAccumulate("recv_prefetch", io.in.fire && io.in.bits.isPrefetch.getOrElse(false.B))
-  XSPerfAccumulate("recv_normal", io.in.fire && !io.in.bits.isPrefetch.getOrElse(false.B))
+  XSPerfAccumulate(cacheParams, "recv_prefetch", io.in.fire && io.in.bits.isPrefetch.getOrElse(false.B))
+  XSPerfAccumulate(cacheParams, "recv_normal", io.in.fire && !io.in.bits.isPrefetch.getOrElse(false.B))
   val perfinfo = IO(Output(Vec(numPCntHcReqb, (UInt(6.W)))))
   val perfEvents = Seq(
     ("req_buffer_merge          ", dup && !full                                             ),

diff --git a/src/main/scala/huancun/TopDownMonitor.scala b/src/main/scala/huancun/TopDownMonitor.scala
@@ -4,7 +4,8 @@ import org.chipsalliance.cde.config.Parameters
 import chisel3._
 import chisel3.util._
 import huancun.noninclusive.DirResult
-import utility.{MemReqSource, XSPerfAccumulate, XSPerfHistogram}
+import huancun.utils.{XSPerfAccumulate, XSPerfHistogram}
+import utility.MemReqSource
 
 class TopDownMonitor()(implicit p: Parameters) extends HuanCunModule {
   val banks = 1 << bankBits
@@ -35,7 +36,7 @@ class TopDownMonitor()(implicit p: Parameters) extends HuanCunModule {
     }
 
     addrMatch := Cat(addrMatchVec.flatten).orR
-    XSPerfAccumulate(s"${cacheParams.name}MissMatch_${hartId}", addrMatch)
+    XSPerfAccumulate(cacheParams, s"${cacheParams.name}MissMatch_${hartId}", addrMatch)
   }
 
   /* ====== PART TWO ======
@@ -55,16 +56,16 @@ class TopDownMonitor()(implicit p: Parameters) extends HuanCunModule {
   // val missVecAll      = allMSHRMatchVec(s => s.fromA && s.is_miss)
 
   val totalMSHRs = banks * mshrsAll
-  XSPerfHistogram("parallel_misses_CPU" , PopCount(missVecCPU), true.B, 0, totalMSHRs, 1)
-  XSPerfHistogram("parallel_misses_Pref", PopCount(missVecPref), true.B, 0, totalMSHRs, 1)
-  XSPerfHistogram("parallel_misses_All" , PopCount(missVecCPU)+PopCount(missVecPref), true.B, 0, 32, 1)
+  XSPerfHistogram(cacheParams, "parallel_misses_CPU" , PopCount(missVecCPU), true.B, 0, totalMSHRs, 1)
+  XSPerfHistogram(cacheParams, "parallel_misses_Pref", PopCount(missVecPref), true.B, 0, totalMSHRs, 1)
+  XSPerfHistogram(cacheParams, "parallel_misses_All" , PopCount(missVecCPU)+PopCount(missVecPref), true.B, 0, 32, 1)
 
   /* ====== PART THREE ======
  * Distinguish req sources and count num & miss
  */
   // count releases
   val releaseCnt = allMSHRMatchVec(s => s.will_free && s.fromC)
-  XSPerfAccumulate(s"${cacheParams.name}C_ReleaseCnt_Total", PopCount(releaseCnt))
+  XSPerfAccumulate(cacheParams, s"${cacheParams.name}C_ReleaseCnt_Total", PopCount(releaseCnt))
 
   // we can follow the counting logic of Directory to count
   // add reqSource in replacerInfo, set in MSHRAlloc, passes in Directory and get the result in DirResult
@@ -80,7 +81,7 @@ class TopDownMonitor()(implicit p: Parameters) extends HuanCunModule {
     val sourceMatchVecMiss = dirResultMatchVec(r => r.replacerInfo.reqSource === i.U && !r.self.hit)
 
     val sourceName = MemReqSource.apply(i).toString
-    XSPerfAccumulate(s"E2_${cacheParams.name}AReqSource_${sourceName}_Total", PopCount(sourceMatchVec))
-    XSPerfAccumulate(s"E2_${cacheParams.name}AReqSource_${sourceName}_Miss", PopCount(sourceMatchVecMiss))
+    XSPerfAccumulate(cacheParams, s"E2_${cacheParams.name}AReqSource_${sourceName}_Total", PopCount(sourceMatchVec))
+    XSPerfAccumulate(cacheParams, s"E2_${cacheParams.name}AReqSource_${sourceName}_Miss", PopCount(sourceMatchVecMiss))
   }
 }
diff --git a/src/main/scala/huancun/noninclusive/Directory.scala b/src/main/scala/huancun/noninclusive/Directory.scala
@@ -8,7 +8,7 @@ import huancun.MetaData._
 import huancun._
 import huancun.debug.{DirectoryLogger, TypeId}
 import huancun.utils._
-import utility.{GTimer, ParallelMax, ParallelPriorityMux, XSPerfAccumulate}
+import utility.{ParallelMax, ParallelPriorityMux}
 
 trait HasClientInfo { this: HasHuanCunParameters =>
   // assume all clients have same params
@@ -316,18 +316,18 @@ class Directory(implicit p: Parameters)
 
   assert(dirReadPorts == 1)
   val req_r = RegEnable(req.bits, req.fire)
-  XSPerfAccumulate("selfdir_A_req", req_r.replacerInfo.channel(0) && resp.valid)
-  XSPerfAccumulate("selfdir_A_hit", RegNext(req_r.replacerInfo.channel(0) && resp.valid) && resp.bits.self.hit)
-  XSPerfAccumulate("selfdir_B_req", req_r.replacerInfo.channel(1) && resp.valid)
-  XSPerfAccumulate("selfdir_B_hit", RegNext(req_r.replacerInfo.channel(1) && resp.valid) && resp.bits.self.hit)
-  XSPerfAccumulate("selfdir_C_req", req_r.replacerInfo.channel(2) && resp.valid)
-  XSPerfAccumulate("selfdir_C_hit", RegNext(req_r.replacerInfo.channel(2) && resp.valid) && resp.bits.self.hit)
+  XSPerfAccumulate(cacheParams, "selfdir_A_req", req_r.replacerInfo.channel(0) && resp.valid)
+  XSPerfAccumulate(cacheParams, "selfdir_A_hit", RegNext(req_r.replacerInfo.channel(0) && resp.valid) && resp.bits.self.hit)
+  XSPerfAccumulate(cacheParams, "selfdir_B_req", req_r.replacerInfo.channel(1) && resp.valid)
+  XSPerfAccumulate(cacheParams, "selfdir_B_hit", RegNext(req_r.replacerInfo.channel(1) && resp.valid) && resp.bits.self.hit)
+  XSPerfAccumulate(cacheParams, "selfdir_C_req", req_r.replacerInfo.channel(2) && resp.valid)
+  XSPerfAccumulate(cacheParams, "selfdir_C_hit", RegNext(req_r.replacerInfo.channel(2) && resp.valid) && resp.bits.self.hit)
 
-  XSPerfAccumulate("selfdir_dirty", RegNext(resp.valid) && resp.bits.self.dirty)
-  XSPerfAccumulate("selfdir_TIP", RegNext(resp.valid) && resp.bits.self.state === TIP)
-  XSPerfAccumulate("selfdir_BRANCH", RegNext(resp.valid) && resp.bits.self.state === BRANCH)
-  XSPerfAccumulate("selfdir_TRUNK", RegNext(resp.valid) && resp.bits.self.state === TRUNK)
-  XSPerfAccumulate("selfdir_INVALID", RegNext(resp.valid) && resp.bits.self.state === INVALID)
+  XSPerfAccumulate(cacheParams, "selfdir_dirty", RegNext(resp.valid) && resp.bits.self.dirty)
+  XSPerfAccumulate(cacheParams, "selfdir_TIP", RegNext(resp.valid) && resp.bits.self.state === TIP)
+  XSPerfAccumulate(cacheParams, "selfdir_BRANCH", RegNext(resp.valid) && resp.bits.self.state === BRANCH)
+  XSPerfAccumulate(cacheParams, "selfdir_TRUNK", RegNext(resp.valid) && resp.bits.self.state === TRUNK)
+  XSPerfAccumulate(cacheParams, "selfdir_INVALID", RegNext(resp.valid) && resp.bits.self.state === INVALID)
   //val perfinfo = IO(new Bundle(){
   //  val perfEvents = Output(new PerfEventsBundle(numPCntHcDir))
   //})

diff --git a/src/main/scala/huancun/noninclusive/ProbeHelper.scala b/src/main/scala/huancun/noninclusive/ProbeHelper.scala
@@ -5,7 +5,8 @@ import chisel3._
 import chisel3.util._
 import freechips.rocketchip.tilelink.{TLMessages, TLPermissions}
 import huancun.{HuanCunModule, MSHRRequest, MetaData}
-import utility.{MemReqSource, XSPerfAccumulate}
+import huancun.utils.XSPerfAccumulate
+import utility.MemReqSource
 
 class ProbeHelper(entries: Int = 5, enqDelay: Int = 1)(implicit p: Parameters)
   extends HuanCunModule with HasClientInfo
@@ -64,7 +65,7 @@ class ProbeHelper(entries: Int = 5, enqDelay: Int = 1)(implicit p: Parameters)
 
   io.probe <> queue.io.deq
 
-  XSPerfAccumulate("client_dir_conflict", queue.io.enq.fire)
+  XSPerfAccumulate(cacheParams, "client_dir_conflict", queue.io.enq.fire)
   //val perfinfo = IO(new Bundle(){
   //  val perfEvents = Output(new PerfEventsBundle(numPCntHcReqb))
   //})

diff --git a/src/main/scala/huancun/utils/XSPerfAccumulate.scala b/src/main/scala/huancun/utils/XSPerfAccumulate.scala
@@ -0,0 +1,122 @@
+package huancun.utils
+
+import chisel3._
+import huancun.HCCacheParameters
+import utility.{LogPerfHelper, LogPerfIO}
+
+object XSPerfAccumulate {
+  def apply(params: HCCacheParameters, perfName: String, perfCnt: UInt) = {
+    if (params.enablePerf && !params.FPGAPlatform) {
+      val helper = Module(new LogPerfHelper)
+      val perfClean = helper.io.clean
+      val perfDump = helper.io.dump
+
+      val counter = RegInit(0.U(64.W))
+      val next_counter = counter + perfCnt
+      counter := Mux(perfClean, 0.U, next_counter)
+
+      when(perfDump) {
+        XSPerfPrint(p"$perfName, $next_counter\n")(helper.io)
+      }
+    }
+  }
+}
+
+object XSPerfHistogram {
+  // instead of simply accumulating counters
+  // this function draws a histogram
+  def apply(
+    params:   HCCacheParameters,
+    perfName: String,
+    perfCnt:  UInt,
+    enable:   Bool,
+    start:    Int,
+    stop:     Int,
+    step:     Int,
+    lStrict:  Boolean = false,
+    rStrict:  Boolean = false
+  ) = {
+    if (params.enablePerf && !params.FPGAPlatform) {
+      val helper = Module(new LogPerfHelper)
+      val perfClean = helper.io.clean
+      val perfDump = helper.io.dump
+
+      // drop each perfCnt value into a bin
+      val nBins = (stop - start) / step
+      require(start >= 0)
+      require(stop > start)
+      require(nBins > 0)
+
+      (0 until nBins).map { i =>
+        val binRangeStart = start + i * step
+        val binRangeStop = start + (i + 1) * step
+        val inRange = perfCnt >= binRangeStart.U && perfCnt < binRangeStop.U
+
+        // if !lStrict and perfCnt < start, it will go to the first bin
+        val leftOutOfRange = if(!lStrict) perfCnt < start.U && i.U === 0.U else false.B
+        // if !rStrict and perfCnt >= stop, it will go to the last bin
+        val rightOutOfRange = if(!rStrict) perfCnt >= stop.U && i.U === (nBins - 1).U else false.B
+        val inc = inRange || leftOutOfRange || rightOutOfRange
+
+        val counter = RegInit(0.U(64.W))
+        when(perfClean) {
+          counter := 0.U
+        }.elsewhen(enable && inc) {
+          counter := counter + 1.U
+        }
+
+        when(perfDump) {
+          XSPerfPrint(p"${perfName}_${binRangeStart}_${binRangeStop}, $counter\n")(helper.io)
+        }
+      }
+    }
+  }
+}
+
+object XSPerfMax {
+  def apply(params: HCCacheParameters, perfName: String, perfCnt: UInt, enable: Bool) = {
+    if (params.enablePerf && !params.FPGAPlatform) {
+      val helper = Module(new LogPerfHelper)
+      val perfClean = helper.io.clean
+      val perfDump = helper.io.dump
+
+      val max = RegInit(0.U(64.W))
+      val next_max = Mux(enable && (perfCnt > max), perfCnt, max)
+      max := Mux(perfClean, 0.U, next_max)
+
+      when(perfDump) {
+        XSPerfPrint(p"${perfName}_max, $next_max\n")(helper.io)
+      }
+    }
+  }
+}
+
+object TransactionLatencyCounter {
+  // count the latency between start signal and stop signal
+  // whenever stop signals comes, we create a latency sample
+  def apply(start: Bool, stop: Bool): (Bool, UInt) = {
+    assert(!(start && stop))
+    val counter = RegInit(0.U(64.W))
+    val next_counter = counter + 1.U
+    counter := Mux(start || stop, 0.U, next_counter)
+    (stop, next_counter)
+  }
+}
+
+object XSPerfPrint {
+  def apply(fmt: String, data: Bits*)(ctrlInfo: LogPerfIO): Any =
+    apply(Printable.pack(fmt, data: _*))(ctrlInfo)
+
+  def apply(pable: Printable)(ctrlInfo: LogPerfIO): Any = {
+    val commonInfo = p"[PERF ][time=${ctrlInfo.timer}] __PERCENTAGE_M__: "
+    printf(commonInfo + pable)
+  }
+}
+
+object GTimer {
+  def apply() = {
+    val c = RegInit(0.U(64.W))
+    c := c + 1.U
+    c
+  }
+}
+1 −1		src/main/scala/utility/CircularQueuePtr.scala
+12 −4		src/main/scala/utility/ClockGate.scala
+2 −143		src/main/scala/utility/ClockGatedReg.scala
+9 −18		src/main/scala/utility/Constantin.scala
+9 −9		src/main/scala/utility/Hold.scala
+0 −92		src/main/scala/utility/LogUtils.scala
+1 −1		src/main/scala/utility/ParallelMux.scala
+0 −310		src/main/scala/utility/PerfCounterUtils.scala
+0 −1		src/main/scala/utility/TLUtils/BusKeyField.scala
+1 −1		src/main/scala/amba/ahb/RegisterRouter.scala
+1 −1		src/main/scala/amba/apb/RegisterRouter.scala
+1 −1		src/main/scala/amba/axi4/RegisterRouter.scala
+1 −5		src/main/scala/devices/tilelink/CLINT.scala
+1 −1		src/main/scala/diplomacy/Parameters.scala
+1 −1		src/main/scala/jtag/JtagStateMachine.scala
+1 −1		src/main/scala/jtag/package.scala
+4 −4		src/main/scala/regmapper/RegField.scala
+3 −3		src/main/scala/rocket/Frontend.scala
+1 −1		src/main/scala/rocket/HellaCache.scala
+4 −42		src/main/scala/rocket/RVC.scala
+1 −1		src/main/scala/subsystem/BaseSubsystem.scala
+1 −1		src/main/scala/tilelink/PatternPusher.scala
+1 −1		src/main/scala/tilelink/RegisterRouter.scala
+1 −1		src/main/scala/tilelink/Xbar.scala
+3 −3		src/main/scala/util/PlusArg.scala
+1 −1		src/main/scala/util/ShiftQueue.scala