Skip to content

Commit

Permalink
Revert "perf: use perfUtils in Utility (#179)"
Browse files Browse the repository at this point in the history
This reverts commit f9dffb2.
  • Loading branch information
wakafa1 committed Dec 10, 2024
1 parent 3fc7e7e commit fd4398f
Show file tree
Hide file tree
Showing 12 changed files with 182 additions and 119 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
# This workflow contains a single job called "build"
tl-test_L2:
# The type of runner that the job will run on
runs-on: ubuntu-24.04
runs-on: ubuntu-latest

# Steps represent a sequence of tasks that will be executed as part of the job
steps:
Expand All @@ -35,7 +35,7 @@ jobs:
uses: coursier/cache-action@v5

- name: Verilator
run: sudo apt install verilator libsqlite3-dev
run: sudo apt install verilator

- name: Setup Mill
uses: jodersky/[email protected]
Expand All @@ -60,7 +60,7 @@ jobs:
tl-test_L2L3:
# The type of runner that the job will run on
runs-on: ubuntu-24.04
runs-on: ubuntu-latest

# Steps represent a sequence of tasks that will be executed as part of the job
steps:
Expand All @@ -77,7 +77,7 @@ jobs:
uses: coursier/cache-action@v5

- name: Verilator
run: sudo apt install verilator libsqlite3-dev
run: sudo apt install verilator

- name: Setup Mill
uses: jodersky/[email protected]
Expand Down
4 changes: 2 additions & 2 deletions src/main/scala/huancun/DataStorage.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ package huancun
import org.chipsalliance.cde.config.Parameters
import chisel3._
import chisel3.util._
import huancun.utils.SRAMWrapper
import huancun.utils.{SRAMWrapper, XSPerfAccumulate}
import utility._

class DataStorage(implicit p: Parameters) extends HuanCunModule {
Expand Down Expand Up @@ -264,7 +264,7 @@ class DataStorage(implicit p: Parameters) extends HuanCunModule {
val debug_stack_used = PopCount(bank_en.grouped(stackSize).toList.map(seq => Cat(seq).orR))

for (i <- 1 to nrStacks) {
XSPerfAccumulate(s"DS_${i}_stacks_used", debug_stack_used === i.U)
XSPerfAccumulate(cacheParams, s"DS_${i}_stacks_used", debug_stack_used === i.U)
}

}
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/huancun/HuanCun.scala
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import freechips.rocketchip.tilelink._
import freechips.rocketchip.tilelink.TLMessages._
import freechips.rocketchip.util.{BundleField, BundleFieldBase, UIntToOH1}
import huancun.prefetch._
import utils.ResetGen
import utils.{ResetGen, XSPerfAccumulate}
import utility.{Pipeline, FastArbiter}
import huancun.noninclusive.MSHR

Expand Down
22 changes: 11 additions & 11 deletions src/main/scala/huancun/MSHRAlloc.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import org.chipsalliance.cde.config.Parameters
import chisel3._
import chisel3.util._
import huancun.utils._
import utility._
import utility.{ParallelOR, ParallelPriorityMux}
import freechips.rocketchip.tilelink._

class MSHRSelector(implicit p: Parameters) extends HuanCunModule {
Expand Down Expand Up @@ -192,9 +192,9 @@ class MSHRAlloc(implicit p: Parameters) extends HuanCunModule {
}
val cntEnable =
!io.status(i).valid && cnt =/= 0.U && cntStart && cnt < 5000.U // Ignore huge cnt during L3 dir reset
XSPerfHistogram("mshr_latency_" + Integer.toString(i, 10), cnt, cntEnable, 0, 300, 10, right_strict = true)
XSPerfHistogram("mshr_latency_" + Integer.toString(i, 10), cnt, cntEnable, 300, 1000, 50, right_strict = true)
XSPerfMax("mshr_latency", cnt, cntEnable)
XSPerfHistogram(cacheParams, "mshr_latency_" + Integer.toString(i, 10), cnt, cntEnable, 0, 300, 10, rStrict = true)
XSPerfHistogram(cacheParams, "mshr_latency_" + Integer.toString(i, 10), cnt, cntEnable, 300, 1000, 50, lStrict = true)
XSPerfMax(cacheParams, "mshr_latency", cnt, cntEnable)
}
}

Expand All @@ -203,13 +203,13 @@ class MSHRAlloc(implicit p: Parameters) extends HuanCunModule {
(s.bits.set(block_granularity - 1, 0) === io.a_req.bits.set(block_granularity - 1, 0))
))

XSPerfAccumulate("nrWorkingABCmshr", PopCount(io.status.init.init.map(_.valid)))
XSPerfAccumulate("nrWorkingBmshr", io.status.take(mshrs+1).last.valid)
XSPerfAccumulate("nrWorkingCmshr", io.status.last.valid)
XSPerfAccumulate("conflictA", io.a_req.valid && conflict_a)
XSPerfAccumulate("conflictByPrefetch", io.a_req.valid && Cat(pretch_block_vec).orR)
XSPerfAccumulate("conflictB", io.b_req.valid && conflict_b)
XSPerfAccumulate("conflictC", io.c_req.valid && conflict_c)
XSPerfAccumulate(cacheParams, "nrWorkingABCmshr", PopCount(io.status.init.init.map(_.valid)))
XSPerfAccumulate(cacheParams, "nrWorkingBmshr", io.status.take(mshrs+1).last.valid)
XSPerfAccumulate(cacheParams, "nrWorkingCmshr", io.status.last.valid)
XSPerfAccumulate(cacheParams, "conflictA", io.a_req.valid && conflict_a)
XSPerfAccumulate(cacheParams, "conflictByPrefetch", io.a_req.valid && Cat(pretch_block_vec).orR)
XSPerfAccumulate(cacheParams, "conflictB", io.b_req.valid && conflict_b)
XSPerfAccumulate(cacheParams, "conflictC", io.c_req.valid && conflict_c)
//val perfinfo = IO(new Bundle(){
// val perfEvents = Output(new PerfEventsBundle(numPCntHcMSHR))
//})
Expand Down
17 changes: 9 additions & 8 deletions src/main/scala/huancun/RequestBuffer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ package huancun
import org.chipsalliance.cde.config.Parameters
import chisel3._
import chisel3.util._
import utility.{FastArbiter, XSPerfAccumulate}
import huancun.utils.XSPerfAccumulate
import utility.FastArbiter

class RequestBuffer(flow: Boolean = true, entries: Int = 16)(implicit p: Parameters) extends HuanCunModule {

Expand Down Expand Up @@ -91,18 +92,18 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 16)(implicit p: Paramet
}
}

XSPerfAccumulate("req_buffer_merge", dup && !full)
XSPerfAccumulate(cacheParams, "req_buffer_merge", dup && !full)
if(flow){
XSPerfAccumulate("req_buffer_flow", no_ready_entry && io.in.fire)
XSPerfAccumulate(cacheParams, "req_buffer_flow", no_ready_entry && io.in.fire)
}
XSPerfAccumulate("req_buffer_alloc", alloc)
XSPerfAccumulate("req_buffer_full", full)
XSPerfAccumulate(cacheParams, "req_buffer_alloc", alloc)
XSPerfAccumulate(cacheParams, "req_buffer_full", full)
for(i <- 0 until entries){
val update = PopCount(valids) === i.U
XSPerfAccumulate(s"req_buffer_util_$i", update)
XSPerfAccumulate(cacheParams, s"req_buffer_util_$i", update)
}
XSPerfAccumulate("recv_prefetch", io.in.fire && io.in.bits.isPrefetch.getOrElse(false.B))
XSPerfAccumulate("recv_normal", io.in.fire && !io.in.bits.isPrefetch.getOrElse(false.B))
XSPerfAccumulate(cacheParams, "recv_prefetch", io.in.fire && io.in.bits.isPrefetch.getOrElse(false.B))
XSPerfAccumulate(cacheParams, "recv_normal", io.in.fire && !io.in.bits.isPrefetch.getOrElse(false.B))
val perfinfo = IO(Output(Vec(numPCntHcReqb, (UInt(6.W)))))
val perfEvents = Seq(
("req_buffer_merge ", dup && !full ),
Expand Down
17 changes: 9 additions & 8 deletions src/main/scala/huancun/TopDownMonitor.scala
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ import org.chipsalliance.cde.config.Parameters
import chisel3._
import chisel3.util._
import huancun.noninclusive.DirResult
import utility.{MemReqSource, XSPerfAccumulate, XSPerfHistogram}
import huancun.utils.{XSPerfAccumulate, XSPerfHistogram}
import utility.MemReqSource

class TopDownMonitor()(implicit p: Parameters) extends HuanCunModule {
val banks = 1 << bankBits
Expand Down Expand Up @@ -35,7 +36,7 @@ class TopDownMonitor()(implicit p: Parameters) extends HuanCunModule {
}

addrMatch := Cat(addrMatchVec.flatten).orR
XSPerfAccumulate(s"${cacheParams.name}MissMatch_${hartId}", addrMatch)
XSPerfAccumulate(cacheParams, s"${cacheParams.name}MissMatch_${hartId}", addrMatch)
}

/* ====== PART TWO ======
Expand All @@ -55,16 +56,16 @@ class TopDownMonitor()(implicit p: Parameters) extends HuanCunModule {
// val missVecAll = allMSHRMatchVec(s => s.fromA && s.is_miss)

val totalMSHRs = banks * mshrsAll
XSPerfHistogram("parallel_misses_CPU" , PopCount(missVecCPU), true.B, 0, totalMSHRs, 1)
XSPerfHistogram("parallel_misses_Pref", PopCount(missVecPref), true.B, 0, totalMSHRs, 1)
XSPerfHistogram("parallel_misses_All" , PopCount(missVecCPU)+PopCount(missVecPref), true.B, 0, 32, 1)
XSPerfHistogram(cacheParams, "parallel_misses_CPU" , PopCount(missVecCPU), true.B, 0, totalMSHRs, 1)
XSPerfHistogram(cacheParams, "parallel_misses_Pref", PopCount(missVecPref), true.B, 0, totalMSHRs, 1)
XSPerfHistogram(cacheParams, "parallel_misses_All" , PopCount(missVecCPU)+PopCount(missVecPref), true.B, 0, 32, 1)

/* ====== PART THREE ======
* Distinguish req sources and count num & miss
*/
// count releases
val releaseCnt = allMSHRMatchVec(s => s.will_free && s.fromC)
XSPerfAccumulate(s"${cacheParams.name}C_ReleaseCnt_Total", PopCount(releaseCnt))
XSPerfAccumulate(cacheParams, s"${cacheParams.name}C_ReleaseCnt_Total", PopCount(releaseCnt))

// we can follow the counting logic of Directory to count
// add reqSource in replacerInfo, set in MSHRAlloc, passes in Directory and get the result in DirResult
Expand All @@ -80,7 +81,7 @@ class TopDownMonitor()(implicit p: Parameters) extends HuanCunModule {
val sourceMatchVecMiss = dirResultMatchVec(r => r.replacerInfo.reqSource === i.U && !r.self.hit)

val sourceName = MemReqSource.apply(i).toString
XSPerfAccumulate(s"E2_${cacheParams.name}AReqSource_${sourceName}_Total", PopCount(sourceMatchVec))
XSPerfAccumulate(s"E2_${cacheParams.name}AReqSource_${sourceName}_Miss", PopCount(sourceMatchVecMiss))
XSPerfAccumulate(cacheParams, s"E2_${cacheParams.name}AReqSource_${sourceName}_Total", PopCount(sourceMatchVec))
XSPerfAccumulate(cacheParams, s"E2_${cacheParams.name}AReqSource_${sourceName}_Miss", PopCount(sourceMatchVecMiss))
}
}
24 changes: 12 additions & 12 deletions src/main/scala/huancun/noninclusive/Directory.scala
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import huancun.MetaData._
import huancun._
import huancun.debug.{DirectoryLogger, TypeId}
import huancun.utils._
import utility.{GTimer, ParallelMax, ParallelPriorityMux, XSPerfAccumulate}
import utility.{ParallelMax, ParallelPriorityMux}

trait HasClientInfo { this: HasHuanCunParameters =>
// assume all clients have same params
Expand Down Expand Up @@ -316,18 +316,18 @@ class Directory(implicit p: Parameters)

assert(dirReadPorts == 1)
val req_r = RegEnable(req.bits, req.fire)
XSPerfAccumulate("selfdir_A_req", req_r.replacerInfo.channel(0) && resp.valid)
XSPerfAccumulate("selfdir_A_hit", RegNext(req_r.replacerInfo.channel(0) && resp.valid) && resp.bits.self.hit)
XSPerfAccumulate("selfdir_B_req", req_r.replacerInfo.channel(1) && resp.valid)
XSPerfAccumulate("selfdir_B_hit", RegNext(req_r.replacerInfo.channel(1) && resp.valid) && resp.bits.self.hit)
XSPerfAccumulate("selfdir_C_req", req_r.replacerInfo.channel(2) && resp.valid)
XSPerfAccumulate("selfdir_C_hit", RegNext(req_r.replacerInfo.channel(2) && resp.valid) && resp.bits.self.hit)
XSPerfAccumulate(cacheParams, "selfdir_A_req", req_r.replacerInfo.channel(0) && resp.valid)
XSPerfAccumulate(cacheParams, "selfdir_A_hit", RegNext(req_r.replacerInfo.channel(0) && resp.valid) && resp.bits.self.hit)
XSPerfAccumulate(cacheParams, "selfdir_B_req", req_r.replacerInfo.channel(1) && resp.valid)
XSPerfAccumulate(cacheParams, "selfdir_B_hit", RegNext(req_r.replacerInfo.channel(1) && resp.valid) && resp.bits.self.hit)
XSPerfAccumulate(cacheParams, "selfdir_C_req", req_r.replacerInfo.channel(2) && resp.valid)
XSPerfAccumulate(cacheParams, "selfdir_C_hit", RegNext(req_r.replacerInfo.channel(2) && resp.valid) && resp.bits.self.hit)

XSPerfAccumulate("selfdir_dirty", RegNext(resp.valid) && resp.bits.self.dirty)
XSPerfAccumulate("selfdir_TIP", RegNext(resp.valid) && resp.bits.self.state === TIP)
XSPerfAccumulate("selfdir_BRANCH", RegNext(resp.valid) && resp.bits.self.state === BRANCH)
XSPerfAccumulate("selfdir_TRUNK", RegNext(resp.valid) && resp.bits.self.state === TRUNK)
XSPerfAccumulate("selfdir_INVALID", RegNext(resp.valid) && resp.bits.self.state === INVALID)
XSPerfAccumulate(cacheParams, "selfdir_dirty", RegNext(resp.valid) && resp.bits.self.dirty)
XSPerfAccumulate(cacheParams, "selfdir_TIP", RegNext(resp.valid) && resp.bits.self.state === TIP)
XSPerfAccumulate(cacheParams, "selfdir_BRANCH", RegNext(resp.valid) && resp.bits.self.state === BRANCH)
XSPerfAccumulate(cacheParams, "selfdir_TRUNK", RegNext(resp.valid) && resp.bits.self.state === TRUNK)
XSPerfAccumulate(cacheParams, "selfdir_INVALID", RegNext(resp.valid) && resp.bits.self.state === INVALID)
//val perfinfo = IO(new Bundle(){
// val perfEvents = Output(new PerfEventsBundle(numPCntHcDir))
//})
Expand Down
5 changes: 3 additions & 2 deletions src/main/scala/huancun/noninclusive/ProbeHelper.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ import chisel3._
import chisel3.util._
import freechips.rocketchip.tilelink.{TLMessages, TLPermissions}
import huancun.{HuanCunModule, MSHRRequest, MetaData}
import utility.{MemReqSource, XSPerfAccumulate}
import huancun.utils.XSPerfAccumulate
import utility.MemReqSource

class ProbeHelper(entries: Int = 5, enqDelay: Int = 1)(implicit p: Parameters)
extends HuanCunModule with HasClientInfo
Expand Down Expand Up @@ -64,7 +65,7 @@ class ProbeHelper(entries: Int = 5, enqDelay: Int = 1)(implicit p: Parameters)

io.probe <> queue.io.deq

XSPerfAccumulate("client_dir_conflict", queue.io.enq.fire)
XSPerfAccumulate(cacheParams, "client_dir_conflict", queue.io.enq.fire)
//val perfinfo = IO(new Bundle(){
// val perfEvents = Output(new PerfEventsBundle(numPCntHcReqb))
//})
Expand Down
122 changes: 122 additions & 0 deletions src/main/scala/huancun/utils/XSPerfAccumulate.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
package huancun.utils

import chisel3._
import huancun.HCCacheParameters
import utility.{LogPerfHelper, LogPerfIO}

object XSPerfAccumulate {
def apply(params: HCCacheParameters, perfName: String, perfCnt: UInt) = {
if (params.enablePerf && !params.FPGAPlatform) {
val helper = Module(new LogPerfHelper)
val perfClean = helper.io.clean
val perfDump = helper.io.dump

val counter = RegInit(0.U(64.W))
val next_counter = counter + perfCnt
counter := Mux(perfClean, 0.U, next_counter)

when(perfDump) {
XSPerfPrint(p"$perfName, $next_counter\n")(helper.io)
}
}
}
}

object XSPerfHistogram {
// instead of simply accumulating counters
// this function draws a histogram
def apply(
params: HCCacheParameters,
perfName: String,
perfCnt: UInt,
enable: Bool,
start: Int,
stop: Int,
step: Int,
lStrict: Boolean = false,
rStrict: Boolean = false
) = {
if (params.enablePerf && !params.FPGAPlatform) {
val helper = Module(new LogPerfHelper)
val perfClean = helper.io.clean
val perfDump = helper.io.dump

// drop each perfCnt value into a bin
val nBins = (stop - start) / step
require(start >= 0)
require(stop > start)
require(nBins > 0)

(0 until nBins).map { i =>
val binRangeStart = start + i * step
val binRangeStop = start + (i + 1) * step
val inRange = perfCnt >= binRangeStart.U && perfCnt < binRangeStop.U

// if !lStrict and perfCnt < start, it will go to the first bin
val leftOutOfRange = if(!lStrict) perfCnt < start.U && i.U === 0.U else false.B
// if !rStrict and perfCnt >= stop, it will go to the last bin
val rightOutOfRange = if(!rStrict) perfCnt >= stop.U && i.U === (nBins - 1).U else false.B
val inc = inRange || leftOutOfRange || rightOutOfRange

val counter = RegInit(0.U(64.W))
when(perfClean) {
counter := 0.U
}.elsewhen(enable && inc) {
counter := counter + 1.U
}

when(perfDump) {
XSPerfPrint(p"${perfName}_${binRangeStart}_${binRangeStop}, $counter\n")(helper.io)
}
}
}
}
}

object XSPerfMax {
def apply(params: HCCacheParameters, perfName: String, perfCnt: UInt, enable: Bool) = {
if (params.enablePerf && !params.FPGAPlatform) {
val helper = Module(new LogPerfHelper)
val perfClean = helper.io.clean
val perfDump = helper.io.dump

val max = RegInit(0.U(64.W))
val next_max = Mux(enable && (perfCnt > max), perfCnt, max)
max := Mux(perfClean, 0.U, next_max)

when(perfDump) {
XSPerfPrint(p"${perfName}_max, $next_max\n")(helper.io)
}
}
}
}

object TransactionLatencyCounter {
// count the latency between start signal and stop signal
// whenever stop signals comes, we create a latency sample
def apply(start: Bool, stop: Bool): (Bool, UInt) = {
assert(!(start && stop))
val counter = RegInit(0.U(64.W))
val next_counter = counter + 1.U
counter := Mux(start || stop, 0.U, next_counter)
(stop, next_counter)
}
}

object XSPerfPrint {
def apply(fmt: String, data: Bits*)(ctrlInfo: LogPerfIO): Any =
apply(Printable.pack(fmt, data: _*))(ctrlInfo)

def apply(pable: Printable)(ctrlInfo: LogPerfIO): Any = {
val commonInfo = p"[PERF ][time=${ctrlInfo.timer}] __PERCENTAGE_M__: "
printf(commonInfo + pable)
}
}

object GTimer {
def apply() = {
val c = RegInit(0.U(64.W))
c := c + 1.U
c
}
}
Loading

0 comments on commit fd4398f

Please sign in to comment.