Skip to content

Commit

Permalink
coherence: Pack Valid/Dirty SRAM (#36)
Browse files Browse the repository at this point in the history
* sram: Replace by sram_pulp

Does not introduce cuts and allows for parametric byte width

Signed-off-by: Nils Wistoff <[email protected]>

* std_cache: Pack valid/dirty SRAM

Signed-off-by: Nils Wistoff <[email protected]>

* std_cache: Decouple Valid/Dirty WE from Data WE

Signed-off-by: Nils Wistoff <[email protected]>

---------

Signed-off-by: Nils Wistoff <[email protected]>
  • Loading branch information
niwis committed Dec 20, 2023
1 parent fac0db2 commit b8c3895
Show file tree
Hide file tree
Showing 8 changed files with 155 additions and 64 deletions.
2 changes: 1 addition & 1 deletion Bender.yml
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ sources:
- include_dirs:
- common/local/util
files:
- common/local/util/sram.sv
- common/local/util/sram_pulp.sv

- target: not(all(fpga, xilinx))
include_dirs:
Expand Down
90 changes: 90 additions & 0 deletions common/local/util/sram_pulp.sv
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Copyright 2018 ETH Zurich and University of Bologna.
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 0.51 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
// or agreed to in writing, software, hardware and materials distributed under
// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
//
// Author: Florian Zaruba <[email protected]>, ETH Zurich
// Michael Schaffner <[email protected]>, ETH Zurich
// Nils Wistoff <[email protected]>, ETH Zurich
// Date: 15.08.2018
// Description: generic tc_sram wrapper for CVA6
//
// Note: the wrapped module contains two different implementations for
// ALTERA and XILINX tools, since these follow different coding styles for
// inferrable RAMS with byte enable. define `FPGA_TARGET_XILINX or
// `FPGA_TARGET_ALTERA in your build environment (default is ALTERA)

module sram #(
parameter DATA_WIDTH = 64,
parameter BYTE_WIDTH = 8,
parameter USER_WIDTH = 1,
parameter USER_EN = 0,
parameter NUM_WORDS = 1024,
parameter SIM_INIT = "none",
parameter OUT_REGS = 0 // enables output registers in FPGA macro (read lat = 2)
)(
input logic clk_i,
input logic rst_ni,
input logic req_i,
input logic we_i,
input logic [$clog2(NUM_WORDS)-1:0] addr_i,
input logic [USER_WIDTH-1:0] wuser_i,
input logic [DATA_WIDTH-1:0] wdata_i,
input logic [(DATA_WIDTH+BYTE_WIDTH-1)/BYTE_WIDTH-1:0] be_i,
output logic [USER_WIDTH-1:0] ruser_o,
output logic [DATA_WIDTH-1:0] rdata_o
);

tc_sram #(
.NumWords ( NUM_WORDS ),
.DataWidth ( DATA_WIDTH ),
.ByteWidth ( BYTE_WIDTH ),
.NumPorts ( 32'd1 ),
.Latency ( 32'd1 ),
.SimInit ( SIM_INIT ),
.PrintSimCfg ( 1'b0 )
) i_tc_sram (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.req_i ( req_i ),
.we_i ( we_i ),
.be_i ( be_i ),
.wdata_i ( wdata_i ),
.addr_i ( addr_i ),
.rdata_o ( rdata_o )
);

if (USER_EN > 0) begin : gen_mem_user
tc_sram #(
.NumWords ( NUM_WORDS ),
.DataWidth ( DATA_WIDTH ),
.ByteWidth ( BYTE_WIDTH ),
.NumPorts ( 32'd1 ),
.Latency ( 32'd1 ),
.SimInit ( SIM_INIT ),
.PrintSimCfg ( 1'b0 )
) i_tc_sram_user (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.req_i ( req_i ),
.we_i ( we_i ),
.be_i ( be_i ),
.wdata_i ( wuser_i ),
.addr_i ( addr_i ),
.rdata_o ( ruser_o )
);

if (USER_WIDTH != DATA_WIDTH) begin : gen_err_data_user_width
$fatal(1, "sram_pulp: USER_WIDTH needs to be equal to DATA_WIDTH (if USER_EN is set).");
end

end else begin
assign ruser_o = '0;
end

endmodule : sram
2 changes: 1 addition & 1 deletion core/Flist.cva6
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ ${CVA6_REPO_DIR}/common/local/util/instr_tracer_if.sv
${CVA6_REPO_DIR}/common/local/util/instr_tracer.sv
${CVA6_REPO_DIR}/common/local/util/tc_sram_wrapper.sv
${CVA6_REPO_DIR}/vendor/pulp-platform/tech_cells_generic/src/rtl/tc_sram.sv
${CVA6_REPO_DIR}/common/local/util/sram.sv
${CVA6_REPO_DIR}/common/local/util/sram_pulp.sv

// MMU Sv39
${CVA6_REPO_DIR}/core/mmu_sv39/mmu.sv
Expand Down
13 changes: 7 additions & 6 deletions core/cache_subsystem/cache_ctrl.sv
Original file line number Diff line number Diff line change
Expand Up @@ -298,14 +298,15 @@ module cache_ctrl
// two memory look-ups on a single-ported SRAM and therefore is non-atomic
if (!mshr_index_matches_i) begin
// store data, write dirty bit
req_o = hit_way_q;
addr_o = mem_req_q.index;
we_o = 1'b1;

be_o.vldrty = hit_way_q;
req_o = hit_way_q;
addr_o = mem_req_q.index;
we_o = 1'b1;

// set the correct byte enable
be_o.data[cl_offset>>3+:8] = mem_req_q.be;
be_o.data[cl_offset>>3+:8] = mem_req_q.be;
for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin
if (hit_way_q[i]) be_o.vldrty[i] = '{valid: 1, dirty: be_o.data};
end
data_o.data[cl_offset+:64] = mem_req_q.wdata;
// ~> change the state
data_o.dirty[cl_offset>>3+:8] = mem_req_q.be;
Expand Down
19 changes: 12 additions & 7 deletions core/cache_subsystem/miss_handler.sv
Original file line number Diff line number Diff line change
Expand Up @@ -299,11 +299,14 @@ module miss_handler
// we've got a valid response from refill unit
if (valid_miss_fsm) begin

addr_o = mshr_q.addr[DCACHE_INDEX_WIDTH-1:0];
req_o = evict_way_q;
we_o = 1'b1;
be_o = '1;
be_o.vldrty = evict_way_q;
addr_o = mshr_q.addr[DCACHE_INDEX_WIDTH-1:0];
req_o = evict_way_q;
we_o = 1'b1;
be_o.tag = '1;
be_o.data = '1;
for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin
if (evict_way_q[i]) be_o.vldrty[i] = '1;
end
data_o.tag = mshr_q.addr[DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH-1:DCACHE_INDEX_WIDTH];
data_o.data = data_miss_fsm;
data_o.valid = 1'b1;
Expand Down Expand Up @@ -350,9 +353,11 @@ module miss_handler
we_o = 1'b1;
data_o.valid = INVALIDATE_ON_FLUSH ? 1'b0 : 1'b1;
// invalidate
be_o.vldrty = evict_way_q;
for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin
if (evict_way_q[i]) be_o.vldrty[i] = '1;
end
// go back to handling the miss or flushing, depending on where we came from
state_d = (state_q == WB_CACHELINE_MISS) ? MISS : FLUSH_REQ_STATUS;
state_d = (state_q == WB_CACHELINE_MISS) ? MISS : FLUSH_REQ_STATUS;
end
end

Expand Down
84 changes: 37 additions & 47 deletions core/cache_subsystem/std_nbdcache.sv
Original file line number Diff line number Diff line change
Expand Up @@ -55,46 +55,46 @@ module std_nbdcache
// 3. Load Unit
// 4. Accelerator
// 5. Store unit
logic [NumPorts:0][DCACHE_SET_ASSOC-1:0] req;
logic [NumPorts:0][DCACHE_INDEX_WIDTH-1:0] addr;
logic [NumPorts:0] gnt;
cache_line_t [DCACHE_SET_ASSOC-1:0] rdata;
logic [NumPorts:0][DCACHE_TAG_WIDTH-1:0] tag;
logic [ NumPorts:0][ DCACHE_SET_ASSOC-1:0] req;
logic [ NumPorts:0][DCACHE_INDEX_WIDTH-1:0] addr;
logic [ NumPorts:0] gnt;
cache_line_t [ DCACHE_SET_ASSOC-1:0] rdata;
logic [ NumPorts:0][ DCACHE_TAG_WIDTH-1:0] tag;

cache_line_t [NumPorts:0] wdata;
logic [NumPorts:0] we;
cl_be_t [NumPorts:0] be;
logic [DCACHE_SET_ASSOC-1:0] hit_way;
cache_line_t [ NumPorts:0] wdata;
logic [ NumPorts:0] we;
cl_be_t [ NumPorts:0] be;
logic [ DCACHE_SET_ASSOC-1:0] hit_way;
// -------------------------------
// Controller <-> Miss unit
// -------------------------------
logic [NumPorts-1:0] busy;
logic [NumPorts-1:0][55:0] mshr_addr;
logic [NumPorts-1:0] mshr_addr_matches;
logic [NumPorts-1:0] mshr_index_matches;
logic [63:0] critical_word;
logic critical_word_valid;
logic [ NumPorts-1:0] busy;
logic [ NumPorts-1:0][ 55:0] mshr_addr;
logic [ NumPorts-1:0] mshr_addr_matches;
logic [ NumPorts-1:0] mshr_index_matches;
logic [ 63:0] critical_word;
logic critical_word_valid;

logic [NumPorts-1:0][$bits(miss_req_t)-1:0] miss_req;
logic [NumPorts-1:0] miss_gnt;
logic [NumPorts-1:0] active_serving;
logic [ NumPorts-1:0][ $bits(miss_req_t)-1:0] miss_req;
logic [ NumPorts-1:0] miss_gnt;
logic [ NumPorts-1:0] active_serving;

logic [NumPorts-1:0] bypass_gnt;
logic [NumPorts-1:0] bypass_valid;
logic [NumPorts-1:0][63:0] bypass_data;
logic [ NumPorts-1:0] bypass_gnt;
logic [ NumPorts-1:0] bypass_valid;
logic [ NumPorts-1:0][ 63:0] bypass_data;
// -------------------------------
// Arbiter <-> Datram,
// -------------------------------
logic [DCACHE_SET_ASSOC-1:0] req_ram;
logic [DCACHE_INDEX_WIDTH-1:0] addr_ram;
logic we_ram;
cache_line_t wdata_ram;
cache_line_t [DCACHE_SET_ASSOC-1:0] rdata_ram;
cl_be_t be_ram;
logic [(DCACHE_LINE_WIDTH/8+1)*DCACHE_SET_ASSOC-1:0] be_valid_dirty_ram;
logic [ DCACHE_SET_ASSOC-1:0] req_ram;
logic [DCACHE_INDEX_WIDTH-1:0] addr_ram;
logic we_ram;
cache_line_t wdata_ram;
cache_line_t [ DCACHE_SET_ASSOC-1:0] rdata_ram;
cl_be_t be_ram;
vldrty_t [ DCACHE_SET_ASSOC-1:0] be_valid_dirty_ram;

// Busy signals
logic miss_handler_busy;
logic miss_handler_busy;
assign busy_o = |busy | miss_handler_busy;

// ------------------
Expand Down Expand Up @@ -222,30 +222,20 @@ module std_nbdcache
// Valid/Dirty Regs
// ----------------

// align each valid/dirty bit pair to a byte boundary in order to leverage byte enable signals.
// note: if you have an SRAM that supports flat bit enables for your target technology,
// you can use it here to save the extra 17x overhead introduced by this workaround.
logic [(DCACHE_LINE_WIDTH+8)*DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata;
vldrty_t [DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata;

for (genvar i = 0; i < DCACHE_SET_ASSOC; i++) begin
for (genvar j = 0; j < DCACHE_LINE_WIDTH / 8; j++) begin
// dirty bits assignment
assign dirty_wdata[(DCACHE_LINE_WIDTH+8)*i+8*j] = wdata_ram.dirty[j];
assign rdata_ram[i].dirty[j] = dirty_rdata[(DCACHE_LINE_WIDTH+8)*i+8*j];
end
// valid bit assignment
assign dirty_wdata[DCACHE_LINE_WIDTH+(DCACHE_LINE_WIDTH+8)*i] = wdata_ram.valid;
assign rdata_ram[i].valid = dirty_rdata[DCACHE_LINE_WIDTH+(DCACHE_LINE_WIDTH+8)*i];
end

// be construction for valid_dirty_sram
for (genvar i = 0; i < DCACHE_SET_ASSOC; i++) begin
assign be_valid_dirty_ram[i*(DCACHE_LINE_WIDTH/8+1)+:(DCACHE_LINE_WIDTH/8+1)] = {be_ram.vldrty[i], be_ram.data} & {(DCACHE_LINE_WIDTH/8+1){be_ram.vldrty[i]}};
assign dirty_wdata[i] = '{dirty: wdata_ram.dirty, valid: wdata_ram.valid};
assign rdata_ram[i].dirty = dirty_rdata[i].dirty;
assign rdata_ram[i].valid = dirty_rdata[i].valid;
assign be_valid_dirty_ram[i].valid = be_ram.vldrty[i].valid;
assign be_valid_dirty_ram[i].dirty = be_ram.vldrty[i].dirty;
end

sram #(
.USER_WIDTH(1),
.DATA_WIDTH((DCACHE_LINE_WIDTH + 8) * DCACHE_SET_ASSOC),
.DATA_WIDTH(DCACHE_SET_ASSOC * $bits(vldrty_t)),
.BYTE_WIDTH(1),
.NUM_WORDS (DCACHE_NUM_WORDS)
) valid_dirty_sram (
.clk_i (clk_i),
Expand Down
7 changes: 6 additions & 1 deletion core/include/std_cache_pkg.sv
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@ package std_cache_pkg;
} bypass_rsp_t;

typedef struct packed {
logic [ariane_pkg::DCACHE_LINE_WIDTH/8-1:0] dirty;
logic valid;
} vldrty_t;

typedef struct packed {
logic [ariane_pkg::DCACHE_TAG_WIDTH-1:0] tag; // tag array
logic [ariane_pkg::DCACHE_LINE_WIDTH-1:0] data; // data array
logic valid; // state array
Expand All @@ -72,7 +77,7 @@ package std_cache_pkg;
typedef struct packed {
logic [(ariane_pkg::DCACHE_TAG_WIDTH+7)/8-1:0] tag; // byte enable into tag array
logic [(ariane_pkg::DCACHE_LINE_WIDTH+7)/8-1:0] data; // byte enable into data array
logic [ariane_pkg::DCACHE_SET_ASSOC-1:0] vldrty; // bit enable into state array (valid for a pair of dirty/valid bits)
vldrty_t [ariane_pkg::DCACHE_SET_ASSOC-1:0] vldrty; // bit enable into state array
} cl_be_t;

// convert one hot to bin for -> needed for cache replacement
Expand Down
2 changes: 1 addition & 1 deletion corev_apu/tb/ariane_tb.sv
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import uvm_pkg::*;

`include "uvm_macros.svh"

`define MAIN_MEM(P) dut.i_sram.gen_cut[0].i_tc_sram_wrapper.i_tc_sram.init_val[(``P``)]
`define MAIN_MEM(P) dut.i_sram.i_tc_sram.init_val[(``P``)]
// `define USER_MEM(P) dut.i_sram.gen_cut[0].gen_mem.gen_mem_user.i_tc_sram_wrapper_user.i_tc_sram.init_val[(``P``)]

import "DPI-C" function read_elf(input string filename);
Expand Down

0 comments on commit b8c3895

Please sign in to comment.