Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

coherence: Pack Valid/Dirty SRAM #36

Merged
merged 3 commits into from
Aug 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Bender.yml
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ sources:
- include_dirs:
- common/local/util
files:
- common/local/util/sram.sv
- common/local/util/sram_pulp.sv

- target: not(all(fpga, xilinx))
include_dirs:
Expand Down
90 changes: 90 additions & 0 deletions common/local/util/sram_pulp.sv
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Copyright 2018 ETH Zurich and University of Bologna.
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 0.51 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
// or agreed to in writing, software, hardware and materials distributed under
// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
//
// Author: Florian Zaruba <[email protected]>, ETH Zurich
// Michael Schaffner <[email protected]>, ETH Zurich
// Nils Wistoff <[email protected]>, ETH Zurich
// Date: 15.08.2018
// Description: generic tc_sram wrapper for CVA6
//
// Note: the wrapped module contains two different implementations for
// ALTERA and XILINX tools, since these follow different coding styles for
// inferrable RAMS with byte enable. define `FPGA_TARGET_XILINX or
// `FPGA_TARGET_ALTERA in your build environment (default is ALTERA)

module sram #(
parameter DATA_WIDTH = 64,
parameter BYTE_WIDTH = 8,
parameter USER_WIDTH = 1,
parameter USER_EN = 0,
parameter NUM_WORDS = 1024,
parameter SIM_INIT = "none",
parameter OUT_REGS = 0 // enables output registers in FPGA macro (read lat = 2)
)(
input logic clk_i,
input logic rst_ni,
input logic req_i,
input logic we_i,
input logic [$clog2(NUM_WORDS)-1:0] addr_i,
input logic [USER_WIDTH-1:0] wuser_i,
input logic [DATA_WIDTH-1:0] wdata_i,
input logic [(DATA_WIDTH+BYTE_WIDTH-1)/BYTE_WIDTH-1:0] be_i,
output logic [USER_WIDTH-1:0] ruser_o,
output logic [DATA_WIDTH-1:0] rdata_o
);

tc_sram #(
.NumWords ( NUM_WORDS ),
.DataWidth ( DATA_WIDTH ),
.ByteWidth ( BYTE_WIDTH ),
.NumPorts ( 32'd1 ),
.Latency ( 32'd1 ),
.SimInit ( SIM_INIT ),
.PrintSimCfg ( 1'b0 )
) i_tc_sram (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.req_i ( req_i ),
.we_i ( we_i ),
.be_i ( be_i ),
.wdata_i ( wdata_i ),
.addr_i ( addr_i ),
.rdata_o ( rdata_o )
);

if (USER_EN > 0) begin : gen_mem_user
tc_sram #(
.NumWords ( NUM_WORDS ),
.DataWidth ( DATA_WIDTH ),
.ByteWidth ( BYTE_WIDTH ),
.NumPorts ( 32'd1 ),
.Latency ( 32'd1 ),
.SimInit ( SIM_INIT ),
.PrintSimCfg ( 1'b0 )
) i_tc_sram_user (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.req_i ( req_i ),
.we_i ( we_i ),
.be_i ( be_i ),
.wdata_i ( wuser_i ),
.addr_i ( addr_i ),
.rdata_o ( ruser_o )
);

if (USER_WIDTH != DATA_WIDTH) begin : gen_err_data_user_width
$fatal(1, "sram_pulp: USER_WIDTH needs to be equal to DATA_WIDTH (if USER_EN is set).");
end

end else begin
assign ruser_o = '0;
end

endmodule : sram
2 changes: 1 addition & 1 deletion core/Flist.cva6
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,6 @@ ${CVA6_REPO_DIR}/common/local/util/instr_tracer_if.sv
${CVA6_REPO_DIR}/common/local/util/instr_tracer.sv
${CVA6_REPO_DIR}/common/local/util/tc_sram_wrapper.sv
${CVA6_REPO_DIR}/vendor/pulp-platform/tech_cells_generic/src/rtl/tc_sram.sv
${CVA6_REPO_DIR}/common/local/util/sram.sv
${CVA6_REPO_DIR}/common/local/util/sram_pulp.sv

// end of manifest
5 changes: 3 additions & 2 deletions core/cache_subsystem/cache_ctrl.sv
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,11 @@ module cache_ctrl import ariane_pkg::*; import std_cache_pkg::*; #(
addr_o = mem_req_q.index;
we_o = 1'b1;

be_o.vldrty = hit_way_q;

// set the correct byte enable
be_o.data[cl_offset>>3 +: 8] = mem_req_q.be;
for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin
if (hit_way_q[i]) be_o.vldrty[i] = '{valid: 1, dirty: be_o.data};
end
data_o.data[cl_offset +: 64] = mem_req_q.wdata;
// ~> change the state
data_o.dirty[cl_offset>>3 +: 8] = mem_req_q.be;
Expand Down
11 changes: 8 additions & 3 deletions core/cache_subsystem/miss_handler.sv
Original file line number Diff line number Diff line change
Expand Up @@ -300,8 +300,11 @@ module miss_handler import ariane_pkg::*; import std_cache_pkg::*; #(
addr_o = mshr_q.addr[DCACHE_INDEX_WIDTH-1:0];
req_o = evict_way_q;
we_o = 1'b1;
be_o = '1;
be_o.vldrty = evict_way_q;
be_o.tag = '1;
be_o.data = '1;
for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin
if (evict_way_q[i]) be_o.vldrty[i] = '1;
end
data_o.tag = mshr_q.addr[DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH-1:DCACHE_INDEX_WIDTH];
data_o.data = data_miss_fsm;
data_o.valid = 1'b1;
Expand Down Expand Up @@ -345,7 +348,9 @@ module miss_handler import ariane_pkg::*; import std_cache_pkg::*; #(
we_o = 1'b1;
data_o.valid = INVALIDATE_ON_FLUSH ? 1'b0 : 1'b1;
// invalidate
be_o.vldrty = evict_way_q;
for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin
if (evict_way_q[i]) be_o.vldrty[i] = '1;
end
// go back to handling the miss or flushing, depending on where we came from
state_d = (state_q == WB_CACHELINE_MISS) ? MISS : FLUSH_REQ_STATUS;
end
Expand Down
28 changes: 9 additions & 19 deletions core/cache_subsystem/std_nbdcache.sv
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ import std_cache_pkg::*;
cache_line_t wdata_ram;
cache_line_t [DCACHE_SET_ASSOC-1:0] rdata_ram;
cl_be_t be_ram;
logic [(DCACHE_LINE_WIDTH/8+1)*DCACHE_SET_ASSOC-1:0] be_valid_dirty_ram;
vldrty_t [DCACHE_SET_ASSOC-1:0] be_valid_dirty_ram;

// Busy signals
logic miss_handler_busy;
Expand Down Expand Up @@ -222,30 +222,20 @@ import std_cache_pkg::*;
// Valid/Dirty Regs
// ----------------

// align each valid/dirty bit pair to a byte boundary in order to leverage byte enable signals.
// note: if you have an SRAM that supports flat bit enables for your target technology,
// you can use it here to save the extra 17x overhead introduced by this workaround.
logic [(DCACHE_LINE_WIDTH+8)*DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata;
vldrty_t [DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata;

for (genvar i = 0; i < DCACHE_SET_ASSOC; i++) begin
for (genvar j = 0; j < DCACHE_LINE_WIDTH/8; j++) begin
// dirty bits assignment
assign dirty_wdata[(DCACHE_LINE_WIDTH+8)*i+8*j] = wdata_ram.dirty[j];
assign rdata_ram[i].dirty[j] = dirty_rdata[(DCACHE_LINE_WIDTH+8)*i+8*j];
end
// valid bit assignment
assign dirty_wdata[DCACHE_LINE_WIDTH+(DCACHE_LINE_WIDTH+8)*i] = wdata_ram.valid;
assign rdata_ram[i].valid = dirty_rdata[DCACHE_LINE_WIDTH+(DCACHE_LINE_WIDTH+8)*i];
end

// be construction for valid_dirty_sram
for (genvar i = 0; i < DCACHE_SET_ASSOC; i++) begin
assign be_valid_dirty_ram[i*(DCACHE_LINE_WIDTH/8+1)+:(DCACHE_LINE_WIDTH/8+1)] = {be_ram.vldrty[i], be_ram.data} & {(DCACHE_LINE_WIDTH/8+1){be_ram.vldrty[i]}};
assign dirty_wdata[i] = '{dirty: wdata_ram.dirty, valid: wdata_ram.valid};
assign rdata_ram[i].dirty = dirty_rdata[i].dirty;
assign rdata_ram[i].valid = dirty_rdata[i].valid;
assign be_valid_dirty_ram[i].valid = be_ram.vldrty[i].valid;
assign be_valid_dirty_ram[i].dirty = be_ram.vldrty[i].dirty;
end

sram #(
.USER_WIDTH ( 1 ),
.DATA_WIDTH ( (DCACHE_LINE_WIDTH+8)*DCACHE_SET_ASSOC ),
.DATA_WIDTH ( DCACHE_SET_ASSOC*$bits(vldrty_t) ),
.BYTE_WIDTH ( 1 ),
.NUM_WORDS ( DCACHE_NUM_WORDS )
) valid_dirty_sram (
.clk_i ( clk_i ),
Expand Down
11 changes: 8 additions & 3 deletions core/include/std_cache_pkg.sv
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ package std_cache_pkg;
logic [63:0] rdata;
} bypass_rsp_t;

typedef struct packed {
logic [ariane_pkg::DCACHE_LINE_WIDTH/8-1:0] dirty;
logic valid;
} vldrty_t;

typedef struct packed {
logic [ariane_pkg::DCACHE_TAG_WIDTH-1:0] tag; // tag array
logic [ariane_pkg::DCACHE_LINE_WIDTH-1:0] data; // data array
Expand All @@ -69,9 +74,9 @@ package std_cache_pkg;

// cache line byte enable
typedef struct packed {
logic [(ariane_pkg::DCACHE_TAG_WIDTH+7)/8-1:0] tag; // byte enable into tag array
logic [(ariane_pkg::DCACHE_LINE_WIDTH+7)/8-1:0] data; // byte enable into data array
logic [ariane_pkg::DCACHE_SET_ASSOC-1:0] vldrty; // bit enable into state array (valid for a pair of dirty/valid bits)
logic [(ariane_pkg::DCACHE_TAG_WIDTH+7)/8-1:0] tag; // byte enable into tag array
logic [(ariane_pkg::DCACHE_LINE_WIDTH+7)/8-1:0] data; // byte enable into data array
vldrty_t [ariane_pkg::DCACHE_SET_ASSOC-1:0] vldrty; // bit enable into state array
} cl_be_t;

// convert one hot to bin for -> needed for cache replacement
Expand Down
4 changes: 2 additions & 2 deletions corev_apu/tb/ariane_tb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,10 +338,10 @@ int main(int argc, char **argv) {
size_t mem_size = 0xFFFFFF;
#if (VERILATOR_VERSION_INTEGER >= 5000000)
// Verilator v5: Use rootp pointer and .data() accessor.
memif.read(0x80000000, mem_size, (void *)top->rootp->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__i_tc_sram_wrapper__DOT__i_tc_sram__DOT__sram.data());
memif.read(0x80000000, mem_size, (void *)top->rootp->ariane_testharness__DOT__i_sram__DOT__i_tc_sram__DOT__sram.data());
#else
// Verilator v4
memif.read(0x80000000, mem_size, (void *)top->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__i_tc_sram_wrapper__DOT__i_tc_sram__DOT__sram);
memif.read(0x80000000, mem_size, (void *)top->ariane_testharness__DOT__i_sram__DOT__i_tc_sram__DOT__sram);
#endif
// memif.read(0x84000000, mem_size, (void *)top->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__gen_mem__DOT__gen_mem_user__DOT__i_tc_sram_wrapper_user__DOT__i_tc_sram__DOT__sram);

Expand Down
2 changes: 1 addition & 1 deletion corev_apu/tb/ariane_tb.sv
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import uvm_pkg::*;

`include "uvm_macros.svh"

`define MAIN_MEM(P) dut.i_sram.gen_cut[0].i_tc_sram_wrapper.i_tc_sram.init_val[(``P``)]
`define MAIN_MEM(P) dut.i_sram.i_tc_sram.init_val[(``P``)]
// `define USER_MEM(P) dut.i_sram.gen_cut[0].gen_mem.gen_mem_user.i_tc_sram_wrapper_user.i_tc_sram.init_val[(``P``)]

import "DPI-C" function read_elf(input string filename);
Expand Down