diff --git a/Bender.yml b/Bender.yml index 774929d4ba..c84fcf4c55 100644 --- a/Bender.yml +++ b/Bender.yml @@ -201,7 +201,7 @@ sources: - include_dirs: - common/local/util files: - - common/local/util/sram.sv + - common/local/util/sram_pulp.sv - target: not(all(fpga, xilinx)) include_dirs: diff --git a/common/local/util/sram_pulp.sv b/common/local/util/sram_pulp.sv new file mode 100644 index 0000000000..f3c4773bf4 --- /dev/null +++ b/common/local/util/sram_pulp.sv @@ -0,0 +1,90 @@ +// Copyright 2018 ETH Zurich and University of Bologna. +// Copyright and related rights are licensed under the Solderpad Hardware +// License, Version 0.51 (the "License"); you may not use this file except in +// compliance with the License. You may obtain a copy of the License at +// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +// or agreed to in writing, software, hardware and materials distributed under +// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. +// +// Author: Florian Zaruba , ETH Zurich +// Michael Schaffner , ETH Zurich +// Nils Wistoff , ETH Zurich +// Date: 15.08.2018 +// Description: generic tc_sram wrapper for CVA6 +// +// Note: the wrapped module contains two different implementations for +// ALTERA and XILINX tools, since these follow different coding styles for +// inferrable RAMS with byte enable. define `FPGA_TARGET_XILINX or +// `FPGA_TARGET_ALTERA in your build environment (default is ALTERA) + +module sram #( + parameter DATA_WIDTH = 64, + parameter BYTE_WIDTH = 8, + parameter USER_WIDTH = 1, + parameter USER_EN = 0, + parameter NUM_WORDS = 1024, + parameter SIM_INIT = "none", + parameter OUT_REGS = 0 // enables output registers in FPGA macro (read lat = 2) +)( + input logic clk_i, + input logic rst_ni, + input logic req_i, + input logic we_i, + input logic [$clog2(NUM_WORDS)-1:0] addr_i, + input logic [USER_WIDTH-1:0] wuser_i, + input logic [DATA_WIDTH-1:0] wdata_i, + input logic [(DATA_WIDTH+BYTE_WIDTH-1)/BYTE_WIDTH-1:0] be_i, + output logic [USER_WIDTH-1:0] ruser_o, + output logic [DATA_WIDTH-1:0] rdata_o +); + + tc_sram #( + .NumWords ( NUM_WORDS ), + .DataWidth ( DATA_WIDTH ), + .ByteWidth ( BYTE_WIDTH ), + .NumPorts ( 32'd1 ), + .Latency ( 32'd1 ), + .SimInit ( SIM_INIT ), + .PrintSimCfg ( 1'b0 ) + ) i_tc_sram ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .req_i ( req_i ), + .we_i ( we_i ), + .be_i ( be_i ), + .wdata_i ( wdata_i ), + .addr_i ( addr_i ), + .rdata_o ( rdata_o ) + ); + + if (USER_EN > 0) begin : gen_mem_user + tc_sram #( + .NumWords ( NUM_WORDS ), + .DataWidth ( DATA_WIDTH ), + .ByteWidth ( BYTE_WIDTH ), + .NumPorts ( 32'd1 ), + .Latency ( 32'd1 ), + .SimInit ( SIM_INIT ), + .PrintSimCfg ( 1'b0 ) + ) i_tc_sram_user ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .req_i ( req_i ), + .we_i ( we_i ), + .be_i ( be_i ), + .wdata_i ( wuser_i ), + .addr_i ( addr_i ), + .rdata_o ( ruser_o ) + ); + + if (USER_WIDTH != DATA_WIDTH) begin : gen_err_data_user_width + $fatal(1, "sram_pulp: USER_WIDTH needs to be equal to DATA_WIDTH (if USER_EN is set)."); + end + + end else begin + assign ruser_o = '0; + end + +endmodule : sram diff --git a/core/Flist.cva6 b/core/Flist.cva6 index 342f053e3c..aad4adef60 100644 --- a/core/Flist.cva6 +++ b/core/Flist.cva6 @@ -179,7 +179,7 @@ ${CVA6_REPO_DIR}/common/local/util/instr_tracer_if.sv ${CVA6_REPO_DIR}/common/local/util/instr_tracer.sv ${CVA6_REPO_DIR}/common/local/util/tc_sram_wrapper.sv ${CVA6_REPO_DIR}/vendor/pulp-platform/tech_cells_generic/src/rtl/tc_sram.sv -${CVA6_REPO_DIR}/common/local/util/sram.sv +${CVA6_REPO_DIR}/common/local/util/sram_pulp.sv // MMU Sv39 ${CVA6_REPO_DIR}/core/mmu_sv39/mmu.sv diff --git a/core/cache_subsystem/cache_ctrl.sv b/core/cache_subsystem/cache_ctrl.sv index dfd42a84fb..ee3da0d45f 100644 --- a/core/cache_subsystem/cache_ctrl.sv +++ b/core/cache_subsystem/cache_ctrl.sv @@ -298,14 +298,15 @@ module cache_ctrl // two memory look-ups on a single-ported SRAM and therefore is non-atomic if (!mshr_index_matches_i) begin // store data, write dirty bit - req_o = hit_way_q; - addr_o = mem_req_q.index; - we_o = 1'b1; - - be_o.vldrty = hit_way_q; + req_o = hit_way_q; + addr_o = mem_req_q.index; + we_o = 1'b1; // set the correct byte enable - be_o.data[cl_offset>>3+:8] = mem_req_q.be; + be_o.data[cl_offset>>3+:8] = mem_req_q.be; + for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin + if (hit_way_q[i]) be_o.vldrty[i] = '{valid: 1, dirty: be_o.data}; + end data_o.data[cl_offset+:64] = mem_req_q.wdata; // ~> change the state data_o.dirty[cl_offset>>3+:8] = mem_req_q.be; diff --git a/core/cache_subsystem/miss_handler.sv b/core/cache_subsystem/miss_handler.sv index ae66229523..aba15bfbbc 100644 --- a/core/cache_subsystem/miss_handler.sv +++ b/core/cache_subsystem/miss_handler.sv @@ -299,11 +299,14 @@ module miss_handler // we've got a valid response from refill unit if (valid_miss_fsm) begin - addr_o = mshr_q.addr[DCACHE_INDEX_WIDTH-1:0]; - req_o = evict_way_q; - we_o = 1'b1; - be_o = '1; - be_o.vldrty = evict_way_q; + addr_o = mshr_q.addr[DCACHE_INDEX_WIDTH-1:0]; + req_o = evict_way_q; + we_o = 1'b1; + be_o.tag = '1; + be_o.data = '1; + for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin + if (evict_way_q[i]) be_o.vldrty[i] = '1; + end data_o.tag = mshr_q.addr[DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH-1:DCACHE_INDEX_WIDTH]; data_o.data = data_miss_fsm; data_o.valid = 1'b1; @@ -350,9 +353,11 @@ module miss_handler we_o = 1'b1; data_o.valid = INVALIDATE_ON_FLUSH ? 1'b0 : 1'b1; // invalidate - be_o.vldrty = evict_way_q; + for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin + if (evict_way_q[i]) be_o.vldrty[i] = '1; + end // go back to handling the miss or flushing, depending on where we came from - state_d = (state_q == WB_CACHELINE_MISS) ? MISS : FLUSH_REQ_STATUS; + state_d = (state_q == WB_CACHELINE_MISS) ? MISS : FLUSH_REQ_STATUS; end end diff --git a/core/cache_subsystem/std_nbdcache.sv b/core/cache_subsystem/std_nbdcache.sv index b82158de31..5d904cf998 100644 --- a/core/cache_subsystem/std_nbdcache.sv +++ b/core/cache_subsystem/std_nbdcache.sv @@ -55,46 +55,46 @@ module std_nbdcache // 3. Load Unit // 4. Accelerator // 5. Store unit - logic [NumPorts:0][DCACHE_SET_ASSOC-1:0] req; - logic [NumPorts:0][DCACHE_INDEX_WIDTH-1:0] addr; - logic [NumPorts:0] gnt; - cache_line_t [DCACHE_SET_ASSOC-1:0] rdata; - logic [NumPorts:0][DCACHE_TAG_WIDTH-1:0] tag; + logic [ NumPorts:0][ DCACHE_SET_ASSOC-1:0] req; + logic [ NumPorts:0][DCACHE_INDEX_WIDTH-1:0] addr; + logic [ NumPorts:0] gnt; + cache_line_t [ DCACHE_SET_ASSOC-1:0] rdata; + logic [ NumPorts:0][ DCACHE_TAG_WIDTH-1:0] tag; - cache_line_t [NumPorts:0] wdata; - logic [NumPorts:0] we; - cl_be_t [NumPorts:0] be; - logic [DCACHE_SET_ASSOC-1:0] hit_way; + cache_line_t [ NumPorts:0] wdata; + logic [ NumPorts:0] we; + cl_be_t [ NumPorts:0] be; + logic [ DCACHE_SET_ASSOC-1:0] hit_way; // ------------------------------- // Controller <-> Miss unit // ------------------------------- - logic [NumPorts-1:0] busy; - logic [NumPorts-1:0][55:0] mshr_addr; - logic [NumPorts-1:0] mshr_addr_matches; - logic [NumPorts-1:0] mshr_index_matches; - logic [63:0] critical_word; - logic critical_word_valid; + logic [ NumPorts-1:0] busy; + logic [ NumPorts-1:0][ 55:0] mshr_addr; + logic [ NumPorts-1:0] mshr_addr_matches; + logic [ NumPorts-1:0] mshr_index_matches; + logic [ 63:0] critical_word; + logic critical_word_valid; - logic [NumPorts-1:0][$bits(miss_req_t)-1:0] miss_req; - logic [NumPorts-1:0] miss_gnt; - logic [NumPorts-1:0] active_serving; + logic [ NumPorts-1:0][ $bits(miss_req_t)-1:0] miss_req; + logic [ NumPorts-1:0] miss_gnt; + logic [ NumPorts-1:0] active_serving; - logic [NumPorts-1:0] bypass_gnt; - logic [NumPorts-1:0] bypass_valid; - logic [NumPorts-1:0][63:0] bypass_data; + logic [ NumPorts-1:0] bypass_gnt; + logic [ NumPorts-1:0] bypass_valid; + logic [ NumPorts-1:0][ 63:0] bypass_data; // ------------------------------- // Arbiter <-> Datram, // ------------------------------- - logic [DCACHE_SET_ASSOC-1:0] req_ram; - logic [DCACHE_INDEX_WIDTH-1:0] addr_ram; - logic we_ram; - cache_line_t wdata_ram; - cache_line_t [DCACHE_SET_ASSOC-1:0] rdata_ram; - cl_be_t be_ram; - logic [(DCACHE_LINE_WIDTH/8+1)*DCACHE_SET_ASSOC-1:0] be_valid_dirty_ram; + logic [ DCACHE_SET_ASSOC-1:0] req_ram; + logic [DCACHE_INDEX_WIDTH-1:0] addr_ram; + logic we_ram; + cache_line_t wdata_ram; + cache_line_t [ DCACHE_SET_ASSOC-1:0] rdata_ram; + cl_be_t be_ram; + vldrty_t [ DCACHE_SET_ASSOC-1:0] be_valid_dirty_ram; // Busy signals - logic miss_handler_busy; + logic miss_handler_busy; assign busy_o = |busy | miss_handler_busy; // ------------------ @@ -222,30 +222,20 @@ module std_nbdcache // Valid/Dirty Regs // ---------------- - // align each valid/dirty bit pair to a byte boundary in order to leverage byte enable signals. - // note: if you have an SRAM that supports flat bit enables for your target technology, - // you can use it here to save the extra 17x overhead introduced by this workaround. - logic [(DCACHE_LINE_WIDTH+8)*DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata; + vldrty_t [DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata; for (genvar i = 0; i < DCACHE_SET_ASSOC; i++) begin - for (genvar j = 0; j < DCACHE_LINE_WIDTH / 8; j++) begin - // dirty bits assignment - assign dirty_wdata[(DCACHE_LINE_WIDTH+8)*i+8*j] = wdata_ram.dirty[j]; - assign rdata_ram[i].dirty[j] = dirty_rdata[(DCACHE_LINE_WIDTH+8)*i+8*j]; - end - // valid bit assignment - assign dirty_wdata[DCACHE_LINE_WIDTH+(DCACHE_LINE_WIDTH+8)*i] = wdata_ram.valid; - assign rdata_ram[i].valid = dirty_rdata[DCACHE_LINE_WIDTH+(DCACHE_LINE_WIDTH+8)*i]; - end - - // be construction for valid_dirty_sram - for (genvar i = 0; i < DCACHE_SET_ASSOC; i++) begin - assign be_valid_dirty_ram[i*(DCACHE_LINE_WIDTH/8+1)+:(DCACHE_LINE_WIDTH/8+1)] = {be_ram.vldrty[i], be_ram.data} & {(DCACHE_LINE_WIDTH/8+1){be_ram.vldrty[i]}}; + assign dirty_wdata[i] = '{dirty: wdata_ram.dirty, valid: wdata_ram.valid}; + assign rdata_ram[i].dirty = dirty_rdata[i].dirty; + assign rdata_ram[i].valid = dirty_rdata[i].valid; + assign be_valid_dirty_ram[i].valid = be_ram.vldrty[i].valid; + assign be_valid_dirty_ram[i].dirty = be_ram.vldrty[i].dirty; end sram #( .USER_WIDTH(1), - .DATA_WIDTH((DCACHE_LINE_WIDTH + 8) * DCACHE_SET_ASSOC), + .DATA_WIDTH(DCACHE_SET_ASSOC * $bits(vldrty_t)), + .BYTE_WIDTH(1), .NUM_WORDS (DCACHE_NUM_WORDS) ) valid_dirty_sram ( .clk_i (clk_i), diff --git a/core/include/std_cache_pkg.sv b/core/include/std_cache_pkg.sv index 7513cc866b..626a2b4d8b 100644 --- a/core/include/std_cache_pkg.sv +++ b/core/include/std_cache_pkg.sv @@ -62,6 +62,11 @@ package std_cache_pkg; } bypass_rsp_t; typedef struct packed { + logic [ariane_pkg::DCACHE_LINE_WIDTH/8-1:0] dirty; + logic valid; + } vldrty_t; + + typedef struct packed { logic [ariane_pkg::DCACHE_TAG_WIDTH-1:0] tag; // tag array logic [ariane_pkg::DCACHE_LINE_WIDTH-1:0] data; // data array logic valid; // state array @@ -72,7 +77,7 @@ package std_cache_pkg; typedef struct packed { logic [(ariane_pkg::DCACHE_TAG_WIDTH+7)/8-1:0] tag; // byte enable into tag array logic [(ariane_pkg::DCACHE_LINE_WIDTH+7)/8-1:0] data; // byte enable into data array - logic [ariane_pkg::DCACHE_SET_ASSOC-1:0] vldrty; // bit enable into state array (valid for a pair of dirty/valid bits) + vldrty_t [ariane_pkg::DCACHE_SET_ASSOC-1:0] vldrty; // bit enable into state array } cl_be_t; // convert one hot to bin for -> needed for cache replacement diff --git a/corev_apu/tb/ariane_tb.sv b/corev_apu/tb/ariane_tb.sv index ce85ecdaf9..b7997e35cb 100644 --- a/corev_apu/tb/ariane_tb.sv +++ b/corev_apu/tb/ariane_tb.sv @@ -19,7 +19,7 @@ import uvm_pkg::*; `include "uvm_macros.svh" -`define MAIN_MEM(P) dut.i_sram.gen_cut[0].i_tc_sram_wrapper.i_tc_sram.init_val[(``P``)] +`define MAIN_MEM(P) dut.i_sram.i_tc_sram.init_val[(``P``)] // `define USER_MEM(P) dut.i_sram.gen_cut[0].gen_mem.gen_mem_user.i_tc_sram_wrapper_user.i_tc_sram.init_val[(``P``)] import "DPI-C" function read_elf(input string filename);