From bf1e0754402f5db445f902f7b3ffc7df42c023c3 Mon Sep 17 00:00:00 2001 From: Nils Wistoff Date: Tue, 25 Jul 2023 20:52:55 +0200 Subject: [PATCH] Add self-invalidation coherence added files required for compilation Added target to test litmus tests Per byte dirty bit added to std dcache and tested basic support for dual core instantiation Some automation added to the multi-core testing process minor changes temporary ci modifications for working without sudo permissions branch prova prova modified Multi core instantiation made generic Transition between WAIT_CRITICAL_WORD and WAIT_TAG removed if there is a flush - feature tested Changed repo with master branch and added masks for reservation at cacheline granularity because burst not supported Fix the never return problem for non boot cores and dt modified for 2 cores Added master branch of common_cells and compilation of new file in Makefile Added transition between FLUSHING and FLUSHING to avoid multiple flushs during atomics Unused code removed and code commented Added support to use the master branch of the axi_riscv_atomics repository Added support for multiple ariane instances for fpga synthesis Increased stack for big applications and reduced number of harts Pheripherals configured to use multiple cores List of issues not solved encountered during the master thesis Co-authored-by: msc22h2 Signed-off-by: Nils Wistoff --- core/cache_subsystem/cache_ctrl.sv | 4 +- core/cache_subsystem/miss_handler.sv | 11 +- core/cache_subsystem/std_nbdcache.sv | 30 +++- corev_apu/tb/common/tb_dcache_pkg.sv | 2 +- corev_apu/tb/common/tb_writeport.sv | 16 +- corev_apu/tb/tb_wb_dcache/Makefile | 2 +- corev_apu/tb/tb_wb_dcache/hdl/tb.sv | 223 +++++++++++++++++++++++++++ corev_apu/tb/tb_wb_dcache/tb.list | 11 ++ 8 files changed, 278 insertions(+), 21 deletions(-) diff --git a/core/cache_subsystem/cache_ctrl.sv b/core/cache_subsystem/cache_ctrl.sv index 9491141650..c14aeae0f4 100644 --- a/core/cache_subsystem/cache_ctrl.sv +++ b/core/cache_subsystem/cache_ctrl.sv @@ -322,8 +322,8 @@ module cache_ctrl data_o.data[cl_offset+:CVA6Cfg.XLEN] = mem_req_q.wdata; data_o.tag = mem_req_d.tag; // ~> change the state - data_o.dirty = 1'b1; - data_o.valid = 1'b1; + data_o.dirty[cl_offset>>3+:CVA6Cfg.XLEN/8] = 1'b1; + data_o.valid = 1'b1; // got a grant ~> this is finished now if (gnt_i) begin diff --git a/core/cache_subsystem/miss_handler.sv b/core/cache_subsystem/miss_handler.sv index 6a3a84c4ae..f00ed8d5a1 100644 --- a/core/cache_subsystem/miss_handler.sv +++ b/core/cache_subsystem/miss_handler.sv @@ -177,7 +177,7 @@ module miss_handler automatic logic [CVA6Cfg.DCACHE_SET_ASSOC-1:0] evict_way, valid_way; for (int unsigned i = 0; i < CVA6Cfg.DCACHE_SET_ASSOC; i++) begin - evict_way[i] = data_i[i].valid & data_i[i].dirty; + evict_way[i] = data_i[i].valid & (|data_i[i].dirty); valid_way[i] = data_i[i].valid; end // ---------------------- @@ -287,10 +287,11 @@ module miss_handler lfsr_enable = 1'b1; evict_way_d = lfsr_oh; // do we need to write back the cache line? - if (data_i[lfsr_bin].dirty) begin + if (|data_i[lfsr_bin].dirty) begin state_d = WB_CACHELINE_MISS; evict_cl_d.tag = data_i[lfsr_bin].tag; evict_cl_d.data = data_i[lfsr_bin].data; + evict_cl_d.dirty = data_i[lfsr_bin].dirty; cnt_d = mshr_q.addr[CVA6Cfg.DCACHE_INDEX_WIDTH-1:0]; // no - we can request a cache line now end else state_d = REQ_CACHELINE; @@ -328,7 +329,7 @@ module miss_handler data_o.tag = mshr_q.addr[CVA6Cfg.DCACHE_TAG_WIDTH+CVA6Cfg.DCACHE_INDEX_WIDTH-1:CVA6Cfg.DCACHE_INDEX_WIDTH]; data_o.data = data_miss_fsm; data_o.valid = 1'b1; - data_o.dirty = 1'b0; + data_o.dirty = '0; // is this a write? if (mshr_q.we) begin @@ -338,7 +339,7 @@ module miss_handler if (mshr_q.be[i]) data_o.data[(cl_offset+i*8)+:8] = mshr_q.wdata[i]; end // its immediately dirty if we write - data_o.dirty = 1'b1; + data_o.dirty[cl_offset>>3+:8] = mshr_q.be; end // reset MSHR mshr_d.valid = 1'b0; @@ -359,7 +360,7 @@ module miss_handler cnt_q[CVA6Cfg.DCACHE_INDEX_WIDTH-1:CVA6Cfg.DCACHE_OFFSET_WIDTH], {{CVA6Cfg.DCACHE_OFFSET_WIDTH} {1'b0}} }; - req_fsm_miss_be = '1; + req_fsm_miss_be = evict_cl_q.dirty; req_fsm_miss_we = 1'b1; req_fsm_miss_wdata = evict_cl_q.data; diff --git a/core/cache_subsystem/std_nbdcache.sv b/core/cache_subsystem/std_nbdcache.sv index 1d7c813b59..4f6c123390 100644 --- a/core/cache_subsystem/std_nbdcache.sv +++ b/core/cache_subsystem/std_nbdcache.sv @@ -62,6 +62,10 @@ module std_nbdcache logic [(CVA6Cfg.DCACHE_LINE_WIDTH+7)/8-1:0] data; // byte enable into data array logic [CVA6Cfg.DCACHE_SET_ASSOC-1:0] vldrty; // bit enable into state array (valid for a pair of dirty/valid bits) }; + typedef struct packed { + logic [CVA6Cfg.DCACHE_LINE_WIDTH/8-1:0] dirty; + logic valid; + } vldrty_t; // ------------------------------- // Controller <-> Arbiter @@ -107,6 +111,7 @@ module std_nbdcache cache_line_t wdata_ram; cache_line_t [ CVA6Cfg.DCACHE_SET_ASSOC-1:0] rdata_ram; cl_be_t be_ram; + vldrty_t [ CVA6Cfg.DCACHE_SET_ASSOC-1:0] be_valid_dirty_ram; // Busy signals logic miss_handler_busy; @@ -245,19 +250,28 @@ module std_nbdcache // align each valid/dirty bit pair to a byte boundary in order to leverage byte enable signals. // note: if you have an SRAM that supports flat bit enables for your target technology, - // you can use it here to save the extra 4x overhead introduced by this workaround. - logic [4*DCACHE_DIRTY_WIDTH-1:0] dirty_wdata, dirty_rdata; + // you can use it here to save the extra 17x overhead introduced by this workaround. + logic [(CVA6Cfg.DCACHE_LINE_WIDTH+8)*CVA6Cfg.DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata; for (genvar i = 0; i < CVA6Cfg.DCACHE_SET_ASSOC; i++) begin - assign dirty_wdata[8*i] = wdata_ram.dirty; - assign dirty_wdata[8*i+1] = wdata_ram.valid; - assign rdata_ram[i].dirty = dirty_rdata[8*i]; - assign rdata_ram[i].valid = dirty_rdata[8*i+1]; + for (genvar j = 0; j < CVA6Cfg.DCACHE_LINE_WIDTH / 8; j++) begin + // dirty bits assignment + assign dirty_wdata[(CVA6Cfg.DCACHE_LINE_WIDTH+8)*i+8*j] = wdata_ram.dirty[j]; + assign rdata_ram[i].dirty[j] = dirty_rdata[(CVA6Cfg.DCACHE_LINE_WIDTH+8)*i+8*j]; + end + // valid bit assignment + assign dirty_wdata[CVA6Cfg.DCACHE_LINE_WIDTH+(CVA6Cfg.DCACHE_LINE_WIDTH+8)*i] = wdata_ram.valid; + assign rdata_ram[i].valid = dirty_rdata[CVA6Cfg.DCACHE_LINE_WIDTH+(CVA6Cfg.DCACHE_LINE_WIDTH+8)*i]; + end + + // be construction for valid_dirty_sram + for (genvar i = 0; i < CVA6Cfg.DCACHE_SET_ASSOC; i++) begin + assign be_valid_dirty_ram[i*(CVA6Cfg.DCACHE_LINE_WIDTH/8+1)+:(CVA6Cfg.DCACHE_LINE_WIDTH/8+1)] = {be_ram.vldrty[i], be_ram.data} & {(CVA6Cfg.DCACHE_LINE_WIDTH/8+1){be_ram.vldrty[i]}}; end sram #( .USER_WIDTH(1), - .DATA_WIDTH(4 * DCACHE_DIRTY_WIDTH), + .DATA_WIDTH((CVA6Cfg.DCACHE_LINE_WIDTH + 8) * CVA6Cfg.DCACHE_SET_ASSOC), .NUM_WORDS (CVA6Cfg.DCACHE_NUM_WORDS) ) valid_dirty_sram ( .clk_i (clk_i), @@ -267,7 +281,7 @@ module std_nbdcache .addr_i (addr_ram[CVA6Cfg.DCACHE_INDEX_WIDTH-1:CVA6Cfg.DCACHE_OFFSET_WIDTH]), .wuser_i('0), .wdata_i(dirty_wdata), - .be_i (be_ram.vldrty), + .be_i (be_valid_dirty_ram), .ruser_o(), .rdata_o(dirty_rdata) ); diff --git a/corev_apu/tb/common/tb_dcache_pkg.sv b/corev_apu/tb/common/tb_dcache_pkg.sv index 7584d81a86..4aa4ce6c59 100644 --- a/corev_apu/tb/common/tb_dcache_pkg.sv +++ b/corev_apu/tb/common/tb_dcache_pkg.sv @@ -36,7 +36,7 @@ package tb_pkg; parameter ERROR_CNT_STOP_LEVEL = 1; // use 1 for debugging. 0 runs the complete simulation... // tb_readport sequences - typedef enum logic [2:0] { RANDOM_SEQ, LINEAR_SEQ, BURST_SEQ, IDLE_SEQ, WRAP_SEQ, SET_SEQ, CONST_SEQ } seq_t; + typedef enum logic [2:0] { RANDOM_SEQ, LINEAR_SEQ, BURST_SEQ, IDLE_SEQ, WRAP_SEQ, SET_SEQ, CONST_SEQ, HALF_SEQ } seq_t; typedef enum logic [1:0] { OTHER, BYPASS, CACHED } port_type_t; diff --git a/corev_apu/tb/common/tb_writeport.sv b/corev_apu/tb/common/tb_writeport.sv index 46cb362875..6951a86480 100644 --- a/corev_apu/tb/common/tb_writeport.sv +++ b/corev_apu/tb/common/tb_writeport.sv @@ -33,6 +33,8 @@ program tb_writeport import tb_pkg::*; import ariane_pkg::*; #( input logic rst_ni, // to testbench master + input logic half_i, + input logic [1:0] max_size_i, ref string test_name_i, input logic [6:0] req_rate_i, input seq_t seq_type_i, @@ -66,13 +68,13 @@ program tb_writeport import tb_pkg::*; import ariane_pkg::*; #( automatic logic [CVA6Cfg.XLEN/8-1:0] be; automatic logic [1:0] size; - void'(randomize(size) with {size <= $clog2(CVA6Cfg.XLEN/8);}); + void'(randomize(size) with {size >= 2'b00; size <= max_size_i; size <= $clog2(CVA6Cfg.XLEN/8);}); // align to size, set correct byte enables be = '0; unique case(size) - 2'b00: be[paddr[2:0] +: 1] = '1; - 2'b01: be[paddr[2:1]<<1 +: 2] = '1; - 2'b10: be[paddr[2:2]<<2 +: 4] = '1; + 2'b00: be[int'(paddr[2:0]) +: 1] = '1; + 2'b01: be[int'(paddr[2:1]<<1) +: 2] = '1; + 2'b10: be[int'(paddr[2:2]<<2) +: 4] = '1; 2'b11: be = '1; default: ; endcase @@ -112,6 +114,7 @@ program tb_writeport import tb_pkg::*; import ariane_pkg::*; #( dut_req_port_o.data_req = 1'b1; // generate random address void'(randomize(paddr) with {paddr >= 0; paddr < (MemWords<<$clog2(CVA6Cfg.XLEN/8));}); + if (seq_type_i == HALF_SEQ) paddr[int'(max_size_i)] = half_i; applyRandData(); `APPL_WAIT_COMB_SIG(clk_i, dut_req_port_i.data_gnt) end @@ -281,6 +284,11 @@ program tb_writeport import tb_pkg::*; import ariane_pkg::*; #( $display("%s> start random sequence with %04d vectors and req_rate %03d", PortName, seq_num_vect_i, req_rate_i); genRandReq(); end + HALF_SEQ: begin + $display("%s> start half random sequence with %04d vectors and req_rate %03d", PortName, seq_num_vect_i, req_rate_i); + $display("%s> half = %b and max size = %b", PortName, half_i, max_size_i); + genRandReq(); + end LINEAR_SEQ: begin $display("%s> start linear sequence with %04d vectors and req_rate %03d", PortName, seq_num_vect_i, req_rate_i); genSeqWrite(); diff --git a/corev_apu/tb/tb_wb_dcache/Makefile b/corev_apu/tb/tb_wb_dcache/Makefile index 81c6d411cd..de6df0fafe 100755 --- a/corev_apu/tb/tb_wb_dcache/Makefile +++ b/corev_apu/tb/tb_wb_dcache/Makefile @@ -20,7 +20,7 @@ src := $(shell xargs printf '\n%s' < $(src-list) | cut -b 1-) compile_flag += +cover+i_dut -incr -64 -nologo -svinputport=compat -override_timescale 1ns/1ps -suppress 2583 -suppress 13262 -suppress 2986 +cover sim_opts += -64 -coverage -classdebug -voptargs="+acc" questa_version ?= ${QUESTASIM_VERSION} -incdir += ../common/ ../../../vendor/pulp-platform/axi/include/ +incdir += ../common/ ../../../vendor/pulp-platform/axi/include/ ../../../vendor/pulp-platform/common_cells/include/ # Iterate over all include directories and write them with +incdir+ prefixed # +incdir+ works for Verilator and QuestaSim diff --git a/corev_apu/tb/tb_wb_dcache/hdl/tb.sv b/corev_apu/tb/tb_wb_dcache/hdl/tb.sv index 37e992c220..c8522316b0 100644 --- a/corev_apu/tb/tb_wb_dcache/hdl/tb.sv +++ b/corev_apu/tb/tb_wb_dcache/hdl/tb.sv @@ -118,6 +118,8 @@ module tb import ariane_pkg::*; import std_cache_pkg::*; import tb_pkg::*; #( seq_t [2:0] seq_type; logic [3:0] seq_done; logic [6:0] req_rate[2:0]; + logic half; + logic [1:0] max_size; logic seq_run, seq_last; logic end_of_sim; @@ -259,6 +261,37 @@ module tb import ariane_pkg::*; import std_cache_pkg::*; import tb_pkg::*; #( `APPL_WAIT_CYC(clk_i, 1) endtask : flushCache + //integer fd = $fopen("extern_write.txt","w"); + // Write directly the tb memory + function automatic void external_writer(int unsigned pos, int unsigned half); + automatic logic[7:0] val; + for (int k=0; k requests to cached region should use cache port, + // those to uncached regions should use bypass port + bypass_mem_port.set_region(0, CachedAddrBeg - 1); + data_mem_port.set_region(CachedAddrBeg, MemBytes - 1); + + runSeq(0,nWriteVectors,0); + external_writer(int'(max_size),int'(!half)); + flushCache(); + tb_mem_port_t::check_mem(); + + ////////////////////////////////////////////// + test_name = "TEST 20 -- random write on half memory(MSB) and external writer on the other half -- max size = 64b -- enabled cache + tlb, mem contentions + invalidations"; + + // Config + enable_i = 1; + tlb_rand_en = 1; + mem_rand_en = 1; + inv_rand_en = 1; + max_size = 2'b11; + half = 1; + seq_type = '{HALF_SEQ, RANDOM_SEQ, RANDOM_SEQ}; + req_rate = '{default:50}; + + // cache enabled ~> requests to cached region should use cache port, + // those to uncached regions should use bypass port + bypass_mem_port.set_region(0, CachedAddrBeg - 1); + data_mem_port.set_region(CachedAddrBeg, MemBytes - 1); + + runSeq(0,nWriteVectors,0); + external_writer(int'(max_size),int'(!half)); + flushCache(); + tb_mem_port_t::check_mem(); + + ////////////////////////////////////////////// + test_name = "TEST 21 -- random write on half memory(LSB) and external writer on the other half -- max size = 32b -- enabled cache + tlb, mem contentions + invalidations"; + + // Config + enable_i = 1; + tlb_rand_en = 1; + mem_rand_en = 1; + inv_rand_en = 1; + max_size = 2'b10; + half = 0; + seq_type = '{HALF_SEQ, RANDOM_SEQ, RANDOM_SEQ}; + req_rate = '{default:50}; + + // cache enabled ~> requests to cached region should use cache port, + // those to uncached regions should use bypass port + bypass_mem_port.set_region(0, CachedAddrBeg - 1); + data_mem_port.set_region(CachedAddrBeg, MemBytes - 1); + + runSeq(0,nWriteVectors,0); + external_writer(int'(max_size),int'(!half)); + flushCache(); + tb_mem_port_t::check_mem(); + + ////////////////////////////////////////////// + test_name = "TEST 22 -- random write on half memory(MSB) and external writer on the other half -- max size = 32b -- enabled cache + tlb, mem contentions + invalidations"; + + // Config + enable_i = 1; + tlb_rand_en = 1; + mem_rand_en = 1; + inv_rand_en = 1; + max_size = 2'b10; + half = 1; + seq_type = '{HALF_SEQ, RANDOM_SEQ, RANDOM_SEQ}; + req_rate = '{default:50}; + + // cache enabled ~> requests to cached region should use cache port, + // those to uncached regions should use bypass port + bypass_mem_port.set_region(0, CachedAddrBeg - 1); + data_mem_port.set_region(CachedAddrBeg, MemBytes - 1); + + runSeq(0,nWriteVectors,0); + external_writer(int'(max_size),int'(!half)); + flushCache(); + tb_mem_port_t::check_mem(); + + ////////////////////////////////////////////// + test_name = "TEST 23 -- random write on half memory(LSB) and external writer on the other half -- max size = 16b -- enabled cache + tlb, mem contentions + invalidations"; + + // Config + enable_i = 1; + tlb_rand_en = 1; + mem_rand_en = 1; + inv_rand_en = 1; + max_size = 2'b01; + half = 0; + seq_type = '{HALF_SEQ, RANDOM_SEQ, RANDOM_SEQ}; + req_rate = '{default:50}; + + // cache enabled ~> requests to cached region should use cache port, + // those to uncached regions should use bypass port + bypass_mem_port.set_region(0, CachedAddrBeg - 1); + data_mem_port.set_region(CachedAddrBeg, MemBytes - 1); + + runSeq(0,nWriteVectors,0); + external_writer(int'(max_size),int'(!half)); + flushCache(); + tb_mem_port_t::check_mem(); + + ////////////////////////////////////////////// + test_name = "TEST 24 -- random write on half memory(MSB) and external writer on the other half -- max size = 16b -- enabled cache + tlb, mem contentions + invalidations"; + + // Config + enable_i = 1; + tlb_rand_en = 1; + mem_rand_en = 1; + inv_rand_en = 1; + max_size = 2'b01; + half = 1; + seq_type = '{HALF_SEQ, RANDOM_SEQ, RANDOM_SEQ}; + req_rate = '{default:50}; + + // cache enabled ~> requests to cached region should use cache port, + // those to uncached regions should use bypass port + bypass_mem_port.set_region(0, CachedAddrBeg - 1); + data_mem_port.set_region(CachedAddrBeg, MemBytes - 1); + + runSeq(0,nWriteVectors,0); + external_writer(int'(max_size),int'(!half)); + flushCache(); + tb_mem_port_t::check_mem(); + + ////////////////////////////////////////////// + test_name = "TEST 25 -- random write on half memory(LSB) and external writer on the other half -- max size = 8b -- enabled cache + tlb, mem contentions + invalidations"; + + // Config + enable_i = 1; + tlb_rand_en = 1; + mem_rand_en = 1; + inv_rand_en = 1; + max_size = 2'b00; + half = 0; + seq_type = '{HALF_SEQ, RANDOM_SEQ, RANDOM_SEQ}; + req_rate = '{default:50}; + + // cache enabled ~> requests to cached region should use cache port, + // those to uncached regions should use bypass port + bypass_mem_port.set_region(0, CachedAddrBeg - 1); + data_mem_port.set_region(CachedAddrBeg, MemBytes - 1); + + runSeq(0,nWriteVectors,0); + external_writer(int'(max_size),int'(!half)); + flushCache(); + tb_mem_port_t::check_mem(); + + ////////////////////////////////////////////// + test_name = "TEST 26 -- random write on half memory(MSB) and external writer on the other half -- max size = 8b -- enabled cache + tlb, mem contentions + invalidations"; + + // Config + enable_i = 1; + tlb_rand_en = 1; + mem_rand_en = 1; + inv_rand_en = 1; + max_size = 2'b00; + half = 1; + seq_type = '{HALF_SEQ, RANDOM_SEQ, RANDOM_SEQ}; + req_rate = '{default:50}; + + // cache enabled ~> requests to cached region should use cache port, + // those to uncached regions should use bypass port + bypass_mem_port.set_region(0, CachedAddrBeg - 1); + data_mem_port.set_region(CachedAddrBeg, MemBytes - 1); + + runSeq(0,nWriteVectors,1); + external_writer(int'(max_size),int'(!half)); + flushCache(); + tb_mem_port_t::check_mem(); + + ////////////////////////////////////////////// + end_of_sim = 1; $display("TB> end test sequences"); tb_mem_port_t::report_mem(); diff --git a/corev_apu/tb/tb_wb_dcache/tb.list b/corev_apu/tb/tb_wb_dcache/tb.list index d08c0adaef..dbda19fedc 100644 --- a/corev_apu/tb/tb_wb_dcache/tb.list +++ b/corev_apu/tb/tb_wb_dcache/tb.list @@ -39,12 +39,23 @@ hdl/cv64a6_config_pkg.sv ../../../vendor/pulp-platform/common_cells/src/stream_demux.sv ../../../core/cache_subsystem/axi_adapter.sv ../../../common/local/util/sram.sv +../../../common/local/util/tc_sram_wrapper.sv +../../src/tech_cells_generic/src/rtl/tc_sram.sv ../../src/axi_riscv_atomics/src/axi_res_tbl.sv ../../src/axi_riscv_atomics/src/axi_riscv_amos.sv ../../src/axi_riscv_atomics/src/axi_riscv_amos_alu.sv ../../src/axi_riscv_atomics/src/axi_riscv_lrsc.sv ../../src/axi_riscv_atomics/src/axi_riscv_atomics.sv ../../src/axi_riscv_atomics/src/axi_riscv_atomics_wrap.sv +../../../common/submodules/common_cells/src/id_queue.sv +../../../common/submodules/common_cells/src/stream_fork.sv +../../../common/submodules/common_cells/src/stream_filter.sv +../../../common/submodules/common_cells/src/fall_through_register.sv +../../../common/submodules/common_cells/src/stream_register.sv +../../../common/submodules/common_cells/src/spill_register_flushable.sv +../../../common/submodules/common_cells/src/spill_register.sv +../../../common/submodules/common_cells/src/onehot_to_bin.sv +../../axi/src/axi_multicut.sv ../common/tb_dcache_pkg.sv ../common/tb_readport.sv