From 367316b164ea1e2ab2fa77ff332b370e43fd9f6c Mon Sep 17 00:00:00 2001 From: Nils Wistoff Date: Tue, 25 Jul 2023 20:52:55 +0200 Subject: [PATCH] Add self-invalidation coherence added files required for compilation Added target to test litmus tests Per byte dirty bit added to std dcache and tested basic support for dual core instantiation Some automation added to the multi-core testing process minor changes temporary ci modifications for working without sudo permissions branch prova prova modified Multi core instantiation made generic Transition between WAIT_CRITICAL_WORD and WAIT_TAG removed if there is a flush - feature tested Changed repo with master branch and added masks for reservation at cacheline granularity because burst not supported Fix the never return problem for non boot cores and dt modified for 2 cores Added master branch of common_cells and compilation of new file in Makefile Added transition between FLUSHING and FLUSHING to avoid multiple flushs during atomics Unused code removed and code commented Added support to use the master branch of the axi_riscv_atomics repository Added support for multiple ariane instances for fpga synthesis Increased stack for big applications and reduced number of harts Pheripherals configured to use multiple cores List of issues not solved encountered during the master thesis Co-authored-by: msc22h2 Signed-off-by: Nils Wistoff --- core/cache_subsystem/cache_ctrl.sv | 16 +- core/cache_subsystem/miss_handler.sv | 11 +- core/cache_subsystem/std_nbdcache.sv | 26 +++- core/include/std_cache_pkg.sv | 8 +- corev_apu/tb/common/tb_dcache_pkg.sv | 2 +- corev_apu/tb/common/tb_writeport.sv | 16 +- corev_apu/tb/tb_wb_dcache/Makefile | 2 +- corev_apu/tb/tb_wb_dcache/hdl/tb.sv | 223 +++++++++++++++++++++++++++ corev_apu/tb/tb_wb_dcache/tb.list | 11 ++ 9 files changed, 284 insertions(+), 31 deletions(-) diff --git a/core/cache_subsystem/cache_ctrl.sv b/core/cache_subsystem/cache_ctrl.sv index 2cd60c1f5a..9c7cf28ce8 100644 --- a/core/cache_subsystem/cache_ctrl.sv +++ b/core/cache_subsystem/cache_ctrl.sv @@ -298,18 +298,18 @@ module cache_ctrl // two memory look-ups on a single-ported SRAM and therefore is non-atomic if (!mshr_index_matches_i) begin // store data, write dirty bit - req_o = hit_way_q; - addr_o = mem_req_q.index; - we_o = 1'b1; + req_o = hit_way_q; + addr_o = mem_req_q.index; + we_o = 1'b1; - be_o.vldrty = hit_way_q; + be_o.vldrty = hit_way_q; // set the correct byte enable - be_o.data[cl_offset>>3+:8] = mem_req_q.be; - data_o.data[cl_offset+:64] = mem_req_q.wdata; + be_o.data[cl_offset>>3+:8] = mem_req_q.be; + data_o.data[cl_offset+:64] = mem_req_q.wdata; // ~> change the state - data_o.dirty = 1'b1; - data_o.valid = 1'b1; + data_o.dirty[cl_offset>>3+:8] = mem_req_q.be; + data_o.valid = 1'b1; // got a grant ~> this is finished now if (gnt_i) begin diff --git a/core/cache_subsystem/miss_handler.sv b/core/cache_subsystem/miss_handler.sv index 9f5491c229..afcd09d528 100644 --- a/core/cache_subsystem/miss_handler.sv +++ b/core/cache_subsystem/miss_handler.sv @@ -151,7 +151,7 @@ module miss_handler automatic logic [DCACHE_SET_ASSOC-1:0] evict_way, valid_way; for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin - evict_way[i] = data_i[i].valid & data_i[i].dirty; + evict_way[i] = data_i[i].valid & (|data_i[i].dirty); valid_way[i] = data_i[i].valid; end // ---------------------- @@ -258,10 +258,11 @@ module miss_handler lfsr_enable = 1'b1; evict_way_d = lfsr_oh; // do we need to write back the cache line? - if (data_i[lfsr_bin].dirty) begin + if (|data_i[lfsr_bin].dirty) begin state_d = WB_CACHELINE_MISS; evict_cl_d.tag = data_i[lfsr_bin].tag; evict_cl_d.data = data_i[lfsr_bin].data; + evict_cl_d.dirty = data_i[lfsr_bin].dirty; cnt_d = mshr_q.addr[DCACHE_INDEX_WIDTH-1:0]; // no - we can request a cache line now end else state_d = REQ_CACHELINE; @@ -300,7 +301,7 @@ module miss_handler data_o.tag = mshr_q.addr[DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH-1:DCACHE_INDEX_WIDTH]; data_o.data = data_miss_fsm; data_o.valid = 1'b1; - data_o.dirty = 1'b0; + data_o.dirty = '0; // is this a write? if (mshr_q.we) begin @@ -310,7 +311,7 @@ module miss_handler if (mshr_q.be[i]) data_o.data[(cl_offset+i*8)+:8] = mshr_q.wdata[i]; end // its immediately dirty if we write - data_o.dirty = 1'b1; + data_o.dirty[cl_offset>>3+:8] = mshr_q.be; end // reset MSHR mshr_d.valid = 1'b0; @@ -331,7 +332,7 @@ module miss_handler cnt_q[DCACHE_INDEX_WIDTH-1:DCACHE_BYTE_OFFSET], {{DCACHE_BYTE_OFFSET} {1'b0}} }; - req_fsm_miss_be = '1; + req_fsm_miss_be = evict_cl_q.dirty; req_fsm_miss_we = 1'b1; req_fsm_miss_wdata = evict_cl_q.data; diff --git a/core/cache_subsystem/std_nbdcache.sv b/core/cache_subsystem/std_nbdcache.sv index 4cdee7e4ce..0cc9647c3f 100644 --- a/core/cache_subsystem/std_nbdcache.sv +++ b/core/cache_subsystem/std_nbdcache.sv @@ -91,6 +91,7 @@ module std_nbdcache cache_line_t wdata_ram; cache_line_t [ DCACHE_SET_ASSOC-1:0] rdata_ram; cl_be_t be_ram; + vldrty_t [ DCACHE_SET_ASSOC-1:0] be_valid_dirty_ram; // Busy signals logic miss_handler_busy; @@ -223,19 +224,28 @@ module std_nbdcache // align each valid/dirty bit pair to a byte boundary in order to leverage byte enable signals. // note: if you have an SRAM that supports flat bit enables for your target technology, - // you can use it here to save the extra 4x overhead introduced by this workaround. - logic [4*DCACHE_DIRTY_WIDTH-1:0] dirty_wdata, dirty_rdata; + // you can use it here to save the extra 17x overhead introduced by this workaround. + logic [(DCACHE_LINE_WIDTH+8)*DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata; for (genvar i = 0; i < DCACHE_SET_ASSOC; i++) begin - assign dirty_wdata[8*i] = wdata_ram.dirty; - assign dirty_wdata[8*i+1] = wdata_ram.valid; - assign rdata_ram[i].dirty = dirty_rdata[8*i]; - assign rdata_ram[i].valid = dirty_rdata[8*i+1]; + for (genvar j = 0; j < DCACHE_LINE_WIDTH / 8; j++) begin + // dirty bits assignment + assign dirty_wdata[(DCACHE_LINE_WIDTH+8)*i+8*j] = wdata_ram.dirty[j]; + assign rdata_ram[i].dirty[j] = dirty_rdata[(DCACHE_LINE_WIDTH+8)*i+8*j]; + end + // valid bit assignment + assign dirty_wdata[DCACHE_LINE_WIDTH+(DCACHE_LINE_WIDTH+8)*i] = wdata_ram.valid; + assign rdata_ram[i].valid = dirty_rdata[DCACHE_LINE_WIDTH+(DCACHE_LINE_WIDTH+8)*i]; + end + + // be construction for valid_dirty_sram + for (genvar i = 0; i < DCACHE_SET_ASSOC; i++) begin + assign be_valid_dirty_ram[i*(DCACHE_LINE_WIDTH/8+1)+:(DCACHE_LINE_WIDTH/8+1)] = {be_ram.vldrty[i], be_ram.data} & {(DCACHE_LINE_WIDTH/8+1){be_ram.vldrty[i]}}; end sram #( .USER_WIDTH(1), - .DATA_WIDTH(4 * DCACHE_DIRTY_WIDTH), + .DATA_WIDTH((DCACHE_LINE_WIDTH + 8) * DCACHE_SET_ASSOC), .NUM_WORDS (DCACHE_NUM_WORDS) ) valid_dirty_sram ( .clk_i (clk_i), @@ -245,7 +255,7 @@ module std_nbdcache .addr_i (addr_ram[DCACHE_INDEX_WIDTH-1:DCACHE_BYTE_OFFSET]), .wuser_i('0), .wdata_i(dirty_wdata), - .be_i (be_ram.vldrty), + .be_i (be_valid_dirty_ram), .ruser_o(), .rdata_o(dirty_rdata) ); diff --git a/core/include/std_cache_pkg.sv b/core/include/std_cache_pkg.sv index ae812c9974..cdd11a6e45 100644 --- a/core/include/std_cache_pkg.sv +++ b/core/include/std_cache_pkg.sv @@ -62,10 +62,10 @@ package std_cache_pkg; } bypass_rsp_t; typedef struct packed { - logic [ariane_pkg::DCACHE_TAG_WIDTH-1:0] tag; // tag array - logic [ariane_pkg::DCACHE_LINE_WIDTH-1:0] data; // data array - logic valid; // state array - logic dirty; // state array + logic [ariane_pkg::DCACHE_TAG_WIDTH-1:0] tag; // tag array + logic [ariane_pkg::DCACHE_LINE_WIDTH-1:0] data; // data array + logic valid; // state array + logic [(ariane_pkg::DCACHE_LINE_WIDTH+7)/8-1:0] dirty; // state array } cache_line_t; // cache line byte enable diff --git a/corev_apu/tb/common/tb_dcache_pkg.sv b/corev_apu/tb/common/tb_dcache_pkg.sv index 7584d81a86..4aa4ce6c59 100644 --- a/corev_apu/tb/common/tb_dcache_pkg.sv +++ b/corev_apu/tb/common/tb_dcache_pkg.sv @@ -36,7 +36,7 @@ package tb_pkg; parameter ERROR_CNT_STOP_LEVEL = 1; // use 1 for debugging. 0 runs the complete simulation... // tb_readport sequences - typedef enum logic [2:0] { RANDOM_SEQ, LINEAR_SEQ, BURST_SEQ, IDLE_SEQ, WRAP_SEQ, SET_SEQ, CONST_SEQ } seq_t; + typedef enum logic [2:0] { RANDOM_SEQ, LINEAR_SEQ, BURST_SEQ, IDLE_SEQ, WRAP_SEQ, SET_SEQ, CONST_SEQ, HALF_SEQ } seq_t; typedef enum logic [1:0] { OTHER, BYPASS, CACHED } port_type_t; diff --git a/corev_apu/tb/common/tb_writeport.sv b/corev_apu/tb/common/tb_writeport.sv index babadd91c1..fcc48bca8e 100644 --- a/corev_apu/tb/common/tb_writeport.sv +++ b/corev_apu/tb/common/tb_writeport.sv @@ -30,6 +30,8 @@ program tb_writeport import tb_pkg::*; import ariane_pkg::*; #( input logic rst_ni, // to testbench master + input logic half_i, + input logic [1:0] max_size_i, ref string test_name_i, input logic [6:0] req_rate_i, input seq_t seq_type_i, @@ -63,13 +65,13 @@ program tb_writeport import tb_pkg::*; import ariane_pkg::*; #( automatic logic [7:0] be; automatic logic [1:0] size; - void'(randomize(size)); + void'(randomize(size) with {size >= 2'b00; size <= max_size_i;}); // align to size, set correct byte enables be = '0; unique case(size) - 2'b00: be[paddr[2:0] +: 1] = '1; - 2'b01: be[paddr[2:1]<<1 +: 2] = '1; - 2'b10: be[paddr[2:2]<<2 +: 4] = '1; + 2'b00: be[int'(paddr[2:0]) +: 1] = '1; + 2'b01: be[int'(paddr[2:1]<<1) +: 2] = '1; + 2'b10: be[int'(paddr[2:2]<<2) +: 4] = '1; 2'b11: be = '1; default: ; endcase @@ -109,6 +111,7 @@ program tb_writeport import tb_pkg::*; import ariane_pkg::*; #( dut_req_port_o.data_req = 1'b1; // generate random address void'(randomize(paddr) with {paddr >= 0; paddr < (MemWords<<3);}); + if (seq_type_i == HALF_SEQ) paddr[int'(max_size_i)] = half_i; applyRandData(); `APPL_WAIT_COMB_SIG(clk_i, dut_req_port_i.data_gnt) end @@ -278,6 +281,11 @@ program tb_writeport import tb_pkg::*; import ariane_pkg::*; #( $display("%s> start random sequence with %04d vectors and req_rate %03d", PortName, seq_num_vect_i, req_rate_i); genRandReq(); end + HALF_SEQ: begin + $display("%s> start half random sequence with %04d vectors and req_rate %03d", PortName, seq_num_vect_i, req_rate_i); + $display("%s> half = %b and max size = %b", PortName, half_i, max_size_i); + genRandReq(); + end LINEAR_SEQ: begin $display("%s> start linear sequence with %04d vectors and req_rate %03d", PortName, seq_num_vect_i, req_rate_i); genSeqWrite(); diff --git a/corev_apu/tb/tb_wb_dcache/Makefile b/corev_apu/tb/tb_wb_dcache/Makefile index 5877d27e61..affee866b3 100755 --- a/corev_apu/tb/tb_wb_dcache/Makefile +++ b/corev_apu/tb/tb_wb_dcache/Makefile @@ -20,7 +20,7 @@ src := $(shell xargs printf '\n%s' < $(src-list) | cut -b 1-) compile_flag += +cover+i_dut -incr -64 -nologo -svinputport=compat -override_timescale 1ns/1ps -suppress 2583 -suppress 13262 +cover sim_opts += -64 -coverage -classdebug -voptargs="+acc" questa_version ?= ${QUESTASIM_VERSION} -incdir += ../common/ ../../axi/include/ +incdir += ../common/ ../../axi/include/ ../../../common/submodules/common_cells/include/ # Iterate over all include directories and write them with +incdir+ prefixed # +incdir+ works for Verilator and QuestaSim diff --git a/corev_apu/tb/tb_wb_dcache/hdl/tb.sv b/corev_apu/tb/tb_wb_dcache/hdl/tb.sv index 0b4550fec5..c09c6f9dd7 100644 --- a/corev_apu/tb/tb_wb_dcache/hdl/tb.sv +++ b/corev_apu/tb/tb_wb_dcache/hdl/tb.sv @@ -94,6 +94,8 @@ module tb import ariane_pkg::*; import std_cache_pkg::*; import tb_pkg::*; #()() seq_t [2:0] seq_type; logic [3:0] seq_done; logic [6:0] req_rate[2:0]; + logic half; + logic [1:0] max_size; logic seq_run, seq_last; logic end_of_sim; @@ -234,6 +236,37 @@ module tb import ariane_pkg::*; import std_cache_pkg::*; import tb_pkg::*; #()() `APPL_WAIT_CYC(clk_i, 1) endtask : flushCache + //integer fd = $fopen("extern_write.txt","w"); + // Write directly the tb memory + function automatic void external_writer(int unsigned pos, int unsigned half); + automatic logic[7:0] val; + for (int k=0; k requests to cached region should use cache port, + // those to uncached regions should use bypass port + bypass_mem_port.set_region(0, CachedAddrBeg - 1); + data_mem_port.set_region(CachedAddrBeg, MemBytes - 1); + + runSeq(0,nWriteVectors,0); + external_writer(int'(max_size),int'(!half)); + flushCache(); + tb_mem_port_t::check_mem(); + + ////////////////////////////////////////////// + test_name = "TEST 20 -- random write on half memory(MSB) and external writer on the other half -- max size = 64b -- enabled cache + tlb, mem contentions + invalidations"; + + // Config + enable_i = 1; + tlb_rand_en = 1; + mem_rand_en = 1; + inv_rand_en = 1; + max_size = 2'b11; + half = 1; + seq_type = '{HALF_SEQ, RANDOM_SEQ, RANDOM_SEQ}; + req_rate = '{default:50}; + + // cache enabled ~> requests to cached region should use cache port, + // those to uncached regions should use bypass port + bypass_mem_port.set_region(0, CachedAddrBeg - 1); + data_mem_port.set_region(CachedAddrBeg, MemBytes - 1); + + runSeq(0,nWriteVectors,0); + external_writer(int'(max_size),int'(!half)); + flushCache(); + tb_mem_port_t::check_mem(); + + ////////////////////////////////////////////// + test_name = "TEST 21 -- random write on half memory(LSB) and external writer on the other half -- max size = 32b -- enabled cache + tlb, mem contentions + invalidations"; + + // Config + enable_i = 1; + tlb_rand_en = 1; + mem_rand_en = 1; + inv_rand_en = 1; + max_size = 2'b10; + half = 0; + seq_type = '{HALF_SEQ, RANDOM_SEQ, RANDOM_SEQ}; + req_rate = '{default:50}; + + // cache enabled ~> requests to cached region should use cache port, + // those to uncached regions should use bypass port + bypass_mem_port.set_region(0, CachedAddrBeg - 1); + data_mem_port.set_region(CachedAddrBeg, MemBytes - 1); + + runSeq(0,nWriteVectors,0); + external_writer(int'(max_size),int'(!half)); + flushCache(); + tb_mem_port_t::check_mem(); + + ////////////////////////////////////////////// + test_name = "TEST 22 -- random write on half memory(MSB) and external writer on the other half -- max size = 32b -- enabled cache + tlb, mem contentions + invalidations"; + + // Config + enable_i = 1; + tlb_rand_en = 1; + mem_rand_en = 1; + inv_rand_en = 1; + max_size = 2'b10; + half = 1; + seq_type = '{HALF_SEQ, RANDOM_SEQ, RANDOM_SEQ}; + req_rate = '{default:50}; + + // cache enabled ~> requests to cached region should use cache port, + // those to uncached regions should use bypass port + bypass_mem_port.set_region(0, CachedAddrBeg - 1); + data_mem_port.set_region(CachedAddrBeg, MemBytes - 1); + + runSeq(0,nWriteVectors,0); + external_writer(int'(max_size),int'(!half)); + flushCache(); + tb_mem_port_t::check_mem(); + + ////////////////////////////////////////////// + test_name = "TEST 23 -- random write on half memory(LSB) and external writer on the other half -- max size = 16b -- enabled cache + tlb, mem contentions + invalidations"; + + // Config + enable_i = 1; + tlb_rand_en = 1; + mem_rand_en = 1; + inv_rand_en = 1; + max_size = 2'b01; + half = 0; + seq_type = '{HALF_SEQ, RANDOM_SEQ, RANDOM_SEQ}; + req_rate = '{default:50}; + + // cache enabled ~> requests to cached region should use cache port, + // those to uncached regions should use bypass port + bypass_mem_port.set_region(0, CachedAddrBeg - 1); + data_mem_port.set_region(CachedAddrBeg, MemBytes - 1); + + runSeq(0,nWriteVectors,0); + external_writer(int'(max_size),int'(!half)); + flushCache(); + tb_mem_port_t::check_mem(); + + ////////////////////////////////////////////// + test_name = "TEST 24 -- random write on half memory(MSB) and external writer on the other half -- max size = 16b -- enabled cache + tlb, mem contentions + invalidations"; + + // Config + enable_i = 1; + tlb_rand_en = 1; + mem_rand_en = 1; + inv_rand_en = 1; + max_size = 2'b01; + half = 1; + seq_type = '{HALF_SEQ, RANDOM_SEQ, RANDOM_SEQ}; + req_rate = '{default:50}; + + // cache enabled ~> requests to cached region should use cache port, + // those to uncached regions should use bypass port + bypass_mem_port.set_region(0, CachedAddrBeg - 1); + data_mem_port.set_region(CachedAddrBeg, MemBytes - 1); + + runSeq(0,nWriteVectors,0); + external_writer(int'(max_size),int'(!half)); + flushCache(); + tb_mem_port_t::check_mem(); + + ////////////////////////////////////////////// + test_name = "TEST 25 -- random write on half memory(LSB) and external writer on the other half -- max size = 8b -- enabled cache + tlb, mem contentions + invalidations"; + + // Config + enable_i = 1; + tlb_rand_en = 1; + mem_rand_en = 1; + inv_rand_en = 1; + max_size = 2'b00; + half = 0; + seq_type = '{HALF_SEQ, RANDOM_SEQ, RANDOM_SEQ}; + req_rate = '{default:50}; + + // cache enabled ~> requests to cached region should use cache port, + // those to uncached regions should use bypass port + bypass_mem_port.set_region(0, CachedAddrBeg - 1); + data_mem_port.set_region(CachedAddrBeg, MemBytes - 1); + + runSeq(0,nWriteVectors,0); + external_writer(int'(max_size),int'(!half)); + flushCache(); + tb_mem_port_t::check_mem(); + + ////////////////////////////////////////////// + test_name = "TEST 26 -- random write on half memory(MSB) and external writer on the other half -- max size = 8b -- enabled cache + tlb, mem contentions + invalidations"; + + // Config + enable_i = 1; + tlb_rand_en = 1; + mem_rand_en = 1; + inv_rand_en = 1; + max_size = 2'b00; + half = 1; + seq_type = '{HALF_SEQ, RANDOM_SEQ, RANDOM_SEQ}; + req_rate = '{default:50}; + + // cache enabled ~> requests to cached region should use cache port, + // those to uncached regions should use bypass port + bypass_mem_port.set_region(0, CachedAddrBeg - 1); + data_mem_port.set_region(CachedAddrBeg, MemBytes - 1); + + runSeq(0,nWriteVectors,1); + external_writer(int'(max_size),int'(!half)); + flushCache(); + tb_mem_port_t::check_mem(); + + ////////////////////////////////////////////// + end_of_sim = 1; $display("TB> end test sequences"); tb_mem_port_t::report_mem(); diff --git a/corev_apu/tb/tb_wb_dcache/tb.list b/corev_apu/tb/tb_wb_dcache/tb.list index 05180e4ee1..7d46ce5028 100644 --- a/corev_apu/tb/tb_wb_dcache/tb.list +++ b/corev_apu/tb/tb_wb_dcache/tb.list @@ -35,12 +35,23 @@ ../../../vendor/pulp-platform/common_cells/src/stream_demux.sv ../../../core/axi_adapter.sv ../../../common/local/util/sram.sv +../../../common/local/util/tc_sram_wrapper.sv +../../src/tech_cells_generic/src/rtl/tc_sram.sv ../../src/axi_riscv_atomics/src/axi_res_tbl.sv ../../src/axi_riscv_atomics/src/axi_riscv_amos.sv ../../src/axi_riscv_atomics/src/axi_riscv_amos_alu.sv ../../src/axi_riscv_atomics/src/axi_riscv_lrsc.sv ../../src/axi_riscv_atomics/src/axi_riscv_atomics.sv ../../src/axi_riscv_atomics/src/axi_riscv_atomics_wrap.sv +../../../common/submodules/common_cells/src/id_queue.sv +../../../common/submodules/common_cells/src/stream_fork.sv +../../../common/submodules/common_cells/src/stream_filter.sv +../../../common/submodules/common_cells/src/fall_through_register.sv +../../../common/submodules/common_cells/src/stream_register.sv +../../../common/submodules/common_cells/src/spill_register_flushable.sv +../../../common/submodules/common_cells/src/spill_register.sv +../../../common/submodules/common_cells/src/onehot_to_bin.sv +../../axi/src/axi_multicut.sv ../common/tb_dcache_pkg.sv ../common/tb_readport.sv