diff --git a/asic/scripts/initialFloorplan.tcl b/asic/scripts/initialFloorplan.tcl index 787552e..f6e6628 100755 --- a/asic/scripts/initialFloorplan.tcl +++ b/asic/scripts/initialFloorplan.tcl @@ -1,5 +1,5 @@ # Floorplan -floorPlan -r 0.4 0.85 10.0 10.0 10.0 10.0 +floorPlan -r 0.33 0.85 10.0 10.0 10.0 10.0 timeDesign -preplace -prefix preplace diff --git a/fpga/scripts/vivado.tcl b/fpga/scripts/vivado.tcl index 8f4355c..7d823e7 100644 --- a/fpga/scripts/vivado.tcl +++ b/fpga/scripts/vivado.tcl @@ -4,9 +4,9 @@ set SCRIPTS_DIR ../scripts source $SCRIPTS_DIR/vivado_config.tcl #Board specific -# source $SCRIPTS_DIR/pynq_z2.tcl +source $SCRIPTS_DIR/pynq_z2.tcl # source $SCRIPTS_DIR/zcu102.tcl -source $SCRIPTS_DIR/zcu104.tcl +# source $SCRIPTS_DIR/zcu104.tcl # CREATE IPs @@ -75,11 +75,11 @@ connect_bd_net [get_bd_pins $PS_CLK] [get_bd_pins dnn_engine_0/aclk] connect_bd_intf_net [get_bd_intf_pins dma_pixels/M_AXIS_MM2S] [get_bd_intf_pins dnn_engine_0/s_axis_pixels] connect_bd_intf_net [get_bd_intf_pins dma_weights/M_AXIS_MM2S] [get_bd_intf_pins dnn_engine_0/s_axis_weights] connect_bd_net [get_bd_pins dnn_engine_0/aresetn] [get_bd_pins axi_smc/aresetn] -connect_bd_net [get_bd_pins dnn_engine_0/done_fill] [get_bd_pins xlconcat_0/In2] -connect_bd_net [get_bd_pins axi_bram_ctrl/bram_addr_a] [get_bd_pins dnn_engine_0/bram_addr_a] -connect_bd_net [get_bd_pins axi_bram_ctrl/bram_rddata_a] [get_bd_pins dnn_engine_0/bram_rddata_a] -connect_bd_net [get_bd_pins axi_bram_ctrl/bram_en_a] [get_bd_pins dnn_engine_0/bram_en_a] -connect_bd_net [get_bd_pins axi_gpio_out/gpio_io_o] [get_bd_pins dnn_engine_0/t_done_proc] +connect_bd_net [get_bd_pins dnn_engine_0/m_done_fill] [get_bd_pins xlconcat_0/In2] +connect_bd_net [get_bd_pins axi_bram_ctrl/bram_addr_a] [get_bd_pins dnn_engine_0/m_ram_addr_a] +connect_bd_net [get_bd_pins axi_bram_ctrl/bram_rddata_a] [get_bd_pins dnn_engine_0/m_ram_rddata_a] +connect_bd_net [get_bd_pins axi_bram_ctrl/bram_en_a] [get_bd_pins dnn_engine_0/m_ram_en_a] +connect_bd_net [get_bd_pins axi_gpio_out/gpio_io_o] [get_bd_pins dnn_engine_0/m_t_done_proc] validate_bd_design diff --git a/rtl/dnn_engine.v b/rtl/dnn_engine.v index d3e6895..9fd5b97 100644 --- a/rtl/dnn_engine.v +++ b/rtl/dnn_engine.v @@ -33,11 +33,11 @@ module dnn_engine #( input wire [S_WEIGHTS_WIDTH_LF -1:0] s_axis_weights_tdata, input wire [S_WEIGHTS_WIDTH_LF/8-1:0] s_axis_weights_tkeep, - input wire [(OUT_ADDR_WIDTH+2)-1:0] bram_addr_a, - output wire [ OUT_BITS -1:0] bram_rddata_a, - input wire bram_en_a, - output wire done_fill, - input wire t_done_proc + input wire [(OUT_ADDR_WIDTH+2)-1:0] m_ram_addr_a, + output wire [ OUT_BITS -1:0] m_ram_rddata_a, + input wire m_ram_en_a, + output wire m_done_fill, + input wire m_t_done_proc ); localparam TUSER_WIDTH = `TUSER_WIDTH; @@ -131,11 +131,11 @@ module dnn_engine #( .s_data (out_s_data ), .s_last (out_s_last ), - .bram_addr_a (bram_addr_a ), - .bram_rddata_a(bram_rddata_a ), - .bram_en_a (bram_en_a ), - .done_fill (done_fill ), - .t_done_proc (t_done_proc ) + .m_ram_addr_a (m_ram_addr_a ), + .m_ram_rddata_a (m_ram_rddata_a), + .m_ram_en_a (m_ram_en_a ), + .m_done_fill (m_done_fill ), + .m_t_done_proc (m_t_done_proc ) ); endmodule diff --git a/rtl/out_ram_switch.sv b/rtl/out_ram_switch.sv index 2d49e91..814cbe9 100644 --- a/rtl/out_ram_switch.sv +++ b/rtl/out_ram_switch.sv @@ -15,12 +15,12 @@ module out_ram_switch #( input logic [ROWS -1:0][Y_BITS -1:0] s_data, input logic s_valid, s_last, - input logic [(ADDR_WIDTH+2)-1:0] bram_addr_a, - output logic [ WORD_WIDTH -1:0] bram_rddata_a, - input logic bram_en_a, + input logic [(ADDR_WIDTH+2)-1:0] m_ram_addr_a, + output logic [ WORD_WIDTH -1:0] m_ram_rddata_a, + input logic m_ram_en_a, - output logic done_fill, - input logic t_done_proc + output logic m_done_fill, + input logic m_t_done_proc ); localparam BITS_COLS = $clog2(COLS), BITS_ROWS = $clog2(ROWS); @@ -91,32 +91,32 @@ module out_ram_switch #( // ----- // READ // ----- - // 1. fw starts, waits for t_done_fill to toggle - // 2. mod toggles t_done_fill, moving to READ_S, waits for t_done_proc - // 3. fw continues, finishes processing, toggles t_done_proc - // 4. mod senses t_done_proc in READ_S, moves, waits for done_write, toggles t_done_fill - // 5. fw loops to beginning, waits for t_done_fill to toggle + // 1. fw starts, waits for t_m_done_fill to toggle + // 2. mod toggles t_m_done_fill, moving to READ_S, waits for m_t_done_proc + // 3. fw continues, finishes processing, toggles m_t_done_proc + // 4. mod senses m_t_done_proc in READ_S, moves, waits for done_write, toggles t_m_done_fill + // 5. fw loops to beginning, waits for t_m_done_fill to toggle always_comb unique case (state_read) R_IDLE_S : if (done_write [i_read]) state_read_next = R_DONE_FILL_S; R_DONE_FILL_S: state_read_next = R_READ_S; - R_READ_S : if (dp_prev != t_done_proc) state_read_next = R_WAIT_S; + R_READ_S : if (dp_prev != m_t_done_proc) state_read_next = R_WAIT_S; R_WAIT_S : state_read_next = R_SWITCH_S; R_SWITCH_S : state_read_next = R_IDLE_S; endcase - assign ram_r_addr = bram_addr_a[(ADDR_WIDTH+2)-1:2]; - assign bram_rddata_a = WORD_WIDTH'(signed'(ram_dout[i_read])); // pad to 32 - assign done_fill = state_read == R_DONE_FILL_S; // one clock for interrupt + assign ram_r_addr = m_ram_addr_a[(ADDR_WIDTH+2)-1:2]; + assign m_ram_rddata_a = WORD_WIDTH'(signed'(ram_dout[i_read])); // pad to 32 + assign m_done_fill = state_read == R_DONE_FILL_S; // one clock for interrupt // always_ff @(posedge clk) - // if (!rstn) t_done_fill <= 0; - // else if (state_read == R_DONE_FILL_S) t_done_fill <= !t_done_fill; + // if (!rstn) t_m_done_fill <= 0; + // else if (state_read == R_DONE_FILL_S) t_m_done_fill <= !t_m_done_fill; always_ff @(posedge clk) - if (!rstn) dp_prev <= 0; // t_done_proc starts at 0 - else if (state_read_next == R_WAIT_S) dp_prev <= t_done_proc; // sample dp_prev at end of reading + if (!rstn) dp_prev <= 0; // m_t_done_proc starts at 0 + else if (state_read_next == R_WAIT_S) dp_prev <= m_t_done_proc; // sample dp_prev at end of reading // ----- // PING PONG @@ -140,11 +140,7 @@ module out_ram_switch #( assign ram_addr [i] = (i == i_write && state_write == W_WRITE_S) ? ram_w_addr : ram_r_addr; localparam RAM_ADDR_BITS = $clog2(COLS*ROWS); - ram_output #( - .DEPTH (COLS * ROWS), - .WIDTH (Y_BITS ), - .LATENCY (RAM_LATENCY) - ) RAM ( + ram_output RAM ( .clka (clk), .ena (1'b1), .wea (ram_wen [i] ), diff --git a/test/sv/dnn_engine_tb.sv b/test/sv/dnn_engine_tb.sv index 983e1b8..a4aa4d1 100644 --- a/test/sv/dnn_engine_tb.sv +++ b/test/sv/dnn_engine_tb.sv @@ -5,7 +5,6 @@ module dnn_engine_tb; - localparam DIR_PATH = `DIR_PATH; localparam VALID_PROB = `VALID_PROB, READY_PROB = `READY_PROB; @@ -38,9 +37,9 @@ module dnn_engine_tb; logic [S_WEIGHTS_WIDTH_LF/K_BITS-1:0][K_BITS-1:0] s_axis_weights_tdata; logic [S_WEIGHTS_WIDTH_LF/8-1:0] s_axis_weights_tkeep; - bit bram_en_a, done_fill, t_done_proc; - logic [(OUT_ADDR_WIDTH+2)-1:0] bram_addr_a; - logic [ OUT_BITS -1:0] bram_rddata_a; + bit m_ram_en_a, m_done_fill, m_t_done_proc; + logic [(OUT_ADDR_WIDTH+2)-1:0] m_ram_addr_a; + logic [ OUT_BITS -1:0] m_ram_rddata_a; dnn_engine pipe (.*); @@ -51,12 +50,11 @@ module dnn_engine_tb; DMA_M2S #(S_WEIGHTS_WIDTH_LF, VALID_PROB, 0) source_k (aclk, aresetn, s_axis_weights_tready, s_axis_weights_tvalid, s_axis_weights_tlast, s_axis_weights_tdata, s_axis_weights_tkeep); bit y_done=0, x_done=0, w_done=0; - string w_path, x_path; int w_offset=0, w_bpt=0, x_offset=0, x_bpt=0; import "DPI-C" function void load_x(inout bit x_done, inout int x_offset, x_bpt); import "DPI-C" function void load_w(inout bit w_done, inout int w_offset, w_bpt); - import "DPI-C" function void load_y(inout bit y_done, inout bit t_done_proc, inout bit [31:0] y_sram [ROWS*COLS-1:0]); + import "DPI-C" function void load_y(inout bit y_done, inout bit m_t_done_proc, inout bit [31:0] y_sram [ROWS*COLS-1:0]); import "DPI-C" function void fill_memory(); import "DPI-C" function byte get_byte_wx (int addr, int mode); @@ -82,20 +80,20 @@ module dnn_engine_tb; // Y_SRAM int file, y_wpt, dout; initial begin - {bram_addr_a, bram_en_a, t_done_proc} = 0; + {m_ram_addr_a, m_ram_en_a, m_t_done_proc} = 0; wait(aresetn); repeat(2) @(posedge aclk); while (!y_done) begin - wait (done_fill); // callback trigger + wait (m_done_fill); // callback trigger for (int unsigned ir=0; ir < ROWS*COLS; ir++) begin // DPI-C cannot consume time in verilator, so read in advance - bram_addr_a <= ir*(OUT_BITS/8); // 4 byte words - bram_en_a <= 1; + m_ram_addr_a <= ir*(OUT_BITS/8); // 4 byte words + m_ram_en_a <= 1; repeat(2) @(posedge aclk) #1ps; - y_sram[ir] = bram_rddata_a; + y_sram[ir] = m_ram_rddata_a; end - load_y(y_done, t_done_proc, y_sram); + load_y(y_done, m_t_done_proc, y_sram); end end