Update AXI Width to 128

KastnerRG · Dec 4, 2023 · 4fe0d2f · 4fe0d2f
1 parent 2920bd2
commit 4fe0d2f
Show file tree

Hide file tree

Showing 11 changed files with 104 additions and 55 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,8 @@
 old/
 __pycache__
 
+temp/
+
 run/fpga/*
 
 run/asic/*

diff --git a/deepsocflow/c/runtime.h b/deepsocflow/c/runtime.h
@@ -103,7 +103,7 @@ static inline void write_x(int8_t val, int8_t *p_out_buffer, int32_t ib, int32_t
   mem.debug_tiled[flat_index] = val;
 
   // Pack bits and store
-  int32_t flat_index_with_header = p_offset + flat_index_n2r + (ixp+1)*64/X_BITS;
+  int32_t flat_index_with_header = p_offset + flat_index_n2r + (ixp+1)*(AXI_WIDTH/X_BITS);
   int32_t packed_index           = flat_index_with_header / X_WORDS_PER_BYTE;
   uint8_t packed_position        = flat_index_with_header % X_WORDS_PER_BYTE; // 0,1,2,3
 
@@ -238,9 +238,11 @@ extern EXT_C void load_y (volatile uint8_t *p_done, uint64_t *p_base_addr_next,
       Bundle_t *pb_out = &bundles[ib+1];
       for (int ixp=0; ixp < pb_out->p; ixp++) {
         int32_t offset_words   = (ixp == 0) ? 0 : (pb_out->cm_p0 + (ixp-1)*pb_out->cm)*pb_out->xp_words;
-        int32_t offset_bytes   = offset_words/X_WORDS_PER_BYTE + ixp*8;
-
-        *(uint64_t*)&(p_out_buffer[offset_bytes])     = ixp == 0 ? pb_out->x_header_p0 : pb_out->x_header;
+        int32_t offset_bytes   = offset_words/X_WORDS_PER_BYTE + ixp*(AXI_WIDTH/8);
+        uint64_t *p_header = (uint64_t*)&(p_out_buffer[offset_bytes]);
+        p_header[0] = ixp == 0 ? pb_out->x_header_p0 : pb_out->x_header;
+        if (AXI_WIDTH == 128)
+          p_header[1] = (uint64_t)0;
         // debug_printf("--------ib:%d, ixp:%d offset_bytes:%d\n", ib, ixp, offset_bytes);
       }
     }

diff --git a/deepsocflow/py/bundle.py b/deepsocflow/py/bundle.py
@@ -645,6 +645,34 @@ def get_runtime_params(c, w_shape, x_shape, o_shape, core_d, pool_d, flatten):
         r = namedtuple('Runtime', params)(**params)
         return r
 
+    @staticmethod
+    def predict_performance(hw, r):
+
+        clocks_p0 = r.IT*(1 + r.XN*r.XL*r.XW*(1 + r.CM_0*r.KH))
+        clocks_p  = r.IT*(1 + r.XN*r.XL*r.XW*(1 + r.CM*r.KH))
+
+        mem_bits_p0 = \
+            hw.X_BITS * (r.IT * r.XN   * r.XL * r.XW * r.CM_0 * (hw.ROWS + hw.KH_MAX-1)) +\
+            hw.K_BITS * (r.IT * r.CM_0 * r.KH * hw.COLS) +\
+            hw.X_BITS * (r.XN * r.XH   * r.XW * r.CO)
+        mem_bits_p = \
+            hw.X_BITS * (r.IT * r.XN   * r.XL * r.XW * r.CM   * (hw.ROWS + hw.KH_MAX-1)) +\
+            hw.K_BITS * (r.IT * r.CM_0 * r.KH * hw.COLS) +\
+            hw.X_BITS * (r.XN * r.XH   * r.XW * r.CO)
+
+        '''
+        Accurate mem access (output):
+            - baseline: next bundle input + padding
+            - p_add   - write & read
+            - pooling - write & read
+            - softmax - write & read
+        '''
+
+        clocks    = clocks_p0 + (r.CP-1)*clocks_p
+        mem_bits  = mem_bits_p0 + (r.CP-1)*mem_bits_p
+
+        return clocks, mem_bits
+
 
     @staticmethod
     def create_headers(c, r):

diff --git a/deepsocflow/py/model.py b/deepsocflow/py/model.py
@@ -185,6 +185,7 @@ def export_inference(self, x, hw):
             ch.write(f"#define B_TYPE      int{hw.B_BITS}_t\n")
             ch.write(f"#define O_TYPE      {out_type}\n")
             ch.write(f"#define B_WORDS     {b_words}\n")
+            ch.write(f"#define AXI_WIDTH   {hw.IN_BITS}\n")
             ch.write(f'#define DATA_DIR   "../{hw.DATA_DIR}"\n\n')
 
             mask_nums = [(2**hw.X_BITS-1) << (p*hw.X_BITS)  for p in range(8//hw.X_BITS)]
@@ -198,17 +199,20 @@ def export_inference(self, x, hw):
         x_bitstring = b''
         b_bitstring = b''
         x_bitstring_0 = b''
+
+        header_padding = b'\x00\x00\x00\x00\x00\x00\x00\x00' if hw.IN_BITS == 128 else b''
+
         for ib, b in enumerate(bundles):
             x_bitstring_b = b''
             if b.b:
                 b_bitstring += b.be.astype(type_d['np'][hw.B_BITS]).tobytes()
             for ip in range(b.r.CP):
                 xe = Bundle.pack_words_into_bytes(arr=b.xe[ip].flatten(), bits=hw.X_BITS)
-                x_bitstring_b += b.r.x_header_be_p[ip!=0].tobytes() + xe.tobytes()
+                x_bitstring_b += b.r.x_header_be_p[ip!=0].tobytes() + header_padding + xe.tobytes()
 
                 for it in range(b.r.IT):
                     we = Bundle.pack_words_into_bytes(arr=b.we[ip][it].flatten(), bits=hw.K_BITS)
-                    w_bitstring += b.r.w_header_be_p[ip!=0].tobytes() + we.tobytes()
+                    w_bitstring += b.r.w_header_be_p[ip!=0].tobytes() + header_padding + we.tobytes()
             x_bitstring += x_bitstring_b
             with open(f"{hw.DATA_DIR}/{ib}_x_sim.bin", 'wb') as f: 
                 f.write(x_bitstring_b)
@@ -321,7 +325,18 @@ def verify_inference(self, SIM, SIM_PATH):
                 with open(f'{hw.DATA_DIR}/{ib}_y_packed_sim.bin', 'rb') as f_sim, open(f'{hw.DATA_DIR}/{ib+1}_x_sim.bin', 'rb') as f_exp:
                     y_packed_sim = np.frombuffer(f_sim.read(), dtype=np.uint8)
                     y_packed_exp = np.frombuffer(f_exp.read(), dtype=np.uint8)
-                error = np.sum(np.abs(y_packed_sim-y_packed_exp))
-                assert error == 0, f"Error={error}, for y_packed_sim at {b.idx=}, y_packed_sim=\n{y_packed_sim[:100]} \n y_packed_exp=\n{y_packed_exp[:100]}\n"
+                diff  = y_packed_sim-y_packed_exp
+                error = np.sum(np.abs(diff))
+                assert error == 0, f"Error={error}, for y_packed_sim at {b.idx=}, y_packed_sim=\n{y_packed_sim[:100]} \n y_packed_exp=\n{y_packed_exp[:100]}\n, diff=\n{diff.tolist()}\n  y_packed_sim=\n{y_packed_sim.tolist()} \n y_packed_exp=\n{y_packed_exp.tolist()}\n"
 
-            print(f"Bundle {b.idx}, Error: {error}. Passed")
+            print(f"Bundle {b.idx}, Error: {error}. Passed")
+
+    def predict_performance(self):
+
+        clocks_total = 0
+        for b in self.bundles:
+            clocks, mem_bits = Bundle.predict_performance(hw=self.hw, r=b.r)
+            clocks_total += clocks
+
+        time = clocks_total / (self.hw.FREQ * 1e6)
+        return time
diff --git a/deepsocflow/tcl/fpga/zcu104.tcl b/deepsocflow/tcl/fpga/zcu104.tcl
@@ -6,7 +6,7 @@ set_property board_part xilinx.com:zcu104:part0:1.1 [current_project]
 create_bd_design "design_1"
 create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.4 zynq_ultra_ps_e_0
 apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ultra_ps_e_0]
-set_property -dict [list CONFIG.PSU__USE__M_AXI_GP1 {1} CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ $FREQ CONFIG.PSU__USE__M_AXI_GP0 {0}] [get_bd_cells zynq_ultra_ps_e_0]
+set_property -dict [list CONFIG.PSU__USE__M_AXI_GP1 {1} CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ $FREQ CONFIG.PSU__USE__M_AXI_GP0 {0} CONFIG.PSU__QSPI__PERIPHERAL__ENABLE {0}] [get_bd_cells zynq_ultra_ps_e_0]
 
 set PS_IRQ        "zynq_ultra_ps_e_0/pl_ps_irq0"
 set PS_M_AXI_LITE "/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD"

diff --git a/run/param_test.py b/run/param_test.py
@@ -14,18 +14,18 @@ def product_dict(**kwargs):
 
 @pytest.mark.parametrize("PARAMS", list(product_dict(
                                         processing_elements  = [(8,24)   ],
-                                        frequency_mhz        = [ 250     ],
-                                        bits_input           = [ 4       ],
-                                        bits_weights         = [ 4       ],
-                                        bits_sum             = [ 16      ],
+                                        frequency_mhz        = [ 100     ],
+                                        bits_input           = [ 8       ],
+                                        bits_weights         = [ 8       ],
+                                        bits_sum             = [ 32      ],
                                         bits_bias            = [ 16      ],
                                         max_batch_size       = [ 64      ], 
                                         max_channels_in      = [ 2048    ],
                                         max_kernel_size      = [ 13      ],
                                         max_image_size       = [ 512     ],
                                         ram_weights_depth    = [ 20      ],
                                         ram_edges_depth      = [ 288     ],
-                                        axi_width            = [ 64      ],
+                                        axi_width            = [ 128     ],
                                         target_cpu_int_bits  = [ 32      ],
                                         valid_prob           = [ 0.01    ],
                                         ready_prob           = [ 0.1     ],
@@ -78,4 +78,6 @@ def test_dnn_engine(PARAMS):
     3. EXPORT FOR INFERENCE
     '''
     model.export_inference(x=model.random_input, hw=hw)
-    model.verify_inference(SIM=SIM, SIM_PATH=SIM_PATH)
+    model.verify_inference(SIM=SIM, SIM_PATH=SIM_PATH)
+
+    print(f"Predicted time on hardware: {1000*model.predict_performance():.5f} ms")