Skip to content

Commit

Permalink
Update AXI Width to 128
Browse files Browse the repository at this point in the history
  • Loading branch information
Aba committed Dec 4, 2023
1 parent 2920bd2 commit 4fe0d2f
Show file tree
Hide file tree
Showing 11 changed files with 104 additions and 55 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
old/
__pycache__

temp/

run/fpga/*

run/asic/*
Expand Down
10 changes: 6 additions & 4 deletions deepsocflow/c/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ static inline void write_x(int8_t val, int8_t *p_out_buffer, int32_t ib, int32_t
mem.debug_tiled[flat_index] = val;

// Pack bits and store
int32_t flat_index_with_header = p_offset + flat_index_n2r + (ixp+1)*64/X_BITS;
int32_t flat_index_with_header = p_offset + flat_index_n2r + (ixp+1)*(AXI_WIDTH/X_BITS);
int32_t packed_index = flat_index_with_header / X_WORDS_PER_BYTE;
uint8_t packed_position = flat_index_with_header % X_WORDS_PER_BYTE; // 0,1,2,3

Expand Down Expand Up @@ -238,9 +238,11 @@ extern EXT_C void load_y (volatile uint8_t *p_done, uint64_t *p_base_addr_next,
Bundle_t *pb_out = &bundles[ib+1];
for (int ixp=0; ixp < pb_out->p; ixp++) {
int32_t offset_words = (ixp == 0) ? 0 : (pb_out->cm_p0 + (ixp-1)*pb_out->cm)*pb_out->xp_words;
int32_t offset_bytes = offset_words/X_WORDS_PER_BYTE + ixp*8;

*(uint64_t*)&(p_out_buffer[offset_bytes]) = ixp == 0 ? pb_out->x_header_p0 : pb_out->x_header;
int32_t offset_bytes = offset_words/X_WORDS_PER_BYTE + ixp*(AXI_WIDTH/8);
uint64_t *p_header = (uint64_t*)&(p_out_buffer[offset_bytes]);
p_header[0] = ixp == 0 ? pb_out->x_header_p0 : pb_out->x_header;
if (AXI_WIDTH == 128)
p_header[1] = (uint64_t)0;
// debug_printf("--------ib:%d, ixp:%d offset_bytes:%d\n", ib, ixp, offset_bytes);
}
}
Expand Down
28 changes: 28 additions & 0 deletions deepsocflow/py/bundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,34 @@ def get_runtime_params(c, w_shape, x_shape, o_shape, core_d, pool_d, flatten):
r = namedtuple('Runtime', params)(**params)
return r

@staticmethod
def predict_performance(hw, r):

clocks_p0 = r.IT*(1 + r.XN*r.XL*r.XW*(1 + r.CM_0*r.KH))
clocks_p = r.IT*(1 + r.XN*r.XL*r.XW*(1 + r.CM*r.KH))

mem_bits_p0 = \
hw.X_BITS * (r.IT * r.XN * r.XL * r.XW * r.CM_0 * (hw.ROWS + hw.KH_MAX-1)) +\
hw.K_BITS * (r.IT * r.CM_0 * r.KH * hw.COLS) +\
hw.X_BITS * (r.XN * r.XH * r.XW * r.CO)
mem_bits_p = \
hw.X_BITS * (r.IT * r.XN * r.XL * r.XW * r.CM * (hw.ROWS + hw.KH_MAX-1)) +\
hw.K_BITS * (r.IT * r.CM_0 * r.KH * hw.COLS) +\
hw.X_BITS * (r.XN * r.XH * r.XW * r.CO)

'''
Accurate mem access (output):
- baseline: next bundle input + padding
- p_add - write & read
- pooling - write & read
- softmax - write & read
'''

clocks = clocks_p0 + (r.CP-1)*clocks_p
mem_bits = mem_bits_p0 + (r.CP-1)*mem_bits_p

return clocks, mem_bits


@staticmethod
def create_headers(c, r):
Expand Down
25 changes: 20 additions & 5 deletions deepsocflow/py/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ def export_inference(self, x, hw):
ch.write(f"#define B_TYPE int{hw.B_BITS}_t\n")
ch.write(f"#define O_TYPE {out_type}\n")
ch.write(f"#define B_WORDS {b_words}\n")
ch.write(f"#define AXI_WIDTH {hw.IN_BITS}\n")
ch.write(f'#define DATA_DIR "../{hw.DATA_DIR}"\n\n')

mask_nums = [(2**hw.X_BITS-1) << (p*hw.X_BITS) for p in range(8//hw.X_BITS)]
Expand All @@ -198,17 +199,20 @@ def export_inference(self, x, hw):
x_bitstring = b''
b_bitstring = b''
x_bitstring_0 = b''

header_padding = b'\x00\x00\x00\x00\x00\x00\x00\x00' if hw.IN_BITS == 128 else b''

for ib, b in enumerate(bundles):
x_bitstring_b = b''
if b.b:
b_bitstring += b.be.astype(type_d['np'][hw.B_BITS]).tobytes()
for ip in range(b.r.CP):
xe = Bundle.pack_words_into_bytes(arr=b.xe[ip].flatten(), bits=hw.X_BITS)
x_bitstring_b += b.r.x_header_be_p[ip!=0].tobytes() + xe.tobytes()
x_bitstring_b += b.r.x_header_be_p[ip!=0].tobytes() + header_padding + xe.tobytes()

for it in range(b.r.IT):
we = Bundle.pack_words_into_bytes(arr=b.we[ip][it].flatten(), bits=hw.K_BITS)
w_bitstring += b.r.w_header_be_p[ip!=0].tobytes() + we.tobytes()
w_bitstring += b.r.w_header_be_p[ip!=0].tobytes() + header_padding + we.tobytes()
x_bitstring += x_bitstring_b
with open(f"{hw.DATA_DIR}/{ib}_x_sim.bin", 'wb') as f:
f.write(x_bitstring_b)
Expand Down Expand Up @@ -321,7 +325,18 @@ def verify_inference(self, SIM, SIM_PATH):
with open(f'{hw.DATA_DIR}/{ib}_y_packed_sim.bin', 'rb') as f_sim, open(f'{hw.DATA_DIR}/{ib+1}_x_sim.bin', 'rb') as f_exp:
y_packed_sim = np.frombuffer(f_sim.read(), dtype=np.uint8)
y_packed_exp = np.frombuffer(f_exp.read(), dtype=np.uint8)
error = np.sum(np.abs(y_packed_sim-y_packed_exp))
assert error == 0, f"Error={error}, for y_packed_sim at {b.idx=}, y_packed_sim=\n{y_packed_sim[:100]} \n y_packed_exp=\n{y_packed_exp[:100]}\n"
diff = y_packed_sim-y_packed_exp
error = np.sum(np.abs(diff))
assert error == 0, f"Error={error}, for y_packed_sim at {b.idx=}, y_packed_sim=\n{y_packed_sim[:100]} \n y_packed_exp=\n{y_packed_exp[:100]}\n, diff=\n{diff.tolist()}\n y_packed_sim=\n{y_packed_sim.tolist()} \n y_packed_exp=\n{y_packed_exp.tolist()}\n"

print(f"Bundle {b.idx}, Error: {error}. Passed")
print(f"Bundle {b.idx}, Error: {error}. Passed")

def predict_performance(self):

clocks_total = 0
for b in self.bundles:
clocks, mem_bits = Bundle.predict_performance(hw=self.hw, r=b.r)
clocks_total += clocks

time = clocks_total / (self.hw.FREQ * 1e6)
return time
2 changes: 1 addition & 1 deletion deepsocflow/tcl/fpga/zcu104.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ set_property board_part xilinx.com:zcu104:part0:1.1 [current_project]
create_bd_design "design_1"
create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.4 zynq_ultra_ps_e_0
apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells zynq_ultra_ps_e_0]
set_property -dict [list CONFIG.PSU__USE__M_AXI_GP1 {1} CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ $FREQ CONFIG.PSU__USE__M_AXI_GP0 {0}] [get_bd_cells zynq_ultra_ps_e_0]
set_property -dict [list CONFIG.PSU__USE__M_AXI_GP1 {1} CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ $FREQ CONFIG.PSU__USE__M_AXI_GP0 {0} CONFIG.PSU__QSPI__PERIPHERAL__ENABLE {0}] [get_bd_cells zynq_ultra_ps_e_0]

set PS_IRQ "zynq_ultra_ps_e_0/pl_ps_irq0"
set PS_M_AXI_LITE "/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD"
Expand Down
14 changes: 8 additions & 6 deletions run/param_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,18 @@ def product_dict(**kwargs):

@pytest.mark.parametrize("PARAMS", list(product_dict(
processing_elements = [(8,24) ],
frequency_mhz = [ 250 ],
bits_input = [ 4 ],
bits_weights = [ 4 ],
bits_sum = [ 16 ],
frequency_mhz = [ 100 ],
bits_input = [ 8 ],
bits_weights = [ 8 ],
bits_sum = [ 32 ],
bits_bias = [ 16 ],
max_batch_size = [ 64 ],
max_channels_in = [ 2048 ],
max_kernel_size = [ 13 ],
max_image_size = [ 512 ],
ram_weights_depth = [ 20 ],
ram_edges_depth = [ 288 ],
axi_width = [ 64 ],
axi_width = [ 128 ],
target_cpu_int_bits = [ 32 ],
valid_prob = [ 0.01 ],
ready_prob = [ 0.1 ],
Expand Down Expand Up @@ -78,4 +78,6 @@ def test_dnn_engine(PARAMS):
3. EXPORT FOR INFERENCE
'''
model.export_inference(x=model.random_input, hw=hw)
model.verify_inference(SIM=SIM, SIM_PATH=SIM_PATH)
model.verify_inference(SIM=SIM, SIM_PATH=SIM_PATH)

print(f"Predicted time on hardware: {1000*model.predict_performance():.5f} ms")
Loading

0 comments on commit 4fe0d2f

Please sign in to comment.