diff --git a/README.md b/README.md index d59a324..a2016c5 100644 --- a/README.md +++ b/README.md @@ -15,74 +15,86 @@ We present a highly flexible, high performance accelerator system that can be ad ![System](docs/overall.png) -## User API (WIP) +## User API ```py -from deepsocflow import Hardware, Bundle, QInput, BundleModel, QConvCore, QDenseCore, QAdd, QPool, Softmax, QLeakyReLu +from deepsocflow import Bundle, Hardware, QModel, QInput ''' 0. Specify Hardware ''' -hw = Hardware ( - processing_elements = (8, 96), - frequency = 1000, - bits_input = 8, - bits_weights = 4, - bits_sum = 24, - bits_bias = 16, - max_kernel_size = (13, 13), - max_channels_in = 512, - max_channels_out = 512, - max_image_size = (32,32), +hw = Hardware ( # Alternatively: hw = Hardware.from_json('hardware.json') + processing_elements = (8, 96) , # (rows, columns) of multiply-add units + frequency_mhz = 250 , # + bits_input = 4 , # bit width of input pixels and activations + bits_weights = 4 , # bit width of weights + bits_sum = 16 , # bit width of accumulator + bits_bias = 16 , # bit width of bias + max_batch_size = 64 , # + max_channels_in = 2048 , # + max_kernel_size = 13 , # + max_image_size = 512 , # + ram_weights_depth = 20 , # + ram_edges_depth = 288 , # + axi_width = 64 , # + target_cpu_int_bits = 32 , # + valid_prob = 0.1 , # probability in which AXI-Stream s_valid signal should be toggled in simulation + ready_prob = 0.1 , # probability in which AXI-Stream m_ready signal should be toggled in simulation + data_dir = 'vectors', # directory to store generated test vectors ) -hw.export() # Generates: config_hw.svh, config_hw.tcl, config_hw.json -# Alternatively: hw = Hardware.from_json('config_hw.json') +hw.export() # Generates: config_hw.svh, config_hw.tcl, config_tb.svh, hardware.json +hw.export_vivado_tcl(board='zcu104') + ''' 1. Build Model ''' -x = QInput( input_shape= (8,32,32,3), hw= hw, input_frac_bits= 4) - -x = Bundle( core= QConvCore(filters= 32, kernel_size= (7,7), strides= (2,2), padding= 'same', weights_frac_bits= 4, bias_frac_bits= 8, activation= QLeakyReLu(negative_slope=0.125, frac_bits= 4 )), - pool= QPool(type= 'max', size= (3,3), strides= (1,1), padding= 'same', frac_bits= 4) - )(x) -x_skip = x -x = Bundle( core= QConvCore(filters= 64, kernel_size= (3,3), weights_frac_bits= 4, bias_frac_bits= 8, activation= QLeakyReLu(negative_slope=0, frac_bits= 4)), - pool= QAdd(x_skip), # Residual addition - flatten= True, - )(x) -x = Bundle( dense= QDenseCore(outputs= 10, weights_frac_bits= 4, bias_frac_bits= 8, activation= Softmax()), - )(x) -model = BundleModel(inputs=x_in, outputs=x) -# Alternatively: model = BundleModel.from_json('config_model.json') +XN = 1 +input_shape = (XN,18,18,3) # (XN, XH, XW, CI) + +QINT_BITS = 0 +kq = f'quantized_bits({hw.K_BITS},{QINT_BITS},False,True,1)' +bq = f'quantized_bits({hw.B_BITS},{QINT_BITS},False,True,1)' +q1 = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0)' +q2 = f'quantized_bits({hw.X_BITS},{QINT_BITS},False,False,1)' +q3 = f'quantized_bits({hw.X_BITS},{QINT_BITS},False,True,1)' +q4 = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0.125)' + +x = x_in = QInput(shape=input_shape[1:], batch_size=XN, hw=hw, int_bits=QINT_BITS, name='input') + +x = x_skip1 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':(11,11), 'strides':(2,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q1}, pool= {'type':'avg', 'size':(3,4), 'strides':(2,3), 'padding':'same', 'act_str':f'quantized_bits({hw.X_BITS},0,False,False,1)'})(x) +x = x_skip2 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q2}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1) +x = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 7, 7), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':False, 'act_str':q3}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip2) +x = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 5, 5), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1) +x = Bundle( core= {'type':'conv' , 'filters':24, 'kernel_size':( 3, 3), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q1},)(x) +x = Bundle( core= {'type':'conv' , 'filters':10, 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4}, flatten= True)(x) +x = Bundle( core= {'type':'dense', 'units' :10, 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4})(x) + +model = QModel(inputs=x_in.raw, outputs=x) +model.compile() +model.summary() ''' 2. TRAIN (using qkeras) ''' -model.compile(...) -model.fit(...) -model.export() # Generates: savedmodel, config_model.json +# model.fit(...) + ''' 3. EXPORT FOR INFERENCE - -- Runs forward pass in float32, records intermediate tensors -- Runs forward pass in integer, comparing with float32 pass for zero error -- Runs SystemVerilog testbench with the model & weights, randomizing handshakes, testing with actual C firmware in simulation -- Prints performance estimate (time, latency) -- Generates - - config_firmware.h - - weights.bin - - expected.bin ''' -model.export_inference(x=model.random_input) # -> config_firmware.h, weights.bin +SIM, SIM_PATH = 'xsim', "F:/Xilinx/Vivado/2022.1/bin/" # For Xilinx Vivado +# SIM, SIM_PATH = 'verilator', "" # For Verilator + +model.export_inference(x=model.random_input, hw=hw) # Runs forward pass in float & int, compares them. Generates: config_fw.h (C firmware), weights.bin, expected.bin +model.verify_inference(SIM=SIM, SIM_PATH=SIM_PATH) # Runs SystemVerilog testbench with the model & weights, randomizing handshakes, testing with actual C firmware in simulation ''' 4. IMPLEMENTATION -a. FPGA: Run vivado.tcl +a. FPGA: Open vivado, source vivado_flow.tcl b. ASIC: Set PDK paths, run syn.tcl & pnr.tcl -c. Compile C firmware with generated header (model.h) and run on device +c. Compile C firmware with generated header (config_fw.h) and run on device ''' ``` diff --git a/deepsocflow/py/__init__.py b/deepsocflow/py/__init__.py index 77079a9..3ce19c1 100644 --- a/deepsocflow/py/__init__.py +++ b/deepsocflow/py/__init__.py @@ -1,3 +1,5 @@ from . import hardware, bundle from .hardware import * -from .bundle import * \ No newline at end of file +from .bundle import * +from .model import * +from .layers import * \ No newline at end of file diff --git a/deepsocflow/py/layers.py b/deepsocflow/py/layers.py index eb65747..7c8db44 100644 --- a/deepsocflow/py/layers.py +++ b/deepsocflow/py/layers.py @@ -1,20 +1,13 @@ from qkeras import QActivation -from tensorflow.keras.layers import Input, Flatten, Add, MaxPooling2D +from tensorflow.keras.layers import Layer, Input, Flatten, Add, MaxPooling2D import numpy as np -# class QInput(Input): -# def __init__(self, shape, batch_size, hw, frac_bits, name=None): - -# self.hw = hw -# self.input_frac_bits = input_frac_bits -# super().__init__(shape=shape, name=name) - -# int_bits = hw.X_BITS - self.frac_bits + 1 - -# x = Input(shape=shape, batch_size=batch_size, name=name) -# x = QActivation(f'quantized_bits(bits={hw.X_BITS}, integer={int_bits}, False,True,1)')(x) - -# return x +def QInput(shape, batch_size, hw, int_bits, name=None): + x_raw = Input(shape=shape, batch_size=batch_size, name=name) + x = QActivation(f'quantized_bits({hw.X_BITS},{int_bits},False,True,1)')(x_raw) + x.raw = x_raw + x.hw = hw + return x diff --git a/deepsocflow/py/model.py b/deepsocflow/py/model.py index 68d93b4..818d968 100644 --- a/deepsocflow/py/model.py +++ b/deepsocflow/py/model.py @@ -1,6 +1,306 @@ from qkeras import Model +import numpy as np +import tensorflow.keras +import os +from deepsocflow.py.bundle import Bundle class QModel(Model): def __init(self, inputs, outputs, name=None): - super().__init__(inputs, outputs, name=name) \ No newline at end of file + super().__init__(inputs, outputs, name=name) + + + @property + def random_input(self): + tensorflow.keras.utils.set_random_seed(0) + return np.clip(np.random.randn(*self.input.shape), -1.0, 1.0) + + + def export_inference(self, x, hw): + + type_d = { 'np': {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64} } + + y = self(x, training=False) + self.hw = hw + + inp_act_model = Model(inputs=self.input, outputs=self.layers[1].output) + inp_tensor = inp_act_model(x, training=False) + + inp = { + 'bits':hw.X_BITS, + 'frac':hw.X_BITS-1 - self.layers[1].quantizer.integer, + 'tensor':inp_tensor, + 'int':inp_tensor.numpy() * 2**(hw.X_BITS-1) + } + + bundles = self.layers[2:] + self.bundles = bundles + + ''' + Export + ''' + + ''' Clean the data directory''' + os.makedirs(hw.DATA_DIR, exist_ok=True) + for file in os.scandir(hw.DATA_DIR): + os.remove(file.path) + + buffer_map = [] + for ib, b in enumerate(bundles): + print(f'-----------------{b.idx}-----------------------') + b.process(inp if b.idx==0 else None, hw) + b.export(hw, False) #ib==len(bundles)-1 + + ''' + Buffer allocation for add bundle + ''' + print(f'input_map:{buffer_map}') + + '''Find and assign a free buffer. If not, add new buffer''' + b.add_out_buffer_idx = -1 + if len(b.add_tensor_dest) != 0: + for im in range(len(buffer_map)): + if buffer_map[im] is None: + buffer_map[im] = {'in':ib, 'out':b.add_tensor_dest} + b.add_out_buffer_idx = im + break + else: #m if break is not hit + b.add_out_buffer_idx = len(buffer_map) + buffer_map += [{'in':ib, 'out':b.add_tensor_dest}] + + print('add_out_buffer_idx:', b.add_out_buffer_idx) + + '''Free the buffers whose last destination is current bundle''' + for im in range(len(buffer_map)): + buf = buffer_map[im] + if buf is not None: + if buf['out'][-1] == ib: + buffer_map[im] = None + + print(f'output_map:{buffer_map}') + + + ''' + Write Runtime Headers + ''' + x_bytes_all = x_bytes = w_bytes = b_words = x_bytes_max = nhwc_words_max = o_bytes_max = o_words_max = 0 + out_buffer_idx = 1 + with open (f'./config_fw.h', 'w') as ch: + + ch.write(f"#define N_BUNDLES {len(bundles)}\n") + ch.write(f"Bundle_t bundles [N_BUNDLES] = {{\n") + + for ib, b in enumerate(bundles): + w_bpt = (hw.K_BITS*b.we[-1][0].size + hw.IN_BITS)//8 + w_bpt_p0 = (hw.K_BITS*b.we[0][0].size + hw.IN_BITS )//8 + x_bpt = (hw.X_BITS*b.xe[-1].size + hw.IN_BITS )//8 + x_bpt_p0 = (hw.X_BITS*b.xe[0].size + hw.IN_BITS )//8 + + if ib == len(bundles)-1: + o_words_b = b.o_int.size + o_bytes_b = o_words_b*4 # int or float + o_words = o_words_b + else: + b_next = bundles[ib+1] + o_wpt = b_next.xe[-1].size + o_wpt_p0 = b_next.xe[0].size + o_words_b = o_wpt_p0 + (b_next.r.CP-1)*o_wpt + + o_bpt = (hw.X_BITS*b_next.xe[-1].size + hw.IN_BITS)//8 + o_bpt_p0 = (hw.X_BITS*b_next.xe[0].size + hw.IN_BITS)//8 + o_bytes_b = o_bpt_p0 + (b_next.r.CP-1)*o_bpt + + xp_words = b.r.XN * b.r.XL * b.r.XW * (hw.ROWS+hw.X_PAD) + + w_bytes_b = (w_bpt_p0 + (b.r.CP-1)*w_bpt)*b.r.IT + x_bytes_b = (x_bpt_p0 + (b.r.CP-1)*x_bpt) + nhwc_words_b = b.r.XN * b.r.XH * b.r.XW * b.r.CO + + x_bytes_max = max(x_bytes_max, x_bytes_b) + nhwc_words_max = max(nhwc_words_max, nhwc_words_b) + o_bytes_max = max(o_bytes_max, o_bytes_b) + o_words_max = max(o_words_max, o_words_b) + w_bytes += w_bytes_b + x_bytes_all += x_bytes_b + + if ib == 0: + x_bytes = (x_bpt_p0 + (b.r.CP-1)*x_bpt) + + y_coe = b.r.CO_PRL + y_coe_tl = b.r.CO_PRL if (b.r.CO==b.r.IT*b.r.CO_PRL) else b.r.CO%b.r.IT + y_r_ll = hw.ROWS if b.r.XH==b.r.XL*hw.ROWS else b.r.XH % hw.ROWS + + ca_nzero, ca_shift, ca_pl_scale = b.core['act']['non_zero'], b.core['act']['shift_bits'], b.core['act']['plog_slope'] + + add_act_shift = b.add['act']['shift_bits'] if b.add is not None else 0 + add_out_buffer_idx = b.add_out_buffer_idx + add_in_buffer_idx = b.add['bundle'].add_out_buffer_idx if b.add is not None else -1 + + if b.pool is None: + pool_type = 'POOL_NONE' + elif b.pool['type'] == 'max': + pool_type = 'POOL_MAX' + elif b.pool['type'] == 'avg': + pool_type = 'POOL_AVG' + pool_act_shift = b.pool['act']['shift_bits'] if b.pool is not None else 0 + + out_buffer_idx = 1*(not out_buffer_idx) if ib != len(bundles)-1 else -1 # alternate between 0 and 1 + + ch.write(f" {{.n={b.r.XN:<3}, .l={b.r.XL:<3}, .kw={b.r.KW:<3}, .coe={y_coe:<3}, .coe_tl={y_coe_tl:<3}, .r_ll={y_r_ll:<3}, .h={b.r.XH:<3}, .w={b.r.XW:<3}, .ci={b.r.CI:<4}, .co={b.r.CO:<3}, .w_kw2={b.r.XW-b.r.KW//2:<3}, .t={b.r.IT:<3}, .p={b.r.CP:<3}, .cm={b.r.CM:<3}, .cm_p0={b.r.CM_0:<3}, .xp_words={xp_words:<3}, ") + ch.write( f".w_bpt={w_bpt:<5}, .w_bpt_p0={w_bpt_p0:<5}, .x_bpt={x_bpt:<5}, .x_bpt_p0={x_bpt_p0:<5}, .o_words={o_words_b:<5}, .o_bytes={o_bytes_b:<5}, ") + ch.write( f".out_buffer_idx={out_buffer_idx:<2}, .add_out_buffer_idx={add_out_buffer_idx:<2}, .add_in_buffer_idx={add_in_buffer_idx:<2}, ") + ch.write( f".is_bias={1*(b.b is not None):<3}, .is_flatten={1*b.flatten:<3}, ") + ch.write( f".b_offset={b_words:<3}, .b_val_shift={b.bias_val_shift:<3}, .b_bias_shift={b.bias_b_shift:<3}, ") + ch.write( f".ca_nzero={ca_nzero:<3}, .ca_shift={ca_shift:<3}, .ca_pl_scale={ca_pl_scale:<3}, .add_act_shift={add_act_shift:<3}, .pool_act_shift={pool_act_shift:<3}, ") + ch.write( f".csh={b.r.CSH:<3}, .ch={b.r.CYH:<3}, .csh_shift={b.r.CSH_SHIFT:<3}, .pkh={b.r.PKH:<3}, .psh={b.r.PSH:<3}, .ph={b.r.PYH:<3}, .psh_shift={b.r.PSH_SHIFT:<3}, .csw={b.r.CSW:<3}, .cw={b.r.CYW:<3}, .csw_shift={b.r.CSW_SHIFT:<3}, .pkw={b.r.PKW:<3}, .psw={b.r.PSW:<3}, .pw={b.r.PYW:<3}, .psw_shift={b.r.PSW_SHIFT:<3}, .pool={pool_type:<10}, .on={b.r.ON:<3}, .oh={b.r.OH:<3}, .ow={b.r.OW:<3}, .oc={b.r.OC:<3}, ") + ch.write( f".x_header={b.r.x_header_le_p[-1][0]:>23}u, .x_header_p0={b.r.x_header_le_p[0][0]:>23}u, .w_header={b.r.w_header_le_p[-1][0]:>23}u, .w_header_p0={b.r.x_header_le_p[0][0]:>25}u , ") + ch.write( f".debug_nhwc_words={b.oe_exp_nhwc.size:<5} }}") + + b_words += b.be.size if b.b else 0 + if b.idx != len(bundles)-1: + ch.write(',\n') + + ''' Bit masks for X_BITS ''' + + + ch.write(f"\n}};\n\n") + ch.write(f"#define X_BITS_L2 {int(np.log2(hw.X_BITS))}\n") + ch.write(f"#define W_BITS_L2 {int(np.log2(hw.K_BITS))}\n") + ch.write(f"#define X_PAD {hw.X_PAD}\n") + ch.write(f"#define KH_MAX {hw.KH_MAX}\n") + ch.write(f"#define PE_ROWS {hw.ROWS}\n") + ch.write(f"#define PE_COLS {hw.COLS}\n\n") + + ch.write(f"#define N_ADD_BUF {len(buffer_map) if len(buffer_map) > 0 else ''}\n") + ch.write(f"#define WB_BYTES {w_bytes + (b_words*hw.B_BITS)//8}\n") + ch.write(f"#define W_BYTES {w_bytes}\n") + ch.write(f"#define X_BYTES {x_bytes}\n") + ch.write(f"#define O_WORDS {o_words}\n") + ch.write(f"#define O_WORDS_MAX {o_words_max}\n") + ch.write(f"#define O_BYTES_MAX {o_bytes_max}\n") + ch.write(f"#define X_BYTES_ALL {x_bytes_all}\n") + ch.write(f"#define NHWC_WORDS {nhwc_words_max}\n") + ch.write(f"#define B_TYPE int{hw.B_BITS}_t\n") + ch.write(f"#define B_WORDS {b_words}\n") + ch.write(f'#define DATA_DIR "../{hw.DATA_DIR}"\n\n') + + mask_nums = [(2**hw.X_BITS-1) << (p*hw.X_BITS) for p in range(8//hw.X_BITS)] + mask_nums = ~np.array(mask_nums, dtype=np.uint8) + ch.write(f"static const uint8_t X_POSITION_INVERTED_MASKS [] = {{ {', '.join([str(n) for n in mask_nums])} }};\n") + + ''' + Write Binary Files + ''' + w_bitstring = b'' + x_bitstring = b'' + b_bitstring = b'' + for ib, b in enumerate(bundles): + x_bitstring_b = b'' + if b.b: + b_bitstring += b.be.astype(type_d['np'][hw.B_BITS]).tobytes() + for ip in range(b.r.CP): + xe = Bundle.pack_words_into_bytes(arr=b.xe[ip].flatten(), bits=hw.X_BITS) + x_bitstring_b += b.r.x_header_be_p[ip!=0].tobytes() + xe.tobytes() + + for it in range(b.r.IT): + we = Bundle.pack_words_into_bytes(arr=b.we[ip][it].flatten(), bits=hw.K_BITS) + w_bitstring += b.r.w_header_be_p[ip!=0].tobytes() + we.tobytes() + x_bitstring += x_bitstring_b + with open(f"{hw.DATA_DIR}/{ib}_x_sim.bin", 'wb') as f: + f.write(x_bitstring_b) + if ib==0: + with open(f"{hw.DATA_DIR}/x.bin", 'wb') as f: + f.write(x_bitstring_b) + + with open(f"{hw.DATA_DIR}/w.bin", 'wb') as f: + f.write(w_bitstring + b_bitstring) + + with open(f"{hw.DATA_DIR}/x_all.bin", 'wb') as f: + f.write(x_bitstring) + + + ''' + Write Text files of vectors + ''' + for b in bundles: + np.savetxt(f"{hw.DATA_DIR}/{b.idx}_y_nhwc_exp.txt", b.oe_exp_nhwc.flatten(), fmt='%d') + np.savetxt(f"{hw.DATA_DIR}/{b.idx}_xe.txt", np.concatenate([a.flatten() for a in b.xe]), fmt='%d') + for ip in range(b.r.CP): + CM_p = b.r.CM_0 if ip==0 else b.r.CM + x_config = b.r.x_header_le_p[ip!=0][0] + x_config = format(x_config, f'#0{hw.IN_BITS}b') + x_config_words = [int(x_config[i:i+hw.X_BITS], 2) for i in range(0, len(x_config), hw.X_BITS)] + x_config_words.reverse() + x_config_words = np.array(x_config_words, dtype=np.int8) + + xp = b.xe[ip].flatten() + xp = np.concatenate([x_config_words, xp], axis=0) + assert xp.shape == (hw.IN_BITS/hw.X_BITS +b.r.XN*b.r.XL*b.r.XW*CM_p*(hw.ROWS+hw.X_PAD),) + np.savetxt(f"{hw.DATA_DIR}/{b.idx}_{ip}_x.txt", xp, fmt='%d') + + + for it in range(b.r.IT): + + w_config = b.r.w_header_le_p[ip!=0][0] + w_config = format(w_config, f'#0{hw.IN_BITS}b') + w_config_words = [int(w_config[i:i+hw.K_BITS], 2) for i in range(0, len(w_config), hw.K_BITS)] + w_config_words.reverse() + w_config_words = np.array(w_config_words,dtype=np.int8) + + wp = b.we[ip][it].flatten() + wp = np.concatenate([w_config_words, wp], axis=0) + assert wp.shape == (hw.IN_BITS/hw.K_BITS + (CM_p*b.r.KH+hw.CONFIG_BEATS)*hw.COLS,) + np.savetxt(f"{hw.DATA_DIR}/{b.idx}_{ip}_{it}_w.txt", wp, fmt='%d') + + np.savetxt(f"{hw.DATA_DIR}/{b.idx}_{ip}_{it}_y_exp.txt", b.ye_exp_p[ip][it].flatten(), fmt='%d') + print(f'Weights, inputs, outputs saved to {hw.DATA_DIR}/ib_ip_it_*.txt') + + def verify_inference(self, SIM, SIM_PATH): + + hw = self.hw + bundles = self.bundles + + ''' + RUN SIMULATION + ''' + hw.simulate(SIM=SIM, SIM_PATH=SIM_PATH) + + ''' + CHECK ERROR + ''' + for ib, b in enumerate(bundles): + + ''' Verify raw output ''' + for ip in range(b.r.CP): + for it in range(b.r.IT): + y_raw_exp = b.ye_exp_p[ip][it] + y_raw_sim = np.loadtxt(f"{hw.DATA_DIR}/{b.idx}_{ip}_{it}_y_raw_sim.txt", np.int32).reshape(y_raw_exp.shape) + error = np.sum(np.abs(y_raw_exp-y_raw_sim)) + assert error == 0, f"Error={error}, for y_raw_sim at {b.idx=}_{ip=}_{it=}" + + ''' Verify sum output ''' + y_sum_exp = b.oe_sum_exp + y_sum_sim = np.loadtxt(f"{hw.DATA_DIR}/{b.idx}_y_sum_sim.txt", np.int32).reshape(y_sum_exp.shape) + error = np.sum(np.abs(y_sum_exp-y_sum_sim)) + assert error == 0, f"Error={error}, for y_sum_sim at {b.idx=}" + + ''' Verify processed output HWC''' + y_nhwc_sim = np.loadtxt(f"{hw.DATA_DIR}/{b.idx}_y_nhwc_sim.txt",np.int32).reshape(b.oe_exp_nhwc.shape) + error = np.sum(np.abs(y_nhwc_sim - b.oe_exp_nhwc)) + assert error == 0, f"sim:\n{y_nhwc_sim[0,:,:,0]}\n exp:\n{b.oe_exp_nhwc[0,:,:,0]}\n input:\n{b.before_pool[0,:,:,0] if b.pool else None}" + + ''' Verify tiled output''' + y_tiled_exp = b.o_int if ib == len(bundles)-1 else np.concatenate([a.flatten() for a in bundles[ib+1].xe]) + y_tiled_sim = np.loadtxt(f"{hw.DATA_DIR}/{b.idx}_y_tiled_sim.txt", np.int32).reshape(y_tiled_exp.shape) + error = np.sum(np.abs(y_tiled_sim-y_tiled_exp)) + assert error == 0, f"Error={error}, for y_tiled_sim at {b.idx=}" + + ''' Verify packed output''' + if ib != len(bundles)-1: + with open(f'{hw.DATA_DIR}/{ib}_y_packed_sim.bin', 'rb') as f_sim, open(f'{hw.DATA_DIR}/{ib+1}_x_sim.bin', 'rb') as f_exp: + y_packed_sim = np.frombuffer(f_sim.read(), dtype=np.uint8) + y_packed_exp = np.frombuffer(f_exp.read(), dtype=np.uint8) + error = np.sum(np.abs(y_packed_sim-y_packed_exp)) + assert error == 0, f"Error={error}, for y_packed_sim at {b.idx=}, y_packed_sim=\n{y_packed_sim[:100]} \n y_packed_exp=\n{y_packed_exp[:100]}\n" + + print(f"Bundle {b.idx}, Error: {error}") \ No newline at end of file diff --git a/deepsocflow/py/utils.py b/deepsocflow/py/utils.py index 7d9f731..e9afe9c 100644 --- a/deepsocflow/py/utils.py +++ b/deepsocflow/py/utils.py @@ -1,4 +1,17 @@ import numpy as np def clog2(x): - return int(np.ceil(np.log2(x))) \ No newline at end of file + return int(np.ceil(np.log2(x))) + + +class QTensor: + def __init__(self, bits, frac, tensor): + self.bits = bits + self.frac = frac + self.tensor = tensor + self.int = check_and_store(tensor.numpy()) + + def check_and_store(self, float_np): + int_np = float_np * 2**self.frac + assert np.all(int_np == self.int), f"Integer check failed for tensor: \nfloat:\n{float_np}, \n*2^{frac}:\n{int_np}" + self.int = int_np.astype(int) \ No newline at end of file diff --git a/deepsocflow/tcl/fpga/vivado.tcl b/deepsocflow/tcl/fpga/vivado.tcl index 8310a39..79de600 100644 --- a/deepsocflow/tcl/fpga/vivado.tcl +++ b/deepsocflow/tcl/fpga/vivado.tcl @@ -87,8 +87,8 @@ write_hw_platform -fixed -include_bit -force -file design_1_wrapper.xsa # Reports open_run impl_1 -if {![file exists reports]} {exec mkdir reports} -report_timing_summary -delay_type min_max -report_unconstrained -check_timing_verbose -max_paths 100 -input_pins -routable_nets -name timing_1 -file reports/${PROJECT_NAME}_${BOARD}_${FREQ}_timing_report.txt +if {![file exists $PROJECT_NAME/reports]} {exec mkdir $PROJECT_NAME/reports} +report_timing_summary -delay_type min_max -report_unconstrained -check_timing_verbose -max_paths 100 -input_pins -routable_nets -name timing_1 -file $PROJECT_NAME/reports/${PROJECT_NAME}_${BOARD}_${FREQ}_timing_report.txt report_utilization -file $PROJECT_NAME/reports/${PROJECT_NAME}_${BOARD}_${FREQ}_utilization_report.txt -name utilization_1 report_power -file reports/${PROJECT_NAME}_${BOARD}_${FREQ}_power_1.txt -name {power_1} report_drc -name drc_1 -file reports/${PROJECT_NAME}_${BOARD}_${FREQ}_drc_1.txt -ruledecks {default opt_checks placer_checks router_checks bitstream_checks incr_eco_checks eco_checks abs_checks} diff --git a/run/example.py b/run/example.py new file mode 100644 index 0000000..e19d90a --- /dev/null +++ b/run/example.py @@ -0,0 +1,78 @@ +from deepsocflow import Bundle, Hardware, QModel, QInput + +''' +0. Specify Hardware +''' +hw = Hardware ( # Alternatively: hw = Hardware.from_json('hardware.json') + processing_elements = (8, 96) , # (rows, columns) of multiply-add units + frequency_mhz = 250 , # + bits_input = 4 , # bit width of input pixels and activations + bits_weights = 4 , # bit width of weights + bits_sum = 16 , # bit width of accumulator + bits_bias = 16 , # bit width of bias + max_batch_size = 64 , # + max_channels_in = 2048 , # + max_kernel_size = 13 , # + max_image_size = 512 , # + ram_weights_depth = 20 , # + ram_edges_depth = 288 , # + axi_width = 64 , # + target_cpu_int_bits = 32 , # + valid_prob = 0.1 , # probability in which AXI-Stream s_valid signal should be toggled in simulation + ready_prob = 0.1 , # probability in which AXI-Stream m_ready signal should be toggled in simulation + data_dir = 'vectors', # directory to store generated test vectors + ) +hw.export() # Generates: config_hw.svh, config_hw.tcl, config_tb.svh, hardware.json +hw.export_vivado_tcl(board='zcu104') + + +''' +1. Build Model +''' +XN = 1 +input_shape = (XN,18,18,3) # (XN, XH, XW, CI) + +QINT_BITS = 0 +kq = f'quantized_bits({hw.K_BITS},{QINT_BITS},False,True,1)' +bq = f'quantized_bits({hw.B_BITS},{QINT_BITS},False,True,1)' +q1 = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0)' +q2 = f'quantized_bits({hw.X_BITS},{QINT_BITS},False,False,1)' +q3 = f'quantized_bits({hw.X_BITS},{QINT_BITS},False,True,1)' +q4 = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0.125)' + +x = x_in = QInput(shape=input_shape[1:], batch_size=XN, hw=hw, int_bits=QINT_BITS, name='input') + +x = x_skip1 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':(11,11), 'strides':(2,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q1}, pool= {'type':'avg', 'size':(3,4), 'strides':(2,3), 'padding':'same', 'act_str':f'quantized_bits({hw.X_BITS},0,False,False,1)'})(x) +x = x_skip2 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q2}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1) +x = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 7, 7), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':False, 'act_str':q3}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip2) +x = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 5, 5), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1) +x = Bundle( core= {'type':'conv' , 'filters':24, 'kernel_size':( 3, 3), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q1},)(x) +x = Bundle( core= {'type':'conv' , 'filters':10, 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4}, flatten= True)(x) +x = Bundle( core= {'type':'dense', 'units' :10, 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4})(x) + +model = QModel(inputs=x_in.raw, outputs=x) +model.compile() +model.summary() + +''' +2. TRAIN (using qkeras) +''' +# model.fit(...) + + +''' +3. EXPORT FOR INFERENCE +''' +SIM, SIM_PATH = 'xsim', "F:/Xilinx/Vivado/2022.1/bin/" # For Xilinx Vivado +# SIM, SIM_PATH = 'verilator', "" # For Verilator + +model.export_inference(x=model.random_input, hw=hw) # Runs forward pass in float & int, compares them. Generates: config_fw.h (C firmware), weights.bin, expected.bin +model.verify_inference(SIM=SIM, SIM_PATH=SIM_PATH) # Runs SystemVerilog testbench with the model & weights, randomizing handshakes, testing with actual C firmware in simulation + +''' +4. IMPLEMENTATION + +a. FPGA: Open vivado, source vivado_flow.tcl +b. ASIC: Set PDK paths, run syn.tcl & pnr.tcl +c. Compile C firmware with generated header (config_fw.h) and run on device +''' \ No newline at end of file diff --git a/run/param_test.py b/run/param_test.py index 8624ff7..bf77e25 100644 --- a/run/param_test.py +++ b/run/param_test.py @@ -1,26 +1,12 @@ -import numpy as np import os import pytest import itertools -from qkeras import * -from tensorflow.keras.layers import Input import sys sys.path.append("../../") -import deepsocflow -from deepsocflow import Bundle, Hardware - +from deepsocflow import Bundle, Hardware, QModel, QInput # Simulator: xsim on windows, verilator otherwise -if os.name=='nt': - SIM = 'xsim' - SIM_PATH = "F:/Xilinx/Vivado/2022.1/bin/" #os.path.join("F:", "Xilinx", "Vivado", "2022.1", "bin") -else: - SIM = 'verilator' - SIM_PATH = '' - - -keras.utils.set_random_seed(0) -type_d = { 'np': {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64} } +(SIM, SIM_PATH) = ('xsim', "F:/Xilinx/Vivado/2022.1/bin/") if os.name=='nt' else ('verilator', '') def product_dict(**kwargs): for instance in itertools.product(*(kwargs.values())): @@ -47,7 +33,7 @@ def product_dict(**kwargs): ))) def test_dnn_engine(PARAMS): ''' - 0. Specify Hardware + 0. SPECIFY HARDWARE ''' hw = Hardware (**PARAMS) hw.export_json() @@ -55,304 +41,41 @@ def test_dnn_engine(PARAMS): hw.export() # Generates: config_hw.svh, config_hw.tcl hw.export_vivado_tcl(board='zcu104') - - xq, kq, bq = f'quantized_bits({hw.X_BITS},0,False,True,1)', f'quantized_bits({hw.K_BITS},0,False,True,1)', f'quantized_bits({hw.B_BITS},0,False,True,1)' - inp = {'bits':hw.X_BITS, 'frac':hw.X_BITS-1} - ''' - 1. Build Model + 1. BUILD MODEL ''' - input_shape = (8,18,18,3) # (XN, XH, XW, CI) - x = x_in = Input(input_shape[1:], name='input') - x = QActivation(xq)(x) - - x = x_skip1 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':(11,11), 'strides':(2,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({hw.X_BITS},0,negative_slope=0)' }, pool= {'type':'avg', 'size':(3,4), 'strides':(2,3), 'padding':'same', 'act_str':f'quantized_bits({hw.X_BITS},0,False,False,1)'})(x) - x = x_skip2 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_bits({hw.X_BITS},0,False,False,1)' }, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1) - x = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 7, 7), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':False, 'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)' }, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip2) - x = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 5, 5), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({hw.X_BITS},0,negative_slope=0.125)'}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1) - x = Bundle( core= {'type':'conv' , 'filters':24, 'kernel_size':( 3, 3), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({hw.X_BITS},0,negative_slope=0)' },)(x) - x = Bundle( core= {'type':'conv' , 'filters':10, 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({hw.X_BITS},0,negative_slope=0.125)'}, flatten= True)(x) - x = Bundle( core= {'type':'dense', 'units' :10, 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({hw.X_BITS},0,negative_slope=0.125)'})(x) + XN = 8 + input_shape = (XN,18,18,3) # (XN, XH, XW, CI) - model = Model(inputs=x_in, outputs=x) + QINT_BITS = 0 + kq = f'quantized_bits({hw.K_BITS},{QINT_BITS},False,True,1)' + bq = f'quantized_bits({hw.B_BITS},{QINT_BITS},False,True,1)' + q1 = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0)' + q2 = f'quantized_bits({hw.X_BITS},{QINT_BITS},False,False,1)' + q3 = f'quantized_bits({hw.X_BITS},{QINT_BITS},False,True,1)' + q4 = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0.125)' + x = x_in = QInput(shape=input_shape[1:], batch_size=XN, hw=hw, int_bits=QINT_BITS, name='input') - ''' - Pass Floating Point & Fixed Point Input - ''' - x = np.clip(np.random.randn(*input_shape), -1.0, 1.0) - y = model(x) + x = x_skip1 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':(11,11), 'strides':(2,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q1}, pool= {'type':'avg', 'size':(3,4), 'strides':(2,3), 'padding':'same', 'act_str':f'quantized_bits({hw.X_BITS},0,False,False,1)'})(x) + x = x_skip2 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q2}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1) + x = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 7, 7), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':False, 'act_str':q3}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip2) + x = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 5, 5), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1) + x = Bundle( core= {'type':'conv' , 'filters':24, 'kernel_size':( 3, 3), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q1},)(x) + x = Bundle( core= {'type':'conv' , 'filters':10, 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4}, flatten= True)(x) + x = Bundle( core= {'type':'dense', 'units' :10, 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4})(x) - inp_act_model = Model(inputs=model.input, outputs=model.layers[1].output) - inp['tensor'] = inp_act_model(x, training=False) - inp['int'] = inp['tensor'].numpy() * 2**inp['frac'] + model = QModel(inputs=x_in.raw, outputs=x) + model.compile() + model.summary() ''' - Clean the data directory + 2. TRAIN MODEL ''' - os.makedirs(hw.DATA_DIR, exist_ok=True) - for file in os.scandir(hw.DATA_DIR): - os.remove(file.path) - - bundles = model.layers[2:] - - - ''' - Export - ''' - buffer_map = [] - for ib, b in enumerate(bundles): - print(f'-----------------{b.idx}-----------------------') - b.process(inp if b.idx==0 else None, hw) - b.export(hw, False) #ib==len(bundles)-1 - - ''' - Buffer allocation for add bundle - ''' - print(f'input_map:{buffer_map}') - - '''Find and assign a free buffer. If not, add new buffer''' - b.add_out_buffer_idx = -1 - if len(b.add_tensor_dest) != 0: - for im in range(len(buffer_map)): - if buffer_map[im] is None: - buffer_map[im] = {'in':ib, 'out':b.add_tensor_dest} - b.add_out_buffer_idx = im - break - else: #m if break is not hit - b.add_out_buffer_idx = len(buffer_map) - buffer_map += [{'in':ib, 'out':b.add_tensor_dest}] - - print('add_out_buffer_idx:', b.add_out_buffer_idx) - - '''Free the buffers whose last destination is current bundle''' - for im in range(len(buffer_map)): - buf = buffer_map[im] - if buf is not None: - if buf['out'][-1] == ib: - buffer_map[im] = None - - print(f'output_map:{buffer_map}') - + # model.fit(...) ''' - Write Runtime Headers + 3. EXPORT FOR INFERENCE ''' - x_bytes_all = x_bytes = w_bytes = b_words = x_bytes_max = nhwc_words_max = o_bytes_max = o_words_max = 0 - out_buffer_idx = 1 - with open (f'./config_fw.h', 'w') as ch: - - ch.write(f"#define N_BUNDLES {len(bundles)}\n") - ch.write(f"Bundle_t bundles [N_BUNDLES] = {{\n") - - for ib, b in enumerate(bundles): - w_bpt = (hw.K_BITS*b.we[-1][0].size + hw.IN_BITS)//8 - w_bpt_p0 = (hw.K_BITS*b.we[0][0].size + hw.IN_BITS )//8 - x_bpt = (hw.X_BITS*b.xe[-1].size + hw.IN_BITS )//8 - x_bpt_p0 = (hw.X_BITS*b.xe[0].size + hw.IN_BITS )//8 - - if ib == len(bundles)-1: - o_words_b = b.o_int.size - o_bytes_b = o_words_b*4 # int or float - o_words = o_words_b - else: - b_next = bundles[ib+1] - o_wpt = b_next.xe[-1].size - o_wpt_p0 = b_next.xe[0].size - o_words_b = o_wpt_p0 + (b_next.r.CP-1)*o_wpt - - o_bpt = (hw.X_BITS*b_next.xe[-1].size + hw.IN_BITS)//8 - o_bpt_p0 = (hw.X_BITS*b_next.xe[0].size + hw.IN_BITS)//8 - o_bytes_b = o_bpt_p0 + (b_next.r.CP-1)*o_bpt - - xp_words = b.r.XN * b.r.XL * b.r.XW * (hw.ROWS+hw.X_PAD) - - w_bytes_b = (w_bpt_p0 + (b.r.CP-1)*w_bpt)*b.r.IT - x_bytes_b = (x_bpt_p0 + (b.r.CP-1)*x_bpt) - nhwc_words_b = b.r.XN * b.r.XH * b.r.XW * b.r.CO - - x_bytes_max = max(x_bytes_max, x_bytes_b) - nhwc_words_max = max(nhwc_words_max, nhwc_words_b) - o_bytes_max = max(o_bytes_max, o_bytes_b) - o_words_max = max(o_words_max, o_words_b) - w_bytes += w_bytes_b - x_bytes_all += x_bytes_b - - if ib == 0: - x_bytes = (x_bpt_p0 + (b.r.CP-1)*x_bpt) - - y_coe = b.r.CO_PRL - y_coe_tl = b.r.CO_PRL if (b.r.CO==b.r.IT*b.r.CO_PRL) else b.r.CO%b.r.IT - y_r_ll = hw.ROWS if b.r.XH==b.r.XL*hw.ROWS else b.r.XH % hw.ROWS - - ca_nzero, ca_shift, ca_pl_scale = b.core['act']['non_zero'], b.core['act']['shift_bits'], b.core['act']['plog_slope'] - - add_act_shift = b.add['act']['shift_bits'] if b.add is not None else 0 - add_out_buffer_idx = b.add_out_buffer_idx - add_in_buffer_idx = b.add['bundle'].add_out_buffer_idx if b.add is not None else -1 - - if b.pool is None: - pool_type = 'POOL_NONE' - elif b.pool['type'] == 'max': - pool_type = 'POOL_MAX' - elif b.pool['type'] == 'avg': - pool_type = 'POOL_AVG' - pool_act_shift = b.pool['act']['shift_bits'] if b.pool is not None else 0 - - out_buffer_idx = 1*(not out_buffer_idx) if ib != len(bundles)-1 else -1 # alternate between 0 and 1 - - ch.write(f" {{.n={b.r.XN:<3}, .l={b.r.XL:<3}, .kw={b.r.KW:<3}, .coe={y_coe:<3}, .coe_tl={y_coe_tl:<3}, .r_ll={y_r_ll:<3}, .h={b.r.XH:<3}, .w={b.r.XW:<3}, .ci={b.r.CI:<4}, .co={b.r.CO:<3}, .w_kw2={b.r.XW-b.r.KW//2:<3}, .t={b.r.IT:<3}, .p={b.r.CP:<3}, .cm={b.r.CM:<3}, .cm_p0={b.r.CM_0:<3}, .xp_words={xp_words:<3}, ") - ch.write( f".w_bpt={w_bpt:<5}, .w_bpt_p0={w_bpt_p0:<5}, .x_bpt={x_bpt:<5}, .x_bpt_p0={x_bpt_p0:<5}, .o_words={o_words_b:<5}, .o_bytes={o_bytes_b:<5}, ") - ch.write( f".out_buffer_idx={out_buffer_idx:<2}, .add_out_buffer_idx={add_out_buffer_idx:<2}, .add_in_buffer_idx={add_in_buffer_idx:<2}, ") - ch.write( f".is_bias={1*(b.b is not None):<3}, .is_flatten={1*b.flatten:<3}, ") - ch.write( f".b_offset={b_words:<3}, .b_val_shift={b.bias_val_shift:<3}, .b_bias_shift={b.bias_b_shift:<3}, ") - ch.write( f".ca_nzero={ca_nzero:<3}, .ca_shift={ca_shift:<3}, .ca_pl_scale={ca_pl_scale:<3}, .add_act_shift={add_act_shift:<3}, .pool_act_shift={pool_act_shift:<3}, ") - ch.write( f".csh={b.r.CSH:<3}, .ch={b.r.CYH:<3}, .csh_shift={b.r.CSH_SHIFT:<3}, .pkh={b.r.PKH:<3}, .psh={b.r.PSH:<3}, .ph={b.r.PYH:<3}, .psh_shift={b.r.PSH_SHIFT:<3}, .csw={b.r.CSW:<3}, .cw={b.r.CYW:<3}, .csw_shift={b.r.CSW_SHIFT:<3}, .pkw={b.r.PKW:<3}, .psw={b.r.PSW:<3}, .pw={b.r.PYW:<3}, .psw_shift={b.r.PSW_SHIFT:<3}, .pool={pool_type:<10}, .on={b.r.ON:<3}, .oh={b.r.OH:<3}, .ow={b.r.OW:<3}, .oc={b.r.OC:<3}, ") - ch.write( f".x_header={b.r.x_header_le_p[-1][0]:>23}u, .x_header_p0={b.r.x_header_le_p[0][0]:>23}u, .w_header={b.r.w_header_le_p[-1][0]:>23}u, .w_header_p0={b.r.x_header_le_p[0][0]:>25}u , ") - ch.write( f".debug_nhwc_words={b.oe_exp_nhwc.size:<5} }}") - - b_words += b.be.size if b.b else 0 - if b.idx != len(bundles)-1: - ch.write(',\n') - - ''' Bit masks for X_BITS ''' - - - ch.write(f"\n}};\n\n") - ch.write(f"#define X_BITS_L2 {int(np.log2(hw.X_BITS))}\n") - ch.write(f"#define W_BITS_L2 {int(np.log2(hw.K_BITS))}\n") - ch.write(f"#define X_PAD {hw.X_PAD}\n") - ch.write(f"#define KH_MAX {hw.KH_MAX}\n") - ch.write(f"#define PE_ROWS {hw.ROWS}\n") - ch.write(f"#define PE_COLS {hw.COLS}\n\n") - - ch.write(f"#define N_ADD_BUF {len(buffer_map) if len(buffer_map) > 0 else ''}\n") - ch.write(f"#define WB_BYTES {w_bytes + (b_words*hw.B_BITS)//8}\n") - ch.write(f"#define W_BYTES {w_bytes}\n") - ch.write(f"#define X_BYTES {x_bytes}\n") - ch.write(f"#define O_WORDS {o_words}\n") - ch.write(f"#define O_WORDS_MAX {o_words_max}\n") - ch.write(f"#define O_BYTES_MAX {o_bytes_max}\n") - ch.write(f"#define X_BYTES_ALL {x_bytes_all}\n") - ch.write(f"#define NHWC_WORDS {nhwc_words_max}\n") - ch.write(f"#define B_TYPE int{hw.B_BITS}_t\n") - ch.write(f"#define B_WORDS {b_words}\n") - ch.write(f'#define DATA_DIR "../{hw.DATA_DIR}"\n\n') - - mask_nums = [(2**hw.X_BITS-1) << (p*hw.X_BITS) for p in range(8//hw.X_BITS)] - mask_nums = ~np.array(mask_nums, dtype=np.uint8) - ch.write(f"static const uint8_t X_POSITION_INVERTED_MASKS [] = {{ {', '.join([str(n) for n in mask_nums])} }};\n") - - ''' - Write Binary Files - ''' - w_bitstring = b'' - x_bitstring = b'' - b_bitstring = b'' - for ib, b in enumerate(bundles): - x_bitstring_b = b'' - if b.b: - b_bitstring += b.be.astype(type_d['np'][hw.B_BITS]).tobytes() - for ip in range(b.r.CP): - xe = Bundle.pack_words_into_bytes(arr=b.xe[ip].flatten(), bits=hw.X_BITS) - x_bitstring_b += b.r.x_header_be_p[ip!=0].tobytes() + xe.tobytes() - - for it in range(b.r.IT): - we = Bundle.pack_words_into_bytes(arr=b.we[ip][it].flatten(), bits=hw.K_BITS) - w_bitstring += b.r.w_header_be_p[ip!=0].tobytes() + we.tobytes() - x_bitstring += x_bitstring_b - with open(f"{hw.DATA_DIR}/{ib}_x_sim.bin", 'wb') as f: - f.write(x_bitstring_b) - if ib==0: - with open(f"{hw.DATA_DIR}/x.bin", 'wb') as f: - f.write(x_bitstring_b) - - with open(f"{hw.DATA_DIR}/w.bin", 'wb') as f: - f.write(w_bitstring + b_bitstring) - - with open(f"{hw.DATA_DIR}/x_all.bin", 'wb') as f: - f.write(x_bitstring) - - - ''' - Write Text files of vectors - ''' - for b in bundles: - np.savetxt(f"{hw.DATA_DIR}/{b.idx}_y_nhwc_exp.txt", b.oe_exp_nhwc.flatten(), fmt='%d') - np.savetxt(f"{hw.DATA_DIR}/{b.idx}_xe.txt", np.concatenate([a.flatten() for a in b.xe]), fmt='%d') - for ip in range(b.r.CP): - CM_p = b.r.CM_0 if ip==0 else b.r.CM - x_config = b.r.x_header_le_p[ip!=0][0] - x_config = format(x_config, f'#0{hw.IN_BITS}b') - x_config_words = [int(x_config[i:i+hw.X_BITS], 2) for i in range(0, len(x_config), hw.X_BITS)] - x_config_words.reverse() - x_config_words = np.array(x_config_words, dtype=np.int8) - - xp = b.xe[ip].flatten() - xp = np.concatenate([x_config_words, xp], axis=0) - assert xp.shape == (hw.IN_BITS/hw.X_BITS +b.r.XN*b.r.XL*b.r.XW*CM_p*(hw.ROWS+hw.X_PAD),) - np.savetxt(f"{hw.DATA_DIR}/{b.idx}_{ip}_x.txt", xp, fmt='%d') - - - for it in range(b.r.IT): - - w_config = b.r.w_header_le_p[ip!=0][0] - w_config = format(w_config, f'#0{hw.IN_BITS}b') - w_config_words = [int(w_config[i:i+hw.K_BITS], 2) for i in range(0, len(w_config), hw.K_BITS)] - w_config_words.reverse() - w_config_words = np.array(w_config_words,dtype=np.int8) - - wp = b.we[ip][it].flatten() - wp = np.concatenate([w_config_words, wp], axis=0) - assert wp.shape == (hw.IN_BITS/hw.K_BITS + (CM_p*b.r.KH+hw.CONFIG_BEATS)*hw.COLS,) - np.savetxt(f"{hw.DATA_DIR}/{b.idx}_{ip}_{it}_w.txt", wp, fmt='%d') - - np.savetxt(f"{hw.DATA_DIR}/{b.idx}_{ip}_{it}_y_exp.txt", b.ye_exp_p[ip][it].flatten(), fmt='%d') - print(f'Weights, inputs, outputs saved to {hw.DATA_DIR}/ib_ip_it_*.txt') - - - ''' - RUN SIMULATION - ''' - hw.simulate(SIM=SIM, SIM_PATH=SIM_PATH) - - - ''' - CHECK ERROR - ''' - for ib, b in enumerate(bundles): - - ''' Verify raw output ''' - for ip in range(b.r.CP): - for it in range(b.r.IT): - y_raw_exp = b.ye_exp_p[ip][it] - y_raw_sim = np.loadtxt(f"{hw.DATA_DIR}/{b.idx}_{ip}_{it}_y_raw_sim.txt", np.int32).reshape(y_raw_exp.shape) - error = np.sum(np.abs(y_raw_exp-y_raw_sim)) - assert error == 0, f"Error={error}, for y_raw_sim at {b.idx=}_{ip=}_{it=}" - - ''' Verify sum output ''' - y_sum_exp = b.oe_sum_exp - y_sum_sim = np.loadtxt(f"{hw.DATA_DIR}/{b.idx}_y_sum_sim.txt", np.int32).reshape(y_sum_exp.shape) - error = np.sum(np.abs(y_sum_exp-y_sum_sim)) - assert error == 0, f"Error={error}, for y_sum_sim at {b.idx=}" - - ''' Verify processed output HWC''' - y_nhwc_sim = np.loadtxt(f"{hw.DATA_DIR}/{b.idx}_y_nhwc_sim.txt",np.int32).reshape(b.oe_exp_nhwc.shape) - error = np.sum(np.abs(y_nhwc_sim - b.oe_exp_nhwc)) - assert error == 0, f"sim:\n{y_nhwc_sim[0,:,:,0]}\n exp:\n{b.oe_exp_nhwc[0,:,:,0]}\n input:\n{b.before_pool[0,:,:,0] if b.pool else None}" - - ''' Verify tiled output''' - y_tiled_exp = b.o_int if ib == len(bundles)-1 else np.concatenate([a.flatten() for a in bundles[ib+1].xe]) - y_tiled_sim = np.loadtxt(f"{hw.DATA_DIR}/{b.idx}_y_tiled_sim.txt", np.int32).reshape(y_tiled_exp.shape) - error = np.sum(np.abs(y_tiled_sim-y_tiled_exp)) - assert error == 0, f"Error={error}, for y_tiled_sim at {b.idx=}" - - ''' Verify packed output''' - if ib != len(bundles)-1: - with open(f'{hw.DATA_DIR}/{ib}_y_packed_sim.bin', 'rb') as f_sim, open(f'{hw.DATA_DIR}/{ib+1}_x_sim.bin', 'rb') as f_exp: - y_packed_sim = np.frombuffer(f_sim.read(), dtype=np.uint8) - y_packed_exp = np.frombuffer(f_exp.read(), dtype=np.uint8) - error = np.sum(np.abs(y_packed_sim-y_packed_exp)) - assert error == 0, f"Error={error}, for y_packed_sim at {b.idx=}, y_packed_sim=\n{y_packed_sim[:100]} \n y_packed_exp=\n{y_packed_exp[:100]}\n" - - print(f"Bundle {b.idx}, Error: {error}") \ No newline at end of file + model.export_inference(x=model.random_input, hw=hw) + model.verify_inference(SIM=SIM, SIM_PATH=SIM_PATH) \ No newline at end of file diff --git a/run/work/config_hw.svh b/run/work/config_hw.svh index 6e99d35..aee0e25 100644 --- a/run/work/config_hw.svh +++ b/run/work/config_hw.svh @@ -5,7 +5,7 @@ `define COLS 24 // PE cols, constrained by resources `define X_BITS 4 // Bits per word in input `define K_BITS 4 // Bits per word in input -`define Y_BITS 24 // Bits per word in output of conv +`define Y_BITS 16 // Bits per word in output of conv `define KH_MAX 13 // max of kernel height, across layers `define KW_MAX 13 // max of kernel width, across layers diff --git a/run/work/config_hw.tcl b/run/work/config_hw.tcl index 685521b..d0693d5 100644 --- a/run/work/config_hw.tcl +++ b/run/work/config_hw.tcl @@ -3,7 +3,7 @@ set FREQ 250 set ROWS 8 -set COLS 96 +set COLS 24 set X_BITS 4 set K_BITS 4 set Y_BITS 16 diff --git a/run/work/hardware.json b/run/work/hardware.json index 471e523..2634367 100644 --- a/run/work/hardware.json +++ b/run/work/hardware.json @@ -6,18 +6,12 @@ "frequency_mhz": 250, "bits_input": 4, "bits_weights": 4, - "bits_sum": 24, + "bits_sum": 16, "bits_bias": 16, "max_batch_size": 64, "max_channels_in": 2048, - "max_kernel_size": [ - 13, - 13 - ], - "max_image_size": [ - 512, - 512 - ], + "max_kernel_size": 13, + "max_image_size": 512, "ram_weights_depth": 20, "ram_edges_depth": 288, "axi_width": 64,