diff --git a/README.md b/README.md
index d59a324..a2016c5 100644
--- a/README.md
+++ b/README.md
@@ -15,74 +15,86 @@ We present a highly flexible, high performance accelerator system that can be ad
 
 ![System](docs/overall.png)
 
-## User API (WIP)
+## User API
 
 ```py
-from deepsocflow import Hardware, Bundle, QInput, BundleModel, QConvCore, QDenseCore, QAdd, QPool, Softmax, QLeakyReLu
+from deepsocflow import Bundle, Hardware, QModel, QInput
 
 '''
 0. Specify Hardware
 '''
-hw = Hardware (
-        processing_elements = (8, 96),
-        frequency           = 1000,
-        bits_input          = 8,
-        bits_weights        = 4,
-        bits_sum            = 24,
-        bits_bias           = 16,
-        max_kernel_size     = (13, 13),
-        max_channels_in     = 512,
-        max_channels_out    = 512,
-        max_image_size      = (32,32),
+hw = Hardware (                          # Alternatively: hw = Hardware.from_json('hardware.json')
+        processing_elements = (8, 96)  , # (rows, columns) of multiply-add units
+        frequency_mhz       = 250      , #  
+        bits_input          = 4        , # bit width of input pixels and activations
+        bits_weights        = 4        , # bit width of weights
+        bits_sum            = 16       , # bit width of accumulator
+        bits_bias           = 16       , # bit width of bias
+        max_batch_size      = 64       , # 
+        max_channels_in     = 2048     , #
+        max_kernel_size     = 13       , #
+        max_image_size      = 512      , #
+        ram_weights_depth   = 20       , #
+        ram_edges_depth     = 288      , #
+        axi_width           = 64       , #
+        target_cpu_int_bits = 32       , #
+        valid_prob          = 0.1      , # probability in which AXI-Stream s_valid signal should be toggled in simulation
+        ready_prob          = 0.1      , # probability in which AXI-Stream m_ready signal should be toggled in simulation
+        data_dir            = 'vectors', # directory to store generated test vectors
      )
-hw.export() # Generates: config_hw.svh, config_hw.tcl, config_hw.json
-# Alternatively: hw = Hardware.from_json('config_hw.json')
+hw.export() # Generates: config_hw.svh, config_hw.tcl, config_tb.svh, hardware.json
+hw.export_vivado_tcl(board='zcu104')
+
 
 '''
 1. Build Model 
 '''
-x = QInput( input_shape= (8,32,32,3), hw= hw, input_frac_bits= 4)
-
-x = Bundle( core= QConvCore(filters= 32, kernel_size= (7,7), strides= (2,2), padding= 'same', weights_frac_bits= 4, bias_frac_bits= 8, activation= QLeakyReLu(negative_slope=0.125, frac_bits= 4 )),
-            pool= QPool(type= 'max', size= (3,3), strides= (1,1), padding= 'same', frac_bits= 4)
-            )(x)
-x_skip = x
-x = Bundle( core= QConvCore(filters= 64, kernel_size= (3,3), weights_frac_bits= 4, bias_frac_bits= 8, activation= QLeakyReLu(negative_slope=0, frac_bits= 4)),
-            pool= QAdd(x_skip), # Residual addition
-            flatten= True,
-            )(x)
-x = Bundle( dense= QDenseCore(outputs= 10, weights_frac_bits= 4, bias_frac_bits= 8, activation= Softmax()),
-            )(x)
-model = BundleModel(inputs=x_in, outputs=x)
-# Alternatively: model = BundleModel.from_json('config_model.json')
+XN = 1
+input_shape = (XN,18,18,3) # (XN, XH, XW, CI)
+
+QINT_BITS = 0
+kq = f'quantized_bits({hw.K_BITS},{QINT_BITS},False,True,1)'
+bq = f'quantized_bits({hw.B_BITS},{QINT_BITS},False,True,1)'
+q1 = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0)'    
+q2 = f'quantized_bits({hw.X_BITS},{QINT_BITS},False,False,1)'       
+q3 = f'quantized_bits({hw.X_BITS},{QINT_BITS},False,True,1)'        
+q4 = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0.125)'
+
+x = x_in = QInput(shape=input_shape[1:], batch_size=XN, hw=hw, int_bits=QINT_BITS, name='input')
+
+x = x_skip1 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':(11,11), 'strides':(2,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q1}, pool= {'type':'avg', 'size':(3,4), 'strides':(2,3), 'padding':'same', 'act_str':f'quantized_bits({hw.X_BITS},0,False,False,1)'})(x)
+x = x_skip2 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q2}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1)
+x =           Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 7, 7), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':False, 'act_str':q3}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip2)
+x =           Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 5, 5), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1)
+x =           Bundle( core= {'type':'conv' , 'filters':24, 'kernel_size':( 3, 3), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q1},)(x)
+x =           Bundle( core= {'type':'conv' , 'filters':10, 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4}, flatten= True)(x)
+x =           Bundle( core= {'type':'dense', 'units'  :10,                                                           'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4})(x)
+
+model = QModel(inputs=x_in.raw, outputs=x)
+model.compile()
+model.summary()
 
 '''
 2. TRAIN (using qkeras)
 '''
-model.compile(...)
-model.fit(...)
-model.export() # Generates: savedmodel, config_model.json
+# model.fit(...)
+
 
 '''
 3. EXPORT FOR INFERENCE
-
-- Runs forward pass in float32, records intermediate tensors
-- Runs forward pass in integer, comparing with float32 pass for zero error
-- Runs SystemVerilog testbench with the model & weights, randomizing handshakes, testing with actual C firmware in simulation
-- Prints performance estimate (time, latency)
-- Generates 
-      - config_firmware.h
-      - weights.bin
-      - expected.bin
 '''
-model.export_inference(x=model.random_input) # -> config_firmware.h, weights.bin
+SIM, SIM_PATH = 'xsim', "F:/Xilinx/Vivado/2022.1/bin/" # For Xilinx Vivado
+# SIM, SIM_PATH = 'verilator', "" # For Verilator
+
+model.export_inference(x=model.random_input, hw=hw)  # Runs forward pass in float & int, compares them. Generates: config_fw.h (C firmware), weights.bin, expected.bin
+model.verify_inference(SIM=SIM, SIM_PATH=SIM_PATH)   # Runs SystemVerilog testbench with the model & weights, randomizing handshakes, testing with actual C firmware in simulation
 
 '''
 4. IMPLEMENTATION
 
-a. FPGA: Run vivado.tcl
+a. FPGA: Open vivado, source vivado_flow.tcl
 b. ASIC: Set PDK paths, run syn.tcl & pnr.tcl
-c. Compile C firmware with generated header (model.h) and run on device
+c. Compile C firmware with generated header (config_fw.h) and run on device
 '''
 ```
 
diff --git a/deepsocflow/py/__init__.py b/deepsocflow/py/__init__.py
index 77079a9..3ce19c1 100644
--- a/deepsocflow/py/__init__.py
+++ b/deepsocflow/py/__init__.py
@@ -1,3 +1,5 @@
 from . import hardware, bundle
 from .hardware import *
-from .bundle import *
\ No newline at end of file
+from .bundle import *
+from .model import *
+from .layers import *
\ No newline at end of file
diff --git a/deepsocflow/py/layers.py b/deepsocflow/py/layers.py
index eb65747..7c8db44 100644
--- a/deepsocflow/py/layers.py
+++ b/deepsocflow/py/layers.py
@@ -1,20 +1,13 @@
 from qkeras import QActivation
-from tensorflow.keras.layers import Input, Flatten, Add, MaxPooling2D
+from tensorflow.keras.layers import Layer, Input, Flatten, Add, MaxPooling2D
 import numpy as np
 
-# class QInput(Input):
-#     def __init__(self, shape, batch_size, hw, frac_bits, name=None):
-
-#         self.hw = hw
-#         self.input_frac_bits = input_frac_bits
-#         super().__init__(shape=shape, name=name)
-
-#         int_bits = hw.X_BITS - self.frac_bits + 1
-
-#         x = Input(shape=shape, batch_size=batch_size, name=name)
-#         x = QActivation(f'quantized_bits(bits={hw.X_BITS}, integer={int_bits}, False,True,1)')(x)
-
-#         return x
+def QInput(shape, batch_size, hw, int_bits, name=None):
+    x_raw = Input(shape=shape, batch_size=batch_size, name=name)
+    x = QActivation(f'quantized_bits({hw.X_BITS},{int_bits},False,True,1)')(x_raw)
+    x.raw = x_raw
+    x.hw = hw
+    return x
 
 
 
diff --git a/deepsocflow/py/model.py b/deepsocflow/py/model.py
index 68d93b4..818d968 100644
--- a/deepsocflow/py/model.py
+++ b/deepsocflow/py/model.py
@@ -1,6 +1,306 @@
 from qkeras import Model
+import numpy as np
+import tensorflow.keras
+import os
+from deepsocflow.py.bundle import Bundle
 
 class QModel(Model):
 
     def __init(self, inputs, outputs, name=None):
-        super().__init__(inputs, outputs, name=name)
\ No newline at end of file
+        super().__init__(inputs, outputs, name=name)
+
+
+    @property
+    def random_input(self):
+        tensorflow.keras.utils.set_random_seed(0)
+        return np.clip(np.random.randn(*self.input.shape), -1.0, 1.0)
+
+
+    def export_inference(self, x, hw):
+
+        type_d = { 'np': {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64} }
+
+        y = self(x, training=False)
+        self.hw = hw
+
+        inp_act_model = Model(inputs=self.input, outputs=self.layers[1].output)
+        inp_tensor = inp_act_model(x, training=False)
+
+        inp = {
+            'bits':hw.X_BITS, 
+            'frac':hw.X_BITS-1 - self.layers[1].quantizer.integer,
+            'tensor':inp_tensor,
+            'int':inp_tensor.numpy() * 2**(hw.X_BITS-1)
+            }
+
+        bundles = self.layers[2:]
+        self.bundles = bundles
+
+        '''
+        Export
+        '''
+        
+        ''' Clean the data directory'''
+        os.makedirs(hw.DATA_DIR, exist_ok=True)
+        for file in os.scandir(hw.DATA_DIR):
+            os.remove(file.path)
+
+        buffer_map = []
+        for ib, b in enumerate(bundles):
+            print(f'-----------------{b.idx}-----------------------')
+            b.process(inp if b.idx==0 else None, hw)
+            b.export(hw, False) #ib==len(bundles)-1
+            
+            '''
+            Buffer allocation for add bundle
+            '''
+            print(f'input_map:{buffer_map}')
+
+            '''Find and assign a free buffer. If not, add new buffer'''
+            b.add_out_buffer_idx = -1
+            if len(b.add_tensor_dest) != 0:
+                for im in range(len(buffer_map)):
+                    if buffer_map[im] is None:
+                        buffer_map[im] = {'in':ib, 'out':b.add_tensor_dest}
+                        b.add_out_buffer_idx = im
+                        break
+                else: #m if break is not hit
+                    b.add_out_buffer_idx = len(buffer_map)
+                    buffer_map += [{'in':ib, 'out':b.add_tensor_dest}]
+            
+            print('add_out_buffer_idx:', b.add_out_buffer_idx)
+
+            '''Free the buffers whose last destination is current bundle'''
+            for im in range(len(buffer_map)):
+                buf = buffer_map[im]
+                if buf is not None:
+                    if buf['out'][-1] == ib:
+                        buffer_map[im] = None
+
+            print(f'output_map:{buffer_map}')
+
+
+        '''
+        Write Runtime Headers
+        '''
+        x_bytes_all = x_bytes = w_bytes = b_words = x_bytes_max = nhwc_words_max = o_bytes_max = o_words_max = 0
+        out_buffer_idx = 1
+        with open (f'./config_fw.h', 'w') as ch:
+
+            ch.write(f"#define N_BUNDLES {len(bundles)}\n")
+            ch.write(f"Bundle_t bundles [N_BUNDLES] = {{\n")
+            
+            for ib, b in enumerate(bundles):
+                w_bpt    = (hw.K_BITS*b.we[-1][0].size + hw.IN_BITS)//8
+                w_bpt_p0 = (hw.K_BITS*b.we[0][0].size + hw.IN_BITS )//8
+                x_bpt    = (hw.X_BITS*b.xe[-1].size + hw.IN_BITS   )//8 
+                x_bpt_p0 = (hw.X_BITS*b.xe[0].size + hw.IN_BITS    )//8
+                
+                if ib == len(bundles)-1:
+                    o_words_b = b.o_int.size
+                    o_bytes_b = o_words_b*4 # int or float
+                    o_words = o_words_b
+                else:
+                    b_next    = bundles[ib+1]
+                    o_wpt     = b_next.xe[-1].size
+                    o_wpt_p0  = b_next.xe[0].size
+                    o_words_b = o_wpt_p0 + (b_next.r.CP-1)*o_wpt
+
+                    o_bpt = (hw.X_BITS*b_next.xe[-1].size + hw.IN_BITS)//8
+                    o_bpt_p0 = (hw.X_BITS*b_next.xe[0].size + hw.IN_BITS)//8
+                    o_bytes_b = o_bpt_p0 + (b_next.r.CP-1)*o_bpt
+
+                xp_words  = b.r.XN * b.r.XL * b.r.XW * (hw.ROWS+hw.X_PAD)
+
+                w_bytes_b = (w_bpt_p0 + (b.r.CP-1)*w_bpt)*b.r.IT
+                x_bytes_b = (x_bpt_p0 + (b.r.CP-1)*x_bpt)
+                nhwc_words_b = b.r.XN * b.r.XH * b.r.XW * b.r.CO
+
+                x_bytes_max = max(x_bytes_max, x_bytes_b)
+                nhwc_words_max = max(nhwc_words_max, nhwc_words_b)
+                o_bytes_max = max(o_bytes_max, o_bytes_b)
+                o_words_max = max(o_words_max, o_words_b)
+                w_bytes += w_bytes_b
+                x_bytes_all += x_bytes_b
+
+                if ib == 0:
+                    x_bytes = (x_bpt_p0 + (b.r.CP-1)*x_bpt)
+
+                y_coe = b.r.CO_PRL
+                y_coe_tl = b.r.CO_PRL if (b.r.CO==b.r.IT*b.r.CO_PRL) else b.r.CO%b.r.IT
+                y_r_ll = hw.ROWS if b.r.XH==b.r.XL*hw.ROWS else  b.r.XH % hw.ROWS
+
+                ca_nzero, ca_shift, ca_pl_scale = b.core['act']['non_zero'], b.core['act']['shift_bits'], b.core['act']['plog_slope']
+
+                add_act_shift = b.add['act']['shift_bits'] if b.add is not None else 0
+                add_out_buffer_idx = b.add_out_buffer_idx
+                add_in_buffer_idx = b.add['bundle'].add_out_buffer_idx if b.add is not None else -1
+
+                if b.pool is None:
+                    pool_type = 'POOL_NONE'
+                elif b.pool['type'] == 'max':
+                    pool_type = 'POOL_MAX'
+                elif b.pool['type'] == 'avg':
+                    pool_type = 'POOL_AVG'
+                pool_act_shift = b.pool['act']['shift_bits'] if b.pool is not None else 0
+
+                out_buffer_idx = 1*(not out_buffer_idx) if ib != len(bundles)-1 else -1 # alternate between 0 and 1
+
+                ch.write(f"   {{.n={b.r.XN:<3}, .l={b.r.XL:<3}, .kw={b.r.KW:<3}, .coe={y_coe:<3}, .coe_tl={y_coe_tl:<3}, .r_ll={y_r_ll:<3}, .h={b.r.XH:<3}, .w={b.r.XW:<3}, .ci={b.r.CI:<4}, .co={b.r.CO:<3}, .w_kw2={b.r.XW-b.r.KW//2:<3}, .t={b.r.IT:<3}, .p={b.r.CP:<3}, .cm={b.r.CM:<3}, .cm_p0={b.r.CM_0:<3}, .xp_words={xp_words:<3}, ")
+                ch.write(     f".w_bpt={w_bpt:<5}, .w_bpt_p0={w_bpt_p0:<5}, .x_bpt={x_bpt:<5}, .x_bpt_p0={x_bpt_p0:<5}, .o_words={o_words_b:<5}, .o_bytes={o_bytes_b:<5}, ")
+                ch.write(     f".out_buffer_idx={out_buffer_idx:<2}, .add_out_buffer_idx={add_out_buffer_idx:<2}, .add_in_buffer_idx={add_in_buffer_idx:<2}, ")
+                ch.write(     f".is_bias={1*(b.b is not None):<3}, .is_flatten={1*b.flatten:<3}, ")
+                ch.write(     f".b_offset={b_words:<3}, .b_val_shift={b.bias_val_shift:<3}, .b_bias_shift={b.bias_b_shift:<3}, ")
+                ch.write(     f".ca_nzero={ca_nzero:<3}, .ca_shift={ca_shift:<3}, .ca_pl_scale={ca_pl_scale:<3}, .add_act_shift={add_act_shift:<3}, .pool_act_shift={pool_act_shift:<3}, ")
+                ch.write(     f".csh={b.r.CSH:<3}, .ch={b.r.CYH:<3}, .csh_shift={b.r.CSH_SHIFT:<3}, .pkh={b.r.PKH:<3}, .psh={b.r.PSH:<3}, .ph={b.r.PYH:<3}, .psh_shift={b.r.PSH_SHIFT:<3}, .csw={b.r.CSW:<3}, .cw={b.r.CYW:<3}, .csw_shift={b.r.CSW_SHIFT:<3}, .pkw={b.r.PKW:<3}, .psw={b.r.PSW:<3}, .pw={b.r.PYW:<3}, .psw_shift={b.r.PSW_SHIFT:<3}, .pool={pool_type:<10}, .on={b.r.ON:<3}, .oh={b.r.OH:<3}, .ow={b.r.OW:<3}, .oc={b.r.OC:<3}, ")
+                ch.write(     f".x_header={b.r.x_header_le_p[-1][0]:>23}u, .x_header_p0={b.r.x_header_le_p[0][0]:>23}u, .w_header={b.r.w_header_le_p[-1][0]:>23}u, .w_header_p0={b.r.x_header_le_p[0][0]:>25}u , ")
+                ch.write(     f".debug_nhwc_words={b.oe_exp_nhwc.size:<5} }}")
+                
+                b_words += b.be.size if b.b else 0
+                if b.idx != len(bundles)-1:
+                    ch.write(',\n')
+            
+            ''' Bit masks for X_BITS '''
+
+
+            ch.write(f"\n}};\n\n")
+            ch.write(f"#define X_BITS_L2   {int(np.log2(hw.X_BITS))}\n")
+            ch.write(f"#define W_BITS_L2   {int(np.log2(hw.K_BITS))}\n")
+            ch.write(f"#define X_PAD       {hw.X_PAD}\n")
+            ch.write(f"#define KH_MAX      {hw.KH_MAX}\n")
+            ch.write(f"#define PE_ROWS     {hw.ROWS}\n")
+            ch.write(f"#define PE_COLS     {hw.COLS}\n\n")
+
+            ch.write(f"#define N_ADD_BUF   {len(buffer_map) if len(buffer_map) > 0 else ''}\n")
+            ch.write(f"#define WB_BYTES    {w_bytes + (b_words*hw.B_BITS)//8}\n")
+            ch.write(f"#define W_BYTES     {w_bytes}\n")
+            ch.write(f"#define X_BYTES     {x_bytes}\n")
+            ch.write(f"#define O_WORDS     {o_words}\n")
+            ch.write(f"#define O_WORDS_MAX {o_words_max}\n")
+            ch.write(f"#define O_BYTES_MAX {o_bytes_max}\n")
+            ch.write(f"#define X_BYTES_ALL {x_bytes_all}\n")
+            ch.write(f"#define NHWC_WORDS  {nhwc_words_max}\n")
+            ch.write(f"#define B_TYPE      int{hw.B_BITS}_t\n")
+            ch.write(f"#define B_WORDS     {b_words}\n")
+            ch.write(f'#define DATA_DIR   "../{hw.DATA_DIR}"\n\n')
+
+            mask_nums = [(2**hw.X_BITS-1) << (p*hw.X_BITS)  for p in range(8//hw.X_BITS)]
+            mask_nums = ~np.array(mask_nums, dtype=np.uint8)
+            ch.write(f"static const uint8_t X_POSITION_INVERTED_MASKS [] = {{ {', '.join([str(n) for n in mask_nums])} }};\n")
+
+        '''
+        Write Binary Files
+        '''
+        w_bitstring = b''
+        x_bitstring = b''
+        b_bitstring = b''
+        for ib, b in enumerate(bundles):
+            x_bitstring_b = b''
+            if b.b:
+                b_bitstring += b.be.astype(type_d['np'][hw.B_BITS]).tobytes()
+            for ip in range(b.r.CP):
+                xe = Bundle.pack_words_into_bytes(arr=b.xe[ip].flatten(), bits=hw.X_BITS)
+                x_bitstring_b += b.r.x_header_be_p[ip!=0].tobytes() + xe.tobytes()
+                    
+                for it in range(b.r.IT):
+                    we = Bundle.pack_words_into_bytes(arr=b.we[ip][it].flatten(), bits=hw.K_BITS)
+                    w_bitstring += b.r.w_header_be_p[ip!=0].tobytes() + we.tobytes()
+            x_bitstring += x_bitstring_b
+            with open(f"{hw.DATA_DIR}/{ib}_x_sim.bin", 'wb') as f: 
+                f.write(x_bitstring_b)
+            if ib==0:
+                with open(f"{hw.DATA_DIR}/x.bin", 'wb') as f: 
+                    f.write(x_bitstring_b)
+
+        with open(f"{hw.DATA_DIR}/w.bin", 'wb') as f: 
+            f.write(w_bitstring + b_bitstring)
+
+        with open(f"{hw.DATA_DIR}/x_all.bin", 'wb') as f: 
+            f.write(x_bitstring)
+
+
+        '''
+        Write Text files of vectors
+        '''
+        for b in bundles:
+            np.savetxt(f"{hw.DATA_DIR}/{b.idx}_y_nhwc_exp.txt", b.oe_exp_nhwc.flatten(), fmt='%d')
+            np.savetxt(f"{hw.DATA_DIR}/{b.idx}_xe.txt", np.concatenate([a.flatten() for a in b.xe]), fmt='%d')
+            for ip in range(b.r.CP):
+                CM_p = b.r.CM_0 if ip==0 else b.r.CM
+                x_config = b.r.x_header_le_p[ip!=0][0]
+                x_config = format(x_config, f'#0{hw.IN_BITS}b')
+                x_config_words = [int(x_config[i:i+hw.X_BITS], 2) for i in range(0, len(x_config), hw.X_BITS)]
+                x_config_words.reverse()
+                x_config_words = np.array(x_config_words, dtype=np.int8)
+
+                xp = b.xe[ip].flatten()
+                xp = np.concatenate([x_config_words, xp], axis=0)
+                assert xp.shape == (hw.IN_BITS/hw.X_BITS +b.r.XN*b.r.XL*b.r.XW*CM_p*(hw.ROWS+hw.X_PAD),)
+                np.savetxt(f"{hw.DATA_DIR}/{b.idx}_{ip}_x.txt", xp, fmt='%d')
+
+
+                for it in range(b.r.IT):
+                    
+                    w_config = b.r.w_header_le_p[ip!=0][0]
+                    w_config = format(w_config, f'#0{hw.IN_BITS}b')
+                    w_config_words = [int(w_config[i:i+hw.K_BITS], 2) for i in range(0, len(w_config), hw.K_BITS)]
+                    w_config_words.reverse()
+                    w_config_words = np.array(w_config_words,dtype=np.int8)
+
+                    wp = b.we[ip][it].flatten()            
+                    wp = np.concatenate([w_config_words, wp], axis=0)
+                    assert wp.shape == (hw.IN_BITS/hw.K_BITS + (CM_p*b.r.KH+hw.CONFIG_BEATS)*hw.COLS,)
+                    np.savetxt(f"{hw.DATA_DIR}/{b.idx}_{ip}_{it}_w.txt", wp, fmt='%d')
+
+                    np.savetxt(f"{hw.DATA_DIR}/{b.idx}_{ip}_{it}_y_exp.txt", b.ye_exp_p[ip][it].flatten(), fmt='%d')
+        print(f'Weights, inputs, outputs saved to {hw.DATA_DIR}/ib_ip_it_*.txt')
+
+    def verify_inference(self, SIM, SIM_PATH):
+
+        hw = self.hw
+        bundles = self.bundles
+
+        '''
+        RUN SIMULATION
+        '''
+        hw.simulate(SIM=SIM, SIM_PATH=SIM_PATH)
+
+        '''
+        CHECK ERROR
+        '''
+        for ib, b in enumerate(bundles):
+            
+            ''' Verify raw output '''
+            for ip in range(b.r.CP):
+                for it in range(b.r.IT):
+                    y_raw_exp = b.ye_exp_p[ip][it]
+                    y_raw_sim = np.loadtxt(f"{hw.DATA_DIR}/{b.idx}_{ip}_{it}_y_raw_sim.txt", np.int32).reshape(y_raw_exp.shape)
+                    error = np.sum(np.abs(y_raw_exp-y_raw_sim))
+                    assert error == 0, f"Error={error}, for y_raw_sim at {b.idx=}_{ip=}_{it=}"
+
+            ''' Verify sum output '''
+            y_sum_exp = b.oe_sum_exp
+            y_sum_sim = np.loadtxt(f"{hw.DATA_DIR}/{b.idx}_y_sum_sim.txt", np.int32).reshape(y_sum_exp.shape)
+            error = np.sum(np.abs(y_sum_exp-y_sum_sim))
+            assert error == 0, f"Error={error}, for y_sum_sim at {b.idx=}"
+
+            ''' Verify processed output HWC'''
+            y_nhwc_sim = np.loadtxt(f"{hw.DATA_DIR}/{b.idx}_y_nhwc_sim.txt",np.int32).reshape(b.oe_exp_nhwc.shape)
+            error = np.sum(np.abs(y_nhwc_sim - b.oe_exp_nhwc))
+            assert error == 0, f"sim:\n{y_nhwc_sim[0,:,:,0]}\n exp:\n{b.oe_exp_nhwc[0,:,:,0]}\n input:\n{b.before_pool[0,:,:,0] if b.pool else None}"
+
+            ''' Verify tiled output'''
+            y_tiled_exp = b.o_int if ib == len(bundles)-1 else np.concatenate([a.flatten() for a in bundles[ib+1].xe])
+            y_tiled_sim = np.loadtxt(f"{hw.DATA_DIR}/{b.idx}_y_tiled_sim.txt", np.int32).reshape(y_tiled_exp.shape)
+            error = np.sum(np.abs(y_tiled_sim-y_tiled_exp))
+            assert error == 0, f"Error={error}, for y_tiled_sim at {b.idx=}"
+
+            ''' Verify packed output'''
+            if ib != len(bundles)-1:
+                with open(f'{hw.DATA_DIR}/{ib}_y_packed_sim.bin', 'rb') as f_sim, open(f'{hw.DATA_DIR}/{ib+1}_x_sim.bin', 'rb') as f_exp:
+                    y_packed_sim = np.frombuffer(f_sim.read(), dtype=np.uint8)
+                    y_packed_exp = np.frombuffer(f_exp.read(), dtype=np.uint8)
+                error = np.sum(np.abs(y_packed_sim-y_packed_exp))
+                assert error == 0, f"Error={error}, for y_packed_sim at {b.idx=}, y_packed_sim=\n{y_packed_sim[:100]} \n y_packed_exp=\n{y_packed_exp[:100]}\n"
+                
+            print(f"Bundle {b.idx}, Error: {error}")
\ No newline at end of file
diff --git a/deepsocflow/py/utils.py b/deepsocflow/py/utils.py
index 7d9f731..e9afe9c 100644
--- a/deepsocflow/py/utils.py
+++ b/deepsocflow/py/utils.py
@@ -1,4 +1,17 @@
 import numpy as np
 
 def clog2(x):
-    return int(np.ceil(np.log2(x)))
\ No newline at end of file
+    return int(np.ceil(np.log2(x)))
+
+
+class QTensor:
+    def __init__(self, bits, frac, tensor):
+        self.bits = bits
+        self.frac = frac
+        self.tensor = tensor
+        self.int = check_and_store(tensor.numpy())
+    
+    def check_and_store(self, float_np):
+        int_np = float_np * 2**self.frac
+        assert np.all(int_np == self.int), f"Integer check failed for tensor: \nfloat:\n{float_np}, \n*2^{frac}:\n{int_np}"
+        self.int = int_np.astype(int)
\ No newline at end of file
diff --git a/deepsocflow/tcl/fpga/vivado.tcl b/deepsocflow/tcl/fpga/vivado.tcl
index 8310a39..79de600 100644
--- a/deepsocflow/tcl/fpga/vivado.tcl
+++ b/deepsocflow/tcl/fpga/vivado.tcl
@@ -87,8 +87,8 @@ write_hw_platform -fixed -include_bit -force -file design_1_wrapper.xsa
 
 # Reports
 open_run impl_1
-if {![file exists reports]} {exec mkdir reports}
-report_timing_summary -delay_type min_max -report_unconstrained -check_timing_verbose -max_paths 100 -input_pins -routable_nets -name timing_1 -file reports/${PROJECT_NAME}_${BOARD}_${FREQ}_timing_report.txt
+if {![file exists $PROJECT_NAME/reports]} {exec mkdir $PROJECT_NAME/reports}
+report_timing_summary -delay_type min_max -report_unconstrained -check_timing_verbose -max_paths 100 -input_pins -routable_nets -name timing_1 -file $PROJECT_NAME/reports/${PROJECT_NAME}_${BOARD}_${FREQ}_timing_report.txt
 report_utilization -file $PROJECT_NAME/reports/${PROJECT_NAME}_${BOARD}_${FREQ}_utilization_report.txt -name utilization_1
 report_power -file reports/${PROJECT_NAME}_${BOARD}_${FREQ}_power_1.txt -name {power_1}
 report_drc -name drc_1 -file reports/${PROJECT_NAME}_${BOARD}_${FREQ}_drc_1.txt -ruledecks {default opt_checks placer_checks router_checks bitstream_checks incr_eco_checks eco_checks abs_checks}
diff --git a/run/example.py b/run/example.py
new file mode 100644
index 0000000..e19d90a
--- /dev/null
+++ b/run/example.py
@@ -0,0 +1,78 @@
+from deepsocflow import Bundle, Hardware, QModel, QInput
+
+'''
+0. Specify Hardware
+'''
+hw = Hardware (                          # Alternatively: hw = Hardware.from_json('hardware.json')
+        processing_elements = (8, 96)  , # (rows, columns) of multiply-add units
+        frequency_mhz       = 250      , #  
+        bits_input          = 4        , # bit width of input pixels and activations
+        bits_weights        = 4        , # bit width of weights
+        bits_sum            = 16       , # bit width of accumulator
+        bits_bias           = 16       , # bit width of bias
+        max_batch_size      = 64       , # 
+        max_channels_in     = 2048     , #
+        max_kernel_size     = 13       , #
+        max_image_size      = 512      , #
+        ram_weights_depth   = 20       , #
+        ram_edges_depth     = 288      , #
+        axi_width           = 64       , #
+        target_cpu_int_bits = 32       , #
+        valid_prob          = 0.1      , # probability in which AXI-Stream s_valid signal should be toggled in simulation
+        ready_prob          = 0.1      , # probability in which AXI-Stream m_ready signal should be toggled in simulation
+        data_dir            = 'vectors', # directory to store generated test vectors
+     )
+hw.export() # Generates: config_hw.svh, config_hw.tcl, config_tb.svh, hardware.json
+hw.export_vivado_tcl(board='zcu104')
+
+
+'''
+1. Build Model 
+'''
+XN = 1
+input_shape = (XN,18,18,3) # (XN, XH, XW, CI)
+
+QINT_BITS = 0
+kq = f'quantized_bits({hw.K_BITS},{QINT_BITS},False,True,1)'
+bq = f'quantized_bits({hw.B_BITS},{QINT_BITS},False,True,1)'
+q1 = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0)'    
+q2 = f'quantized_bits({hw.X_BITS},{QINT_BITS},False,False,1)'       
+q3 = f'quantized_bits({hw.X_BITS},{QINT_BITS},False,True,1)'        
+q4 = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0.125)'
+
+x = x_in = QInput(shape=input_shape[1:], batch_size=XN, hw=hw, int_bits=QINT_BITS, name='input')
+
+x = x_skip1 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':(11,11), 'strides':(2,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q1}, pool= {'type':'avg', 'size':(3,4), 'strides':(2,3), 'padding':'same', 'act_str':f'quantized_bits({hw.X_BITS},0,False,False,1)'})(x)
+x = x_skip2 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q2}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1)
+x =           Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 7, 7), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':False, 'act_str':q3}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip2)
+x =           Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 5, 5), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1)
+x =           Bundle( core= {'type':'conv' , 'filters':24, 'kernel_size':( 3, 3), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q1},)(x)
+x =           Bundle( core= {'type':'conv' , 'filters':10, 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4}, flatten= True)(x)
+x =           Bundle( core= {'type':'dense', 'units'  :10,                                                           'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4})(x)
+
+model = QModel(inputs=x_in.raw, outputs=x)
+model.compile()
+model.summary()
+
+'''
+2. TRAIN (using qkeras)
+'''
+# model.fit(...)
+
+
+'''
+3. EXPORT FOR INFERENCE
+'''
+SIM, SIM_PATH = 'xsim', "F:/Xilinx/Vivado/2022.1/bin/" # For Xilinx Vivado
+# SIM, SIM_PATH = 'verilator', "" # For Verilator
+
+model.export_inference(x=model.random_input, hw=hw)  # Runs forward pass in float & int, compares them. Generates: config_fw.h (C firmware), weights.bin, expected.bin
+model.verify_inference(SIM=SIM, SIM_PATH=SIM_PATH)   # Runs SystemVerilog testbench with the model & weights, randomizing handshakes, testing with actual C firmware in simulation
+
+'''
+4. IMPLEMENTATION
+
+a. FPGA: Open vivado, source vivado_flow.tcl
+b. ASIC: Set PDK paths, run syn.tcl & pnr.tcl
+c. Compile C firmware with generated header (config_fw.h) and run on device
+'''
\ No newline at end of file
diff --git a/run/param_test.py b/run/param_test.py
index 8624ff7..bf77e25 100644
--- a/run/param_test.py
+++ b/run/param_test.py
@@ -1,26 +1,12 @@
-import numpy as np
 import os
 import pytest
 import itertools
-from qkeras import *
-from tensorflow.keras.layers import Input
 import sys
 sys.path.append("../../")
-import deepsocflow
-from deepsocflow import Bundle, Hardware
-
+from deepsocflow import Bundle, Hardware, QModel, QInput
 
 # Simulator: xsim on windows, verilator otherwise
-if os.name=='nt':
-    SIM = 'xsim'
-    SIM_PATH = "F:/Xilinx/Vivado/2022.1/bin/" #os.path.join("F:", "Xilinx", "Vivado", "2022.1", "bin")
-else:
-    SIM = 'verilator'
-    SIM_PATH = ''
-
-
-keras.utils.set_random_seed(0)
-type_d = { 'np': {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64} }
+(SIM, SIM_PATH) = ('xsim', "F:/Xilinx/Vivado/2022.1/bin/") if os.name=='nt' else ('verilator', '')
 
 def product_dict(**kwargs):
     for instance in itertools.product(*(kwargs.values())):
@@ -47,7 +33,7 @@ def product_dict(**kwargs):
                                     )))
 def test_dnn_engine(PARAMS):
     '''
-    0. Specify Hardware
+    0. SPECIFY HARDWARE
     '''
     hw = Hardware (**PARAMS)
     hw.export_json()
@@ -55,304 +41,41 @@ def test_dnn_engine(PARAMS):
     hw.export() # Generates: config_hw.svh, config_hw.tcl
     hw.export_vivado_tcl(board='zcu104')
 
-
-    xq, kq, bq = f'quantized_bits({hw.X_BITS},0,False,True,1)', f'quantized_bits({hw.K_BITS},0,False,True,1)', f'quantized_bits({hw.B_BITS},0,False,True,1)'
-    inp        = {'bits':hw.X_BITS, 'frac':hw.X_BITS-1}
-
     '''
-    1. Build Model
+    1. BUILD MODEL
     '''
-    input_shape = (8,18,18,3) # (XN, XH, XW, CI)
-    x = x_in = Input(input_shape[1:], name='input')
-    x = QActivation(xq)(x)
-
-    x = x_skip1 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':(11,11), 'strides':(2,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({hw.X_BITS},0,negative_slope=0)'    }, pool= {'type':'avg', 'size':(3,4), 'strides':(2,3), 'padding':'same', 'act_str':f'quantized_bits({hw.X_BITS},0,False,False,1)'})(x)
-    x = x_skip2 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_bits({hw.X_BITS},0,False,False,1)'       }, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1)
-    x =           Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 7, 7), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':False, 'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'        }, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip2)
-    x =           Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 5, 5), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({hw.X_BITS},0,negative_slope=0.125)'}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1)
-    x =           Bundle( core= {'type':'conv' , 'filters':24, 'kernel_size':( 3, 3), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({hw.X_BITS},0,negative_slope=0)'    },)(x)
-    x =           Bundle( core= {'type':'conv' , 'filters':10, 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({hw.X_BITS},0,negative_slope=0.125)'}, flatten= True)(x)
-    x =           Bundle( core= {'type':'dense', 'units'  :10,                                                           'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({hw.X_BITS},0,negative_slope=0.125)'})(x)
+    XN = 8
+    input_shape = (XN,18,18,3) # (XN, XH, XW, CI)
 
-    model = Model(inputs=x_in, outputs=x)
+    QINT_BITS = 0
+    kq = f'quantized_bits({hw.K_BITS},{QINT_BITS},False,True,1)'
+    bq = f'quantized_bits({hw.B_BITS},{QINT_BITS},False,True,1)'
+    q1 = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0)'    
+    q2 = f'quantized_bits({hw.X_BITS},{QINT_BITS},False,False,1)'       
+    q3 = f'quantized_bits({hw.X_BITS},{QINT_BITS},False,True,1)'        
+    q4 = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0.125)'
 
+    x = x_in = QInput(shape=input_shape[1:], batch_size=XN, hw=hw, int_bits=QINT_BITS, name='input')
 
-    '''
-    Pass Floating Point & Fixed Point Input
-    '''
-    x = np.clip(np.random.randn(*input_shape), -1.0, 1.0)
-    y = model(x)
+    x = x_skip1 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':(11,11), 'strides':(2,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q1}, pool= {'type':'avg', 'size':(3,4), 'strides':(2,3), 'padding':'same', 'act_str':f'quantized_bits({hw.X_BITS},0,False,False,1)'})(x)
+    x = x_skip2 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q2}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1)
+    x =           Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 7, 7), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':False, 'act_str':q3}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip2)
+    x =           Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 5, 5), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1)
+    x =           Bundle( core= {'type':'conv' , 'filters':24, 'kernel_size':( 3, 3), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q1},)(x)
+    x =           Bundle( core= {'type':'conv' , 'filters':10, 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4}, flatten= True)(x)
+    x =           Bundle( core= {'type':'dense', 'units'  :10,                                                           'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4})(x)
 
-    inp_act_model = Model(inputs=model.input, outputs=model.layers[1].output)
-    inp['tensor'] = inp_act_model(x, training=False)
-    inp['int'] = inp['tensor'].numpy() * 2**inp['frac']
+    model = QModel(inputs=x_in.raw, outputs=x)
+    model.compile()
+    model.summary()
 
     '''
-    Clean the data directory
+    2. TRAIN MODEL
     '''
-    os.makedirs(hw.DATA_DIR, exist_ok=True)
-    for file in os.scandir(hw.DATA_DIR):
-        os.remove(file.path)
-
-    bundles = model.layers[2:]
-
-
-    '''
-    Export
-    '''
-    buffer_map = []
-    for ib, b in enumerate(bundles):
-        print(f'-----------------{b.idx}-----------------------')
-        b.process(inp if b.idx==0 else None, hw)
-        b.export(hw, False) #ib==len(bundles)-1
-        
-        '''
-        Buffer allocation for add bundle
-        '''
-        print(f'input_map:{buffer_map}')
-
-        '''Find and assign a free buffer. If not, add new buffer'''
-        b.add_out_buffer_idx = -1
-        if len(b.add_tensor_dest) != 0:
-            for im in range(len(buffer_map)):
-                if buffer_map[im] is None:
-                    buffer_map[im] = {'in':ib, 'out':b.add_tensor_dest}
-                    b.add_out_buffer_idx = im
-                    break
-            else: #m if break is not hit
-                b.add_out_buffer_idx = len(buffer_map)
-                buffer_map += [{'in':ib, 'out':b.add_tensor_dest}]
-        
-        print('add_out_buffer_idx:', b.add_out_buffer_idx)
-
-        '''Free the buffers whose last destination is current bundle'''
-        for im in range(len(buffer_map)):
-            buf = buffer_map[im]
-            if buf is not None:
-                if buf['out'][-1] == ib:
-                    buffer_map[im] = None
-
-        print(f'output_map:{buffer_map}')
-
+    # model.fit(...)
 
     '''
-    Write Runtime Headers
+    3. EXPORT FOR INFERENCE
     '''
-    x_bytes_all = x_bytes = w_bytes = b_words = x_bytes_max = nhwc_words_max = o_bytes_max = o_words_max = 0
-    out_buffer_idx = 1
-    with open (f'./config_fw.h', 'w') as ch:
-
-        ch.write(f"#define N_BUNDLES {len(bundles)}\n")
-        ch.write(f"Bundle_t bundles [N_BUNDLES] = {{\n")
-        
-        for ib, b in enumerate(bundles):
-            w_bpt    = (hw.K_BITS*b.we[-1][0].size + hw.IN_BITS)//8
-            w_bpt_p0 = (hw.K_BITS*b.we[0][0].size + hw.IN_BITS )//8
-            x_bpt    = (hw.X_BITS*b.xe[-1].size + hw.IN_BITS   )//8 
-            x_bpt_p0 = (hw.X_BITS*b.xe[0].size + hw.IN_BITS    )//8
-            
-            if ib == len(bundles)-1:
-                o_words_b = b.o_int.size
-                o_bytes_b = o_words_b*4 # int or float
-                o_words = o_words_b
-            else:
-                b_next    = bundles[ib+1]
-                o_wpt     = b_next.xe[-1].size
-                o_wpt_p0  = b_next.xe[0].size
-                o_words_b = o_wpt_p0 + (b_next.r.CP-1)*o_wpt
-
-                o_bpt = (hw.X_BITS*b_next.xe[-1].size + hw.IN_BITS)//8
-                o_bpt_p0 = (hw.X_BITS*b_next.xe[0].size + hw.IN_BITS)//8
-                o_bytes_b = o_bpt_p0 + (b_next.r.CP-1)*o_bpt
-
-            xp_words  = b.r.XN * b.r.XL * b.r.XW * (hw.ROWS+hw.X_PAD)
-
-            w_bytes_b = (w_bpt_p0 + (b.r.CP-1)*w_bpt)*b.r.IT
-            x_bytes_b = (x_bpt_p0 + (b.r.CP-1)*x_bpt)
-            nhwc_words_b = b.r.XN * b.r.XH * b.r.XW * b.r.CO
-
-            x_bytes_max = max(x_bytes_max, x_bytes_b)
-            nhwc_words_max = max(nhwc_words_max, nhwc_words_b)
-            o_bytes_max = max(o_bytes_max, o_bytes_b)
-            o_words_max = max(o_words_max, o_words_b)
-            w_bytes += w_bytes_b
-            x_bytes_all += x_bytes_b
-
-            if ib == 0:
-                x_bytes = (x_bpt_p0 + (b.r.CP-1)*x_bpt)
-
-            y_coe = b.r.CO_PRL
-            y_coe_tl = b.r.CO_PRL if (b.r.CO==b.r.IT*b.r.CO_PRL) else b.r.CO%b.r.IT
-            y_r_ll = hw.ROWS if b.r.XH==b.r.XL*hw.ROWS else  b.r.XH % hw.ROWS
-
-            ca_nzero, ca_shift, ca_pl_scale = b.core['act']['non_zero'], b.core['act']['shift_bits'], b.core['act']['plog_slope']
-
-            add_act_shift = b.add['act']['shift_bits'] if b.add is not None else 0
-            add_out_buffer_idx = b.add_out_buffer_idx
-            add_in_buffer_idx = b.add['bundle'].add_out_buffer_idx if b.add is not None else -1
-
-            if b.pool is None:
-                pool_type = 'POOL_NONE'
-            elif b.pool['type'] == 'max':
-                pool_type = 'POOL_MAX'
-            elif b.pool['type'] == 'avg':
-                pool_type = 'POOL_AVG'
-            pool_act_shift = b.pool['act']['shift_bits'] if b.pool is not None else 0
-
-            out_buffer_idx = 1*(not out_buffer_idx) if ib != len(bundles)-1 else -1 # alternate between 0 and 1
-
-            ch.write(f"   {{.n={b.r.XN:<3}, .l={b.r.XL:<3}, .kw={b.r.KW:<3}, .coe={y_coe:<3}, .coe_tl={y_coe_tl:<3}, .r_ll={y_r_ll:<3}, .h={b.r.XH:<3}, .w={b.r.XW:<3}, .ci={b.r.CI:<4}, .co={b.r.CO:<3}, .w_kw2={b.r.XW-b.r.KW//2:<3}, .t={b.r.IT:<3}, .p={b.r.CP:<3}, .cm={b.r.CM:<3}, .cm_p0={b.r.CM_0:<3}, .xp_words={xp_words:<3}, ")
-            ch.write(     f".w_bpt={w_bpt:<5}, .w_bpt_p0={w_bpt_p0:<5}, .x_bpt={x_bpt:<5}, .x_bpt_p0={x_bpt_p0:<5}, .o_words={o_words_b:<5}, .o_bytes={o_bytes_b:<5}, ")
-            ch.write(     f".out_buffer_idx={out_buffer_idx:<2}, .add_out_buffer_idx={add_out_buffer_idx:<2}, .add_in_buffer_idx={add_in_buffer_idx:<2}, ")
-            ch.write(     f".is_bias={1*(b.b is not None):<3}, .is_flatten={1*b.flatten:<3}, ")
-            ch.write(     f".b_offset={b_words:<3}, .b_val_shift={b.bias_val_shift:<3}, .b_bias_shift={b.bias_b_shift:<3}, ")
-            ch.write(     f".ca_nzero={ca_nzero:<3}, .ca_shift={ca_shift:<3}, .ca_pl_scale={ca_pl_scale:<3}, .add_act_shift={add_act_shift:<3}, .pool_act_shift={pool_act_shift:<3}, ")
-            ch.write(     f".csh={b.r.CSH:<3}, .ch={b.r.CYH:<3}, .csh_shift={b.r.CSH_SHIFT:<3}, .pkh={b.r.PKH:<3}, .psh={b.r.PSH:<3}, .ph={b.r.PYH:<3}, .psh_shift={b.r.PSH_SHIFT:<3}, .csw={b.r.CSW:<3}, .cw={b.r.CYW:<3}, .csw_shift={b.r.CSW_SHIFT:<3}, .pkw={b.r.PKW:<3}, .psw={b.r.PSW:<3}, .pw={b.r.PYW:<3}, .psw_shift={b.r.PSW_SHIFT:<3}, .pool={pool_type:<10}, .on={b.r.ON:<3}, .oh={b.r.OH:<3}, .ow={b.r.OW:<3}, .oc={b.r.OC:<3}, ")
-            ch.write(     f".x_header={b.r.x_header_le_p[-1][0]:>23}u, .x_header_p0={b.r.x_header_le_p[0][0]:>23}u, .w_header={b.r.w_header_le_p[-1][0]:>23}u, .w_header_p0={b.r.x_header_le_p[0][0]:>25}u , ")
-            ch.write(     f".debug_nhwc_words={b.oe_exp_nhwc.size:<5} }}")
-            
-            b_words += b.be.size if b.b else 0
-            if b.idx != len(bundles)-1:
-                ch.write(',\n')
-        
-        ''' Bit masks for X_BITS '''
-
-
-        ch.write(f"\n}};\n\n")
-        ch.write(f"#define X_BITS_L2   {int(np.log2(hw.X_BITS))}\n")
-        ch.write(f"#define W_BITS_L2   {int(np.log2(hw.K_BITS))}\n")
-        ch.write(f"#define X_PAD       {hw.X_PAD}\n")
-        ch.write(f"#define KH_MAX      {hw.KH_MAX}\n")
-        ch.write(f"#define PE_ROWS     {hw.ROWS}\n")
-        ch.write(f"#define PE_COLS     {hw.COLS}\n\n")
-
-        ch.write(f"#define N_ADD_BUF   {len(buffer_map) if len(buffer_map) > 0 else ''}\n")
-        ch.write(f"#define WB_BYTES    {w_bytes + (b_words*hw.B_BITS)//8}\n")
-        ch.write(f"#define W_BYTES     {w_bytes}\n")
-        ch.write(f"#define X_BYTES     {x_bytes}\n")
-        ch.write(f"#define O_WORDS     {o_words}\n")
-        ch.write(f"#define O_WORDS_MAX {o_words_max}\n")
-        ch.write(f"#define O_BYTES_MAX {o_bytes_max}\n")
-        ch.write(f"#define X_BYTES_ALL {x_bytes_all}\n")
-        ch.write(f"#define NHWC_WORDS  {nhwc_words_max}\n")
-        ch.write(f"#define B_TYPE      int{hw.B_BITS}_t\n")
-        ch.write(f"#define B_WORDS     {b_words}\n")
-        ch.write(f'#define DATA_DIR   "../{hw.DATA_DIR}"\n\n')
-
-        mask_nums = [(2**hw.X_BITS-1) << (p*hw.X_BITS)  for p in range(8//hw.X_BITS)]
-        mask_nums = ~np.array(mask_nums, dtype=np.uint8)
-        ch.write(f"static const uint8_t X_POSITION_INVERTED_MASKS [] = {{ {', '.join([str(n) for n in mask_nums])} }};\n")
-
-    '''
-    Write Binary Files
-    '''
-    w_bitstring = b''
-    x_bitstring = b''
-    b_bitstring = b''
-    for ib, b in enumerate(bundles):
-        x_bitstring_b = b''
-        if b.b:
-            b_bitstring += b.be.astype(type_d['np'][hw.B_BITS]).tobytes()
-        for ip in range(b.r.CP):
-            xe = Bundle.pack_words_into_bytes(arr=b.xe[ip].flatten(), bits=hw.X_BITS)
-            x_bitstring_b += b.r.x_header_be_p[ip!=0].tobytes() + xe.tobytes()
-                
-            for it in range(b.r.IT):
-                we = Bundle.pack_words_into_bytes(arr=b.we[ip][it].flatten(), bits=hw.K_BITS)
-                w_bitstring += b.r.w_header_be_p[ip!=0].tobytes() + we.tobytes()
-        x_bitstring += x_bitstring_b
-        with open(f"{hw.DATA_DIR}/{ib}_x_sim.bin", 'wb') as f: 
-            f.write(x_bitstring_b)
-        if ib==0:
-            with open(f"{hw.DATA_DIR}/x.bin", 'wb') as f: 
-                f.write(x_bitstring_b)
-
-    with open(f"{hw.DATA_DIR}/w.bin", 'wb') as f: 
-        f.write(w_bitstring + b_bitstring)
-
-    with open(f"{hw.DATA_DIR}/x_all.bin", 'wb') as f: 
-        f.write(x_bitstring)
-
-
-    '''
-    Write Text files of vectors
-    '''
-    for b in bundles:
-        np.savetxt(f"{hw.DATA_DIR}/{b.idx}_y_nhwc_exp.txt", b.oe_exp_nhwc.flatten(), fmt='%d')
-        np.savetxt(f"{hw.DATA_DIR}/{b.idx}_xe.txt", np.concatenate([a.flatten() for a in b.xe]), fmt='%d')
-        for ip in range(b.r.CP):
-            CM_p = b.r.CM_0 if ip==0 else b.r.CM
-            x_config = b.r.x_header_le_p[ip!=0][0]
-            x_config = format(x_config, f'#0{hw.IN_BITS}b')
-            x_config_words = [int(x_config[i:i+hw.X_BITS], 2) for i in range(0, len(x_config), hw.X_BITS)]
-            x_config_words.reverse()
-            x_config_words = np.array(x_config_words, dtype=np.int8)
-
-            xp = b.xe[ip].flatten()
-            xp = np.concatenate([x_config_words, xp], axis=0)
-            assert xp.shape == (hw.IN_BITS/hw.X_BITS +b.r.XN*b.r.XL*b.r.XW*CM_p*(hw.ROWS+hw.X_PAD),)
-            np.savetxt(f"{hw.DATA_DIR}/{b.idx}_{ip}_x.txt", xp, fmt='%d')
-
-
-            for it in range(b.r.IT):
-                
-                w_config = b.r.w_header_le_p[ip!=0][0]
-                w_config = format(w_config, f'#0{hw.IN_BITS}b')
-                w_config_words = [int(w_config[i:i+hw.K_BITS], 2) for i in range(0, len(w_config), hw.K_BITS)]
-                w_config_words.reverse()
-                w_config_words = np.array(w_config_words,dtype=np.int8)
-
-                wp = b.we[ip][it].flatten()            
-                wp = np.concatenate([w_config_words, wp], axis=0)
-                assert wp.shape == (hw.IN_BITS/hw.K_BITS + (CM_p*b.r.KH+hw.CONFIG_BEATS)*hw.COLS,)
-                np.savetxt(f"{hw.DATA_DIR}/{b.idx}_{ip}_{it}_w.txt", wp, fmt='%d')
-
-                np.savetxt(f"{hw.DATA_DIR}/{b.idx}_{ip}_{it}_y_exp.txt", b.ye_exp_p[ip][it].flatten(), fmt='%d')
-    print(f'Weights, inputs, outputs saved to {hw.DATA_DIR}/ib_ip_it_*.txt')
-
-
-    '''
-    RUN SIMULATION
-    '''
-    hw.simulate(SIM=SIM, SIM_PATH=SIM_PATH)
-
-
-    '''
-    CHECK ERROR
-    '''
-    for ib, b in enumerate(bundles):
-        
-        ''' Verify raw output '''
-        for ip in range(b.r.CP):
-            for it in range(b.r.IT):
-                y_raw_exp = b.ye_exp_p[ip][it]
-                y_raw_sim = np.loadtxt(f"{hw.DATA_DIR}/{b.idx}_{ip}_{it}_y_raw_sim.txt", np.int32).reshape(y_raw_exp.shape)
-                error = np.sum(np.abs(y_raw_exp-y_raw_sim))
-                assert error == 0, f"Error={error}, for y_raw_sim at {b.idx=}_{ip=}_{it=}"
-
-        ''' Verify sum output '''
-        y_sum_exp = b.oe_sum_exp
-        y_sum_sim = np.loadtxt(f"{hw.DATA_DIR}/{b.idx}_y_sum_sim.txt", np.int32).reshape(y_sum_exp.shape)
-        error = np.sum(np.abs(y_sum_exp-y_sum_sim))
-        assert error == 0, f"Error={error}, for y_sum_sim at {b.idx=}"
-
-        ''' Verify processed output HWC'''
-        y_nhwc_sim = np.loadtxt(f"{hw.DATA_DIR}/{b.idx}_y_nhwc_sim.txt",np.int32).reshape(b.oe_exp_nhwc.shape)
-        error = np.sum(np.abs(y_nhwc_sim - b.oe_exp_nhwc))
-        assert error == 0, f"sim:\n{y_nhwc_sim[0,:,:,0]}\n exp:\n{b.oe_exp_nhwc[0,:,:,0]}\n input:\n{b.before_pool[0,:,:,0] if b.pool else None}"
-
-        ''' Verify tiled output'''
-        y_tiled_exp = b.o_int if ib == len(bundles)-1 else np.concatenate([a.flatten() for a in bundles[ib+1].xe])
-        y_tiled_sim = np.loadtxt(f"{hw.DATA_DIR}/{b.idx}_y_tiled_sim.txt", np.int32).reshape(y_tiled_exp.shape)
-        error = np.sum(np.abs(y_tiled_sim-y_tiled_exp))
-        assert error == 0, f"Error={error}, for y_tiled_sim at {b.idx=}"
-
-        ''' Verify packed output'''
-        if ib != len(bundles)-1:
-            with open(f'{hw.DATA_DIR}/{ib}_y_packed_sim.bin', 'rb') as f_sim, open(f'{hw.DATA_DIR}/{ib+1}_x_sim.bin', 'rb') as f_exp:
-                y_packed_sim = np.frombuffer(f_sim.read(), dtype=np.uint8)
-                y_packed_exp = np.frombuffer(f_exp.read(), dtype=np.uint8)
-            error = np.sum(np.abs(y_packed_sim-y_packed_exp))
-            assert error == 0, f"Error={error}, for y_packed_sim at {b.idx=}, y_packed_sim=\n{y_packed_sim[:100]} \n y_packed_exp=\n{y_packed_exp[:100]}\n"
-            
-        print(f"Bundle {b.idx}, Error: {error}")
\ No newline at end of file
+    model.export_inference(x=model.random_input, hw=hw)
+    model.verify_inference(SIM=SIM, SIM_PATH=SIM_PATH)
\ No newline at end of file
diff --git a/run/work/config_hw.svh b/run/work/config_hw.svh
index 6e99d35..aee0e25 100644
--- a/run/work/config_hw.svh
+++ b/run/work/config_hw.svh
@@ -5,7 +5,7 @@
 `define COLS                24          // PE cols, constrained by resources
 `define X_BITS              4           // Bits per word in input
 `define K_BITS              4           // Bits per word in input
-`define Y_BITS              24          // Bits per word in output of conv
+`define Y_BITS              16          // Bits per word in output of conv
 
 `define KH_MAX              13          // max of kernel height, across layers
 `define KW_MAX              13          // max of kernel width, across layers
diff --git a/run/work/config_hw.tcl b/run/work/config_hw.tcl
index 685521b..d0693d5 100644
--- a/run/work/config_hw.tcl
+++ b/run/work/config_hw.tcl
@@ -3,7 +3,7 @@
 
 set FREQ               250
 set ROWS               8
-set COLS               96
+set COLS               24
 set X_BITS             4
 set K_BITS             4
 set Y_BITS             16
diff --git a/run/work/hardware.json b/run/work/hardware.json
index 471e523..2634367 100644
--- a/run/work/hardware.json
+++ b/run/work/hardware.json
@@ -6,18 +6,12 @@
     "frequency_mhz": 250,
     "bits_input": 4,
     "bits_weights": 4,
-    "bits_sum": 24,
+    "bits_sum": 16,
     "bits_bias": 16,
     "max_batch_size": 64,
     "max_channels_in": 2048,
-    "max_kernel_size": [
-        13,
-        13
-    ],
-    "max_image_size": [
-        512,
-        512
-    ],
+    "max_kernel_size": 13,
+    "max_image_size": 512,
     "ram_weights_depth": 20,
     "ram_edges_depth": 288,
     "axi_width": 64,