Updated API

KastnerRG · Nov 20, 2023 · 9c4e77e · 9c4e77e
1 parent 8d0dd35
commit 9c4e77e
Show file tree

Hide file tree

Showing 11 changed files with 495 additions and 380 deletions.
diff --git a/README.md b/README.md
@@ -15,74 +15,86 @@ We present a highly flexible, high performance accelerator system that can be ad
 
 ![System](docs/overall.png)
 
-## User API (WIP)
+## User API
 
 ```py
-from deepsocflow import Hardware, Bundle, QInput, BundleModel, QConvCore, QDenseCore, QAdd, QPool, Softmax, QLeakyReLu
+from deepsocflow import Bundle, Hardware, QModel, QInput
 
 '''
 0. Specify Hardware
 '''
-hw = Hardware (
-        processing_elements = (8, 96),
-        frequency           = 1000,
-        bits_input          = 8,
-        bits_weights        = 4,
-        bits_sum            = 24,
-        bits_bias           = 16,
-        max_kernel_size     = (13, 13),
-        max_channels_in     = 512,
-        max_channels_out    = 512,
-        max_image_size      = (32,32),
+hw = Hardware (                          # Alternatively: hw = Hardware.from_json('hardware.json')
+        processing_elements = (8, 96)  , # (rows, columns) of multiply-add units
+        frequency_mhz       = 250      , #  
+        bits_input          = 4        , # bit width of input pixels and activations
+        bits_weights        = 4        , # bit width of weights
+        bits_sum            = 16       , # bit width of accumulator
+        bits_bias           = 16       , # bit width of bias
+        max_batch_size      = 64       , # 
+        max_channels_in     = 2048     , #
+        max_kernel_size     = 13       , #
+        max_image_size      = 512      , #
+        ram_weights_depth   = 20       , #
+        ram_edges_depth     = 288      , #
+        axi_width           = 64       , #
+        target_cpu_int_bits = 32       , #
+        valid_prob          = 0.1      , # probability in which AXI-Stream s_valid signal should be toggled in simulation
+        ready_prob          = 0.1      , # probability in which AXI-Stream m_ready signal should be toggled in simulation
+        data_dir            = 'vectors', # directory to store generated test vectors
      )
-hw.export() # Generates: config_hw.svh, config_hw.tcl, config_hw.json
-# Alternatively: hw = Hardware.from_json('config_hw.json')
+hw.export() # Generates: config_hw.svh, config_hw.tcl, config_tb.svh, hardware.json
+hw.export_vivado_tcl(board='zcu104')
+
 
 '''
 1. Build Model 
 '''
-x = QInput( input_shape= (8,32,32,3), hw= hw, input_frac_bits= 4)
-
-x = Bundle( core= QConvCore(filters= 32, kernel_size= (7,7), strides= (2,2), padding= 'same', weights_frac_bits= 4, bias_frac_bits= 8, activation= QLeakyReLu(negative_slope=0.125, frac_bits= 4 )),
-            pool= QPool(type= 'max', size= (3,3), strides= (1,1), padding= 'same', frac_bits= 4)
-            )(x)
-x_skip = x
-x = Bundle( core= QConvCore(filters= 64, kernel_size= (3,3), weights_frac_bits= 4, bias_frac_bits= 8, activation= QLeakyReLu(negative_slope=0, frac_bits= 4)),
-            pool= QAdd(x_skip), # Residual addition
-            flatten= True,
-            )(x)
-x = Bundle( dense= QDenseCore(outputs= 10, weights_frac_bits= 4, bias_frac_bits= 8, activation= Softmax()),
-            )(x)
-model = BundleModel(inputs=x_in, outputs=x)
-# Alternatively: model = BundleModel.from_json('config_model.json')
+XN = 1
+input_shape = (XN,18,18,3) # (XN, XH, XW, CI)
+
+QINT_BITS = 0
+kq = f'quantized_bits({hw.K_BITS},{QINT_BITS},False,True,1)'
+bq = f'quantized_bits({hw.B_BITS},{QINT_BITS},False,True,1)'
+q1 = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0)'    
+q2 = f'quantized_bits({hw.X_BITS},{QINT_BITS},False,False,1)'       
+q3 = f'quantized_bits({hw.X_BITS},{QINT_BITS},False,True,1)'        
+q4 = f'quantized_relu({hw.X_BITS},{QINT_BITS},negative_slope=0.125)'
+
+x = x_in = QInput(shape=input_shape[1:], batch_size=XN, hw=hw, int_bits=QINT_BITS, name='input')
+
+x = x_skip1 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':(11,11), 'strides':(2,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q1}, pool= {'type':'avg', 'size':(3,4), 'strides':(2,3), 'padding':'same', 'act_str':f'quantized_bits({hw.X_BITS},0,False,False,1)'})(x)
+x = x_skip2 = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q2}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1)
+x =           Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 7, 7), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':False, 'act_str':q3}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip2)
+x =           Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 5, 5), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4}, add = {'act_str':f'quantized_bits({hw.X_BITS},0,False,True,1)'})(x, x_skip1)
+x =           Bundle( core= {'type':'conv' , 'filters':24, 'kernel_size':( 3, 3), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q1},)(x)
+x =           Bundle( core= {'type':'conv' , 'filters':10, 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4}, flatten= True)(x)
+x =           Bundle( core= {'type':'dense', 'units'  :10,                                                           'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':q4})(x)
+
+model = QModel(inputs=x_in.raw, outputs=x)
+model.compile()
+model.summary()
 
 '''
 2. TRAIN (using qkeras)
 '''
-model.compile(...)
-model.fit(...)
-model.export() # Generates: savedmodel, config_model.json
+# model.fit(...)
+
 
 '''
 3. EXPORT FOR INFERENCE
-
-- Runs forward pass in float32, records intermediate tensors
-- Runs forward pass in integer, comparing with float32 pass for zero error
-- Runs SystemVerilog testbench with the model & weights, randomizing handshakes, testing with actual C firmware in simulation
-- Prints performance estimate (time, latency)
-- Generates 
-      - config_firmware.h
-      - weights.bin
-      - expected.bin
 '''
-model.export_inference(x=model.random_input) # -> config_firmware.h, weights.bin
+SIM, SIM_PATH = 'xsim', "F:/Xilinx/Vivado/2022.1/bin/" # For Xilinx Vivado
+# SIM, SIM_PATH = 'verilator', "" # For Verilator
+
+model.export_inference(x=model.random_input, hw=hw)  # Runs forward pass in float & int, compares them. Generates: config_fw.h (C firmware), weights.bin, expected.bin
+model.verify_inference(SIM=SIM, SIM_PATH=SIM_PATH)   # Runs SystemVerilog testbench with the model & weights, randomizing handshakes, testing with actual C firmware in simulation
 
 '''
 4. IMPLEMENTATION
 
-a. FPGA: Run vivado.tcl
+a. FPGA: Open vivado, source vivado_flow.tcl
 b. ASIC: Set PDK paths, run syn.tcl & pnr.tcl
-c. Compile C firmware with generated header (model.h) and run on device
+c. Compile C firmware with generated header (config_fw.h) and run on device
 '''
 ```
 

diff --git a/deepsocflow/py/__init__.py b/deepsocflow/py/__init__.py
@@ -1,3 +1,5 @@
 from . import hardware, bundle
 from .hardware import *
-from .bundle import *
+from .bundle import *
+from .model import *
+from .layers import *
diff --git a/deepsocflow/py/layers.py b/deepsocflow/py/layers.py
@@ -1,20 +1,13 @@
 from qkeras import QActivation
-from tensorflow.keras.layers import Input, Flatten, Add, MaxPooling2D
+from tensorflow.keras.layers import Layer, Input, Flatten, Add, MaxPooling2D
 import numpy as np
 
-# class QInput(Input):
-#     def __init__(self, shape, batch_size, hw, frac_bits, name=None):
-
-#         self.hw = hw
-#         self.input_frac_bits = input_frac_bits
-#         super().__init__(shape=shape, name=name)
-
-#         int_bits = hw.X_BITS - self.frac_bits + 1
-
-#         x = Input(shape=shape, batch_size=batch_size, name=name)
-#         x = QActivation(f'quantized_bits(bits={hw.X_BITS}, integer={int_bits}, False,True,1)')(x)
-
-#         return x
+def QInput(shape, batch_size, hw, int_bits, name=None):
+    x_raw = Input(shape=shape, batch_size=batch_size, name=name)
+    x = QActivation(f'quantized_bits({hw.X_BITS},{int_bits},False,True,1)')(x_raw)
+    x.raw = x_raw
+    x.hw = hw
+    return x