Quantize & LRELU in Runtime

KastnerRG · Sep 16, 2023 · 3adc71f · 3adc71f
1 parent 6e96871
commit 3adc71f
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 21 deletions.
diff --git a/c/model.h b/c/model.h
@@ -1,12 +1,12 @@
 #define N_BUNDLES 7
 Bundle_t bundles [N_BUNDLES] = {
-   {.n=8, .l=2, .kw=11, .coe=2, .coe_tl=2, .r_ll=8, .h=16, .w=8, .w_kw2=3, .t=8, .p=3, .cm=1, .cm_p0=1, .w_bpt=140, .w_bpt_p0=140, .x_bpt=840, .x_bpt_p0=840, .is_bias=1, .b_offset=0, .b_val_shift=9, .b_bias_shift=0, .x_header=414341061322735616, .x_header_p0=414341061322735616, .w_header=414587437826703360, .w_header_p0=414341061322735616 },
-   {.n=8, .l=2, .kw=1, .coe=24, .coe_tl=0, .r_ll=8, .h=16, .w=8, .w_kw2=8, .t=1, .p=1, .cm=20, .cm_p0=16, .w_bpt=200, .w_bpt_p0=200, .x_bpt=13320, .x_bpt_p0=13320, .is_bias=0, .b_offset=16, .b_val_shift=0, .b_bias_shift=0, .x_header=8700964375684448256, .x_header_p0=8700964375684448256, .w_header=8701210795138088960, .w_header_p0=8700964375684448256 },
-   {.n=8, .l=2, .kw=7, .coe=3, .coe_tl=4, .r_ll=8, .h=16, .w=8, .w_kw2=5, .t=6, .p=8, .cm=2, .cm_p0=2, .w_bpt=176, .w_bpt_p0=176, .x_bpt=1672, .x_bpt_p0=1672, .is_bias=1, .b_offset=16, .b_val_shift=9, .b_bias_shift=0, .x_header=846686625550303232, .x_header_p0=846686625550303232, .w_header=846933027824074752, .w_header_p0=846686625550303232 },
-   {.n=8, .l=2, .kw=5, .coe=4, .coe_tl=4, .r_ll=8, .h=16, .w=8, .w_kw2=6, .t=4, .p=4, .cm=4, .cm_p0=4, .w_bpt=248, .w_bpt_p0=248, .x_bpt=3336, .x_bpt_p0=3336, .is_bias=0, .b_offset=34, .b_val_shift=0, .b_bias_shift=0, .x_header=1927550536119222272, .x_header_p0=1927550536119222272, .w_header=1927796989932601344, .w_header_p0=1927550536119222272 },
-   {.n=8, .l=2, .kw=3, .coe=8, .coe_tl=8, .r_ll=8, .h=16, .w=8, .w_kw2=7, .t=3, .p=3, .cm=6, .cm_p0=4, .w_bpt=224, .w_bpt_p0=152, .x_bpt=5000, .x_bpt_p0=3336, .is_bias=1, .b_offset=34, .b_val_shift=9, .b_bias_shift=0, .x_header=3008414446688141312, .x_header_p0=1855492942081294336, .w_header=3008660883321651200, .w_header_p0=1855492942081294336 },
-   {.n=8, .l=2, .kw=1, .coe=24, .coe_tl=2, .r_ll=8, .h=16, .w=8, .w_kw2=8, .t=3, .p=2, .cm=20, .cm_p0=4, .w_bpt=248, .w_bpt_p0=56, .x_bpt=16648, .x_bpt_p0=3336, .is_bias=0, .b_offset=58, .b_val_shift=0, .b_bias_shift=0, .x_header=11006807384898142208, .x_header_p0=1783435348043366400, .w_header=11007053838711521280, .w_header_p0=1783435348043366400 },
-   {.n=1, .l=1, .kw=1, .coe=24, .coe_tl=0, .r_ll=8, .h=8, .w=1, .w_kw2=1, .t=1, .p=320, .cm=20, .cm_p0=20, .w_bpt=248, .w_bpt_p0=248, .x_bpt=138, .x_bpt_p0=138, .is_bias=1, .b_offset=58, .b_val_shift=9, .b_bias_shift=0, .x_header=10952754293765046272, .x_header_p0=10952754293765046272, .w_header=10952754456973803520, .w_header_p0=10952754293765046272 }
+   {.n=8, .l=2, .kw=11, .coe=2, .coe_tl=2, .r_ll=8, .h=16, .w=8, .w_kw2=3, .t=8, .p=3, .cm=1, .cm_p0=1, .w_bpt=140, .w_bpt_p0=140, .x_bpt=840, .x_bpt_p0=840, .is_bias=1, .b_offset=0, .b_val_shift=9, .b_bias_shift=0, .ca_nzero=1, .ca_shift=15, .ca_pl_scale=3, .x_header=414341061322735616, .x_header_p0=414341061322735616, .w_header=414587437826703360, .w_header_p0=414341061322735616 },
+   {.n=8, .l=2, .kw=1, .coe=24, .coe_tl=0, .r_ll=8, .h=16, .w=8, .w_kw2=8, .t=1, .p=1, .cm=20, .cm_p0=16, .w_bpt=200, .w_bpt_p0=200, .x_bpt=13320, .x_bpt_p0=13320, .is_bias=0, .b_offset=16, .b_val_shift=0, .b_bias_shift=0, .ca_nzero=1, .ca_shift=3, .ca_pl_scale=0, .x_header=8700964375684448256, .x_header_p0=8700964375684448256, .w_header=8701210795138088960, .w_header_p0=8700964375684448256 },
+   {.n=8, .l=2, .kw=7, .coe=3, .coe_tl=4, .r_ll=8, .h=16, .w=8, .w_kw2=5, .t=6, .p=8, .cm=2, .cm_p0=2, .w_bpt=176, .w_bpt_p0=176, .x_bpt=1672, .x_bpt_p0=1672, .is_bias=1, .b_offset=16, .b_val_shift=9, .b_bias_shift=0, .ca_nzero=1, .ca_shift=12, .ca_pl_scale=0, .x_header=846686625550303232, .x_header_p0=846686625550303232, .w_header=846933027824074752, .w_header_p0=846686625550303232 },
+   {.n=8, .l=2, .kw=5, .coe=4, .coe_tl=4, .r_ll=8, .h=16, .w=8, .w_kw2=6, .t=4, .p=4, .cm=4, .cm_p0=4, .w_bpt=248, .w_bpt_p0=248, .x_bpt=3336, .x_bpt_p0=3336, .is_bias=0, .b_offset=34, .b_val_shift=0, .b_bias_shift=0, .ca_nzero=1, .ca_shift=6, .ca_pl_scale=3, .x_header=1927550536119222272, .x_header_p0=1927550536119222272, .w_header=1927796989932601344, .w_header_p0=1927550536119222272 },
+   {.n=8, .l=2, .kw=3, .coe=8, .coe_tl=8, .r_ll=8, .h=16, .w=8, .w_kw2=7, .t=3, .p=3, .cm=6, .cm_p0=4, .w_bpt=224, .w_bpt_p0=152, .x_bpt=5000, .x_bpt_p0=3336, .is_bias=1, .b_offset=34, .b_val_shift=9, .b_bias_shift=0, .ca_nzero=1, .ca_shift=15, .ca_pl_scale=3, .x_header=3008414446688141312, .x_header_p0=1855492942081294336, .w_header=3008660883321651200, .w_header_p0=1855492942081294336 },
+   {.n=8, .l=2, .kw=1, .coe=24, .coe_tl=2, .r_ll=8, .h=16, .w=8, .w_kw2=8, .t=3, .p=2, .cm=20, .cm_p0=4, .w_bpt=248, .w_bpt_p0=56, .x_bpt=16648, .x_bpt_p0=3336, .is_bias=0, .b_offset=58, .b_val_shift=0, .b_bias_shift=0, .ca_nzero=1, .ca_shift=6, .ca_pl_scale=3, .x_header=11006807384898142208, .x_header_p0=1783435348043366400, .w_header=11007053838711521280, .w_header_p0=1783435348043366400 },
+   {.n=1, .l=1, .kw=1, .coe=24, .coe_tl=0, .r_ll=8, .h=8, .w=1, .w_kw2=1, .t=1, .p=320, .cm=20, .cm_p0=20, .w_bpt=248, .w_bpt_p0=248, .x_bpt=138, .x_bpt_p0=138, .is_bias=1, .b_offset=58, .b_val_shift=9, .b_bias_shift=0, .ca_nzero=1, .ca_shift=15, .ca_pl_scale=3, .x_header=10952754293765046272, .x_header_p0=10952754293765046272, .w_header=10952754456973803520, .w_header_p0=10952754293765046272 }
 };
 
 #define X_BITS_L2   2

diff --git a/c/runtime.h b/c/runtime.h
@@ -1,12 +1,22 @@
+#include <stdio.h>
+
+#ifdef VERILATOR
+  #define EXT_C "C"
+#else
+  #define EXT_C
+#endif
+
 typedef struct {
   const int n, l, kw, coe, coe_tl, r_ll, h, w, w_kw2, t, p, cm, cm_p0;
   const int w_bpt, w_bpt_p0, x_bpt, x_bpt_p0; // bytes per transfer
   const char is_bias;
   const int b_offset, b_val_shift, b_bias_shift;
+  const signed char ca_nzero, ca_shift, ca_pl_scale;
   const unsigned long long x_header, x_header_p0, w_header, w_header_p0; // 64 bits (at least)
 } Bundle_t;
 
 #include "model.h"
+#define X_BITS (1<<X_BITS_L2)
 
 typedef struct {
   char   w  [W_BYTES     ];
@@ -16,13 +26,16 @@ typedef struct {
 } Memory_st;
 Memory_st mem;
 
-#include <stdio.h>
+#define clip(x, min, max) ((x < min) ? min : (x > max) ? max : x)
+#define shift_round(n, s) ((n + (1<<(s-1)) - (~(n>>s)&1) ) >> s) // === np.around(n/2**s).astype(int)
 
-#ifdef VERILATOR
-  #define EXT_C "C"
-#else
-  #define EXT_C
-#endif
+
+static inline int quant_lrelu(int x, signed char nzero, signed char shift, signed char pl_scale){
+  x = ((x<0)*x)*nzero + (((x>0)*x) << pl_scale);
+  x = shift_round(x, shift);
+  x = clip(x, -(1<<(X_BITS-pl_scale-1)), (1<<(X_BITS-1))-1);
+  return x;
+}
 
 
 static inline void process_y(int val, int i_py, Bundle_t *p_bundle, int ip, int it_bias){
@@ -43,7 +56,9 @@ static inline void process_y(int val, int i_py, Bundle_t *p_bundle, int ip, int
   if (p_bundle->is_bias)
     val = (val << p_bundle->b_val_shift) + (mem.b[it_bias] << p_bundle->b_bias_shift);
 
-  // ------ RELU + QUANT ------
+  // ------ CORE ACT ------
+  val = quant_lrelu(val, p_bundle->ca_nzero, p_bundle->ca_shift, p_bundle->ca_pl_scale);
+
 
   // ------ MAX/AVG POOL ------
 

diff --git a/test/py/bundle.py b/test/py/bundle.py
@@ -268,7 +268,6 @@ def add (p, p_frac, p_bits, q, q_frac, q_bits):
             assert self.proc['bits'] <= c.INT_BITS, f"After bias addition, resulting bits {self.proc['bits']} are more than bits for integer in CPU {c.INT_BITS}. Reduce bits or increase integer bits of bias to continue"
         else:
             self.bias_val_shift, self.bias_b_shift = 0, 0
-        self.y_int_b = self.proc['int']
 
 
         if 'strides' in self.core and self.core['strides'] != (1,1):
@@ -300,11 +299,14 @@ def apply_act(act_dict):
             x = shift_round(x, shift_bits) # = np.around(x/2**shift_bits)
             x = np.clip(x, -2**(bits-plog_slope-1), 2**(bits-1)-1).astype(int)
 
+            act_dict['shift_bits'] = shift_bits
             self.proc['int'], self.proc['bits'], self.proc['frac'] = x, bits, frac
+            print(f'----------------------- shift:{shift_bits}, plog:{plog_slope}, nzero:{non_zero}')
 
         apply_act(self.core['act'])
         assert np.all(self.proc['int'] == self.core['tensor'].numpy() * 2**self.proc['frac']), f"Core + act output of bundle {self.idx} is not fixed point"
 
+        self.o_exp = self.proc['int']
 
         if self.add is not None:
             a = self.add['bundle']
@@ -424,10 +426,10 @@ def export (self, c):
             w_int = self.w  ['int'].reshape(1,1,CI,CO) # (CI,CO) -> (KH,KW,CI,CO)
             x_int = self.inp['int'].reshape(1,XN,1,CI) # (XN,CI) -> (XN, XH, XW, CI)
             y_int = self.y  ['int'].reshape(1,XN,1,CO) # (XN,CI) -> (XN, XH, XW, CI)
-            p_int = self.y_int_b.   reshape(1,XN,1,CO)
+            o_int = self.o_exp.     reshape(1,XN,1,CO)
         else:
             y_int = self.y['int']
-            p_int = self.y_int_b
+            o_int = self.o_exp
             w_int, x_int = self.w['int'], self.inp['int']
 
         r = self.get_runtime_params(c, w_int.shape, x_int.shape, y_int.shape)
@@ -452,7 +454,7 @@ def export (self, c):
 
         self.xe = self.reorder_x_q2e_conv(x_int, c, r)
         self.ye_exp = self.reorder_y_q2e_conv(y_int, c, r)
-        self.pe_exp = self.reorder_y_q2e_conv(p_int, c, r)
+        self.oe_exp = self.reorder_y_q2e_conv(o_int, c, r)
         print(f"x reshape: [int]:{self.inp['int'].shape}, int:{x_int.shape}. xe:{self.xe[0].shape}")
 
         '''

diff --git a/test/py/param_test.py b/test/py/param_test.py
@@ -278,7 +278,9 @@ def test_dnn_engine(COMPILE):
             y_coe_tl = b.r.CO_PRL if (b.r.CO==b.r.IT*b.r.CO_PRL) else b.r.CO%b.r.IT
             y_r_ll = c.ROWS if b.r.XH==b.r.L*c.ROWS else  b.r.XH % c.ROWS
 
-            ch.write(f"   {{.n={b.r.XN}, .l={b.r.L}, .kw={b.r.KW}, .coe={y_coe}, .coe_tl={y_coe_tl}, .r_ll={y_r_ll}, .h={b.r.XH}, .w={b.r.XW}, .w_kw2={b.r.XW-b.r.KW//2}, .t={b.r.IT}, .p={b.r.CP}, .cm={b.r.CM}, .cm_p0={b.r.CM_0}, .w_bpt={w_bpt}, .w_bpt_p0={w_bpt_p0}, .x_bpt={x_bpt}, .x_bpt_p0={x_bpt_p0}, .is_bias={1*(b.b is not None)}, .b_offset={b_words}, .b_val_shift={b.bias_val_shift}, .b_bias_shift={b.bias_b_shift}, .x_header={b.r.x_header_be_p[-1][0]}, .x_header_p0={b.r.x_header_be_p[0][0]}, .w_header={b.r.w_header_be_p[-1][0]}, .w_header_p0={b.r.x_header_be_p[0][0]} }}")
+            ca_nzero, ca_shift, ca_pl_scale = b.core['act']['non_zero'], b.core['act']['shift_bits'], b.core['act']['plog_slope']
+
+            ch.write(f"   {{.n={b.r.XN}, .l={b.r.L}, .kw={b.r.KW}, .coe={y_coe}, .coe_tl={y_coe_tl}, .r_ll={y_r_ll}, .h={b.r.XH}, .w={b.r.XW}, .w_kw2={b.r.XW-b.r.KW//2}, .t={b.r.IT}, .p={b.r.CP}, .cm={b.r.CM}, .cm_p0={b.r.CM_0}, .w_bpt={w_bpt}, .w_bpt_p0={w_bpt_p0}, .x_bpt={x_bpt}, .x_bpt_p0={x_bpt_p0}, .is_bias={1*(b.b is not None)}, .b_offset={b_words}, .b_val_shift={b.bias_val_shift}, .b_bias_shift={b.bias_b_shift}, .ca_nzero={ca_nzero}, .ca_shift={ca_shift}, .ca_pl_scale={ca_pl_scale}, .x_header={b.r.x_header_be_p[-1][0]}, .x_header_p0={b.r.x_header_be_p[0][0]}, .w_header={b.r.w_header_be_p[-1][0]}, .w_header_p0={b.r.x_header_be_p[0][0]} }}")
 
             b_words += b.be.size if b.b else 0
             if b.idx != len(bundles)-1:
@@ -330,7 +332,7 @@ def test_dnn_engine(COMPILE):
     Write Text files of vectors
     '''
     for b in bundles:
-        np.savetxt(f"{DATA_DIR}/{b.idx}_y_exp.txt", b.pe_exp.flatten(), fmt='%d')
+        np.savetxt(f"{DATA_DIR}/{b.idx}_y_exp.txt", b.oe_exp.flatten(), fmt='%d')
         for ip in range(b.r.CP):
             CM_p = b.r.CM_0 if ip==0 else b.r.CM
             x_config = b.r.x_header_le_p[ip!=0][0]
@@ -384,7 +386,7 @@ def test_dnn_engine(COMPILE):
     '''
     for b in bundles:
         y_sim = np.loadtxt(f"{DATA_DIR}/{b.idx}_y_sim.txt",np.int32).reshape((b.r.IT, b.r.XN*b.r.L*b.r.XW*b.r.CO_PRL*c.ROWS))
-        error = np.sum(np.abs(y_sim.reshape(b.pe_exp.shape) - b.pe_exp))
+        error = np.sum(np.abs(y_sim.reshape(b.oe_exp.shape) - b.oe_exp))
 
         print(f"Bundle {b.idx}, Error: {error}")
         assert error == 0