Pointnet works

KastnerRG · Aug 1, 2024 · c7f7889 · c7f7889
1 parent 25c6d9b
commit c7f7889
Show file tree

Hide file tree

Showing 6 changed files with 377 additions and 75 deletions.
diff --git a/deepsocflow/py/hardware.py b/deepsocflow/py/hardware.py
@@ -226,9 +226,6 @@ def simulate(self, SIM='verilator', SIM_PATH=''):
             cmd = f'{SIM_PATH}verilator --binary -j 0 -O3 --relative-includes --top {self.TB_MODULE} -I../ -F ../sources.txt -CFLAGS -DSIM -CFLAGS -I../ {self.MODULE_DIR}/c/sim.c -CFLAGS -g --Mdir ./'
             print(cmd)
             assert subprocess.run(cmd.split(' '), cwd='build').returncode == 0
-
-        exit()
-
         print("\n\nSIMULATING...\n\n")
         start = time.time()
 

diff --git a/deepsocflow/py/xbundle.py b/deepsocflow/py/xbundle.py
@@ -47,6 +47,8 @@ def call(self, input_tensor, x_add=None, training=False):
             self.prev_ib = x.ib
             BUNDLES[self.prev_ib].next_ibs += [self.ib]
 
+        print(f"{self.ib} x: {x.shape}, prev:{self.prev_ib}")
+
         x = self.core(x)
         x = self.core.act(x)
 

diff --git a/deepsocflow/py/xmodel.py b/deepsocflow/py/xmodel.py
@@ -39,15 +39,15 @@ def get_config(self):
 
 
 
-def export_inference(model, hw):
+def export_inference(model, hw, batch_size=1):
 
     for b in BUNDLES:
         b.next_ibs.clear()
         b.next_add_ibs.clear()
     BUNDLES.clear()
 
     user_model = model.layers[1]
-    input_shape = (hw.ROWS, *model.inputs[0].shape[1:])
+    input_shape = (batch_size, *model.inputs[0].shape[1:])
     x_keras = tf.random.uniform(input_shape)
     x_qtensor = user_model.input_quant_layer(x_keras)
     out_keras = model(x_keras)

diff --git a/run/param_test.py b/run/param_test.py
@@ -11,6 +11,7 @@
 from keras.utils import to_categorical
 from qkeras.utils import load_qmodel
 import numpy as np
+import pprint
 # import tensorflow as tf
 #tf.keras.utils.set_random_seed(0)
 
@@ -175,24 +176,24 @@ def product_dict(**kwargs):
         yield dict(zip(kwargs.keys(), instance))
 
 @pytest.mark.parametrize("PARAMS", list(product_dict(
-                                        processing_elements  = [(8,24)   ],
+                                        processing_elements  = [(32,32)  ],
                                         frequency_mhz        = [ 250     ],
                                         bits_input           = [ 4       ],
                                         bits_weights         = [ 4       ],
-                                        bits_sum             = [ 32      ],
+                                        bits_sum             = [ 20      ],
                                         bits_bias            = [ 16      ],
                                         max_batch_size       = [ 64      ], 
                                         max_channels_in      = [ 2048    ],
                                         max_kernel_size      = [ 9       ],
                                         max_image_size       = [ 512     ],
                                         max_n_bundles        = [ 64      ],
-                                        ram_weights_depth    = [ 20      ],
+                                        ram_weights_depth    = [ 512     ],
                                         ram_edges_depth      = [ 288     ],
-                                        axi_width            = [ 128      ],
+                                        axi_width            = [ 64      ],
                                         config_baseaddr      = ["B0000000"],
                                         target_cpu_int_bits  = [ 32       ],
-                                        valid_prob           = [ 0.1       ],
-                                        ready_prob           = [ 0.01       ],
+                                        valid_prob           = [ 1       ],
+                                        ready_prob           = [ 1       ],
                                         data_dir             = ['vectors'],
                                     )))
 def test_dnn_engine(PARAMS):
@@ -210,9 +211,10 @@ def test_dnn_engine(PARAMS):
     '''
     VERIFY & EXPORT
     '''
-    export_inference(loaded_model, hw)
+    export_inference(loaded_model, hw, batch_size=1)
     verify_inference(loaded_model, hw, SIM=SIM, SIM_PATH=SIM_PATH)
 
-    seconds, bytes = predict_model_performance(hw)
-    print(f"Predicted time on hardware: {1000*seconds:.5f} ms")
-    print(f"Predicted data movement: {bytes/1000:.5f} kB")
+    d_perf = predict_model_performance(hw)
+    pp = pprint.PrettyPrinter(indent=4)
+    print(f"Predicted Performance")
+    pp.pprint(d_perf)
diff --git a/run/pointnet.py b/run/pointnet.py
@@ -0,0 +1,298 @@
+import os
+import pytest
+import itertools
+import sys
+sys.path.append("../../")
+from tensorflow import keras
+from keras.layers import Input
+from keras.models import Model, save_model
+from keras.datasets import mnist
+from keras.optimizers import Adam
+from keras.utils import to_categorical
+from qkeras.utils import load_qmodel
+import numpy as np
+import pprint
+#from read_point_cloud import * 
+#from preprocess import *
+import tensorflow as tf
+#tf.keras.utils.set_random_seed(0)
+
+from deepsocflow import *
+
+
+(SIM, SIM_PATH) = ('xsim', "F:/Xilinx/Vivado/2022.2/bin/") if os.name=='nt' else ('verilator', '')
+np.random.seed(42)
+
+'''
+Dataset
+'''
+
+NB_EPOCH = 2
+BATCH_SIZE = 64
+VALIDATION_SPLIT = 0.1
+
+#input_shape = x_train.shape[1:]
+
+scale_factor = 80.
+## Load data
+"""
+print("loading data...")
+pmtxyz = get_pmtxyz("./work/pmt_xyz.dat")
+X, y = torch.load("./work/preprocessed_data.pt")
+X = X/100.
+y[:,:] = y[:,:]/3.0
+y[:, :3] = y[:, :3]/scale_factor
+y[:, :3] = y[:,:3]
+#print(y[0])
+X_tf = tf.convert_to_tensor(X.numpy(), dtype=tf.float32)
+y_tf = tf.convert_to_tensor(y.numpy(), dtype=tf.float32)
+X_tf = tf.expand_dims(X_tf, axis=2)
+debug = True 
+if debug:
+    print("debug got called")
+    small = 5000
+    X_tf, y_tf = X_tf[:small], y_tf[:small]
+
+
+# Update batch size
+print(X_tf.shape)
+n_data, n_hits, _, F_dim = X_tf.shape
+
+## switch to match Aobo's syntax (time, charge, x, y, z) -> (x, y, z, label, time, charge)
+## insert "label" feature to tensor. This feature (0 or 1) is the activation of sensor
+new_X = X_tf #preprocess(X_tf)
+
+## Shuffle Data (w/ Seed)
+#np.random.seed(seed=args.seed)
+#set_seed(seed=args.seed)
+idx = np.random.permutation(new_X.shape[0]) 
+#new_X = tf.gather(new_X, idx)
+#y = tf.gather(y_tf, idx)
+## Split and Load data
+train_split = 0.7
+val_split = 0.3
+train_idx = int(new_X.shape[0] * train_split)
+val_idx = int(train_idx + new_X.shape[0] * train_split)
+train = tf.data.Dataset.from_tensor_slices((new_X[:train_idx], y_tf[:train_idx]))
+val = tf.data.Dataset.from_tensor_slices((new_X[train_idx:val_idx], y_tf[train_idx:val_idx]))
+test = tf.data.Dataset.from_tensor_slices((new_X[val_idx:], y_tf[val_idx:]))
+train_loader = train.shuffle(buffer_size=len(new_X)).batch(BATCH_SIZE)
+val_loader = val.batch(BATCH_SIZE)
+test_loader = val.batch(BATCH_SIZE)
+print(f"num. total: {len(new_X)} train: {len(train)}, val: {len(val)}, test: {len(test)}")
+#print(pmtxyz.shape, tf.shape(new_X), y_tf.shape)
+"""
+input_shape = (2126, 1, 5)#X_tf.shape[1:]
+n_hits, _, F_dim = input_shape#X_tf.shape
+
+'''
+Define Model
+'''
+
+sys_bits = SYS_BITS(x=8, k=8, b=16)
+dim = F_dim
+dim_reduce_factor = 2
+out_dim = 4 #y_tf.shape[-1]
+dimensions = dim
+nhits = 2126
+encoder_input_shapes = [dimensions, 64, int(128 / dim_reduce_factor)]
+(_, F1, F2), latent_dim = encoder_input_shapes, int(1024 / dim_reduce_factor)
+decoder_input_shapes = latent_dim, int(512/dim_reduce_factor), int(128/dim_reduce_factor)
+latent_dim, F3, F4 = decoder_input_shapes
+#print("Test", F1, F2, dim, dim_reduce_factor, out_dim, dimensions)
+@keras.saving.register_keras_serializable()
+class UserModel(XModel):
+    def __init__(self, sys_bits, x_int_bits, *args, **kwargs):
+        super().__init__(sys_bits, x_int_bits, *args, **kwargs)
+
+        self.b0 = XBundle( 
+            core=XConvBN(
+                k_int_bits=0,
+                b_int_bits=0,
+                filters=F1,
+                kernel_size=1,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0),),
+            #core=XDense(
+            #    k_int_bits=0,
+            #    b_int_bits=0,
+            #    units=F1,
+            #    act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)
+            #    ),
+            )
+
+        self.b1 = XBundle( 
+            core=XConvBN(
+                k_int_bits=0,
+                b_int_bits=0,
+                filters=F2,
+                kernel_size=1,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0),),
+            #core=XDense(
+            #    k_int_bits=0,
+            #    b_int_bits=0,
+            #    units=F2,
+            #    act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)),
+        )
+
+        self.b2 = XBundle( 
+            core=XConvBN(
+                k_int_bits=0,
+                b_int_bits=0,
+                filters=latent_dim,
+                kernel_size=1,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0),),
+            pool=XPool(
+                type='avg',
+                pool_size=(2126,1),
+                strides=(2126,1),
+                padding='same',
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None),),
+            flatten=True
+            #core=XDense(
+            #    k_int_bits=0,
+            #    b_int_bits=0,
+            #    units=latent_dim,
+            #    act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)),
+        )
+
+
+        self.b3 = XBundle( 
+            core=XDense(
+                k_int_bits=0,
+                b_int_bits=0,
+                units=F3,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)),
+        )
+
+        self.b4 = XBundle( 
+            core=XDense(
+                k_int_bits=0,
+                b_int_bits=0,
+                units=F4,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)),
+        )
+
+        self.b5 = XBundle(
+            core=XDense(
+                k_int_bits=0,
+                b_int_bits=0,
+                units=out_dim,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0.125)),
+            # flatten=True
+        )
+
+    def call (self, x):
+        x = self.input_quant_layer(x)
+        print('input', x.shape)
+        x = self.b0(x)
+        x = self.b1(x)
+        x = self.b2(x)
+        x = self.b3(x)
+        x = self.b4(x)
+        x = self.b5(x)
+        return x
+
+x = x_in =  Input(input_shape, name="input")
+user_model = UserModel(sys_bits=sys_bits, x_int_bits=0)
+x = user_model(x_in)
+
+model = Model(inputs=[x_in], outputs=[x])
+
+
+'''
+Train Model
+'''
+model.compile(loss="mse", optimizer=Adam(learning_rate=0.0001), metrics=["mse"])
+#history = model.fit(
+#        train_loader,
+#        #x_train, 
+#        #y_train, 
+#        batch_size=BATCH_SIZE,
+#        epochs=NB_EPOCH, 
+#        #initial_epoch=1, 
+#        verbose=True,
+#        )
+
+print(model.submodules)
+#print(y[:5], model(X_tf[:5]))
+for layer in model.submodules:
+    try:
+        print(layer.summary())
+        for w, weight in enumerate(layer.get_weights()):
+                print(layer.name, w, weight.shape)
+    except:
+        pass
+# print_qstats(model.layers[1])
+
+def summary_plus(layer, i=0):
+    if hasattr(layer, 'layers'):
+        if i != 0: 
+            layer.summary()
+        for l in layer.layers:
+            i += 1
+            summary_plus(l, i=i)
+
+print(summary_plus(model)) # OK 
+model.summary(expand_nested=True)
+
+
+'''
+Save & Reload
+'''
+
+save_model(model, "mnist.h5")
+loaded_model = load_qmodel("mnist.h5")
+
+#score = loaded_model.evaluate(test_loader, verbose=0)
+#print(f"Test loss:{score[0]}, Test accuracy:{score[1]}")
+
+
+
+
+def product_dict(**kwargs):
+    for instance in itertools.product(*(kwargs.values())):
+        yield dict(zip(kwargs.keys(), instance))
+
+@pytest.mark.parametrize("PARAMS", list(product_dict(
+                                        processing_elements  = [(16,32)   ],
+                                        frequency_mhz        = [ 250     ],
+                                        bits_input           = [ 8       ],
+                                        bits_weights         = [ 8       ],
+                                        bits_sum             = [ 32      ],
+                                        bits_bias            = [ 16      ],
+                                        max_batch_size       = [ 64      ], 
+                                        max_channels_in      = [ 2048    ],
+                                        max_kernel_size      = [ 9       ],
+                                        max_image_size       = [ 2126    ],
+                                        max_n_bundles        = [ 64      ],
+                                        ram_weights_depth    = [ 20      ],
+                                        ram_edges_depth      = [ 288     ],
+                                        axi_width            = [ 128      ],
+                                        config_baseaddr      = ["B0000000"],
+                                        target_cpu_int_bits  = [ 32       ],
+                                        valid_prob           = [ 1     ],
+                                        ready_prob           = [ 1     ],
+                                        data_dir             = ['vectors'],
+                                    )))
+def test_dnn_engine(PARAMS):
+
+    '''
+    SPECIFY HARDWARE
+    '''
+    hw = Hardware (**PARAMS)
+    hw.export_json()
+    hw = Hardware.from_json('hardware.json')
+    hw.export() # Generates: config_hw.svh, config_hw.tcl
+    hw.export_vivado_tcl(board='zcu104')
+
+
+    '''
+    VERIFY & EXPORT
+    '''
+    export_inference(loaded_model, hw, hw.ROWS)
+    verify_inference(loaded_model, hw, SIM=SIM, SIM_PATH=SIM_PATH)
+
+    d_perf = predict_model_performance(hw)
+    pp = pprint.PrettyPrinter(indent=4)
+    print(f"Predicted Performance")
+    pp.pprint(d_perf)