diff --git a/deepsocflow/py/hardware.py b/deepsocflow/py/hardware.py
index c8d1f58..5b56553 100644
--- a/deepsocflow/py/hardware.py
+++ b/deepsocflow/py/hardware.py
@@ -226,9 +226,6 @@ def simulate(self, SIM='verilator', SIM_PATH=''):
             cmd = f'{SIM_PATH}verilator --binary -j 0 -O3 --relative-includes --top {self.TB_MODULE} -I../ -F ../sources.txt -CFLAGS -DSIM -CFLAGS -I../ {self.MODULE_DIR}/c/sim.c -CFLAGS -g --Mdir ./'
             print(cmd)
             assert subprocess.run(cmd.split(' '), cwd='build').returncode == 0
-        
-        exit()
-        
         print("\n\nSIMULATING...\n\n")
         start = time.time()
 
diff --git a/deepsocflow/py/xbundle.py b/deepsocflow/py/xbundle.py
index d3c5f82..18ceb41 100644
--- a/deepsocflow/py/xbundle.py
+++ b/deepsocflow/py/xbundle.py
@@ -47,6 +47,8 @@ def call(self, input_tensor, x_add=None, training=False):
             self.prev_ib = x.ib
             BUNDLES[self.prev_ib].next_ibs += [self.ib]
 
+        print(f"{self.ib} x: {x.shape}, prev:{self.prev_ib}")
+
         x = self.core(x)
         x = self.core.act(x)
 
diff --git a/deepsocflow/py/xmodel.py b/deepsocflow/py/xmodel.py
index 284f04f..4828ecd 100644
--- a/deepsocflow/py/xmodel.py
+++ b/deepsocflow/py/xmodel.py
@@ -39,7 +39,7 @@ def get_config(self):
     
 
 
-def export_inference(model, hw):
+def export_inference(model, hw, batch_size=1):
     
     for b in BUNDLES:
         b.next_ibs.clear()
@@ -47,7 +47,7 @@ def export_inference(model, hw):
     BUNDLES.clear()
         
     user_model = model.layers[1]
-    input_shape = (hw.ROWS, *model.inputs[0].shape[1:])
+    input_shape = (batch_size, *model.inputs[0].shape[1:])
     x_keras = tf.random.uniform(input_shape)
     x_qtensor = user_model.input_quant_layer(x_keras)
     out_keras = model(x_keras)
diff --git a/run/param_test.py b/run/param_test.py
index daea4e3..ecb87bd 100644
--- a/run/param_test.py
+++ b/run/param_test.py
@@ -11,6 +11,7 @@
 from keras.utils import to_categorical
 from qkeras.utils import load_qmodel
 import numpy as np
+import pprint
 # import tensorflow as tf
 #tf.keras.utils.set_random_seed(0)
 
@@ -175,24 +176,24 @@ def product_dict(**kwargs):
         yield dict(zip(kwargs.keys(), instance))
 
 @pytest.mark.parametrize("PARAMS", list(product_dict(
-                                        processing_elements  = [(8,24)   ],
+                                        processing_elements  = [(32,32)  ],
                                         frequency_mhz        = [ 250     ],
                                         bits_input           = [ 4       ],
                                         bits_weights         = [ 4       ],
-                                        bits_sum             = [ 32      ],
+                                        bits_sum             = [ 20      ],
                                         bits_bias            = [ 16      ],
                                         max_batch_size       = [ 64      ], 
                                         max_channels_in      = [ 2048    ],
                                         max_kernel_size      = [ 9       ],
                                         max_image_size       = [ 512     ],
                                         max_n_bundles        = [ 64      ],
-                                        ram_weights_depth    = [ 20      ],
+                                        ram_weights_depth    = [ 512     ],
                                         ram_edges_depth      = [ 288     ],
-                                        axi_width            = [ 128      ],
+                                        axi_width            = [ 64      ],
                                         config_baseaddr      = ["B0000000"],
                                         target_cpu_int_bits  = [ 32       ],
-                                        valid_prob           = [ 0.1       ],
-                                        ready_prob           = [ 0.01       ],
+                                        valid_prob           = [ 1       ],
+                                        ready_prob           = [ 1       ],
                                         data_dir             = ['vectors'],
                                     )))
 def test_dnn_engine(PARAMS):
@@ -210,9 +211,10 @@ def test_dnn_engine(PARAMS):
     '''
     VERIFY & EXPORT
     '''
-    export_inference(loaded_model, hw)
+    export_inference(loaded_model, hw, batch_size=1)
     verify_inference(loaded_model, hw, SIM=SIM, SIM_PATH=SIM_PATH)
 
-    seconds, bytes = predict_model_performance(hw)
-    print(f"Predicted time on hardware: {1000*seconds:.5f} ms")
-    print(f"Predicted data movement: {bytes/1000:.5f} kB")
+    d_perf = predict_model_performance(hw)
+    pp = pprint.PrettyPrinter(indent=4)
+    print(f"Predicted Performance")
+    pp.pprint(d_perf)
\ No newline at end of file
diff --git a/run/pointnet.py b/run/pointnet.py
new file mode 100644
index 0000000..f8448ae
--- /dev/null
+++ b/run/pointnet.py
@@ -0,0 +1,298 @@
+import os
+import pytest
+import itertools
+import sys
+sys.path.append("../../")
+from tensorflow import keras
+from keras.layers import Input
+from keras.models import Model, save_model
+from keras.datasets import mnist
+from keras.optimizers import Adam
+from keras.utils import to_categorical
+from qkeras.utils import load_qmodel
+import numpy as np
+import pprint
+#from read_point_cloud import * 
+#from preprocess import *
+import tensorflow as tf
+#tf.keras.utils.set_random_seed(0)
+
+from deepsocflow import *
+
+
+(SIM, SIM_PATH) = ('xsim', "F:/Xilinx/Vivado/2022.2/bin/") if os.name=='nt' else ('verilator', '')
+np.random.seed(42)
+
+'''
+Dataset
+'''
+
+NB_EPOCH = 2
+BATCH_SIZE = 64
+VALIDATION_SPLIT = 0.1
+
+#input_shape = x_train.shape[1:]
+
+scale_factor = 80.
+## Load data
+"""
+print("loading data...")
+pmtxyz = get_pmtxyz("./work/pmt_xyz.dat")
+X, y = torch.load("./work/preprocessed_data.pt")
+X = X/100.
+y[:,:] = y[:,:]/3.0
+y[:, :3] = y[:, :3]/scale_factor
+y[:, :3] = y[:,:3]
+#print(y[0])
+X_tf = tf.convert_to_tensor(X.numpy(), dtype=tf.float32)
+y_tf = tf.convert_to_tensor(y.numpy(), dtype=tf.float32)
+X_tf = tf.expand_dims(X_tf, axis=2)
+debug = True 
+if debug:
+    print("debug got called")
+    small = 5000
+    X_tf, y_tf = X_tf[:small], y_tf[:small]
+
+
+# Update batch size
+print(X_tf.shape)
+n_data, n_hits, _, F_dim = X_tf.shape
+
+## switch to match Aobo's syntax (time, charge, x, y, z) -> (x, y, z, label, time, charge)
+## insert "label" feature to tensor. This feature (0 or 1) is the activation of sensor
+new_X = X_tf #preprocess(X_tf)
+
+## Shuffle Data (w/ Seed)
+#np.random.seed(seed=args.seed)
+#set_seed(seed=args.seed)
+idx = np.random.permutation(new_X.shape[0]) 
+#new_X = tf.gather(new_X, idx)
+#y = tf.gather(y_tf, idx)
+## Split and Load data
+train_split = 0.7
+val_split = 0.3
+train_idx = int(new_X.shape[0] * train_split)
+val_idx = int(train_idx + new_X.shape[0] * train_split)
+train = tf.data.Dataset.from_tensor_slices((new_X[:train_idx], y_tf[:train_idx]))
+val = tf.data.Dataset.from_tensor_slices((new_X[train_idx:val_idx], y_tf[train_idx:val_idx]))
+test = tf.data.Dataset.from_tensor_slices((new_X[val_idx:], y_tf[val_idx:]))
+train_loader = train.shuffle(buffer_size=len(new_X)).batch(BATCH_SIZE)
+val_loader = val.batch(BATCH_SIZE)
+test_loader = val.batch(BATCH_SIZE)
+print(f"num. total: {len(new_X)} train: {len(train)}, val: {len(val)}, test: {len(test)}")
+#print(pmtxyz.shape, tf.shape(new_X), y_tf.shape)
+"""
+input_shape = (2126, 1, 5)#X_tf.shape[1:]
+n_hits, _, F_dim = input_shape#X_tf.shape
+
+'''
+Define Model
+'''
+
+sys_bits = SYS_BITS(x=8, k=8, b=16)
+dim = F_dim
+dim_reduce_factor = 2
+out_dim = 4 #y_tf.shape[-1]
+dimensions = dim
+nhits = 2126
+encoder_input_shapes = [dimensions, 64, int(128 / dim_reduce_factor)]
+(_, F1, F2), latent_dim = encoder_input_shapes, int(1024 / dim_reduce_factor)
+decoder_input_shapes = latent_dim, int(512/dim_reduce_factor), int(128/dim_reduce_factor)
+latent_dim, F3, F4 = decoder_input_shapes
+#print("Test", F1, F2, dim, dim_reduce_factor, out_dim, dimensions)
+@keras.saving.register_keras_serializable()
+class UserModel(XModel):
+    def __init__(self, sys_bits, x_int_bits, *args, **kwargs):
+        super().__init__(sys_bits, x_int_bits, *args, **kwargs)
+
+        self.b0 = XBundle( 
+            core=XConvBN(
+                k_int_bits=0,
+                b_int_bits=0,
+                filters=F1,
+                kernel_size=1,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0),),
+            #core=XDense(
+            #    k_int_bits=0,
+            #    b_int_bits=0,
+            #    units=F1,
+            #    act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)
+            #    ),
+            )
+        
+        self.b1 = XBundle( 
+            core=XConvBN(
+                k_int_bits=0,
+                b_int_bits=0,
+                filters=F2,
+                kernel_size=1,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0),),
+            #core=XDense(
+            #    k_int_bits=0,
+            #    b_int_bits=0,
+            #    units=F2,
+            #    act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)),
+        )
+        
+        self.b2 = XBundle( 
+            core=XConvBN(
+                k_int_bits=0,
+                b_int_bits=0,
+                filters=latent_dim,
+                kernel_size=1,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0),),
+            pool=XPool(
+                type='avg',
+                pool_size=(2126,1),
+                strides=(2126,1),
+                padding='same',
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type=None),),
+            flatten=True
+            #core=XDense(
+            #    k_int_bits=0,
+            #    b_int_bits=0,
+            #    units=latent_dim,
+            #    act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)),
+        )
+
+
+        self.b3 = XBundle( 
+            core=XDense(
+                k_int_bits=0,
+                b_int_bits=0,
+                units=F3,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)),
+        )
+
+        self.b4 = XBundle( 
+            core=XDense(
+                k_int_bits=0,
+                b_int_bits=0,
+                units=F4,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0)),
+        )
+
+        self.b5 = XBundle(
+            core=XDense(
+                k_int_bits=0,
+                b_int_bits=0,
+                units=out_dim,
+                act=XActivation(sys_bits=sys_bits, o_int_bits=0, type='relu', slope=0.125)),
+            # flatten=True
+        )
+
+    def call (self, x):
+        x = self.input_quant_layer(x)
+        print('input', x.shape)
+        x = self.b0(x)
+        x = self.b1(x)
+        x = self.b2(x)
+        x = self.b3(x)
+        x = self.b4(x)
+        x = self.b5(x)
+        return x
+
+x = x_in =  Input(input_shape, name="input")
+user_model = UserModel(sys_bits=sys_bits, x_int_bits=0)
+x = user_model(x_in)
+
+model = Model(inputs=[x_in], outputs=[x])
+
+
+'''
+Train Model
+'''
+model.compile(loss="mse", optimizer=Adam(learning_rate=0.0001), metrics=["mse"])
+#history = model.fit(
+#        train_loader,
+#        #x_train, 
+#        #y_train, 
+#        batch_size=BATCH_SIZE,
+#        epochs=NB_EPOCH, 
+#        #initial_epoch=1, 
+#        verbose=True,
+#        )
+
+print(model.submodules)
+#print(y[:5], model(X_tf[:5]))
+for layer in model.submodules:
+    try:
+        print(layer.summary())
+        for w, weight in enumerate(layer.get_weights()):
+                print(layer.name, w, weight.shape)
+    except:
+        pass
+# print_qstats(model.layers[1])
+
+def summary_plus(layer, i=0):
+    if hasattr(layer, 'layers'):
+        if i != 0: 
+            layer.summary()
+        for l in layer.layers:
+            i += 1
+            summary_plus(l, i=i)
+
+print(summary_plus(model)) # OK 
+model.summary(expand_nested=True)
+
+
+'''
+Save & Reload
+'''
+
+save_model(model, "mnist.h5")
+loaded_model = load_qmodel("mnist.h5")
+
+#score = loaded_model.evaluate(test_loader, verbose=0)
+#print(f"Test loss:{score[0]}, Test accuracy:{score[1]}")
+
+
+
+
+def product_dict(**kwargs):
+    for instance in itertools.product(*(kwargs.values())):
+        yield dict(zip(kwargs.keys(), instance))
+
+@pytest.mark.parametrize("PARAMS", list(product_dict(
+                                        processing_elements  = [(16,32)   ],
+                                        frequency_mhz        = [ 250     ],
+                                        bits_input           = [ 8       ],
+                                        bits_weights         = [ 8       ],
+                                        bits_sum             = [ 32      ],
+                                        bits_bias            = [ 16      ],
+                                        max_batch_size       = [ 64      ], 
+                                        max_channels_in      = [ 2048    ],
+                                        max_kernel_size      = [ 9       ],
+                                        max_image_size       = [ 2126    ],
+                                        max_n_bundles        = [ 64      ],
+                                        ram_weights_depth    = [ 20      ],
+                                        ram_edges_depth      = [ 288     ],
+                                        axi_width            = [ 128      ],
+                                        config_baseaddr      = ["B0000000"],
+                                        target_cpu_int_bits  = [ 32       ],
+                                        valid_prob           = [ 1     ],
+                                        ready_prob           = [ 1     ],
+                                        data_dir             = ['vectors'],
+                                    )))
+def test_dnn_engine(PARAMS):
+
+    '''
+    SPECIFY HARDWARE
+    '''
+    hw = Hardware (**PARAMS)
+    hw.export_json()
+    hw = Hardware.from_json('hardware.json')
+    hw.export() # Generates: config_hw.svh, config_hw.tcl
+    hw.export_vivado_tcl(board='zcu104')
+
+
+    '''
+    VERIFY & EXPORT
+    '''
+    export_inference(loaded_model, hw, hw.ROWS)
+    verify_inference(loaded_model, hw, SIM=SIM, SIM_PATH=SIM_PATH)
+
+    d_perf = predict_model_performance(hw)
+    pp = pprint.PrettyPrinter(indent=4)
+    print(f"Predicted Performance")
+    pp.pprint(d_perf)
diff --git a/run/resnet50.py b/run/resnet50.py
index aa8d9f5..c8f0b1a 100644
--- a/run/resnet50.py
+++ b/run/resnet50.py
@@ -11,6 +11,7 @@
 from keras.utils import to_categorical
 from qkeras.utils import load_qmodel
 import numpy as np
+import pprint
 # import tensorflow as tf
 #tf.keras.utils.set_random_seed(0)
 
@@ -404,60 +405,61 @@ def __init__(self, sys_bits, x_int_bits, *args, **kwargs):
 
     def call (self, x):
         x = self.input_quant_layer(x)
-        x = self.b1(x)
-        x_skip1 = self.sk1(x)
-        x = self.b2(x)
-        x = self.b3(x)
-        x = x_skip2 = self.sk2(x, x_skip1)
-        x = self.b4(x)
-        x = self.b5(x)
-        x = x_skip3 = self.sk3(x, x_skip2)
-        x = self.b6(x)
-        x = self.b7(x)
-        x = x_skip4 = self.sk4(x, x_skip3)
-        x_skip5 = self.sk5(x)
-        x = self.b8(x)
-        x = self.b9(x)
-        x = x_skip6 = self.sk6(x, x_skip5)
-        x = self.b10(x)
-        x = self.b11(x)
-        x = x_skip7 = self.sk7(x, x_skip6)
-        x = self.b12(x)
-        x = self.b13(x)
-        x = x_skip8 = self.sk8(x, x_skip7)
-        x = self.b14(x)
-        x = self.b15(x)
-        x = x_skip9 = self.sk9(x, x_skip8)
-        x_skip10  = self.sk10(x)
-        x = self.b16(x)
-        x = self.b17(x)
-        x = x_skip11 = self.sk11(x, x_skip10)
-        x = self.b18(x)
-        x = self.b19(x)
-        x = x_skip12 = self.sk12(x, x_skip11)
-        x = self.b20(x)
-        x = self.b21(x)
-        x = x_skip13 = self.sk13(x, x_skip12)
-        x = self.b22(x)
-        x = self.b23(x)
-        x = x_skip14 = self.sk14(x, x_skip13)
-        x = self.b24(x)
-        x = self.b25(x)
-        x = x_skip15 = self.sk15(x, x_skip14)
-        x = self.b26(x)
-        x = self.b27(x)
-        x = x_skip16 = self.sk16(x, x_skip15)
-        x_skip17 = self.sk17(x)
-        x = self.b28(x)
-        x = self.b29(x)
-        x = x_skip18 = self.sk18(x, x_skip17)
-        x = self.b30(x)
-        x = self.b31(x)
-        x = x_skip19 = self.sk19(x, x_skip18)
-        x = self.b32(x)
-        x = self.b33(x)
-        x = x_skip20 = self.sk20(x, x_skip19)
-        x = self.b34(x)
+        x = self.b1(x) # 0
+        x_skip1 = self.sk1(x) # 1
+        x = self.b2(x) # 2
+        x = self.b3(x) # 3
+        x = x_skip2 = self.sk2(x, x_skip1) # 4
+        x = self.b4(x) # 5
+        x = self.b5(x) # 6
+        x = x_skip3 = self.sk3(x, x_skip2) # 7
+        x = self.b6(x) # 8
+        x = self.b7(x) # 9
+        x = x_skip4 = self.sk4(x, x_skip3) # 10
+        x_skip5 = self.sk5(x) # 11
+        x = self.b8(x) # 12
+        x = self.b9(x) # 13
+        x = x_skip6 = self.sk6(x, x_skip5) # 14
+        x = self.b10(x) # 15
+        x = self.b11(x) # 16 
+        x = x_skip7 = self.sk7(x, x_skip6) # 17
+        x = self.b12(x) # 18
+        x = self.b13(x) # 19
+        x = x_skip8 = self.sk8(x, x_skip7) # 20
+        x = self.b14(x) # 21
+        x = self.b15(x) # 22
+        x = x_skip9 = self.sk9(x, x_skip8) # 23
+        x_skip10  = self.sk10(x) # 24
+        x = self.b16(x) # 25
+        x = self.b17(x) # 26
+        x = x_skip11 = self.sk11(x, x_skip10) # 27
+        x = self.b18(x) # 28
+        x = self.b19(x) # 29
+        x = x_skip12 = self.sk12(x, x_skip11) # 30
+        x = self.b20(x) # 31
+        x = self.b21(x) # 32
+        x = x_skip13 = self.sk13(x, x_skip12) # 33
+        x = self.b22(x) # 34
+        x = self.b23(x) # 35
+        x = x_skip14 = self.sk14(x, x_skip13) # 36
+        x = self.b24(x) # 37
+        x = self.b25(x) # 38
+        x = x_skip15 = self.sk15(x, x_skip14) # 39
+        x = self.b26(x) # 40
+        x = self.b27(x) # 41
+        x = x_skip16 = self.sk16(x, x_skip15) # 42
+        x_skip17 = self.sk17(x) # 43
+        x = self.b28(x) # 44
+        x = self.b29(x) # 45
+        x = x_skip18 = self.sk18(x, x_skip17) # 46
+        x = self.b30(x) # 47
+        x = self.b31(x) # 48
+        x = x_skip19 = self.sk19(x, x_skip18) # 49
+        x = self.b32(x) # 50
+        x = self.b33(x) # 51
+        x = x_skip20 = self.sk20(x, x_skip19) # 52
+        x = self.b34(x) # 53
+        exit()
         return x
 
 x = x_in =  Input(input_shape, name="input")
@@ -537,8 +539,8 @@ def product_dict(**kwargs):
                                         axi_width            = [ 128      ],
                                         config_baseaddr      = ["B0000000"],
                                         target_cpu_int_bits  = [ 32       ],
-                                        valid_prob           = [ 0.1       ],
-                                        ready_prob           = [ 0.01       ],
+                                        valid_prob           = [ 1       ],
+                                        ready_prob           = [ 1       ],
                                         data_dir             = ['vectors'],
                                     )))
 def test_dnn_engine(PARAMS):
@@ -556,9 +558,10 @@ def test_dnn_engine(PARAMS):
     '''
     VERIFY & EXPORT
     '''
-    export_inference(loaded_model, hw)
+    export_inference(loaded_model, hw, batch_size=1)
     verify_inference(loaded_model, hw, SIM=SIM, SIM_PATH=SIM_PATH)
 
-    seconds, bytes = predict_model_performance(hw)
-    print(f"Predicted time on hardware: {1000*seconds:.5f} ms")
-    print(f"Predicted data movement: {bytes/1000:.5f} kB")
+    d_perf = predict_model_performance(hw)
+    pp = pprint.PrettyPrinter(indent=4)
+    print(f"Predicted Performance")
+    pp.pprint(d_perf)