pulp-platform · dnadalini · Apr 3, 2024 · Apr 3, 2024 · Apr 3, 2024 · Apr 4, 2024
diff --git a/README.md b/README.md
@@ -151,13 +151,15 @@ PULP-TrainLib's repository is organized with these branches:
 
 > Note: checked are complete, unchecked are ongoing
 
+PULP-TrainLib:
+
 - [X] Forward passes for DepthWise, PointWise Convolutions and Conv2D, Fully-Connected (FP32, FP16)
 - [X] Weight gradients for DepthWise, PointWise Convolutions and Conv2D, Fully-Connected (FP32, FP16)
 - [X] Input gradients for DepthWise, PointWise Convolutions and Conv2D, Fully-Connected (FP32, FP16)
 - [X] CWH data layout for DepthWise, PointWise and 2D Convolutions (FP32, FP16)
 - [X] HWC data layout for PointWise Convolution (FP32, FP16) and 2D Convolutions (FP32, FP16)
-- [X] stride and padding (only naive 2D Convolutions, without im2col+mm optimization)
-- [X] ReLU, Sigmoid activation functions (FP32, FP16)
+- [X] Stride and Padding (only naive 2D Convolutions, without im2col+mm optimization)
+- [X] ReLU, Leaky ReLU, Sigmoid activation functions (FP32, FP16)
 - [X] Gradient Descent optimizer (FP32, FP16)
 - [X] Max and Average Pooling (FP32, FP16)
 - [X] RNN training primitives (FP32)
@@ -173,7 +175,23 @@ PULP-TrainLib's repository is organized with these branches:
 - [ ] Biases for DepthWise and PointWise Convolutions (FP32, FP16)
 - [ ] Sparse Update (layer-wise) in TrainLib_Deployer
 - [ ] Partial Im2Col / Im2Row for Conv2D (FP32, FP16)
-- [ ] Integration of biases in TrainLib-Deployer (Conv2D)
+
+TrainLib_Deployer:
+
+- [X] No Buffer and Single Buffer mode, supporting layer-wise execution (tiling not supported)
+- [X] Conv2D, PointWise, DepthWise Convolutions, Fully-Connected support (FP32, FP16)
+- [X] Average and Max Pooling (FP32, FP16)
+- [X] ReLU, LeakyReLU, Sigmoid Activations (FP32, FP16)
+- [X] InstanceNorm (FP32, FP16)
+- [X] Residual Connections (FP32, FP16, only no buffer mode)
+- [ ] Residual Connections (FP32, FP16, single buffer mode)
+- [X] SGD Optimizer (FP32, FP16)
+- [ ] FP32-FP16 Layer-Wise Mixed Precision Mode
+- [X] Layer-Wise Sparse Update
+- [X] CHW Data Layout
+- [ ] HWC Data Layout
+- [X] Online Learning (batch size = 1)
+- [ ] Mini-Batch Learning (batch size > 1)
 
 # Known bugs / issues (open for contributions)
 
@@ -185,6 +203,10 @@ PULP-TrainLib's repository is organized with these branches:
 - Missing integration of sigmoid function in TrainLib_Deployer
 - Performances of FP16 sigmoid may need to be optimized with FP16 exponenetial (e.g., https://github.com/0xBYTESHIFT/fp16/blob/master/include/half/half.hpp)
 
+TrainLib_Deployer:
+- Training does not converge in DNNs generated with TrainLib_Deployer if the last layer is not updated 
+- With no single/double buffering, not updating a PW layer in a sparse update results in wrong backward computation
+
 
 # Contributors
 

diff --git a/lib/include/pulp_act_fp16.h b/lib/include/pulp_act_fp16.h
@@ -12,24 +12,33 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
+ *
+ * Authors: Davide Nadalini, Leonardo Ravaglia, Calin Diaconu
+ *
+ * Activation functions configuration structure
  */
 
-/**
- * Authors: Davide Nadalini, Leonardo Ravaglia
-*/ 
 
 /**
- * Activation functions configuration structure
+ * @brief Structure for activation functions
+ * @param input blob structure for the input data of the activation layer
+ * @param output blob structure for the output data of the activation layer
  */
+struct act_args_fp16 {
+    struct blob_fp16 * input;
+    struct blob_fp16 * output;
+};
+
 
 /**
- * @brief Structure for activation functions
+ * @brief Structure for leaky relu activation functions
  * @param input blob structure for the input data of the activation layer
  * @param output blob structure for the output data of the activation layer
  */
-struct act_args_fp16 {
+struct leakyrelu_args_fp16 {
     struct blob_fp16 * input;
     struct blob_fp16 * output;
+    fp16 negative_slope;
 };
 
 /**
@@ -39,17 +48,22 @@ struct act_args_fp16 {
  * @param output  pointer to output vector
  * @param sum     final sum value of all exponentials
 */
-struct softmax_args_fp16{
-  struct blob_fp16 * input;
-  struct blob_fp16 * output;
-  int L;
-  int n_heads;
-  fp16 * maxes;
-  fp16 * sums;
+struct softmax_args_fp16 {
+    fp16 *input_data;
+    fp16 *input_diff;
+    fp16 *output_data;
+    fp16 *output_diff;
+    int H;
+    int W;
+    int L;
+    int n_heads;
+    fp16 *global_max;
+    fp16 *partial_exp_sum;
+    fp16 *maxes;
+    fp16 *sums;
 };
 
 
-
 /**
  * Activation functions, both FW and BW
  **/
@@ -62,54 +76,88 @@ struct softmax_args_fp16{
 */
 void pulp_sigmoid_fp16_fw_cl( void * act_args );
 
+
 /**
  * @brief Backward pass function.
  * @param input Input for sigmoid.
  * @param output Output of sigmoid.
 */
 void pulp_sigmoid_fp16_bw_cl( void * act_args );
 
+
 /**
  * @brief Core function to implement the forward of sigmoid (allows parallelization, parallelize with pi_cl_team_fork(NUM_CORES, sigmoid_core_fw_fp16, &args)).
  * @param act_args Input and output data (data only will be used)
 */
 void sigmoid_core_fw_fp16( void * act_args );
 
+
 /**
  * @brief Core function to implement the backward of sigmoid (allows parallelization, parallelize with pi_cl_team_fork(NUM_CORES, sigmoid_core_bw_fp16, &args)).
  * @param act_args Input and output data (gradients only will be used)
 */
 void sigmoid_core_bw_fp16( void * act_args );
 
 
-
 /**
  * @brief Forward pass function. Configure and pass a act_args structure pointer as argument.
  * @param input Input for relu.
  * @param output Output of relu.
 */
 void pulp_relu_fp16_fw_cl( void * act_args_fp16 );
 
+
 /**
- * @brief Bakcward pass function.
+ * @brief Backward pass function.
  * @param input Input for relu.
  * @param output Output of relu.
 */
 void pulp_relu_fp16_bw_cl( void * act_args_fp16 );
 
+
 /**
  * @brief Core function to implement the forward of ReLU (allows parallelization, parallelize with pi_cl_team_fork(NUM_CORES, relu_core_fw_fp16, &args)).
  * @param act_args Input and output data (data only will be used)
 */
 void relu_core_fw_fp16( void * act_args_fp16 );
 
+
 /**
  * @brief Core function to implement the backward of ReLU (allows parallelization, parallelize with pi_cl_team_fork(NUM_CORES, relu_core_bw_fp16, &args)).
  * @param act_args Input and output data (gradients only will be used)
 */
 void relu_core_bw_fp16( void * act_args_fp16 );
 
 
+/**
+ * @brief Forward pass function. Configure and pass a leakyrelu_args structure pointer as argument.
+ * @param input Input for leaky relu.
+ * @param output Output of leaky relu.
+*/
+void pulp_leakyrelu_fp16_fw_cl( void * leakyrelu_args_fp16 );
+
+/**
+ * @brief Backward pass function.
+ * @param input Input for leaky relu.
+ * @param output Output of leaky relu.
+*/
+void pulp_leakyrelu_fp16_bw_cl( void * leakyrelu_args_fp16 );
+
+/**
+ * @brief Core function to implement the forward of Leaky ReLU (allows parallelization, parallelize with pi_cl_team_fork(NUM_CORES, leakyrelu_core_fw_fp16, &leakyrelu_args)).
+ * @param leakyrelu_args_fp16 Input and output data (data only will be used)
+*/
+void leakyrelu_core_fw_fp16( void * leakyrelu_args_fp16 );
+
+/**
+ * @brief Core function to implement the backward of Leaky ReLU (allows parallelization, parallelize with pi_cl_team_fork(NUM_CORES, leakyrelu_core_bw_fp16, &leakyrelu_args)).
+ * @param leakyrelu_args_fp16 Input and output data (gradients only will be used)
+*/
+void leakyrelu_core_bw_fp16( void * leakyrelu_args_fp16 );
+
+
+
+
 
 /**
  * @brief Forward pass function.
@@ -118,16 +166,18 @@ void relu_core_bw_fp16( void * act_args_fp16 );
 */
 void pulp_softmax_fp16_fw_cl( void * act_args_fp16 );
 
+
 /**
- * @brief Bakcward pass function.
+ * @brief Backward pass function.
  * @param input Input for softmax.
  * @param output Output of softmax.
 */
 void pulp_softmax_fp16_bw_cl( void * act_args_fp16 );
 
+
 /**
  * @brief Forward pass function. Configure and pass a act_args structure pointer as argument.
  * @param input Input for gelu.
  * @param output Output of gelu.
 */
-void pulp_gelu_fp16_fw_cl( void* act_args_fp16);
+void pulp_gelu_fp16_fw_cl( void* act_args_fp16);