linting

pulp-platform · Nov 28, 2024 · da662e1 · da662e1
1 parent 1bb7d83
commit da662e1
Show file tree

Hide file tree

Showing 19 changed files with 330 additions and 226 deletions.
diff --git a/Deeploy/Targets/Snitch/Deployer.py b/Deeploy/Targets/Snitch/Deployer.py
@@ -29,20 +29,13 @@
 import onnx_graphsurgeon as gs
 
 from Deeploy.AbstractDataTypes import Pointer
-from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
-
-# from Deeploy.NetworkDeployers.SignPropDeployer import SignPropDeployer
 from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer
-
-# from Deeploy.OptimizationPasses.TopologyOptimizationPasses.BasicPasses import ReshapeConstOptPass, \
-#    TransposeConstOptPass, TransposeMergePass, TransposeSplitPass
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    NCHWtoNHWCPass, RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass
+from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
 from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ReshapeConstOptPass, TransposeConstOptPass, \
     TransposeMergePass, TransposeSplitPass
 
-# from Deeploy.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import NCHWtoNHWCPass, \
-#    RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass
-from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
-    NCHWtoNHWCPass, RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass
 
 class SnitchDeployer(SignPropDeployer):
 

diff --git a/Deeploy/Targets/Snitch/Platform.py b/Deeploy/Targets/Snitch/Platform.py
@@ -28,34 +28,17 @@
 
 import numpy as np
 
-# from Deeploy.Bindings.BasicBindings import BasicGatherBindings, BasicMatMulBinding, BasicPad1DBindings, \
-#    BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding
-
-from Deeploy.Targets.Generic.Bindings import BasicGatherBindings,BasicMatMulBinding, BasicPad1DBindings, \
-    BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding
-
 from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \
     StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
-
-# from Deeploy.Layers.BasicLayers import GatherLayer, MatMulLayer, PadLayer, ReshapeLayer, RQIntegerDivLayer
+from Deeploy.Targets.Generic.Bindings import BasicGatherBindings, BasicMatMulBinding, BasicPad1DBindings, \
+    BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding
 from Deeploy.Targets.Generic.Layers import GatherLayer, MatMulLayer, PadLayer, ReshapeLayer, RQIntegerDivLayer
-
-# from Deeploy.OptimizationPasses.TopologyOptimizationPasses.BasicPasses import IntegerDivRequantMergePass, \
-#    MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \
-#    SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass
-
-from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import IntegerDivRequantMergePass, \
-    MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \
-    SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass
-
-# from Deeploy.Parsers.BasicParsers import GatherParser, MatMulParser, Pad1DParser, Pad2DParser, RQIntegerDivParser, \
-#    UnsqueezeParser
 from Deeploy.Targets.Generic.Parsers import GatherParser, MatMulParser, Pad1DParser, Pad2DParser, RQIntegerDivParser, \
     UnsqueezeParser
-
-# from Deeploy.Templates.BasicTemplates import AllocateTemplate as BasicAllocateTemplate
 from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate
-
+from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import IntegerDivRequantMergePass, \
+    MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \
+    SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass
 from Deeploy.Targets.Snitch.Templates import AllocateTemplate, FreeTemplate
 
 GatherMapper = NodeMapper(GatherParser(), BasicGatherBindings)

diff --git a/DeeployTest/Platforms/Snitch/main.c b/DeeployTest/Platforms/Snitch/main.c
@@ -44,8 +44,10 @@ int main(void) {
 
   if (snrt_is_dm_core()) {
 #ifndef CI
-    printf("Network running on %d of %d compute cores (+%d DM cores) on %d clusters\r\n", num_compute_cores,
-           snrt_global_compute_core_num(), snrt_cluster_num() * snrt_cluster_dm_core_num(), snrt_cluster_num());
+    printf("Network running on %d of %d compute cores (+%d DM cores) on %d "
+           "clusters\r\n",
+           num_compute_cores, snrt_global_compute_core_num(),
+           snrt_cluster_num() * snrt_cluster_dm_core_num(), snrt_cluster_num());
 #endif
 
     printf("Initializing...\r\n");
@@ -55,13 +57,13 @@ int main(void) {
 #ifndef CI
     for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
       printf("testInputVector%d @ %p\r\n", buf, testInputVector[buf]);
-      printf("DeeployNetwork_input_%d @ %p and %u elements\r\n", buf, DeeployNetwork_inputs[buf],
-             DeeployNetwork_inputs_bytes[buf]);
+      printf("DeeployNetwork_input_%d @ %p and %u elements\r\n", buf,
+             DeeployNetwork_inputs[buf], DeeployNetwork_inputs_bytes[buf]);
     }
     for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
       printf("testInputVector%d @ %p\r\n", buf, testOutputVector[buf]);
-      printf("DeeployNetwork_output_%d @ %p and %u elements\r\n", buf, DeeployNetwork_outputs[buf],
-             DeeployNetwork_outputs_bytes[buf]);
+      printf("DeeployNetwork_output_%d @ %p and %u elements\r\n", buf,
+             DeeployNetwork_outputs[buf], DeeployNetwork_outputs_bytes[buf]);
     }
 
     printf("Initialized\r\n");
@@ -71,7 +73,8 @@ int main(void) {
 
     // WIESEP: Copy inputs to allocated memory
     for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
-      snrt_dma_start_1d(DeeployNetwork_inputs[buf], testInputVector[buf], DeeployNetwork_inputs_bytes[buf]);
+      snrt_dma_start_1d(DeeployNetwork_inputs[buf], testInputVector[buf],
+                        DeeployNetwork_inputs_bytes[buf]);
     }
     snrt_dma_wait_all();
 

diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py
@@ -41,7 +41,6 @@
     NeurekaPlatform
 from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer
 from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPOptimizer, PULPPlatform
-
 from Deeploy.Targets.Snitch.Deployer import SnitchDeployer
 from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform
 
@@ -82,7 +81,6 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]:
     elif platformName == "Snitch":
         Platform = SnitchPlatform()
 
-
     else:
         raise RuntimeError(f"Deployment platform {platformName} is not implemented")
 
@@ -211,7 +209,6 @@ def mapDeployer(platform: DeploymentPlatform,
                                 default_channels_first = default_channels_first,
                                 deeployStateDir = deeployStateDir)
 
-
     elif isinstance(platform, (SnitchPlatform)):
         if loweringOptimizer is None:
             loweringOptimizer = SnitchOptimizer
@@ -228,7 +225,6 @@ def mapDeployer(platform: DeploymentPlatform,
                                   default_channels_first = default_channels_first,
                                   deeployStateDir = deeployStateDir)
 
-
     else:
         raise RuntimeError(f"Deployer for platform {platform} is not implemented")
 

diff --git a/TargetLibraries/Snitch/inc/CycleCounter.h b/TargetLibraries/Snitch/inc/CycleCounter.h
@@ -44,7 +44,8 @@ void StopTimer(void);
 // Returns the current number of cycles according to the internal cycle counter
 uint32_t getCycles(void);
 
-// Returns the current number of instructions according to the internal instructions counter
+// Returns the current number of instructions according to the internal
+// instructions counter
 uint32_t getInstr(void);
 
 #endif //__DEEPLOY_MATH_CYCLE_HEADER_
diff --git a/TargetLibraries/Snitch/inc/kernel/Gemm.h b/TargetLibraries/Snitch/inc/kernel/Gemm.h
@@ -57,19 +57,26 @@
  * simd       = no
  * cleanup    = yes
  */
-void Gemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
-                             int32_t const *__restrict__ pSrcC, int32_t *__restrict__ pDstY, uint32_t M, uint32_t N,
-                             uint32_t P, int32_t alpha, int32_t beta, int32_t transA, int32_t transB, int32_t A_offset,
-                             int32_t B_offset, int32_t C_offset, int32_t Y_offset);
+void Gemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA,
+                             int8_t const *__restrict__ pSrcB,
+                             int32_t const *__restrict__ pSrcC,
+                             int32_t *__restrict__ pDstY, uint32_t M,
+                             uint32_t N, uint32_t P, int32_t alpha,
+                             int32_t beta, int32_t transA, int32_t transB,
+                             int32_t A_offset, int32_t B_offset,
+                             int32_t C_offset, int32_t Y_offset);
 
 // Mapper Functions
 static inline void __attribute__((always_inline))
-Gemm_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, int32_t const *__restrict__ pSrcC,
-                 int32_t *__restrict__ pDstY, uint32_t M, uint32_t N, uint32_t P, int32_t alpha, int32_t beta,
-                 int32_t transA, int32_t transB, int32_t A_offset, int32_t B_offset, int32_t C_offset,
-                 int32_t Y_offset) {
-  Gemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, A_offset, B_offset,
-                          C_offset, Y_offset);
+Gemm_parallel_s8(int8_t const *__restrict__ pSrcA,
+                 int8_t const *__restrict__ pSrcB,
+                 int32_t const *__restrict__ pSrcC, int32_t *__restrict__ pDstY,
+                 uint32_t M, uint32_t N, uint32_t P, int32_t alpha,
+                 int32_t beta, int32_t transA, int32_t transB, int32_t A_offset,
+                 int32_t B_offset, int32_t C_offset, int32_t Y_offset) {
+  Gemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta,
+                          transA, transB, A_offset, B_offset, C_offset,
+                          Y_offset);
 }
 
 #endif //__DEEPLOY_MATH_GEMM_KERNEL_HEADER_
diff --git a/TargetLibraries/Snitch/inc/kernel/MatMul.h b/TargetLibraries/Snitch/inc/kernel/MatMul.h
@@ -58,8 +58,10 @@
  * simd       = no
  * cleanup    = yes
  */
-void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
-                               int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, int32_t A_offset,
+void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA,
+                               int8_t const *__restrict__ pSrcB,
+                               int32_t *__restrict__ pDstC, uint32_t M,
+                               uint32_t N, uint32_t P, int32_t A_offset,
                                int32_t B_offset, int32_t output_offset);
 
 /*
@@ -71,8 +73,10 @@ void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_
  * simd       = no
  * cleanup    = no
  */
-void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
-                                            int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P);
+void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA,
+                                            int8_t const *__restrict__ pSrcB,
+                                            int32_t *__restrict__ pDstC,
+                                            uint32_t M, uint32_t N, uint32_t P);
 
 /*
  * Matrix multiplication ----------------------------------
@@ -83,29 +87,39 @@ void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, in
  * simd       = no
  * cleanup    = no
  */
-void MatMul_offset_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
-                                                   int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
-                                                   int32_t A_offset, int32_t B_offset, int32_t output_offset);
+void MatMul_offset_unrolled_2x2_parallel_s8_rv32im(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    int32_t A_offset, int32_t B_offset, int32_t output_offset);
 
 // Mapper Functions
 static inline void __attribute__((always_inline))
-MatMul_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, int32_t *__restrict__ pDstC,
-                   uint32_t M, uint32_t N, uint32_t P, int32_t A_offset, int32_t B_offset, int32_t output_offset) {
-  MatMul_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset);
+MatMul_parallel_s8(int8_t const *__restrict__ pSrcA,
+                   int8_t const *__restrict__ pSrcB,
+                   int32_t *__restrict__ pDstC, uint32_t M, uint32_t N,
+                   uint32_t P, int32_t A_offset, int32_t B_offset,
+                   int32_t output_offset) {
+  MatMul_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset,
+                            output_offset);
 }
 
-static inline void __attribute__((always_inline)) MatMul_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA,
-                                                                                  int8_t const *__restrict__ pSrcB,
-                                                                                  int32_t *__restrict__ pDstC,
-                                                                                  uint32_t M, uint32_t N, uint32_t P) {
+static inline void __attribute__((always_inline))
+MatMul_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA,
+                                int8_t const *__restrict__ pSrcB,
+                                int32_t *__restrict__ pDstC, uint32_t M,
+                                uint32_t N, uint32_t P) {
   MatMul_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P);
 }
 
 static inline void __attribute__((always_inline))
-MatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
-                                       int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
-                                       int32_t A_offset, int32_t B_offset, int32_t output_offset) {
-  MatMul_offset_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset);
+MatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA,
+                                       int8_t const *__restrict__ pSrcB,
+                                       int32_t *__restrict__ pDstC, uint32_t M,
+                                       uint32_t N, uint32_t P, int32_t A_offset,
+                                       int32_t B_offset,
+                                       int32_t output_offset) {
+  MatMul_offset_unrolled_2x2_parallel_s8_rv32im(
+      pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset);
 }
 
 /******************************************************************************/
@@ -121,8 +135,11 @@ MatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t
  * simd       = no
  * cleanup    = no
  */
-void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA, int16_t const *__restrict__ pSrcB,
-                                             int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P);
+void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA,
+                                             int16_t const *__restrict__ pSrcB,
+                                             int32_t *__restrict__ pDstC,
+                                             uint32_t M, uint32_t N,
+                                             uint32_t P);
 
 /******************************************************************************/
 /*                        Matrix Multiplication (32bit)                       */
@@ -139,7 +156,10 @@ void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA,
  * other      = loads/stores explicitly written in asm
  *              for optimal register utilization
  */
-void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA, int32_t const *__restrict__ pSrcB,
-                                             int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P);
+void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA,
+                                             int32_t const *__restrict__ pSrcB,
+                                             int32_t *__restrict__ pDstC,
+                                             uint32_t M, uint32_t N,
+                                             uint32_t P);
 
 #endif //__DEEPLOY_MATH_MATMUL_KERNEL_HEADER_