From da662e1f5c37ffe7c129e13a26deac89fa783737 Mon Sep 17 00:00:00 2001 From: tahaelbayad Date: Thu, 28 Nov 2024 18:27:10 +0100 Subject: [PATCH] linting linting --- Deeploy/Targets/Snitch/Deployer.py | 13 +-- Deeploy/Targets/Snitch/Platform.py | 27 ++---- DeeployTest/Platforms/Snitch/main.c | 17 ++-- DeeployTest/testUtils/platformMapping.py | 4 - TargetLibraries/Snitch/inc/CycleCounter.h | 3 +- TargetLibraries/Snitch/inc/kernel/Gemm.h | 27 +++--- TargetLibraries/Snitch/inc/kernel/MatMul.h | 64 +++++++++----- TargetLibraries/Snitch/inc/kernel/RQGemm.h | 65 +++++++------- TargetLibraries/Snitch/inc/kernel/RQMatMul.h | 80 +++++++++++------- .../Snitch/inc/kernel/UniformRequantShift.h | 30 ++++--- TargetLibraries/Snitch/inc/kernel/iSoftmax.h | 12 ++- TargetLibraries/Snitch/inc/macros.h | 2 +- TargetLibraries/Snitch/src/MatMul_s16.c | 7 +- TargetLibraries/Snitch/src/MatMul_s32.c | 7 +- TargetLibraries/Snitch/src/MatMul_s8.c | 23 +++-- TargetLibraries/Snitch/src/RQGemm_s8.c | 38 +++++---- TargetLibraries/Snitch/src/RQMatMul_s8.c | 35 ++++---- .../Snitch/src/UniformRequantShift.c | 84 ++++++++++++------- TargetLibraries/Snitch/src/iSoftmax.c | 18 ++-- 19 files changed, 330 insertions(+), 226 deletions(-) diff --git a/Deeploy/Targets/Snitch/Deployer.py b/Deeploy/Targets/Snitch/Deployer.py index 778f468..ff32066 100644 --- a/Deeploy/Targets/Snitch/Deployer.py +++ b/Deeploy/Targets/Snitch/Deployer.py @@ -29,20 +29,13 @@ import onnx_graphsurgeon as gs from Deeploy.AbstractDataTypes import Pointer -from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer - -# from Deeploy.NetworkDeployers.SignPropDeployer import SignPropDeployer from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer - -# from Deeploy.OptimizationPasses.TopologyOptimizationPasses.BasicPasses import ReshapeConstOptPass, \ -# TransposeConstOptPass, TransposeMergePass, TransposeSplitPass +from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ + NCHWtoNHWCPass, RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass +from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ReshapeConstOptPass, TransposeConstOptPass, \ TransposeMergePass, TransposeSplitPass -# from Deeploy.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import NCHWtoNHWCPass, \ -# RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass -from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ - NCHWtoNHWCPass, RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass class SnitchDeployer(SignPropDeployer): diff --git a/Deeploy/Targets/Snitch/Platform.py b/Deeploy/Targets/Snitch/Platform.py index e22a03d..50c1747 100644 --- a/Deeploy/Targets/Snitch/Platform.py +++ b/Deeploy/Targets/Snitch/Platform.py @@ -28,34 +28,17 @@ import numpy as np -# from Deeploy.Bindings.BasicBindings import BasicGatherBindings, BasicMatMulBinding, BasicPad1DBindings, \ -# BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding - -from Deeploy.Targets.Generic.Bindings import BasicGatherBindings,BasicMatMulBinding, BasicPad1DBindings, \ - BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding - from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \ StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer - -# from Deeploy.Layers.BasicLayers import GatherLayer, MatMulLayer, PadLayer, ReshapeLayer, RQIntegerDivLayer +from Deeploy.Targets.Generic.Bindings import BasicGatherBindings, BasicMatMulBinding, BasicPad1DBindings, \ + BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding from Deeploy.Targets.Generic.Layers import GatherLayer, MatMulLayer, PadLayer, ReshapeLayer, RQIntegerDivLayer - -# from Deeploy.OptimizationPasses.TopologyOptimizationPasses.BasicPasses import IntegerDivRequantMergePass, \ -# MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \ -# SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass - -from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import IntegerDivRequantMergePass, \ - MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \ - SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass - -# from Deeploy.Parsers.BasicParsers import GatherParser, MatMulParser, Pad1DParser, Pad2DParser, RQIntegerDivParser, \ -# UnsqueezeParser from Deeploy.Targets.Generic.Parsers import GatherParser, MatMulParser, Pad1DParser, Pad2DParser, RQIntegerDivParser, \ UnsqueezeParser - -# from Deeploy.Templates.BasicTemplates import AllocateTemplate as BasicAllocateTemplate from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate - +from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import IntegerDivRequantMergePass, \ + MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \ + SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass from Deeploy.Targets.Snitch.Templates import AllocateTemplate, FreeTemplate GatherMapper = NodeMapper(GatherParser(), BasicGatherBindings) diff --git a/DeeployTest/Platforms/Snitch/main.c b/DeeployTest/Platforms/Snitch/main.c index 78ac940..9ed91aa 100644 --- a/DeeployTest/Platforms/Snitch/main.c +++ b/DeeployTest/Platforms/Snitch/main.c @@ -44,8 +44,10 @@ int main(void) { if (snrt_is_dm_core()) { #ifndef CI - printf("Network running on %d of %d compute cores (+%d DM cores) on %d clusters\r\n", num_compute_cores, - snrt_global_compute_core_num(), snrt_cluster_num() * snrt_cluster_dm_core_num(), snrt_cluster_num()); + printf("Network running on %d of %d compute cores (+%d DM cores) on %d " + "clusters\r\n", + num_compute_cores, snrt_global_compute_core_num(), + snrt_cluster_num() * snrt_cluster_dm_core_num(), snrt_cluster_num()); #endif printf("Initializing...\r\n"); @@ -55,13 +57,13 @@ int main(void) { #ifndef CI for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) { printf("testInputVector%d @ %p\r\n", buf, testInputVector[buf]); - printf("DeeployNetwork_input_%d @ %p and %u elements\r\n", buf, DeeployNetwork_inputs[buf], - DeeployNetwork_inputs_bytes[buf]); + printf("DeeployNetwork_input_%d @ %p and %u elements\r\n", buf, + DeeployNetwork_inputs[buf], DeeployNetwork_inputs_bytes[buf]); } for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) { printf("testInputVector%d @ %p\r\n", buf, testOutputVector[buf]); - printf("DeeployNetwork_output_%d @ %p and %u elements\r\n", buf, DeeployNetwork_outputs[buf], - DeeployNetwork_outputs_bytes[buf]); + printf("DeeployNetwork_output_%d @ %p and %u elements\r\n", buf, + DeeployNetwork_outputs[buf], DeeployNetwork_outputs_bytes[buf]); } printf("Initialized\r\n"); @@ -71,7 +73,8 @@ int main(void) { // WIESEP: Copy inputs to allocated memory for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) { - snrt_dma_start_1d(DeeployNetwork_inputs[buf], testInputVector[buf], DeeployNetwork_inputs_bytes[buf]); + snrt_dma_start_1d(DeeployNetwork_inputs[buf], testInputVector[buf], + DeeployNetwork_inputs_bytes[buf]); } snrt_dma_wait_all(); diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py index 05c2795..4e24995 100644 --- a/DeeployTest/testUtils/platformMapping.py +++ b/DeeployTest/testUtils/platformMapping.py @@ -41,7 +41,6 @@ NeurekaPlatform from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPOptimizer, PULPPlatform - from Deeploy.Targets.Snitch.Deployer import SnitchDeployer from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform @@ -82,7 +81,6 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]: elif platformName == "Snitch": Platform = SnitchPlatform() - else: raise RuntimeError(f"Deployment platform {platformName} is not implemented") @@ -211,7 +209,6 @@ def mapDeployer(platform: DeploymentPlatform, default_channels_first = default_channels_first, deeployStateDir = deeployStateDir) - elif isinstance(platform, (SnitchPlatform)): if loweringOptimizer is None: loweringOptimizer = SnitchOptimizer @@ -228,7 +225,6 @@ def mapDeployer(platform: DeploymentPlatform, default_channels_first = default_channels_first, deeployStateDir = deeployStateDir) - else: raise RuntimeError(f"Deployer for platform {platform} is not implemented") diff --git a/TargetLibraries/Snitch/inc/CycleCounter.h b/TargetLibraries/Snitch/inc/CycleCounter.h index 092e583..f197055 100644 --- a/TargetLibraries/Snitch/inc/CycleCounter.h +++ b/TargetLibraries/Snitch/inc/CycleCounter.h @@ -44,7 +44,8 @@ void StopTimer(void); // Returns the current number of cycles according to the internal cycle counter uint32_t getCycles(void); -// Returns the current number of instructions according to the internal instructions counter +// Returns the current number of instructions according to the internal +// instructions counter uint32_t getInstr(void); #endif //__DEEPLOY_MATH_CYCLE_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/Gemm.h b/TargetLibraries/Snitch/inc/kernel/Gemm.h index 70835c3..f320942 100644 --- a/TargetLibraries/Snitch/inc/kernel/Gemm.h +++ b/TargetLibraries/Snitch/inc/kernel/Gemm.h @@ -57,19 +57,26 @@ * simd = no * cleanup = yes */ -void Gemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t const *__restrict__ pSrcC, int32_t *__restrict__ pDstY, uint32_t M, uint32_t N, - uint32_t P, int32_t alpha, int32_t beta, int32_t transA, int32_t transB, int32_t A_offset, - int32_t B_offset, int32_t C_offset, int32_t Y_offset); +void Gemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t const *__restrict__ pSrcC, + int32_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t P, int32_t alpha, + int32_t beta, int32_t transA, int32_t transB, + int32_t A_offset, int32_t B_offset, + int32_t C_offset, int32_t Y_offset); // Mapper Functions static inline void __attribute__((always_inline)) -Gemm_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, int32_t const *__restrict__ pSrcC, - int32_t *__restrict__ pDstY, uint32_t M, uint32_t N, uint32_t P, int32_t alpha, int32_t beta, - int32_t transA, int32_t transB, int32_t A_offset, int32_t B_offset, int32_t C_offset, - int32_t Y_offset) { - Gemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, A_offset, B_offset, - C_offset, Y_offset); +Gemm_parallel_s8(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t const *__restrict__ pSrcC, int32_t *__restrict__ pDstY, + uint32_t M, uint32_t N, uint32_t P, int32_t alpha, + int32_t beta, int32_t transA, int32_t transB, int32_t A_offset, + int32_t B_offset, int32_t C_offset, int32_t Y_offset) { + Gemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, + transA, transB, A_offset, B_offset, C_offset, + Y_offset); } #endif //__DEEPLOY_MATH_GEMM_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/MatMul.h b/TargetLibraries/Snitch/inc/kernel/MatMul.h index 75ecb83..be0cdea 100644 --- a/TargetLibraries/Snitch/inc/kernel/MatMul.h +++ b/TargetLibraries/Snitch/inc/kernel/MatMul.h @@ -58,8 +58,10 @@ * simd = no * cleanup = yes */ -void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, int32_t A_offset, +void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, uint32_t M, + uint32_t N, uint32_t P, int32_t A_offset, int32_t B_offset, int32_t output_offset); /* @@ -71,8 +73,10 @@ void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_ * simd = no * cleanup = no */ -void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P); +void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, + uint32_t M, uint32_t N, uint32_t P); /* * Matrix multiplication ---------------------------------- @@ -83,29 +87,39 @@ void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, in * simd = no * cleanup = no */ -void MatMul_offset_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, - int32_t A_offset, int32_t B_offset, int32_t output_offset); +void MatMul_offset_unrolled_2x2_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, + int32_t A_offset, int32_t B_offset, int32_t output_offset); // Mapper Functions static inline void __attribute__((always_inline)) -MatMul_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, int32_t *__restrict__ pDstC, - uint32_t M, uint32_t N, uint32_t P, int32_t A_offset, int32_t B_offset, int32_t output_offset) { - MatMul_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset); +MatMul_parallel_s8(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, + uint32_t P, int32_t A_offset, int32_t B_offset, + int32_t output_offset) { + MatMul_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, + output_offset); } -static inline void __attribute__((always_inline)) MatMul_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, - int8_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, - uint32_t M, uint32_t N, uint32_t P) { +static inline void __attribute__((always_inline)) +MatMul_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, uint32_t M, + uint32_t N, uint32_t P) { MatMul_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P); } static inline void __attribute__((always_inline)) -MatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, - int32_t A_offset, int32_t B_offset, int32_t output_offset) { - MatMul_offset_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset); +MatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, uint32_t M, + uint32_t N, uint32_t P, int32_t A_offset, + int32_t B_offset, + int32_t output_offset) { + MatMul_offset_unrolled_2x2_parallel_s8_rv32im( + pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset); } /******************************************************************************/ @@ -121,8 +135,11 @@ MatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t * simd = no * cleanup = no */ -void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA, int16_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P); +void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA, + int16_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, + uint32_t M, uint32_t N, + uint32_t P); /******************************************************************************/ /* Matrix Multiplication (32bit) */ @@ -139,7 +156,10 @@ void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA, * other = loads/stores explicitly written in asm * for optimal register utilization */ -void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA, int32_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P); +void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA, + int32_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, + uint32_t M, uint32_t N, + uint32_t P); #endif //__DEEPLOY_MATH_MATMUL_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/RQGemm.h b/TargetLibraries/Snitch/inc/kernel/RQGemm.h index ae5e366..e86b579 100644 --- a/TargetLibraries/Snitch/inc/kernel/RQGemm.h +++ b/TargetLibraries/Snitch/inc/kernel/RQGemm.h @@ -57,12 +57,13 @@ * simd = no * cleanup = yes */ -void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, uint32_t N, - uint32_t P, int32_t alpha, int32_t beta, int32_t transA, int32_t transB, int32_t *mul, - int32_t *add, int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, - int32_t B_offset, int32_t C_offset, int32_t Y_offset, int8_t output_min, - int8_t output_max); +void RQGemm_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA, + int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset, + int32_t Y_offset, int8_t output_min, int8_t output_max); /* * General Requantized Matrix multiplication ---------------------------------- @@ -73,33 +74,41 @@ void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_ * simd = no * cleanup = no */ -void RQGemm_offset_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, - uint32_t M, uint32_t N, uint32_t P, int32_t alpha, int32_t beta, - int32_t transA, int32_t transB, int32_t *mul, int32_t *add, - int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, - int32_t B_offset, int32_t C_offset, int32_t Y_offset); +void RQGemm_offset_unrolled_2x2_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA, + int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset, + int32_t Y_offset); // Mapper Functions -static inline void __attribute__((always_inline)) -RQGemm_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, uint32_t N, uint32_t P, - int32_t alpha, int32_t beta, int32_t transA, int32_t transB, int32_t *mul, int32_t *add, - int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, int32_t B_offset, - int32_t C_offset, int32_t Y_offset, int8_t output_min, int8_t output_max) { - RQGemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, mul, add, log2D, rounding, - per_row_quant, A_offset, B_offset, C_offset, Y_offset, output_min, output_max); +static inline void __attribute__((always_inline)) RQGemm_parallel_s8( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA, + int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset, + int32_t Y_offset, int8_t output_min, int8_t output_max) { + RQGemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, + transA, transB, mul, add, log2D, rounding, + per_row_quant, A_offset, B_offset, C_offset, + Y_offset, output_min, output_max); } // Mapper Functions -static inline void __attribute__((always_inline)) RQGemm_offset_unrolled_2x2_parallel_s8( - int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, int32_t const *__restrict__ pSrcC, - int8_t *__restrict__ pDstY, uint32_t M, uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA, - int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, - int32_t B_offset, int32_t C_offset, int32_t Y_offset) { - RQGemm_offset_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, mul, - add, log2D, rounding, per_row_quant, A_offset, B_offset, C_offset, - Y_offset); +static inline void __attribute__((always_inline)) +RQGemm_offset_unrolled_2x2_parallel_s8( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA, + int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset, + int32_t Y_offset) { + RQGemm_offset_unrolled_2x2_parallel_s8_rv32im( + pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, mul, + add, log2D, rounding, per_row_quant, A_offset, B_offset, C_offset, + Y_offset); } #endif //__DEEPLOY_MATH_RQGEMM_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/RQMatMul.h b/TargetLibraries/Snitch/inc/kernel/RQMatMul.h index 72eec46..fdac512 100644 --- a/TargetLibraries/Snitch/inc/kernel/RQMatMul.h +++ b/TargetLibraries/Snitch/inc/kernel/RQMatMul.h @@ -32,8 +32,9 @@ #include "DeeploySnitchMath.h" -/* This library implements the requantiyed matrix multiplication for several data widths - * in multiple different ways. The functions all follow the following format: +/* This library implements the requantiyed matrix multiplication for several + * data widths in multiple different ways. The functions all follow the + * following format: * * A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix * C = AB @@ -56,10 +57,14 @@ * simd = no * cleanup = yes */ -void RQMatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, int32_t *mul, - int32_t *add, int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, - int32_t B_offset, int32_t output_offset, int8_t output_min, int8_t output_max); +void RQMatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, + uint32_t N, uint32_t P, int32_t *mul, + int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, + int32_t B_offset, int32_t output_offset, + int8_t output_min, int8_t output_max); /* * Matrix multiplication ---------------------------------- @@ -70,10 +75,11 @@ void RQMatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const * simd = no * cleanup = no */ -void RQMatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, - int32_t *mul, int32_t *add, int32_t log2D, bool rounding, - bool per_row_quant); +void RQMatMul_unrolled_2x2_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, + int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant); /* * Matrix multiplication ---------------------------------- @@ -84,36 +90,50 @@ void RQMatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, * simd = no * cleanup = no */ -void RQMatMul_offset_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, - int32_t *mul, int32_t *add, int32_t log2D, bool rounding, - bool per_row_quant, int32_t A_offset, int32_t B_offset, - int32_t output_offset); +void RQMatMul_offset_unrolled_2x2_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, + int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, int32_t B_offset, + int32_t output_offset); // Mapper Functions static inline void __attribute__((always_inline)) -RQMatMul_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, int8_t *__restrict__ pDstC, - uint32_t M, uint32_t N, uint32_t P, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, - bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t output_offset, int8_t output_min, +RQMatMul_parallel_s8(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, + uint32_t P, int32_t *mul, int32_t *add, int32_t log2D, + bool rounding, bool per_row_quant, int32_t A_offset, + int32_t B_offset, int32_t output_offset, int8_t output_min, int8_t output_max) { - RQMatMul_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, mul, add, log2D, rounding, per_row_quant, A_offset, - B_offset, output_offset, output_min, output_max); + RQMatMul_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, mul, add, log2D, + rounding, per_row_quant, A_offset, B_offset, + output_offset, output_min, output_max); } static inline void __attribute__((always_inline)) -RQMatMul_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, int32_t *mul, - int32_t *add, int32_t log2D, bool rounding, bool per_row_quant) { - RQMatMul_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, mul, add, log2D, rounding, per_row_quant); +RQMatMul_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, + uint32_t N, uint32_t P, int32_t *mul, + int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant) { + RQMatMul_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, mul, + add, log2D, rounding, per_row_quant); } static inline void __attribute__((always_inline)) -RQMatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, int32_t *mul, - int32_t *add, int32_t log2D, bool rounding, bool per_row_quant, - int32_t A_offset, int32_t B_offset, int32_t output_offset) { - RQMatMul_offset_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, mul, add, log2D, rounding, - per_row_quant, A_offset, B_offset, output_offset); +RQMatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, + uint32_t N, uint32_t P, int32_t *mul, + int32_t *add, int32_t log2D, + bool rounding, bool per_row_quant, + int32_t A_offset, int32_t B_offset, + int32_t output_offset) { + RQMatMul_offset_unrolled_2x2_parallel_s8_rv32im( + pSrcA, pSrcB, pDstC, M, N, P, mul, add, log2D, rounding, per_row_quant, + A_offset, B_offset, output_offset); } #endif //__DEEPLOY_MATH_RQMATMUL_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/UniformRequantShift.h b/TargetLibraries/Snitch/inc/kernel/UniformRequantShift.h index b30c35d..79887d2 100644 --- a/TargetLibraries/Snitch/inc/kernel/UniformRequantShift.h +++ b/TargetLibraries/Snitch/inc/kernel/UniformRequantShift.h @@ -29,18 +29,26 @@ #include "DeeploySnitchMath.h" -void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul, int32_t add, int8_t *data_out, int32_t log2D, - int32_t HW, int32_t input_offset, int32_t output_offset, int8_t output_min, +void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul, + int32_t add, int8_t *data_out, int32_t log2D, + int32_t HW, int32_t input_offset, + int32_t output_offset, int8_t output_min, int8_t output_max, bool rounding); -void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul, int32_t add, int8_t *data_out, - int32_t log2D, int32_t HW, int32_t input_offset, int32_t output_offset, - int8_t output_min, int8_t output_max, bool rounding); +void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul, + int32_t add, int8_t *data_out, int32_t log2D, + int32_t HW, int32_t input_offset, + int32_t output_offset, int8_t output_min, + int8_t output_max, bool rounding); -void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul, int32_t add, int8_t *data_out, - int32_t log2D, int32_t HW, int32_t input_offset, int32_t output_offset, - int8_t output_min, int8_t output_max, bool rounding); +void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul, + int32_t add, int8_t *data_out, int32_t log2D, + int32_t HW, int32_t input_offset, + int32_t output_offset, int8_t output_min, + int8_t output_max, bool rounding); -void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul, int32_t add, int8_t *data_out, - int32_t log2D, int32_t HW, int32_t input_offset, int32_t output_offset, - int8_t output_min, int8_t output_max, bool rounding); \ No newline at end of file +void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul, + int32_t add, int8_t *data_out, int32_t log2D, + int32_t HW, int32_t input_offset, + int32_t output_offset, int8_t output_min, + int8_t output_max, bool rounding); \ No newline at end of file diff --git a/TargetLibraries/Snitch/inc/kernel/iSoftmax.h b/TargetLibraries/Snitch/inc/kernel/iSoftmax.h index 603248a..a59c54e 100644 --- a/TargetLibraries/Snitch/inc/kernel/iSoftmax.h +++ b/TargetLibraries/Snitch/inc/kernel/iSoftmax.h @@ -29,7 +29,11 @@ #include "DeeploySnitchMath.h" -void SnitchSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out, uint32_t *lastDimBuffer, uint32_t size, - uint32_t lastDimLength, int32_t coeffB, int32_t coeffC, int32_t log2); -void StnichSoftmax_i8_u8(int8_t *data_in, uint8_t *data_out, uint32_t *lastDimBuffer, uint32_t size, - uint32_t lastDimLength, int32_t coeffB, int32_t coeffC, int32_t log2); +void SnitchSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out, + uint32_t *lastDimBuffer, uint32_t size, + uint32_t lastDimLength, int32_t coeffB, int32_t coeffC, + int32_t log2); +void StnichSoftmax_i8_u8(int8_t *data_in, uint8_t *data_out, + uint32_t *lastDimBuffer, uint32_t size, + uint32_t lastDimLength, int32_t coeffB, int32_t coeffC, + int32_t log2); diff --git a/TargetLibraries/Snitch/inc/macros.h b/TargetLibraries/Snitch/inc/macros.h index aa335cf..44c708e 100644 --- a/TargetLibraries/Snitch/inc/macros.h +++ b/TargetLibraries/Snitch/inc/macros.h @@ -29,7 +29,7 @@ #ifndef __DEEPLOY_MATH_MACROS_HEADER_ #define __DEEPLOY_MATH_MACROS_HEADER_ -// #define log2(x) __builtin_pulp_fl1(x) +// #define log2(x) __builtin_pulp_fl1(x) #define INT_LOG2(x) __builtin_ctz(x) #endif //__DEEPLOY_MATH_MACROS_HEADER_ diff --git a/TargetLibraries/Snitch/src/MatMul_s16.c b/TargetLibraries/Snitch/src/MatMul_s16.c index 40b9ea2..f4c56ba 100644 --- a/TargetLibraries/Snitch/src/MatMul_s16.c +++ b/TargetLibraries/Snitch/src/MatMul_s16.c @@ -31,8 +31,11 @@ #include "DeeploySnitchMath.h" -void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA, int16_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P) { +void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA, + int16_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, + uint32_t M, uint32_t N, + uint32_t P) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); diff --git a/TargetLibraries/Snitch/src/MatMul_s32.c b/TargetLibraries/Snitch/src/MatMul_s32.c index fe5cbe3..4ba81a7 100644 --- a/TargetLibraries/Snitch/src/MatMul_s32.c +++ b/TargetLibraries/Snitch/src/MatMul_s32.c @@ -31,8 +31,11 @@ #include "DeeploySnitchMath.h" -void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA, int32_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P) { +void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA, + int32_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, + uint32_t M, uint32_t N, + uint32_t P) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); diff --git a/TargetLibraries/Snitch/src/MatMul_s8.c b/TargetLibraries/Snitch/src/MatMul_s8.c index 3993030..e9ad77f 100644 --- a/TargetLibraries/Snitch/src/MatMul_s8.c +++ b/TargetLibraries/Snitch/src/MatMul_s8.c @@ -30,8 +30,10 @@ */ #include "DeeploySnitchMath.h" -void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, int32_t A_offset, +void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, uint32_t M, + uint32_t N, uint32_t P, int32_t A_offset, int32_t B_offset, int32_t output_offset) { uint32_t core_id = snrt_global_compute_core_idx(); @@ -46,15 +48,19 @@ void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_ for (uint32_t j = c_start; j < c_end; ++j) { int32_t sum = 0; for (uint32_t k = 0; k < N; ++k) { - sum += (int32_t)(pSrcA[i * N + k] + A_offset) * (pSrcB[k * P + j] + B_offset); + sum += (int32_t)(pSrcA[i * N + k] + A_offset) * + (pSrcB[k * P + j] + B_offset); } pDstC[i * P + j] = sum + output_offset; } } } -void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P) { +void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, + uint32_t M, uint32_t N, + uint32_t P) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); @@ -96,9 +102,10 @@ void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, in } } -void MatMul_offset_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, - int32_t A_offset, int32_t B_offset, int32_t output_offset) { +void MatMul_offset_unrolled_2x2_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, + int32_t A_offset, int32_t B_offset, int32_t output_offset) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); diff --git a/TargetLibraries/Snitch/src/RQGemm_s8.c b/TargetLibraries/Snitch/src/RQGemm_s8.c index 5f939df..1850bf0 100644 --- a/TargetLibraries/Snitch/src/RQGemm_s8.c +++ b/TargetLibraries/Snitch/src/RQGemm_s8.c @@ -28,12 +28,13 @@ */ #include "DeeploySnitchMath.h" -void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, uint32_t N, - uint32_t P, int32_t alpha, int32_t beta, int32_t transA, int32_t transB, int32_t *mul, - int32_t *add, int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, - int32_t B_offset, int32_t C_offset, int32_t Y_offset, int8_t output_min, - int8_t output_max) { +void RQGemm_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA, + int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset, + int32_t Y_offset, int8_t output_min, int8_t output_max) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); @@ -58,7 +59,8 @@ void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_ for (uint32_t p = c_start; p < c_end; ++p) { int32_t sum = 0; for (uint32_t n = 0; n < N; ++n) { - sum += (int32_t)(pSrcA[m * N + n] + A_offset) * (pSrcB[n * P + p] + B_offset); + sum += (int32_t)(pSrcA[m * N + n] + A_offset) * + (pSrcB[n * P + p] + B_offset); } // Requantize value sum = alpha * sum + beta * pSrcC[m * P + p] + bias; @@ -76,7 +78,8 @@ void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_ for (uint32_t p = c_start; p < c_end; ++p) { int32_t sum = 0; for (uint32_t n = 0; n < N; ++n) { - sum += (int32_t)(pSrcA[n * M + m] + A_offset) * (pSrcB[n * P + p] + B_offset); + sum += (int32_t)(pSrcA[n * M + m] + A_offset) * + (pSrcB[n * P + p] + B_offset); } // Requantize value sum = alpha * sum + beta * pSrcC[m * P + p] + bias; @@ -94,7 +97,8 @@ void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_ for (uint32_t p = c_start; p < c_end; ++p) { int32_t sum = 0; for (uint32_t n = 0; n < N; ++n) { - sum += (int32_t)(pSrcA[m * N + n] + A_offset) * (pSrcB[p * N + n] + B_offset); + sum += (int32_t)(pSrcA[m * N + n] + A_offset) * + (pSrcB[p * N + n] + B_offset); } // Requantize value sum = alpha * sum + beta * pSrcC[m * P + p] + bias; @@ -112,7 +116,8 @@ void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_ for (uint32_t p = c_start; p < c_end; ++p) { int32_t sum = 0; for (uint32_t n = 0; n < N; ++n) { - sum += (int32_t)(pSrcA[n * M + m] + A_offset) * (pSrcB[p * N + n] + B_offset); + sum += (int32_t)(pSrcA[n * M + m] + A_offset) * + (pSrcB[p * N + n] + B_offset); } // Requantize value sum = alpha * sum + beta * pSrcC[m * P + p] + bias; @@ -124,12 +129,13 @@ void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_ } } -void RQGemm_offset_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, - uint32_t M, uint32_t N, uint32_t P, int32_t alpha, int32_t beta, - int32_t transA, int32_t transB, int32_t *mul, int32_t *add, - int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, - int32_t B_offset, int32_t C_offset, int32_t Y_offset) { +void RQGemm_offset_unrolled_2x2_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA, + int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset, + int32_t Y_offset) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); diff --git a/TargetLibraries/Snitch/src/RQMatMul_s8.c b/TargetLibraries/Snitch/src/RQMatMul_s8.c index 229a75a..0831415 100644 --- a/TargetLibraries/Snitch/src/RQMatMul_s8.c +++ b/TargetLibraries/Snitch/src/RQMatMul_s8.c @@ -28,10 +28,14 @@ */ #include "DeeploySnitchMath.h" -void RQMatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, int32_t *mul, - int32_t *add, int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, - int32_t B_offset, int32_t output_offset, int8_t output_min, int8_t output_max) { +void RQMatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, + uint32_t N, uint32_t P, int32_t *mul, + int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, + int32_t B_offset, int32_t output_offset, + int8_t output_min, int8_t output_max) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); @@ -54,7 +58,8 @@ void RQMatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const for (uint32_t j = c_start; j < c_end; ++j) { int32_t sum = 0; for (uint32_t k = 0; k < N; ++k) { - sum += (int32_t)(pSrcA[i * N + k] + A_offset) * (pSrcB[k * P + j] + B_offset); + sum += (int32_t)(pSrcA[i * N + k] + A_offset) * + (pSrcB[k * P + j] + B_offset); } // Requantize value sum = sum * _mul + rqs_bias + _add; @@ -64,10 +69,11 @@ void RQMatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const } } -void RQMatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, - int32_t *mul, int32_t *add, int32_t log2D, bool rounding, - bool per_row_quant) { +void RQMatMul_unrolled_2x2_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, + int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); @@ -133,11 +139,12 @@ void RQMatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, } } -void RQMatMul_offset_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, - int32_t *mul, int32_t *add, int32_t log2D, bool rounding, - bool per_row_quant, int32_t A_offset, int32_t B_offset, - int32_t output_offset) { +void RQMatMul_offset_unrolled_2x2_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, + int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, int32_t B_offset, + int32_t output_offset) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); diff --git a/TargetLibraries/Snitch/src/UniformRequantShift.c b/TargetLibraries/Snitch/src/UniformRequantShift.c index 3cb4703..a45a0b7 100644 --- a/TargetLibraries/Snitch/src/UniformRequantShift.c +++ b/TargetLibraries/Snitch/src/UniformRequantShift.c @@ -29,9 +29,12 @@ #include "DeeploySnitchMath.h" -void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul, int32_t add, int8_t *data_out, int32_t log2D, - int32_t __attribute__((unused)) HW, int32_t input_offset, int32_t output_offset, - int8_t output_min, int8_t output_max, bool rounding) { +void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul, + int32_t add, int8_t *data_out, int32_t log2D, + int32_t __attribute__((unused)) HW, + int32_t input_offset, int32_t output_offset, + int8_t output_min, int8_t output_max, + bool rounding) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); @@ -57,7 +60,8 @@ void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul, int32 // Compute i intermediate = (reg_data_in_A + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[i] = out; @@ -66,7 +70,8 @@ void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul, int32 // Compute step halfChunkSize + i intermediate = (reg_data_in_B + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[halfChunkSize + i] = out; } @@ -78,20 +83,25 @@ void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul, int32 reg_data_in_A = data_in[chunk_stop]; intermediate = (reg_data_in_B + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[chunk_stop - 1] = out; intermediate = (reg_data_in_A + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[chunk_stop] = out; } } -void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul, int32_t add, int8_t *data_out, - int32_t log2D, int32_t __attribute__((unused)) HW, int32_t input_offset, - int32_t output_offset, int8_t output_min, int8_t output_max, bool rounding) { +void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul, + int32_t add, int8_t *data_out, int32_t log2D, + int32_t __attribute__((unused)) HW, + int32_t input_offset, int32_t output_offset, + int8_t output_min, int8_t output_max, + bool rounding) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); @@ -117,7 +127,8 @@ void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul, int3 // Compute i intermediate = (reg_data_in_A + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[i] = out; @@ -126,7 +137,8 @@ void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul, int3 // Compute step halfChunkSize + i intermediate = (reg_data_in_B + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[halfChunkSize + i] = out; } @@ -138,20 +150,25 @@ void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul, int3 reg_data_in_A = data_in[chunk_stop]; intermediate = (reg_data_in_B + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[chunk_stop - 1] = out; intermediate = (reg_data_in_A + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[chunk_stop] = out; } } -void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul, int32_t add, int8_t *data_out, - int32_t log2D, int32_t __attribute__((unused)) HW, int32_t input_offset, - int32_t output_offset, int8_t output_min, int8_t output_max, bool rounding) { +void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul, + int32_t add, int8_t *data_out, int32_t log2D, + int32_t __attribute__((unused)) HW, + int32_t input_offset, int32_t output_offset, + int8_t output_min, int8_t output_max, + bool rounding) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); @@ -177,7 +194,8 @@ void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul, int // Compute i intermediate = (reg_data_in_A + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[i] = out; @@ -186,7 +204,8 @@ void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul, int // Compute step halfChunkSize + i intermediate = (reg_data_in_B + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[halfChunkSize + i] = out; } @@ -198,20 +217,25 @@ void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul, int reg_data_in_A = data_in[chunk_stop]; intermediate = (reg_data_in_B + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[chunk_stop - 1] = out; intermediate = (reg_data_in_A + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[chunk_stop] = out; } } -void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul, int32_t add, int8_t *data_out, - int32_t log2D, int32_t __attribute__((unused)) HW, int32_t input_offset, - int32_t output_offset, int8_t output_min, int8_t output_max, bool rounding) { +void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul, + int32_t add, int8_t *data_out, int32_t log2D, + int32_t __attribute__((unused)) HW, + int32_t input_offset, int32_t output_offset, + int8_t output_min, int8_t output_max, + bool rounding) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); @@ -237,7 +261,8 @@ void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul, int // Compute i intermediate = (reg_data_in_A + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[i] = out; @@ -246,7 +271,8 @@ void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul, int // Compute step halfChunkSize + i intermediate = (reg_data_in_B + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[halfChunkSize + i] = out; } @@ -258,12 +284,14 @@ void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul, int reg_data_in_A = data_in[chunk_stop]; intermediate = (reg_data_in_B + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[chunk_stop - 1] = out; intermediate = (reg_data_in_A + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[chunk_stop] = out; } diff --git a/TargetLibraries/Snitch/src/iSoftmax.c b/TargetLibraries/Snitch/src/iSoftmax.c index 990393a..41194a9 100644 --- a/TargetLibraries/Snitch/src/iSoftmax.c +++ b/TargetLibraries/Snitch/src/iSoftmax.c @@ -29,8 +29,10 @@ #include "DeeploySnitchMath.h" -void SnitchSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out, uint32_t *lastDimBuffer, uint32_t size, - uint32_t lastDimLength, int32_t coeffB, int32_t coeffC, int32_t log2) { +void SnitchSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out, + uint32_t *lastDimBuffer, uint32_t size, + uint32_t lastDimLength, int32_t coeffB, int32_t coeffC, + int32_t log2) { uint8_t z; int16_t xTilde, p; uint32_t y_sum; @@ -53,7 +55,8 @@ void SnitchSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out, uint32_t *lastDimB lastDimBuffer += lastDimLength * core_id; } - for (uint32_t i = offset; i < offset + (chunk * lastDimLength); i += lastDimLength) { + for (uint32_t i = offset; i < offset + (chunk * lastDimLength); + i += lastDimLength) { y_sum = 0; x_max = 0; for (uint32_t j = 0; j < lastDimLength; j++) { @@ -76,8 +79,10 @@ void SnitchSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out, uint32_t *lastDimB } } -void SnitchSoftmax_i8_u8(int8_t *data_in, uint8_t *data_out, uint32_t *lastDimBuffer, uint32_t size, - uint32_t lastDimLength, int32_t coeffB, int32_t coeffC, int32_t log2) { +void SnitchSoftmax_i8_u8(int8_t *data_in, uint8_t *data_out, + uint32_t *lastDimBuffer, uint32_t size, + uint32_t lastDimLength, int32_t coeffB, int32_t coeffC, + int32_t log2) { uint8_t z; int16_t xTilde, p; uint32_t y_sum; @@ -100,7 +105,8 @@ void SnitchSoftmax_i8_u8(int8_t *data_in, uint8_t *data_out, uint32_t *lastDimBu lastDimBuffer += lastDimLength * core_id; } - for (uint32_t i = offset; i < offset + (chunk * lastDimLength); i += lastDimLength) { + for (uint32_t i = offset; i < offset + (chunk * lastDimLength); + i += lastDimLength) { y_sum = 0; x_max = -128;