From d66d7ae0fe9dd9c9a6d0806fb8ffb8c9286272ea Mon Sep 17 00:00:00 2001 From: tahaelbayad Date: Thu, 28 Nov 2024 18:42:11 +0100 Subject: [PATCH] linting --- Deeploy/Targets/Snitch/Deployer.py | 13 +-- Deeploy/Targets/Snitch/Platform.py | 27 ++---- TargetLibraries/Snitch/inc/CycleCounter.h | 3 +- TargetLibraries/Snitch/inc/kernel/Gemm.h | 27 +++--- TargetLibraries/Snitch/inc/kernel/MatMul.h | 64 +++++++++----- TargetLibraries/Snitch/inc/kernel/RQGemm.h | 65 +++++++------- TargetLibraries/Snitch/inc/kernel/RQMatMul.h | 80 +++++++++++------- .../Snitch/inc/kernel/UniformRequantShift.h | 30 ++++--- TargetLibraries/Snitch/inc/kernel/iSoftmax.h | 12 ++- TargetLibraries/Snitch/inc/macros.h | 2 +- TargetLibraries/Snitch/src/MatMul_s16.c | 7 +- TargetLibraries/Snitch/src/MatMul_s32.c | 7 +- TargetLibraries/Snitch/src/MatMul_s8.c | 23 +++-- TargetLibraries/Snitch/src/RQGemm_s8.c | 38 +++++---- TargetLibraries/Snitch/src/RQMatMul_s8.c | 35 ++++---- .../Snitch/src/UniformRequantShift.c | 84 ++++++++++++------- TargetLibraries/Snitch/src/iSoftmax.c | 18 ++-- 17 files changed, 320 insertions(+), 215 deletions(-) diff --git a/Deeploy/Targets/Snitch/Deployer.py b/Deeploy/Targets/Snitch/Deployer.py index 778f468b..ff320669 100644 --- a/Deeploy/Targets/Snitch/Deployer.py +++ b/Deeploy/Targets/Snitch/Deployer.py @@ -29,20 +29,13 @@ import onnx_graphsurgeon as gs from Deeploy.AbstractDataTypes import Pointer -from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer - -# from Deeploy.NetworkDeployers.SignPropDeployer import SignPropDeployer from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer - -# from Deeploy.OptimizationPasses.TopologyOptimizationPasses.BasicPasses import ReshapeConstOptPass, \ -# TransposeConstOptPass, TransposeMergePass, TransposeSplitPass +from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ + NCHWtoNHWCPass, RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass +from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ReshapeConstOptPass, TransposeConstOptPass, \ TransposeMergePass, TransposeSplitPass -# from Deeploy.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import NCHWtoNHWCPass, \ -# RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass -from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ - NCHWtoNHWCPass, RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass class SnitchDeployer(SignPropDeployer): diff --git a/Deeploy/Targets/Snitch/Platform.py b/Deeploy/Targets/Snitch/Platform.py index e22a03d9..50c1747a 100644 --- a/Deeploy/Targets/Snitch/Platform.py +++ b/Deeploy/Targets/Snitch/Platform.py @@ -28,34 +28,17 @@ import numpy as np -# from Deeploy.Bindings.BasicBindings import BasicGatherBindings, BasicMatMulBinding, BasicPad1DBindings, \ -# BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding - -from Deeploy.Targets.Generic.Bindings import BasicGatherBindings,BasicMatMulBinding, BasicPad1DBindings, \ - BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding - from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \ StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer - -# from Deeploy.Layers.BasicLayers import GatherLayer, MatMulLayer, PadLayer, ReshapeLayer, RQIntegerDivLayer +from Deeploy.Targets.Generic.Bindings import BasicGatherBindings, BasicMatMulBinding, BasicPad1DBindings, \ + BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding from Deeploy.Targets.Generic.Layers import GatherLayer, MatMulLayer, PadLayer, ReshapeLayer, RQIntegerDivLayer - -# from Deeploy.OptimizationPasses.TopologyOptimizationPasses.BasicPasses import IntegerDivRequantMergePass, \ -# MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \ -# SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass - -from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import IntegerDivRequantMergePass, \ - MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \ - SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass - -# from Deeploy.Parsers.BasicParsers import GatherParser, MatMulParser, Pad1DParser, Pad2DParser, RQIntegerDivParser, \ -# UnsqueezeParser from Deeploy.Targets.Generic.Parsers import GatherParser, MatMulParser, Pad1DParser, Pad2DParser, RQIntegerDivParser, \ UnsqueezeParser - -# from Deeploy.Templates.BasicTemplates import AllocateTemplate as BasicAllocateTemplate from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate - +from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import IntegerDivRequantMergePass, \ + MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \ + SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass from Deeploy.Targets.Snitch.Templates import AllocateTemplate, FreeTemplate GatherMapper = NodeMapper(GatherParser(), BasicGatherBindings) diff --git a/TargetLibraries/Snitch/inc/CycleCounter.h b/TargetLibraries/Snitch/inc/CycleCounter.h index 092e5833..f1970557 100644 --- a/TargetLibraries/Snitch/inc/CycleCounter.h +++ b/TargetLibraries/Snitch/inc/CycleCounter.h @@ -44,7 +44,8 @@ void StopTimer(void); // Returns the current number of cycles according to the internal cycle counter uint32_t getCycles(void); -// Returns the current number of instructions according to the internal instructions counter +// Returns the current number of instructions according to the internal +// instructions counter uint32_t getInstr(void); #endif //__DEEPLOY_MATH_CYCLE_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/Gemm.h b/TargetLibraries/Snitch/inc/kernel/Gemm.h index 70835c3a..f3209429 100644 --- a/TargetLibraries/Snitch/inc/kernel/Gemm.h +++ b/TargetLibraries/Snitch/inc/kernel/Gemm.h @@ -57,19 +57,26 @@ * simd = no * cleanup = yes */ -void Gemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t const *__restrict__ pSrcC, int32_t *__restrict__ pDstY, uint32_t M, uint32_t N, - uint32_t P, int32_t alpha, int32_t beta, int32_t transA, int32_t transB, int32_t A_offset, - int32_t B_offset, int32_t C_offset, int32_t Y_offset); +void Gemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t const *__restrict__ pSrcC, + int32_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t P, int32_t alpha, + int32_t beta, int32_t transA, int32_t transB, + int32_t A_offset, int32_t B_offset, + int32_t C_offset, int32_t Y_offset); // Mapper Functions static inline void __attribute__((always_inline)) -Gemm_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, int32_t const *__restrict__ pSrcC, - int32_t *__restrict__ pDstY, uint32_t M, uint32_t N, uint32_t P, int32_t alpha, int32_t beta, - int32_t transA, int32_t transB, int32_t A_offset, int32_t B_offset, int32_t C_offset, - int32_t Y_offset) { - Gemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, A_offset, B_offset, - C_offset, Y_offset); +Gemm_parallel_s8(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t const *__restrict__ pSrcC, int32_t *__restrict__ pDstY, + uint32_t M, uint32_t N, uint32_t P, int32_t alpha, + int32_t beta, int32_t transA, int32_t transB, int32_t A_offset, + int32_t B_offset, int32_t C_offset, int32_t Y_offset) { + Gemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, + transA, transB, A_offset, B_offset, C_offset, + Y_offset); } #endif //__DEEPLOY_MATH_GEMM_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/MatMul.h b/TargetLibraries/Snitch/inc/kernel/MatMul.h index 75ecb835..be0cdeac 100644 --- a/TargetLibraries/Snitch/inc/kernel/MatMul.h +++ b/TargetLibraries/Snitch/inc/kernel/MatMul.h @@ -58,8 +58,10 @@ * simd = no * cleanup = yes */ -void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, int32_t A_offset, +void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, uint32_t M, + uint32_t N, uint32_t P, int32_t A_offset, int32_t B_offset, int32_t output_offset); /* @@ -71,8 +73,10 @@ void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_ * simd = no * cleanup = no */ -void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P); +void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, + uint32_t M, uint32_t N, uint32_t P); /* * Matrix multiplication ---------------------------------- @@ -83,29 +87,39 @@ void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, in * simd = no * cleanup = no */ -void MatMul_offset_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, - int32_t A_offset, int32_t B_offset, int32_t output_offset); +void MatMul_offset_unrolled_2x2_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, + int32_t A_offset, int32_t B_offset, int32_t output_offset); // Mapper Functions static inline void __attribute__((always_inline)) -MatMul_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, int32_t *__restrict__ pDstC, - uint32_t M, uint32_t N, uint32_t P, int32_t A_offset, int32_t B_offset, int32_t output_offset) { - MatMul_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset); +MatMul_parallel_s8(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, + uint32_t P, int32_t A_offset, int32_t B_offset, + int32_t output_offset) { + MatMul_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, + output_offset); } -static inline void __attribute__((always_inline)) MatMul_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, - int8_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, - uint32_t M, uint32_t N, uint32_t P) { +static inline void __attribute__((always_inline)) +MatMul_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, uint32_t M, + uint32_t N, uint32_t P) { MatMul_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P); } static inline void __attribute__((always_inline)) -MatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, - int32_t A_offset, int32_t B_offset, int32_t output_offset) { - MatMul_offset_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset); +MatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, uint32_t M, + uint32_t N, uint32_t P, int32_t A_offset, + int32_t B_offset, + int32_t output_offset) { + MatMul_offset_unrolled_2x2_parallel_s8_rv32im( + pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset); } /******************************************************************************/ @@ -121,8 +135,11 @@ MatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t * simd = no * cleanup = no */ -void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA, int16_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P); +void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA, + int16_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, + uint32_t M, uint32_t N, + uint32_t P); /******************************************************************************/ /* Matrix Multiplication (32bit) */ @@ -139,7 +156,10 @@ void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA, * other = loads/stores explicitly written in asm * for optimal register utilization */ -void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA, int32_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P); +void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA, + int32_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, + uint32_t M, uint32_t N, + uint32_t P); #endif //__DEEPLOY_MATH_MATMUL_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/RQGemm.h b/TargetLibraries/Snitch/inc/kernel/RQGemm.h index ae5e3663..e86b579b 100644 --- a/TargetLibraries/Snitch/inc/kernel/RQGemm.h +++ b/TargetLibraries/Snitch/inc/kernel/RQGemm.h @@ -57,12 +57,13 @@ * simd = no * cleanup = yes */ -void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, uint32_t N, - uint32_t P, int32_t alpha, int32_t beta, int32_t transA, int32_t transB, int32_t *mul, - int32_t *add, int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, - int32_t B_offset, int32_t C_offset, int32_t Y_offset, int8_t output_min, - int8_t output_max); +void RQGemm_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA, + int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset, + int32_t Y_offset, int8_t output_min, int8_t output_max); /* * General Requantized Matrix multiplication ---------------------------------- @@ -73,33 +74,41 @@ void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_ * simd = no * cleanup = no */ -void RQGemm_offset_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, - uint32_t M, uint32_t N, uint32_t P, int32_t alpha, int32_t beta, - int32_t transA, int32_t transB, int32_t *mul, int32_t *add, - int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, - int32_t B_offset, int32_t C_offset, int32_t Y_offset); +void RQGemm_offset_unrolled_2x2_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA, + int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset, + int32_t Y_offset); // Mapper Functions -static inline void __attribute__((always_inline)) -RQGemm_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, uint32_t N, uint32_t P, - int32_t alpha, int32_t beta, int32_t transA, int32_t transB, int32_t *mul, int32_t *add, - int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, int32_t B_offset, - int32_t C_offset, int32_t Y_offset, int8_t output_min, int8_t output_max) { - RQGemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, mul, add, log2D, rounding, - per_row_quant, A_offset, B_offset, C_offset, Y_offset, output_min, output_max); +static inline void __attribute__((always_inline)) RQGemm_parallel_s8( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA, + int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset, + int32_t Y_offset, int8_t output_min, int8_t output_max) { + RQGemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, + transA, transB, mul, add, log2D, rounding, + per_row_quant, A_offset, B_offset, C_offset, + Y_offset, output_min, output_max); } // Mapper Functions -static inline void __attribute__((always_inline)) RQGemm_offset_unrolled_2x2_parallel_s8( - int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, int32_t const *__restrict__ pSrcC, - int8_t *__restrict__ pDstY, uint32_t M, uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA, - int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, - int32_t B_offset, int32_t C_offset, int32_t Y_offset) { - RQGemm_offset_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, mul, - add, log2D, rounding, per_row_quant, A_offset, B_offset, C_offset, - Y_offset); +static inline void __attribute__((always_inline)) +RQGemm_offset_unrolled_2x2_parallel_s8( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA, + int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset, + int32_t Y_offset) { + RQGemm_offset_unrolled_2x2_parallel_s8_rv32im( + pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, mul, + add, log2D, rounding, per_row_quant, A_offset, B_offset, C_offset, + Y_offset); } #endif //__DEEPLOY_MATH_RQGEMM_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/RQMatMul.h b/TargetLibraries/Snitch/inc/kernel/RQMatMul.h index 72eec46f..fdac5124 100644 --- a/TargetLibraries/Snitch/inc/kernel/RQMatMul.h +++ b/TargetLibraries/Snitch/inc/kernel/RQMatMul.h @@ -32,8 +32,9 @@ #include "DeeploySnitchMath.h" -/* This library implements the requantiyed matrix multiplication for several data widths - * in multiple different ways. The functions all follow the following format: +/* This library implements the requantiyed matrix multiplication for several + * data widths in multiple different ways. The functions all follow the + * following format: * * A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix * C = AB @@ -56,10 +57,14 @@ * simd = no * cleanup = yes */ -void RQMatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, int32_t *mul, - int32_t *add, int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, - int32_t B_offset, int32_t output_offset, int8_t output_min, int8_t output_max); +void RQMatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, + uint32_t N, uint32_t P, int32_t *mul, + int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, + int32_t B_offset, int32_t output_offset, + int8_t output_min, int8_t output_max); /* * Matrix multiplication ---------------------------------- @@ -70,10 +75,11 @@ void RQMatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const * simd = no * cleanup = no */ -void RQMatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, - int32_t *mul, int32_t *add, int32_t log2D, bool rounding, - bool per_row_quant); +void RQMatMul_unrolled_2x2_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, + int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant); /* * Matrix multiplication ---------------------------------- @@ -84,36 +90,50 @@ void RQMatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, * simd = no * cleanup = no */ -void RQMatMul_offset_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, - int32_t *mul, int32_t *add, int32_t log2D, bool rounding, - bool per_row_quant, int32_t A_offset, int32_t B_offset, - int32_t output_offset); +void RQMatMul_offset_unrolled_2x2_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, + int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, int32_t B_offset, + int32_t output_offset); // Mapper Functions static inline void __attribute__((always_inline)) -RQMatMul_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, int8_t *__restrict__ pDstC, - uint32_t M, uint32_t N, uint32_t P, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, - bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t output_offset, int8_t output_min, +RQMatMul_parallel_s8(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, + uint32_t P, int32_t *mul, int32_t *add, int32_t log2D, + bool rounding, bool per_row_quant, int32_t A_offset, + int32_t B_offset, int32_t output_offset, int8_t output_min, int8_t output_max) { - RQMatMul_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, mul, add, log2D, rounding, per_row_quant, A_offset, - B_offset, output_offset, output_min, output_max); + RQMatMul_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, mul, add, log2D, + rounding, per_row_quant, A_offset, B_offset, + output_offset, output_min, output_max); } static inline void __attribute__((always_inline)) -RQMatMul_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, int32_t *mul, - int32_t *add, int32_t log2D, bool rounding, bool per_row_quant) { - RQMatMul_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, mul, add, log2D, rounding, per_row_quant); +RQMatMul_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, + uint32_t N, uint32_t P, int32_t *mul, + int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant) { + RQMatMul_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, mul, + add, log2D, rounding, per_row_quant); } static inline void __attribute__((always_inline)) -RQMatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, int32_t *mul, - int32_t *add, int32_t log2D, bool rounding, bool per_row_quant, - int32_t A_offset, int32_t B_offset, int32_t output_offset) { - RQMatMul_offset_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, mul, add, log2D, rounding, - per_row_quant, A_offset, B_offset, output_offset); +RQMatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, + uint32_t N, uint32_t P, int32_t *mul, + int32_t *add, int32_t log2D, + bool rounding, bool per_row_quant, + int32_t A_offset, int32_t B_offset, + int32_t output_offset) { + RQMatMul_offset_unrolled_2x2_parallel_s8_rv32im( + pSrcA, pSrcB, pDstC, M, N, P, mul, add, log2D, rounding, per_row_quant, + A_offset, B_offset, output_offset); } #endif //__DEEPLOY_MATH_RQMATMUL_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/UniformRequantShift.h b/TargetLibraries/Snitch/inc/kernel/UniformRequantShift.h index b30c35db..79887d23 100644 --- a/TargetLibraries/Snitch/inc/kernel/UniformRequantShift.h +++ b/TargetLibraries/Snitch/inc/kernel/UniformRequantShift.h @@ -29,18 +29,26 @@ #include "DeeploySnitchMath.h" -void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul, int32_t add, int8_t *data_out, int32_t log2D, - int32_t HW, int32_t input_offset, int32_t output_offset, int8_t output_min, +void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul, + int32_t add, int8_t *data_out, int32_t log2D, + int32_t HW, int32_t input_offset, + int32_t output_offset, int8_t output_min, int8_t output_max, bool rounding); -void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul, int32_t add, int8_t *data_out, - int32_t log2D, int32_t HW, int32_t input_offset, int32_t output_offset, - int8_t output_min, int8_t output_max, bool rounding); +void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul, + int32_t add, int8_t *data_out, int32_t log2D, + int32_t HW, int32_t input_offset, + int32_t output_offset, int8_t output_min, + int8_t output_max, bool rounding); -void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul, int32_t add, int8_t *data_out, - int32_t log2D, int32_t HW, int32_t input_offset, int32_t output_offset, - int8_t output_min, int8_t output_max, bool rounding); +void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul, + int32_t add, int8_t *data_out, int32_t log2D, + int32_t HW, int32_t input_offset, + int32_t output_offset, int8_t output_min, + int8_t output_max, bool rounding); -void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul, int32_t add, int8_t *data_out, - int32_t log2D, int32_t HW, int32_t input_offset, int32_t output_offset, - int8_t output_min, int8_t output_max, bool rounding); \ No newline at end of file +void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul, + int32_t add, int8_t *data_out, int32_t log2D, + int32_t HW, int32_t input_offset, + int32_t output_offset, int8_t output_min, + int8_t output_max, bool rounding); \ No newline at end of file diff --git a/TargetLibraries/Snitch/inc/kernel/iSoftmax.h b/TargetLibraries/Snitch/inc/kernel/iSoftmax.h index 603248aa..a59c54e9 100644 --- a/TargetLibraries/Snitch/inc/kernel/iSoftmax.h +++ b/TargetLibraries/Snitch/inc/kernel/iSoftmax.h @@ -29,7 +29,11 @@ #include "DeeploySnitchMath.h" -void SnitchSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out, uint32_t *lastDimBuffer, uint32_t size, - uint32_t lastDimLength, int32_t coeffB, int32_t coeffC, int32_t log2); -void StnichSoftmax_i8_u8(int8_t *data_in, uint8_t *data_out, uint32_t *lastDimBuffer, uint32_t size, - uint32_t lastDimLength, int32_t coeffB, int32_t coeffC, int32_t log2); +void SnitchSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out, + uint32_t *lastDimBuffer, uint32_t size, + uint32_t lastDimLength, int32_t coeffB, int32_t coeffC, + int32_t log2); +void StnichSoftmax_i8_u8(int8_t *data_in, uint8_t *data_out, + uint32_t *lastDimBuffer, uint32_t size, + uint32_t lastDimLength, int32_t coeffB, int32_t coeffC, + int32_t log2); diff --git a/TargetLibraries/Snitch/inc/macros.h b/TargetLibraries/Snitch/inc/macros.h index aa335cf2..44c708ed 100644 --- a/TargetLibraries/Snitch/inc/macros.h +++ b/TargetLibraries/Snitch/inc/macros.h @@ -29,7 +29,7 @@ #ifndef __DEEPLOY_MATH_MACROS_HEADER_ #define __DEEPLOY_MATH_MACROS_HEADER_ -// #define log2(x) __builtin_pulp_fl1(x) +// #define log2(x) __builtin_pulp_fl1(x) #define INT_LOG2(x) __builtin_ctz(x) #endif //__DEEPLOY_MATH_MACROS_HEADER_ diff --git a/TargetLibraries/Snitch/src/MatMul_s16.c b/TargetLibraries/Snitch/src/MatMul_s16.c index 40b9ea2d..f4c56ba1 100644 --- a/TargetLibraries/Snitch/src/MatMul_s16.c +++ b/TargetLibraries/Snitch/src/MatMul_s16.c @@ -31,8 +31,11 @@ #include "DeeploySnitchMath.h" -void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA, int16_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P) { +void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA, + int16_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, + uint32_t M, uint32_t N, + uint32_t P) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); diff --git a/TargetLibraries/Snitch/src/MatMul_s32.c b/TargetLibraries/Snitch/src/MatMul_s32.c index fe5cbe38..4ba81a74 100644 --- a/TargetLibraries/Snitch/src/MatMul_s32.c +++ b/TargetLibraries/Snitch/src/MatMul_s32.c @@ -31,8 +31,11 @@ #include "DeeploySnitchMath.h" -void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA, int32_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P) { +void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA, + int32_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, + uint32_t M, uint32_t N, + uint32_t P) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); diff --git a/TargetLibraries/Snitch/src/MatMul_s8.c b/TargetLibraries/Snitch/src/MatMul_s8.c index 39930303..e9ad77f3 100644 --- a/TargetLibraries/Snitch/src/MatMul_s8.c +++ b/TargetLibraries/Snitch/src/MatMul_s8.c @@ -30,8 +30,10 @@ */ #include "DeeploySnitchMath.h" -void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, int32_t A_offset, +void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, uint32_t M, + uint32_t N, uint32_t P, int32_t A_offset, int32_t B_offset, int32_t output_offset) { uint32_t core_id = snrt_global_compute_core_idx(); @@ -46,15 +48,19 @@ void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_ for (uint32_t j = c_start; j < c_end; ++j) { int32_t sum = 0; for (uint32_t k = 0; k < N; ++k) { - sum += (int32_t)(pSrcA[i * N + k] + A_offset) * (pSrcB[k * P + j] + B_offset); + sum += (int32_t)(pSrcA[i * N + k] + A_offset) * + (pSrcB[k * P + j] + B_offset); } pDstC[i * P + j] = sum + output_offset; } } } -void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P) { +void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, + uint32_t M, uint32_t N, + uint32_t P) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); @@ -96,9 +102,10 @@ void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, in } } -void MatMul_offset_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, - int32_t A_offset, int32_t B_offset, int32_t output_offset) { +void MatMul_offset_unrolled_2x2_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, + int32_t A_offset, int32_t B_offset, int32_t output_offset) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); diff --git a/TargetLibraries/Snitch/src/RQGemm_s8.c b/TargetLibraries/Snitch/src/RQGemm_s8.c index 5f939dfa..1850bf08 100644 --- a/TargetLibraries/Snitch/src/RQGemm_s8.c +++ b/TargetLibraries/Snitch/src/RQGemm_s8.c @@ -28,12 +28,13 @@ */ #include "DeeploySnitchMath.h" -void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, uint32_t N, - uint32_t P, int32_t alpha, int32_t beta, int32_t transA, int32_t transB, int32_t *mul, - int32_t *add, int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, - int32_t B_offset, int32_t C_offset, int32_t Y_offset, int8_t output_min, - int8_t output_max) { +void RQGemm_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA, + int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset, + int32_t Y_offset, int8_t output_min, int8_t output_max) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); @@ -58,7 +59,8 @@ void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_ for (uint32_t p = c_start; p < c_end; ++p) { int32_t sum = 0; for (uint32_t n = 0; n < N; ++n) { - sum += (int32_t)(pSrcA[m * N + n] + A_offset) * (pSrcB[n * P + p] + B_offset); + sum += (int32_t)(pSrcA[m * N + n] + A_offset) * + (pSrcB[n * P + p] + B_offset); } // Requantize value sum = alpha * sum + beta * pSrcC[m * P + p] + bias; @@ -76,7 +78,8 @@ void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_ for (uint32_t p = c_start; p < c_end; ++p) { int32_t sum = 0; for (uint32_t n = 0; n < N; ++n) { - sum += (int32_t)(pSrcA[n * M + m] + A_offset) * (pSrcB[n * P + p] + B_offset); + sum += (int32_t)(pSrcA[n * M + m] + A_offset) * + (pSrcB[n * P + p] + B_offset); } // Requantize value sum = alpha * sum + beta * pSrcC[m * P + p] + bias; @@ -94,7 +97,8 @@ void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_ for (uint32_t p = c_start; p < c_end; ++p) { int32_t sum = 0; for (uint32_t n = 0; n < N; ++n) { - sum += (int32_t)(pSrcA[m * N + n] + A_offset) * (pSrcB[p * N + n] + B_offset); + sum += (int32_t)(pSrcA[m * N + n] + A_offset) * + (pSrcB[p * N + n] + B_offset); } // Requantize value sum = alpha * sum + beta * pSrcC[m * P + p] + bias; @@ -112,7 +116,8 @@ void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_ for (uint32_t p = c_start; p < c_end; ++p) { int32_t sum = 0; for (uint32_t n = 0; n < N; ++n) { - sum += (int32_t)(pSrcA[n * M + m] + A_offset) * (pSrcB[p * N + n] + B_offset); + sum += (int32_t)(pSrcA[n * M + m] + A_offset) * + (pSrcB[p * N + n] + B_offset); } // Requantize value sum = alpha * sum + beta * pSrcC[m * P + p] + bias; @@ -124,12 +129,13 @@ void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_ } } -void RQGemm_offset_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, - uint32_t M, uint32_t N, uint32_t P, int32_t alpha, int32_t beta, - int32_t transA, int32_t transB, int32_t *mul, int32_t *add, - int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, - int32_t B_offset, int32_t C_offset, int32_t Y_offset) { +void RQGemm_offset_unrolled_2x2_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA, + int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset, + int32_t Y_offset) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); diff --git a/TargetLibraries/Snitch/src/RQMatMul_s8.c b/TargetLibraries/Snitch/src/RQMatMul_s8.c index 229a75a5..08314150 100644 --- a/TargetLibraries/Snitch/src/RQMatMul_s8.c +++ b/TargetLibraries/Snitch/src/RQMatMul_s8.c @@ -28,10 +28,14 @@ */ #include "DeeploySnitchMath.h" -void RQMatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, int32_t *mul, - int32_t *add, int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, - int32_t B_offset, int32_t output_offset, int8_t output_min, int8_t output_max) { +void RQMatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, + int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, + uint32_t N, uint32_t P, int32_t *mul, + int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, + int32_t B_offset, int32_t output_offset, + int8_t output_min, int8_t output_max) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); @@ -54,7 +58,8 @@ void RQMatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const for (uint32_t j = c_start; j < c_end; ++j) { int32_t sum = 0; for (uint32_t k = 0; k < N; ++k) { - sum += (int32_t)(pSrcA[i * N + k] + A_offset) * (pSrcB[k * P + j] + B_offset); + sum += (int32_t)(pSrcA[i * N + k] + A_offset) * + (pSrcB[k * P + j] + B_offset); } // Requantize value sum = sum * _mul + rqs_bias + _add; @@ -64,10 +69,11 @@ void RQMatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const } } -void RQMatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, - int32_t *mul, int32_t *add, int32_t log2D, bool rounding, - bool per_row_quant) { +void RQMatMul_unrolled_2x2_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, + int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); @@ -133,11 +139,12 @@ void RQMatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, } } -void RQMatMul_offset_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, - int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, - int32_t *mul, int32_t *add, int32_t log2D, bool rounding, - bool per_row_quant, int32_t A_offset, int32_t B_offset, - int32_t output_offset) { +void RQMatMul_offset_unrolled_2x2_parallel_s8_rv32im( + int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, + int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, + int32_t *mul, int32_t *add, int32_t log2D, bool rounding, + bool per_row_quant, int32_t A_offset, int32_t B_offset, + int32_t output_offset) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); diff --git a/TargetLibraries/Snitch/src/UniformRequantShift.c b/TargetLibraries/Snitch/src/UniformRequantShift.c index 3cb47035..a45a0b7e 100644 --- a/TargetLibraries/Snitch/src/UniformRequantShift.c +++ b/TargetLibraries/Snitch/src/UniformRequantShift.c @@ -29,9 +29,12 @@ #include "DeeploySnitchMath.h" -void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul, int32_t add, int8_t *data_out, int32_t log2D, - int32_t __attribute__((unused)) HW, int32_t input_offset, int32_t output_offset, - int8_t output_min, int8_t output_max, bool rounding) { +void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul, + int32_t add, int8_t *data_out, int32_t log2D, + int32_t __attribute__((unused)) HW, + int32_t input_offset, int32_t output_offset, + int8_t output_min, int8_t output_max, + bool rounding) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); @@ -57,7 +60,8 @@ void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul, int32 // Compute i intermediate = (reg_data_in_A + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[i] = out; @@ -66,7 +70,8 @@ void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul, int32 // Compute step halfChunkSize + i intermediate = (reg_data_in_B + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[halfChunkSize + i] = out; } @@ -78,20 +83,25 @@ void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul, int32 reg_data_in_A = data_in[chunk_stop]; intermediate = (reg_data_in_B + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[chunk_stop - 1] = out; intermediate = (reg_data_in_A + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[chunk_stop] = out; } } -void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul, int32_t add, int8_t *data_out, - int32_t log2D, int32_t __attribute__((unused)) HW, int32_t input_offset, - int32_t output_offset, int8_t output_min, int8_t output_max, bool rounding) { +void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul, + int32_t add, int8_t *data_out, int32_t log2D, + int32_t __attribute__((unused)) HW, + int32_t input_offset, int32_t output_offset, + int8_t output_min, int8_t output_max, + bool rounding) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); @@ -117,7 +127,8 @@ void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul, int3 // Compute i intermediate = (reg_data_in_A + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[i] = out; @@ -126,7 +137,8 @@ void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul, int3 // Compute step halfChunkSize + i intermediate = (reg_data_in_B + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[halfChunkSize + i] = out; } @@ -138,20 +150,25 @@ void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul, int3 reg_data_in_A = data_in[chunk_stop]; intermediate = (reg_data_in_B + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[chunk_stop - 1] = out; intermediate = (reg_data_in_A + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[chunk_stop] = out; } } -void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul, int32_t add, int8_t *data_out, - int32_t log2D, int32_t __attribute__((unused)) HW, int32_t input_offset, - int32_t output_offset, int8_t output_min, int8_t output_max, bool rounding) { +void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul, + int32_t add, int8_t *data_out, int32_t log2D, + int32_t __attribute__((unused)) HW, + int32_t input_offset, int32_t output_offset, + int8_t output_min, int8_t output_max, + bool rounding) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); @@ -177,7 +194,8 @@ void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul, int // Compute i intermediate = (reg_data_in_A + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[i] = out; @@ -186,7 +204,8 @@ void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul, int // Compute step halfChunkSize + i intermediate = (reg_data_in_B + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[halfChunkSize + i] = out; } @@ -198,20 +217,25 @@ void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul, int reg_data_in_A = data_in[chunk_stop]; intermediate = (reg_data_in_B + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[chunk_stop - 1] = out; intermediate = (reg_data_in_A + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[chunk_stop] = out; } } -void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul, int32_t add, int8_t *data_out, - int32_t log2D, int32_t __attribute__((unused)) HW, int32_t input_offset, - int32_t output_offset, int8_t output_min, int8_t output_max, bool rounding) { +void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul, + int32_t add, int8_t *data_out, int32_t log2D, + int32_t __attribute__((unused)) HW, + int32_t input_offset, int32_t output_offset, + int8_t output_min, int8_t output_max, + bool rounding) { uint32_t core_id = snrt_global_compute_core_idx(); uint32_t numThreads = snrt_global_compute_core_num(); @@ -237,7 +261,8 @@ void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul, int // Compute i intermediate = (reg_data_in_A + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[i] = out; @@ -246,7 +271,8 @@ void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul, int // Compute step halfChunkSize + i intermediate = (reg_data_in_B + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[halfChunkSize + i] = out; } @@ -258,12 +284,14 @@ void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul, int reg_data_in_A = data_in[chunk_stop]; intermediate = (reg_data_in_B + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[chunk_stop - 1] = out; intermediate = (reg_data_in_A + input_offset) * mul + add; - intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + output_offset; + intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) + + output_offset; out = (int8_t)CLAMP(intermediate, output_min, output_max); data_out[chunk_stop] = out; } diff --git a/TargetLibraries/Snitch/src/iSoftmax.c b/TargetLibraries/Snitch/src/iSoftmax.c index 990393a3..41194a96 100644 --- a/TargetLibraries/Snitch/src/iSoftmax.c +++ b/TargetLibraries/Snitch/src/iSoftmax.c @@ -29,8 +29,10 @@ #include "DeeploySnitchMath.h" -void SnitchSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out, uint32_t *lastDimBuffer, uint32_t size, - uint32_t lastDimLength, int32_t coeffB, int32_t coeffC, int32_t log2) { +void SnitchSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out, + uint32_t *lastDimBuffer, uint32_t size, + uint32_t lastDimLength, int32_t coeffB, int32_t coeffC, + int32_t log2) { uint8_t z; int16_t xTilde, p; uint32_t y_sum; @@ -53,7 +55,8 @@ void SnitchSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out, uint32_t *lastDimB lastDimBuffer += lastDimLength * core_id; } - for (uint32_t i = offset; i < offset + (chunk * lastDimLength); i += lastDimLength) { + for (uint32_t i = offset; i < offset + (chunk * lastDimLength); + i += lastDimLength) { y_sum = 0; x_max = 0; for (uint32_t j = 0; j < lastDimLength; j++) { @@ -76,8 +79,10 @@ void SnitchSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out, uint32_t *lastDimB } } -void SnitchSoftmax_i8_u8(int8_t *data_in, uint8_t *data_out, uint32_t *lastDimBuffer, uint32_t size, - uint32_t lastDimLength, int32_t coeffB, int32_t coeffC, int32_t log2) { +void SnitchSoftmax_i8_u8(int8_t *data_in, uint8_t *data_out, + uint32_t *lastDimBuffer, uint32_t size, + uint32_t lastDimLength, int32_t coeffB, int32_t coeffC, + int32_t log2) { uint8_t z; int16_t xTilde, p; uint32_t y_sum; @@ -100,7 +105,8 @@ void SnitchSoftmax_i8_u8(int8_t *data_in, uint8_t *data_out, uint32_t *lastDimBu lastDimBuffer += lastDimLength * core_id; } - for (uint32_t i = offset; i < offset + (chunk * lastDimLength); i += lastDimLength) { + for (uint32_t i = offset; i < offset + (chunk * lastDimLength); + i += lastDimLength) { y_sum = 0; x_max = -128;