Skip to content

Commit

Permalink
linting
Browse files Browse the repository at this point in the history
  • Loading branch information
tahaelbayad committed Nov 28, 2024
1 parent 7c5ffd5 commit d66d7ae
Show file tree
Hide file tree
Showing 17 changed files with 320 additions and 215 deletions.
13 changes: 3 additions & 10 deletions Deeploy/Targets/Snitch/Deployer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,13 @@
import onnx_graphsurgeon as gs

from Deeploy.AbstractDataTypes import Pointer
from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer

# from Deeploy.NetworkDeployers.SignPropDeployer import SignPropDeployer
from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer

# from Deeploy.OptimizationPasses.TopologyOptimizationPasses.BasicPasses import ReshapeConstOptPass, \
# TransposeConstOptPass, TransposeMergePass, TransposeSplitPass
from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
NCHWtoNHWCPass, RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass
from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ReshapeConstOptPass, TransposeConstOptPass, \
TransposeMergePass, TransposeSplitPass

# from Deeploy.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import NCHWtoNHWCPass, \
# RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass
from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
NCHWtoNHWCPass, RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass

class SnitchDeployer(SignPropDeployer):

Expand Down
27 changes: 5 additions & 22 deletions Deeploy/Targets/Snitch/Platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,34 +28,17 @@

import numpy as np

# from Deeploy.Bindings.BasicBindings import BasicGatherBindings, BasicMatMulBinding, BasicPad1DBindings, \
# BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding

from Deeploy.Targets.Generic.Bindings import BasicGatherBindings,BasicMatMulBinding, BasicPad1DBindings, \
BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding

from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \
StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer

# from Deeploy.Layers.BasicLayers import GatherLayer, MatMulLayer, PadLayer, ReshapeLayer, RQIntegerDivLayer
from Deeploy.Targets.Generic.Bindings import BasicGatherBindings, BasicMatMulBinding, BasicPad1DBindings, \
BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding
from Deeploy.Targets.Generic.Layers import GatherLayer, MatMulLayer, PadLayer, ReshapeLayer, RQIntegerDivLayer

# from Deeploy.OptimizationPasses.TopologyOptimizationPasses.BasicPasses import IntegerDivRequantMergePass, \
# MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \
# SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass

from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import IntegerDivRequantMergePass, \
MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \
SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass

# from Deeploy.Parsers.BasicParsers import GatherParser, MatMulParser, Pad1DParser, Pad2DParser, RQIntegerDivParser, \
# UnsqueezeParser
from Deeploy.Targets.Generic.Parsers import GatherParser, MatMulParser, Pad1DParser, Pad2DParser, RQIntegerDivParser, \
UnsqueezeParser

# from Deeploy.Templates.BasicTemplates import AllocateTemplate as BasicAllocateTemplate
from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate

from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import IntegerDivRequantMergePass, \
MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \
SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass
from Deeploy.Targets.Snitch.Templates import AllocateTemplate, FreeTemplate

GatherMapper = NodeMapper(GatherParser(), BasicGatherBindings)
Expand Down
3 changes: 2 additions & 1 deletion TargetLibraries/Snitch/inc/CycleCounter.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ void StopTimer(void);
// Returns the current number of cycles according to the internal cycle counter
uint32_t getCycles(void);

// Returns the current number of instructions according to the internal instructions counter
// Returns the current number of instructions according to the internal
// instructions counter
uint32_t getInstr(void);

#endif //__DEEPLOY_MATH_CYCLE_HEADER_
27 changes: 17 additions & 10 deletions TargetLibraries/Snitch/inc/kernel/Gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,26 @@
* simd = no
* cleanup = yes
*/
void Gemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t const *__restrict__ pSrcC, int32_t *__restrict__ pDstY, uint32_t M, uint32_t N,
uint32_t P, int32_t alpha, int32_t beta, int32_t transA, int32_t transB, int32_t A_offset,
int32_t B_offset, int32_t C_offset, int32_t Y_offset);
void Gemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA,
int8_t const *__restrict__ pSrcB,
int32_t const *__restrict__ pSrcC,
int32_t *__restrict__ pDstY, uint32_t M,
uint32_t N, uint32_t P, int32_t alpha,
int32_t beta, int32_t transA, int32_t transB,
int32_t A_offset, int32_t B_offset,
int32_t C_offset, int32_t Y_offset);

// Mapper Functions
static inline void __attribute__((always_inline))
Gemm_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, int32_t const *__restrict__ pSrcC,
int32_t *__restrict__ pDstY, uint32_t M, uint32_t N, uint32_t P, int32_t alpha, int32_t beta,
int32_t transA, int32_t transB, int32_t A_offset, int32_t B_offset, int32_t C_offset,
int32_t Y_offset) {
Gemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, A_offset, B_offset,
C_offset, Y_offset);
Gemm_parallel_s8(int8_t const *__restrict__ pSrcA,
int8_t const *__restrict__ pSrcB,
int32_t const *__restrict__ pSrcC, int32_t *__restrict__ pDstY,
uint32_t M, uint32_t N, uint32_t P, int32_t alpha,
int32_t beta, int32_t transA, int32_t transB, int32_t A_offset,
int32_t B_offset, int32_t C_offset, int32_t Y_offset) {
Gemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta,
transA, transB, A_offset, B_offset, C_offset,
Y_offset);
}

#endif //__DEEPLOY_MATH_GEMM_KERNEL_HEADER_
64 changes: 42 additions & 22 deletions TargetLibraries/Snitch/inc/kernel/MatMul.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@
* simd = no
* cleanup = yes
*/
void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, int32_t A_offset,
void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA,
int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M,
uint32_t N, uint32_t P, int32_t A_offset,
int32_t B_offset, int32_t output_offset);

/*
Expand All @@ -71,8 +73,10 @@ void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_
* simd = no
* cleanup = no
*/
void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P);
void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA,
int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC,
uint32_t M, uint32_t N, uint32_t P);

/*
* Matrix multiplication ----------------------------------
Expand All @@ -83,29 +87,39 @@ void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, in
* simd = no
* cleanup = no
*/
void MatMul_offset_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
int32_t A_offset, int32_t B_offset, int32_t output_offset);
void MatMul_offset_unrolled_2x2_parallel_s8_rv32im(
int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
int32_t A_offset, int32_t B_offset, int32_t output_offset);

// Mapper Functions
static inline void __attribute__((always_inline))
MatMul_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, int32_t *__restrict__ pDstC,
uint32_t M, uint32_t N, uint32_t P, int32_t A_offset, int32_t B_offset, int32_t output_offset) {
MatMul_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset);
MatMul_parallel_s8(int8_t const *__restrict__ pSrcA,
int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M, uint32_t N,
uint32_t P, int32_t A_offset, int32_t B_offset,
int32_t output_offset) {
MatMul_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset,
output_offset);
}

static inline void __attribute__((always_inline)) MatMul_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA,
int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC,
uint32_t M, uint32_t N, uint32_t P) {
static inline void __attribute__((always_inline))
MatMul_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA,
int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M,
uint32_t N, uint32_t P) {
MatMul_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P);
}

static inline void __attribute__((always_inline))
MatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
int32_t A_offset, int32_t B_offset, int32_t output_offset) {
MatMul_offset_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset);
MatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA,
int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M,
uint32_t N, uint32_t P, int32_t A_offset,
int32_t B_offset,
int32_t output_offset) {
MatMul_offset_unrolled_2x2_parallel_s8_rv32im(
pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset);
}

/******************************************************************************/
Expand All @@ -121,8 +135,11 @@ MatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t
* simd = no
* cleanup = no
*/
void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA, int16_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P);
void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA,
int16_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC,
uint32_t M, uint32_t N,
uint32_t P);

/******************************************************************************/
/* Matrix Multiplication (32bit) */
Expand All @@ -139,7 +156,10 @@ void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA,
* other = loads/stores explicitly written in asm
* for optimal register utilization
*/
void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA, int32_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P);
void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA,
int32_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC,
uint32_t M, uint32_t N,
uint32_t P);

#endif //__DEEPLOY_MATH_MATMUL_KERNEL_HEADER_
65 changes: 37 additions & 28 deletions TargetLibraries/Snitch/inc/kernel/RQGemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,13 @@
* simd = no
* cleanup = yes
*/
void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, uint32_t N,
uint32_t P, int32_t alpha, int32_t beta, int32_t transA, int32_t transB, int32_t *mul,
int32_t *add, int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset,
int32_t B_offset, int32_t C_offset, int32_t Y_offset, int8_t output_min,
int8_t output_max);
void RQGemm_parallel_s8_rv32im(
int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M,
uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset,
int32_t Y_offset, int8_t output_min, int8_t output_max);

/*
* General Requantized Matrix multiplication ----------------------------------
Expand All @@ -73,33 +74,41 @@ void RQGemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_
* simd = no
* cleanup = no
*/
void RQGemm_offset_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY,
uint32_t M, uint32_t N, uint32_t P, int32_t alpha, int32_t beta,
int32_t transA, int32_t transB, int32_t *mul, int32_t *add,
int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset,
int32_t B_offset, int32_t C_offset, int32_t Y_offset);
void RQGemm_offset_unrolled_2x2_parallel_s8_rv32im(
int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M,
uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset,
int32_t Y_offset);

// Mapper Functions
static inline void __attribute__((always_inline))
RQGemm_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M, uint32_t N, uint32_t P,
int32_t alpha, int32_t beta, int32_t transA, int32_t transB, int32_t *mul, int32_t *add,
int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset, int32_t B_offset,
int32_t C_offset, int32_t Y_offset, int8_t output_min, int8_t output_max) {
RQGemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, mul, add, log2D, rounding,
per_row_quant, A_offset, B_offset, C_offset, Y_offset, output_min, output_max);
static inline void __attribute__((always_inline)) RQGemm_parallel_s8(
int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M,
uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset,
int32_t Y_offset, int8_t output_min, int8_t output_max) {
RQGemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta,
transA, transB, mul, add, log2D, rounding,
per_row_quant, A_offset, B_offset, C_offset,
Y_offset, output_min, output_max);
}

// Mapper Functions
static inline void __attribute__((always_inline)) RQGemm_offset_unrolled_2x2_parallel_s8(
int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, int32_t const *__restrict__ pSrcC,
int8_t *__restrict__ pDstY, uint32_t M, uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding, bool per_row_quant, int32_t A_offset,
int32_t B_offset, int32_t C_offset, int32_t Y_offset) {
RQGemm_offset_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, mul,
add, log2D, rounding, per_row_quant, A_offset, B_offset, C_offset,
Y_offset);
static inline void __attribute__((always_inline))
RQGemm_offset_unrolled_2x2_parallel_s8(
int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M,
uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset,
int32_t Y_offset) {
RQGemm_offset_unrolled_2x2_parallel_s8_rv32im(
pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, mul,
add, log2D, rounding, per_row_quant, A_offset, B_offset, C_offset,
Y_offset);
}

#endif //__DEEPLOY_MATH_RQGEMM_KERNEL_HEADER_
Loading

0 comments on commit d66d7ae

Please sign in to comment.