Skip to content

Commit

Permalink
linting
Browse files Browse the repository at this point in the history
linting
  • Loading branch information
tahaelbayad committed Nov 28, 2024
1 parent 1bb7d83 commit da662e1
Show file tree
Hide file tree
Showing 19 changed files with 330 additions and 226 deletions.
13 changes: 3 additions & 10 deletions Deeploy/Targets/Snitch/Deployer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,13 @@
import onnx_graphsurgeon as gs

from Deeploy.AbstractDataTypes import Pointer
from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer

# from Deeploy.NetworkDeployers.SignPropDeployer import SignPropDeployer
from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer

# from Deeploy.OptimizationPasses.TopologyOptimizationPasses.BasicPasses import ReshapeConstOptPass, \
# TransposeConstOptPass, TransposeMergePass, TransposeSplitPass
from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
NCHWtoNHWCPass, RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass
from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ReshapeConstOptPass, TransposeConstOptPass, \
TransposeMergePass, TransposeSplitPass

# from Deeploy.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import NCHWtoNHWCPass, \
# RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass
from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
NCHWtoNHWCPass, RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass

class SnitchDeployer(SignPropDeployer):

Expand Down
27 changes: 5 additions & 22 deletions Deeploy/Targets/Snitch/Platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,34 +28,17 @@

import numpy as np

# from Deeploy.Bindings.BasicBindings import BasicGatherBindings, BasicMatMulBinding, BasicPad1DBindings, \
# BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding

from Deeploy.Targets.Generic.Bindings import BasicGatherBindings,BasicMatMulBinding, BasicPad1DBindings, \
BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding

from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \
StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer

# from Deeploy.Layers.BasicLayers import GatherLayer, MatMulLayer, PadLayer, ReshapeLayer, RQIntegerDivLayer
from Deeploy.Targets.Generic.Bindings import BasicGatherBindings, BasicMatMulBinding, BasicPad1DBindings, \
BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding
from Deeploy.Targets.Generic.Layers import GatherLayer, MatMulLayer, PadLayer, ReshapeLayer, RQIntegerDivLayer

# from Deeploy.OptimizationPasses.TopologyOptimizationPasses.BasicPasses import IntegerDivRequantMergePass, \
# MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \
# SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass

from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import IntegerDivRequantMergePass, \
MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \
SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass

# from Deeploy.Parsers.BasicParsers import GatherParser, MatMulParser, Pad1DParser, Pad2DParser, RQIntegerDivParser, \
# UnsqueezeParser
from Deeploy.Targets.Generic.Parsers import GatherParser, MatMulParser, Pad1DParser, Pad2DParser, RQIntegerDivParser, \
UnsqueezeParser

# from Deeploy.Templates.BasicTemplates import AllocateTemplate as BasicAllocateTemplate
from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate

from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import IntegerDivRequantMergePass, \
MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \
SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass
from Deeploy.Targets.Snitch.Templates import AllocateTemplate, FreeTemplate

GatherMapper = NodeMapper(GatherParser(), BasicGatherBindings)
Expand Down
17 changes: 10 additions & 7 deletions DeeployTest/Platforms/Snitch/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,10 @@ int main(void) {

if (snrt_is_dm_core()) {
#ifndef CI
printf("Network running on %d of %d compute cores (+%d DM cores) on %d clusters\r\n", num_compute_cores,
snrt_global_compute_core_num(), snrt_cluster_num() * snrt_cluster_dm_core_num(), snrt_cluster_num());
printf("Network running on %d of %d compute cores (+%d DM cores) on %d "
"clusters\r\n",
num_compute_cores, snrt_global_compute_core_num(),
snrt_cluster_num() * snrt_cluster_dm_core_num(), snrt_cluster_num());
#endif

printf("Initializing...\r\n");
Expand All @@ -55,13 +57,13 @@ int main(void) {
#ifndef CI
for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
printf("testInputVector%d @ %p\r\n", buf, testInputVector[buf]);
printf("DeeployNetwork_input_%d @ %p and %u elements\r\n", buf, DeeployNetwork_inputs[buf],
DeeployNetwork_inputs_bytes[buf]);
printf("DeeployNetwork_input_%d @ %p and %u elements\r\n", buf,
DeeployNetwork_inputs[buf], DeeployNetwork_inputs_bytes[buf]);
}
for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
printf("testInputVector%d @ %p\r\n", buf, testOutputVector[buf]);
printf("DeeployNetwork_output_%d @ %p and %u elements\r\n", buf, DeeployNetwork_outputs[buf],
DeeployNetwork_outputs_bytes[buf]);
printf("DeeployNetwork_output_%d @ %p and %u elements\r\n", buf,
DeeployNetwork_outputs[buf], DeeployNetwork_outputs_bytes[buf]);
}

printf("Initialized\r\n");
Expand All @@ -71,7 +73,8 @@ int main(void) {

// WIESEP: Copy inputs to allocated memory
for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
snrt_dma_start_1d(DeeployNetwork_inputs[buf], testInputVector[buf], DeeployNetwork_inputs_bytes[buf]);
snrt_dma_start_1d(DeeployNetwork_inputs[buf], testInputVector[buf],
DeeployNetwork_inputs_bytes[buf]);
}
snrt_dma_wait_all();

Expand Down
4 changes: 0 additions & 4 deletions DeeployTest/testUtils/platformMapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
NeurekaPlatform
from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer
from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPOptimizer, PULPPlatform

from Deeploy.Targets.Snitch.Deployer import SnitchDeployer
from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform

Expand Down Expand Up @@ -82,7 +81,6 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]:
elif platformName == "Snitch":
Platform = SnitchPlatform()


else:
raise RuntimeError(f"Deployment platform {platformName} is not implemented")

Expand Down Expand Up @@ -211,7 +209,6 @@ def mapDeployer(platform: DeploymentPlatform,
default_channels_first = default_channels_first,
deeployStateDir = deeployStateDir)


elif isinstance(platform, (SnitchPlatform)):
if loweringOptimizer is None:
loweringOptimizer = SnitchOptimizer
Expand All @@ -228,7 +225,6 @@ def mapDeployer(platform: DeploymentPlatform,
default_channels_first = default_channels_first,
deeployStateDir = deeployStateDir)


else:
raise RuntimeError(f"Deployer for platform {platform} is not implemented")

Expand Down
3 changes: 2 additions & 1 deletion TargetLibraries/Snitch/inc/CycleCounter.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ void StopTimer(void);
// Returns the current number of cycles according to the internal cycle counter
uint32_t getCycles(void);

// Returns the current number of instructions according to the internal instructions counter
// Returns the current number of instructions according to the internal
// instructions counter
uint32_t getInstr(void);

#endif //__DEEPLOY_MATH_CYCLE_HEADER_
27 changes: 17 additions & 10 deletions TargetLibraries/Snitch/inc/kernel/Gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,26 @@
* simd = no
* cleanup = yes
*/
void Gemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t const *__restrict__ pSrcC, int32_t *__restrict__ pDstY, uint32_t M, uint32_t N,
uint32_t P, int32_t alpha, int32_t beta, int32_t transA, int32_t transB, int32_t A_offset,
int32_t B_offset, int32_t C_offset, int32_t Y_offset);
void Gemm_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA,
int8_t const *__restrict__ pSrcB,
int32_t const *__restrict__ pSrcC,
int32_t *__restrict__ pDstY, uint32_t M,
uint32_t N, uint32_t P, int32_t alpha,
int32_t beta, int32_t transA, int32_t transB,
int32_t A_offset, int32_t B_offset,
int32_t C_offset, int32_t Y_offset);

// Mapper Functions
static inline void __attribute__((always_inline))
Gemm_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, int32_t const *__restrict__ pSrcC,
int32_t *__restrict__ pDstY, uint32_t M, uint32_t N, uint32_t P, int32_t alpha, int32_t beta,
int32_t transA, int32_t transB, int32_t A_offset, int32_t B_offset, int32_t C_offset,
int32_t Y_offset) {
Gemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, A_offset, B_offset,
C_offset, Y_offset);
Gemm_parallel_s8(int8_t const *__restrict__ pSrcA,
int8_t const *__restrict__ pSrcB,
int32_t const *__restrict__ pSrcC, int32_t *__restrict__ pDstY,
uint32_t M, uint32_t N, uint32_t P, int32_t alpha,
int32_t beta, int32_t transA, int32_t transB, int32_t A_offset,
int32_t B_offset, int32_t C_offset, int32_t Y_offset) {
Gemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta,
transA, transB, A_offset, B_offset, C_offset,
Y_offset);
}

#endif //__DEEPLOY_MATH_GEMM_KERNEL_HEADER_
64 changes: 42 additions & 22 deletions TargetLibraries/Snitch/inc/kernel/MatMul.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@
* simd = no
* cleanup = yes
*/
void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P, int32_t A_offset,
void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA,
int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M,
uint32_t N, uint32_t P, int32_t A_offset,
int32_t B_offset, int32_t output_offset);

/*
Expand All @@ -71,8 +73,10 @@ void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *_
* simd = no
* cleanup = no
*/
void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P);
void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA,
int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC,
uint32_t M, uint32_t N, uint32_t P);

/*
* Matrix multiplication ----------------------------------
Expand All @@ -83,29 +87,39 @@ void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, in
* simd = no
* cleanup = no
*/
void MatMul_offset_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
int32_t A_offset, int32_t B_offset, int32_t output_offset);
void MatMul_offset_unrolled_2x2_parallel_s8_rv32im(
int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
int32_t A_offset, int32_t B_offset, int32_t output_offset);

// Mapper Functions
static inline void __attribute__((always_inline))
MatMul_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB, int32_t *__restrict__ pDstC,
uint32_t M, uint32_t N, uint32_t P, int32_t A_offset, int32_t B_offset, int32_t output_offset) {
MatMul_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset);
MatMul_parallel_s8(int8_t const *__restrict__ pSrcA,
int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M, uint32_t N,
uint32_t P, int32_t A_offset, int32_t B_offset,
int32_t output_offset) {
MatMul_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset,
output_offset);
}

static inline void __attribute__((always_inline)) MatMul_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA,
int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC,
uint32_t M, uint32_t N, uint32_t P) {
static inline void __attribute__((always_inline))
MatMul_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA,
int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M,
uint32_t N, uint32_t P) {
MatMul_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P);
}

static inline void __attribute__((always_inline))
MatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
int32_t A_offset, int32_t B_offset, int32_t output_offset) {
MatMul_offset_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset);
MatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA,
int8_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M,
uint32_t N, uint32_t P, int32_t A_offset,
int32_t B_offset,
int32_t output_offset) {
MatMul_offset_unrolled_2x2_parallel_s8_rv32im(
pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset);
}

/******************************************************************************/
Expand All @@ -121,8 +135,11 @@ MatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA, int8_t
* simd = no
* cleanup = no
*/
void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA, int16_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P);
void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA,
int16_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC,
uint32_t M, uint32_t N,
uint32_t P);

/******************************************************************************/
/* Matrix Multiplication (32bit) */
Expand All @@ -139,7 +156,10 @@ void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA,
* other = loads/stores explicitly written in asm
* for optimal register utilization
*/
void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA, int32_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P);
void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA,
int32_t const *__restrict__ pSrcB,
int32_t *__restrict__ pDstC,
uint32_t M, uint32_t N,
uint32_t P);

#endif //__DEEPLOY_MATH_MATMUL_KERNEL_HEADER_
Loading

0 comments on commit da662e1

Please sign in to comment.