From d69bd408cd70e88446516fc5bd37c5824c2813fc Mon Sep 17 00:00:00 2001 From: haugoug Date: Mon, 1 Jun 2020 21:15:39 +0200 Subject: [PATCH] 3.5 Release --- CHANGELOG | 19 + applications/CannyEdgeDetection/CannyDetect.c | 6 +- applications/CannyEdgeDetection/Makefile | 4 - applications/FaceDetection/FaceDetGenerator.c | 12 +- applications/FaceDetection/Makefile | 7 +- applications/FaceDetection/main.c | 14 +- applications/FaceDetection/testset.cfg | 4 +- applications/jpeg_encoder/Makefile | 3 + configs/common.sh | 7 +- examples/autotiler/Cifar10/Makefile | 24 +- examples/autotiler/IntegralImage/Makefile | 6 +- examples/autotiler/IntegralImage/main.c | 2 +- examples/autotiler/Mnist/Makefile | 21 +- examples/autotiler/MnistGraph/Makefile | 26 +- .../native/freeRTOS/periph/timer/test_timer.c | 4 +- examples/nntool/common/model_decl.mk | 80 +- examples/nntool/common/model_rules.mk | 48 +- examples/nntool/kws/Makefile | 14 +- examples/nntool/kws/emul.mk | 4 +- examples/nntool/kws/images/features_0_1.pgm | Bin 0 -> 7857 bytes examples/nntool/kws/images/features_1_3.pgm | Bin 0 -> 7857 bytes examples/nntool/kws/images/features_2_4.pgm | Bin 0 -> 7857 bytes examples/nntool/kws/images/features_3_4.pgm | Bin 0 -> 7857 bytes examples/nntool/kws/images/features_4_2.pgm | Bin 0 -> 7857 bytes examples/nntool/kws/kws.c | 44 +- examples/nntool/kws/max.log | 8 + examples/nntool/kws/model/nntool_script16 | 5 +- examples/nntool/kws/model/nntool_script8 | 5 +- .../nntool/kws/model/nntool_script_emul16 | 5 +- examples/nntool/kws/model_decl.mk | 75 +- examples/nntool/kws/model_rules.mk | 28 +- examples/nntool/mnist/Makefile | 66 +- examples/nntool/mnist/README.md | 21 +- examples/nntool/mnist/emul.mk | 44 +- examples/nntool/mnist/mnist.c | 112 +- examples/nntool/mnist/mnist.h | 7 - examples/nntool/mnist/mnist_emul.c | 125 + examples/nntool/mnist/model/mnist.h5 | Bin 0 -> 288232 bytes examples/nntool/mnist/model/mnist.tflite | Bin 0 -> 87456 bytes examples/nntool/mnist/model/nntool_script | 10 + examples/nntool/mnist/model/nntool_script16 | 10 +- .../nntool/mnist/model/nntool_script_emul | 11 + .../nntool/mnist/model/nntool_script_emul16 | 10 +- examples/nntool/mnist/model/train.py | 25 +- examples/nntool/mnist/train_model.mk | 21 + examples/nntool/visual_wake/Makefile | 24 +- examples/nntool/visual_wake/README.md | 6 +- examples/nntool/visual_wake/common.mk | 1 - examples/nntool/visual_wake/emul.mk | 31 +- .../nntool/visual_wake/model/nntool_script | 30 +- .../visual_wake/model/nntool_script_emul | 15 + .../model/visual_wake_quant.tflite | Bin 0 -> 309136 bytes examples/nntool/visual_wake/vww.c | 190 +- examples/nntool/visual_wake/vww.h | 1 - examples/nntool/visual_wake/vww_emul.c | 95 + examples/nntool/visual_wake/vww_emul.h | 18 + .../pmsis/test_periph/i2s/output/Makefile | 36 + examples/pmsis/test_periph/i2s/output/test.c | 120 + .../test_periph/test_camera_gc0308/Makefile | 3 +- .../test_periph/test_camera_gc0308/test.c | 148 +- .../pmsis/test_periph/test_camera_io/Makefile | 5 +- .../pmsis/test_periph/test_camera_io/test.c | 14 +- .../test_periph/test_camera_lcd/Makefile | 4 +- .../test_camera_lcd/test_camera_lcd.c | 4 +- .../test_periph/test_camera_ov5640/test.c | 45 +- gvsoc/gvsoc/bin/gvsoc_analyze_insn | 205 + gvsoc/gvsoc/dpi-wrapper/Makefile | 2 +- gvsoc/gvsoc/dpi-wrapper/src/dpi.cpp | 1 + gvsoc/gvsoc/engine/include/gv/gvsoc.h | 2 +- gvsoc/gvsoc/models/Makefile | 3 +- .../models/cpu/iss/include/isa_lib/int.h | 35 + .../models/cpu/iss/include/isa_lib/macros.h | 3 + gvsoc/gvsoc/models/cpu/iss/include/iss.hpp | 1 + gvsoc/gvsoc/models/cpu/iss/include/regs.hpp | 22 + .../gvsoc/models/cpu/iss/include/rvXint64.hpp | 321 + gvsoc/gvsoc/models/cpu/iss/include/types.hpp | 6 + gvsoc/gvsoc/models/cpu/iss/include/utils.hpp | 15 + gvsoc/gvsoc/models/cpu/iss/isa_gen/isa_gen.py | 8 + .../models/cpu/iss/isa_gen/isa_riscv_gen.py | 81 +- gvsoc/gvsoc/models/cpu/iss/src/trace.cpp | 26 +- gvsoc/gvsoc/models/devices/testbench/Makefile | 2 + .../models/devices/testbench/testbench.cpp | 303 + gvsoc/gvsoc/models/utils/dpi_chip_wrapper.cpp | 71 + libs/gap_lib/Makefile | 2 + libs/gap_lib/img_io/ImgIO.c | 865 +- libs/gap_lib/include/gaplib/ImgIO.h | 32 +- libs/gap_lib/include/gaplib/fs_switch.h | 80 + .../drivers/udma/i2s/i2s_internal.c | 2 +- .../include/pmsis/implem/drivers/perf/perf.h | 38 + .../pmsis_api/include/pmsis/drivers/uart.h | 9 +- .../pmsis_bsp/rules/freertos_bsp_rules.mk | 2 +- rtos/pulp/pulp-os/drivers/drivers.mk | 2 - rtos/pulp/pulp-os/drivers/gpio/gpio-v3.c | 177 + rtos/pulp/pulp-os/drivers/pwm/pwm-v1.c | 3 - rtos/pulp/pulp-os/libs/io/prf.c | 669 +- tools/autotiler_v3/Makefile | 2 +- .../autotiler_v3/generators/CNN/CNN_AT_Misc.c | 253 + .../generators/CNN/CNN_Activation_SQ8.c | 1208 +++ .../generators/CNN/CNN_BasicKernels.h | 121 +- .../generators/CNN/CNN_BasicKernels_SQ8.h | 745 ++ .../CNN/CNN_BiasReLULinear_BasicKernels.c | 1118 +-- .../generators/CNN/CNN_Bias_Linear_SQ8.c | 464 ++ .../generators/CNN/CNN_Conv_BasicKernels.c | 2 +- .../generators/CNN/CNN_Conv_DP_BasicKernels.c | 195 +- .../generators/CNN/CNN_Conv_DW_SQ8.c | 7038 +++++++++++++++++ .../generators/CNN/CNN_Conv_SQ8.c | 4443 +++++++++++ .../generators/CNN/CNN_Generator_Util.c | 204 + .../generators/CNN/CNN_Generator_Util.h | 46 + .../generators/CNN/CNN_Generators.c | 612 +- .../generators/CNN/CNN_Generators.h | 106 +- .../generators/CNN/CNN_Generators_SQ8.c | 3056 +++++++ .../generators/CNN/CNN_Generators_SQ8.h | 605 ++ .../generators/CNN/CNN_MatAlgebra.c | 5483 +++++-------- .../generators/CNN/CNN_MatAlgebra_SQ8.c | 3441 ++++++++ .../generators/CNN/CNN_Pooling_SQ8.c | 2281 ++++++ .../autotiler_v3/generators/CNN/CNN_SoftMax.c | 1 + .../generators/CNN/CNN_SoftMax_SQ8.c | 243 + .../generators/FFT2D/FFT2DGenerator.c | 10 +- .../generators/Fir/FirGenerator.c | 2 +- .../IntegralImg/IntegralImgGenerator.c | 4 +- .../generators/MatAdd/MatAddGenerator.c | 2 +- .../generators/MatMult/MatMultGenerator.c | 4 +- .../generators/Resize/ResizeGenerator.c | 2 +- tools/autotiler_v3/include/AutoTilerLib.h | 166 +- .../autotiler_v3/include/AutoTilerLibTypes.h | 126 +- tools/autotiler_v3/include/GapBuiltins.h | 2 + tools/autotiler_v3/include/at_api_emul.h | 4 +- tools/autotiler_v3/include/at_api_pmsis.h | 24 +- .../configs/chips/gap9_v2/gap9_v2.json | 2 +- .../configs/chips/gap9_v2/gap9_v2_rtl.json | 192 +- .../configs/devices/testbench.json | 117 + ...i5ky_v2_6_sfloat_single_regfile_int64.json | 4 + .../ri5ky_v2_sfloat_single_regfile_sec.json | 3 +- tools/gap8-openocd-tools/tcl/fuser.tcl | 61 + tools/gapy/runner/board/board_runner.py | 13 +- tools/nntool/.vscode/launch.json | 51 +- tools/nntool/Makefile | 2 +- tools/nntool/README.md | 88 +- tools/nntool/_version.py | 2 +- .../generators/nntool_extra_generators.c | 280 + .../generators/nntool_extra_generators.h | 24 + .../autotiler/kernels/nntool_extra_kernels.h | 74 + .../nntool/autotiler/kernels/norm_transpose.c | 204 + tools/nntool/autotiler/tests/Mk | 78 + tools/nntool/autotiler/tests/MkGap8 | 20 + tools/nntool/autotiler/tests/testModel.c | 48 + tools/nntool/autotiler/tests/testRun.c | 381 + tools/nntool/execution/execution_progress.py | 54 + tools/nntool/execution/graph_executer.py | 334 + tools/nntool/execution/quantization_mode.py | 32 +- .../generation/at_generators/__init__.py | 21 + .../at_generators/cnn_3d_tensor_permute.py | 47 + .../cnn_convolution_mulbias_pool_relu.py | 46 + .../cnn_convolution_pool_relu.py | 214 + .../at_generators/cnn_global_pool.py | 70 + ...n_grouped_convolution_mulbias_pool_relu.py | 48 + .../cnn_grouped_convolution_pool_relu.py | 126 + .../at_generators/cnn_linear_relu.py | 78 + .../generation/at_generators/cnn_matrix.py | 109 + .../generation/at_generators/cnn_pool_relu.py | 85 + .../generation/at_generators/cnn_softmax.py | 53 + .../nntool/generation/at_generators/utils.py | 36 + tools/nntool/generation/at_types/__init__.py | 0 tools/nntool/generation/at_types/at_params.py | 214 + .../generation/at_types/constant_info.py | 63 + tools/nntool/generation/at_types/gen_ctrl.py | 105 + .../nntool/generation/at_types/tc_arg_info.py | 148 + .../generation/at_types/tensor_stack.py | 38 + tools/nntool/generation/autotiler_options.py | 118 + tools/nntool/generation/bindings.py | 45 +- tools/nntool/generation/code_generator.py | 702 +- tools/nntool/generation/default_template.py | 18 +- .../nntool/generation/generators/__init__.py | 69 + .../generators/bindings/__init__.py | 0 .../generators/bindings/mult8/__init__.py | 0 .../bindings/mult8/conv_bindings_generator.py | 79 + .../bindings/mult8/fc_binding_generator.py | 57 + .../mult8/inout_bindings_generator.py | 40 + .../mult8/inout_infos_bindings_generator.py | 47 + .../mult8/mat_vect_mult_bindings_generator.py | 50 + .../mult8/matadd_bindings_generator.py | 52 + .../mult8/softmax_bindings_generator.py | 39 + .../generators/bindings/pow2/__init__.py | 0 .../bindings/pow2/conv_bindings_generator.py | 68 + .../bindings/pow2/fc_bindings_generator.py | 50 + .../bindings/pow2/inout_bindings_generator.py | 49 + .../pow2/matadd_bindings_generator.py | 35 + .../pow2/matscale_bindings_generator.py | 46 + .../pow2/softmax_bindings_generator.py | 35 + .../pow2/transpose_bindings_generator.py | 24 + .../generators/generator_decorators.py | 102 + .../generation/generators/globals/__init__.py | 0 .../globals/constant_input_generator.py | 41 + .../generators/globals/filter_generator.py | 93 + .../generators/globals/global_names.py | 21 + .../generators/globals/input_generator.py | 26 + .../globals/mult8_filter_generator.py | 94 + .../globals/mult8_infos_generator.py | 182 + .../generators/globals/output_generator.py | 27 + .../generation/generators/kernels/__init__.py | 0 .../generators/kernels/autotiler_kernel.py | 21 + .../generators/kernels/general/__init__.py | 0 .../general/imageformat_kernels_generator.py | 60 + .../generators/kernels/mult8/__init__.py | 0 .../mult8/conv_pool_relu_kernels_generator.py | 257 + .../mult8/global_pool_kernels_generator.py | 85 + .../mult8/linear_relu_kernels_generator.py | 121 + .../mult8/mat_vect_mult_kernels_generator.py | 86 + .../kernels/mult8/matadd_kernels_generator.py | 86 + .../mult8/pool_relu_kernels_generator.py | 93 + .../mult8/softmax_kernels_generator.py | 79 + .../three_d_transpose_kernels_generator.py | 152 + .../generators/kernels/pow2/__init__.py | 0 .../pow2/conv_pool_relu_kernels_generator.py | 214 + .../pow2/global_pool_kernels_generator.py | 66 + .../pow2/linear_relu_kernels_generator.py | 103 + .../kernels/pow2/matadd_kernels_generator.py | 147 + .../pow2/matscale_kernels_generator.py | 85 + .../pow2/pool_relu_kernels_generator.py | 128 + .../kernels/pow2/softmax_kernels_generator.py | 63 + .../three_d_transpose_kernels_generator.py | 144 + tools/nntool/generation/name_cache.py | 41 + tools/nntool/generation/naming_convension.py | 33 +- tools/nntool/generation/write_constants.py | 53 +- tools/nntool/graph/dim.py | 158 +- tools/nntool/graph/graph_identity.py | 19 + .../graph/manipulations/adjust_order.py | 71 +- .../manipulations/eliminate_transposes.py | 395 +- .../matches/equalize_sym_mult_concats.py | 76 + .../nntool/graph/matches/expand_transposes.py | 19 +- .../matches/find_asymmetric_quantization.py | 203 + tools/nntool/graph/matches/find_hsigmoid.py | 208 + .../matches/find_missing_quantization.py | 98 + tools/nntool/graph/matches/fuse_pad.py | 3 +- .../graph/matches/match_external_bias.py | 62 + tools/nntool/graph/matches/match_gap_conv.py | 35 +- .../nntool/graph/matches/match_gap_linear.py | 31 +- tools/nntool/graph/matches/match_gap_pool.py | 28 +- .../graph/matches/match_op_activation.py | 89 + tools/nntool/graph/matches/matches.py | 60 +- tools/nntool/graph/matches/move_activation.py | 59 +- .../propagate_softmax_sym_mult_qrec.py | 45 + tools/nntool/graph/matches/remove_noops.py | 33 + tools/nntool/graph/matches/remove_relus.py | 121 + tools/nntool/graph/nngraph.py | 86 +- tools/nntool/graph/types/__init__.py | 46 +- tools/nntool/graph/types/activations.py | 156 + tools/nntool/graph/types/base.py | 10 +- tools/nntool/graph/types/conv2d.py | 6 +- tools/nntool/graph/types/fusions.py | 116 + tools/nntool/graph/types/linear.py | 11 +- tools/nntool/graph/types/others.py | 334 +- .../importer/tflite/new_tflite_graph_all.py | 489 +- .../nntool/importer/tflite/propagate_hints.py | 14 +- tools/nntool/interpreter/commands/__init__.py | 0 tools/nntool/interpreter/commands/adjust.py | 33 + tools/nntool/interpreter/commands/aquant.py | 92 + tools/nntool/interpreter/commands/astats.py | 69 + tools/nntool/interpreter/commands/dump.py | 229 + tools/nntool/interpreter/commands/extract.py | 34 + tools/nntool/interpreter/commands/fquant.py | 55 + tools/nntool/interpreter/commands/freeze.py | 59 + tools/nntool/interpreter/commands/fusions.py | 68 + tools/nntool/interpreter/commands/gen.py | 97 + tools/nntool/interpreter/commands/graph.py | 77 + .../interpreter/commands/imageformat.py | 141 + .../nntool/interpreter/commands/nodeoption.py | 89 + tools/nntool/interpreter/commands/open.py | 100 + tools/nntool/interpreter/commands/qerror.py | 69 + tools/nntool/interpreter/commands/qshow.py | 44 + tools/nntool/interpreter/commands/qtune.py | 53 + .../commands/range_equalization.py | 81 + .../nntool/interpreter/commands/save_state.py | 47 + tools/nntool/interpreter/commands/stats.py | 59 + tools/nntool/interpreter/commands/temps.py | 36 + tools/nntool/interpreter/commands/tensors.py | 197 + .../nntool/interpreter/commands/validation.py | 144 + tools/nntool/interpreter/generator.py | 5 +- tools/nntool/interpreter/nntool_shell.py | 1733 +--- tools/nntool/interpreter/nntool_shell_base.py | 256 + tools/nntool/interpreter/settings.py | 278 + tools/nntool/interpreter/shell_utils.py | 65 +- tools/nntool/nntool | 7 +- .../quantization/cross_layer_range_eq.py | 78 +- tools/nntool/quantization/float32/__init__.py | 0 .../float32/float32_quantization.py | 112 + .../quantization/float32/float_kernet_set.py | 128 + .../quantization/float32/kernels/__init__.py | 0 .../float32/kernels/activations.py | 80 + .../quantization/float32/kernels/fast_conv.py | 130 + .../float32/kernels/image_format.py | 25 + .../quantization/float32/kernels/linear.py | 75 + .../float32/kernels/matrix_operations.py | 54 + .../quantization/float32/kernels/pad.py | 29 + .../quantization/float32/kernels/pool.py | 153 + .../quantization/float32/kernels/readme.md | 7 + .../quantization/float32/kernels/softmax.py | 34 + .../float32/kernels/tensor_functions.py | 104 + tools/nntool/quantization/kernels/__init__.py | 0 .../quantization/kernels/kernel_function.py | 129 + .../quantization/kernels/kernel_switch.py | 99 + .../quantization/multiplicative/__init__.py | 0 .../multiplicative/asymmetric/__init__.py | 0 .../asymmetric/asymmetric_mult_qtype.py | 196 + .../multiplicative/mult_mulbias_qtype.py | 126 + .../multiplicative/mult_qtype_base.py | 249 + .../multiplicative/mult_quantization.py | 388 + .../multiplicative/mult_quantizer.py | 212 + .../quantization/multiplicative/mult_utils.py | 38 + .../multiplicative/symmetric/__init__.py | 0 .../symmetric/mult_mulbias_qtype_new.py | 265 + .../symmetric/symmetric_mult_biases_qtype.py | 58 + .../symmetric/symmetric_mult_qtype.py | 232 + .../symmetric/symmetric_mult_qtype_wrapper.py | 123 + tools/nntool/quantization/qtype.py | 63 +- tools/nntool/quantization/qtype_base.py | 36 +- .../quantization/quantization_record_base.py | 202 + tools/nntool/quantization/quantization_set.py | 130 + tools/nntool/quantization/quantizer.py | 8 +- .../nntool/quantization/symmetric/__init__.py | 0 .../symmetric/kernels/__init__.py | 0 .../symmetric/kernels/activations.py | 176 + .../symmetric/kernels/fast_conv.py | 139 + .../symmetric/kernels/image_format.py | 25 + .../quantization/symmetric/kernels/linear.py | 87 + .../symmetric/kernels/matrix_operations.py | 131 + .../quantization/symmetric/kernels/pad.py | 29 + .../quantization/symmetric/kernels/pool.py | 188 + .../quantization/symmetric/kernels/softmax.py | 58 + .../symmetric/kernels/tensor_functions.py | 98 + .../symmetric/symmetric_kernet_set.py | 130 + .../symmetric/symmetric_quantization.py | 208 + .../symmetric/symmetric_quantizer.py | 607 ++ tools/nntool/quantization/tuneq.py | 17 +- tools/nntool/reports/error_reporter.py | 13 +- tools/nntool/reports/filter_reporter.py | 2 +- tools/nntool/reports/graph_reporter.py | 2 +- tools/nntool/reports/quantization_reporter.py | 94 +- tools/nntool/requirements.txt | 6 +- .../stats/activation_stats_collector.py | 26 +- tools/nntool/stats/error_stats_collector.py | 83 +- tools/nntool/stats/filter_stats_collector.py | 48 +- .../stats/step_error_stats_collector.py | 50 +- tools/nntool/tests/conftest.py | 106 +- tools/nntool/tests/graph/mobv1_quant.tflite | Bin 4657216 -> 4657216 bytes tools/nntool/tests/test_cmds.py | 2 - .../nntool/tests/test_cross_layer_range_eq.py | 23 +- tools/nntool/tests/test_execution.py | 125 +- tools/nntool/tests/test_fusions.py | 29 +- tools/nntool/tests/test_generator.py | 189 +- tools/nntool/tests/test_matcher.py | 5 +- tools/nntool/tests/test_new_paramstate.py | 4 +- tools/nntool/tests/test_nngraph.py | 234 +- tools/nntool/tests/test_quantize.py | 28 +- tools/nntool/tests/test_reports.py | 12 +- tools/nntool/tests/test_sparse_list.py | 28 +- tools/nntool/utils/add_sys_path.py | 16 + tools/nntool/utils/at_norm.py | 43 + tools/nntool/utils/at_tensor_loader.py | 190 + tools/nntool/utils/data_importer.py | 79 +- tools/nntool/utils/exp_17_15.py | 95 + tools/nntool/utils/formatters.py | 37 + tools/nntool/utils/fuzzy.py | 61 + tools/nntool/utils/graph.py | 22 +- tools/nntool/utils/json_serializable.py | 8 + tools/nntool/utils/new_param_state.py | 17 +- tools/nntool/utils/node_id.py | 4 +- tools/nntool/utils/option_list.py | 3 +- tools/nntool/utils/validation_utils.py | 102 +- tools/rules/pulp_rules.mk | 4 + 370 files changed, 49200 insertions(+), 9963 deletions(-) create mode 100644 examples/nntool/kws/images/features_0_1.pgm create mode 100644 examples/nntool/kws/images/features_1_3.pgm create mode 100644 examples/nntool/kws/images/features_2_4.pgm create mode 100644 examples/nntool/kws/images/features_3_4.pgm create mode 100644 examples/nntool/kws/images/features_4_2.pgm create mode 100644 examples/nntool/kws/max.log create mode 100644 examples/nntool/mnist/mnist_emul.c create mode 100644 examples/nntool/mnist/model/mnist.h5 create mode 100644 examples/nntool/mnist/model/mnist.tflite create mode 100644 examples/nntool/mnist/model/nntool_script create mode 100644 examples/nntool/mnist/model/nntool_script_emul create mode 100644 examples/nntool/mnist/train_model.mk create mode 100644 examples/nntool/visual_wake/model/nntool_script_emul create mode 100644 examples/nntool/visual_wake/model/visual_wake_quant.tflite create mode 100644 examples/nntool/visual_wake/vww_emul.c create mode 100644 examples/nntool/visual_wake/vww_emul.h create mode 100644 examples/pmsis/test_periph/i2s/output/Makefile create mode 100644 examples/pmsis/test_periph/i2s/output/test.c create mode 100755 gvsoc/gvsoc/bin/gvsoc_analyze_insn create mode 100644 gvsoc/gvsoc/models/cpu/iss/include/rvXint64.hpp create mode 100644 gvsoc/gvsoc/models/devices/testbench/Makefile create mode 100644 gvsoc/gvsoc/models/devices/testbench/testbench.cpp create mode 100644 libs/gap_lib/include/gaplib/fs_switch.h create mode 100644 rtos/pulp/pulp-os/drivers/gpio/gpio-v3.c create mode 100644 tools/autotiler_v3/generators/CNN/CNN_AT_Misc.c create mode 100644 tools/autotiler_v3/generators/CNN/CNN_Activation_SQ8.c create mode 100644 tools/autotiler_v3/generators/CNN/CNN_BasicKernels_SQ8.h create mode 100644 tools/autotiler_v3/generators/CNN/CNN_Bias_Linear_SQ8.c create mode 100644 tools/autotiler_v3/generators/CNN/CNN_Conv_DW_SQ8.c create mode 100644 tools/autotiler_v3/generators/CNN/CNN_Conv_SQ8.c create mode 100644 tools/autotiler_v3/generators/CNN/CNN_Generator_Util.c create mode 100644 tools/autotiler_v3/generators/CNN/CNN_Generator_Util.h create mode 100644 tools/autotiler_v3/generators/CNN/CNN_Generators_SQ8.c create mode 100644 tools/autotiler_v3/generators/CNN/CNN_Generators_SQ8.h create mode 100644 tools/autotiler_v3/generators/CNN/CNN_MatAlgebra_SQ8.c create mode 100644 tools/autotiler_v3/generators/CNN/CNN_Pooling_SQ8.c create mode 100644 tools/autotiler_v3/generators/CNN/CNN_SoftMax_SQ8.c create mode 100644 tools/gap-configs/configs/devices/testbench.json create mode 100644 tools/gap-configs/configs/ips/riscv/ri5ky_v2_6_sfloat_single_regfile_int64.json create mode 100644 tools/nntool/autotiler/generators/nntool_extra_generators.c create mode 100644 tools/nntool/autotiler/generators/nntool_extra_generators.h create mode 100644 tools/nntool/autotiler/kernels/nntool_extra_kernels.h create mode 100644 tools/nntool/autotiler/kernels/norm_transpose.c create mode 100644 tools/nntool/autotiler/tests/Mk create mode 100644 tools/nntool/autotiler/tests/MkGap8 create mode 100644 tools/nntool/autotiler/tests/testModel.c create mode 100644 tools/nntool/autotiler/tests/testRun.c create mode 100644 tools/nntool/execution/execution_progress.py create mode 100644 tools/nntool/execution/graph_executer.py create mode 100644 tools/nntool/generation/at_generators/__init__.py create mode 100644 tools/nntool/generation/at_generators/cnn_3d_tensor_permute.py create mode 100644 tools/nntool/generation/at_generators/cnn_convolution_mulbias_pool_relu.py create mode 100644 tools/nntool/generation/at_generators/cnn_convolution_pool_relu.py create mode 100644 tools/nntool/generation/at_generators/cnn_global_pool.py create mode 100644 tools/nntool/generation/at_generators/cnn_grouped_convolution_mulbias_pool_relu.py create mode 100644 tools/nntool/generation/at_generators/cnn_grouped_convolution_pool_relu.py create mode 100644 tools/nntool/generation/at_generators/cnn_linear_relu.py create mode 100644 tools/nntool/generation/at_generators/cnn_matrix.py create mode 100644 tools/nntool/generation/at_generators/cnn_pool_relu.py create mode 100644 tools/nntool/generation/at_generators/cnn_softmax.py create mode 100644 tools/nntool/generation/at_generators/utils.py create mode 100644 tools/nntool/generation/at_types/__init__.py create mode 100644 tools/nntool/generation/at_types/at_params.py create mode 100644 tools/nntool/generation/at_types/constant_info.py create mode 100644 tools/nntool/generation/at_types/gen_ctrl.py create mode 100644 tools/nntool/generation/at_types/tc_arg_info.py create mode 100644 tools/nntool/generation/at_types/tensor_stack.py create mode 100644 tools/nntool/generation/autotiler_options.py create mode 100644 tools/nntool/generation/generators/__init__.py create mode 100644 tools/nntool/generation/generators/bindings/__init__.py create mode 100644 tools/nntool/generation/generators/bindings/mult8/__init__.py create mode 100644 tools/nntool/generation/generators/bindings/mult8/conv_bindings_generator.py create mode 100644 tools/nntool/generation/generators/bindings/mult8/fc_binding_generator.py create mode 100644 tools/nntool/generation/generators/bindings/mult8/inout_bindings_generator.py create mode 100644 tools/nntool/generation/generators/bindings/mult8/inout_infos_bindings_generator.py create mode 100644 tools/nntool/generation/generators/bindings/mult8/mat_vect_mult_bindings_generator.py create mode 100644 tools/nntool/generation/generators/bindings/mult8/matadd_bindings_generator.py create mode 100644 tools/nntool/generation/generators/bindings/mult8/softmax_bindings_generator.py create mode 100644 tools/nntool/generation/generators/bindings/pow2/__init__.py create mode 100644 tools/nntool/generation/generators/bindings/pow2/conv_bindings_generator.py create mode 100644 tools/nntool/generation/generators/bindings/pow2/fc_bindings_generator.py create mode 100644 tools/nntool/generation/generators/bindings/pow2/inout_bindings_generator.py create mode 100644 tools/nntool/generation/generators/bindings/pow2/matadd_bindings_generator.py create mode 100644 tools/nntool/generation/generators/bindings/pow2/matscale_bindings_generator.py create mode 100644 tools/nntool/generation/generators/bindings/pow2/softmax_bindings_generator.py create mode 100644 tools/nntool/generation/generators/bindings/pow2/transpose_bindings_generator.py create mode 100644 tools/nntool/generation/generators/generator_decorators.py create mode 100644 tools/nntool/generation/generators/globals/__init__.py create mode 100644 tools/nntool/generation/generators/globals/constant_input_generator.py create mode 100644 tools/nntool/generation/generators/globals/filter_generator.py create mode 100644 tools/nntool/generation/generators/globals/global_names.py create mode 100644 tools/nntool/generation/generators/globals/input_generator.py create mode 100644 tools/nntool/generation/generators/globals/mult8_filter_generator.py create mode 100644 tools/nntool/generation/generators/globals/mult8_infos_generator.py create mode 100644 tools/nntool/generation/generators/globals/output_generator.py create mode 100644 tools/nntool/generation/generators/kernels/__init__.py create mode 100644 tools/nntool/generation/generators/kernels/autotiler_kernel.py create mode 100644 tools/nntool/generation/generators/kernels/general/__init__.py create mode 100644 tools/nntool/generation/generators/kernels/general/imageformat_kernels_generator.py create mode 100644 tools/nntool/generation/generators/kernels/mult8/__init__.py create mode 100644 tools/nntool/generation/generators/kernels/mult8/conv_pool_relu_kernels_generator.py create mode 100644 tools/nntool/generation/generators/kernels/mult8/global_pool_kernels_generator.py create mode 100644 tools/nntool/generation/generators/kernels/mult8/linear_relu_kernels_generator.py create mode 100644 tools/nntool/generation/generators/kernels/mult8/mat_vect_mult_kernels_generator.py create mode 100644 tools/nntool/generation/generators/kernels/mult8/matadd_kernels_generator.py create mode 100644 tools/nntool/generation/generators/kernels/mult8/pool_relu_kernels_generator.py create mode 100644 tools/nntool/generation/generators/kernels/mult8/softmax_kernels_generator.py create mode 100644 tools/nntool/generation/generators/kernels/mult8/three_d_transpose_kernels_generator.py create mode 100644 tools/nntool/generation/generators/kernels/pow2/__init__.py create mode 100644 tools/nntool/generation/generators/kernels/pow2/conv_pool_relu_kernels_generator.py create mode 100644 tools/nntool/generation/generators/kernels/pow2/global_pool_kernels_generator.py create mode 100644 tools/nntool/generation/generators/kernels/pow2/linear_relu_kernels_generator.py create mode 100644 tools/nntool/generation/generators/kernels/pow2/matadd_kernels_generator.py create mode 100644 tools/nntool/generation/generators/kernels/pow2/matscale_kernels_generator.py create mode 100644 tools/nntool/generation/generators/kernels/pow2/pool_relu_kernels_generator.py create mode 100644 tools/nntool/generation/generators/kernels/pow2/softmax_kernels_generator.py create mode 100644 tools/nntool/generation/generators/kernels/pow2/three_d_transpose_kernels_generator.py create mode 100644 tools/nntool/generation/name_cache.py create mode 100644 tools/nntool/graph/matches/equalize_sym_mult_concats.py create mode 100644 tools/nntool/graph/matches/find_asymmetric_quantization.py create mode 100644 tools/nntool/graph/matches/find_hsigmoid.py create mode 100644 tools/nntool/graph/matches/find_missing_quantization.py create mode 100644 tools/nntool/graph/matches/match_op_activation.py create mode 100644 tools/nntool/graph/matches/propagate_softmax_sym_mult_qrec.py create mode 100644 tools/nntool/graph/matches/remove_noops.py create mode 100644 tools/nntool/graph/matches/remove_relus.py create mode 100644 tools/nntool/graph/types/activations.py create mode 100644 tools/nntool/graph/types/fusions.py create mode 100644 tools/nntool/interpreter/commands/__init__.py create mode 100644 tools/nntool/interpreter/commands/adjust.py create mode 100644 tools/nntool/interpreter/commands/aquant.py create mode 100644 tools/nntool/interpreter/commands/astats.py create mode 100644 tools/nntool/interpreter/commands/dump.py create mode 100644 tools/nntool/interpreter/commands/extract.py create mode 100644 tools/nntool/interpreter/commands/fquant.py create mode 100644 tools/nntool/interpreter/commands/freeze.py create mode 100644 tools/nntool/interpreter/commands/fusions.py create mode 100644 tools/nntool/interpreter/commands/gen.py create mode 100644 tools/nntool/interpreter/commands/graph.py create mode 100644 tools/nntool/interpreter/commands/imageformat.py create mode 100644 tools/nntool/interpreter/commands/nodeoption.py create mode 100644 tools/nntool/interpreter/commands/open.py create mode 100644 tools/nntool/interpreter/commands/qerror.py create mode 100644 tools/nntool/interpreter/commands/qshow.py create mode 100644 tools/nntool/interpreter/commands/qtune.py create mode 100644 tools/nntool/interpreter/commands/range_equalization.py create mode 100644 tools/nntool/interpreter/commands/save_state.py create mode 100644 tools/nntool/interpreter/commands/stats.py create mode 100644 tools/nntool/interpreter/commands/temps.py create mode 100644 tools/nntool/interpreter/commands/tensors.py create mode 100644 tools/nntool/interpreter/commands/validation.py create mode 100644 tools/nntool/interpreter/nntool_shell_base.py create mode 100644 tools/nntool/interpreter/settings.py create mode 100644 tools/nntool/quantization/float32/__init__.py create mode 100644 tools/nntool/quantization/float32/float32_quantization.py create mode 100644 tools/nntool/quantization/float32/float_kernet_set.py create mode 100644 tools/nntool/quantization/float32/kernels/__init__.py create mode 100644 tools/nntool/quantization/float32/kernels/activations.py create mode 100644 tools/nntool/quantization/float32/kernels/fast_conv.py create mode 100644 tools/nntool/quantization/float32/kernels/image_format.py create mode 100644 tools/nntool/quantization/float32/kernels/linear.py create mode 100644 tools/nntool/quantization/float32/kernels/matrix_operations.py create mode 100644 tools/nntool/quantization/float32/kernels/pad.py create mode 100644 tools/nntool/quantization/float32/kernels/pool.py create mode 100644 tools/nntool/quantization/float32/kernels/readme.md create mode 100644 tools/nntool/quantization/float32/kernels/softmax.py create mode 100644 tools/nntool/quantization/float32/kernels/tensor_functions.py create mode 100644 tools/nntool/quantization/kernels/__init__.py create mode 100644 tools/nntool/quantization/kernels/kernel_function.py create mode 100644 tools/nntool/quantization/kernels/kernel_switch.py create mode 100644 tools/nntool/quantization/multiplicative/__init__.py create mode 100644 tools/nntool/quantization/multiplicative/asymmetric/__init__.py create mode 100644 tools/nntool/quantization/multiplicative/asymmetric/asymmetric_mult_qtype.py create mode 100644 tools/nntool/quantization/multiplicative/mult_mulbias_qtype.py create mode 100644 tools/nntool/quantization/multiplicative/mult_qtype_base.py create mode 100644 tools/nntool/quantization/multiplicative/mult_quantization.py create mode 100644 tools/nntool/quantization/multiplicative/mult_quantizer.py create mode 100644 tools/nntool/quantization/multiplicative/mult_utils.py create mode 100644 tools/nntool/quantization/multiplicative/symmetric/__init__.py create mode 100644 tools/nntool/quantization/multiplicative/symmetric/mult_mulbias_qtype_new.py create mode 100644 tools/nntool/quantization/multiplicative/symmetric/symmetric_mult_biases_qtype.py create mode 100644 tools/nntool/quantization/multiplicative/symmetric/symmetric_mult_qtype.py create mode 100644 tools/nntool/quantization/multiplicative/symmetric/symmetric_mult_qtype_wrapper.py create mode 100644 tools/nntool/quantization/quantization_record_base.py create mode 100644 tools/nntool/quantization/quantization_set.py create mode 100644 tools/nntool/quantization/symmetric/__init__.py create mode 100644 tools/nntool/quantization/symmetric/kernels/__init__.py create mode 100644 tools/nntool/quantization/symmetric/kernels/activations.py create mode 100644 tools/nntool/quantization/symmetric/kernels/fast_conv.py create mode 100644 tools/nntool/quantization/symmetric/kernels/image_format.py create mode 100644 tools/nntool/quantization/symmetric/kernels/linear.py create mode 100644 tools/nntool/quantization/symmetric/kernels/matrix_operations.py create mode 100644 tools/nntool/quantization/symmetric/kernels/pad.py create mode 100644 tools/nntool/quantization/symmetric/kernels/pool.py create mode 100644 tools/nntool/quantization/symmetric/kernels/softmax.py create mode 100644 tools/nntool/quantization/symmetric/kernels/tensor_functions.py create mode 100644 tools/nntool/quantization/symmetric/symmetric_kernet_set.py create mode 100644 tools/nntool/quantization/symmetric/symmetric_quantization.py create mode 100644 tools/nntool/quantization/symmetric/symmetric_quantizer.py create mode 100644 tools/nntool/utils/add_sys_path.py create mode 100644 tools/nntool/utils/at_norm.py create mode 100644 tools/nntool/utils/at_tensor_loader.py create mode 100644 tools/nntool/utils/exp_17_15.py create mode 100644 tools/nntool/utils/formatters.py create mode 100644 tools/nntool/utils/fuzzy.py diff --git a/CHANGELOG b/CHANGELOG index c0a3785f7..8c3b3d130 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,22 @@ +3.5 + +Highlights of this release: + +- NNTool + - Added new quantization scheme with scaling factors (tflite quantization-like) + - Support import tflite (1 or 2) quantized graphs (uint8) + - New adjust algorithm to support more general reshape/transpose layers + - Added Autotiler layer for input image formatting (HWC uint8 -> CHW int8/int16) + - Support multi-inputs network execution + - Support for new tensor dumping routines in AutoTiler + - Support for generation of model using SQ8 AutoTiler generators + - Bug fixes + +- Autotiler + - New SQ8 kernel set supporting scaled quantization + - Tensor dumping from any memory area + - Several bug fixes + 3.4 Highlights of this release: diff --git a/applications/CannyEdgeDetection/CannyDetect.c b/applications/CannyEdgeDetection/CannyDetect.c index afa84567e..15a38e0f7 100644 --- a/applications/CannyEdgeDetection/CannyDetect.c +++ b/applications/CannyEdgeDetection/CannyDetect.c @@ -667,11 +667,9 @@ void canny_edge_detector() sprintf(imageName, "../../../%s", Imagefile); ImageIn_L2 = (unsigned char *) pi_l2_malloc( COL*LINE*sizeof(unsigned char)); - unsigned int Wi, Hi; - - if ( (ReadImageFromFile(imageName, &Wi, &Hi, ImageIn_L2, LINE*COL*sizeof(unsigned char))==0) || (Wi!=COL) || (Hi!=LINE)) + if (ReadImageFromFile(imageName, COL,LINE, 1, ImageIn_L2, LINE*COL*sizeof(unsigned char), 0, 0)) { - printf("Failed to load image %s or dimension mismatch Expects [%dx%d], Got [%dx%d]\n", imageName, COL, LINE, Wi, Hi); + printf("Failed to load image %s\n", imageName); pmsis_exit(-1); } diff --git a/applications/CannyEdgeDetection/Makefile b/applications/CannyEdgeDetection/Makefile index e8ed386fb..464343acd 100644 --- a/applications/CannyEdgeDetection/Makefile +++ b/applications/CannyEdgeDetection/Makefile @@ -26,10 +26,6 @@ APP_LDFLAGS += -flto -Wl,--gc-sections # in a clean way. APP_CFLAGS += -Wall -Werror -Wno-maybe-uninitialized -Wno-unused-but-set-variable -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wundef -# Should be removed once openocd is the default bridge -export GAP_USE_OPENOCD=1 -io=host - clean:: rm -rf img_OUT.ppm diff --git a/applications/FaceDetection/FaceDetGenerator.c b/applications/FaceDetection/FaceDetGenerator.c index ba40b4e92..4d7b9f434 100644 --- a/applications/FaceDetection/FaceDetGenerator.c +++ b/applications/FaceDetection/FaceDetGenerator.c @@ -92,7 +92,7 @@ void GenerateResize(char *Name, int Wi, int Hi, int Wo, int Ho) KernelIterSpace(1, IterTiledSpace(KER_ITER_TILE0)), TILE_HOR, CArgs(2, TCArg("unsigned char *", "In"), TCArg("unsigned char *", "Out")), - Calls(1, Call("KerResizeBilinear", LOC_INNER_LOOP, + Calls(1, Call("KerResizeBilinear", LOC_LOOP, Bindings(8, K_Arg("In", KER_ARG_TILE), K_Arg("In", KER_ARG_W), K_Arg("In", KER_ARG_H), @@ -123,13 +123,13 @@ void GenerateIntegralImage(char *Name, TCArg("unsigned int * __restrict__", "IntegralImage") ), Calls(2, - Call("KerIntegralImagePrime", LOC_INNER_LOOP_PROLOG, + Call("KerIntegralImagePrime", LOC_LOOP_PROLOG, Bindings(2, K_Arg("KerBuffer",KER_ARG), K_Arg("KerIn", KER_ARG_TILE_W) ) ), - Call("KerIntegralImageProcess", LOC_INNER_LOOP, + Call("KerIntegralImageProcess", LOC_LOOP, Bindings(5, K_Arg("KerIn", KER_ARG_TILE), K_Arg("KerIn", KER_ARG_TILE_W), @@ -162,13 +162,13 @@ void GenerateSquaredIntegralImage(char *Name, TCArg("unsigned int * __restrict__", "IntegralImage") ), Calls(2, - Call("KerIntegralImagePrime", LOC_INNER_LOOP_PROLOG, + Call("KerIntegralImagePrime", LOC_LOOP_PROLOG, Bindings(2, K_Arg("KerBuffer",KER_ARG), K_Arg("KerIn", KER_ARG_TILE_W) ) ), - Call("KerSquaredIntegralImageProcess", LOC_INNER_LOOP, + Call("KerSquaredIntegralImageProcess", LOC_LOOP, Bindings(5, K_Arg("KerIn", KER_ARG_TILE), K_Arg("KerIn", KER_ARG_TILE_W), @@ -208,7 +208,7 @@ void GenerateCascadeClassifier(char *Name, ), Calls(1, - Call("KerEvaluateCascade", LOC_INNER_LOOP, + Call("KerEvaluateCascade", LOC_LOOP, Bindings(8, K_Arg("KerII", KER_ARG_TILE), K_Arg("KerIISQ", KER_ARG_TILE), diff --git a/applications/FaceDetection/Makefile b/applications/FaceDetection/Makefile index 89980492b..e5594bf38 100644 --- a/applications/FaceDetection/Makefile +++ b/applications/FaceDetection/Makefile @@ -28,7 +28,12 @@ override config_args += --config-opt=camera/image-stream=$(CURDIR)/imgTest0.pgm APP_SRCS += $(FACE_DET_SRCS) APP_INC += $(TILER_INC) -APP_CFLAGS += -O3 -g -D__PMSIS__ -DUSE_CAMERA -DUSE_DISPLAY -DNB_FRAMES=$(NB_FRAMES) +APP_CFLAGS += -O3 -g -D__PMSIS__ -DUSE_CAMERA -DNB_FRAMES=$(NB_FRAMES) + +display ?= 1 +ifeq ($(display), 1) +APP_CFLAGS += -DUSE_DISPLAY +endif BOARD_NAME ?= gapoc_a PMSIS_OS ?= freertos diff --git a/applications/FaceDetection/main.c b/applications/FaceDetection/main.c index ae1382feb..13ff6609e 100644 --- a/applications/FaceDetection/main.c +++ b/applications/FaceDetection/main.c @@ -51,9 +51,11 @@ #define LCD_HEIGHT 240 static unsigned char *imgBuff0; +#if defined(USE_DISPLAY) static struct pi_device ili; static pi_buffer_t buffer; static pi_buffer_t buffer_out; +#endif /* USE_DISPLAY */ static struct pi_device cam; L2_MEM unsigned char *ImageOut; @@ -70,11 +72,9 @@ ArgCluster_T ClusterCall; void setCursor(struct pi_device *device,signed short x, signed short y); void writeFillRect(struct pi_device *device, unsigned short x, unsigned short y, unsigned short w, unsigned short h, unsigned short color); void writeText(struct pi_device *device,char* str,int fontsize); -#endif /* USE_DISPLAY */ static int open_display(struct pi_device *device) { -#if defined(USE_DISPLAY) struct pi_ili9341_conf ili_conf; pi_ili9341_conf_init(&ili_conf); @@ -85,9 +85,9 @@ static int open_display(struct pi_device *device) { return -1; } -#endif return 0; } +#endif /* USE_DISPLAY */ #if defined(USE_CAMERA) #if defined(HIMAX) @@ -166,15 +166,17 @@ void test_facedetection(void) { printf("Failed to allocate Memory for one or both Integral Images (%d bytes)\n", ImgSize*sizeof(unsigned int)); pmsis_exit(-3); - } + } printf("malloc done\n"); + #if defined(USE_DISPLAY) if (open_display(&ili)) { printf("Failed to open display\n"); pmsis_exit(-4); } printf("display done\n"); + #endif /* USE_DISPLAY */ if (open_camera(&cam)) { @@ -183,6 +185,7 @@ void test_facedetection(void) } printf("Camera open success\n"); + #if defined(USE_DISPLAY) #if defined(HIMAX) buffer.data = imgBuff0+CAM_WIDTH*2+2; buffer.stride = 4; @@ -195,14 +198,13 @@ void test_facedetection(void) pi_buffer_init(&buffer, PI_BUFFER_TYPE_L2, imgBuff0); #endif /* HIMAX */ - #if defined(USE_DISPLAY) buffer_out.data = ImageOut; buffer_out.stride = 0; pi_buffer_init(&buffer_out, PI_BUFFER_TYPE_L2, ImageOut); pi_buffer_set_stride(&buffer_out, 0); - #endif /* USE_DISPLAY */ pi_buffer_set_format(&buffer, CAM_WIDTH, CAM_HEIGHT, 1, PI_BUFFER_FORMAT_GRAY); + #endif /* USE_DISPLAY */ ClusterCall.ImageIn = imgBuff0; ClusterCall.Win = W; diff --git a/applications/FaceDetection/testset.cfg b/applications/FaceDetection/testset.cfg index 386188928..e4d988786 100644 --- a/applications/FaceDetection/testset.cfg +++ b/applications/FaceDetection/testset.cfg @@ -6,10 +6,10 @@ test = Test( name = 'face_detection', commands = [ Shell('clean', 'make clean'), - Shell('build', 'make all NB_FRAMES=5'), + Shell('build', 'make all NB_FRAMES=5 display=0'), Shell('run', 'make run') ], timeout=1000000, ) - + c['tests'] = [ test ] diff --git a/applications/jpeg_encoder/Makefile b/applications/jpeg_encoder/Makefile index a1bcbe835..308788bd9 100644 --- a/applications/jpeg_encoder/Makefile +++ b/applications/jpeg_encoder/Makefile @@ -1,3 +1,6 @@ +# Only Pulp-Os for now. +PMSIS_OS=pulpos + APP = test APP_SRCS += test.c ImgIO.c APP_CFLAGS += -O3 -g -Werror diff --git a/configs/common.sh b/configs/common.sh index a1443c9b4..a8195be81 100644 --- a/configs/common.sh +++ b/configs/common.sh @@ -14,6 +14,9 @@ export DEP_DIRS=$INSTALL_DIR export RULES_DIR=$GAP_SDK_HOME/tools/rules export NNTOOL_DIR=$GAP_SDK_HOME/tools/nntool +export NNTOOL_PATH=$GAP_SDK_HOME/tools/nntool +export NNTOOL_KERNELS_PATH=$NNTOOL_DIR/autotiler/kernels +export NNTOOL_GENERATOR_PATH=$NNTOOL_DIR/autotiler/generators export PATH="$NNTOOL_DIR":$PATH # PulpOS 2 @@ -55,8 +58,8 @@ export TILER_EMU_INC=$TILER_PATH/include export TILER_GENERATOR_PATH=$TILER_PATH/generators export TILER_CNN_KERNEL_PATH=$TILER_PATH/generators/CNN export TILER_CNN_GENERATOR_PATH=$TILER_PATH/generators/CNN -export TILER_CNN_KERNEL_PATH_SQ8=$TILER_PATH/CNN_Libraries_SQ8 -export TILER_CNN_GENERATOR_PATH_SQ8=$TILER_PATH/CNN_Generators_SQ8 +export TILER_CNN_KERNEL_PATH_SQ8=$TILER_PATH/generators/CNN +export TILER_CNN_GENERATOR_PATH_SQ8=$TILER_PATH/generators/CNN # OpenOCD diff --git a/examples/autotiler/Cifar10/Makefile b/examples/autotiler/Cifar10/Makefile index 18d0185c8..7243dff85 100644 --- a/examples/autotiler/Cifar10/Makefile +++ b/examples/autotiler/Cifar10/Makefile @@ -8,10 +8,26 @@ RM=rm -f CNN_AT_PATH = $(TILER_GENERATOR_PATH)/CNN APP = Cifar10 + +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_BiasReLULinear_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DP_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DW_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DW_DP_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Pooling_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_MatAlgebra.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_SoftMax.c +MODEL_LIB_POW2 += $(NNTOOL_KERNELS_PATH)/norm_transpose.c +MODEL_LIB_INCLUDE_POW2 = -I$(NNTOOL_KERNELS_PATH) -I$(TILER_CNN_KERNEL_PATH) -I$(NNTOOL_KERNELS_PATH) +MODEL_GEN_POW2 += $(TILER_CNN_GENERATOR_PATH)/CNN_Generator_Util.c +MODEL_GEN_POW2 += $(TILER_CNN_GENERATOR_PATH)/CNN_Generators.c +MODEL_GEN_POW2 += $(NNTOOL_GENERATOR_PATH)/nntool_extra_generators.c +MODEL_GEN_INCLUDE_POW2 = -I$(TILER_CNN_GENERATOR_PATH) -I$(NNTOOL_GENERATOR_PATH) + + APP_SRCS += Cifar10.c Cifar10Kernels.c \ - $(CNN_AT_PATH)/CNN_BiasReLULinear_BasicKernels.c \ - $(CNN_AT_PATH)/CNN_Conv_BasicKernels.c \ - $(CNN_AT_PATH)/CNN_Pooling_BasicKernels.c + $(MODEL_LIB_POW2) + APP_INC += $(TILER_INC) $(CNN_AT_PATH) COEF_FLAG = -DCOEF_L2 @@ -57,7 +73,7 @@ all:: model # Build the code generator GenTile: - gcc -o GenCifar10 $(COEF_FLAG) -I$(TILER_INC) -I$(CNN_AT_PATH) Cifar10Model.c $(CNN_AT_PATH)/CNN_Generators.c $(TILER_LIB) + gcc -o GenCifar10 $(COEF_FLAG) -I$(TILER_INC) -I$(CNN_AT_PATH) Cifar10Model.c $(MODEL_GEN_POW2) $(TILER_LIB) # Run the code generator Cifar10Kernels.c: GenTile diff --git a/examples/autotiler/IntegralImage/Makefile b/examples/autotiler/IntegralImage/Makefile index ee8e3891d..400a0873e 100644 --- a/examples/autotiler/IntegralImage/Makefile +++ b/examples/autotiler/IntegralImage/Makefile @@ -14,16 +14,12 @@ MODEL_GEN_C = $(addsuffix .c, $(MODEL_GEN)) MODEL_GEN_CLEAN = $(MODEL_GEN_C) $(addsuffix .h, $(MODEL_GEN)) RM=rm -f -APP_SRCS += main.c $(GAP_LIB_PATH)/img_io/ImgIO.c $(MODEL_GEN_C) $(GEN_KER_PATH)/IntegralImgBasicKernels.c +APP_SRCS += main.c $(MODEL_GEN_C) $(GEN_KER_PATH)/IntegralImgBasicKernels.c $(GAP_LIB_PATH)/img_io/ImgIO.c APP_INC = $(GAP_LIB_PATH)/include APP_CFLAGS += -O3 -mno-memcpy -fno-tree-loop-distribute-patterns $(JENKINS_FLAGS) APP_CFLAGS += -I$(TILER_EMU_INC) -I$(TILER_INC) -I$(GEN_KER_PATH) -#moved to new openocd bridge -export GAP_USE_OPENOCD=1 -io=host - all:: model # Build the code generator diff --git a/examples/autotiler/IntegralImage/main.c b/examples/autotiler/IntegralImage/main.c index 4339f9fe3..755521a45 100644 --- a/examples/autotiler/IntegralImage/main.c +++ b/examples/autotiler/IntegralImage/main.c @@ -97,7 +97,7 @@ void integral_image(int argc, char *argv[]) #ifndef NO_BRIDGE //Reading Image from Hyperflash - if ((ReadImageFromFile(ImageName, &Wi, &Hi, ImageIn, W*H*sizeof(unsigned char))==0) || (Wi!=W) || (Hi!=H)) { + if (ReadImageFromFile(ImageName, W, H, 1, ImageIn, W*H*sizeof(unsigned char), 0, 0)) { printf("Failed to load image %s or dimension mismatch Expects [%dx%d], Got [%dx%d]\n", ImageName, W, H, Wi, Hi); pmsis_exit(-6); } diff --git a/examples/autotiler/Mnist/Makefile b/examples/autotiler/Mnist/Makefile index e427a7962..18f60d01b 100644 --- a/examples/autotiler/Mnist/Makefile +++ b/examples/autotiler/Mnist/Makefile @@ -23,10 +23,23 @@ endif CNN_AT_PATH = $(TILER_GENERATOR_PATH)/CNN +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_BiasReLULinear_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DP_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DW_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DW_DP_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Pooling_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_MatAlgebra.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_SoftMax.c +MODEL_LIB_POW2 += $(NNTOOL_KERNELS_PATH)/norm_transpose.c +MODEL_LIB_INCLUDE_POW2 = -I$(NNTOOL_KERNELS_PATH) -I$(TILER_CNN_KERNEL_PATH) -I$(NNTOOL_KERNELS_PATH) +MODEL_GEN_POW2 += $(TILER_CNN_GENERATOR_PATH)/CNN_Generator_Util.c +MODEL_GEN_POW2 += $(TILER_CNN_GENERATOR_PATH)/CNN_Generators.c +MODEL_GEN_POW2 += $(NNTOOL_GENERATOR_PATH)/nntool_extra_generators.c +MODEL_GEN_INCLUDE_POW2 = -I$(TILER_CNN_GENERATOR_PATH) -I$(NNTOOL_GENERATOR_PATH) + APP_SRCS += Mnist.c MnistKernels.c \ - $(CNN_AT_PATH)/CNN_BiasReLULinear_BasicKernels.c \ - $(CNN_AT_PATH)/CNN_Conv_BasicKernels.c \ - $(CNN_AT_PATH)/CNN_Pooling_BasicKernels.c \ + $(MODEL_LIB_POW2) \ ImgIO.c APP_INC += $(TILER_INC) $(CNN_AT_PATH) @@ -46,7 +59,7 @@ all:: model # Build the code generator GenTile: - gcc -o GenMnist -I$(TILER_INC) -I$(CNN_AT_PATH) MnistModel.c $(CNN_AT_PATH)/CNN_Generators.c $(TILER_LIB) + gcc -o GenMnist -I$(TILER_INC) -I$(CNN_AT_PATH) MnistModel.c $(MODEL_GEN_POW2) $(TILER_LIB) # Run the code generator MnistKernels.c: GenTile diff --git a/examples/autotiler/MnistGraph/Makefile b/examples/autotiler/MnistGraph/Makefile index 174ab2b59..b9d3df91a 100644 --- a/examples/autotiler/MnistGraph/Makefile +++ b/examples/autotiler/MnistGraph/Makefile @@ -24,17 +24,29 @@ io=host CNN_AT_PATH = $(TILER_GENERATOR_PATH)/CNN +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_BiasReLULinear_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DP_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DW_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DW_DP_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Pooling_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_MatAlgebra.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_SoftMax.c +MODEL_LIB_POW2 += $(NNTOOL_KERNELS_PATH)/norm_transpose.c +MODEL_LIB_INCLUDE_POW2 = -I$(NNTOOL_KERNELS_PATH) -I$(TILER_CNN_KERNEL_PATH) -I$(NNTOOL_KERNELS_PATH) +MODEL_GEN_POW2 += $(TILER_CNN_GENERATOR_PATH)/CNN_Generator_Util.c +MODEL_GEN_POW2 += $(TILER_CNN_GENERATOR_PATH)/CNN_Generators.c +MODEL_GEN_POW2 += $(NNTOOL_GENERATOR_PATH)/nntool_extra_generators.c +MODEL_GEN_INCLUDE_POW2 = -I$(TILER_CNN_GENERATOR_PATH) -I$(NNTOOL_GENERATOR_PATH) + + APP_SRCS += Mnist.c MnistKernels.c \ - $(CNN_AT_PATH)/CNN_BiasReLULinear_BasicKernels.c \ - $(CNN_AT_PATH)/CNN_Conv_BasicKernels.c \ - $(CNN_AT_PATH)/CNN_Conv_DP_BasicKernels.c \ - $(CNN_AT_PATH)/CNN_Pooling_BasicKernels.c \ - $(CNN_AT_PATH)/CNN_SoftMax.c \ + $(MODEL_LIB_POW2) \ ImgIO.c APP_INC += $(TILER_INC) $(CNN_AT_PATH) -APP_CFLAGS += -w -O3 -mno-memcpy -fno-tree-loop-distribute-patterns -fdata-sections -ffunction-sections +APP_CFLAGS += -w -g3 -O3 -mno-memcpy -fno-tree-loop-distribute-patterns -fdata-sections -ffunction-sections APP_CFLAGS += -Wno-maybe-uninitialized -Wno-unused-but-set-variable LDFLAGS += -flto -Wl,--gc-sections @@ -48,7 +60,7 @@ all:: model # Build the code generator GenTile: - gcc -o GenMnist -I$(TILER_INC) -I$(CNN_AT_PATH) MnistModel.c $(CNN_AT_PATH)/CNN_Generators.c $(TILER_LIB) + gcc -o GenMnist -I$(TILER_INC) -I$(CNN_AT_PATH) MnistModel.c $(MODEL_GEN_POW2) $(TILER_LIB) # Run the code generator MnistKernels.c: GenTile diff --git a/examples/native/freeRTOS/periph/timer/test_timer.c b/examples/native/freeRTOS/periph/timer/test_timer.c index 0b1461261..1589b6683 100644 --- a/examples/native/freeRTOS/periph/timer/test_timer.c +++ b/examples/native/freeRTOS/periph/timer/test_timer.c @@ -1,8 +1,6 @@ /* PMSIS includes */ #include "pmsis.h" -#if defined(PMSIS_DRIVERS) -#include "pmsis_driver/pmsis_it.h" -#endif /* PMSIS_DRIVERS */ +#include "pmsis/implem/drivers/pmsis_it.h" #define NB_ITER ( 5 ) diff --git a/examples/nntool/common/model_decl.mk b/examples/nntool/common/model_decl.mk index 2e68e7908..376a4ef2c 100644 --- a/examples/nntool/common/model_decl.mk +++ b/examples/nntool/common/model_decl.mk @@ -6,7 +6,7 @@ MODEL_SUFFIX?= -MODEL_PREFIX?=GapFlow +MODEL_PREFIX?= # The training of the model is slightly different depending on # the quantization. This is because in 8 bit mode we used signed @@ -21,19 +21,15 @@ else endif endif -MODEL_PYTHON=python +MODEL_PYTHON=python3 -# Increase this to improve accuracy -TRAINING_EPOCHS?=1 -MODEL_COMMON ?= ../common -MODEL_COMMON_INC ?= $(MODEL_COMMON)/src -MODEL_COMMON_SRC ?= $(MODEL_COMMON)/src -MODEL_COMMON_SRC_FILES ?= ImgIO.c helpers.c +TRAINED_TFLITE_MODEL=model/$(MODEL_PREFIX).tflite +MODEL_COMMON ?= common +MODEL_COMMON_INC ?= $(GAP_SDK_HOME)/libs/gap_lib/include +MODEL_COMMON_SRC ?= $(GAP_SDK_HOME)/libs/gap_lib/img_io +MODEL_COMMON_SRC_FILES ?= ImgIO.c MODEL_COMMON_SRCS = $(realpath $(addprefix $(MODEL_COMMON_SRC)/,$(MODEL_COMMON_SRC_FILES))) -MODEL_TRAIN = model/train.py MODEL_BUILD = BUILD_MODEL$(MODEL_SUFFIX) -MODEL_TRAIN_BUILD = BUILD_TRAIN$(TRAIN_SUFFIX) -MODEL_H5 = $(MODEL_TRAIN_BUILD)/$(MODEL_PREFIX).h5 MODEL_TFLITE = $(MODEL_BUILD)/$(MODEL_PREFIX).tflite @@ -42,11 +38,16 @@ MODEL_TENSORS = $(MODEL_BUILD)/$(MODEL_PREFIX)_L3_Flash_Const.dat MODEL_STATE = $(MODEL_BUILD)/$(MODEL_PREFIX).json MODEL_SRC = $(MODEL_PREFIX)Model.c +MODEL_HEADER = $(MODEL_PREFIX)Info.h MODEL_GEN = $(MODEL_BUILD)/$(MODEL_PREFIX)Kernels MODEL_GEN_C = $(addsuffix .c, $(MODEL_GEN)) MODEL_GEN_CLEAN = $(MODEL_GEN_C) $(addsuffix .h, $(MODEL_GEN)) MODEL_GEN_EXE = $(MODEL_BUILD)/GenTile +ifdef MODEL_QUANTIZED + NNTOOL_EXTRA_FLAGS = -q +endif + MODEL_GENFLAGS_EXTRA = EXTRA_GENERATOR_SRC = @@ -60,6 +61,9 @@ RM=rm -f NNTOOL=nntool +NNTOOL_PATH = $(GAP_SDK_HOME)/tools/nntool +NNTOOL_KERNEL_PATH = $(NNTOOL_PATH)/autotiler/kernels +NNTOOL_GENERATOR_PATH = $(NNTOOL_PATH)/autotiler/generators # Here we set the memory allocation for the generated kernels # REMEMBER THAT THE L1 MEMORY ALLOCATION MUST INCLUDE SPACE # FOR ALLOCATED STACKS! @@ -67,20 +71,54 @@ MODEL_L1_MEMORY=52000 MODEL_L2_MEMORY=307200 MODEL_L3_MEMORY=8388608 # hram - HyperBus RAM -# qspiram - Quad SPI RAM +# qspiram - Quad SPI RA MODEL_L3_EXEC=hram # hflash - HyperBus Flash # qpsiflash - Quad SPI Flash MODEL_L3_CONST=hflash -MODEL_SRCS += $(MODEL_GEN_C) -MODEL_SRCS += $(TILER_CNN_KERNEL_PATH)/CNN_BiasReLULinear_BasicKernels.c -MODEL_SRCS += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_BasicKernels.c -MODEL_SRCS += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DP_BasicKernels.c -MODEL_SRCS += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DW_BasicKernels.c -MODEL_SRCS += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DW_DP_BasicKernels.c -MODEL_SRCS += $(TILER_CNN_KERNEL_PATH)/CNN_Pooling_BasicKernels.c -MODEL_SRCS += $(TILER_CNN_KERNEL_PATH)/CNN_MatAlgebra.c -MODEL_SRCS += $(TILER_CNN_KERNEL_PATH)/CNN_SoftMax.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_BiasReLULinear_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DP_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DW_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DW_DP_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Pooling_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_MatAlgebra.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_SoftMax.c +MODEL_LIB_POW2 += $(NNTOOL_KERNEL_PATH)/norm_transpose.c +MODEL_LIB_INCLUDE_POW2 = -I$(TILER_CNN_KERNEL_PATH) -I$(NNTOOL_KERNEL_PATH) +MODEL_GEN_POW2 += $(TILER_CNN_GENERATOR_PATH)/CNN_Generator_Util.c +MODEL_GEN_POW2 += $(TILER_CNN_GENERATOR_PATH)/CNN_Generators.c +MODEL_GEN_POW2 += $(NNTOOL_GENERATOR_PATH)/nntool_extra_generators.c +MODEL_GEN_INCLUDE_POW2 = -I$(TILER_CNN_GENERATOR_PATH) -I$(NNTOOL_GENERATOR_PATH) + +MODEL_LIB_SQ8 += $(TILER_CNN_KERNEL_PATH_SQ8)/CNN_Activation_SQ8.c +MODEL_LIB_SQ8 += $(TILER_CNN_KERNEL_PATH_SQ8)/CNN_Bias_Linear_SQ8.c +MODEL_LIB_SQ8 += $(TILER_CNN_KERNEL_PATH_SQ8)/CNN_Conv_SQ8.c +MODEL_LIB_SQ8 += $(TILER_CNN_KERNEL_PATH_SQ8)/CNN_Pooling_SQ8.c +MODEL_LIB_SQ8 += $(TILER_CNN_KERNEL_PATH_SQ8)/CNN_Conv_DW_SQ8.c +MODEL_LIB_SQ8 += $(TILER_CNN_KERNEL_PATH_SQ8)/CNN_MatAlgebra_SQ8.c +MODEL_LIB_SQ8 += $(TILER_CNN_KERNEL_PATH_SQ8)/CNN_SoftMax_SQ8.c +MODEL_LIB_SQ8 += $(TILER_CNN_KERNEL_PATH_SQ8)/CNN_AT_Misc.c +MODEL_LIB_SQ8 += $(NNTOOL_KERNEL_PATH)/norm_transpose.c +MODEL_LIB_INCLUDE_SQ8 = -I$(TILER_CNN_KERNEL_PATH) -I$(TILER_CNN_KERNEL_PATH_SQ8) -I$(NNTOOL_KERNEL_PATH) +MODEL_GEN_SQ8 += $(TILER_CNN_GENERATOR_PATH)/CNN_Generator_Util.c +MODEL_GEN_SQ8 += $(TILER_CNN_GENERATOR_PATH_SQ8)/CNN_Generators_SQ8.c +MODEL_GEN_SQ8 += $(NNTOOL_GENERATOR_PATH)/nntool_extra_generators.c +MODEL_GEN_INCLUDE_SQ8 = -I$(TILER_CNN_GENERATOR_PATH) -I$(TILER_CNN_GENERATOR_PATH_SQ8) -I$(NNTOOL_GENERATOR_PATH) + MODEL_SIZE_CFLAGS = -DAT_INPUT_HEIGHT=$(AT_INPUT_HEIGHT) -DAT_INPUT_WIDTH=$(AT_INPUT_WIDTH) -DAT_INPUT_COLORS=$(AT_INPUT_COLORS) + +ifdef MODEL_SQ8 + CNN_GEN = $(MODEL_GEN_SQ8) + CNN_GEN_INCLUDE = $(MODEL_GEN_INCLUDE_SQ8) + CNN_LIB = $(MODEL_LIB_SQ8) + CNN_LIB_INCLUDE = $(MODEL_LIB_INCLUDE_SQ8) +else + CNN_GEN = $(MODEL_GEN_POW2) + CNN_GEN_INCLUDE = $(MODEL_GEN_INCLUDE_POW2) + CNN_LIB = $(MODEL_LIB_POW2) + CNN_LIB_INCLUDE = $(MODEL_LIB_INCLUDE_POW2) +endif +$(info GEN ... $(CNN_GEN)) diff --git a/examples/nntool/common/model_rules.mk b/examples/nntool/common/model_rules.mk index ed86dd3bc..b96cf3560 100644 --- a/examples/nntool/common/model_rules.mk +++ b/examples/nntool/common/model_rules.mk @@ -13,7 +13,20 @@ else MODEL_TRAIN_FLAGS = endif +ifdef MODEL_SQ8 + CNN_GEN = $(MODEL_GEN_SQ8) + CNN_GEN_INCLUDE = $(MODEL_GEN_INCLUDE_SQ8) + CNN_LIB = $(MODEL_LIB_SQ8) + CNN_LIB_INCLUDE = $(MODEL_LIB_INCLUDE_SQ8) +else + CNN_GEN = $(MODEL_GEN_POW2) + CNN_GEN_INCLUDE = $(MODEL_GEN_INCLUDE_POW2) + CNN_LIB = $(MODEL_LIB_POW2) + CNN_LIB_INCLUDE = $(MODEL_LIB_INCLUDE_POW2) +endif + USE_DISP=1 + ifdef USE_DISP SDL_FLAGS= -lSDL2 -lSDL2_ttf else @@ -32,59 +45,38 @@ ifdef MODEL_L3_MEMORY MODEL_GEN_EXTRA_FLAGS += --L3 $(MODEL_L3_MEMORY) endif -$(MODEL_TRAIN_BUILD): - mkdir $(MODEL_TRAIN_BUILD) $(MODEL_BUILD): - mkdir $(MODEL_BUILD) - -ifneq ("$(wildcard $(MODEL_TRAIN))","") -# Runs the Keras script to create and train the model -# Exports the graph and trained tensors in H5 format -$(MODEL_H5): $(MODEL_TRAIN) | $(MODEL_TRAIN_BUILD) - echo "CREATING AND TRAINING KERAS MODEL" - $(MODEL_PYTHON) $(MODEL_TRAIN) $(MODEL_TRAIN_FLAGS) -e $(TRAINING_EPOCHS) $@ + mkdir $(MODEL_BUILD) -# PHONY targets defined for each step so that you can execute in sequence to -# demonstrate the flow -train: $(MODEL_H5) - -# Converts the H5 file to TFLITE format -$(MODEL_TFLITE): $(MODEL_H5) | $(MODEL_BUILD) - echo "CONVERTING KERAS H5 TO TENSORFLOW LITE FLATBUFFER" - $(MODEL_PYTHON) model/h5_to_tflite.py $< $@ - -tflite: $(MODEL_TFLITE) -else $(MODEL_TFLITE): $(TRAINED_TFLITE_MODEL) | $(MODEL_BUILD) cp $< $@ -endif # Creates an NNTOOL state file by running the commands in the script # These commands could be run interactively # The commands: # Adjust the model to match AutoTiler tensor order # Fuse nodes together to match fused AutoTiler generators -# Auto quantify the graph +# Quantize the graph if not already done with tflite quantization # Save the graph state files $(MODEL_STATE): $(MODEL_TFLITE) $(IMAGES) $(NNTOOL_SCRIPT) | $(MODEL_BUILD) echo "GENERATING NNTOOL STATE FILE" - $(NNTOOL) -s $(NNTOOL_SCRIPT) $< + $(NNTOOL) -s $(NNTOOL_SCRIPT) $< $(NNTOOL_EXTRA_FLAGS) nntool_state: $(MODEL_STATE) # Runs NNTOOL with its state file to generate the autotiler model code $(MODEL_BUILD)/$(MODEL_SRC): $(MODEL_STATE) $(MODEL_TFLITE) | $(MODEL_BUILD) echo "GENERATING AUTOTILER MODEL" - $(NNTOOL) -g -M $(MODEL_BUILD) -m $(MODEL_SRC) -T $(TENSORS_DIR) $(MODEL_GENFLAGS_EXTRA) $< + $(NNTOOL) -g -M $(MODEL_BUILD) -m $(MODEL_SRC) -T $(TENSORS_DIR) -H $(MODEL_HEADER) $(MODEL_GENFLAGS_EXTRA) $< nntool_gen: $(MODEL_BUILD)/$(MODEL_SRC) # Build the code generator from the model code -$(MODEL_GEN_EXE): $(MODEL_BUILD)/$(MODEL_SRC) $(EXTRA_GENERATOR_SRC) | $(MODEL_BUILD) +$(MODEL_GEN_EXE): $(CNN_GEN) $(MODEL_BUILD)/$(MODEL_SRC) $(EXTRA_GENERATOR_SRC) | $(MODEL_BUILD) echo "COMPILING AUTOTILER MODEL" - gcc -g -o $(MODEL_GEN_EXE) -I. -I$(TILER_INC) -I$(TILER_EMU_INC) -I$(TILER_CNN_GENERATOR_PATH) $(MODEL_BUILD)/$(MODEL_SRC) $(TILER_CNN_GENERATOR_PATH)/CNN_Generators.c $(EXTRA_GENERATOR_SRC) $(TILER_LIB) $(SDL_FLAGS) + gcc -g -o $(MODEL_GEN_EXE) -I. -I$(TILER_INC) -I$(TILER_EMU_INC) $(CNN_GEN_INCLUDE) $(CNN_LIB_INCLUDE) $? $(TILER_LIB) compile_model: $(MODEL_GEN_EXE) @@ -99,7 +91,7 @@ model: $(MODEL_GEN_C) clean_model: $(RM) $(MODEL_GEN_EXE) $(RM) -rf $(MODEL_BUILD) - $(RM) *.dat + $(RM) $(MODEL_BUILD)/*.dat clean_train: $(RM) -rf $(MODEL_TRAIN_BUILD) diff --git a/examples/nntool/kws/Makefile b/examples/nntool/kws/Makefile index 3d41230c0..6512b3983 100644 --- a/examples/nntool/kws/Makefile +++ b/examples/nntool/kws/Makefile @@ -29,11 +29,14 @@ $(info Building GAP8 mode with $(KWS_BITS) bit quantization) # 8 bit so the input to the model needs to be shifted 1 bit ifeq ($(KWS_BITS),8) $(info Configure 8 bit model) + MODEL_SQ8 = 1 APP_CFLAGS += -DKWS_8BIT -DPRINT_IMAGE NNTOOL_SCRIPT=model/nntool_script8 MODEL_SUFFIX = _8BIT else ifeq ($(KWS_BITS),16) + # use a custom template to switch on the performance checking + MODEL_GENFLAGS_EXTRA= -c "model/code_template.c" $(info Configure 16 bit model) APP_CFLAGS += -DKWS_16BIT NNTOOL_SCRIPT=model/nntool_script16 @@ -66,18 +69,15 @@ MODEL_L3_EXEC=hram # qpsiflash - Quad SPI Flash MODEL_L3_CONST=hflash -# use a custom template to switch on the performance checking -MODEL_GENFLAGS_EXTRA= -c "model/code_template.c" - pulpChip = GAP PULP_APP = kws2 USE_PMSIS_BSP=1 -APP_SRCS += kws.c ImgIO.c $(MODEL_SRCS) MFCC_Dump.c #./model/layers.c +APP_SRCS += kws.c ImgIO.c $(MODEL_GEN_C) $(CNN_LIB) #MFCC_Dump.c ./model/layers.c APP_CFLAGS += -O3 -s -mno-memcpy -fno-tree-loop-distribute-patterns -Wno-sign-compare -Wno-maybe-uninitialized -Wno-type-limits -APP_CFLAGS += -I. -I./helpers -I$(TILER_EMU_INC) -I$(TILER_INC) -I$(GEN_PATH) -I$(MODEL_BUILD) -APP_CFLAGS += -DPERF +APP_CFLAGS += -I. -I./helpers -I$(TILER_EMU_INC) -I$(TILER_INC) $(CNN_LIB_INCLUDE) -I$(MODEL_BUILD) +APP_CFLAGS += #-DPERF ifneq ($(platform),gvsoc) $(info Platform is GAPUINO) @@ -113,7 +113,7 @@ clean_all: clean clean_train .PHONY: clean_all -PMSIS_OS = freertos +#PMSIS_OS = freertos io = host include model_rules.mk include $(RULES_DIR)/pmsis_rules.mk diff --git a/examples/nntool/kws/emul.mk b/examples/nntool/kws/emul.mk index 257b85d87..052b6d4f4 100644 --- a/examples/nntool/kws/emul.mk +++ b/examples/nntool/kws/emul.mk @@ -43,10 +43,10 @@ endif MODEL_GEN_EXTRA_FLAGS= -f $(MODEL_BUILD) CC = gcc CFLAGS += -g -O0 -D__EMUL__ -INCLUDES = -I. -Ii./helpers -I$(TILER_EMU_INC) -I$(TILER_INC) -I$(GEN_PATH) -I$(MODEL_BUILD) +INCLUDES = -I. -I./helpers -I$(TILER_EMU_INC) -I$(TILER_INC) $(CNN_LIB_INCLUDE) -I$(MODEL_BUILD) LFLAGS = LIBS = -SRCS = kws.c ImgIO.c helpers/helpers.c $(MODEL_SRCS) ./model/layers.c +SRCS = kws.c ImgIO.c $(MODEL_GEN_C) $(CNN_LIB) #./model/layers.c BUILD_DIR = BUILD_EMUL diff --git a/examples/nntool/kws/images/features_0_1.pgm b/examples/nntool/kws/images/features_0_1.pgm new file mode 100644 index 0000000000000000000000000000000000000000..9628a4a77ce8a8b83246e6f6c5b247d4c3844984 GIT binary patch literal 7857 zcmWNWg?|*s8^&jLW_HhWu?y}}q);q4w50?oR-lBSKdiV*f#M#577MNo!70+>Rw(XP zxB#)c1mbdcd%GjQ{R8&>jJ?nEeV^^s%+mCmDpgvxs8S`mS(7HsEHCM_l*v(@ZPDJH zW@p&Gf^hs@Sij;aS}Vss^*eQ*C6)#J%kW#6MD1_`m<#5BZhV^*EgfK;@dP|se+AFW zyVzP#M|mz*@tx=|`ki~LYk=*(_aAe+(3bhp=vvsllJoiyM@V3`&tX2zs|A;n@lsuO z8#Dmh!4IGok42j#Kdpfy$V5E_j3o-a$ZwKy;s!s5F4AAzKe^soCwb4C7l!W6e}Ha> z2TQNh%^{=J)&5uJCE`=CjQmQP#lqkKumV&9Ue*|eqbn>PFCt8T2oB*c`as@}++=Zd zhjflMbRUkKVY!g`TA_{sxxLWnh(5j=>|5tuwZy-|vPX0X&LNGZJMZSxYLI+)@c1*2a=guqx{SH-b@sfdBEw=zEmO3bC8arAP`> z{QMi~rP5#Iunp*6I@R6I6>DAMnQlr5nVPFfso~X24yX-m7yO?~EvBFN3Kil4u!?G6 zDHsnr0E<|F6HyG`NEG?AehBVVN`(bYH_a6zSRHhQ-gozoT5ofDFDmUFf!sv=cUW8@ zRmX?;gDHV{OSaI1%}6i!nEn7~fCMlGOyQkTC8-YUL88f?;4qO&?z0qNRW9?gY=`tG z9pS#_YHY2a`HU>Kj{mH~$Dw}}R?s>+>uZhF)s{TIFxXmNA;qv!a0!?X@k(n{IUbBG+0Ed2gEk z2(4L|M{b3mD#{AvS>poN{k6;kxl0?W%tS5O8>8MDFb*u^%g`R_A)SY2;?4R4cwe5) z{t*@Bnqnq5;j{EB_kzgPmK~no%@Lt;{vT+1n6I#%y3RgVUFuIUE#{AdW8}%w6`BTy zgRS5uIL7m&%diEVgCz1$KLIq{p0*KBh?iIA$;N(Lxt~V{Ec-k?O%ol9a+XPT!dsX6 z=rpIM)(9N13F#~V?7 zHWdxTf(exhbNE%Q9ugLMN22f->uf*yZda&HR(8Vq8%=@NH z=fk{Z=ta1x^t@Kd5g)u8%&|JfVKo=mhPnDGFvNH-9l#R41jR^h>cO*QN;gYYlk-M^JrPrUP)rd#9)(x(6 z+zzZOJ*%AI#lbJ6n-swqm<%QW3;4p{;ep1vzd#4bYW5E0%A52F+$C2MBX|#dg|2iD zjjCaf$m(n!?mU&BhDU^LDZH(Yv@Z=*2)wcUE)E2{$R@Oa*$w^s4S2+Jrb$QPXZ;Xh`osPHpUCCb;oer}>dp&!1m}d^Eb0+_Z*QWW z33RntrE>ZM@)ulBbHHq4eQQ7tpK9pA0D1?#Ame!o?rkc={{&~`JpK(okIv9}@B2nB zu+H~3H}`Sc^Scs%QkRj+#27AyUsb8-S^ z(|&NB;lI7XYQ7D1lgiSus1u=j1K2^{$li$|vI*Scd!H1`Epv30HInGzfFKJOlmiC9#!Qfunyt^N|IX`Ki&^}cdTIGnbE%RxN&6^!On z&=zSd1*klJqa6bk+R|(!1}yk=Lx( zJZ@!W$n?Ay(!Q{W(u&mV+^&b}S=Noblez`J2b**ksA>3l3(%NLs4;x5B}tihf&LPV zk>9eVq7w-ddHkN?^RheDI@~T5m&aF32mB6Ym33-K5BkMCNDmj?ERU%k+)d2VGpzeG)Pcw94V@e0dDAUK>p=O7r zOnx0@X{aaffN!n6p+3alMt;XfYBg|u7|m{hZeS?5DF*XS=r1^wZbIwvCcOkKB{yjS zuSPENzu8f#53ig#)>IoT$gg5~jn{vc!D!qm{X@ZJyCPj=$K{6l+`wQsi5KXrK__E< z9`TCLLwTV1|<3!=%ywb@E3?a^p~JnL>YXVX;61OI*9IyB;3a6vD4CNvWuM+4pJBF7K@}= zJT|*U_<6G1!`TQi&NGgEvaK$hTf7FvX*S_C@6e|O?!o$OO7KS!W6=E`@iiYTO@oXz zCjH3_dKQG?cv?lY$2KuUe3s_%k=a+nH{cN&@ocSFnYomuJ1UfHEgg=g1-8>}zQew`6ySOp~jm_tlny7vA4#fpfBUMV*K~s&lA9S{W`9FzEha@UZyNm=-#N zi+mqU5+N*1j*(l^NLX2(#Gi;7xC}pJ@Y*sCvZGw{mB#5Sf_-_mX93ef^3?1gL+b*g z>3ZDH-&XbF7zTrRBF^BK&Y%--E^Pp_X&2H>Zb~adEMH^GMGtJ|Rxu0z##?7~bj>uu zj5IY@FQ2)QVP`w_k(v)T`QNg+cxg$cK$LuhC95CA42XGj8Pc|ak7h=7eO;HQEqqCWWD zSYKB@MS3P=J&C+hEcA--;z_h8x8oo9WZ}m>`Dt&WYqN4IqdIr-t)EJ@|D1Py<$cp( ztbYz0BM&K6OS_Rc@q2KqsBQ4>T#(G8kPhPLPjZ3$Fu2qpA;cH)(rhWUzvv2Aho@dCk(KT9TPo#o2OA zx>%eg4fq5ei5BzTpd0DL8+k#Qv6e>O1TjWDNsFfrr_HzCKLJe%jz^QNGks29BALx2 zgFl0r#`#qNN9X`3!$G&I8O~m_@U&d*WJs03UVn<{=}(34%$rNYd;z?by;JKOwf-~|bu>FH z4Kq047~V`)YX1Vaa)FItHSs^v!TuX>5v@hu)T(rljN z@v-OJn%+-aWgq5C@Gpfc)%8-Esbc9hUk8#;>lyVXfOUY12==YC2CDigTw9)_eG(Vs zg>)x3F;P6CmcT{_Fnk_$aLvJYsZP zqTwH}Xol2W8mX^Gb4Y7L71t^ISy@($?ub|7Yut;^@$PjEGR1nL!AI~_dMmBE-Qs&3 z;dORfIgLMQagGY zA143P7Jv#$DgBv^!$DCE48&w>BOoaLqjRJ=@Q!B1gOnBe6W-jI8t$?M zD4yqvXro^LtjH)TSI+oNd;=$?U)A-H1V2}olOyVQQPKP`uq-efuclRkxd#31G1eH$ z%1hf}C@oJqD!R6a&m;e_k^Csyz)uT?hw}pOG1nXAeEJ(bpTmp+TAIDK`dB-Ll6{wG zp*&n&7A%i@=|h5p;Z$$|^Z>P)UD^e@>MzJYNH#gDi zi$b}S4dLPBl|Y~i>Cdygoullg==5cD6FlZQqa|7o`?hG&q%a_X64N37y1*3jir!W| zAj#0HA3+_~NBRQ$=|@n!a)ch@X=p30$g|Kr{=JCAUHNhEZdanxD19%Dlw!PNHNWMu z@2>V9_X|`KUS)t^4J44Mw5z(<$X^l-UK_vw`cC@R;Fm{AV>+Loz(ZIuUq_x!BbMW@c^z+BR2Nf7T5Eb% z8tq-G-8KJKR4L%bcl^`zRAN;-2WO@6>u z@Vf>dM2gusid((mWlff+>CwCo&du6PCtJ4^4^qG2IlgoHOw$4FtahCk+#S4Xc-v7B z1J1JRk{xc>$B`5n=;Oh6avge}t-*VFFHxUhzRr_Wc9wNsMt2@f8h!r4p4#>l_4hr% z1%V~{3AvH}PTxuzF}wD_IJ<4243K#UdJZn>73ERNAbmNAC2h6-v?Cb;5=D9ZnWbbt zEF&ysJ=ei4tY+;J8*Qy~X^~a_=${+dN0RAPy+2N-)ipC5Yv{{TFo1uMWO$XP%Keon zeHNHPE^FVjLF5Im#}n`!HaDYo*(B@5%ve}nKA$yGe70xh^eFJ5MWs0oJAw>ar{Ut3){&O>kV}IBnK=>+QIy(G3BYfm1j1dZl06Xns2qe{qm}C3))z) zrI;x_S?9n&$;M4aQWa&~?;McMI!UA8VVZ;|k{fyjsjcERX0c4!B3kf#w1VA9ty#hA zFnfE-)vOn@?($8J4S7pSPbzbZ4j26*pQTuJ;n`x4c2&5H4o)!U%SjS}M)HS-lHRn3 zv{L??W{J1*C~=yP#RFM`Pp`|Zb^Po3o-8uY@Nm7atxE3plC#RvqUMEe{E)_}J<%$j zYVc&SNH*s49JWgu0i)<`tSAjx0{oGbp&6p3azrfQ<8cc%^W%cDw;VB9&q=6`TZJsb8&wLK^Np1MUjASFF+MN9gmsi#ljggO&X6cQK+SrGR82;Xx0Nw>J!f{3l z_A`tGYrz$*ml*yisUXeiI8ZwCt zPw!Ls!M0Xd=q}SZ(J|N$tl;CB*H~M!k&y4;Zg@__vU_qdvFY_X*pu;3UpkZ4rp?7(Q>^8Rr+vwWkS;>klFfe!px!s;B0DbuX0Y2xP;>YRct~7g zMe>$Vx`ic>}*t9zF*Kq`OW-4k0?&EI@P28x26t! zsy`1rGG4K6{98ygZe)e%gttmxixWPO;!3v&bK;Vd|pcZU59nk*1|c!cyqyk|cuH}S^cYdBiz zOrIL*-JiS+Z;bY{M;~9>o6EgF$KhwD%%W(es-=UsLh&fuPpmfI6{5m5{+VDn9LFuN zuR-LMjXW_EbQ4oqYf=Hd3%-Csxw4VRUM6ss` zW*0x6uK9864tw()bt@o`Fp2J0UyOrM7?pRZKlz)n}N(!)dMjkr}6dEZ@73q@b zV&u`x~H^f=XjjF1O6uJ7=3g6Lm6jTWlolp#8|7AuCXm~w#@EZwAH#qf1v+s zUk?+4QKUY)#a@7AhNAWa_k;%>gg>y}xH+1xXsTwKwzH0aj~968b#HAU(P+p{bqYo7-4!6L92zlOiCPb7=n)kjHT zCWQlWpEL%g+=jkl@Z+=arphlFjYzKTO<@DehwwGI*Gm4jRn&jgima_fAYjL9;A#37 z7-pQ`LQqYNL^1FLT}%EZ2enkV(L71)=TGq`@x2&^oU9-v82*DYD*Yg?YkykU&#Z@* z%gHLe;i#sIV7l$8=o4_GDw2ggHEP8H0Jiaq(h)IM`v2s>3k5}>HlQ@ zTP##Jp=kJu9yijGb%r8j@a>XYyw*0NOL&Mr2PTm;y_py+*A*sS3%#T#+;Ndp%-zxp z@nC1O;^SnCV@vKsb!ce3_QdzMX_-hR<1J8%;YagCEhPzjEl#0(^tF4pD`@%JbJW}^?7xC*q(MYN z(WAgs+uOihf1Kr^2oL^=zm|5hj&PbmYc0TAejQQRm35Rey}kGqQ^x0?@VTvUdJl;o8VHN1Y&C@mp``BID5 vnC|kVz(>>zwxi39d^g^x*p)Ysh6AYAN4-c}IvZ}1^Lby%uQURujnwJ?L<9mW literal 0 HcmV?d00001 diff --git a/examples/nntool/kws/images/features_1_3.pgm b/examples/nntool/kws/images/features_1_3.pgm new file mode 100644 index 0000000000000000000000000000000000000000..f682773e1b23790004576f44f52a03f77827063f GIT binary patch literal 7857 zcmW+*b$FD=*S#~Z?I!C<2u`55w75IL3barfpzy;f?(XhRfg;7NIK_$=D-?IKK+t$L z2}yQ0EAKn<&GS9`|J`}#o_o%@XS&vls9Q5Gu2I9dxcc?#)TtM-nw)c2ik@$+<4Lf3 zqDL02SEQIh{+r%-u2|15Pbi4PW)pP-xM1}uVKK!G*vh_nd%jN*8m977kQ#&R)Z zJ{zy}XXR;aX(=g4yyem(mu41=tm!gxH=_*a9B;``2MY?dE_|j&(6~^BJOme^?_dTD z2hMNN-S~Sv%a{Otq&wsw+>^SHuKc2Wg?h*)JcCS4)LoY%`(*qWQQg(`>oGLJInnpZ z2-xq0twE3JGW7+&$}T*aoP!P(-Y^}uM5ymPVTdlO#5NIZK$NV1sxOYEAK)!(h?@Z&*0}4_5e+S3&u)R z0N&6q{3LFvx8r8DIXg{HOY_K{#B;6^kyp}pn%g;t=hj5yoW*@UJ;gptUlv%T{Yifd z`Q?pxGEIaL&>J4|a%?kx2Ca?zXfD?EA5olqN>68floND2y(@hpg$WHW0T4QYF z1@Z`bkG_;X(%`4@C0j(d&;RO-h*_LnnI5*q>>fyX~8tGqw$`Nm)mOjm%^7zYDCDOXO`S7--Ib0Re;7q4`Dt2Ejh~1(oE3(LvbD zWbHq#kamNA@O6XoXmuC=z$!@>+4Zc_B^O2B%Q|Xb5napkD0swN%6l|)MvlvE=5<+{ zNmTXm42zM2{D zh>8wpTnJQ&)cn7PcHpxfcm6JmONtEKBM*!UMlG}#HoMA(Qr%O*+j;B$vFX|tezd2t;A5ZV z7iPjND2m?m$9O&@5wmtF;s^L26kvs}gEPu%zMfZ;-K<5<$|8nTG-pR_?U>IgdkQ_) zBK{ABl@+Ux``?;gv4ep({2=w|ePFtnw{=jPzrdrQvT<8(5m5~VIG{`;AAVDsv1#nO zRE|H*+voC7SI+!mQq=NK552O*oPW8XnW-Fk9PX&Kp$`IUVK;wDdV>Xi6TixQr4&Rd z2Eq5Vmk4NG_z9v9OY)f2n?0~Cavi^wU9mwCb1`cDEq*7^YIYo z9pF#-XXvOml}2he*;lU0|1yB<@*~*7e-iaB`kLD>kjkDzw$e`7&Ksa>VM%(|91M|}48zRk+Y zKwhYbdfV7z+|jPV?NAxogpFd&-~gWq2CoQ`vJ8Q73hk1=WevnvpP|=StWpt)dqxZR zTHhM`7qUF_ugJ~zpr^BcvbHbKIk-`g$U}X%HW=>=wg`5@=@IENex>h5iRvVl%=XI~y2Q7jj(ni^L{xj$EB%?}tks+PIMB}aufLmbf$B7N z2486#i0K_=ihsSQ4d*c zhS_@Faw4@}=&>!vzsVO=&+4~>{Y^*Y1|*wR6(_6>MW8Z565z^;0V*6aQVW`rgCZw*}-oa+0kwNQVg4|qJ)NGXUB>@ouY9!SMdHn}G6l^5zacp3E+d&xplNB9=*;^KT? z{_7&6P_K;owg#4C$yM~y&Iw+-Z=5zD*xFCEa!NIphQ`wGjXFXvr@?$!$InWW(SGt- zKCHy*wOA>27+=JmOaDP5IDxIaR{q!`)$yCm8n%BUPAC6sjCB53@Y!3|R3`Y|`@8x{ zuF8+&dh{D(A~b`=FbpV5k~-jCWTJdlouns`D@tt`&BjS}(0)k7z)$8qift}c&n#-Y z7}+m5Bizri)jQZ^l}6vL_*GsNUBg3ZvA2c_4D8hP?$Ek$ohGnHhX zN?+hqD1v%PVOBn`aWNlaq4ssO{$(z@nd6^oMECgUQF6L?&m$FOm>QomGarF zTvN%m=6#vDk((n9e(DR!8~<<>$Z` z@-025e&Q{SzhO2k0~;y>59PJ^s8LQjuHL2ZL02~OI5bDjAWdkx^oU)~zF8{AQrXid zGA82Cr^vv1M}F`__=u^d-%`*Rhl$_VrI3jfm8Q?B$5+Z&G(F;hGLp=04wTfYBHXp6bGKR$mYDrTgFZpv3+meCOA z3P)KAnNT04BCcpKyg~cG-NKJ|q^ZzT?ZelylhRSvD63LjIp;A?OS59?|8ZR5ZQDzK za;SfVyD-)phj)ZMfui!ys4*EMyviWBDI8#JHC`HRti>nPE373NAhT)V zX4XG*Yupg$qs$W8T+@xup1@XHw0~8wk?EPggYO-F7QEwsEfvFCNJD}D#qa?Rz<9N> z+=lGL*+SpG6<>{52u5nnpg9{V{A8Jo4RQZQ-OX;RHB`5K?xH6kiWqhWHNjMTj4IWf%WQHxdM5Ja^)Sg0-mo#(UPdPiSbf&4erYd(+-vy7d<1V zomxw+nX-`>uED_-`tRoJ{-@r?xNoR((5-a96^RMT3-|I08h}k*D#z2Ul1=@K7Q=ni z^6WK~(t^Aw`%Y@l!l_-#%#Oa3Eh%25Zt4eez?l(vM><-r`HL1DMZ>}!gw{Pp%Z(#E zQk*XtTEiQ~Csm=3<#igTk$8=EoyVeuYBKN4ERu~)N*Pw_lk1-6n%qVEIkPuD95Xh& zoG-NXEmR8Lp=05xp%UsYG|Bjv7ljS53s!MO`9<1G>dFxsp_j44G?>4I@3nuR6T6Df zQTS4-#CgXzS&xEWaUd}1aK&h-!=qMbP=aY5lt~!XW zp`)d@G)t(lm*gi^j~#2vPS=zu>yn&RaMOjuU3r0ZX~5+#qqH^3hEkR8_=qu*s{(%s zu#(NiG0-DC9Db4Kveoj>rVsQIO~H+5Ct6c_Nt(SI5?#lVkk%V7ipb7+h}}^!;R=v% zuM}<-c&(H+T7=4~b8#i|iATXIXa(a~cWDqh8D1)U%q_YO9aK)!H~fuKm#?SC@pUrq zt=}~!qEy-pX{vcgRvxs8Dib_JYuY*lOB7bu*3lZ_W=cc!kI_bKtFxgD453rdWhkX5 z@ZWH<*xx?NljtcZrViv^*aPV@srGiY>v2RzdNK86WGzoep5Qp*?@9i${t`T2SkY9P zbq#HlmO?QyT{y|#1jFp7*)W<<53k^rv0>~+ca`=m0BzJ*=)zCn6J*MpsxB=elzKsa zWm%9j9@aPx`=1&stqVgF1CiQ0@?+?!^c`FkeA-XwWfmVoC!wJb)F+`m_*eZR%u=&y z6!*x*`8x{uPm=JatE**1WZD$PX^Y7@z|Pnk`wj-Tm}dq53yjk`()iFx=_K?fpJ0pV zMxNOBgJ`eN%MSP(sf)1!zRH8?9{8d@;8W=#bcB3z&vj;+KcpR3_uKAe+o{WXvtUZ- zWJIT6bl|;unRtSmrA6R1dW!oS3z>X6{RcHgdkrUAh|3v4{*64GJ^{Z{RH$$epC*mm z!yVJiTheEkE;>8qQC{7_{af@A7A1VAP*Zl3Y2lS}9?T|Np(hN5+kCT_*Kbb0}XKKBjmi$KOgyUdgHxgxY>9$}+bp^Q_8Y@pk zn7$HvIa1(RV?}W;G!Xn(U8+JFNUzk7E`winkTiLs#@=4N5V8)4Q~}Xz)?EfHB{ZQ99DMzUIooTEME@7Odkbq7gL_YKEd^Q;WJ zdASW|2U#P#-M5W2besyG3>J^ffRwOPap2vg7j%YEfT0P?$H&m`h8y*l_8JdSZxvB1 zU#b`^B>J(J$P3$4YKM5^cen){fL*HbZ9I6!9YpTnxgk~yBu}Vvzqr#6*ffQCkT7a;jq0>@R z`W8=B7tzc7J0)3o;)7@#xsrI)alk}UcdBMvFgIG65na+BVa&H}3ikKEQ0ve;p<-$f z15FX?F$k(a1-2ij2>)>x?Uue6pW%Z%g2eNk$|*LEZV>l8#LXP%Ou5O$z_H8^VYNr2+QOov6a9`mGHA_Dqm?+mr3&~vcLh!+Rq1u(u zI#_CK#9d{#F&XwqNjlcqa=TKSm z1M*woKjCki(y;=?9pOA*L1VBDjWH(R2szf61c#MDv^M-tD~@{e1JZi3$351y(p)<2 zzA9OEWV!Ins0Y3+CL{|mF8OYJPvmv2v@=QEY^mjkR z!%6;#HpG4yVsyhbI@zBU{7o#bmU=>AC2cgdgyzY6 zPfM5)XS zxqf3Nxop}AB{G6uryI$B9!+M{fk>vKOex`$s5RL_ZU|@b8K2O z?SAB;jC#-`>ZO#7lb=rh?MjgqE>i}6@x z7Of70)U!N?)tCPu^W6KKv&}uzRQ0FG5t(!O5?2xbfbcKYetNqwi8#P3ggQ#oP$_yC zx(a^TDpa-ry@plBNu;C7`eVqHhlp%{pY~bo3#xpSjCL<{tTOdT+oM{ntvu~`va?BG z0=?}xOwQ@{$ar=sFjQKJ*U+iLIgfxu?xnTyW~gFNR6)wncL{!JLQe85)h3eo!O}sJ z;|@BSn0`xLsm`|6_FQI7oOKIh$Q%1g@f)iuYBYc@=!8BW_towZiJ6oF zHiw?ad&!N&&-Qs*pVTR8eQUaB0GxE@`D3Zu{w&lwaMaufo`+k>fc_#S(C^SkWKPvZ ze-Fc0L&E!|dSng0ty1!qak)D~%r5UFHQg1Roz3CYD$1tFuAcYsrz_jv+vsfj6dD~U zFy-(edU<6yI!G#@^8(N5{1}~&CcqBkBAzY(Ny_0kwKt6dml|Y4nMK}528;RaXg-;~ zQo9&gB!>lfI7EdYe#^IhJn8h+69l;VM3%C-EdX zf%bqyYJZr`FG|PA5%*$8H`A!J@oH&{*E0}zjq2_%N-jAkQ>Pvu;bV!RE+Cwd@wN?L(J0kBK({S4F!9TD#h zMR9?C6VAwM$O2fVwSZXm0AC^p-B;~#ruFGrS~u&E>=a~=YUN+1C)y_H#eO6hRxrc65r5r$umWyo$Dv5~YPC8dsM~>jT*aHGxYs0Usr|-PIipv?-}4wM}*< z*No;kyZa+ZX-7ljL-0>i3Eo}brFccXl4!edyr=mX8bD{zBk~r1l&+GSc&;+oSisIG zrFkBGjsGU!xb1eX7EcSP~+Y*@HO)z{)dj0CMbni&;s}o#`8(E zx_FOzXAF>@OV9NFXti?LxXGQS_mIe5NtZ~xySM#-b}j9d+Qgz~ZQ%eB8n)z!%i|q$Vse6+<$wB%dVH++7?= z+QZZ=^`Z5RCxMII)u)DcTXoWjOplQ1j3AaEn3tDu&Km*PF(1c`Ph+vk}!q)#`2{a(%sWQy8Z*u+?DCE*vLwY5pq9w^7)`dAA@U^HY|EAFJ$mC#);CS;EHcaTpEF2;7=X&DBER}zyPU$wz zC8h8dd_dnP^l~KG!9R)jC_il=_^-74tK+5leY!)NZ22?0t@K9JTc-bPiwGwMx|_pn zRLCyx6?yY|@vQ@3Ki@@N635M`EDe<=hr6)9<=6BKpQr5Lam*_GUs3mGhuc&kZG_s^ znx8!yy>(3WwlogeJB9xYq?n6Aop7|=0F5UbMc&j3Ci05xABmzu{W@wdH4GPFhvd<8 zBLAo~gl~9z;r|x9S2>58Pp2)@n%m@T#`@R~d6ya7HX^h>Ff*bTpzu)Hh6~6M(Tzr8 hCw)$bNY!zsQD4fIw}~ydp1O?n&1a}D(f@^V?A1;Lw+5*7}#ih^|FU6ry zD1itGaU*#Nk*8NizPrA4|Gu+k=IpcgK6{3=wY6zkyLQJ8wQIL)8y42qc7QI3FLxXv z$!WO|ZC#naUoUfvE&Ih2BQJG5@N~D{7n4+uBgK7v8}xydz=V?lz5{tQoeanMY8Y;B z^3&_6srejh!Gg#Sv}=5@V;^puo{S4@WqGYN9HsTZKE#_`^8;-8H|DDPziox+mHlf zwRYfyP1-Kp)6|IW6U5|YALwGdl1_|299T~FrALto_TBj<`cX$hapUr%vdvw|^&4rS z{q3tx=8DDof3U{*`am_l9INm_3q!%Up0*JkGKDj*@XO8l33i^WrjW2cC_vtw+1xbQ z5&J2h&8{%Nnwa_yYH4r&^4$L<8S0qlc2>b`!)E1(xDBHiA_mm## z^~HU8HqT^h$Ts>pJ}vO2l#w}A?ra~H--f@dkW@0>{myd4bGmFg`dX1ZOF`vpXGKy3#kfJymL+>wrJS*)VzxR}O6$#LpWdQq{X>1^gZc|g$P;yvU=r2to) zXRfu8ANl%P_VX6XXgo`V&@$Kr6X2nE!>VIN1ZxlBJ*-j2Ktq}6t@$Ons_^n1(y#2s zhtiz<$i&IY zYG}&VY7HS82ElF-%iIWki7;s(aKWl;&)#*_Rq|Z)c@8p zv~i|~FhrjN%XtJVf>!W7EP!mfTxj?yio_@QPwE#ine?Fd*?T-k^b@7%tr%V2$9%;z zIWJU>5nVHWVu3X_sSOHOunA1{zd(oZVg3_b;_KKhqq>7(FLdRJ;w6qkWx~f6&^b_o z3ivU!R?2{zBG;%`1$P7aR_T?jwbFEP=VO>wp=PWyEH{WZK+&FL{U~}(d(l=bleL5H z&=C&6b6zZFlDZ@WbrPT11`z|_KmoQ$N$^%2g?_Ntqv9rI>+nSi@|*c|{41NnM)IdT7NX!ws0B^=YC(t<{~(I_DZWcggAm9D2V&wfyn~)F&Qpt= zEp_G=NN2d3)=9r#y}h=fpc<-4V!eKS6WU7^-N^#Q5r}|h&;%xNCL+mm)IkXThIbOp zp%~txpCFTm!$s%};hweRLrGfBBdH$SoBEM9s1f9E{wV|}pc(EGJrkYNi?pLGK!k%| zbb@zc0+-QsBH>l&6ZGN3#SPROokQnQcj%3xQA;RsU&Li)=kvbC>zK&;mPyqL)RRTy z(Ew!f?O{{#0czGKu_a;!l!`DoDW>u>uocH44ZRUAdPVF&>%>CXf;vJ6WJcBCoI76n zT(%?o3JNsNyIJW|^_hC75TSkexHpo%z!ny-PNSE_W}|n*VY2AYUqBApjFNGR=uN{Q z8(rqhp+3$LGFpnL*zcZTZt981IS5ItQ^q{?T9pB6Gv~K33kCX%_y`ilTI)OL5bhEw zq9dFU)%khR6}?9b$wP=_UO0v+PZp2S_fUZ9?L9X5b&nAd2;MY(loyLn$cQL#8 zhV`tnB>OR(MzizAL%opN>OJQf?SX%%CykY}rp)LS_B(SzWtalz#6vb7ZO5Ap%m_YT zdxZ{?pY#j-2>G3dh=aJ9SXc!1sVd5vB9_AJ++g&e`f}}3Q7dhezu0$M9Hd+HR8f(w zW5W%;>IOD3fbB&ePzj%h+OnUO8lCkQ6Je-BUT1_u z(*5_G3$+f)Z)zmD$tvh(u&_6@E?NkkV3jCm(@|Y?h;K&ISeg<=qD*_}88J@YA{L5O z*dpxC-vid@*E7wcKYsIx&tJOsn3jTr{RjSd+p6T4= z58!FGe|UrvCnR1*JE1NAUyt@ihn+x;*;jZp%F+JD4^7?aaq1v##C`6-llfOU8J6=} zaC)R3gmK{#(eR7s>QwJ;G*z9?W?Gv;YrQI3$qCOun+-u}!4{*+VwiFhxy(z%2hl{TEvkt*cmS`SlW0Dx^~)T=+eqyT4(UfK^w7Uj zs*??BEqc(L%7eAO(2E~r4Nx!W0UDpg2IBuv7OeN3BTWGll|jWpwVdMYenlh`oS8HOA9`;xE3(J+*Lf%gcD z5{8Fg-ok4~`z zY$P(n2KZCV;x4=tHRo^15)tF;232HD`;Px2PvaNqD^$Q{WX`la)Ei~igob8sVKIFd zbk!H-nN1v8iV}kN>BVYgkl8r)$hg0`2ETY%4cr%v8yZX>u?B{oJLsXHw$T8GPn~p-HMfw;qS-MLP&^kDQ(Tok&d2B<*6YjOP zC~eJG2gJF?xLo**8me@|n`kF}EiU2L=>ek}hTjlb+(8zhMQksY5!S*XAJ@<``6;r8 zexyh774{;1uyr-FWv&({%qN{o^r5zurCRBA5Z=YUMx-jer4Bacqa0dnxWAr|BASaW zWC7|!j~Mv=pAs&z$j^p<_?=|3BWx!=!D^&Ww|0P~Icd<%TD9alAL%&d`dl_!Z1C*# zj70_dO?5o#EEwC3=E7u%gdapzG644Iqc9qITn3`pZu@Q*AoZH1+ZIFUU8PMdx!n5d@Ps(BuG zX5m18sQT7e?=R&DY~l@>Fuc<|2!%?*L^?q;?Mvu}7t-hAl(?bzML%;md?7-}ZZKks4@o1h$(h@aE$(Ne3I%))ZF5VjYHrS}|Tyc(%z>oQ_Y6CP6 zKhrjI0`t^1u+Ur|j&L`QWv-9U&2|x#QysUqJ}Mlb1qTNEvfZntffV#H@^gMpI|=bT zmOX|3&>UjLHW7h$u?TG?IYid!AE7$AsVgwk6vK;HDn8EMeY|EqDz4@vqd%;bitYMh zd!gIpxknDGgOmuegZ-+Vfp9*UJv6-J5b%jS@fXfyBedq^YkWuj2s@-k^ogh{Pvoum zHhhrPNUdNQAQH3Npa@G`!4`jAYZKQ-R|PUjo#d~HmTSNG4{^+<(*VOuZZH^ngXn-K z@TdA0vLKVR9&pn@O$0BHUW*v9nrvsz)LFJhcuVei9AMjC@X@=>dct+S4B;M{(;tV% z>JNO!88h^}Jrsyh#ys7L*FgRGPW>6#L=J0{j2aBrhp}cPi_haK`jI82uCe`%y5yvw zhPI)fTDf)ekFKNTnbJ+#+y5kq!7h9Wp*J|8J=2`K6jQKMw5N4b?P|F z9eyk8H&kSU;$>>4^@FEe*<)6`gUe&kax_U)l7iq0+XlsA9XOznuf>&N9|h@4B59+= zF&w4c;d@Ny;Ub@GOe@}xOHCtaP}VaPXT4i=NH4Wra`(`N1gub(y3XRJc)7@xV&O48 zE#8aO(81`B1DL{i7DGOgU9<`;Lp!xSBHXkCEJDI3nLV|)sX1Go^+>!j&u}(TglT~* zKH@BPPOK@dK}o1$dc)&!X%44)iBprM=+Cr9pfs>x(wB_bHVun;kWo(BHH7hU^<(HC!u@#6x0RR)mHI3Qcpge zC*n=)o0RI75cFN%2>Gf*D{iV-%ooZQyU$A#wPtEL@zN;mCJsi+_;mQkxW9)Y*Z8Y3 zl+_}a=wu6I<}TX=(j8#%ags!5W*tYftPP#j*lovW*Bf89>6xe0Jw)!vX8D(* zNVsI+*$QLfl{n4MA;#XR|Kdn#3iZG)V;QYcNDJR z2P&p{ulg#>_1zy`bEV1jrB6aHL`V8e)HW1J73)L-4it6tFezW^&e}`oWv(=*HaUZz zWk*OI_UJ=jn`ZDICSlg2pH|Y(L4HpS|2;XuUC+~9x}nzed?VWNwfZL!3iFN3WS+6! z7Cus4N@|;bfdZ3ap0D5ImCf75EOwaK*y0Zz>}S!t>8m^x4 z-!kRVNB)U;IaFlVp|{}=`oMH?*l^u{tCAFIiZK$z7ACjeAD);)#SQuvYxG#sE88Cs znUyX5VVm>mr9Rm)uFU2eX)93P`$n0X(-Pk@oD5Dn3~WZ4^9zg;r!f(M+D@`s4xmS1 zJL#$k3Rs>$TM7^p3uns921orNVCH5M7NFtQX$J zE2>j*MQNOVA9~@F{<`d~T$jJ5xA8+-GpWFlfL~|!GS?2;T`)kKX~RV)%C}mBlvVCN zq_@7+*B;#xAL(;L-DVqn`X`UagJGzypj)t6=>_R{p0b)7Gm)6bDv}5ETw-oOQ`91( zmC5N)^GB-(Z1ak2<#)_Wd_CO`>3iB*nTl({WHtbehB>fV>}UJ%La=B$EJtmXt-J&e zQvadfOCh`;JB3Aa-ZI?T&Gn19o41>9oV6Q|^gB_cXhiQo zOE_p$MB``3TxhHTc}2>U#$uYZLfgXLNNZRSTTO1#Itc^pM{u;5Oasw3#(ImO#BlbLQ4dW*)lfr!Gk%vKZ8!7Cp(2Lm;cN72 z{CD=BVO&NFxr;3*Zxj8=@r!d{X`spE9piDzUFZjY2a+TPP_NM&CkzDod39+RZb7?} zE~G@gi?*8T>LL89X_MH;WO9XO#Rmt3pl{N%@n=h6ZV>J6C~)>E?P!|m^Lm?`tz1{# z(g-w=g`%eLD{O->@k~C$9__=9{HifM=EfKj)Cl~3Bgy#W~ke51w1lSMc zzty|h?M_<0-|F|>^;#{Jd7-jW>W+e0Dv>y62_2GEMQ`F+4|GS_4ftR_g>h zJCBET>&5?;_mKUr z0j_j|uSWTvh>dK89tB+7glq7a-zL?_BesPA*{)qg+f3)#ZRl*`Yz}))?o%ycZ9otz zNVVgM4qH)sW)93OyX5`KddMH|?Pof|{!%XE_hK#GZ=|Oy4No-E&^Lm^Srw@#DO9h5 zSw5pT5j*7_>`&^#uc<4+9B>HQr(MCH1JVncY8UOlmd^BiXU_D7xF_KrdL{KTIRMR# zJ=8GXk2;DgOvU%$h<=V2<5e5ba-Njws1;_cc#YPx)7q z2U@B!58vT|veYcQTo2Z)cKHidc1FP}6&|fHQHO?2`=}GuY^p}5zs3Tp` z3EYr(vlP}Vao^lQ8`WbC2DrE2V1LB#OZm7{_uHfmAujirlVue3X{7Y=wpX`%#*6@?Ja2NJE z;c-Bw9G7v1RkS~HUiR)aohtcUG6gO7clJb!d)h!{jCjQk(BopOk>R(2gQ68_FBHJxpF%>}`S>~ZUDD(9YK+*Q7u8d&*4AZ3WtF5p>T~Zr>|;;W z9by}+Zk+d|*k>rF1Z~J|zCv41PLd0nfSb}YEs4%1Ml!@(5t)^|cUe>Lt@LDm!hYJh zMp{L4dG9itw@;V*H5Z$OD$qMfAO(gEXDE{wF&vgms3 zrYqSkoJvtbj=dVOr%!{u4xckcZ54RMea0op!avNrKpqVz)k5(n&!>wFuiFz&h~0cP zUI0xDesPj$y@i;Kf7TC**YXG>2~9!wX~Tp@0bk)~Y17cafVG8FXi9}JPqb&Tth-~& zYfD+YliI~_^LEw+A%S88R-T1S%LP$@2 zBJwPgbKSo20jLbh;&FFRcKHSrquG3C#S<2%4HSx)1Ddc3g>1&7SRvUdUDmTjMG~f; N6$8zmAyh<>{{yx7=4$`| literal 0 HcmV?d00001 diff --git a/examples/nntool/kws/images/features_3_4.pgm b/examples/nntool/kws/images/features_3_4.pgm new file mode 100644 index 0000000000000000000000000000000000000000..048c708b52dd593a51ee2bfafb6f923054abe3f8 GIT binary patch literal 7857 zcmeHM)qho2uwJtFaX$$fAh;9UDM14X4TRuc3N0-jG&mH36))~kq)4%r;9e-Df#M-< z#1kS%wk)~({t5T}zMZG}?O8MP*%{Ko)V_6aaF@=(!5uraYuCZloc2fyb@wvX&%a@O z;Mnrz3OH-6UipwTwC?vk@)a6u@I>jhP=I`F9`b^=U;+=O3t$VF4yJokFS^zmf6up>PC5T8^TQ7I?25nT{pQWm1?i9}9Nd@VgxlydYlzZ7 zGccHsqvv3KFkL=ND_NxSJ)DDOt@amDWoa}EIjASq5-`U!H{WkiZR1P+YeLD35}*)tS&P)Kw?(wPFYx%dlXi6H~9c zr<)e$ryCmB=a-I!4Q%?diT;bG#_~YbY>fbaNW<}M(1xCce}i6N88_2hP@Bif-Qii` zD21qpK$QOMgYZ3T#^SJ-gr=Nw4K{Yln_$>u4=w$jf3QTC{peq6%$EKolWZ42mNXN; z0kQNtJPV@0_dJz|us;7*o(2aCf03mqNM|Nqx<;7DF3|*3N#axbyUUEb^12%ew0<}7 z&bGpe^Gc#+sp3`cnpU%q(jRy{NF*!aIWPn)={$zL0ulGFa(cmwoP4s$?r z5W?fhK`?@^P!7Yx=!m?E4a6PDM&49B#8TKgZC~!C)Cw4H4$WI8l$k!1+($QT?aO~u zfrF8UB+zn*4)$F}vtUiu4ym9PP}wuG4;FwHWH}xv3|0HUJ>oRAh&;lv>@tfGbd;s2 z0lUmU=Q@O{CSG`vjWBopqR6?{6^dVpGD~!UU%-Pv8o35PgJ59b(ewzi!yaliTqevV zw^6b#f{bK?#6-4^Z5K>*V@j<1hG|sZGVzt^Qo$>-%rdcjs5IO(Qo153hAZ@nzZp8k z50L)wF{lduVoBr$>;~&o8LRjmIUtvn*mOw6b^bD(DWX@m1ut zb|*DSQ(S=8swzkms*yfyIR2Hir)RN+ew$huaKmaS4mXwAZk4vj1Dqk1F-p8`GU=yw zx157LlxlbYm`BTD9%u}DYVXHkLpYG!K=V;A`4*oq=8_()x7K?!y@B0yVd|8?=a$n2 zF$Rk*y>zkA#kr*HiT|MWsybQOV0zE$OaGziU@{Fs4WSN{vJx^0N?cU$p`&=H@)_LL z<&u8%13Ex9l9xy&J5m}2^tT8Fzv%{9|M^meyVwKEBBbkVJ>f_3iF%2|B0wIfy-*HWo!rSCVSbnYNnc>yQ2H4{+oJMY zYJ{zc++Hp*?c{BhapDZvgQja+*9;i>ZS7`{Mj>qpw zM$!^@wz+2EkESA9_hN}}x3w<+D21E<^#{wJ%zdF>U9MXR*OQ*`F6aircrqCX8-gv$ zUT_NhF0BS{#64soKPCP}uWN6bM2;jCxms8n6^=CCu!nt_1>F4tmLj=QnP(y3 znsQ%w3JSD+P6zEY{%As%LJ`bUX22!rv1|Z+#ja!&J0cvUtH~0ikZsA+-Tkaxiw~Ms z*rn3D@Qkf*MF_d)=tr-sAI%wDl#_7?Y($s9R1l=~`+*FFd%E~#V2B~jZYf`byE(`BjxwDC zk~fOe)SPXTcHu{$FZ~04((X5jtt5?M0lzQjb1w{41!xwgDz}LNx1_b`JPl8qrDVIg zxp%=ohVzcB@;YLoE5i3J?QX9|zE{6D?VzXqJMmb!j2?xFpf*7K3yFatV3V>EL_xQb z4AzT~=CQT-HR(qGK$I*`aRj8BJ^45Emu>gUnv12*I8QFoIfhdgeQEB^h`%R32By$t z_*&!3DIBx08XneAm&2#9MEZyK5F<$nU4(vCZ;~k(Q%lNNcYAYY-X|f?=qcI*``e=` zuTz8bCFw?z%m((*hY<#?Xc_ct^Jvd2XeZnSn$;$#BQ8~vVU&)l-_uKImpX_1hi%lA za@_UYw7nov-^QF#*ceW5HujC-Yn<0eEcwd1gunC+gd2Gpxd$sXJe|RZvH-jR>{HI7 zrNTmWHh6`9lNYGF;T1KJWZ-7>Y04!>vA$P9Ptz~ffFd2-=1%b6r;4MTPv>{6^LU7V zG#UW)Ohh(l0?YXu_6v@J_tivvOXpy%z#JGTKPD9@nntjvxD~yfa?{>LXD_&BiE^I& zV%LRNQGA(dxMK?l01d3qSX*g5RzNuY1-<|+!F1k}jl_-MM75a^svpWuz+Z5xY^3|} z88VUVMxANx)Y^^`-N%9t*17JJWq;z^uGyY^sl+^lY^4V+#jM2t6n8-j*cj9r>cMGV z&Z4jmj#KNR1pJIdpkOhG9Ae+%II@JE!qK#KDz&Z9pD$`=PYp~hv-7_lpM3HDCdNaw zDZ6A!Vv65}T<8Q}fof=bT$Ar+FVT6>jJWYip(XnpFVL-`o7ogpk3M3T@O1hqZHT>& zzE@Fo+k}91r3q>;`$50g|A%1%?LbEw1NmI33`_8DIuKF7!p?%V-FHfU3f}U@~t-XMix)RjGs53W`z#c!Yr@j;=&*n!`Q{ zlj)As1@^y1R`8YWs-s@fcEH_dy%D}F{R8DN%`{C`XL%#=d^C-vYV&Oa+VJZn3RGi- z5`!(pfASfcIoHBtta@K5p)_zq2$N@)vmDILJJ;feGdsfEHc{+;2N zp}$mo+1cOqq_7ou>auzC{zSuMX`a&Dgh(@ATcHNNz>?rr5DQ$O37r7qSW7tr97T7e zw`3CvAw7AP&{x~c*SHs*lA7k2Vz^RV#WC7(GQT7J%JIIUvu~OY#(4o)vs>st z8wceNMFXpBm5!3h)_Jm98EqW}xB2Sx4eVFq0rNFI9>kl|RpC+pN&Pqn%;PT zVsJP09EcH@v7=y}kV}WN8Tc>*MN0#>2xC5X#X0C$79n+92Ry8D2_EddBXu!dVv=_& zugz&Wz(4#H8-e!ors_Gg0_~Esd2O*59mS)BDw=*6?CJ6};5&el3~}93%{H!9OXl`FODdo694G z_52OLji0hf#l?Yr@GseY@qQsaD_VuFm%dBh{g`^*NL9?2xzpE{*WsPWc3z^@yULrf zDrhjsR}J_oKBsutGyFB(#%BrL`7yo@zh~=6{VKr8wvD7ko zBWsPXYbX;d)&TSQ2jpcCEi`vj-;Eec|59KQIur4J>0=uKG*;8>7VPe*KHNv^J zQuLVgQ>0=3&4!zNq%VSvVmH)AJc|zkt@s7H82RCREem>uRpl0a3WsZ79IngZJ$VKy zXT3{$2ZkX>&Li9o56nzg-LA2fTPx4&8_JcwS_V5VHAVH_I^%Z1ZCp68-6Hnt=vFcAOnc@Zv!<1>FJlU&U_>7IN&m8wax#ut38 zw>oRdN02sJz22JN$1Dy3IGZHnI${^)p*j=&%-?`hcrq{J8&C-gFS!=@5OvG>1`Ft9 zdYatEkzDz;Z@j_eAL3hR{EJWUB#_JOh?WtQ@Mv(3Q#J%Y0NsdJu<3dz&q+6-CUC(! z!dt$XKSzn|Sn-Fzd~`2+9{L1crEQn`TVIvO`S$Ad-k074!_RbP<#+Td+f3eS-?vob z^XB|6?x*>$NYSV}sC;BQ#D~nFseC6WVHeSA_Pnq~HBmnx_mEgutdSWizqhCR{#I%k z&Q?zG4Ka@cdApxECLo%mm3A!BqQj-ws21FF0=)G7Gcdeu3z=ssy_l{{U+H0L3Cy1&>kkmX8$ zVF+rFVlWLX27hx=yI&QAlW1X`cuEP^)c$exEO+a>f)%U<9!C#k%yL&X9M2zRb~->& znR37sTe+0CcR5M2G~0;5Hl@075p1Is@Vdqymw7%(2VQ+5YP)QkpVwCHV~_RLP-Ypw_$uTWYY1AbqzMV=0NV)XX#MupCQ|}Ff=#3au7T&s zV_6ECqwePq#XLHI{*D{a25I%(D@E-n^=++t%3=qUYu^{m_mS4n0-KYT*d5Vtf9OEytbcqtvx-#k{TLC)*6i@WLO+ z9;>yox7^rt*?(I4(b#|m$nnA_m_QS>yeNgiu3R=z6NE0_MnorBQm%F z^kg)CC}7e8b`(dGnY^X019!4(IE?N~)jLlcGK;4>b~;lF%=EA$&DWZ&H~Bm#d?Dsk z9_OEpyW$Bv9DM;bwVPT&>cRxDT+M;4h3O;?>=GK$eLPYe&u_CmxCISO9qZa=j45vK zT;P6KNa<+T1ph%b-*nD9#+PUM4)l|13MKe9Z;I||7c!kUqyNCku&$QW1&Zl}@K8LN zn7}GAfSKrH)Po*PHM)N@JuaEyIvr4>C_&A&HT3H86Q<#7|@gQ_p z%i9y#GGatB%p*&LMY;^q8pPmSJ!WZM1=fi5qoy>2UZjmXw~XRM^F~&^D+) zZJXwBPcDXBE2JW!`RyM~UY)baGRlc&! z;Ym_$5#eX7rFL(NG>7D+BebNgy1EF~!CTZQv|k^_7V}&ol_rrHD2}ekTJMh4HOxyi zop9o^LinBCRyj=h-ZDvfq_(k!p($jsE*=eL|7iMo24I}hO~?)JD^tObaJEti8|pvP zV)g*nq2;6%noa9xwGK$pAITeNdgGi`uE(wHD=G}?4Qr;nUtVwafHz7T-LH6&<`=eW ze(NT$!ycnaaE$r_G=e?l>pWX{NV@TFM8Zze>gXt4ot5rhB`(Yk(;c*hmIi|q3;z-< zx3tw%$4e&+^Y~rmt;o>`wppt;Sxbo8uv_pnzbM~fD&6MIAQ$i@lE$9k-6T@23pQ!} zjteLgzhsB&=h|PFRz;1RW4udfq_Y;^OlMlcpp`7qF?5r8;YJVxtY8SsL`y-2Y=iqy zyj&eD)oo|1L54V=eMR4)#WXMTq5HJRvmOX5ERTyH0^*4ErH~@aEJaawnuoJKa(!J0 zeopOhlBS=l0+}X3#^)$i;Bh!qz6XN!FW6RaRV<>r$q#4^?VOe99-@oPc_|*W_AKoH zZ`z-G?y6r~_A0|@ZR=DvMp~^a5>l9|x#^ACg>+`aP!5Pucfev;OIZgW=p@pYEfjR@ z64?t6(K%W7T!)3`IY)KBS!xx>f&F$;aYSBi38j}oh`l}g=<6o#(DwNWjMErI;*9P^ zL*PBd0M@|M$^-CB=tWlZLAp!)4c!Q1>35myTqlL8IcfSS)<#7;z(CgpZ)atYIY{YA zs#|8$KE8YSC#?AfI2Wt~O~G-ENq&MsK9OvKF*G^jwR5JJ z{&|;PZ@FLiC-esVARSaUTXK|oN?&6T>nL9l`w7Q+lI90TY54pHSp!qRTxA?5V6Wl; zHY_WD@=9G-euT`1t7+HFRgNwC!~((Kw(3j%;#=HX)DSY=cvBkb&la_uQ%b=ja2iVh z{WZVUh(9A7a%Q8zkXL;FEs5GUk4e52*#9g$TOSrMEnW)P7fp@RMIKS9hF zc+C>xuDbQv%z*p#)87K{veCSp@QNeB&pZc9`~d(2l{Yp%!vXRhSPeb{0L)b{3U2(? imxQ|_G(IUZ%7uR6LEkJ<+eG3!_5()&0){p{) z5?sP=HlE#0$nMOQ=exh}57@hN=Q-y*=bY)&#NN0;l`1WoSEAV~iZ5y4d8C#td_)X|oOyJhW z0^&wQyh5$ebaV&4kPpcYbk5v_zbBK;4bYi9F(PDuhWKtVf_lX%Pf7P)e9o5^kz$R? z-Yys0rsTgc*4a;o&YIcIMYy$@P4>X=ye1x@&P#$#;t`1>t$BSs6<0EZ*ol8OUh>%_ zm(}9e@e}dh(A8au>!|{6ehr-V!bH$jSv2Psv(JeMlkHM?d@0P0bvLPs1 z(8_d`7)xrALSs9urIpzOaf{{%EDMNNnXirILQzzyoNg)FUpu{ z`^-5huRB@i+FsCJj&UD0+~ybd2Xek~gQlW$VmY3J`k_yO3QAt0r|cGKM8~rKa1Sld zcq}UE9pxL*n|=^uyP-*^c!;#RJOE2{o#$m-(&Bd$Y!LyucIZy?#io7I=Lql4iGNacC(LSNpbvCy}ylk zv^;}nqv2>V91^MIE7Dv9$P-eFmB)8zJ*J^~5u4>x(NjI&RByG&|B)*h2d(!Uf956O zyN*_YQLK!sFYgO}_X`}!zS7>I!(t8|i)Nt5kS+F*mG~pCt)0@U@EZ6D9nNmTMZJv7 z73*lKxbL~=Hp%gf)fQoYk#`n{99@GW_#haGE_ppC!V9#`fT}7R?z}th_|MD4xa9dw={LO@h>Amr&dsvN4d5#CtqgY z2NzA7whwI=^YFK5H2N1NDOGBNd$TV|TWT23VX5uSEB|k^_d3W*=q@WBnXkV50GwgrSM(ga@Jds1%wl{~%}3T6T?`CKZii@*G)j z^yCKVAyULd`cjOweNu0W(92v zY9_u?dOrjO;gM`dDb8TOkd2YK~BcAP+znUD$D8QM-r3Q4 z5hOWckLQ)^B|el<$CluF=}#tgo%j4l!&|Lcp(Z@i{WF=sN@#s>3wZ=@K`l@#NEZjl zMbwn9r+<^P#sTz5Yrxt`tG-Mk8BNoL?)lV}gr54S?S(VTKT#hXSr}Lo9%Zc;uFMxW zlI10%gfl$Y;e;cXhY!vv-M2^Pc7e=Ig3RGe}^fma37=@=Qz3K~RL=hQ|Z?b>MDtg7Ng#M)q zS$Qa{50~AgQ_a`!neASP`}kaztG1@OK2*lGAiLMcpk-fCPh+!vBfK;g(z+;@=b^S} z0jh}70ez8-tUL%bbLMrF~b(uX;8U_U6nHDf;fOpoSia57ji8!b^R5 z7q3DSkQXB5KBX_?*+jK}&$1(UlJ+(GO|DUXBSPL({2l6f=NgW=Z-w=`y>YIM(#RM7 z=x{5?O)Nu_ejg+`op5 zarpdC44{Pgy2LB;+)wVDFW?%Wy>Q>_hQE zwu4qj<3w9LL_Oa!U}73AMJni#{BfYZ++q~Az z*=H608VBd|rLJtFoEc|-40X)gl;DuKjDJ*mdJR6v7jztsVK4C`JSMzFj@4fANR%0o zi1x{D9W-nNXZy4XfPW`56A%`e^0ne{4KaR?}Vt2UK=hxD*Ba?!= z!?W$CIh4Jy-V^nWH>4aoDw6ST)J^%xwW2z?qwstg&moCyKW?RAejK%kC?@|9YiU>v z_0DoXBtx=#*aObC{t?<^cbmYi(0%(DF#_5-6F@V2(GKXkNK$^cAL-lNn;!^Oa8%;SVxeO-1k8lTqF>XN-$A2v{~*4oHQ z@;`;YHQu`J+vpG9WotP{Ox_jxhkIbKHoNI;$D761 z0JT^C#YPVq)u1bF%T~xSG?6!uh4ih6_T)O3p(DOZmSdL9nYEzM@kwBWX*ez$ea%z0 zX|k>nLH~qr`CQ~e>yeHo$nNwk{)Ts^NcH4Hy+hz;Bj`AxuJQ)pBe6z_`Bo4J?hw%cu%;<|6Yp={_ zVy)(tBVmS?EoyppIbXoG^n;cUju*LApoe|3-ycF&EgWwaw>=d}#$8$m74qJAz48ag zAx#8vF8pIART| z>5U_jY`^A~qN#3c!BSSiHI_BtjU5hva0xnA%{LCOMzQD^_{BO>8gF2ya7VJwtc!oq zkMqTHp+;b?tgiA_x+lT?6rc8GSblSK%ugi~T)yB(vAkHaSS1cS&!Il%JL*9j)O@$2 zjw&z2%f3{iam*xE`c1flY^@iH>o8Og$(f=bekHDZ_PhPKZAN9=L?_Q%Po}u;1h?>d zt^wxHEWy4>=;3#yJ!&ap@Mz_qw?HX5jCQ~!*+f#Bb~gTyD(&zyC_SP991yd}E3wJb z%H5PEWR0@5b9B$QYIj|hK%!C6QQQ2L8_sn6p0|le!PmqNyhi!^Sg0g4@nKD1V+C|0 z`azt)XHZL(X9B8^_zLf1@%RSm!lL0K{>v;6*R-ZGM~qYcZ??C{?I2OU`4N>J&-1&| zUtHFLbe8VeVkEQD&gp2Uc~2|A&E!s%FDD~}nD|#E>)EWiO08v#SujJfcox|7Eh5N= zljp+jEgLz9ev&!fvcmp8?>L#|8dtD|_j3MfY-L_YU99m*`j7abXo5SU*(&e9Q=Dy! zO!Hfe@ZfM+$e_pAGr2+ACDw=r^o=OzMUhL%mJGjz+1BTdAxX{+ffp?32!_rXqwQ@W z#cZuF#79JD+z*Xc`+S3_O&&vmd5=!lV$2mVm*$vtV5y!UPV>?vUHss!61kRa&A4P) zWhC6orFk3se8ui#-+fME;+KX$zWd2fZ%cUp= zCCaI!G^)&ck}t_gV;U;b9{a+1V1Jp+ZJ82thF=l$tAnqelR z%{0-RuKWg)uf%C8#9MEJVsZ3n##L)YXHIS`JhG!BHDveno5gapp;^h+Q}%^9y<{(o z$ebIvM6HGKMNKSS#gotw+zHx>HFz&-iZqB2tMT_R*BA`@;YcV`R3xd!UNKzzQ`8pA zNwn;izCxY1J!`71z*#51EgZ8H%h_L8F(T^Y-lA_UDZ&ZTe*sv0YqL-0gu% z!f>$_`tmDA9GWgV7U>EvRmEhOs$WBYqY4^=+>AHY7g{*nKk^0c_O5z3TIs=r{1SMY z*)SB1ljSV-L>?5KB*IR0UNcxChQbBbGkjR-=jWmC;F>-beM?5^Zy+7*(qiF4)@(~R z+&Wyl_(z=g+#jgqI$k(BKaou4#X^J0OuU}`Bn%NRXQ-_ICHfL##4Gs^UuKjSH`$G% zZ^c+`BYhhYvJ~L>h~skYsDd6g`Sn1Ew(AMFlY%Rdi4u`7jb@g-wv#+w2s ze_A+;zeky&pgdxU6@l=#vYmLyFT*96h6HSplhE&?y-|jbV9t<(mxtc$fa+tX;K?|R zrot}&bGT$qF1$s`GoNp2L7vpuAo@=C8ztbLC00%{-$FOpPdtOm5RX!!zl=pfyfd1J z^?YI|nl%0VHXKQVRRx&k}e$=_k+n7HEA4`Y%Ro1P- z*Qf&IX=&OcpJWRjBs6#}ju`j(C=n6fWqRa8{!(_J ze?aZPW<0|DGe1q7F>!W$(QpK8uD->ZZHzN7qaEQ3p>t3lTFc$2hC*~txF!#(sjuOy z+pcgdN`gBP1{#cUHcH;gljo!@Y7}{UTE|& zs*n#5BUJ^U^2`{>6jceuJNZZ?WQLK3cB#Jj79WhuvgIrh9fCFfdvv~WH}?h_VG{5C zk6r0D!%50m6OB>q7?~4p5bjRaqVjSU%7-}AQT2b%@l_lUKcGSKXJaW|r1j>z|>8IYa7+$zJrXZ)srjnBO;6+CM%FlU>GY&j%>zZKKM ziRMK-4t7Z|N`p}-8a$#KX@ajRR}@A^SP!~Kzt1z^qIOGsEq)+VU`c+!MvThot+bK& z{Tmc$WP1`~p*Y(?vW=A%ON*w3Zh;FX$}=kK4?&$_m};$)@H&-3D&cDUHKF=w@eO)I z3;8B-iZq5nc^w>>PtDk)177fEu78p3TBu`D3G3(Dd4WWaqNm~g=$f1`DJY|M6$!vAg&UiH~#qt;?NpeVbfzt|K z_|(?|n(DO`ZPL30dKbNq=)ksyYC^a0-H>u=va#rb9>Gl2XLpu^=r;0B6yf@~1D{NB z#CG`|K1WZo6Wot?%Yw|&_DnJ`cOQ<{O6GlQ{-)P2y2Em<&*THv4mV|w%r&UBtS2+k zOPG#^LnB#-HXv72Yg~pW3XbB5VvZ!FNd7PV zM?FzL;or`WYsZV;n?KVuR$Y|Vdc#$oAeNaqd>DEy;^b`QUD}{1)hA{sKY5!!$HnP$ z(G7JV*&?9+Ey!=n``8IByjJady+a_G#cCVEgBYMfqnY@NQZ<9wGTDq5UQ|YUr44_8 zo=CY`X`xm!PG$Qy_zr)IrqE%s3YfSc$9^hR}{E*zyEjASluh%YfOjH3>i0w2%{@m5BWo)GlhrbU** zg7d7oo?u#3&#*}xm3_2Qgu^h^JI|9dWO05`?W1Q9gQ`KUd_e+efY^>};Xm1XXh&Ph zzOoFyCR@NN{29FREY*Io+6(_wvnHJpGWp6c)!_u5pmjs# zP(6|iNpJ<%f=!;rI=9{qer}qymGO`@)turx_K~!s5u`Xd2OO%REU_BuD3{g_O;g&? zo;dJec>~AcbkUNmrrpFH(E-PT18pbO;Do2C?zQ~oKW3cL`P$gUd1cW9b?jRkhf_2>@ijy~cE zY7^~+#rUnP=&5P>)Os>Mny<2MHPiV&`a#yf5cn`|h3~57v>fz6pFtFg#zAP07JwlO z@gn7ua$q9VM4| z2sX)8uwOocN?23<_%#> highest) { highest = ResOut[i]; rec_digit = i; } + printf("class %d: %d\n", i, ResOut[i]); } printf("\n"); @@ -119,7 +117,6 @@ void test_kws(void) #endif /* NO_IMAGE && LINK_IMAGE_HEADER */ #endif /* __EMUL__ */ -#if 0 unsigned char *ImageInChar = (unsigned char *) pi_l2_malloc(sizeof(KWS_IMAGE_IN_T) * W * H); printf("=====>imageinchar %p\n",ImageInChar); if (ImageInChar == NULL) @@ -128,7 +125,6 @@ void test_kws(void) pmsis_exit(-1); } - #if !defined(NO_IMAGE) printf("Reading image\n"); //Reading Image from Bridge if ((ReadImageFromFile(ImageName, &Wi, &Hi, ImageInChar, W*H*sizeof(short int))==0) || (Wi!=W) || (Hi!=H)) @@ -137,25 +133,21 @@ void test_kws(void) pmsis_exit(-2); } printf("Finished reading image\n"); - #endif /* NO_IMAGE */ - //#if defined(PRINT_IMAGE) + #if defined(PRINT_IMAGE) for (int i=0; i=0; i--) @@ -182,7 +173,6 @@ void test_kws(void) #error No bit size selected #endif /* KWS_16BIT */ #endif /* KWS_8BIT */ -#endif #if !defined(__EMUL__) /* Configure And open cluster. */ @@ -220,7 +210,7 @@ void test_kws(void) kwsCNN_Destruct(); pi_l2_free(ResOut,12*sizeof(short int)); - } + #if defined(PERF) { unsigned int TotalCycles = 0, TotalOper = 0; @@ -238,12 +228,11 @@ void test_kws(void) #ifndef __EMUL__ // Close the cluster - struct pi_device cluster_dev; pi_cluster_close(&cluster_dev); #endif #if defined(__EMUL__) - dt_close_dump_file(); + #else #if !defined(NO_IMAGE) && !defined(LINK_IMAGE_HEADER) BRIDGE_Disconnect(NULL); @@ -265,11 +254,6 @@ int main(int argc, char *argv[]) exit(-1); } ImageName = argv[1]; - if (dt_open_dump_file(TENSOR_DUMP_FILE)) - { - printf("Failed to open tensor dump file %s.\n", TENSOR_DUMP_FILE); - exit(-2); - } printf("\n\n\t *** NNTOOL KWS Example ***\n\n"); test_kws(); } @@ -281,7 +265,7 @@ int main() // #define __STRING(__s) __STRING1(__s) ImageName = __STRING(LINK_IMAGE_NAME); #else - ImageName = "../../../images/feature_0_1.pgm"; + ImageName = "./images/feature_0_1.pgm"; #endif /* LINK_IMAGE_NAME */ printf("\n\n\t *** NNTOOL KWS Example ***\n\n"); return pmsis_kickoff((void *) test_kws); diff --git a/examples/nntool/kws/max.log b/examples/nntool/kws/max.log new file mode 100644 index 000000000..3c62f360b --- /dev/null +++ b/examples/nntool/kws/max.log @@ -0,0 +1,8 @@ +8.5547465e-05 +0.020743534 +0.00016057519 +0.020964758 +0.00013261518 +0.02027892 +60.138706 +35.190804 diff --git a/examples/nntool/kws/model/nntool_script16 b/examples/nntool/kws/model/nntool_script16 index 242b7f485..248d0cb89 100644 --- a/examples/nntool/kws/model/nntool_script16 +++ b/examples/nntool/kws/model/nntool_script16 @@ -1,4 +1,5 @@ adjust -fusions -aquant -f 16 images/* -D 1 -O 0 -F int16 +fusions --pow2 +aquant -s POW2 -f 16 images/* -F int16 +set graph_dump_tensor 7 save_state diff --git a/examples/nntool/kws/model/nntool_script8 b/examples/nntool/kws/model/nntool_script8 index 11f720155..f701652da 100644 --- a/examples/nntool/kws/model/nntool_script8 +++ b/examples/nntool/kws/model/nntool_script8 @@ -1,5 +1,4 @@ adjust -fusions -aquant -f 16 images/* -D 128 -O0 -qtune 2 dp +fusions --scale +aquant -s SQ8 images/* -D 128 -O0 save_state diff --git a/examples/nntool/kws/model/nntool_script_emul16 b/examples/nntool/kws/model/nntool_script_emul16 index acdd3ea47..248d0cb89 100644 --- a/examples/nntool/kws/model/nntool_script_emul16 +++ b/examples/nntool/kws/model/nntool_script_emul16 @@ -1,4 +1,5 @@ adjust -aquant -f 16 images/* -D 255 -O 0 -set dump_tensors 1 +fusions --pow2 +aquant -s POW2 -f 16 images/* -F int16 +set graph_dump_tensor 7 save_state diff --git a/examples/nntool/kws/model_decl.mk b/examples/nntool/kws/model_decl.mk index 5f0a99fd4..7dddc6b94 100644 --- a/examples/nntool/kws/model_decl.mk +++ b/examples/nntool/kws/model_decl.mk @@ -4,15 +4,9 @@ # This software may be modified and distributed under the terms # of the BSD license. See the LICENSE file for details. -GEN_PATH = $(TILER_GENERATOR_PATH)/CNN +MODEL_SUFFIX?= -ifndef MODEL_SUFFIX - MODEL_SUFFIX= -endif - -ifndef MODEL_PREFIX - MODEL_PREFIX=GapFlow -endif +MODEL_PREFIX?= # The training of the model is slightly different depending on # the quantization. This is because in 8 bit mode we used signed @@ -32,10 +26,9 @@ MODEL_PYTHON=python # Increase this to improve accuracy TRAINING_EPOCHS=10 MODEL_TRAIN = model/train.py -#MODEL_TRAIN = /home/joel/ARCHIVE_SDK/gap_sdk_old/tf2gap8/examples/kws/train.py MODEL_FREEZE = model/freeze.py MODEL_BUILD = BUILD_MODEL$(MODEL_SUFFIX) -MODEL_TRAIN_BUILD = BUILD_TRAIN$(MODEL_SUFFIX) +MODEL_TRAIN_BUILD = BUILD_TRAIN$(TRAIN_SUFFIX) MODEL_TF = $(MODEL_TRAIN_BUILD)/conv.pbtxt MODEL_TFLITE = $(MODEL_BUILD)/$(MODEL_PREFIX).tflite @@ -44,13 +37,20 @@ MODEL_TENSORS = $(MODEL_BUILD)/$(MODEL_PREFIX)_L3_Flash_Const.dat MODEL_STATE = $(MODEL_BUILD)/$(MODEL_PREFIX).json MODEL_SRC = $(MODEL_PREFIX)Model.c +MODEL_HEADER = $(MODEL_PREFIX)Info.h MODEL_GEN = $(MODEL_BUILD)/$(MODEL_PREFIX)Kernels MODEL_GEN_C = $(addsuffix .c, $(MODEL_GEN)) MODEL_GEN_CLEAN = $(MODEL_GEN_C) $(addsuffix .h, $(MODEL_GEN)) MODEL_GEN_EXE = $(MODEL_BUILD)/GenTile +ifdef MODEL_QUANTIZED + NNTOOL_EXTRA_FLAGS = -q +endif + MODEL_GENFLAGS_EXTRA = +EXTRA_GENERATOR_SRC = + $(info script $(NNTOOL_SCRIPT)) ifndef NNTOOL_SCRIPT NNTOOL_SCRIPT=model/nntool_script @@ -60,6 +60,9 @@ RM=rm -f NNTOOL=nntool +NNTOOL_PATH = $(GAP_SDK_HOME)/tools/nntool +NNTOOL_KERNEL_PATH = $(NNTOOL_PATH)/autotiler/kernels +NNTOOL_GENERATOR_PATH = $(NNTOOL_PATH)/autotiler/generators # Here we set the memory allocation for the generated kernels # REMEMBER THAT THE L1 MEMORY ALLOCATION MUST INCLUDE SPACE # FOR ALLOCATED STACKS! @@ -73,10 +76,48 @@ MODEL_L3_EXEC=hram # qpsiflash - Quad SPI Flash MODEL_L3_CONST=hflash -MODEL_SRCS += $(MODEL_GEN_C) -MODEL_SRCS += $(GEN_PATH)/CNN_BiasReLULinear_BasicKernels.c -MODEL_SRCS += $(GEN_PATH)/CNN_Conv_BasicKernels.c -MODEL_SRCS += $(GEN_PATH)/CNN_Conv_DP_BasicKernels.c -MODEL_SRCS += $(GEN_PATH)/CNN_Pooling_BasicKernels.c -MODEL_SRCS += $(GEN_PATH)/CNN_MatAlgebra.c -MODEL_SRCS += $(GEN_PATH)/CNN_SoftMax.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_BiasReLULinear_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DP_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DW_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Conv_DW_DP_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_Pooling_BasicKernels.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_MatAlgebra.c +MODEL_LIB_POW2 += $(TILER_CNN_KERNEL_PATH)/CNN_SoftMax.c +MODEL_LIB_POW2 += $(NNTOOL_KERNEL_PATH)/norm_transpose.c +MODEL_LIB_INCLUDE_POW2 = -I$(TILER_CNN_KERNEL_PATH) -I$(NNTOOL_KERNEL_PATH) +MODEL_GEN_POW2 += $(TILER_CNN_GENERATOR_PATH)/CNN_Generator_Util.c +MODEL_GEN_POW2 += $(TILER_CNN_GENERATOR_PATH)/CNN_Generators.c +MODEL_GEN_POW2 += $(NNTOOL_GENERATOR_PATH)/nntool_extra_generators.c +MODEL_GEN_INCLUDE_POW2 = -I$(TILER_CNN_GENERATOR_PATH) -I$(NNTOOL_GENERATOR_PATH) + +MODEL_LIB_SQ8 += $(TILER_CNN_KERNEL_PATH_SQ8)/CNN_Activation_SQ8.c +MODEL_LIB_SQ8 += $(TILER_CNN_KERNEL_PATH_SQ8)/CNN_Bias_Linear_SQ8.c +MODEL_LIB_SQ8 += $(TILER_CNN_KERNEL_PATH_SQ8)/CNN_Conv_SQ8.c +MODEL_LIB_SQ8 += $(TILER_CNN_KERNEL_PATH_SQ8)/CNN_Pooling_SQ8.c +MODEL_LIB_SQ8 += $(TILER_CNN_KERNEL_PATH_SQ8)/CNN_Conv_DW_SQ8.c +MODEL_LIB_SQ8 += $(TILER_CNN_KERNEL_PATH_SQ8)/CNN_MatAlgebra_SQ8.c +MODEL_LIB_SQ8 += $(TILER_CNN_KERNEL_PATH_SQ8)/CNN_SoftMax.c +MODEL_LIB_SQ8 += $(TILER_CNN_KERNEL_PATH_SQ8)/CNN_AT_Misc.c +MODEL_LIB_SQ8 += $(NNTOOL_KERNEL_PATH)/norm_transpose.c +MODEL_LIB_INCLUDE_SQ8 = -I$(TILER_CNN_KERNEL_PATH) -I$(TILER_CNN_KERNEL_PATH_SQ8) -I$(NNTOOL_KERNEL_PATH) +MODEL_GEN_SQ8 += $(TILER_CNN_GENERATOR_PATH)/CNN_Generator_Util.c +MODEL_GEN_SQ8 += $(TILER_CNN_GENERATOR_PATH_SQ8)/CNN_Generators_SQ8.c +MODEL_GEN_SQ8 += $(NNTOOL_GENERATOR_PATH)/nntool_extra_generators.c +MODEL_GEN_INCLUDE_SQ8 = -I$(TILER_CNN_GENERATOR_PATH) -I$(TILER_CNN_GENERATOR_PATH_SQ8) -I$(NNTOOL_GENERATOR_PATH) + + +MODEL_SIZE_CFLAGS = -DAT_INPUT_HEIGHT=$(AT_INPUT_HEIGHT) -DAT_INPUT_WIDTH=$(AT_INPUT_WIDTH) -DAT_INPUT_COLORS=$(AT_INPUT_COLORS) + +ifdef MODEL_SQ8 + CNN_GEN = $(MODEL_GEN_SQ8) + CNN_GEN_INCLUDE = $(MODEL_GEN_INCLUDE_SQ8) + CNN_LIB = $(MODEL_LIB_SQ8) + CNN_LIB_INCLUDE = $(MODEL_LIB_INCLUDE_SQ8) +else + CNN_GEN = $(MODEL_GEN_POW2) + CNN_GEN_INCLUDE = $(MODEL_GEN_INCLUDE_POW2) + CNN_LIB = $(MODEL_LIB_POW2) + CNN_LIB_INCLUDE = $(MODEL_LIB_INCLUDE_POW2) +endif +$(info GEN ... $(CNN_GEN)) diff --git a/examples/nntool/kws/model_rules.mk b/examples/nntool/kws/model_rules.mk index fde767090..e8346980f 100644 --- a/examples/nntool/kws/model_rules.mk +++ b/examples/nntool/kws/model_rules.mk @@ -13,6 +13,20 @@ else MODEL_TRAIN_FLAGS = endif +ifdef MODEL_SQ8 + CNN_GEN = $(MODEL_GEN_SQ8) + CNN_GEN_INCLUDE = $(MODEL_GEN_INCLUDE_SQ8) + CNN_LIB = $(MODEL_LIB_SQ8) + CNN_LIB_INCLUDE = $(MODEL_LIB_INCLUDE_SQ8) +else + CNN_GEN = $(MODEL_GEN_POW2) + CNN_GEN_INCLUDE = $(MODEL_GEN_INCLUDE_POW2) + CNN_LIB = $(MODEL_LIB_POW2) + CNN_LIB_INCLUDE = $(MODEL_LIB_INCLUDE_POW2) +endif + +USE_DISP=1 + ifdef USE_DISP SDL_FLAGS= -lSDL2 -lSDL2_ttf else @@ -38,8 +52,8 @@ train: $(MODEL_TF) # Converts the TF file to TFLITE format, generate feature files and corresponding CNN outputs $(MODEL_TFLITE): $(MODEL_TF) | $(MODEL_BUILD) echo "CONVERTING TENSORFLOW TO TENSORFLOW LITE FLATBUFFER" - $(MODEL_PYTHON) $(MODEL_FREEZE) --start_checkpoint=$(MODEL_TRAIN_BUILD)/conv.ckpt-18000 --output_file=$(MODEL_TRAIN_BUILD)/kws_frozen.pb -# $(MODEL_PYTHON) $(MODEL_FREEZE) --start_checkpoint=$(MODEL_TRAIN_BUILD)/conv.ckpt-10 --output_file=$(MODEL_TRAIN_BUILD)/kws_frozen.pb +# $(MODEL_PYTHON) $(MODEL_FREEZE) --start_checkpoint=$(MODEL_TRAIN_BUILD)/conv.ckpt-18000 --output_file=$(MODEL_TRAIN_BUILD)/kws_frozen.pb + $(MODEL_PYTHON) $(MODEL_FREEZE) --start_checkpoint=$(MODEL_TRAIN_BUILD)/conv.ckpt-10 --output_file=$(MODEL_TRAIN_BUILD)/kws_frozen.pb tflite_convert --graph_def_file=$(MODEL_TRAIN_BUILD)/kws_frozen.pb --output_file=$(MODEL_BUILD)/kws.tflite --input_format=TENSORFLOW_GRAPHDEF   --output_format=TFLITE --input_arrays=Reshape --output_arrays=add_2 tflite: $(MODEL_TFLITE) @@ -54,7 +68,7 @@ tflite: $(MODEL_TFLITE) $(IMAGES): echo "GENERATING INPUT IMAGES" - (mkdir -p $(IMAGES); $(MODEL_PYTHON) ./model/save_samples.py --batch_size 5 --start_checkpoint $(MODEL_TRAIN_BUILD)/conv.ckpt-18000) + (mkdir -p $(IMAGES); $(MODEL_PYTHON) ./model/save_samples.py --batch_size 5 --start_checkpoint $(MODEL_TRAIN_BUILD)/conv.ckpt-10) $(MODEL_STATE): $(MODEL_TFLITE) $(IMAGES) echo "GENERATING NNTOOL STATE FILE" @@ -63,16 +77,16 @@ $(MODEL_STATE): $(MODEL_TFLITE) $(IMAGES) nntool_state: $(MODEL_STATE) # Runs NNTOOL with its state file to generate the autotiler model code -$(MODEL_BUILD)/$(MODEL_SRC): $(MODEL_STATE) $(MODEL_TFLITE) +$(MODEL_BUILD)/$(MODEL_SRC): $(MODEL_STATE) $(MODEL_TFLITE) | $(MODEL_BUILD) echo "GENERATING AUTOTILER MODEL" $(NNTOOL) -g -M $(MODEL_BUILD) -m $(MODEL_SRC) -T $(TENSORS_DIR) $(MODEL_GENFLAGS_EXTRA) $< nntool_gen: $(MODEL_BUILD)/$(MODEL_SRC) # Build the code generator from the model code -$(MODEL_GEN_EXE): $(MODEL_BUILD)/$(MODEL_SRC) +$(MODEL_GEN_EXE): $(CNN_GEN) $(MODEL_BUILD)/$(MODEL_SRC) $(EXTRA_GENERATOR_SRC) | $(MODEL_BUILD) echo "COMPILING AUTOTILER MODEL" - gcc -g -o $(MODEL_GEN_EXE) -I$(TILER_INC) -I$(GEN_PATH) $(MODEL_BUILD)/$(MODEL_SRC) $(GEN_PATH)/CNN_Generators.c $(TILER_LIB) $(SDL_FLAGS) + gcc -g -o $(MODEL_GEN_EXE) -I. -I$(TILER_INC) -I$(TILER_EMU_INC) $(CNN_GEN_INCLUDE) $(CNN_LIB_INCLUDE) $? $(TILER_LIB) compile_model: $(MODEL_GEN_EXE) @@ -87,7 +101,7 @@ model: $(MODEL_GEN_C) clean_model: $(RM) $(MODEL_GEN_EXE) $(RM) -rf $(MODEL_BUILD) - $(RM) *.dat + $(RM) $(MODEL_BUILD)/*.dat clean_train: $(RM) -rf $(MODEL_TRAIN_BUILD) diff --git a/examples/nntool/mnist/Makefile b/examples/nntool/mnist/Makefile index f969041a0..820c507d9 100644 --- a/examples/nntool/mnist/Makefile +++ b/examples/nntool/mnist/Makefile @@ -8,30 +8,21 @@ ifndef GAP_SDK_HOME $(error Source sourceme in gap_sdk first) endif -io=host - include common.mk -QUANT_BITS ?= 8 +LOAD_QUANTIZATION= #-q #to load a tflite quantized model +IMAGE=$(CURDIR)/samples/5223_5.pgm -$(info Building GAP8 mode with $(QUANT_BITS) bit quantization) +io=host -# For debugging don't load an image -# Run the network with zeros -# NO_IMAGE=1 - -# The training of the model is slightly different depending on -# the quantization. This is because in 8 bit mode we used signed -# 8 bit so the input to the model needs to be shifted 1 bit -ifeq ($(QUANT_BITS),8) - $(info Configure 8 bit model) - APP_CFLAGS += -DQUANT_8BIT - NNTOOL_SCRIPT=model/nntool_script8 - MODEL_SUFFIX = _8BIT +QUANT_BITS?=8 +BUILD_DIR=BUILD +ifeq ($(QUANT_BITS), 8) + MODEL_SQ8=1 + NNTOOL_SCRIPT=model/nntool_script + MODEL_SUFFIX = _SQ8BIT else ifeq ($(QUANT_BITS),16) - $(info Configure 16 bit model) - APP_CFLAGS += -DQUANT_16BIT NNTOOL_SCRIPT=model/nntool_script16 MODEL_SUFFIX = _16BIT else @@ -39,19 +30,21 @@ else endif endif +$(info Building GAP8 mode with $(QUANT_BITS) bit quantization) + +TRAINED_TFLITE_MODEL=model/$(MODEL_PREFIX).tflite + include ../common/model_decl.mk -include common_rules.mk # Here we set the memory allocation for the generated kernels # REMEMBER THAT THE L1 MEMORY ALLOCATION MUST INCLUDE SPACE # FOR ALLOCATED STACKS! -CLUSTER_STACK_SIZE=2048 +CLUSTER_STACK_SIZE=4028 CLUSTER_SLAVE_STACK_SIZE=1024 TOTAL_STACK_SIZE=$(shell expr $(CLUSTER_STACK_SIZE) \+ $(CLUSTER_SLAVE_STACK_SIZE) \* 7) MODEL_L1_MEMORY=$(shell expr 60000 \- $(TOTAL_STACK_SIZE)) -MODEL_L2_MEMORY=370000 +MODEL_L2_MEMORY=250000 MODEL_L3_MEMORY=8388608 - # hram - HyperBus RAM # qspiram - Quad SPI RAM MODEL_L3_EXEC=hram @@ -59,37 +52,30 @@ MODEL_L3_EXEC=hram # qpsiflash - Quad SPI Flash MODEL_L3_CONST=hflash -# use a custom template to switch on the performance checking -MODEL_GENFLAGS_EXTRA= - pulpChip = GAP -APP = mnist2 +PULP_APP = mnist USE_PMSIS_BSP=1 -APP_SRCS += $(MODEL_PREFIX).c $(MODEL_COMMON_SRCS) $(MODEL_SRCS) +APP = mnist +APP_SRCS += $(MODEL_PREFIX).c $(MODEL_GEN_C) $(MODEL_COMMON_SRCS) $(CNN_LIB) -APP_CFLAGS += -g -O1 -mno-memcpy -fno-tree-loop-distribute-patterns -DDONT_DUMP -APP_CFLAGS += -I. -I$(MODEL_COMMON_INC) -I$(TILER_EMU_INC) -I$(TILER_INC) -I$(TILER_CNN_KERNEL_PATH) -I$(MODEL_BUILD) -APP_CFLAGS += -DPERF $(MODEL_SIZE_CFLAGS) -DSLAVE_STACK_SIZE=$(CLUSTER_SLAVE_STACK_SIZE) -DSTACK_SIZE=$(CLUSTER_STACK_SIZE) +APP_CFLAGS += -g -O1 -mno-memcpy -fno-tree-loop-distribute-patterns +APP_CFLAGS += -I. -I$(MODEL_COMMON_INC) -I$(TILER_EMU_INC) -I$(TILER_INC) $(CNN_LIB_INCLUDE) -I$(MODEL_BUILD) +APP_CFLAGS += -DPERF -DAT_MODEL_PREFIX=$(MODEL_PREFIX) $(MODEL_SIZE_CFLAGS) +APP_CFLAGS += -DSTACK_SIZE=$(CLUSTER_STACK_SIZE) -DSLAVE_STACK_SIZE=$(CLUSTER_SLAVE_STACK_SIZE) +APP_CFLAGS += -DAT_IMAGE=$(IMAGE) READFS_FILES=$(abspath $(MODEL_TENSORS)) PLPBRIDGE_FLAGS += -f -ifdef NO_IMAGE - APP_CFLAGS += -DNO_IMAGE -endif - # all depends on the model all:: model clean:: clean_model -clean_all: clean clean_train - rm -rf BUILD* - rm mnist_emul - -.PHONY: clean_all - +include train_model.mk include ../common/model_rules.mk +$(info APP_SRCS... $(APP_SRCS)) +$(info APP_CFLAGS... $(APP_CFLAGS)) include $(RULES_DIR)/pmsis_rules.mk diff --git a/examples/nntool/mnist/README.md b/examples/nntool/mnist/README.md index 63074dd08..4f42a7374 100644 --- a/examples/nntool/mnist/README.md +++ b/examples/nntool/mnist/README.md @@ -6,7 +6,7 @@ It goes from training right through to working code on GAP8 or the same code run debugging purposes. * It first trains the network using keras -* It the then exports the network to H5 format +* It then exports the network to H5 format * It then converts the H5 file to a TFLITE file using TensorFlow's TFLITE converter * It then generates an nntool state file by running an nntool script with commands to adjust the tensor and activation order, fuse certain operations together and automatically quantify the graph * It then uses this state file to generate an AutoTiler graph model @@ -19,34 +19,33 @@ The process can be run to quantize the model in 16 or 8 bits weights and activat To build and run on GAP8: -make all run + make all run To build and run on GVSOC -make all run platform=gvsoc + make all run platform=gvsoc -The image loaded is included in a header file. THis can be modified in the Makefile. There are also make options -to load the file via the bridge. This mode is not supported for GVSOC. +The input image is specified in the Makefile and loaded with the functions defined in ${GAP_SDK_HOME}/libs/gap_lib/img_io/ImgIO.c To clean the generated model and code but not the trained network type -make clean + make clean To clean the trained keras save file type -make clean_train + make clean_train To build and run the network compiled on the pc -make -f emul.mk all + make -f emul.mk all This will produce an executable, mnist_emul, that can be used to evaluate files -e.g. ./mnist_emul images/5558_6.pgm + e.g. ./mnist_emul images/5558_6.pgm This mode allows the application to be run with PC tools like valgrind which is very interesting for debugging. The cluster only has one core in this mode. -The build defaults to 8 bit quantization. 16 bit quantization can be selected by preceeding the build lines above with MODEL_BITS=16. +The build defaults to 8 bit quantization. 16 bit quantization can be selected by preceeding the build lines above with QUANT_BITS=16. -e.g. MODEL_BITS=16 make -f emul.mk all + e.g. QUANT_BITS=16 make -f emul.mk all diff --git a/examples/nntool/mnist/emul.mk b/examples/nntool/mnist/emul.mk index 4c3ba4bc2..4320a6450 100644 --- a/examples/nntool/mnist/emul.mk +++ b/examples/nntool/mnist/emul.mk @@ -1,4 +1,4 @@ -# Copyright (C) 2017 GreenWaves Technologies +# Copyright (C) 2020 GreenWaves Technologies # All rights reserved. # This software may be modified and distributed under the terms @@ -7,50 +7,48 @@ include common.mk QUANT_BITS?=8 -MODEL_SUFFIX=_$(QUANT_BITS)BIT_EMUL - -# LINK_IMAGE=samples/5223_5.pgm - -$(info Building emulation mode with $(QUANT_BITS) bit quantization) - -# The training of the model is slightly different depending on -# the quantization. This is because in 8 bit mode we used signed -# 8 bit so the input to the model needs to be shifted 1 bit -ifeq ($(QUANT_BITS),8) - CFLAGS += -DQUANT_8BIT - NNTOOL_SCRIPT=model/nntool_script_emul8 +BUILD_DIR=BUILD +ifeq ($(QUANT_BITS), 8) + MODEL_SQ8=1 + NNTOOL_SCRIPT=model/nntool_script_emul + MODEL_SUFFIX = _SQ8BIT_EMUL else ifeq ($(QUANT_BITS),16) - CFLAGS += -DQUANT_16BIT NNTOOL_SCRIPT=model/nntool_script_emul16 + MODEL_SUFFIX = _16BIT_EMUL else - $(error Dont know how to build with this bit width) + $(error Don\'t know how to build with this bit width) endif endif +TRAINED_TFLITE_MODEL=model/$(MODEL_PREFIX).tflite include ../common/model_decl.mk -include common_rules.mk MODEL_GEN_EXTRA_FLAGS= -f $(MODEL_BUILD) +MODEL_GENFLAGS_EXTRA+= CC = gcc -CFLAGS += -g -O0 -D__EMUL__ $(MODEL_SIZE_CFLAGS) -DPERF -INCLUDES = -I. -I./helpers -I$(TILER_EMU_INC) -I$(TILER_INC) -I$(TILER_CNN_GENERATOR_PATH) -I$(TILER_CNN_KERNEL_PATH) -I$(MODEL_BUILD) -I$(MODEL_COMMON_INC) +CFLAGS += -g -m32 -O0 -D__EMUL__ -DAT_MODEL_PREFIX=$(MODEL_PREFIX) $(MODEL_SIZE_CFLAGS) -DPERF +INCLUDES = -I. -I$(TILER_EMU_INC) -I$(TILER_INC) $(CNN_LIB_INCLUDE) -I$(MODEL_BUILD) -I$(MODEL_COMMON_INC) LFLAGS = LIBS = -SRCS = $(MODEL_PREFIX).c $(MODEL_COMMON_SRCS) $(MODEL_SRCS) - +SRCS = $(MODEL_PREFIX)_emul.c $(MODEL_GEN_C) $(MODEL_COMMON_SRCS) $(CNN_LIB) +$(info CNN_LIB++ $(CNN_LIB)) +$(info SRCS++ $(SRCS)) BUILD_DIR = BUILD_EMUL OBJS = $(patsubst %.c, $(BUILD_DIR)/%.o, $(SRCS)) MAIN = $(MODEL_PREFIX)_emul - # Here we set the memory allocation for the generated kernels # REMEMBER THAT THE L1 MEMORY ALLOCATION MUST INCLUDE SPACE # FOR ALLOCATED STACKS! -MODEL_L1_MEMORY=52000 -MODEL_L2_MEMORY=307200 +CLUSTER_STACK_SIZE=2048 +CLUSTER_SLAVE_STACK_SIZE=1024 +TOTAL_STACK_SIZE=$(shell expr $(CLUSTER_STACK_SIZE) \+ $(CLUSTER_SLAVE_STACK_SIZE) \* 7) +MODEL_L1_MEMORY=$(shell expr 60000 \- $(TOTAL_STACK_SIZE)) +MODEL_L2_MEMORY=200000 MODEL_L3_MEMORY=8388608 + # hram - HyperBus RAM # qspiram - Quad SPI RAM MODEL_L3_EXEC=hram diff --git a/examples/nntool/mnist/mnist.c b/examples/nntool/mnist/mnist.c index e346954dd..24b0ff41c 100644 --- a/examples/nntool/mnist/mnist.c +++ b/examples/nntool/mnist/mnist.c @@ -7,15 +7,17 @@ * */ -#ifndef __EMUL__ + +#define __XSTR(__s) __STR(__s) +#define __STR(__s) #__s /* PMSIS includes. */ #include "pmsis.h" -#endif /* __EMUL__ */ /* Autotiler includes. */ #include "mnist.h" #include "mnistKernels.h" -#include "ImgIO.h" +#include "gaplib/ImgIO.h" + #define pmsis_exit(n) exit(n) @@ -27,22 +29,13 @@ AT_HYPERFLASH_FS_EXT_ADDR_TYPE mnist_L3_Flash = 0; // Softmax always outputs Q15 short int even from 8 bit input L2_MEM short int *ResOut; -#ifdef QUANT_16BIT - typedef short int image_in_t; -#else - #ifdef QUANT_8BIT - typedef signed char image_in_t; - #endif -#endif - -#ifdef __EMUL__ -#undef PERF -#endif +//Image in is unsigned but the model is trained with -1:1 inputs +//The preprocessing to scale the image is done in the CNN AT graph +L2_MEM unsigned char *Img_In; #define AT_INPUT_SIZE (AT_INPUT_WIDTH*AT_INPUT_HEIGHT*AT_INPUT_COLORS) -#define AT_INPUT_SIZE_BYTES (AT_INPUT_SIZE*sizeof(image_in_t)) - -L2_MEM image_in_t *ImageIn; +#define AT_INPUT_SIZE_BYTES (AT_INPUT_SIZE*sizeof(char)) +//#define PRINT_IMAGE char *ImageName = NULL; @@ -54,14 +47,14 @@ static void cluster() gap_cl_starttimer(); gap_cl_resethwtimer(); #endif - mnistCNN(ImageIn, ResOut); + mnistCNN(Img_In, ResOut); printf("Runner completed\n"); -#ifndef NO_IMAGE //Checki Results int rec_digit = 0; short int highest = ResOut[0]; - for(int i = 1; i < 10; i++) { + for(int i = 0; i < 10; i++) { + printf("class %d: %d \n", i, ResOut[i]); if(ResOut[i] > highest) { highest = ResOut[i]; rec_digit = i; @@ -70,51 +63,35 @@ static void cluster() printf("\n"); printf("Recognized: %d\n", rec_digit); -#else - printf("image loading disabled so no sensible result\n"); -#endif } int test_mnist(void) { printf("Entering main controller\n"); -#ifndef DONT_DUMP - if (dt_open_dump_file(TENSOR_DUMP_FILE)) - { - printf("Failed to open tensor dump file %s.\n", TENSOR_DUMP_FILE); - exit(-2); - } -#endif - -#if !defined(NO_IMAGE) printf("Reading image\n"); //Reading Image from Bridge -#ifdef QUANT_8BIT - #define SHIFT 1 - #define SHORT 0 -#else - #define SHORT 1 - #define SHIFT 0 -#endif - if (!(ImageIn = (image_in_t *) AT_L2_ALLOC(0, AT_INPUT_SIZE_BYTES))) { - printf("Failed to allocate %ld bytes for %s\n", AT_INPUT_SIZE_BYTES, ImageName); - pmsis_exit(-1); - } + /*------------------- Allocate Image Buffer ------------------------*/ + printf("Going to alloc the image buffer!\n"); + Img_In = (unsigned char *) AT_L2_ALLOC(0, AT_INPUT_SIZE_BYTES); + if(Img_In==NULL) { + printf("Image buffer alloc Error!\n"); + pmsis_exit(-1); + } + + char *ImageName = __XSTR(AT_IMAGE); - if (ReadImageFromFile(ImageName, AT_INPUT_WIDTH, AT_INPUT_HEIGHT, AT_INPUT_COLORS, ImageIn, AT_INPUT_SIZE_BYTES, SHIFT, SHORT)) + if (ReadImageFromFile(ImageName, AT_INPUT_WIDTH, AT_INPUT_HEIGHT, AT_INPUT_COLORS, Img_In, AT_INPUT_SIZE_BYTES, IMGIO_OUTPUT_CHAR, 0)) { printf("Failed to load image %s\n", ImageName); pmsis_exit(-2); } - printf("Finished reading image\n"); -#endif /* NO_IMAGE */ #if defined(PRINT_IMAGE) - for (int i=0; i #endif -#ifndef DONT_DUMP -#ifndef TENSOR_DUMP_FILE - #define TENSOR_DUMP_FILE "tensor_dump_file.dat" -#endif -#include "helpers.h" -#endif - extern AT_HYPERFLASH_FS_EXT_ADDR_TYPE mnist_L3_Flash; #endif diff --git a/examples/nntool/mnist/mnist_emul.c b/examples/nntool/mnist/mnist_emul.c new file mode 100644 index 000000000..cf11289e5 --- /dev/null +++ b/examples/nntool/mnist/mnist_emul.c @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2017 GreenWaves Technologies + * All rights reserved. + * + * This software may be modified and distributed under the terms + * of the BSD license. See the LICENSE file for details. + * + */ + +/* Autotiler includes. */ +#include "mnist.h" +#include "mnistKernels.h" +#include "gaplib/ImgIO.h" + + +#define pmsis_exit(n) exit(n) + +#ifndef STACK_SIZE +#define STACK_SIZE 1024 +#endif + +AT_HYPERFLASH_FS_EXT_ADDR_TYPE mnist_L3_Flash = 0; + +// Softmax always outputs Q15 short int even from 8 bit input +L2_MEM short int *ResOut; +//Image in is unsigned but the model is trained with -1:1 inputs +L2_MEM unsigned char *Img_In; + +#define AT_INPUT_SIZE (AT_INPUT_WIDTH*AT_INPUT_HEIGHT*AT_INPUT_COLORS) +#define AT_INPUT_SIZE_BYTES (AT_INPUT_SIZE*sizeof(char)) +//#define PRINT_IMAGE + +char *ImageName = NULL; + + +static void cluster() +{ + mnistCNN(Img_In, ResOut); + printf("Runner completed\n"); + + //Checki Results + int rec_digit = 0; + short int highest = ResOut[0]; + for(int i = 0; i < 10; i++) { + printf("class %d: %d \n", i, ResOut[i]); + if(ResOut[i] > highest) { + highest = ResOut[i]; + rec_digit = i; + } + } + printf("\n"); + + printf("Recognized: %d\n", rec_digit); +} + +int test_mnist(void) +{ + printf("Entering main controller\n"); + printf("Reading image\n"); + //Reading Image from Bridge + /*------------------- Allocate Image Buffer ------------------------*/ + printf("Going to alloc the image buffer!\n"); + Img_In = (unsigned char *) AT_L2_ALLOC(0, AT_INPUT_SIZE_BYTES); + if(Img_In==NULL) { + printf("Image buffer alloc Error!\n"); + pmsis_exit(-1); + } + + if (ReadImageFromFile(ImageName, AT_INPUT_WIDTH, AT_INPUT_HEIGHT, AT_INPUT_COLORS, Img_In, AT_INPUT_SIZE_BYTES, IMGIO_OUTPUT_CHAR, 0)) + { + printf("Failed to load image %s\n", ImageName); + pmsis_exit(-2); + } + + #if defined(PRINT_IMAGE) + for (int i=0; iV_nHvOa3=r7Oo>sf!5XRWu3 ztEc07SC3Wh{(fAbw~xQaT91vczJCSl#U0RB)qxlPpp@v>*I)H|2#*=pR(k0Sr0sXH1N#BGpM)H2m0AQglBp`J>j49RysMa zarJWfhje}#Zau4{*etNwoI$*E0xqYK>?{rG=`r{UR$H$b-^ooB$VSM)7wJi+~XJA|jc-wuax z%|M37( zC;SAhb=u(Q|`zbjkLj|RUuImC^z5G3#Je4LZDE0XxE_{rckHSX#XxA14J>lmeYRJzRQ;xj6Yd zIl6iKu66S7g*&@Dd2y-vIdX;O|FhkLJKeo%Lugn>gvJ;^$K^(7H++lIj-dY zBWCPeeZBpD&%)1F`$yvIx+=hv%WvOE`ev<!`Y&4j+idk|_1{s+zo%-yz1AyKUsulnrN7s`h0_MBesA(e z_V9Zr^3$^ZNgjIr)L*pwKT_qtqw)WtcK^MR%={;8`lk{8lg$5StGe1;-TTM3I$&OZ z+UlRS>gD*asM>#*t^O-K{Eyn|tiC|x4?Nyq!j#_jf6`k2Wp`oH8~pqM1O0)~ztBB* z(em$IH#K_~x!+IcpX=+-W&7{0pg$$`f6w|l?O&Mj-%qlix0^qAo#ZaD4qLYLg|k2R z0KJcY6z~4rzjNCM%7Hk!e;dIg+#g79Qu-|e@mGP>?}OtRE>7I6g}>7;8tusq?Yy`_ z*+2}~7lrw{dvkSFV~T+eS6f|Not=VyR(o~V)yL1n^JgHY>8h@w_cM0#^!Dr3Z6#-J z9J$Kd*Ml3;IXZL0EI)2E?(6Ln)Z6D;S8fpJ{4-|d>c4YKgyT!jOjy6>&O2`=Q8d}eTW14^Gi2>R{tNDz(D?XAYVV=A6E3|^#_K} zjeY()FZwg5+uJsM0O4OO`~23RzvP95r}PmR@ID&-bt{cNxLH4aE63matquM4G^ajz za6g_#=I`-r`s*9|&nE`-y}LiZI?%6zTpxkyznp7CYxlti@DBFZFY5Mp_`g&PqXXLj2 z_XlxD-TMw7AK%Ycd(Q%TKm5Gi<9|Nz1oy9$eYN5JD`+pWMFnR$Pd~qXt^fINUM@Gi zvOVAKrle_SOFU{&$hUul`!0f8gT%>-c~i2F!M#K47&! zje!f)A87a|%RYbIdmE(hQ_`=u1N45?VVsn!vYabPnYd?+AZLrd z!jNNbr0wn_yti^S^YpDHXZh(0CT8(0#!RLLH&1H8ToZLBKi-|Wn$^tt9uo%2_d9`K zF`X%%wgg|Uh~$J9?Pmgxr!dMwDV*(2(#)vK5}Z?y?&DWFk(r@%79Dr68T**UjF9a< z4x1H6Br~Kr_LmizxnRL`;CnprS`AYU@FHF*;OyAE59BjHu{dr#hI6+QOH)#D!)*t2erLdhm}fJZbz$(KWj7qh!HhW*%NcX8nAz_!j&rQO z0O4{L)^48!vFByZ|_JxwMb55hA@JTq>w2X7`$_qN{!UEOl-g*Cp3oX*p`>A8>E9Hk$zoY>->oFk)lbB;C7V?H_uF-;z9jJ(T1 zs~0t#XSTA;nwS(Cr%0HUN3(F{K3m4P@DMZGeKRNJas-pl7G#PfhjNnGIgl&TPTd!} zGD250ITmg~9LMJO5PnsU^Y)S=qbgKI%FTu0`*GluoS4ROoRfyR*G$P5&qu`a&_&!W z?#vmZsm$#8xRa>}Ks61=E7X9A;lycEYM+cIN9Pjlq8 zv{1R)mfn7=&nQ}FFpFj$W8#OYbKDmqxSH+-X^{!cg#|r0cUb{iu6@I-YoeHCXU@dS zuVLQHEdntuUdDXO8curGZjQ%;Kqmfq1he^?HHXL14Kp(I@nqu!PF|S@$HPaBV_vd= z8M51t*)>4`Zw(&L?2&Y3)*Ht|{kRNzIZuNff`_2SKOH(Nt`gJrm*MfU4`3cJj&5C; zP8%jnU>?S3FomxUv1(d_VEmG3X#Fh8=$;$H&?s*x3)v2Z6358yU|uG_U50I6WDHY|#`!V;F)$zJUGku7ApjC`3{ z%6f7xiPb%P0h?nyi7N65(!;6@tLX|4^*(*1Ze%8*aviR$bTfXsX{iytW1wsh(DH

Kl7Z-m>o+S zjdh7^oDIGHVlCx8nNPon7Ls`%B8XmY0PXD1X6f7xr7Kwz$-Sr{WbGIW)^i6Q_C(DW z2G1K)h{Y%+n(iu04s^tm6+854Mu`HQ5Id56PjV&uwN9Mjs=&46{>dJ;u*g`$*8TwH zap^Vvc%KpalFp#N`T9bGT?^M6Slw|lIOc9-a4Sc_!03jC;eyx=`eGK%`W>o;2A^XF z8QcxvH!N&aHTYO?!(ic#Df(-})C@K}c%Z*hdy;{yRGWckx~aj$PI<%R5po8T>%|Rv z?tRezuC>x2`*OK~@O@pw2lc9k@j53AW@VS_vv(fQV}_{ckDD`0U&xnNKj9fNZV)tx-mRPUa?h+b=Dh92kg zb3G}(X1$9UB6^34#_G#CN9f;bFVQR1ZP)7>Ia2?EH>3ZdVXa=`As&6cRUZ026Q1jF zq_XthcNgeA%5l+G{B&H;&10tChp~6{(pIJGefE2$x0MR(o0>h*laczYKX2kz{o;rj z`q{QV`i0v>^k>)Q>L-Zi>qlF*=`YO_)cKqzq$9kkN{27zJ^J^YULcyqiPdc4PTa-upM}NX-zz_T_dw-O#`uq1(h5Pe-!Tp5( zw4c9~=%0|^B=AS}bMWV_-rw;4yX?oO|K@T+e}Dhm_G6~kH|Dhdq5s4DCaVu|!0-Gf zu|J>w9VGCp{S4&Ex_&ul=|7GbxO)GW^=SJ4p^ygFpI^^){)^60w0=ED(V9AtksV<8 zkJhDM<6N_!FZ|Itib6lR>HX!v`N=@}x9$I@aq)DfzYB!{U)Edy+qk%o-+tWxbn!>$ zC^80pr1ks#$3OBj-J}oyPe1dYo|6Ax68P26DD;oQreA*s2KC>4j?$&y*VBLv{i??U zeEeDdUIM?KqfB2uAi>|`4b%qgysbYT8G!%0m*bR(P-j9@g&4hoE$GT&gAn6)n0j)O9|w(Af!-n(a}~-j-cBuZcY>+6+&AtEQW0 zbrAQR&xp8nAZF_BB3kEDXxkVTy}&~-J1PbuL`Gr$s~jrWxCHqs>_d;;Fo(qzr zGN4*!28i8Pg}GnKQFT=`gk_tMI`zZ&!dn8)wLCFgn%@rPU!ve_K`}l(b^|v*j{}|3 zLWnqXntaVP!RJpm5V`f|*y`Uz8S|$~xZvU@+SKU^F}LM$@aqUTvR)Eot}Q_xI|WSg zEGAcrQ}GMz0-X~aa@jSKUcEL7zf7n_soO>v+GEFhqf&_tO0!@o^BVPje87@0ebyp5 z2Vy#^gH>&k4@sOD(lBlkJKVXI1lL$%^R5t2rc&#L! zt+>$;lBG^yPrDE^cJ?kJx#=caU-UxzA4_0oBro~u$cKvBu5?GUCnT;D0@1g6V9k?4 zJ%rP+!bpc)<>I~TW|Jtt8{~u3Mw}!Tf;VrSf|j-6cxQqEE;NY7mI)WBh?fDNOauJb zU;wsH+|fY&6&;Cy+g%6}De3gE4;N z;p*G_^pUs~Ub65(pQ<3zy1bF(vJG+QR}0)|eS~Bmi-dr?lkn#I(|BK&kM7GG2RVzX zV6=xWx^It09-Vw#@&o9{219)O#T`aozfPoP-KAjCg|^Qv;EHrHJcSzib$Jd>&6lDE zqD8>>MU?Wo-yrExg?Q~P1M6RelCo)PDEQ_Uy|YV>CcZjNwxY-pgbqEArHK5fqR*|F)NpRLO8>g+PgX1a$U!9vz zx1C*zI_Hv+nO{isEb1V+ogZb#kA==5r_e{Ko@IY%ADLU^1@BrG!KExY7}ZdWpSL(b z_?3xtm30V-tr?5F>bl5zG6inbTEn{8YD6e$CeFy;ho;ftP_ShgZdM#Zc|yvdT$&AL z+Dq`&^M};%M;`VJnud$@MnYBYG>lY!#*(;}kH-X4amm6UT;D2#%6B=WYjrq`nJ@`% zPfUXB*;O>k=>+>h^$1Mp4yNwX$ety2fm%yu(2@H9gGbrn`lU0#c1IgIxi}b1+^53! zjw|@)Mi$tFT0#5bO48=K0r#`jq5g#gHC<3f+Rf{!h$^KK@dog0JTF!x1G#ch2By@7 z;Ny|@c+f{0!^=w0!(%b#UjD%)vq*%xk^QhZSXL4M;YH{wnNW){4T;^W}z|gXqtol zwhMH0x(_DZNTPmog8&w%qfB*MU4fwsif%8UJFQQU!&z%!=(rDX;maXXG(#JPopmC2 z>O%|#l=g$2R1DFreN6K0r^0yF7Q@N2^XSZSW7?d+gJp(kD0J~GT+JMgyFAoE{q#}T zI;xgFn<9(yac#IPX9P2Qtq!)l2}7Uu3fxjJ0G_w=(Ja9l8~M{vF7^d%yQBn9zQ*CZ zMWbMw{bY`Hm_N=OIu0W{$8f5%|RO9GQ`sAuUw7TUJb6FlJ3O;~31^T$(DV_D&emewp+o9V1DP-{)PgLoN!}f{8 z@$mu|P_D5-`A)nnx{>&3{!+ZqbPj6BL_o#YR6<#X zIm?^^Yk_W3{&gX|5>mo~uxBjEO?>Fotb<``YqU>6!(#&=0&k#*Cm&U(=%N0y+eGzFHVM7R0!(Z}*NzJ) zV)K@mmRLdXnS&_DP6e~pFJ#m=9_CeqVR|IZ3>Aeu<7G&(3P8ncbj%%?X+sJG~(sx3eSW=S56SE9#M9SbRyk%nPd1t zKY`Xn$zi)^G|j%Ylyr;;XI0ya!3)_LpjM~^ts>ndH^>m$r)=7e{XUj2#J%b!FQzaVN4cHN~ zk!_unMEv~fSq~0`!EkvavM25#4A#qq>(ycq;yN1~@=w!>`<5)<_9x)gr49S0wa}{; zL5BJ(4e6qbF=S+|5WZQHgVd&hlD)3}DcD0BH5?oeoy$USzT$?^`I#70xU2`WHDa2_7Kj@kAnQY9qi9qtI_|&0uUd5 z2m{CQ!iG_A9O*g~rM}eri?l8@2=z_!~ zSd}gG19cb8ig{#h{YLE2r}Ho7JhXJogdV*o+WLC$@X!iF)J8u zm?61*SOJ=Tq{GdYoiOr6GRk{y#nl4mi7Rg=)-Tu%hnrRs>(|MUxZVi2otcfss0SS@ zlJUaTi>ymCD#1n02}cZ9fZ>8d^i!uRZvC+VJhq0A6C|5#bo)ejSIoo^Zrohno=F8G z(=h9t5@_*zfwq%4E-RM7``dcRgunnuoWKhn0x76A$QPG4xuG9sftgzod+n@!?Bon1 zl(%<4^~@xE>Ba}|C)Kif5BXu9R|c>)7T`Y5MeM;Rju80|+hE~BZ5Z$2K&H=K#OjvY zfiG?jB`|dlZW0&70~%}?CFp`h&h|joyBYZ18biESy|2?&_QR`z6F^6+6wjAm0H=;s zbun|d!@00@c&CwxT7%rkB+nF_QFER0DG^#YcNa`L=L(nBN@6R|ExLR}200NI2<1|d zIIj62sT-e$2Jt1hq2vlY5*iOC;x0sggC8uqRZXfM?5E>SpT?T7?O^Tpi501~1i$t? zB*L4%QK?b$VPZxQ8UJlIu9-L&^)1%nXwz~!c~CRifAlF{R#}8T`U!;W*h+*S4dpCR zO$O)NWoSBC82lf0Q(^n_aO)u-r(oAOdI;wNdu=)$Y&i(MmEE|p2_MHs-3RBCSyRQ! z33Pv%5~^Oj&dL%z1;fURW2xs=IJ*)Y9;rSpAEomrf2oq$XO?U`YXBA*0MJusH0 z7v3PZ< z!@;U#u#Xs7Jj{o^`=dK{mxRLx{WxeX{6HR`Ux1d4KZu#fY;Z63$6GC#AT%bPK;?Oy zH*`EOc#i#i>T@!FtvGV7egMzhQiCbuBVntzB*mZjU`OXFtBD?Wyq z#v1}R?;68xrPB~Lej<_rBg*glhRp9)gG-STOul&xw5lD29aCgs(oz|?P^1pB3-{uN zr;G7qc>z2fI~COU1(*psd+2h$-FQ)ufgv|u!R2eW(Zp*d+SctPRvUG3-tcvJb6P1r zR=1<84U=G)Q6yfOr;ir*Z<19D{(fzu zxj>Q__pBkKm#?Dy?W@T8*TKMkxdRq269uD{W~3=#3Ixp?3OXSy*lKu~&eOLAQOaPW z`etZ85lWtJ7s7+S5x6eo4C~#7LC_Pyi{Wh?W)s=%l^oA+8 zWk-R^y~8;8@IqL#dKuo89!&!u$`I*s<;3YzE8yJUaizale9}R^CPJj=3~M@g!?vm?}PKRY2d5X?5R=Oj$EpV&J@P0PcBm4PRyP!MaE} z+_BLchs|A!FRzPYtMObcwjawnTbIk;_h=IA%AO3q8y~Ru>wabx=?n$o*tvM~!g_EC zPC@nEUumMwTXtL01QO-+0*W_QLeOasJn3Z*4wtvl3*SfM{I_b5S}%+uyL8C&6NqQ^ zzCiaBbDCXlLWL^4alK*|9@1Nbnfqpgm2+9$n{U#D87sxzmz&TZ>-WM5Z#9@K;0hm= zCxKC*Fk`($3nuXfQ;~#IWVP2>ax}RGBpmf2s!AA)9-KkVJBR5t`6rY!VFc>z{y-OA zTn<~Fw4#@J2~6Cd1|K|6&~C#c#47U~UEMi|iHLWD7w@`=W03~YHFSWiHEH-{qYL<( z6`}gH!F28GgE%!f93J{UrbhcK@%yv)G;Cok`s4=@tH^AKoxYL0auH)BwQs|rb2ZR# zWgLpV*hvZxmyxrNlVFjhEW6siIqB;-WXNzwV4PF&{>c!K8oq-#K6{D$ zRuOn_a2SmKyqi86KOQ<}AAxmwrSMdA87c+b2mATi>}mEU@H%P_4qw}V)3R!@=X)?c za7mG6vhfAU?CFHeSFiD_?L0iHp@p&Mp2ElMTkLZ~Z18jhgV*kKk$B;+EZ4-TFj6^- zI9GCL&~qKqR?<|PCo2fNQnkcmlMCp* z-1W!+Rvjy&oR1|)Lw2$1rpCa7x+mnu;TUSClndvN>tkAo8P2+8Osw~_SmCG6gT*o} zR+TM>ju1JA8xC*ByxD_M&cud}v^hhKKOChUEeEMYR|Hs^>!XOLwAU;3RmNWWbn(# z%dp|NFx&>D&sV~Fbxmx0u!}@w2r=u$cEh-tfzYjXftD#%0$=`U;E-AGn!6yi*u9}ni%X)tT^1w1%H9F3ahA*Y*%`CO4n+Fb`T zBSp1vi`x(5Q_c=7f{^97>1Z-k=US5bjt7+B0oz8L#A$m zB^wjKdz&+MzUhR(*kN=oHHQ$FNHS;s5|rg1#!)>s4i>5kF~ZNpu+w2NeuP?TE$WKw zh7c?1 z)!|qg>jPUI=d2TAgb>nu^-LNtCHjSky`@%yqNa6F|6hIx8{@wUV0^W_e- z&E@Xh);|OnrDm3M>SOSHa*CXOI|kj3-lst$55UWtNicZWJ;S4Z=gFYWd$F)-Erd?n zMZTIHfZI$9H@Y2);5F9oGpN=_zg0j-gSD93EI} zQIvGgpk;5f@#f*P>;|*RIEdMctL+aP*16}SYh47aOq78kgAPNJiW<(XjfY{8v82wU z8XsQ`X1OhRM7-V$a+aU0tXouHOlMgpqsDYsj3GKuzx5liF8RUO#z;_J?S@zCj9`{a z8O;#Ahl5=WaNaT<7Q1O1raXMdZqC>Td;*TB+~`IjAc!u=iH0V_Lue`dhWK#TZg;Iv z6m2o32jA?VSr>$uDbP-*dS$^|S$$X}qz(~F27`gYAX1V4jx=sx4yS~A$j#zPXzVJ0 zb4S%-Ld$lR!R?!*N%1kJnCqZhv^JBV8Oz=scOBAE9-55y(Yg);EL3*Ib+4v_kH|Ih z(6|*ob{xWPYcaIee9Rh?v;e-`E+KOkSJ1tSjX`Tm5_GzDkQc|tl6{TJ(Dbnd-2(Q( zlRG;}!?i6~7Pt<-yQ-7+@W*H;7m3fSIhePm3)G#%>4*wNWV2lG(_^W|A*&PapY+YEB}K{dHE!0ZQX^{GGX|9e|*tf43tJs!UyDFP?{PHZAe6^eyGVNpwYZp;bw0g2>i zx1Nk4%^E;e)F09lw;c!``(*g0Nr0%!m%ud5;mCHEC(GZsLjK}7^gD0?(52lAes}B#8!Cf(AZjCwB-OEdb%FdI|lDXJ(NQ6n#E`m+U;#50$232`J z5na+pfXwo5v|-yp;-6uK?Aub{rdb0K&Kp2|*lesj!Ov*%E&*4M&2TGaG_jUfL3WHL zd+XOdP`lR}lgVi8Vn?BLo(YyNQ^!Z;yIIOAIqr z6Er^7;hFCf@j%BgPBrL*S=J7e;~PrHo}Yk&uk1qhR2wXnP{FDs3f)y1)amIZNU1zb zjjvpk;kIrNH*KdP!*W~J2{BN+LqIc5E>?0V`pvZ)EpM_h?)$l-R zdfof6nIN;S3v5Su(2d4B&|0~GCEw7+ZXXg38+Bso6LU(s&1{GZ?-R;;ri^p=T;Xl3 z3VAAMO~r%*!T3reI(?JG0x2y>f0hHIM~C6}^fvlVQVOdpvuVg0IjX|_QHI^h!n#p~ z&#{{m3Ox^lQHmLft28vZ@sKr3dqO#$lbp?(KYs&M?joqvJQm*H_eAlL!cfzFjTKH` z8y4AcX^R^JU3-^G^=PyC`Ku|(9M25p+FI*!VH8?<2!7mA1pe%cAT@aqe4Ac~-^8Lw z?rDD5WImX_(%1!Weq^Bdv0R`>=D^u0VjQ!sP@EOF5MsrZMbJN6W7v~jF*t* zHJEvO{XQ9MGX{rNl#nOZMKGAZ5)SGh9#O)DS6Tc_Pdj~=an|3VS}yVr)Q$fINe)d zx-J|Sy!%4^UaP=c4O0?lE(tRtit+v0L3r)M0Z77y z1RT>(r2C_}IlXu}oSBk{N3V4tpG_9Ec$n067#{`I0k~*AL(ZKE*gJN z22Z_-#G?2DSoxKYd0xZ8*XtLsP4}0A&J{k87@tWL%sWwk8gA;e)>1l>NR9AoBAgas=H}Md zr*<~rq;uO@_lAg}#_rJ^m1D*jyS1B+^+{%b^G?HyV|Jk9(hxFlSq;>Y?J#_cE&9KG zLAyuBvUX>0!C==scrnxoH;)dY!%o`aT7_*mHf#w@ZC{Na76rkaOYQVzeGi>gwjC>7 z&yla^buq(}d&eOo6Aq=SV@bCxeAL-Sl5ZTS4Sh$UIB7Hc!-wZAvSuz69FxHZ^-Ebs z-qui6aRR5uAEtt*hM`a%2Txf@cOIZGjB!4Jj}sx#715`o*o8sHXH<)`VD>U z*1~RIWUoH=5Z2FFk9R+Z;r7!xxLvZ7-du2#RUy3@LN@@I`j~?3!BuqLhMDNNd;+OA zb--y-(L^Ox7GA5~fOGK?;2fC@$t}7l@g@-bvcfSf$ry~ziNN605{xW=Le)CM;EoC} zYMvJ16fi}QzdH`-=<=Kk~T59rijBPmBUa435XCqLENRynOn9_ zXn3KCwV|SuMp}oF+vyLv_aDY!zF`ejiH|~gkuoyNBNJ~n%A???3cQlS-D`PFC9TJl zA%0XOY2%e9jW)8V6T1aBja~x8WeeUj{*Lc9bJ*+m?m!Fs1tfiZ4;9lCrV}@{z(n5Z zFfC>`$TzN_KG{d{jGGtCXXT;d4-M3oiK6e8g#)q3!CiIxq5O_BIVE}$Wfy0_EQ>bq z>nJC6R|Ie*_nwAL$PC7EX&N3+YNzHpvytQurCW8cAv4$u1m`zkv*b6()7k|clMd5c zT0w9-W*%zCOd^*KMnn2)HrY152L{d2A?X*tW7@VK}=unTP;Ok#U$8+aWRAd?&Mf+pv^CaRlW;?K& z6d=7@3__=$pmMsppjbB_Z+tmR@r(|3w0^?LT~pBD?H=l=qe!-Xk%M+meOUF(6;G(P zu#OB9gqz&ENv-4ZX^%uDp1D-Sl6pUanu(a>L(_>EQ?wB;9UG0i62{}kv-^mZ_CjoW z9gemvKG0Q5#yzoVEb?I!l{t_CFW*zlwa+C+6QhaQ4sC3|KLu|&H{sbOBmu^3QUIRA(ztwBHY9C~Luc_qJW|&IE5}Tw zzP?L|^7I1CTUAf^vnA=asxSDyx`Q^wuD}Ow>Ja$o0ST5~fn9vh$?4SfP%W-*v0W9dDcP*4r_j3bW8Pf(eYSHL7DvF-z_NJYpdQ{Cy znu)Ft0Uzg9R2XwCes>)^qd8jB#oQ$*wF#ZIuih^kdM`j@Qee zTwaSgnzI{YvWLQveGRlEr4R?JPeA=p3wF{?Zfsk*i}LKbs;@U#3bVUAQEXE=JQ|e> z)4QM49Xr)bwT?J|*@Nrw*nB&*)ZHOjg9D&4GXM-t3Ar|bA7WbWgZGK?)XgRZUNpWU z#g5UWnfqH?4OatJ$#8G_CZdL3R@#9t0|&tv#Rgn1*n{+T8ya6a zO1HEehP?N}I6F2D(u)&-?`kj{Vnt#1;9Kx*n=i>17i8*fm!O&LXm(5C399~a9p>lD zGnwz+lUM}_&fUlWR^)-l@H$!;*VymERD)DVJg0!SM|9xeciB+Zq=FnxtTIE&c6ePXpLg_Pdkq74m}`tc^;fsUqi+!a5p^LHWJ&# zrT9ZM3%(ds?V}rR38^o zcrz5j?z523GzX%ue*kHVDpFhHkI#I2U_3d8Rq6)JMOQJrchv=7m^>pBx7>u+W7EN} zVkjhq*rL^p9k6}*Nb;t$8fzLq5&h0Ya!W%5B}9iXX)-OO#9xQADVI&UO#WIs;XH@-%HewYA z?()deg|iRgE42#ByYB(b>=1^Xfe(oFabK*nR)AeMJc(B2L2~1%D=ci(14*BS;4q1w zBf)hGfp)hcW5hB%ejo=dhX&({L)!4er2wv3J|O#hxIXRhUUX|IhUMHdo9VWRWb1<{ zn8X|{*aQ@&h>NZaa$e=rG;SBG663yY9P|@wiz0RiL*880QP?=gzqOcSkol;pwzabFg#ik?4}f9 zi^@rs>xKmobKZ%4Ch ziS09ctm4T7z43{V#%ZSSFYhGrtnK(nS%9oi5X2V^8Sw3-J5{ z@K!e!NOv756ulzi52fL&krqARBTF~T2u5-7db)=s;=$u~;JkQDol$ZuVZR6hi5rWF z-Nhy3mC7FKf4T^iH{67IuG#Q?avG^m>0;}QHHZDP6!7VS$y73aJNR;cBQKW_kNfk+ zZr|S_3@f64_e8SqPk$^diK9Jml!Ektu z5wh!SA;!fDOv{gO^T$s3{mw1+<6>bl^cKPQn%*$yd=1k47}nnU7`izm5)a2r!=CJP zQf$S*n+rea-VI@fQ>@%!wcS!f{`v=oA1+#fK~-v9^QlPC=JCZEaUmjgY!7}p=|g)Q zBH3*}PJz~~Ff_CZW9jA=(#)|_QPM>Kf-X1Gu&w)Gy4YTtBY7E=N954MX>ZBbHetNI zOA2qk5r%1%6{O_!PFVfWj(gW7gpQ7z0$Y5|?V-S6LtQG|DIci_d5MbyPdhK6h^!Z&P7n9^Q`9hV!h$i@~0L;}b- z-bOrg-;(I#C}zm3N;?1aNM=K0C-G=L3M)FR;il9M+I9b(p;v?^bboq_TFtubSRO@A zWXv@jR3(cAAK5U!>=Z4sy8@A#+n_`=nTC(72Hld~Yz5~GGTe794!V%VUe2=`MB)&4 z=1!q=_A20%txxa--!1yZGKWUbtj5^2Ga36^IrvCZiL_a*z?<(Jq24lu-1R;UMOPd^ zeFPtqD-=#6Ne9mIP9T%!Oa(8sU|1{5fI?e53b+R0<`WD(cVsa;I8ThD)X{=lEV{_D z4W77Z#$HNH9?-n2@|;y(AE>QXH!T>h4*n_`WMM=j2ph`K_+}aCJjurxo96>3U=G+z z?T6SMs?4TiT2$d>68D~l2u?3LOHQ;@(-*}~SSENG4Z_Fb9Lv`*==lMZe6)fQZ#;}c z9XxTBzZ4Tas0ubUnW9SAdio(?Gx!?$Fh8CcqV{kz+-$u9Sjjr{zTIJ>^l2hVkKKs} zxn~~+ks07D+d)U|o{_e>#^SBl{t_2b}f z9|C#4^&q=sxjV*xKtuk*IARV&e0gCWJkie_Oi`s78NQ?WivJ$C@r6k zW``f6llBf|35NiCc{)y=6a~+m?vcT1qluO22ezMwA+bK&fcHmm^Rd2f=qi<^=;9YZ z?tKWMLP=KmF@*v1sM&a5jg6tl?NLs^3FkZxC5@MM(Ay?kz^U^s>pPW%(Jex#$SVeS znQFGiDTJ;vhTf`IMg8sJcgN?#F{p5g+W;ujgLPw z=yoWUwuFg;!yvBQ=d?g|f+nuM@P<8Tge7P{WkIOp5tM6oCz8wd!o!h`!0=21;p753 zC5ML=B_3njaqpz~UQfo(;+^QB97^wqoQ1`nmq6$Q3yRnBG0P=&V7$l>=E{gP_K3J3 zP_MWPX>UWJK8_77kNjBHWuM5g!?|^B#VVNUeV&eYHDjytT_P#d_d!VyFXwUnEEeB! zUSxHzhUHSLu|!P*A0|hWuU~diAMXE|S@Z5B-T%QKB3C)V?!(@!GrExedAD(8=&faw_o0h~hQ4M;kR% z@a5e&HvimM*0M=_#9MhEq?FCXs`ydNwm3~VTKA9@`FSc?ah?STE!oLh{9`9v zF?)$KSJgoaCkk_F-jGeJs&K4>EBkTLd=xI2j0eM=;97YgY1{jO1ddChm+RZv!*@Ty z(hZStaM2A?nm|ChZ74onFO3f0&x2d}NSvm>8?skcp=Qrf*jHIkhs2F%owBCic2fw(T};7-&_uSz;>$2gG@qL@O~vrI)tFgyh%Hy;jebi;qMO8N$ffC2 z+)V_N&hS9o@KNa0R7+B{xG_`sbrN%b2gaQ~LwuU9gXNb{P*n*Beql9`klqgR+<0zh z$URUh3Nt+Odb$DA8Gv8TWl-YI zP|HCaZxl-xyD3Atq6y>v{0ad(C_p63+{c~dq3x%b6(Ha>li!YhxGS^Of99$$u-hYC&0-`y2Id|66o_C zrV|DR(9;{wl<^(m{YN#pQxXfKRX(KQ-%~ghJOgSaB{|>D29X1sA5pt|RhT!@Li_1F z-gm<&>?nBz6ThQL&Dv)C=N$zV&eQo{PJN@@N#Y!}f>*?|3I^PfrJr!^C9 z9|)uJ)&h)_T^Px~FGzAV-r>xiN4#l1sc2-EON|%j<5Cht4R+~(NMa6*dIrF|L;pyf z<~7FDe-_*lxX$+X94NY{iFeoQa#Rl}n%a7>+VXrS`g_YmqUaWbc5s2XoHawmgfnR5 z&U!s28i~2?S-g7UJQ~MW(w27*NLug$9C`i=Ljn_E$Ms;m5wefWT|JiyBqU+oI&=IR zLBPq;2+P);GgTk+g-I5i)W%7rw&~-^yY&-%|uVJHLRy8ke=C| zNtdc#&+TH^m`;S`5f$~KXB(O z6TCia1$=eA%rok`gpY=sQR+($tc!X^l;srRW4QsijhleU>|1PI{0}6os-YpNklnvX zb4=9Mz;&Zl5TAJtyX+T&>iNmAbFUTb(X)dGu_s8&4H`#hv_nsoc_xXOeeOr z!}rM9G_g<#4@l{PxNkH$?Jx~ulcsTe?sVdv!5t9z!~uwTC>#ko0`lKqLW1Ksv8wKb zu)*Clp>s9~NHK;&W`JCntBJiM+35S)h{K&+$MbM0Loust*llnO-~Nz;pSltpxi}MW z*V2aH$vea2gS80dB=3FbcJJ0Rxne=ADkCa(44|httTsC(;Q2vxmJnBLjJ%T{Xl-KnGQW? zS>4sG1)8O8X>f!VoL{q;`=Rw2sDBfn+r0W{y@woWc6f?+R>eX_fgm521(5s`m8e|P z&a0_iO&S(t!;X_dq<`uhcv(CdTJ%!kV!H(FV>#eH*KFMSD;+ac#&J{(%#+OCpW`qK|&t8MWb?fkfVgo5Wvj<&|ond`BE!aG~gr;%a zX!hzzD!nj_DxSS=I&;Q8ly;B6lBnhQzRw8#^;(sys!!)JQRN<2ZE!jK2~mRm*7;<~N|RheRwEJ4g(XEZ zaZ!dYVGevCD;CG1hUj`MEw}-m-xuL8j4}>ce3?afG?3ev@Cxt9UpvkC*9{I zY24I8T)*NjO8-|2%^NIWJn;aIPn(JYa{Hldh5{ZEzX*#tLj0zKQDlrancmPAgog1> z(~*iH(}`bKus7=#Il{7v4exkVot;As_ar}H!+m3N6ZeDe~fEl7oTkM&r#-VCBOLr@L6sC%y@x|Rgt z@(xodb#1~MMds+%GY|I+M1!JhEbSkdLbmPdqLb|1N&G$mY_?GW)hCB|oh}Q}bGZ?E zEt!M{K?gxfa4(PZF$RzRutIjT4po2q$dsHScy-}94T!l$+>X2?oy#7;nrcHLdU*@k zjQe1?<{>0AhWMgAjuE?;MOJPKCf+v6kmEB!-tcOm@Zo8b?MupehhD5ex6Xs;FK3TA z4_=|SHPC}=o^i`R3h^fPoP@TBUT*)mApRXTC2?!ZQQ5eMamX1#r@|ee*it|v1#d$2 z<>%G3AM`xCq|V4>&{Dp=U!BraFkhnXS?2meRybzUYg#;_JzHRsbAWR!HQ# z+EF(8GWmAflIOBJ8pbrvBX4&q&YGCP6O-SM+IL>D9)x1(bBV$Q?pEYp{$$Rjx(%2% z`5xp23Xm5ra-2z_%i-FL#pH9oCTf3LOw8~7z|MkMP`zdV&B`0_QvV>G{c9zJdVa@S zL-Ps0sDZXmN}`|Ip3u(@gP>6M8h7*?!^1ZzT(37D$xKw|rRzI;Q2k9pCfG zmqZV^xnl~x()I#V7B^Au4#02gj=|8s8JxlVzr35=T<~^}q?qOgZ}Y0an|~3%g&(K< z=55S}sX=fc<1-BJy@h?%dKh$Q7k08WZ>QckUc3IDRM_}qvP1<>Wgrlmrv8VvI^yt%CIJ6aeE8 zN17pHN;g#~fWPK){5BL!hyJT#K4zCfenAu#g-M{3Y$G}^&cPFBeSj4B@}9Btd|R;> zT3)xMBR2IA=E8dPB&uop`xKLuCqdXiwwsnbErnS}q%fmOjHBAGj|%N|U7@RI2znF4W$M;xV<&LIJNX4jMdh`G43#)%2)ui4$cAnb2p&N z+J+t+6~aM>VQRgt5koI7#M&!gc?<5GMzMvyu+7Z|w(AGs-${X>?{FXWOvJ%+TOxOK zLnh-X7DrrLvoV2lmKJW4;v{!x?v$VU4lF))n*c>32=k^$mxV6)Lb}ODG1)DS$?$ zG2AZo!v`))p;0pfR?8ZKVxSrNy_n89)?E*~r6TFJKdg^UA|4OeT*Y=h1gZLQ{NcN}Ag9AIum35nf$G|Pv+!9K9jjlr1rxqG{{svg|iLjr`7zwrX!8US(o)ScuDd&ey z@poX{WEu`DNMOgoa8xgdr;4JjP#9N@dL#zU;Az_QoQpF{^dX5gy+`Z{2Ti-PFjoJT zo;wo{%+6EzFLMLAUor`GgVV^DgcRZ)|EcC*V*_mwnu>28nZR{FVWw}O5I!pxqE=W8 zluRvzDp_q1cRYurpbXxAjz!aVE^u}8LU5_Sfz9QjL~>CFjlVQX^FMxIx;Ey4$$r+i zU}b@sAJS1VT?)3vg`=K_JX-Gik9XjR9?sr*9PU|gA!odd*04T)zh+&;lr><}7z%7R z2;bIf0m}R!2W$IL$tj9a$kf1TtmjLg^`j1~)Q4$z9Z)Vb937Vahg(D829Opuv5tOSb(jKxJ3XlIH+b|CD z*QAlYSCO#uq$LPiH{u5UK3ZgKik(XTkxR#|Fnp*Ga+Vgr-L3(&b-fLYE9>ceeFgoN zJzy?0y1=)C*~B(zjEsxtqe}P)S-wjNrj;Ln3yB3Fa$<-WZ?yraF2SK> z0qBoQM%{RScDE4*V&-LJ>Aij&O0K{#QDJ_kPXzv&-i()aT_qih{mHmfJLnyBqRWr( z2lL+(n8mXBtrj(~apNe|{2Rt@>w3ENL=dkr+=;sG+C@72+IZ?m>+n>5Bt7eyN#pfX z;Hl|0ob`b9Kxn>UHaMwc%%pj+&b<)#A521S@FyZYZyqOX*)wQ9(~R#QzCnY~?WQ>= zCgX0mNxaL|<~aQsk^_gg!`&}4L2s3hX<_as^!+RgiaCkobmeoDs}cgC@+Y{LU!-|PY(_jB$K~T#Vgt>7v_a0K^OYZov{x4Ns`wqILMKH2c z3U6Ex!3g6S&^K0uCW8{31#Atpth_^Sj>>`I=?Lx*vYvn4N}3}U%7Lj(LA?J~twRqH zTaXFM0?E%xg!(3t`PH?Up%)2nmi3_NvE^Wy>1S%cS%4lHZ-Iej8(?Mg1PR_>Kpqc& zq-V1uiLb<2n36vQmnimtqeTSnXbNG>70q#0E<^XlWRocilj(7P9*nG#gy=#)P`af{ zg_w;U`tSH%@4DaM>wM`ThKLh3SQn20sUf2(6? z+4wp@I1HXLXHoI^OXgfOmrfDRBLBMm@T;>Zf4inT6|fAWF+Yl#cn?pi9JGZOx|r26 zU6NqQfFR!8(8znqT~E3Uc3{eXb=Y^Yh14ATRI7X;1g}jTfrdHZP^4K(swP6o_Rs#H z{6HQh57wePWP_XGB2@FZ1;f#`Sj4?R_s4|O6V7^gsbK|%9!i73h0PHEzLfL|Q}}u@ z7@IiTv0ZpKeu~(DR(tNC-C-^C?p;kKbj#?F2J6w%Fo5%=Pl?=PCA<)sO^do3ct09u zfcT?0V(U@>bSRv>(97ix-kXQ}S9J3xjou@MkB?xxZw)jan9cQT7sX)KW8>B`1@qnV z$*CKWa5GL6W<^{?0l$4%*>;pJ4!;Wf<@!m7=^)XN(!>$zAlh$u6ro!X*GSF6hk6@O zw^nQRCiA0lrx9BAg^E3B@sgwH#;^a1;OJ^_E|g30G##Mc9F^c{x5 zl2vGZLf2KI=*1lB@wbD{F1QTy zw~FAXGzV4Q=zvz;U2^q7F(z*oqC)K^5LGiwV^)=k{%Y#SC1+;{>7Q(W>kzTVTNSM8px_L6-aU=-W zADSSW6szeMhTZ+oHGtw%4DelLvCQ#0sHMomG*?+_aSt(a=_7b3+d`F(*Dx`PeBAO? zj4xura;k=2n8GrkIeXTb1pEnu#DYm=s926P{;fm*&Qg5Ysm8fnFTe?>f}F3{4f*|1 z(;)K+>mwihLDc2%lOg@Z{9f%VPzaWIz~2xix4$M>kdE168t}2YjVwFz42%YTlR~a9 zOeCL%P?IAtd~GFk{D_43Jqyua>l^w#zlV;`B{?b(l(Sx$HvpB|TKA`>%Pfb4XF`cN}qL{#~Ag~!qHqAp)!b{gFqw~~guyA9`hBk<0{ zAiR0o4CAE;N_=R=TN(jWRJqZ-xh6w@}W}9)|?P zV2@8aZ{J`$oV7@y7CP;8(r5t2^&KH_CX2NFl*Aq0E8rjNMQNKo3YI5)*?!0hRSW;2 z;`~y`xbPEiWSQaZ+FmI6)O=9L$Q`WNL7zO@WW8FIVlPsugu5JJTs8XP=XGNLNq$oO_b%0Vd=0cSvA~- z^R6cVRc6bIwn7l$7y>DwrlgS)YO1_Pd3GtX(F7E zEMZ)}?IdVlYasiy9ue^)ggFxM3kM&w?8+*05^=AUbn8XKnVUv%FT>XKX!B`&FaCx^ zs9MoOrI#VXDwooPrS!IK4Y}_8oJ?AmiVqV1z>BJ>c=k;fCNEfn^FQgsPeTzRd2@hQ zKZW&?H?Xrxxe{*mXJC$Z0{8s>s#=qOf;cD>4*i>^!;-reP}cYs!vfS$;CC5-T_bpe zNO1PuyNajnThQE7fKHdMLGx=TsMYQoP)=U}pEh(*^UcYybH4<%>2;8O(xqf%M2r)c zB*fu*mg3t5C(%^B3O>KO%ly-qqDf38Z2D9K^XuoMXSfI+n8$&AUdgPNYZavQJtnU| zWU>DD3T||DH9c*69KBxGk*~);kkv=#p#WpSTY4-49MyFpUho6k<4p!#p|d1*_#Bnq z8H)kW2Ta}HtRi8;xkOXJ7_C7GvUk+dJwAJ3>#MzV$Kg_l6qiGTruAS~ycs{S+;Cvn z9^i=BP_4PP;JPga7alprIJhl@ukGyjU_S%af87VkYWs02wBl!L4sm@Mf#!>w(Dt`1 z899*5wC3ef-h(RUj;tXjKYWVHwV$~kUj*@r7e$ek+}$W{b&$ICdcn$&PI|s36dpTE z(M1;uvGIB|sysXjuU9dc)@g$4jRWbeMc3(XubFTzbtW?{DT-!Cp2w;iRk%^r9JJp_ zF>cEed5Q7W^aqcDD-KIh-l>2|xLCnFw@oA)BML}w^*(0b=mtD3n?lt_j3D&xX;Ra_ zkO*aXLG-E5Jc*1ss9KbS(b?-UEg}sUD;dt_IkUHwD`+%44e51erWYXh*R+0_(HFyas9w65y zg3H~0prFHZJT|@trgmE4`bEj`?ZFt;+<^GuzhGR^<$yIG6ES|t38KTS;B;=UMW5M( zbK10y>*SpY0-C~{IbYQHJhNr=U`RR^M2L`cI|rc3bu$0+#w}=EHG}`VEsTR%&!okp8xh(59v92^7|s>=LXZ<;aj{VFEWth@(mhqzJzU`zR+a#EA-%PVO)0qIo_A|Co8Y@ z6FZ+7e4UahoGZd7i39P&6IT8xnkff0GNOF-{m0SD^DsHM=`PNC{D$gAEdzsTfhagG zOBBDZhk2LAa7nl%TwSS&i$)sh+zV3VR?0TGu|8oI9+E6X zz)!lCNCz!~9otjEd&Up+7F>o0N6Ls)s}KyWR)U>dS^cll1F!$fgyYldY0T1hC>+&G z?If1*Hkw4B$K1ylxZo$MAKiu50_Woo$t2=$)Bv6PBjIB~1{6mLV)ME@QemD0P5Qn} z+{1EQ>^A_Hk7vNDd+qFN2xDFEPkeXdFOHs0z!eQ!LFtSdXQpZ?Z>fww9DaBZW?!m+ zKjYK*GgMM|XC5^`=43XvBT^N)5+$a^)%!tG>k>|x9{~HLZ<#Jo`%06h?}M(2PDVjF z4mYn_2Y-`l(0|u5IPiQXWzJ}WY{W2c^CAbB*dzwlO_k)GUn(45;D-+%iGqWqKE!kL z@UfdTHf}kHY7s?@&BB8u)M69L)?H*QcV8nHbu6fKpD43|AIwYpF9p0;gdKh+9shYP|wDmj8#`nb3eG>WlDXu^QnPEk?9qJ!mmY zVDXP5IQw28v3|XkjEA&>_?!sXTWSkWQzN-{YGQO>^aXT$(?h4kw~+WqWKz5-nR`zT zU-@U^uy{1eCSM_<>onn4-6A%+LA)3GhvCfC1dKRP0QxMa^+56?aViu9smxpS zl6X7IIk(bDmr_8F?fJH8Ra3{D2zcf*6YrZFG8IPON!FSG?CzOIcmJ7Swh1vvTi;P1 zCs`CeJ{_IR&cepoi|CqL;#m8>7Q?TtN6zUVWRANHO8>IPK23MneEbH_UgtD$=bpr{ zfjB4{UJj}If6>v1ACNNaM#A{NLATC=_B`q(K6Z~VxZ@7y-o5}8Z{zv7-&}rD= zqXuT0N>C;Dg8mD-Nsew32HiXdQ@Hpz>^0nQ8Bdx=3eGQoOKs`I@h@2RDA3a5t%6MvF6X&|{N~JIO!Hc{BwsUObPUtnQq@rit9Qw8z>A zW%#&#kdFQO%*(c_ApDqkP#2SjCtWOaxZ@Li-%?>3;+RJa)Iw0`{Zr_h#D_y2;+$(w z5KL(oOYlS5*Ld@{>W;KaaF`M7cH<+S(q-_pvPIPCdE<6p2MkJ zoJt4y7ny>9J48K^?ZqmmL-kA<{<$&(CUST$?67bHRgKSJ=qJICVY&K<>Z!QK3rJLK zGpNkjibG%9Flk{BPS^~C?aDexUh@(U%rW90ZIi<8Dca~8`wV~E^YN#kHG6Z*3rs$A zV%tf7xKkI7ZY5`-C}juexTvAU)D&Xrv6JqMQv{{gT$;O3n(zMcEt*$KHS_psh)$_x~&{NzHy#*GP#-XXDInxE`8~ z%~JAI{Xi17+`NjnE#j$W_EB``n*?XIHo?redL%ncF{VG1SMQPze>N+@c~IqNSv&+a z#V8ESw}V96N+x@x5*gj+WV(eut-iy>V^w~jJtYX{=mhg-joaX>?}i{7sDtBfXNcNA zEl}NYmYBEw#3LmiFi|iYo6KgRxzq)malV$WmE6hZ#_fe=bMr~Ael_~9_vb8JwiI5k z2?PbBT{!>gG?ESh~*Hue4V6KW4k;a}9?pxw$Fc&k={AL_CO)-;Xd=V}KC zejNi#?*9f~7fGUhWC~|){y{LKd-1okI!yVthC1%uL(VLzfQ2mcv2Z7^R-g6F;g)ie zb8r**84dClvf2rYz+;-23htTXfcvH&Mhj63hzRzlb@VYWu0#t3s||S;iR|tvBnw+o zzQc>arJRK7YEqEdi9bCEa{kGZdj@|@#f#f$)cZhkcl%AW^cLlBut_E>=g+`D0S|Du z^9b)<%szOnUk5gu^+Ec@1w1p^0tYYkgFuTOT39c}R@ToMRv(6!R@^7+ZIp53&{VLW zl7{-r`e5F}e3Z$E#9hA>Fk@7MzdGhQx=+=h)~)TZqdN=IN_0`hY(1QH8$`uDy+GH* z!vlx{?vih~uF(P$Iwo_@weH2KnTyHu4>4fATm*hlSx!R2Mo9goLAx|gVW+kjZ)0KG zvd`7Sw7vo`Qo0v}uB@XrXV;L4OS*8@#fiMKe1S*OU-NpU%YhdgM`Lye;vA(pn71br z&vouaoAm}T&P+#Y@(ABMsxxEZjpUg?Hob9W6XZHxLEBbo_&9QcNE?2nZ?vrNP(4!N z1M8qM+YmYw?|?=_EHpkT#V=oSh-Gmc$Y{PJWLhY!_ha{ep>i0W?FWL92f0$){?Owp z@9|eZ2?YAk|?S>$4@`1c4s<|mS!2a<8q;2>F|Ylhr`X>?OyD*7sv!->wB ztdC_Q+-pb%(z8XJK6%%+bG_rSM%espjDLL8M!qqTLp4r!m?=8&T+@5aW_LA7$w$UXHdbbZzy+ zE)7@6OVz=e_FVWi<_Tuq|52-Dnh@1}3C~q!qt@98^2IQM>&9jm4>fIo)A6^-VNo_W zL;WPYDxCuM!&$IkW)>rBW(N(E>=^Z~SmO6b7QY$fL%mTFJKP)rb@5-M)69qn>qddz zuUyn*B+*H9n4FNAORkRx@NTM(5qJMfJJ0i>P-DOA{ zTJ>Sl$wk;>3&hJ$gs#4mi#5vHwB5It7zC$c#D8<4qV*U#zmg04!?ienK2#EQjvUVR z6XuL6In%pOL*d-XWSE*ThQHzfmRY_;`8k!i{9QOX#m;RKHpakHe1)qgui%%hzlL8b zD=|a%B&jGHhed^95GN~+$I@#d=f_`+zH^7I@fLW}O&QLd-Nc>G`A&;osDMFM2v+5) zf#5}Lw3p~ZjzuswYfdM^XA7y_$+@(&<{b*GswY++4@l)q279_+6Yduih&z57rtiIK zdTY{4(4G+w(a(+H+0j}i>(C0g#rlk%_HmFO&Ba0t0}=jP@?c3Ud`-RpCWc(5&7z1j zP7g-GIvuiWMGRx?c?mQhRq<}0569T{4zzlC)^t(XIXtwo7G0CwFvr**a^{?;(OaCz zkAiHvvwR&|`yHmEbJ+Uz@f7n;x(cS&Y@`|au_(!uW7*=bM8JIkT$%2~^;;tV>IqHE zqdG2LbeW4i;|@3&5(Gh;_TjYsiy(Tz1q=&SC45#RdXb*N%dpuBTBi_Ff*H-uOR>RV6RSPHz{ia`_~K^_`pSHP&VP!Wv97DQe~Acxl{xpmDSZ&Sz_l;5qK1uiri!IsP?fL4u!CK|Fbc4cMqp&xkdDNWGf6;{KL%0 z7eG})4%TY_(7H_Wb9#S*QX}?P|cHV>j`u4SUzo;{$hw z=}-FYJDoUjw?QK`U;rr-4R(s-(rm`q%~jLF%yW2P|;LvYPc zn)BzV48J9*9v1U=Vco7_j2x=Onax)~*7X;e_j(m-sVv5%gqqT;_yU7Fpo7_X85)FTj8W4qD(r9127AAkOX6|i%MZ_ydaq6Z5 zrgeWZ96rOvADjqK^0|(te*bvqUOZup^$*j8r?MQI#R<$J`Nu?0D+3I_Y=fH59n|Fh za>!RlW|Q+-?2QsbTi-r><{%4o|1H8bqgRP#`a0OP$N(K5+tMRmEnG5d)jyS*d>*gx47bm0u?e4euF3<>xI4R6~W3@3T=a3 z$cG?o|+ z7>VI5Dzd86n3gPAP1bKIK+_?DjZa>qO1Cr3H#b%LZw!|(NEFyGpC+*%Bfc^37 zNWg!B%+}R``1ootemHX-29pkBf%1M(eSIB%i+`k#!at*2%Vd@lq3ekBf_4}`BEjxxdvS!s@R+JXsF*Xtn`|6MCfD6&KQuE;Pg{@9 zC2?qe>nLfN&c%0Kx9GTb5(WkYLio;R*qUs~c}EcB4%>s?cp}_cU4~Z_ze4E32P9$m zJBm$N4c_IIG*sd`weh%$1O2yQlcgev^xsF1?h3SNS_RHNY3Q&05UweIM|HNBdCYoB z*QA!v1E(4>wlWDDHb&u-TRphaCIVdtc0ljLQ?R|v4)5r_=cegi#PV0|%n}zb=-ypM zOv3(>&$>-GX<`S8rp-aiCw{e#%l5;jjQ7kd`FE&%ED=Ab#KVVR17fYQ2Ys%y*@prH zyn-joz^Ww#`t#k{+tgdYe$){Zk7)wu>Q$O|d@^;*dq=L9g<#W{-yrp*2c`y{=f&;& z36qXg(en{^$c>@`W@dLAM0^mS_I^X8rK+C0cCj4F8mJMa&{DYj`xsts&SQF~WP`zi z8Q`>XKbG%($6P+xN2>=)@%eBEFaK6Cw%&dY=S*rS5xsydW%-QzP6_^j2Vry~Y8@GK zVOg6yE_f?rDrdh-6AI1zMh#a_MQztSdc&arjQkqtRpuJy&I9b=ZN=htkmT3&7hu-$`Gs1%{VLKu()7Z-qz~ zy0E&qLbDF+-m<1Pca9#6gm@ACTtAvC%l7X0ncb|_Nnm_v_BF+>0>3Hk!dzaM`TgK zM=|hcZz|+1%7yXh5BNuq<)3(cxOzee6o$p<uSZ?(Jvw1?mY+T^~#jb?$-6A~mSunX;K~^|ZH|L$fw}<642A zG=apDDm@Q6+Bj%>?7|uN@4$5wFOf&79kKYJO$7^oS;K`D#?0+A;beJ7Gw6T6g$HxL zaCi1x#Xcbekh?02xm!NKjOSaif2|U%9*o7f|9qe+w3PZK#ZteXC=_zaM8OyLm_vU= zu;|e(G%n}k!}BY#QA-SDvXQ>}B!#l(S7=g1DIQ?W)HTahamu0zQk==oIAbsIWCfe| zT#`-`Y%PfAO?LmWGXi_hzM~^XYPF9-!+}j_C#v&J$%>SE1W^fW5HG+B4`0)*PaAJ`VZ%pr+O(;Plo}v4MwVmfmml4P;#egb60Dnf6-G~6;x6E3cdM9VV| z;MJ;bqN8x1cDk=;Zk5l1ACk&^i!al$Um*~WTx6hLvj`6x^}ttO9^?D95JTb~!uvLL zzWwuh!ez5FR?if^n^1K8iU9!=E@CJh5m!)PWf1p2BLh5;4KgZqM+Y{a|bZ`dPJU0RdBL)0# z!w@a}rbQl!YH-|)ilAo943Ad|a2%yBEXkp3!@toSpP&Af?%Szp|+C88JaUyG)q zc3WZWKp&GaWhp$G*aog0wrKwIJe_eV0Kx^oVlJy!#cB34vTpal)t(PW)mEU6RvtAK zjN!#>871%5%;HGReoU9xAHqSN9V}e`1kGn`!ECNAsrae}r2Q2s?rO&Kss?EJiM=}^ zmkvuF{xfyPweaz&1pixg2nlYkrx&NQzM^xPMEvMR)+^s{8rh^vzZ7nT?SGT$wdqGt zqEQmQ`)87w@q;Ai`$yuoXa}|xJ*2{^voJv36~YU|u&G6nmTQLM%eQ@Gq%Ri&c#b4Y z(jWhIB;twgG3dy64E8n(yyjPvIK`f8>Bmb~sg9d5=aJI{2+cSF2jv^^+ItC%lU|Lx z&jf;Oc{BYiz7n`XhnVN@96_mYIa6jl51tPl1LrBe;NgFg%wByLBjG3p&O94mvhlVx!7m<~m70_!-Y;yO-g^ni5DAy$aoupXeR$V$59V zhksw1VP~c@s2u0gnnSB-o9u2FS@4mn)&-*28wHcf&9hOg^DgvRE`;jHco-Kv28qw6 zvfkk_uz7Lcw7767wuVi>FQb?Ed|xKMAM+<4G!s#CS138T>n>iuZ_oQWvxM3Xy@Xw2 z$(VRbgw{UIB~^38!6V<3{R8Z zO$v4gp^`@+Y<~Bhp3Bq0Ud>m~v0V|K8y1n6bD?+kx)27_9nw45a>w zz@}&6WP`IX|N6P(G%GC(PA1Esn|l_$%4USD7b7sh)+tuC!5@%@-EU}Ri|malN!F`t%1`* zYD`oABPiZ^osP`kN)22WBYl5 zkxj%`V2SAu8AEs*#&W!~t9U=3ijtRM!MN8ho_?NYkLA+RkXoHX9ZKh;x2Fx2KN*S1 zmBM)L?+n=Swv{^`ki?T8$OAp=7?x2Jr8h3lg34b`Ks=Y?{D1rL#)2Z0FMG=COPK^a z-(I2enZxA%Y(?y2Gg8JGN4O!=#oNUCIhqm@iL`ev+~MzrmO+O8J-ZVG-{f&?4*fED z!afr}3Y=la)mx;|LC>=Sgs^gN(|J!aM0DaM0No?S}JVm(^sN>RL*^ zU81nXnu3w12+Yo_fc*zNp)qbR@LSh`s>@7jbapa`|Go*&AFie14mW78ULj_c4B+uK zpUA6~a&YpY7`4`242$IBsP}bcSSw_SGrW(1dvX{4t8NARTXFbMu!1?38OtQB@_{}3 z7QoDeOx#eHLLNC~;o2SI_|Jh$w$99lC7qk_=ch-sa*+fs`0PVx`Z(g7Y%lN~{X}Yi zg`q**aU5@H1A~!n(m5f5hjQ&fD6SFipB};qZahg8TMT0k8}LJ_4_fOWwpOOoo_pW% zYC-|3>qerd?rQW1@WR-#we;YZi@4eEG_|&pHqw z-e%+3Jn40pSTCg|4E%cx4j<-l{`z-;bJ}#2O|K^x3YO#iIWffKSvt)0zDho?4#5%m zY8d_woIh3h*DY5q;d}Cxut<=WC|J7TZ$|Hd?$}=h4@SF zi@@$7R_l+j!Pwy`P--}zcJ4S!)3@7EThlo_y=55 zATP`A3?4Xl$kh2K%M2Omkm>OrL^flTwl*tqrh433mfrCY^;S5e(}X-edmDwn@1@ai zz=Oh5GdSnBK7(MhZsN%VlKrw>beD1`yOLo&?5AI%{s{rT^-Md+4x(G-C^vU4DiD&K*Ld_m4=*^)JNau^0NemmvI? z3%Lgm)0cGTTw%o*f1$tOBz|8*sKjDQrJ4M&fEZdH+uBq&cdr9ywY^uLM}rj)6d8 zs4WDga(f||^?`;?WD;S;Ix6|iA9pvtCCW1Y;VhR)=(%$)FqW6OvwpuP)7XTT6=m%F z{Cx(7YBw_8PiKLCry<@Rt_QadqA1(ni|79Xteq^+u^63z=U0!Ac(*CoAf=0g?H)WC zvwx7i@Blm#nhG~vgE2bcB<`EFg;(`+5FMLU930{FH$4ZkhXN{ zCiaqS{Vxi{dnVD~!C7cpHo@gcCX&mE{^;wW2(MP^LHyT0WWQYnlRrO&8nao1{gTTu zV_6*>YW)b02drS)9v4`5r5aYPAH?PeHg&*Ji8r2hkj;?Tg~eB|lQTurG3sP4UjNlf z-m$F8Zk=x!JGYArSv?>N9z^k4MqlEa9(y)#znhMqE5yiavY@HeNgVeY!QT~Es9u*c z*8FF~2%d-{hDu?))$7>|xP_VY)&6PJ$wm!DmQSZK=QZKxBU1=4*#)RC4oXk`q1t9I zT`qhF#{$2Se*Q7WrR53PpzZ+jnZ+1?B!&5M~;%Z<*n>PrhnnD^!g9(aabM!eb^vcCRRN7e%A4S&!{jUTDe_n(+R2UMOI$Zr$ z2p109!JXZvH2K+5h;`VF1Eb=gAnA^xqRYYdhzw3WJ%;D(>##;+5=zX_0p~n-l%f%o z{_Q4K3V|f`;S*T%NeSHYXVQ;{y>U2A7I)OH!KEzoQMh#-*EXabk9^i(`NzLBK&uiB z3nQST{xzDJZ^WbZiIAiD34gff;m=(eydUSY$);lwq)a;quAE`-`^=k(>1E|mv2rp; zHBuAKJRBjfw9PfiAJ~fz-%6jP!g7875N!myd@ojr*hmuWXT)&B@_&i@Bw`u zg^=BO)1k_0DQR&I#P95l2CpBV$Ts^ZkPeo?kb&RSSBPa(*X0v6<5sfdus*c&DenIo zLO(l-;j@W&3}qP@c~(i1ds&N8j&I@H{F88RYX`m9um>0Zb0ro%i#V@;S0vQol0=G0o zz&j`#U*GXzW_!kCidPmc74(EtE7rjdySo@Y$rWxspN-W!_;kd@oW?8Mp>0QR(%Pg_ z3@h`2VP{twaq}o-RPd?6=~QgqXG{*1o8x|^*|c1jXDXJnl9$}c@?V!P!`LJx}Ox``lVvpoOD z(0TY{)rN6gNH*CNDY6=5h3CFbC2vKFN<*QNN`nTXY41@8*(1sdA>p~Nhm=T5WmS^W zvWh0n_xuCN=X0F<+}HK{eZOw}ym6T;c=SpQ3dJN~S4tv03X$R&2o-SjLn~SSZ7nQz zd}<+R!GYxyc^0;FMj@Y+Pyrn_3sP~Ly7rtR?MhLspAcZY{s7s3wFXqlSt8#xmACAB zCAMsp!)(D)j{f0F6f>7Wxfz{Yp3&|4-Q(xrb3+GKul)^PGJPnvBp=vJ0^HcH2sb%8 zsI3*rc8vz<>f{s3BxnV}Ml8cWwg^frd<+;uKHr+BTvQ*~RveOF+k91)do(rYlEVXotN3#-Dru z2V$Oq%yTEQM4}O=oE)q-usdmPvG@a#GIGEjhOD1u+7+0h_Xt!fif92=(K+2NQ2Xu) zb@YsYMHeg3Fh?6NefvzpE=SVN<6ZPbUo43{;sd*O7QvK;>&Sne(xnI5;f(wv81yV7syk)*l{{&r2(GZH5fbgk&r{BoEMwF1FY}|e`gnN z)XZ(T`f~;>4GP6<@j#r}=uO{i>hkQXgQ4`)T38oyl}^s#V8XBu)nl_Q3Mn3>(=?QZ zsy@N$^C9p^dne>AGXp6NLxfF{Y)3K?WJ1}jFUzMbxyAM%T%Kc1s}3D`6KpJLfv0u0Kf<3&b&S_9f82z6NR|KCrW+ep>MNIviMV0Uf2T!_x*o6lVF!FE3Q1 zTU!dm{uss`Yuae$nqMe1GZ%LcedP|Or_v2sM)ZAw3r)Q34riBh>FZt|ZJTGw-KBCH zoj#>-{FcSyOtWS64{qq=lrI;+qeP5ZWmG}}GAgkA$~ZdO)N}XMHp8+N8(?+39rd{B z1SeHCp&KDeTJ_CBc!8anUSO=suPX zqls(Cz+Pn#W;3g*x)E3z&*pGG8o)_63y58In3|o^qYuoYVNpT?Ui>efv+r#hh~86z z<9AoXjGf2e?;#9e~}}%Ed|W` z%jiN)HDa=N9#|J>P&tnbNEV#MFpJ+p0k4QYzEOnnb8U$mk!E`1dx?7JO129$h0XMd z@t)~tk@Y8H(C*hTCh<4nX|q*Ss4au+{XL13t62^PO-FtHaq=hm4@;wGd0{KsX?eFN zQF7QskL=rt>t%;vG+cleA6SMR8RyVNWhpR!I@hnks*C8G50g|Sh(N-Y`S?w0GqD?PqG#r3lJwTgu$4|BBeOE0%Riq6&fUr0v(94v zuOR#>l8@e^Q(Q$8_)Z-#d9Ah1Ve4$+XbQ?(T zWtoWDvG{iQ5v+P72*MrWxbg5bcr)`5Hl2M53l0gxkI2KE5uXJR;~EXU_bBN(v_TO~D(d+QyTPB=* z7l=}k;pETJHgM`XPIs((Tzg*a6*>EK1j1}RFg`&EC8Mi2hrOrr_#{(_RrCc^V_Dew z_0!nQ#4P4z?-X8dDuL-cTi}^>2-trMCZ!YnXekv-s;6B+lN2$0o@jw#g9#RN?mN;Q z)<)uTQ(-6TI=YfAz`V3iq<3BYP^a}cj3rqDoECsw=RxolRD!IB%W;R5B(qQF7)*Iw z4$pm5!Mbc1&W$l_$7K@!b_CMiPdezZVxRe$Cq~ScE6%X4C4gobR)bQ@d&=09(Yo_a z++-7P4o)2=SF(zzj_nBHv%Ekj8|PwI(-R1YR|IvHQR;qtEg9Atr#ohFbL~{S z`Syemv}(6y`vWa_=hQ9yT^E39yT21(+YS=9sT3Z^--Tzgaoml;YvFI37-SU@j(yQ0 z{NB6Dy7gSZ2bxQ7trk67-@TTW4 z`gBwP5}fD3ORH9RpK%v;=l5|RCbV$#pUo$?ywxD-h6-daVducBCBQo-geEq~g4Lw~ zvVNopgWAnuo^d7Yt#d}Z+do05tB~5CxoRF(VT464?@?GP2o(H9IAMZrkTZWKH&Q@= zS8D`#r8fiS={>=(+cqH0T1Kd<4E8*#CwFd5!97!UaWtKUae!sgea%(DQoA6KcfUZF z!E=&y*A3j9J_EnDGkjBX!ii<6kR~j`OZrzb{|pMH~AzjA>4MNg=mL>L*EJ==rb*k}zLuEih{y=EkMJhHTho3Er$i6@W z664Z{+;er{%>R`NTfW7^G#m<7tC9raNSs@tMSk-lP(f!9GgbAmv+OXMFT4->CYy*& zNjd%cRE{y#n4nMc6==YsBz%2oHF2+cK^4k6iGR*|yf3?te7mH?x)#DP=3OVzSCnUa zS81H}SCg=nooh}_Ng#_Z&cG~()lfb22K5(cA#x`)m{>tS&R&6);P6KbDpQ{C>li%gdqv-6^K)*R%A;#_23*&ZKuN=oaI>_E zsQ&GvV#`A~)t| zv=^O3BcCSP^+1))41MB8zTSv^-7!@2FJeZ_4LFuvhElUr-AVIXq|7EIho_0cs)foBR=phMt{B15 zm(^IDV-Y!nX^eNpya?^T$|tz{Da$Zi)SsU8@!^9*A}2;bun-P--CeJ3{?1e2;ZO5 z07afAIo6|w3Ex|>&iAXuoDVzLyqh9paA=fDdR7DZl?P>Vn_zf#J_IgLrL#9x!7kc| zg=zKh!~G?A$b4qI!TI%4totiami5&BjiIymkC3b@t~esZI!zXP;;!JGH1V|o-Q05v z)P4hj^VOWet)d{-upOT!O2emkA?D4FUo4l$o~s!UNrY}5#-5XBIQR4(p~_Epl!^1g z-HCSSr_1^&6}<7kfC;E7JWI^q_>+|`vG{jq1l^WUL(&ATJDMFN40`QK^hRzH(bL_;I#Jo&e1{nZ9Bm~9 z8p?3y@N2r`K^ga_ZZmj1=qFk(z2urmF>y9rLamS3;;||dlonOMKMr%?^dBDDScX#w zW8d%9Y@RHr5tmhbruH5Mq^qYCjn2p7gFBUYR{9Kj9A=$~yTYN>^%X}i;25^t+yEgb zT2WPdCmyZSM(tUBWL?g2dj4`GW`-TW!n>ZdM@A7oT*_kE20}ddqp94GvJ0d&)0Z5Y z;er2NjnM_s^FZH92fFuJ;*RouqO;C|L`KI#W$yw=4N${btvR$JI|}J&CZ}`vF))>` zWp~a681>bW+>1+xbC!p>O9jVhr|t+{CcO~$*l#f}niS!g3`VhMgasg0+% zXQPNrKOTY;@aw;;q+(_)96P!gZ%->?yAn^yU9THtA)9fz5y5^h?rDLRjOXLMm~9WTSD&HpyC4`B>V+IN zg7)2Z@IEh)dVMJ1_64>A&5FdhmGbnB!3ZbG%p5K6b(_mKITC}@SI{qKBOZ{`pnLs1 zF(|>F>|9t#^y~<{cv#3$EN?=A<&w{Bp8I0cCiZSqR zF}$)L(iBU%S^8Q`_Ym7b>9+)j#`&P{EzZ1>`~VSBDclF<`S5wpO#CT46TZeAK|zhv zc>do`>RXaZA1z+T{P|Cc*;RY7&i~sS=CL@-d=F^HuOdYd{6PY)nBNES=GWB7P>ETA z`t*)#Ep~Fh0sppSI%jT;r4MynR1Y>+{c{ez)Mk?lY3n?iQI^q|SlS2IID ze`Bv}J}&Uc;(nVfq;u82lP=9b&h#DXaFv}!OjIcI?(ArV#2#7Z_*!FT{I?UY>eL9% zQ4>O?2zwMd_yBCxOqik96?7oG8wBrv1WDBuxK=v}VwcPV`7d*Ee25=DyOxr9GtZ)J zXCQVDB;wJAet4{PkjmfLO^Lq>-QB@DJq+`3?Y&@Z+%gH_4J@B#(NplrSxc|v=%V(& za*pzjb>O`EGx=}pTAI91nVPs>#PV(3)bx!uI_bcay&c zQDm?|7b?p0A;R;oc?7>7**ua8XV}@$#r$_{4zr7NJ}AYoA3ofb+U*#|osRXJa#7Fd z3(Bn+qPrxIvo0-n(td!S7rrG9_&u(0!=HRX1#1!Jq-ZlWI}u55CUufyb4B6y9c$7) z8Nv3`%1O!14CHKaK#gaY;rc9bdivHaP?tdZ&%}gk%u{4LLEAy@QUH#|?19994%(x+ z4PD|(=&i*o>Ft?_%1N6^2>a}=yH*7em#!1Z#=Gcxc@~-Pa}LZZx%hJ3G_s~47;LTI zbJgwFfR2qk@8nfA#&@?1dk5bPjn6JX+G8rIMx~c;{FU$qSoO;-M{5I9TB~0`9 zKaodAQ}Bf>;;lt5h?sf;YSTZ6|dtguie8TJ&3 zk@qLwarRYD(mHN6=_ChSWqvG)>%C2#PFM zS6L;AK3TI7WR3%_vpf$&&O;;}|ByAwah$K}MR4P4G%CzJgUhyG!DE+wfOqRO-R6D| zK8HL*uaNg_hHM$0o4%Gl?I^~q^(;fXRtm4F=7EgIai|uoghiQLICAAKer%e;OtL&* zMX6AZM$jtA?&X1zvjfPr`Qk;{ZdgC(916<)R~Pa(0V1uv$jr57;4`-f_)6=DRlGM| ztoj7Ej{gTU&PT!~>$CXf*bSVJ4M(r)WY&cqg_939(o?51$cx+TxB7$+#x5$vwPSIV zKHtJ^w*}l9UjysaHSoXk7cf8HjV!&94l+;Ap#RJQ^uN?Z#EQ?t>jF)Lj|zyf%JBQX zG%Q^^h*}A(Pj5M!SDKjtvI7DjE~^XA`xUY2?k}#X!v*@t-W10)&1vCr7ji+hn2xQ8 zhw70XpxTxJ2VMK&${z{Pt+_@+UurRJ{)@SBjn8T2nVTr2W6GpTti=tN{YXINPHscs zKN=qLh73+wLaXZ|QG5$K$66E!?~~H8%ufx{rHY_Jvlw5O}sw1_yrGq2rPj z%;`51ToGshPi+ZQ$f0CuAj_!Y;|FysL&ifd9~boYLF=^--z38GL@Fulw6GYi(TeU}*$jH;|RNL<~^vhM7laEmb( z`u~6x*X_|(W_ zgW8ATfy8Fr7g!ygxy_J8f~$h|4io$t89VP1xRZ zC@wGyhn@~wx^Qb19ymV9QIu~4jg~%iTRDSp#b41*vt6LZcMM~{YGK;Q9^&%75NeOK zVUzVa)}QqPe_0u`<--|RS!xCgV!xp6?#tYr@haF@cZQ_41Y?$B z0X8gnL09Gou>FTsaLKxmK4={Q@C=2`D#0Kqpa<^+73!z#mgass-HV|*r(pJFF3#rM zhVUgUbIvaX^S9WuytW8Boz1`%sOO>h34r=%61*ePUC@**!27cOARRs7j(t1Bsph*C zC<=P08CSyLey``g+Fu4{XWYSJ=U28BZ3qiS0;%`(W2h2gh%YM6!c|-L`F{3|le0?{ z8_x)mea)xYOxqBB^71$uDYDGlNd+d9%^g+Ga6z5hQFyMo0gj25gOr2{INJ=My;}qP z8W9FtzRxr}ir`p{78ASkCM@1G2o1%oZ*J;lJd?(w`jampY>qOs;_63uxvdPmm^Nry z^%`yolv0D9!?^EeKmI#84Kt0{ds(F^T9i(J%G@|m(JbX^#Low%_-gv9!I7MB4uJr* z3HYM;0re7QLXXu;j!{3mua;0IHJ@2j`+zT%D=MKCT19lA`#$IW?WGVB=Zdygmf)1< zg&hM)U?R(-foZvL_nZ}*6Ltsjt-8F@<*eW83VY@&+u)o0Y4o1tFgN09E-9590LQB{ z(eCM6TG*(Ldn^EbEmP^dX-UNM*mRy&qAbWi(Z`_KQn+Jb8yUam%hU)NlU*xs!c~Lc zRBA&G{*+uyPi$3US{+-6t|xn^T*59+%#D^TD`I-PfED$i)x1oTYj z@Pfx*QtrE3^uXF!da&d?{C*onUaMxX-nD+R-cyQUAw)cd*wr97NKx?|)X&rsXLihs z!p{%du;|C2g=^gu7&f1QRZ0ygx5R*Ts5oPRy*%!-HN~*#4`e%Rf@f|DkXvyIUHQZD zwebRWhciOf)9J`>)Qf*S-+yzH^_}pj!iIDRsQZqb zS6a*=bwlz;cLT0^y^2gO55)1@2WZ(}0;8*1Flq04?AmFII**ay-8N{s{UbYjGiGd$)!=u@lbAn#k0bRtntO0w9Zk%7N*)`e z(Yroh0a=!wmYEHayEFzv|LI!{IXI)+ihXsT##?YV%Rir?Tnf{OH>mbL!iMhe5WTDj zQYMlhal(RW{6T5yo;B1<>@ZcIa}w8Uyd!7NDv&?7R`aG;s57^(WTW_f!e(_3kfW8g zm>*G%Ruc*2c%K4u;PzJDRh$Jqt^&NZbJ=BCUSECf)ocn?A@qq?I|q+yVv3LQcM$3~jBpo^sDajMKHlQ`BXHmI2ZXoZ!JDppu+QlU zitW?`l~Xt2Bl})6mta}R7bDS>KZ^MM6JX}5w7_ucEHo6Tg0!Rg;C=B1u77`vX8y8c zF6jHC#?OaT)=B`t$PL~0Kc@F(WuQoXC$?3~p~;g#FgqcFQ=`fd4cefr=RJ%sPAB%x z3GBDTUt;TW0gLns@b4W3c;tAWb(3C4BC85ApUptF`wb?<&W8m57$PFm1#@KiG2Wg_ zX1DP%KVt15>XTr-_w%0+8?%zMy#4|)yYe}GyT@T>njXEQ_L2G>3!|HKLV%<54unq4 z1=%_adg`Vr^T}Wo&uePX9d3)5M(!25l;?@KroV763l4!EX-4rWxh_9 z(0xvsps>-A<`rw8jgB2@SfGUBsS!}?aURPb7n9v{cjA{<(WKo*fhj)QL%UAyBK~45 zPe6kUTWY(ZWltt8x6Xr}k8?rMx&lw#Rv}yDlz2l^ZV}u{q+Eju6{{HUO2(0Za!Xk=Q(odQVvZI)I!fCMfiQt7%%TA!v6Ry ztd59*?Xz9+w@WdQtUAzm-cCle6<~(MSI+YLNhAuxuw~0a-sI#Q__6i|+-L7ehIs{O zeXj_MwAxAM)l86!Ucx(L-b~}9gSdYUx=6~=Fud^QI*581p`f|6g_hwuScZ;t_PjAd zrm8U3?94(jcOjGJe-imee^c{2*=T&=92^^bLtbChIQeW zotIEc{vg=xFvfnrN#d&BO~mX~aL3;3X0pbnV0W<$HQ!#q==;}+$L8s<)8iUSteH*K zcj=K;6NfOXJ&IJ(Tqqg&Ol7A3K>Zqj@DaL;;uQpPZzphF9@%q+RDej+I#TcV8(dc# zGCAjF5w~B>Wf`6b&=Zl=pU+c^j56dXA#PVKcn z!JcJ}T#N7X&|AbFn-)&tt!AHPzD5N$@4KH1V-igF+3?SMre zhseD65>h036N2(Z>E7&af^GKnSY8b0&#EF=_HQ>D+;j&Xa~&P;Rb$~B3GTx(8{{8K zAdl8f(B>~J|7tj|UU@+i843Q)*?mlz4u8K6(VMmCwt0M595+tx?l8mtR6*WJ*&LD- z&m&mu4$Q+~?!r!GsK0WLJi5_N^%NuN@1GScBg!7r6f%jLttKeO7eR?tJ_sastUC69LdX&BVz-f!Bc1_*#FCe-!_@h zv+W8M>{SILeRW(f+6Z-H+Pq3#b>`RU|IoKI2xmJH%;C9nWX~Gl!gse&Q`!wyeHO(s zxoXf;Q{#;bE~QPWJ8;dZbZ{QKgA*xIjOEEW5PJIu9o*3YPufagaY8Am{MQ6!3Bzz< zY%9E4LSfr&A6y@!4N5{k(V$zBH)ZN2DDm_KufESjsa}!ySTqXXU4Fs-zt`|qVKXFu z?4;`Y2k?&XayY(71A2%nQujzw!#e?fNfEe!vy+&18Z!e6qhVWP4yc6bldqEhNaaJ8 z%lYjoJj>gU)0WBcw#`yQ0b6S-TRjiOvs|F;oC-eLwH_Aty#@=G?U~FnLb?vwpi@K! znY37jvnn%5)j~OJ4}VVVBLqOHz6rxtHvvbv=teq|le{)7jxn)1?70Iagi)6ibc_*p2@J;MB<6}&OrFnT(Pq~W& zSYKkt6yDZb1Gu!g6rxzB_)1UKi342LxttHmEJvkwiZriaVt~}M=d;I{9;Ix)ZsAIX zep*@#PHWrYg>)ord~+8n7EfoguPgwKXbz*_ssK|&MakQ*4DnasXQt5?_+qUZx?CKE zgWr}A2?qk)CF{^NYB~P7^n)mrzJQ>Ao5;qtA{wawgSKydNG=NOp|0%FsLtkU<{#FA z4X36v79Wma)akt-emI1htXs+)Fe)SErcp$qOqlt-;y5+wYohOjXEARZ5$E1Ni)P1P z!kW<=q(-BO`g|`Z&Emtbxn(ac7-xz3Y~CU>^dT%ed>CS_X7bEetDxQ0*Jxz=n1)WB z%Pd}53vrAV&W&XEL3y%Vsdt7T8)<WE? zw{6V*TqXfY?V3uUVIThEStyn z%J0Lazhy8HCJe9mEMYuEA6kl5px|XAy7a~$PUK`2Se%F;A1<%NyApg*pk0p>|2Qx| zw}&>K_J=+jHgllC&U)RSpk>};nBOZ+$FZCwn@f@ct2jt|cbjA!?8mhiSuWJ|1k|qS z^JXhsZFdISI?A;S!4Cg2+hVR%3A798r{kWlN3(7!GTz1Vzk@n9WytWJ+NebWJ~ zcDV|!udn0RwOUMo?_zTBekRTGHRN=yp8_k=k3gJe0e*U^!Sr08!F%*Nj~op8!YyCq z1y1XxF(v9XkhZK1q*b?)EuqyQUUUna_m5Ng@E|Zb_MTI!YXLQ1`FNQd7Ls?}>%i2U zi|>=FAnV?FvSvJ${0a= z!B9?Bz49et;MaFS%jOjPtF24Qmal+ZwgVT#K5sovrg(6DA$$;zhrYw1@Ff2(CY^{w zq0+5zrL-94#uTB5!a=wsntsbC2ePtb?a}Ct8@E-?-I*cjXr}C zY7uY_Ot}paY|r$VIMIrk&a66}1%l!UREN#)%r<@u?e|o0zs^cXv5X;uzy5=}vfr@h z+Btat;U5uJA16+Z48D#rpdfq)B=n7-;NLCSHOB6)s@}p|mJKg6=m%xuKcTWh5MrOm zkvM@DRQ%>4c=?7)zPUDX)Er&uYmaEo>(iI0y6-SI=H*_n>wL)BJad9>&@_UrNk2F? z$F=}Pbln6!<6D`G@mMie*%i2Ns{%FAKHaqmMC*RhzW8WRM~yeu@bHs zF9;vZ`vgLB1c4Y zEXx->$gz)I$ewi$bSQB@Ja^_}R9h$M`Iulr_`|sAWF}6zU=6R2WWjUh5k9}1#B#U7 zae4A=M*m1X$#XEGezi^Td>#iTmVP5!z768un0Qcj)TCJ~H^da#dGD%!wC{L5M#!8a z8@bmYH)A~`_x2u@D$7AX1umVO;sPhw9l6f8Z}j>15WIa?3p6jblF*V|ygFA0huQnY zoPI@;eAR_cckBU&+fTUMSsqvei72x&2sUfFg4n(_u(36td)DP45sQ+7^K&YpHa8Sy zbhe{V_;PT`ZNmBbo%oiI#U6Y)2V$rBc;{Yy0DfUp=;Fpe>zcc$pKJ$fYpv1nh8p9> zAAowh=QBNZ*KwA-6f?CooxYME0{JE}?wyZibo}ow)ERHZ9ZT2@&$cWwnN`SoN?3ks z(IuQ2%m-<@eW2bv2P4^hl=;A7EOWVmr-Pi(Z-E&cZMcSt+q0-woE@62SELI*hma?7 z`*3DP0m_z410U{XvaoF(Sg6k+&B=XKlX$?1u|8bs6+=$nZlPBeZ^8)KBT#?qGxU7X z1fh@jiOgYnSO6C=Igh>0CdJbl-MuVJN|}-R-cMVaLg=F?e++Xy0w#wwaL)=MnAwm@ zk9*A`nv3}H?CnU@K2#2NmZpqqiUaIumg5B4b$r-I!z<@|=$RBm7mHm+$-lv%9af4b{<(1r zyB4y$;cOBW@DpK=6P=kJOW^1>uuGaGH3h{aHQWRp$6s^)Cd~!6-CSB9Qi_e51$1P@ z5BMzFq3-)5vipe&@NIC0%N%8nNT4D#ADPBkx??KVxYe?r*G|+RU1Z>d7RO90f|^*d zvkQv~?)y38==YKBZP|pNujm4fY9t50N$p)t3}lF+T}&$|nb zcLGVX#4chf?~dP>g^_{l>xpNfFscT4le#=Vn9u%u-~2Sl>2MBu`edWG<&n-Gy4W3W`gJ}YyWX1+#@MT#`S5G8U*U$H%t0S43M5<%L-ed63 zKL|JQN@=aU7qPk?4ku*qquJe1EY%d}g=QQA3#~I4yko`3MJJG-d;uCEY^Yd z@fN&NI~SsT=^1(owZT0PL+I8j!7T3)xRCY{vTu&lwbzXC_u9AM+i(P*d&R=L&UK)9 z&j}MA*TM1;3EugD5eQFK$J}Y9xUK6IaSGZ8Q;Q<-+GBgzVq-!Q`fre)j||s1&5=Bg zdCHAA5=C!35r?dcjr8}xBbso40hxd1ux?2M-f_Qox-voPe`#~0L*^6 z6v`KUr0Z7c^2D{q!EjL&wl*|DCCiDLRE|U&yD}^b5dmLnxqL1-cc}zojq>!b!D0HVsv2t1owr^a#Y_i16JIeN4U2v zACE^4ZsAjZDH?hGG>M+M8k!co zfPMdxES~l2z}_-NbnP^vZ=QT3c?N&Urj@ZUK4wl_-cF;Rvl8i_cDBp$wjbu)^usz$ z_PH}r1>R3(EL~CnnDWvZ{K3JiWf7q6U4^sSzma7wqP!mhbuhfeo_rhb=5`&( zrs39SpwBs*xLiDk-*qc6=yp9m;Vq!wKMb<$f~6oZ>puST`-)43Q*kBhY5A$6kFhDw z@QNu|%#^mFhII)R1LcLd!-XH#^XsC(?ize5HWlx%=Sq^+D~uM`L;3cZn7UyJ6Z)`~ zb=DoA_ePyL3il*%{DA^oc$`6gh%dL;+VKcycZEP+)_*i{>Q8(iwFMPsEy35P9iYfd z2{gj0@cWs5sA1$5f?hO*5$|x9(yM?i?%CL9n2R-~I9f`tJURd)hJgs>X_Bf2- ziBD@_u9FvLCpS}p*U&K1;Du90n&8Tvau5Jd?7z37PA4~*Q?Secd7WjjH0E0E zn&CGneU@dgpEKYF93G*<6-w~7(U+Uv;03SbFOc4AK@b>q8n$pVsi!T6lX`eAERIUS z;9dE|QMQ6Z{>?($sVeyMRV?@#mw_r{4Bv(~;$k&F$hh!{2!_tX{Z$X)XYxYm+xm&V zYd%97=e5(-OMF=8o^+j3aVhxr2f*d2WAL=S3bkkOlkXydXsPv#j9G@jQd`JTQE+A^*Zv3GVjCOAZ9?#@L1C?72UmjpsJf8lHSZOqHb zL|6APR9PheAGfBFKB*sYLBI=h5(Ka-;03Pwm5H_Khu~wA22R&-LEi4|7++M%2}+B^ ztG7GoTlQ|U{ZcNd`Ds8l`#tk?;|DTtH;*QDU9_-K2!~GDdE~a~ZF1xNRCu;1vwl`a z4Ttw59ABAF2k$#Sh%%2uXTG^X9b3aO#pylWru~jqHQwR8(wyKJnS^k1WOK2x;}UuJ zA_e0UivbVT&{^CAMDt9!MZR(_89JT@{=YTx`^_XA*ZE9eKDbDK_iiQzecdEK#sOCJ z`QXQYMG#S;3%5#H=ieFr`gcBu@E=bTSDNHtz4MrP^MN2NdgTGm8}`6z_vw(g)C-31 zzQ+vtN@{V?8xxBC!0Y~OU}32^$&vi!MfOKD$V>fRi;JhLg?q02c5d{upBBykNXmr0fq&>>R=KGyAOQZyT1hHL{P5mG% zYzGddAL;5>acC;8PYRx`;uOEHh2AVSj~Sl^@84d8=72Sz`kZ}Na(+Q^$~)+6I1RJY z^y$Pf%QM#RL9e>`%rW`jtiR+BjlF!4)_L(DY4HdQq{L#*U@liX=P&ol6fx*j{7bw2 z?_u5xBhJhN(|Fqg#py$-DZC@9!8q71$ny!GkM5;!Nzs3c5pvgH@ZU$c?*fuE%TG8o zN+I^+S6CB#23}a*qxTOxvGa|1Zou`KJc-*%RAAjPDBJCTAH0`f;DwE_aM~bEnjVNd z>O~mePc?K?WgfcED@Viih1esl#cSJ|n8vRp!K(g5%|n!_j4OdF1?J4i z>JwaN-)o?D_6_!YP@|8#UP0vHco^AW1Q$2Q(Vf>%&>ec;sHD~~XO3_hXJDZdw9P@( z3^T*lou!ae;X}pJtT6C^6>JflhP788;Ylq@bLLKAf)nS$V8~rMzOovaQ*VHo-A>G$ z9N?j{KUuHeMCOO?M_cVw{J3fp%=P|D?d5cEW$|Hfl=X*@j$Y)XEheuHdcoD$C&Zqe zecyj{604KU;mMIal$t(_w$HA^IlEdqZ+Mih``$n}uT1L28qBcX*V24JjWc>*kc9mk zBjM|n7%t!HNosjc1|D)!z*_zt*?5QLM~XkEgOxR8-+>Zz^wa!m6{M^vySfmRYvW;ic$WoO0L+y434t93TM`y zUtGJWhh#>=X2^>UB*#RKb5AATrsKZ>#Cp6bo0Ls**c}ttFa;x}F>8Gf6`qfYf{yhmN0h`3JQvEVl(>Izb9lH(j z`9=6*M3i>?cL~mCBogg)4?t!l0#EMsz~ZLgbim0NXOu;Amu@Ua+iodFa=#AvL>>W) z-c-75+A4h6{F4^h>>|EVuOh^V|-MEH-GvqSen{GkLS;ZIfDW4hV>pg`0!)EtX+7p`Zs-kc^w>% zVs|trBtd!l5xkqf0d{#VNAFi(>3!$>Griart#vBR*ZiAyf!yJPX67MS4M%5k5+ z5UxF)P26{N#lUp;1*rp)^-h+8+V;6KTfq$lx4&`ra(sFC>RKugV`m1?!wjQVCLx|T6J85XSJXG8+qwM z+M=UyB9fg6BsQbb_X6Db!;q(E_W~Ep>%^+2DWJXlF+7;-!QI2o=Gw2n!~UcDNk86! z>Wyz$m$WW5jB~@{<00^0NC?)mJ>Lz(?x=UDf_Cpsh4MBX*lfHXcCk6co_%a4L9-O< zU&`{#4lBa=@*w&U12I}#4&Pjlf%wr2;C3Pqrrs6jDZI?1YWlahy&IQ+xWYDKGjtFw zj4Ltm(i!U3I7a>{Mq$fKcCMw<4UcQupsj{OT68UNFb2rput3rpCrOy?Z|j%Xex}n( zdf`!vDc6y85WWr1B>RH*Ga>==7`~Egc#hKxU+ZMRnf*>mE*$~WlJ`{5r-%qif5gJQ z5zNm2u5v9D0$_BzHBGz}j;10P$!jMk_Ss?gD9kUi$J!b?kIo>&H!P^_hiWw4`kmtx zdk9BYt%33bhhh1%64>_YFz)Qz#@TYBp1{Fn&@g8e(?lJ(k2c0bw)|x{y=Z_OqO0+$ zem-r@nV=`%`=QSvB{+JB&qA~yf-Kh!q>f!-tcUIk93_#Yx#Jd!80f*f83OMp7uHzc{a!%faISJY2N|w26;?n;D7p9_C<7lRgM+4P!P!4Jrz}2uhVKu%ctX;RCbT@v53By z`4V?Z1;EMgHQ3^sYhVv%5SZjNp=nSpm^*j+|cy1z?;mm&1|B0s&KkslW$1YeDl(MdGcOf1p z$_vKrgXyk_M&hiT0504^5bFP(O#C+l_Y|JOUbVAaySy^mVziZdNd=?&f=Ey>R{}on zY^Lh`B{Z<>=WvSa>FVpoJg3H;kkdGmd8J)^(5aH)NX716IK0oQ=$3rkHY7m_g zQ*fWb6H@%#imnV6CkjWFq13%};xnrYz9a-fmv$z)?|Maa&0IL*I*;IGl?kWXtrgoj z&v1_ae_-Y*!+AbZN%vVfqnfDKPu5)l&p%Dzre=X#y+nB$pU;qE5gNEodl)@`^3lMt;)c8(S4r~sWNIgU1Fn*@ z;5Jyw-j~P8idA<|FYO^2SS@4n_~9T4S2n_g>EaEttuTq9B$zhFa4C(Lh9f*(bKytFxebhYI=CODJLQpPRDm(Cd&-yeZv86{9*V2P$9 zvp6cBGN_{DezND}aqfqYvfwItA2f1=!J}dkf=nY(a{9{1HJ@i$K7KSb#Q`k8@TmWa zAP5_LPCS>aL9K~VvS+6|4nI9ZCsW^(Ey2ck@?J2bI3Z5Ho=L!&&r7J#EiGI(Qis_j zgV^|XGjDuP!I@K5@GJBc3~frq>m_$#)hRKairQ32R*3>DzB0@b+<>aeIgqyEFnU~( zLdnz8sAZ524>SzX{Hg(%TAiipdoGfMcS*SS*>Sv*7X`+-J?K8K4{y|!;r*E(;I&3> z!>5p8s(dgE&KLB;!3Os1`Ar4J_tsGF$1Tj3z^f=Sb2g;MD&RVnzmp{}3t!CG2s=KM zVX8wb*NPWLe>JTG%Q0`zT2%x1~7DX&%x^Pu*D(uX@M%%>Z!*G8e*>Ex&gUrJ5 zdDC9#&b$gUjK}Dx^(!(We2mH35r#S^0Vg5O5C{i~0wsk)1# z&Y2)R-8s0aeFt8;cM)ydBsdAIXS-=o5$mVz<+KR>A$NmAVDszl4#U$}MbHkcZzwq^hCeR*W6G@EbaM3#6lJqB(MBl{mp_BPJd#3h zZ_4E!a#JFiXYYcbekVwzs-egAx!|4S2X{SxF+uhGoEVz~ICC@$R?NtUmS=A0=K*NX z!e1WdH{!c1i+JllrIJF8Qw$ySfZZbc7!h_AjCXs`e<#`?>XIqT^RVEY*Yl<}0}70u zO#`tD)&PYMpD}9lBF^l~>STwZ4|h}icZPGJkJ*?$jnmkf%Y^Bh(N8<~5#MTYTp=OE zInPz)IeZY}d_9%LxOlcu^_%NKn9caBM+)%_gLUYpsq4UDvJ;A3%dqLB4%gz;3fL7} zhS@&Gs5Ewr{&v3!0{5q~4B20>^qeqJ`jmsl_2DK4EH6q;APgD=ljyE6Z*oa*J*u#0 zzrP&@aI*CtRlHxuof%QX-QW@d8`+#$pXCNBIZFZCb0Sg0RGD1aEzk4cbBgDao#fGuU`X#4#y4=EjL5Q?8TltHC%Kfn)$IrH2J7+A11!ZMxg7kP|T&hvRu_2L{qK5=fBwjKmcQgAJZ z$BfP~+@b1$2TyK-qMPxIHM@m79LjE1)UN?cl|7J#ugUO%izIsB8Mq&F!BN%+=uqie zE*?LXSN6pjjN^SxmS1+pj`Xi&L$x>HWES3;d`W(^rLx+X6*P7&hPYqLajowgl&PDF z1Jl`^Bv)%R_N@l#s&YoO|2qD@@dl19bEO-CjW`)QKH zUjAZl|CJ;srNkaSblTy&I~C||)Pv90ETFf1YA|}Y5O43t zE#Ty$2+1PKXptC)q7SBm)VL+;^!>#fR`b|B?;O!$?`DNfSxqj#g8_hSdV2i5|A|!1$-~{@cGw8$Z_}pR)w?Br@EiJ zuOl30AIc(|_->i}`o0L>UVKi24HKy%uaF!ztsp}c6<9G4Nu9PTV&n!>+}f!~a?k98 zt&)ALMp8j`pK2wC{%yq29w`!1TF1To;1XAQ^c}Gqy9Z7texQMTFfi;3Lyu-N|BMvi zVtW81TQ$Jy!k&1`q!250@|YLtBix{IUo7}>hwKe;!&L@HpsRK@(OX35BQ~2OE)Wh9 zMxroL7{XmLVLqu{28W|~Sw9bo zn^#c(7kr$zU!9)NrSg0+Y|Od z!F3)n{TU9~iwnuOt?b_XeS2IPno3T|5{|8OFK){Xq`sBftS)g7zt0IKe^)F;V@RMm zHzSDj+p|EUDpBjiPPlepJEVTf0UNdxG?MN>58qpkg_jo)o7sAp*BFk6li4%PrC<2s zu?sf)YofGK0azZ|z&u)i9#)-U3vSt_@SoZUv%KjS`BI<xXta|yL>-C)Zv6)E`m+@$a?|PCwXNK$rAKJX zA#vW(L%O)~!bhrky#Vs^ld<{sQTp$dJM7ta4U%3|Ly2xTF%AO8XW$=M?`wo>O-*s? z#g(Ag-+=j>B*C|r4}>+!xPzNFg3|DHs`sfC-qc$`!W{=}4q8gS9n~N{N*_q}k}qWQ z#t=NB&SM^JE<)?`V^sPBV$;o!(0@7;+*e1#KcjkB^lJgEIh+FW3l9;yh9lfV#yq3~ ziR4A!e>g|yJo#ufNN+YRh6`SyP_r#wxhfp8!YlL_^AL^uDeK!me!$}PXT=Tbp-?M9uod7fQ~gT zT$|Jyw)^G^48m$Do8*X@bdY*$p2^F)}Uhxfu=q43uQ>g^x{ z53{b&vg1Zf_5@qw+w_xE{nv~p#a8g7F9Gc=MPQlxJWQ5b4xfhw@aM!h<(I!jL0Wf z3c9Mf$2!Nk`CWGCz0{WYKJMfW*c~IUlboqhND;I@@x!#$m83OCnQ#LB!1TUl4Qg)d zq3yQ~n5)W?U5bSm;V6e0FLf#TJWBLCZqrT2W8nAARLnRji`k!dlEzPBq|Isr6TW*3 z9d#cf9$Cl1>|-lVy z_-u~{GuO)oHmJSBj}D^{-LL?zsHpII?u)^w!57-KJs$UFNbnYFh`{dSYSit*1hI|m zfkELF_;*S;x-^Pl`@Q9~Vro3}Req=E>o$X~v-!~Hh@HEhVS#(@og$f~An=1aj=Ncb~^Yt;V10X{F9Q2G^Q_Iq+}ic+-Nd6hP1 z-(r35btGe>FddQ?6kzLWVl?yIzTjd-@<`>S^(D<%$|&0@e} zN-fKueMcfr{Q-k^V-$4}#G`hl*t6Xj9~51oi@%tVrwV0wsd)ee|6ayC%~|9?wIr1O z`46@^?uFo-qu5t$4LnnKbXX~gQE!Ftu0{iFTDcQPBDcZ*SNk!gJb>u^5&@ypKcIYS z4vw9U#I-*~p}0K>re8`R^I89CpW-4KM$Wk)FyYR7Ld$?HW@dB~syF{Q5(kouvcKd1FpxbUf&So|$+) zZIG*BmjY$cYeDGQRL=TF3)pd;OS`w62dU~(y!ZGkwdJm-77b!Np@1B2nX)v`x^E%w zab$M@Je*J~R}_2GPEfio5i=5F@b!#U_;zX|V>L@0>Z~RpHXxQP9#91L{#2AY69RD} zmhAW5o<4c@lq^>0goG>Z;BI$^mIkYX>UbcoSu_gFSRX#FI7Xwby{X=*6ddlG1$Fy^ zxI$%uY`=dg1e8Z}&CIsK#Ljbc@%}W}^Zp@-T+JZsP9DI-)p9)9d=4Gg{K|cmm_+Sv zW?~zD$MlFnGQM~aTdx}7-?(U$yp@iI?uszXW%DFQD`BgQ94tL<4t1*wkSo(|GT1MP zc6H@+o_!^XnB|hSE%o%BdKxHQ_)2!J2qRFh2Gd{(#6 zgvmaO)!!0f`GK2oT8Rh!bGSGb^bEYD=HTxref-hk0#+-G=(5yMG=KIP_8zo=25(7@ zxE8x5r6xgCZN+$-LJhFiF$E60ogW7+1PTL(DLB@PKE_R!Z!|Ed@(~NT2 zyT%g`QjTCZSjW)UB^<*^0v?JEN8_iW5TGZ<>#VyDd*-?VIUh}S3znm?X(3Xr2Sn}d z0o-%A9((-R`*VgObM37u+*Rh`+{p946h-j|sdB z#F3afsLR?5rJ|i=+(-g$E{=epRqV!gVF4z4>f-4|`gBso1UKXrbLHl`Qpv58*_;>k_EX1ax5%f;b+CqW8RKhmQDi^>-1e*k zzMn^M=G$WKv-%XOBg-;<6$W5*p*M6U?1qK}aXgoQ1eAwNu+&h6-qu?N+3a?_x6Cej zwvF`+(-hhLmuJkuvxczy&QrA8ataSuvfsz8TPdeCk2&h0iA$Empr!h0e4Oh8HD3*2 z)njAO&b|N!X2z)T{xs-nv)p3j5|BJ!SQ9X@3 zIYRG%CUEUWnNB@^6jhyv{-&2msQ{PCs~UmXDGylglSiGSWiZ`nj0#*$A?14EI1=~* zUvFmpu88%-L`)W5{8@|MHiwYE--Gm9tb^^pGvVmPFm&UvTS9kl(Y-10C<2lr-F8 z+DhCe@4(-8t>`2729085u~VfIcFq<=&%HbG>$c0#yT63hd>o0u&`xTQxquc}M?#y> zX|PP4!RBNEUfXMfvX&&)TdHHi+&bzuDF*AcmytX7q&WArD!8sioQ8XG1`xGOn%zp0 z!T_T|JZ$U+_D`cBO;3qzzqf~FC>q0v4aM%xPtfCV6?Ym6a-%-nCSANd0QuWs{P7IL zy=Ko-VH-(qQWh=pT#0Azx-t<-fACLb4o;Ev0|UbxhW`T}j{lA%%50v>re=gLmk9*> z{U^w}kGU8pAjJ6EbICFp8J;=U356Fc@N8VhNnvjsmM;$?yYyHV;NHhn^obV!iV*;- zpzmazWdYC`TCj4NKkj)=$x4YIRK%M=gU4LXPvd+n>}PjuuDZaE=s?Uf48z5yaSg(f zo8hEP5>>M6fUxRBV%E(C%J+cQ%ZS2zIT7AXt5w{@lrQXkVO#yLYiZ1(&^D&F;y(Jf z&F0jn^^u-J8K^Ff#_OlIp>}i~%$atcD3~6HPUi}I#rFg_}$5*y%pN)rAY zZ^Hb^)ud>JJ6vjN#p_}%C>U@KQ#P)rBN4M<@X9UZTE>Exf+VPHJ%e_gYP@*Mwe+ph z7W{tc1&R0^3_iB2IgTyAVAJ?#40@i2X(QG6XrB(~o|OZSpe@kkngt@ELAbr88B+5_ zFn5|Py6q_>0Sym9b@?|k5L*EWo8m!nlNzeDEyFo>=b_Q~3st>eL5k1YgTgK`@+rXw z6|;OnL8qTC(&OT6b2hu+ruVlXO*c8y;L?&%SSV(RZ^A$e7(c;<-@`*9|Lj zE&9C}&-wq6+N;NK`W4URU$$MPF?muTdPxeE$%SK0+cOvW&-OBMDeNiY8Y#l##1(9F$#@fJTZZuw~9dyf7F{Tl-o_ zccv_sR!CsNRtBcz&jx|#)hOMSO(zbroX9J}wDNKo6jZH7-S9%Jnv+1M^|sRJnsiY7 z&3Zx&W_T@S4yHs4!;H)km}2P1oKCF6a5@(yMP)(ZP$=Y0H^$1PMpW@yBG?W;2Cv#p ztOr1$b=44gekY1-=~CiF^?k$pH+!+RDwLKLE8-%PpO}2|V#Ay*exNh2fcsZ6hifj| z4I4tmh^Z*xcZWH6y-pAO7R}`;A3Vb}jTYq!&KQQdTFvy_D}A)@dkWSa3!gAlvX!{nQH8QC@7UGKjp97Ww~NZF%d?vien6?#gAw`taXykNW; zub=Tlzr(61x%xb|ms6S~YX}AHL)51?494Wn;eDI=n57$oS;GcU!@l=6C-22?x6dFa zCm9Ob9AIt2JXk3ZgG-06vAm1Vqz0m?v9tnoPDY^Ul-KajPZB>ZPXm!M0o)j&46{4t z!m$Ir)G%)v%()T+!)7y>{)xHhQg@r{Zg`94uRBDi{bs$w6lXd=pb~e!w1$48R+^m~ zfUl*Na8bGr zl#VK6Ub8S-|1c+8mFgJVKYuaT<10DZyaJ6mPRxQ`@o;`hHdZa$hj#>(pxwO*XM_ef zl$hRR`o(|II-!NcFGZQRKx`Fz@3;tB;tgPCr3D@=L-n+@2^hSsXN27n8S8IKXf860 z`y>#i@(DwDO(#RiQ>yiJh!ht~a(Db@?~+#c=;6e(+GSk+9 z$Ade#>ZTC&>{vs3zxbl!mKHeuppougnuJMHPtl!^Ssh={0QQL(kVD6gfo}Lh-g3+B zL|G*alh)R-`P~boy5J)Y_N|7{xhY`j+l&jh0?5sHgJQkGSfSqsriM=dK_7)1|Ka=~ zPlnvE!t#*&z#lu6H!EJ8{3jhxGz_G94`yj&Z<{5qYrc=;A?pR?eko8EcZuFOKp>Q_%m17CLV=fTULm7?wIhi(c~Ja&aV_Nj*ZU z9bVI*3`h9zumX3;UBK{?AXwjd4y0Zw^7hIZz*B`#I8!7I`Ne|ZcRGi-`yG{f5{bTs=buQ>- zUVv3Str#MDhxAXQ_*Q*{oO;z(UwTazKJ3YZ-=vbw%bcMNcfR8%cTFt&8bSBuv6~}! z7F&~)m|DfN#Oxj0Y3olULUKIJlkFvGJLhqpO91`rxEpjw=g{<@vFOzM7+dCBz;`cW z>;PF5_?nB~C+bMmD?WG-!^6=mPyDfP79^ZI$ntm0!MI`p@~`y7nLDPDP2Y_+~`hsY{PYqUlTI zo$P~3t2w-f(zUqMY&Y%-KZG*UZ?Q#pKDjbtf`7Ps@zRuY^z2hHRPeimXF1`x@MAua z`Cf~qUy+BgoURB4_4^7OO?r#;=PO}VKqv`3WrL#eYT!4Vgtm@%sdJz< zoZY8EquC9?_v`A*USmd5?Nm7Ls--!WED7p-|814_1DgVQtP(4j0DE~UR?ci|o22kpX6zgWiOM;T-s zp9-_yzoD->lOSH8mRotI1lt?-(1#x)S+=7Gm=vF;i~dT&(_ug2>2CxNiW}%1-IciY z(QgYK*0FMpFp};tUd1gO_r~4rmxj8Qwo7wN7@wgpkmAS&(PFpZf z<`80Z`1@2Rtpjc3&s1=Uigko;`Pgq z=*w;kn0`%>zB4t!*M^&5%I2%s9=QfbmrUm!iS=yA9`9iKlPi%EcN%4{nqchb=U6_J zMI+}#P&MIM=q&w~`937U%&lvH-@ESNK)pHW{jLC8KM`DVM1=d@N*?!n-fn?x(6DCObD$-`KkiLO-Xl-XCCk$P7HVvjO+JWHPobWuSTFJg!h# z3m~7+Lm2h__;E z7BPxBgdH*rx$`W6WGuDD+WI-rN6b;y<6%QfiYXPlF4C|D*jA`&mW|Dep;-LzMhN@@1oD2Pe(VtX1aMO27NNyP>Izz#w4_OX~NCK zJNPbyaAv~gd&}4yT^N`qE&{c0hw+EiMfwxhqS|I5PCyU4FRPn_X~n98DQbz-rbH3c1ucNcA2Eg?ZQ#%NZ?~L1|^Qd*zr^W?K!vMNwYS2`k{@kFipeqMbq)K zdnt~x{+%z|ol$sx3OgO9gX1DYxNVTbmDxB0f`-mAO(*`*UmFt9^!+1Ry)lzMm2;xW z2kmeftCN(?PC>&P$B1iFD(cR*!F{*lKtkXPHj9eUtX`H^HH*EQE^C8%bFNTNSAA?O z*CaN-FTf!G3*vLI9qKbzK&6Q$rbh0-MGDd+?aVa1JIfz)_2nT&el0FsI>G3$I}U2e zll1obc<$b9*T}<{$#`NxDfRyR0ADp`6013GD6=)LA^M33YA@-8rTnR|_4*_+b9TXp z=nK)3g|KmQBag6|LAr{i}H(|^}hvHE&34SMp9cJoB=aF`mWt@j$7jm*c&n)&eG z(^hh&cn&s(3-V6%#KXtc92))bB#bPGXZMp??McQKUOb3}gMY@b@s$ilzF5j`gWP2L zc$Rd1`Yx*IVT4DHe&x#PcYy1%6R`VM9#OCs=N%J{Bf+q3`IV>-aJ=<4k?Xw+{+;7Y z82=Q|FwBFKWen;cz04f`;R1(G$U(k%8uz#EO}r7sa+58c!OcUQmvt=>{eqonmFo$x zJ|PKqR}Vtx*JErBivvngA*ge&8)4N7G`vdDP~ar)cTgk0D~=G~*EOWO;W!afw}vn= zO(?y3l@b5=n0s?`5T1N#gk!stL8^8HtxneBM}um(-84cEyVTKDM+ad4U1j*D`UYh3 zC}g1$%$&Xk@|MoQEBtk6C)0_}Ird<-=@4!$NCH34jri;DBaF(iM1mGfmO&`Eatv7yM@N@qBl*F_j*ew`X=Y=Ey)PS{pI%$-P3h6%qCqEi2l)|Eeol+HR-6U+m1 zy&T9BIK@2bW*KfX6?p5X7Qwh#FZ`*Ahvct+n=K*X0FO$D}z2U$#(w zp$pI)v;af&{-baHg%Imyp6K=W5A=^}^G@hICIg#-p{Fwl+GhNKgp@GU@>PV47lm|c zq#BN&If|ZXQ^9#@9r2Z9aI4!D;9ac4mt!^f=UXmxvzx}w*->;=i5Yrba--7y^H5?} zFm8Wz0@{3X2lU+U@M$G?gp=-#^kc8Uf}ad#TWd6mS~g z(c7|r=w3(1S<_UaU#;5&eKPRzuHU%$CFf0q#nPd)TapAR!tem6`= z+QQZ4u^8CCnHrcEbJaD*8GeUyD6lGJEMBsk8M_xP4~^;dyv&P%+@6i#KiQREqiFV&mgDD>*aP)UG%gZ>*m006Qx2>h@-qS8-t-KJp z{q%zsi7&}4Pk9phO@MQ8r6;YfSPs?-C-8##HWK08iQjLzVY{0jeiM6y^)nU{DHVQ> z>(Ub3s1QJg`LCfnEygvTEDPQ15m`U26S`6saK28x1~$VRaG}-%;7q9{qkC4;{q3w? zHQ$sBKP$n><}TtW!)~r$QRk?*dg5a7MWFJ2fLsxHh!@&BA*d$~y&tk!k^mtNzehaO z?=^+nTE{8O>7rXS#j$RmCIpy7kxE-TlKZ);c6hxW-nsOPuBu;&vVUg+Pty|yG$$aj z${I7`_t77`YJ5MngDb0}4o)+oQRV~E4H1`dLAn_%QMpL=3}(?ClAj5^(#>%9m4S0o zHdlSp7NmsUjseLx62QB0^9_TTtW!X3do?UQX@ea5p zUxQim`Jw279X?C?#QyHJ$RCvn8N=~d);127=nC(7&!FSyH+(E{kqBl~lDo;m;QlcI zMazZAP%1yqp3T~wVeeS7%{q9wc9d&7=Lu=vqfCq?#ZlOO1Ly?vu&X8nVv`g>r(rH0 zJn@1aAWz_rKq_om=mgc4;i!3_4GKR^aK-m!u^yHo4FA={Yy*DIl{>)w^<-?Q_CB zd5a<%;d>MK$|X=%TMi5H9sHSb467SUk^iYYt&Sd}Ovqy{skdPH|2hndd%@Thn{e76 ze{jB0k2UOW@;u{F`mt9Rh5n z;Ks8U@}HwR6}pp;?QvR+dDS>ETpI`Xa!PUQz$0pP(uho~{lPTHTqezl$~a5#BATkY z;FOL6e1F&vr(-WEF#bczA6|zCCGp(*&(1Ltqpo1nI-R-tdLjAPS^;CO9IW{?hsH?G z;E6nCvq3d~8?55?vL0Lr1ut1R6fnVk-1wIkd|iTzFRvv%7Xiho-T< zefk2NkPgN_#p}kQg_lq^CMgMx3ZuJvu z%wiZdi$avP_kptcEZ1PcZqjfd4X=vN##Q+qWa`r_^0E0%jvX?7#LjS=QN znWYV*t?xnjdnovqR5SJu{?Ob!8M>@hnB16t33bcj@lP{>4WE6ffv6AZZw_W&Dg@wz z?H5ry#1E_;qQR?89b@ahqO)Q$u}~4=9C)>mGzotu+g5DEFr^eY?p{p({2u2n-|CFL zHD>T%i!7=9B+Tpnb`q1Wc)>+kYq)VY2CcT;L5oO42Qht4^MoL;|19eXu9?ktEWa>b zEaOgH_#!j1CV=>!UrjvZ9DsLTo)@o>ji+DF2kkXQs6D-z*=D^OqE6+2;Yx9ZDkRGsStUO0t+E zY8%kI|0lPiR2KP*f57qntJp8&N*W6<)w&)lL@~21ARv60c9aT0va=3Md?L)lZ*jCx zjD1ddY7ya$5olsMN!==*!@VLyxH0$uyex&WPq2c}{!Z?K1NUjwKh}%(smBb^=X_+( zjtv6dsPiBcR5zOA-ojYid@GY>)7g^h^+{;Tm z-+m)no_L$PVzGa}N~D{5iKL)3k9&eg#f-br z%_Z8zbBYr#Ej-37e5FB-zP$tvf@aXj^~4u9uF-NWG0x}?F88Hg4*FPIL(A$2xaJW7 z0gczG@be*L!YU0 zP!BF2l;fT0)~9mqsxZW|GCF_dql!ryjvGb8uUl5Q|FRk z+d_Z${v&>Q^;9(3nHaraP8T~Yr=)&6KJ*TQ-b0r`PB4n9$2!1>_G1zdrj41LNEk?E z_w5X7!01~m#8e-GqW@NM9LwzJ^mkeiB7X_>d(JZl`4w@j_bAsfLx!cC&p2 zS=cA{i6q}yNT)62!0%KWnBgkVv459Ey$;oZaz#Gow(o%2RqpV;SOT`+zKh#f&*zC` zas86hZj8?+KA3!W4n)-Kse0-lzEy|;b=B#-nkq{7KJTYt+Jdk~#2!NH@6+uPYj8-y z6jT$<(4yNGzbB@m{F6S?p0Nc@52{ne!XV5#pwF{4xJ*iGpL5r?ZDKpZYe8vxBi%UE z0ROf` zQ=i@dleY!%AbSaZs?Nh(LTXSwo&5~bj3H0AkAC&fMo$wj^cxHUCq7Hie9va!SK8oR z(QWkGba`5)*}?Vu@{iSppAe%LN@QN`9o3ga6`S zi|Q?^v}ZA{?@=KQ<(6omse(Vf&Z5}L>5#cZmDR6>QQy9pkt<(IOy+GOnjt`|-(=IR ztS=w!7@b9<#&>i88QjP&c*4!4eU#D=XP9bq?@&%>Ur6gAA z0IdI-LUZ2V!)1e}FgZ#I7)im@<3&_Ye-yv@EWx0WNuvLwlI6lI$KLc_9MNduN*>w< zPkxus$3rX0PWxe++3H9v4QBzzGzQZf)}nvF9F~<41+C8t=y;(C9%y|*8#C>3ucReR z8J4Hpx_8q=z6`4U^bTEQ6^(J(2Z+hc!Rb`9qfwkCqZ8~WY{DPCOAF&>JA*`80 z;ia5E82QMPhgC{gwq%S`;UscDGl42^;jDfz>{@65D~iX^Du?YNbZVe4zY@-pk>;FT z>j4?gvYhg3W)PnzOd6ubu;^_(c+<8E_TP}>nMI!@ak~lZ2))1z$mwzd*ZHCB40(>qXCYed_lNp;wZb-i zbNudFhrY)xK>GsQZCdODJMwDLM79{KIXT>GLWY=>XNg;d<$0f6q=|657WpJVNnnjh z!^78Mv_W2r)BfNh{_*AK{Epq6=wphCHO)pp^AA*M7D{%H_B9u;EGLf-uqrPg3m>%B*ls&|rZFBT! z>7QrNxMdN{mTx8%3&&u5+Z-sXTt_?8kFd&EDAtte!jXVpLSC9d*XnWF(R>;j{dBO1 zWyHq%&u3RBYhnDUIH_OgL>iR0fpCo&=(xQ^7qvpvJJ$_1mS@P(ZJkteQz`u>ScAHg zqMV|9Avpd?4iRg}*{9xMw8PgV*3bo=4t#|_{Ypqidcf822AH3BgW%h5;B4oHhBwD< zbM3f?$@T3aEO-7SllVS?@MNQ$<-RcDK!B()SdpsxOm%&Jw#!ub!8yU+)BKngln^TT;-S-hM%8&7;V%7`2GGm1wxf^(}9 zSf}Z7;`F`|&!sllGXv3@V}LWP#i;Pl1ng8EW4rn4eApSVL5H?KIqwpGE`m8;S+BM|DhKez~_GS)rSasmi;smOnP!BWL zP3J7jX75Q|T2$hA4X7IB;QTQs_;SM&LQcn(=9 zjvUOh+X>xsGzmcVI1CHhu3GnWR~*gax`W|z{M4+oUDtAIJfRU zkn4;AkyA$?jLn&g4V@+2Dh>Ltb2p@~3T6@*ajv}VCgN%yjzZgdXz`{A`in^dJ?%RD zw8973k$Gr6+6?n|Um)=I#?{juBHGC8Ev3KB9#!(REiN2ELW^b4!)|6 zl3BaAppf(-TzTURsAVakx1s?AXK7;Nva{U3k-6X%e+DHz3sB+hJ$f`gjJ)3Zol1ST zB6}@oH5sN^6t+m@E_4d zzr1F0wPpvX_JzWd_)4PGD28?KQ{k`71UD*o8=n170p69G!!mOx(B77eO@j_FG*gGp zuT+Ebzt_$3Ou!&Cd}|B*o4kSKOnF1CW~2j`qlBRg z)i}5MLqO*AHYn32*wqt4BlI)D@R&0?N|lgDQ$(oR%X0j?Zx*>NwG^-2O9True%{rA zH!$!=3DyN9!3`?Okyw_6LH0(pal=PoEJ~@wFsn)K-;KfLN*Jb@3i0{cWSR2^y!<$V z-Qf(wkFTphqgDbNE7Sq6PQa@pH_6K*MR09<98|xX&e{F4j;IX9&=)bY(aEU_W4rHD zgJ-`1vlThIPcku`?;RN3j-W|u6J&k+Oqi2=8>Ko<;zh6TM6AVzUXQFq#R~SlZ{rLJ zPg-DQff-K11M=3wgwvAR!wslUha=mCoyw1^TE{>XNn%e?ny~bo2?@lXY=!Z z7U_a(`Ax_=!G&*Iu7hm26z8dGJ2q=8bj>$j zP@e*(LgCDyToWj^UxBudFX5!C89it^lcOTwgbN+)vGVSIHv1e7ei6%YEPfNJCFpP* zBeGH5EC7d=eI-+kpQCpgAC3e}<8AOvr+soAjM!IQj^m^on(bF5&ul(0tGwmGW#cAz zW%>|L`p3fSEi6lQc!;caZ-c6eyU54OC)QOTxQ_-EC{lIKj>k#R`pJdodaoU%b(Jyu zMLBhT?+2T%mf-w9p{Rc-jy%s%GPse4fhoO9gR}n<`Ncy+SrGm7$}ntBK`_<6vrc4!51FYB+IwCw^`gc!uf^|6Wp)+Jnd4g^G>>+#^`#;z!%6n@Q3Ss$^ zG&$ZCq(@Tl`EGU}>_QpbV*a9r77tFyIbe9mQK;0EhPVEiCTg>ldENC&sp@tMLsb7#OlT~q(Sf2XK0$a7Rs~l!LZ^? zuzkrcP_cJQ*EUsRFt~`@{^C2G7VFM#p0O=~9UDNb?-LffT*N68+fXV_59IcpB>Jjn zQ07x0gsY^1s%j_f9eu!jQ(giSUqf))mt;88)WO_&z7Na1LRdbZDSFHdg;(cTFX)IZ zt6ij!3y&VtE)^->5kX*yNp+t6*^Q6 zfxEgZ+aX{%+dtT6O~-Q9v*9D*>>Y3XI|m%?lsLcyVZV16bLqPS=f#?w@%=-haDaU)68} zDr`fDc4`kE@!1D6hgeUhZ9P>L3L&?|{(nZBOH>|)v3}hWc;-<8o(564WrPoR&M2in zA0MEp110cYzz6;oYqOfdU0l{-!?FO|==w$;s;nKQdduzE`|tyDUbmC1X?X&wQCj#h zD-G|-nPZ>qPFSvEh^~=VR4d5?l~Ye+=7w3MX|WO95x0lm+hpnGJq?VtgbNPk9m6-H z77!qo4CNNA=3FF;U&4$aOd%aBoGNL0O%z#6mZ7&sB24|82^Z7+h|bY77#uT5ZmHUu zWCb5Y3sE;PR4aiU2fq;SEtydEuophKo8xHGO|TSEhAxs+(Wj6#_U1eDTp4wpnKOa^@x`Hg{+C4Ty zesCIRTgy$d=bafAu0MxK$7b{H6)fdlO8m#oEFYk|UwXk}o(0}ajDrV*Ge}ICFkbUC z3)^)A>y4Jn!HLvshe1-M1 z3tQl^?MX=V4ua87+tGAYIkC7s0BSy)rS zITqs=*5l;KQ6fcS9f57x2nSW-An(#t=H!tf*c$hT1glfx&S!o&A-b78)(9gj&C?jV zrIWId#pL9kG_uX_prDWYf;69yAxn11qO*#?rS3?E)-*lwg4h_?GR_+JU66w{tx|Yt zY9!RoG{KsN1@Jg=5ROt4G(=~KaNE>%q@I!uvet!a^Dro<`On&J&A1rqRGuoX!d zR}5r!K0Xv#!gszJ_QnW(M(`NcY;hdsg|8yd=D%eZ`U(5GhTc%6{0L1(pTMB+=5QKw zK%*DJ@iq-ieXYo5f82ulvF0FM`2n3?6vDg87iC$d|56<&Ya z6Z^bAjRu0v*??dA7(3Ax;x^93S@+ejsCOzFYmdY3g3U$J=RDNZB`~fe9qi?Ii_dS6 zC5cZ>p)NKa)Z_9=5B;yAM|U#7r8*w$?!*dvGbaW26?0bn)C``*ZiChxndss?8&)^m zCL+HEOseVw0gGRd@msgShNr(+B%dSLIF+IQaTyxfp+qu;`j@cm4w*PQ7H;M~0u#Cb z#u_W3)9{s;Sv?6Hh6wf&_Du9s)(V;h|3x2R3>y@^6)&u@-_U)g30yj)f;%r2*}GP#uK`vcgkC;OgIFq;w2D zaJU196ewZkArq`>o6F|x&9!A`vt3aWP;+EY&aX%Np@*ngAZCuVCbI1tm{xO z>iy{mk_=5Ky%!G;6Eu0Sn++bySAoMDP6{@;;i&#<47htqlFeyu_)g|I)Ytcd3vH9Y zR1pon&{n*0_W$W8FPP0=I@=xW@SaN z*sXY#cf;7@jv|vAGZX!FP|F@ch?D z;=MSOl_)SMjOhikR$~!TT*2w%Y0|B31e!PsI^~H0IDB_9kbNt`;EXCc^3DuaT;Bo~ zvbN}=FbJf(>d8=c4%R>ZLX4k>;DBqjaO#0Ju6;cR%O)G)OqW35t$7gBwG8#5v+&i0 z2)2K54-Eae7$a|$6PacCWK8*c=IgMCG_z>DJ>(nlORZ)iwvY{=jA|rK=zJy0U3mm_5^fRi6+=ZM z4ObEK2sfhE{Sk(*^8pLn4Z?iJh!>5Q;fBJRNvUBf_UkZ&Q{zU!65G|_zSafT2tF*` zUL0dtWr-kdrVr=Z_m`Rr+*9YQ18^&QEsQF4!QnPTAY!>JSgdzqB@N$D?O7gt*z}SZ z%*ep`QZe9oY)t9c*%7c*Za599afBoB@%YF@QgHA0N0T3d{lw8IcxN5NuSIF{C% zH>IYK1v{ir?ZGj><;6NBaEJtPrBR`XIa!);d!vi$&gF5<~?GU1U>LWATU^J(-1O8hNiJ z=;&esPVb*iOdiIw7fMSpY?&6C*PO+*>&BGk-(;n?lDyEaV6R}C9|~6jjli&C7W=%n z1p21FC5Hue)%fLFaPn~o*0-CGsy(SEkKFkL&O7YQ{mF)+i(-I*v>hl@neQ8^*b{QZ(TToJIXa-=BXr+FhgTa zO(D2cAE?8y4LEOV1Q-`(vt+4a@l`_!bP4?__)I<~%14dyz=JQK1JeLlZCSI z%BR8V@az{%Iy@8ZYn~M?xR!(@poqo)6zt#)D&USvqS%pb#C(@%QfTl1tS01Q|Ht8I ztx<^kRThXY8vB#b9W|)9B^d&zS%AXc2cmmDD9l@ygq5$(ie_joA+6V6h_;>INA6YJ zXBR(yW0@W6AZOqXe7!uGDT=Kzx2hi)2VNlU!W_}_y9K&ld?#qHdIMYcoCHu{ZUdKt ztCX%)%_BYRA-xLC&kn}y+s`1`w*lR2N+3b=EU|x*h!Q2{@MilFeERhS=tV?BZB;yK zdu5?U(r74Z*ag`S!!V(FG?vkM@c!pQa__kx6ufdJ7pH{4ww&vr`OFWbcE;kqeY5ah zu?w`FQv?;~0`b(x<8e_)GWwoDJWv#b*%QXIh&0?so@t&zm|!*?uM zlLQ+}b>Z03SV51c3hm}b!IUQ+PdP)J?9xR}$Zx|re#cDg%}(FtHbBPKHol}vooQbOVn2^x1vVj1o9NvJ%Po zxTz2&9HgMxe+91WRfQIU)1$4-Y&PwiG%8#WXKR1q?K`dbI{hq3c`0HW4h%xI(c5s-_#(V8>HsXAJ^BenPv_pZ%zLn&YEwfpsf)bTHIu*aS#lZl7FVIUW#_&R0k&i$Y>P{l9w1qmgpM(eO#9|W z$nI^5qg&h9<<+audi5mSq8kc_GS6BqFrJKhNoQGwWr;Xjl9Pr7i@~9EHuFsw%xfb; zq4%*;438^gyWI5!TOrOS@}Hz`;7VB7+bWwTseoyGLP3u_Ki43c({|0iO09^Bz*M5^Ji=! z9>ZUVS4*rDFO(XLkJ63sz-}4(wwA!lZ=b}v-3QU{H+Qm2Tco&s-3d0U`Yfbfx{q1| zABwBZ1%`LF!lbsl%ssjeb`70?aj%1k=8DN8aa|tFFKrk1SLsh`WCTvj zv^iM%Djz(JW`nNUAe=qj8QpD&6H8?4~lR#T|n)s(QHMS5M|I zbqt2y%7J;G8^z(C+7Os@vLrhYV0y19_&K;AG}XSvCm)p2qJ2O7jOfmeXKyI+p0EvP z`Gzsm%Z6l=VPCrUV-FscIS03nk^tZ2Iikv;QP5!xxJTMw>>3v%KJPUWi+hYA6SqBp zO{QeD1dnJL2`( z`{0c%ef47qHEM;Zu1DhCch2GUZoO!mP?u9Z>af_)2LnF>^m~zsJL2s`B?IO7+`t4J z^5zk|&CA4%fnwr+Fb85;?O-?u2-->?^FJTNSsKWREP?!HW>ya)rwqXn>2dji{JmIL`3&l!K}h377O z7G3K|f@%UyLVmBlXw-@Gsy!#QRStoEf)JL-+TttEo&j8uBCbmI*6bDQggCC;qGY_H7 z8>147>n!%dsX4Z+j5%O!?_BKTs{x*)#xvO&(JW4X0y+(l!wrSQK=l42zL;^A&EB<( zH6HSUaD6?}Lr~FO6%kE(9O%U8CFQU`(S^xR?hoQyN-U(QC$8601&4#ea(ClC45}E3 zp4GH2A8KjJFE<0~5goX* z8G=4VkmTJC;^_RV(B-c!_#H08c)b?U!K8lVl9B;G68KQicwWHz{ul^b(%uV<78zVQ z<(Al3p%-0YD2dYJ-eSDB6W9e`6|^)8aL3Oud^Fhy=1Sfq8#E(OKR*}aXH|jRjTZ6d zyi!9_oNnXl-*z($A{p+ zW!tUv`}L$;AsX_h`NQ_SLwGlHxL}jtiYkli$;Gwb*nqoB!K&&2*{$^EUY_dL?io$qDJC*RBB4cXWs_l7rVo-!ssr} zoYI${8Jhwj?T4V*u`jqx*JegDHMMd$r(y!!7M*9agtZ)5i{ZTMjvbv9upcv5??Hz+8CvRfmTmM>p*GgVbN`h5~M7*bCpjyea4k zXTY>Wt4Z~lQnXuo6fBe%;Qm_zGpc(iuCnn#^FwZM`p#X_f7c`&H9N0lf>wl}2Xp{* z8n@s&S10Bd9fu=@{ki6wPLQpUhrNf^k}=!kMN`C{qFpIcaCDf!9NTlo>c^%ui1T<& z3|^(NcfAYPn*FoU@8?`H=FSz&PThe+^>d)A{VIWBMo1-MV9ezSrKuM$qv(Si7S7ql zmT4=)O!Zz+cLBifIl-6Ft1+woov3QSdouQXJVf6)4Dqs;aNU{R%p}Mk8_&$c;WKVn z9XHOv#ZKW^YyTO4=B;KU)ek|%*EHySU=6Kt(gH&xN-!CUf`D_Q$j3(-T*+4!Ck2UM zY0rAmWGR0*UuXl*`+369m@t+l@X7PGhTc|T{-mHiP! z<=!SCy+Nd;QeemhA115SFR}+ajx+6!TzKGp2ani1fb|}`K(-(V4+Ksk%cPsg51%ZI zH;BQfGdhX0dN#|Qx(_=ot++;eCU~^=!}hI{!13%c#QpCvaYP9*Vwso~oe8hP&Or8% z5Y|ujCM@4}4bD$06AeGxjct6bhac{^qq=Ug&?o1R&8G!D-c6xoS%wil@@a+36)Nbt za62CD@r`}+T!XzV3&d)lCX4Iej*z{wP4052T_pkVb&6`S>zz;NbJ{oV$Y_z zXf;xWj^^2DF}RwfIlKki9sSWbw>PXWi(q5b6d+o#AEcxkpH`yFSGo$e&TtLux2VI^ zk4@rF6|rFKX230TtMF3ZWc+mG6T8>o08&r(_`IbLx#YKl#C>%|Oq@aV$J->j?_05VsD(JMo1kH==?^IQf8RT=26sgAU>t4t4+5#VT!QBQWHYG9Hz&OmC)q#$u!f?t%Uvv=Cm3`|rWF^|7cdD>H z-`%t{b80QwA6A3!gzve|E}tzMH65!r9}(x?kAcNIkK@H{yNTHxfnnJ53QOO85_6o@ za8rsDeq35C@)9^0I{WtucDg;Nt@2FlwP+el_nZvw(;}entw7e8X2OhxwlpUrg-yS) z0KKF~pbgp1jt5M~zTv}2$Fyr0@a_his$xZ}eXpT<=upA8xCXAKr{f8mKJ3(jI+0P2 ztDq~FfEIR2=>2drYJZ4>`q-^_=?8*9{{RzVU%SHMCahkx5+i#F+_5pYn1%O#d=n+c zvyVz4ub=?W4OT|8)B;gKbe6a!zMQzaJSGoJ_d|rV1W(Z_hasxc{P4Abkke%&?6WT= zVSU?$`-i}x+~KgnV*^XqErFw%y6DfMnV!qiQ>*!-Mt z#BAqL(y)0clzQ9;t%!J`OqPM}&Rt~u!KsjaERS^`xr#OU>hVs!#khO22QGC?0{8Pa z?C?Z4bO=!sDUVTsvAPz}*nX8MIp=`k+!r9ZCrYd%lHuL@k0HmbhoQ&KW^B5#2#5Z( zqJAqGY!BN+96ljf&Ps!Yct&*YMIO0xTVPHId?2&2yWmS+I_oI@h_3>cKylfWvE4^_ zy*%e~#1xqq)-(gsqp#h?VKv0#C6kYNz z0!5`Ytl;2hOziH@hF_0F7q?9C8gZY@QM4i^nzu-=O%39GJ5!-;^G8uegCX3!7lb}A z5Xfc9&kR$i`MvtGur0?h;f4Y9mQ2A#xlOEbR{{GfSqlZ`1$cj9C7ga1g_$OnG&g1z zu}mEV8)ksmcvu{s+^B{ALBUK(@WJp;Jtsc-7q{o6JQY^UaNeNhta>1d6BVc9vWdRf zr5%Ru^)E4Rkq!B5@{Il5V~C!QtXVlJ6g`d_O`$^;9trDYKX(%NbubRL9y|yiE}h25 zgVKcdC=j*n#pvCj03WfNpc~{%B6r_LEyvsB(v`VbzwZ*>wpxW{6M|uJWHkG}RbbdC zegsFhhfLWN2GL40$k&P*%=*-uJoJ48A#*>$Uf)Y(#p`@f`Y?g2|N2BcH;04qR%!at z#E5wwGADZimf-2C-QtUh)#CZCRcO)wbYR9hL9^hQH(s1y2ztYAfqS$Db?T58JK{L4tYuEjPcP(Ld~iYm6#LW>2-uOzz9+`uq40%W|?#apd9QJsv%R4u|hCLf)F!o~=Oo;k|MJbCR_x&i? zs+%f$KRl0R{L*EZa%K(| z_N+r!Z+SLp?=Y~HmVoP_Z&|d^zslUP1OKhnOzX-X96DVKZ@)Q&lJd{ zxLy{o*H=SDt)s%4krA2lW)IBE^eFAGS}z)TIRbjO14|Wrh%>{&5l61VkEf^Ms0(Y+ zux=KF+>;jb&!#j~*Br7swFGARCUAPG3>8MB#KoJu;IiUMyk=sLK9=>k)a0>9anvRJ zdbSFOdFNu@E`+QSLT5Jbfo;;GVawEP48JLG_~VYF!;Dh6}6AeKq?B=$cJk+2=<#PW>|uJ+3$6~i)_&5=zI;HZRO9>(JJS5qMSi8rd9 zU4@&T3G0_xqd;l2AG!Le0#;7E4pT-qls@zFz}rK^@QQk-(1+)e;>;_!vMC((j-Mch z-MPTU>dnrL*N0f!)#%xg0wzY*_@kT9_71RR;?=uF{Vt5cJ(cSCIkpHB!_A>U;~+We zVkbUZ9WNd-YB`?PaE0A*VK8xGOzDZ&Lvh{NFnDB^Lrys4!QgO7Fl~EFFxv$+)}9jC zzc7O(Z){OxWIE~jN}ILkK45**PKuaKH)=goAGBqg#g94^VCAGfcyW^(8|UEx*QD3L za_gUD)UE_<^Zm&*^;0paW+7N@5@SYA1}q4uhc@NWNUjPzg5+bY!|58Xig|-W@{U6K zVkcOC@+tH2*CDMZRAKP1TDIPK7rf-VQ98y7@AR8b9xM>F(OtEvT-O==J~{)JBpfDs zDxG*H(qA<0a1eR!hooO+$N|OS>bhx z&9}YFwpqGjV@L?D=uu7FmPEst3%!`(WF0=c@Fd(Hc^~Essw9Jp1Rd}rN$hpNEAq{E z9>{N0;l1N~!{WFKGGUT5tEw+06Pyb1+p#y8ruBfmyl+h8?g@J#k*OG2y%ANm9A{s^ zoJ~zi!rL+NuvN1dS4`|8sooi-`<>)SSd%rTO)G+?wPrAV?Gv^_@f{gi_Y6+uuZDSp zv`PGOdn_34Mly_+L7#W8$>hz&O!MOkj4HCgH%jj?(VCJTX?5&=ohz8X90Q++ZpCxw zf3iELO_@(`WgPL4k$EGJqkvl$*{E{Xwzn@jMp|LAv;i(3w-=S{#bABnGJIIu2PaFb z^0qd?-qRrlOfD`4lLbrBPqYM@`<`cR2l}CBu_dWnX$vNHv1I0ZGeP=^K$E)3hnNJP}NOff@ic5f9-V; zTXUMV==x*g<7cd=#RwiT=peb_zYO|SOoyJkO0iaT5e{D<1qI!fKz>ayB-?wTz3wh2fx87jX6sJJMrJ3=CacfV0l#va){XNK}eB6*=T$iojp@xKo9F zR))YJM^)ZEISktGy)2!e+y~uf9br!{FB7{aOOYqvDnNcyB93dF0T(QSL@i64#K93Y z*wPvahZWM{w#q?x*zy|SnT)8mZY+j7ydbtG*TAgiTNs*ji?|)X51Id?b9tE~PG1xh)tqMh@7JU+j|eK2lx&j@$z^>Qr>3)Ma5I+7-@+ zjgflT_9+Nv40OS1xdmh2zF^SW%Lp11?vu6?-MR6!3(!~795oZlaMF+qP+}cSx}Tj6 z{hquRY`#|roGyQy=MjeQjO8GD%_Y&=!-m)}b|21s_8M*u{>ltD8Pg5f(KzsF2D_bA zh!gt{V5N^fmn1eD^6H;Tcyz`O!NuwmRDTFzfo^+PuIe)qJAVqky{$&$MTt1Sa4poC z2SC!aX7r1&ggd@~3CHG$J1?1VtE0&tv zwn$jhDTdU`@!&~?Iz8VTwdZTHA|o?gGx7rJ9<)P2fq>rC7=SkptI>%?+M>=E0pf&T z1Y>qg#1dixDX$!HZr(vO7W9)#6lTE*%bnuN*@JQPrWBm(nSe8chVnCxqhU#k1(U3& zI4*V<&V6nGyG--&XtE5b8GeO|f#E3Q)E}R#Ze?dX1YMM_cyW|M9TE2qfYzuOEIr{1 zD^|s_{XL?Hg;5a>%?W}?)&|~Pu44U?VyN=$MK<$uL{8ctEY&3NTh>bUcFh6IFnEg} z7r2Wa#fXVhr@%uj`^mnnJdBz%x5M5`3vh(06BtkaATDvpBQFKJ!?_f2rP&kgmSuuV zzV2o>ubxGL(?_<>pHCLW<*+e5*9s2wgl8}`A z=ft?bLlP2yj@^Xoe_c`0)0L3^`;kGSL5GN zen07-kg5>>uh(BE?M`wx35iXAhyUODBPo2UztZwgE=Wiy3qOCols*1QcHrM32?@v9 zlP3N3@2`J({l5HPhyTrUz;=?2!(Ul2?{_}@Q_3YIirxRa;{SJ9`zy-dk$=biUpc7! zF0Y#w{qvlCzn|Ar!uId_Dk-5T(OW3p8$DMn_gc4Tjr(d(|G%E6qG$cPCXSOB)k9*` zuWtXO;!iv9Kc?xQdhxI4|D*aNvFyL<&%fr?e}(_!9Qcpw&)22@&(@zmDO%#QX2Xc3 zi_HIo-IuIevB7=a3ZFIqoLlX_Y0+9AAMX`wmj4~?cl;6mPW->m{rB-lNG$VqU$@Tl z_sxViE%jXE?SZ0gYQ4_<^NMs{a+9H|5^_GYx|=2JAszJ%g>+m{;98j!hh=P@46vj z`RBs_NqJes>@Ce~{sg zi)&GD*$;?c!*OK%6`Xh`9$U^DaEH)hqPArikN2F+E9`4oUiZzYuVYG;%BA_JZ^J+* z$V)V3H-$G>r}1yLwxYN_kML4d5tE;`6K>^^37fJ>?g;AD5?as6X#!sv~IA&E}=$VuAR)3p-VY2zV~a#|7cs_)wTwyOX76En`Dp z1HE{^9sFMILqq3H&{y^#F&{4n`hwm3r;JH_lx{q9zqJfkd!EI6!?R%g_^HIFGy+<` z$kBvtr?9)U4BvZx1cMz!_lQ{N4stFn*ab6E@ibAJ+y z4=lucgCuF3Yzr7=&!RKVtiqcC1NnzD=UK7TIc%0Q$Ju*(P|tKJRK-+?nInrSca^FA z%OzZKkS5oeO}R#^6aD%%6W%z<;fb>IaASit9hTLJQ#*EJz={&~;anlqgt^kslGeOl z;|Y7{uT2*YFrlMFV|dS&DliC==YC_~h^nM+vPUM5abx^cs@GltTf5>>9ZNCT#}LKk zlp$FSwq75(2~Ze#?2#Hq8XJOO5FKvk%QSmBQJX zOst%c1j|h1X~MkbYi#mQ;Mf9sdaqr8hTYelglg+8%1j?t6dxY7m zjA+uyuV4#Jc=_Bdwn*6Pe3c4wurY1fRg=<_DQk3)mnxP)WonLF@(eKCeOJ;#75 z;d`613U-`SqM4Pd^kd)qxVtTtZ8~BLH=ip|_s>0P^EoTpVEL0Y*(Rd1blICys#*QKV>xfd;*M(~umk zH0Q+wL(oUiWmzZD!M-cY^7eve)>Yk8WHD$6Zc^w?MFTIPyHsyFU0n{e7B_(U_*6Eh z@vOi)x{MQ3CyL@W9Dr*vA6Y{G?sQUvC9R6?z%;Wf;8rZpk60OSh3og&0GDcfQmD)g z4<3ires!4Fr9p3-7;T5`J@LJ zxS$s2%$9&#uYa+TSj}$R-(kNpYFO!K8-6|{1LI!=qK;cnFxWhlFZ}!nT~Do`L8cej zz5TAdT)!PgYdAod_fuA|#hFXTiShBUvD_-|2G+Lrr1HL@U{S2VU48pQc7H`aVT~HU z@lJwXVQKL6y$>ksq==dZYSIq-eJuOXaiY9QnhWh7zuYvGUyE0uGOH`d6uCXbuF!=% zZk$ZpUK+x!rr!LSz*f3cPr(rpL;+g-*+LDvSkV9%%Z^0iqu{u6u;N$V4uZxsBfQ$*`b-D4er*sxU`ZTuZsaPvpX%hL%{F048_K0 z*zi@*E1Bj>?6yr4ShT9#aFrpq?=yuK+?@%No5yn3YFS)4P}s-*5=9yZE7O_pno%Ll zmFM>h!M%&GVEHl?de6BC@{Uwt@B(A2z3;m79@+zFp-c^v`rLz0R5GHj8tSyIhbzJ0e4tjWP;~#G~u+HU!N+#1m)HDLIov6?yo6exB?sRTrvXg9CQwS0s zihSqNfn2S%4JsnrVfKYNtsUF0C^z2Vd zJRSI8XLVZASj+hKA^eTkIIC};#bA)`Kg5Vf~|aGk-+#J?1jZDOXzsL;n=*(kB!=_M6F(Lz?t73gJbC(;t_Y5ZT2{X z9usb%s`V9-L)JHt=+GDJLieDPP#+JLTk<=0dx`YaE%Z}+0_bj57wk_os5Bjmm%`Mz zzW!s-)gMHkUUJ5qE}_4Zb_YI0mABYu5jDFg*zY%tcls3K($;=dwZ0p)9FeDOQ%tEi zyGR^nvzOcGa+aHOnU$oE#@o+7ldhCZkX7{Mm31Ky;WU~mUe%|IGJ-H*L8K@+*8svb z=h6->Gdkt9KT7MS2>#rY#QXG)v#9D$@KzYdyKc4vuD^uQfvtGmUl$!=U2dkD*SSc=m&9m12ctH52&m>ZVJl7183u$2=AQ^SQju4B* zP#H{L482a;wwm$HX+N;)QZRH*9E>|Vzp>*#qv57w7gUA!riNp8!duBU;x%Xu@El31 ztNZ|>-w&Yn^+tTjZ8!cUDHy6tUP5!#PTISsCpW8C6!b?FvE*AL==Hh>ExNLNMaO7f zotDdnxaGlNi9Rs(f{1GkSPgT2RbU0z;#0Gd$#R8UP+d1rG;k9W#aE`%IpzK6vRnJf zh9_Uqec2=Mckm;F*6yW8Y;w>^;TY(q?4%R>d$19eebFOg0fay7Vza(`aLveHq7{*z zWO(>crn;mMJq*i1t}-7+NaqvPZe7IJSdp8VsPTa5TcJ;iAAh890N|D}(+`m&14l^G z#hoW$-i&e7Dc=ny_t}arnPX{L(GO0ItBv(V3M{T$|b-fv;3^;__FMnYM@tPQ$_=atF$iOjX`@li-C^WdaaDU4<95(S6 zY~A078!nzlZ_+$c=x0mgGZg6TFlG9!*C5f3=`TyyA3K7FO#;{oYdiFAmE&nI7Q?#F zX=q?x!GXxB?)s_ik6b@YjXsJ&zP^N)LALZlvd>$V>n+g?HV zur+jeg*RkHiecc2MMx+AVB1x4w;8{`v>T57p~?Hcwufo2 zv}w3K&&7&t#o6yKo*j9KQd=PK$BnAC*t4Bst7-g2Jpbel%xl#WuUfL4kME^O2d~qh<@*$c|ZElC4&k<%d`RIz7)Vkq5bJ=UJ9w3!?=`9F3vde0P8-T5ZEwRh40T7 zFS?b&T)_^zeTqCEu0Iv)%Ygnovxmy`mZFy*S@6QLC_XCu3^|pxh-;0DfiQJ3)kqE<5iyzHis&FXl6xb2St5UM;2mtycVeohg5Ax(St?H_{ZTEc~#@l*&29 zV$8xbxX<($*i>8s|C%)R$~OvJ$~KbZ)G@p{QHAHJ&86Z;H%ZHyM`)z8oSl3-9VZp{ z;-e19V!nL^zI9hcU4t<+O|2Av3jEGIb4C8RPZ3!#tqSA32GL^!wRn^cK{_HI^v%C8 zR|6HkfAkW(>Hdjyv;M-)bPl77WLrf;zSiO1=Re8Yy5o>k7Qx-k7SrVO!|?1F!LBgk z16XgB#!FeQ)OD~qwrUTdg_BOfO^Hm-ZXH7d^+4WUoyWpR4TxLZz{f+IZWcHM#zqc-WYC6Luy|r}hOWaiW4f-70VickcEf z6{972KzSf>?XSdZT!HtPc?Zf(SBh&T5oX66gJVn9z!euWe7IBu7wffo;aq2)?jy-( z#2eDnwuN;`;nu|^=O~J!Z#80Q?!l6Su$fIkT5GVMys056lIRam%qFI&KH%}rX zj9W0PZ#A}@>dSX#-GT649<)@hCmrEBgL_VQ*wp^O3f??Uw4XBn^FFG?n$EmZ6CM~X(gZg_5pdJEf`;?Nsc=< zi$*Bc!wh=~V%%JW2UmXvtDdc*HeW3sntTlc26Vyg?!9P^iU#*xJepq6k%ZM-8sU7H zA+IQ}g~6`A!u&uDrtb3)IO4nqtZ8c_^%yG8)%TTR}s8H2H+JBl&mn zV6rPti}ab;pC;YwOAi@Z@|^W*eBGO|WJ*miil%tLoadAdnj#0%bTCM4e*z%^LIwXd zmY2F!ux>tnpmkmm+K%e;-a{j>;`$*{s&NIBhgE@}eVfSV$wPcGwl6tbY$SdmQ^ZoA zuZHNep1gm14))P)f=%UebjYmRaNW#>e{t2}9(!#0fQ60FHQt%Zmc%y{= z#S7s0#3`tB?gGelX_8;7XYt!JC&A=>JL!kB8`!BcQP9w_fv+B3j+@takbWP|!u9>e zywjoz-Ybsh748l2`Tl8?*{Fn%JU8%xGc9=^z7T@Vi?Bs$3*8ZUk-gpQ2#tYfxYgZ} zyu&*OKYSg`WHp63^o+Z(cc=nAGixy47aR+CR*sIBmS@Sykx-NH3jB_H@$CC6`JDRE zSZx=9reY;7{jx9pc6ls5byMUy&)*>#sDcUW=1{NQm+?ra5>+(wh41}G@SIVv;oI7H zaE{Hz7YY%OCFp-FJGq#Axt2`wAqn%JDpEJ|#n@0?R8+J0iNLHd6sJxK=jKh_xo@or zy>~NJVDV3)akL&J^84{?oV=3KZC3*?**H{J_Jy6XvOMKi7}DBc`pw`A_^YR5OaGT}Y}j>JmAU~) zb*5FqR$us9;};k)=PF+a(olFZC1DBDJ}V z@cVwFCW)uDV1D$Mz>%9sbFYU8+{IEtpTvQwssf*Hu21*OPZ#OB2J*223wUOxA-kq& z!5_Rn$BsX+qghRTc#^9Y*K=&e$%Qp&oKgq@^*;qYb{)QQPZt(d$3oa?59*L2jb5{) zcuBN6xC|_0+5=o6yQWPPRBp)S^Q7t5jCx$u)tlz;>rdARoYlgdW;P?^2fomr$Pb%p zf<}ux+O|F-m$VMTi!K#T^!51QqEkesYBoRk;4t)V)TKIZ49D3VAP!s3i=U6{&okT) zb(7g^?+~LkIQPJ{cP(5?>*ov)v&~a5d#9Fk#jS~~ON@gfp9rWZk z-1ZCp{n}i;T!&(no9N?&qo9(hObxo5@X+s3&~;l!KrM9R9Z`pHuvB*{iQCb|^*gM~ zd<`P|LUhS#Szm9kGKHs3u~Y>U5YeCmLP0YS1~hgZa^KJ?O!Z zQqUXBAmCmX?n&0=8~0sB@r*&M)B0bO5ycufx-oeW6snAIJzA#|@Qfm);=(x}ov z8k9y+Y3@7EU7q)Q?)!a)|2sV2^Zvir@8`F#>)Pw=VePfo-g`}(cc}e5!}eaw?2zU1 zMB@4Uzl#^6+n*;U_rUg;bNp>jw_J(34}iQ}oC4z9@?DV=w8@ ztdNhi-V!ZtpPK@`yT^pSsvX2Tq^Zm4i3*}^Y=QZlwWM0L<^_l5rWgA1XrIq7=#L|$xmA$H zn=`JQeqpnMr*<@yE1FTvl+X9(I*QM=5bij`=yK(_8fO%Ef?HI1iyT$CZzL1x_c=P; zdAD}g_U;j4Y#w^l(#J>*D7{Mo!|owt~8ZwomWCri*~&w+WbWy5WOQfAB* zAMU%Xk(`bKdEPzsdM2Q7kd_bipo7Qp=-qRw=*z<6cwj!AroN2kDJ4_9iW+6!=2clt z-M+{4vunEC>KQ$BglXq zyLvA(Ct)0Kx2_>Cs3?^C?(iTTT^UPDs*Iq8m8Zb7#|~ zw)u==Gv6<4E{EPXEz^cD;B=2Geiu{F6jqxR z2&dmjtfO5t7SK%C6_)AINSz4X)?#;e7Om$@{LX$$4YYYv{xWS_cD zZ|_s)_)P9&#Meo1=azGL#~(#=&fC4ALpEht3}g=HX{_#Gp5q8+M|KWlZOiu!e4)&J zGife;8mzeP966@=EI$tCmTe6G(jj`_@&KdWp-K0(DRIZhb~2Uu^>n4&6((ZT2}W@J zVJ3B}Idf67fN@%!#!QiZN3(m9>8oR;d1BYk`7!Zqc=s1da}C4! z`uW5jrt;_^dgk*ox@fs1ck+U#HU41&ocPbGyd!J7nY)wac=u|BXr1%V8M#_FCVB1z z+O4jX2^1Q~`{dI`FB|NlJLD$vT4Kg=-?(bmuDm*%;R@++C!QI@wSIGj(XdwH1`QXX zjTfrZJ~4qjKVM~TRGbE*>=8(>E-gXC*Pq`H_dYgCxZh!lpq zNR?-=c!$Y{X!_IFXx^J?Be}8-Gr0-IQaraug51cSO4`+1kN0g(I5(}`y5^|mNnUNg z6Z3tF3U95y5cgAG3C$Y8;uc*BV|*1Tp15ZsXQlZl?j5B&^hwQiJlV&B%+c(L+;)jf zdXB6aPh$K5dfUjWj8w(|z1B#aw~sSO$F#m;5?`}<2Ul{K@lHm(s&@tSd43$-VglVX8EeXeM_8 zGv7d;*_R#9kmV{&+&2-fHK_8&O_|8u*&InX_sVf6)@yRrJ}suHns5Ajy(PJ?1DZLx zc`eNPg*)i0v^Qs=%ROdlvp@6d>IK?!o-j`@E0C7Fc7T~Bw~6_}m)nmGmFLb2%VP}W z1bI@2Lm0P#GmPf0Z%nT1d}hwh+5B+1d+5jteeRaU7nwep`}AJwJ7-5wH#5sEiEa=* zRC_%28}s6YMQzRN9_D;l2_4e$gXxPt&xGd)b1wRLG9_J_oV!lyoZ=-{xz-QnGhL#( zT(iMXwN0;XF*_{M8EWB1rt{-Up7{zDu31hEeQeO4`<;JZw4oJ0KJ<)cy6KG>ca5qv zQ*|++y3}NB?eZ6DbcV7x2l#tJ2RVPb_1JRe**c16pfE@$T50g|vl!0a`FG5xAC}>D zdQ#k(YO86cSf6`qvpw%Z)Ep*m;{fe1>rIdQbcPX%yu~pyTEZ+D>&pZuAEpD%#&h8H zB6`Loe%ysQSD0r{&oQ1yCiD8u(sEnwUM)-Bmo_2V z>&96|r{H64sz(g>yLTFW_PYn~{;hOIKe);cF!w8oQ5?5u3mzFQmDTeHCjh?}>9BJaVEHTKRsn3*FcJ>J~>^Gi5iM|<&>2i&QB87|DLmfcpn zMdBeXK3!ch zaOg<);dG7WU0PkVoi^BWmU+L@kI(n+(H|M$$4XkvU9muqX+3n17HZl?m&-QsV^rR% zJ{YdX3*O*MKicHW8A#a4+r3|ab2V=i_x08Wj*1Lb(`V4gm_4)L>S)|yW-ZX;3brUQ z-KVsMRHd}Xc~jAF{G zG?=`elbm4_#hJKX2ku7CR9fliGv@sBkvuPTeeMgB<;-H!BnIqVc>Vh(@@fn$cv9zz z8I9c!I3F)WF0%^&QDuO ze_1WTbssjKd&GDx{bR*q8qd6;pOsFft2goQADmTKo1Mxq9XWh`$HQJG<1DYXUQM(1 zVt^B`X8~W|Fq^||8I0z2e74|?{h`nBUfb~$rV9ptRW>tFYR=tTb)W8eZ^cvoc$Xtm#?ZGm++{M$C3z>W zrPChI#A`OYW;4!K=c-qYaOVvi_`=k*dh>=k`p{F?6)?ku$8)!^u5uP-h;v(Kvv@Dd zQt57=v79#SGRwydcVd|T5X36O%gY8 ztQR=)qE|xqSgFAGo+iB)v zUMl^);WVQ^EQ#K{P>HwXtrYL}^+DP`RgycGAIrayxy8H=+(f@#ZOB^@(!#g~HquV= z8T2=&{j{}v0F#~-N)KOnlJ?ih`MdtjLw4jt>3`gw{Gh5j5RpjqT+ z!9(@@-mk^~SH0Q4gZsT-*wCzX_~r3AR9^6}_Z`1^WB!{z_l{rwmP2sA_rLrv>h6DA zF++YO^Rj;SVBN2A`)B#b{aZaUe1CLNfzS}|fFQmztry>s)-PbGfGWRPfASdm)Bn#F z{-m%We2f0+;TO^U9NIrY{_*dx@w*3kKTF#7%Yo_d`O{E^KjDY`a{gG;|9<(u83VuP zcSHPQ+b=pA>bsS%x9^g`Q17AlA-W!V|4Bsx0>(oh{!ThHPaSf8{?h+feG>ct{eRD2 z?@s|&hWzy;f7LUT4z<%LAaqH{Z+`8+-1L6mo}msH{rMN(K0aY9!ubB{D?_}Dy-a?3 z_X+%Y2LJnjCjTg)=|2i+_KyOx{!u{kw*Y?|Rj)tv{g3GRBj9g+|3?6S>-#?f_*>uq z5y0R2{*M6uTi^d0AN_rtpugnD!}*;0PZTP{|NJw@@;6HQ>+ygzCjKO~pU^*_1qA-M zU*0j4^)HqFH9yVlf}iuW?�-@2xA5ML*&G5)1!QUH#w2{5N9Y_v^~g{^G$e`WdSF z_iXwn)%;srt|7R8G#&CU9HP5*zvy0r;kP1(3H`+9uW(cXL+KEfHGWk*^nED(WBz|MZr!s}Kg){z z8Sd^^eSci1ezx~l_J2D0YwOGJ=iA>~U*?_s+4`Y*{-18x{}%8+3Il&_eX;x{%k4i(F{=cU`zhBqWf6?s_+5Mi5|9e{TKflx=dF{~p(_B1+!GFJR z8oxe2<7;mI)*tmN^CRzX!wvC`s=B`o_qXc$SI8T-?+7Kg+?kr8HH9jFvX%0iIfvb^ zHI?nT{Ty{F)QQsVP+@m`6rnz(?4ssg&7>~gE~JW9zNciatf2CL9A_tX8&gxNgxHO; zp49S}Nz{T50&Jz|!<1Q$0ehd{X!gygRn+mKRg?}H$4-b3Wjj>^`@3liwfkfn<-PC~ zPP>&)%~Se}<{sU+H zkZob)O1-+zcPU~%W66$Qsvsno$}N~l)!(~Cm0Mk<=6kMZbF=nRn=K-#jne*oE%|id{(+B~GVypU|T6)*Pf#ET^(F6h2WBDaWa@Q^%-% zng;CQzBF}E&Y0SCS%*!l#ZgqJ2s=U{iCPs_&Zdn&qRqNS>e^HbYQ~rhYLn^*O6771 z6&|ynEfzVKs;t;bDb9(a3`XyySSm?ub$cE5h4;ejeeK67H`lRLPSOc#-a~h)sdh4@ zmXJ$DJ#uFUp91#PWp63`zLa9Ch_K5h)Zyjm?NpxracYWFH`O)oEHz=)Zt6n}!#*?a z9@=zwP)oVF6!m;3wQ|vENU$1;_Dd2ke)Gpw4D zACzMk?!1Z0qP3LPI%z8B;8Lo^$&eaeIFEgSW5h0;JexX84p0FVg6y~26REV-lPSmV z!`O}c+o{5P0&L!*jg*GRWwy6!H#SeKpp3`5Q?8Svs1K{3QaaBEsmLaA_A?np>e#Zm zRGHv)>TYZnWtOmy9ab&L&gf{P(vR!2cVum$j160;GtVfhp+kz&jo(5wJ-1-HIqjpA zgf~)(_KB2!t{GeQi49efIfYVbkzjWpsiZ!qf2Pb|7gGA3t!&}BZRoIl7q#rBDfL*Q zl*)^w*zh@+8WuW8$z|=K!kn9_S1anNQ7(_Evy(H~<8C^#PtMGv60DG272!;^3SFnN z(-%=Ksgcx!gC*4K%jWF%a#6On-BoJSNOx-E+Awz8?FU#mt&oZ<7Gy82d`wxSJFw>- zX{4U19bs$vp2hNuMO2hr92FwiNS#@Gi)xwjgo-`?jyiOAJ~i2mO%05>MeW?PmC7Ya z?6_A`*#nOS*{pdTlw7kU)#2Vs<(5yU8ii+3TF=%{`<2;j{e4xG!+KjPJ=Tx%(v4$} zQZu4HMEO$Y6Na-}ZQfJvd&aQeY#pR}#@4YHUDcr0R1{J$Wi6F|IhuNOMw1=9wTK#U z(q;!r#Zm{_9I1B49%_Ya19i%JC;LM05_U@PDatC&g*~h}ikhw5NELNbRMu&G>a$iL zb?(ww_ISx)O2RXeT3B|73JUXJZ<``W>3ngd6j-s;Y0IlriI5Qc%j0)cF1LXle7qk| zXC0zGbk3s=P@5^oUEXZ(z}Zycb_I6b@N1OZ>yuP6w~KNe!J{$^cCZuQ&0&-EZ>W^% zQ`lVvwv<}uN-{BAm7G4yBCGqh67zrrvd(Y;8Ske=q;ABM9Z}PW{CFqgbKZ!&PJ2j} zOqfcFQ_2WqGJ{CR2a-p(eTZYTA+gqWA_`+ONxtYDvSC>O314PU3cgPx`<9B5fbFx$ zB()H7Y36?N_Sj~U{%i?3n~*@d=XsF+Ct}25W*kZTY(uQyMG@~K^2A;$hP>&VPx5D* zk;|hE$g;@sr2OzS5^VR7_41_>*%$M{G$liq{J5Y+*4S+#D@3Bn@eDSpyWvIFU6&)~ zOIHx5NP9x{wUQ&%a^$cmUGDrV+nFo?HfyJ{W_v{dnLIgWJqRw((+K1A`57g;85M)tM}kxsR# zWbyN@S8`rkJEe}{oT%13@mW!Cow6DE(vpe^_4B^lPW7@&Dd&{t!*R|7YC&GamjP zeGJ*_4W<7H`Y`y}pTFq-zoUozSECPy-@L~^8-LL6NSft(Z= zc}D;Ss>&dSV}N^vynq(i10$Cv!|g;loTL1h7=@R^mF?p(X;T_JmJ5e2J6yP!TLiIMum++gIjAM z5W+x9bQsDns07m92-TYRpr-X9v>loQ2Q=NlBzz$xq=~}9l8a!KNuc?B3W%2*>?duoxW+v2JtBCc0@twfACh{+b0TxpH_+ zFAJ{7R-pHk6YxIb4C-D@1nbrjIJ;vzyxKJiTYb3j?(sZB%Y&ccz#SjVuZTfIyVKCJ zelP40TL7o7M}zOQ68NnAl4S2O#}YYxT&R8*3cl-MMBxc&IIt@|%C*M2zK>v;ybgwswZ$mEOeD**Kna>a zWdBLH@KF+yzWCyEi3#w7lLpgQ9f2=Fb)-Qwhj<5SA+onYV(A1F9h?p49@K!?^jL6B z*bfcL(%?V+rRmNE!(s5~5puj=82oN-fNd9s!(0np)Nincn`=|B=Ikga++GEV36^lm zKNZ4QwZ!-9R711X*J0+9C$M(VPDsC(mGxhqKfrlSU?^r<@!L6#t}&M zUkt{c)u1771kJVI$Q2h^4C;>uF_$Q)e^Cco;tg=egMdMR5k$s`pxY}kjIDY~s+{fM zl{E`Rx8#COtPdPG!h_h}!ysh%h)jJg1|~n|K^wP*Y%eQ?esL8D{JaxpPDzAq?$I!2 zvKf>JHi4+MD{Q!b9oBTEkVo?8h*gdt;e1UX@rs9tym_zj!^1OxULH-pZjFSETlizm zS_EOzanKu_0EzBvfOUE^999rOpYhq`adiVp_{xxj-6KrCUwTCj4%dP)#X8VBX*ERp z=K|VqC1V91k#ig?7);m&V$17E|7c+tj$6r&@Ej80?oGCglY^9&S#ZCL1>56N2!2=x zTC)dWW{wENOmKzLA5$RvZ8e-v%_Mc6%OQXJdB~~efW(HeP~EQ$i(K=-@4-YEv$uqp z$l5^Y1Ql53ZU*kNcER{^HSFCWibjTKVOI2R=r);(eLW9BDti?+I!b_D1q~;{%Sm}= z2f2K~2&R3WY%=}L06e+Fke)DUG@~W4p&$?JJwC#o_DZmw%EddXr7%5lA^OiA2L~S{ zf%gYVNQlxlndZD2lrIdzqxwp?c#(ma>B1;}@de2*HwJ+P@$jSmE!_NA2&1ppfJ4X@ zIIzwN209JlI_&_n60ZSn;?u*FaiH{NG@iY)1?6QO;MMtypv@9Pe;-z{z<#bP=#Nmw>`gm?KDPoMOsR!)lG9*WQY$#0Y=J4O7NVk42)3TxM25w6w_z0=SmMKHyh)3|DAYR%o1%qtgygd8+&5Ku+m8w2} z)F2L;PsCxs79H4qrWoYf6Ty>nkjQmAkuAsSAo)uVgw5)Qd$y;*&8!@DQOS@RvjoO` zV!_#m>rEBwRlv^jF!6hs0Zpr%A-}4FT)#LGgGP>nOLO+%TiYrkOdo=clCwd`X9Svw zRFSh@QIkHcX#{iYui)bN625lY!wUU7Y&~bP)$C|$mav?KkC1@CZAkUB1Ky0o$)JYtGlJ)hl)=>uDCO#s= zJ6zE@#04~h95K~T1T^3!G#Xq4@6t=KG|U2u&x|&^xoZbGWH1t*FZe&dT3oMgL!o1Yq%28=-Islh1oz8PjM$cGRw zT?icK0P13!K*dB9M&5FQS&x(8>W54)Pw9s1fs4VT)esz8W@7)?{V?p=S=c&y3@)bf zV5HD$s1$Vvw+%}8%u^DJf7~OBd(~l=*h1Wr-UMH=#9@uQ2%6p_xbwjk67)zO9$i}o zR~b!6`Q8RfnKR&P(>b_(U;tVj^U3W+RU}tM7GJg5VW+kXhMQ{Pmidaf_I3eW>#~HB zf-Gnox6@QpB>@PE!X1T6V4^z-y>wk5`KT8*X~=-4$URJ0_J|zJsDjonrtl_>gDb4+ z$i>uyhIJF)LAZkj`n8WniJ4r`a9ae0Z^glT>uxakuo&`su8_Nln=r~~2HyJq1zgPD zg5cxRAiGN%sw>5@lXn1|mn1-IO%D3s+KLJn*Fjdx7;rsNicyzrakR}fP;SqK8 z>Af1xNL#V>Brzg+fy(yFM`&}V35064%G%# zq>Rc1=EM?~@aaz0ZQl|W%WE7fGv&Kk>`75_#&(kO6)?!6f zQaG@W^}%}^%ihPIm8)^i?8$ja()p=^m3Mq1>&gOcl2YeQa;JBhC4Nk0O`qdNT#bIP z_K%;$s_odpDjlsuY%|MQOQJTiMt&_}?TP!wN;sLodZQK1O0=qAN!}U38Z7T*rJtl> z?&6a$5MKkW7Y>2+xRr3;UL1oqM}hx$Nz8GnB`Sgn@NS|9F{pn@Dg{1}&-3+6hJ|l| z(G7*<2R9wY=br;r(;bj5&ZoIcHQ=%_4d%3$Lu$t)oSXfe+9c z$&Li+jSsNhW-y%itqrvN@C2TTU1MV3n5UX4aLEmfn=LN>}O)Oi6cuAU7GgrCFR-TmZD4#%+nfC?^gQsl3lYfxg~98}oFLw4ghSh!#&EVBxQ zhv}b4*SVPx5MK&ydk;b@>4*8jfe_zJ!F$2!_=&&Xth`tcrG2{im{S8$UaLXV!5*Ue z05jewqOXE1h^!kA(F04-x#R{I`g393brHNt2=3i;k!*7`1n2FW!N6P^T9)5{@e_8# z){ml?zf2dq_-C*cVft`*??}8Fe;B(z?!y%gi}AtMarh}o9v{i^&n@Hvl#N?$7Vt(G zFB+|cwNrOsb-^smRkeXQIS+KdHy<{bJVV|MP1rqQ9B!I^4*CPraM-*{#4E>q(%uJ( zSkzL5dlL=tAuAU8!yIA4Ruf3m9e^#vcY}!YW^hn<#Md9wP?)8S7YPSP^q8QopEaZ@ z?L@(uUtrXNVc1}#MX6-R;`%qYAar#D9D6vL+B7K=A8wGs-CNH?+SdS7XmG=r9V)1I zZ9RrpWue*8@yH63yhlfJ>%??LGG*$0LC7NMr6A{xFqisy$@IK!wI zwdSq?-Iv9%Y=0Z+ezg!7CKZIrAIwJ_g7%qxCU;LZ0hWA&@Db7|qvMKmJEHK&m_SHf+yc!tdU&E_ z1A2SxhHUv8PtW0LYY(D}!FMWo(L=&{>RYCp87!cSz9RqnyxZLq6?Cdtd%9*Oz zXq1UIYqmn2(G+}0@^KzV6+bphpzs@UtjSEpV|zy7?WUbjr#k_QdxY`hM@0-|3xEU7 zUwgJ5f{b0O$oQL?}hIc}Nrk_rvL6UhC zDb(k|wmK0saxEYSZO`z}ehy@A&PFV=^Cu=@Q^88&6Y(s$0(}?elHi1N!^fGiATF+q z*22d@`<4Ve=&vW$rdcFmr830p*OA%xTZqkM72G(g7j*0bK|g_yiydK*o0vrQPpXG` zAKk$Jl{)B!t-uw+rda)SBKgTc!!Z7v7CsokMi=W> zu;aZj`i~8VQw>I7cufNaxo1uMmCu9qi#TFF_7e#5CZJCx{~W0P8Fa0NLv}_Ps)?;4 zDvQ3ry-;hYPHBg3sr%&an+=BI1GS*4G74`fPeb3lSkM*J1W}hB@+8R>hFLEHc%TSA z&y}#Dmy4UeZH5zV2SCWm8H)PKh+XnKI6WmE4m*s4&ec-*HXs})WI91@`e|VOn24V` zY;cmvGUzia0gYTyEIV+LkH2dmFeMZ4Z4@kd847|i`=F=l1Y~{`f<=|WC=sm<98n26-grY$RMx7>m+P9q{IdH1=rqLFXPXIAZAua*eJ~qm^QM&Dsv0s!D+E zvu?<^DuQWN4j|LOM%&ApVB(*FeXmZF31{Ab`XVo|Vn*Y@q$Z*>+D3m2HxHg`$l!HFF{y2@iW*JoCoiF=U{H5B#f%9hS6_Q zAezr>p&=n2BjlpoPYc2K~zJic}3O;|B12@0bKqq|z)~H?tS-ZP%g})Y6 z9KFn!-SD7AxCXde)j&<4A41db5c>^F;Q8|~xV&mVp*@Zh&CQBnHnkmE9^8Rx5oh7k zsGXqtT@m)am;&QUOdv*Qqv`AR$)J6fA??@iLidrYa8UClIh|7p(^G`OJ(e#w`cg$a z_B{ik5OauVehaZAh`gVqXeg603TCD>gX?T%l$iIBBu$PctwKM@eCb($lBV#%_#p|k zN(RUI6R}l#59GH@18F80Mn+sAYYS6AB8d&L$pG>(4yZ5}abA!ttP4?u1_uh;FYSiH z{zFh*H353^a$p5J3vPzS!M-Q;aO|`^RwSN*m7~I;NuVBDR<#p)!%gr%qYPDA>0p2A z7Q8P|fGm%1BuU8~m$%G?6B^!lVV*Y3_%4W%a(6*N;trU+V#Bv)39}>D50V{`T5zvl z3tR^?VZxzEsQ+*r9UB`#G0hpTIUR!D&=FW6GanW#yAS7lqJ+>! z)LGRIb}Opj`AJK#*Ixo|m($^Zu>`n@CqnU;T-eLxLHoxipi&+R<6l~VY=AW?i5G&p zR68_Y)WgC@=Yf)02cJ{?pnS9@4k(YnbSXiY#M6Q=<9%@6_J?5pO$x3Qjzy^PKzy$N zor_H%;o~}R>QaL>hx#D$=_v5pwhJz}%3_S&LK5M3fRt@)g;b?6nECw`1PhM9X{$x> zn8sERb+85fV?iKz>X2zf_)NIvF9mM%PC`zh7D`Or2xzkqJr@(uH7&uf5sF|d+5y9D zra`CqM0AuDhQrNrhBFguA#AoDN^1MzNM{G&L@Wlsu4!;#YV3yr*>~y;cji1fIceNlkmhXZ)X2+rL{Yu;}or&8vWL6L2l}qSC+!tg*ca zPZJkF_M#l9zRw>Ev217?-USawFGhz|e4dro4>tz#!LDR7)T$d}#xyU`nj?vqCFD`& zlr`wYjRIXe15C3E!<=FJFg-v5U31PtHt!6WOf3h^!|hNUByT3RNfWL1j)2Q`E3sg{ z0mdX%5Z(LLVD9yid&RfTV&btpV^GXz0=UE`_l1KaoHcuMYlL%Y)p2U~!{ZN-l z!TFOm&~|q$d=`lY1wS>A_KpYV0v!x7oq##r{m?zE57?HHX!}YI#5T;t_w_Ouzf%rx z7^zauiZ-Ze9SI$83P2V`sKGvaOe`sY6Z(5Vy37IJuT#f|S>2H5HwmMU&&4g{x`69& z3?igu@W2IUINZJsZ}A?(bLm9vwNJ&RGt5CSavO{qo{sg)ucQ0MC%8RvE6#kGh`~`h z801zDlZ_>?N4n8eUvdZT8f^_njS7(FUc-+T3D9kwg1or^vG3mas^@3pcL>n3;EaB^j1me2E4FdB7(Z(?eA{-RJJ46ERtWPC-2UuWmyPuq_7y&bu z?S*r5-@~Sx4W!~~J1Kix0A4>RIPFV8iP~YZE5{P9vk1r&8s>QH0@?O=KyzYXNv}O$ zt~ebI2b>^HYtx|5APd4)NMgVPDg69J0FOGHg4xaPsB`2Fm_3+*jr{ZGw5K!S#QrQ2 z?<)fuVurBnP?pIP%{D0e^o`WJj6k``e0j9z4Jep73TNAo!QM&t;ibJ8I&-4&xSK6B znk|QlG6y*CA7`@iOA;vYjsok-ZQ#5R!+XtRakZEREb&Z$^6oNVpHjx?3sa%4HyjKq zyU9mAB{+8f9l8B+AIzWs6mDwN!{Z(1kiF>xd=*`WtnYi_RqrJ}e?1SDjb=Ew!vMZd ze*?P?&cng6b)Zw44$p*+!N(3q9CxQ0M0@nW|M3M_eLowlpInD9$v{}~>^VGJt&J+% z-x0|fBOr6U8BT7QgEli}VV{Z}#@rUiyaNolE|>xVI;Y@vVXtZ8oabPFPXTmC%j57# z>#={=c*vb?gU6{7l6C(9=G*m=73K9H`(X`i@1;>(_YjfpJTd9rb3xpw7ln$QqY?T0 zyNmq)8~ZiDAR!)t-aEl|J!LSnIEYvL*{CS>4#ciX;>Yz5p}R^Bo(NcDQ2GscCRYLG zQa5nL)D%3S6$ioV4?x873%G=ph&Sw;q4#MuIGKp!iAe(>b?Y`{pVdX1PsZr+>;}ku zx&&7r*22{>2H^ZE79%VDu=&$%Tz#w=_iwp?qq1W0euy&F$(OIhHK|jJ>AP@EQ4!UP zDOg+F0;a1((B{q*(-Rldaba66jGxtrFZuJxm0ONar5U^)avN{9mf*GgCOo>7OBrZ9 z0;9E(NWW9S5Bc*=zn4wGWR5QuZuG;f)@pQ+eTBlD>tGc-5x4hcqr>hlG;k2*>rKot zZ$&q}n$`dh-W>;zd~eL1l7ZLP#G}}U=a3szk0zUqDbnYH9OE1ea7x6d#^O}7#&Zz4 zT7>iVN>h&;x8udR>oAbD6h~eCig({8pi06QzD#EqZkJt+Io1)lx7iFLLj zgQ6`fVD`=fn7!--TrORUDPyuA!bJjgb(Vrg>7T<n@zl zG{uU@YA{IK0V%Tn&@tQu)n1H2uCW-D=NN-Z$!a{Deh~~Tbs^`eJU$S!L3y$Jdo2Sn!ebDxV}dO%nijF(l3M9eLLt zNw%bIAy)z&N$%n8CY(SavX?uCh&^^EU#9dBzb-DhxF12_i8JA;C%~BmN%Ck$8#!p< zLCi1J6HW1Ztm5#u2BSE4$mS{PU=qOJS0yhZ?2dUP_ks{PRcl9H7|tNcOg$@Uodztp zdWl$x-6F*7EaC1MOCEleCF!G-A=qaPu{_;H-Yl>LpHZriDj7{ycL)=`dGRpvt33GI zY$0EN=#$yTXUKy|+ep!Y4dj05GxB22C=l(pC()fpiS4Wqk``adYMo&N3$k7k?LA3Q z`Ro|XaEc;VW-KIO4Z9%jYZdtBOX2D2N=WO^gVy4muwFkF(iigeA8}2lu)_pi`CKOt zMrMJRWGZy@{nmJrl(BEpsi%byq@sv^ zi5U>E)(EzD9Vf@O*OOP%qT%X;Gekc#h4im0hR5wMU~^s_7`L2-lG{^YUgjAxZG#{> zo!bRQOQWGX*b9@g_d-y<1tg!=f>o0P@!{DX&{*#ZWQqo~Qa2z)G6zcYcEEg*NANAa z8;bTV2CLD=;5lO+tnhkB&I`PR0JH#`y~p8J=00!?*-b>Bq>=I^Pk_9(MZayYpoNw~ z*1&KST55dMHS}PBzW{5(8(a_K{5&uR;7bK8-#=m|Qa(?QRVRhebS0T9!j} zI`)Ar|D05qXNWU$%Sel-nV~6Prhot47!*7sgy-^T*j+pk1mi@&aat-28nNL@DnpJ& zPe$W17rg5E0(?0);Qfhoa4;2wL!1ZDmb(L7rg(y9%YJ-wcsicaatF?Q8WJLsG5N(r zoIa@$ChDfavdEh-_rpa9(0BsQqs~FZ6D6D^Pyo#nQ(!V#1`odrf}V;1N)Dd@>>K;w z`GG=Mc=ZshedGx| z2<3GIdL=LLg<=WNn_LBVwlzaHFCWgIz6@`?C!mGE8zS|Y1!_I&&^p(_q}FgE1a!+n zu8%wDe%uK&o-t53>>-I}n}9*}R@ft705^we!#zG1b>k1WA2X$!Cp=K^<$J|0S0hHLi<;Ie^O*ria5_uqep!_G2P&ooz< z&}fCFZ`JVNR#miGlnV}_>rFS7)fF<|lwFiy{BTZajB1 z5+7K+0oliyV8_T)==iohA~*}?~L_{Bk<_SLm(zH#`J}O9roVOfzP%q zJX(^8ldKNFh=)>?h)gPMT%$!HOz&ewe4@I5aq{1se}W>Amut7 z#CPC%`vuJPiHFHff+m+6EFkH}5m2{ug)vQUK`(hHUxs%EzokWikm?w`+r1lNxqSW7 zqy$*(UwUq3q&Z_`GsF1PXKkTVxlM+24iy zU}un1Y6Y(+2chA^UGgAJ0ursFaR;WsTcy{~wEY&`cpDG5+MVFN#|$uCO5lUV5?nD| z9{kw1VAy&u3>th5W&=WyDKP;@TD#%?S}wf5^N`$(`T!%-CxC#$GjKiig@jF=4-1bu z!?EiNVCgXh81*a_#`4ek8XB=Mv8jkmy0Mc~SM{;l8nVdSnn+^kyv-y}W&+$yT0zn) z7J{h9W_VgZ1@7;Yki3&@tbRcJ>+h0`o)IPjG=V>jDC#w5NTK=d}b&DV)Gv$w;WPbRSC>KGWGJQ6&o z7r_MuCs4JCC0)-nAvQ}GimtIhaLjFZ?%W4q;m^UYTox5qodyYaK+mRQkW(=O`#VR# z-9<}awN4Q^zg7yq$0>rjLZV4g%^Bz|yiP99l0jabB*rUU2Q=>mWtSe1%RURytJwwf|RXiMI z3)efALHyWZd>$qV>Z|Jry~PgvHsyo!t#;U#uL1sp6(G?v3Tw7Hf^ER-tj9$%^;}%r3vaV9M*O{fWscP=#>8r9DfXu)gnBYoRA6U8ZSfWbWeP{eJqqhf<4TW@B*2lYOH^g zh$Ef=?!8+Ba=cvpx@!* zNrh^Nhu1(M}a(}bxwtMwyvz1j#PLigkHic9cd{yRJps|hl% z^w4ar9B!K~hnqGWhJ67WOtmdD;OG+_P$lJH_SO=+g${!2;$!F-9t@&JN%;A02vpn? zao>s^?X$+v&$ zmc?m&p167Q6G&4Xfg{$=f=7N60W1?SPD~1=HwodwQ+!?b!U6cv5Cr4z6oaX+F2-)p zLv?2s2<%hipAD{Igq=9T`vc$}<%;XPJ@B1gG3-m933nSS!1QGjWLazDvNL7GXvaSI zc+eM@c`pQqN8%7xcn7PEJYoI!7F4)04vcK&(799_BW;B7;f~cXZKH$9{kYj6wf`Du z|5(eHF)HJpWGnDFB}zTE+ze8u_hNPR32GuCoHh zPYA%l?-j5u=NX9gYyk_tY;NEZ-(GAm1bUS%aNYGjPzZhk$3-uIR{unZS#1f6Opfu# zG6#mKEA#bST+|8e2JB4+SJ|nsD$xbB`RnS*=dmzFY!`G-H^9Jsk;pQ&flL(t^g zzieDX@PWu}=#nmj!C1r-PAg!>!B%_~w~I*UOJe_P3hZ~8<7xjHa1N$Us%{p*V%e**uCH}#Q2ZEWVJGIdM5(v;$v~& z=4KF!*$maQ*W$_-`%p~$7Ib_T$AYv9^qdJ;wRIVE&Ns$S{%3IHy6Yf1OA|#+1u)p~ zF+3iX3WpYELs~#5Ok+91sR&OjQ##9f$=9n-nMH_7z7dh1T+P}M7EHb$XlDgUCy|$% z6IpAVN?BRbQLHguCt31?+gRW^!bp?7mZkc(*X*mGA;}RkAZj&_Sb>M_NbEO;)mY+4 zhTS$~ol*)Rx%tyrFO@S`!kd*@%j-TH99eXZ<>;wEYMNun^(Tp}lUJvc-l_tY?T19x z#tLzgKTU{8hn-`kJUPSKaUq5^Q7Mgez-<`Id5r+;hoU@r9j8H@TT@xtUQH~Cx2D9p z|Et;b{v4LdjnAyl6Cz0Hbw^g;LrG#VFOB$2y2J|V%wr`w^{|wUrja>vq9o^yB+EAQ zF6&TdHmlm(gxJQ(Lfx`>zTSQ}Bt78oVKxG&_g;nSkDlP_UI{_oZNvZyNt)X_VtDfj z-=5(Nk%-7KxwqT{nm)D>Ey;8^oOb{WUaf_3Yl>h`R58d4Z-ZE+Q-GI6QPu1Q34f{t z*H8rXTeVCQv&Mt+qPHYA)ChcPVnI>62zJ)qC2PbLz&bq$E*kFwKQ9HCVXFY#H6f(I ztBtf9+yVkj@LhqW8;k%40wpt-Fdwgk9A!L!w{?tw2HHg1Ca`H~QRa50Rx zI00XrL!kQqVd%W$diujC-jWh6S_+l+9^cP-z9@-ALzL_yD-}_8W}%@dO%)B1ik1{o zY0^NYUqi?$J0y`6zx#i$?;rQx?&o>Vd7pD@?eINp6{@^{z(ejZY8Pd(N0A#aN}gf; z2XDmw8fGy?7jdp*ADw@^AJx9xOkn?xz8^b*SGvDxq71j6b?VVbj}f)gSQ7jh4u0K5 z)A1u)5Vo`)XG~5&M-#*g3qiy@N&cI_2!!x+Najifnnn+!zoh~nHw9svNDyvqbka?= zI!He+Tay0E2zsy70p05#p~CI6)o!Mk^;m>jlux36%VJWo>w}5nKRlZD0=wtVLgu^q z@cUIr!?8DTuu_$prR{0&UIBXY^)!UVHX@bNF=xo9lg{%)oZeE828l4LxnM;v@6X4t z%bV!BsE}bI8?Ck-yaA9l029 z;rFQTqD)~I>UX_JC&Z4B_jqrbKJlhm`^1Bk)9#ItwGXIeav?cAF{8Cih~Lrt2U0y= z{E*U6cx6SBV&E0XFYcht7j~mb*UBi9nUyDQ#=9ss{(M-Esa_JT*v9iOE^!}*KAXr z6rI!nTG}kBd!H|bhpN+1g$;FeFD0JCO!CXvKuWPU=tKTSq)6XKjH%ASnW|z}qotRd)qecNK|>4r>YRd$(Je4Os^a;xsEi;rGR2Lh>)|O4xQ~XqR-{C>26~la%IL-^N$TwJw=}Osfu%4+#_79 zi6@(OWBOjT4=L5sr0mcNwS#h0;Z%V&)@qdC@dwIQU*X%32s;xiI{B~|zaM6ivYkA( z?mLS0BdKtX@Wqrp(J(3$Fj?@s0OG$onSrMyxn%c3)#3y`dQYNn%oUE?4t$9?WYzXz&_pncEKEhAV#HTO9whB5CmPDj#zC*I1wOluVwCkGY+ciVV`7Sw5IBV% zX-d$&)&i8nl~mpZb92BYx;{Y!Rf|htu6P8?*K@vOeKj6miZKp(l88rJcVHm&GxGj& zO!wDjOun?8qL0-;Wo{7$0?g6f#c2vd>rm^Xf+b2*uwzxX{xh*la7k!I$@X~4)=R^; z%^X*|F&-i3w_@kh#Z+)fk0$1NAk{1efwK-G-!Tk_3m-AFo-F1rF^}{Ph|`&xRNS27 zfZA1)X^Qw1H1K|-`&l41UD`=L)W>>dX3~^3r>SIi7IY`wM9wH9N=cnd1_9ZaYS_cP zd1_=HatpsgAH(CNFqLkLq2_<{DY$zCV^-&5WLqq025lxOg*&wLjsfkm*-9TC|HRx3 zK?>b>11FQC%-T|o$jLwof0L(C+v}zDBc%h|ds0ZfCImgaA#RS#WETQwP|hJ2y!xg| z?LFI&Xc}dx88n*$cji#ixIE&^e1g#P3d~=B5XFaD@w4#`)C9^QGweaCpUkLCVk-3? z)TJ1Xoq6OY3?|LZQ?hUHWob96y!MmxolX+?Ax>8eX3&scKB+eyrcVyh_`1rCbc~*n zL%9^`{Pd*4jhqG`BTdUoE}tY@N1i){DQ6KQ?*Z56BOfvrMxP?3$qA2=27M2B-l>3E7heR`E+_WRLtnqb5| zFE>ibG9r@Vd|#o?K8POZWkAzWf}bqHd7fJPX)q-b`-*px_`lI8^cghFJEKHd+xy7* zu^HKY-htJjJL!&TGy>PZ#es++qk1$$dxmcA&7hc5y=3~dluS=Qq!lH*N$sy3{~dQdyO#LzJ1h$D z!ej-PSK!fl9-~7p^GLw0%tW;$kVH0hptr1wGHavB|L9S&xH?F!hg|8~lsu|cJcEMq za(tCqacX%fMOL1X7;rwSV-~lP?AKON$6^h>m9r){t5i|QsA;sIU^Lk-`$R%wJv8P4 z$0Ic=%$*kC6w^wE&8 zJMaoVleO_{wg4KrJ$7k?Ka4Bnk((k!eb(GAuJQwmf2Cm{VKFXSO@-LeZ}5E-4Q;*v z+LriJ^q+h5>f|GQ|M(T_7lxCYvMOyE2nEmaH%2c2S=px`QP`Z$iwxnfYXrw#u7k+i z`#8D$8SaT&VBIriD%g4io;D)1>V6+$rQ6WdashhxHAv686T3!<(d|PT&{vxZm1Z+` zrSTufTyr^|+hfhlId7_DwIG&n3++4WtD1~6AB2)y&P`raD4K2yXe&+``tkr?IXToIIv<3rTOCV(= zPlfgc&~rG4F}2}PQ@7xJ=?8dcY>b89J8?*V6P$mR;Mv=$l$W1CHzoaO*2BdlI4nl- z)mKSm=@eRJqs%wZQbbo z8$t_5UBp-Jd_TMFBO8Aqn!IlBMN8Lf7NQ=J())O z%i+F1AG>}#(zN-rC}uYia^}#wwFB@TEJsIuJkH!nXPVdZsngGu@)TZU`oHDmxH*t4 zA6cNmdp>; zX7x(5%A?|>R4qdjx7bqC)zdUF^g6{=R6$GInBC?{`G^{wfhkq z%4&d8H~KoBNB`#Up_!Joknk#`Uq;e=r9vON;}c7=dC~M_VJ?Z;s?(U8M``rdQnGdk zBTs`s%KAK)bOnFX-_m4qDe6Jp-vBbd&8M^l0kmP{85@36h)uo+sfck}@l7?{oAizL z=K8_+)JJM+G=oruE+y9yts0i4wdE%fHE3*lan5GEZh8&pFQI75)~A;iPUs4q$bVxQ zfypvCbV9!rK8n3KnRgfIb|-OJ=P3GZrs)4}4x+aQbjZeWC;c*L!qmDy*eJ=(xJo+M z94>yHw6Jkg-NUzgAl!@ub6kH@UV zJJH4W2Y*>S-LE)_vg{o6&09wUX_n;p#|E`Jn`wqy7*+ZCVCt#ua1{`xgEoZ2V>l*; zca}|d(!ih}A!b)QQ;i8?D&|?}xOs`~R_Vm)2XSlyPX;x*U)hO<6YO5&8unx9Jd;W3 zU)hu1Wz4Eg2Hu<}8-mFdc4-1fjmMmf7r!Wg)6I?1IZ! zwg8*3;aD3RGbe-1`V)@r6?{zkJ&ipZGsxa^dP(qLIJ_rXF+uZwmV9w5UL6}@*-}T? z;hgtu-B-?A9tdTgp=+3D+I?pFWDDo>&qUaSn|N?Ro?|JOlmDM;#0>6$!?b8>Un)mU z3*XY=Yw0-q`4y_N<;b#4f}1x=Fmdu6vvy0j<=%aGSgjRlj+)&T2{k%vCCZ<@mSe;;=acbMjzba_q=TEUVONKqS!Tv( ze7Y`+%sP2;8kt4B>?WLRtRnaMnaH#*ravZaOu)f`%CDKAyss5aau3*xW70Z#Vm4%y z7e{d)3Tb_GKHk?hVX1RC^!HS9c|avnxceEY`Z_det`gUS+_3ze(6h@%eg$l3uRJh@}Q1Ad9=aV5BvUYA%pv`sYt<)+%8QdSBag} zs5_JLV~6lEBMkGZx}eYQpftveCUN}xb-!M$IDZ^B{9_S0%>g}AIZq=r2DN?_xHD}6 z)~xu%qCHJnz@)FNP+AQh8SSQFWqUF2el;6RJ%xtSE0{F*5LRp~#!`h69G#kpxT*xa zd8kX>4}P$^(JP?psEXy~wx;=wiZJPpM-j7ydC_@n`WA!Nyh_&ab}jsmwz4jPP^c+t zqr;fvI?mr=(>U%kF`ebBX8NH2s0=hRGjFm2(ADBhK(c?)&jgP6OX|y!wB*hm#Mf}Pz zwuR?J76%{UnNu$6o~h9;Yd5m~8cbs!xX|NQf)wcghZf2H!+0xges~^X)-hF@TBS*c zi`LN0L!~I*lxgB)s7u*?$`q(_gqG(oC*Gg;Q0uLx*d@9YMSG~(?-}Y>Jg0_|C%Ag~ zDAswIqhRMT3+Me9`z}<^NJ$fHJ8^#jjNYBF=E+0RT-|64z)4oc9 z8=A>X)q)1siI7Ix8LI2QPFlAONmy4Bi)G<)a}kz`TjS@oAiN)x!rt4vObq?Q(7FFM+V&jB?4E37?zqT4 z`K!}6@AatpC`y{0oadz?MZ#-sQMo*p+`=6OG=hKz@KEg%|VPs8M!LkaAAwIzv zWw+*Iz4brlx;6s(<{tQ#GzH36#-Sp651bknVMXB~*l-L^-;-3PbyS#%E=Xi^pO>=h z3DKrYmMEe7j5Fg2S)=VMm*qQSiZyc%LS@ogP9HpsQT1+E#p&Dc1y8a?zr~;s-ojj_ zjy5%jsbNKKHqhSugiWhigQA;r;H!R!y;YY&kL*_F?yd~msV!{bY$3EPk74q{P3+Hq zrX?0BXR`f5Xw*YE-JD?9K*DH&eVcYezM+OD^VupfViJiO$A%|aMnADX|-Yip?H@67sw--PHsU~# zK59>&MPG~>avJQABiX^eD`;R^WepqMHxCybHo{LS3+v;qvzJ_Uda~GbDh~>S=*L93 zx7>jF>{`G`2{u9Xu(&Tti?VKF@yK-C$()A(PFvAWe*?1^F&K}WKyr->a&zJ!S6_^a zL!xxy*=cfrkdN4=8a$P~MvoO$$y_Itb`L6IecFH6`X>s%64ucm*AZa=KjXj$uQ6`y zQ#^KAN#XBKQ^@@R+&Qy~`Z^70d;-YJfKQeGdTH1z9l0lDadRo3vK8`-gDutQpXV}6 z*cMD8A6HTA&OlmOdy?KN3K9R6dXG!tUI`%nBt+u(^-Tn-0S1PB_X+PSOem zMQXe}i9F8*!ew^!$`q4V5M z5_Q#z-pB{A)QEYcQZt2)W%OXiiM`P6wL@9kbW$%ggNwpmX!_kI$K%Jb(f%o2EHOgo zlSjB)T!hdW&#+o94atM?CN|qs5xlhnGmEEk`3G^D)vpaFr*Misv=UcOEGCT&?O6Im zl=y3lF*385y@|4befwIYr8jgb*uw_MQmaw7~kwapaCH<=#p4P#Ydc zp?fvyvqK}uI0XWoVX)q=j4h^7wBg`2^jXS~`G18lj(o({>}_P~LK`q{_i|SGu!1VD zGh7gprty!YsAC`k+Ud`+EG1Z*x`|EatI>+j)%f*<<4Jr%VDsbwME_*t ziR^e1u$YE@@_{rYXNfd3Pa3|c4K=N0F#BGM1ifq|HlKiV;WiWNElr4du0=yX;}A0x zhUS-VSWwq)O1C(MIZswllmBe2jtHa*s}L-|FOKIA1>vqPW^~?XJUwvy&R+6%(vf4? z;QjXzCq8!JxycHcs%)Z0ckX#?FGKbCZuqC2K}WABWghNkCL=D8+F?QOWR>arr*qgm zFp5HKF2FTA1rLUc@y#QEP24-5TBUD5a>IL$oiL~7PuX}>YYzL(Lnxe8389b1$oVCL z8XZBZR6ULSJvVTm;tmR$Z*%wR5PcOIMLPXekk@)ZejcN#!u}r380uzUUj2rrUK@O^ zEr^$L1>&9Q#y*|(xMR2(WrKNiH=py5Qpyon?ndvm%JA>LHXU=mg4nIaq_-#-9&w`Z z*zJyeza|?0RM8;arb~$a6Hm8?D;YaLS_%(dB7DLGVZGr06JTDEWua0+t{%X`* zxJM1{cQ`)c7e$)wW)lyKkwu0A(Qp?sZ>8ar@7W;a~@@SL3rbuqeK%BQ5((X?GXhTeJFBWKlY`m!z^ zk(>|g^45<^R~;l}bwf(@dJNvuE^Ik>4xe}1VgFbmnD!RZs7|iua%4JQ#`d4D$&ot)V!CYTkM(6b9>HkY%0#Fhj-!Of-^u)BKNeWEu$WW2B;6@R znr&ZDQIf*j{$+%>_1;>xUo?t0=n}+p+aATrQ@eQNp%=22d3K?Lu;@@Td>O`E-q%I3LUlxDm9!+5@XrC5hfJTLWi2J>?E;Q59K z^CX(%d2`)_47Ik_@Zg!in_6kWwjONf^>F>Nk=mbllV`f|wk){AJMnFVC(-K5lkMEc zv)X9H8`BuT8`;)meoD`lcXfs!>+UP!dF}AvSx8;v=`9mser2`-@P@Gr7 z;-+ne_~BFB{yrC59J!2haT4wyJC0`E5LlK?q%5a;Hn7(ayHAQ>C{NAQBTf>tns%Tr zT@Q1gN8;7u%{Y!UcHjt?*LiS@ZC$zr>T)vJRj38o-gsty{srq6FJKgGg85_Sp-pN7 z2AYnt57i$aoT^Og*1JPqBLX>l^PtalG^LC6u-q0?+;jPZ>x%YJ9dN{gja*s{$azo!adHt9;8l*2Ei&Ze{SG#X$6;e}l$LI~kKEU<(UCox zCe##TWr`pPhzKED-vv3(+$lCpp4NY@$F-dkXj@=4q;^Jf@6h|05|G4}7jv5KjB@Dy z7{>Asb~Mqo49l%NaIr^+Hf76@^S&k6>Lbqilu{Hkm&-kA-iBsyB4u;=ev6B8{QRl& z=&qG9jVyBIn;Y#WsnN&ySMt_kZuUmq)O8Ut<o_*P>^%Pl$NKnR&Z57XKgfG>H|e*Jp}yy8{KlJ>{2`8C zIO$-7E}u(Ocr}GOznPOpR5uP+9wHS>Gm0uyrc(dkIDC2yX><3suImd4xu+NvAxw{VOs3ykm%uGIl71?5QZeTd{Bbg)tvmf`zcqI@-bq5Q zpb+Vt&!n6rNt$VoSWZ<&i@`jO6vyF&x9d&GROXVNKnt<7-}kf$%{Ky zkBpEU%u(@0L&-D*d=AF3>M?ZYo&`SWE{DlqQF>Pqk4GFab|qQ8*=7=Qf&O8%XOg+V^vAOo{Y4pGH-Ls(vjV57pX z;=rDT#{Da!!Txoy+;4(3o%1ff%y@@ywLL5*nKIFw^S{EJZXC zZTg#q3H@LrCw?H*{sLS$&814~GgEgCz+$UFypz>|;T#$E!}lB9IL<)l`$O!ycMmsY zybxL$iZ|Uwc(q233Qs=9q)&HvZo_5R^&uGz4M)-7Ie}c6jkBe9Sv$q&t<|7jkp!TRw@Vk6>N%8Dq@_*O2if6Z2-RrHSJ>@5Ol(y?R+m zxlv+dddY?aY=tR#B!k}Fh{daB3k05@jMz82#=UF8>D%Q0;Jv<;ykGmH!zP#}IgO*4 zsvl9xDxaZ)l&3jtTiRL;p4jN6jYQ^%nfOc8S)S3i7uoiBPF*E8DISOquKK zsidg~p);pb+(aWfs8d1#ZW?sr+fPiGDZ(FV`i1iyQv7K$T<-I%3DxEkiMvTrX!SJ| z{BklrlUfF0t?j7!rB15LlIXm(A}PszA_JSdxTms@hR1%!_57psWMv7$q7qScV>E6q z7dO&$u&0o;m6R-5NV-)UATPR^cAu}ovhWtH-5g9a*2dG5);Xkk^&fo0DzUn&3fcd8 z!t97X+NV~Kg|s>eln9dJ*EEEfen4=1I@1srqglP*QTJ7tzw%=U#!GwAG?6~i8g!+} z!tKzSUQd1BWcliHN@UJu_NVI3AztGox)BX}I=d8(YsBfcLmiiGGN$pP*I`p`IJy1^ zL_}f)O<6$jNe;o(obPCz{2D2;8JMEtVdB6(;j*zA?JKOnlfDAv%$otXuZfhebcN%O zchj{6i}7hz5_v>jz>`p882L)T;Bl&vTagwWi`PPQdmB37AZXVia(00;A|F3 zh1P#?E$$xD77bx?&JA3t97|8L1TdD6O zl(Vu}A9A54Eav>U>MvXAWO*~*R_=wiyArkU`-YhK9T8uYJPUCqJ1-%u~dyRKT&*g8VmgSE3~03d#9p($v-3RNp2{V#N`p z5?Vxa476zE!CBT`euUapw~*BOYP_?tq3}$3va!5Ede@c6K1Q5QpLjv&)U2a;U2Pik_5=;zx{SZx`857W z1tvFHWSSuyptjzCikL0O@jP?YoQFh_=nA@?JTP z*>j+gEnoWY1;#+jf+4#GfDhFSt+tv%W1hdH@=*aaX8S#|o}`SvHwtt-d;kya&P zuM`t#S>7J<;N2#EXEt@-6(Sj44_zI1n6$LG_fn@A8T86h@dr`zi>krrzrlF2<^W~! z=aRd)8Fii9h(p}}?;7b=)DLu_LT(Zg)T_vL_!caXO1F-jB$1YlRKIKneVTTe_IK8k z&Yf$t%jFo`nH@-L`vXZJfJcv=#&X^V(Ul?yX9xPEP5Ib^Y2n$$W)|j zfi?@v(#$m@uzt52x8I~0{VpowIs^~m-O^#`8LXx7JF>K_zKG;A?sES11gdqdM>^L@ z+sKbV;%tBXYn0^vpEHau98sZ7pY+LUeILoJnTNYi?P>e_2F%Q@;qsqWG#nK~3mwI& zVPPQzxNP8|xh2q;ZHQ0fmAM>168+9uLQ2Qp!~Sa`{%+^EGM_p&JSfGnJ&zH&HI7dC zI-|MeC_Srgrsjz+F`)dL%L3<9J^u@dy?lvYLqVLEHX~~bQA%c@b(c?LTd^@cA9{=g zLm~R0n}#xzDI_l2g?AQ;^fABzq5n*g_#}p%?wgG*>(^qek&IciP7$Q%i{o2uFUB>A zQ&rnF%+=+7bMqJ)Q1}ITj%!@55KdQn2(B6_xW{RYx(*?xk_WuurSk-nhB%M$;2$KO z{f=WcdJsOF0DpBo7+H(Zju*QiT^)~T8(|FguED>g@9bkRmyt7V!y}UxD5*%|TNB7a zh~wdQ+=Swex2S(Ufr>dkV)4K{O#fVkbEO_M-k}8kA(_xiiNm5`4Z1b^ES7n!#?Dj8 z2<-8K?|ng<?c0CUmcj8L|K_3g zLJW>PJZf}&oDv!LUcYJgeW-^dlKBfyE@vKz1lB}nDrQpQseF9@I*3_ZHZx=K zS3Dgkz?46daBQ+Azj<6YnV2|z88-plJJF))3bgL=M+|t+Atk40Bo%*}Y(s*`Sz!e= zcV400OH(K+xrBOS$CKBn0nE6x5(bMzNao8V5?1UrOPscw@+{-4R$x zwkWKj$}`-aq`Q+G=WEdH|7^+r*8{YKaXn0nA46fS4Xv2v%H@zlX}jJ9&XTDeokSH_Omsv(=>gaV)>~KoET|JVIGo<7x4#homLCkqU%0Xpfl< z^_7Xxql7A|SBt=^E06KWwgD%_<;g{1F>W6^PNr~%%FScceR>Wy5AWu>d>T1lz8rnC zBhl2EX!2aVA99xZT;BCE=80zE*81OU$?Gsm={t*zhLzk-;*R7qp%m|O8XxY+;B~4T z{5P5y$rosHnH2#D-sZa7qKY}@MS$w&zQ%m9{ita_Lix+f(8Kk>iY_0(PTL}MJQ_!~ zTK%kO?^Y=4IMBQks+6W(hF5l6KkCJN^jM!pNB=FH8-JW-RW9XN%|^%xi_>LK#`Ra# zAZC^o@rda;OE9zBM2 zT6fummE4(pcnX{7*3EW|yUmWWS|;>$2|HFb&saeIKHHytojL3aK!cbPda5H>e|NGZUWt_yGmw`>hmi>WTPTRLyTe(?>r_@$ z3?}e0hSj$$WQEtmpi@=KZtZORzGt%PmtSc5A!=AnOz6n>mf;sq&9KuWPP zY!)Q3hh{hNVl?-+fyZc(bu{df;&EQ)9;bU(VVL)kR7_i$)v7BvC36iQU4`j_`wav; zzBO5CUWk{Et|G{MA!$F8C)*)))T>2NZ^c~T=>p<46`^bNW2|HOkWZ0=Nw7TEDXnGH zxNtEQj~sw^wGzpnv&4@Z&*8J0W2)`Kp|?4ll45@&I!ToR<^Dq6un}&DbKpIE7RE2d zkY`ayuP;4-8t2o$uzG;AQ@%1++er3f{WcVp3n9^D6yGD|G+rf+q2QKTH2%FDP2jSW zzxM~z>twFuPwN(H+%~g63%PEV%N>Y*a2C?SrC2fU6m%nesBUI4`o5QT`>OF1WA7#Y8%yp?6CL7uJ5^yW}_&aIu=OdrW=!WNGkprhSI{;Y@Ax_ zK#BiaF;o5oRjDQ8iKZx=*3UpnnXd8CU@JQQHVuUfgJ{}d61=_UlIm$y+PqFB@1$#w|^$uyRvB3r2Xr?rm^M6N8qI*68s6Wnm zfLrgO^zL)GanHic>z+`pm8V3}Y_xrxMx$$#;o0|>Sz0Erf}zn^@S}{`@9i@^`pOQm zf)5$ndyea-ipS=OCvaG^3Qn)@V$t1NydQTS=rSkwlzUA5*9u6@A7Sx6^Gz2VJB5XL zTn}~MIUIR;AHi)GaBSje)CdQ{YkUs6osu9T@rF$_8DhI{<*@e?{;@VezM%;_nhxS_Wjc1ZCPVV-csjiCISaHZK~0Mu(Z^_*5!W-mssvw> z4Cv&O?O1-Wmpw`=!J)rvAiL)Q7;gaGmsiopngYD}C=H8bUyK@6i<9$$N#pz{L^}$Q zdB#JuYHxu{;9bVb3L(m0Xsp-iNwFf8tfcucz58NCMg!HDf9Va1uBSjZdm4SdbqUi= zY$-i15PzH{Xk%Y0>Rzljn3XGn!JY?DF}9?N<27u@k~f&bv5A-VhQj^iJp^qU!L7MI zv?1awWok>%gsJ`zF>1n#`bg&L^ck=Gui)JKeb9LyN-_HfNR0QHjqh)PJ(r^usjwxT z0a>!P`-Ce`E0C;r9&P#Fv}NmUY+3Lb%co5tx#&)K_Kc?b9TV|*OAd0kT*e=*1?285 zNWSw8;O+Vr+oi|TQs+8~OtYhzzM<6f!JI~IXn_9`0Xzx~r9_9RCcH-pbfsf7R$GkX zf9x!zk727Z^WRKbr7#igu;wDD5}7Lg>^F}@S`bwucgu_GBg5~p8wX3s!s36B@H<= z&Ao@vCI-pk9_-J#%{~g{u{%oR*$I(M7I@T=-Az7WIx|uYSy2(}YJn|obZ&$xXFXrw z9mUjFC-LK(A404*ayocFJh>iBtv!7#bLa~5dpO!uQNotxIHobTlYI7yoBtyN9k5Ga3-oP6*v0%c*e#a= z&%2W#IWn8;W>LbFFD1}9mBNf$_rhjJ51g~PKAvgM*#VAe?mE|nmywG2b~c6$`8r^n zwJHi-{o$gU2aPUO>JR@4wT~*)Mn#y+7EmtNq1kCMofI#tV9uHqNIrR-ElV=S}^$A0po^#&~;AWW&4eGmg4Z_1W;Uwru6HSzG*KG$jSi7Lr zLen%weGM8#Z$T|1A0;xv)EFsCM(_WzGfxj;)$!Yy^+1Y}8u_@_aTf2Y3Yo&IWr)$g z#UxUvHi^y@*0Z8guW#>H9ZOD!8O=#_W{ZN$7yf<2U2+>L-BeWX?4>y@-^n>*VtU@ zm#w0swMOLdX+Ql?Frpz_Avz-`P8xzvW{+}mX3_> zR#N3{2WVaxppR{3kka6jfN&8Sw5Ax_=;Xj}OeoE^*hrD1MXAOileWKm0qt=`IC$HV z9*TvLZ&)Zj*doslPai;f*%5lXS%h3Z%|m=%3?=My<$AHiDSP@63OYT3-w|~LMoB)D zeCZW6Dz}nTh$I=L?WOD5d0h9%1kzHBrtSO*wD%Fm?sA=4^)AzCN%|qO+igf&8Jost9Yi>{3p~C*~H$WpAraqG0w~3bcEPb5njWn@2&gc&b8A`+~{qs0+Q{ z$fv?FW5{jXLF~Arjg<>!NUe04=^ld~82H4q+N-6c;2}W2^-kk+W+eQ2Eb#ToUTRKn zWP_Y$ymIGW7~eA_DfHOa0P^ z=;LHby3esBeTHj^A02^Y)3vl<+9DDfk)YPgMilr(fHDIYl4eT~Mtk_vpNqZRS$vR| z%qhk2^&FTtHNk5xr)?h$q8kP>M7QPW$g|hb4txw1$$QKy=Nkmpt}*`gHk=kDEnquN zo+P!gXG!GMP51~ZP;vMptez1}dQVnB(PswP|A>O{I77-&F2mn%euhm;#kp>m8Kj}f z^$WH+LF<(|&GpEGolh2eSABt&$a2y+b&U9tndEfBfF_^1jX1Y%D0^*Y3Wk<6f0Q!C zeaeUOFF(?`D8z4Fewx{`v6R?woLsf+Xu5|OZ47vi1FZsd!J;1}U59DsC+^*@odKV; z7I>;xU|G;Lgb4b>W8r(Q8(0%DUKdaimWS?$i%d;t0&buOw-4OMj^hux9O5?4XFr4c zDL$BAU%>=kya9MN<7hhE4sp@hZzWM+o_UWj4xE`<6kKmNlT3j*C#UfUBweOKgct1K4Lh0Mw#a}W0*Hl zsFyeHb2;zkurRBZPUFonp2mxA59eLG+{zQH`^`JJK$QLcUdnUrDd5esapm#Ow40AM z7i056U-A6(!g;^P%CJ%cW%kUu!ra3#ikDt#!|Z(2SVL|SZ{OS(yk@4wG?J@%4q@Y& z*_jC5JF|PdKE0PbX^BK$RZm7Z{lSX{X&cKZTCrs&(rjs3~ zap=4%ePP0w()W`!?+Il|;Xjz++H_XAN5D+Z*b;XfZ?gAe({S)h5dKu{fl+5VuG}v` z)#%%(^bdy3FMSd)tYtdiT~W28kLA~?o9-;^+|hInyGhshowH&*j)tdVD~@8r zsa-@r9Z29f$0l7$#+0;a=s(nh!aREno$f=+XgO+@v!usf-(lQ$4dd#rVBw`qmJ!f{ z4q-7m%Xx^4%->M+!Iu;f?}4VgQf&G5nO2YAN6MolX`8MX%^h*3#<8K8FUe*0v-!08 ztqh%fY)W~ZqsX+{mb9~N5EbM`mk&*%U2PZ1+lXUs`Yz&f!CP$6+D6OQEv0eScha9< zuTZ>m9ybq^;(B%fQf|y63{EsPJ`_w(o3-I}&xek7J)p7kt!Ukh<23u~7Fz1NWyoT$6OQe!1P3JWW5n$v(`9FLq z_IDp`2rsAhp-p6@rA-wprqQ&NKWy_E9}-fNrK^q;sLZ~QMs}>>8{QgWq0ElUpWLAP zGU3!75=!uM<>&v`;LUSuH^sB_=Cb8w|4J03$~ z??M_U?L&HH3&?-gcZ?6cM3kY*WkXBpCg+2nzZ`=7{o`P_A_C4T;QA$xm^s9}fykz9 zSieJ={&4%=fl}@}19yEW!(=_>L@y+vjkDZVs z&V+8l9*_SyI`6m|-#?5uq^uNDR%lS#b)M@w5h@fRMA;)VJ70UZwX9I7w2+q7AW5jS zY$1_dNr>$2cfWpr^?&D_=eh6C=epi+IhTz2kIs$lSf^>h&7~cf*zDLVEVI-ZV#)e$Y^h@XyCpw!p(7V<`j%zDg3cqF7^Iz##3 zn~wCg%X!S-Y`|Tw9^+2>T#RlN4U08rkz%wSUH!CZ+rK?5_w+=quQSe^+YFQSD==o0 z8E$ICqFsy{%?_>O;ZA~KqScJ#yXi>zHlFi@pC`JNBU`NzIkxN1uvzquL#A!O1-?;=VSYj&Iuxj~-_*@G zX?IOBR_?+DuU;bSP6+16m8db~Ft6QxDye!a(Bi{nHp<*Mw=w`FrC*^|FY({UD3Mpv zq>_Ft{09z%UV+$=85Ln%^E-T6orNUXCA{{F!pZqs{4o3ltfORzzp)=-vDJ7Ny^ytu z?f7&-D@L^bieVP1$UOX58oi?im-|~;j(WD1>G`6EsWyctub<-@cSp3{V#EJ(4x=U{ zfgx2}gzvExE&J`pGtGfKD{@ocRWr=9tvbQ!wgpd*7H$OvZCp_7z(J1=;=J(h?Hl+B zG2%Ye=ut%XNf+40vJI0ro`k<{S1yrVkcQd0Q2+5m!6*!aRn0zTxHmGvJWG0mtpSPcGkh~+G0jn z3%Y%?;6~$BIAm6XMM;yPQmiFSk^8a-E&K0eX>U*FZujC0 zTW!8`48i8Ly>W7g1$`WsTB+Vp=9}qh()AS=IsU`|UaSzy$gfvnsqKJ`-DBu0-IZ3q z?#^Go0#GX61$DYQnBaZE{K1Sk6s}C9lGr7c9vutWz976gC^&!%TT?s2995m~&YMc!$DwQL!AH4Mxu=&Nr(d?=m{TI}vU~)c?jA;|OCY;U6nR$hJp3&> zT9bAL;DzlSWXvlRnLd9$u~A~4$ZBnC(~O_?&xObFyyX8$o5|ZcT1=mPfG6jvVAP{{ zs?5lz=h7kQzD+z=TRlKd|5Kc`vMp|j=kB+Pod_wNz!H%~*rRgCVuf=j+*-St+bVt{ zvDGv@dGr-Fo8!>s#B-E?7o3E{hV;uhPMd$H7%J}MQ{#_g^7KDgzbr*++C7wWjCwM> zRRp|;rgM#MJuMFANkhUPVmfszFSjQg^G_(HZ=TH;;jZo&Z(L%U)< zEZiJ~0@q#~XLZZ!&rJz41XJMAg4Ue+_%Za-0vRbdT_-jM3T9h2M#T@|CL?89jPJqt zJ4LYkq`>pyJ@C!@m$aa>8b`+o=H19Em^7gk12U_rn-U}4^VZ;}j`rOA>L(P7?!Z-a z9q#z4^7-$*`0Fr>zu&}Sweodz*ietcdk67V>^t?fPvO_FQ7qTXop{RoH`*N?&w-8s);XUq^3!Y|se1KyI?iln zTjK-Na){uQH!++rQ{3GgVi0BjlaG7+#kMSac4*y=Zd2P?>^93l>!!o>Dx1%nvJOlO z5**YPU3OBv4(EG@Y+PAFr_vyPoTw>(pzh4woeuKy->&r73gm?_pU#Jelv$l*LO5X=f|DKRFx#mzI7WVb_v*aPlg8DU3ed{ zTAI5n9`6sdvVJZ8W{s_tDqd!YZsK%t7L1eLig$2IWSL~L-Iv=IKa`X+2jR{(;Wh0f zSRi6n?i=&a>QmNJ$>enndLC5fK1Jc1blBv^-yq?u&1V9avk$p3~#ZSy!;Kum&x?6`*l4 z3|*cdg?}%RQD|6;+=&f>!yACV@<6m1Dms9d*FyD76N);A;FZV{KkeF=cVzDci!K?v zG{$mt@GE4@5v;C(|3Ue_7B=KnOPUD-P;AfxEv}B%mw!gU<#l^(+S-5;tug!zU zCtKxd?qHbOd)ccEiH!4D%|&~AQU9+V+fkp`){onZobXi5pYz^^AoPtQ`xRanzilV0 zlz1DA$Q{qI!`pE8mMHjK8Yi5dXW{ov_)j_sAEa9d>tFR{;_`4ld~QJ-bsY}zJ&p3i znmCEwak173ex(8JF$I z<~OATE^ElcZO=T+>U|8){yxIq%^QTDatSx>IlxV01&P?@KCXOAIHu@<9g!tZ();@7XL| zk=fx)wi`#!i*6Xae?R8W7zt%xWv2gHh%_qmf^Z|*ytL(yY3+Dk{W9iUD8l{2+0v++ zbr?TKFg|7;h0^Pf;(Oc3RMA&A&3ORr$6q)pEtop1w7D!qlLp&-c&czGg1!gx;a~^O z>7vEfTjiYiR+XbYSFuLN2aOI(skCg7G;3%VY+Di{ed#?wS|cx)tjE6=T$3sI_n)uPg*Rv#h<5+3g{KK*Z4@}`WXpiNSyw%Y63XvJTEWq^%vvIV~Qt7ODjkHhfIreoN zh5-lLONUI%F=b&J>1oe@(&g}@((ZFRtPITuNQSzbasEb{G%&(l@^(8a6}<|SR))n$ zih>oe@skY>#&$)Kc_#$){wF24B}oA-dD4EH09j1WAJU}#npk{#qg2=G6q5F3!6W{I ztj76_v}N&esL!&)&v!qhQ9t9P_0Ogu+CN4rSYU(X0v~BoZh>Ub_lFd}?4We&Qb$~I z*d)pSewR*^C*edOL;U5Z?xmW!U}jxVd(tn7y=@-F<@<##MT+8 z+wNmgp8OZ{_oquQ^g2*C$dn}?)6j2x0fviQZ1*+SF-v$sr@!mVa3x3l8?@ z8Y(I!X{tsl#`Zaa8C#cP{Qkkbp`FI!F|ksTUOEzATCwbF2OgQHMD?+$!pRhd%QHvg ziJld$qK~00tqK*s!pC?$1RGcU5Db>~cs>6xgiKYFR;y2A zN!@17r5|Rmn#^v3aq?%HK1==`#QM%FOdT@+Nanjmjz7O6ANeVub68*Y8xn~DS$nX= z=@))1FyQ^rP?ngUW$KOowDvxVpFYpvc`-`ztpYXMK7`-h2t0c=m!&m|^6F+zqpZtI5p45Bigpxyn2lIunx-Q}$5Gj?=~OFYU!WW~9}) zGxK0Dd=SPo&%(x^he5ybxcj~phK~z|+dVlPYgXcfUVl{Xl|%dbbLmS}iR9V*Q#x8Y z2NM>oN5OVicpTG$lTi&eWC> zxM1Pxp6uJp7#AzlNUUq?D zxBXdvW)SpW2eWX=EuN}72;XDj?A|_6WGcNmW63)1yPAjp_U&WQ-gpe}u!~pg&ro|{ z9fut8<}00hyigI&J|ix%o4JDgdv!axJ|1ARZV(fjtoZi68x`FbF?OBI+9WBOajT-G zg!x+XA!q#Q?w-m;KiV^MXbjs`6>yl=6y#L?Wc7y6*kc*M9z&W1ckYMT)`}0fGwc)F ztB&KU;nw8GLRJ>)aj(d&Jvq9JNwa^m#~gbuPw{8|Vn<%TJ(**rzAO)sqh!)XR$V{C z$lSkJlsc1YaxM89amJG-1@lw#SN>A#EPpeuEemFx*;#oO{j#$ah z*CwIKM}u?JT;Vhf{l{E6hGQMA+3t@5n{D-l%V&+x$+YhCtjK7ba(U@Dthqo8~( z?0?+B_BjbCy>%9;_Y$S755LjhbPi|cZ-&>3T>Sdt%^~NL;Ss9H-3#p4QtQK=8FI|5 z?@2w+$Bbb*+FEzx^tqP2{y^}wwkXqg*kCMrxsY%6i2K*RHBGWLnkhNY^Oas7DwKX-o@3R)&I-3uw@Fj%wXilR z1Ou*hL4s=(&hNFw7}5F2n6wUV_M4!hx=HH&-zkY(GNfq1{||4Yfse1sB<(U87A%>9 zjLqFKrzTFSP1C`-OjWc=P{f?}x1>dS*QK6Sp;Cuk&XRuc66wnlEsT5L2~T5QNY%4W zNKR{l&}oY=&eWN}O+guZ+JBY49v5EkSM^eCp*8}We39Wa5z8i3NHHDSB4+bd98;~4 z%v`*r2=A$wt=}1^!@J?7t`n-a9Yl-Bgpc?u9CwTN;mXuey!bpGOMXqHOY#$GbEa@O zjw+X~DECLsMMc{?INEySbhg zSUFq~epeKbdj5~}-gY1EeM!N^HMMxr*ATm#UcWCQ;rvy}v9281YC-UoJ;I9GecXTIG>gut%1=3! zGW=XA-vpV+lf-;!6I|q}cU5>ab}R_ za1(uq)@?cQ{yb(XU1dpj z7vJ8*j(|L)i#dgu-bU=?8#Oqu_z3>!O{9(#4)+PAn3?qo0WrHNHv?RnQZb?3aeTB* zMb(ga%#1COj?|q-YN<0jxtzkR3$KJf?kZ(h51`jQ52mEYu;HLRvo?3cVwIVko^_7@ z0;l43@nR}GILOtHrf}eXcczLg!J)JucChrrP4i{U6YL^8_vtJ)F{JH_AGqRU$bR|z zI6Fx}Zmf5kug;0R){t5}==KLvgK%gcB~M;(v09UF$fkc8lB`%+Zaiw7=q#zyb4?FM zYH2aa|enJPY$t-Yze=YW(EJBQt*AWjy%3(qOSed8p2Y0i)E z;*2rVpLlS2ZYEcc|AJ5N`f=Sj1-WbC5SDTli>~(-Z0|N4uO>3;(^Fw{JDIg5^LcG( z56)CeV(W?LInvXMKkSnE(qSi0**@U>$8y0ye8wFbld$#m3Jy06gU|eLINx5ln|(T1 zn{0lDzSW*mqvw8RU3B31mGLy+t3vzR-ZT?zrSnI-GpT8S++CK2=BVDB;rAF-ngzx; z+7`mm{y2{v8_Cf(#NI1IQ9c^4QLtwh4kS(!yy%nE&#r{&{pZ+eEaqZ)+Kd^Sja?Ic z5ZPkQZ==_9f2j$w_a}Dd7O0l_cFz5RY;936_j5jjmsH`zu zJGcT4fA3=W2El`=zl0CZ4+!o=oz)}ZsH(HMEKQl=N@w-X?5H)1rv;mEj_})7>Jat_eR;kV-+fr~lWfsS_4#TVEuGp>Anbk^m z%vy6__~IP-Wo;qiGiq?N(jThT+Hm~%MeGvv`LQVq!PbULD{@y^(DJ9Uj9)WlH=b>mxqOP1bsB%#+@x2T z%y)ZSNCwp!6F z_VRDpw0mu(8?9SOE9zCH;~6$mqF0X0J!_9lZlEO@S1C$K`@&_3o*A+piSe>3r&Y3! zL4mSWH9@ih9|cJYw3g20hs#Voe#ycXMMz35M`dMWZ^>%<>q!AwGo+REnzBt-t);4- zv6B79@3Mb8zsLrBS|l~h*dR^Vq$F+a=^`tS{3J{FIV@AQvy|i?eo2oF#=`1PGB#Z` zhMk8FdIVp?U6CDair9z6rPbEo56nU3I^ z6l2(!1azIU92*_&(6+lV-Z>l*9KOC-UMV`>(H}%-Y#&0uzLku{yLZAKRdGHw!0px* zl1lG%(HGf+lzo0k8aI%#rcnL~u;txv|L}XW9ra|wY2T)q@6I*y#GVYSJ>CK*-+wUl zdXJaxX7rCuvKsfZJ7e>XNy~P(mLJ}~n4@R)q4n18Y#Y{vgGApbXZ>sG-;e;_aTQ)H z!Oc+|7>P2Q)dSuws6xK;A-YZM$Qfl72wP@AKhassA2FPBrp04Rm&?plOT(`jiu^c6 zCj0{`yzAbb>$OfwUIEh?A{;O-FSBqN-f8}Uu zAL=5%FZRXSQT|+bTX5}*1aGf=DE&Shflb$RI-GLlvP;8w>smHl6kX(fpM>#|+Ce@Q zd-p=qj&kFU9pzTzYuU-XBO4hcQapvM+lxZB`b5l+E^8Kc~R~<^lNmh*RHipUe8Vvp#4$ZZrxN>(c<2SeAlEW#? zYmSi{wlNUgO-=claLEd@SrfH5^~}fLCN4IVJZO!&NUc_Q7IWO+HO))fg5H zui>sF9nNp;F7IBq0&~BsQ%lW>JaQL_iK$3e5ll^cb>TGjW?^^1yEnB&!}N~W(#H`V zxrykzWTv&5ni0J{girqSH^FjU&DqPY;kV;5B2{q4yKUlAyI~wS+KE@+d*NQMV=$Q) z0&T(nud;uS6AtGwXqlLI*@$@A$R{}Bd;oR}zTs!!4Vb#sz*4^m>JCMi--&o#tc!MI z#$iU$KiJMy68oMjXeiu=xd&JBb>%ANerrRwWsih|q5<2Y#QVR0CB}|#z^SGg)VpWG zB=bum>v9W!mM7D*e;Ou@n1ey`Ui@qF9`cJS^vF}E%Vg1$h_IyLJas0aFt?M%+*IT1x>n4~QO~7)S;Z{p-)=S5=GbBTiDfFIPkBD+}wsot+{NuXZ zV6g^$c78$fqEqa8R2kc*_~SqG6ucd9!1QH|zv%JQLy+_E#wr(Y&31Ctgt<_%pM=Cd zEjTo}D`#9?jOlAEu=%ICV0TAh^{{p@9-{&OcMmY_ggQGd+Jw@-s_40NB}>DvOGRbs z+%{2(<&6`$#BQ_X_sfb6d2{%u&pzzTyo!Q-#{7ByKO(s=YczUb@2E;_c$1BPcWv3F zq!YF4+tSNAP-H`G`FWQ!t0tV{iMH7|qj8DvRD#jF(`(@}*^9EsD_A={2vHlItj-N+ zhxt*xkm}QomEOI1zrvQ|-{~?~od2@g*J0E)UE#1zX0_^IOp6V{AERAxzEy3qR{0Y! zyA{XIbmL!tk;QBsz=LH`80glS#-bB;lpWXM;`HBk9Qi$NciSbTIQEgMs4mhhF!Z}no+7jZs#8_YFs$Jl4gQjWB{!qG|X z`BwA_<~SWirld&+*H2im!P0u9%@k_(I4vE#7f%y4D>17T?8B?7^p<(@U!Re}n|%xa ze%i^^=ZIO$)8X7P_JP=KEHs}wM)<%C&ajQ#l64=2clho(y5*K*#NmS&{l$)*os!s8 z@*Sxy6WFTuBO=cA;Nd$LvCGH~C$swV@6}koU!lfbhCXbg)=Blv@2!0{qmGUBzea61(7lz|nU?Um$? zSKj05%ULY_0=gy%_Ib$^R#~m!k%bEEJmC$}C*%tL&{G`FyN*t6=2=H1ujQ-$PbL4r zKst_)(f-*Yw&_=oX$y^*RX3P-1z+7*WC76<(o-c#ndt;vcGJ=}Zw8hLmlBz%WE!`~k@Y|>&@)*A5eVFSlaqfZ)u3g$z-Y4fV+bNwzrf^iu zINgEIt|(Gn_)VIWgRDXfXK;_rA?eA$&hiB!H}zzCZyH?|JD~0D>E~R{Ukj3?C$e#D zjxU6!;GwN{_Cr7WX{II-!W*3)$6t%pIOp*jOcY(FiZ)HyV{6Sv2b-|8f4P`t)S^^F z_^7|z@tcbl2UYdqv)!4}`yGDl^CyVjb47nj^q_Zsy2RlHR}pzu;@6bzR6bV1j&D0L zQuMFJRV&KPD+NQn(3|UKS_^)S3QOM&W{3O|oL_vHLxwEiy-?6>#A(`Zzsa5|9(=!} zntGRZ@nO?x&fPYQ{ri0p9EE%=-?Ei`R9<4-)xJ!5Zop^jGOa@^NAaUxjMOqbiqp0Z z;L5@znp!H-wrzmml6K&UKbh!f(ow!{WEpaNyK`Dq8O9!&ZFW2S5&G?XNRRAAOk357 z!?N#EWl=|d7`X>=?rTIwC7NTb+tKH`8ISGm&Xpzod0Ab|1rq#l(0vd`cFyMfwF8*j zyBAM>E)(7UCTV5WGA@7rjMCdOUWy-y7hg)aIPZmMKAABr$A#I~4)B`ThXjlk&w(A4 zcy)d-tE6^py;3|QvRZNAl|m{O&EeNdPlnmr@@Ma2t`0rTybj6DD`irwk zAwI1=i7`rhEIPJ}$JqFOXo(5r=_^j$F#G_GMFwebbs8V>7A&*Esq3O3pCmIulE`E4 zjPJ>`tQ_-BCzSc@K~K6@_GWqW5Jo?|#ECbPh0{$i((|;~O)$-EX5SK-@7^5x{uP>| zA0S#g9f1qOux^(IXD%_~-%sLqCRh|3mE`j9nen2_QA)dxt~@Z@n>A9=~5OQlPF;h}e|zK!6|!)f|;AG}09>(At5O3yxURl0Dl2}h#qk{`@p z-+)Zp({#FAN<6le`y4byKz~*F!32BxwcrJ;KDLF|Mybm?c9|}ppVPug+b$sYL>vdT zY~YM1ZRHPM8L`N>gfZ@=kW0fDm(ZRYbWLb~p#$$IFXX@6R)XewA7)>x@i*-*@>bMg zuu_oqtM@zETR&AY-tEYb%R2Jz4}I=jU5t%0%CRzkHVcN|lvHN;ad_`A>=EqP$VwpnzUmYtX*2JBwX>G3%ZV2Wu80 z;LJeU|GkHE&8=AXelRBa`(gj}VYnpt|N7REIINqFW5O%=A@d1Rp2g#{@Urgueh8B% z>_q07biD2z0lQj}PYQk~eO%uO*PiR(uGt}!j=h5o%lBelq=fl3dOVURW;VlwXJT_C zo{r5&iMT(z)&}7H+(Wph23j>dL7vS5SZ>^e1A8a3S5C-Nr9V_Y41)A|vM1L+59N)qRmE zbQxYOS&v9!^2>?T^2(&_(FDGqca(NRGI6=fNbZ}~RX*zMZv1w5hxse}&_Tn<+4(=Fy4R1Q0#saTutfMcx znpLqr;Q&7s?Z@i&%eca;hEr0#E$F*OB_^0q5Ezf4r zc2WlI>L1~<)QZg_bCfCg|1*SNb)!pb_IlyQnERraTO}7RnqRo#GM-jHBNy-Fa@A_4eOYEKnE5*8G~RZb8C_P;NW2SQF6zg9)8pBu z!A8vNz;(kixctTew$-uZx_CL;oa-PDj?v(&uP5MEyo~8q;*QvFG))FL^V^O!tUGMO zvc>J_??HBVKg1m-ZRESFAMvl4_v|uNVES7XDwe2mkNr@V%KBPq-_@u7=Sr#n%=Yp= zZ$;K*t6+*QFk;EN_8jo_j9{1CmEr?t^Fdh&=JocVOJgKHD=#(4kGYGkdw0^>xjlRD z6j=G-wI9VP zZPIZ6M;60B3pQCwUpiaQK%|QsCJ4@T+lpwITZ=oMVK~P5`QqflNc=wQD$Ug}LT!Dh z)Uflj6p_{zlk3wgXQa2n)2d{d>k+{q(^`#QQ&!=k`#UN5<2npE_Cd1Uv|UP048*;x zRLMrGS@J%WAYFRc&s43^SDLEeht@xo#jN#_wAeTl!?JHkFMpkuVpep(gAV_p`?71& zimbQNHMLaf$dPA~+QxWE|K1eoev<|+-rj;4)2~P~FUCm+Quo5;u&$K+rZrLr0v9)m z{zUo~Dd+quamTt0-TS@qH)0O5yhTf|D{*Pc2K4UeCYe5+jWPbZNcR5^UsmeSO5_#a zwR(!a5A)GNL$+@o%-1)MF~w#y?T2i@4R>|gd~6SuWPePE%7?t@kmb26rf3FW#kenMrz#ld+Y*4fy`r1-KrrM)C$m}jLKMOv zm;b=^I2HMtA&U{&C4h~&TiL$Jo;#eJVYtzT6TZ4~QfE2T4MI6nKa>l0h?&;Cp)~&Q zB`$mn;f?Uca5J3Ab2ImH(VQo=Qyj)^O;f2Z&d^z-PH^$2MN~Xo#yhEDIO6Zd&_Qbu zQTH4buirv@X-8{A!z-99=6+k|3g_b6flR90N15YC7>#jcTz6IWun{|;(UQE_F$n7| zf9f@8RL$bjBQ@_8UCMKHNU)~m-^ zc#6GoS9v;e>?QOQdxkHC3t6{cNz7%0lj(O``I-yk(X6?Lf1GdfnpOn=o$QJG!ar*> zw2FJq+{BE7TWAoH!ZYI!(Q;@Yd-T18vu-2!J4fWGjneq7Zaj()R^!fyFvNE+#3>DX z3=Gaj&aUwokpB|xELHHkusgIo|C6-dG)PYt8e&{Znbnu$>FBMJAsNp)h-;;hh+Vh_ zyLP2Q!!r>->I>nVGfHIpMCN9nGFq>hiafEG>!smh64)2cpqG5G5jq-q3v@+>%qIBOh9!U`La5WcOsaWsbtL@=Z9ORV>{Ld4nal z-{VED0|xiqh}!fDEYH0Tk0HR*$gOyzU4r?SmO?+p6Ah-V;jp^`At(DGdb2Xjl6;`7 zpoc~OoyXyo8p0t~jtey=3?H~1b&a?1)Fgl_OpggxdGg&FqY}AJ6Nj(I$|ZSdiG$!ZzGNkDM9NS2XXbB;At5Cp#!e=iWgJYW%gpbS`y>W`88q3faD)E@lKrE|IVc!{dCHFQ<`SElRi|=}K+zT%{ z@AVUW{H?r}FW4|a*QD?X2N)l*gGjH%?K6AXe*RWER=#DaR(C!aG@29Ebd?YGD`NE9 z{_>jb*^*<@Xg)imLi3Rm*e+F_(JKd-4=RkqtA>5>DAtspchBP2kH@IeIfAA4(&_s= zj;E|+`C+T7+@d)Le_Iv9+DC^SBYv2AWe?#Zok47Tcz{$3W#V52dE$iL%(oxQ3c(TX z`cLe1R;seWvY)tDyhK7;JHd$=%yB(mLZkC8&K`e^CHccSOKl1QtDaM>%#_O)_%i2y zGbWp<$iKJ^gU-9o@@dj*mJ}bNSN&|BTYsBtzbngMZ#=^>8w}y{A(#4d#_|4XPu|(^ z2nS9K$5oS)sPEqywfckM?%Rv!&un31p#yz4?Unw_EJuXPA!JP#?w~unv21X%`So{x z_z-A{Ucw`A*z6wUioY;&W>>m(Ig9Wn2RHrH3WCEWIxTOmA#U+wD7NWC3z6Fx-aZotiq~P_^EH@e(Sg4Q4i{Zc z;e+mZ4};#EgoSx8>>l$4dd1sO^x_B%JEut_25#jIvwu>ny{2sSSCzX$U1%_`3^9V` z@j?=O`$lDsOv#ndQ;&W{`aCG+>GB7UB;zANh#1%jw`VO!aQJ7*&2Iv_yi-E9FDa=E#Lib-2;MuWB8kn*jy|RaiyV?Tm zO3*^X$LpwmlqMZK^GiB1K27@Opk~$dQW;}Uj=Vp%S5Dx4z#9r zLg$CUn9xD6?{+mvAyMr_-{n6HitU9b!JSd?IY~+oxg@*rYUx|My^^x(N9odmTT*86 zMl@cN!|AFzr^JP#%l>Ehb4%Pqs+v&JEfo4EI`G1j?(li23b#eOC9P~F+_z|jQ58q5 zbfe>7Yo&*B17B1e|Bap%ZupY+8V4e7V%_d)RD_6gr5f4u=3uxC(!gt5(=oboEc)w2N|Uy$;p5N)($45e zq;^S1)`u{RJ5nnJG-)uTiy2L8ZbR?JQH=eTf)_XLVDWm@w1(L&qS-7v#8z(eRV)CL_Xhkni2_FoLR8_Ue& zXV|jY291kT;F+exTc?fLs2GjSQIpJ)2h}2N&;w|!a%N!r*IZqlj_5ZPoa8^6d)_>u zu4@>d-4CMWv_703+9LJMoy*&w^3B=?8*tdnc^Easjl&CbS*juYwL_J7$kdB}E;{p! z-%^$@GNtFC@7TQM0X8N@plZz-d=s6>;jbbn?Nt_D_aB%S(usK)-MHBF2+!_P;mBwo z$-?F!_x7~Lg!C7P)Cys3-#8X#T)^*k9vox4lIU;6;Qw|=v#xp3?#g!IW;hJ5Qx9SH z{xIxn6zHz$Ao#fPco*Qra`k9#S?$MgmdPxKJ;R)(;x4tR5i1kpkmMU>ddPh+j>M}8 z{_7&18gELqwWb^$zJd;ex^PS50xtjf9W{F&LqFgm=7(&;WwUO)5`59DY+C@XXP4lI zILo+hGU4jEi|ENiu$Z=pm3L}!WM3}-eon_>$3R>fBe5wX6_x`OxwCl*ibek;;HD?@ z+P_BR0c|`Nj?xFC(_wj$!QY zmsmc(Qks0_KfYZT%#MvG;TSxFg$;GMx_ky*_vGV?*GZU$gmJ<1Eb_o!>W3)d+dfyi zwJU+Yof0>X{(#DgD2s8PHOLMbi+Rjq5_^KMBH#CBFZp+0Eu^BF6 zPu|Ku2kLis;`gh3^X-40Vrhm33!B4mqNo)=^jyR8L?u={|BTCF{iqe;$$kyRFrF%Q z6i<>-D$hq*r8`F#t%ZYmy*MjR@6q2B8!p?jQ>r8GYxU!Fv5<&oxC;Hg7MATJl^^`a;{o%&aec|juNw@ z=~pnVsFoEj8|bE+DrO<3(t^)b7-uijIb2)nMou_2QKCJqH|raRX}<18`#XXeJ&_;Q0RM z1gokV9&JZ*=M@8P+_8u6G?LK(X(6NW4TIIzcs~}FfYftszc3hpll6)P>wTDL0(&49M8M%Wg zV&Blx@)~xzTKsd(lKySOd34f8#J<&H=GPip{#wCqvLa5&JT3JLeuJ}f|DxZbo4Bp% zgIw>>0Sh~d9`DB9I6B^idCC{Ld6oqarHWilnG-(;3MYrb4E|VE!~4#gu`6&Q4j13T zqW!)58$6DLMQ3eJi<2qv=aQJ zfPs*fEt@N4j&3WP@}NK#;xJt{E&G;i>%-Bq0bd`R9=W_*mK{D;W;IEYWTPyl;XkTn zJ6m@bcRq6|#QBD87V;||1Zjemc;Uz0R=q6hoyi$hyqhzl;oRT?xcqH4j`+zJ& zmMQZwt(CbKkCj#)Z;{;$u92<(xl8un4tr_V#4MZKD}$u|CB~9QWQJ@-kcJd?tX`%W z=q}m4{~`OP6K7M>+Cki7@5wyoRmx^m+N0-%Q@CQc14G_MrJi%an)P9j&e-E-d=Kav%aCt%Qo5|GfT~^np}A{1 zoL?AX<6b>T^25?*$3{uBx=iv=v_p3E*A($THuXjos0QTq*$yl(_FMTb*1 z{hZiY_7Uu#d6?jN438>2;Q!VhCp>YKZNKqKWM-ZSK1YtK*gYOt$jYsIaXV6tXV%w2u5%e!5e=`eJIvjWDYLk)1OFO{ z^JLe4^sNyb_r>a5IX|EXA0wzj?6c5rZdJan`n5?BZBPt;q{GRIQMgk6XxF zUv0-eH=1B(lE5Vc?we{ZIf8kE=W&j14%e%d@Zlv#x%~PfPSh;mo09>oHW?=W)7zBu zjy0h4eGvPqiXGAD?{ILRNd24XT>Vf%zSkgyhmsAMv%*`hWH^#`<(hKUUXAdYbCdVP zp2*c*@VndFirv6YmOZ@9)^#nsdPr41+8~#!lCI&&iW>gcoJ*^-$GB)m1)g~dPOgDA zhkWib>NbvNi|^uoUMLN9BB8cibXEduB{SvaxZF#e%Trcb&wDFpq=q?8KGmS_ z13RvgJ;SOMqO;&&!uJ~D{}ufkepV;B_nR*=r#GNsXapw5eXt5o{fK#^l2XE6pMSHU=Zk0dbSrX|F!K1s8tqrBcMtTYw%DE%5wr09B(b*t$T8N)i52W$|Df)z?Aa zPl8d~?W0sWM91W5D-GzEyTefDAzt@1qqk-nT9zt_?^``O54(jmceKTM(t?H!hH$Nq zl!6rEu<1v#v37J2-0x_>dqWd4orW?(u{W=j8bGT_u+Prz#|7maoC?{9shYM}eRGD$ z5R#6;Y!d)$#LR)kzcZ<`h`Py z{yY?E=Q5BacF^DTuVVGp6kOY}8GYexTeQqYgPN+&{lS)wY zL1ao741mU|A^cq^XL#UUzReb6&>WVz_ql9`M?CN-BQ4!6DWvuWQC%bYw5&b zPRvJ>H7-e*pOprMKcaVk;1#DNEac3*SkB!cW)!`gsB-NP^G7}9&a5-oFZN_zHra|9 zUA9Tny?Xe0f5#SWA6gdg;6SHl`t5JW@+WI}EZmOMCw`~$Ne#Yu7mKj`o@`fm8r_41 zQ!g_b>a&G2T0es>)-xH?bqrK`WHWoLB1c3n;a%?vwA+)#*b)zE<+)&vZR{XlzRQ*y zXSr~p+hf+u(~wW-k-<$nJK&VcX1+BK;GktgYk5;k52%I=97x%A|p^CvI9Y?TTCCRyg^OMQs%7;W`UYMUrwlH{NT=ft}|G; z4Gh?4&RH6d@5ua+%`m!x5N;uhp$i_I45rJiQk%E^55{WX5B{l-1c%3Sl_ zmv2{&V6-@YeeoM1H8mwOEU>pc@w_H`rVZxPf->O+?IMpcOyi8WkawZ zm6`*&Q@sm|4R_LYkQ!}=wBY+Ak^hLjz;hF>h<->%c;~O6)0*3Y1FX##@BgAxahgTf zW%XF`!v#6>yUT+Ge=7Y(I6DTK)8u~?orgb`?Hk5JNytnQ+9a}{`#2t@NQ;Js(oQ=i z8cKVRUC2m9Mn%a;6iI|ON=wq-r9CxNZ@=sJ2fX!uK5}2zb)LuZ{kjWp^X6H+(Ugv( zMWZ-Ou>tBXSD|2@kGa2&TB&9!^Jcu0mmEaDs_f^yo$9>)OM_K5L)kn(fZL|s z$3GJ*PWo9SSRxIAQRm6Mt%5Ogh$7biGobTIJdSYV>h62d#9A(| zh(_4LM;t$5xo|Q^G9|@{Z3fPxrZ^yRh49mAS@Q1p0sK$!Q+|Bv$Gvy& za)og9{3q_e%hikyDwgJBSrQnKVl+Wx}mQ=(>$W$$|ec=DkUu7QvnVV|@sQ2faCJ za5&X=6;u6&2ABCv;vc>FEa;FyhkKWhenIdzb-XyG&qTEhDiXK-!*C|2}3jIxzWc-8s=SM~6K z%RMo>jQ)kh8U>4YwZWJrvi6nTCevhrIN!VN;rr=#!Pp*jyw#rG(~`vu;Sd~D$06J3 zB33;Vv)XTh&9TRxnOg4=H*yfe8|%@%#fEY72FmwW$Y}EWK2B)Yn zPhd8tzkKD{`}{END#LfJ#@2-|u-Ew>Li@Y2qGliP^={ust-eC-b7#I^yqdqRsmT{- z8}r!Br=0J$i&I{e(Qd1lIXW~@H&l|;1n<;Lut%MB*ydaWO5dEtZllT6TPOMz^PCu~*PaD+g8vp? z$@3a_s6AP9hPq@+JuZgP#QvvfOD(70@a?p-j;3~d1hGqw`Fvnbc~^nE6H=$+fkRew$6I1O!i-KNe6Dj3R*b#3YQ^E0;o z_ZV{vcUira55uJCMzGj6gujlT=T056^_EyRhFH*C$${n`^BFO@mE7T=4>F>n(fs)s zR32(sHE$Ph{hLADUm3=Vj~8K*bC~U}zC_F7zj&=Tl+|6lIMP6{mySljwsi_3W?z8% zOnuxHS>Xjc139wY2^Oj9FNuKQI!XQfQ&LD^oX{q!%X>Ao zHLfC? zbOb&c!sV1nbPHXJ%Jc#$Hq`{ud|y;2tJ+mMG-BJa$I=0{e`vb(N4OL(z)$oi6dQhF z=Dd!4A69|^>z46wjWv|}q@!itSbTdPZX2yxgvLkW|7-bagn8$oS)3=QR<}Z~^KvW? z*^PzqmoWByJOVrHL_l+DqWjLi!ju&mWV6?p>e;&)i@WH^QyjxJvnufX;IvgNR zLiWH3+?BF~9c+%He9kWHQ+z?4%_o>~*PVAqS4opnOAuKeBlbM1Y)PMwzE|d$C&XK$ zuKQOE959yO_h!?ny*)?Y+fJoQe=fNoTwsO=#r&g?gHCS6uO+rvJGc`+v|nJZCz$&U zD~hpGV<3MX&SB%DBqk*6#wJBWM*mS}Zfi^73%di2`3cYwEa^FW=VSkY?FdR!W!-9U z{6INB8Q#Wv!3Zf@c#%0@1F7Zb%TBdrl4+Xg)!j9g(t<)+-PeWArV6~O=*kR@0KN~g z<^4u|;VCmjlj1~nS#%VGM{4q`U{;hox=fe9UOc_8oV(8tp+@KZ{E=wRXNh%;TiK3> zD%JR{NiYiCYTKHZAN{@}x8p4io! zjg!UMG+-M#{r!iEs&c4_K7`8n;avI2pJu(oQ6Kk>V;277gR@h4FLFBd8|&E|n#MR) zdv(pIoo?5zH1w@ zb4S5?6|T#zn%_|$Zq2|xJ}kDn1fOm_IB7*57A4kTl%+4{T#UubNrF$Y(T55z)j498 zBcv*C!EIQAfx_olsWn${_0F^5@e-7JT@bDkHyXCYv*@J6PlIh)`&N9zyoCp6Lr?ry z9>q_a9eDSs7A=;FPWK{f7|AE%uM{Ea?-Sg?4q<4&c|4-+!g1!mG@1RykJ4C)MV zW0x<$u;P6fd{NFfh+}bQUk0TPk(2|kW9{a3n5xs7iV=%3{*z)~6^oU4F}*GN?+)VW zRh_7{atyWHwK*fnmPNYSJaoxGaHFPBx7`W+?z|dLd#du;e+rfb6V(KdnJwW8~QglV`MdaJkw74UB10VG~hzAX~VRq*fo1X;pb!IWOjep8gG)s_wRP5v!`<$8ViGPJ_gxlCl_{`Ip?=gt>L$^^S$dqerMlk7B0L{h-mrs5sMm!2Z z{aZ7}+*#N6VoM&B=Nw1>yH-?g7s;|;7x_`Y25P%k(78iLuH1G}yoJud(dH7)$5}A< z^*)5O7Oeczy?BuoM3lsHVUY%1lb1mSXK7g>GEQ}UY1-%tHJ?Pzxj?CR>uJoiPG!v? z7wYItV-5j~ zT6z-CZ`6rwT{Y(V*l>*gYiXo|8)h98zLtBT);8UiVSqFTNB$%splYGuQz^2k{aeZ7 z&?2n$iG_2XaKb#ZhsH2ttS&e!Z5NESnJssuvcA)>_(>er-`|Greg8@4x)tK0$1u(* z-3+zhG}QEo$BO8`aJ4iP4wX9LiWZ&RGx3la{22_mkNDtTq?C`jyzaIhg~Z2%pp+7&J8#s)vm1 z>TM1~=kRoD?n~g`cKYn0;lwlI8(HvVfTX#$90o)5soi~=;5tv{rGy~p`)IK-Z!1EU z^o5UoIBg~vGqhhGJ~{?6WSF>|tluMrPIv_iyI?MJ7JGqik1#MpjVd!u1lPM43tM_} zjCgy@`d*DuFB75s;2c`McSf0&BBM|GbHA?@OA<{kXd)^c-_t6zUw=|4UJcjj|doanx1R6{F+qXOry!5%`jbQhD1yAv6|rT@HEV(`mQk&9b^<02Qf^ztsK8w8Tq_c6nR!hu$;~Da{e&5H0+mNZgms6C6SMAGDX?~8_ZhTA4|t% zh*`inDbF?(3AfK<$^bboeQqW4N)8zR;wlb49{|~)^HRqSV^HV!O-fy$fz_uYu<&CI zlAekF)%^y{9wj^-!@97|lv-r1A4H?+hTzo6xP8AwT6t3o4KrR!+Yi{Eqn8& zPlL{(n^g`v^t^;U#iDOBq6V{vG>bR&4fMU>%#STUC39Eth7PyI+{Q#(x%lomulL6U zn-y>r?A40J6}VtF6hAa(VAKG?Jw3P+E>0RKvKft`|J{|=#kaz<&Znir$M?XxLkw1) zeu>XFEfKw5F!ML*)1mtY{6M7m9lML-C5CM8qm5~T>p1729g9B*Z|1#eP-uG!7Ki$C z?Z&6DY3Bet-Q!5P7Kj7qs*v20$nlTU_#u7 zZ?CfAj9MF(r|dxXh_7}ZwjE-q$bOVQZ^s(b0v2vv%$k55Jd${o@DJpZ%F9?VS4E!t zw+fx^+R}ZR4!`cxx6?jvz&+yq=RYo(-(H1qM7LzNb`cDmvl}FLao%_TPtN!06S%e0 z1^&5h$v=66XrS7b-W&8d=f{4Q#BZi<$tR3!Ji%68_b7WY77v|V#C_lr=PouJ^O!U>ZWaZ!LNKkpR#lBbbWbv?|$0Sa7I)`?w8zG3oR z(Qm1Z#-D}JyzIFH!6L^~K5#zb>m!lfUx(i#x1)D@59n_8vRBCbgYS0>r1P7%BTH~t zpBLw0-dT|eKIFhNsruY~e+U%Zm-7}Z(e;;@=a*$-v!$lpr*RpOe{*E@OEHfMY{lex zrp#Mdj%C#flAp;zJQw^c`GWxr_ZPX~w=<9^dhqiz=c7){$o)PnpmB{l{oJ;qHf|Ar zzTd=;jxo|dlWO!I@4D)h0rf&t=AW4fj*W3L)8aMWMidVB-ck$Lbh+60TIdvUl< z6Vrw&vtsKgHiSRMZsh{`48^L) z2)%pTZm!_y97!6@EdhtPt*{zH*A>tx@+2;8wnF=ZDjf94pH<_h(yn?w+h23wzuVRf zJDURAmE$q;X8`LoKBIHdL(J(I&ROsJ%14f^kwlb@hjtF5mfHfZExRUi{(8h98#*t0 zh!JboQ1_iS6RgWo;8B2XL7{l*RfUw@7u-3t}HUrJXWmEpY8J}keQiE9V*(Nen*!)D%u*==PU`Y7iAmGA7n zMm6JDO^W2_^8k4plX>&BB33UF9wZ+fzPdGp>4P$*1NArXN-+L6sh8m0#6Xlh{$%^( zl<-SkGUm1Bi|~2+5X=rWijt-$G(7o`yq|^H~j{X*YtqtGc z{v-+`&P>FHd)k?re_K3%qWi-vUn_gC4|hZ|&H9ZF@E$zNm)-z4MKYMIN{2ASrG z*3yx}GTGDMezJqQKC&+Z3uIr6J4lbJoMCq60lK#l0O~cx7}k0-I*YEm(i_34+|ije z^PWn9S9@cc{VM50)I*U;|0Z?2+}YOK%oXE&PD=Xi&mi*sc4Xe#jWcSm&^+uE?&KH4 zYT;>29^Z>^W|m9MyKGS%riYzbr)}OidEoH(u~?8HTyy6HGfZ27BekDNy?YH6bAr9X zJt;bibuwI;*$dux4@;9|52RH|mD2h^Cv-m^2Zb(yNXfnl2oqxU$n! zsyl(#--EfT^gMpq{6mVxW&4XMTj@P|BHC5-7P}KOd5!9RX!@zhYf#B&U7JL&JsTNf z7XEFXHx~4@XI8y6+ZvbK)vdRuWuFIZQ1<2b!zoO1w&M05;tf>xSc8rvr&()kA;lW1+={#b!2lrL>!Fcyq>1DtIC@IB4(=*-n--Z`3RhWg2-plB4 zqbm*C8PmspBH!KYONUvrd7x-I8lHnTY@W^fKCYtc1WpLx1p z@yo(PxOPHOF+`bjW(y}!`ysUZZ!do9C8K|Ui|E07ql@P_O!EDXs6oo~{cOm^)3#y3 zY&#CtDdxLCbuJwyy5AYvoD~qooq8F_zc-848L5m?)uhh$1{jVq;lN&5bnIk6`MUj> zI@FqWLqw;@zQoQ%V;;sl#K-aY(o|6niMJwwD(7?Gb&gy4TYFE`r}y^G}L5 z7w?M?Ztyr`VAr!L3hK76v3>VXJj1L-p=Rq&_1UhGm z9qZ0a7+r3QHyu4tn0go1gI}Opc+3X^o4KMngi;dz~zjbo+2;3 zT=XMm4RXiQ*h%zL^~9&|4LD{}fU+dPZd%m^&qrRjT{6g=6IZ87Pi}9+Ho@B7D_3H~ z647y4;7sM{hcN8b2N9>E`O|wKE@_=Y(5EOYHtB4eH}W8!7>IjLYHzxVd#z)FDv>W( zG~Nc1`{6X)RLy}_*OQ1T^WdI+$At5-2y>UDphj?64UTrB*8QLGf9;Q*g?)H@%0Sj; zUYGhmyMVM8lbAJn7;V-Zz};AFt`=R9&v{AsHt7vU>OK`7!;7#_|B7eH!f~XOh43MY z^lU}i<@RL9k0ZHW|1mO36uDu^82BoP`{$t__&7+3ZG4nC(BljKI;pbX{XG0Gs)vtW zy4{CGrW~_(kJPp9E522qW5v7_*n~y$?6wuGZ&*QXt#{J7^acFzW(K^Ms!*X-5t4!o zY@VJxg)S+p=%cbwbkB#frB;_O_x`}6VpS~3-T|i-+N>^+v!lxnHn!V>*>hSmC^rdR zcGx5Iz3Ani9ZVhN>$sUXlg$B>xlwdM)~g5)e8Lni^DlybtLwslwi|DEccbe!k+sPl z!FKt={Z=P_FXo5CsCxtI+~#6pv@s1G=5n8T1O_UeL{UhfaN7=o{yin+p12FMeDQZw zS0U!@2_*Xk!tKZ_6j!y!^*LuTb$2U!CCm2wY;{ih^}v?F%~?E}{6ae2X&UQh8S>wR zW!xOnAf*=xXVe}UY!%PJs%Z~Ay?@#2Hm*W#m@WGo97AbUGi+A-!ND{b9h@IaBP{H2 zY-ueHuGL^c_rdHKD00Ko%Aq%YF-|`_C8?k6%hM(LtXN=_F%w2p0!(K$=ebyPg3No?pZb?3H zx-1=%X)x|-1}_U9^TF&oHaG2JFBkExTI0@{cWyCJ__gnz`-bDaTF~}IiCq`bp?L6g zCb{|~KaRbJh~y$(-_U^6z{yy#RB&C?!WnUB5l2s5#szx9qk2wG7mb5>U9xeGiL)+qfjdQ9k3{M@a>)Jlbt6OS^g#6Q3jaknn5V93)&ZH_>@QG>3_NMd8Mq zaP=!j>beX(^tps3-9>iy!VKQav}7B-MLf5=7*(}eV&}C|aEHdQF7XvAN`B%({ZqJi z6CABwf}3jm9myAJV13`v?&5VbZhRggsb}jkxF(LN{jMPV%vjn_ieSC3;M09*hbxLj zOuMretNJ!$K-pK+H7~UdDK5mISKfTQCW2pn>kALF8OPQ=hW~XP1R9251$tdrhYr1TnfEk^q0^4T^T}n5)OrDn zUFk?!+aE6;28-ESKbXFm&(lMLkXI{VRM`dWnHq`O~+=v7mKOkehRS(gV+J)H(BA@y4rQP)0pXhMSmAX#*;n2^J zo^hK|v*#R!8+%H}D|_Rw_EtoSedd|U2y9fZMnC=ENSwY5;re5tbIggWesAJG_f43f zO(qr;K~c>A3KZ1Dyw91754K^Q)mAi2TY~xSmOMM-DBk=%hV(B^{3-r@JC|fb%Wx!m zGXnXmA46Yosw>A1U3|WWnbZ3T&SgJ-U%8K-mpwU5O|ax%f27HQ0Q#P+Li;b>?;#472UN+(orTn?ux zy=))9J|yn9Dp2WU!d^44!Y$_*&SdEe2K;@r576Kwk=@r3j0M#iB3j%Twafa!q2irQ z(uh?0my#*ble$UXr-!M(Npsm@0tFF4?e1GiYMN1aYQ+YRajrA8MX z?|KA<;?GeXeFUY+zp?RjPkcySVfUqjikM$Lk+!~TzzxxVb!y2)Y}0n0G>sD7z6#u2 zI!^dcD_GNeDa>Y^!+4)_$lo^6ZmVxw-u$>oINg1DGtHcSz1nc0OyqnDJ*9(gy%8vK z6}g|TL2ByHP7!x-sLYUF)91oE@FLo}4B-n`!3Or73r(ZxT&pDJ^TrBr-z7NKtM*ZC z@E}h1nS)bD6ezoH&Z*y{1Z%eg?{DkHcDA3OFsl&@!X80eb0(~lb-CU>o-QA)nIDwT z=&UMc&e+Gn1+&;`^KFrT=Uu6%I~p%YK}L)Wg-#4Sb6|O)qSn%?0x@wN_-@ zuCXjCgfAwB^O;&UuaAylcFB1z-dM$;ojUTQ&6nVu*@p$jKk?~iUmNpOkyqFuvQ}aS zI{oD&dJV|uykWnP+OnJ;d()}zoy4AEU!-RA6hVSfl`P)mMbk>5X+4YkZ_c5?pzRF% z@C6S9x7DlONWMyC3$+JFa#XE9n-qU?eM2i2DqZBQ#qH!vj5g5nhXZr|OQioRefd3~ zXdV~u;yrKoadOY@-0d`+J>1%Gg)gne$Nc6;3VthRd%bSUY??@=C-R zx26)V&oUw8Tj&=y$6ifMVr9iy>D!rCP#0{;6^kX#^0-Ai(K9YKYQgGN6X|^M9rwF$ zhPUHC{G9$C*^?gG>Bc7GVbTJ+4lHDNlf9cP%UTD6ZD&<6FN$J&Nq)oeTNb?IqrG9q(ZOi+%gY~atvRl)# zr0pU8NY@Y^t{=XbsV};aZPTPK2eYKm#$+6v?QdETEaHHbH(d%F!7b>y*Z-BfMdKIetai#wsdag?;ULy_d%t1FZb$4SqoYYSe< zc1c&3AmynDhh|8Eq@yi3&e6Hj!_Gr6N9_cLuC2x@_uDw6kt;qGUU=GhGe0CvLtc_A z%^lWbUw|wAo@m1*eJ0?J@Na64@v?vNz6Qe&U6c|M0?@QJl{2nRL4KPg*7WO5LvJ&F z6`t#n36^};>Y;S-duHBx$Wp{I2Ajx*V9!;pN3N_rHqb2P^s%I z*cg}5-!=)U?PFoK(vY|H-r{{r33hr3KS0mD7%|BUM^k6wSO-P)Nx1`yjnf!+PDT~Y z1uQ!L3-!H)lPhUHTyF@c$-Lco-D^CHGotYMRua^`nlUJ%L~ti%SlzzX&ayC+JT?NSz*?Hsx(bxJq@~bqP8?qxX@R#Pq}cPHfnO;CU4v{9*eRkv2SgkNABXO3O(NGUkI0;8XRTTmG1=qpx{8z7+tmNP@1s-r`o)cu6XUis{K}Qx)+1t9j&+{{Q_fO z2Mb@)LU!?-#smHzdAP@9)Q@Xmm-gi_ig^R~-mB0p`xNTi-^Rwp8Fn)#mZHC^0{Vw$ zet*-pl&YK2*G8M%=^P*E7(1L^UpN+)h{H)%^6P z5e=>jL~i5=l~M((WTVI~+=^i5$`Y|FQG`-Y57AmZLWkm+^mDOcMrYA)96O(Ch4wrw z_z4z=ujBL6&P+5}EI5Z_`8dCoyp2{PyPo|@$GdIBXJwiWnt3FTpR*^z*{|OlP_@o~A$ZX?N2^Ug`LZ;Uh=#mV7sJK1Rr!hNYlV zqbEsIs6;#k&Cp3L3b%*n^f zJQ6sG6`w@cV1y2xTR-K;&SzPs*o)>Z8#(rXgWPX&4X(-yIp@O*CK!KYKV1*uw>ZrW zZeQuLz7HD=ayU6GkKbx^)7#od1*KyA{Jb^*DcwUc@Et3%K&J z0dptk(fr4D+*=z(zizWJ;=(&*RY_OeU|TD}?XA?awh|<*New(HwP^Gyb~?w&q__zeI3z-4IxJU2ylF72YYXfacrRlEy|0)OAfo z#SwMvic%Clfial6BOdSGy_Z&94V9#SHmLjZTryqW4o_MvFy)x=X}rzB_#4G=UosJ! zr#?g*wK}No6*=Rx%2?G+1>969E&B0AIv!$xDTm{2Kb(q(+m&+3#{DMZ<27mWxz=}2<7!qrN#tZ$iV`V#d#Om4Jdo&;6Cnw@ZaDw1=CLs6eIv8Dg zA}zXl44zsZm@2Z5Q;LTmsIChtmRN}Ff1TvrzgfbCWyl|N1lw1N3Hv~OwARl=pt>$E zWu+iRo{0KE5lGqb2I}K?uwn@aZpu^axV9$vF zgXyzH2oKTarvqpC$LJds8g_HE;P0P_`op7Y`s{M;Cu_y*^S%BOrnr@%CR3LNpB)*O z6K&(;co&}L|Ip#USg|{Q&NXc!*?c6EhqH!ox7%hOkLWEgh%=XWxEzFO7sB!6SQKMS zCiM-^0C(s+@n*La&KH^6N3CbD<8V`+z7|B+qEgnox^jfSDp&RH%gHav9MOF^)EI`I zk1V-&Sw1&4Ut(yyU_y!erK3R%d)LpQ>%4(n-n4}WO|LRv@GG8=UCa6+TlxBp8yPTp zJ71|qaOBb6@>=&z{Hhm(eoK8AJE{qX@&g%M+KJf?M|nnk_Z0$$^SkKSXD-j?n7dYR z?U=*+wHf$gRfcb~hT{2_8LP!=ly_{dI5Ke*fB)rccImQ^Il94z*?O1Br`oh(7$dUU+PB zkf+1T(C_L8ydK|$ttyOpcT*9vk~~n6p8|!%Wvs0l$dW5{_**lG^^XF1$~Y1h6?d`0 zb~Zl@PhsweZ_WWqoP#koYetyvf?@PTdH6p+VK0Q9)b;%inqIZh&SIZ9EyxX$dfXL z93@Ar|BJK&6Y39jlqUo{#QP5=Tvk6ua7&Le-2RBPq&$I5|E2Kc!&# zgYH*k4EZkzU8LFEu)Q5$-`UBGFTLp6BZ^P`lK4}10tai)r9r&4{LYuv?4 z)#&zAcp@B|qZ$y|(^cfX+VIHyCyANVus)m&=#Y|HP#s;;v%vN}WpF0F|U0=A?8$a1yZ|A}p{{>5JMt;R# zrA{BuoJuHpRG zw(Ki7*mI4AC#oP>>gQ|5K@WC7+IJS3twz&p!8Ig|ypGqAjmVq16`sNye!EJChx?Ua z!?#UT{#}O~4KJhtPs0%$oyqI>VyHep5M37a4gWhc}}NzLAxQvWe_BBz}tyD80*9{3)SDY{RUo)5E<)hs?O zoBjH={R8DB*~If3h0;k*wpH*3$T8)iS-v3YqfcgEEgnyJd0r(quQ= zDoTAK_sdFC%4I_a)XVNGE|%RaStpDA?JDyflP7zyKuh|PpegmT?JGMk{@ydSUt}8x zgvyR|St&bzs*ChR*-V;Yx6R%z=b3Cm^BSosq^-2G<4~!~ZByy->E%-6$m_D;W_#J< z_R7+k;&ZZp1&~}X^}(K3m(Vn6JI);yJp6DQv~-KZM!jMTIbMa_ku{Q=fjU+k+bkK$ z^kIwbFj|K`l?H}C!=3=)44xN-MOX6>G&BN-2Iga5Rs{aD>df*+B|O+5 z;(Z=%FmsckZBA-94)$M-3xf}!)M24;0JY#_;TdU^w*$VK?!w zQuV@47(DcmbbD?abl$ZCp4WQdz0Ec0ZBrbgZx7)Fg$?-Ft_T%(#7uZzEmU*Wg+nI= zWwuP`Yp#gk;zMWUJ5J6iO`O?EbVR^2aE5@Sa5qI9LKb0bDX@9(4dDUVhquE9@P*1r^fGD1#lls2qMLn6i~hVn{Tm|_mec*c znLKhwdsglhxzdZVOiYZXjf)!#tX=4^;<(6`>M^lh3^v?}Lvcej6->s`JoF{p97l0h zHRJ~`T*BAoX}l%+jW5TX<)xNJ>G1Gy_ANicwO54G?DA?R4&6niY2KV`-jAv=F=#b? zByDRAnOqdcy_t^~dFvCOM_=Pv(_5S^c6L)&+RIy9DzRzlV)=^i)5%SRhvD`aTtTOzA3aa4trdx3BRa?KFp9>MkDD-)9 z&C67m&n^w+>y<@h>{zZ2n9H7Rn~_~Xem6?vDBr8>Wq1^8raRDKgJ75Ev}101CEnKQ zv)AuwT>r^kxZ&e4ra2DXduw6f{Sd)?-6C=_K6Xu?`_ipp3$6!jq*rbqy1I#-?nze$ zsH(7}yEn5|Xmb0gFnWk{yK&45bbYx9k3W8}9bhUt`6HCMW}fJLR{h1OB~J7lIsnhF zeT4bJ`*0ES{vy3R#BR6-GvQ8EO1Ow?zZS!yya@VV^tmza3I=ueN5(tBYPeTRi-F4A ztYgMIVLj=#XbAiFx`aV5J8+kBAYblo#EajpxVzMVQ~#~tuM5xNeSR&>vqcx{SvAa_ zDDt_@7IdF;3R_2XU`M|*SUUAIQd$dtr>6z(++V`fQ-&Dpunyakr(p1asrXRU0UPU2 z+B6QaN1WwHX<#28m^2CQfK?=P3j4DvL zx_*k_{>%<^hF97DcL)bzY95EF`wazuBOh;mnJ}qJS$fgD2lmY~Ii~C}Z1oRf z%U-c3wI0JG`bQD=;jeo zg|f(EAZG5mgYxBeShHdWK3$7N)xIcedEjv9#c#q`asM})GMo3`E3hWXM07YshhpzX z=mzX!r{%dk;i!Vjs|u)ydyFUDA}r=zE63=4BHQuAnugV1@ly09%+mJ2(zHFJvgI^0 zxPYE+!nG>;jahcP5EG>)-U;1c_*h(=ZV&*wwG!*RkF$$#2=7$(of&keUQzEddk zXCCI&-N|Aeu%AZ9w+J?~x_s8`iLmM0Rz7fD4m7gKsp$svx+L}-{c|9XKVbLhl@ssQ ztD>#oz>b+F_?V8tycH|hgvSzj=nPq*w33ehiTh8`Hauvr$)5_E95?NNou!@)SHIuF znLTYd|I;#N-(Eof0Ua1$?t;Mkn^~Jvg<)m7Z2CBahUe{>8z|hfdM^+W^#ENw;yB(p znX)$O{P!l2o*4^xcJVsFC#yq+;v04ucZVLwqfj_Mj@R}aq{HqxG`}o<@3X?VpsbwM zzgN)vf-*<0Ys>aibeNa@m43pNxT4i6E*vM(Q9Yd5Qd<_6WO1}#UkpE5&WAe+5EtK- zRi&+YM)n3*A$||bI@!Z$ExX9FB=?&tbO{{9HQn48eb$_31zYNkqb09=d5Gkx*)-V_ zg#vSJwpn@=!!8W4t1nfc^+nNJ(|L>0(W)XhDY(70x#-&Qj5P0!36sCyfI-qrl=uWQ zP3<0>pMS%(CPhB;J%wII!g+Y7J5R3oj+O`U{4xDE#*M3&a+dCgbK@Plzur%U&w^#P zc{HzIa-(V8RvH=hpx>#n{J77RccYBCsQqo{)c(NR95@jSC##U9pS08356<=^XlF#h{XL|cyHi0CA%;;zEG=)Hq^tCi$0XKta> z(ZTYC>T0atqK5kSf#SutohElh%i_QlW&-6K+d|Mf-Rm{U~ijS2^)cB=4TgrkiUda|TXi-$w_~ z(KJ|mE4zz*K^1LRkKyRJJrqTG)(8iVuF+P$ap)=kQ_wnNH9Ptjb?K;_)-qqvU$8E7Z@De(O?WA7oXiPb? ziRRwIH``$)yU%Pyz#`!f4~c@d=N=@xC<;cwFuPFA_Pq30k(vLL8M<;X8*Af{Wqb~* z-8P|>;8NBKPmH zF~>0+pw_`bJS!emjrf!^(YAm;Md?NOSQ-p`<8%p|{^PqVr%5U$o zd33P@r~DGLpYE&J$x>DRe8w7n8}Np<=jQX7`$-m_G?$w+xX3?z+XL5&1t=8rxXS!3 zeLEYuG4tpZ3VmV-|j@W*#w@on_s5@T;J5$8Zo*aAjudq*#i8&cZG^>lhHeylNj`Cni- z6ISi#^%i@%|IQG`Y}$9%J2;Rb!ZlZr7GC1E# zaH0ATQJr8pY|Ylwe8C2j4_sWgi- z$xysgdeAvrfmd`M{*R*b48-#N;y7iG5J`w^vU%=v&h;d#v`ZxoX((w@G!zXhk`a-W z$jEMx(NYp6l|)0d2O>rFYmfi^f8nJ!9?yNB>s;UO=K}(fv1svZ2s~Gf!JkJeVNf#^ z>GCu<`Idhs`>H|2rbz3)AEzOpo9|MaHUX`vgBkm#;n`&)_IE_^!926BxQjK)}lX;NRvXP%DvzIf*Jbke`w6{qN0pS(H>X9>;GmH%JS zdLM|E4+i1IL!KEwL4d0!ux#lA(qcz{&Y%3H_=kA(MS{<%CDoUJnx;%)x-T`z3`FMO;m5u z<2}#wpxV?3)kaBR$E9WH;3I~ie6KKX<8%b?g*ZoM6!yH>gqrIv;9ML29>DlF>{R~) z_wu8`u;&LQOwRwC}OxD2e$l7;3@qC6}O%NkL62oAxErsuD9*4n3 zYuqlCg`dl!aPz<7#&6qFDAng`%929ZYD~fm~Sy(#|3TQu13+n zG8FhN1jo#=qJ{jftEN~F0^jhyEb|liX%9o`A`@J1Sqx^A#-a0r1hAQW3}PEwajVH( z6dNUm;%cV2am7@UI@t@(GM91V;4G zotz^Ta4Fs8oa-fJPG2|2>fLWAPCaQmXV6v2Es0M6xgvdV-kt|buS|lbC-R{0z;g<_ zByjHa=Yn=R&i&Ln$=%NnwMsf@17_kzkl&vT@xLA5*)*QtcQ2Q7uyo@Nf0u`UX%dj} zSPse}8#wtpJGjv0S2(2unq0x^i`;^W4KVoPARJzj$ldJw!*!1<2i?QfoNtf|tZ*>} zr(==ew(_9B(@hUP+ns@mll9z|GFR~Pj(~8>5-8qj235L~xy%|_SPLy&n4^+1Eq z5vAg#9adPi$`-{A4uHssGdS+mS5Ae`W}N$Q1vYi3!Pc#buyj`mw8`?`=~)kLwvLy; zYkrAbuzoV!{y7^1Ous+~Ki8@*bHLLR<*@kGD7-VS0UK&2z#cDSP}#1DlXE?7R0DKz z{%lR$2ASBCtAn;l3b@Jo4wy$~b0_Y^;UcM>a8gYHZ%6apE1zZv;2AOPUqWDxQ#jv4 z3q?;~Rs5q^0a3rEU`)In`iWaWPACL!0bgO%pU1rGx@?fhpAFSbG6s&|mGESdMrl<@A{8!(s>hzB({quGz6&^aj;_7r5| zNW(@H-69J8S6z7CX&auIu@0vd8)M--hQ4Q$P&d>aCwwo2qwNFm(?Jp=EW^-x*Ge#s zdIYX@b#QL`Vfb}*9aOy($LYf3@M@eGo)C0!o+cbvJgMbA`aI%Z&6EQDMK`TI70&>- zXBhNN9D@1h^LZ~_0qoQ5hJ4*pn5}+|cf7NKc!ZRMAccq039zPuNxffhehXBHU zu7rN4LU{A)8+Y5V7uxL2(KOc(l)DQd!S5kdj&FyT*AN==+F-lHUr;UafSU!gU{sYg zpDk3y?|mB~=dK)F{&^7clhr}w*iA@Zz~@j6X5g|3V{yOiB>Y#?3hVAfV-XqRX8l+^ z?&}J@p}DZ2dp^AO`wlY_C&0mKc^=62W^ni4Dd31XeTJ(3y|A@)1#}c1LF>g{nCR<(X>UdG%C{j1*bkM?P|@QEgy4R0p@0@_mK{ z?_g8y08VL2#X7M#JbIAu_*0I5+C`up*N-CC8=>~H6dvxm3O7DWVfge!$avLZVe#e; zysr?!yEnD4^N1K(Pt(Hm`|6azv#3N=*W$(O`#7b@or>g7!`9*xL5%j#7qC_NbkTo8Ip z;q%$@S1_S$25M9#g1z%TeEVW8N`1A$&j-waZ7@Sg@c@Co>^Sr{b%nM<%90-&oF1LIBu{w16hmrVa>5|s6R3c z^lzlWHSr>x_3SrizT+4t5Ap7M%IIo(4KQ7#co`=5t z&gwk(NRjUqoUY;=^h)4Y&}UHM`9u<Dn~GjSc<*ood;x$`dWm`wvG5f)>; z;`Jg>Z!G4T_C!H~DeZRvg4vv7x*WqxzX#-dM?=ENWlF6OCvl1R#oaH{$O2fuC?wmv4Ij(MR z9QQJ*m6QJ7%`HB_An)i>?z`Agn78^Vto3}&m6S5D)s2Jxp-^aa`NkbM<_{DV4KY_` z;b4*?Y)`()8Q-jdn_g1zeA6jTjDH@ihX(GmY77i3*bI9tTfvBq!eqYZWimJiOuer{ zCsjjRo&vUQ?cuH}5EQK|;BM{^g-06 z<2h~@Ktgy5!sI64$A!&M(jSB?6PO)zC6uJlpI zN_)VcDk^9;ssh$osN%Sk5-u{<1qYt--pNkhYuU6L_Q~CYb3e;y`!TOkGSWw}PMUjS>HnR!kE0H*B-&ybGlRUqfcd+fKXp!|yaoYx*{ z%iSNY!l{D___%UDw*0n3(+ETKavDKJqa$!?`&kg1X^%6O62)ws35scxA-zk8t+vW0 z9k+h)4!uZ78q_7GgnyH%}K46$>w16sK&Qd`qsFbd4Zq5&z$<(W+*4ZOHJ8&>1~$B{T{`DlEs zIuGw(iN!6y0Kx1#^j&a=`#UVqtKuz`PuhWc-Yw9;GuX;j%H#J(6VdNpDkN!#@pr?{ zXs=m;CJ$G_=Cz%Y&0$`5FFBkTf*BR|9UEA)wh3iLqZ3(7_`OyjR=cY7>3T zGT4F@qh{kYi9tBQ_ba%od~dzl9-1F5hN2afFq9jK7T5W?LbeQ=dKU3sx>@)@c{WP7 zYGdm6Hg1gj1)Sa_$~)==;I}6XH`Ybthfl4bY&-)s*(0c-4p?6No%_1m2rnr91%>cS zP&d94;znBF*|mPCrLTtBdAA@S<~*8`6l#c<I)7M&ul5YW;D2>6I}>W9Zl}cz}sCtP(Sb;yp!Be&Cn9dEwr)i6u;Mfe+Yk$Ue0MvS`B?6#Tc{90GHd@ z;B#_BquCR2TbVkB*Di$@!&)Gv<2jspaTLU_#R;5MWVyI}an8oIQo#L66fBdS$0-E9 z71(bw;EblK3bOY23k+5r7Tg}cQPANwU2tjY2y=PMV!??;rh;sdYC%WBEG|)h3&y-yE?CyyC9o^)7RY_P zBFJ?;BKT%}L~y)Uo`d&toa+=H!9E%(cycmD@TpB)&}}zbkoY-S@MxB-S0((`E`;8l+u+BYDbVct1cG*@K)YWrlx#i6 zE&iwt1+uwZj&~*3WY^C+)Yg2vREaY5Wa!kc2i-#whiG~#)%MQVc9EP4!@Gj#VDh0ICdUNBKHinEV7vmq zpNmH3!m|!dM&ryQ^Kqis7}AYW#oNQHQ0}%p+%@O@7KOXH;@$IcV0j!)8Z`z}0;ixv zTOry%cE|HWVz~FdEBJo{WEWn+;cr_}e)(QBvn#-G-%6ZbaSy@I2BYdkXxcRwG!;0~ zpLhSksjPCa`zMDfd9Jv7@?^Z&QEz>L-{m z96QO;`N@8uo5atzqBk+vo9A234n>QO=lD8m2EGpEpU1T)fs50@et8=VTWrgJ27^#n zJrnmeJK}ww>2<5;0}Q(zjcq5aF?K-$HXYmo4P8}WcAD>vZ(I$2zS$t7;ct@|$k0zT z8VY9b!3oZD_+23XxqRP(-%i-#wT;Vhp35lok;+G{76J#MEYA`Tg3+4>t=lZ~V8kU` zT=TsR^m4ueZeD{^?N@-?^%^Mn5(ZV09QtRL!`oL+;Ze_gNd1ut4}bZ6p6-$mvO=K1~|+!0p?5b@3J?`aP^fY81-cVe!QQI_YVAm8lL@fjK9}6 zP%goQp8t4;PX=7_P{CZDhZA#X9XiXb;Lidjkkw-ZGc@zz{QDoAj$1k?J^cb7HOHdo z)sI~G;BdGw;s~6Q@&cV5Q{c?uM5|HGC2;T7HE!AO?Qqty2S&t;gTsUmaB9aVnEX)z z7xmh~kk?TVd$1k6LPl{{_#97V-T}QWCBCqIlP*-&@@J`CPvBHQKR0Y%G(6cE1&_wa z;IqMIusF2~hW&Gc_3C%I#nNv%pX0YVrxI(>sOtj?Q#$1 zL)5mNoD4I;pdfLuTN1$)&#r`uDevKqU^^Td+y+wae8)g-Cj9KN=PsWJhvxctfCxYE zlbQ(AT&}{4P=5EP&O3@ibz!G`7Wgihh${vp_>S%~F5#&+C_Fd=?)`D_Pe%i`J*=@v z;(4elFU( zodgdQCqiCA9BditfZR_G&^9j_6xG^c#S0Ovt~W=gN4)=Xk1?OE5r?nR>mc@+0Vt;~ zfdVgatff;RkZ=H_#v%Ch`YSxy!1w90M&P&PT#lWaf@3`Afn{(RXMeODPXEWx=N1+4 zxNr%i?oi|z4Tqren;sXQbseVV^Lzxkz#^7 zdhg*6jg6qQNE|<%ErBsPFTrz3Ev!~muwqsVVg7%IV1;KFEa-ZLUA+rHcWeeeJ8FU! zR-5sw@B;eGXh5w$nV^zT#jTz)8x>F7H{Naf5^nSNI8P;$&?0j`{>YNUvr3-4XKxxF z92JFUE7qfLjV#Kp`T<+#-G$z2LkN;|gC6<=Vf;Leol9_+o&pZd@P$lC4KD`=t}!d?tdg-}GXifdPIG?7@(Sx$v-G1+DXVUoy{=lF0oG;Ss6U zTy8#W?wA76Pd~xR2NUM^foib|YQ5ZpCjD+GS!07&@Ap9L$t0}XbBdd_CImjHPKJxjBu3YgFPrpNd!Md_paP6HA4vO%5T8VHx|K0)}TD?&^Lz`Af>f+R_ zW7u=r8v2GOC?+e7EK@cFdoK@%5|mK~^-Lo)?1V0rDvFGy;B)oUL?|AQ4D@8ev#so*=Y0LKOMKG6<8%;%DE`tW=Vds77Fn;kf94SSsDDs6mK zx&RtJ=73;kD8}&jS?l_m;iU=hgtnN9wF&+>m+zc?xz0bcE#E<5)D39LQG?J2E8y~6 z3GBO?i>r1|#FZZ#;O+$%G1L?8AXa#^dxHLtJ%V4hOqe;L83)a25qz^}PjF zHzlLv?Ht%WeLCNPegvN#uY>mnaa3+lwy_J6!8cytxwMgY!A%VCUuQBXkFv)@o+I#X zw>@4sxE*Ak`0~E`Y4Bz(&uEjA!cyZ{6Q^VaENDCp(Ss3~q%Du~17)zhv>!@*f+0z- z9CiI~L!L}5nA|IZC1bLo_+}&+o}U1h>@vZQ=liSe5JT^iDez#kIL4^I1KSflAd@zX zX9CTEw<;;9elG*sI)cILge1>OvBTNdFT%?4^Kp~F9^RTX!G?HQh%bqPyUx9!^!^{r z8!-SLK^su!-91>UG7tNlnSr}FIKz$`1F zOB6p-)$j~WcNG137+cPC!3@Ry*w|N!8_M2c(#=)~u+hY`!j1SZria_l=YPIso-y7z zek>lMA#j^K6N^r*!GZkqFtdnvwKpuq#bQU$u zL-U2nc&Nn`XZ*+W*xeIRIzNs--xgt3OD4}^bOwEXo>Dn2h0ni;z~^Ce&?6xNSN@%b zVD>$P$se`aI$z%;bHrHY&0mqBMaKeNW<;bdDI^jWWfi)BRViLoi> z6}4bUMl4v(*TYG1Mzkj zV3qtCJklsmb&|<&f15pcE_TMEN&}M#uHQlLD9=Ke;EkSP)A3)M5~_%dK!buA_z>GWFEvo{ zz#ou%y$t0~Pv@@jd(HPfvb4xT18129V&b92c-XuVwfD|P3{8h<{(h-`vkLY^@^|&E zM<7ts8g0&xMOpoiu;up!z{WWk7dHSke4oQ?J)d{{84Js9j)bNGV`%&60Eq#zHXGOf zxFV!`jDQF!A769B7Y)uLCJ?HbQ~-g4)7%4`mqoeF)OT zA3zE3hYIzwf(tws>@7b7KDLO3{sph#y0a?(4&pl_!~CGMb~FZAY{Bg^HK1{(kk4|b zL&gk03=!M{yD(R{wxAp4{o4Ur0vT-0902VbI%v|H4NJ-`(0IvOIFe+Es#WvRv?K?$ zHiy88F{{Ajmn`2?C+mNg&Yv`oS^N7^5pzYdPv_0vH4IyqkA8j~}dL@a{YB5-w zp^lDvEwFQWJMROJ0k=Ayi8$5)RDJTGq+>A_KfVu`o&u+yIpNQlyuZM~9?eChP`~9q zH&A&91FZRMY@ZTXJ~YEEX*=$klem z=Ue%Fn13Gjz6`>+FRpxV@*S>ir_Phik{Ja!r1f)!Eq@a>v1CNA!@Nx4#j z(msQbHt-mOG*6*~%Mp|rtVgMfspw;JABXF1$7a3Zl=?6qIy`scbjiEW!1Mp_@~)dR z9_i>1H42wr7GnOs%Q)<15uEmvMQMI7Uz3=|dnuH8&(LJ_+E)QHPe`E8?G~`F*Tkg9 zd`xY9K5DP81gYKKxbEXP5>e#yk$&57UE4G4)~&{>NgT>-xP%&gJ-Be`A}q{Wfy??d zaM33PQnQ>nwpgFH51++6diihq!H}KP~>-@BrQp$)Wa34?Hn;5q^7VjK6ACDaJky&+xn> z`{_=&tBv(TBcSrtgI~C^%R84&xl~A4Uggt*DP8Q;M_+dj6zWw|J>_*GtA(dpz zQ0PR3#&)ofeGXgp^+JmB68OJEcTiRdPrQxBO=ffO^Cmsq+dKoEZ;m8u^N}b#mW4iw za{T@w77u+($6;Oe@7~A;aMuK+a-|Q^n{{$_UJoT993*Kfa0xDx6mKiq=55pcRBoJ(oSfRk>cVCq{HI6vDG{(UV6gFqiH6*mf=KKaU>7@=o0 z-R1+gA#Dk#b1)pz{DL5C`WR203!L!PDX!aYJGVD=3>SKE zBzLgr8<&~59Xuq5Lt2s>r+vxOaPFAGY_G~uwrYp$=nfh%v%<(;tF`02(-?95+_(NWbf zlE0Tu2#bell2I6Ncm`;yjR$|@i6CioA5>Qygq@?VSrm-i58`}oW!Thp@N@P`T=zN= z79>r?nLK;?q4ht={_h9$s6Rv9`VnxtG>4O2CWGb&1;*N`5*Y6905+Os_$6~ZO5PC1 zyo3MX2`dL1?htxRRY4b(GT3RG1jTZPU}2gUWPW(eE&Uw^%Nl0mqtX{}@b)R_E>y+F zmo`{?K@v%80t(jKfR0KoZr)qT&EPW$GA(0KLPH#-j=X}9?lAln)d(F?zrg8MKW9BW z1N?cJ$oOaZko2M*`drlU`$9{M;|RG=7ohpr0W5kx0pIdj)URje!j9}h7#SJ{fB#j% zrl$uWI7q^x;CwxlxE}(KdDp=;HUlrtjpTbFp?t=L=N6QjqQOeSIO8K2B$WZO!3Eq( zRdJpRbk8v9P!3Ep+YP%yl5ocJNOYO^1PNz<$b?iC@9!=+b^bFy@ zh9Dd`I0w%u4MLQ~LG*O_38F9lK%BNRY&j-{d#^s{S>P9;P))ah#Xg)v1uZ(n8BAugSKOe=Tq&0K=`JNq@t|W~lh~bS za?I?099!VhPWN7KBg5MfY^^+FQL_d}E{!p*d;PSo*nstDZe(xYrm+#x4y13uGrZh) z(DKrWlu{?g*8eV`TPu&zl*IiM{bD$q+!et-+uX)O6Sso;x5s$+*JCUi>cxZB5_H|S z9c_gk=(lS!nXOAFe3?vM^42uDw1`a_zl_!S9-w7F>_Fxz3N+ATUtc>ioh_T#qgO>3 zXb;pg>k;b`D6^OO>iBQsM0#HR5}qp+F!6zXbbCo1%^n*@xAJ94x@U~=L2EU$DbuIk zR|+iit^|cbJL~M(&K#os*yw=FyD>{l#Op(5jEc9lgjTqnEK?i3URHyZ*w9DJyAhQ5s0b``j_qM`AZ+->dCAtgv`?a;*wJ@Y3_m@RJl5hHs3x$-I@tB^lvwP`S^_V zQtjBr{rZGM)hKaqA?vqqqb}clq}km_X$AVEb#())DSkp1Lnp8mtpe0K_z6ww>{Z}>$*V3I*HI{YWjArd!!IoWdXDu5_nC%<~-JRaf zx*OAk`dLAKy4%leHwgZIUB_weBHfFXbM!o-+jRqm7V7Wr&_%aLRd8c-D4jYcLAej= zah%;F^jV~f=Tj_bY2#ybx~5EG{|?|Nkw|np+(jBI#*&y_Ek^Myp=~pGM#=0Z`ZaqP zJ&;$RkQ*h`l)sOvALKH-kjbp9Qh}X4ZpV(l+fDrB&lb*dXKef_rhFlvj7)SWC*~tv z+E`8xmp794^EPToJVMS>Pf}f}6r1|bn_2l5VZuUNtQdWa?(TIa`4cIq`R5NVaJF^{=XIGJat(UD|T3LkSFFE|UBOF%dU%}XOQ0Y7`}-iHgfqhEWcqz)L8cT!@Rh)$o@j&uKVON3Cyp=ApI~xi}%cIX|2>4=Az1&J=p^VKI3Z#*nMN zHVu6$rzpuL8rCy{jq$6)rIYl?{Ki~%``aDrE7Bpc(pjXrC<_xGs`ym*buZy%>!Mc1CviC$%>%d4h zMP-0;r^>UxGD7xlZWXDN-=~La>zUHU8g^#t9MqoOg%8`C*cT%sHstpR8>V>E!b&sH zK2^{DOsJwR)qL{bKVC3f`W|N(!uylsPQepJ56(hoBK%I9DmZ?A2(}9L@yymJ?h}q7 z&4!)mNb8yND^Dh8XHR8W!bLav>Lbdr&87&}ONtBD zGyRHs?7j3H*ZjCox&~P^EXstwCAm_nHM|!fg|3Ssj7zYf!0{TvtAn9`sw@)Az|1c4bQ(!`whR7^fYl+ zG*+AmRMP0YBne?#00t?zLXUDj?)3FGy*}?kUNRcVU0MW7t%cWo+BmiEM6o zxG*$pxbRI`0ox^?%Vy6sWjkkU2|r#PA$&4@tuU-)3JbkmN;$IrLhWk`!ZCVbEW0g= zU2>OW^X`6R)&?@d51+J!Yc*nN^oUA&GiC+575$h>-uGBvgkTMcMH9!`a}<(?J*^{#0t8YFU#8ht)>^>7t`k*X2PYe z?QFfEkZ8kr_G@DxWlz1##PUj6hlVeEpd3bLYlS#tXC&K~BE@_)hND&CZ7Q7k8_RZ! zvOha6(3XJnG}$MC&NQsT1Ly9tu`-2h$?g!$-Wkq@eLg|Eteu&Kd^XcOEXK5ZzVJDw zne?V8lCld&&?O09nzHQ(9cap<6Kg~1yKW{8)Lftyp_R0L)L{(#=Eru6jc07;Eo_$c zVcD;z)3Ya6C_c@aF{=@*aghb(4h`Yz(oO8d=MgOJXE`NYcub~qH&cXJ11%6+#IjDg zvO5wVsO-%>`et*BS*={aPKGKm7qVuB)x+7BDG^LVE}n_sILd-dHqg5VBj{NCTvk0V znYa_T@!HdVdeOdsOxAoM2em9xiM~a@q${nGm)ePf4NAKA_u^+Suxl?G9J(y`R zVu^umR!Fr?YVS_lkXOC*Z{G{rrzBv$8_!e3S`}8LB4BwR7;88y$DYS}v(KK3n3KE= zX78I1rlO+cV|NlN=X{35=@sD6EQ%2>YBU&C3q1#q!u6j)boFr(DvemntQOzrc^-8r zIzyRFdLl)SPXPP!<{X_{s>Z%;xkp+{J87^nhQ&Jcl2mRQ%P6WL_0(#*@N5b*24nW4 zzk?-R?8L-rhv~2RZ<-RjiOkh9iRUkn`~)Y8x&Me_+nZ_0?kBWn<7dj+xriP}HB;GEhEnz^4dVSaKgv?_D~ z^U_VDad(z8>wA0Ir=mg@9vwz^rubuJaVE=26k$VWg7Loq{tQ!i93z%luo363Q^J8B z%Dg2?wR!wID03mp6iK0n7e3%2%RpL@n}KtO?vR{82U)~CrAZ$}SeUXU4fk2fGK`ei zuh6g5y4sPMG=@+>@E&&V`&wqb=>{mu%`#xcij z-^sf!nZ3HuPG_!lu~pZPv6=9Vm5lyQeo@oORed(QYp%tV=FDPoL3-?i@?F~erH4|# z`m$}?@30*mY0i{aNRW0*dl>ga-Ko7|3_A zc~Svu_E?AKoJLa0_5j-8^&FiyE@Weejb@ILt005Fj*mXk6ekZ-E4ohp{NHa~CdXW_nX+Mn-$)prFSH%q$=n0qQg8BI zTE>6x+`i}wL;a4kKio^!8R$dY+G=uf94Xv+b2%H7Jx32$1W;kP4sFRj$Z{WVVpV%b zvgY6-T6O*zMSP8AM$SSek)q5#Q3yM^!G*QN>}6IbkFd{s!&u8Y0c$p0Oj_+Dg)>~M zSiH;-MY)wRnMJ=T|8^cbyQ!NUoz=strt6U1*L&Fa@H_nq8cS7|)rpNTAoG7;Q7-Nx z)jxHoHRppV7`><{I}_F&PholG8f=eJB;K0zg>KZykh|9k=6LHNwS;8Tb&t)oOx2zG z?#c@5m*+8qXEAho=@7-u=tI$nOy;c{!HNdbna-ao`cRXOA+fiZ@_`a|I&T;a+p&W! z{_hZNlW=8I8p~M5T{|}XuOcdVUBk^fLDZNdLSKSDV{>ae?($fSntG`;;4y|ED3Kgw zENI2nKFZJ!U^_nDrjwpu>1o*&(hQtJd&?Er$p{@5+@i~N?cGSn!p71uwHDUF-`vn-EK1B?H;(v)9Ny@DE7-<&r4|41J)yQU7*n*ARPPJemlJmM&2S-!=Dx@1Wm#BtsGmN#?ItPkrRgiA znbEu{Y{Ia3;qDFc!cBGIELZueyY_6OOYb+eDe< z5)G_Vx<(6Lmf}FaGuy3DKwAnHQJ}XjZRu>o8}mh3q(CW_oOqaPO;m!i^bOLYu4C=+oh4 zJTI)098Xtc{ellT_tawAF(;B@-{qtFqg?8UolVXM)Y#r(%kaQ>WVck}Np4mj9WAb< zVwW*Y_l75l3!l-qY%Ml<=MWvReNTgvM1^5)vCO$}H9I`PoLNiBu-cF~Hv3K^8}xp| za`$VpOBF}(&H72q@!A%0G%Kbh^p=iqQD?JUXVA-uYV6irJr=N%-$knU;BA|13akB& zu0Nk*H78E@`TTH#^=aO@97)bw!pN}k3w_x5gjPA}2$z4k!rr`D!rn+HFr$P2QEfsi zJASO0H9r#-Ze9Krr+=J))qz*Y<7N|CTH3Nvr3Xl`V<$fYexue?_2ls2DBBz|n$39g zmUgav$@Uz2!?dIF*u;YS%;S(T^OL#8UV03%LziX?FRKTmf$C*^m)bxY>XDRT@gD0P zjA%mgGgy;RLF&HJWDpoly94jj!f8{eE;x_f@JMBWZ}XYq>y1pR(vPWI!i6x1QWY3hPnX6G8yIdq-e>eD1scS7QyB)~lUBa06^h)-?AdyvXenPHZ z;jBHHKmQ6A3bk^3EN_HF=t~WkGd=FL!J>7{0o}`g-dgUd&o+8{DbQ#*&NVcV{5pPD zQ9Z$=VDeL%&8Fy$VRGYt(fj$TEaclRiVo3duKK^otUjIfL_LGy6LsjtEemGS_m1|S zwWY!`Te>;_8!Qw!uw7J1$t%v&w9j2QPva-v>U>Q~z2fZdhUskjjyDwQ5<+20ifnOu zBOP_T#C|VwVsl%C>PSrs34g z3Ni$qTNrVT5~Vc76Rhdhakfec=$En^GcT2) z3EQRFovsnA^8QG+jbtcjbRE9ec3_JgLddn&A9q!2QTO~t?3olsgMkz1Vv+&Qer``` zRrS;oewt0cL(H@>mAUUwV4EM9lE?ikEPH4b(-t9N_vV+l(q;;+@}J5o5)Ej5f(RK{ zzs6x6rC4n6gHEn2r3L>Y$bY3G(`=tgVjTr+;k%!-thSlFl2sT_j-qw&%_fJ@_wmkrefEN90IIp^lKnG&pEdt9`A5WKC-bD2Uk~A%9S><% z;3!H;wZp7xqw&h4cblBi(K1Xu9$|Vla+@Z6Z4&c$6ESjpm9W}*Il7H7G8n3pKD#neX)K#nS-P1x^Y?n;`;x(9q zVgn8Pr$)_?j-$X5BIHOH!TM#)Haa4e{IX6d8B=)mpgOd z?AZJ1%SmKxFJ_O5WLh~TWX|t3Vpf}xRp~2~nb}SY7Ote?E$Q^%N=m+`UGGc8p zN6BrvF&St(QTIijU2tqEDMonH1Gxi--S`pRj!xVIlC zU3y33lym7(sS@o78ld4vPf$jvfVOUsWPWAEWW3mdd44-d%{`x3*!1=6)$v&NZ&xl$ zpR%6awm-%q3^SObgOYIj{#ZJ1djaK!Zd2^HcG6P{BtHtF?>H9rIG44RjS=!%It>2P1=qF?V!W>y^*amb&s+tXG{+iWFOVc% z-?5Y%@r+I`Izl;8TVR{nM%o-)PN(V`=ubchE&8EH6@pFlC#Qf`Rc&OMCL?&3g(9i< zOS7~8MN;ZmS<3T~r}eHcU{A6H+f==g(m$3_aQku6RL-X1$JervHTJCH>M-`6GiPlP zg>+}EDXZ4^Ws2sh?7^dIvif~}M&QFB3UpdI2Ad($o{O6W%ka`tvux9>T>4z zcAJyt@(!%ptxmlG8yJg^B-`)Lal|8QTGW3ZU+z_6JKrVIs@NEcU3Qm*qeoD}!8z=t z@HgrH;Agt(4zj-5Ox2eanM{2kU3lcgvds>_>}S_)Bn}jEmt#2$%P#`Qt$c1ZcP^}f z3h*0j0`-1<%#8g*JCaw^k6$G$uzf!>43K3fBG<9|q|Ao7Te7n1bmn1K%sA5|FswQM zKla`_s;cj67pJAWySrHrXYWIINF&`yr%H(hQX--tB`Q*4w}Oe?-6#fPV`3|caQF8U zcf9w0?)x3%_ulav??0D6&gQJW#@cJIv({X5KF?fpvf4ThvG}5RsN7@|DpdU_EAg?e zHE)as)xqJ<+InjP)xP68b==vUIuTq=M{H#FW0!jr?~`Lxw(u|&T|Pkx9@M8Tq=M-1 zDQVQsVsEPafhs*a=OlAJXoTu;xJ;$6Td8&Le^9SZ7|}fnHT32{6H0%QqMQtb>B+U; z^vQ`&lw+YCy*;#o+S@WrneOJJ8;&faLywA3-PagD+Lqr`rENYX<9C^Auv4L?KPOb8 z%5!SkbT+j(c9iPEByIAmmA)!p$4*UHO&d2nrLB(LqWkYku;m>D*D$^h^sajg>F9M|ST7fsTU|fC*_xM$VHak5k@d?sgteK; zOZsS{m}Pw}nDxt9gmU#Pp-%O6vvyCjr#t6^b_|$Ho6dYqojgBAVId>)lw@gXzDM+s z7@I^+W)wst|R@2j{ ztGE4Vq4V+YWzV~n&35?khE?*ek6PGsgBE7` zv--_JYQsH8N+j8g68l*~k1=?$`UW*RCjTKzx916~cUX+BwvVLFF7%)V&BP*zoi?;bTohsYqNx6j|rPRg?si09Ax_3c6^|HBxBJbnqgRdj0)a&E)udokv z)LePG$m1Pd(^N_Kxb@O2@7<=?D2&p_b}eHF(}&cATpqps*jZ}B109-I^9WVFJe-pE zV$++Bhtsp%YUs=S?zF?LOe)Ct34O0{27T?m2A#J*kpA}8la_b6Lib3np!FVh((Laa zD4utERBJ*rExKt1EhlD8IlZi)^W7g)PhacNo(APKuPPs1f5e62IMYXsxQVb|KJ}uP zJsP51)R$A|`FyD&=l%5KK_8mybSd4<96)$=!HU}Xej$zWchrhLd3qU}n~ve$NnQIq zlOFXpr}tm%rq!PllY;rn4HOQ%Hh#H#(&(USsNs*+F05+=Eyj~KqXR&REv^r+Fw zwy)GY*PqlYb#XSAPZ&KuJC^pC&PVf-hg9H$PWl{OLwAjzqAxiuX6;@x)p~J$7VF&l zT9#dzIcu*zi#ir1$uga@gVk7gopmqq2my1q$q{DO%#_8i@xqxPTgS1vQKf| zr7LX9s5yI&(FYvjX$7@1+P!K&-M_?~9yC*;!h20AyCwCswT&DlzIK51_53oHw5b7A z{aTw2XkE#~A^X9q5K^ZVLYGlx8+Xz2`-kd&S&#GwSoqY82d3E$gQbAvwy_V*R zYG+jhr&;$mFQeAI7-4C9%wy>}H?qtwFu8X2lu)92jZQB3S~$}(yPi$BMcJ{dBFu5I9=jc&fDk{Qf@ z%Nlq3Q?MgVf;ZBGrA^fPJz12}ST-H4qCn5StVv77YtzdPby06QBIvT8dGwFeJ#@{{ zt*ja*eq|@;acZ$eEo<9(3)awu)vSz&{jAVhLal}q%T+>>73a~T{ch@AEx9K=dOd;{ zbSJ#uYfiGm4W>#E?HxH?x)N%Q`lsIB*5#2zLkD#_p*6@SZy zDn4+ZYS$X4=A9d$CW$;f_Vy&z-|&f6bGu3x#0Sw)e*0*dVoBO`_ij4d;2yoPPm*1> zaWQrD^eReC{RaK+s0>}`y`H+V`V{s2Nj;?Ql&0ioHmJ(ot=o)8kH37hktg0pnd%&WDZE zEge~!+oP9ak6xfQNxfq6q+Mhg%KFeM^lB<>K_5#c^FGU|TbwfcevNVxGNRm`UuG@N zKhLskv|@>lB~vnS^^}$6U5b;crnt2(uwv3HD4!!0)C{pO+Iy!km6ypuQTL ztm}nTA&b!;(<&%0YjIlIcYta=#795j&ZKTEl%bqz*U^HDIp{|^YpK>*9#r8@H%i*6 znhu)1ikkU~j~?fpLf@Y6Nl8xar@bu8XmK3@+U9f(9a&~Yn@tI$Pe(M;d{(`5_s$)Z z_><$5YQzTmF-wb17oDVp#RF-JJW;xMS2iuD(Lx`qilDzd+)Vj7ouDGQH`6BTXVBlw z2WXARVESG!2Q84fgl{IL^c^iW#l%0T z{^-g&*7cONh&soTIJAUycB=_xbt{H-z2pLGB9-B>)3~Uk@jP@_s1)^L&Kg>pN~X^- z9ICQLJYC7lNe3`CC(n_^w4c!udRn&(%{^@g6`yv7zBK1L9dL6E%`vcsem>(o<>7Ic zKDM!qruYf_Ot&yiSB_9m&HUM=Tnp)v)pP0EOR{wJoF&xfb3bYI@LD?kmN?t5*@Q0F zji!hCOxX8|TWHPf4*KEx&2+;%2E%QBLqDA_z*f5_$+i_wr^fd&cG!&FblD?ey7qP< zWyWY%K8BB}I{ifYsImxMEVO|-o087jd-N!4{mf9xGbV|8^YtsszRR4dYnEpvjL0dl#Okq;o3is-yAr@`*m${;(~r_bQZ%{qU4FD(j|s>w;*@ zBirau@!9mE!9H5?`4M`7iZFXl36n>O2>_ax{FZh45tCz>Inzvrp0dn0rLm%p4Y3?2 zH5orH5o)s(s2J0WltDuc?S1_iwOT`#eq@tJoxVPmI;)UP*PT?SRYKb7tl9=@Ff^1J zUYbC!-0Vkx%9N!KW@yqH+SSyp1&oY16-GN8+e)VfucT~d&S&kHHl#0EMA1HBzEr1* z6aC=8S}KJSruSu<(<({Q^m!pdDYtr40&?jz^*w@4?vtUZt-kasA#S?wN*G-{HlLP? zY@ka#PgAqr+0sAPE@Br7w9ymB9dwX!9X0ReEb7KiEB50nnKb8iP4;!+LDm?PN4PrN zky^8LEi012y;C!7S)0V$SfSeksrd0@tixvyQ*AG;sJazGlyPG=9bk5cI)CI7^|`#0 z6*cDv6Vud%)?DC3aU86qKN(%44tqw>?|9AFiym{btGI8{x96OrpKP_K$*T{vqK*oC zQagd&zhXKSE1kijwy4u5!uV-U<{e|KZ&LlM-cT+1@$|f(LzKhGaQY0dEA>$36sy{K zgzD1VLTxQO#d;}MN7a2?M9KOuq%QBPp$yc-DEEj>v<6j8Z#vt+uG#H!sR$H*a0Z&JCynsC?;e3~k7>6HxU9~iRO4mc;vI*u5Wgs(;AN%z4p%rF? zS3a4T-R=jeK3zz2tbqE`GA!X`Vf!2rJZrH;Ykn9S#4~ZXp$PA%reOZ}e2fjY;O6y6 zY^aRDyXPy=*`JTJx)~5&A_fk_QRo_5p#RY{cm~VkT*w@}nrn|e4y#bp>V|VdDrk## zMCZ62bk^nK@|p!`FVV%_$8vZkSAnuM709*p#}l_Q49(5Q!X{6+gwMwnjuKReOv8X& zDfC|~L5_qEO8Cv-c_Id1K00EWy9A0UHR$g;0-H;-v0UB?eYW|?bIwPcpC?>-b)bAF z43Cs*F-<}gvqWnVqGf=0@)?N!77VMa{Vp9IN@j!KQu$QtovE+*l@J~zeQ(55qU5> zpa$=UMd+B+fPu3UB$YLgN0y=aqX*Rat$^9uNSPju`S-mLezz9q*6N_TF9Ibk5jb%? z6FXPNV&S?}taD6)t5PiVnVbS1o7_>La-ZbaXQS)tG@J>q!BqicO#TwEmN=x1$qgx3 ze!d;+Zlv<_;s@yKA-c! zUF3=-*UO>#F$)U!tFe*ghgkGC( zNBrV?C{5wVkM+sex^4yd>h1BVqzZA{Eio*-4Gs%UA#}F@`R)raxcdpQ{l3@9u>mvV0UpP7cEKvXB8wbbVS8nG1zwSt=t8wuBaZjB?j-NxWiScf4|$m9{JDt`DcIs>dyc5CD_m3m6KTmX@9kJ?)ZOv`ahd5{j2EiiGN?O^{;v( zC;xrDoqyMp`FlkFD)`?n=l=R0F8}ZA4gXbdr{ce__v5d6$CUnkJ@LP~e@*${*Zb$Z zlKL z0IP%Q@Hp#ivA3fdZtX6xu?fH%TmHNthlH!eJG2+%hglhtvqkbymjdx=Z9? z%VVN`hzs)-T(K};54Vn+;Q3AoEZk#<%E$#clk0>k=^teG4_yQ~sbHt$79xCfI=)`c zK;#uYJhxHC8%qH!zZ`+7iT0QuCWnQqxbQk%1JBi?L7Eg`zA6kluf(By;0+n>>n2%K zfg2{ZIDJA1niq5M!JJ^1QyheDm_p;68tR{wSr#OZ5QqNDBu`oy`%ajnIQu;*$tlC< z5<~11S&xg`4iT2JIMSRIp!C`p7b4XlCvR=)yF3yvM`tl+u{SPl^}$x_cjT^?G1~Za zp%n0eY@fVI3Z{J}j~69iLbn)$2eo0WAdkd+RZNS1OIF1iVx@HeOr&SRYA7DH(dL+< zF9i3JN@UEmhQqKq*o#@XE$xg(e-@&LgrO6tiuMtIoXLAg_K%K}nd=Xdfm@Hr=}pE6 zVsMNTS~Tw3hhpsgw$#HyS{w5s1mN8Bhz#eej}R_Uzj(Ut3ve1Ajx`?f@8Z1adCo$ zL;ZlRwID*)`(aC|BT#u5+s{vsqKVJsm6t4(BX}*YmL|hriiy$GVgvC*l?XakjnO4M zDEQ5RlTVGoGqW58F3(B7tSqj`J|$!KpAjCeCnU{-0$;K$W`*iukvucjk}ilAtw1@4 zF=RA(@!^y{=1a@tOz=kXi#H0X+WVXA^T{8VJHyo?_|ebk_} zej27+dj zdV_JpiXU_RRv}cg1|e6&F|axi3-}aJ*w%!@cl}{`<|e{FsNuD*K8BwYj6YJd5Iot8 z2)X%q>RySqj8<6Kx5Ca6IBWQe#1Bw7X}lbM^;!u3>4nK3>G-i)6G5}XVA$i1n=_=a zvDgOTvqRxoScDn8TTv3@48>mtSTGunBqsml_4qsF%=~$Xx)YD3mI~nGmV(TiFU0S2 zD6TvOg6w?I$;1Gp8(m?L=xo9rZL$A?Za<)w#9$rRR8_|d%QBSP+n1}j+ zJe-`HjUO{)Q8$f1XwO_r)!upNzRm~D;ux5VwqstMKGw9kVl6Y~fv0xDYf}f=t=fqc zBO5qyIl=}m&`nJ-S!}T$zqME6a7HhNj?Y6xH-&S;)tJNJwFzG}v82WW&l0@wRkRnI zhO-bdeHONyDaDOvW(YAi#%*C8w2eC>n6DVGZ*y4RT1QaP9*KDQbZGje;;bx%p%04@ zxiJv!5=-E-JREuBE{Jc7huvWb{4B9U+yzlQF`Xn^1hNq+h0ge6vYxx0 z^yE<3^>HyW9Sy)fn}#}bz%r`W>9Xb2-EXi8>mWUq9M)j&HjGj;j7oQp4?bn5dy%;J4tT~}LFF?y;`tWh*Be_Tn`a~Hz(Zp3xDZ7RspxF-fW){1q#y8ErA?_I zI_xPpNH{^FO<|Cripv@;kQ?(xfyOS(d!U0UxzV_>0UZ-TVDCBZ#yEtUtZ zC3{t*5W{{(fFRCECkl_X{ znBl;0QPkY&t_Y?*k2>Yw%S zWIz=dGCCbB!RvILL^V&4ZT@V0^?6K`-|0c&u^P7S_&^>Xcu6+B z-A@)=uZM7vDl}$9V9R1@q??QpjnyXLr#f_LQaw7_rzQkc=1IZh`M{Y3B)?XS! z?i_F-fCtAvUW(%mSG$}n;_GZb~b zyg1pJgAIQA*k&z{%lrPAu&egz|g^EaFqM&?;Yq_kyoIIK>4@ylMJ1sVl@$~74yphc4S8|AvR7AioW={D8qxqgStXE0Gi;o*-r(}?3 zn1F~DCA{3T1ke3MQ6j0vtf!A;)P5#7SeX#s907$U0jOF=Vc>8CGgn>lQpz4%YZR?w zgSU{3MsBS1|4B@kJ@IWYKOVfV1m-zmTK#$S{E%nfBN{7{Ik0om85%u&NXoV~(PcMb zSs%{xT<)AytCRK3Z3Jbei&9R~( z0$1**L6g@8c^9KF-#Y|5Rv92O)C;#81E6Y?fGry*$-YiTCsHp(-lm249?6N#Q$0}o zwF4^L3rU)}A{56Tl2yNckhq-@Siac=($T&+J(I%PZE9GkC=IorfV#qUAe1n=ViTiN zCu8?JH{?B?iN$Xm@h!p5YQwfr_%k|N#Y%?1I#h~1TR5=JcP5mTdBC-|2QMu-v7gz4 z%@wtw^Ee58WBkbOWSQ=6PQa@6tvFSm3T{3|?oHvu?ot(4+L>VYE*k{iHN%PvdbqrE z4K{jbL2bwfvl3z*>5UogaDCDh(@)6^Oqy$tz>MA(>}PkoCmx6W@A)I6#dLTt1O?3Y2G!E zv{S?1N3?5%}(M0LvN@oB9%pY6q&)SA-vJ5}zJOfuEc0zD^2TF}S zu~`$itQCpCIjf*+BM$e=?pQV`j`>56$$+H_Jg*l(LLRs)bW{(Q|VAyu7}MWX3QEQxUeF|a***2 zu;e?)#5S*htLG_1%Kj$RmoxGD#TN3RY8RTXZ70k6GqAjnv8%pK#j1{%cOI{s5NrvL??S)8bS^&Olze#kj z2qYMtYQ<+OWGr2Q{^B%)#6oTXKen`5L ziWB;>=(aXT(pg}2?MLE}Rf*^OWsqyVgvs@#kLJ>TGUGHqBn0&l(zOizKb^5Lq7vr| za!}M830h1X?Lp!&X*ghEjx>l1T_d^rA^6GUR1umd!f>NOxU-lC64_T_eW{*=w{3yS z85-#V&M@2wD3r}HarxDPJvwz5-FSqtU&9c+KnGg!_E6oaidSQ7_?}^S;-i7sK4S-l zr~05{BoPgaowB7*7Jf_2LF=2NYtR;*cXJWr_8Kt*rWn$ig`b@naA4$v-fna7Tr9xF zLVswDrC>*Q2EIM_L}q{^X2edxOs5e1E>lHJ=x5TH5d$wb9-PzDfq0!AW-U5Qx{L=% zanWrub6W#`ZxBV&83s4GL7{MuFci2&A-zHarV*u<7q5RLf?qC^DrpMbvIW>VD2KsI z^O3RcB{^wV2e~h6$w+4m*a;Hww)8>FRDR@hyBHRoF~V!H8Te$Dh3}je$XEVFEI<2U zzKz`{jkE(@??vCA{?*}t6 z&1_LQk^up84Y*$ogu0_XR2jXJIUI+rg5ubtW(3P)G1wI?4bd`lEV(NP5pHfsi96$v z{|L$3WDDxB4s6y5VTtfJQgG-S2{T-bjiU@^rB)2_Rhsznl@|k5PAF4S#NC<6mQQp9 z5i|V(`IwZ0)3zyCyZas~7W6~c(GgNOqa0~ppA*eFvth#_4(EP%#xA~3zHrN!a58(* z(q*}jdDMWFY+m$zK0>xkv&GFDOpZ4h#@)U}4G}3Xh{ExDw4M#bgn%sM1WnPO$cfQ5 z5wvF*BeBsPK?C_{ms^Rq({vCpF-}Ui7a(}48PvkP5g93oU5}l>kz|hJ!EE@Q^2OBx z#%III6z7O0Ca3yi_}vJ>=UD8Y!i|=XX-HQWfH;HEU8))*;x=*c2vdaVXat-uJ|2WJSD4Q8 z#l788*vahSA#d1Ny_yq8&e6Euyc9MasZf`x#KH56k>Jj}uZ$Vac&lN?z7nh~UyhmH zFR@}lCkCHRV=$CZMs zmT0>j4euBO*k1L7o}nZ9Ow2J_VuT~Nmg1!TEyP$bc$Zfsn)cg6CBzk}zm`LDb{v*n z|7N)-wF^7$zaz7>_d#!eD?Enok|B2=7#cq(X9sqonqLRcvMO+EV+eBRMME?7EjbZB z-?a5z1o%hqLNaA9#&y)8UgZlWha?VD@)$JJ#m0;f_XEn-3*M_p*C^jdTymATiJmwR_gU<+3sI zj?cl4JJFcX&H+aU8<%8UFjTt3(*I}&IPQKU-ujhzy?r+v=NseVq$gxto{$x%k701- zLGmG{8v{Y^cmOBd;nqf?tdgnhy$*OWzD5S+8xWgSjCT$+PVTEk?+arze3Qe2k|LPf zM8WT3Go0k7BbM=5E8N|V&1)UNw?+UytK~7AlYs7_#kjNeBWf2Xoo_n;cdu20?k-3$o~5m{F`g7lTvyGFW~aS`N@? z9sEiT4+Nv`yB<=W`aw+N3_0%ko5Ve-$3&?gtS^e;?hg-UtR(QzLmamfRj@?O3JZMP zp_0~w8H|5gTKyO?V%E+yO>-%a#usb;G<8%v*uv^E zA7avk{kNy5XNYxWfbiWpc0e3&yq7V${_EaRvC_`|r z1fE&&-~!`Er?u4vw|wNV^(HU)LI`>$W-zkE886>!E<5~e_zbrgFwiHX%D&f~>j-{XF!F^W`ZnqhK#u*Jr_Phwiiacx$2*u!%1bjbQ zf=Lw**zQ`ya39Wi%oPK@Uv?NbWY);H8L&QT2z@7a2;b1gt(VfM;Vs72%hEV|Wf`he zi;$+)1Oovr25acSvM&YLOxqzs&;&lTDqe&sql4=qv3DczUPxIUxa@_>E6SMRwhCw7 zb~C(G3I+m_FngI61Q%6cOgbC&=|AvcyD zW41wZUlO)5a=?9C9$HUr!nCbQSTQ9G4l_ORbw3AO>TU7t=_1HKv_(pDBHFX3;+{w~ z%(~h!v2P}nOwKIW*A$0gJiB=hO-AspzR)ngD)v)4h!SO#&T%9NW;`c)1a=S zi)GJJ5w(30s&(Z-hgHHyM-ew9bwM-uzBXNo&P*?~Uz>)l%_Bs!E(48MBN;o0;TeYA zv7|uU>UdrwQLf>GpZWy3eV0k;{apfGzsr%3Y=xa&z1Z9Yu&vb$| zZF7X{c;Q>oOa`L>gLS9k{iHXJ$D85D+FM%CkNdh(=CP&^*kmG~b2@j($+87z)jo5Tpq>JGJ*ADU} z%o*>`%0r#LN2VM;NBZSOaY61mnbUNeJnWLe=3reE&fh===L+Ku_gXUhc{JHmpowIA z5hg)j0OK=k3rxF1W~7}Vub>O#hGHDPrGQPCi~H~0pcG+&bt9gLE7wBn*dEJWwPMiz zIY{^7pUgShfYS z~^xj8DZO7TVfCQ1CRxgGK3MP;muHig^&Lyb|j@){z~z4WNER z64n9?{y&!wPc5xXOx9}=&Q@Zvk!bU_M+{wA=NYJ{-xdTd>0i*_XeSd?a?&%*}o4iQM07J@4= zR*0KNVa+*X^c+;cEdJ>z+@;R&eVP~@JxL0$z9k2fT_85eV8N}^;5^q8l3!9GJjMl^ zv-2^dzyey#p1xd*A8o#S3D@quWWBBBLXlS(@o$7MMJX9vmMPfA@aHn# z4p6inCleyxIPuL9%KLk9ZGLkVzN(%-KTUFA9yn)$seb6}|?{L2_gnIx1DL z<+~V6L^KfiDh{^S?~#BY6{PJPC->}slAk>fNZ}L~TIkorlfiXud%hEo4~F2&%*VGa zf$(3(g%csVm>#Et(GTy)GVdT3%}6QGm3jzgiW3Zh}2?+@!ci*dj~1|uD_QEQNaL&l8nb%-HzHX+e+ z)8YVh&K3sO8)MWOc3{shd*rItW5W_<#+SSaX}Ri1WqiCZbn~Idj~9z}KP5_utMu25 zR^Wqg9%8OCzuid}te$O(YdmHsZ#6(FXF7IRMIa2GFcC|FBVl~Y{n+4$w8zg(7K(&y zVT%Y1+$qB$zIj->w+Xk`Mq&3T1N~oAKw=AlHp4%C=3N1{MG|gT1>-a0cP>AWjGrkT zxVcXVJ+wS5mYpSo4$|;A>WY9y83-LdfT2ihoY1}xp>g{Cqd!t5<>F z%L#5}=9mohM3BF+i6~<$(DsX9G`t3@Z`8qA%8TJ;d_nUE!Fo^==GU^2dN&_EUY!V) zWcI(K?#Ml1hH2_iNIwyRF#`ez>sZL`X#?N)_xKdSa11)J*e2zM3+7HJ$uK}`^Fp*% zrsL+40{A-pYmOA-LvaH(>>Vej7c#ISYYMcqB_O+CF-(N{Agou#oOQ9moDHE!XJmb< z@(hGoYk|k17avyLApWOjAe3({9)5|&Z^wm*ESxl_Q#4>wvk2F7g7F)nNQs+^fXE$? zziR@+x3y?+OomDAO1Mb`qFhiH%Cr_{cBSgyvuT48yAV$|?7*qCVsL2%L9{a(Pbb_^ z_B95M*H@tas4bM%sbaQiF6PzS;m9!=SoQ#Z$7eEQ;|j5(S?GVkaAE8jxbQm`P7B$1 z#KD2X!!vO)UKDOhjd;4I0sX@Bu%qBN(QBxI?~N90xvK!*HYv++Ty5V%9?H&j`aWQ$@v zaEPRA4~4cs0+gCm5Wx6xcnfjk_6~dKh`6J8|5oU9zaqEakCQh~jG++AYw=0a5=Tar zVNs-xe!qnnT3-rxW)E7+V5vp6GT5{&6^2{~$U{R;Y!LiTT$~>gb@4$$4I1LUH3za< z8LlwrEfIQVf>&W#=qPYvaxU`YRuf|zc8Oxu`!^(dr!P*uo({IBAg0zxSn^fk~e{ z<{jcjZ^e6Zy@VgTKUZ7u#89~P;}_XDZ3zYhmf|Y>v5q4cK^w&JF~<{L+jHQoatB?H zxgd8%9Z$8Ra4kyB!fH_|S_5LBz;MsK*3GzKx&e-g<|se!gG!HJgzI*|=;S?8m#fTh z32#V=n;f36eJh&ReR{!mEKsAyKSVW@G2-QdIx)X3j`4GHEyfkDt%O*3&aTFs)pIt0g`(O8Ih;WfdoEEoy}0gCtly zt%Bq+0>uDt41GIK7T9H?tZX$BdrTqamxK;ZM!#7X55X}LI6tQGX%oSkd?y@umx9aZ ze6Tle1(ebaV3@KJhJ1Q>^fnT?#}iQ5-VN(+DO`J5h2zFG=sv}8>H;?4QctkFR}hNm z*Fw0%=)=Z$*27cQ4J|U8aYNJ$!Tfbl%`t~F$6>S=3t-AhRfq`<6J@CDPs)@-S+)!} zzTQNwW(de{JFH-6BeB~S3+h#I2Kt-$XC!eHXE;q=p` zuv$L{W?nQRBrGhxh4MqvH6Kms*6`^M$B-h8;Bz||eVPZaTsPx#l|6(m`=N5c2d)}? z=u*-^=v`NX6PYV;K&B2o{sp+C8wr<@JRBGcMa>sACNDz}l!lAZ%E)+&?l>d{*y8R> z7c9?WF#0)e5G{|uO&%fGGIEl2XeK;nr@_%V0l|kp6K}(xB-o-FI@%gYpA(7$VkxNM znvIk985_m28RtWbaV?G$PY0@54GAotwiIdA{-{h@f`i))!De{C&yECzA}v_9 z#~mLz6VVoxjD!k%ly9@aZ^nM6-y1@w^@%xG*>rH8{7w$nGSSl+{NM3fEY`kU2_I!A zEOB3puW#0pM-zj1?s}7q8#DT=xi}*e{EfacehFT|wRoF*3Ntg)P{hnNn`evAtr3S$ z_p|ZvRX#E|`d~OUAIc1;%5lPoS+@yjXZWi1$`q=KY%$k=Iv&?#JR(VP8b+r&KfFIFgBno!g5P-x78T|OrO|U-Y8;tKiMfuEF~>H5j%X5 zWa`cZlY9&1TKRBaoCWEtl6Zk0EDSzGB1`Krw3`cR$&CM*9l=R%Pg9A!`Iwns2bKJF zunT zm~(J-5d3@(B|8>@b8WFBG)NqJ)3+;k;rW;4U++~isv(FDqchSnz^yyUM=b63E%UTjLJ-sm-QUjGT3ghQA-G)E15IC zGBFrE=8l()ZDi?S0df6r#L$3^`+RJ8Yjq(hmK(w!c=1J44>yUF_k`c_G{UV~PRT=*Q zUL+3aKxQZfO{4C3b(jNwMmqR#T^*9$kBHbE3jT7@SYIuIYKE7(;QNU59QQ`~vry=y zjF3;pBBf_SVNi+CA#%#mdLB{(Tu>s^VkYJTjv){IF*#%IeS8LG;**l5Xt zkwz`dQSvo7v|u@ImQ_G9xC%aJQ}J9m3E!f^u`eUvSHeEH8z*)x1=nacmY5d9aoZAPJ&A!? zRT%nB60ywQ6&D|^z+)k8xJWTJTTv_IW-)$HrLq^tFtXflzdDb3y zdvq}>-HRYIrUEI&3OFvTg-TR7mWf0_r!5kkU8_N#amQG!2j-q&xVvBp3?~)hw}cQP ztSTWtJpsI9)tFVIfOQ|6@TfWrFPyb8IO2kdlV8b&o3F`grzv13!32@R#D_STqzPi*#t4j9 z>BCH)k=26@_#sz>4YxJWnimL}IAv(c^5SXd95~Ze@HR3)xitlkmY0P0TnaXEFNMF3 z0h~T-KxTm-vcArRQ;QqpgSi}L9X>E%u<3X^H@thw=w-Ii5DBY5(Qz~My2>HgW+`)~ z+zRaRJiLqGFt=qm&d)8Pj9=m&?8)DbSLf1k_4aD8ssj)=zXP3RpNQ-C*YIxnM1~@C z@iRmMC1#KHey!=n>Iz1mYI%uOb0grop%cgX10lCg9}2c*c(}Qp@sD%HmzX@%?PBo2 zhsJ2Kn2M}|Q^drz2=_I;v4`>XTi95L!5`}&n8(NL3{Yo4Qtira&5Zu^GAwG(cL#@&%nW_&1zYGX=PQ#9`&CuhRiUi}OSQV@Y^MMxJ zJ7tRM{YznP$lzNiopCR~8s#gQv2qEAiso{&Z#)S|{ON#`p*zsHy%o($RXD;K9&GnhglaIuV$ie?X(OHIN*)366B@_e!>F#dWi;@N@>6Gq} z4nabtL1~l_FhBvZ^8-;-?8NTE?!*@3%=zIjf?~%&a5iL8EpZkByGiS5azsv#3Jm&PAm+-2+-JN{9BGG4Km(#kl5FGC zL%vZT*01(M-j9Xob92H=xgbcqJHzG#{Xao7!CIvQ<#ngVAE`K zyv{5&O8jbv<`DtnZ5)yBSclQ-`51kW2g?F(IKJP2{@iZ%rmP*>mv1uhYGLdvu)*qE z4Z3x!YjJI|6yg63BD**ooA!`CNB#4Q)oS49)WK$c3tX^Iz{SfsSVP|fvG zu7Kb3^l-3G9C4*F_+(#;Lvm0w}n($Hv}ge#jC>A9V(fa;sQSm}Hm-W@gI-fN8?r^4WK&<1ILjBqJB6UtZl zuxM2|E@{N$^xR~8v0Q>O3lFe1Pb760!RplI^SKt;nEFGzogvezLkXFd+brs1%G7qr$(BYbgf1~=(AYA9R2QEaNtn}F>&<$bO5PWxv|>Y9Gk@raJK3lt2Z}5_*E}l4#-3_ z{U7(fkcN{s1!Mobar&7owC%_*@tqj*l0a-r7RT$u(Fl+6#CQ=glRNF8oGOig>!RrY zQiLxHL*eIm7`Aabus>Lfd;ve)72ODx%@wFM+J>o~{lwx{p>DA&uB;Bla(**}x-K>h zcvX%a)T6g3l|qE?F&0|YqS2%a-AYsgNjyWC-x%}Fe*sl9^1bxY+3sM0Ei;c&l*0;Q zUxsj9>KwGG58AImz3IkM?3*YcMu|A~58>!h-i7I_YG_Zl2JU?hLSN)OnCS?kbYBv@ zsjf1T&H~?)HKe0*o7~!-1mTHdEdEsjHC`(i@X6x2Nioj-Ovi!$Tre}N1TTMOLp8M) z&?9fc#T_q-A9lijJS+B+0ncgTdJ^&@hP53pv3jsxSNEkE4 z(cBTddX-9^9_sPSI*s_|??6V7EcjD*W7EGYaDUbZJ+8|L5b(vkk<(Dg2*x$v>j*lp zkDY@hG>=z;?4>ulf&cAA)4v{+CTxJ+5+iK%AH?yzRPcSbLHU6?eC%k#t4qOn#^(ot zHbeOD$U$j&NTG$isOroHD zbv^DKwMEn?3ltFJy@hK9t{+N9MWi%p)({s@J%nIRDc0(k!ui~CNND*YO{*1?3M!C! zwH9Tfj?h@y3CC}9Au((XzlNpoQjx~f^D*H2Z`fe%b2>YY0);O|Fvocme;jI&7g7R4 z(`0NW*5p(Td8$5fn@sgl1UsSzx2(m~CzWbQ#O;CGwF<0@qb!e}Mwp-3ga_&y;Fe}i zbxjS1J|2UMog(`BJdm(a0kNvaaNBPUwp^E2zg7J_*nH zuc0kpOIorpj{i}FGtDU9{HKh^tvE-N@@+wL&xGx6Vo_fGjdJ3dJ z(^=VK0tY`C@>&&u^}l3KPv*k3)(%7K7NB@57ku&;LAT2q{&Ku{s49XG`=hMx^&Zx% zb)7Ag;6$Yzj}hlCl&#(H{}I8=hsw8ql#`c%q|N|*`W=;KIuJ%YZ^nOHNw9nrN8 zI5zJEti^aS8Tc8Yn$uAuH3ys8$&>MGT*vbC23SYepr+|Qx*M`!)Y1i3vp!pJ9FnC?TqyFKbw5uu3L=w&;dL*wa2}xG;F$& zhZvPad^snCn+J37^TPs+%lYFNaRT-l9$2_yK2mNoaNhC**Fr%oR5Cye-RAFgi&vP5bgtKI#{mh=BWvJ#!OhJX>zCGGM>!AT=lj9?b{sa#sDW2ai`f1W^UVz?T%v-`w#2UuESKYjN8O3#=hPazD<#Idub2I-;G!E0HFo|bC7u8$!d zh9UTztRY_}`Qcm^pq|4Bkx>qq-fxJzCF{|Aatdy$u0)lR7XAv_;>k$|q;qrOO1KPO zsK{%FH?M@-jRJ7YNu=6>=3LFO$SClIO`Z(gWf?w5l)*Se2O*&eh@d$|19>$9GK8V` zz!IsS{E@PBIyg3Zp!{SAM&~WU-UoEY>rAqM`5)OK^H7VT1VPH&|kS1vq?o5leOI zpC1B9F*ic%Q+1jjzhJM|^fJeMCxc|-c`M{jFhgl~jEvdgklYn^?g!m-N5)v9LJRy3 z?q!F$m*YzRJ$6*`2wNP~%^oM~>%5-gj9vGUBnrU)FR^9la`Y zFkdW*Uq)J(d-XAUx9u@&lNw_J8VA@}F-at^f5OVQiV|y_gKxj$ad%>z4O=ULM?nT( zm#N|S&%-Qii1N+^Y_V5Q8Xav{nM0W%ERI;gD0q~;{rQ%KpF78zy@y!uaq4**ePLT! zMEPDOxFuEry*_Cq`zImpRRy+=C&B1`FwA6Z@o9jbb3W1nX*T|J)YNEmWCT9GCv9fH z1TLg4_dYPje#y1SdrkjmM-`Svap3l73Dr*DSira}u3k9HNEh{D~l7b=EPbY zCd*RdxDg~&lW=#rGWLH7fa*>Sd?Q|MV|Om#p$Wmua+tV7{x5S&L>+6ul^j2O*uD_Q zSFAwtxm9SI;|f<}>K#JoU~c_<3`EU<&eH`rwb)=FOBg+O`3`U*U3=h85I$}wfsso*1k;_7IEU&5Ib(YF%<+7w z2i*DP!0YXZ@&|@+rT52otN?Fz_n>QgC^qegf!IzP%GT4vtP@h$FuWSOL}Q6vw}E+e zBm}6hynefqW{@W6dvAi~_s7_Ax<@yiae#B?Q)X>bjwvH*ibE{xOvHNo*oaT{S<`RA}etJjYaP0bUd;(!%5RAsI0KU z{sJkuEZd5Pa$!8tnohm30XELFGhC2l3!707eAZ$Ry5EZayOA`v;K%I6GVo}A!(NX0 zVuaqA^Npg&be>=m);x&Y`hjKD8-r)%6Q;P|633OM;cYl^@pcs`SnNf*xd^TvVMuQJ z!K5`#G2T!sOgfUj^hyMldt@N3cAfpS$i)K=Te{=L(K`H-4XV$AS8q1tctjCWaGeF; zb%f34sW4u0gALLwYlTk?x}=iPEB%bQG&_@@$q9eDx$x_?0K8>O4IVsG#(4!N1WK=k zZd(qf&6C2+NSXt4eq>csS!3d@4ZJj(==2YbTyI#2_?yJ>1TIWv_;#@t(30XA^g2 zF0O)U9BT}`kK3WBE)d`PhM@PW6W2l(V_0?(xNqD^uP+fiZ5AZ&&%$TY z|2V%E=qDsxfTWTyzRNyEj8!;vD$b#|vUk6qULap`jic&C=Z;Yv9IP2`XlWr*JvSAz* zG@76T8L(TF0>#Qw)R3RFo>=mu*Wyr`+K6ME(}=+nLF{ZpoI9q4GhYre^Yg**OE-oZ zaWD#-MeweP`t3Iwh~rd1{zq{fNeMStx`Bs$TS2J3S%l=*Iq1-S#kvMvaZzcKmAq=f ziZW$*i>`*z1{JIny2CD?RtKlo6}=GiJj}d&jZIN5#lv0k*z39mo#{)lP$M1Q#AuHC zM!@!@DKf`N>nasR;R5>Xr*qKSXbqe!iG$3!Ip`v8E3uq-aQQ#1E7K2gg+j3SR?Rq$ zS!2JSD13kB;7+(7rirUV&3Gz~otXlorOWVOKY3d&US%;iETL8}j9`;Y_=LP>DbW=` zsXel2PV?o49CXhF!TF3SWD-bEpO}KO^)tb5vw@u_{!#YiOuW3(X!vl+FDAO^J~N}&C?diOGFSP z)P!C~cjS2gViW0N@ZvkbPUPy~>#j>oS6B)f+V5ETGafv-_MV*~t~XZk9h0s!$HtS- z*wv4IV0l&KiK8si%e*jEG{v#W8?3kBDjU9SgxK2^kayyTpNbAoDky8I{GU#$YCLM1qY z6Qg%2hjQ)~!=?e6k$okvZ+0n8JX?jc%j}UtTGja)L8PfI!pKHv^1gn@%rbr~a8Sb< z@((%sC>YKisUh8r@ZV8F!JFCW-RXvc_OHyoxAsxv&i*T_{ z7XvpwF#;M`cyuDP`l5>+_~dqu_@z@=8nav%C=>oAcp#TpKd2(x~0B z1gXLP*y}E8a#of!2MY4o`JJ6ogi0;-wEE^lZE~O^y4GTc; zvQRX(7{F~{Dfk|Qq9;KZ*=@6FL$w`vZ=|5_f-kscDnm>_9!IxX;A~$m#ss8LUblx$ zP@jEo)eF|GS&bJQE?D!QKenlw!;N<)ympsBwsk(dQzn^;yecuX%kc8-7AVDNK`H7t z^Y-&aeduSV9`cU8^IQq%mAS|+NQ0itU3TV#FBXcCrl>24=gBH?UR7@3pr%CMuOD7T zZbI><0@PJWLZ0f5Qz1Xu*4A=VZ;VI8{S>M%Oz_F+Fxxy|4Dnwh^e)-<;e-VFve)Ee zxrY(nMXtr|$}$+PiN}?l>ck214J^>~#g4W9_?J(8%05&4kV(R}Z7E@{l%b&jp1IhE>LO&C3k0sqTv zXb}_mrAHiZ$Y1>OybJXof~Y%Yg9Q&waBsB`LLAEBIA0MHK~#U-@+BFa%y3sD90Tv{@$;BFRyWj z`-O0zDipJ>SRib9EH*s(%ch)Ag2Q@0m{rMwPhtTA z|5?INEeT)k?65{(02|l_dM*{ws~3Y*!wxnv&lcZJ6NnS%N7x;12o8wpFMtrZ+j!t( zG8ciR>%c7&jhhu+aNMVb19ruTI7~JDfiYYzr{43-SN1scCp&T3OOG?`HXhw|h1B*j zylqiM?&nq5W8npk51zQ;SBe`l8*pbvI2wZ-$g3lVsE+AaXylA(q=i{o%*RnTC4@{R z4|*&=d54uTcyt3>K^oJv!C6?86o9grkL;~%BU)m;alkSZ&(11B{#rFO|JL9~mMlz( zE!DRD!=`-kgT}!M48^6h5Br80U)}`E^GIY;t36okDh))ME@RHl+u5&!7mW`%xUsv{ zIV{{}Idh4e$J)$P^o+E9825HB*7ZrrFwLuzEax!qu3Ic`K_V-Z2?ftI zYxd~n6XVub@vOjanr_POHWn10#%!*?VP#U?OfCBboA-M=6PRPdRM%884W|R_@F`ms zUg*NMjPbE6t6iAPqhaG>6ooU~Vr>r{6 z^b*e*YgBAxD$g#lh|w(akyo-ZyB>B?bsH;osAnc;GL6k$Hn2zPaje^{f-SIo$6QGx zD{obX-J2Mk*da*XYENu5bwuW0L9}~|!&6uY6I>%~Yv@#%Ka)mR-d;m>Jr0EQ2w;7c zHtJq@VNK{?#=n?!9)oXe-%VqvmWtqY)K(l&;lLiLPwacNB1VPe3b18GrJN-cQ{c$RUCn{4zpIpXaqH@ zA@zd*oKC!9Bjz)a%((z{f&7@6OrD4u0i>qIpu6J`i*rc>M;o1g+@+MAa)Sx1aYApq zA4VvDMeUz1Zco-5r2B~AeCrbIcvp?6L5 zu=45&cL=Ev-mKqqvU&BKY%T{tA)ir3}wU{CB3{c{mgCJk|HXCmEs z&)MGJu4pTrj=!?P@VH@(xHoA~7gt1y^ddZKkH^5$|MN^IuOOG*JxNe5Xl zGZUc(k}%pYkIB!{n3fq1g#t-z_|5R(J?Y_jqS(4W8>uuC9&w|rw+qWKIAMvhFCW>i zE4}dko`~23!Qd&FLS7;-G&Uc05SIBOZ z9B@HYQ46x^Ztn~lMzAn>abL_vYLp3971rtLMocqV`f?W5Yi@w@(66|g!{ zf`|I6;c>qSzXHgY?YSKS!eL0-Zc6%?Gv$f6Ak4WK3#KceJvI)9e5hx3KFRoHxgj=R z683jR$;V0aS3W7Y&+0&HdNBUpFTwIuQ}7G)(Oh8wi|HBK`WT2b182T8cJ7SAr;^z?JCKQm z`i}U0KOf1K)H}S$!eXa6IAI$EZNon-NMQ=ZT#8U_y&8rcdRV_6l*>W>AiG-p3yMVg z3v1GP__24w2ELtjDA(}A!?ijvyGgysU2{Bo`Q5m1L&40#^1Z0sQZ5ptDuIq+D*8;KNQQVBu!Sml|a1Z zYzz*EW6#YLy(9EHPMj}KwRIuXCMj2+tJw(g@ZIB*=^|uavXu{T@|kVBu|C6 zG%WipA?qRn>F{(&r&4xUlQDX=pRrYoB*8b6hT3y@k-whQDadK<}kr?D$9{ik7ruR!B3f#do5{IUSnI zgOE*k;nxZY+~XlG{E#amy?--@7)~I?06I0r*t$9a>;9@j>4X4GFEU8%O~ax-KHRMC zVc+hk;d9bMHo3f(as#Qy+N6kgoZ93InT{zPISAfVh`*H;>=1b-=kAM!(2TX{qTLSn zMZ}2Z^2MRYcCdOf1-}&6K+#zZM>a1)LfUP1T*?;wYT+0YoJ~1qs@PW4sc*JL3P0a) zA~>%S8+tmS(w++c=w^ss(MI8n5bO_2z(y@JBBgo`UmW ze8wIZ9Yi2AtcK91<1BuwETaGY#|#28VDk1HD>^{lg2PT2xflph7XjQ&O2u7uk2T43riO0#RT8RZehLHC@7Dt~4A3AhPz25d%VE>rPDJio zia4)uyb<-mwur6Rq}_>`tLNgb{w#cdG!yNf2~gr~!buUzuvKI)`%*z_jrQHp<>N)5gipB{(@(gXTf%hQg03^HO zLOds$dN*Kk!DZGqeJ>=)lXmyrTgH~{Vz;?h>m62Jhmf zI~VL?cYQ3@V6E=M9Aa8MM)@5=p$f60*wW8QADg;#&khEx=;k% z@qpuLO9;*_g^ocg@_%we@y#=wIq!q<`9lcizRl)!yCL(o6Q&YxJlSfGS-U*-|2Z{b zcQeD^-dn_R9EBsz;GK5VVvu7#R!1)czg0E*vc6&M-%spQs|?l*8A9flmHw7@Hz<9kD$+MS}E>dX~Zt+6*zf}diOPJ@bL6p967}cmL7rM>OrV9Yr_Gx z9z8?j*_&0#u4spsE{bNmg&kQbq_m;C4bu8->B z`Iz48jv<#e?cVNj=#0-nWxfGk-lh!9&8>L5ZYS1TdqMVI9p1P3LGm1NNp*X$;kyRJ zug$@@nib{^yCI2wKlR25xH)2hdpq6HD8h|RBL66Z^8%~8XG>gF1^zp#fzX46@OWE< zV_q9^Ptphie6gs`UyX(mn)m7QqU^6Bu5Zu4UZ-|^ogE8asxch83*bCqj42O4Fui{R zI803SB<02S`0>JhpA3GVEX46!g$R5@9{sEo{Y~rmuxIBu+g85|M#q|vkVv(nq&4Zq z#xUGN&!}iT8jQ?MBKC>H`7^`XoAt;Gt<^t#<}|K~2w>wtF8a)ggV*fE5!GJIyp)bZ zskx}hbb`0P8tl5d;P%!G7CUudMOvI0^`joru_(DVAA3_Mv*#}-UWV90s?r1c5k-Scina)FL7)Mk?jBP~tst^qD z)`OF)8cqAwW8QPhvG{od+@w9{)vv-wnklaTP>6=(DbUgLM&55BB<)_Or(Y5P?gQ2c z-(Cn2*=D?#Z$MDeUi3VR#BdJH4d*4o&`i?g(rf|D`YVo8e`WE7zV3+4WLzKYxA>EfMfg;7W9KbEUtzR@M9A727ldd_-N3c>9)Og@etw0+hv4q7o|)RY0UUPQ*+}n%d?V( zcd-n%4e=ZO%I5)Su66Jxv2-H~AxQd(=W}!H-^~k!>kddH?#OENBxUsoBA#a9Tjo2$ zI-=AtrZNc;oF1?~?vIJxYrvV1k1sV=aF64MHSLD6BW|yL<8`)eh9;ID=fci03E1*} zWLFN@5zoU7safuLwMQ4iroN~NZ6uGEKP0zFVEaR3tnv89^p{^`_jiZWJ-Q4I3B+P- z7so+j>~4)lBbJ{u3(~MxX1=AmSrt9=vT^i>3zpJ+^Hxm*(mK45y7MKo9y)|uUd4zN z%17GwpX^Q!^#wGCN+6ATq?UBhA}#~LS{F>$yUS*YJi#Wf0i3+N5-TQ>(SJ=F{SjSo z=S@eso0>_@n!od|q*v=URt z!%(ed3h50ske8IE>4qv;y;K3pZjyNAAWO_$ z9SVJ$(Qggi_yiy-T7XEe8F9n@5o(r&Cc3@YEp zQGep!?ZUwM(i(whDbJ5D8C5&Qpd%NIm}A5*3KElfRR=n}nQ;24jDXWi;M8G-e5*2C zHR;8-f2D{#?*;XeXvCeN9yX2-V)oUjFdD#yItS<%QBLkn5uEwhhUY^?w6BVCK;NjK z$0eBZ*P5Zb>mRewB%b2zZ}y`m0JEZZv6uTjai8>Ob|4om5mITVvWMBx zcs;#M=ks9q?KCr_nz3MtA>xhO5G7NGta*M^~11ZIrWvNF?tK#s_-eyB%j@ zb0a6>4D6AtrHH}b+IaU*AA7p^a9X$+_vU_KXGPWVCEf@(n=J8S>pgaAY%y_68knGr zjwkb||EZ8i-nx%$OWr^B_4^TKvcOsYJFft^)Wj$gcr7Br!f@dc2UgO2wJiM!>-|rc zYPG*CB5^CqJ9n|x%qOgGKFvy$RdsiW)<8*bHttH-;2O;aTGB(1mlcS(wYS++V=a8K zH_W?Q3w0(_@t|%cd!BcerR&S0Q9})zs~k~Dd3()?({N?z0sCxrik0v^ zWHTu5>OZMzP_FRCEq$63uamI@JYp+>D%!VOJ|I49x7NOPaZQx zK3v_NiLnMXC}-Vg{mYi)MJKTy@8>{Fbbwh@Q@&n~EAgH#;HM0de5WG@4|T~`y=WdJ zb2frsbRqIzOA)7)g!}$nklq@F8>f?yo}PpK!+PlD|IQ3rxiD*wir$Ph*Ray#6O)#! zfw-k9)|wUI{ike%=m((sMIQdrxiP?%fX2mXFeE?P$NSW8iuq&dQfXYg=?kgeDJU_F zKq7HZ3WMWJ>DMi``jH@flV37N@>X&0mIn9HRNUNPh^j^t{GGy$=kEDfpPYp`gCYoc zu%G=WWr!BaI}7dbz;=EUn4fyY7Mt0S&r2RtZ{A`dyH^v}<%v}^7kz(-1Esqpi4O|I z^{+X)cvKz}c8jnBqOnP#voSDd*;9)^>LS$=>b)a5+%&lVJ8&ASI| zU~4^Cz4@|YaJMgsiWDLe%^*y9iv}An9Bn>++DgF zeI3u)EIMP-r6Lh~T8w;bs?ekPN6iY#z}&rwW>tkSA#LXIlx#Sa9AObd#42$Miea@ZF_4 zVZI0-upVp8R9cHI7qw6Z#e~}+dhZ7gylMmUp2Nu)4u!or8Pl!WffC#?p z27pgP8m^DYYveZ{R;Ho&s%VHr^$a9%xWlMyE>7)k$7xkvoPH-q{(B#Y)z;#Y&Q`1> zmf>(`4vKtIV0bQ!>H=kG$wgz~+Iku zT@4Qw_k%;~9SayT#;oZ=a2mGP+p}&SK20ai#&R9pY{DqJfWFSw4x_nd7|@EO%(N=Z zJ>ids9+WAk9s!wR3BS_sF7q6S0FA$4NhQAO_MJhG&}599#}p z@8x)#mWzm$iAX>BhP|4o0RL3VY*KPTanyWFP#)+RIYab3k%I51c{nielLeT^U@?aq zzBi`gYTyLBv~vlL98gDcG=q3L`Dvf>AZK7JJ2dYU%ULkUcHF5kY^UF?>-G0cJjod` zbE6?UMGh~l?chzB#8JfG9kSQL=8U~K^J)((CkCr`QUss9xD0djs$pPZjOsaoP`KBI zMAG!GY*R+PtTFh%IpgxyG~!baF+Fo$yy-r{n*UvAi3a=FmYYB{w=}jFd&9Z?HoIm+ zIk0tM5IblJu_#VB&*ntcs3bCi_puV|43sBF!JY2XGBpmYTQ{5hF(n9bvxG|TUzR;( zI_!^0LA<5{vj%ygb*>gK<5F>Hs2+kKZhK2@n)nO}=$8e4W|@F;zjxmF1v9B@XtH9h51z3_r8& zA$uqnVKJ0ZDBzE5t|TmJN~c|{mPRMMtI*?nfgyeac=@)$;!r(y&@=N*MF%HZdLa?? zo(;!dMSX}2xU$LHeOn$on3VqULCWJStAp#wK6Jg0$D*%wFsrS{mVdPW)qy;K7pn0n zKNh~NyI_041+wMMc=vNT3^%JmB0dbyOehn4b~reKPTmqjrKtu6N3{^=nh2p}4bn)QVcn~R9v&~s3)+llX-=r3 z8vjR!33?jHmvDO&*K;YCe-&kT{`EsEw+1d=`OKu|*&!!zg?@}|EyjkvvW2arla$n9 zy-^@|%WZM=&Mb6$1!C?k8>oJ2#;t63xEm;dXMniIfR#EPHqV(xury@}w_*0Sf6Ugs z3_=B&DA^{1SAU$bPrXDGQ>uEx;-tUX4#Evi8 zaR%J$rei6waue-=s5n)FQ7#`m{IwWI56p(LmlU{77h`6YBDg;?m=OnWz+r%ipX%uF z{mPCyk*_pZ7t$X((aY_Mxmlv5J4iz1LnLS^HO&2O5XVOv$~W5e%2YA6^fg-^l#c;L zKSRDvrubGf!j`L0onq6B7r%(hITnnt*38ZL_6FWW^I_DIyjWU?b>bmG|TZqH7FUB<}9-YU8a8D!yig|?? zOQK#^HiKA19|&>TVgtWDmXS|tCyxNWr41m#mFA6US+Gp1!kJdOpO@ET#iv=YNGKyd zocy)j5jgqhJd3+#ME6CgzWV%1G<@u3H+yQ)RoH{-Y6tS>o5TH@09wVC!ReJ~XrG`wSAp!mt#K=~Q7C`LAMAXdmpfW!OczO)I-v zkuSCpPNX|co-c&7*Kro-sE;Rt^0>M+1iIekIKA~CMD<><$wBf8I3__{q!1%?hjA>< zrmRF~Z17XVsX4T7WyfW9B%Wpg@|yU&b`H9gKe1aqVwgtf;kmbWnKF5(Tpa07`8YdG zb@PN;1Ok59wn-gk{3VEo;kA-1S=5iUoYQA1>bWepvyjDg zZ`CXI+{sK&SQuw+I>kJFiRXLW$y7%N*-y7?l)ZM0HTx^G%YJ#k*q zTQphc&_3gH<9pZ{>mBU9XdJuy!M$VX|6Sf+(_{mNL%D+&vx7i@IT4R&eqW#+NN z@)%=9HwWrP;xH>)3?XL@L-fUQ=DAx0UKNsvRF^m8+u8!x89<2yWe|6+gH1Ng58fFe zCR>y8O>EHnkKQwnA?CGF62l!&nZ4Z|HuCKw3u;#}S+j0gi z5Ma%R$DHy=yeEzz&RuNeO)6x1qp<%h<%Z~dW;TJcloyqY1+T23c5XUS>u2EKF3PE} zs>Mc69m*zeBM!j=4&;rwU8IVjJ>ghke++sB4BiTq&q;TR)GH0hgiXVj!x2a}Dlz={ z!4YLP``Ik9HXJ+JiZFiaWewe-cWWjV|Eq@AT2*M9FGGD1^%}{>cuZ%C=SYj5-eV6O zG|qw1ml1rk(S_JA;w&O5lP$y?9NIFd-B5)zabq-WZbGtqHZl(Q!XeuQt9FdB8Q&(@ zv2@yHaZnk)hFh^V`zoHUwnA9C2zvH5!Szl9uJ4bbeIwMXk5N|4cTVupe9r8C9Q2N- zVenQI?roThAyEbBR!KwiP67rU3`qyr0lOD|Skn=Yplzv$cwL5)>*45#@WMr6IXTkY z;C;oI_*?^QPOV47HVMP#pcFhIZTCA1#*wfs2y*tLvw0DixG92fM8kzQ3whNWz~ARZ zxr{Ww=heUuKUYKJNp5IHRzZ4Q6WG)ujLsOtim@KtJ)wddRI@$j>A;+zC@9`vgMi2N zxV4#PnJ>EW?($~VA*hZ=-<`1a!5ZYM({6+0dyuqy#_anf;5cm!M&(MuTWg54ddico z3!)uF@~9n)gi@X+8Z~W@oKk{;3#w=&b~|CF6EO#c@T2*$q;C%GZCilvj@9@bJp->+ zG*O0CfrHb zuPje~UuaL_JHMuw_zZf_&9${2z*nAsev50=QqLh-v9R!PzL@bNyJOr;(R0R9$2P{ zH$uC}4?M~K9Lj^kJj#@!p0>iI3d0X6i`62N^ksD{SvCh3DK{f`+z_MOn4z&nk*VN0p(?o<|e|06k z(mZ;3hAQb&Tk*3j8OBr-olT|PEo)oQ5TlQZ;yy?PS3~&M8obF5#h<`atcNm4qv&&+ z*qQ{rkpS#$IfM&4``P#Fsa+=VlO%P`()j1mn?wD1pOto%GvcrygPcFGVA zGo)IZ@@+bg>TSOyfg61duoj z7B3FjU`7gMb1P7uxlB3fOa92+yau+puNkMF7=+}%u}f`~yD%dkSL6@ko7gATe&82- zRPRe13b9eSnlKr)h0|(c=buPm=VmMHmjB20oOD22q$zqNEg-s(1Aa}K7#|kJ6T9bZ zzE=!3M>@m5^9NJQF~-N=@rarOcn`Ut|Km*D*Z0K<%Af0g!iga2n=~6{84Z4&gBeW^ zS^sl&h+k-@(5i-@T~coXge-u1-rESPGPyqgatACllO zeKRI+mE)vN6Z&MbF;VoLCH$fslUw#EBhFLHGlOm8O>CM1s6X8 zlVXr!M*3;OD9s%da5_sI?L1|O9<#zml15&t(UBqSfESdbTUFB(bhl*5JMG28$Rm zth(lfmfi7G`+LCly(v0&(Oygg$^f2l#ijgc^1I37_U1myAzlDw(?VR0Y#}bf8)-(n zkgTXhnfUd1&~665C4RU*{*qndEWo`$0e$-@%KG%@#>LScP+5Bjxkm($G**hU+{Ao( z3`1y|5SpqNU}P5Y7(87Vl4{2flgs+0q*He)ZG`ZZ<*-}@a4S*1m3tG+N0*}1Q4fP+ z^=MUZ#-5TJq!&~pNHv-6)HPUQJCkxposgBUgFFKtM7tiwy`ERFImi#2C`aO%YbYaq z70NY&!JpFC-?^XR?^z{R~;2z{Bs4 zLMJ3|*QtDwTuk1DHCCV%8QMY>?mXj{nvVIx3yZLeSl|Jk>Ghk|DhU>hP z;k&^T4}Nck(SM3K)<>S?8ezPjUkXd|4gXD9kEwj?k$94FdCW4Ab;1Cal5P-4@kXyh zB_4{V7)2qO-nFfi-PMVMX{&MArwi6MY%zNg?XtO34e99bOiXttGz#e~tu=x4Ep2St z!l~~{S;&9Rb|S)TC9dtGp77;b3{-4_prrvE-g;rIvksoacK9c`1CvcF;OV^zZHsDg z^{6W1ex>08=|1DtwbT!WVf;il=FQ+izyg09e3Ol0F&A`-8pCX54t#Gmpoi)PkD4M} zjkv;g)mhQ5CvDjNUW}7#MRBB+_Tg-gfX@z2+-aWzVVW|HPQ z5pjujza)u5|A8G;m%e26JPZZrCJldD>*CDiPi$dj3@To(#2&4mEYhF_CN|^DKmHJ+ zuKs3C-0JxLK?_q-U7(qBmkI11GyHn&JsTmeeER(Vadg&ES*BeVmy!@9lrAL%>9{tH zbP6IJ(y1URD4=vBk`fAnAO>JzcVc&7Vq%MkiVBYLo%dVw$E-DLWOSbUxv%S-z4vb) z!`IE!4P_vT;@<_h<13H)XX>zft%9iZWZa7=z_;(#_*b8X=zD9hx$rZKsg}T**9Pdk zp@EU1T1dAgz=*!rr{7X1oYHdQ?h^|$(H}_zybwI*hgHq02so^d-V6l{zW2lLHAdL6 zjLrl#E#BIpdjw8KOHhi+gr;pQ->!gYX18WqS)B9tAAjZ4p5VDRu$&c=_!2jals1b>o zhg6`P%YnQy>Q}DJB(F|5R3_+sAt{2*SJcs4REw1-w6Ie21Y#RyV8>Gj(|~Lgww6QV zU>yz-hlxYg8_Q~EK;dWo~Doa1&}?siQ)2ISvsYty11bKk~Q<4D}CS73GWfMma#Grvp`$t6}ItytG6k zOjX{F?+b#VaH#SdFutN0_jF`n zMLdq=?UW_2sDnLSptx7o!)e41g{NYXZbeV2N4b6CF`$x%upG#hcfOhm_Aa2@VPojf8$79H^izcD@5JzSy)FC9mHs+9{$woQJ1AM_%55`X%4X}ewCuB&;|33SK?!%KkX`g z>G}1?$yb+{fAl}5dd&gK^9X^P+3msuPAthb8cY~@!|gnX>lE9*^x@hVT2e5V#4PDGJ6{~nvsuZt;O z!k9tlSBF_Yn+P_>-`5Rn+qtO_Kc;|x4TbuO90M$v{`_`PV=UMHB2!lR&4NyI;^VAO ztmV`wb6A@R4`~ThY}Lcd-np1=ql7B*lx@*>V=;Lm7!Oy%g!lxD6W)%JHEUsgyq?9q ze??jIGV)`XVV$ZlK7W18el;c_+x9fOqGXJ>$(z{jH3FC@7REmDkId%5JGSW6D4SBP z1e4E%8UE1()j|!(czlCO`0W_j{yKhj=+_F7G>wUr=0zW|1`Nz=zVCeMY?KlHa0rD(cInu&f+rW?(mJh zrSIE?Y9Fk(sKkA_jZn9}&Mq2rz)q=v?jcEtDU`w1m(-sTX$7arDNsE{o)$}bcYn~v z0m|3PS_$LfB_+(t;l+H~S<8v4!l%6x=bB~Ek>!uYqD%0QcKv*P4S2Uw3>$NrP*F|& z@8m^JN;tuiSE)nebg1s@4S6_w`!EyRQ%VfpjSz1lPtHn)Qxa2g_Gui>TF=IfZKT00 z)WD<1_AvI*fkj-R)~~>iZ1X(_djFT<%TZ3mC6*v|OFDMaKJ`_GHBSE~?&>7*7XRA< z!JJJv_a+SQKCHo%mqV=dh%PP{i9+j!H|(SGV8L$VSpsRoX}Y*RN?wMRY@`O7qtSxC zf6?TPZqvhFJAb^D`O55m=%Yo^4&&_$$PdDgh>5pscl}(v47|mZu6km3rXey-7|e#g zGlx|U;Or;XoBT}3lUCl?dX9bV62$ohFWJXB8Dc?p>-%h##XZNr?5ms=OdL~*<1h{9 zCH-+Wli0E^y^-vwiz=-g)X{TxSmHe!Pc%Rw@&DVUR-?p9igZC@B3-INsC+oOjyb`6 znhkbG$)GF65&Il|v*$0!$3^4c(AZo0)`Fq@O#n=TVB(Q zHZc=+A+p#*J|CTrU)i=M^0XAW;?;H~IKLTWw&MIqF)7A)s{~}L=$;-;&zaBy=x>SR1JtKcM_#%Qs*+_P=SR%4=pH!mh+x-F@*{3ixUoaat?~CcD(|xVHq-OU!VXJ`)M6m#n9u*uUEruTAi_oOSkJ0q&gPKv2>SzBLG3OAGj#0 zN3%2es*UI4z{T&{p{vXBUg11qw^!k)3vriR8?nT=fjEjbn4wq;wUHyZkYI-sjtNkb zUIl}{`IvaT5#c)AIPK>TF0qZ!n;(n7i8jc09>+vhCZ0=MWB1-V*fx}6{vF~om{p>w zq91c8Ke!;&8v43)7ql?M<#*)}%JQII&FN_UBu)C-I=m#d-CVjCRV_3{fq)xzZ;+>i zHK28{18qI)iS-zbLpjl~EGb3IU3mF=cAR> z(d-PJ=v%1DCLhGpNE8wmaqyNe-U-o+Ir*ENFm=Ov;aco8wWmIX`H23z9=Ct4ApV~c zy7ekCYehP`d~C3fX7aee&DeI87Y)9q822*5;$?|AX6A`C#6Zq2l|kXgdW@viA$P|h zYjhYdb58{6<`40~nP*Wx4x) zvb?>2+1iKtP?3nkL#_FkX8(gJ9~DJ(03R}+6L<8OJABhca6YCMv&efedTA9dJ5aaQ@G3ZU2f*>y2@Iz5 zqfhcH+fFg1u^>hLyLG$Z*W!&(S|o{aY=Gdu)eySig@-2Ln2_>=UxyLaexp5zjwY;T za6>B2D5Pb_{1e?(mOz|3E1Q2e_Z0-u*-WI+hxOUeTdPun`sbZLZTQKL%kyDt{LU-BPJj5=K$F%3|lon)_W=93p$N))+aMWDO#3t)#5AtNi`B&lzDk`c6LJ_HRfNnD>imtWM^dmq zvffknqk|8B%>&_Oy%5f!tFew|+J90t#GQ}8B5Cr;j+kNf82L$aQn6oTi~h~^rif|z zL}&Rq@?BG2Y|$1vN7QM5ppFDVOE~QfhRCW*h{)%_G3R2KxrAc<+(YV0+q&^)Y7#PY zGT=5p2|~VJXbg2jQzmKKk3;Zb_ZCPl=ElYgR@5^#6}3l=Q7~B#>(--8Z<4Z&9MrAg zmkO2iY%Kq`0WIN6v9eT`dZ!w2hxjlT{^P_?rz*6Gr(uTwYnK1N&)Pc=>+Ah+R3{aR zKEyJcLE7{YHPYW2aiK5>V{0;S??Vu-FNwgJGxVKUQ-fvy?I#{qIduL}?~Rl?Vm59@ z{zgmO%2|fSkB<7btJ4vhGs=9~0Ctx4L3&CdPQGq{*&-Xbm(`%f{5%tl>4ICcAaOj3 zQSgFvPx+P1j3t{GKM zGI+W{53{DL!mG~&kJ6VA3(o|ai}Z1ozZ74|bJl%tF;af4fRvaK*bmA;jCta&!Z8FM zB2Pcfgl3n#aUrxBN#<$@rX9cuiz?JE;4&Ehp#`5`r%6Yi+$xOtx(*&<JzJrg*jZl`3=r_o0}@&H^#vX*KwQEwb5o{% zaIPjS{85E#{)+IKAs0BS-P;50DJ6DoCT!`vZ~R}8*+l#|(B zi1Ti%v0xQ3&Zj+KQv+6D^7|T?_6XvMV?I8T9$_16igQ+SII~a_?_XO&`*0ol1nnS` zLOnSiyAb_33Sp|c_@mScHB&9PJeR|+a6imj7l49r9$b;|#fkOAM`;nHJfS4s4lAI0 z-7^+{W*%;@r(NPtTc+@1oE;4!kM0*Pd_8}Z_4ZKzczZt^9_B!%iY}`w?PSaS)Y+vi z4s26#CR4KN*Eu0n&n}B`vyhu7*ze2JF-@$AoqqC$`M>I5ZU1#M&o%ehw|aAA#woIW ztMi%hGEcTUg+n`6xtBfYIKnPJKFG8W_OiCQPg%G{J}bQ)$bx2`WIjB5*!yN{mZO@@ zmK^bBK}&3y?-@_(C5vI1%G+7RuGj4F%?P%PJaWMWYM8fwC7aJv#!jW(X5XT(G1Z6> z!=5EW?3;l$sxGZ#JI-!m>UKOxn&3xf$3Zr2-E5Znqn&-b9>f$2zq2U1!x<{Llg463 zofC7gBlahgZJGz|Mk|cnmqhVbI{&vEW=9>TLhQ^Z<{vCd-P3FIhZcwDbNr3#FlfK;&8e*MBdIq(!KjkzHO8_MP)(wn;h=gh*92C z8F%X_>rcMD(>!+6;~WR4Vk6Gpg;SeKdQ^#GLN0dut;z-&3vV(;A((mE#-H!3Q>OX15^9H6(&0;827V6%7ZJT zkZ+krzGn;WwHM;IUmVm+fi|>Z|bmq;Xy=Nty<|UHvYCkdWc3|1A zg}A{rACC?4AlpTsm!clsuS!s=ze+#RA{rvYv+(`PFq-|ku;?D`bNrm(`IE8&hdg1h zwT5~Yhw#B}78(r`aD@0O=TtIuR=85vPh|yG{!vFnbS?f|-wNqF0eJme29;l0@pN!2 zJnqS&?8;_DP9@*qUKjkGy8;(~DdT>I8?0v@$3tEpER1Tw^}$2fm6A*BIP$XJ+lD3Y zd~x-#Goo`k!R-|b{SJ3x2&Tik*$Cm;v@_Tiht0Q(aJWhw4m*iGnKy*V;UOk@Egz;~ zQN+*nTne;yl|BG|5Y2?@Zg&Xn!!7lAE-^C11e4_?bfDVJ!BzCvEySbvG_F(3wT`x4a6=78^*9H#IYV$&(w z%Z^rJ&Xg6H?M}M(b<&s{6v&@49YW;I9DcVH4ccyyE1Cs~`AUdvsep*BA=WGDQl}_& zM_pKfMIFT7?JUQ&!czQcD#UL48YEnCgTy4=TVH2je`q0i?h?-?a}}CM`);a`LUBP9 zHriO}@9^QrYu=wseRUJ=sMq7@#w2LG$Uxk6bqEhQ!IGa&kGGWdiW+-mGfUx|%TewiLLd1u$e=hzsj_@Nahm4#qFR@9})Bp?A}@Kx<6s-hs3_^?B^= zMUQkQQduJ2q%Xy#?<%+)Ws86YYkgIBZv3Kd_oesSU^rBbX%6I>(q4jf%I4q@E5pTu z>oIm&&}j3dBAD(HWQbKE{>FVxJ=$r%>CVSr-g+4JMuIu*!hg6Xyy?7tp@RlYfM2d5qbw!qD@a5XG}d1bk!bTnrRoS;*Z&;df;?E&FWrB zg5z5bu2FVcZudKGT%iolv}Jy2Aw$Ho-mSW@rCUOj)5|8j&udLT5XSD`g# zzJYjEDmJcv#mq$daA@aN@a$>?4^zkDZwhc2E5aPE6wJ|nh|5BA@%@ewaRg0pMrE%~ zwr?b^@-*Vxop9U;sKe#3ZUl?i!P4&y`)*T2`-%pnS4b^z2t4~c!_QdH;O{1 z#Ta`>H1YR0v74RAyY%8UOKNn-l9zVG5miO+P&ppSb|R|G7D_oXSav@Ub0%nKW^4t| z5p9%;E1}|dl1;mE5yn${@xIvwq9rS6?@2x5ud`77pc}mO{j-yofJnY2 zibS^}*meQ@_MArU)wS4~vj^YTgc7ss2(rUQ5PHrS`JhaUBjxB0nZi6b2pj45oTXBw zyDQ!vywo@G!(uN!s5XNCN;y_=2EuiRJ_hDl;Lnz3%pahRmuuoUaHJNq%dCj$wNHn; zn0AVk?K$~*$%PRwpDMe`ByauNgC1Jb!Gy;#i;pQU;+_>rmaeWmu zQ(oojw|peUWuq(8lJ@0i(Q!cs%l2(X+T8-!*DgRSJr6vJtr&{VL1)8kL?#ttJ3U*C zDXTHM>L3&;Uy^YrlyYkEFne_fQaZGUFRwszLJ|g3o6%NkfWVm3DEl6S%K^!FED%qu zj+HcT6`*mu5Ht&%@#?UKf#toKXjmeI=k#5<_oWuL2IQggD?r|a1_~QOF!o;qgx-i4 zRc@jDUTFz7WHzHFXHxt4?fc;Wu?mwnqbX-nj9cbgQR10|ecOp4N7=70V=M7eo)|2- z)VG{W9avl@$jKtzo;;4*vJVieC!LMNSNqxa45$T=JZ!}$nlEJM3tiouz_3RGV^%#Ms#UO-f@7>rkhj_aD5oq4OkbagJuAj+7MuU^d6kk8HB#5 zFXoKo!Lq9ruUr!GeTxPD(jJ(XyAi3B#oV0|2RSW8Ofwh7vPfY_8`GUFuKhwD`$&W7TKMj*~ft>3iWamypaP1%VXEt@c zoL`A+=XG((WIArD%Oc|`@olOcVSOkOrw*3FQm7u?QEL%%k9gPH?qc$MIhI7VLgTVE z-mTdVv8}#vDN#puOF23htiF(ir~vLgSe|ilwXZbT&0A-%9*gNuEYwr%G=ZWsA z&tzb7IRJc}X|#VbhSv*IXzvv!53DITA8?}cxixiEZZkOcmh^uWZ6sFMkv`i3tyUSR z*(^r`n@RI)65?Ef5&B~nHX1v^szm~S9!FyGm#x9}l3qj#&cpqskuV)Ti}NBIh_P3O z?b~zF^~4upI^NjRMy$h$h3MKRhV&s0T*`CAP2W`*9^|5(3~+V54Vp)NAndXhkw+@v zuuBsU1Arj%Z63K!-rgBaXp|#O&L{{?Uo7EcMy%K8jpXsM$DjMgxN$y+?)_CLx!!;S zs*Z3>^1@0(Biw}>WiLz-99W4*@4ArxhCDT##D0sFg&=(&4ZBQ{yTK1(T#EV|#hvlw z>|bWI^8k`wbwFSu22oP25YVE0?)5Yj+PtF9icZ=AiQ@H@c;t}hSn6xM=EpO&h77~jq4>K_2)rL8f%Mo$jff(y|aL`a5i7XZwzkKoHLm;+f?Sy^%D|VM705aA- zI8-?ks(yLcx26+m6H758R27@O;*tL^4!9YBez}F{P-q~ZBrh>7l#q5#7RC)~cuL;c zREs$n`z4D>)eQ3AQD>QuCZa;;VJkOvmRr`~$5TDXT&qXlbvLk9Q(PGP#x@Tc;lC*A z`y(&E-!CoP*4(D|mi&r!sfldce7Y07%fVjiQ_!8Kfrb027j89a5g&iD;TQGz&dCj< z$JG7#j~nd49IY`0>Ye(v7W~sq@Z_~5eybP2jkJsmZGK4a)P}EA3a0UK;Lsa+NQdwu z%3T z1qbpc+0TE(8FpNPA)*1qlX1&vywPSZX6?0r3mJt%72S1Lj{7 z!&*LikD?pyYo(y$mxjdA+1NK*i&)9=P^+8*G9RFB{xa;LeAh^S2Jy7Ff}gxKwMX5s zWy@-)N}3^Nog6+=_vh{mFF12j7v8N!FzLv}Lse33)bK4HY_)<9@3kHM8|2H4+z zimhzfhyKp>__>(Q_>g&&9jAG3OoyJ!WN>nRhRY@L9@1U=%il1F4p(c~mSo_GX&vU% zPI{fZ4@O_DMoRc{9DY0#U%iMC{iF{m_S}%ylqD923LeRu(Ec_Wp{+b{Jzxem)g36` zv=*B8*OH&39?9oJ;d{*=ocD6cd#VmYVv!2uZNw=$CtDwykT;9GR4?jaJv$N{F2q84 zNIS8hYwVDN75=uY!@B=Q80Y;w%!x3+|-%uJy3k_iQRUQVh#Dhz4#7kEn zEdRoTUV5Lotlq``hy|gpOBto+ya?}-#$(zesy!wK+B-wEM0{Y?k2Dcx`<_Lc&%j~X za_sn1itqN32v{A(tnTr{Rm=;S2{UmzjtkPFE3~@Gr(>Is0K!fs<5OiEs*l;A>nF`r z>T}U`IshltBXK8Y8EMZP5TPA=W^6qSPD*JHP1z3HxY-D=bb)+-0p2eyMc{Mlcj~Bz z|BYFcwdsPC(M2xZ^L2>&kQCQmYJsN_*@u|HXbpMdMtN4_u^G zFy<}_=Mf9IZNJ7EELNjzvoE&asX$w#3!QE5a6jaRjdwKBD?Q38z6;?*bUXRois5yH z?sN8OC?Y*!%Pe(lUnqgeyJs=|$r_v;YohKaPK0(7dwzuaKE`c`xulD~4jMXTAIL9K znu#)f<@R0Acn8 zJX-~B((FpTHp2UT7P3FL|z|8}+XSv#sW25vw@aDk!p}Clm_kx9cPlqI5HMVHALXP}QVYXLT)FneWQJzR> z3Grl~()(PgOGl^Q903IeXb=v;WK9*Z0eo;nybc3f=Rhp26*gD$@gc(udvBX#pAh9X z?>pjP{*dI7!x_K#4lixnD6S@+pTM;?eI+48$}o>KUpgF5w)S*W}Ge&z)p1Z%xH(Pca;i zafNO0eRju~4?bnTn8%GV7QW>HTRNB6-nRT`yQ_ff>>0cF&5^1B~!Jlf2+xMUYvokHgoFviHO`x|A}Pm=%v%?t2cbCXY={aV+eJp~D;B zi>)7sAN#HkX}mKLaLpV0C9QF^(FObTW@FhUS)BVf73M?#*v+=Ttae$KzOtwqehpIZ z$Sh}Kes`mqyvDZ{&c|Q!FMKr#A?A@4-0}vnjj{+)9o&d?RU(Z}QD5;5c}+Llfa`1r z`d{rOT_Fp-Ro-Z@(1nhjFh#N24ZnA(d;v(FZLvX(ytd3Gv z>3le94ZHDk7M=Acbzn()bOe_qB623!xr@v2@k1nK=PfWuS-{zi_gG6x8jeqkfORMF z?P!J=x+RQClU2kv(ZZdAY$%^pz)}SZ_!S95?(td_eX9ea(E+hnaxlLM$Rr z1IMQ6$jQ7*Zw0Z;1Yka_WZwGA z+|rGpL9?}RwKjMbpJRihJIwy7hlBLpe%45yb1g}D**7!8I8DracbAonkiMQ)h->_t zU?4LS!jrLx_`MpwUQtkaI?OUQ8^XM*8$#ziaBwEEwU&;v%|jY^V==;_jpss#{Pu2t zsT1~h3TDm^XTco*(={aCzkbHd=%kD>QKU zTmi0mrQx}!7vk@U!R@vu23xz)_qascQ+6xNdUBbt1)z9Epo0>hfZca%|Wwp zE^;fbyi|s*&V2AZUJZ}SbFj@~3rb1nZFpk_Gkt>PgeKv9(n}^;nu>FMcGNpWK4+(? z*nZst{@0JQp0Y|f5@V$9TN6^rOWRuEga2}J5uu@mgdZG`bdv*<--FBzwC`1+toS(P zToo%Y7$^hxRBq_HP&UVW8}@S(EAx{8Zi_F&_CXV3>x*&6N(1H5fw-D(f~VA(Y253AZ2SE>F#}cDzQ&b06s$;ic7Dppmm4#zqD(-U)l(9 z4h39G@rAw#976UO=q(d#hp%Z-Av@Z-(h4vs_SPbWu zd%)3mDPGlgQ^u6M`p=2unG*!Q3=!}fOohOv8a%p{fZMm+Ai`75^i$=rtU43x+chyb zR~)MwYt&B{$l+iYWn^iGd?B|3mg3~mxlj8a0dahP`W7R4E-xH zbGe+xTK7^+c`bx(`^YOls}nERwL)f_Fa7$}80xk`YD5k8`l!OKZ!PBbyFt%e2orXp zw6me!HA#CM=?=lIL8|n4qk*d~vCx>dAFsLAA?egV=J68caMsr6P?TVs8GGHI? zfq^O`9IuK37iGQ{YEV{s#WZ{>T?BLL;rr7^oZ#0cXf5)B*hz-XZp6m=ubmi91&Cdn zg!CGIe5M@1u{~>1!#e}}>cnv)sZwL!Ar08d%3(!rCN`&aVCOEr zvVgg%1`eE5M<2}`%P-g>qiHrS-B!m2gXK`4(u^sAW_Wiu9~vvx!azzBewupV|0D?g z8=-8fStEA!rNeV!Ag)^w8*9G4c;+kwQtV-*|;x+xwg88oV~VIZ()6XPL33Cra7plXF?_wICbP{lUXX}f%soGH2jR6YkA6!8!G8LPRYgD)&S)G_Qs#yg%FMQ0rwPX9JCREq$JH)^iCH3 z?>_UZ=D_$S4qRFNj+uFrziqxfWW~locy5cU74spZ$4hK3GxCcbVsQdKxM->ZjZd8TyNwIZaROLRy>7##{U-^{K!i*@ zT8gNb%bAn(_BOm2r`=Z_?d4wZA&|$9c7C>aJI;+fMFC9qtkGSy*cHv{#E1G-0>P7{ zmzdM+wmyOSei`yR7Nhg9Bh3aGc=lQir*=_C;vFXpB$w#)oo>YrA$op8ig`N8aC>ftf>IA0UfKx%HPo+{odGB5S4{0Nv85FQkt0H$=#x@->G@7g z;)gSC+)$?szy#ijmIbV|2&3^Gm2jbE9Wd7(V;dgTLn*mait;y>CcGunfX3B|`?X&C``lFdU(QrE#>x@>DkGvYZ6PATEpcst4sJI+;z zu}nbxCSCOBE~WkQBx`$<2fL&8ko!&V!^2|4$e~?$e=+`AS>c%05<362vH9tIcwN!J z?O8j(Z`Fo`H@e79KfzFMORfD zM5}c$`ych7zNf4WG|;_w8ZO$mLRT*tcOQl0{`ei%5pju)EOo$_KPwRTnR3lL{It7L zt?l%%h;#cY89ai{`1!>FhjzPSM3?$+d)<&cFcaUspR?<(_M#PG>CZ-Yb^Dv=3NzY)X7oLbayGnkwfz#X;_9xr(K%v<$bhM62X3r1blKTq4{Jpb}q5UIAw=>IJEF-UKbL_h-eV`=B)+(SPGU&CZ@bn9pKAJRM&KQ7&z0^@%`}Jbsl@Y3#%G z0*C|_A>lzdE{7;$-tJ;Ni_bveN9whrp7^*g57?vQ8}Z?>44Q;3}PTx@&*aZat7O8DIXuaMKMHgF`4Lf4skQk0+GtA$Ug05*M-y=)sha zyP;zd}24 z?*y;6`gj;u4aNKj%s)w9=C48cwU%KqpEnM~%R|bl7Pr)A;OGS-^y=y0iPCnR)34WH z$CwAygK0P7>J}b01@A5D_1xwN zf$DV#>flGPjynERcE-rMHgs6#!`U|jvkY1=?k|c3G~X>hb{>yE6yUJZX5zrqu!K=g ztRrr^Uc7wxnTFxfgALRIz-fPvl&s)x+XT~&yZ)rkAVh=_nwv#SaiUjdp=oHMuz@`njPWRni4+=3X zSBRoY%CmBY!%N*A2D!eZdDlT)!4tZQ%6PnY87!s@VBerMR??2<##49j^`^6j>$XFv zXg%7`6`_f^fW7Y~zPCHZnR~Ya~ zUW1w~3SqQ~~L?l@rM?q@|-0zTZf^F1 zQh6bswQk0MlQS-lxg&3eI{BA?>7oY^NE+jc9OA?^(;ew)Jwy#@cC|Q0UBblQR;b6j zFWiXZ5yPXgO18T1I8MHx-`hJsXdj59{v;I)lb%$5EetP*vx%874+UR}p=NRe8RMGx z)6jrQy;+Z{OSY=Zum1xPtwj=7GH$a~~~Bp2%B8Lfe6Q>NC5zuQP7A7)Qv z+wr3C20RCQ!R;4=$>TxT+fSZknP5EciorF3)%c)okK~7(IAW2HI=W+ido%|L_B_N% z(#4St9vE7dgaaP!c;^N30ClpcRg_czeTr>c+z+Y4!AKWr$EKsi;-sw4y1z-_Q_Mn^ zjWl+5W5c;93_OcJuW0YAT!N8_4o?wl&2kr-QO}8emjNK27A1}eH!8iQgNr&8sB+BAfH@@^#8n&eAor1lzdQ}%%&QtkVhK7D>3#qC(L z(HDHfVbDzK!awgV7<;=KnL!gQyNweuuL@EBU^i}ya-c3Z2ck~KxX)XR+nEBGGK+c= z!Iw!8*?jbdmhqxYdn3`)i0&~nZYMqCz{Jy#S) zqmhu_sDb&i>v6J@_OKYSQ@eJV^+GQc&B7@XuHe#*H>4c0ve{_i4S zFk>z0{ktGc^Z)EsWw0!qhj~Xc5f)mF+TyQJA>ZQe&$IA&b14)%Vl+5o+QIQp6Q=d$ zc=_io(p;|L)gCvz3Mt1T2N&=~XJSi|IpP-;Vn)LvY`ZLh6NgK&dus_Qj~JqyG;PCe z#K*a64=I6m#Igghq<_vYd5GCYOhZdS6El|Bi{!V9F@JU{zA3H5)qD9kq7sbhFUlck zr;U*xw#ZC0g!b}2tgGZiUin7EkpEqvaX-RNQfKOdT$J)G!p$H*=m#^j4#&VcNDGAl zvIu(X3E5wl4PGuYL;6i!@;GP0g8tkcl)Ehoi$wQAMd)Z3Ln(?;w%`onTTF4cgP25? zzTo)J)*vC9`jT5}aEWINnu7bOvpA3Zx&8=kbj3{CgO;imLd%>F>vSD)|06L|WoJWl zia9>-F2u{@TGWM`g=>kF^X#BJ;-?;bAa4<$jRtNlSc1E+b%;Yf9pkUpLH2MizDCYN z;^7QPKbv40jTN|dBLpqYGFYxez0=PvVD_dQ9+6?>@19Hk>{N(t@p9dp!6n*& z-5#2#7UqJX4f!xg3vvF=k6!NS;J%x#J4w92efbVh-C2T=g<(*Cumqb!g0Sa+4%U5K zgf>}qyrOKlKpt`Q+l(Nzf%;4O59)mFs78&IF|m2Qp|~akH~*}`^KXIRJC+ai6kGf! zQvB6M z&qo)=mX}!c4qg~jPPV_+1@V1)l!H%Wc^aaaHL{CMX`Ko#4Na*3%+Q`2ww=keQ!a%o z0uSDo;Y6wflq>UjkEyEXq0R>o_C6?jR{+lt6yc)Nx&UspZSH`P$? z9Dpqef7rJNoLWjET6p9-9dqy1VpJj%kIws`frEOR(&X`(e0ydkQPkVn19!?MR4Wk& zOeCL}UoW+SEWGe9#stFFRtfintT{V z5a#j4+@(?Ma;G?)1QU_1V2yqeLG(=#o!j?`e37)1$a-Eve4;vBJ+lzEB*Jm5QveG^ zlTpc^3f$R(;zz$(;m0h*7p}uSk9aLEg)aQCQ-s2JDo))Z7Un=7_OGLEJd--$oDKYv zH{sefT})ivhydFpdIl7b;~bAXM^l8R=wg+OCyZ8_;(-Ho>R88Pbyg9Q_s+qpAR*-O zXv5`06)RTUiq)KnD9^)E$49S@ENMB>Xl6Wo{5K(+r2v^>$F=j;M2EIh{aDVJ#!&Zqy+To^JpDWAG4 zllX65bY9NF84zD`h5*!$Q6HI{0r=~Wz%SdjR-OxqnZZRBQ zXa>{mH8@qe5DLLRnAKHPn$>=>MJuM@lXD#2PW8r1JtZjL`~5e+8D|E;^E`X6 zb>G(o&!CU&rDQ0Y>)dh2QW=|c-Aq=$RE5Eb1C0N_`z)?)83bygV3n7NU$K^Om371L z(eEtQ;{p=K#@VWA)QQ|Kg0_A!?ykL#*u19}r=lKW*MfYkkRqOx`XX#)+nLmF4fL;1 zg0Qq4I+w=aKg!!53#z0!7WHg~WbvtU0gR4#<3vahO6lHxZS(*VZ~tWVnx3$=jsu^L zEo^r3;i613-2?LB)ux7odGqnOM3(wVa+vUh2Yg$|A$@o~`>5-M9IFcGEHI{B+GJCg zT6ugeqP@CKEe^a`0GYGIGuKr@N(;!mt#30!$a+0Lh9_3SFHbZ$1b@25~@$r&5^<=eCN8BpW zn=!Cn(S_1BE8^^(f!<*);=FXjqQM*Ua&}>|sxFjPrQuk|1Ps2Cgy0u@%nAvEfDCm? z@2Q(eELn(hy?tzOm~u%p|2lbwJU%0-)akLolHLGRd0E4i=Q=KV%0qsLSh4$zF)Ol+ z`z(@mdT1$W<(XM5#D z{HRqxTk>&sHgg9{A-`nd4}Q+>f0MBB)LYiuAB%}L4#@qi2}gJ!{Kh0G|D288lxL0J zG=QVt$JmM~-2uM!gRK%oR|>fI$&72HfDy^PUtPQ*m5Q0CERk zu};%y6n2F|Ecx)(MFMFwOh2 zg~#HxwKz^nc;TqO2Cj1j@JOx#w#ADPp*NMd9AX$Xr9E1A8MLP~LFJ$^Os0s^F2Mvp zo`qm2RR+`Qs<4q}Ipqu6p%~?Y%Az&6sAPuh>`FX2q7Cz4G3f43!|pDB7-VS?W9$)E zZ(=)!4+s<6(FmDCCGcwrfMFHwLq=!8$#5Rbg;ikr;XRy*FTZr+RAkzxQvXYY+j>3( zg#`YEhGT4dh6B`kIOO_s zg{G}HrW}fba3ApBnKC?%wa4c`J91@JqI1+4-OCzqg8E+%1=mCKk{f;&r(-R@EVP=w zvfk@CXp$!uhcYpyKiZo-bbTf=-+FoN@Zc+eDWAb=i+;ji@W6 zJ&o%@JWV5hb|-bfwMMA_>kr|B_6S`|9-DJU@GsXkIq#o}NT*wDb?^+F)*gc3adKwV zb>i-1>T4E8U=RI_-6x*H@|i~Hl`DZ~IpqSnws7AF2x53v6LwtPil6;O5DGYq#dm9= zeOiL{Np3jHSqJ4KA;|GuicOO?;mH1_unzEm@F5dOQeKe5rDv9*BZ@Y3AxKBalz;0Z zmU7q=XFd{3H)9caZwq63<6QK;AkToC0DccQL*sxIroNLvKd%dphbrUAkOsz2*dWU` z0EzCZs9sVI+egZHIFyZxtCi7MLwwF44zBfELe<^~Vqvn7(XzzAvKVv~=)f|kl^c54 z30|x3u~S~Or|z#usT=W)D9d$X8=a}^-5`GFHXF)&f<$31GJn&rd2EO#qd&$mPSMyl zVHqykirLJ;0%*MpQYbJ;5|6a4K>%{hIR>SY}0??+ORL$F4 zEU%mk1xxa*7I{Ky=T&aedL!(1zQ7)dXkqr1rLc$;hk}W6sjO%M>;-C zH^wcm9Gu^;f-A9i8SlbkY^m|a>q6pU+qfe%F#);vEjIJC}W`Mwq=)_H;5 z`rXFHU!G%kR9CPXtv*)hA;Sz;zF=1}g_v#x7`yDsuJd+V8Wr?u`YPjku25EQV zrTjU`=GpFJPj2ynuSkJo)GUs813|p|=7&M*XG-ZC!r*})9214HJldPi&?a~xx*dfJ z9qci{NaW>d&j94}SNu~OjO8Gq@XWEV@RDmUVg1hxOJ4alXqhg=f_N_nJt5;dn z1r5gi1=EOSLJXt~5e#u`;Pl7=hT?|M{q>HWm`=>h0u?k5+CV|=HqN-4V6|xlZV#11 zv40R>o;yLS)eRn&x0n;L?hLAJU^P=4J!hst(ZS0^a?WfRvlDD|kXYg}+n|D##MP;R z@Ea5A)_7vUxd=EeKMXd5ylAV_pdYJ=yz+D0jF5@wX|BN3&;96Rx#&7XyF%wIsMrXi z&vzzPom+~61Ln9WnTwtF{V*_~=eUv!L~_V)#-9Ywue#`pnh6AV;P%Q1rtgX#v)HGe zuvwLbP@ZhsH49^O2Ql=%X5vcwXI7P3haO_P9cz%r(pxi;YAOh+ZXR@VoS{W?Lfw@f z2)ogWsEzs;kxw+i=I{rC6(#$+!ckvgf^3f6O-Vk zIfecn`C|TUN8487Fi^f!wXY7J#YN1Ukb`oRD9Inmr_#CnnfrZO)X+;val9n*^4#t`l>!`!>hxnfk2=G>b-yZ=S z-z|s4-%+@7QwSQSp}4F&6D5azU`!d6Z9N4rxh8{i$wmkXm&4I-;t-=dqul9ST!^*9 z^>Q%?I2+;T(JVAj2H;UeG@i~PH%f&SviACdXFJWRWLoe&Odq3-v`_!7hdBKnsFwX@ zHT%-=c)uP_{~-sk1QlBHZ@{b0P2C1dpN<*nguN@h^)oUm+W_ZRgO8PMtal zA(%8%25v6-!~jZ1fK?`OzNg~U7IIy$3qk6e<(QQ>#txU1LtslSpER!+EG5e*&H|`|3p*kbjUUGVPa7wQ*hb)x745`m zzy9SzC>-?z$<0gi(^M~l6nBb{JK7I_DPP;Toqmt6e6V$t zX6chUxHq=y;P;w-_HMo?WV^|K5@dmn!^ybuia3pJnb=u-hLvaUfTb9{Pm=9mJV_m= zSI8Sry%CPLl;K}H{~;n{mZ4xh&BZv2k;gj$suSpEH<|#aOHx?2p4cQtS_m;tfQpL> zB5kFh7+Xs74{~{Hq~dK^3?x&l@ifr@1E-uQgKv$RND(}A<^^wiB*LW^VYu}*D?Yps z!HbA_HKYpZzkk?S0S_#>cboYYMdOxnJY<@FGrspVc>Z3Po?BCB2QCCN@*O-<^+Mz| zS-5{u2iNc;I~YM-brn^J{fI^1rYGD}KJ?krdB@UZT82+%Xh9Qtu-)Z2T>ssHGVeSccO51N-!_9LG!5(jo=k`GfDKWr4}T^5Ev zv{(E}2e#rER|uCb#Ii~QEDRWDUkr%xJ}(Kq5r&BGkA`$oEV>ry;r3(di0#ruZ21@V zj{3g;#Dt+lOp;>9$84}A8BMz61^3aT)Xo?arwr#l^P6l@Y(AohZLo-bwhCsa$ZxKT zpPHHUc`U&mrv`G0TVr6IGK!RMGA~I7OevLtrr~t(A5q6FT?vz()mFq-xx&;AD>?pAMuN*4}7NzW8L=Xc3KCm31QghEeAbIa^I9BV3rh}zvEX?c8?t8 zc9PsJNjXsM9b(@?J*ek8fZ3<=@!m2Yxsd(qs7f2jK6wSoAC;=h?40 zobg2|XmQzvff|_VTwaBVn@>VvDV+oUn}jkBLuNrSibvca^m-+p$?9W6`AP_N+2GA} zVnlFOLvv*_ytIl?SlfXm2T$O?J;r#dKzjiFbYe{hqODMf=6=*&ZEeNbpTtu<+K)(% zKBgb=$12+Ue|<*m?8a&Mb-e>)Pd6g;B^S$rO2{!`kG-V}aDmSn=>~lW3p61{Q3)dc zxe+t_8~fJm52Fd>lG*Qo4flwK;2Mr@eey9{#iHrwUhLPKgkrx2^d6>+0c$})s})Ap z48W&$Dw=36?SG;UQ#5yT*E|Wrw_O8hTj~s1?GilR<_)vhMkM(iU|ZVkU_y7Y|7d=r zdWMVrYs#UumG%o;=b^sN9FC8R;CN~a=B+ozwuz}2l$nKL8(SPKrB0WB9de0-aV?lk zWG82%NVX1FE=pm3$^_ijtH$+g@~n8L5^tY|Clw3Vl#+%reLxd z2YL&H)Vfl+!$uS(G8uVkP)i-e5t$vRafl%X@&UVZ2`I}GqOAXWCU$fR9?H4m<#kmYNTi&e{btVH zZX;+O-Om(P<-)IdJs$B%;`-$hs8N@Ag=-tqH@#=Z9T9lCiMT|{ROsB_>^d2z-KwD2?23jmDP$zm&-O4c^~QYQ^uYjoIs0*m z@^V1~qR=lQ$BE!8 z8BBp$G(Xb5NnsCl+4+?xU~RD#^~PpEcAG6?F3=8VDzP(Sg#Le~I2bL2x;r14q>DGo z{>9_UeGRPn{*8Tz6#-ANCl2L=V96~@sC0f{V;zc667)qO{dstuRELlghWIe48OM6` zpf0!#zduT1WM4JD+4y4S{16OJ48?K35;!c@puJ%NSLAdJGMCFDAU+2zi|dfOUJ(DDkXDPt*#T;JQ@@DCR7KhT&&@TMaJL$3PnY2P6x!n$rsC(xMG&5I2s0C?2fw@?T&L|A5w}C@ zfIG&&B_iTs8I<_M@%dl@v6I%JH;>qMQOan68~%9);2UKL?B?6zG@XA(&ZnZ8p2-OZ zG$5(62pzU_u_d()T@9j`e44UGB|p?uuK)iYeM97QY)W*1`}G;nzA28^eKja14vkk>0ve5Eu({2J_Lnr{ec6cN z-o0Y;Gr4sw2nWdrte-20&eLm&Mb?Fp`#G5V#~j}m`eNgmPza0dg1V17d_CqP zeN+}TsRP_SW_*aAR1L4k>kuJIU9aQ)*k42Zg?l16F&)_3*a2B{E2NOGH+lUs*jQCV zbCEqtk67W^vS7**n?NDn5_JM~Xcgo&{oZ_!>CZGn8ik1*@NU5Z*hUa$|*S0K*O?` zSJ|;ZG1}WIKy-)~djHO`-+L9%^o@G_0Tvwh%lg=PHkqAXG|K*5N=3|+=@_}-0*@Dp zaD68N6X^#mPPi6(F5P1m%RaK}Vaf9=AiuB?KFE6zCtMV6gZ0d+ z>J+oTHWdd;##yYF16KD^{a`-91LBMwlleHGc zc-3*AnKpi7qAz#i4rLxjj2f|wGBE!hL}DHvd1{xhL!i|ScGK4YDHk0PXJ};1R~!zN z>UNCJT?JnMYAhJ>fgrsvr%D}ULxP&{`|3>h7zf;aKp77wAMCCtKGOmNM7jx)pRp8O zfx(#4YXM2E9$b3Xj=xD$plAn3{g*^NOJc0v`OCIT)}ni238oW|W{ROIc-Bdyyj=vM zy2QQTs04|^U5xj9IB|FCQMOqh8F7uMvL_cnU?uUh)bKHLHwIRjLE}OSLUL!oyZHxO z`_K>1HRw72eJVC;9OCYzxp{!&K6Y?a0o}ybtK1p{}Wo4+Y^O!rv94fhz&0U=q}QV zQM%J^(Nv|*Af4;Bs$w9_6nFNB!a2Q=J{$MgSMn`oap%K=Tq72}Iq+>%ARd7$+`iK1 z$ISx!_5IOOC4n2uzcbE&CUzA|!tymgJ{EmoZwm)Fwf~6GaGkQ9*R%1tJc0b|3`?ui z$*cX5-R^UOl{WRZw+}$-y*$#=`7nM@0*cv|oU>DkVW>}>VV6KUpU%V8!S#4rBZuS? zKKL8cUEP^{h;GlBa`p@uetgOJDD&ZRlGu}LrO`=@{PA<2+1}w7Y>B)DczRb5d)E<< z2W7}JYKZi4At2%vd!LbjhlYvR8vTrYiWI| zeCP#-HHhnIfb=(S{E}IMclYISo$_qc<_BP%ks4(Y9moX~h*$wf^2~AZ^^b%+#{LK$a-Zc+4T^06Cf4N7%SN`NJlqlap-@c||f|UTFx^hs5=( z;NXzp7;F2dfDy||D9O8kH^3NtYTNMdW(Fea4uXfgfR;9wabFA2JeiA`t2nTrIZV2j z3zokhVHGV5O6PuZ8pt!2eE$|3-|Ipio1-w1NC7|fv2R|8g2Oro%*>8~y}h)lNT?k| zZ|30u=-+$YZ+vai5;)MzOmW*5a!qbV(jMvqZ*G9xQyE;{vk0jTt8xFiJd~#SQMV`n zzb8bZ{yYca^nDnk^J>P*^)wd^2P+MPUfyMd3kSkxN*R_vPNw{ZE(Ra+Vrk<#M3)d( zW3LC2J~zO#EEyk;^rFJL2p$#Y$hsH-hmuD4oG*h(rZL{w9l#&y#pG99hpXRhu>1TT zT-ra2<_y%KwfDs9=jFH%Vhru2nmDqMT-PBL#zW%LSoM`U=dtstKeLBAt(n+FIi6z< zI(VI5Mqapuu+Mu2gUaJfNoxjHd?#<};S~mfQ=3uncP{d-?SX&88st@P2A_5_YMki% z9NUWEe>M0tOxc*uZtyi)V*HmWIOmjcKf@fIqBXD&nT^0y4xQ&BaQWK_oGC2Dc-9Q8 zs;$GwxD#duoMPSHt1yyZjfuSr5c#1F_Qa)VBDZ?7gB2D&eZ*SHEAuzj9g4K$*Lm@V zVVKx`e0tbh&4J&*d$wPYp0~EH7`7E7Kkj{YY{z7T-9E&wJh;ya9Url87Udl0pVZg8 z_=#mMwF?REr zVn5BJfLQX&n_KVt}vOx&vMs!g{zt-u@D|Vv50xMr9pf>O;JNRr8@%F_rZ&x@X zho{k}+OnEs`ITUMnVk7nci#9dWyTKcB3Uq#?GuHFnv3M052y>c&7_SyZL@y8Q zl2lQ2<0I=8RKVxUtsI{mCCZV1X9whJ@lI+ECZ-x={dDRn1Wv=ohw;$wC3o(VO9;?Y zLOgX)Z;Zn_syIZmlpWn@#?+9V)o6=qFP-J)YUmQdIJ zVjcA|jnLligKe6=@b6lR`ZxZV6C?uueZ(Kmt2dsu&=?ut!eOLUh2&XVutJSYiXXDPqm536j`>dKdcxIEMq^AW-e-m?ldm=FsiH|YC2^nvrkzJLGzYCR6 z@vsnE)GTpm)peHmeG%e#deKRHfM4GWh-n}VCCb0@J~M|AM;bD%YM3BwL>W9s96qK2 zrS`Q*uqwgVr7g(3>`t!hK61;9Gu3x__|r-J`Bn>f@1t`B?Y=|rtwhhO25yU)64Hey zqp8#ZIkWa6wTw70Tvu#=K+ZhJX6*8*#hU}ekbXM>r`At_DfQIcQ_gY1d8prS-;PbC zeYg~n?gf#<_8JpRXheK)0eqC{>{Q%{(F`N}dQbn1S>w}@ zVP-93hTzvVSo@z3HXWnc;=)1PR$2r%`!zTu?FqNeX0)V-qiK2}CLbkkv*r?{d`v^t zi2#fppM#KgBjfK%q;PEJPV3i5eVYHjljz(*g#p$QTGh!+4;~MyPf&pyNEqBN4Vvs zgIZRLWRnKmv?oX0mvC|!i6U};3e6&laDY>em*2b)lhFt+PY`~dX~+IU?vVep7yo|! zWUJ?pZ}`z-1Sra4Yh5Ba3%{{7ln;0`&6{&|zyU@{6L7A{1Ux!J;16&n?icxrTb+re zSpwtsLI|CkU^;z~v< zN@Ny>VoOXS0%Qd!%aDMMuK}oU(?jxxbx8crh8P&4u=X@Y`5{m4<$UsB5TD~`)eOYa zyrol}I!z;*XvPJ`S5<}uGwiXUellg>b73fFjVm%%5I-A-ym|xTq(q=yem0&O>q1t% z2s<V1GFBEaFu))21DKfMPn^ zEf&wJ*S$09npVv|E3GrxS<=9CEg!LaD(jfD{sETl)5nf`HLzC~-?7VSbms1fXTJVE zO#N*llT!GtKfAYs2|Jx+vr8J;?G-(&EGd&s9?xb0w@O*o_5-ZQt%%JQNMk+)ebnc% zWLX+MY%orRiObY8d#w{JN8u~KJkghtImV38ok?|x@$OKC?`^NyYM;KyQr%9oTa=a|_^z^_XJ7N2v`vMK}t zTtm2cxL|F-3l_~ozO)DvY!O{c_aIpuJ}ij49C?~w%X3ceY``zyFl-YHhVrIB9ACbQ zKHIV|m^J~)w7X<2zG0_>mp8Q-kk^ z

qEhefvp8X|NtrC9_!VsEfWr`fppGy+EC3v1LC$GXpQ2yJr0`|UGvelri&?cztp zs3?YXX}4(3P(HpBsRD7RmspLzcO3Dcza0uc{Gj1L*?dh$sOeMYE{I&gN>h>CyAXT6 zsB@}&wUI~Xk;^sF5FcL$-xsOalH7nnt{(Zcy`hsIf|#Chh-6K`o|tIl%}Rvo6)VHZ zYcr5c&tzfotB@_<3_HgmSjEwqPwNf4XjF^qLd%dIZG}J68u6q|65qumAT6x}$6v09 zYOF*tWsHP6Dd#&Y4QbcTAiN?N>QjlQGIcIYt2_~GxSee@-3Z0;3={`&DDz&1qWXn+ zbSxM~%fhkc^<+#g^g~o;K6NZp5pW=%+_lxHawooP@>*1>8(_-170A8qium^3cwF3t zEqX54vGgEz{&2?6fnMZ#g_->Ru^j(OHZWE(hxX<}*k%?7u7n*n(d`?8>Q2qhB^IQ-H|lo(W||_jxBKggTbu$M zQIo|=IojC-FUFA#T6lg!7d^`@@M$ZsIza*D2Zdqn8ppjCZ-H0tfJ% za^l2j;p;2f-F{ZXLw*75ru(?4b_wnU3B#p`SZm^H#DFM)bZsW&FX|$*Z8A>vx#QL6ZY{!T=I}AE3Vd`2^Jr7W)u9;t-aHN2{WDOQzsIxI){EP76WpPa1Z<0#DCHOex3}U*Rw_hh8o+Y)EF|zP z!=C9zNV!S#{cA=r`V$6eaci8ODTakZWjOS_1u4IqP;k%<_ir@7BghQb_>0kQ>5jJF z#7>e?f^D6lAu?Cm@s`dC{_T%CY4e%eV#^{Sm9B8F`fCzDHlgn_%s*?JK{_s6I4`*Df zky54wgFAZQ`y55?z*-pmN+3VwA}BvG!OIWM_+_1iFLdWP^hFe}_hFUC>3=gj?m1gs60ai7|R5nt&p>lOCL5k>M&&WT07 zv@Y~F1rY;<@-mwokQI6z8Jmq!IbD@Hq0#VLP|IDC(}<#X$ynYc168$h4E2(CS)3fA z31&EB-HfgIlsTU$gNuO{D5{Nw^jF}Amm9gpow1G8h(%?Ql%4_S0p!vZXm&C~N%LGi0uUL)l z(ne-<))D?<1&}dpMjmBDcWrJ&Z+#}5;~6rXsh?ry5B)NAm}Ldv?JYg7v|S0DKeWS< zJ~!E$GjX|ZDJmq$&HeEXv*yx$?~Sm+at;p9rukA7C3`)>#UPt38-Vb4R^j!?0YK22<6maPn8h z)~US^8<>xqgV7kITwA=j05)fvK&*2UXEY=o7YBHtL_KR^k=wXwx(w$Od$1-f8nOIy zaeH?SdcI4Vw#;J)$R~%TYzef2wK#N}B))Mp6dxqP(sK`P_$uJ|KYc+N7PE$v^9H!Fc;Q!Z*O-e;_jp#v;r1gK1n;qIB2+(5TBaYB>Gnqd@T4-HYXC3 zBdcEAeklJ;d))ckAyEZb_2Ik63v;eM?XhV2?J3G8AwY;EahpWIyI$ zHFqlI)Jrktm@1UJXs7m;dKh*u+0{{c=l4aUAw7>=6HZv=Fb@|Gm(V?03^PuXTRXso zxD)YcTJDFAL(*W2i*WF8Dt7x+;L<1ZDDJ6+Yk)T1?n*$;+Bo9%M`CXoxnln+<70_0 zW(jl{Cp;rB?@^_Ahb~4z%|`IK2Xse4XI% z$`%jiFTo~%y89ec$Fc=_$Vi_Lo>lIsGqc4|;s)sSIiU6Ua*SM_fs!f2dvB%pj#M1w zUMz8;F9xSpgkn91)R4ngxO}*RJJisM_43D=QN1&Ai2X9g--#fv7}#0O#s^IUe0R6S z8;w7>%woX#=tv9@>R(iC@%j%Qy*INS5 zjG7^Nz8l}q8l&o5I%KM6!%(sj+V^7d?vyK@WzWVkF@N;mUxDXmp0MA?i(vO>267{_ z;o>U_DJNf~%TllKs1S^Ynq3b!waefO~F49O>AP0`dKZf>F&XdS8M4gU7=$AQ^o32J;`Zm0; zQo<@b1!B$f;^xD82##N29ifI04wJy%r)qS*@PH`!uNGXQIpGIUJpO)*-D>uO=G_(0 zHXzpdSK#d)x?yWOgZ$#sLg$$yde=w^p zTi!Id9qu7O9ff>-9=Zok^ znVczR3*jN60#lxOKOGspe{zl9P-XKAjMe|0n|ea?e@izxZOZN!Z@KyE<*)@wN zC34r-!{9glSnew9nFh~J@Pdix#wL`_p2Qzh-qYtSdVmr6U)Qnb3CrX2k)To~E~oiNIE z@?U4UAIqWfY9;OMjBqNy0+}BKAmJZ{aq@!{-84aoNd-*KWr07}9S`4mB5U4NMC?u= zzvL_&yA=hKS%#=MxRky0+6RBdIVf~9M)u7ToY+`Nov2WZiFjZ`oe=7OyTD;$29$i# zVHtLky}A&HAnJ0fEeOVvZIp2f%)^bv)NN=t!#WvFd`}Z3mWC+$)jqJ!N9)*Kzh2{W zQ%^BZ8Db+`5XIoc3K(8BLfSWa=IvC5^{^UtJ{2M5QXY+G7%?oAkW9M{lWZF{6vGfQi9SpJePV{|d~vFhGWUYS zsQt_E{rm}*{#K5>tkV!^VU7=5I7ofbfmiibc%od5`!96RrnDCR?!0(ymy30$D)3mG z_~QThLPu2{V?rfp*WAmMJvxWnFe}&xAq(Vkn&ImhjN8Y3VHlx@Z@;|oUG0uDmb zgm}4bm)M8fEqJ}$k*na*i@iT9k$Scbu92nq-9?Pq&yg7Yevd7s_s+A!b|@Vr-`RK! zWXkLzc{>myXIx4YQ@FJcez;~u zf@{4Kc_UNMcP0*%G_P9u$q^v|#H1OV2=Nbz*gS5){pDsvdv-xE%KP#9_L9ej=J5ag zaFjAbI>EW{x*Cst3cRK&XWz5Y1Cn^NzaO_Ie&A?LJ%sf&rLfzagw)K9*xT+0cfXa$ zq3np3NG<&TGsHVdV5gA-K9A&*N5~U=m&|bJY5`*6B5~Jt89p7DMfsHitQ%@UhC&po z-0rdz*)pu{2?C4#%gT23;p|n)N6n!QnRpZi!hu}v0*nqZa8#$`cM>tA=a*wK<$Dz0 zzhj&*>WExqlrLX^)08258(oKYv~T^VBag&yzt~E;hpyTogna=InXcA*#(j8_BTU`d z*nhMW(V_g?ofw+uX`z>Te{1dDu#uPZ1tw-ee3D&Sg8|58m8T2)$1dn zHp2-*rF4hS`_00-zq7g9D~Y}FlVy0j;^PBd^1USCYM~TvPm9By?m`^%@A(Kq3nbs{RzTtzfZ)V`5)n2w(zyd}u)-&NZ zx=^@c0gbb&I9*I0b~zt#jEH^y+zr#W>}2ZE*V*AA8_1-4psxN0<2kSrF5T4kGVg}q z11`?rT?x;ka=4Ang@MuxsBI+{-6=b4r2nqgWCK{V3>)6;k;ctD3{tMgws01EX8vUvh#K^)jD&- zOqzqN7}G&j1ntI_RAZTd8Fm~m#?^Du@JU3hd{av>Fnk0OzsKc^ouI}lbecBo9Q#%1?2zyb^6wHCl_gDH+0i{S5hE?la#F*nH; z90*hAlX^mui|M_&3TkQbz&$QU-+=-cnW?f`{KzOEnN4YPQ;S$?2bO=wBP9CM@tJ1 z+FoW4hK2E4@)}c#n#4Jq0GNLz#@II%Y~e)F?4umPjT138Ndhn1_z_RN)$R}LSo}9d za{XOq8jrrRDpg)w-$o3c6}O3jY=ItuVaD@16}-nPp?}Z-rblEjlgEl&1;W@k^&Pvh zBMM2C0VtmImQA0ij(4w5vU8u~@zB-;*X(Kj=lPE<-13zrkbh$SqzM>RY`{?&UqodT z;}LNZoI>;9a$*76&$;0Hq;ZxwPY>x2RNy+9dLZ>Ou)4I0GvlixzIw}|q9Y4#6U(uq z+zJy1!Vs}s4%0m;Yfk<7xcY8LU)I3C+fyLgA%vD_WzP4y8oZ*pyN7~0IcUju_HZfU zWzz5|kGP?xns5neB-i={7Pf(0Cc)oW6#o_GU84#gnmwienuH9>!Y^4p!dwr>K!Tos z6a2iPIYSB4V`-K>WPyeY|JbM3i?FIE5+TusP;;Rf^{Y4R>tGU2?FfL0Br$Y&D7&;` zIy!r4+C0d>tKr_e^ z_f==3Ur-ToOD3UpryFu4nsI0L9Gs~2MDq3pcsaiZJq63~`g}GLmRaJ#n+#m>X~W)T z$|Bz+clL67G+b1~<9(7ideIg+nS1K`sVKcli2BVREts`B6eJ@rMpVB7CUV*vazVoM%Fo7P$VG_zLZ!==I%(RBI3F z>b7CBYy>oS6kx%z64W<2V%lLd__mS5L6E!<)98#b>lpQ!CmHRPamRt+nfPekfK1BE zejoC|B)cX^6lr2cGr5Cag`)n!2vmxwAFE2<>68Rq)0NV>dm^8hp5)bQdV-PJr{R%y z9&N78Xijv-;}`i@b>sn^y{+N0s1&zyoUtpO&QH&rAoVB&Mi)IXV{rn_CfX4Cs~P^c zhtNtnk~R9XU@?{0v|KUV)?C9JOg2Gj*c~&5l8ATOh+@5&u=<@1txy{{TPx#ux+(e+ zE66Lh7w3;p#u{dW)Pyj);}IwJ&=2O3umF!M=E9*h0&~kJqi?4;&U6zqf=>Vm%3hFv zzu8clSOR;(=}bVgixpeDP~7~G**vAb$Q@a1w~0Z4=PdZ{=)ov;+INgpv4E;s$cq`D zvEQs8dp)P({OL?^tq#IzqdM_BR^q;%DA!TpbTgH7pqLe7fOvo&ICacguNHRl|WMnj? zC~0fb(oPymY5bd%XqW1}-p_qdzx4>qrFYH z%nIZ#l7!&x20XrFh0b8g$qjI*tD8yv+~x4Rr3ME+W2D$PnM~bFot}OvoTdAb$bl-z zmHuUi{zgK|f*2EC9^^Nm%-xE^kd=PO+?Udv^maZP4qnpvOP%n&vIcm&vx2_oi?CH# z8hnlUu%Yvt&iE*tSu9OwFn^q*Gnv2}2mC$okL|n}0`b${h?r%JtHfPM=ru!>3-x!N zYy*$Q3N#-zz@H*Pv@e?l#mNmUX}uT*p9R4ErZce-ol%*qhKvJ?Ay#9G(g72E+cU%t z?sG+KhY04?`?8q>SD4PmeN66n7UR_kXF~kHnMx&{qrA^B_XHl6GBle#4U1rVT`ZYK z*fjQf?j*xEf796~0SPv^v53vTD?t3-MQncgYxdJ(3wtNl&dgl`la7I@U!nOCkipj{D9 zL_2f4`ot*XfYI7nNa!H$;~zPiLruVrJ=D<_y$7kJr_8s(6#|v+ zm~-x(PKiV&UOIkduPcdda{m-|b(}%0U?6;^86n0y76Zkdxav#3zPdD=+pU0#y?AF-ayA50^xY8PD*=^BqHqr1&1zAJ|0vcezi}Zx zQ7%f%)CHIIl5l2`A>O5nW1gf5*4-dho#+8rDbGdW1$*R`EyazeWq9;J6Q8-w#4)!g zW>PuLqzuSIB!#3VV)R@fPe)e*#@%Z&+1u{|$(zJ^q+BK6t_u86)&{I?_h1B9CpPwB4N``6nlo?@0olopxyR*%I)5y zx#qTha)k+mljEK;Jr6A*;t~M9?DZ(Q>V`=JTBr?jMBpADj5(f1m)1t?q+ONCa>{tp zj@8$^hh@B&2DeAl`3_FN9_28s`L`U`sJHNULm+YFDgPJ#k_{y+1=mX1fh8+H}4V6!%po#g@KW;Ed8(nmJI3Pq~mjw_9q*$dR!tJ--knW zP6?c*?t*BlEmkV~BQMqtff-wHiSG$JmOTTF2ec^DA%j?Tkl)e~UcZH~Brz3H--(NF zMLy%UN?4!E#cK^ccvt%2K>TEQcJZU7AsdGJdE`VN{oa-9wz zLviqU>%o4>4mgNN<4L_e3dhm8@Ctc5+n=$FVwxw-plnD)JaR9Kp>F*pc42NVvaMAy zpzDPPCyI=NXFKAx(@%DRGButdg)p1Oi-xWFkPr5S@n&KY{IUo)4@W5d@?nZjyg-{#r?~}vHBgruBvc#f_EQm&|!}5vonAi}4@9O;cHJb++ z7URIHXO2jYIK{dX;J)Jjdmj#WT4QN`Mfu}7mbh^HDhrxQGegsU<~?$cF>i9t1uVhF zKH_eLi_?8>0(jlMpcu8E`L7_CZ`?e*iE}4s`Y1CQA!o&GDU`j8L_^$Pc3fExsht%# z91@ItbhG#Q#4j745Bp~tw11icjl;)`s&0_GG4Lnz*l34DE*B@w#US8G{#OfUI`aWXLhhqA90(q7&_*D(g!V7&zR-iY+1)Q?nC>(>EIU$!*3yH43JNCP%s1T70U3>=YjR_nRqlQjaaS3P%4@Ldv8PZPqBmT zEb^|XrQq62QDlVNV!sC#;8|D(<8ze75(_`byUoIDT?5ox8e#(G=Bb zwmg$ZuRNhTNNoG`SnRjI2`f1*>iQPK^%gPg$5Ef@S2lLEe#YQ<9>~^+qHA9ZrukRs z)Tr%*fIL0JZ|{SU2)VTzC2(e#^0tmk5RjgW$h%YUWRfpl{IJ-r1<5_dNoti4mrW$&<42!JLD`uLKBa`eC-ZHUcQ; zea+nz8y#d}V=)std+#voB&BHuYm{N7p%1%;XzVx|2lGlpbgg&7gA56*yfPh9`{%-a z`b#*`-fP<-Z+tAKef9Zmx^L+HAt3pNt=PI3du%Hy7rP7JWOH!&$!y%(;EnF#{aCtO z9a8dX&~%!L6+3US$Y^7P^O8Sh2knfX_)zyD0$GkLp!2Q*%bpUOBV7^kpLsE=c88r& z%3=WtTW~qS9em={i64$cPJ;x*?*w3pq8@QUm5@11^Z0p`mt9Mq54zW_J>v#H`)F*q z+kmB-9ByrIK@s&Qz71t!aDyN=)Dhd7I5o9=pIB9&2R>bCHvBI=7!HN*h~--Wr>X}q zK7F6@M3<6ZhB|EVD^N3Bgp~Iu$d@9FvmKim_j?NtrH#`$dwB=G$QhyhdOdz0y8-`{ zS=dkfg_ZL3|7t@WA(?Et2W&y$+d^V|%*Q&fS4=$I4dEp@xLdadDyOy4a>fN|$L3?s z)E&rks>QAV>c?0%GqazoF{l3;Go9IMWs4r>_t~Vf%+&LHn>q%T?kfz1$f#p7fnVb z*laOuVzV_4p_YGHOXhMc7;C}%U=`$gmLR6k7@kK;FkMv=dU1PUN15Sw!h_7EeHWB| zKMJqm+q?wA?21A|?;s2K1-d{H`D_b6dX`%SzYS%?_g<=?q97c$A7c=E#;KaCgT z&+RyTPOyX6=~OKIOOBa0b~qkfg#g1eeB_TtWVF{g;XAGq@e4d}9lqKdFZ~Hi3MMY z58303io`iEDeGpF-zcNI{eX_mzAoxq`=egh8a889xD(}!4{!6)LA}R2K4Ex#lz3P# zLJ?P2h$t(2b@5ZZ*u*nZY-Doq{UIB19k!ogV*Z{F7+qAw9L zT?a7kg(03Nox!)M`FMR`6C(Ul(eQRTl#A>!{@6GOIXc71QP^0Ee>M(D@#DRH8v2I@ z(Va9Eyz<@TL*ig$5(B>f&k*lwaLyKDs4pW&XJ$1dedTpK#~ne_frVI|-Aw29DqOLn z^GN#+$YBG5_bfqAvISOKS;EgC7Rlf8;j5&E6D7niJ~bD6zs`r{mHjA8^Ms*2&E17t z5Z+XRcl0}AYokryLGrZ)G_jN7#EUH72mYvhV(V^3;L$)#YpcPx3C_?__lM6!Wz^p* z$N44pIX`Q>o*Jj^EP1B;6&_;O@n=uHRf%R$Iw0@=szXj!ft8AI}9*izj3r% z7cwnBYK*xjh-q>o6Ej~XLvg}XbaW;|?Akaqs|4WFQ62E9U%+Xr6HJRo0X$}EsLRbV z5zP#OVEJt9rd&>H8_hmbH^H0V5?^915H)N>`3&+K`t&esV_hr@9FNY#QTpykW9rK3 zC?smklk7lD-KT^fv*`S*K)vZ2+LLumgnJ6{m4CfsmoNOs1`mhe%gH3lO^ZM>SqiM1 z2SRt2Ao`9Ee5}-98Yx5!y~)(kqYkmMHk#@4CGr*Zw24I_@}31h3&wr3HP8~>hK-b| zl(u`u?5~z0<)#(1o8u6Bw!wJ8lTh&9&;zq~MW#m^G7r)-Vo5O)pVB;dAOaVrucUc; z9cJmfLZPREMH$3Hv_McR{|&LfuKMGB2>p5ADO7svz?`!FI=bY*U}fl?7>&LM8L(cT z3jrPnhz82SuUQt$Rp(-X=W_IX5JtG93oMt= z_hY#@zEc+C^K0Nb7=WWb$MC#G5KWh6q5qaSCK3OiuV^z|t`jHgS|qAYZ-<+1D;CN4 z<0DTn&ff9F!`yc4;Qf!47`s6BiYWxYs=`Cw4r;+c7_1}qsLW=}N;QXFZy8qcR$<%6 zT%2A!4cE0I@u*!9UlvcmwcDjQ^P>o#`izivn>zlZ!bo|R1D^K@;McOlxsA#wr%r@n zMLmkI2@(ry3od>FHv4Abn-V>LN8>SmAPo_Eb~xOo4VRl*C>s95oI~SKe*O}ZG5KN? zscVHC1p_ozY$5(bIxhEFq98eg?!YRDca=fLI&R-Y=|wCsW~l2~LwPw9*knFq;)jXV^?3oxi|lZ{ z)g2>+nXsC(24OSzVD(|@p#9Rv)<1k$n*NV%*J@+-lnqk)W(+Nx05}#C@8;qRVz;k= zcEdDWd_o=Tm(%cQl=8JU$Kj>h&01FbVuAQKc6h=PtUC6B)%4oK^|C4Y*UrHA$N!iH z^(kgK2|$!j7zvXlFkzqYv?rp%2#cmUf`T_5PfSGAfzQnLq7UrHh+}j{57U~+57piZ z{jnRY)ySCfo>~CAiAvg>;v$UMTxWXkBXN0r6}C$2z_nEcZ)heRG1U@>RxN?GfG`?l z6%if(mK9p;W8P_XY_$Qoh_&r83Pk#)Y0NwoOR}cAmNH zb@4=)UN?sBq9FKH6Z^9N71L>PKm*OY@+Jf@SyfG9I_u-6#x&rF7@Qx>#mpQ#B&U^P z{V6>dEzZZ>+a~Z%h@qc18IO)kz;|gq6yKZ;2W4Z#c6Ay(EjNYE)F8NcTH$=aPH5Vj zQywK4Vfe)+7IGLOCZzepa~KJ`&q^Afvn~V57-wqg$WZ2E?u-XaTqO?zw4;-(r%v0I zWQ;}pf5);B`nM=+H=Uln^o;1)Z;EAiTV+fP>lSrP zlAey;Wu`AqfuzY~yoeVx zT0LltrEe{uwAdW&yv10MEsk^933&2S0uQp)U@k~`p@%szo+FCt@s#12y?V|2X0C43m@q7B* z!DB0e>F2F!P8*78XLl2`Zwc(BwV>d-pYi)IgHT!->-jax4${2*$V@+YQHJ8Otp{9Q zQiowb^}W}>W6x+77u{=vTKfRBy*Ct}C4(T)P!E2SVz?dK2J@NZ@T=)(3;9}M$1EVGOD@L7C6KZ6$N9=Eg!yg7 z?z+1;5&N5c+_aN*ou;#;82@D@Y#ZrJ&rQHYPnx?XWN_S-)j0R}0MpOVLiyMj+x2%k z$4Bo3hoDp}R}4g_0I_tOvN^f#edxWbj=S%Oi~WPR-b>6lO|Aps2fpAQaf>RQ#Gnw& z%MIq55W8+RoClY|>>1^UYiiICyaX4!lkjg4__p)_yD>A5bE7@1Y-uYydn*91JRdSm z@g7W=NqM%kBRFzf9^uD_*_!XV-02j1oF8KP%vj*ol)U<1Y-w8ZyOM@;`Kh0*fkXnrIE!}H`O%}j-}17#jA zszGpIDbgq8BEFO|C98&5Zx28BY_S_QH|-*)*;+XFDdSuQFW0=kg5HH{+HldBLkjgPJGvu2KhR@k2;Mcq+;ZVGXjis_y*z*h*p236#siX56%h;(kki!n2` z3~3Ry;oZ6#Uo#ZdUU|i7+Wxwrnj3AW#v3ZFzTu&@n%)9;jdezsS#Qc6W)t}cijD>3 zS24jVW$J&aQWr104m&<2f}fp7g9hDM%1$z$?X5UoaF-R_lE5pUx9qy50p3_|ghjhL zxMTx(O~@e+iWybJr8%Q%zL*lQ8RJZ=A$3O&`y0i$U%&gX>DC*_2GKi-?)z%BSFmc+ z77V=(!qu*AaET{Q#)e!NpW1@oiMU-z}U-o?41%2bAEvBSx^Pb z(mAM_<%M9o8_-?Xh^)=#oP^aluJVg2m;O|O`zk(}TO=pSgYWJPx z-526)vWYbk=!Q({GBjyef#08pn@)F<)^t~Vb*8>$`(xH?a1HX(#4ZV2g3v~DY>~Rb zMzVTQw_p>zuJ&U=E{D>rpRDk*A-Df!93HMNz?P^obll^i?)Mp78)(Iq&0CPsb%Skh zjbv)BCR}!GEGBt9W*s}OvS&|^vroMvNc^aUgOu?*G-)IAj;l5s6{9>%b_U{>cd+8d zHEgln8fO0`7A@Mu1>&_w{@8rv6_MX#`DsWz<{`&LE-se2qf~;J+;@(m>lUA$0DoDW}(uAb?z z5nh5b`qQYVr32|cG4A_zCm2{QKq7r-<~T3K+LzaHqUSOSuib!)mLiG+{<4Z=Pf?`D zhql`)*mYeJZdb-*vT-@SEIo)NzX;#j>xLrof^%Fp3bu}es=XTeHQW#;8j6RK3vt;o z2S%^r5w`(wkX8rJDkUy+(L4YVnB%e=Whv3f`EwYWqZ!2i-wUS{E6UULu$7!Wcip!D zTF(5K=WB^K(r*}dYcCX!1Yz0TC$v9Wh@W4~2+(>I9X;9@wjw6w_(s%OJK(|hWhgkZ z9*vp}Bwt--oj@(hd^B3^2;*_Qn$%#6%` zvma9RENp0>nZfHN>=mz~nblWb>J#2J3+It%Y8uiw;$zLC9#7<^E+Ma5kw1BPZJ{Ug zob7h2hQRw;JUuvo?UR~Vc>OIFI#h<*FRxfj>Hs@?_9t6+yN1=5HX~viarOT;Gw=I0 z7}&j&ow~LMmitA)6=fi_t^^3?0H9t9QPO*!=SMabX2xY^Ehz*{LiN2I`DR_Yfvn)*0 z{fiH2S0Fm@7&i(#5c}v0@fU14Gg;ytJD+2hH+*EXFCJh4$>~hzsw!S|h~s|$D0BFa z7knDJxUhoUXZ}~1QE@c(99P1bXC7Fl>x$7g_A?m`)sneoA_R)3gz zQ@50}J-{kDbJ#Y?>1?cKjLoQWVS(Ml%tTiSZw~Bc=d4$9Jk{w?=I*k6vwh(c|B3b9 z7v_wIJ4PpJ}EdQ%HhO8vN{J{f`a1~@f-8RhIb zEGhKHck?71yc9-V%M8eET7p5xcJANB)Aae#|E~mjCZ!C~p7IOFp8tW!Aq~!Ge;|29 z_2_wQ$N9IF;wxpZPxsU!R&oX$UkPxG@-BAE`jFff1f3uHxY8=ar4Eq~vZo3$GH0;Z z&K;fyPNU^Bd2a1~LoP@NAy1yLlSx}KYQzUu+Fc|vav!EmhQlcF@FeCTrEM9WrBA^# zHF^9wE6+`unS?;E1$g90x%!>skSKJLxC|S>zmOr$i&#?son%`KK4GCcIorH!SQp)) z6>DxX-$(l(Z`%NY)`j3l2-@pTF!N1(Tx;QPt0-Z(tf?lp2+H=Ye^*e-8UdS+L_!#3*i4Q53}Afbm<*nZHg~Z;kOi$+Uwa| zwbl4L!i#!if66iVz=J-k`|(P6)L;(&s}s2p+QI1hmfLn>JncGArPQjfyl2O^wf28@Y@Qn#R1_ zZrU?zs#&sT1l=oFA}iaKGwX4}J);Q3rf0#4o^ean^RV>Y6Y8xUz{C7ucDcHoNziWe z--jK{_kk#WFWAMlo;t^-D)~TuY8(5gbcl7ok-)REBdmD-W$cfYLBPOzW@c!IRYzNx zR$VH*%I4ACGZ*K*D)3z)02-cCxzw-Z5R&>%+kM=_X3q(WhP0p2PVVqtD9+yiUy?BxxtmhwPO z)GwyuJOaD#MvyxwiK`>T)S{e96LH>4*0iGDd<_bP6|v&!H74-u7WT*p!~TpI9?*{4 zc0>|Q)S13L#TgBmOHupc95Gl{!zn?U3;jtxh5&MFe%KDHnDy{;66V6fn$e>y$+-@A zLiF-KcFUcQ%e;|*9hN7UMO+FlY@UpQwa4((b_E7~_u`foH&%=qFg^?k#yPfHL zMSa;~X^0ZH_v27DT+J$RCAJ6$3XCv8RglZ+Am96-9WI~G#n)GMcq5vD61_tZaFph* zh??q5mUPzZxhJT1ck)5~i7KP|#m2w%@0)4pn1^>xzuJM zF21P(iOX){h~^;-Sj~s~^9(LAWe=9`^~N{;Ul^j@iGuzV?orhhM9wwgVpnzHQ4_5k^;4Lp$m{1WqV$zj}R9Ebx}i<*mky$;%+0I3A}rkn3QV zF}f`S$cg3$9)I%dtgptT!)<8!8p>&`e1_}9_KkB@=fX}+;qsr4<9tQ=xf68>TxaVL zQx-^P9+o|LSv?m*`_kDl;!R1`pJMO4dT?^yc-U)eV*UckkDd|2u>yYX^${QR2b1Hb zwHf`IX4q|;kJ;Y};5xYq9fpSZxvYR4T``XHueAfO?It$=UjaIn4zuYm!!h%oFS0`N zan@@iE=lrYkBAI+yZtz-M=49Tt_N2Gav)Xx7VlUGG@7cZ_N1I38jyPrJsw$vjsLKqX{+LN52XeS3D#g&MhI39-%iE~z^wFrD# ziG%vD*iO+&_|Se2?>=}_j@})sjys_1ts>&8f+0_5?XPw-kWSfz>k1}lIV8wEs<461 z6Bl^R&4#%Dbo9j0+?KMeWxKqvaabNgZb~RGJV(9r9%e+d&N2xp@*v7%AVnI9@22DA zg*CAINWF$pIb6?pMZI=nh>JTRddCp%zxP3+`+107x`5ji!dxWJ1Qhw`qOVzzn-*t? zyUCQ{6{P*k)lY1E)@j^MEd~Kf(Ch?!i0}z<|&EX zk86b}kf3+E&1(3%eqmyPw1YO@2T}1%1i2FHw(|!wz4{p|o3ik`Qwmo1&tumHIr4yx z;fTpOD6O%_QtM2%ZO0X+`Li4oG+Wt?EpkwBI?gge4zqdwF?blP2w}eYa4QXf!K+31 zY@@|#Kgoh;;8uJ*orEWK#*jTl8J%O)bDqBq?oY@mG-WcR7mee*1F73i9p0+0FC_I72nOjbgA(suS(rVsnRkpupt zBsL38rE~fy`C45kTk zdUqb-@9qOQw3 zGy_*TJ?t`?i8CXQ*rn-jkQxSTXFm8nNUo4BA7m|A0`W`_6znZW+yXLO+{4o<@Gj`(5!3;zX zw@~lB5H|Z)!t~z_Ksg64nRCe7BFCAocnyJ+CfM_T08ev1?XFuOmwF$iId@>K=!3;; z&9SVt8+y_6F?v@TV~VZ?c}nk$JJ{R+^x0ur**wf)hPZctp8$8w9(G<0czJ#FmG^~@7$01=FdM7{nVFH zFO%RFtuw?m9#?!1^FoN8AoOq3*^#=DvkE&QH>`|my6?yS72-q{-my0y1R--u6Q2Lc z!%MlD_U{Yv(L#^Tmq!rRt&Ggig52tJ#Mca=pS3m*t?t{A+VT{d*@bWt6Xs^nU1sC& z@9cJREBTnT;peS{TMC9y(bR_4;-#4DG#w?;aky4?nyqP)V()JCv%{aH;qciI0rSV< z+WZG>x&-xv-U;Ag=w93znhlHTU)j@-k1)`vfl*5l1ZmEqym}~N5*vYKb8t;T1lw8VJk>V8vE*0PzHtzVEZ#o)_nFTEQlv(?Mi$mv*W)aqP0VrCHAvk<~XN0{%#_wasS zih?#X6uh8avZ*l^eI23q?lCl8%*VpjcUbeXW0X-2!l_q3*$di{&)DyVU#2`*b6pkZ zICC_F)R6yiHyZ1^dA%=I_UYa2h%sNhAHDmUI;2eZD^K5yNR3b#372sDFt(Ulzhd)?fJOM8YkQ`|HC}Z*TV3*E?!N^ zhuOarjOkS4dQLC+vk#)iDv2BST!S;!J%sef+MJ%79H;fuj59xJ%CVh79PXWCy6FZ8 zcX@-g_0-oialusN$^6GBj1@(Fp?Hi)J@Rr~ z_JcWSs@zI_?o{GgCPC5H2$KT7(H<`W?qSo=p1Tu0iMudqRWf>aZKL~L3*IKJ$MLe8 z?6=)LnChk?^sEt*W*tM8#8SAf{e;^RUho#ZPL2;7@VpRX32PjY6HRW=srT87`k7ex zekwLU+Resg&Bgs61ML5M>H~CM8~gkfhlLGsWKAx{-g}^fc2i!-F*s?E4h;uNcFMkH zLoscvfpSD84^OeWQb`;M)+2wzXBO~48GOCMIJzXCcxB?m|CDF1lj&VCZz}$srPkw(luCp|i+i6`TC<6c(gN*V1oKJJ35Dx$v5#WVOKJpK^Vm~{yab@NbkE)p*m zCqXvQ3k~HW+%Nxg_^pxx-lO$cBu6=;v?2K2Y{Cxv1Gu96kLjG#g^PYCMihl`n4a~= zqP>u}d_EG@Vqiin%7QN9#iZ&XeCr$}%xHp7JVjsksfXh%!c4&?U_N`xqI-af>R{gY8x;mUf^3*T)!Mw0&kb z50ew*_ym-$&V(HIm<`+~){#1O-*glqx=es8mnHUEgBkg=({Wzb11}=ZQ|^GA;3+}a zD{6$mI2Sf#7KQh3sE>}5Z2uo=h`mvPL$CpoqPyAM13D=G?!mq)Z8xoRm1Dnr@3W3W z@htcAdgd0jkvShKV#nUqv%+z=*ubF~P!D|1awX-tVV|XV^d$^Kx7T4V?VRdg?}q%9 zSS-9#4UO7NCgM8Oy2-^Z0AWrQe>TiTXC?KL<>%qk zc_&DIUk9Q8ijXNH0}<{HBxyIO|2>}W1^X$tOc{gTgNQ08c6f>qxAJE*6M_OWFxZ0T zP;v_ICr-xG`z&=WM?Yf~R;;?jt_?=OA#MumA?ds{Uh9m9;^s(P{T_(M(~6Kjsfd;n zuTYpkJJmi-*eA}%OAS-(5Anm~r};P>v;ik)zGKJB1kL8$B-Uc|akllsNw!Jw2+O{F zg1s!+!W7<(u!E_@pyQtmW$H>kEA2+pwpo<@@Izl~IGoaneX?aOmX54|_{WoQF`NbM z*+1A9g#z+vn7~6^9fh_lA$Xr0%F9e~Ie9UZY1b5RJDmIhUJy>0#yz^-jMH@Iede5j z+j51-FC2yl-JK=WhA|*0J(n99tiloNBrKJu zeGy+ACZ*B&GjJzd9SS8L?r13xAU^(IxNe${=$kfJ z9lH#*&4$>xsun?~@?m#=6HM-%Vtcnlv(g{bhd6tg?FfC#9#&JYLU$7Egn)zq4rMNe zSSY>=B3E~yy*7fgyLkrn%QRs+VHEej(mv8}3*=sPp=8_$Qk`wkGp`Zao_t)!gOw1s zZ^BfXmtZaBLdt8gz_}c^d+*`G+^xX8KBhERjXT`Q!BEBxWw{%1mbVlQwqAo!%1ei(!NwzG5)l=GSMAh*>^Qh*7jx|vc!dvb!Lc}f6{{J-N|74-I4mm0-AAd=+38% z|BJ^=zgCqCr%br}u2u9-wnC^!53b+%iRQ6R-1w@*l{Z{v6Vu2yQ{IP*+5PO$4oR5b ze$PA?kAsI(5JFDNLgbTF+@!8?It|PDziqKIYHbedUB80!p#$w|N zSQ<6q!LoU{InRd$c;7=AWulxU>zR(iGdA^x2wqRQhUIsNmz`h91~#tO5SVvM?W4eV zt%l)ltrm$Swfo<_bo0-zSO0ZvqgKrLGM(j@dbR3gSHpAGBY zAXX2uLM6o8+m0RkiJiY#ABE4Sz3PW)3g=5HdF$njQ~WlhC3fg;4)6(Y&H1{T}b zqjVMz1nXkCAH?cv(P4ONnF*CMop^imDzT)RDdV#bLnTshDIZ}KDT-X?0@}&#SpfC> zImBs-A}7r)@~;-ak9L;TH)j#+TN$qd1i6P({18;52*c8RbkcLJVID81cW?_httxmq z(mC_oA7<{pko&8(6N%ea!7w$6`a?C?Y_G^AuRH=DxBIkPo=fbAR@Qu(vePJD~I$ZAmKOC%Ppt8nEIBiaaAB<=FmnPfIC>z!S{2S))v-=X3-5_dd!rx@H-)IruCC5~ znN9Dtr-tNmIf}PAF>n$;fo;Vfkk{6ShG>p3=@YA?7UITSOI zF0Xe-CG9qS$h|hNECW5CJUQ#@CsFbE5UjK_@o2FE0zEF_RBj+zT$H#U^VVa+$5ynC zZ{yBC--n4VSMcw|DdH>cKo*|>r`IdQO)dF`7!?)la2CStPdeP%(O6vb`^kQOA|5yS zFeKNEBJL{v`yxG%DIkxrpdn_PwVyi8#HIU5XV^#`Xo?WWf$tc^A`;NEH46McD_DY& zC)?b!9((t^Wk;);Xs0qBy?$5NsG|klksq*}?Srf(nIl$>C^<|gbN6Mpz(d;!&QYtV zli-AQsa`bOMxgKGF68wkAf4{6`(7;KQZmBf?Z1NDQf+Y9K^~bG<kf0qa-Nm= zTY=cMzoQEa6C+oPY>2tJYk zYsv>cESUuJwMUVbvIsBaZ1DF`4i>&%49%`moL#yJQ$6a@6HUB|B`4UUnWJc?-re_l zXDHGAt^S`rGHg9*wwezfol{s(v;7}`R5AH>0T*^F3(N94XwPAbJ(PDJ@O_7UqgjyO zEybDkeq;`}F0lpXA$Ys#H|zN@Qsl)C!UG9Rl<~=b%CZu5;-|5QGHs7o_B*xc+C!k zLmy91jYq(Y7zod{#X_q{;&m(I*p6Wq{9rkK?+>CQ&4b(b?L5@-=zS4*4|}LrqPx|G N3rP5ms*k+f{{SwpJ($J#%Kx-1l{6<~3)O`S|!&FL5;EJ^MTHUhQ*O$($AZd;`4axjd=N>q{Q5P#!PszLKA> zTZ^ABlvn0Id)e}dfB)J3U;Y2a`oD4cFaLk_|JTNR9^`r0>2YvNU_gMs@2>5p+qRg_ zS?+PrnV01I!?RDM;{WRTudn~<`OnV()$^bH5A%7L!h^@wy}tWB_VUc+DRJ@g-}m1; zUO(*rM~4J2{hzIxJp8YX;=JeozV|k-T|r)3Oy=17dIT-lwvF!~uU`27)cdc@|Nbxk z+xCC|(tl9p;lKJ@dA|y;^Colry!Pz!^5?7N)eq$r{yZLo&XYGXl@UeYI#q&c;u^DF zCU@wRq<(A_6Jy$j}l{d9-@m2oS82rM;X%KO()}i#I9Mw z*eP7aBL;1lw`ew#pWwqh&b`O}jg0`Um%|{amcbVDi2<;0U|*>dcrb~VNUmNWhM4X+RBVOlvq zV#j&*9NX4oxV|*fS^CnZ3nKnXd-h(0Cr9Jr`>axk78E6%!D5i7`lT!PhCT@TJfR zMnKMonZgEvzsY^nu48NlxhV6X5|dqLuxiXE9Nw^xIWR32 zLrTNRtYs}YP3$IAwr*rAJ3i7S4^}d71b=WdHc2v8ADXZjucF|WD5mD4DC_?_4$n&L zBy)o@8DsuH=48|?R&DljCZFsE?@!^(`x9xjR%c3j?HD&|D9k@ z6rWcrb)9Y+_R83=`HnH<;940n6S&egV`>GDHNpS+cwX`;nsj-6sE_90e2I!-rt%dz1_sf^nee%7-u z5N363K_zmQxi9>U{QIFr#gmCu$Y$>)U zaTlG<>M-+~N^qvu24-e>3#)8uidr=r=<`uaM$ILgS-r4*fqJa{g_VqCdJwau`v3aRNjx9*|Mg+>dFlTjA7=W$KJ3VqiGO^m z{Cpbfb+{_)B*s2h#>fBGLel3J^15a^N^5+FC*Jv_@=XF-C>r3#KXy2$cr!u2EJzLP+{AErS{F|@6|hwK)p!)Gnk zbjne{;plW&eCZnLZ@+;XOvmZZ4G)N-kOzvTo~N%!J;YB{!inEWMCf!Ohz}P+*URfL zQz-%xd>pCXx)X57#t21Zx8kOFL0l8K0-h~agXZV;sHWS=4cN4a(6CH$UGF3`&bN_ zyngpL_sPsgv=cUiF~$y})n_obH?$BjYnJYw(?uJ)#-K#i14AT!!0wTJ@Ltf!rHzT? zS^f^J<)6%I#frl~@lu?u;z}}XPZH8%aeshmKI*vV+H^$A_|V;IlA3#GZI`(b62 zJm`tnL()(-H{DH(k@sunYX|*XEr>zG?O5$~5Fp?@ZcKOz>RV2s zTEJA|`7VcEeY=b|{)U{~N)?n-Z-jEkWgypIO%2aGfRxEqc(!3KoEf~2Esq6pi+w3M zS}$2&r@xU5EJ`6i^aJRdi~LMXo-6kKkpsm}A8dS{21YlAA^D>>8tyrXDPkEMee-hc zPEJM6o^F(?E`-acq+#Ko_Z*uP57fT7lA6!C0&;2Qpb_vKZte8Lq9wVwc({xD{*$Ek zzl%A0=VgGz#d=)!#t{PIwa}^|8I(mHz}pWBL?t@|{9hDNAJb>#*Wptz>x~<1@)x5E ziVlLjMi!oUcmSW&N@HZ)BkZ<1jdesHJY*d~QUOqWC9AAB;v&=ttGYgEjV&05*$uFBP;rIVY?ZH_L}pEWFsE7kU-5_z4&D&L)IEZqsh;W+?f>y zppFjsF#I0Z_*xxaQm-UarZcpCq!K?Fn6vi;tXR=}Z9F1rjhUPp=-e(&2fxPPoa5u@ zDb-{ho3s<%3$n4mTOT#IC*rl4(rorpNw($abHr6uuI7I<557(y%`g7al(SbJ22__Th2Yqt+WujYI({TDz+Y8TTvfmO(_{|NjvGw8e7 zVF0{!q^Lv>PJMwvdWC57F%y^H8zl1`8O%!-#E*CTu`Y;(+a6_*JtqZIb5clA zfe*4R{9xlaO85Lofv#=hu-^pHjT->o4(`}`Hi)Qro+CPnTL^Quhf~=q3w{=zc-nL` z3U0T;MJK0W_=GRHY@CI~4{NwuHbVHyPZSz4p?-Pe5~y8Vj4kpkKFQk$4-~3EtXmI5 zp6BD8p4&uI?-o_RrwQ{N^dPr4oIaNTT0Vm0&+asI_7p}Y{siU+>p@=dF?iLOO&kwO z!e{+D?)B9dP%1SE!$$>}&62}}KPDTFh5&jjlj6LM6=uS(i!(jjE$C9!RAMoD2Rs7; zaNau`2#X}J>zpWaz_1eQPg2YwBRhM7u9C7Q zvQpnbJ24IxXUu^kt7S0l(Po%(=n}mgU&Q$^bOKhq8HCLDtMTs!V6b`Gbg>65!lTY6$4ol3VS912j483Ns-LD7P(>G$6WdQu`dBzo*w2$o7 z&L*{svY`3VNm`WDz-6TkA6sO8V?5G<@U4}!2{G9P{3eDiv-}(IoNfYKorohCV2HQ71mKlsI&=$P#odat@b)wz zX646PFbgTe1s$8w;^ZVSQ5mI6(!!uB;4N{Ne+YM;13k!_zs?E=7LXAUC*Mx)MroBLIM|>E;&zW=#`pq!VGH-c?N9J> zV>#@!VKKCB46m2xK*o3)n0JK1`r0lCox30QPIAUCZ+ZT8)=aE7azdH+rckI;kCG+L zcz4=&JQi&W3)9t@k%_HXYH$;~Kh3pVt8kNy%jrYzvMY3HSvZJP#X*C@IZkKj3(S$N zqG|I!L0aWLdO4(y>a2VKrGHx>=fY#)y)Q4A|vqGl>Jq zn5HpKR%Xm#a;7Q3^QF3Ue3=ZeoIrxPHjq?$hMG+{!`=r!xOdAmaeSIEe0?=QHecaV zuFNBAO>YCv)e#eLQy0Xau>z3(sRkcqPsYqw#<c!9Ef_DH;dL zZJ~imGUKJS^>cg_NoDSR7+t0WHx!+%Uo<_%D_9J!pYOsE_dXIn;|@UPCQwOfBiSFl ziDK3#az4)!3ww@Z%pN&BG%E(mmBdN7;X*WQD~1ZS+0aCtQPVdTa#a#x$Tb?1MQ&ij z;wDbg_$IWQyaG2A|0MeIZDch)Nre5+aP5Tn;73m)S|8-%4=|z2r`?D1;%DH2&Ny9j zF%Q4+?;*!GTTq40wfJ535b6!I(Iv7EsLUM?Vt(Ed3+(QolMO%X<@b`79y7&n2DMyr z@)&ti_?^6)?Mm@Q1u@_crBip_z_=+CJ))Ll$oqTb;^-M{wJD`xw^f0w9106QhZ7fN zZM0ZdPDf8@lcZ(tWNp@8s>_{(mosK#vUw4ZZAVeWXC35i$|HKta*Xw2LUwBzhaRyD7vyt2I_vDz&(nE_**0v{{^|y zY3{zT(Ec*kbQa>vu?yV6yiCq?r6=4%yh2aLFr5D6>v(#_Hr%fuN%!{t#&q+&oD#Qw ztsp5N#Bg3~GslW$AZ6T|u{<;hTfB3rs7otcH{xe^iFkmZ z<1)JQi$0weYmMXoIB;{v2<@&(gplHToU^;0=#JdRt3Q2l)~-;JVrb4u$UlOze-z-C z?i~D_(2V`32f^-lE?WEV#kU#pD3P@YN7@;f`sp~1^SN@2g{BkG&0{{IUeD?Rg63`7_{z zv^6|?G|HJhq(MbfW#|gqAR>J4G{g-*$D#f$v@v1_m02HO|6pV~^N2(~)=Ve|MIco&(4rN0(3?ZM{=SV+MS_22ZE!ds}Z zD?&q`m)KW5i6}R8aw}9>m}k2YBJF?wtn9Z5$N-mHLpuBvDt6@~%tesEo;Wz(Lu|8?`-Tjrg!#V}3uWiOjiMNPZZY_>obgJ(cKSkPP zGw9Q*JKXwyE8H;X4fm~mu4|A!vAGD}X zCmKNr;Uxv&kW~b84R%l^J>DF;(-1{u7V~&&72>vPY28(>HLR(hgEbmPV6o8$EmeDH z%&pbP<@v_ujMKz0br)U;JOfRxl2E9u2oCAjK=QmeELomGucS$0#OESl&2H7Jcez0o zZw@=abq0l&SKJR_^<=VaCq?0pL`)_eHVb^C?|)5$ql1-HxBC-k>eO<$r~e33i(~QG zl2&WS0$EP#KMxvjV*tl@O<2k2)x*=rrFbFOjhGql1dEI$T&EZT*X`cG=MN?{K-H8? z@%MtpACwBmm1D)Se9-(?2M%3*==?7qrd1mtZ9fM}XT!koTsM`C35WiHe`IFwb9v`?t%YSr`{Qi85&v=IMo-@FxMHy|!`u^?4bjxM#8^xd2yG{`** zBd&#DZ&wVn6Q)4odJZ_=t3-{WSTK28$1#=9qZwmym>#qb15%?%+4@UR`96m}=kcql znW1>RTNFKH7>In=L8I*iaS;uwcbt9@-IH^?)o6-erUseXbZLy#j*ko(+GaL$E`J?NRC-C@GGVI?y9eO?t!v4YptP@`ey8APbw*{do z{>9|!vs=*CV9Sv&1r$;*gP~1vu-U+i2zOj33hkZfx=Mtc+m#J#byLBq`5%$7iN-p4 zP292T4w?8HN)vx6qi_3LDtA(hy_)Vpqw`I^$5!qc~gjXC+A||O+jP-4;yvcWDt+=8MK7RW(#0t`ds%biwp! zBb^s}4;N=}V1T;;vd`_u@H>L6^cEi?b)*=6-o6Lc3E5=MUs?KLq=7D6K7hwO`PuR< zSMbS=*>Jv7l=T`(gxdEVR6=_U-@l0iH-WD>uqh1MTo`icObL|>nZyyZeTTF4yXpO+ z>s0Xk8*Hw8Kox)aU~aS@MvF|wwva?Pa50n0e$~R`0tdvf#XOvc0Iv) zF>{&9bzXQvH6EiiqNoqwEvnsh01eC@fLu%%3TFs1hL`e4t}ln4wRI&b?+0O!eFj$x z-_nqEQuI(+4X0vfGI_5pMb`ORgYtkLOo`J)BjX{g`dP`{qHh9;@8hUZ<|9y1c}89g z%z&%m$Fb?%I%stI2#sC^mP3ncNJDiw3J=6lbEPsE2WOf!C`F^gW4R~TRWOk+&FHTV zK$AE(a&BZgRJ@Evk>5@D=J zR$1g6I*(hQUqn0KrO@GMiNTXLLS1Sbls{7AjJjx`dn!NLcuz)QrT}+;o)71W9^u&E zE+SZD1GNP%SSepdZ9AIrZBagaefpI!6M2~0U0%DV`92;fIEkx|HIuh5jqt!VbqKhw zNj3KNBW#_6|F)fn4TGC-+5M|1`S}VRzA3=y?~Oy1Hc|A|3a8D++8Ef{1u7R`qfQ0` zVyRytWa&vdS0DloWL3hlUpeG2o`&NMPH2eQg~!Wv>C3g$^)Y@`oHu_rQI* z8R*jxMQ4YLvAYU^CBwFGMnnla)Pit+f;qO&HzFJM=u=64DXhQaOm6-Rgcar)9JhJf zky+G0k0OYC9t>Ka2iLJJ|$5!Dz!{+_397l+JyLd!Jar9V;c6JZCNX>Lw9g z$4?|$Xd4957?=~V9edwT1F4K8pvJB+H6aKu+}%Y(XgY1VtH?Z>w~`t8AOPz=Wr9w# z7~NbegI|^;!KAXEAe~07-wVmr8+QldTg@eC_r{Jy*$Oa05`nb5yC3A2RiZ_7C*I$? zgVVk75-m?u#Nqj+u<0E_uYD+#=~=_34Gd1Ne+|zKIPmk#1+Y+cq}pqoQTs$VReL%g z5~3(5PqwFlec71VQVvrz+)xmAGne=!p!V`3e7Y}*yxlYxeYSR?^3e~}H)01yd@8{i zO-G^6WiRPnu8D4gqxF$i1H_rToQlrRBi-CkoHcj?UeDS<7S20_OK`X6=mE-y<>9@tQPi?L2eV3M!rJyf)K*oTHNQC(?zK#Tu&L)U?P&vy?lZ%iVVbz$ z;x~-#mnAFgP7<|iD)?_enoXM`#lBZHq?{$?RJSe_><%6PuU178V15NQ@AyKhCg*|6 zw?nwgzK}WxH=+OcBJ58+g3~7TqTHlu_+(U{EaC0Lp1$1!e9i)J)3T1Ma;h6Q)Q@rt zd2U0#pASSF6KHyrGgqen49+z1pvI2(tz~DWL+k!NqSzKnXT&AL*=}tp_&%M?$Hx$K z_#r*`#+5^E%k%iG335+xGq!hF;p2kK*y^H#^Hdt~ro$2V1nA1*8%cJlFSuLHC^yGvleensKMAFIgIzZ2yrhWAz34z8qO8REWTQB*W-<~rU5Td#%|%Z`D%TLEMoyN1;( z{(#B(N(@PAM!8XAFnqceKe~m%hlhVCpY3UyXQ<4*;GjbKR&SzF8fE0_=pZrP&;nj5 zPT0-k?kAbYVUREv|1OqcMm@{OG2S_TzQ8c4QOSquPwwzwycB1N3v%9uwY8brpZo3X>taM%srltxL8ha>A_qm;ta}Tkyom<;)ZL z33^zYQp1%&pj3W>`*r0IUJ2G?SjY^U~<+WOw?^)1TAbDFabQ{=ky@N$j%Q zbyQ^4UYKoglL)+bqn@uR5e*MP>4&+rud0@8t8M1miuU8wQW5;R+Y&F#t%U@y0z98F zfbP<7Is3ACoYa&pxc_zlUHN_!83^*lLVlJkWaH_~DKm+K^itfa$b}@0g_uxw8{YiX!qMsaoJ)FV zvAd&~{1}eG7rFZM?}a9K@H7H@cK5>Mkw`M5+4}Lb!z9+Zk1U$0 zM$Hdq;ucmDg7>JR`E{18vO12sS0~|yldnkrra7?2&mWJ?p2C*4v(^@Y^T1MWCK11W zjqH6hPM@{q!1lTmVCj5=8&fR?ITOPmws8@s_P#WbWzA6UlaFs~pMb)YB^(-m41A8Q zBVV{NIN6jB&&!&^(|bA!p@64GD;iDmowlO`3tjOcakQvBD&YQp+0*57L3?r2^O==aFId_HJCLA zyV`6q{@P@WyvI9hj_^d!={wN5?4D(?@oA#_s~GtfHj>dr`e5>c4};g0qOQg{s0cIz z0dH5@_vR+Nk|}`VDbvxO?*}b6xjSIRy-c50K-Rn@ZVukG$;~gN6tM!Pv_UfMdLkj-kgl_T-W< zivenqJQpm&N%qR`PZ+yRwzknHFuhR7u**}8P0#CN*s_AMCSu^HWe zWT2n<4eBj{aOk@p-h;^~9u-FAgPq}<;X3&GUV(U#@ z;RmyDon9a4tSYBm{e4*YZwIbhG7l>!E@8=b4}3SJ30yO)>+in!N(V>V>MAX@K_U`_$ou{pC$u2y#wai?Z=j!KzOpSf_vVjk6vFI4LhXoVYjP0tW5nw9|lmzZ;V=?#&9}a)B06q67WL8)>2>^A7Uh^G7S~XZh zhZ!hizK48Wdk3@Eijp9uyehIr;LhZH*jHzQ?OxHeP3IwfR#*%-*Mx#5 z=cu*zL_EgCUWD(KiIjVFAzpn`L6pCTfY_${AUZRFjQAQ+=ca75d3_smJ`Z71V;&w( z6{j+);^(JTB1Ywfzv0ZG1Uj`yAB(pfCvP`TfHmulZdC<%n!QJlghiw8y*k(=TLu^3H>2?_ z1kvjp*k~X?1jY2=clt+A_L8@@8kJ&3U?+AT_(+w0Y{T2-me3pWoc8!mWi$#nn86&N zSFYcq_bVi@N=2SJZJUL*EfZuYsSfXM7sjgL8DQ!63_xp~ZUI{u=q#Zx2hYRr!!u#q zk$i}d?gp>tGwGgk6*9uaK-;7bL_X&sEIkaw=CubMwf_Z6gqOhQYeUr4D+|>*lWjIms7_$?V}ZFfwLH9(zXON;`QlG?@VuKXWoj3fULn2(Ra6(CSeYxOcIK zx>^3M596;z{pC4E*(42PA9?py>N3 z$(*_r_pKkrRk3?9^4T_g@jVwhuXjML8K73BHnBc3MAuKZh6qnd61`{`Y~=RhXs8J5 zm$?Y9-dqA9-_;q%Ct;}cxRbn?hFC3B1^%9|Adtkv*cLu+lKAMB=D;`hq zcn_R9B}dL=PLL16)4|DoF7`b*g)ZWaVEz0quIalAADP=6+x&KT-?b8?c<~rX`hQ{j z$s|yY;Dgtj@8LAw`QF-=c97H_fe8m~#@$T-j%wb3-aIK9o)wD%*aP`5%s3G#T2Lug z3XWYeOx8Y8D*w`rByxH{?_wJ4brMIx)pDRZo5vsX&Mnh5dg$v}dG&QOwleEJ2r)5r z;*41PG~gxRoWwU8bux#0e4_z$Ue(9D-NERjG=R4k%YfK6L9CFMX64R~kVnU#kZlKz zxy6prAR0sZznrP?s&uozC`cNT5Qo=iq4Z zU6{8p9h~c3sOg1%B5S`2``bUEjFmCne|ZiVWqb4X@m=^yuDHJ4X$(sb#*pq9Q!GB4 zNp16%0lCGn8M|+g$$Jv;@yC3;{rx8Ro%)E!`)x4s(G|LKpbb+8wAmFAhHP+_JzA-T z(Z`)iIBAy^?;J`R7WvEp!=>lwTm7k+!>@%`V-)dC>q^=;ZxyUsH5Z<}5n$oUeKZQp zf?vFO%%rD^{@&n-wmdFgwl$X?Df@w*NeVFTB8Ae#3MDU0C#98tNy*09c!E@-ib@0; zUP#4*eWldx@*omF2k`i#NA(Z4!J;?2&}h*qG}TvU?%AkQDc;=QFec7;^WqP_wVxtY zv1aH#TmvUg>N2}-@5L_{8CtwkfCcKjkd4!jLRgvY`sV>)rZii<+b>F?nX4~`%M;1 zbO669gI)_4kyWdz>ksD4MxmNFQ=GpE8_?KL>5k^v_MF6F}?Jq zmAC(kppDglD+dP1HsK*sGn0eY4OhaH7Gp|M7C`Oihjd&s65pgb5TQ5sahvT{=KT-?W+1(pI=t@_kKdRPi7E*&%8*ZB65aN zb~f(>AZN%%6U_m zamBCHuxB#0u;TLO?PmDQi-no7L6c_b^}?>SaN4)P8P~L}2UC+c^izFIT?BZ1^y*LO zI;zIKeCsg{Z{7~dexmGP#76 zs61BrxJXU6Yzh`joMll&?X zzmC@u0qalTE#L?bg5x;9x7^2c;aj9^gD@VKya3zRDUrih_d~{!`?OVRj8ivLpVV>( z!S7=tW*dcIT$N@0?21!(xqLD{aG*q7BZX`T6T%xpO3;ZbdxN^|pzAF}%UZmA?WEtPVGT7@f75JTw!;VFF!Qf&E z6HQ~XmDfq+l1$HXGr*+1r=&mIS_X>89#Q0Q{6s`gn zlXo0*4Q*IE5{d9H7W*!xa~ejU!r!b{^x9WZW^PCXY;=`_^9>59^f?ms`R7Bs1TS80 z!~-9vITB+pE6)Ade#AI@9No?sLINiaC(dM%r?MyM;RaJG`|UJ7Yg9wGY37i2aUm*8 zE`aH|foNsCjTRX#1jD$M*dn!#Mx`OAR91=&^m{?h&M+rcfz{-6uqUjpbRxqU*`#dO zFXFJj3_FKD(5=ZQ>BH!Sc0eh13w4x(NGlD?P-SgQ7#NaaW2l?$gxWcg|Gc0MnD<3lTfq(hqdR@xPo3+@zC&cc+kv`?*gMhC`1G{hDp+qrPc7!zZ>_(?uOsd zb6`C70&I3dvkA)Yh=7*RFxVDwn?5u!0>hpO+*i}f zb!$I?MlNZDTjvBDFX!T&GxFqky)LMdyQFiIC3;GV;Agn3K8RHck-o>D$(tFX?4Mf%_%D73>836+*kz8zw+xsX z<>T;a)@`m~mk=f_^9RM{s!&=W1X+%gQLosI3W-+1YmYoQRbW64d(Eerb@w4*vlAGk zb)ubMJSJ~`g731!iRsU?WRZCy# zx=FBi-7YLhILce2Skh?8G;lH73`dhKz+lrFI4=4CJ|0zOK8@Z(m*Y{^5nC>i)Cvt0 zNOz$1ebZ>s&q!|luoZOP(8j*@mDELIoKrFjIfjNouzGP6u4rBeiQcbix^fim4A}=B zCD+hCyabOb8{!S6znoVxj|pA$f)``89ZrjwL&}g0WETw+LHP!HYELmrF~0PuKn7O% z$3uU(5V^t8#Y4UEprL&9u|Q5w#%%Ze2Qy6L;ibg=Q+1a8|^&e0TVLbTMu^o?V(pvi*WN-FZdE~ zidn^7uv$Hes9awTA%7_}IB0?Qy-3*g>o!(T@aCF7>oL2>0GOkHph!W1jXkf(wwD%B z8UKxVr7;9+bGAb9Ti$)Cbbt%Jd+}&}2h5JVfPb=eSnZ=#sQ2_eDSgR8dn}8-cDh{K zyrX!Ucdx+c_hkrURADXC3>gw(yx5zoWdCL#m~_$+2Ya_sueoI?Ae4dc)A(7LauE=9 zw*{x+{q#U@2}%k|vp05#@#4-s@MpRuNbYonBWdG!+I=$f!eST>e4c`Z+3HC2w&KyI za@5)}ot@|Y0e9G_GSNTQz@(nf7`V?HZoGL;mMn?qI9Y_#6{(N#!G&HpzrOP#BP&~0S|XxQS@*nel%$Bpw`4X|8ni@G$=exJwhU%2C(_xtsFJzs~cyA_9LW;Jn^oclt*UVg&N zmh&O$S%g_~c@1M*8wWr8@<_mp?_{cE7+6PKfvrf}My82*<8r(T4^)|n#sspB8@3N?frTQStz zE`^G^DonRRDswv45N78Fo7WX=pwkW}Vz-MC=op^GlPj0-p7aMZ;@xv`WsN@399cpZ zM{c1Gt77r@@wd#sTMaaSPPA0cO^_(DX3CDx2g!I@gCAj)hXG)?7jO6CNEPsIW_ z@%#pjn;02>FEJ^hSR|8ohP@~V-`>rJG zLesE9{X0qaItov}C1OXFDE!yaLDuc|#dsYJ=uHpD-KQ>N&Cqq+f9bvXXOCQzXJ&%P z?r^Xf*^Ig5JQljMISAqHsFHq@e0Oue>-q-NtoHl3AyElE$wK~(!*!daE!ipH$YS&*9v zE8Sz!TkQZ=zg~xii4REd8!1q1^`K#lEMn*-=10_GY&&hjs1!`)5Ea&o@;i^Jnnlb$ z`&jyE?Q$ZK5KATR61eYlmkMo7qFZ;@m>Q0~BfdpJ_`}>17q%$D`LHm^v5JE=pDvJB zZ0;ei=OQk$`A5wLO;9_m2rG<&V8~=1{7{L+2Cq)qdd>kd&NWf1u5zN`HwTVvm!gN* z9iBU@lUVNw21`*JeCwoSw&-gP8W#@JHHxL2z(6b5sTxNO90oWACTqy^t*N+|r-$#= z5;?+W{!tG$Gp2mA0B0XaBlG{8j0LVM;d|zJy2#E9)t{R1dhdxrvtK9}eKaRJnNv{W z7Q5$8=^(pi@X3o!^I=i`PEa))CzW%~($lP_>i*mg3JWEfoWd{|OY_Fqja>2})f;EM zeaU*=qL_2n1RgcdCvOkjqxX5fBqY=hS-jph|dM}&srd8tcYqe z+#vDKBzRow0*q(~Mkd^(6J^4%-qsO|bB@&oAHK-x&)&mwSx?Cu_HMZTd=LKaTM8a- zr|@Wv4lNF_#osfxV$xkr@Ho!;3>AvF(zTg+pcjGON2U``sZkO&HXl3x-GZmb}povl#uEr}MhyK9xJ6gaE8!kQKd7;CbG>KQo>2>LJ^&3+t z8J6as*{4c(2Jw0QO5s2Q-ZGV?4OsLclODWWhZ8}9sA}3ng_0$>U9)4*X=6T{=dmX9 zxpidZ)+M;=7i|9h+c;X}wvr3_Hz8sEFHnd*4t2bjoR%>QGnWZRbb`}F{iGyZxUa;0 z+x7&=Nk8bbyaB>zHxR2kUomT{IbGhGioH!YphZvyUp$Lrb3nI7+Qa6G-QID| z!W1%=u?%iLEClhIk8mf;(R|$NEm^B>1%xpzXzF@GzNh`0n6d>8y{ zKL!g-3*p&=Yt%Y54*o`H!x^P=I2Ig16t<+!U{7$yVa=A=^Zz-UBJQNN$9)nw+qs-g1Q`oiHmjGrVRT>SJJKPx@sS-+ehhi zIW${v7Yd83VNXdG>2a9EJs@}!{%(t)|6Y8g6H6xX8)mp*vDsZ9r&TF$Qx@*-n}XA% zLpgEveYCzh8Z+fD!`01e>6`>T*k70oo7_xc`a=n>ec?Wqv8-V|m^hSQ%+7}ig=Cg1 z!5QVOUoS3B?0i@+_H{G)btaG__tp>27gXZ2<2)>Jzlr|;b~1x@zs!dd%=xPwO7QBv z*^r|30w>SLu3|I$ z*4e1GXAO)Qexxd~g78?O6d#`RqbIArz&B$FNGLpu({v=!E#edoMU{cd!&WLV+Yp`^ zw^FxX2OuWpAo-#@3udU^M1^D*80biZMF&NhEi%FI_+1kJrSe z#ubJIit*^kD7E(8x^T!&4QP*)3rcD%~2%F(cZCgV+Pr@Z~=8r2yqM5RnTXS zJl0)G}B#}_>_@#O0sydBELSpgZuL2;bg z3mgERXBA%E$ofVHr*I!!PlB?c7SiSzf>mG&TKCuEEvq1)4@1GL`65`vmD4vJMbNU# z3xd7optn#wXdFqR?~e(?KZ$e@5t)zoZWrV4CRfgawR^y*-I!^(S4H=*dETO2dC)KI zhp>s$$o=4hFD_5wdiq+R^R`tWom@y}OiZGG#VeqJ<)+r-cIYac3F|7AV3WZMvRNqw z27AoF`LGQ{n?3>KX^-HI(QUAr76G2AKxp( z^{>Css#c_GEm@467GmZmCkmghf$G%H=&m(^?}S%_-yDB*d|ZsLXFoyPbJZwtqXuU# z{6Rd^MoDzvX|iT9yAx`>AdgsW{?UCNS-4~q$@Et=<(8!4^k5lG9Y4TxNvtON(et5y z)>_!+#p-xXFR90aE2z8ZGlutWfjL1oP=4wb+HFas60h6fp>G6LPN<~6UqunQ&lQ-Q zvlI`P*CLUh4ii3M=y0V3^TlsLCVvm7Vu1p_n&6VJpZ%D${R60agRSzC{0U~lau^|Z zmYfW7#k}4Hc;ts16gjw)k~178{6uM8w4^S+nLdmaC2{b0;~SFxJ`;Vy_3)m`Y4Z1q z0pzdVLB1?wP}iX!MP$Y>WXE>0D0eqmIPEFe)-2`=YoEo5)FSw{qndVF04@pCqi(AE z@$uCn{PB4qtXOoLJbj#om5UG3z{i5v_RAhRUE}c->#1C69zx1`=iiq!(gveYP|=IT zI-BX3y(AdVZit77>hs`3Pmy$X*ZOOFmsUBxpl2`aW!Z>5U>X^L@q#tXXInX3ny?To zm!89a6{;94rVZN@US&+Max`0lj?$TaKW#h~-xrMQMZS2&3tNAqyc&IohM zXMvE^q)WZc525>}VcJow&d%!-=q2exjNOizD;``2cE(=#Np+0)Vm%#sQ3>U@cjMIs zu^<(@091v}kly2WAX7F042+|R(}rgD_rDjn?O|svS`9mF4uH8-BkUGTgI?KWM%M8< zLm$pW8{|___HKUjTNTzFilBuvR6qhmiR!P}Fwg!Nul7I&{CyJ(KRmo}$7T!qd37u9 z{4WKzV}!#B~9 zJBG=JTyW7=9uzV9km#uoYYNR^nRNr|3djo9BGm8&x`<(N_-wIYuXwV8O&VofU9_6lWQMY$}3HdJH;m&*5!7 zn+!Kh6X?&M+9=X2&E_Z05PMD*uHRO|M1Bwh_ho0O#;jZT)2s@eO%kZqt7<&r{fq8( zb~KNkG>boSZ5w7Ct_S^lA9<y%;qV+ytNj#d|JrWR7rvF-E7~L|05E$TNj?V$-)c6&7@Fu2h6Asg0`kU zP2?$P zxfEOj*ld;K7cfhk4~yqIqhK=2)#R`|SY#LOozS9tzda)HkKaR8K&M$NtL3gPETsiv zb78FIFw}*ehbdl)+)K&wu+VBA$d^6Ar$wh=k&!Yt%P1Qatl!c7QCCUEQ$C2yGv86Z{0Yp5jZ78z>6^m73mK%h!5_49pJ1!+RZ#fo%T?@D$HX^B zFsR#+&iCNcCEXjidx~C>^$Q!wgBWp$-dKS_jcFFCFxWV0juTGxSP`V@{_! zfvog02>ovrofhAQ*Si6Nuc;e`KY1Ebf^n z%g?D9V@6c(z~yQCq02Q3#Vj_0w}=aNANQmxx?DK^>NH$xbs{a3KVrckn`fQ3h(;~h ziCbB%ZFqMr&f2ySzs5TTSJ(^5pcby=cS!Cl3p6t)jXyMqvASGtKH; z0mswb(dYR$vhAP?-uiU}XWPG{|50CD^?Vur?q7;l{69qBwHxp2=TZG;Ei%}ajQgVE zh=2M=w9k!!)!7G#u~aNfHNDPUb(Ew5qj%}H8NPO=_XVAkAJUsz1nWm+ss8Oc82q#tjrXs^ z9O4wXGQ!ayvWfa#k(`0a6l#z411IV$1kahk=MDP&cjpW+R(%yUVg1?Elo~p^b^`{ee8u}c z!XT;>3#F|0x+NzTCM~aIJaiWDZ;LBZyAlpO4K?BJ%Y4q7kQDGWB^8oiy`(+~3`2d%`pT*sHDj#14UPH$}Y+px1 zJZE5mC^v9P0?jeV1|z149zLLt1&eEOi(($?&JbmF;Bz!vdlq@2VTDg>bw z@9CaDVjwK}A1Jq(U>MH=w{5Dy_lv8^uIOQS*IkCDW%JN?t18SfmZNug+i6o#Fn+Pk zq968^;OR$isQ5?^li^nYmMx}mlkMI5d*}gq)v*ZI&Ur*mu9bpg0_iaS%yAGD_M(gP zj&h@pOz|Dzws+0k@g#H-G#BtK!Mv?$-{u^#YFy#HREvo zA*_-$=U%aL$L|7N5EHNid%bjEOh zaM1<_%nLT=~bPCi&cH*kS_4Jgt zJb&5WeZ-RGZolkrpdRdenU?elFAw*VFv&LXnh@cS+?+x78ed}0teFFSFLH67wl&VU ztBv3JL7>!*SRbc~5}RGIc}*A&bU(nw%Hh;?-Y(QQF%#1AJ#l@iFs|9@N+&EPqrjXY zyf#x8zI3hy;Yt-W3z>mtng7UnD>1k?bc7?iDV`&J`UZa6c!bvdy+y27@lY$BLvOo1 z#_ExiD6ypypLQg3B3{0s?>|>_o^z_ux>|@ks#=7_i_K9lYLmIuG%GfH7mgkIvSehX z6$Y_+IbC+I>dLc#J=f2|mKoacmEpkRY88HffGN6(&4+E*9pSo1Cq6l)fmYUPxW8Q- zXLwvdmOz8^H8o^FsvOLX{{tD8_kXg#jWJ@tcznfHfMQ; z_u?5HIvE~l@~IFQL*i9o?6QidrxfqQ!kRVk@Yp5Lxv(87C97ycqB2@_+M%Y~6f~am zowlrsB#~mh@NZ#06z;o&R<#niwRC`d5@B}TwjVHp1F1pa^4<|b@3la2 zYblt2_Qch!W;n~DkV4iPw9~$TBA>-5EeHqymOG60@-8q}3WS+7f}FpXiL0Hip{u_E z)z#~y%A=`NbmnZ>wL}IAJxbAz-L0SNM1dkdj}$gr(u?!8VR^;~weYzHp-!^6%~y>& zo&88Wt0ZAizMjKLkHaKyeQW}4STo;*oRChWbJkyk;^aA4!)BOBMjCypFT{6~%|LN9dwOK2P~c>%x&I2 zjof$=N!FG-pq*m`w3i5i?!XN4%j^eec*t?J-(ExR(nu6Nk_YqkV#(8A>G1m*%l<4F zCyV)-+~qf7plVnbG5*~WHyKABlim@8O&$OH2dICC87g2pSrzyg!;z<@z=!yJU=19Z;Q6YxE+nKcjZ0C zx?whd&CYz7c&5bf87V@WXk&iop;lZQCeOKUcpWR+Lz&8#@95^N?Yx=AB}6FR9IqR= z69Z>K$P$=^iJW9?njMKvzpJRji)i>y)s2iiFu>ETjySlwp1xh6g4>PRb3q~j2H8y1 zxbIoibB!grJ6vH_*IC-ZW{#%2@1}Q+eek<^4VgAKog|*T!V@xm#<4e?jP6Ypyn@EL zF#F^(`e%|9UXs7f$n2R4;@ye(_hleDo|VPrDG9j7_9M;bkAo=ZE$_2S2q-Q4NG6pl z;h`#T5?}2C%FmX9@X;ACH{lO$kW~Y-%JaM{Ul~~CFb2$;V^A9x4eJZPk&XZ2;pAW( z=Xie;&q~`DGu*8}OD+hSmnT80+8tg8{{ctj&J{BE=Q4PA^*Sv4$AyBy1u!Q%5xSH= zlbl16IP)5ZHgY#%-9>5i)SC}m`oy?zmdz(}S;`Q*>KaWEc#G%UOz>BGDzocj8A;*g z;p4P8-Zy48xof4*Y2BiO52V>Sz9Jou8NY?X>@H-Il4)xm2fiH*z$<$!;r-VHqPXNc zy;;w8*<{+X$!bC9i5W$YBe$@nET1EGBp#Z4V#s4*dH(3ZCm4Rao;C-RV4r?5IVw1v z^GS~7fwbeWb?5|nGn~oH{54EkKJ{ZN=LU9W#^cRdLg1aQ$MO)evK(#lbT@9VKsh!{?X%|kXiOt{s8xvDOhRy{_Ytu2-N&A;UU*>M8`>j3pS=7=F!=9%9JAR3OFy*V6x7m`P)FPn1`ak5wXQk^KwS3;5Bi7I{<$~%%T3n3MlVW!>c!w;lTH)+*vjk!F+ihrqu-F zb@5Z^=$}XvfBG=Hc3*(ltPPlA8w8h}5qQcquFlz!Q5AG%x zUiaeKv^#W11)olz;4>vRx8U=k^Yp*^Yqag_J=A#glfm5efY;`JdR14!Lg(F#$fm-(qsk`)LwlnSmN{Y(x6{$XO-aN%=_uH6pVq>{5O+R!Jc{74AfZbKH3I+Jn-z%A5xVO>`}mdX^d-8QSx{h$Q1J@G2(nCy`W=wB8f>=7ub&3DQpfk zn=b9V4f!$2aL7x9AE*$I*+xN_=of{JFMHs%lri{gufP>!FYyb8fjqR*!ACB*;uWi# zhA$uiVOLSV{{%49jb^`3AxX-bLp*KJ1!K1?b)__Kw$Wp~X!xU8DoPz-F zTAVqWM+M(*!KTY$7#ST4YacqW%-ek;=yHzUH+&4?o8ur)^&G69eHL%9{&l0Y1V^US zns+F=7jzTVK`GW2dYOAfU9E|h;S3Oy>jklXSCCpWi~h06hZ2QK;3)q`_J3JQ!ulW7 zb-N8SvDNx9*;IqC+T?0g2)SZ{bX+o{#~NS|&$ z;mfRE+(sTpDS~x?C3x4Z1f}u?_`0kNy)rXkC@mYZuO{KC_1%Pg7$a-Ldck08CpbIp zfLhz{VCLBXf0WI!V}StP(niLQPk3 zd5<5Bs{MrDM+wJqvIQLW)**)u)WYp4&eZ?p9C$A44^!_uaugq5Cmu^~K(7X3mZd89 zh@v5UQS`;<2MfR{G#%4K9-*pg2Rzwd1Vw93;qRA+@$yRx+%F)8Z*-T??V~r~;M)x- zv|tWA-Kvkbjz0j6=aHDRd>QSDehH}pnv{v$$a0Rtcw+G+XuI2jQ>-;u-A@2b4)Ea8 zy{&XbFbDd(uCd)yHk_Q5_rYw22~@m{MJLyGR!@y5i<;REp9lro=Ntygqi;#k{SfH3 zuz+sE02q`zLHESJCC;lvvC1kNo^Nghu6+&(jWYw63ZVxM=FE5K?4TZtHRCq2BbK83dRm-X^bih@gK7PT)p_aX{=h0fT%!4JcrPq=6 zH=RTi@{%;HjYp4r(Ns@<1VmaR1=>zWK8BivSPJ|w-ldBW#Cv&DLmRfnLk@z5I;Pe#GkoC zmstMuKwS+D{+3A@jMRT@uGi~1-JWrSG1}XQU$)nC{+rJBRS39){&)y+Nclk?DBOY6 zi(~My;4P&eWw}c2QaCRwm~*ushaor z&L@XTp6e%#GQX+)umeWk6-K>4KHiDd;hcOS2wN4CNcVUwt!KP(rPUIgR(2J~)#X?g zB^-=>hVGvBH7ru96m9>2uXxRyma;3{3JREFm|Ea{r~>yRd~1gp&1 zGkMN@?xAxVI6FC!$lQ~rLKg3_C~Ph@77NFnl0BR?KSa67?}gk0dhZHiF3- z9=z2Qz?BDN>$Fg$p)kQV$h<9!pbL$?qMZ-I$KEGtcO<7sd&-jD^c8&O7~Rs(|;o0iR$CYWUcrQ zoN4fa#yy(`7Xmf#$eSFdZOc}sVaj_f&eXy1?$0>v|C$r>Hxb?nET!+v#yE~&MB($6 z{dCd5N}P5Ucz3&^IrCrWq5WJRny{83;XP-G&3A7!xU~m930csvC3k>Xx(Wcbv*U=+K%E8B_nn>r=;6QIF^^m*= z{Tl0_Xs;Ni`wC2Yr^Dsfu41j?(m!1hE9%AE+t>|`W42Cs>(P8Y2IHw`lqu;rWXs_i-RF-SP~QWZ-UcFcW|Gj zH(CoGf#OY>czxmlDX~mEUAO&J_N-UAz5Bp}A$8%I8WBv~pV(9BJNt^T6?fVmtJNtVE^-jkTXt3ZJdlQ|0- z?qd0CEBs!0i7wVI#(V7CIyWE??Uc8og1-P>iA%<2i6-oB$%Pu;FdUn{0s4Z)(4_JV z@>#BHSDzh`d8q`7H!fhFL^7S~wjcjKlZHd_;+Im zsOY+YYHbH&Fu9B7>$AOLOf7UsT%fy`al zWlSIGWVbvT*T<37llR$1a_?g3${yneBDw*w2CG7Nu*TJi!_M30~oRk#-~+?5^u; z_kdVJA5kjlA&Z|4P~(9fSme_{ovQRPQhqJWXvw6{&a^=8Tq&Y)_ZHRuSISXZCC2rv z{6QmE9U+yxNVEvxa5LsN(oz3CXdQhN z^m%-ITAHSIr4fEl3Cy9N=!~>#PW~kkj2)W*C;8Lx*Z3nbI(?C6Fs}h>uoQQmYXntLfbF}A z!9JxLe>J4w&U#UNQSyx8FWraxo~7W#=_eq$`41@(WN5g_RIHPp0;6YrFobHNTKaCL zP<9GhH3-qi3PG^u!5;c??iMt))CO;VV@TRsPkpu)u>9_2=I7&9GGbT-BKw56HdoYW zvSA3}iqw#(i&{)nq9QC=6$|dVZRk0m%f2U}L$2$HamB{9vBqj29RK(azvY=@?ZPN1 z%I_dsQlufS!+|IK^(@MEuxDmiCgerGgKf`iiM(kaPP9(pfB$p_-8D++rj`S^*D#%K zEZYmwdrx2kjR)!OO5l$bg5%XR;PYExB144RpmT)Sv24?W-Z2FBzp}Gw3Rm9Sjn(J? zV+vx?M@0_SgtWj3P(s6Oceqhz3ElHQapF|pGv&_R9IcH*V0v*bm2@0;ZOi((xduc)v6e*sozAtekAuRh z5cd234TZ-J;XQ-7=k?S zOYqeul5W>nNblS=hV+{au(YRu<$;!hn$tY^wbY3IY&u9ord7f2ql(--OZF`e%~YHx zc%SV6uEfJVPpDqs2IgbmPy7-$mmb{UiM4er$Q}P1^eUS*EV_TxY^41mJ>+`|LhsIo z69WWajV7U^cN41*X+Q$|&O*Sz3aY4n4#(4;LqNm_()xNHSw7Z|_fmgDNZ*k8K+See zTUH$8{>z3a>vfnC$37>tgo_&mIcV*Z3}$b;P$Rh>>v#D>{BS6-U!w=xUK-&eM<7Cc zHmkhc2IVqWf#}syo>H48$S+~}Alr7H_Ym7#+U$vD*NxzXpdNVjw9{(VUx;hernP&P zQ|*jIv3*S0#;mSEB8C5!_>+GxN~9+xtY7lCw;a&Gp;Af}*Yc5Lt7Qe3M*mZV^*en>R<7@Y%cC&o3L+1fOEx0ep%J z_xvJx6N|{lN$hOBafm75{vx7MiKwxD3(yCbIb9c2=t{*|xW8;6rZ~1T$+Nc6U;7#$ zV(1A>lG{f%cPNto{<-7NWDSzoUj^3;Js|aj4BokGMu)ZLgX3~Zl!{^5FsV8C>3A{7 zsT#xnOH%k&R+O)pb_^t^$iQ9JOG(^v3B21^l9+<2aAJ=T4SBPSqx(&ob`|pIv8da; zJA)A*y-ovi3+G_t&G&Tnx)*rMOCEM-s=?Uy<+PKxiR$q*@$TgkT&9$YH-$eC;dP-n z=Eg9Q9>Mf;p#%sle?&Z6`{}9BbvXF$75f|Ah%b*eG9A+cE83)eR0KeCr)pz#Ytnv`ZI9`OVHEKj`UObc> zRH1*}!WiScb4-MAC3w1rfFAP{QY;E!hMNadf6IYrvwTuy=y^yjQpRm})k%ZlRrpr& zmnnTOf}4!XnNhbl?96k9Ik{F5hMzqF^Vgoh@ixYJ%TJjr-_)j4w9Du=>lnNflm~mo z_L6bmBvK!(1W&7F@YvvTvi*|-+RRM^FZp^Zdn1r@DS(n+R=FJ7_L@GjzliOJgMn-| zN0~Vswnwc3e{dA=llWzNB1nkdGfsl8Z$dD0ZZ){6uLs>HVd!156)i?~!T|eu-OQ5V z{>=6UxpY}nV&@K{cR$F7qW_>>)*Y(H4}hIeVeLc&1-n&pTo1oJ<{xg~p!2)*am%&^ zF!V_jR()t9M=WhYqkkI>vL^URUy)8>JC;uDGzYVR5_(E6o$x-);C3b4ft{}7a97@w z+~)o?e_j_1>m@!goHN7Vml}(7G@TQ-Qjxw?&IZ%@7ddXWiS3tuh6i0_RZ`rjA2}P}pH$%oU;RU4Znkq$*o@nis~7QR*jBPzR)c!>9l~!@ zYH{$M2+WMTii>$Jc*u_3N$PV^%Ksb)s4OBEUU4W^-8cXK?=!xaRN=cO#4~n%TDoYf&^c-(F(^enT*^nIyl@_7=!Nu&?013K{37JFt! zV;BsEykgHjCG`0n2R@yucszA2s%z9>MIy@rZPvgKX1nq7A=XS>_#3}252Z(ji^<`m zJUZ9n3RZts<2z;L(?|cY6=0(Uc--w7$Imhl2mUp~%pI4=iMACeDboT53gsm4uK@p5 zu04w0WBW!$?O`+Pfp~~(gGJs&+~@wSuCP--yA)mr;yKOE_Z_Ak3#`Jy^Hr4XkLshT|7cLr+v1 zb{-DJ6dfm=b7>Rj(Z(v+rpvPBWlp%mvypw5aVlQh@D}b^6yesBM{(1ii}a)6J1YK1 z2)?>VayHnp-s^^+xcl1zxMX~h(==8Oe=N^YC9z|uCqA9o?9_@~_a5Q>5GQbm{Y0Oq zvKc?igLHwW2D#L74W2(0;6IU%V+6I`;7hJDct(~Jfviy`^5sUH%q*~{SorT~6g24f!>ryAkck(wDlXQ+BX&M9(08l`;S17 zbuM#Lo8?s4{l+M6C-xjxBR>r~Y2PV9?(n%hP<}CrNApgClFKK~kDIGu3fl$pV(||c zW9O?jvmDqg`4o!}x5L%d#bA0GICtH9uzpmGJT3oBTT_*|w`cG`-)0m2(!LX>7f7I) zPd4>3-2(~9A7SAF2D8;o@j)sfQH#@9-TnXew@y|8J&%?ZfkM#12b5B0g!KN zgg5>EQNP}JusS>h3R`8d^5|6Vo`C?osf42H1$|(X$u%sENhhKsLflW&eNgyN4}@nOg4gv?)WC8+6uxT3(~Z+0aJ32O zeENyU(&pmjSrv@WGWOnQJA-cQS%gk2Y%#cL5k9l_h9;>bSiHWl?y8atIj$;1Ql=eY zwPR_VtYJ(_eN}M%uNc}@bO{&NAH-X;Y-nZl5spN9Hw~df%*Av2sd@f6;FTPL%i)pi z6Y&NdtFsH}gYoCowXA?#Kd6K@XV+u#uMN;HLU4{sKK)^|7xxO?CEJE~^OpS4;Q8N- zfF)-*aLsiF=e7S%=n=BT&A-Cvbp;uGS|iT(0PI3%)iAV5qqOeYcV=X^BgE#^avEo! zhS}{iU}|X|aoZ7se66c^*!cj=tuzE)w-9QMpU14-^~6wVDS9leKxgfIoECNnJ5L2s z8JT)&^duPN4j(|RU|%{pD;-OQ<}$D9tvOS!=%MD)XzH!|hZIC^C!%K($R&0+e<`B_ z*YsVnbe<0$?AnE!UoK{IT_;%0u%Ar#DJGuv1sG&{k8@^J7S?|pA@gVH!L*wBSjw!1 zN&ntat+5NRTQ3x|lOLet-sQS<`31*=UiZ1(UrLFsDZv4wm{t zy=NxKs->ZW{Y5goWd>*L@)WF9)ud+nJPz+b3YHtQ-K4n-nVltzpz6m5=969?HqRU8 z32IEx4KKq%rSCtqofUzL+KXt0!ah{L5sK@c#1Q+7$C(BGLa=V(TXJUs`-a-cUvluR zFwW~%MPcEk5b-7n6C}K$z&#fWh8N*mt2wZ2C z^-vi4)Ltcd1Frb&VGLFJ>5XUQUsGe5T1+uD!@{I&Qm=7^obyzn$L5Md&*~JgN;rxo zei|?-G=!!Zt5MCkLe6!qe7e0(00w*taj*6a#KW2x8?%yJ+nSFHt51XAyF{|=#B6w? zvKThD<!F@la6AM?Zkp53xOuoe^FD1K zeMBd{mZ7x&ACYaoh&B^PAi_AAJr9IQyK<))lT$$Lw9ClOze137t(o`?{Uc5vM6u1* zfpf9;KM=LgV>T>z$45^d6I)ATx>(hhY>i+$1RqDRxeVvJ*~^Nse6cZIKjuZYY)ZiB zu6la*<}Fe+y@1@&a>c;I@tg-oQ|S3K8aOpW6@EDUkD@bitMTi?aHC2}gAzrQNR&{i z-n|Y&h6br53ZZ07NGhbcG-{Ft4Ju7EEA{Sml**7qgp?sFiIAVD4Ef&g4{*7hqut(X zJUw7?tP!%r18?q9FM<8o-+2|oIFH&NA+DdZ{s%L~S^`fVEFupI_2KTC zV0auQ109o!YHS9t;DmlT>U(gM^_i~^HpkY$WSdirPeKy1m8$euOdwenkUNuYzxrMumN_S+JF~Gn{v2S~wP+3dKK{R1mp; z_o0{GSh#!<9uEwnK{>A2mKTe2pKJp5a4@qXJCJlfafF`o!^qdLA{#RH;r@f?X|RMc z)@N@f0a8gAMJyK89`kFRSDXYtzcY9()gJy> zD?!nWW@4f{4M!sz$hcn~)9T+%T1>Bz86WRJhtC`k5TsO(C}8>TIv9%MX2tXTP&o!jD{BB|d|SH%P#Q9z~;#uEPA*-F~Q%UkL)OiF8V$H|f7tOl9A77fo888?85LLd}2oL4S8P-oCeDaGn%2$9r8E3I$eb2O+T8mMox1N4mWe9%NDtxO_0+Aaw(I^l>?c7(mCMXEaKU89}`V2Orp$UJt&gED4 z%R!h=G<-?-rt??lLsZU8s`xn_C&tM0CqJvDyH4LDW}0D01L|umJPpv-Xe$Km+Dx_A z4a2EFD{wGm8!Va93d<4$&|0Mck_?9F1sz2)#LHS!O>~8&llk1A{pl&0P98l$lybxhk>h!Sh zjRCv3&j3nwr(oh3j$mP!&bWgVgli0tHv{|_0_lLHyfB$Bj9`OmX0|%IG z8z%71zduT4*6J z5A(pYmA}yB=|?a=Xu->U5diMdl$`pzbvpUxGK zSo(+#@690|*Sm1x)h?8Kri3?7F>#XdY$k1iq|?1;7q$+54YH{LBFr7niG zRz^{{ksr#+rd%M>ZG0mujiuDbaW2f=KLgKz3O{=&1pfp*#7==6E{hoe3q#YeVM-|O zJ6F!KA7UUS?FPoB{Wi|f(1P@WA}oL1%FH>EPgl6@$2HtNrc|LH&76nmkHE|1y0r#g z@p7Y|q)cIt3)ju`UkI{U->BWqYk1~6py1IMBI7ZNmV5HC+MZ7@#mIroLN2p9^o2NC zG;>V#IGl2r+c})gp`Gi4xWA7uDdH0RMN<`U{rV(g6BGq$k@djQfQi;4B^o}wl_=cW zN~AsS(rtc8Y>C5j`e-Z+MQa_Q(R&S)&M5@{v_i(?sX4wqPz?RIS3rf*O-#yP$fWEq zrZZ>7LgbztrZOxNCUIv()AuMGaB7FJ{lfIowufYQW)b+eM8NpG`NU?(olO0a7CcG!A16zvK%KOdS+kP_RP7yG{aJBK0<}l{8$USiX>Vp9ZTR5(#DVNPlpoTR;#w9~e zpv2R_!~Ztnhu2bIxqJ>?zkE4<)B8ie9DGmeZ{9~&%U`vw-L824%0V#c{0gUD`hieE z0&#q>o9l2lGl$!5pcVHn+Bwoz^EP!J#>tbn`SR67HBNrAWZ z?>^j{Q38i%1;DZvOGug`!~0MyN|fu|z~c)aZgcmwhx5j1#KT5X(=mp39JqOCe+b&| zO~gXy^H8Uk2KI(==p(ZhzjrM{FYQ4i+k#T4x^o59ujb=~WxD(suY8cd-2n#XPl9KR zI6OUn3i}`I#N#Pd?DIZV{-2Wr!2hYui)>cFzJ3?{x$qQDZSH4F=4e6nnpym1n$oW>rb*2tCZ;VAxbwMa^-c7#irs4QXYs~W5 z29lJAap?=;@Rw5D;D3)Aed))sk~AV|kc+93uTgtX5UsE(;ASZ2s0(ePn&0=3o+b&X z8hyr;a;|~-%Wl?&>s-T=D^KEM8IJk3j_`{}!e)gY2-fPrrqd~bf%-W%ioubxn~tO|E19Ki0JA7tOhC-A!QHQqE^jlQ=ZW9!xR)UQ<> zmS)W(5&bvFX>*{546k9te$L4)Q9^B$_F!VzH@YQcl*m;HLHKApJsX`vL&pY5lEyMR z)VG+*?R3Bc?=}#LieA>qcNYj{odW7GlX`1C!}RYVC~Bq#zR_D?ZM_&4tZgKwGN~ln zv<HT8owsZca(V6xddw_De$zF zB3Lhnn`pl>*LZ0EYC89oECyo@AxEw2si6FnnaZo&Vf_w zdgwCfAp=WU#?W^k1VwYqm*u)JE3b==1W!kyl@<8fz+z{hF+t9@grEc(h64%O7VLRjIz6*E~Sd9 z%dxNSBIgoWA;+JUJ)8GP zAr*6`P2d|8O3`&geA;4E4~;gN>|(AbXBgvY{77k_Cg7PAnY6gaxJ>^imCsrQi!1EN zp|q3Sji(ohkmWm%)Y&M#^?zKpEA zGK2T{bPvqq@)xs(vv9|=&G@`(1-uk4BcfgJq4I&&;H7IZe7jm+|VsJUgrQ;I#p^6piF zzw#=2%e@}rHnqU`GYg)FqaE%2Y6wlPTgc*wY-sq6u<)-jujHcu{_3iS$djTNs=63{ zp0CDv20O6o=3gpV>cGo8z&Sz(jCl+50zs%^Iut0Z2Jx3m*rz)hiE=|8UXv}w_rIfX zSBpLdPX7d&+PBe5*@dgVB)}5Uy+mR}1Pi_fkw)#)aLnf|yT~sHNW>uC*BU21E)#hs zePKA!#R#XA%*OEouDAIy8hODXtS57c{PJ7|LG+8U(&$?<*qcetnQ6hRu|O2O9SimG zJ#^R1aA=Nj!h3?g&=FRQLTfT$wPrgeZ^|Ic6C#Mbvk1gw>QFtob!c&04!SET$)7LF zosaQU^zAcDSv3ZKqX^voU=P_lx4~}izQ(u=kjGN4xOZ)5?YOfBI*M1*JthKZ_n!xM z-VHEgVg)Ev{v79L`{A`Gx7fbf+o`(`11la3;-J6;XvkQHb(1+a;_F>B_V{PE_?s05 zFCM}3N;%X^?=i6%)`OaUu1~bi0oQqNqcf();Njzu64zv}$T)yz31+q83rpZ{x7W;v(Ld&AsC*?qZ_iZ5sMggUg3_l2==P z!UUBp+~1c1yk&iyGc^XUjtb$=CJU6Gr3%(%iMa5*GzeSFgUPcwzwXbws5RdX&YQTi zUYVP5+4dLIVWSw?%DL@}8q%i3c3V!PPllcSeW zD7~CrDBB1lGM}i{ky843odw9c@hxaKGWuZVY z%5}n_*MZ<)e1mm7zZclIe4_BW7LTgAaQn!6__uf)_HO(`bjHM>*lz~Z9-qYP;5adZ zzOQi5lA=$u8gKROX6}5WAil4P_#TnrW{!Y_Y{uf z_zL0=Sfk!#E10FS7JT1*WZwPrCz3~WP|4*8Hm5W|{^wD4`E+MIDt4a!x+_EI>sEH= z7bFXRd?jMacGPfk09j-fh6hh4F;Tk+>`-~l@$BQk?}rS?$=Hxv_6&{qa*iYke1J~N zkBsVkK3KV1fkho3V-HN@ZIN|AL#Z+py)_56EqPD!iXyLa2%g?LtrJkfmW z9%8XM3ZO5R^arhnt>!*tYoq|IDwqpiI(Z=WJ*!si!AEw}=~O%x9g80uIo__XF__G$ zFm6WfU2=fJNQpSlsP8YHALi!zf2Kjgxy5knlqm06g*Lso(2a;K{tdpO1vG+wg}^=M zU}@MMH2a&0nXEC6o!JhDtZvcow#x8c_zuoF)K9CErASGqFo`|692Xo)c?|;MoYH*qY!KJbrodxry4C-uoosRnn<2l<%4qM5ZOg?$^PM7{CR5{T-Lupy$^=q zmOV4k$v2a2uew1$-C7Lx2TDOaYd)e>2vt4gh;=dF=b{PEn^j^ewPhWQd%vi~+{p7Sn{x;k#!&cIwEX z+!_(w_)s7J;SzAXd>nJF6!2y4Qji(Qfu%hKaBt=pO44g!!gQ`88Tf~pzUCJEEU(2` zoQoqrEe0-}aN|BRf8Yh~1lxTJPun7;!_h6pkMP{Ya63~yzq06`o^{dC>a9K|s`{mAp`LZMEnKT!cOyrz4 zI;AjiZyHQ!$-^7XpK1ALQNHH>hg7C~3|a-x(HV(-xcmGpjJmX+{F?FsR)9Y{u97=bg6Znxa^NMq1qBroh|+q_0d@BliT=(xb!=aNYxE$A zzikIn?*B1It#*<*<{8LwrHK9daQJe+ms#5#0zbz)k*PQfyLEfn*)y~7WA+=os@Ovh z#(t#UMYWK1I1-l*YU9=km(kgLFXu#kL$quP8Q(B3qKz~0W``eL6*YoM;a*Uxm`NDX zQ_K-=H?<_Bn<*_VLkERh^w78?`_e5OsPkOXaXlNPe_LSC6D_)NYaE<)T*x=?(Z=b2 z=dro>uHnYcBs_a}Ev~G3PE&F_i0a5j_z*6JSM;S}`PB-vT$+P2m;`L| zS^VkskiH(CjD4U;{B^naf%hfSe19s=;NDv2t9GFMaWVXzZG_?SU+K{F4kVeCntqv`bnFe*cQ2au?pC zOS%Go>-9b?GxIVgtGV8r6N9$DGI6%31;&ZJfN`f|Fqi89r#=tCuk3&LMPw})?Gc6L zj!N)q_!5+_Rzb0+2_&B=p~b#&;;L?EhdL5*OIYwb1It=tc$JX zJZMz90vj%=0MFwm`DpinD!r+t7q?tza^q6K(s}`W96CxWX5M7?M5^=Do>sAL(akX2 z69os}szUkQBN!<|QRDDV{5-D_`IRPcx-d)cd{} zJgzUmUlKW3DRvDNWwKBxwSkBx#gWW0A)}PxCnWBDJEQi&iQa8D#HaHIsjHV8nsaZm zo2RF<6*ju4-kAr-hwhS5`B3cny%8rA1X8JA!KgjRxtn?tVZXTnIv3ps6WtupdlRqx~;G?@Hp9c{Wu&lJcQC; zK9J=3zJQamB-z^8K>UI&$g?d4_$+@i z=Y`)%yPCM!aNAEZhs}Y*9v;wQF@zeIHekZWozT*i2c32k_`TZaFe!97Em2Fr1Am6` zir@`2T^s^O$N#`ArQfwm1HB|rlLdWgeHd`_#;fA~r0u^`SotpjKKvYpNtezuJyXs> z-@hnu7z>1Ler2>*R+~5Gf18YQUE4=x8&G81Gg^J;794G#jB(1N#Cekq1ojP|Eg z$H)$Le^`NEF3n(jebBaF6%*N&WSf5qV}i>-NR80W={(}dOQWS+jzyefg8UsA zrg6PSuu{l^<4XiVePS~tzWdHzjuMqGynE}zP5m6oWhl3#pId^lW@nvN>EG;U81)TUrgwFT0#L-`$*}NrE{G6fd`0`dJO1(Ub zQraJ>)~yu!XW%+%IunUsD*vIbs}_09@41&a7>oU`fWRv@;5@E?vl=JDt%>u{*>VO>q*1h~bqkFs z^M!GV4V`s&?bE;Ro>+Zu^Q&YW1=YNCrM&x)+VF%Z*4h0vog}P5@`No0u)#6 zg9k}dd49+4l9e(#5D;fYJUC9&8NG=-_Ru=4R-Hs&xXI%2$_ub{xhd~{X##2+hJuHz zC+f@dX-LEe^7f4pN``9kghV*Lq-GEd&QpY1uRrkVXEj_L+6Dr8fz;--EN{b-Nw8Qs z5)N7CLui*Mf49&LU{w!enEX-5W2Q0UKP-vEv$uq8G=XnJM&ze&H~5EhQ~j*{uv9;Y z(Rk#AI${I#{Kgwpb?5SyP5t)7f@lzw{a| zhzo(FDj$^LW;iV_(xlF08)Y|up2cMHAvb7AErjjT5x8*2 z0+>I|pMKv~fI&8=U?j5x6vp+~FokC0t6z84EFF8wmMETvaaw`AiEc36zyOAXK9UU+ zJ*nuj3&sP-#$ox-X~;iy4?oXMrWx{&;n&6tsy3oUoGTC>9qnNces7|swjD$|I0MCM zAA;oFEL8aM3i1QI4G*gfqtB0J^hozsd^aYDUoUz?$%|5WQnZ%ZpS^>Z2Qu;O#UhSl ztcGz%%?KTgBo7}?#IH;YzJJt7Oy{;z4h2HQ;_$jRoU2Pj^ z=z@Qwvo8ia3+~~ER&Q2h`4me3wv%b`!uZAY2>qcmmFF?hAI!LYP=D?YI=e23HqmG!A`J)9fe)c@lO(`aV|YZ^F1g_ z4KusVmhd9B%%yLaQ~3PN1&u!(#irz5C_n56A11}Zf}VQXcP1CBARcQQ{{spCsbKx| z2XPG859v!EksW1UgB+jJ z5_Y|IV2Znq@QYhCocdXX7j!jAbLT~N_l|?uU6{xms+a>x`x~I??;FC~BF1+x_J;rL zxu5Bn6SU8E6XcI&(8*H-@MLQvu2udA>wj{4oiYz<6R;T9@^bOjpY<5}v5{0nY{Ow| z4HEj2bIZywU4ua<-iF8SO+8H6Ncojl_SbEh8Z7-e0q@82rP|+(=8aa-21#_wG8?F!X#h6ssPsUjX zCn6QIA~DAwlc|{#LF6d6@3Pwn(y2?}rDk63QTRhkwHCtjbz)TVTM%Zhk>Q=$Q&scn zzw@ZCbO7qaqw(>cXo!4r9EvoRNxNqqyjAlAejJz2s~%vcm@c4^wk7Z(>>cc@48$25WL3k7;KfX(N5{Cr>=y3Z4nwf-P_!(F5bzDt(QgXc4?!hpu z+lH4)`$2AP0Caqx15HQ87{@9L6fFotxgF)?zw~(eH^~d!%o&CjyeD<-S4adefH>7k z@HYLtNtENZfb9t(oECS2c5vs4y-zLqaN+~|w8al@x_u%v))8w$&XfKZk<@+vV|qN@ z8^&8WPEK>S2s1xUtG6aV zOclpSvpkHC_?KZp*bO+cNf{sBdX5=}w&<>+#7tGov7_rbM<{EdlFk8ld!5gl>BG9M*>1z>T9dsI%Y(+Hy>d^ZWBb zs^t{E-8>zX)u!@oZ6$a?hwaoD5AHe4q$~M$1IFe+u~QFQX)F4lm}*WBP5Rj62W5nKkZfU`St@*CPK1I)2@w`7`5jlf^@N=3zNhY8&xx z;cNEL^cjqX*9$1W_#cTlyoy(9)la|*JmCh#O52hrM@i(oVNyHn5B611FzAo4qSpEU?qj^ZP#XpAa%ZiJ8LU4q&wNvIrAy1a(Bg>$je19^Ww$8ovX~0Sg9P_y zU86e<_dq058e|9nc)7Pg<_gRzE zXVQqCWq4&=guuY=)u>UyBlb(omC#~_ES`139e^2%{aL#r9I%=O62d0JD@T~PLEH)`( zk3R_q-%VUL?y(9!z8b?Go|cSe+nTUe=s4Q=y0VdN+)jUiFw~Z=L-V-P==D{MlzRW6 zrtybCPumnk*Lu>=D^Flm441vw%emc;ekVQs@5%A=6Y%yHKRP{hK4!=MW8J<8p}6}t zoa1c;vRflz@{(6Lv7g&Er0WSTkDz z|0|sYqly>sY<(Q}XOwX*>%!DDUxtsTy_vdQD{f@2v*--iBHi3NP_1AFt{lH^JRZLq_pEGZhy1;fe?%6iT{c2EkxjPeO@h>F=m`RCLnJY|HsDmS&KbT3|=5sR~E?e22 zMfb*z!CW%|{^ghYV;rWL%m@gmSGB@T%`(P;VUCQZ1> z1IhgsAm^?CRn9kH^7L}F-=I%=csHnKUpM@+&%|ZnNjQP?T*Sr~LPWO**p**K$(x4g zDdEi?s{BI-?C0}q-^lZCs;%Sgd-nf*b2}yVN_?Md)9|^GK5q6iM86U-I9nsgQ+Qay z);(GRSxgiAzFmzuEkk&BpNqr08Z#(4X~5_>@^NP0UozSC2yd>SB=o^@`^Y97R0@XXnLl8L>In1Tj0jIa-;vJk_yrF;lbOIIO*9QbT=+Nu zG#==q?EUGmR?H3-&r`xqF6Zt{B=E%1dr+Oq{7me8oak820qUKx ziKhL{g)Ln7wJ=K^j;@_gwnZA_<(W&3b6pRErp_`ve!PycS#z43q0NFFQ&LG;?M+-N z=nriPhiP~GJo4FDiubnmJ*mvlgZ|I=O#gy%+W&L}nb=(XxM3l0Eyn?U-R_LA<_8(o zZ-&1MFOse}1?*j{M)pkWCqbQocq>nuch{hT+5YzdsA`;tq?twdvQw0o^jL$4yqU}k zP8@(oB?A0_`|rv2+##Ia9)yamH_3%wUAQ>=9IV^@osD&ULx25?hkt80r`5Onr10$l zDBHmi*Frg=Hg6@G2q{-PV307yc8$Gq#=ZJtYAv zj)WpE5Civune>-XDBRhtM{s2-I5wsRna*Xr^ThkWg zo=sp~7*5A5Pv9+aihC83>5aW-SvKMj75M3nj=xhO>g^S5bt=OnbAORV#y@e=niTv` zLY{w}>o9xmD8wUVEBbXU!I`a6w5QbyDt#lt^uT2PZE*>zaL541#@o^8Z6Tgd;Bs$U zN7o6Z1i`QS*f{sxD6zyC|c10`l^6w<0v(_7? zcwixJUH6<`5SfEZGzEDtK%Tcb1z^xE4YO7xGtSR5LH@*N$gr&hvG`TEJ#-QF*A+qb z6C*Hm=pt{61G(;pC(2C_qqm|wpH{g;HD zOT1y>kPcdYJ%A=j{UjytJbYR`K>po+&)9nxKwaVx`{J88bXz&0(}`*H>@HL4H0&I$yPiVY9<`$+Rku?p(;GQWQ*&K(x(uX1M zOFdi3F-^nnrjWa$;pkVHOCx#{;D=46@vM1^asJ#;cDLIWjHrvi*|*|h|7Bg2NgBmm zy+G*W`foicO1R!M8wP$phXQR0u&(5suZJ(7A*ti|Z`i40%Q0tJ|Qf@LD+TK0pPGyfOaHM)Y5mOy}-+Nv3*d zvVYb+f_tB~!-cbfkWkk^r(EHfXCFVaQH%3YLO=-e?e)-ktum?e6(QBv6!6saK{DZ} zCRrYyM;^G1K*Ah#I$~K5Qg3YFlzb);6uknL8)iX?K{&Z7s))8Vz2w@}LYOk@g3nHD zr7=4mk_(ps)yXqbaI6wKzVO&qZ(XdJV-IIGyrf&@OPM{9IjkyskTgAbN|#vYqjuGH zytFnJ-NVm=%KK{e+o}NCwmBBsbvLmMBPXzYcs^czA;D9fp9Kn|6N&f@K8-w<1{TNocF|cq zMeMPxK=cjVM7jk!pt`^V<4gP>OFh<= z;|!l}NaJ!J?#I5dq3P=|)TDyjnJPe@Sus^CAt^V|helu?S^%RfCaGCA>)zqRY6)rUOBO6(1+;#REoz%Y> zi#C17qS8_vDN3h5wcb&?@8Zlw`z;`)*MJ_^!{CgeBf73g!nkCLcGXvE`$|J0W6mHt z1%5zzkTq zhW_L3v#Bx%s2}^2M);1>x;5MJgqj}dI%keX|0UCbg%y}y!DW)${Xpx?3uqtoAP?Cf z&{y36ODjsqi|;$3H%tq~a+BC_I-e$3w&PO63oww`PbMu_=V_nj!&Ro2Y!|+dUe}_b zU^J6sFsQ-QyG3;H-!xtb@T)}V{H>?ok9%wFu|llkjsb7vSF(&chN8D zvgrHt9ND9!L62Rl!m_+txO&ykPbj2cI_96im$m=Y8++bL3ul`zeQ74n@;htC#Td z&yD~)=`G#gbcX(zwgIiFEcRJ(thDVzB=pfebaGzId%$I;4NhiYVl1B&xt#R1TT&jD z$aHMD0^cIfuBxJY+;nuZ2^L&jDhM#xcDlcn!b^Q#`z~{RgWls z963u0-fEFU<@1==(FtJNkcz24CW4aj1$-~L98#A~$N%+DLAK6_VkucmZO^g=mQ#mzxst9 z%U8fm>n~L6cNSW_R_8Bhs>1nxb;K!kD`ric%(LzMf(<@G{3Fp{u;-Wu>L%X15nB@h zjii}+Mor@X_PPYh-}AwEsVvX7JPfPm$>8@J!}vFHF22c&;P$y)Wb}FqzShYjuR^%b zd?O1Q2kX#bdl_!Y66FtX+r{JsE8%sel_31D4EBYG;hL}@Fv{2nU+*ix`gdy};1L7K zQSabV!w~rX^8)pjcf?QkB@LXTh00?mi1(fS*ziOhAJiM-9gQZ4@E)OuIJdX^1Rgh^ z;TSUJH#i=650SjHkKB6Vg>p@PAe{aPsrG`#bmS;%^q4GKSWd^UUvo+B99b}2_X=ZeoQdmQtJ*1ITTxt8 z0k&_*B9c3$K`7je41_KulT`AEj&TmW{xlK&Ormk;#Pb*-5r_k77g3zcDwj)1f%!a+ zYx}vB;|xEc2OPM4!^j6*E_@s9Yu4h9{3%dAk2RhU8HdAt0k|osjC!q%LM3KC-T3vUq@HK9eM6;Atui6-Y-ZR zw6rnaubsAi%Lm$J&mPvwLDOn0e9Fzt|8Dt6dvnC7(9&*{lPtzSmkrpkTL9|VYjnxw z-2YK@9{yCmZyc9R$QDvbp@bB|xvvu?nbA%=(JmUavuD{tR`v*4DV+N{*@-ezN=ceZ z3gsIL{hr@{a9*!-p6A^6b$veXH;R6eWZ~_@aHjVsQT#UzlNJ|f8_t2g4L49k{x~Gc zS%Guj7$;^yDcoCph#IK}Q6WB2^7MK)Cwk>x%Kf8=!{>d$RzL~ohDb1dhcq(gQ%IV6 zs9V3fSPv{_O-HN6j%DSvO+!%Ol!#v{)C82Jl4f&W1 za>UCfc=kz_et7+aC>~x0E&(?5>8H6wUnU*0XB@-(vI)pPSV4<@4alB!Z8)443jb{g zAXh(|!cQi*Smf~lX5?AnvWEL;-+6~7Ro9bWQEF6qb0jHa+=em=>ge|~j7B4(@wb}r72JTwARRNV?8GTIraS5IiPgBl9Wb&FMOUUmWV9k2x#~@-7RKZF z#91U641@gracDTnI99%elX!1SJmNk?b)?(K!=H2U@NysQ$*@2%gBE&)6-BP3eWOv) zZ&_9~#YAqY01RzO#jxJlIO!UX>T!xNOREq%ii6M_CZwk9dt`RXH#xp9S6jEx?jzCXh8zLN#vqf>MwNDx7@7@@W;oeP5bs z*vjWHJ+}!g8Gg!1ZGOu0w*-E+=Fz{_H?gkl4H)=*gl){Z`A7UqqHLE4BcZ>6x4#>H z@k)b~zdD+JS27tsED4r(VxV)g56Uer$KUyxRB84sjCXwi>+>pLpI$H>vp1yTdaYoq zx({?IGx9w9 z;PNUEFzBtN1z&ve43k&N7LCF;0r$voV>MKy`Q!fgifjXOE+}u(hoDm-bd9+zyI@ro z9P~+Hp6gJsGSbI^3kh&^Yae*N6nimA$Cyo)Ucmghj7NiP0U^CHggts!~EFnQS%fjsj= zKy$MX#BUIwXPviFQ`7CR{O|&7mHh|@V!|=(p$n^Rrac6wi^H^73djA0FX+ZCg(ceo zbP{ht^&10|d3T?inC4Vb0d6=>U(F+HXJo-5wQRb-E}f{#O+j;IKWFUvY4i!6O}A)> zVy628s&cgsMqSvrqM9;}#bH*tT@>2#g;q_}*`P&-11{T|&*B99rZ>-Ila0&Wq01uz zJ1z!+etraX-*pjB*5<=J-6&X-V~R*s*bPm$$h_2tti@5sF}O$oc{Z@{-Mjx}m)s_QZ^{hW*ab(pw8q zqgn)o`kum0iR%peGMQ$Zy@r(b-Liv z--<8F#bLSbYOEzG#5gP$+f^qB-{}$ZMDYUde(FiHO7n@utJTCXFC6Ds%*KP2Z|Tgo zwe)0>1zgFT%~jXlMV`GcB^!$5U{9qjYtl3etyATpaU=lS=Q5dK&6glCpJAvU<&Y1@ zjll1#0jvMVPMGvOO`^RW@t;c`c|@~7CM*O_4tvw=&fPFDkO-EgpD^)mFX0iiB)7|| zY3&0ZHm~w|s>JV4z2EAvud5+=ELw*#k_DKemjpi6Kk>3h7+vn8z%VSXGc(j0sED|S zy9KU6NA(mvA6Z2M8JFT^!717y)kMh-XJ$rZg)8fV5ghBlWv3!|doO@3--c+!$u!t{ zLjqiNPte|iPE7paM&8Zjg5wWS?u45=@%}ajJ7=Y!#DdSHQ`Hq#eZEe1l-gj@M^Ua^ zxFka1$hT5%e)D)-t`Dy1V_=` z#mn$=wi56;d5YQs-{E!nD8_WI{apFOSf+3MtyUqtu?x+ zCc1-OF|wpbdHgw=O&6f<}wnLfBK7IPh>v5Ts#kUMBansZguuOU&cv2s*WoZEFh>mjm)qYfVO?oT&r99 z?3w$Qz&@PE%~NkDvEObHo#%@1R5S$rG*;th;dyM`KXQ2cYbA%1X-QmU7#`-cRcvX^ zy*TT4A5AhT=A2_3)is@Bz`gc}IfJyLNNh2j)4u~b{srhcxe#T9?ZN865;l*`1m!rL z#seqU;6FcJcI5YXDACQN?eFhHW~U9S%X$%6e zIm7eUVV5pVXHA>w<2hz;x_N^)ZnPAF4~qn0Av&BIMgw zW8&(v3}tOaxtqRmaXdl+Tw@tVRe2Fs9Mk|S=QOaWc}MPjSEIet+sUbq_rdhF7RH@o zItf|I>|c>LP^#F9eDn<=_j{9Zclv#lJh1|;MNQbtA}V2p!Xapwl;LLDTHq8sCA*p5 zQCZ9)_NjIoRJd^s3tbJ^SFFyG{ZIB#kuoLJYrcWoJ{VBd`9X+E7sBJ9}HlU2KF zzvF@9?x=d^F!0yE#^kzJxaU$jE9JynFuKZxU&lnrlbR<~W|tFwmo295PCk(AFbG*~ z&tM>`o`@_zj&DN^*y}&8;)>cdLuMsmYYz<)!)gg?!7t5S>Q;0yJUB+q=XQc^t1VqW zz8Et3pVHPT6@2t58a5WNSbLe-rnl}TxS((cibt+feY-`Fqq+pS%}yvf983cKq`}#0 zPnZ-BBwO`#X?9o!?A_)~TEhIne`YQ5TsRGfzPg}%>;x9hSj-23Udnnp@oGQ`$8}?xA^G<&wE&yFfW#_VKS?i4b7NtnK5~O?=M{A{)NNsLhMAZ z22}Z{29++exLQW(H~XWwAFb~#9- z8?ufxy&&6yTk*f-3S1Kr0m}rlSj=NamM~o68NcG8et$U;Y)&E$48L>u#%k#P&IgBI z_@PP98*2A-3W-Q9Mh0lm6Vg9XQdAzjcm4tg0TXul>vK5pXA%$o+z0h5*HIs_0@yfT z4c|B=;)5C)Oc*>!CMM+B1ri@X4@Jr49#6FOxQ9xY>tIb?8GT-o46;`XpsMgO`JD3x zDx`$rQ#GRO*6TOaN#AA;X?H*hmEyD*GgM^;Bj!?hjj@$9=%`nzA1?R?Q2no`GURl5v3K4u!< zsv6<)=bvHT&~=hb;z=&6oyik07C(u`#MX2x-5*0}soEGQbLLTki+IIJ^^J6W27zsG8DyT2{k`-qUX_3D`3D1ag_ zywF>?fjIV05gEM+@+PJmBozLdtbEDOy)Ebg)x{d*tUxFU3FOl83k77Kegzp~_L5gj z0?B{tZjrunZxSpvm;2&k2UQVBV`bmIh!NWbhMCW|(aHz|X(wetcsk<7>C zDLS+}*_qC_p2N1bdI_OBRAD;Wo_w${VL$rr%ssBcBHQkMhD3YDVIL}hU(Q!x@{-qh z<99y3G3i3O{}X9alg6)#jr3-71uOQ|UXm(v95xT-;`|#G*q0(q6&LhyB-U=lK+PYZ zSYitAO$u1&m>lW+Q|fRsaU%*A`=X`iT+EDNzN?*YNn)fR!Y@Hs;&>EmZF#uszg!2* z+KK8GD%_!=6x>jrOKlhI#H(w@Sz|{t@Tu))^i44*|DNjMr$2|Gkhy!_Vn3rZc}qD1 z-6O_pa~)uJ`a`O7i~}?FF+2pJg&@{bOgaLj*hTqnxc=v2*le%J_G^3sgVDz9trsI; zusa%(TZPzu>2p|iMaxfcB821unAq=WrF46+^~Re{gC^#srlEDA;`+4mR-N z;95R3sxT&7nS0OZo&?Z%U_ozn-=Non$LO5x=ZKdo8+AMK(9S21RJ7)R@Oph1S?mX! zo0{nD$AU1|G6aQoY(dH8g)mH_IQcJ(pk28Fa#!R)#VRI;=3&P=`!gJNu9xThoLfZC z8oj1+r^8V3@(I|ezlfuCiiazw%Ez4` zC-P^qABKyp zBA4(-2*G6^d{Oq=Dz1KwI(F2wQ(A4uolgBu7Ci|^wGRedsjbJ5Uqghu{`_k=<~@(Q zC#@DcLOk*O4>xf7`2;3EbWq-`_sBoS!eaYu{O{Wq1POo<&7aifT^RbRc#<|&91Q3# z#V5O;(7K!&6lf9SUg|%Or32dB$l`b$6OaYfz)yJAe;%&)=wO+@xB*|Xb-CI5rMber zWjN)08$HK7Fs0U?EG|ofXPa+hw3IQp?bHKN$0k<0pa`b4+#pV0JE7m_B~(R-qQBN7 z5R|+CnNw@YE^{GVB@CFv^h<63TTM!q^+2y8MLFlyjJv+YUbn~*QA z|9cC(V25#stu)N8{6{^+SST;pLP{Mrk{9a(ur&QK5$cwO?zlxTxb6x`UA~XBtShA& zckf_J{1(*k272Fw7vj#(hH`ca=g#CSCa#oG{=@x{BKV#*-RR=DJZS+* zhSz>f#RY4Y8$wU@MQ~UrNULr=rV8^2+!0G8j&au1@g&nV>i7f4$DXXZ8-ejwKu z#zJt3CF^dMKeQLrLjT?_5<1<>={zM12?E7r!NIF0ilJI~dTI+v@>)w4Gjqq|390Z} zI}sk6dz#stXR4!I& zmok0cFpk2D7sOJ3D~hb%jxQDK$+$l+*Q==*i{9F^9P+w|H`A3jmvkn_%+$E)*1;f} z-%aDHg^^V<5A1d>XIc7$aa=tQpy;e#6CL@jVHcsW$^JoIMWpJWKJN_Yc~@})-2n;6N#W1Gx_4AMb_ss1unEPxAZ2Bq4eZ)6` zX`W*!%+bQLJDX4vn(_UR02n=ajSu?9@DbM>8~*6BQo_pd{ec zt8b%3S{R-Qk;Xsc7lEVDi^~Vju_j^}!duNv;>s|p^H8EKvOH-VP z{ED+yOp}_@`7nRT4IlP0y~@EiaD(9=C6x1_M?xox+XR8gMpszpd;tv6+|Zv|uE;0lvJH&Ou?e_6HX<~^_*pZZ$!eZtD2ib^ z_NSWht}?@L?*2qKI=&+xv=2k($YYkV_$(yz#MxJk8)!tfKQ2)Srk2CjQ!{hDrSln?=5YwaVZnd{7xA z);jSll{Q)S@-sTvcngVP@Cp32{yF_L;|<9U9Do6bTToH_9JU|MhC|CWAa&^O{Y6G* zATcn?Wb>!1%HEn)c_geQ>Oo5Q%lsU896SrFj;he|_sKN!dkBsw$g&Ih4X}EPE)4BG zjZfPb)2RLj)PrFq+02cDE0v7rTU!B*a)cOu+FRNnx)t79G7PljYvib_CYV;HzzL0S zupu*qwy@H0-g7=uEXuINt40_XH&BKAVdCl#fd3@Gu7`M#zn{3*gisR^?Wz!d9 z30e*9p0@aHqAZPK0ejj*GOX*?ef4&{WO>;R)oH#iB z;WIq5@iZy?kVK8npMkqK1<3Qol~C;{Ss@Z-K$2Rg=v_xHedewSLnA{_dFBbxJ(&zj z%Gx$(z_VjJ`EhOoNBW8*N{x7cNozAJ-t!;nj!K8P zo9KGN9VX&kPz5?EdoUYDub-r%AKR-`%k?4b3d8bXSHU#_ zKJIN#e%!jigO>+pyq(YJtWevyZMw`G$EBF}mv=Sx1{p2D7o zJ7N5*0#{nJ9j`h*XIR&X$W1{!R`eBXD&@Jm{>bC?Odl|P_KU6+Gvj_1zsGd+?~%Ja zckx=D88(h?!szYNcz5wI=Y5_B7I9N?Qu8+4NzcIFB1+t*P{xsd(VFfOOu*+1ljHo9 zBK*7G3|seFLTlkp^l!`pFREj5<5(igPpurq10~SGHI}Y)>^5n!tww*9PV`DDM*9H? z@TQ6Q>01Y!>z8eySwtOj=Ury~8t1Uiu5_U@~ z@54v~OHrvKk4g+6{vIErsyWO(>sTf#$2>v<2_>+;bOdJ?j-mp?+PFp2(k*d;bR+rquM%dSy9cq}&Ja7-7ddQsbQ{|R$#05j%(nuT;eUzvcZVd6 zIqnPVc}iix*c`y zIaAy>c?MhfB-l>c_4L)p2*MQ&gHfjInw8**e;K}(v5Y9BSv@5;73<-0o*?&MY9w^4 zgh7h=RTA791P8uZ;-*E-WXTR^WI5#G4+%{eaJvu7rMJSm3{z-n`)m>;SBqdij3)om zaQ5lP9Q}uN$QL|7ET8`&Cx`Wc{_;V=fOy>es+_#36@=?LiXgNp0|#mmAG!;3Wkp($ z_x%L%xHgLlTxML=ibha6_>QF4PU7>1U@{?-igSv5@hOMlTK`MLy7P`EYYXz=?9XrX zOtK%IFB+n~W~2CAEuC_km_7TAEY$ve3XO&$3Et(xvMPY+{%`cHmk?-K>B8c;c=~mJ zFG*D{!!yH%bcD$tz3jWeDoyF6r``GS({E{z`*eha^f%#z4CC(V6Tz4E;mCcsh1FN# zjSa;D5pS)mH=_%WHIgs23`Se$DgJmzhP*$!2UEhRa z;sbYBed|3LIUC2kukzqYMk*cgS;`G;Z^YpV1^S|Xh{n&z#E>KFAZi<&xZLc_&^3_qjwFkxYW60j?o1k8GQ*+hMni%}}uM4^~kHXv*cMN`|28 zcuQ(m-9QKTIHpVa2Nuo!P5k2%VMBQtC@h&G(%n~)hvAFqrE9Yjn9mu}@Bj{)0Dk+C z3o*lLa9pdB8uEsa^tm$Z>1AESFs+mZH90VO{9|}NSQhW8)zan_o}AJ3>g)|0YskTH z0XUi%O%?cvA+!$JO*@7*%G#2t-+t90{`Bf7>eT?s|htN;Q z;o!wuu=g@#3%J`;b3Hd$e0~q?(-$T+wclWdbR98{nE~np?)Z0wA-XN>A(3mPxE{`a zc*kHiwT$b8vA+SNOr#0J!(*8L_=Vif(FZW~_y`fG{)cx8B_VRvTkOr0$KGAHQDK7y zG`xx+eRmgfD-+E?>cvBNA`uD7eChCZ^dA0t^_4VaOwg&bhd|@MH4q=^P22M4QEq}X zcZU33R-9xe9ADT(la4IoUVWa7w3CnR#LSAf3rKTImW(p|05z_XVG3S+Cdhr=xq|Dn zM49d5E5+US^#a4#ipATHjWNrqie>QbCtkXF96Z0wV?Xc^Bx^TDK+mjde6+BUCUw7H zJfvc@H)wb9C%AIsK+a|*1_fJM6(S*n%; zQ45)UAj2n04C+8FX@0I_SSegeO$F;%UfB6^KMF)Lyr~r_MB_e_Ic6Pzn>s@5H_@+1 zuSo?gwiw|^n)AT6^bauoL5S@*U+?lUo+R z-FF4FYy3TEZJP~;reg5X%0G~Bic8LyhG0`}7|VuQLss(%*es+C{5SSsnZhT0ybS1c zi3r>JHM1A0Q)Np&NQJSlGuZ~6yfChCjTIlg8Ui-@lBW8{WLl~W@AAJOqS_jmo<4{D zYMBVW;ZdXof3jeG;9XYWiqrJuVlN0%ngY>y74mq3ji&gWDqix&67F6&=XZce)a&7x zhA?~HUJgF|VMuwT4}fb?04D8bJaCc$AX(f2ZAVrzJFszr62BRq+(UdfUWie@@344$ zoG`?u921%M)lcBssMjdEWs-(J3&NrDa!kvM#(9&(~Dl_+@zCc3| zd(c33j_d`&t7FVNfEP~RohI+){ZYV?nM1qjvn#uENUt6bcf#)`r(?!l*6y#1+24ON zyC~^}OwQ^J@wZjM_UgCj&3g<-_NlYijHjEtHp_<&js4(gUB&r&CyGwRPJ!?zT>)c9GvZxPEtUHc;H_zkDD?Hpe4{p-`B=_S#Pak}A z_&%;X&BHa@Bg(CNZ-7V3+Tb_y{AOO?3_U49bkOD^=WFRbvS@WDO|o9V{hSxd5|iIa z)-3j>j!`kBXv;Q?F-XHTUZNy@cOLn?=?2C|zgn(xX@hFx}> zC>*`c`dNP+PdA#Ox~D5^z$qQLmwMpt6(5q>)e6U%dtAJ}Ew)^ofKn3;yx2O%@=xJm zuhvO{$)BkZA8U)|m22>-vNx!kDFU35WO`+G5H6dH{as8yk<0M)Ui>2tdRZhw$RCOx zE8_3368PO&f{g7@gXrbgQGIkLShmX%*L~ZtuGbr8&R@&glvjZIF=e>4kcZgLbVluC z^+c+e2X0==VLH{?(07R+{!3W*pn{*QR7fUvFMp7Z-ud|8Q5D|39EE8CtBLTo z7kD2yB*Cc@3oi(ew)$prH9sBWY8J!epH1YQ?Id9nKJNERYa!#&0*GO9-y?NpXz$pF z`rCG(gKaK~eO9JU**EAHr3cih>oE@95#c5@MnR=RKB+EwhlPIvVcxnZ(kY(Uvt7S`W|cE||RBtPRWgPBNXJhxp4!6k4;jA*V$E#&xWznDkyqTPKRr zqC)8H96)?lL=pe3qF{f#g0>&sj_TU)X}o?m94TSCJ>!pAVoqj~qFu6hLXiz(2FJl+ z?SGtuCD*9!iPbc;{RZ|AsKCa_nef!Ah$9_i48ecQA%*QmOBCgCRl5neZ;nKlwC^xp z`2gw`q(O+~RVFL&9)orz6XP$!>>C#@((=d0seo4&&Dv*U9Q?TfHvO2-9{Az_R(mR7 z+g~Z7Zu1q@i)OKByQ{DlE-68^-~sqAF%0j1k^>o+r=Y|x0drAL{CKl{!sCSf#k|u^@YPd;&QoZo>YeyaDg7IK{y_Q&c#y0L0b)5cwa1RHG=B9`j=E8~eXx5Z(C&(YY1#bUzar3Wm2zaNDrsfXlZIeeFH=YIC z1>Zq)nFXes8Nrc?2JoxRqejqAwzvtw5ls)se6^HF?vSQu8=lZ+E9QIeeMgIug&?KS z3H5!dF>kI82;E!=cfEsAYTtbL6B!8-)h*a*w*@@2Yl-#hMmT5VgHHSnRkOsEKuhQb znHo!G>5f>#(`5;Ck z&y~ZGWBHh7$OoH6f-$JCl~@lPN5`%JXf5@nv)Iy5VH-na-Bx1suV`9U@DMyr{m|bx z51#Gzf(82TMFSbuMj#%!>oio@rz`bRXfQ=-9$y^}?lUF%;`3Z*t`+^d(@k}$DJs%>z2I8u1XW+z@an2l@-+1kv zIt<011n8cR7t$-~o%^pj#f#UWfB!brwrvK1mc^u~)S3qC#pCb;SD5b@0JYW%Fzeb5 z7;tex^+_MJJ1Yu)Hgfnm{VcA0rV4*7OL5Mg7fiM$3ebKgo3A4Q4|o}~zFwDuVZ|o$ zYw$13JqDe)PE_=7ZZLB!$igsFz&+E}Oe37<{lNRI%psR`7;IRHbaxkgYb4yGX8741ugf-vF@rB{>*(%zh3@K_9V^0*+&Clv2``Luw&uV z!L5)c?gfuBa!}?{H-3S8^ls`3Ov+jf4~JJnl)o@6?PjBqlL=kEi0PN_m&AL%j8oRH ziYRW%L?^vxB&CpHxg3Z?CG+jfMUsNm`mOla@(vgq3y_F}mnMM@g;4u?H@UP(lFm## zLtpJ#jE{80xoc1P;e6{c>gAFKqH($SC}S?(-;6|VRF+ojuBFl4PIzs;6gPf$8~MJt zh^~;iiSfQnKZS3AHNP(m3w1WbnIk5o%~ukZ-jBmNpGZz)Q8f6d&4vD%X$+5kA&iSN zdD*pL^stZ@6uZ_loWy$8kNq`xqUr~(mC}aCE6UX3sGF+Hy zr+$;y;Cx>wF8y{32L5vdrBpo_c1d7GG>+0(zSCsxoMMg#PbI{7dXu#m%Lu>PdEVJ7SXpLgH|Z? zawhe;SwW9i$AgM~GZ85{L^DY~y*j8*XQ?emGmA~2^NMj+ShO>lUss&*v* zLXvbQ6V5Jfg3q^jS;D`s;h>-Zm=*t|H|1rCL%0F{D%k`js@dTGP?YPGGml7L0GxR{ z94v?SbEdX9Vsqnry4(5#p7fC7{`dJ2z0z!eE$*X?OJ^R+Z*ioeXI5Zm`wCQu%!8N2 z344rq*xMc+qLV8#Fy-8GR>HNbD15dC7Cb%9;uSW8g?hZ05Pc0Q+KeB#vy9cmU>?MYykP4uH}A<20^AfXSV8(#G?4_$(lRnTsTY;O?*VpJ65Z zNUJcD$2yk#4G*3k_6k^|`= zGwbJSxI)k#39i;n0q*naX?o8zn5DJuHVU7L!G&)ZQsL-*#Hx7{2(~7ZuBR+eFPzE! z&Pz#a9OF4J&p_{M;U+7R0$_Bf5vNF-;C$f|xa>qI*;n9?W53w&{)Yv01YE*s(+V(s zaTDbt3y6wn2pJa*g#Fx3y2pDBc4IX5HQ%9n=}l~PWuez7 zz}1o*kd7+=r>Gw|aQYUqdBP#rDjtvSiok>VHn{AdGx4=6K?x%-befDJF_mlSg~&)$ zN-kneoZ1d96-H#yW@)aHf*u}nJdRt~fYE_T^vi>nxUu~t{@rs60~UQ{@#tAIyX@tV zdQ9W~V0AlgZVx7FRp#Q(oY}bX;20AgZpQOpwqV~e3nEnHifl`sE)}F z&VnVSbI7tSPhsTVRnp*cjq*I6!Ty`D9ozOXJHX}Fr~#Xod%Msa#k{=n-F;0wlcom> z{-$_l27-LLDo!}Jf@OIl4KZ+s1JduwqWlPm&!0&eI|5MHn@fhzJ%Dfa?X)>JhyI;R zr@M&{u9WbFs;+oCe(4-bUGH1fWQYvr4X;H%y=br&VLW9|{=)WW8KB|FaPYK4t5y_T z10D}Oc>DV?=iK8HCP&l*n4P#SJlpny9x94qc$i{z+&CK4Le=nMSu`Bgc?r*_x4=PW zE;qI97b&~nNbH*h=sqVskV>s4r}>1zmp&nZU*_YdL;UQl1>ex{s21#AoQrMyBI zLke4i!6wq4UX0oaMK4a0+VHpV+Fb>1)F{EL#P3wHy@q4zJ&kpSawt$DgOw{!qQi+S zVp`1jZkU~1e6OgnhJep?OsjU`Ql8mA!NoGG*&4kiY|lH9D3Nc>`^gyQ{WC9tnfl$}-Twbf5m6Z-)2K zM)r!mMxMP4E1H+##Kvu-m0J0zJD~~s?lXV_Sqh*xw(xM9t^ z@#A1H-FoOe`IKFU4@#@xzc3TDidX~*g2^}|M21_{@C)@#_v3~BD2&w*#4ltDC69Mj zwbUA*=8UXBWs6+Yn$yS<;k^b& z1-F<(MU{C*J{5FGwXfrp3EP9oV zS!+(E^}md~D%!Qrym?mLL)v1PakLsO+dfkLvWH zORFwi9tee9|F(i|x*V12O2=KEt@LfV7uhcF39gIVL7_qjziz(_Z#}0;z+#3KM=GG9 zFauJwY~h@0GRjF`C#(Ofq;0dLQS*TsWF3gd`j9=)?BtFOdlKnLGe1!uSb`pAPB1zz z2`m^M+?gNhaO!jfURGheltaFt9u|mEpAGT-+9jC#A&2tZd}b7I=Myu_;*g|%VUP}z zfxnN>L$*sDUan=+PsTIht!y^tTD8Gy|2SeXGaYLGb%UHO4{N^PVKn8P1^sq+LE&7s zNj)>5*m6mG(gyFNQI4(3zHvNQ23do6v6Zg)H05kArul zNm+v_UK3pj63ltsd}9J|+U~JVPOc`WtbFlk$3ZyrxD+QJxZ;Zmb=rPa1)^LPkTRLS zjL!YweME$$6lvgiPBysNT!t7aA&ixEuB!i`g;BMGARQbDQtvwHkLkU{yyiG@%l%C3 z>f>PK)l#H8Lh+|JhvnM74R7@e<4v<$*bs6RZ+p#$xp7RlEq@iho|5IRPsvA$eJXa&ID^sd!oL40|=M+5dK%1aF2Tt(#yIz z%}qIs_u~mzB%Xo@Fyy#A=jHCK-b|b9T2RBa2rBLg<3*V#*ihIB4fC9!>)0(Cc0ClU zwc6<%mt?Fj=qD<#v*2mhJNP14Ny~T*sd>XeY&jbN&eg-D;aNQnl-z{PRc(~(EP}%k zSK;udI~YAX5at?<(!h*WI4F4zE}mh|*_(6dOhaKf`NS9tJVR@Dnk=W`EJ8aj2X!v1p>g*xQw;5CUIn{LfXB*VdpNn5}?$ge9zVu#^Hsi1r z$5WYp(005LM_tB<|FT4Ac5nnuja<&V4=(^%o>kp<619?o@n4_?+`BA;23`g9#jW+=8D+;(=sW<6Ry2}Y zw+VV}PCQAdZAAH>Y4H4_9OS+VXZ&9Qpc|h7$6tzq-C#eG2a#y?z6MsUR|DB7Sy*Bh zL4J0Bs#>eXu<12*kS}uyD)Bo~gS;#V512uWXJ*3#!8lZ5I5k-sZ*Y%69bpi%bmZ7A zoR(D~8XoTOclte1G%tZK^K_x%FEjI9c?|b{9mM{&V!Fw1J8aSvVt43nfJPyPVHX@k z`q<5=Jtr5O6J$YnT|AwAY8Z}8EulY`ttCfx4YOu7?7$xN88}BIpGVF1%3OM^8l|++{)s9_AYB?31>J>Yjce$b zuf(_yZlLJ`CU@dxfgZMxh|!Wfvhdg}wz8=m#OGz;V7)z@NUa2)bv!USyohlei?b(> zFC~FzRms!It0Ku9c8gzqAncz=mFO1?UU(eIbh6)~ZBW$!K2JANAz6bd-e zFJ;)A^dQdL4MHRlrwE2B!ob&fj)qS&DJ?JpTb`fPM|BqN)6PP*K1Gnt9)`0SHTXb# zHkzR$y;LUv_mVB({EZ-7>BWI$|2D{;z5`(kv+zMsG@ewJr32ANK{wVO>{lDIcZ~>v zQHwuz>Bqv*fIS3SU4=DA!->r*cjOIm$GcZ~tG2TKLa}b7%UI@mz0C zZk8qydvTOXFF8RL9t?!u%$sOFLzNgrDnWs`AH?~8B2Eikpzqu()_>iHA(WS&bxKwY z-1&tVK4KQ?_ZMOVpFI4L2q#4&{&bea2i$j63;x9U;oA-i5MHVdw|32h)Me6e@;_%7 z9`S-P(-$Doy9fmbW@3E!L8{`pgv(Pr6K{uXg(7u(47{8IQl*igV0#Bi^V?y{C-QQ6Ox;;-xj&chI z>%8HZO-6$J{t&EOAjI{OGoU6G6`=aU86Gz>e!>wUbowy|tw#Gf`@9P=J$D&6-;#$> zOB9vvFh1X>3z(U`a{d01Yue@JbWs zN>#MLA@yx|v~`4|A^)5Dg&xI95fOIYlNsb>Y72RHa|@iG;X%YbLh<&(8KB@AO@F$#ZeM$5~>R3&yP zKGtj}2_4-Sr+fh_Bm}sV6EfTv17cV>^9wo;sNg;69$KfwM{Rh!F-Pk@F3CFx=iI)a z;-*+Ud!vD5{hq>karLZBm2Y(NxCB>zmpBUdn_`#x75v;;L+gkB!fdUXaOmcYs$m_K z%HWs|P-IvHd))m&NFf#UwrUe{aV0)?&OqZ=sbH{bHMT5TP9oxcaa^?@eqYHX-}egQ z=xb$m`|J=ncKsCyHO|8K-=j=a<63c}%m56!Yr{6{2{?P*j|#09NAnX6@blshkXZf$ zy4?PQZ7=Jg^-(E`NcW*)!xl)YWm|LRv9&{g z;VmZXyicc@tb3}Dyu)*0V6Cpn#{POLb~6Z6wT6jT#eCGMNJVYde)8zq46d1*8;l1C z!_Zqbc8rG@ku3MY*SmSx>np5KC2lk1I+&#A zy8)lwUGa9@XBgbH0yMg}(b46C@NqB(S4n?^hi_NIg~BS_eOjM%PgQ~CR3tsNw~v^a zH((N*jexVVa41vuzrCK?o(oR7gcT?xz^Xjw{tZrjrBtJ z-^F0`ZlGdp`6na^LF8zd0o2LKWPlD@|E`Uwi z>Ll#oJnG+~%9&FrM}O~H0fNerG)CGAPo9^8QQl8gllQ`5=3X!S@wXBG% zS-8lJmz6!Q9)k`lqS$Xgj8-Tox@98dm&jVS)+rn4vJ)Z)Yy)A7S2SvIS>&Fl44OPn zBFz^MGVEs``jgpv{nroywO9O!hm!|oMKkYKpHt-ffDl)sw*|^~%z)lsqOfKyKmE){ zVE=|}DpGC(^HS}wwW%D_54&*wCYhtEz6+gk_$4{_d^Vo3xd2Z2FIcIsny889DRlF? zjQ5#)$`Sq>D5uP@l0EHE!{8=8S|$Jv2lA_WMSa2E+6d2S>*3JuE%;^b9jqVDfV<~> z@cDmHz&SjJ)Oi+@YZyQ;uD*`FcfL|H{~VOxWC`oN*kyZX3a>jq}0f{02zN`poJHR)q!qZD_fRm+O4Soe0R#M$ zvaQ6~Z>sFbd+p<>_TUkD^Hhahqv}V^yIWBfzk$>LC_3+WtiLymBP%1DWL7fDC>nUq zeT$C@B#8ScNsE^rB>ho0$aoKP>XG0L`)2N=&MW|R zmb`%~wJcaEvlkQY-GhcZMOZ%i1>Qei48D2Opl9x3n00>@3;d;ECMu}qjT^^^FgaSk}o`iukP26LNH_BbOr zTilUvO26OQLQ5Yf!$tQO*yg8lQLw?mVV$PA&oZY6-m=}9=gVJux(_8#Ni^!Zx} zS7%oN!2;)~2b^9b_yr`X_&mq0eZ z5Bj;Ffxoj0GUpv)`Rge3*F_U-9sn1e%VCfCGq}A?4`0lkN^MuU@tD7rVk_q$YMoL? zhUlDv6N>~7#0O2${E%O;z*8(*e8LNk4c5T3#R6ksc?5>qbMy;51@G35KqddtnD=9+ z*lWr>R+%#%0|s2hfbu4!_ji#ADk|KoMvqs92-neH<1rs%nBm^=q9;QIjv^ky^Ish? zCQ<_p*NwmyCr5NoxCtKfxOFaE4iEiMqW2^WM3 z&}7#mShOMq6W9rOra6$_@{C~5LeGKz(2t<%>WQ`0r6lt1N$lAyNoyx{vGIyCv47tJ zx;xR4ZaaIA)XU1T;HxD~wnG6p-@&2568tB=yo{nyw#ag)Frc?FS~Ghp+Hy|CZl zuefuTI$frx3O=_kqJ_{0wDvwpP8k(~x`d=CZht)%wd>RGo=GBesc>95X)snd6p>Hz zKB71GmWlP>bii|1MPu@hkuie*yR~I2Ias|K(xQd^@FgcOCk?l69z^&fU9ft{aA?|B zc#~rW7BcH`WcoI8yHOVWrBYC_aT0v*(Iy7nDNI?=^S^m}l>COJ!-4mT)BY2I-6v-yo!p~3{{N8t**}PwgXH)i}Y}Xa+j=l@qD=(4)(_QdE z?-(RJFvh39r1;Us)#Sj#0P*3`NAO4ILEJU7gJf6h;HzH|Y;21N2EG%+keh2T*sFmV zHa3X;*>7yi6Y`0KyVj?-*4T0LtK%HMI;NIzgxs994_g+svEA~?Si9^Xs>dG0Dxs#7 zjipd$IUk{35>GzL!z0BKbm;3)xN4(@$E&&>Iyz)&>wi&Vr?tOv>h_gnQ$qRH*t;L-t4R2i2%u;_r**WAx`r;t6kv?~7uR9WPD3$+?0-kTaI5+rq^$m$83Q z5&1Uh0J_asMKtJ1eC28mm(<7vnzv7nt}w1d`s`vV@2{ z{O@5l{MT-dZs*?aAW6J zw0%`Xj6G+<-D`1Zdh`JL39oxo)O8F<7=rNurI0T>mb^TfhD|cPBxv%A!c?0*c;(4O zGWBRZSzZ1bbp}S_d+RuGF24j0(}&>qNE>+k?kt(I>l7RkJRC^}_mLps|8(El3+&Gx z5pgu#jYp4tW?L-P;kU(A%0HS?Bx(XR|&U-q!N?) zGU6xsRj^`~7?fHn$3px*G9FI99R#n( z>}O$#1Wsh1WlSo8JoR=4tp`>>YvPz|xDh+`&Xx7W$DqB_1@xM;6W8AH1y`{z9-cCl zO|Oq3r9$SX{UA-a43?jh$v>+S z)U}YoKZ)^B#auwqeF0OoG=S_iwa{a@5#5K{Lyph`7+_Zo=k|8sG~vFkHXxcA2y^t4 zZAJxpbyW+uelj0+y03i7@h!M_9ft)UIji&Ymnfd>_Gbq)wo}&g5_kmLg{LO z3F%W$HaM8_Pj_4KuA~wv+3L?G&K3NaqBy}DBt>GkufS*8LvhD~1;pmO8XL0ZfjD8E z96ar$(6KTS2H$#9G#NfKqg`hBdwK^39GVZB!zMHTs%2=>y4P%$DIE(7roH@gv>#j+upcJ?kBv)(>}jQt$Gt>>KqG`~C`hrSE3) zvwVs;P5!2M#+*_No^=w9a+BGj2|v)&^E;@PMTygY+(CB*V@&gsVh5Qdm3&!1^0LhF z;?paTbXwVQ{QKtYY~VsDx$&3OpE!cq^@->+d4+@5*BqQ}kO2EnUdHT; zuZiNsNKu2+KKQ&)0lqGchO7GV@Hfnm`V?EjFhg0?`Q0uupQgpNN9@LiiT#fA)#t!kYUw@Y~uP5(`Z}Az374rT) zuD!#GoXNO)?+>xG(owc;nFSK5DDlW(OFFsqFutB2ig8{x@FL+UZcLj9d+!zD$Y^<* zEz<#K&z!`fMI8=*gnYIoBY&|4fqU?}aklt_xdM||5eZfKA~NN54+P}&f~zdzxl2p& z^-V|GQ1_9nKCg_{O0Fjx2n(9o}YkW8Cv&`0~HM=vNyGy|0e2 zie1TggJ^OU%hgP&^cXUWCnW58GM-6FK^a?3{$6_y=6GBoS7mgerguNSQs`#W?mfZ+ ztwSVJ_BdNE4usy|Z6NEDgM3Xki09oRdWpU4PIM=3l<&nwlP-((wxzzmKTWp%5zvJ^!{Hxa=_wkmRO#1 zlh~e8gS@ZbV90-ap>Nqx(mCe_Y3>zU@x6u4Bwie}s|hA+F%;{>$~%)(?WyzK+Fi*(3aV|BJATue5Lh1aqq z5_}H?!}OCvu1qmNl5HU=&7F*=_Fuu$$#S5q62?j^=Adrge`1s0*68u?EO8DDgzk%9 z@Wi)W@TR*QzK-314>L9B;)HCb_uUk&dK9T*=LFdDWiK%_8wHt;K`6}>LFfEoII~0Q(uK=|6`wnELoV3mUT&dso;HyO{Y{)fR~_ zXO6%-_o|t~k5lj?IbN8x21CU_De;-B;SjWTFx%mh#Z0}N*zaTtL#ji_F8>Bxrd9zL zI_@*;1=;Mpu?q}6Xn-m0`M4w92H#}fCCPyjxWZr1y?@vbvEjB@V~(IBQw84Ahv0U9 z8~&I$9p8rECy&iNP^!iOI?r2ikIlW}GPyHkVEiIhwEZf&7Wrdnw>ua=ea~hd(#H+T zxiIAJCBf%#7oC@zh;NNM!2%R0Xj}W?;(#SWi|;3{6&ImzTPAGUGP|hdKondKy@7|m z-a@%s-?7xs+d+QSC*~vc3Kdu7v&HQI!Bb{~OL{R*8r??J);@q2&5J?%jvl)cq784; z6=D37+1U0A=$e}CkSLmqOBdV}<+V=5Q8tFC{<07!^+bRHIZQ@->~Q${*dJoePO*jU z*62UP8A3fHad({^-COGb*qR41wNkX)O<;|*e}}bl3ZQsrG2Rb+E84t%0*?Qcf;JnK z=-?Pnygu?Y?zm=1T^3))<@zV!O`WdL^Dv;tEB4`?i&1!ObPvlf-UT&}I0OstxrW;S z`0+N5r0v;(6T@mnx7+h@NR$n64~rvqMLY;DH}Ut)xeY;D;2*_fQ! zw-;VE1Yz%!AQ*0$M*2dIlbDiqkdi0(5?<^A^QXo*Zj~13y{66c(}Zm-0+VC{d1S#qM+-H7S6sAUjCr*hlAbW>!eJ1EUHE-)Ar9bII8^w`!J{n zrjiNxBWDDAx8N(Ad#?`P9oYrm$1XVj4XlA<&oyEBwJ7nU8Pq&c0SwD^SkA1;5t}SA%J1?*)%g&4KXRbp>1#|M`so*WR?hXB0 zvS3|C4$J~u96oU~T$tjG&0{%K4gH8|GzhrpG^AN7k^M`z0!|LZ?c?HDjgL0WRNaS* z%#f7}daLRBQo@~n1O%%#vIQb37~fqAwt{{*x5^Yxju<5JE}HaTk<8rWb%@Ofi#RSGbP9>S_!w5B?0@ zg=MNgurQ@ltWeyImy!z5N^gg_env6dCOr>shkhxPzB|wH@a;DC?yv?VEK$at(r59H zmW}x1nDr<(bs1BAG8N4Z+e4B36-V$1!&OL-J}7V#@~eL zrIWGM%M`{KW{O7BUo2{NAa>4LhK3a>pgKEObmf`AF!gbVW`eLQ{vP>w?-#jRxfVc1 z9v9GQxW<>m;7d-Tt0u8HI9dr8eoSFqUm93XehzuOSOT;kEWn)6>C9s3Z8&r7Iw&mu z2)O~TLF=gucbw2duGBlj2h`>jXZIo1X~Mzx&*J>civ%9!bs&+w@MY3e=zTm0Zg?rs zDT1G`DNz+hhHu7uayj^@bOO8`)B>`rYoPJb8Q2o+51X?$fre1$xBu0o>yGxrDV=lJ zAu|tEywBjdr-Sgs-$=2Z?j-=bAc5bcO{Ko|uvJVF$EIbmpM&~I4v2BT%n0gyw^!sh zTaFGL`vg?BzY!7!+Ntj;-(>A@Js0|_PgaX%nxh?Gd(FXWQuUd zcpC~E3N7g{`83h+P8%4!`5xGOzl`Q{&R?JU! zUuu!4p=vn%cf=Sz4OHP-6OX{IeWOrEaSduu$|I{jZ4k{>^v67ny%@XQ6mOT-K-;4| z;_(mepmy;c6PL0)oO$&a%9vh422_YPP`n@!_bQwhiCasN&%^8U8)~IeQDN-{_&!IDO4 zRQc`#p_4C)4JKV?3Tu^#TUa2jTIq`=Yb>Be(HAo|%!EG{HEh?o3#{HR7Luop!`g8Q zLN1vyybV=FD;0N?uHO&M(-=`b*}!Ct9^ugU?rd^yEFR9igopH^h@MIoj<-m{I>R?G z!7HgLgfV{ITkPLXJs#Ddn@Hgd~O z9ye{-Co(I!Ad1qS2xCpN@LlXxyy>b$e~w-c`V-{Q<@a?V!@~#JWK%ruvmU3Mc#gJu z_4p(2Jeyq+&XyN0!u?g5tbAz+yMM2boWGR@Q8y=Idx9*jDD@URBuykD=7(d9$}#Z_ z!*TRgw>Pq!z-`qgIG*HVdB9rln- zo|EW!t*toW;Q{u|AqC6_3Jh3#ZID*uc!)+~vX{U#>b)Z9q1O~$>ivXXFC~Pm+C)5Y zIRVcWT|~1ULz-q52qC|>f%!3KxOmzNMG+0+RnvbF{cAg*@%4PNA#oV$Z@Yx)Gz$L6 zPlH(P6%hAsj;K54KTP$JrY)iZQ21C0PHQ&flGS72_UB}n>@3Oejncutljc*WZ`NqI zL=H6i(jnSmB#KN|!(_{^#CM(-NX_qu)k~t;MkB%B_U{n7l|;k$j<@)v`Y1O1NM+|V zt*|k1JI=8dvcVHBLhhhN7#KPUEhQwe^S3XY*d7j94c|n~f(}JW-~j#%x-FV3xfe!1 zh{L-RBOzjfE_1)>jiKu^$k@G!DA$w%dYuVaJYk>cP*|xbv7exReIzOS&mOAUgW-zP zIP%}oAp%dQM11|^Drl~^hqKm9^ppMTf~tZ3AX0 z9Sp-Z31{xK6QbNfoE(o!73EvR;KymP@K;%tZW~=q7L6Lf$7wwj9SHIS z>-V|1JnR=MnJ|p(z3V4v!0jM*@&rh{FClQg8o<)e6dKwpFhAiOaTHAjKIRECQqzMc zD{RrYPgkrG0!-F60&I*2(oMLLZT%Vzr3Zu9z;A=e>AqDEs~-mo4yVAtmfu9>OCnQJ z+r{QhyMW(bRf=2>JYheD97D-&H8OB`3`%wy;l!$&XgxXu-!0SyqtsDQxMmmIzCagj z7m5qFRt4ZZ;Z9PbxIko?c>oi}>_J_}E@tz3BCgz2gjOF8u+V%5H1AFoGKZ?+;PrS2 zcy|l-JbukS3=lM%!!NKGUqc~j_d|R$^8!8`p@U<-2^!x<8S3|8DmvSF;JblNICD{) zNdM3?SiERf--H8iy=seRq{ve6L`|&r`uSq#9nl(Lqu- z3+6js0{1fm1kLbE*uHQuP54j><{y*6;J?TCP(vM_?pR93t~vsqX6LL2zYK5oQ?S;pk*I2** z2+^-8Ts&V%f?kQ43iDOez&}+>WM_8(H>thD#nuk6;C?sUA3hQ_0u(^*iWExvWx(lc z1M#ftGcxi^7HPRPh?W^?!O5;_k#cT3yY%}P+iIT2wuSq$5}h_^cr8z#tVjZvgxADw za0YpP^c#@E$#A*pHlCld6ff+lL5*qAtg|$WeMpV~1>xsseotbaZ`EMk1h;u%?gZlqo!_5JfqH@DQFn9i5l995FG+MugYSmXbtt<^f z8|z8vo(w2dU@%@i4lnF912RSz?iamX~14D#b@cQ`dB47UuSQ2-L?UP?gt|`@FxlIOZ zU)Rf)$E9JZ({^ZGI#;|l(-aTyal^LVA2GJRzh6~CLPB16G=*oh@JI=dg78R4N=OVA zz61q_Z3tdG#%5LU`mo^DHXGLmgl$;6W<&7*zoo3MzhA;!LPA@m6+Z7?#Pfvw;nrIw z+DC;Ik?}UQWLLFIWCmH4t&9>^w z!jAmYICh%~?aDof!-v0ypf-KlcRT~!KX1U~{2UC>u;B6M&Z5oR;XE(FgG6M^7tIlQHG&qV^ro3v^?{RxC#Gr)11gGN+6AEILNWc(&mY zTqCuJUQFl^Vc0v&=_>=XjoQ>3^XG+PO`7;nwhKfYk>#ITuHZtyfkbCgJ~lMC z(Mp{OeC5GDl+Tr>Q`%k;&+T`_c^3je$GsIsTNJ>N39Vo2Uub4l+%5 z5H61s8`LhP_W$&_%Je`8%iTz?2p*HG2|j$&a6P)HY%H(alE?Opi7U!-JBRBvrD?m; zR#>LigUfGi#%u2*N#S@GTA*Hw;j8V?Y0)R~YUvT&*j1aZe6kkRNnF?29p^+g}1pI7B}Ho>S_If&mh zI8X9#8gqZ^o0uu5%liUSKyK#-vff+Jem4hDt%3#!UoZfq%VS`$s=c_|Wd_M?zCk*V zkK#9TVsXNqc4%It%l~CNLh~;d{^Qg?teNnNuz$%o=zx&xGIBD1f6$y?nOBHyt9Qc| zg8}^X277+NZ6WcDzD>T|pT=hmc~O*LPzASknZpcK9h%%`M-Mz)16vDCcv{vnHt^9h zTwB&AY8oL))iDhoZg!$$MdNwo8dH4UdIyx6G-&dbKLS(yDb&1IpsI$g%*uv$g5NqrP~9rc-wAxQ6vZOu==oUO z9;8p>ydDCXHIQC?HVev@snK@DDcDKWsE(yAw_aR?X<;H(7bEzZFP}!w&+(|JyBsr$ z!iBNePdt*VNALb^VJB~R!oGjf-0|~foU30CM#D!?^@af;tGtd&8yyFu&v9T{C`;s3 z=Him6yZDge18{xVVqTrp4n@8<;b@LN?{PE3Lmfk@@t`J{nw3IEtUX3o#08VV$Jg__ z1^?K!&SW@Jas(cnSD+oPSD={TLh%bn-v8(lr9r_gnMI zKqaDl*#q2Deq%uG4?OAKl<59s`@Cq+-p;C4S;li8$ zP`T0;{;TA;^zRoirRsF-t}{Zmp(4?ax95ldQ{z9+cY)UZDCli#ATQL`av!?|^rxc{ zENhy`r!}kwTcdD3+jtQ)8QI|CN*7)+sT~#&b=)Ozo{y&L(~wOPq}jxJ5bmbA`1!sA?b+nWYji|(x7HZA+vm%t^b|nqDdFBy5lp^Zq$nQx0&c36Lx*J< zMqU-PI-?X&Y$@Wi7wqOOSEfRVH=$EM$#9)NFNnA*fY%@DgBfbOv4Wm};vbUaiN`Wt zotXq}8?5+}H3y)0MVILNQ46lI-G+BhnIf{07yQ1*2H*|%e)u`;BV75ljAlkYgnt(K zX!#jvXz(pe6MV1RAbO#GGe11wP^5O}8=Nw-Qy{_!e@i z#_#|sO**O3p62#x(@eiG(dEf^i`Fn{8oYqP&NtPp=(h_Wr|k*n43+tYhf=(>fxlUp2k)4T%6iJm~0ZoP?T3f4Ag6<5iQ8sb`1I&KjwuVKbq8ipxvr}) zM|macIv<8hQ|Cc3~d}nAi^StG5A(lS_2^KvT8`&ll_P!2Y2;vLFgZ%q6foP!?9D zWq_+kCX~}j;?#@~E?$rVp_4vidHw)CcaSAbU!IBUcc+Oi-StJOZRgQH{DY8_qRUry zy@t{LMj$)rwZLVNqGRR8(m85&{Pxdmh!ZWRD`NytspUwt7x#ksiPdn{XCYoo>cvaJ zkLl$ucRK#~HSpBm0BhsV0~ruZrx=Z*maGH=E{wyyRTiR`&r)$?)Nx#>;LUpU7W13u zY`BrUkb!FdtjMlKL_6Xq@aWG|=%9O-Fjw@G746pNm0gis_m~csy^0_?V=1)E3d4G# z_Btu(Ff{_Y@Rz6u`$`Li9!Dki-5LXryC0y}x94y)crs5o_zJC@M$nZt-ZbE{JdJH` zbhOLYNLiz ze#<6?Y4CfWEV&ON+z`&_UAu1L*wG5qLoFKD*9&*}T3O6JUyXNV7f^Mh={%(FulV2e zc>JeiNUJ~1gaLi3_~)@Mtx&8KuPj?nEe*_g*x*B~r#~MhLq0IqR43|F^prfEd5mQD zC1Sg~6^)%cj;0!4X4`yoapgoa+PQEJyzx_|;$x#6hwX5N75hGr>`jgWpJX*&?mYxc z>;3tPALF?C#$lql3Kv0r={1B$7VPr{5ANf=n0wsW&-PAyBbJ{T&FHde9!cc zMcYO;L(DRJ`u^uZu#^r%IWHlH@t+Pq&UOg(&vw#su>;dS{uL{AGF(MZXm&9jWal_*!=v zu1&fQa!1^#$JHzNMDVQ-vWZ}Eb}#W&r4Aez6iB=0hT`?#j@;*l z-kfd5t;8z)uc0mXAF~7{F6r~{o>kaw(gkyj)#$Rodzk9k)e)ff`!P+0lcSzmKgv%Id9EP_oT3^6}8xX zHBMY^ZvwjF>oDQbS+GoRhZ;RK9;TBIQ{3$ZWzC`?fg0lYMluC_BH4o~jF^>p!NVtLs#<+_es8+*~Dalr?!cYZsn}@WOX?KP3JmW1pu40IPFhz$KZUzv0CZH z#FjsCG2#=2-dK60&<6+#;2k<{wEge{$ME59e8`w07(IL{Hhmev7q*q7#^Y-|eL^@t zE|rBXNow54WIP={ri&%tjpa2G8T^agZ0e@qLA?%NgoOP)c;HDECJ0%JMRH2C}rzM)8nayzU zkm?01bp@X6r^`cTOYty)b!1j>oy5`%H;&F3 zaTAuz(52H}J;N0jyP){^2T+<{z&0Lg!_X=Bpx6B~lL=}hIhw2SQns2{V_qd0w@i^T zWhue$BF7UCMd9PM-e|m`30c<#s2-Y)Fa1^M3nz76xqk#*7NkQ9b{#^W0COIsY0qO6 zALC_Nc}P2N0akRfphFpqZ7c2*FZJD+x-uEWZO!;v-jrIoq+!+NRrqX=J0IV+1Qu3Y z!&`H*G4<0SHv953IJnAB(uRCS<)dduPtr2`x+k?PskvX;8EBNx0zH$2XxKjzrJXY&{iZUPlrf`jQ^jOgoEaS#x0_l;db6vP(mnp#e5OVk zyd9vyli5vpx#S7v4%~+F(<0#3wB;12D}i)^kbTKcf=p;RdACTL*0cz$?0J{* z%o8IVyrd2r|41_1u6T0n+d!_n+5oqQT?1p4;Z%EzDVojGqQgdZV|K$_`ZBE$?#E2x zcR!uO$^A!Q?1Ko=9m7f}_K(Fa1&G@M&*HDnMBt;;d5p_3zWDS$T&>{DmC1e_^6d!o zXd;wj*BB@}b!pG$fY?amoT75v1))sz*QU1AbR5**>35aD@j>l zHhLMfv)q4m@Sx=#c{uwB?0wJzHg5fpIdUgEu~N{yOqs;}2c$#JCkq}MPy*AM^=M5h zN3)d{ET&@t)eKq4ex2P#i@r+G&Wl1uaNQZ0^h8F;RhFhdvX1k#Z85yzE>PDV3&^XC zf{kIPKz9CR{Jz+V=4#BLCpC)D&qR#}STCXM(M8PLR>N<%O$>AJYs zBZS>G8IB)|M#Gnhp}bu=4tl>|BpMriq2-AwbzioS`aPkryF-i1ZHtGO_JxAyD}&7a z-cNEzm||K76MQQV@l3}f+*LB2pSe{BtrO(%$D$|OfTHq4n1+9_rqg0xur!#XHZi(w1=FM~t#fAD^H z5Rt0XM(wf6w4zm!zg|3$ElgPhO&N-mw@u=40io>1lT?Tp`_ZR5b-b_k3P zxr1bhvIz$HeL?leIOu8hX2U&_aM?6J^3qI|D(VfTfg!uNM4&P^9BIcTua`sN%O~uq z;7OHp=>*vk2gvzdd-J zVVtP1rV|z&Ig9HmjQR4L&5--m6fd|mP@g9{yiw1JdK%k6VpSwHb%=nL>Ij-CF&Mnd z^r>yt5VVyRxLiGXFjUHz{VYGi9GpksS2Il>d`p+R4?QdBSqJkKPBT%ys2R>3xQj2_ zn}LvULN+Xd10F!zeUtEYiwW;p6^%O`Gl0E$fcIC&@z>t6blPKkxbx;I919k54DK0m zDVGb_d-EWD=A_I&#YK^2*R-&sQ;*BdJ%Zv_HKbQff~SvLiYa$&&_pYo*S-6M5&H$d ztFJY7x#!@@<9|rD`acYw?!%SyENI!@ZS?a-OE|FcI+@`iL5d9|A=%xYpEVVXZT${> zjnGRI{C0fi=mlhj?i>6yv=N-9# z+5x8Gsz;BY`GbgGw0QtuLpQLAW^cqU!umhB;FM_cwa4f?>>8VK`X8QokqP~QR;W4A zh_3xP5et?t1FbRxDv7)Cs(KP`NxhH7Lf@%KOCN^q%mbJDIPiM^9ain^B^~nwet5qG zEzXFBHE(iJ+CYi^-JnT#{73?aho-2XvyuMFFrn2>O0aeWha!(=IN1D3U`3P=smG_m zTje#JuhJ&th7KU1TKNKtHy)gl1$MKt5B)In5Ip7-0xQf>}4xw9Y6!?nf^_V~n-{yY=3^!`fh`?%G z-_(!g-=1N4yB)RE?1JbWKOk^UExNRNQw7;(wx(w(ZI*3@e>KuVK6@T<{!f8O4gMgG zP)S7BX@&>(RzAXykC20!u zSnXLJbo~VM2|U#>KOyh?(jKvWD{$O7YdNA0$`^tYrF-*R*+&5j*MRTGmz zCuIzsR%wA5RqC{?Ii(= ze!oJ;gomhVa9lL=nlD)*Ylp9f9Thi!RG`K77I-c?Lo{>OG&bv73tp3X%zoB4vWTEz zd~cx(l{jxL;=4z2kLD=wS{(^Du#WY*96(~9Kt(?{@a^l%QSwSIZht9FZ>ue(>P{7+ z6c25xF*6=6WZz?v7fb+!{kd$9z4%|?Ao|ByiFQOp@gl4e&5%^)Q@RxJjPXABVEKT2 zm24-PkCph;Vi8~VD4hhxE(G-fePsR&XDYsM8}?oGfD3&d^!2oLShDIH?7r8dMQ{QB+5H1&xs zuXVnJ^V@XAR|G%jvF|JBjPoy0MqpW<`Ubc$!x>6bTikskQSi2053Mkbk8QKUT5HNJS_&|9 z`%7rx(U84K$R+bHXCEAfkS;$1j_Jz0N6?zJuUDmBdzwMb%9+~j%!Ti_HnZQ)Z=mBd zVNVeJdD-v3;Rl6SNSkzk%L|%*^W7Rke{D0UIUXhnBWGgA79C>b7tSq&-k07`KfyEU z#x-9WQCrPX{J!ND;(FPD*UlQ}fYtsYU%$W5u09>aGc0Mj@L97pu+%gqD3H|U zTQ(d+$JF6G-7}5p&vD@0w!N^-U?Cn}T#Yw$73lPZGIW61QM9pGM)S|=<5H&%bam6g zjv8SG+Z+!CLiUe3n}<}|gl-UKZCT?ZA;xZTmf`PZC-4?IL6`JL$WhXM!;Cx2 zLA=n8FB>UO!E+QZ_`MZ>4tb7Asc)gmZ9MeVDv8|E+(bitYtih+KKyumIjGK;A1H&(0ZzETt4LfUv$lDBKe*cbgIuUrZ`vdJZR0saT2k3 zrqvonn zhaPpSnFH-Rta-?YFtBb==DI^eVe^|mc;LT2w$4JC&%NXijqEL!yl98}k*Cr8#SC(H z;6pSys)IE&mz?~t1ZuR-KuO?GI=Xx{yf#*$je<@*A+!YZzPF*h!C4H`I)JOIrqMCq zquAIjX^>QAP3O#RBdeA!=VSh8vDC(YSSNS@@4cG98|GEP%~{nr@5)C<@B0~;YIPCJ z4VJUlgA2$2r!JNg`yFD)V^}w+3U`);ladBK-h6yG_x8Jh4K0&M;&fx$d#4h@BpxvR zSBCWWoGau^=x?|aW6D>!kHd{$9LRt*>*3aIb8+CoDSU;b2W{1y3FhiCaL*|bA2ptU zabNcG>oE1#z~PUdus5%KYXPI~7|>~@ZoIoElC#d4cw^{RzQ5`l zDyaQeleeIMDGAw%r_ci`>n*rKcxNJezuLB9)5{I%0Z zJSy~_va7Y&yxl?6B>N9bwBE!utA4?e!cI_E6z0d~LrLf&bNpXhR~{C{k;MlAX9yx! z1iTOg!2gkV@~>=-dcrl3`g%J2gr?Gx$R7)2dRGImVWq%Sd1=ZRRt}Ip@%=*+DD@_VXJr zI*tzQn2db1$!KcLb~-YDsn9#938}vt$+~LyqF(C>NH)4cIqTyC^s8`+qPG{KuD#P( z+37Uew|fVgQ)Z6aV&wEr)^MErYCElJD4``kYSE0K7TVKchYLE&Ns7Zx)F~7RQ^ri7 z+j3wJQneK;EVN_atTkqbhd866opv;N#!qN6y-M;Yo*{~iLex2H7wLHDhT5L_v(}CE zsJK3l?u#|W3ALAr+j2Md^@vS$s^wT5hR0xS(R1{jm@kyAw_<;EOX%=p8Gf+nsp$2r ziR9XtVKK6lvg^a~-`|DvG~xN7s>vTcqH`}saRW-eSzgz4a>WLd%Cos*x zSa1F zRgSY~t)szD=1J=>)T8^`nyI6o9Wwo40y>Za=Nx+aqO1>EXvkm-{IzElo%hlK_at3H z)n|)Yswxy6eJ~Pjxl%xnJW9sF+Ve>BbOwDtqLJ40df~ua6DqYXLix;|EpW7C)zPu+ zlfDKNvkcB87}rWoG9B?Q^>ETNtejT-D}cE`KBaO{yP&OWMyn#y@bspQbm(ad^uY8a zab9>A(R(2*M3IdPWbnSC{w(b-4PblQEa}b89M+PO!~Xfgon#(e%1YExI8~D%W;Bn- zxnu8$L##4TNp1#vepf=|E*MrzCbS*9i{<^t7-es;q0KZo2VR8qI^8@ zBf4J!XXx4=L7S7$l2P+L;GIV?dhe|mA@p7xsrS=RKjjRRal{*!xJ_i&_D7-UL)~KG z+7jC51bf53NFWim3jF&u1^WD$J=?Rw@7werr{VesuTe~m$T~b@(8&Ym;9Fp4q$A5M zaO@ck4e7W_W8|5%Bf=cN3UgrA4+hYp_5e1w?J-zhA2ETi0DII($lYW~1 z3vFDyfVR9)G0h}LQuh8WrAgim@%dyp(_6SsxBBPP_Vb%j<&z>-8drcfMGrw&7ut#n z*D%4ic_vlni0Jacy{J9^yx8ipi`ATRqJGn7Gdued_&;nCYD|k{E*ZyB*{QoILq4DV zve;4lPIVhCVe#}g*>tSwE+(NKw@@9uL6^Gjqs_hgJVH6-1dags0!#pqLa4juZM;4h zuvnFzX*{0ytAzRTIm))cpzM~IcP(&xx<)}k* zU7WVM89j{aLJ`?6WL;D_Iq&X{?|tNr4?e3FPJF)>xyEcp-$VY0UxOnig(`I9uOFj> z)~Cgtc}95u{b5qo&JLozSb)X_-WHnXP~?6p5N&vQ2c`a_4L$Go-xPe|vEX@@hH+Xz zox=%W4M<$9PRmJDr>W8*>2glWifm}}0DZq#`WSG%KgXEIg~JJ84_KMFCVrJlwIVHj z2@kbbWa65?z@zful)daR%|WTu;VE-IX@Uwvfh@cIia5J!u@(B#!>0 zP@)$8WQE%j5>r+#I;=A#?MLm9eU^!+5tbO%a)p~dsa zjkcA(41>naeZe4Y*Gm--6t+}=Qvgz;ia;UICPW2Df!P$rF^)E3ZMw?m8^ zKAP~m;BDRP^ObLumu?MZzB`ck=ZNsOB)XUd(H2G@)U)5?D1AH zVT(W6UFk&Ji|nNq+a#&|onqyZoR#8F+EO1ULeuyD-7#_uTAb7mwb&40`Rez0StUl1qS{n zllAKLO?h8~eXeV;&viIB4=;0I>>!M|y*Kf;E?b$$RIcOU44eas5$ENc-u;r>G6(SI z6_1U<22^!(!biw?^frx*48K?EZU2w29-w3JmFx4qoLgT8gI97Ln-%op&p#;t$M+Ha zedPT>= 1 - x_test >>= 1 - x_train = x_train.astype('float32') x_test = x_test.astype('float32') - if args.clip: - x_train /= 127 - x_test /= 127 - else: - x_train /= 255 - x_test /= 255 + x_train = (x_train / 128) - 1 + x_test = (x_test / 128) - 1 + print('x_train shape:', x_train.shape) print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') @@ -90,16 +80,15 @@ def train(args): y_test = keras.utils.to_categorical(y_test, num_classes) model = Sequential() - model.add(Conv2D(32, kernel_size=(5, 5), input_shape=input_shape)) + model.add(Conv2D(32, kernel_size=(3, 3), strides=(2, 2), input_shape=input_shape)) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2))) - model.add(Conv2D(64, (5, 5))) + model.add(Conv2D(64, (3, 3), strides=(1, 1))) if args.batch_norm: model.add(BatchNormalization()) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) - #model.add(Flatten(data_format='channels_first')) model.add(Dense(num_classes)) if args.batch_norm: model.add(BatchNormalization()) diff --git a/examples/nntool/mnist/train_model.mk b/examples/nntool/mnist/train_model.mk new file mode 100644 index 000000000..c17bb7710 --- /dev/null +++ b/examples/nntool/mnist/train_model.mk @@ -0,0 +1,21 @@ +# Copyright (C) 2020 GreenWaves Technologies +# All rights reserved. + +# This software may be modified and distributed under the terms +# of the BSD license. See the LICENSE file for details. + +MODEL_TRAIN = model/train.py +MODEL_CONVERT = model/h5_to_tflite.py +MODEL_H5 = model/$(MODEL_PREFIX).h5 +# Increase this to improve accuracy +TRAINING_EPOCHS?=1 + +$(IMAGES): + echo "GENERATING INPUT IMAGES" + (mkdir -p $(IMAGES); $(MODEL_PYTHON) model/save_samples.py -d $@ -n 5) + +$(MODEL_H5): + $(MODEL_PYTHON) $(MODEL_TRAIN) $@ -e $(TRAINING_EPOCHS) + +$(TRAINED_TFLITE_MODEL): $(MODEL_H5) | $(IMAGES) + $(MODEL_PYTHON) $(MODEL_CONVERT) $< $@ diff --git a/examples/nntool/visual_wake/Makefile b/examples/nntool/visual_wake/Makefile index cde741ccc..80b51b7a3 100644 --- a/examples/nntool/visual_wake/Makefile +++ b/examples/nntool/visual_wake/Makefile @@ -7,7 +7,7 @@ ifndef GAP_SDK_HOME $(error Source sourceme in gap_sdk first) endif - +MODEL_PREFIX = vww include common.mk IMAGE=$(CURDIR)/images/COCO_val2014_000000174838_1.ppm @@ -16,23 +16,27 @@ io=host QUANT_BITS=8 BUILD_DIR=BUILD +MODEL_SQ8=1 $(info Building GAP8 mode with $(QUANT_BITS) bit quantization) NNTOOL_SCRIPT=model/nntool_script -MODEL_SUFFIX = _8BIT -TRAINED_TFLITE_MODEL=model/visual_wake.tflite +MODEL_SUFFIX = _SQ8BIT + +#LOAD A TFLITE QUANTIZED GRAPH +NNTOOL_EXTRA_FLAGS= -q include ../common/model_decl.mk +TRAINED_TFLITE_MODEL=model/visual_wake_quant.tflite # Here we set the memory allocation for the generated kernels # REMEMBER THAT THE L1 MEMORY ALLOCATION MUST INCLUDE SPACE # FOR ALLOCATED STACKS! -CLUSTER_STACK_SIZE=2048 +CLUSTER_STACK_SIZE=4028 CLUSTER_SLAVE_STACK_SIZE=1024 TOTAL_STACK_SIZE=$(shell expr $(CLUSTER_STACK_SIZE) \+ $(CLUSTER_SLAVE_STACK_SIZE) \* 7) MODEL_L1_MEMORY=$(shell expr 60000 \- $(TOTAL_STACK_SIZE)) -MODEL_L2_MEMORY=370000 +MODEL_L2_MEMORY=200000 MODEL_L3_MEMORY=8388608 # hram - HyperBus RAM # qspiram - Quad SPI RAM @@ -42,14 +46,14 @@ MODEL_L3_EXEC=hram MODEL_L3_CONST=hflash pulpChip = GAP -PULP_APP = vww +PULP_APP = vww USE_PMSIS_BSP=1 APP = vww -APP_SRCS += $(MODEL_PREFIX).c $(MODEL_COMMON_SRCS) $(MODEL_SRCS) +APP_SRCS += vww.c $(MODEL_GEN_C) $(MODEL_COMMON_SRCS) $(CNN_LIB) -APP_CFLAGS += -O3 -s -mno-memcpy -fno-tree-loop-distribute-patterns -APP_CFLAGS += -I. -I$(MODEL_COMMON_INC) -I$(TILER_EMU_INC) -I$(TILER_INC) -I$(TILER_CNN_KERNEL_PATH) -I$(MODEL_BUILD) +APP_CFLAGS += -g -O3 -mno-memcpy -fno-tree-loop-distribute-patterns +APP_CFLAGS += -I. -I$(MODEL_COMMON_INC) -I$(TILER_EMU_INC) -I$(TILER_INC) $(CNN_LIB_INCLUDE) -I$(realpath $(MODEL_BUILD)) APP_CFLAGS += -DPERF -DAT_MODEL_PREFIX=$(MODEL_PREFIX) $(MODEL_SIZE_CFLAGS) APP_CFLAGS += -DSTACK_SIZE=$(CLUSTER_STACK_SIZE) -DSLAVE_STACK_SIZE=$(CLUSTER_SLAVE_STACK_SIZE) APP_CFLAGS += -DAT_IMAGE=$(IMAGE) @@ -63,5 +67,7 @@ all:: model clean:: clean_model include ../common/model_rules.mk +$(info APP_SRCS... $(APP_SRCS)) +$(info APP_CFLAGS... $(APP_CFLAGS)) include $(RULES_DIR)/pmsis_rules.mk diff --git a/examples/nntool/visual_wake/README.md b/examples/nntool/visual_wake/README.md index aaac44909..f1a5aef36 100644 --- a/examples/nntool/visual_wake/README.md +++ b/examples/nntool/visual_wake/README.md @@ -2,9 +2,9 @@ This is an implementation of the visual wakewords challenge winner at https://github.com/mit-han-lab/VWW. -The tflite file is a converted version of the trained float model and quantization is carried out in 8 bit activations and weights using sample data from the visual wake words dataset converted to ppm format. +The tflite file is a converted version of the optimized uint8 model released by the MIT group. -There aretwo different builds. The first builds in emulation mode where the AutoTiler generated model is compiled using the host gcc and can be run on the host with sample input. In this mode the calls to the gap SDK are aliased onto the linux API (or ignored). The model runs in a single thread so the cluster cores are not modeled. This mode is interesting for validating that the model is generating correct results and evaluating the real quantization error using the AutoTiler CNN kernels. You can launch this build using the command: +There are two different builds. The first builds in emulation mode where the AutoTiler generated model is compiled using the host gcc and can be run on the host with sample input. In this mode the calls to the gap SDK are aliased onto the linux API (or ignored). The model runs in a single thread so the cluster cores are not modeled. This mode is interesting for validating that the model is generating correct results and evaluating the real quantization error using the AutoTiler CNN kernels. You can launch this build using the command: ``` make -f emul.mk clean all @@ -16,7 +16,7 @@ This produces a executaple file "vww\_emul" which accepts one argument, the imag ./vww_emul images/COCO_val2014_000000174838_1.ppm ``` -The images have been tagged with the expected output. The \_1 at the end of a filename indicates that there is a person in the image and a \_0 indicates no person. The emul binary also dumps the tensors produced at every layer and the actual weights and biases. The AutoTiler may have changed the order of these tensors to reduce the use of 2D DMA transactions from external memory. +The images have been tagged with the expected output. The \_1 at the end of a filename indicates that there is a person in the image and a \_0 indicates no person. The emul binary also print out the tensors produced at every layer and the actual weights and biases. The AutoTiler may have changed the order of these tensors to reduce the use of 2D DMA transactions from external memory. To disable the print of the tensors you need to set graph_dump_tensor to 0 in the nntool_script_emul. The second build command builds for GAP but the output can be run on a real GAP development board such as GAPUINO or on the platform simulator GVSOC. Running on GVSOC allows the generation of execution traces. In this mode performance data is generated with the number of cycles used by each layer and the overall graph and the number of MACs executed per cycle. diff --git a/examples/nntool/visual_wake/common.mk b/examples/nntool/visual_wake/common.mk index 12603ee79..f3cfd41c2 100644 --- a/examples/nntool/visual_wake/common.mk +++ b/examples/nntool/visual_wake/common.mk @@ -4,7 +4,6 @@ # This software may be modified and distributed under the terms # of the BSD license. See the LICENSE file for details. -MODEL_PREFIX=vww AT_INPUT_WIDTH=238 AT_INPUT_HEIGHT=208 AT_INPUT_COLORS=3 diff --git a/examples/nntool/visual_wake/emul.mk b/examples/nntool/visual_wake/emul.mk index 758df0205..bfa0a7d9f 100644 --- a/examples/nntool/visual_wake/emul.mk +++ b/examples/nntool/visual_wake/emul.mk @@ -5,9 +5,11 @@ # of the BSD license. See the LICENSE file for details. include common.mk - +MODEL_PREFIX = vww +EMUL_MAIN=vww_emul QUANT_BITS = 8 -MODEL_SUFFIX=_$(QUANT_BITS)BIT_EMUL +MODEL_SQ8 = 1 +MODEL_SUFFIX=_SQ8BIT_EMUL $(info Building emulation mode with 8 bit quantization) @@ -15,30 +17,35 @@ $(info Building emulation mode with 8 bit quantization) # the quantization. This is because in 8 bit mode we used signed # 8 bit so the input to the model needs to be shifted 1 bit -NNTOOL_SCRIPT=model/nntool_script_emul8 -TRAINED_TFLITE_MODEL=model/visual_wake.tflite +NNTOOL_SCRIPT=model/nntool_script_emul include ../common/model_decl.mk +TRAINED_TFLITE_MODEL=model/visual_wake_quant.tflite MODEL_GEN_EXTRA_FLAGS= -f $(MODEL_BUILD) +NNTOOL_EXTRA_FLAGS= -q + CC = gcc -CFLAGS += -g -O0 -D__EMUL__ -DAT_MODEL_PREFIX=$(MODEL_PREFIX) $(MODEL_SIZE_CFLAGS) -DPERF -INCLUDES = -I. -I$(TILER_EMU_INC) -I$(TILER_INC) -I$(TILER_CNN_GENERATOR_PATH) -I$(TILER_CNN_KERNEL_PATH) -I$(MODEL_BUILD) -I$(MODEL_COMMON_INC) +CFLAGS += -g -m32 -O1 -D__EMUL__ -DAT_MODEL_PREFIX=$(MODEL_PREFIX) $(MODEL_SIZE_CFLAGS) -DPERF +INCLUDES = -I. -I$(TILER_EMU_INC) -I$(TILER_INC) $(CNN_LIB_INCLUDE) -I$(MODEL_BUILD) -I$(MODEL_COMMON_INC) LFLAGS = LIBS = -SRCS = $(MODEL_PREFIX).c $(MODEL_COMMON_SRCS) $(MODEL_SRCS) - +SRCS = $(EMUL_MAIN).c $(MODEL_GEN_C) $(MODEL_COMMON_SRCS) $(CNN_LIB) +$(info CNN_LIB++ $(CNN_LIB)) +$(info SRCS++ $(SRCS)) BUILD_DIR = BUILD_EMUL OBJS = $(patsubst %.c, $(BUILD_DIR)/%.o, $(SRCS)) MAIN = $(MODEL_PREFIX)_emul - # Here we set the memory allocation for the generated kernels # REMEMBER THAT THE L1 MEMORY ALLOCATION MUST INCLUDE SPACE # FOR ALLOCATED STACKS! -MODEL_L1_MEMORY=52000 -MODEL_L2_MEMORY=307200 +CLUSTER_STACK_SIZE=2048 +CLUSTER_SLAVE_STACK_SIZE=1024 +TOTAL_STACK_SIZE=$(shell expr $(CLUSTER_STACK_SIZE) \+ $(CLUSTER_SLAVE_STACK_SIZE) \* 7) +MODEL_L1_MEMORY=$(shell expr 60000 \- $(TOTAL_STACK_SIZE)) +MODEL_L2_MEMORY=370000 MODEL_L3_MEMORY=8388608 # hram - HyperBus RAM # qspiram - Quad SPI RAM @@ -47,7 +54,7 @@ MODEL_L3_EXEC=hram # qpsiflash - Quad SPI Flash MODEL_L3_CONST=hflash -all: model $(MAIN) +all: model $(EMUL_MAIN) $(OBJS) : $(BUILD_DIR)/%.o : %.c @mkdir -p $(dir $@) diff --git a/examples/nntool/visual_wake/model/nntool_script b/examples/nntool/visual_wake/model/nntool_script index 70d0c70d0..48ede4bb5 100644 --- a/examples/nntool/visual_wake/model/nntool_script +++ b/examples/nntool/visual_wake/model/nntool_script @@ -1,19 +1,15 @@ -set debug on -weight_equalization 0.001 +set debug true adjust -fusions -set input_norm_func "x: (x>>1)/128" -set input_divisor 1 -set input_offset 0 -set l2_ram_ext_managed 0 -# set dump_tensors 1 -aquant -f 8 images/*.ppm -T -qtune 65 out 6 10 -nodeoption * ENABLEIM2COL 1 -nodeoption 0 ALLOCATE 1 -set graph_reorder_constant_in 1 -set graph_produce_node_names 1 -set graph_produce_operinfos 1 -set graph_monitor_cycles 1 -set graph_const_exec_from_flash 1 +fusions --scale8 +set input_norm_func "x: x/128-1" +imageformat input_1 rgb888 offset_int8 +set l2_ram_ext_managed false +set graph_reorder_constant_in true +set graph_produce_node_names true +set graph_produce_operinfos true +set graph_monitor_cycles true +set graph_const_exec_from_flash true +#set graph_dump_tensor 7 +set graph_trace_exec true save_state + diff --git a/examples/nntool/visual_wake/model/nntool_script_emul b/examples/nntool/visual_wake/model/nntool_script_emul new file mode 100644 index 000000000..ea2d4f9b3 --- /dev/null +++ b/examples/nntool/visual_wake/model/nntool_script_emul @@ -0,0 +1,15 @@ +set debug true +adjust +fusions --scale8 +set input_norm_func "x: x/128-1" +imageformat input_1 rgb888 offset_int8 +set l2_ram_ext_managed false +set graph_reorder_constant_in true +set graph_produce_node_names true +set graph_produce_operinfos true +set graph_monitor_cycles true +set graph_const_exec_from_flash true +set graph_dump_tensor 7 +set graph_trace_exec true +save_state + diff --git a/examples/nntool/visual_wake/model/visual_wake_quant.tflite b/examples/nntool/visual_wake/model/visual_wake_quant.tflite new file mode 100644 index 0000000000000000000000000000000000000000..8d33e4f36cfb83209d69eff2e1815d31ed61b076 GIT binary patch literal 309136 zcmaglcd#VaeJJJuiF%eTmZc|A65k^wkrYLOBuFAGfDPDu_vV}?=Nx)E=Qz{TJvrxl z@7$c@=77Zl3t$mQ03;*^u_RieRn)U&vuxLUPgSZC{ph2TT;(eN;i>wnyH9tY?mpAu z^f|x&g27;z?H_#+48ZSyG`MVE8n!d+TYCVgP9d!xw?4fZPA@ zX~S=U;q1iKi7#G-Of$2YZ%J9r@K4tjne-D;@3giop;&cDiUaf87He1aGZmw?}nfOWzK!*jsT10W6frO#j( z1%Bl=7!CoO3yeF#{@D$N4)FIjgW*0f2>b?6u!7}*pIN~2z^}{(!+!;S3H$;$1w0FU z1^5E+S>Q9k)4;DxU|Haoz%PKG0Y3sh0-gY`1Gj-gKmbO8F94qfJ_9@r{K^RO0DcMl z0{9v5Bj6+83Gg~_8#n|ufh8aY_<&Quv%s(BK_0*ppaB#B8}Kaft2vMt@B}zD1GWkL zXd0vePk;vC15N=?13#Jq+W`om4|o;$4Dho_un)i{KmdKftH5V~pG|=EfK7k^`u=4& z4)zmx75EJBH1MM_una%|eZXgd&j3Fg1@nMSfB^b{&klikz|+952Ep+GzXW~(+y)MT zO@ILUfLDRf06!Z5c>$Y!psaufPykYZ01!Y7cop~z@HFtNs~~^im%!`5ZQu~t1eSmX zPykYZ01!Y7@BubJI0w=I-`Bx3Fbj+VeZZ^0Dd1V)E5H|k&jL^X0oc#q2g9eqwx0sq z27Uqj4EPc75%2_f9k>k~0-L}R&;SZR3J?GShylKTnFjY4aH$w#fDZttxM3CmMQQ+t z3hq$gxB;+P!{3kn^RIsid;|XHU;oSW7lHr3-xtRJ`PW|rz5+Z0JPSMroC2_a%jc&c zlNPI`AnkCWG5#ua?HQy{tYc<&`kPQ)&aFK zO8ImypUo62-Qv>SwwBD)+qK2b`rg@(@BQwd@Ba1=7Qb-#lP~_=`lmORpMSXad*9vr z){|;e=TjOLXOo;5h*5(Rfo8fGu*gHt_`N^$`F=d@|Gs_fherSZWtsY9-0^+C?dKEW zpGNjHbOZHnXk>$Oyj$_F`rSR1d&rL5O9h)M+2x>H6}gz8mN!c~>#ONpcy%jv>tyo_ z6X5uPuK>>g&jQZ@r-0MItH33o4;TbSfk|K%Fab8e1uP@~TIPmN{mZ@37Gs+tYRh>; za;={o{u5UBZu!9_u5Ku13f&r zQt>@rG|yUk(HlC4UC&`=|H@ZVClkRRG34Z#+m(~l&1~tqxSF}M+-1~)?(xtWv7Fgv zP`5~>(|k)RCTnw-9E;z}CZ^=<8cFHpz? zDOF9R^L#MKCZ!}_%#kZ=vV`sVE7?;933Oy3ZGeRL2lYF|WmGa@w;JMD% zUndKtuP4_(f27|1`lUZFEWJ&zmP0{AOvxs@$a~YR#Y+w+7UrEeT31rObeq*_nOK)q24$RHT9!TO-%?wy8O65b=!$I$&9WyiGLF*_#GCewzLPtGFay((O zX{_&BxKHsX;r5#^K&Hf;XBaAk^8)FO6ZPGWWg%3ll6V`9&K=CS28~1WbM3RUyv;fHCliMb zA2J&nFM3@4w4Zn;l6ATY&$!(ed-d9lHPS_=y?sF!)D|uH_O&Tn+52jK_PN^FDM>Y_ zkd%LXF3>SyGw>w)!l=o`!ET98MtdTvs` z+_||hKn!cFu_6;zrwcR9XFGVXeLkV?)Aln-f7TPGN7!4hh9T8U`Wu0A;3yXRQ(FF5 z3az8g86gvJp0Y6Ekl>X0E|e7#kcYqqLm@;IqJ)PE%okIViGyIT+b1o}g!N#_VK)s)d~~mMXFe$_Z|D5phEP{kG}n@ zuhMKyWzeFYh$gmTN%(A^yjoKAcoP58-sk2we`^HKz^7b8&GX$lvSmd0RDFq5U)>z@ z-wY=&#J+qqJ+b`IG{sL}{zD`uPS`HdG3CtGfbueZ>Z`43y&ZaG5E~ty`1;(l(upT} zrBAkWM&|1MS5b`~L7by2tfp{6@VYr&B>N?ohdlFq!4Z}6NYZqGLeBP!F{aq#a{}MOa#I^gQkKtxKP(X}akJVbo4=sXnnxx>R{OHzG{LJOW8n%iFUF^XzS#D1 zptllULNRB0>Y^}^{z}jpXJ_jx(ZFm1jYTU#L0Xl)l^WwMV09-O#Qh5af51QUybxSH zUl~np-3<*VMzV>z)2T0d5LeLarO1U{vngH~3=!cP?!UQ#Ow8!wFgN>hZWPKmL|1W~ zqGzDa3PKj-B+1P)9%*iJ!8K*|J4TFbA_YZ8^H;IRRg(=5TpEp1{){pON19MWjazU^ zx1B};1rw<|TiUp5sZs8-qefcxUI~kt0^`cEL$gq}SfTHa!`*5tYmy z+*I@7v*_Z09<^U(RomcKVO(?WlJOh#Qa&))_CZ6n%x|@qva86q<4n-GWKx~+3(eHU zRUb5<2hx8y5PU(9Uq*`MffO3N^i66Vg~Le628Y6lZ>m>WUs6Y|jWg9Xsx_}Mp>HyW(D{kG#N(fgKP4;=?+sVNsl@oyUvn|nhDNntj=0W+zTu~jiy`D}>~-mDw~!YL zt=~VqO-g4hw~h}MtT&r|=fbCEa_F37I&La+)mC(HtqQl@)P`9(=ZciRCQOwsJ5kX; ziQ#!uc$_|CM=jA}!hZ?Vr3_>WqG67(XC*6Ej0;A^<{~C3G=NVn)&k7Nc=Pc&(Hn4+iVm$-K);{;%=>GkGtMl8}pOy zqWneAOvZ?m(fsikln#WWi&iu3Onb?&W6$d6r$#2AWzro(!D$ZX%h5$6mPhrVITVCS zK@uaKoAbqmI?@gB&}?ubl}$P*HsR#^PdTYktRb$BK(1kyJY)B#vrU*V1>$62vP7$p zW?=>*XiY6;RF>2d6glB3Ee9nH4Tp(wA{Q4Efo&gQLfq?qt(5AEx1exHnaV`1oEML{ zy_1tItCB{}XIz-g12ar~j5LXiJr)Z|;naM{5^%Yb=L+pxQQAj~!?uBh9#^MB)#0iB zKm-yS9!u30v$-4Li!-XHGVPo(jvqTV21cmu7?UWG6Z8Ae;2kb#>|h-#-WS9%BsLkI zuSsQZDKXSW^!;z3Uamy>sHi1i&%*v!!!b5;SyeD(elo2!seU*!)u(%3 zjn3G*F8gv` zub5;OQ_oQSRm|&U;sdnTNX`)h{&?X#-sptO;`z1`+6b0B^O4f|qa_jDHUQPxe%TyOR$`#stEo}8F<(DibOH6DBO$GaKSy<@jBn{z2 zkJwj(FfWI|G*ZciY!Eit$~p2lw}H+nHPjKJTu_XGk|hYUdL0Drqh!W2_^gMnGWCLu z3txY7R3~{HDG8%&6bHlE2mJh=Ofk<$OG?%F$7YC)vxoIUmx3Ci{ z=e~hF<2FzKw!eM3i@7u5p?U8xIdy)7yLzAY91X?ykpnL?AO5VahLBNliVIco5SEtb z<0QhvL?MlB!720Qaw{!)GJ`Q#?;oM`nqI%(9munKnCiFNg1*V9-xP3{OLn+oRhFp` zM33PirEpPg!hZK?>e5wLAnp&!X(Pj0M5o&ymJ*(b2rXG&mTk!(l1RIaj`@Jc92sK$ z1vPQ4Uzi^`@AI99RVr=^5M|0+iFz-@ktCULaVe^`;aRFF6uw>Z4IyzByiV9+q17H7 zsPcVbmMA2Gzad#*QZ+fiwaMHh757}j7~+FoT3`P672jb0kkD3o>f=u}+18rLZ~0%bw%QKhgF za~y!Kp^B}p&!2K7JY< z*&GbTbb+|+;bF)b@q~$(AKcX;G@dXfr(Jx^6!dd+*yVtwFrUc!(>{kgqSrlWL?qP0 zL78`lnb15MLROYXc~J=Ma+&I-D9gs|1E;NF$D~z>O2V8sv}NJ1mJxTvW^;CYx^gyy zI?9OdOXpu88kVSEc1Kjwyl%~}cJykHYxycn4GskeM=XGjqZm@K6H((e?S@EusW@zl zT_tjAb~;v^%o1>L!sD%%e26s^|f0pz^A@tIpOOSEn=K}%D z#kX4HVWXw|U)pIjHdA~ne9<-ClrP*Vdl=%uUKGtsz1Tia?wvioG~-_Pu@n5M!bhs@ zcluwpMI*t3E&RlyBzyh!KZ--E1I~?F!epNHvv_7Kb+)BCGlU5hW0+$;%now1jyZqP zAL6UBPQ=$q^Tqjf-lt5@%{bDSW>UE+t_+dU1zA(;81=P;hcStkD^W zv>@Jm5WLAML|j4%0*dEbQ=Z_MCt+tJi6mX-vbc@s0|}2k5sl1+ys7-z`H>0#KG$-X zyi1W8vWR;e$yxs#$??rl!Gc=m=Y3`+mlH2}!@(dE#dEfZ5SV2$kcAwaJztWmxpa*S zmb|4H>M`ttV~H7om)h10LYB@4yptSuJe|2ob`Y9=>-^c!+yKrE z7DkYY*mOw&Z@5_%17y9b|Sgr z>o_m?n;7dJZ#aW8YJ=HfhZw&7#)4yEbN)Djuge$0P$QN~AsOoMQ?uUQr+i?d$)s5l#)amK<5dQDkQ(hQdicgNTzTM%hU z1%-c}HO}oXHvSMB^1xW!F^`mI!Yb(*7ufntBv?o1;RP1bAkiLTg08i*(HE+-J{uGZ zq1pJQnfNQScnm599Au^&9V$p}))go9cm$?WLk!)Bt`JkF_}UOF3Ftsb(#d(T)+mg+ z$#B8#MEvQ+l zZu8O$YTxl2@ zHnY(WJYePy8-Y)B?h*=t>w64kY(kFNfWS92~Zk{`(-tC{1I zjri^K_cUEC)p3~;Z}!T_wLRw@Z+a>D_;GUi(F#*uEWydpyNOU}*rWO1<#VgTT{*qc zZ2O6Zo{i*~q8-byM+sq<1g~YA+*}S?ILFswP`0^?W-+*oJL-4$C@OLP^&jszEP;m~ z9=oi0qwj)92x0F*rqP(|SHlRK%`I(qIj`~y@FYZ~np=Afp8aq?cz6HvYB_qTy>mY| zyM;Dt?7ki4;+xBLCJMJfl_axfYNXzu>ZvLl=%cpwZ%_!)EFZ?06usKxGUeFe0=ySg z1NF5ey^~(pEvAd=dNdFVrR4+$?QfiH$W^L(qr#kgPZ!cTbezyLdlfx(eLcc7Rv>mU zpa(sTSa$>3&dFkzy|=Ie>B*dH^T<||HsLJmkKv;0aM4=G;!MYPYqP5Fb_6f!u0zqF z&XMVmbsLge)^3sVH8}Xu61xx|3v<=#GD0rtDW!y#wmbxtb8LF9-z^kWKHLf~C2Nk? z-c2171mDEPL(c0q`ot2JoO&omLpg_7aqqRyi(tgRE)b@)w4O$uEm!5hEk5E zu}EXPkI(}S+h5&#s$nVpC2^csUYa|+yItmBBy25goUX*`=T0h~+;O_wUD(B;ndDS) zEfd`zZDW_@eAxYs7WT=R5-n)iB79Vdv^Zgpc&8KI>W%BtcM&RCp&yOCxIFva;E$xS zgRcFIMmLf{%9azace~|u$EOLw-I<9P72LTxZwo;Tl@2?J7Uho+Q?O5D9i7BQt7E(L zz!Vz7-osR8LsO52JwxjmUCvKT(kIQbVj-ClQWdXnRDZ0p7SpQkljqit)C8k8*Kgb7 z;WZ*CS|g#?M2GYT=M&($_&i$-y~)&Y!lzb>K?bQUW|1BwB(lYDg~;wX{P{zciMJZn z<3`Tj@boe*cH;&eO4QT_x4%R33XHZR-TRN$8waHN&|ZZleOpPVdxiO}sytKmF0#dn zE>*P-mO~^YA$J1heAEUN4mwDqys+{(xLxN_w7Rv5Pw@Zhf;5h+7L{EDsw#ck7&ggEKi#u49IS6Z+0^hJPrJJ>ei>9}dG%O>LDCaIn zmqRkD_>x;;AKt4M?iTeEH@Sj{clXvYayf#a^meuyC}W;Tk~EQRux%M~qqt%rs@=LkE-Kkp{^i2*es=3uv*Cqrs&!dw;hTJ7%f1nvRIvK-jm4g{(!cqa(LiIPJ#p|lZ<$P=%+M|FXzI=kMKqRlCia+ITpn_HZCF2a@v+&X2WHvD0o$HQR-5$ zw~))drN)a*yjA9jV6jG3TxzG+)K=JVgUCKQCVI(EBinV?>H3Nir=E2E)Zs)8mE2^p zfrN_b#%aEl*NlZsS$O2OHF~gmtKuSzRhYTna_>x$b!0W}*uNiHAgo+R(!3kasg7|; zS-ar~UhnqWkseyS(S5kNv($TgtwC1;2e^=j0^{3BEPFH%6@R+6cv2&FOE)9e^Ic9% z*e%7{g15u;n_L+dda|&2+|}#MR;r$Uy@&lv|7IR;e77mAn{(#c{AQy>EU!~{gMNxB z@RZ|QkvGyxcJ-Hw%wjRkG~S@Jx+nKgQnb2~&9@V`^BZ+0WYf`&ioKGB=Bg{H?E2kW zWGTpeP@0cW-ujM9NW#qs?X%auIkfb~QCIvNnR8?U$eM4j+;~NM*XPdJ=09Ban=q5I zEWA#v^SDE++8cl6-k?0y;?+p0n*Nr@b#()CrOpYj{N7);e^`%HdMmtrng706#HY0t zil%BE+k;~l}fXo^xXFC@)b>d$kq!;rMsr>S$S#5vd9m7zZ6>WYYS?;6s5sK zYqfS4{G}T)|MJg$0@jthKg$R?!9{7!x5)Hu&*Neza+}g)4bzAHo_F%d!kM)y9lrGO zLXN%p2I~3SkF%|ceM4_jfzGe8;`pzOl;ZylNI|JS{*(A(!m2BF=Vbq*>VSWnlM`R>m6CpC9cHk**4R6G(s}Ggc+t_Ld*YvslbSBqjyvBHlPf(4h z6c6QZwChU}9a(+A53AyWed-0Wmw@RjcQ^iaAiZANzQ%oo%uyT8<48izr}@q4rs-f(uCBT~PqfZ{{_H#S z^)0?q*)ftww7hYv--$fbOIRHZ&m7_FZSM^;PRu`AKYybEMVO74xbog# zzY^3QEn~L5_7)Ud70NfOTFJFs_opANO1lr%vUu|G6uTbyvJZW2suupK!5_v-y>%{EPB>jQRBxXC#FL1IZoL&|NOt}hUA`;duM@XczB}KF-s(o_R(sJi zd&>JNzPf2|v_psfrH13pm%F z%H!+@>H~4D)6yfx2jQKuLfQDOnsc)7N%M_yJdh-F8paH;-d^2P;*Oj$VoMEvKk)40 z;uJOZ9rxoy?}FG_%=BjKh5Zo4Y}{-1eKh^{g~H)lC9N&3s!fX(T0A>j%y#RnK9>y; z@#%)JV(zUFl`TQ@MA2M9ydxSri0bv%QC_d?SE6txSk*S24eB=RB<0u`9ePw=)6%)q zF)xcgDfwl-)iejFcotUsG7M^DpOCDj`X;h-?5`r-*V7AA-bnR;Rv&LrL@Q|x7Q`)yy@3`+7e^V)Jq=Vem z)|tYr|IW?Z7qHr_o8&(6<|tu*Xbn#>H}5wh&X6Ujt=(;R8#zYb*5Qq_0mv_wZ+XhC zBe{C`&6CHg&DD1XOa%ft;{}Z{9aoYa*W(9+*x7obFpXVaj)&V*OK};&t@sf`F!?7B zckK5&ok-}>+~&{kGOroo6#ZK9B~v%x48JBE6y98{%9V>D4b%-jETL`Cy7WR@g7}7z zDAa!JUz6!fcM0WK`H>_Sj;J>}w7y6{>0`LLn9|^M?kEUvsW;-Y#)VQd#7ylUo(~&4 z`}JCu^V+#yHq~7&{jLk&b>ObFj%DMb*h`dV7Hb<~?JwOQSp5p2ty5vRUMSZ}d+=V< zyS|ZY-*(%;dt@%Z?u=h2xgY55<2P#iKY9FK~7y9;vC|MK{a?(PpagB{B81I47;-}u4EV%aFE7kh)oaXcG-1&tuB z;M!|%K%Dql$nJ7(`^iZW;IZRa)zT2kC-Hh+fRJOZOYACUm z6e0HD8$nIS1tChpv6Trlw|_Tm78|40RXy$3wts&Q+6?^pN`>(^WnXaTiH6*~>pQoD z$B7@?+FR{D&qVTmlA=fVKf299?YHTpcV|y>mO*CKeUkk7!*`4a`Gx&cnR-|HiJL6^ zD24NzNq)osSX?+KKC+)y^7H9@Z%rcvw#eR4cD6zkQv6hb1#28amcR~LERbgmUO{kT%=}s=uj_^fo>1K<1|ST_2rtw$$| z_mj*U`|Axaba?cd*!aj@?Bhq;9jd|Zm$oi0uG|P`Gp6=M$=8Z4U&ft33?_r%P49&l z+D#}IXz+9XuV$4O4~O?pyO~{>d1W!}?rF8}&CC~yk*}?LzMr(;ouKup<>wyi4ifpi z-P$2cZ`mf(jOSf@)|+0@zuRPvz7u%W4PprDn{0a=hSRG}9M>^9U1CO$!xwUqYNcfj zO_jsO+k4I4|Gpq4%G~A;J!r8G8ShnZ>-U^no!WIzKjG5j#j=`Xtj((X!9qvB+|F;WlzBg~ zN!rWSNimryt+gC&G(p!^4u|V4?8kJaQj6`(c;ngbM5J2xY%CzRKgdSv)!S-H5R~hX z*vqzHN{^=T>325AoA%d)zrae}at~Blk97Q;e6N%q@N(A9hbw_V_P#)_@Z`sG{%r4l z$=Y?ye!t7gE^RTjQK_mJDkmD&pG3LE$)rnJBGJKggHchUeG^)$^ZUU$_X$*4EjJbv z`eJ8)bAu!c!}3j=&gF(oRUMV+m@48`>Lp1Y*&zBcpZ zIAyxBCtSpL`04-n4mRGCJ`Uf;ag^;i_@$+}gk?Xrd;UAj-uwMGH}0+%67BQmdLXfU z*(SDP)4h&*u7YiMGstbPw)!8AO9x5GoO6iP7JB)%_V&`g?ajqNV0nyN>a!8z$is)z z&jj7`my@wre*vE15B{-hiNuh?7eB1ORnv9`g3XpUcQAZ;X`TAuROkn#=}dJXyq#P3 z_Ey@Pg(-si=U+2`04P8T$N&YP0yKaQ@RRnx#m#{>qbcu7fC)|Qt$M9iPeb&F(i2L~ zrh~er;#HU8?(%|1+jWqhP#N(-p&32quMeBl&NvgX=+_(P>OmzqTlO_GGr5)A`lXOm{sHssyH+K6ScwK5~P)F(>_j-~<-vYVIC$$GQ1oKGr#5r?LFI@gb0Blj1Nw6LC>@DI*j zFm-|-wX?&w%=y^{!tE5xA?qNtAmn6g-o|s4=;Iz}5P2ikJeD^D@6w9*cNXR@vAMUd z6kH$u;H96g75T(pFMR;lLPPw4^~m_2t3|Xh&K0*rM$8w55)@}%;tIi9rCcCOWzq?{ zu~WM>r!63mE~ zz}Mrg)O`6`((Sq>CGok$jU!hd5x!fso@k-sI1Uwa8smRXk~}BsXb)>GCFkXC*FQxq zs`2RCDYZ!A)9#iMPn2);jKMekJJoN-4<_bOFJ4c)8&|(9V~(1fk8hrQQ43TT-NS#P{pUm9vsmtHAKY+e@Q0oS z`p<5+5A)xDC3b8t2&_Nxn-ossn{l+*gk=XRvZ2&`ez8^1zZy+jNbMHrf0!zgQGwS` zm)#`O-h4L+O}kWcwQzUia3a*3pB38Cb7IDrXB48Govx2qI2J#N=A&2hY{Xef8O>pl zzfgNN9Y3=2FR!4k3`Ii&5*0Rg@mjK??zL9ux}H>L`8W`oVyOdVPY7m_;S{G z#CiS+Wi9xbl==L9(A$9EJXsJL1=={aAe6=aeM z)&0`570)Pdee)jZ75&^P5XWOO^5X=mT>&9{+IcFvUA51gAi>5SxacfMFl*nKL%u3d zRD=wWLU1!T%q@bzw?x`>FG-E%Kyx+L3aLxnO0A1|(Jfn2O4!LtsCXm3DBq0uWbtkL zn51<we-Ey&C&uGgqFJ6*X+r=DM!9GnwKei}lNC<*1~B?)uqL zSSD7{0V%@KF>gg#J_&gEd{Z&*N*g|J)K;4lFOf=?+c_slDjw9g?U`5!O|7)T=Yx*P zbDV!Z@GXl|iy$}Pw^y0Ux@rxldiz7Z+|=V3>9TiJ?t(v>)~Z{G7h74YL6-}iJYg3kG6L9J=ypeMzj{lL_5G|bM_Qnh&h{O(W) z1Wof9nJp*jLh<}-4U1o`2)Ca4sxiidp zp;!o6bxx?ITTzQioMb&yanS#vmra3v2E2=ZH{~%yr#T?;XND z`Cd=3*~(o%=_PLu{|II%a~j<@+Ja8lEmZt;5!%mLF0MN1=JQ11N@cZhMG(W71BvNn z%C{vIv9zZZ{CMug0VwCCJtit2cp*A#M1l_w`K9OjJ(#P@A8ij=b|%gxaP|^9F?zhe zFuf&t^(R3Y4d%0=921Bgq5E*|A_*((944bDw*sKw)UCp@BP~TqxILG zu@09e49q(SBmLeGos_`PCPEZDS6PW!~DeB zOlHxm4Rmg~)|z~4iZ7*EZ}zTexpbEGU1B9hzMc!t(VICI#_jK`{ex9lPQo}`76~#{ zPLfiypnl^};bMNlJ|tdmU*z$QDZQ;k7QG()#$oNIkdKUPp0XtePaT&!$wo5~-7+r+ z$KnM`&$S-^04E?a^G-}L=EBqUr7bG1$hAv~+IDv608EraWA^!C%vfG8E9UBzcM3bX z5UOVLmSCJb0a5OkUL&2*L}7ISUf+z!H|p`MrL$FPZa2T^b%-GJh9F|WxU%96!6kar zrKQ)Olz2(iz7(b&NY7ajt+&Nh^3w>n{3olJ&w=luaH$2|yBSwaL27XJ&Tp3nT!|?Q z)(P)%!34VyMHyRuD(BkKh^31#Jsu`Oi-)qUDaPe(Au_4FHu;f^wg zoDXyF!?Dba5Bk{WlRoU+zru>;suuC~$L6VY4wf3I!%d#P)4T`&S=d6Kl~D)sM-}76 z*K!Br-XlbL?%e7@_O;<@?BCt1MwkN+x?X#J{>At9QdbwVmTfNaGCXr`4D>qpZ?1SZ zrXz2zW>cBhX7{>=_VHG%&EU89cb2sq{Xj)uA?u<_xx@3HS6y1YUJLk&L{5(xUC+ zcWQ8Su3%lt4k6k?88>fhuQoBgHZ>r|vJBqzA-D)Z-2#yr3|$?Z+h!RiA=EftxPmGi4l;+*S-*bZx zJG?|_);+Z=N7J4}xKr7912BBPH54nEvyLX`fAckVxzg0xshPRSKcmN z*p1+>_2JoJ&T=I-NUUhHhvvIWJh|O`q~C_3_GPtcln<@*P7ib4Jh>1@kK*}hF5=9v zCN$LPNxG=ZozcT2D4&CFz}Gj94Iv1^HCzbr0Tg}L7S>T6WK!lkBsQQC4jm2Fp3 z(1I_#d&$4$NO@q+lA&yttUrXStj{Us{X<Qb>qX_;54fJxkwZgel<|5s#>nyfR8j z&OUMaClFpALCMlGdbzlF&aVtriZemXvJ;E@(vEsC_eq4@RJ?JKc9pl5D&26TG81bv zOS!x%_NXH5lp{G(^s)Y~V{a?tTeKjla$vBtEob53v$tjV9O_#`B5-><6O&`r?iW;Xc<12EZwOiv(e$^!laI&-6K7gw}&Dh4n42m42_6rWN!Iz zBdF!pJBge#MQgdRaPQ5$(0Q)VUV$iSK_C?qAP+^U1`__b5m0B!<@^^SzYw z=>0X~?HJC4E)*6vk$df%6N?9%($Vo}HSX^cjCS$eC2wp-)i#-EwA?R}R4@kDzf!Fd zNZTdl9{8N^FZaAz3N!%*)0f{Ii#coGDx0D`WxMza6wG5DEwLLOogw4+Y&XqT)DBv| zY6|AN!;NY)%A^8mr&F1HUE+&{^g?Z=IyPgSYPs5K>sCXpD${Nq)-8EG9>3{KMbZR; z`G;N_D@!j1of+iC^`9)vypGc4@C#5c^pD^A`XAggo^287)UoflO+EF>hka$=fEMy& zel&lOj*g!?#EDvbB6M+zf$>9S#zkD%?L*K36hftBN%*7bLv*TVszn{H&p&`}@sjrr zRg8U04SW-I9TL6$RL2&=-I|+2ikfST(M8cav<1~mxjlt%&#Q|`dp?=PI~hTf;ZZ1- z1)ZHboFvsC40b(Vi#TV*UXPk#JLoXRfo@xlA(<~(;pJ;I`Z#VedNID zKiD-jYC(MCO6Y?#?dh%fw-~G zfZ&R9deO%&2im?P&xyMlinoT2ag@`dufO~T8gneA>PoN_$~_{Jo5Y?nf@~eSqRE>f zJ~z8oRRwfG9V6%_C$1U61(5Z#X>3)jBzdx3T-ZD;_op|rY#>0sXDqH_uE6-C)t6s! zPFf<4bN~;|bBxyB@15iJ<8Q+y4f9m>rR*kJD$}ehtqPUm3wGN6qS@xBQ=Z3JtEJ`? zj)zWm=aWC=nR@6&lYk_Ql1A@l{`uGcY42a#gFgB13rAO5=|#0OaOg{tXPzi93}8VNr>Q# zT7hYvVrYRBVu?6Q2{b23jKU}aN5eQp(~M4`T!Q2xX;I`8pi<2V0x1h3B~T(oRc6!&4^XVtDI{)78la8#=;Qp zBNfg=lQ19WFfD;5R2E?sMgzgMoLT`pM}jZFI5sR`0?FZ=kYK^~8G%d4aR#Fmj-q8$ zf-)3Gh%7Hs42vc>o&>RHJPOK(<+wB_3KWPz=MgHgz)%8C<)RtFgC;pl6eWVgSRo0L z6;KEynI}M$#Xj%t;WEzSvOpm;ODPPQl*0^)&{85RDGDdkf`TU~Nh+{3fny1pW?_{T zc#b0}1(dp?D72aiF*23olC((4M3M%37EuVE(m-(uj6{P>QI23V4*ZWQ3L?U7crXKt z93?_3Bd7#si%}ptm1T(xD(Z2vT!TTRL{Xv<&8|jcR31^oI?PHmg=N6*6D&+pOfsP* zlQ04bLVzDYQss}U62{u=VrGdCkCej4a~moL)%a2+BE7+--YUjw$Uto0&xLIm>E4Y8 zWXV?z`c%l1p+gQ=FqO%-(+S#7Whk*6L=zcEWHAQH&~g%|Jh8Y2V}ckjD$)w56&)< zl0iU@C<;l8h!7H!VP%?ANFqtngajp%9LU&QdaZYHeJ$pR^G(sRoC}iU-T>%f zpt&MKv$W~^OY}pS-u)EH z*%ChFjk>7Y?#SbG)3QO;pqu|UW$ytUNp&R*5BOW#>$Sa3YlaChfDPCLlLv-jm>e~d zMjBNcq7H3|<$9ZLrH3$cQty@DjsmX}h z+%$$DHZ^0C+gkMgsMj_aV!Cl#ST0kMev6=&6reANt7&x#$HU|yT4oN#^h1Mk!rkU& z*bF9)!?^*gLT#&1TB8^XDD3b(w;yT>)9M(lBSL%3K~!y^U{MbTBuyobz$6TrVQ)UI z8N!S*4{5NaQL`LDtV-OBhjB`0alqI!YP@>Pq_f&ci&5bwFw$<(lZ4KJnPJ3G6k_6; zRG3~tIDlcmxf&SedYnKlHnY~ zG@A7)9HC5hmk~uxJcAmMKs^~32?3>1p(xzV)haDkl7byLN@LVuGLmw+%_Oia_)yF< zS>+BCSfZICjTnNlZkNtsGEiEN(Lg(laO{(WonWoSVn9iu0jF>TPVGV?z+j*(m`(@m z*XW`wX1mEk5V)SKs+DppVIXv|DB^WU`3ao0#;qzR!Waz(gr)oQlEOb2c5aw9gI!HTj3YA0bz zRpKU{T0>z5wNYlopu3?qm?>SPg#@k&%m}7CC*ic%%~sq*m{6@&t;s<1BDhs%*6I{G zlNB=?5JF`&J8U{qtCt#Oh*n`XS{6`&U5&$Z!kV-i=re@QZla*=F}ay?x!^OjHw52* z073vUfD}Lupaf6@7zLOFm<3n>`2F0+&mLbpYy463UBLe(z&`{04&eUzkH-ad8vmoW z=bCwqM(=fS|KIb?4g0lUQyBC=02~JB2IvRi0tf&q*XTfmk+JgVL^@d=o~l){V}tW_ z?MRpxo+l&DW)l5@Zm1Yi#jT2hEvXTWjI{3S9BczgxgJz z`?40snoM7&W^rapX3Z&W;relb35uH9cOJa5OQT{9MMx?1(1(~Gg{XuRM|Ty|WF&RhxR!h`nlswo;O&O~RYv5{!B@J!ua z^v0IyRHVlA3d6$gqkX<5c|~3;!?Dw`v$34hI$LtrbK|f3ZzvFBetWs_%dyf_DH}}q zZIN)s=c|-Q+0)q>YpJqCO-4G?-dLh?Y0NSuk(BeXXNRwz(1->G>kl(3I_dX;`9LOcQU1=5xX`p5lQnc zZd-B6>8K7{&syyncASbAUD+|ucrj?t=S$;>;8Zdf8Leicd2c;5nRTYK#W))X2Ck$` z{wW{J#;5A8@v);*d5^aenqZ~%>1-}UPQ7qE8%rNJdG4j||ESh~`Kux=R-n(*jQLo2 zk~B`l5&H>Pl#@uw;jG3d?3J)3?{HD>5x5dcvze%OipegdGcmSEr>F8$GqKnoS6*5C z_!zx5&OU1Xbeb&1gYP<6-)q%Vx*$4h7PARGF%g*327Ym5@RZg#9KEL=m`GswUi^olv&}Ka zM$y5;EHc`)N(h-Gh7zsKxc1JYIoASg1W0nX_j4+9MPLe)h z!wHw?h{a)S-yPHLrc-V!5oMhB%@wVs?+sqj)-_o*XAzg_-f^C2?pL_V-eY?-^7=^SO4KC_!m(Y`!(yx-L^AF@sr&0!8yI%IsLpZT};~_=TAp7dJpX240|)xDel}ciW)N33lg_2j{3Tjtj=Pm zbmOksd2`w1?aMj6axNM%8dO0qO%8iTd7jzaF$)_JIfH%d@FP}D=4DdbHPbE%OchSL zUkES5GPp9Mk!K>Xf&D=T04^~%au{}J@7eF^r5 z-d|a{=Ra0f)c?7%ausy1z5A~#D^}PydhXM!R;kvmTJ?K4Q>0m0S(*L^&@uQ=t5yjC zx91z6HxS~6{&{6(`< zs#QU-H*~E{LmJZ%=YGgT78I|oK)O#t-VXlv%F3I7^Jhq7(;rt>w1E3t!2KH7vwv7w zIeqfu?e(|H{_E@D0=et{zJBWCbgzpDo}bPAab@=>z}J2PTmjIod;)>9it83=9IFkif{{j5=I&rSyeAHZX_T%;OQJib;-5=sw zpKf322>l@apw(U7|N)c<`reV4#7SfwtS`-g!>9$jv5s|<$lo+h+$?%w+{ZB|P zq~o5f=c=}tTW$?9!-2#LhY-EM{%Gs7JLWnImHm@Gyjp+2aX=P)sJ(NZ9n-MA#|{Z3 zdR&2)Z9{=4<1-es1bgXZeygh2qP?v**F+|1Mj7ti@`Q5_mEJ=T6U_sPgMP!Iwtb?e zGyTKok&%NQ*B5(Cj)O&GPRJd7dao?S6`p#?09uL8w#&E=vKi@aPV#Y8p__8u9)G{S z-`%AklCHyDQ9keWoaG0yeG3l;(oSO;+n!Nukh6xPIrgE(cKGQhnuITU;=_^y znvG|Ug611!{%pAYSpjk1)@>Gc&BCian|423qUwX&9_e_o+S_mO(JbDC^3$n;fD@Qk zi+d(c|IBV2PmgD$tgCf%FxG0@#C5uh4&lxNb5m(!tmEL+sB+7cVxM7K(&?j3S|QqY$Ybwso%}r} zN~j}GuDUm!>pk2T=HA9@KFB{M1KS4fL)c_yhvjCq5_!7X^09FI61K0rY4S>4=?ge-N;1Phw0X74e zK1{C!6$+xzs$YOi1tQd{KY`i!JqUIXxBh1x{&`^NAbzdVf`6kA-4E_Y-gW@|U+{Yi z;LiYQaDzy;>MvmbB>;$HE9b!R?_fH?{|zvI5AYP&{Qw&Q8gao`^&UVYJrLJc<^f6o zAZD#J^yL0CvnvEGyTIC4caFw#!DH|i9vdTew!ba)cPP>f~{7c4WVQ4>z1zaxQ zSyitph<=#X1CZ8HsJjy6yHVe}zgZL0P^EvN8&?IH=`b^*oeu{=>Kp z-Nmz0GqX!mGqsty+3Bg_`Qfu;qi;=&Pc6)jOje#7t<&S9!$%j!kB-mIjQgf0#^-Cx zSC3R)TX<`Ber9sf(RRE>4XvOf1wEXGZ@rH9oyqJ9YVFF>&gKvrFUk#k0Xn z#pJoy&YqkseEaGL_0!1nVaQ`AnEwabYZsWKkoR|?y&Pb#Li+{4d=%UuzOS4E9MOky zx#0g`|06qWwxRy}Ji2=gXiBPIktxC^4gU_)H*0DgZe9eXSXYs4F_KiOcA9i;5ss&G zsL4dWVo*JZXb@DTaBp#pLEZ_i3)=d6&v|w7pPViKLRHvO;HN8GX~oQ z!N9j>Zs-mf&F`&OCglC4yYQdJc&-b=rGU&~7Den~!9IN3l_*AbBOxpQlvF5wg!#s6 zdA+MwJLw~>6e-Pm%^_p=bL{vtAJl&<T=mdX@LnN*^dN@W5G_z499kyI!FY|)TJBpDnM$;D#1SS8j78UzF~iAXF$ zWD=DIa3pfELyE4R=DtRqpS)W`bhYKtUo=Bt_;>)DMAt{e91gcW4QY3?Ffij4prlcYX{1Z1y zrA82dmN(E~EokG=6{&o%jIiYk&CNs;$GOX+dS9SRj!>T0*fH3JT=~FGvr9H4+}ww@@Y*iFhK3NXVCppcV#) zltY6O2`KzhNFkC0s$2>UDpWS&iJ_{7#Ek})h@g&mkk?N?|JS}X0R3YU`pP!|wu8O# z?m{yFJZ-S56Z#gg&Q%702EQ@`&Q7q80{9_Z4CV%acfkK?7+*I-eEx^Dw?cR>VB$S# zk!<^ZusJZuArsMkE2skR&fQAormpM{F{b%-Xe73ppSaT7w19qFnSWf^X8iPETpbtb&YPB7{t$56_oKY@gy1E@S^)jEvwQoei%-R-U-+P% zwn82M9QH&SbM-f2zvNHgUIqIi$6#F7VNP5CVPY5XgfZ=BV;~d| z!2kYmjTW>Onup8p^;F87j1(P#$%t8oM1^E>U+I`ZwUA#-s7Z|O4*Cw)%A~4VI2kj} zj0Kf?+CLl2dFPgOjNp5xmQ?{d$D(MoN|Y|yT2d!0M#}4$iF(*dbSY8xk%jz(r&z=D zrf4PfeWN#yju3T+)Z(6XR*!~vEuo3Uyo#LG)(E28;3%rt*@8oNBD1&{cey5Bp`G$6 zivmklGlIO&X|p?h276H(a`sl}f+^v*&z$kZ;bv*h9%sf%SRoSdGm%_@jmP2QZ8GmX z{R*aXIgk6Thpa~ztpRy4!1%*1@9Cl5x#vK`eaIVR>5xaq#&d>~ks9l^<|27dJrcCU z!%2p&#l|KBgH_*T)Ec#nlzhyn84rd3yN}H-amS}pmLCk&EIAsVu9=;obSUW6WGkt7 zHSbTv*>o`MEzn`5YnJgTi`jWI8=D&E#Up8SG_KB}E~;enE8LXbO?!L+Z`e*2V-Y_a z7+toh>xEOcgxFjU*rO8HXodIWN@-~Sc?sG*dV`OwAEnK5h5=Pb0yfrW2284LTP?j&u8Z)Lv7rLZOz2>E;lH)bB_8?H707FTp~*48XNIeovuyv>J8MmQYHt z`{fV%WeB?A9E|x5jky)f6VSI#f+>PN_#n(P(+wFA`YZ*e1?J%u@N0)~AixA!cxPK2cy1{P{;>TgWN#sEUo4dJP@0t!XNd2K2U6j>-1QZ?s%H1CW}#~gVyRGPCzdGv^0477lo-x zEEb>01-#Kz;!5@aJ+hdKjntR(rE;Md&W0JWWwcbP)`pXBWooHXBKOcM ze{JhWp?zShcGYQs_W@*3$G~z{ZUOUtaQA`v3b@^1hvr$e0&WT{w}E*I{5OE#tI+oU z2KI)Gk_XcXZQbyH5bToxjriXOH$2I*>NvEyu+g``ukq%`E#UVh02#Q?fcfHwdD{wU zj0Et<|LX;YWe>VWTW;lz5AUX!@CDkN?)8nS)zZ<0;~ZZB>stBhTSvct$UbZnx1kB= zoTgV(kovIA=H}dEZ_>UHyI?0&;d>0!xZ!Cu+igFxtF5H%Fd&CLF(TH$c&qld9iReM2 zS<;~(IRj+opj$tbFwKjHIlN$>_z+@{!{w;~b6==u%H{OAcB(&h8ZNyZ((LZE=9iI_ zSk?dN;M0#>;r7Uw-Fb08p-V8H#KRS*Z4}eqK4FMkT|6{Bq-sy@klA~1i@fmjD$$y4 zI+-s$#P}m8pQ1vZ$7kjbRS{yBAuAPCfd=2k9By6p?X$HKHjlfZPkn{ zrkwU|6sSKx()oinUR_j`MCTOppV|Hqx#2`7h4XB(!1AZ?j3z7;6tQo=d!yuHBy`G+ zo#ZA~`vw*tbEt<=?;&MV;=nz}O1jG1qmmzR`vhF?r)KYM@*Y0c^m-AMnLP>z9?^Lm zz2nHPv1BU6;;JE|sV?ut5;zsr*vxBQ`|DgQ1OD{`m|Jgze&&F08UF@-Z4LCX0(8XZ zz}{HD8zJmn=$nnc-xw3Ez>9PMI{-)YAuVeI7v%PnKn=Qx0kB0DcHG-vQ_We%4qU{tR#jAkOXJeh9+bA?)`6>mckr#KXP{GC$B&03U=; z9xFQ=@d58Ekmg3n=DWQbq75m7sL8}5w9l_gh`UlFcI48 zV?(Sv2y<$Z1PvRk&lmKvECjQRC%_V{+s6a~2@gvL!*-el)6Mu8nx?%Fkns6vx69-7 zdYvAc4uxE95AF8&7#~ggyiOm(hC*(i+mjCX-EMEt6sF-4vfJZ#((aJQ>j_2Ufv}4O zja4BVWH20b1X#0&b}^84!sB6mEaMM)z~%Kbeju84djl~4K>@(w_qc)%xC6*SxqP(O z!@602l<}A#hky(3=lCE9f>}tw9|8*#_5{LyKNF&ZL7$rn`CUGDh_QNEFYWW&-HgYd z4u@Hv2MU(-1(|@43A;i*mo-5+1GK~JvPU91w{O-;xiOz7X!1M!VLHl0oJlJcciLL-P4bDi2rkRk3VY~sx2E~dvBB3y3)@={dblBM_2jg@3yp6Xu2feuooE4XXG;jE4>QVF%e~55^pRPYCK3JRM=04YM8}8>HPdWW+_&R$nC) z^Myc3nmftZDJH<$a;z8hX?erJ5Hz9RRWvw~S~d{zSc0)^GHAm}aOW~&4wUVcS}Zzg z?R}av(Gv*gy}hw{d?aPn`E0Vqs3icSteOf>Sh@YZ$=*n~>gk9~?k>e)$AorHLp?<> zw&M5uZLX*{WMTtRHyMgEF+Uv$(J|;eKA#tqVKD^jXR<8xESDR;!B^nAGxRx6#2NN5 zLE7hHT+x8X>mK&d(C|K(W;KDe~heD1m96F(E*aMM@M-8(+HxUhEO zrKL+}xXp82{V$Dk=g;i8_}a<&x4F#5FCP2!)YO^tw@tsa;mXNASmqn%BDzQsD~u=8 zVdNsHI^jg3MXxH6Ka@%v15lWx*XMT9PM3qx`vb0$-Rp6A^)atCLD|FpfZxk{!d{vI z!{_q5m^clW$!Q<-{UGZNq*%9`r4nvO*hazWLE7!eTirIRmxww&c9$q3~$(R4HLU2wH!BS|6M{(SH2GshRVl;YK6CNv$9Q`XGUSgkj7Fd;vF|hI8c}a0M9`j8~7#?L)&+ zp9AzVx_zt*isTH#8(eOW3)n~$hH``r0XIo{Szvx_APyZq9P|cgKQJmcR0P9>NUuLg zvkZ)5n)XLQ>i7BrH0!6mPSy!3$pUs@Q64|z1eM4>8?dYZjAgIa=YrG$fsF-=A-B!z za=?RAZrbK%;IfRL@-U!^)oXS+oiUFM_yyx~5MkOIhJ3IAV9-8b6HYel010sNXzHPdpM5CuX$z0LppSRMqu4SsHERK zHnwd}X4r$I{8iMHupn?ny3QJf;kX7Tiw<>sdGJ2T)$Z_QdN7f-1Rs9PYtpA|#-znf zm2ge}$#GPm$T!P(nEn0qCf6!JAyrJcT^uJ(ei7$+zVUGh!5?@%vwLqFWkklZ&cpmS zEbgY%p?;3cd!pSW&{&rGy9)4Rl@EVqXy?XmOu4DOseiX1`gP>)_Q9sax^$5sclOJA zlb&9Mt7npXw$qApOfD~}FK!7OOfkm=2b}w}d)t(*r2MY@M$@RyQN2sH=^(0Xy2lgG zMAPRus=&6VCIYRhgY*Cj$Hu+;2RcXEnlFihg5!NIrdK}Tsiv|eb9sC0;y})zL4DI@z1wQmXo1)(A_FcDz2*vR0?J*o9QvFJ zakfW^Ez8Q@o;pwISM53wa2Xt^zT2)n86-RO>x5kIsJiD)?$FRT2t22W9Z&iBxtMye zuglEr8fkG+0}=V3&-@tQz%N-RMOB?x#%q7buunD{QV!9B%)Y0B;(qB|fErEwl*0%W}>K-P+ZjMjLA&11@_i%hp@CgXUk7Ns0Ru7 ztzElj-MaPb*RNT>ZuR;#>(;M>-!1FbtzEnB*4uAc-*B&A+d#N^-TGSqZe6=!-P)UP zzIpxHo7dh9n76K7xBiK{*Q~vL?M-*weCyiPH?M`r>(;GV2lQ@T3%?Is>%nLBKY$&4 zZ&}+&()TV)&h;&Z(e)b&FgMk zcl&Msc=I>c-L>}SHIRpI-Ffp}D>vSB`|5RH`1+a+cdTB$dIOMLd-E-8A>p-aZialU zg*@MK>zcLeZ@6vU>UB4*S+imN`gOOhhw`pp+xWk31LPIFAW;Cw@r__#3ngq67t&vU z%Z3J>)lmN1ZiebudmH2$@^{m^+n`+QSFhW!X5AfY02LC1B-gIH3F577xPdlg{?^Yx zG1o!~z-RsX`L{o)^9|6)QcNZ>X4Mo{unX4yY{{6KD8pm5j6<4wirTqLN1pCkg8%+wc;JEoAC< zJIXWW6m-zM_!F(x4UfMZbq|o2XN4#A#%8tS#z3 z0(%@dk@uV7kbE4o)w4QU>vf0S{)_WRyyX!t=AZMA$;s|SA?L|?f}np?dn9EK)*@}^ zYpH>bm-d#z)xZE;#98 zwqzw2E)J{Vfu-5jh`Nx=mDK6(ndjy*r(^Ns=wi*PzET)36vkgzDjv&5=FZ~ztI;34 z;TfMCf8qE=yVu`OFi|Q|ABj#QhIqgmio28J*}}9@7Ch?7cy0b=yhvMJuZ$kI5oap@ zRA3B&nC{H*gs?n$Z@L6xLE?y;%D-98<+AzgDaS}P=cSY~%V3L^6?K?64!t{F}{M0W5|m0uz2_aqX#YD_4-Xxwvtp$?^4`1ILhu-Kza&tlRfULCy~<`S(f1zEjVaxCi~4=Y_J+ zk^Kh@-Jt>+f3f5ioval7k$SduA_-61Ato^>Xu!)lva=#LGP1?+L|Yh*+;Y?Q45xRa zsl+_|-BdG;nQ0Dazs9G}_U>;XhQIQF@l~VGZwmIs?6RPw!w~C>;fDj=PCxGewU0M< z*x0k*(Cs&i#pPm4S?b4HlG?J<5l-AYvB@%~CHVv;Mihf&n~SgMxC>Y3)Df}Q)M3C3 z-F8-A5_6=aiI6_!7bf(Kkmq%T;5Kk)Pmc}}r%XQ8o&Inwt{EjO z*g$(qU8oRzm)DrGSfvWXevfHq;c`)6*Y|5R6=~Vk8h*3Yy+a=s=pO3wQUa{~u*mB= zdf;H3FFdCV*+HM&w9UXX4eYQ~dyG4}qoK#2>OH@DCELtvdA_B<->(Th7*H#?JNx2< z#V&ff6J4|A9NlTyA8DK3EB1ki{8m$q=CgZp#pkU_uC#_8+H z7TEo+0|LTuRxAE3UNOT{%Zg(gY2>a#&dH64`^+Iw7&!4 zkAT_u4vqnBErj0yVFJk0BH)$4{gq#Rkk?;A9;G^)7Sk#;Cb42j3ReLQIt_xUF~orC zK?i^cQORWnIYJt6BT4GD2Ds6sRH*ej)S%VF-9Cd9E-52wqd=vTLj)bBBD4ZrA=eu5 zmLWZSLYHHx8g8Mgl#C8Rgk-l&iT0{3LX<}!m=3Q0p_oE$G-wQ3ivb9Mx_=ziXtXE~ zMf7rmO>a`G)o_DeqefIbnL&f8tQrkmqQ-GGj(`#*LV*|%g$^FYGN`p0Qln59m0WlR z6JEPeDRgQSK}ZTkalM(Vk!hru3f{=T&8f0l%)?xJv^Iqo-djXuDsLZ-1>jXxv4l?v zkxgo+RMW?jVZE-;t?bw)PkRP6Y}g795~8ZsC4NiqBRrkg*4xoLk8M}*@YX|!%q#A1 zQQ7$nyfg+nU104-c!hk0T)>m6j3ZjmI zVzmQqyXp|LT5G@wL@W+j4+?$#qy$mu4e}v{0!8%(twt*_C>1i5MPfkOoKg7D{bXXC zP)`a+bhZelQ+8}8YKT;I#vB*i>ht^K2DWqm)ddt98bhI?&gqWHD?#WMR`)zi%{wT1m$UL zfg$sNz@sqnd;7KcVSSg3Jgf#y1-JsYXhlZ7oX{Cma=Bg)*WvXTJXMLo^H&Ip4e4;5 zLaRnGkp?3)B0WNCOmYM|Hwus1!8sj-02g#bl@?Vt6iyfWs3z~D2eby|ZFA(e>Q zg#6DlVxNBhuXDK+KVazbo198PK@scC%XFF@FLR_#O-lJ;ncCS6?@_jE3@wI7*={LjZrwe-fpPJ{-j?YK1xh1Hfye+onFy$1 zChD}dq-9$zf`Hz7yU_gD9wg7bs(ELhTEX-jlw^9jTQ@p_Dx#p!`P$V(Uw^8f-yTK_ zKHq?CU&b`FT9oX6>hJ-lsBm0t2;5;I4h*ypuC{NW9`>44iS7s66&)kYVS&4Q^B#wB zpk1p^^9JKdjjrI+r>hS=xyQSCuqWA*+J2t&Uuv7EG&S`f?y~j!6^Q+Aci*GWTPbtf z4o&}dZpY^-I)oe`oTL zBE%Iq1l#2nlUk{A1htu`gyMGNw<>a~UC=tY4c?x&%S;2lZVLxB*^gs^@6b&wHejnIn{2bGj5Jb;PGodxr5kl;tlu3o^(DbxDS7< z4$nl*6J#GclaPtglaK6NudEA_avR+Z-#x;cEk2@gw1-Qx8x1Wypwkrqb538C{`rx?de&w!x(t$qQ$|=}In{OG?@9}VVNX14$$99#} zyQ@#!e?g<)*ROXC>buE~wP<_qK|UGL!J|3PAKHEzJNI}O{-uW=Y2gdkHuY;pz9aB; z-C(_uG~)R^`-S(|Tir(lp8T#oZjNb!WZh|@d^Duu; z!hD{Aw0;lr`3#tPn9r;55w4N!aftUlz&iwVE<*T|t6?t#=o&%RcnI?QeE>g%{}kki z!1Aax4dfT?XJ z+ydGtD0rUM3daX&lS$>W+7Z&JB^WY5p(dpk)V?@D>x+rfV_Fkor6^EqA*OJgbb{h* zhtVD&5_k+D)Fyc0Lqrp#$*aaqW|iEbkty94y-F(&NNvqQlMpvMR0iB>gI76p@LHcq zPr$TJnn4M-+-otJ7{nEGI^2#?EBg&`e{ej2629 zwX%LIyg&ikbc{FxnjTP#$?DOYOm?$f5+iUF*gt88ca0;)qDN&I(v{?5ve&9LV|K&_ zhXviBXfI&Wc#u6OSh>q4?ft$SQO$8X_V^zcmxZ%j!${p?J!DoKnst54mFC)ZE8Iu+ zryj8@ta!>QLk1@vmh&)yOOM)UR4>g4ogq9lFr%<4rZ&d?u2{s)35FaRo7U_xD=`I0 zfF=eVC|J`IHr(uK==iWIC=Q{=Y^r>UlInzhL@Boq5TG|0*YO=XP@bbv2Bb!WQtGUj zEyS@7#JW=4p-yxN-Z6o*jfixcX}D>xrgXT6c=3QxI6|oH537AmJJ~1in6D$M$6tHy z74J`OJo?|q6_-whv8k<33?Je=@!0GSR?N=)?fZD@%lY0yKcatSLab$sDmA6uPgnOJ zk*3->S>%+IKc$Zw7FB7rBa_PDVuROUm37(^j2kP$`T!ln66BC=@5* z*fgjC+n!)~y7sI0-hWc9JL}3GitH+5!7V@Y1Qq zjtdlAsZK2hHILRoC1@}4T5*iBQDz-xavN|BY61OK2D`~*bTWDrl%!}3Ea>kr!>J1! z=)NJ%5|xh9QyMd3a#L#1bY-$LVm)Y>0kv58{r~a7m~Zf>F9F{=0nh~eB@evh62J=Z zynhG3JAqfE!1*FT6vE$Y@H*gUjqmldU@n6Bb-?)?m}dazOTe?f0{pQKeDFS4w+w+_ z9panCtH_q;sA?^gE^JxfILz>5cXZ-?bEd%a7kdM7U>qj5vwXsh) z5olOI~!^9U3LBDtld$5wUqlsg}U1q9-XWZMvm&tOVgk> zDp0;!-2ZE%s(6``9O9%VzZn1SFhxX;J9fGwdoMT+_2>m5FI%q&HG0YAKWl&Dd-lqJ zSnFg?r?w1pn6r2Kv>?(1v$G}rJXICDXp*&5#uNrVUmp?*{Judxq6|601y$DUQ>nw+ zNJ}6^^c9(`*_d_2DbeJR?ug%C%Mb}QgSGhRDMBkt9rYz~O^k}A6A|O$OU#8uuQT$> z&oZNC^w_-c>=@IpNM`L;6W#8PIpt<<9#7jMMN`p{bsOmVKoh%ooV6r&kn4TgQ>L}k z0a{iZiHuMq$*J-~`wqp9pIxrp!}sMYs-kMGHCb30TVs2gni5pMYI9ZT@h{GIMp+%LE&?y6k?`@DfehXu*17v~5`NBEC|25d(g7Mf4WAndZynO|}jXVc94EXauq`ehroVGby-=SI-ZPf)iH?`t~%xYu;QOtnte-{9m4?(nH;< zjt*a}Z)3t8am_T_)cI3ZcYldbAB>#Qm}qLm+&Z)kt%CrI{U1^=F|{eXH&7JN2=#j7X& z#k@<~p*nxAG`3l@PveX^?YPES>&Qei4qvPc{lkm1!57sL)1zn%+|^uya&3EExcw7pYl(e-DCdg+79=$V^CwY zWm7gohq|76p9u-t2M(&wj2&*b^5Ye`;3_Si?3>Vb`8S1%)yx~%D@pfWJA+&lWXbJ$ zf3YCR9Zf~HI0EMk7Eh;nSG}}5B06pA{?*c@Bl$$rNMPb*z!uB#If25tuC$xBOe9PW zX>(t_>v#65U3lK+^G{L%&-mOJR`%x!mm}X9>BY%oo)^WoyP9a3VZ6IerP_`h?M-R1 z2=iQUoS87ablHowiOpjhF5Z zkBwUX%iu^czd&N-cJ9i6IhL3W2jy=LhldLOJ>BbJV#sC&JC29$Csbw(3cTo0~WACwiPQ zX4DwhISU5&p%*+{Q&VZETJp88JqN-kS{l#pzO^F2jx>9Mb&D4tl-bTtMk?aInmHEj za89JGDH5}pM>XME36yZuKR6-)mbKIItAR%}fP4b8Jt7K5k{rAvf0`yP#~1F_#hnV8;63S=oeXCuww+W2Umj%q(PFn*X{(D5 zrxD3T(Xg@`Qx%Js>~j_70ySP*SgO(F!nfLgGVjh?Mo0W<{a8v^$@+p#nqnP9d10;9 z9VRqcWo+8SCVDyC;w9(8i!mc6oVlPG4dP=QPp#nUnp7P}PFfWG2^M)p%DA@&^%*!1 zIE=WfX87(WB8K`8W+?>GKVuiWpH)mHh8)L^TMUJKH6BW$n&MS4*{&sx0qQ%Uu#(t#3j%Zwy|1(UlSL ze}v`vew}S9HQmoqGoE}Z#Z~RVQAt!>NNo*-`v!VvO1y19-J~i^c3Z~lv&qa+RCdIp zP8CkRso`EhRZcp~b=8`OlVV#jW|NS4cS)DZ5KsSp6LSXTkwtY*^i=gI>&i%bG=|@# zhtED2PUe&nm)|iV=!&~ydb7puPK;*`sQ2lQy%ivfK5bPiFqYDVXKhFO3v5(LMY>&*AD!Bh1km3;jTUaN59oDt~1~O;{*5!5xdjw?1 zGrwP1>4f>I@m*^I==}j?$Yzi+e*^Bu8SciJ`US9KK(nz|-dG>M3V34hhon~i3Ffkw zf&T9R_cO4_{xRS#f_*R0eFFSTVE;St+YE931Ln-{!bLy~*6v?`_|JpiXCa>ryx#z= zZvfs7D2Nr@OOTH(K+gsFe-qLU0G+cC?_VMQ`+#>6@Edmo%n-gF@HYc|9`N=+84ow^ zNP_(|r19SnhXS}0=r#5jypX1#K@0K%S9e$V;Qj;Lv~AR7E3D;(0v4-H5-ef)kNf8bCb+_f4M4GeGva_)dg zFvJ@i>f;Y_c|5U*E8@f5tgbErpUamIbn*Bco>(dr4v7YN;sLl##_Qs8M1!0me!GCz z_$TKKh!1y5IYJ({e~>>k#OL%0xjgRR;NXBz!0mtwUA#e_pufGFGc+LOLS}?qu5hpy z?xFEHd(}LCCwHjl|0C)>pe#GiGcj^)@7i9A%aufN4=GXtK#&LoFqnV|Jsn>6%bojm zcsb|TFZ9bf=bUqn{W|9~J$Wz~BqEU@D2Nh8$)ZfHEUk{bwim8m`W*E+x9@%TR@JRr z_19njSM>)LlT2sOs2m1yp24Cp@H~YA&!o{PY$AiaMCP&=IUEubM5g}v~sd%D{78f|g1sape17AzwQdu-HL%<{~N)|zj(FqjhGL^>VQrU!2 zp@767OfR!>JThaMN+QqF$s#(HIZvSw1r!FALZ{G~bPkO=Bc_vBY$}DvVshy$I)TNM zFj!bInF!htFJ@3UR3?K(BH`$uUIZ2mY}s*X;F?Lm(^+gLYnjF*Ggx#Qj|uvN&LQCm z3;9!!U4*pgbyPiHW5W*>i*i26btiMOZ=*m=r8f$s$uI90L9=?gD`_2g5Dm znJhe8MkP;^2_n#|5*9E}z>}$19Qek0B89p%iziVicorVVoTpPc3#3IBYkG-1j~kn3 zP^nlvhK#4O=J7+zI3i_^K$;_h@)$x2eu_-wi-=6nRWm|5i-jRGI7Awb&YUOHiC_rO z*vut1o6Vxo>7Zd~OMvEZfXF!%28{>4k-5ZSP}vkYmO&+J$utg=JjW%1W?;$4T(XSB zq)7-25=lU#k^#Yxxl}ry%_Gt2dA15`($v%y^^=wT9%Ny4*)G&-P1 zHWQoq31PY~7C^SH1bODh-6;m&)PgzthodMp*Wl)wp93}xU1__NyhxFoap|bOqRGkuZP2im5S>a7+V@bjmYQJi62K~m>J2;m>`cDBTWP5(p)rf zNh%KDP4iHmaAKB0_j2hdDxFEapgIbJ%wWzjD3oit&U+=lU^+N4-H!wW%(u-ExKct{TzRGhqzH|0k`p$D#6R*5bxc;N>#jf@) zz3}@3m3LmvCVzW#X!XaJe%SYYZ~RSH>MwqHEihUPd}FhH^^YRY{~&pDAnW=4=WDl% zkwN71(-MJEG6QykaG2K&uL`Z#QR?wOP%KU^F5l3MVP@yJ={F?u4rEaXnT5z?sKq6| zEIoBauqfTU$v%OGHVfqrO{+m zJYkWg;xBPY3xuUvI$=V9!*E1^Eb(y0!V&`?06jxqCV(MJn*@RhjzR%b!7P|*X?!Y~ zL!ffF0-T7l2*eZ)g~9?isZ@r9#lSIeL@J5IA`{ryIWmUIq7yNcB`Sl61hYSlM&nYb zj3p9<07MfWTR2N+QOHEzv?C%41vnA{%K(xE6AYC}Dq1G8$ma+GHEQOTJ1UG_LMjpImFa`+I{yTci&_5eb0Gs}L5e{Lk;BTBn^$G%1@YXQ zlmE$S`%P;2R$}#!2cE0*(+__9X7W;~e)H8A64%SGJbww&|3)Mg`NiNs>E)Myu;zOS z^+GlG-!4HyW2skvZ@qfu_3RrjB!{7S&maAuJOJ~*dHv(S7cH;0Q?U;E*n?~hQg)hoY!b;yUPmA+A_-TKqm z4}X}3jHG?vdZm5~>3i$-mylwfQ9MZyDX@qu{^y0R>nP2{<>k5C1kp|9=*;Zw@{eAY z2vgJZBFOl4tsJ(n08rW37=KZ*_l8_QkK$lZBFrVFYIImUMHbIro)yq2STdeVq$}{V zx33cy!Pq7caCD4}ys$u~F&WrtJ`oQ(Z)uLRxJ1G*fh>n2OfhIg%@UJLV__*&5@QMr zL{BUoOJz>sIY3sUfQb}1aljI!L;;CTSp-rJm;i7rDp(7p&;>v+0tCe1Gr%-Pr_e|g zGGK93ECx?z(djfSodKi{Akq@>7g7xnW@m|@9vn7(Ql_T}i7W;QM<%dXbRf2J7(h_s z(?~QRKodaC=v*ogv1Zpg*i7A2l%G5PIR;Gf^txQT4lW>$2BSegF=0wjMgsjjAW_jrHlvZ>L z2h*uslOys4CFw1p7DE+H4ojwQnxqh@dj2&za%ASFSY#8U7Q~ui%$rgWmI*eYL*Z4^ zOY@3ZHWJQU;D3K)nQoby=RTn8gietKoMB0=n^sEZ7x+^H;<>lf;!%iU@kffW z$tfb2ZsyHW+2ZNx2@#T`n+Fp(R zsDKFsY6hSn5`aU%Jq74K78ysPE{a$S3^tn~#uI2X`UHu#h-U!tlmg%YjXJ-`BvEL% zB|43+XHl4JAe)nbBM$-_2>3)Ao&b7~%)$FK-M z`kkW#PR63o0eKg2FCZ=f5r;0&{7ZjuT-Pu^b|UfX8DAfC*24&l2J# z1TuAwOk*$L*^6{0kuRiENmRO!z#)+5K?!sq?t<98SoSjbW?-^Fr^u!OW{@!$96We4 z8}vI7MqXlo8GJ%KLkHkzkJ)H}F|G!`@;x_nA1Le(uHHizoK>@(;Aluje2e3~6W02+&*eClhU=Qs& zc*nzU1G`zU9&iHI4*o4zEBN;y-`|5{0F?iCU|rxYDEC*O4j1+nKLF*zLB9V9{EmR< zzXz`W1Qh%P+cDR}$Y%rTzYEIzXHd5f!M%S6ek(wo-UE3r zd=h*J>VpP{5IidZqo)r+x(q1iqRu$*jtjddpt|@qGWeDYzg!J){AY0ELgz6J@>PLy z#l<>>8Kf)p&_<&{Uo$7(@$Y9+_Vk_bJzdEf;k9f5GtY3xt+y)8MRhR1k(pB-^IlZF zp<2vT-=R869?_X1sI;{Wj9;{Q`4Ce2u$_zoN9FEnF>LTh9UH|S-BI>s;l9XrD6+m6 zsfM)8VmG&&_Zijg&E;dIPv6aKxFdCg!rIG+vV6&!DK6Q2*%>kyT_{p1tCfkfyLN($ zvR?Koe{JatZOdwkab7Qy7k*o_e=dNW5@$0;Rzwv&_`202EdNFljeXA0Kc3!savyIm zh8r;qQ|bPjNLrt+8vQlY1l2$&{yarLd{n|5yC##X3oK#BzMP zEE84KX1`nU5wTFvNp#BHieUG!;=lwNFo)$}?gkZn=4FPQcDyet?cA0{l>LaG^gL~)3X%V%Luu_^d~ z)FNRi0bOtVgRYRoHnICnn>iU3c5c|C3fYzTBF$PpS>4Ul!`s1t7HfFVHXOHex%tJ0 z%6cGE`C}H}Z??t0GR_oBB|P?3?^l$$?atC_k9bP*XiTz%2{>^PFVMmbf2Xb=V@0~V+_ItR>sfU$hxInuTW2o!)F*;q z;(!pPO58EAh~id_$;-<$M!;l_#3VLfUbwSm`?4K38Jiwef}fgp$C_dbDL>VbIujZP zyxsSv>eD7xzs>o1~05Y;;t%`>f8HOP(vd3zCfcBf%o=aX1OXL4l^XU zZwtSBoQSZGlIF?e9B)<-i`^@4l=3-ys+QzRv|?*kl4i{stvZ~1$7;-o*Fg~M`cKXE zkwrblu9R6Trpe5|#3WbO4XS5di9(B7by~f#tq556d(VKdgHR@=E>}I|oD5NdiG|Ny z!gSx0b#C<|U72nxKK$0YL;aw(*`9WeKB!YQM+$13WMeYal`q>Q=?(~0b`YwyQ7&1@ilhe zAwKTl1j^$}RE1pk*F?iqLe<(*q>i=k=Qcv7hc;XKJhD-?S)DPpBO#bri6*%O9|DAT zQ?BgAt)_!uXeRsT)pxcXgvDs9rRCMEF|dlG(rim^5iOFWQr!Wn&kHk_#2b5y#sps) zZ->^qv-tG;Z)?NO$e2hF1UfCoDK-wMaR!a1Z^|R8P(5DM-stW-Xto8jPOA8?2hr$g zUNwLLOj}^Q_&1yW;i1%ftY#;`;m_AC_OwEIu_42WEJ}j0s6zdzV zL;^R9G+oKoj@R^sDJy%sEVpYbrb^RS&~DdCdEG8=C7K8sED>Auv==r% zy3n*ywDJDPaf0KH88vx-Dso=a$Xj(~LUz{Fa3wrq-OvpkAZU@f*+ z?CllPl7E@SHgjZ!sA8NQ2>O+8-pa#c645eN!3c z)2sgiG?eD6ENO<(@kH(UlS2M|ooT?1Rv}|FYQs?5^&0D3|GG8mYPg%L8(U}EoQr}C zZ|6T3x=w-51ILNpiZ8Gyb-O_g>RQ8#DO*^|oh@`eDLu4&ATy;|EFh_M3Qq4>`$hI zET#H9oiJAK=+;haK!XhEDU_@|wu)-6RIQ9e!e!mro8$ABc9|-i(AB)-h!rc$wsx^o zEwz4LHr<=rP(CAC2u7#XV#%d6s_8AMr;w;pod@J%JiekzCF3XMmCl`LOnB%y2z72K zS7I(@_oEeq65+7Lyql(joQITGc1ylG*h`atQ+E8@ohN5d*%;Gj~s>BL}FJsCkG*2`MIjb6^IPQa~JfL-+ zmPKJBMyxVY?n^gf=Ku4qI?XU1SehDzr0w{ZUaEC#>>i&Y;H^)rH!OecH5cl);_oO~ zVc&95Ze}^tUvj$Ik91SsQ@1|-+i9^mfvi~Cd{c5~J*C{ykC%G)1Qe)_>f>jtDoOQk zqdRqmSErL1tmd4wwPW&G;qeSUlPnz=n`D36qM+!xLZF2mr1UWNd=kdGzm=;;-Yw~- zZHFH)HX%*9m%JOoyYnS>wb+~e?U$wCq{2*iq~1{S(piaI7wtDmNu?=aUF~Pa8tKH$ zD-zW^S+gO`%(4p}WpJGW!lpHqLZG^0&{|8~F0-81AM8!+MIUULK%6Sbav{jeZ{%7r zCU7MECpN8PS1RaOQ&MrYt{Qf3D+3B0&&ymd=Xt2OT3tNgekL83i0nI+xYFDlDdh3S zva%VQC}@2lXT$ACo<)y|VuOnt(bCLncl85tUu!kn@g`&W6FbblDV6e#rI4~}ke*B; zvb%b}-=P+>Mg4{ortGOly>~!S_#}x(4dhfcoKLRrb{}dLu8ovK2)-__ zd{<47$(vO^)qmrzJ=%U!KW_z2p4yC}`u%KD-idKExy*SqDui!TeHEv_8Yo{mS`7o= z3aapiZL%@z%mtZ$5%-sic)Cl?mfEZJMnk^INvix=oI1QEPQ+WJcd1HcW<^n4$?rDO z^$&c}Z)>XcX8Uiok&sH>_w0HkC0zU2r_!9UJkYayDOoJR+mYciz$s9FPvF(pa!hm0 z_2z+^6ViKH-1x#&<=3kX|G}r9bVKRDUi2g?Flfw&CS7?$6}7sm;dOB=aL^U_Kg=Ac z)rtFln^Ed+rhL(&`%!wA>a(dEYq9VN#eNzcudFKqHF;O3m${jd%)Y8qb?IDlJ(X*1Np_z%%y(sfNR;jfc*%QJ%84P_9b1@Xm2|E%SMF__M+ky%e;N(=mq* z_nXROs~dcp5pSU|u6(q7e^;Tg$y|JQ`r4`2+edt|+yo)Te54JMl;>V})8by&_~X)? z>2itUQS?k1m9M(In)Jx9Qf-VeEtPJ})ZcM4;o3v?eQPY>-EUh{F3AckM=kbO5Wt?GI}oO2O3pRNXcktjKb-wi99mXs!KxeLPloJh!T zgIUg-CWZgAeGB`E{z%r*YGJo+%R{KL#aRB4u-?l^GvwA{iD zmXdDZ!YahUh7+FAsz04p$|#1FyhN}h(x=OY0JL;{oi4cGQcD60q29--+uMg zhv3}#`Bz_E_#`?7xF_^yUw!2OSS$w&CU*f=y6{s(2b;zxe*w}1EcN#Q12KQ~)mLJm zH-)+RyS>~u{u1mdzw))Oy#)Lg@&WdG3G4?)z;hh^(mK z{tS!2=Kc~W_ZvU^>MIDS&)2IXoqX9bB;&@1_^QC;1 zcG)727!)zV2HtDx#ndY_RkRROSbh3~K>n_n<@GcI^^7l_59oZ-P&Zj!S=Y$SOq)*X zV%oSMBA&zVv$>>9O+#67>#Dv)x(C#!-3Cj7qS8lnk&MBq04OM9l&{2;685H2no>G^ zw!Ge=)^Pl0QOX#1v`hhq6a?J@O5cuD$_m^Lio+te;~=K=m(0;v-MC^j0fSnX&JoKv zBz0;d8uVLq%S_dR+_pkKT#Z}YJfqj3)|fh`zCG^OYyxSu=&=O>t~12qz#_)#u;!Cl zw=E@{0EU9iV$*zH@w2oLS7EomrNp|>Dw!qmk1y`(5dqyShyJZ@;JfZghIU|w= zA%VOVA9E#?5(l-OPEZyPo~iSJy1`geRx45J&Ph67n>K0*4?Sx~b)gy?*voL47D?WO zm#ZyqOQPsYYxHZ@OjK*L_7_v@E6F-GAz-i^Pkzye$L#ciu@#KPGYK9V=kGSfzJCwx&?baCQupPkv~*QSBqLX$8LF>WKF)ma@n`x z&50)sx(q+VUCpUxaa13{<6%p;6jqDEztu2P8(EQ@fuskqD~saoh;)>KH}8FlI` zNf*v&sm8gUtlCphXk44xNYHZJ7U%eY^OROBb>SA(7Y9y2vyA1SS)~?=TDg+dpte&} z&QM$E(kI;OWLZ|Al6%h=mAvghVNmQ%Xk|dN^|((bXHs&ftoE5n8ijb~^=J2;#gwIO z7WzsezbjJrBsyD$M3^gN1{z(qC@$DK)-_BJOBG12B2#F;Z0xeN|KPBM|IC-Lny>dP1idzc2sk2%7z`!3Jew{!&}yUAUUTyu3b!V&zAyra#1+~(M=uu2_Atp;I z4IQJ;6iVB*@*R)eC^`)}j0(2b%9aH8dcj1_mJF}l_waM><+`^j@o5#o6HeaYY%xrA zmA#=RyDdB6L`0g`W;I_$3-2}*X1DhXa_%te7G##ijbBz~>pF)oBiPaLBXVg)#C~`e zxcBL3qed49T$e7OBX=L^;}ly!*>QUES*>m=+*t=&Wrk!NZcdh#o7D6ahn?jnC01!p zEl&%IV!l9le3EKvyGe4^5%%hBO;sow$SR4Ira!0PYy`=vK0_Y0EU$1{E$eYewY4KR zXM&ZONmBA-V@!)7=~bv=MwiEE3#>$a*1WYYvUYv`q)XKCc*GzyJ=tOP(R@I$-f8dC z_FeY0A~ofZc@L~B3ToPakV#mD2@Pk1r3OKD1A>G_KA#Q*;PVk+X|1?!SE%lpqRxnW z7OX*}{S5Pzy{=T4)_e|vo4pt_7&CSffly%t3k9uyf!|y-Cw+Wi-U?Uiss+1hGwTtg zK$BS$%&^&)^jNaSWtl4~ilpm6`*4=6E%HYU0a9~g*)I}n)aK+d-*w_K%%9p_w%tj8 zT4YEB9ax<~$MtI+6`jUe*H9fR`OV&LDCke*WNKAVouRmz0iRx;Yl<`)QQ6^&JIHNS z$riWREgeNm8OZQWIk7Zu#hI4)?Uk|3QxuZ1!g3*?UU})6P2>xW-yrYQH>5bO_R9Ixd<=ZxT{D-K=Wo( zR&CMhcLMi?c87d<$y;*=eIQ&maLdOGC6?{3gw>dfM%HB!JF{%hTZ%R?GAf&H0_0zqw$=_78XYbBL6@s+8#D^qf?ZMF){hsqiJaMZ4!S zrOA5LtKDczZiQFUDT#*WFG}Nkd6U&MvC66IP5JnCEo-W|{muS#F`P(v7nj3?Qs1jg zow;?q!?4HOtvmI}pr6$}l+=OG6irYN39nnsY#Z5Vwm8&+oImPMxm^i!jgo510{*UH zMc$Pg>g9YUrU^(KVzsMN4CqtQtY#7Tdlc&V_EICaQgo*iD%UDs;h(NJBZi7JSB}-! ziXf=-Qr>dvmW2}a5{P!Yzv|Pmi6+0;=1b^}HjA($11^qrhICmd@fd6!LLr(xZu*v0 zwL;#Qb6RX2Z%#`VSi(3BUnL0ufo;EIv`NHripdy}>s3~@H@_m3<}%!1A)Mx#{0`r; zS-)KMaK#p_(hdS7Hdk<2noFgY>0ErjHbK&^g%#Oo!07aEy1bP_)ew(@57Wz*E%9_% zY_z5PlAJdKqPF|I*idZV$rag?5uH3C@+ReOo-el|p-bcvr#VR3>?QK%q)Sr@Rg7^E z2+;1Yt2PD8Ag#E+8k5A0R=o{4c?@>tmL{uiRx0&8T_mp4XKgk2O43sBr0tFsvo{?r z2iBZ%l{=8D$4?DrcR5<4=<6T^J>95vb4+eUfT0YuM0tZ!vM1;y8;y`V6^;0nM!sV? zqVk)Jz5t zJyy0-4t$KLJ+#nDG@h1EMiWB&17KDicKEjSQ;!nS+^(bM5(iX#8;9uzaiHB=d4gnd zyJ&HzXN9|Q#LwFeQCnH%x7(JDwyK3=6;@<-awU&`U1QVc{O-2F7I69;f@r^;-%YH> zfyr|#91_~~vPd!%ikXFSn`p&uu&7I7cfi`U`aC*~T3Ob+1sZEPQjpnFT8}jtw5=rF zrr@m*zO5?z;&oLSCGV7-j*K*$;)&l%weMrHa@87$HXHOfTh^H7B%Em|C&{_C-*V>l zZ`Fk^yKXg;_1^KR1Y(nw7^HYL(N2iasR$hjbD}Pj1yeSOD+SiC6FPNH9inrrk|@LN zN}9%lDX@;6ZI{7PcROVb1Tz`0OBs@iEkLUvSPOa623-`e<5Lwc?RrN z{Q}^d{~h3*2LLDi7~q({1NXgPU+6NxReuHY{4e1A-@f&=uQf09aY34o!5+~=fVKW3 zNW%hqO83Bi5*{4C0sBrSkT(L#_ypv88$7?TUtxm0TJSzHNLvG#>1!a}AA{3{9`TFd z{vU$${|IoE8kFq^MLE z?dy}X)1xpb6bggG#wN!QXar&g2^k%kopJuxu@g^y1SPQN+` zpMXNg;8U=Po0HRHBaLR7^fCfDFfsKe9047GL%44a zOe3a-pb!Xr2nHRS9)rVSpnju6Bhx4(0)ZZ##2}`pCsFVr?A+w+gyWY6=S48i9g7rFluUMW@rF43rEfjLy4Gq z=p5u0Y<_fMXyA=;%*gokJQOlcK@N@0B1VVMfRL`tjNQCBIzA5>M~n?aCr1#VAy8wW zvnCKD&`HQJd}d-|7&?SN%|Kuf_}KX5ARIC_gPeF13Z0)sPfnu|Q-pyDBzkfj1|LO3 zL2sh34-6u%-MV#q1Ti#sbM!h4j-WtqA%TA0W!TUN?AnhZ)N8k)pv}f$ODGtM03Su6 zhGCbXv-5AhfxI*}ig*nQgHbLGk3pu1(~zr|VB}kuM}KtfyKfCJM{keanm|sVIFm^D zAcBM*#!Sz>{6nVltxJpK5!@ga2OWQjh8kLg&5gpxwKsnB=F2ZaQ9pX=l_do8`y;o> zH#C@=*PeTYc>OYTVD$S0LP+{8>wsVckQ!e*CV|?cZxzH=G%AifeqCGo!w7qsG&44S zILn;tLmV1PAifVXf4%k`_ z=aJ($^w3q_5)3;ipndz5*ANWI*mJhm--N*4KwWtq^$P4|`?p!5H<354%}$SF29RU7 zIM;3t!-vR_VZi47eH|oJAuBk%KdH6Uf1_+Y=+Rvyesf;LsR$dKh+lespkb z8Zmc!eERkngfNN1K#)k-z%UX4xjhFNx^fvYI&d2bI$~rD2?MMJ0!9gJ95AqvTd>JV z)F>1>I6ZwE0lEPJzlDUNU}Nx0a5Q=viJC;gW)ah)h*`jmP=Nc7A*RvSpx;KoZedV} znc>kJBg62C3H0bF0y>Tw7#jq3Ipc8jFnVNc0r0pbI2?Nm2`Y&~TtiOGprKF{au^Kx zArzQyCJ@sw5LW=hM~qFNp@3};gPsBmbrb?cPuzx#jRQ_S218HWngnwMdU^&9 zogbVW8XTK~yfF%y92-N7LttRuf=$9uqv$CZ0tLU0o*06@c>_LzgurJ9NrQtUZ_Uq- z54|-of<<1tIdW?X1zSK4O`*pYA!BpU>o+k21jOi7?CU>P!mYNEx>32Q^NSIsi{fOShHiJ*vZL3=-4>) zPXSjzkIkV#700h4rx2qs7-j-YT{qCk(cwQrpohRC^e`0AA{;%A934fDjnCeMj|`8) zr$)zap<$re;ZQ6LKK~X1Fr5kHWh86{4ub%Sn;1hwP~$MrDswQ%_;YC3m4V>Zf0dKM zKo4M;eGSM3dq5WW23Tu;0%VY{fDG~XKxQ$4`(J@H9+0F1^8Byh z0Aatr(t-Ovkj@XvD1tKw+}j6sL5rYlHITJ_4eW|c;MxJoQGsU{woShT|G@44?!)*Q zcy<-M>%zBd8kBt(*ic>g<39w?{~R1HaDZj)uLwYnng>GEh28KI@Xm)o*13psIYc!v+1r!I7aK$;8Prx}p{*C5}UppKtk`(0aJtbd7MK7z0J)}_B!QmCx6#p>eZ{xSk>p;IjU89X~$MOydL@U z?PR}IJ_3Pq-maddTAy@-$F)xCSv2l-)$V*;$-Z07KYf(1bRFFqbHCC^e*Dw)$I+vW z9cj(?{`v84{;?zR$xht!c5K7s~z_j<+JYAC$X+s)>E2pcq6IS*{V!XJdOp40^?aJt zL>ha4@69|*u09Dzz3Y)!tyB95cn3J$JdYJm)}Ow&)$aPYwjMutwDBlbZUNt#>7H#B z#EDsbw6$}RZ9gu2*gyHGuiri|#6PN>BRG${kd#0_Uz1p8;68%D;8vV*vv6Ct8Rv|kOZzpcOQKAAocKmq1-y(>E3O29-LOY>;28%$rJN#WV^PV=swx( z$C^JmXl8-a{QYj%vY&i^?{33WiLaI0l}vr35Go{++0yZ9@vO7o+i=%T>dCI7p!aX5 zEB&=b29)=iX}_UvqrF* z+SqHIq&JUpq0NKjMyu7{uWzhw_S%I^VXL>BC`8WIgB#U+W3{;7sT^cp39~|5-m+!rEH_{1ME5*m&rZszfczWKh z?sPMSY&NnM|MP6OsX8q^-TwLhdcCo^e`>Gpo@dIR?)Q(+&kt8ky~e%I8})m~?OLty zNv6NM-`o4}pjKM_OkDWMX<_y7PRH|Eq?ZqDpQVqEk7|djr;p9G{^rifd29V%=R6$? zwj;e<%od0~+Ua|?BH`Zd9Xsx@C@|}C`Oyd4lrkUT#@0A`rt@m2_Vx_bnP4_>pHu9C-)%0N`SPEBKp7tI0 zza-wt^}=3FpY~Ga)kw^pb;Wb-o!Aq9 zzH*eSpWh2N&X4-d#N*2OC%V=~=IPVD?OvlFPuGi)$0xB|@Axp9+&`#q1@H9k9K2Wa znhI+zM?HDkETn;sM4_5ESlexZjxX-C+v)q+yY*5n89sOP14m&>pp87K6Z9WyKndU{=3`q2h}^PYrT;1+2ggoZ=tPxA8!hQD#)mpXP zcWmqk&Q4OHwf*DWJApf8Zv*gyBgY5Av)046Bl)%MYI-%6%%4>Y`dmM8l!^GC#o~p| z_SU)Q%)YZ#YPUnp42YM|&&Te2)0Otgz1@@3;+gBi`%gZ;7moe(Ve9VQ2irRzZdLm4 zZ=UEQ+lRTc$F93~kMi3oQ=nJs^{U-kVe>4TJ}ECZH%>U+jh__OJBiQQ^>=FzHy*m2 ztsGztCr9^#>y7hDIT$_L2>)60{rv91NvqNI``*p0rTUG)<7#U&{qfP!7oGi$wWF;< zE`L-@@3xxhC!cJ7YRo^V?rb%W?Dx}=&7E8&);l|XoGI>Ye&BM~HXMoVtz==fDS7{} z`0%8^`O&d`^*p=VbXI@=IZ2|#7wQ%wI)I; zSItWa-D$hFclO$+-%BN;$#%EZ+Sy5`54`t-dy3P_X*=00wE$1| z`ciw5c)D8O0+dnR-K{rUnRLC7>pbWtD&dVg?)qxKbuY5_el`*Oh3Wk>aJ~5~TsVK{ zi~mCq?0%Z-bUUlAyH$%=r8~@o+gwSlWc}>Dp7mp!_sjdApKm4`j{Iplw z-s%QwWlbfu<=d;QukHkZSl7sB_W^|mbD4uwu6wpyZuJfg(U={r*Qq$Aix+&+e`J@~5j&XEfG`)ju+weXs7DN}ld5AMb>Jve2*G zkIk;vJKz3dcju_r+bX0U)sIRa-95`ajr2DPd;R0}SnE-v(Az!R-rLje6#HMqqP=^c zyPJ;qcJA%;UgGKTVWoYX|E$xcX3E9)&bvTuAiA@@mfy7RZiayCk9&8*<@LSx$%E`q z{g1V$;cV)Dx3IdgTJgj_y7Tr?t!=t{)++5+{^6)l-&ikhRo74Y+pF94c;av`wqHHo z$i97CEgv0h2Rq@d4rn}}1Ah<+tS9en=Zc$M5GV4u<2pXH-{0z04sy2r)_q&0ma2C< zX>ah|djH^Q_pH`<`k=YnFZ*)q8DqP&zxv+OYTwhzJ^JXOXFYhj-;JI%Haq95!c@lt z=KS_XYUjP9$7=`06%geumsqbJHXAK}K6bC)IPLk5V(E8Np=jxR zSKqx;>sBHk?CV3Tk-XP^ceR>14Z7Yts%|~}Ou8?sSKmHL---0!e*c|JTlZ!59dmoN z+t@6W-rd-Kc-GoHN|pZk@#D3}?>+nIqtaSuH+Q%EY`^yDhkNl=3-c#oTRdZiD`y>7i8Ig9w#4zeY6 z_TAIYUUL6K8whk*tX2NT^?2)kV$D^HAA#74?9tz0kutzDcL8RJ0jzRi_n`(rYZG9V ze*@OUPr>;Hz%OBtW)d8~1UTsgU>ylq6aF7Tx(ojeHSl}_;F}A(3Kuxv25ZHWVBL5d zl=-Is8~qC0ive7;5ArDhj{Cns849reeGy}(0YTp6x41zkm3Et%Z zhY2{Nm;_kw_s9MfHz_6CO+Uu;|F8nO5LhREoQxQ<|x%>@ljcPgQ_%fWtI7wIj%inL-Z5n zXwNI9gbS3*<32r&V~HM?Sw`U0%dz51Na0zvgpx4xf(Dg-(@*tP!h(E`v$yP>K{VmP<}?=u_}zAx!+dhlDc6uq%W0M++x=#Ng2l zvz5F2sbg~RDz6|LCTVA8!W7xMK4l@!FlQL^B!{T~#-v=eRF~nn?CvDtRO0xmVbbH- z;$esvVrUD!S+%XK;5sSFlXNCmTV@Q_I?o)IJPt9r{o;XV%x-|ZRJHQtp$Mm%#$-pi z+&gCSN|^qpab?yNm?kv^@;ottm%VNom)^2sGqjv;itj{Zky^yWe%+}K|E0{pTN1m* zRV^i5DM{ro^%BV$;VgH->P@iS&;t%rwX=MmX~v?7>xu8za5J#*_Zy^yY)T4Q#_5$X zPSB-YUWZt~hFWY-VshFn>B&fNMQ@yAf`Cg}k7W`+8492dJ$KO*+xjO}x67{Ja!j;fHe+K&%!1@({gmaK z2wAKf6joST?`W2!u$sknY;N|6-Y+WLHdTKu4{c2Sx-2x6&lWXP~>gjkB;dy~JIb2?OP!lWWEZeh}Nf<{GzZCMT#x*4AUUT3!yG{(s^NUf!zD={B z2m+6KPMfzV>EkGPszKwvMGpy=6MP6sZ?Y~(lUfaTROMD`+?u#KSNPmmX#3abn7{Ib zHJe{QwkPBTf^0){sq%m;HgsLX>9NWiMe8-AI>ZC!bB{hEy5H%qiFIBoMLBMz{YjGD zvvCVCra{d&aM3w$K^imKr~&`J!g&`nZKXg%%Qq$2kLc{uzc(#M>8atuL?kP6S|AA% zhC^~DU0h%s;w`EWSa0$NS*xPM5xRwnll}~yNv`pO3{EbumwBRoV$~@h z38x(Nb~qd1aCHnhhoFhy{12(WqT}F93g>9{-+ePS@oo0v^3WgPyenGWC2>Vt!nQ)J z6jk8WaFn&kfG5svk^=&}QQ=k7SaYIPKZ$O&vQEl6!CXrb(g&-Z6XZPootc*l`qws$ zNarNOm(#Pg|FcdXm(I(lpi7+T8B=E6h%r~-a=1m(BRUL~kir=Lk$zG#nuQ7ub&4_# zTPjO@3N2T!G$~0|8BVF3#&Bo`d&q&+>n$z{GfD_VWkxH$e=}>yM8h!|;0@De3{Pwj zFJ)~(h%n7~7FiLjNlrxe?_MK#9GX$DK<6T{*K9MbD8{A4Eo2bFaf&nYFKYf1=c&;H zi;e$&jct^v}36lip zV&2WP&HR5vy=R#0*m)i}vL)>!ag}zpN|a?WnIbtH4mopk?!KK<<(vzsDgXr(3ZV*> zQ90+FySqBxzJ2fQ8|F@);gCa898(&lrD)0W+Oq6jt=IBew&j%{d-m%`KXm(^2dI;9 z02~14J?H!0_w9+&H$$<}S4MK-&KWGZdogV!c=TfW*_k6fCo6sJddR6qnai?CH*;u( z#dssC57`kXxqX(Un#3o@`OPiFJG<)xD;@wd0|AwCz5NqCh(M8$4otno;H_D`zPg8;l6L zx{%g(B9knelo;YGPF2Ked z$&8|o54Gdt&`z+Bo7raa)A^-`rNN>zUllWsxa5&w{yM%n){D_MP$yQ#16Y<-ON!MJ zvy$u(RsX%jv{15_R%Gp>SjQeD!`^&jI<-%ONjzqEUv|fws2{A4ij0~Orq7aQ%(84E zK?^Q=G}=CtMp}f&B-AtZVhTY6@=zd@jbxJnNOJ)&{ki zJW3U1p3CmVSLCd@Qg9ob_HBU_m$Zl6eyvh_^TE8m96;5?4;Y7#9I?Bp-`cCz0?D+v z^7hi(=qybg=6x~$Xs;4u0_~c@PN)4D*Mi#Gc5`lI$*^-@kzGuh_K@nT*p+(%#qf!t zrM|nnrFuzF6(0suW69{Y>4U57QtLC07_fIfaAs46;jT68b>|3uxyip9{9D2D7PB1_2#2I^Te zhdMQ9y@5Eoj5f5feR`v41p$9%(M-b2Y?xYc=Eq=qj`&Z0%W_R$lm z@D$%@{WLy7V|$#g%;>W@!tmyHu+?tfY2kU>z2uj&fl*WWP@bV^*T;7*7jnY_A-dc2 z+W~7N$#&i-<}GT?G#s}iOp|DYSD*ntH#QG*|N4gZStj%!CIv(GSAFz24So+!oh`nh zi7s4?J3P)xEhT(E*LZH705_96$!Nni3!7oSnt zx?huyf{*PFpW}CfJEHD-w56)Wmar{B$hfLqbrk5Tfstm-&ln?N?Hx7um2oL5rsE=O zVdEqnggaYnd9)&so+r#A8q>qjpy|^Oef(4`tbf<=O73Cg&FilRQr+g=#U8)M*1s0# zDnc`kU)BEF(=iQV8dZC~8y6pHq_`!rQN!TRcPFCHIJ6%3;zC6zUBZuMGM8|rRAbQV zr9mJfp{<8XQNYGx|75A;We^7dJ)Axpw%{SLPCPd~E=WMkDl|T3!;oS{Mdi}O#_n5cFXZ`1;o~@epdGaRW zfQYs=4+xsK4{Gmn5A~MeWb5zO!#?*J@Bs(=9Rv(^egQC%3oy%b*k1(PGleyvj{*1m zYtZleE5K3bn8*)X5p03}J>Z|u0DjVg9nH4@H~k#!Ujp27uHW|;P=^mN(f@$*VZchu zfQ_y|+NZ(x=12b+?0mpS{~rLB`Xk7zgY-=Zr%>-tKs)|BV6k)l>VE^c>gORp4(*74 z8hqX17+;3pKLtMhYf#^xK;4O7ec^>K0EhlxLc9MI;(h|^+W|~=0=VrFtWW(CwDBK7 zz0aWDGbsB#@ZYGyaTBn859<3}So=Z%GyW8u&wG$B4edJTCDcG2e+xE9dr7`q1=Cm{3?hq!}0%8^7q$L26dhD3;ugJf1t_!U#VEYdt?tJ}F2jRa$IUd^aHQ0|r+xFld{>*tBp?)6r zzX$1Ggx_}|&mTaZENsu9Jr|(_`-A`YvXH^UtYZl2F>)GYEeO06G_s?SEcqxiK`D8l?7;fL zVt&mk8gJlBc9rvnB}9c%vTJ@npNT5P)dgq3XqVo$paG5rF%Fe$Gi>n$@5s3%PT|q2 zPi$>6gtv*?LUA)8CxAVTAmp3QWXpoCSd6&Sp-ZN?ppFelw0ez^iP@EyK@S9hVS+NN zwUUU{I}lvJAsR8Or2*6@q%@J<7=>l+J_V-<@)BC7LqVKDAHtoS3F93GghbI$hGSKPKctSMz~K^bu@1(G zBDB_nVxmv(3&?Q?P>`8idjC4*jvz`@gGA#ht6S|%`nfPxRFb@1j@k&^MO;Jme8h7D zbCByc(uFPXK}X1HEtyIJ?TXuV9>gpJ^csniLL8J9y3rhtOhfC+Yy@E!2Tq z2oFNqfNIF)#NC7k^Pq0bNs_dkusMC0V8b{RnCxh?)3dDf3JIyrD|3Tflm+*Y3LZCe zejX-EI5RM1s=c>ZG7>TBd2Nez6MWREQtFs3b%ybHSUm?&2QM( zDNDeJcagXrqa%o5iD-ZbDdXULF~usyIV_fd1BtcuDCG{jF^2+%X&9c!Vj72&Q74x% z#>%-PsEKkp)fu2{vPyhDe+Y;Z!{>yN*j=;T8#e$wo8RR}or;*nqx4B_0j&uMu%--_ zz-*=zO=7^g6k4Dt_(WiHPKzn)N7F#DxMr4PW=9qSsz$3|4LR7H4HF0>=Cg&2PT(Uo z8Ve?7O_$KQH+Yjp&>`--!FqD`o@HZm@keeGP=jP#jot5Ozo+Pr=jj{{{*!sxcAxj;Zhc{UA74`~2FFyI0HLBaNsuR)a2W5*==@&Tat-En2_f1FA{N6?G|0t_Ys6Mx$2#wja}> z7SiH_N7aBCSQBOPGp}mQ6c>i;A%ucuuSBJ#)CMWdFfuCar9Wq7w1`#qAoA6EJ2Gl#{P zc@2oRHrH@<+`;lDy?4XOTCy72!?{tZkyoi?6h;^sTJ*4SKe1%B6WS_JhJjF>-|KL! zp_rLzVQV-&RBX{ek-e7)etI$Ge~&o&zm%)!D8{b^_jZxkWmoJR2k~g?p_i6TKFavbk6Z?v7#@Ri%L8%1OE9fn}Ei^{txYo-`(CI77Fm_a6LMf)mz}hNgu02=fm)4Jr^fB1#^Es|#+Vyp39Z{jB7)#SIiO{= z>x>?C#ATBE=#UxJd0l$XhR*E8Se?>}x!u4a=l1BVrXU$}aDJ4*>|VQF2c&auKH`j$ zTEu7x8Yy_^7|!`!S>Q+w`%IYK;>`MdEEA&Bo5*I;7SrQaQnUb}sc)6DXlynmhGax9 ziV9A0%ZP4OEjA@9H(*Z8vI5M8q#38A9)sWN)wyCSx1KIi=ta?w%3qRhcr?ZeR!kb= zmMCX;g$#T_#cx3m=N!^Zpl}OMY5R7DGUhzf_Rartgn;oCjoyyEuP3U#x z9d;}OUPjPBG{LRejM&SJ)~Yfk(k33y1FQblg2wH$OG6#t2QQ{EKImVOMB7@kd98KoQ4oV3oU&u+&FdNpIv z*+jTt@QsdGX&wejj-b<}r_j4G_yQo|5G)xuwE-fA;y?bWe|W+4i5m6!9dRq+qa}|~4}}7;G%(7o>2$OeXJ`vBbpkz`v4E(eFnmJv7(>TM!mbi9iy$LYl$A?LEL2c_ z2an-o-0F3r9ytNbeHxdW)?t>Agb=-kyE@|*%h~k|h9hZ@)anR>N>ac>1z-dv_(6ft zV^P>_jCoV#bL#b`^Vg8YZZa~gQjPgp&=@tdC`S2Q!Jq@x(1aeoV-^_Bkc?L2Ora*J zGq3@Z2Lz0eKtPEUNTBSqIy^RLh`|A5dnSyc5r-2;SEdSMLA8;ibbuAl=<>J=D;&SyD#D*Wja~NfA?<8N8wt% zG9=M$DYY7nqjxQx~eO^%d}w8uL21- zW^yaHKvMGq)8Ao{ZNM{M1e{}n@YjHcE&^^j$2jNs<@_JO*%y8Z(%uB051zk|F4q@AxRehTvaD@gk{un(~D z6COa_=K(L3q09kw9g17cC zuomCe+S4TqsLm?o2OMZ^nXcw$s_=_ldaUW+(3W%oTPgX_r6LO+V?24^UCx>wH-n3@ zG$${R_52R8jTFN%7gh;U5&H@jYWZu{T+Re~as{1;He24-u?7LQn~ZbnOFG&GR}CCs zO?xyv@-26&;!iV{tZ>{)^0{a)rqUY(CJ_yT7H>eq;sGh1bK9(6m1M@$nJ*&s$ByFJ z>|gr4ovJ4wR_XF$u%94m?z(!2kFkY1IDt~NYd(En#ALzvOcW#IXt{JV!0viMkN52w zS&_IkwJhI0^W86%I#y=K*5lN)Y|K|%4(G~^2eC(Mqo>o0t^7fr^0bp7L0%*Uy5*z7 z@p8L?;0=E)lC-hzW;9y&`aF>wZufRy+akl5vI%2I*!`iviRHF)R%tn}v)k*kH`b}%B+)BHd7*Zmlr)18a zOtu&?TO-_jPXa1=e5nvebd1-SvFb%rH8I$JYrc{K345KEv?=l+lsj9=b}ptLc-ETQ zh?9s1*Ne7+zu=_f0g<++$Q@qnq_G1-E|5b)fg|qEgMPOqSjY4$rqy(ACvbHOSDahLcxvZ0?7s|a-%MvHHIRsx;O6`f=)5K4x8S* zUsg{eo2DY8Uw|ROP<$xfu+)7yq%Q;%*;zV#lzfngd53|#E6{u2JS`4<4~`|@c(TCa zstF#OmM8aKl^k%%kPTOvcdIWK!q}QODOLkxfh!pGv{3etK6T}o<^&?SZ})5s&~Pk8 zGdlXT#qateOwf|gc#|>kbIj6g8?nUJ;s{X1_{%2$r*eFzTa~oaM8=V1M%kFywwC>4 zEBP8=cVes->`ju3DBP6VFA9ti^r1MEo z1oZHcM8d}e>+W>5j+G-tPj%EwJ5#KXI?&itm$F?~H`!SW&I~VZra}p0K||Zv4jNTLu1l)#uOBvQ-AZU5I&oL!7rc1t)28YyObuZ%csl6 zlAe(deykaA`qSHD5c7T^}Fh^C8aq+VE>UN)Tyy2)T)q;*QY_vE%9X^j;UEHTyPh4YmjcN_Djbw+=x9kII4U7B z=HeHh$K6CF^w2TXsNyu4V5pG4QYjZty-ycFV%DlRL-0V73_&{L=_Jv;(RVw z(gqV&;USt2hurQ`%MRj_MI5BXGiWT%N1_*O-3pqfzvOM@`w2ShStu?9vWk*@m^k1v zwDiF9FxHLuB2IDt0(d%`@06*UFVz>?cb@QrK-C^jSpp!gX^a-vc8BKgZ0=xzNRa+M z{|Q==S+sFFrYSs%rQI;d0ruFuc##GbxJ@?N7jQm9^XliRPNaZ%vWZ!4sw$_0Rwfz6 z?Q$?YNOXMLuDsLe7Q0kBEB1XN&M zFy|Fc09`SM(4$yB1WH|1E@{k85*vP!vH00EJ_*%@WG;V{J9Dc2PeNYXcG)D-J|9{k zs(gqORh_WTga@iRQj-Szj3q-tKs`)r<nsFxfZKRCNSloHzq!zV1w}R1j+|=k|hLWt2 zd@foe1(=5jQz<6f&gVk<9N18_eML<4=OT`>KTX=>18x9pHN_H3yR$^xOC@n%qQsXH znQ$SCVL*Zy*C1yhsq&w=3hB$8kNV@HGYQ^t|^>3Kn zQ>KG4cZVaid^c)zMy*s?Ne5#z8jlJ@ZOhcMz!VC@XmxmSg!I$fWIX1x$g&-e&bS}* zbo{pYu;F!ko3EWD(&-f4Rdv>)F^WtX_C^^`*F;@>1y4*%!(20TxMA}W{i;}*5_G4U zzZH3MxMRr5!_0lm>7fh_dd$XT?0Xs3^)H_4tZ5=p&hf=$kgxVx?YliqD8+#$Z@7S! zSOly*a)rmRPTUQ2s&OV-%D?N4#8E1nZzM+vAtE5B#K!OgZ69>TNPlB}p@PT;*YwPVdfzvrI9SfVD(#AUYcJf80oc zp@(Da>x_w9zNoA`@Q~iCh>C%A|3SX7oe31NxFot^p;^Cy%!QG9K>+)RT?di^;T(jo z*_D-}e5PfNS~X-bzXUom2}N#-#RUE`8fUj+kbHNMo3G)G7bax}9veJ!=taTjLE6bisgQtIRy~#fB3s zkgQD^-3;d8xok&_(T+?B&9tZkF18iT@Ls--YXy6oWQig42TwRN0s)jp-T>6g3&W0> zW!w`>TTLXFOH4d@I3T+o$TL{kMP<>|5EmhYAriDwQeX9;G{c#!ABe{6{;|lc+K_TI zEsOb5{_x)te}_fRIl8^Dw)1nafK>&&@&RC*o3NhtDu5>fe*X~GkAi?Yft->|=JgiSTAngz^90hs52JQa;;23_`e(k(oXxAe7 zch~Zqlb}2rRI7)*Zns+;X0Cs0>a`J@9nt7V70rzuj&3hW+hsf4@4} zZnlc!N~_iBcY$(lII8z1gF&}GoB~lveYf9jZ#Vn>QMt3z8cchGR&UhrL*Z^~d(Z<9 zr*@-P>kk@@LA%)>_4<=sXS>tx^xFMa4XiA8yZuI|Q}1kUz3!p!Ui)FUGAnj#!{LYZVWpJsHOHCCz2;yr8}#aXgM)mz+Aj>JmEO)6=*l{k z-oxRf-I+{B-A=vT?~W(!UcXly%v#-XZ!qdl+tX%$Tx(Z)y$3)uQg3&A{RW(PZ#ZuD z8kPQTw+Cm|YuDkdV87L9ce<_CU^1z}{~jE1A0pb#-cGYSh4Yxf>9+@iTodxopjf5e zXb;+*PPgCfw_26Vbkb-Rf%UH40q=+X`e4!+76;w#ptIBH?=**#Zfo3z%hrO6*xuRC^V0F&y@*nNo4qhgVFk1W&`T zUU~}8<#9LD+6J<5xFMsxVIS^YrUKbpZx6c9I?d6neOL!#r*<8l)OGF)lY?X|13Ua)&~vJc#T z2Ixx_n#29>~Dn@xB^2L0Avr_*VVO7O}UHv6q%uQr_2 zhdZ@af7Iwb@3gl2gTb`h2Zhv1wE-Oe@Gw;>-A-??+db&+f=6t*-y2VowQe_GZPa$R zyVF5$IP4Y<$NhG_QJIX|hQpsfr@a=uv)jYjHaw`k z&Y;lm^oHSfuUl_aoBe6yUbnY>*shOKgY)Ng2#HY1BoA-i)(}p<-P?w@M!npH??-d5 z-hNo=Osa2`df*3g+UY@+xdB|;bZgwK)Qj*IYu3h-gY8bGz1>Lwabx4CJL%^thuK

YCxRUMm(?NGQns)2Ge*I|L+^s{m>$Rt?YJ1!}rzPv{k9s{|$b;`s zv)66ZD)rrI?|3j74ZDqMWz=p1%ig57Q*U+$)z+xn7!7*uYJUWGeezlU@5f2!_ER+& z-xOhN(1!5^0%L%4Uz7rbzlQM3FrGLc6Z~5U7>p58koP}99s#yD;Ws3I^1p-q%nxDb z0|@8#I=>BhPay3v?AKsB5B0tPW&URv|9lzZ&gZ26B}DxWh+ z{Xeijf$$N;e+lwEf^uotZ@~UBj91QeVOiMk!?@;ep#1v~{!fVic^K35&dWd@7?dx- z_NO3z0C6us+Gk<=KOpZ5u&u#n4eEIe;S}Otgkx;M{`*koeb{~(%H?6~brbgg48o5f z1R&qnApSn=pX;K-Q0@iF_jlLwoRh$_&S5|8zZyktCIdxv3TO#{Jbr-NvE|39d=7S4fwY zpt-1JU3OdQf5~N)&ne88l?btj-)YERq_vRL;pUHP8E%S}jRP8Nu=5)vp?VN%whLCwY0d8UwXTqtfS+UsW&iE7x@E zmz*Q>Sz!Z#6{C&fTnw3GQcl~=^7@Ki@-wfRUcI&YCE~X-%M!$L+jA}5qfl|JV99LX zREig}eoOh?J6Ek2$BO zz1T4<|N1|HQ{9aH6UC3eL`CwexF8>jm*Vr5E0{^k*T8&A_G?=%32%;SUP|hjAIHUw z_0OT0C(XNJ#uZE@Q3)3|%VxxEDH*L>T10AGG;Q#H^nYdUs@5)Am6ENEI8N%_tE;5_ zdT!;WNvDWyfG>l}$KwbJo;DmMHJiLa17q6?B2t;jP2Iqx8>U;gTxzhi;M`i2bATyk z4AdYwxfhkknU}Q~XrP*d0STcaxB%#%y1&BS^eNO1y*hs5)yVR)(fT41RR=eWEUlFC zW-rO&)&-430?TvDx55&bbJDTo{KZSik}hJyZP-d+pfZ8MlS8^@5}b)Ojo+lvH1B9O zRW#UI=@zv5d98&dRNRJhbK{arzJhF(tUi;Q0a;P3;EVMLS9R0nCafCuRdSUB*&~^S zkXo&j%c|8=h*P1~S*!$M2H7o8i*h22-pp>&CQifNw3_T3=vskxs|H%C(b@Dar-5`h z2ttCWtr3H68T=F0%yz^rT~Jv6mx%rLSI#!f5!)?xo?%{cV^QLQSz%khjO$ie>oQOs zcTF1(hhBrOBgPhg&682u{SuAZb&4&6Na7YBa#Kphn%cJp#yqQdTyaCiuWgtVw1w15 zvG59QCRv&43Im$LRAB9zU$4ZqH!L;X(z-^e+F)-uRXW_tc`a781YZ(W8XZd+j+d0i zt5Wo`0yi<>l!Y!UoYaLpM>u5~aOV?KliGuq-D;5C=9PO4TY_nLM*v*Rpq^{ zUw5Q7l?&>n&`nW_nm0bTu1?@ejm*K!T|?XjiMKBC*sT@ZbTeXKc3)r?XoJqPlFBU^ zorRDixFuI#wz;mYe-)7iH=SQDtbRqDzx5RZ$Z#1qd0A#on{zE*Lgr&zLFP8TS!Nw@ zJgrL|GiVlNpzS3!({i1h)ViG;{w>w3wgsnEXcXlP;jbOJtzQng6k(lJ zYI(7|TqIQ&sQrWiEGE6TK3$x5nnPc)>iQNEmag?CuO#z@4Lx`35{ci`vkIp|52k2_ zIq8x|LH&xux~ddGn9bw&@^{oAIc)oNt=zGtl#;T97Q~IM7OBK)-o$A2E4VX!*QQZy zI21QEIx1@M@HVT_6<(2T%ID3FEjw@WDyml({GPcDVNP z?z%_M=cH<*o{}LdO0r2_^3*gu8#Yi2ml&G!``c(!%wC z!yt9btXh*>YcMSn`09dPqxD3$97}eQTM4ly^M*y8XvsDtEmfR#szsDzZXP-Cyiz{@ z8I^CT;or1rX*(TTaxK%8O5!UjO<}%+U!A|Xp};WpEwE(>gQ0@=7g%8foBKK#zU?Z> zHP!m9%nchCG$>S{OT1`YQ7D!Q|FL5U@Yi&>n!Wplg>hV(=6x!}~t^#)4pI-uiUNn+%_^`$x0Rh+z~#+SXX zsMZCAUAknkIAoOj6@eh^5uNe@s#0#r`E@OLFghG61uI)Zq#%fDRy%;~EKxQE9PUNX zv<7h(lf$RdT3?>KrE{!lWjrm9qD~j@3_4a3(DqGeFq=N94H~@I4V6{1w18}+=C5C) zl$c&4`YZx&HLN1HWHOsUio9qR9Vn}CS+v|+s?aDR9bW`N&MV9-KN)!`u>DHdOd2*x z-<}tqt#5&}^_pXW zp=5?IP0UkjGAzAAVq#FDWVO00r;6K(HM`~O6oK8?Sr4prqW18X9*v@Y8vXHzXZ7wk zn8?^E;4*4?^^S2fA0v=#d?gpHUY9B)X4z%y z1^VJw9Zi)~Vly(TH9LyxOxcJ&s#j^fW*HsQwbvF_9h9BkvKw3`ebVe;rM|6Y>6S;^ z@|!oa5=?5;{H9t`VXX#Z3{0<=DbGU6&{f(Gcu@TmBTe??i%Y_xr;If*%{MYqg zm|J-&E9QI>jdjH+k=#~$m2Q=Wb6IqlN#({hHpXU*%QxIwT6Aku9)m5lu}T@&RkyZY z&a7c+BDQpQUSldF8+TonHTl|IetoN~%sJO|cN{uhS966|D8xC50bO`SxIr3U1pi=@ zY_+%PZEe*T*KS>vF7e!5gZVm5X;e!uyT6Pfi0YbMFdG6}a+zVx<$D>mlfKP4*@B7P zzl|sY@`cY?j{{EChE8?aEtd43rzEn_#&1Qw>JT=mDb<}bnj1u)Cc1MWJX3!j5vgYYSs zV}1a6?*QicMZiZt24(&gY^Q()Er6}S_2!fFdE#GzIpcZQIsnJL1m*rIq%A|4NByHOX zT*39jsabmOw2|j~AF~)2_~xS~YpRK*GS_U{k6ybr+|J(q?qb-See%y~@!QY#Y1WpB z))G`>yKZj(;KO)*_%Q#+?<0n1(jrNH{P=b4=-G}!l|MZo>s3tg`jZfPwr+nM5Ip^s ztMNF4guZhm+nqi7KRlU8<2xaLwc}0oDWnpqgpq=~E%_+>`W)FA)C{keJe6!XN+LDx zG0Zj&4t8>j?ojued0opJuYXj{eV>1uWpYL#(wdp~{f`gi#llDLkAtUJHs5X-cgfD- zC_Axs3noF!?mgVC`D^yJNThtTK^MnMrOo2_@woN0`m7eqI}>coKS;9oO7T>`)9j$x zRNvi+N8*`y$twz7{Y^7vl2$|gN$)T7_7#V= z^hc%fy-{)bV5GPncCT}C2HTNb8Pid>_Kmknem1D?}iNb-G5r` z9>`Q7I@6((C+WmSHD|QnyM62@E7d38{7+?M8Z8H#y#KT@e6kyxMbp7*qxU<5 za`^tNQ9JH2N6kare3B?t2w(#0wsWHGv@-#zrM`Wy<+FEVv}oh^n}dT(wbghyb*88m z?w69O<1W`X1S^>)s8IGwAZqf5{13=fv?~VRyEkI1J>a%-#Y3{4*BS%yU0ZncrYJV@ zPeWY3SR>P+!29<;d^nwQqx++FA(9Nnx5aFOJ&2WZsdULPIS9f!({?4*zUMDTO1n%Y z6)$JpT+(ij^_~tQm0f-+TuJ47$4@k|)LAS%*q`~yw?2B~tVmZ%bm(DuTlWxzG~K!O z-u-4Q6{&S?V3~Jhd!El8^r^}T6YiyRC$t;g%hCR*%K6;@fm8yzU^+*$-sA_&y}8h= zxL-SL#Y>NaZ)FHETaD0X)g-r@4fx)kmC}Ky@a#4dyF2|zY-ij1TA}iIBsR`Ya+~R+ z6gDdD#To@pq;g}=@Pw`5ZwSxDJux16J9M;L{@!SNIhwSsZF8f_ZZ0u9JQZ21Evh|P zl{Rr>bcocE-^*apU~_t}R}5r>(MmIwh*hFRLox@FJi~N4T^diT^{4T8XdDzDcmhr8 zM5SrA-eagNHm)8=r!L|LPcr$BR?xUFbsQ_l?geW*-DxP#O|RdJF#6d8G)}W|M%VQ3e3ts{HC|r&V3uth9=wIK=D1Ib-yKy^atN* z2Kv8aC-u7rLi%-IBjXgpx$Mk~hMMiuN53%mC^;@48m)MxR;}6J6w>)!pN0Qn^r-T@ z{nnXyk8Hf3uDpsi+e9gn*^X2DZ-S=SQ4e$=jtq7CeCr!x`}O|gH;QM)XSqCI2xtHH z-RHG&^UzxyG~>)cIrqF9dL!N0Q+U_U$hYnaC-^gL0@^K!`NV!Z6H4#=_B+KLPq8RW zo0-Jn(+B=^t6!i_-g^+#%10uEOXfNn>>|_V2ld`+&I}=kg_}chdV*eOJ}mB!4dUN z?6|5dhKmQy9(EWF&6=%&mwl9lM>qQ*K62L&H5mfA8f{eHPM?ACn(kOm9g*Q)x7mv@ zd;1+-WH4Qhx8LUP$sasy`x}+8kanCQ7V4yAiNAZi=hUTaS#Ks=&z6cpy}{?uY%Jy! z2CRr}3f*1-AEv@Pyeu%x63J=kxy03TSCZ8vRrH7$1-F*B=?9}#{w*t1^a0$~G=~SSMR^vGz z9#Z$+VCh@QV;NsOQEi200($sylX~OSaw0qq2G2&IV6%z5*4@u_F8tMZ9!HL6iT|rK zIz3Y~&5iPF};YxG=!}c2=hYI_nMB}J{H=GYzxr5%>TcC|GtVf=;6k_%yTF>b} zxIg8x#mKYMvS(l1tG9?mFO;uly7|epSDyq9lAV)YHW5E4eb^ip!nvbFbDzwe{l8!s znfq-f`MbvKG~_ug*8R5HDEENqJmz@~hGnZ5Jf1dtuNa*u^cDf_$FQOw@YvbEML zq2zQ_iDCHy!P?-fj`RyGv>{SrsT-ZAcu(tr5Q^kCOKqh>MvHt$TnmlS)##P zOXA7yNvI*dBl45U!E1+;xcz`l?&YP^_l{)7*1aYU?LOVjS1(4X+8S+83yt{wbo3z* zTrK{c@i|-$Wy{{kM+jU7hA0!TP)sGOcTOGIlLg`HtY(-hIK5CEZ-+z{m2nT^`2FnO} zq3Em@@#APL*n663J!)%mm3ZMPS9_ccFr?SG-4NHk={WTgmGSrrKZVyvv@jJHV)k(p%|n_hTwz!DHfR>l$LAb zov`)0uQl24xK6v<;?BEu|Iu8a8XrlclTPGy6xmCLu{z@#cM7RmfhyI4OavtDD#`kX zxkrKA=zb%eJmdeQcu;eDD)-yDNAYo}mF7E-v6FXtp`!2tknw5(FVa!U5z3a6E7JX3gyG?D(1PY2Ve|^k?x6?B{-mSUa+q>N}I2?Eg0w4)S2qZ&61{4$|{GoupMj%2RospH9 z&%f_|?|bik-|ySnKK|wdlj8JRUdbrM6ym8oi+ngr08#nP6TZ%*%Kk_Cz!A@EHEf^! za-&^t=MU-;@b%Ya0+}}QB-bc4v4(wE>{%^+6P&HajE!xr590rSdrS<)okuqZR{>h(x(p}dFuYen2>S7()o?*`cIK| zMCvgSI1evaY^+(!N5Z?cs`-3cR7Y<suy7?oQIeO^rJuD& zi8WdO@KaGPmY(E&U6|{Zb#MVzFNe#3&RhgnYs}!@;MI>8TxX5Dm2_N6qk63nr5&X5AIrLqViE#Zwv)JpVio?myDT~ zkV4%}ae4RPbm@TK)(f30DXIJ@;>`5>JK|!zBu$EDv7B&je*8y`wUgvt4NXur!)2}3Kx%3^_oHe_ zYc4Zn`)Qn?>Z}v*D*qRUHt2EZnL^}HqXScg)f9^om z4zSr#yy8nYX^W_2$62rFYq&?Lyn@u5xs` zU#z(V=0$(FlPT8<>20hoC8UATh&R)>9~^WWx%ysuF`LM|Z=Xbq#iPB+s#UEW<~sCN z{gy&V$1y5RwI1&%peMM|)?$mQhhM559ZPp7%zjbsywloSrPy|2=5dvJ)bDOQ&3(35 z>Bhu2{!=>St0U^XRS<^^U+lzoTm9&c?Nz2*7Z~*o_Q#nqscFy z_pEIVbbXSOVk})c@U|7yGWJxoixeWUuoZkce3pEe%L-ohG&;TagUFvVJ`UB=Fz8P@7 z%_Q$(DlW>nF_DcyQ?r^wbLTzZ(pHxOcd2>vW~CAFmmh6+CMC9V0Q)QXy>0I)Ra-Yp z$tGJJ0x0tK?zr|a{%~L=&-h3$+fHQdvmVWinhmSmA0RI#h@hvp65)p0u5HJ!_c~s% zLns%LiFhqloS0H0h1Skh85(A&?5bw~wwTukA2bIK<5Em5)zZ19wKv%5t7>5wAu_(* z$ejLi5UTGSUF+i8#eTaTkKq7(r#HIya*;;o&6b&8aHD({(+7QvVsa7dfNo2CH?yCt zf4e^MwZ~60mDH1Ft~%J&8AGf5XseU?aVpj=jhs5Dr{%NOL3MkO(8`fd3TCR+y^Y!F zJ4KBW+V_jOL+4gGmthL|+JSDk#fgH?=O2@8vR1W5>xdElwCmFwPb#DAi*Z=Yba%(2 zyA1Ep+jqe;YE7*8f23y5o+Xn)>}{zC;NCy0^G<*&X>owx&q&!yW{dFAY(dr7vL z$YI{-&Ao~$Vl`#Hu>8cOkQJ|LiAH2n(*Prxpv_LCH;JSw15K*2e%BF1?g#nX3*2;y#Svh~!-X~&nA9`bq z&wr}^a6fi!^uBZbX}kx%f5-n?J_}CN@=K5W!eZ`8%|dFUd2^!LA_;DN*REC5_fGOe`2kZMZsDJ$_yWyNqOe!dI!mM)&8C>hWNqH8i_L;! zX9A^ex>!<6F(qCx6kCXfYZY`eKGONs=XAN%O4(K8hozI+vq3|y)m0@MOYvt7Q}Gry z7gk69XfQ>Nq?)ynyj|UWFsvTaRg7JZZY4g&PNc*48}DtO{^2ll%B_wDt>x)VMx?!^ zR+L9ui^n4efRp`^q7r;KHcrBp(Vc#gPi!0*LK;x#+6~f`)y+B^g+%tvyoEr`SmTb65t{~nAiu6){UYXKBS|1@tLwI|J3dd ze|GTwNb`|ecQx*n*k=2W1ht7Lu2~yewUin2*<^BKB2%BDo7yjJI`3hRKEc!oPl8R9 znBB&LhgE;l@YRdiRCBAA7MoAmYTuO@j#2|lqHL?%Jd9Ub>27uXaCevl4XeZIS~Atw z?@{1d??4S#iV1TgwA0R)*@^L>Q_=i}cwfYt(ym|}9P%nY7Hh|2TOPN9xD^j(QWeA; z?N{xc^@#suJkI6s9~CRB`X{9`26Z-D zTDF|Bn_Kzm9zQ5_u~KDGF@~cN-LtpR=zQtX#wJtR)Q7qbv#^47Z0T6v95#xBonq~7 zL65gmk$RagrhBqo$puSQHC3o~Z7Y3T?j(yj*qlkUf*aPE+Hm!=g|ZlOiXXCJ{bPX$4`ji&&|y`r&PM;603n) za9{lBmqu3~Ax*DOW-zQtij_TBXKmI7y<_{dfNM29Wppac^}Se&UNxG!Wqjiz%C#LzgOTar2|JLUj(~=*8wN&e}#jS zb3Z|E!Sw|=!}${Esd?e7=1tI_`wZgz2=K^eK_BfSxPAg*>R@Z|cR@$*Ih?_W5dN!R zbMWhs{?FlT<`JC5^x&MP4L%KT!TkWva7eiS23#@7=dVD?tqy4nA#4otejCy_hWDRB zd;x3={sPW)4&mLug>;_6fzfM_&pwp-ixB_UAl?@t?_Y&Fjln+Q??L%5!#UCy!5-p& zg!t#>Z$i8ngna~I{yo@i{1XUw24#dKzu*82H50x;wqM8t{@DSv!+!_rD5lgg|;)T*e^qS{uPw(S0T)OsL#KJa{d{F`zh4B0T%=D&+X-d@C}DH|3^^Y{|m0) z0&KT`2j^htHXYZ%)?^;S{3V>bRiUgugSL^N?uXDG{{y`L5tQ}Zx8(-Z%@6fH*Sow9 z*mu7M=|6`2S-AfiRQw`@xemX70_i*8H&o#Z7ofbq0?+4Xd*^3&Je21Y>hb{E_ZN`v z2DICcAkKAY$4?(l;tku|1R|RUqWBcL7a2nQs@4ieg<*g82s%S zBJ?|{kYNUVrUOtVi4u&8;5bHeJRH@KgiFGC7JwOOk`B`(g<*_<&^Q?ucmiNcNQS|A zp5-`(q_H5!5Ew!MxE9V3IE9lKiZZ+aKqtHrzy+437)X~O84O3<8g&~X10>H&1W!;D zL1R4FH{vWs251uS#sr2H5J;aTFotGGjt~iq7g&-PDGH;&zK#r$1Whs?g~BfYcnL#rg%c(RjgdlB}V{nQ_ z354WOlnR6)IjAK^kUS(!P&DP|36eq?3?N@5T4dpY(FB6w4Fbh{C=IVL4lp7xYy}Tv z0g6BnRF&u;M=5?b3^}prVi53lIUL8CFcI(rOc;xE9)RZ{LV_1SFKEQe;31l&a6!b- zpo2mwl4V5(V3;Tp&|X+l#G#d=SSX1v_;3_~))m|s2hy5fAQX3);P@c44dKt>f-FLB zQOOWSvm{1wG?-69twmZupg&0lp>PBmUZ$X{DU2hbmCv6@8fD1ypHLVyj6n5p6t+>J zT|y#B(m2BqG(iCn3Q3begr#|g(*RK`B%q8;G3Ve~G$sMG61f5e!9@z%ju2Uh!c&9~ z=z>K_5+gB%rBx^>BXa~xNjQVa1OY7sC1L=%4d{g72>!e?Ig*FA!%3XL8H&X)=n^QM zmlsGf484jI6!gP+*Ajs2MH48{sSK&$G>Ss6P>cWw9jrznB9xUTaf;ytoWeXjIUS?} zghE9*SfO!_ z;w1#=3gu}Uh>OF|n~D3IMkm2!mxcc10PzGUA5dYSU>FC;V+>C-ka(Eocr47YdYIz~ zfr6Gt=^z=TQJz`BaE8UPkO-y1a3EM5dI(w+x{{Ef8yQg`NQvQanu3NxIEF<V~mp)=nw_x-W1RH0ze8VEh16^ zvV+!wPXQo4LgrZ@3J%(qrG+rX2XQJakTWC(0A|E_WEklzG%5$Y0T+=F2Kg&sX9~2y zQVi(D8j=*0F~Xtni5MQg=R)5DEhIctAbp{NiFzAuEDQ)?8WmL*f+3ye6~P^%umHQ( znbi?g4r_D%0JV094H6W9vXE^bM=UcM0wH`n!;psIP(4Y8$Dm^N*;-618swfB;1Ttujmym=#Hq1O|*0%t=Wc=z@R|Miaok1z@MZG;t0% zISOnMvH{cz2AB~?VCQ9}nR6n+2!tnSHpIZVg7*@ILq`G_+c|F~pkhF}BFWJ@@CZ_X zi)A^~jR8;N;V%sNQ4pWNodgkQ@hnh1g7INKc+Q(hHxOe0n#RkcFsxv-go3O{F^J}4 z0Cf^*1%@KJhILkqEs3`e6={;>0mX?X zB@`AtXb2PH62V9j0)^=#NWcX39;yX`gb4UbQ8O68@-%2L`ofI>Y_t=|N(2vUZWjx5 ziU-2-3JfThi}GTkKP$#@G%HHF=%<{X94a6Lt_MSF6zbss8$gc)MBt*Aq*l}?!;Hnl z{6vzsz4DTW5EopO4`bOBlV;BO1E3|Y;Z05uFlWf6>8gWHh&1Z-psQr_!b9@IkK9YM z>yP(-wmk{C-}oo{KVR-Se!9Chm3?^g3jU+}OB?QmI2=O$%JS?Ua&UYYxOz#SK72cI z=_zyA%pVpn`L^d)*|(dYHZDGfGMz5GzW0Nj;7>mI*I)VI z@m2n>@1!o@e|KefC3v>``@6q)HhFsR=EAdQw|)PheQ)0kFDy15YvEq@ljjS+egEdl z3D0r0pc3-aY6ze|G$f3~DZ!NtVqLHIh77JdG|aEKT^y#s1EQ30ng$R%FDCi&vVr4? z1w{}%q+|6)Sb#}e2?074?t|$!s0(hHCCPWio(4mjmVkc)_q^#wohu?S=OY2Cl#Wvo znq@F5fa}gsT?AI{0#5}gz{~dmZ*~VTC&tQ{>BR|xS16w4Nd!kV44@PV*5oNaL!dap zqIft&lLYBO7%m2rJ7|}~JPuSG4p6e6LKVPW1K27KgRp>Zg`$4yT$lj@g|WX#^I<1} zz<7pX#8NIwCD$l=I!JoL81CVLyJ@FAB;7!vcmfMTZfB1(65F zW*Fu%m@Y9E<~6EfS3e4L!uZ6#*hdU3`P%xq=P)e!=RT0 zm@a|uAOuBXAhaOB#z3;7Lw*nlS%PDjD26If6a;jkLkI=vr7&|7VZa>2{S+I5Bw)G$ zra`hIDA{6SilgBrivchY@SgxhM#T`#V_ue|KstaKf~HxX$5~vXkr05h0^5U`L^3Gg zT?`pU0vPZbjz%z(f$5$TVQ?EhCP47CieTJ|dqzQIt)!uj@I|D!rUpYZpzFx7evH*! z*8%ZK;P4eUE)XFE4?A7KAm&@!(*g2KB#R*|=>^?s4q+H743nrD1aTYsis3=l!xRocSr@$|$Y%`5QNWx@4haGVtqc+| zj0%nAVG`y6E;JS*!|;`d$p@n(VBCO`FK7zIEs93#!+ zG%LtJZ?qT3pie;NR>OoxkOV*pq=ByhwLmWb`G72sb6_S#0Re+}!$FlX0>}b5F@iFo za|VWi*a=cf2ucBrTrdDN)Q`&`<c(glgmH<3R;Hi<&R07u2US1ksfdY% zLQpVhU_S_9q=FGJiv)m8vpBGOD6ku(O<;Yh;fFQ?QifT=kA$GHK`Mg`VXX>79rbD; zQFAbUd052)s{=xVv1Cv(pi=@vAwcS5dlXv-j|!{mYe z$D{v$-{?84-{Y`;|DRzk|0UR$ILBZw!QR6Py!$KIBS^z_2>S%T4)6aBg#EAK+cfNJ zMBvK9UWW_zEza#YzX1CR|2^yxT!(N@c>Y&#*CFoz1L6M;To2&)??Rd(`28Wodjw%_ zz`I8f|CjJC2-mrNXbh5i25Hsc_jlmi6r^*Gtv`hK{}lEo6nL*f-2W58{{+5YgZHqV z@i+FNhw$w;;o&;G&%^J(2lqdRZ|8eE|1G4m4!_Uw-k_jlmoe}ZtI z!SnwO`JCIno|o~<@O%y0?&t7)j^*(?kZ%*hUpkKm`BosWbN|?|Ed)y0U-*I(zLRjj z0CyY0m7rYbnEvP3kiQM-d>7)J+qT*epMv{8fcPT3`xb<+K)$~T;r~9wn}zGVAHH+m zmJs%Q&oBt(vmxBMPi`LG{S?wW$C_M+Z|6GQAHki3=Wjzg7W_WP=0CT!{cj-epTKh& z@;ZjQaQAQZxqk!v2~>7tHM0~+ir{iMw!Ros3ybEDlE|QoPAYsV@`H@Qw=sVi9b3)V zt)1dXE$bCC&C|lB+7iGGH@fLANJ%}OD(GDoUK!L*GM#`{1GI^3Lp;;-es}LK7;=i{ z2Nw;xdr*m_3;`s##A8A^&M5cTjDFea@cz5hCc8hr-_T2$=fL2z`m6C4oJPb!HT&oS zc|bLCvN%GBVSb$N`8rwguIE9-WJheNJwAeiMr%*bq=(w9|GqB=$3@r5_1Xb9IBi;k zwA}`b@UZq)nzySszCCq^T&Im``s6*v{ItU z!92De?Ueet?OFzDMV)({d^(f(D61Yn-6BpKbC2u$iF-}m=;GoLm|7V58SBkPD53g3 zkN(JmKQj(~B0mX!r#C+y=zn2c>sr}#F0&&ww2G^?DH@fPSo9~S@w_+PIQ&ySM^Ls| zZZ&dtu860LCbq#n+S@AJtu}90bPizGu#>7qKKZqWXi{D`9Mzs|Ldq`cK}1J zvVLPamGxoZOzhWFYjQUod(n=&(nTjBlJwoePCWHi(*n>pB|N(<`4FDKXY z?M(it5G%zVI16nebTyakK5U^e#|DzMr8-NmW&00AxR(RqwQF$ z#e`#%P6wruu|y%6oXR#m;#OO4ls6e(-PY~(XrXz=rm_c>^pKfwiP-q5F<5hV{A_pk zr#o!dd8d#yTPv~l&PN=2dmGUJT(H-CA?c({Mf1hR*Dq%ilWl>mG1cu%iVo)lq}1#U zg+jtFlaD*(8XTv^Wu)OrkMecw)n=&N2lP0pUD%$4*OMt!r8~PR`a@A^Oiu&#ke|XD& zrz?yjv3aq(Arww^{9EaF08cMLc(A}ycrJ(k-en7 zO_wTB#NEbCt7!p&iOufcMlskYd5!9@l$j;EyKqdQi-Q&P0Z` zLn(czD3u6JoX!ZsD}P4VRv)-loMLY$Iuxq*r_~nIr{v`2xzhzyo@4oy<*QnZL38P> zMQsa7f1bml$NK%4@=L#0JyA|Gwfu6>a+5DIU9-QDtuTp2qd15uClkHAq|IyHjS_#7 zFYc(w%vn<}#@8dezRH&7RT_Mvyf%v6?I#1IPT#P1VmYQ5K5Y3iTSDS{g*(w)N{xmZ z!92b@b}_`i$_;5F_Bg)W1xnv9Ot}p<8;jM-X=JdR(Q2`zAe#|2RXa?Q2jY%n^Ys>K zZjCch-EJ_ojjiSFjrv`0e9U$>hIHh(tMEB)hOEXi$IU)}N!($<*Il#UCC3f}Z?iL8 zXee%eurYknAqRAI2AtWQg+J`4jnU$6&AuPo+X!*NW?W?>Wy))rv>KKz)J3|}jY-Ms zbE%Na$`rT2=b_Y(#g|=`f}GyQEBzP78dz5#9Ba<|>{N$ZP28$F3rXupcW1ARGYRD0 zv|jpT64FNDDt^n1M6NM`^>*ZPYI8%5#?SIQ8mo2)ci{ef4Gtbcg@`&sMQPw`xHFMY zH86XP59JdyOr(RYjCrqdI9JPRn_vo~rkA_95)*J|_7?0_M!3EhdG|Tzx~Xi;zfsK; zAMx}4Vz$+RH3c?ortwTfZ^lkfi1n*h#Dz_Hz)+bD$TuX)=DEga!kiQ5UIxxi}wqbJIx`8)R;#ZR#I z(c%Xm`wJ^dD}E=NZYRt70CB(8yj42|i-`~mZUlW&qzhm%sP$Z`8ZXyv@}Vo$TdD}& z$p@FK``W*MSa$$`p&Xe_B`W6btL1idBj#?uQab%$VjoM~D!b~10bHr~ zGYjE;%WYY^V}B{oj-&F6b`l-fcJc5q-6+Qv=D*+6YblbqH=I@i@9%{iTQ6E`S^pIHS;s7qb9t72srd+YUV;5bC|BRT78Y50pI{Z^s#is%$d<58MVbw$H$ z7B|ebr`5^_i9(AR*VSfZsvqpBnaF|CY!AtNlJzz2GGhQu;nuSg9VfJNOYLMNbGMi8jGhh^ek+lQrI-y9T_<%-_YOOq?G|>&I1p0fbn8AZ zI@MjE!QygRuWywmma^YlPKLxH85kJpQ$tzdnz3@Z!6tL-$Xa%-_9P!(sfJK7TX#EJ zm^&J&X-yx)fI{A$x3;B5kGz=}uBUxUD~7x*CJ(-qnwI9?QiNCTfN;14o}0FZem=W{ zqNwfQ=lU~OJ#yZe%uKpWcy?5L+Tp8`D<6PSKF$zfZ}uRQ&s5I}RzeXXlvb9xc>RyT zJ5jaKT~s^QdrvM0$YW;PPIQbw--nrUCK=55$2!z2d(}noyYf6)E|nT! ziX4URq&RszfUhbsHjSgLaPBDd}`mde!^)-sZ;;$7;)iXc)(tJ5I2 zj9go871*R2jV&bXzoQVs1b=}Zybm=s%pl=kv%Ld77Ll@9WW0=_03%jZ1!WMX^ zwX=AyXRbGT)|iPtmtB7FoQH2eEoRj1&U`S~kcMgrVgW$bGk781DqVq~WmlV_WiY_!2b^ah{r&@&= zIx5d})(z%FeIPoob zJGpJ*+k*_PuxzcgLq&H1l8mZH6IWt}0(z%#Rg(dZ8x4S=J5zKpQ70#sRR)1*@z9!h+2MT=QHMhXj1`5z%n** zl_Z&qm73^B(IjE-!1;)@dlwCD8LjB!CLv_w*!N!;`zqTfqWzUcM>R1>ZE*uJKHU)} z>DuL%e|al!4)+SoluctdQl(b>39Z3aSufj4YNLGXC^h<-$=w?5CAsKsb!lM^)7Hc5Be8Mx{pBaLcW{R^6)jqIzUdZ|#)OVErJye`EBI zdVz7N(#;oA*g|ZhI&1^e*>5$%q*La$LZy6`*#*GI$i`gx)0d^BMJVQM{GjmU$2;-|+V ze|zAIZ!0S11jF4*uC<;3U!ShYr6PYf6*`luc)6k;u{ASRRY z_N%_{zHJ+WOe(pVs>ea5?U*a;wB(D7jIf65=;x<5!CIL~qEBvR-O=|6SBkY+P#977 zdqwkZF`a7Dd3{73HMN4u_P7?aXe`Lin#bHR?`&&FiX`UynIB2*^?@Bw>Y<9}QPf;R z&gmkeB3Ajh`blbO+V_p+XfxsTl_nCX9c1##(PY~^EvrgidVw0jY)28v zRR0!RsXwHqU4>$p_&J+$bVDWb=q_)klYs~Mek0qD$8*Eg^zvc#eeTI;L@hii4v$^W zog<;B+5r=Gy~i5MNr3%d=yHUrE;8KN6C$DVwI~VXg9Wg+{s2?=GY6G zgLEg3^nF3P_v2VLW@~vp? zzU1!U4wj55OA; z5$qwyJ*k>m&!W}EbW*ZJ&xJF(n?W`#TR6)V3qfacJw8qy<5Wew*WOAoeGHZ@LLrt# z`Fga+8N(lcf*m8$B^tH(KFDWIY}zVcgQ*j8%^J z+DfRpoK~!o5mxw>ZaWbfrM>3TTWjfL1V3^`DxtZ2Xme>fYxidx-uehPUIss?r8Vys zm3fjG47pN&gW2AUM;j^AmCc2u#<5+ku6vE_UUVo&Bs;UFt3TH0or-#&H9k3#j<{*-3R{7HLw*;nOKR_C3MD$SfErp$0EAEF#*`5f5o%gv*vZ*xg&q`0Hm#LK=i z7mVDjEms{)snCBk+L?7eY&hDcWIvFTWGCaw9;7$qmWFM-s7j85t>Vb+SNqwQ%}2&W z%6;$HNxMo1Pt`O+X0!9r)l}?sna$?8vnZx#)?KCj4S66nxV@;2xEW7wr z_MJT{$iHkBMf7=kXf7VTiUj+eSkkQ6>$&uHI!l>5P38)3%@;+3%-@yndv>HQwr2op zM%CaSE*DGM-Ji+XLZuuxw(e+o0L}Sf-zJSoBf{=yHcZ#&n}uhU@*}^J7j(-i%l!wb zoyO@=H5@HBB)Q92Bfhgbm^IqvbWKg~$ChG4ac=U;S^u$CCc;~VY$UXGBmVN5vXd!v zRtCLine;U?QfVd%AvPMVCo^5&d>JhiuSf;GTIpBK#}V)E%AOCV*NtgrDxN}qCw1rE zO7gJH(?j97bCQ~$`TfyZ^|fBww_nR&%o#3KF#kQoFx}MD~SLEYz;1VmYfBx<84YbU_miss`t4NoUPUBsHuVscLmW z6YNgr2}jvF@aSAN5!+!SZFebL&BIZkm?dc|SvLRl;F?`ZmJR2)EM5a=8E~S!TZ}YK z@bnWb3Z=w9+w^PJQLU~O2*bqga1nfviR-)ij|SwSs&~}YJW;4e#j<7h*zRcTPatjk zrK`}i3$_wgTej-etH~t5sh11XoROHR5(f*2m{1qbTA`gEq+Toe?@9Y>&DH=&G+N*3 zilm$-3MU)cJ>X=~WZucw$JyuR4mq2=Ia^p*XNq^x9xA0*g-UD7t869P#X+2Gqp$Uo zQadB>d_P%UaMiN!t4O!RiOp1F7(QU#aXz{!i1B`8HKBU+PP82N<^X~voOU{JZ7dC! z_Z_{xp2c6yXlZ<7)%ZM|2?~j5%7Cq{txRdXUXTnV)0D^U&HRm)kjoW{;v4<@-f+1q z#P-s)SR3y+nBA7bi*Sz4N@cfQEP!8*{Crn;kxe4H>m@AG8o_Q*T+AZi1ht>cWk={! z5RhsseKq#tAmuG~TB&ljg)~hgS6dU_@0(DcLUBKoQzBeyV3#sUDV$PuA(^L z@5Aw|bE8SvO0afnlabh9pCfQbJ=f%n^p?^ajm^Q@l_Eg~xW+_K>IGsC(0G&N?FMh> zlLdZjx*Y#XDuvO*t&dqT2;{N5&%qaOKX=(_QKil-CdcL~2!Q;f+O>X^IVyDpJKIO_~808O^2m zCvP`Kk!_5{cA{n{-QDZlX@oMXFS9qxtxDuUGDD&_Shb?A>+h90m?@$K&i zdXe>5F|z5|8I=a#UO!0Lhn1=vi+i;ao2hiX`TL_zXn8aew(4X%cs(g)i*(iUmP&$3 z@2&q5lq`Mq50g=5Az6IHHv6Lu`tESQe#=x&i?GRg@;6sD{6w#T zv1yVlk8GC~Lt&qeF4VSqNQW93sO@X_vsFt&;EboKgyQjZYOh;HOJp}*A7$0?qi)WG z6Bn|Q>urjg-Gjo6)wqQuj9Ymct=4zja%#7XgMnbNZkY``yi<^zZvcQbtOvIfqmp0U~D26w`T}V0yIxQB1175y#Cidx-`$HdOm;wlBwxUC|!p#=Rq-cH7R2dZ{9h zqbJ7Ayj{1_B^p0ygm&wRik_}D$_=pq))29+zG1~-bNNj#jYOXa)|&&?vaD9Clvx+F zP6e!m)-7KtQtD(Ho`d|P(xd^fpg(M4bxufcFJzui_gqibugh9H@rNZgjt6`3gw9*} zPN@)UAM+dZnHjIw{y?4ts%WBVgM&E2853q7CY5$$P%SBlCDQ9{d9%Dbh{Y2hV!y|#`PP}O8t_wP^Evx%NnFBFgEYIiruSxct~N8Y!6EYHYv>0Mvioo(NXQ43Z? zHyT^VrTyf*XX4SBKg?!5KU%K$m^(x8&@o=Q`6{yYidgT8A|9(J12xx&d^J+Hw#E(W z7K>Gqha{T9DoQ$4FB6PUT^Fv_%bs?Qdd5q5PBgYgS-+W5+>eJD6LxiWi=EWX`>HQl z2g}k%zC6%y?_>!ej!N+kE>~jgU7LPXx${HwpoBJ}di|(x1f3?4W)9@ccGU}NN`va% zuA0mA#K=axf|ur0z$2(kZ4|tEo{*$_+LzKX+vcQf$Co<`$C7jHorXh`^=!qvr3R!1 zUWzYr_lm?D-GvF;rg2Qoa$c!pA3s?cE_;PYsa$&8#v)3lCzU?pc5bDP`~z~_j+XmH zV`Qr*8M#?}FB-i!VG&l&_aE0&Au@&}w$CJXKJi)$8%w7n}<#cS2Kd#9i;MCB1D5@orP5ak2hLuPTni=~)3d9%dubWa8yB3`^(9A)er*-B zMtxKG9L%aqx7fIzTwPQyI3sGnw?sFQC7%~BM4AJi*J}{(iC5h%4p*GN7nuDDpZjL- zPFTR*0;&ca%jV7H>m+~~E=vobkLh01-k!F7i_*>5Ekq}91R-YMTCrMQZ{m$raO2Ee zTf$u-uV+5#z3G^CBBaylL_Kl--QT8Hu<7afNHplt9k1Q;g;Z(Ix9Iujm8s}oi)(-V z4fMm#>(~BsxbQ0+_zAs6Z!IVBD{+HKtPq>t?D8aJmr`gvgSK zUS9HkdC@uNSXspHXpVSAMgkr3U&NW&m02==$(r(`;2F5Oup;?=Z_u;p;`R4LYJ(N$%RSm&SEfl4MQ#kmV&ES=F_?|JA+=!T>@JTZ(s#`S2NHVu-sS+ zURMlNrO}ISQpQ8#!t^va%>-G_<93taC3@9Vz7?=SFv#L~=xWNHSp7O20bEn^i07*d zRJ8iG=IRb%;p}TyuZXOEI1?<(=oI0dRlebdm7oytgjc@dY0oZr*KW>5FoRRfB_XpM zrZb?wiUeinYqtLiwvM%hLFnyEr24uzwxln zD~c-Z{)$)3s_$A(r%Jywi(wbfwQZ%CF&%cUd3fhS&?W&Z^!Hh zJ|~{Na_g(tUcdZ1QDolK#4i=Z6U-tUEN>ERH4_z{rkfLLDA+>saE=)uT0bgdpvvcUoO-wb_W4Gp{& zy-pa@(kw46eeX6wP0de*0?u&Z8wq0V%^2#7%2zR-UcP#b@UQt6R~J2?ai<8P@;ay` zit5dXy^4w{hpvS@Sb6C-FY=(hD9-YNXJy5_ti`+w4)i)`M56Bq5x0<`mH;!E11l>J z_%xq3gV@YlQ%f{Guf+95CaQQ6fjJk!hkOA%2?ize-Zh26t}DXrpm`Hq-fM()r*Vy* z%KH|`*o`#e0sT2AsJ(?y%r%W;x4wciYr_SXqr>(|Y;lppa*|udT|{WX>wCi&zAa<^ zNE%cQ-QLC5l`!+Y8EI+BniCl8Vz@Sa4Go!r?=D7GrW_%`0%lHq%Yo=BzZi}!Gv5wj z=B$6I?RA8Fix;q~RE_W}=Bg_vW|mwxScRT)pFQz~* zmypH4ydgL)dbxk@_RnS+*H@!{A^1)(duI@J8lKdG&lwMrew2J8kSPM1{i1LofCs0P z7#=dw+ul`a8kFuF-ml4E-F{PC5Xd|u$gX(gVsvS7E<~niVriKH4qC@Z85R7uanB9r zmP_={A%4v{1Im`vyv~&9x$Dx!o8VEDTXW9O=gcKNaBi7~eeJct9O>FftX^`-F&RK= z9V@1y(b>0iw_o=J+29mfz3zE4>Rwdk%VpwqEOw1%Oz)a^g<{Z|Yk_6cJ@0kTxEz_~ z8&WGkqoB6!_g{C%T;SFeM`sl*X{@0u0jJ-w8d>oW8b+$}f`_xF)hQ20FOy3O*hqL* zlz=<14El(7g0ASYN12yN-?C$EF|_8H^Ak%56Z93xT%|SmY1R)=SEH=l^ncPy+zp$Fj(o(<|z2FX9 zHq83`N*!51YIAd|9(D=5*%8?w9+1K}ZfS2q{Tz}-eJvsv?Wrqk;oGiMNJ6;lWnBE) z?42*=zWTEnL^;{Rr)FDWGbcn%%;KZlCNzIO!TsuLc7_!B0g{{*u61 zfp@}gE>v~aS8p4F|98H0BLHgBDiufHdn>+dPa_ML!eJj69dfHP+;yLMm0WAB1}-fp zqk&l@;NX`-Q6W61cDV~e8jlMzOwoPu=FO`!?lq??wCcQhA;e$05m=6g#c;faPVv%8 z2)PZ8*sk8#d#Cz!6Re)*zpc^2H-#j2@wF=?lP2c_H*VLi$t$u^TgZRyM(Tg-O|RT3 z{HB+T$_j(rkYEtojBn;vcrG=!^q#qJgUKKWqN1z#TMCK=MUFS$V?8${zBhE&Q&}pWrqpa6)*;>!D^=bIuzJ%gmJ9 z3l0RF!Ym>?-LiKHrdm_n_=TlFM4a9i+aH4<{}>P^Wo*CRc{(x;ktzE@0KoHS2Ea_a&%^ey>;t7 zVwjk{@^wy2;@=v6V~(Ak_PrUrHO)HzKce0PSd#247nDNmTRoic0wjTikTfHW@9AlC zU3u@l_f=V0m6hISWqEhYT<=ZyZQngT-QzW*(MY3_gb+L#5J>xg#l#Z8J_2~a!UB&M zJNwsz2qNyenR(2AOlBSXf8W0Y-%!SIw$LgB)3wm03ewINsrW@rnsExDj2x&i;q7j@ zBUsqt)U~T5&>GKX41BufdLzzL0g+r%;n;#lsg*!z{qVLijbb6x9g8 zqP(st45ZI}CRcJ!tL^J{r(iKN$@1dJXdkJPWiU!m3k4b(Ze7eI4A)tzxU(n|0X|jW zhH2EQ^(Xb_U?N)#lS!t8dPrfQ2$42rk?3L7(H?GrYO?qE6Z@^1M}N|xfF}RY?#$Dm zxRY=^)ku@KmueEpKVBCC%taD@?{4QxXTi{IC_%E7g08#7`FU5c%4MESIC$9yMlLWq zOFFO$^rfjrYncHk&oh%Cfw5Ma!Lkh9N(KBJeSu85kN^I4tcgaH+leSbuB^)qudX9-%!IFFYaRB4aP<6n^E~Dsmb18*~yy(V{ zy9!h0l)&N@!BBKlcdwaTSRp?%*eV{ZS9Cf0zf_7bxk6w&Z7YEJ?!BTfK4d|ol25IE zAsvb(Mvb4jzz_0OB$($xn2FFz!SN=KsB6&j>}KZ(xBGkuL|)bR!I(- zvJj_}R)&hFutywBpGYE%M6^>)3r~oKEptLt7O+Snl1Az|;nKh#!iRAQ-Rq#?blg=E zy#>*YwNf$hP@CrHm}^a#;sKSZr?YGEOi_*7o^4ohB3&$@n0lAwj9}2)=U=5lgTmup zf<#_&fsMYr){Q^XHV#~MkL>@lIC=BCb&$EPe0eh@iuYsn1C4`gXff9Y#b$!P(Ejrm+--u4C9%C zGg|gLN_Z@)P*F^Xub{OH+X^wMpn(}fwlaxe%-ECGun1L-#bV-uFWcqaFkz)wiPPuV zMTa4l7)Yw`4qmzLiC8C_8bMb#=7z80&N0vnYQkZ*TIgZ5H|ZP?lMHS~%2dS{44&zH`8?hXQ>VS@C8@$}OX+IVwow#HQq7sEc1nquv_lD8 z308%iw~}=NN^3Peh{YQl*0RuVbi#BVxAUcr8sA87%6%b$BrP(Z3OI8OWe_~4&OX;=7756iy**+{Muv$j-)`GN;O@`4} z{*>YU7%J+S6*z;i9h}3a&HkgDTR;A6I9x9?ef-r7>-6?p9&P$ucFQ+iuYLMd)YY0L z;>q&R*2GCyb3yOph}S87|F% z3dlv&TJqz0Kd-k*&Tu8s(NuC1e59xg8Nh!k!Jf|TYdFCSe9w{V7-H#kM9Ua$+r%-z z^kW`u zF`V=wpDyt7j;|3-C!58he>${d0yav^LbDuB=dz@=9VF(4*~{)qN^R{ssgtA$nOSb0 zOTGf2{k+Qbs@&^19%4q>RM5Vc1EejtZwU<&gsqm%wTkJxDKX0?df7ap`VcbxY(@5F zidM&PF!H0ZUZ^j{c9x5_6c(-|!32+)%#f&$zED8Q^#p{26CP^%L>%%C`q;MguNG60`sC6Es+0E(!p6fF~QOVfRd6Oz(W)21SbMAV1j|PfMZk&9(kM+NR_TDKz%_=G=S5aG?3g%{GbO44`99; zxHIY$-DGOSgcL-UmjRo`^J-lhV2vh63c$i50=)&#;ysBJQT8oWViiE$vq~4hZom;C z$(<&|C*&5Z@+`xtEGLK}!vpk`q@jJFw1NZ$VPt}rS#}*bIvRjlQ<@~aC7|n|WuXK( zRvV-HC?yg5Lk0=gOAgIT2L20-lzMFFHK_rD;>Es;Hf_k-hgyj*981S7M|pv?kM zu`5e7zJ{}uAk-V=s3s{ualldaIS!VmWR(G$kMjE^4!$qITPqSR0x*wkO1%k&cY&cr zXaNYCnk31!0SzRLYuME-{FtZ!6V2+N+BwV*dvzsx556R_*UKvVOb>`DHaWUipA;*D z%oo=wWQt;c(ux6!u2xD9hingEL^x~~K14*Yxy>tb3O;8rGd@v)$D*zPF#+8tpn1qq z8;C!ut-457S1AFJctK@>pBYT316cs5VF9}aqqYIt%A#@jJ~5G}325VH3iwG>u`?u_ z-8o%=jIgXEbF3=!9NPgZ83y+F7*Uomzp*ReXP{ zN)oPu-6xsplpAea?J&TzQ-(82j!J==fru9>P_DUvsjk=F4aFuj)&H|cElEJFT2Z_w=YtbmucZWez1 zy-YQIcICvotDNoMRj$0A1n7E0}@oO?rI4sh(c}9=c}z;ZEI5*lwZV&LZhwr z>SMCg!>OSJ-30KT4ERJ%D)f-Tx0(aI-vln1mQZOQ5mlnUI?a>Ez=FL!EM@?@4Cvb~ z23^)jsVUt#!vHA_Yck#=_h$+qds`UbO2uxv(W})vOqGT9=X@+Jh_De=l2Lpr)uHbJ zY7kI(Qu{(pXebp-7U@8e`ipjD)&0$T#>YZ3olSuYpsZJ=9<(dAx_7`AX?!Xt(2jollcXVAO^77o@X zc(;M^OcSm2CS_te^<~kFh?2hrz_+V?f;m+cT;TR`a@_7?z#-JzmIk8=FH$+0oyjmD za3ovdGBPglyqE&`FTi#i!fL$-fZVzOA9sUN>HwN$1yQ!qB-bo4^*jJ;yMoGyD#uUQ z2Fx~FG}Lev#|I5g;hTLpZD4b(P-YlOgnl&J6a+X0kmWg-3Dat@Opk&g0J>A51oRYw zDscj^IZ0uOW+hM|1oXNzBVa_31p!7cnj8Vu1uFu(D93Zqn6P3+@giGo1CV~IFbo2F zGE9SL!bpINGLRigsw@C6gvd!vx?O1i8IwRGEObVK&?H$INSlaX7A44+;Chn6j|3jT z&U_O%VIWUhZ2;X02^0?u5E(VtIc1j!6{jVDY4eH-bPF=cuhmH4HQ?&BEK2iXdQRts zvd}kDv{VHZ3Ip`bpyUV*C-RCS~E8<+E(J&}cA35;U0c8LrS5 zDWW$e=|-b1u43hkC^pymF|LU7lUXz>MDVIouhrND@X5%Wgbh>->#zdZCdPC6m^Pe| ziTNzkfmg|q5KaM6R7V2KSWpG3!6*a_%5?>YA050pifSt=(}i)4mw5OXSwhUiRTSZ+ zh%6(+2wrFBKMa|Neu`#Vut=`8pqB7a1B4$h)0F|>>xVrC_+Nl(s!pnH3V__)paUH( zs}hU|AAN`Gp>@D(3v>mhG4K&}VYCJiIyF$Fc~1<}50Puj3eCtc)`3ni=yfYfb?{gJ zG4DNvdDzdu{P4>#H~bjP|Eh5R&oGah&j~*Ob6o}IW}kp(A3Xn0a0QsV&21_d`v~Osqj3EY$SXj63GOox{}i78Vt$42zlHms z203$l_&-3L+mOdg|L*rC$m=hmYHZ0x ze+J^rZCrg5{O8zh4cz}Ig#Xmv{M)*{$?z<57FVM~* z;LjU_yaVu?+X{XH{50^q2J`O*qlV!Y z|LE^vJp%IIg7|aUzYXh=_koNdl;;TI{W3iMKIHXhaQ~0s)}X9Iknum@dIsOik3m`* z?q7xWS%q?N;JyU;{?WhJWxk&JAjtbyNc;bS%z4;9LRo(v%4&f!{RONCeiQ6|y&uY$ z1vwvtu>T6z*XHGiwDUET9rF1$lq(O{7vTC7@DKhj-2Wq_xd{1RhxNq|0|()^pj;n@ zJl=ve$Dcs@14w@c`Tjk`t%95(q;0@D<>$d~j=i5_-sg6mqu}=?s3QX0iFY8)9gy)o zNdGGk{=={~`xx*5l%T9zkmk=I&pAiopTo5Q^|C?yIlg}p@?L{FeGuA{gfcze3@mEmBT_~Fc{AGx95$<1w^8PoF@gz7O`28j1G5;PafBPTnsq^#gUhmFHW=-Yk<+Y+SgkIj z%V^hnEq06D>oYFc92%QZ8?-vWcBs*1@p$bTv(p;!IBhnQXE9^37)>UP(GWA(jD{tr z&SG^~T@H^~uYHbiy3G!^?xXhSR`knmhi}DWT1h7tQY#kg;&LEq&+1Ka^ODhKcNmrf zI=gPg?DkpCJDfVhd0WhBHQ5c`r~MA|`4ywvY4>UEhEBwD(crggeP+Af5q~P`aoZg$ zMx$}rYB0FW^YX7m?M8>mWwaTBW`|kt@|ZO)m(ilv_|0~Q$*T8Rd@jo(6fog3In8?O zlHcVp7|eQ$!Edx#>^i5;X13WqHkaA9taZC=UbhvpTUarftbXs3{aH;ou;Oq!{Vv~yafohDPr z=Dh5*ESo(ByIT*|9!(CLEu{6B9R`iVYqME(k$}c-b6ac%`-;)#@)~?lK%2>CaX;qr zY1~2cg5#22XSJHWCZpZra2V}Im(64}n!J9!$)>XgqZYT#x#IVjbnp`W?q!q37+m(4 zJuZD9u1jhxddG^_Z85sdHj81wsdqkSbQnx#o7ovOg#$sG`8nHS$YRzT!%H!*(`GUy zJj)K9Qy()!m5gqq#byq=?Dx4yX4#6kVxbkI!4k2?ykVz@@GPkd`b!s$asMMB%W`DJ z^EsQxn|83qWixIJlBQJM@7DjE&TF%pb=rz;{qdwe=uGDq0It`QtMdq#(X++;F2L|u|!Q)&gfrW^yf99 zz>-d<2|e!i`F++)3ybm3ExQvdk>GQelA4gtw(Lx~f70x>Uec8HArpK9j-od15&ah* zzpM+oMg_Mm8+K+F{3~HCSbQ{lqB_k*zdezr-+OXMSV!VBUz%i17o7WnV7(owEqr+4 zQNQ{9E9PKI6Lnl}JYEW#yzb?6*K@fv(&cVEm8tXxm%r}H)*ZI<&k5A`HWD^KsiTXN zr)$RhyW5UT)_m^cqc%HF8iY2uUod7eHn+|u3faat%36kNOUn2_nHCLANMwgX}y39ed$rXueohycb&kT*O_n90Kv-N`0ykK>^bSrMV!DP3YHD-s- zV1pl#N$>X>oi2ybX>%C0T3_6dF<7hyi^=MYMVuzH&*Zj4HYT&lZ%aV$XL33WUW?YH zy=Z|>$fDJG+%9X(?KZfaMyFc~odf*LJ^E$nd(8%oR=Ws2h{5eOFGg%Go!jVfyPd`r zyUpr^-?}ld?6SK}eyhW9$!N9fHAb7;=}lWqUZYz(9}Aooozdj+2VK76Qn;#3rdPrV zGjxD4>t(Mq;(p9}`RA83%L%J5xajceW0#y-uipD1%SW{<7oh3N7h_L3po`2kBL?S^ z(ePY=T#UJrwfG|mU3@tLM(6!un@M9rqs5}b;ISE?q;|8xY;=U(OL{ld(zv33R&NQL ztpVer!RpXJ4fGD1Y1v`3x-9mz*?Y+yaOeUqdmw0pMuUFRW%DF#HlNk0OBI{}msw}9 z2K)hYNawIYPj1w#xcxed&F#^_fMJ5((_^*-9U6zjuCp1PR=qZ$*C&lOv(93&*zFM? zw3E^82)d!0FuUzGXn(8SW%0s@qJbucPDW?+LeJyzJZtutoB_X4>h;=^=CBn$Mt96?_geKvOUPi>I!qQDM6+464s8g&5jV7*anWlsvaE+0K}@UH zE?ux2J$n5De2tE!75zEA)1vX@JXO2dwDc5=#-@z^iOY-6IhGwBlY2faEd-o)z13oJ zueg(+@LC-{r!TMbUd}mvNuw`f4L+lDx`I)w-DCBHj8>-^W+5K<0XPh1n;z=oGFWs$ z7=f1cHmg?SvYT>dGkpFgqcvuN(a>pFarmMAYs<<27u*)hvc(J|aKvfT z*}O|8mpd5nT9@pWk612v0tTNaWzbuVmlMw9vdv(LyM5NQ<_G`sAM2?m%*}op=3@T< z^Ue9XAp`TapNDz!@4>u|hWR4_VfzsF6!?3=y#sMVFt4-19Iy#t^YsS?{yztEzW)=# z4j>Io@!oeHTm=082gv^yn7jTS%q^b)za5Bw4*b6lb3YmG;}Cxv{67Tue-HjWK~=#|3u*rv^7>~;YX$cc;Qt5k`~!&lqfm}7L!9r!wGMI} z5O?n8yjAZ)y!o8|MQ{!9i^28gJZ}j5<9T{W^A8~JPe9I3zn&jVmBr%vCC(bAc908ty!t2!teMYtv=zrW{`J`4kW9okN=@R`a~J(sAdY`o%16=qdG9EIB(l($FA-3CQH(qYy<*b%2m zGaF0gJ=yaDQbwkEDz;j23QR7h%TpK2sc|savlP;ZMb<|XPi6eBkh@3bJfUhKx^mM} zOr(2ClI3cTbhV36ppvL$)$2QCl$<-Mzt*s;aF#1sg2~$?xr`t zSMxt(N8?*&W(b32=Lvl^kFGcS_&9G@r&lsT-K5nUa=couEs&14e=1*|s%S^ayO z6;M^HQ0wuH%{Kc&bE%l}42;(!UZEAFT$f!b1F&g3KK(Lv=!_e6<|~hn&r02c!(T~- zY5+{+kY?o>fZnBvM2*e-`9~wym2M>8ZY*!Ci$Qfx+?>VyTxn5TfHOeU!+H&h(^`__ zqjY&<6g+rqF~|9{mPAb4S@GZ{cRIYE%~+B=%_i;Mekz{I66IO}3pv|>iL6qIZCYvj zv8)r*7c8~Q1+H4)lfrPPmGmk}OQm5$>j^5VMezWZ9Lm+U%aXa*A~V)JW0mCsRRemU zOa|FNs4K;gAE{0i=kUYbcqB@3E>A=@d2oEWC?SKT{BAW}UhQOZ&lX;1DnlZV0DZ_W zOe*eXQn|q8hjA``vcWu>5F$C`VpC|NzA#~$xoFX7L#i3EIBNDMj%tWZqk(+QyByUSs|9_~}hr>0Y&`u9>?reYILb8LE4>N5~)%vKD@RN40xD}~9 zjU*EB&d0tJXbqmRHC3G4=4+ReU!|Wbs7aD{MO2CQW5XrdI7h-feGNF3fC& zcqd#eW#hrJ4N1pKu4Ia{q3uedh9&)lTjH`5KSG8i(RW8O3gLvk>T+4~u${IY&O^7! zZNCXE*AXt^+VJ_gik(Wp8EzF3DSw)37;_h*vjCm~$VGX>Roc0kJUnTWt)fnxVnQ#5SbW>+z;Uv%yT1I#6UYVozfF!#m<2OgEo zf}u+n@w%y~#Bfl9SWAD-f3$%z!Ai-`M0+_d7SM;nJKFli2AMBzXG@!o4v@x*pmAIo zMoRoFEM(v~bGQNLfdx=%D%Bc0aTD;|MvJQ~>v5sAf`=}nytZbSBSo{~j-`SHlZ=&X z*_CoRUPDY=$$ObdyW{ps@0b?*e+XCO7PUw0%OyP}$)68HZsK_6@TJJGVt(|KscGt@VI-ewD89I+iCT^Y?VQ@pNd7dqgo<_!FYMKc zM#YO|`CMj!yJ94%J`q?-3gzPKM%I#v`-kB;(X^F1Xs}Sq7Sw?+PLD;7EESTp&?{$R z@c@#Gi%igWBoXCYoG5sh`g12~w37dnl+Oo*bOBFJFW8Ea>}b6l&2s#uTE&z=#QoOG<}MjNq^%u7pNRWHlOzI9dv;p~VuycAtofrd$(;&tBof=YiOpC_h#;l{m>A z7ptWwVXmTUt#oq{CpN4bk(yYV^d|pY0M{nggtKA1= z-EEDfveS&PqA2jXXY}7n94r;;=k|4qM0-;=?8rPZq53t4ik9CTyzekKhR( z;sO3#%92dAd&bskptEC-whfkO`^x`b{D}Stk5|hz9q8Wiig~xX(z>XhK9bD5Ozb_C z&-H9itDi<`jm;aEI4S+`sRirmam1G><|dzE%K^lC(V?iHr;$sEjq|O|2p`Igi-(wI zui%7VP0hMJ7W*9OT(;%2Wi}BDmyY9A-cxoWd1ROePUpnNL=@o!%Sds>ta)u{7(}~m4k;$oelf=SkwVEt%`W zb9~xhjiqoSn85PkvYw4OF`GT2wu@yM2~t*nQj5*1yV;@@F_c2>*nyT#rFtH10j)0v zpP-FQ`CcSMm%YAJv=5aY<&9^f27;#O)aMHb6GD+@un^F2U6_8%0&#Kd$sx&z&~O$3 zg_7XHVRT59>w6pJ9Xy{ckQSkC8RVlTcMz>c3Vs49)RS(Gu@$VY7{^J!uWIM z!ow0+Mvz~@s9)7yB5l!MVxI}prsZ+rSJ^DTaMg58T6uzvrP+3DL~rCj-poJSC8kcpx*_+bkBgg@r93I#u~il#+IX3K7%dcYT&l{ZEte|^!C%h>eXZ3tngeY^x|k)s ziE65pQENUY7WDy1dW0!b*^!&rDw&5F4f@)?6-%vuIC zQmJ%`-EMj#o8^{KG>>I`Gm21=@Svkfs}(vEPm)DfWI0RXnp6y50&VEFzuXqiIod$t z)u`<(ttHxJ5-(C4HL*}k2g_z!lZ@`yx@AkC>wPkw#tiJ1+LR5c)`)i|Wh_>W`Rfse zizL}b9dC*VSSJC@9p~-j-AX8#_KIq-93ptIP9m#8Iu!RIF|)^L#(^il%`aNR!J`IK z^5FsJEYk6r(FE3jbrqe=uAwocou(-!g11(}>5*6f)p0Z-rmF&rr|Zr}FI81KOFYL# z0|gRK$|&>%TtO_P^YL`b6UPEFW%8HmNhaU8mDY_9#@|Wr_pgZ=eH~oZcn?0>h<-O<~i>i@J zR>Iscrrlli08Aa!EUVf@hE5WcPm@xeLHZFiB!-;C0H!F`JhRP zirHwsoM+mryHe}1>7@cM7WhIi=gWq$e$9t>aU!yauwOG2sx7g$T9f0TCM^asWY(0R zolJ{x6=GyDo<6V`!8! zUL4_q*+dXST%c#hdkCS7&AHR;{~8SRII96bnC(o_o}bYxj7NZK(GDV3bXcp$1s#$5T* z(j}~wbR1u_fKp<#IVddUx@Ej(#xhew3(Ml(sy0%LFgF=GP~ik>C5xlQ+SLlZoJk5B zuJosjV9~?YC?rA*%*A)}VeHURAWgH}V`T@WJ~Q5QSUVDGa~1PXHNDHO6p=4@-hR4D zRw_>(+4FjTal=@06_dluO<2DK;vsvQZ6&LD8qJAi#ePsDc#3Jxw4-L~u_Gw{dus$s|;ND4Y*8thFn~U>wY^=zPA6F=r3?M-R3?j9e=I zR@I}AC)AB>GmuMYDF*hnQ)Tzc#m2{&XW>wRO=u*#dDj?XuK0L8d`LT)luKGcXL!8p zaK>sWU!hoyKkbfFvuvz!XtHMg6b{DhV4LfGUjcUm_K4mGdqhvd6}FMyt3befZ)zR( zk1XKoaK8IB*rWR{>@PhI@#Z=VpM-t35?tpT^>aS;zlQVRV@LzH@0r2Pby>dz@_q@< zh<^w6u$r)E^}DcNg+Tlx*ymb@^W8Zg`IF%0cJ}VUb$-*T9OQfu;(rii zZ9|zhA$%WXeh}{GT=$QFe-ZpH|MNI(s5un=1&*Xk+awr zcWLV})jG|#ZxtgiqN{Yj;J?~;Z+=O-fo;}dJ4P#sj%E`gi0h ze*ao8#F>zr^(lG#=(v(?Ia|ofh2HF}5Kx9+xsn{nFY4;A+!zEesE6fTvLW?r`5iuq zp=M#@=HY3!+7HrqpXhB$!yU22U2C_CuM+KzM(=C(x&s-TK0)YPXXag zJbZI=wJNAd_K){=cCZs{bGr&mwP37UEPtNwY*x%XE!-czP?8R>1oF9#_=Qu@n)JSN z2fM}X4mBrAe~O5yHMU8$(+N~r6<+0*x4Q}?+4>0D-e~a| z&%-OJX2<>QmDD}r`^FzruI*d|uy=zc+E@6^*G&90~1ZePt!9x8Wa`~Bi;wlBGKK~1^0p&Q(?Ag{Pq-zKZ=!QwA=jU8X% z@#x;rJB=o;wr4x(5k6EuH_KM`2+0{2>-|uTPhbf0Di~L}602=i#@=W!xx1>8YP!Xj zKUFN;_TTD0aAw|p@BqKbUVYxu>kXRQkNni9)@?s7Pso#(kBaxd|Lg=mf1~qk>bBVF6LU|)+k)9_{@WtcsEiRG#4u)~=%R>`rexw5mp@!ery<(ZP896un*ubTDEt&Q-|$ z_`X^lwQjx5ZX?#-t#*8X+%9Fz(^h@EUD#9tFRqr^@V+(IcNc!+_O%yNyP4!i9=54Y zEYLB2MHYg-#o8?Rtt3ahu2eUqZP?{l#;+21p%HH#AGKO{bHRs{S}&C&*EZUFn@3c; z@pAG-m4Wy4PCU(ZWHZ^@A0Jnxd%{^$v^`&3 zzN(CIE}y+QzFL(A{MB*dKB&OQQ>UqFc09QEx)-i^oNnXRK~LIbTq(-%JKL?)lOuxU!8Eb*x9o-Nx#;<*tB<6@msG*a(pj& zrk;%Lg4%wg=F8}{*}^@E8`RPj^f31P#!Ir7#oN0Nx`UU;zt0G%+Ra|S`B>c-5yTzn zR;H7*l-oV{J+R(s?q}4yyxw*1!I1CYYMl;RvGz4zV^ozVU#17Wa2&lr9e7Fm<8O9+ z*N>3QGnAYi<`rhIU4L+Ty(!(uA@|#@%Z;Pn$uz{ArW*|aU306<%~Z*Q^Y?pi3;Eb@h{INBuS#wY2KD-2*T3)6A-Av2xXvNJ zeywv{8)9n4c`tuT-0a*yTI^c7eLdR)9y+X_;l}Lg19USRtIfcG(T(=?-FRIPKLch+ zxxz)dcjv}poz53)(?Y&Pl_Xk@Z*B&BTxZAcI-(^EsiBcq-{K4QjrQ$(Pv3lPU)(z! zT_e}`uBGoNcPq_0(nH5&1s|QHTPb=#t)jE>)=xX1yu%>$-RJLU)E~TKktWq^ww2PA z?eA)>=Vo7Yypi_5@m1=TVcTPVb2MI${pWsMm49^WbV0(5RV2eRMC^-TbfQwdcdbPr=X`%R*Q?jCG#c(Og7a-~+RiZnXWEpKYCzr=3tA8Bi6z0c;o$_v$# z*=xAEdHuDv>j&uO6Q@ph zu&O;;QKyamw7Q8za7z*9{tggOwsCz#5)|a3DBDPtLdRxOwk1v%r9Gkm=t4Mr(32!lutzT0Wa`j!~VReE` z{E>$*Cf_L)Pp3WkX0bMiJm1ai*j$#*^nS2BJDCti#QqFGcYF0e9wa7y@*e$qK%%$I zk+VYQAT>zdpf)_k=JUfld%>4^sc>_qB-h@I76---RYkqQZVZj{8{_>u-~NSNZ{Vw) zu)l@2D~!-`jlgBEXfub=5WIUi6ET?ZXF)rJRdMVLQ=i<;od% z`a--n{l8!z`2ExJ(HD-Bv2FQuJA?Xnzotz7GOKd78^qW5&*J?9HlXZvBK!3_6n{3_ zqFFBY=IwAkfA^}4-3{#C-#=B)T+dVLgSG8YUv0g1Wfo48_~5lYb|=OZT8BCO;H9#Q zADlmRL~3rqGw$MN>vUKMzHnLBfqj-z^_uaGnbg{xr06!&D0{CB1}xAnCCRM^-1WFB zm?N7nwY&9xueSwIhgBxo-@CE*Vl^I>LN%43y#H{!9ubS7(cZD#W ziL^|KRqiHH-iS1de0pWJet2gGAq?j`UrklG=I>@-A2iv~;r+En=^}P;>!tY8&fx3R z)nclfSlmBu-afm8)-(BMuj9uX5yh$sB^oa2QUdh$U~1h1EXVI zW@qRQnL?uiOHH@gH~0?w%zg@$dV%z0Es<;pty^4`*~g{~)rR+&E&N)LquGKhV z>N|aOHjQbjH?b2i{zHXHqnG!36k zD$1ydz<^oys#6cMiFdD2Qvamhl^?2s;m%CD{BSb-h^&LD<&Pj$1C z%+W*^o0YYDyBgUqh-MvMG{c>PRxuG=ut z)UYLn*Pr^zwat@~HSp`FuC3VK3)2JcFJ9fhvHS4NICYkKDE8N{yk=#Io**}|^;)I+FvzRad$84m({h^*Yt*MpcdjFS^(P zB)gjM7T}aWuXM)kRiG+tE2S^q>0Fci{RB>by^+4juAY&v&cKRf@0qO`lQ#F2)y>Hr z-O}z@@;A_ZbZtr`J>6N1J{k1~yGPOe)jG9ra5lr;M)A6GyZc9<=PIq5W`e&^**YGd zJ7^xZKKe!5t@QJ6emghm_I)$vJwed^-3o}VF)B~az;gXlv-n@mlS z-LZ(~*Ve{>o!%H6+1isGuvLe*3yN|gRJXb5+Rg!{e>J(OoTyjOo{qS>dpPNL8vMuf zG1EA`t90gKR-Msg{vJf(u(AQK*2(%H-t>Dwsl$%(y`O#NgT-(TNtKt!Bxp()G=$d z?C44m&3Bx8&pQqWH3{H4>pf4Uet0@NxHq%)iH+<;d9c+KW`iX$pBRr1q)rm6ort04 zYVv$CJ>iSN{Zebx-{UcPkQi+o#9oZIxycyW+`Ijmqg7)oLEpe!hWx=`mhfx5XyG)^ zSEc;Rk?q2O>mL^Rdv63%#KyYPZvaoCP}%-mr6V_ctugLU{nLd~`(zKDY{_`$(v!kY z2mfYq^-gG~tTg(sD;q@SHdVacWOiQ??ljjaw$*wkIulusQqE)dtvl;$q^|}CxYzF} z@mcd)SGhZ-#&r5M1j_#wvKvSDh%X&~ z{Iy(@45yO#M)Sp*y-|7eTRqJ?J)!bW?XSOj=xHVSqmUv$oV@g)vOz3uFnfnvysr#H zLuWha+mgSO2<;b?YajnSv1Y*1r{uWM?=BuHYvrcQHAZADx`u5HwW_S%Zm9WJ5Twlu z%^s8K64uEJ!3PpIV-K-VXMJZ^2uJzXqb+yxC**2!@7j24Jm7};d}-#qvR;}^F#a8K zI6&e%CT{%r&P_f(zH%pg_06zr>x#Gu_B4aL-Ri{00!3r@#agaS?Zz@|n{}l)QjoFc zP1jbdlni5OWT&d7>C*KY7RVlSsFT~hN$K3R)8|(YN6{UQ zFP2<(D-hcYUzwKph!2Z^+!hY6$z%hIq_Szia zz6pC>e+B;U0Y2ov1D@imuow2B4}pC{*kk+NU%mI_T@eavOWUwJ_QPO--W%r!=HHH`!LuHY=WE| zq}zo!e*=4XeRw_t|0z6Q0UXB`l&cD9e+c&UUIy71$aohv@_rol@4O)Ur$N?tp*$P# zd=bLG0{Q+O$TmXQ&w{@O(j~zE=fKYZVFTD}oPzvc1=(+c|6c;uFHn~`rsk)i z3^dgBhv)49ac@95{%;5$K>SxA{(l3v1lX4@#8+XD@&AD&+!oB~`)9@^ty0Tl8M zl=*MK-v)b@bx8k15btfM=ifnjUjdBBcOiWn>cK&r9;E#$^k`M@YbKT@YFT zm8{b(iD^p$t*}j*FM?rp9up)Pmjy5vt;kZ7k!3;w6e&&vEbu!rps?G3a_-0&rQ+3o zic|Ct&?+@ZJlLuXC>3GBqN_?x8uc<6g8c$&)Do0Nmtc*uQu1D!E7Y;hCnb?8{SY3>aa5+HE!~2Wk4bVTARhEfO`ewRs*gn3Y3WAy}G0{ zQCb89XI5QDwl)F_jKQ*^Ojs&@G%!w(3cS&$yK;C(2PIF%8Rq&QjuW*BOW*Q7e&631+}L<{VouvdsophxM8u){o#> z0fQ@DI0=+RQS1sh^!H2=8BwE&+Ts+Z-6e#U*kLFCKT+=iBukPW<`I%2sGtZD07VfX z5eVQ8cRath{<(Q?T>CEH-gnQm_uhN&efM$ zQ-A;gh%yirg#IBE!I-G%n#!!K%&g2m|IGUH|G?KmK5R3`{K2^27rO8WVDp4M)u>nR zjrjw?crqNYhdto7AP{H-0;zP&>&x1LA$#DhKrR`8td_$*3wS%U<#=XKAQkio)<*bI zq-@K=lA$M8aHc93aOASf8Lz>oZl^x2bHbvDt8a~Zoe6)!=Z=nZrCnPnw2~@qq1Vm*=(PoH2 z4g+ah)@R8?3Vx$2;aN__f(c*L?Q^@e8Bbp6D~F;H7nrA*q6v4*X1DYUMUOQMyRl(X z7Ohy8J&yPwRf`zY?qb~I&VnZf4>76_G-+HhvU3xFE9Ge=& zpPKZ~OdY=~7;a6vTYD*6$anqZfq(mjbNT(J{pY`Ae;&EJ!hZkbt%iNwInaAPP8tpF z9AztAD>%K7EeY6$1!#Zp3&i{;Pr_qJrNfyp?4*Y6*-2MU6)Xf|NoXuCOE?ajx?NUFq2x5hGWJL$xvta5v%%x+CCuA`VVcLYb(~4TdicPY?tRK-2vpO&|l3#2W~~M&m%h6O07i zzFx#-in$WYfTDLt1L>r@5ex$8DG2h`AI^q@iBbfng#ZJ00n5goNVs7ic`)jU04+V} z%a_2f32b7wg5#A?!QTa^E|G95;z*|w4tFe^0S^LUwL9{k>_*FpZ9l#okigK<2s&Jm zU=65Nu42yJtmdP=HJ7KhZgIx#!>l7#-cCAQE}!276A5q`60&>Cj!7#Dwp~70*zAP^ z7^Bo`IPxWs>Q>7g12|W4JA%d46w`(zD<&5R~fP&k%qisaKfASN7F#p@#lT8 zVZ92V?NAcjA^3wakeVS+#BT*695~7Fn?NQyYtDMqZ&pS@l}=~lPH#An@qq7(x+kbA zIbaXJ9(IO%VLXEh%z^xkxu6roy~aR1<^n02f^GB>aP<*MfFuYxgT7EK=mmGa9#1CY zcY1w5jt{&2AWj2;;DQ%MaA-UUhbQX~Idm!LHsI70yuKKsu&QbI$6^t0GU15EBOYTo zrL%(hM>-ibC1FQ-+Ly2-E~KzW>oxnqKTaTIaK@7`#Gz zQXa&Sd(-D$jRynvbir+IWxOC`ExwA$ln7Qb>QKCqbijz}OvO8qy2<2)0p1!M2E2v5 zCk37jybfE%(MT@|mCDw{1Hg(1!Y9nF{oUl6fKAnOOGc`(|By#W~S zHR-J1m$wWCB-wVHCINET~Kqqcp=(wOu79s3{v(5y4^2ZQXf_q+o zu1SYEIGS+&9mw~EFM@v$Y3d=p6kNOTpT&p61;?Mj@fgzi zKE%J+=l`n^??TUyg6@Bz-}m8q0*(tG246zli@o}C$WsrVC4}=g;MyB-{vSc7{~P$< z5Am+RGmCKk4%`=j@c#+&x&+rQxalki|79qT7j)~3y5d4wx8WclZ};Grf_kB#JQq6o zg)PDcod1vC`-gY)7Nl_x^7RE=FM}RFg!BMe{`c#UmIdZC3-CLFu%AL&7kk6M0_nd6 zc@sk(#;~9N!au@~A-&h(`b8dJh5zI5d*S!%7|wqVbAl%q<%Bf;2J-x4xGxFM`&%g6 zK3w~2NVg5o`xA)&Zy^2;;J6F#@5gZM_u%&p2xmijzXt6`4fCmAh53RAo_`0*^clnr zLwY~{|I3p2+CTb7fzsQZlgQYXAKabUUs<&tP5M$K^Kh@U(Y94etBnUWi?>;uE0_I> zMt;B3H>)cimo2i{QtC3+yKkq$kq48Ke*+=TkX-Sk19tAEZY5TVgupY9hSTw=S#iJJ ze_(M-rTY3^ZunL6=i9q~

z|41t^c+m7DGimT({-L@}m@X5`YZ2pgbpCun@ z53k-AtS*o5c|Lq^eIrS$x}&ft)eHoL_KmpMr8}qcsv?Pd9qb2^MDu39qI=wq|@>E_Ss9VO3-%%?hn_l zw)c*QL#_!I)J}7C z4FTtBoF|W`dyP@dS>N>bgX^aHV#4i@F_-zBnq+{jd&<&3v?Wc;!L4zwKD*tom-Dr) z&f;5}|K4@@<+>`p?M^Src*kByPlR+Fke);(VCE>~8Y zjo5yE&$1O&HwFisX9@eD<6OQkvex}!PocZbR44b*@k&$4)OJ$zTk+TXoD z@I1EdY2N)R@FAdUsT=-|*S#Sf_Jb?$?(M-x*YL#S*!1Q*Z{z~64|=?Irjw7=eD1{K zelmaCTU@p{^|v=+MscrODj&w`3j5AdbsQ+nR?1g8OY%$MwB2Jp6ZPal~|?{ zUpKAYi3D1ua;=>-rDCO%VMRQs=QCgwR`eEnyLs<$&Fs=vs>$HsmMK^a6xyD?dM#K_ z9aSvNRAT1Ho_f@qm`fp9FEGR+j=8=)$P?K$EK0;_Fk)D$o))FsW1!n|10Pmy?T&{U{R^sPYw^>Hogcy zUfFtn^T@k(mw)gO3q?AODR=W&H$Iry(=q>QWvw}~AAkc?M`)CC^^0qDZ~g6w?u$(qm z9uLl^Z!?>Yu^0b*Xm+(@KY5a=+d|XKL%%+gOgUFPT`!TjrR-NXw(PCq@i09agHJ@u z_9R)UhO6ryPuXSh&M5%DQSF~>g5~28b+6p7PqvLYtW~^R`wc~GDmU)}){<@Xd>$-Nc z?*#X6H4*(j`bdy5rE=$~eC9F0eox8PQMm2S*}fWDbz$rcw${H z#!SLkH+kaMtS7$~8ZIB59R~JqTkf2L+Fyv4AFh^-9F}dn%=ai3cdif-n)rk zl=gffUhOcp_r$m6yF09}8x7&J{&{AG8kJkF$f+ z^FckZTfh6;u;^9FmUye#pxmv>FH6-wnqrBvMxls6OE=dmB@?Ds4+_{6LomYpZ@Qnb&DYGH|PKAEodwg=V7iadWU zxTQ~Z((h6?q7hB*Riq>~c|wiUbl`7>sD*^N0Y)iXy0f)nz}Yl>rdc06=)5~MJh?YI zEyNRnVQl9xHf=kN>+WQ>`S_k&=gU+C4%1+}Ri2#f#mrF@Bs_9!sa z)$`f2de#|r)(RD{BWY#^kFC+ircLEd@%YwnvEO`XUGx80wxsc?^X|-o2@f>b!hAPTIEzKmDwxwPb*u)(@l2LdILH z<~r$8JeF)0!^J|W+zOP+#Sq8y@X{-je&H;-AAI3Y4hIU&W;l8b+}EW{+YBF#l2K_e z!nfVZ?1hi+-rb5N;5+MNuak>S(}%8tVz}E~F3{Y9ddSsG0pmNHVJdQjI9WkA`_Y^{ z%I(!=>bDBbj4iOFAK$i#YPmQF$Yd~8Tv^?SeXuc(2upgGFR93-4Xb_lPQ1H*a^%>~ z`VWeJXSlp=KUu#OP|^L~jONm2`Sq%_WOB*jTX({cSlJt{_$q;aXYN^K{Ou&HPEL<++pGXgnC*j2*A+*xRlAgYA&>P`l-UR_Yk`f>iYHO1W!8eYua#BD zxVgoC)q`1(g{(`rX0JP=r|a>pXDMA(vytU4soRx$-o9XSR zy73JwMl>GCs0+6ehYhOy zIIy+sP(7ksPuyjp?XLnYH_CFXOnvFqQu&VN5xF_ae#v`6QnF!SwMLEs;!N*9Lk-bh zw={O8gzfB(HITj)SX-W!hevi*z(cD4$N_WROe&r9kNQvZC+D)NT`_3=V4U=-wiTRK zITC9Xnx7l;jeytMKKHtO-I{m9p6lIr-aJT$mcCF-W6e1DzEkiRZClPHNj94&Zbg@lT9)_Lv;JmHx35dpQ~n>Ey4QX$`+4uyhD3jI=5cLA zaB%0raLXjps2T-9CY06h9eliM=@bIVa@{d{l>1&F_8TgjDe|Q7?2*b8`5#oCuF>3sHn;qV!X-@>j2T!_T?^GaLyx)FjwM?0j;kd!N#IM1y1?2CO5kIP z#MRK5?7G3EHC{Q{tG-3`WViCG+g5kIQfizpMJD!@YHJieOov>C?OL+7774je;%-i= zJoSAM!4LAWL~y>fL3quTIl-Da7*Fi1S@#U-&CSRIuS)38yx6raEtX@8=Ywf5R|%TZ ztnHZi{nI;Mu_1VWRj2ZUlg)2cI-V`xgHkTuRGw$w4uCOd-s3k@x8(O-pDH^$g5o)w z4+cFeF%u6DmCbR#OOS1i^{unWx@jqTplh1sx*wcg{!wG2Kk+H(h&$QOaAo=T3wL&M z{=hZx`?z$)Hw3%lr6dyk(Bgbb%lyK%`0}2O_Q|?wN7R$u|MQd2)V=uYzyJBAJDxT3 z>&M2SzLvk!6ro#&IdAm^H(B;SIXsMQ9sZ*8&~dc>^VRdpdVIuzujmu6{QnMplG)~u zl-$Fu@}@ud>Hh8>=k%bOE*MAo^sqJI*0)Y;x#MDF=HOwtS{$^eqF$CFYR#Z@rIw?e z%6y_f+UkGcKQC(9(eP@1BkY`p4tE_kO=Vad#+ttLM*LQI_lNz^bX%{@C2~IMwr*T& zCYp`3vC`~@OWwp?SH4;6*Q$xeTH9gi(@#cY&XENFkj4#}c&%c+luGgzCX(c1Y9oPGo*0ccbLRlH3VKb=-mcM_hSG4ikxVYh zqg(r_(SeYewuWlW$Gf$3uNU(hN`+#!=^bQZ`Bv#daG3{{PM}g6gi52)BoYo9n^kM0 z=B{O>ZIN#9xY6jrx^pf2y9W3w1k}n@H?ls;yC#SGSy4Tl2NaW~wb40m9jLYpZEe?; zukS|I{f+%{q0<1A>2|Cns-|k|&7sJ1D=GJn-GheD;mv9a%n*G&m0)*z50?UyNYY@A;PYK$x7mbe~PvmCW3y zkL}cZVW;=tmlBrk$bCyuY#ctX^_8Yk1f6cV9UuF)bhW(<@R#-N9{7Zw)>DP7w0t$& zzFU`tuXa7d`1!P-ez4QENIPrkd+lsV6>m>_{pft8*+u)_mAgyfqwXDDDx*jY1M7pV zq@I3vFV{&tbo3wI_n+zR7uTwL!|qPS%V;R4E{YSAChs;sdb<3HMPE+5{@5$saVMLx z>J$3782v57ajBd%R@`Zo+s}$-)3#HA)#2L6Z|d(9T)jx7VGH)i_{l@<-8qD)S$_~VX!A?+AAZ9Tag z7Hq~pzVG0%qi9)uH{p1v-z%(bZo~)9<0lz?Kxy|Ty`ff$j5`LZ6XX4~ttQQz663Z0 zTjTb6CsO>Qn>SuenIix8dHYXm(-v=9+m)-+K~dN27KcW)^|ve*L(lBX&DTu#pL&*z zn%aZ-`u@mP>zA6nutiuh(z^@Ts&?P`%xBO13^#r@HJ{aTt4sEj<@0{9I120Uv=fKf zXsgy8td+8x34=G60nGSHygAs)gqyiSuCP_|xsrWXFuJMQ;&;{#o5}S0x{T#~sf2rq zrigQS9FO*HH9ft7bTo`iR>sV}zDa7lhwHHH9WEH%iCnTBPK68N4Z%{maEGfL#rD++CRU$_F_L%1Q=iEIK0{OHV$m< zsiYxW|L%+2@sIXSM(dV+=ThfksnuB?_&sCiu}NdwER`eK9h>2YdbgGcSO>vpt(7~G zd^Q>Q;AU<2PWFRFM(P=RmnGZF$!U|}eH`-!?;GQ;ce*uLNnal42f3ThwTfCdJgZD? zhm!4lINVqx-pwSp+@o`ws&P`f(_UHSZ50Mi)5z>U2KYt2zq@x3crmJ4OM_w8p|hH+ zyKCRH{%xgJ-;K9y$?czp|VEBky^`eip!j699!Y9;bOmgsHGjWVfPb8q#nDAihX zh8B6DXdxyQObXF6mu*@IH%_enoexUe+bcu;Lz~{Z7D>d|UubKSmU_lH$=E8>#$g`}AyOam$|RZ%;Q(183bS(HHXG@&55Hf9QxBZ?9-0soRrMJAcyO^9(lY z4N*H=_ZQ|Wp&otms4x2M_|1C-&B%Gb^d(ho>OvjsB`MDlO5MsHPOa&OzHb2IZ2E(E zH_|N7Cd(DSi>kfnd=N~;2NnDJ^P6L7qM9SyYu_*&34DDL}fR(jMp=+XCJ)Q zj@{Z;`V&o8ZGY?dOx|l8){EWlcZVf~_^EtjC1JZ8S-vMN+U{n<;aEZb_EKu|GGc?} z2TwqO+$) zz3vG++q|)Ia!WDYI!hdzP7CbQWV_h`{Bl|xFxc+Up500ZZz|%-9af}n-}fx}(8P8i z@Yx;DJ+UU$xy|Yp*P|(OxfqiRLd8gDSEMuekYM&5_k$#J!&T!t#xaiGkvy7s?*)oI zcz4p&Cr(pqD36Ew+X;*x$u3X&&CayRY`S{G^@j(}#G+!W;xM?Hj+;OAJpGK>n14x^ zxJJEGeG#ip+o_FMvp1XlUbHjDB4wweRatXaiYpC4(z=)bY^-GUR)iN^=;2Nj0E z!a6t{OD*5c|N74rwVPDwW7kCIYi$0|c>8(av~79Ezp|kvmfvW%vWd-Hqtta3RJ%rH z%$V$tT2k*xFzeXJq&>Cq+x1%;*{&gy)Oho~L@y1d)rY}P(t33%;3@2_R;^KII(Nrb z%H~a>L~X}fyypv>?r%(;9bG4s5_-8Kz)kswRX)tW_dSIdN2eTd{JrX0ke}UHf4pg` zR5D>hE*!5Hd(FmXDO1QaYq0`=*CUk*0Kp5ca;jPh9ax%hqO&ckQH_efF2NqMkf@U1eHV_U@Ek z^tT$#;dGpNFYgIhmu=uf?Ji$;B~p6DZJI=$=6ff~=H2+FtFsxM#8<%@WOJ{)r7<)b z-c9X~)~$ALE&Xts4Rm$dVs%e*vv3;Ew${=I+0vg&dVylA{84h8s_Qz*w^)Gp?p-=Q zTo1*!Tg7HC&O3W3-N>meslvltLyvn44{~`|ey~G&n&A|(W7H0 zCXZxF&rKUQRj9ox81#1>Ay+WH^E}hO`Gep_x)@aGJWmTxP27EX$ef2b6=27~wpR);>Iz|8XOG)@{@#?R(w&n%jQ3FR2{uQH^l@RJNB> z58K+tcyH}gl`#)C5`%3UHif9ly_}++&CFFO{?C6XRE1JoHfpT zcu4|O=hA_S=tlcN^&|bKN%zBIaYv`#IIB5NBjTp_G`o4XzAhMh{MBmH5Q=3R!g4S_ zDEY$Mi@D{~oVBnO2sZ|;P#Ly-Im+vW6j(xO3P;d#s*z+qo=liC*+ijUSqTlJ@n)p* z{L_7XEAfenIW*k}tc`UJaf;r3VvP0gyF*3iBy67SmhB~^bm}gJn?>=+>>Uh^drwWZ zwWCqtBk^ddaBx=a%?&hzM9}5RO`o|E&Pusj=v-}_uT1ayQ}>^D8Hsc0zVRD{r(rvI zGPpk7k4BH10q>-czUOns@>a!_4nGA(v*}n96P~Q z$PP9d`QYTaOKS|@D)e?i2g}v${@fee^ftaHo@JG}@SWsCR%-jS70&mPCf$1xZ_e#57V@^fb)~%XlX#HK2aewnR;N4e!&uOq z2(D$SeS6DNwn|K1@B9a%nm@$MNshB&`A*~gJNj@M z88VGkx0{{4rlVsjriypF4^q9NCFp9l+`;|8Ui-FLCG3ZJsimKv*A(5bCk5nE>mG$Q zbE%=Sm@Mh``a4_J)xIsNwmUUduSt|0wG-w)Ej`Qsq}(ZmI@KApvUHlmkM7o=He&9` zympZD)WSK?US;!zOm>NzN*!na zPv%r4%n6Kg*B3- zL$ze;_8jt9_>fm?B=7HMlTRzV>WxHzKWVRjB<$(N!=^g>vgC7ZMtdgk)s)Ll(cjA+ zZOp4D$E)RzkK9;)eJ}Bi6aTIDz9yh{SM5{7uwrX&hnH=IhBv|)+0)0dj_&A8*E@CH z);TijrrV!TH)CUEUY>lGIex>?3QU?`8q9{lyURO4zQ~dEZ^pL*-NbELs9YgF?u>N5 z8_ZOHYWS|#DO)|Vn*60eshJp6qT^Dr>ID*Yr#j4}JRLBrPoz|*YTI6IeVjf9$or*H?3XYR0gybWy@T$Z${&vS9h`_N;pyA#ni)Cb@jZKjItfrleN@7%2Ew$b)&Dy zGcvzbDe%9gYkt@@HCusrJm@LqIx(ro;A`EKSi#(UxpX@~AJx{v-{2k|TJ80P5ZT;BWiY%?z0_5drX|>`V9!--6;e(ZD+J}X2 z{fYiG@T6MQuKXOF2eD^=R^Ojn`3?R3Xrby8=r(tsccWidHS?2PtG!c;)^cfIKils5 z;457{oXKqEi-mRxT$S3(-ekctXp~~rM#Ym3du!o-J6SAvvRPZaY-=R*p4PaL@`CYV zI@)bT^r@BAa6I1tlY&?|6`Y#x@1mdWgl-kX{>BUMS|&M7wboq&t#X(RwHk?AIVY72 zT{*g?#+KJyktnr*YHTZz%WZR)(dNd+3RjMn?zHKM!VV zBjv%)bNhk#lgMyuyw=*#`@%xU`PHq4dAf2a2$Wx2v@cefbISVyRzQ~yR3f(_FQ9YcpdKh6|fT z=@h|s>_0$UKI}cY;8Xq`{QhT%vjypW1mSA1CA$rAZh{Tmhmeo2L)<>3{|yM^Ksx^h z(*38fhh_lR58=Kmkmvsg!d`;(FKh!N5GMrpok6-SI4*cPE-0T8!v7ND{VmweRlxqu z3D5Wu*y<@E?FsyD!0!cj=O05F=kWgs;_~3wg7m#$#5jWZJvje0DDQo!8zwya`;do^ z;QB8^IbT57{|jN?gtGrBJnL5=t#z=KdjfGj|4ZO&!vDVs_tD@vML2#L;uj$eE?mdp zz8E}{3H9*@5dUqs{sn}86^@S}ZVRp*LAibb_5U1f@IHfl{bP8C7vcW|p7B!%zYgc$ zhq&WEhdP1Ze+}_Jf_$7o**}6hdjswEhKLe+kn68ic(G_g{hZUWNTdPI$(jLHJkUS`E^$LcIS3dCx+; zBPjFl!1=#|yv;)x{~`>o1>l($c;*WTyWpID6~g;){UVLOgFR3|xc&;n{cqsc0mq*~ zoPP=Sj~6T?Hk4TgdHM*_`#<2lei!bO!u{WZIIlwfZbH})()o|@{4%h1*5LmC7V`K_ z2)hPtCJD!vaE%3VF4~It<7ud5veIehZim{1)#tT6Ob)9!J1=GEQ);h016U6CYYQ9<<;hSPk4^E&MQU+|D}IR* zKmzy;_8gDAjIl}V3Ipe(2nO^MI+r9A-xe(Ls0G1md=`6_#TD=o6w$#mn1UN@icQCe zSpmZQ3Xj9T&S#P6JclD7Y0O0uV~}hag7HWSlYp21S)9wnS;PWB1~3jny+YwEU`AY8 zmQhdk|dyJs}N)%$rFjO~jmM zVN4!N2TrG7Ctkx)4wIy=^O&^n6POKQvP2k*&E|91fVM#y6qiKdxjaljaCkQ`0RBL^ zF`kaOi12VePLT*PD-v-Q1Som?&=+!(QdGxj3IBuQP>gwq4Af;VW)nq6KOZ z#jsh{>>C0x3GK?l0xy~5Wtv3D5Yh-tz|`=}^IVE9Q!LxnD&z*D#1Sq@T}I^?oyoew zP@&+G7MJk(bPk58tXhIXDb6gLD`Z_lfkeb&ns5P|OAt88G?UmjfLbBoa(t2-m*zM$ zjAD~BV!(nh85|Bl@DS#LR4Bk`3>Ju_~UQ+RJ8b$dyO9Y>~#F6s(GN9W~Tmx>LLrI22!qYD+ zfCKe4oQp68R}dw3{W|+9DaLTEr?pp@=ud2m-hu51S%bsEkXn z3^MX%E^w?+u8Vk+HIG28G0Axefm6ghLQv2knM_i|2OGSZT1PQd3}}!mTn>gn z)1@#WR6p_($pc;q=MpfzF7r4n(VJ|5p->ot^EuR_T<}}782d5@Ks?u2(0fqg%~=-3 zLWTe26$%!B*KFZ zILjpFu3|uAnup#@LUR+LJSgO=R~K*rCPKK#EEDJ-hyW!C#KW!bgF+gtBJXZ(dsfvJcJ? z(k=uxj14R{MWISuO6T6V%9c?g8b^Qvb_RGiJUW42S&$F}MZo(I0=oobP$Uc{7>e+z zxp^&cpyaRN3m4cfra;LBLJxs6Ns^0l*d$IvK^_S~vRSy-Fct~~FaWj?bQM7&uVI(D z7$sa3;UxX~JV7l}S832RAa%e|Av^*@plS&WBNrfpiUkDv>;gfj2qR~PK`sz%8A?Os zD2HSTpf6nK1HcR?gSkuf^HZnLc~Ey5l01?Cr}Y8 zb_S z?i^Rbq)?Ll4i2<2kg;IK0pim_?LB{oL!_&8%;PH_dG)re_QCc#zkH4HAx zi~|lyinEm*s7iqo=QAmm0J4fMN?#{=gdE}asmlx+8^?jCc2yw41YDF5LF45zzlF#t zgb5v%%fV40N5oQb@da?o$G(ou&`Erj2R@pv;0po}3UJ^QM^4Nva^~jP8Xlj+5@2+` z6c?b-+$cMtBQOyNJ`$4vf(qq{NrFYP!9A`@wBL$Qk!spE!Z~)+;uVP%fazO~>qxq{)c;fnkghB~~ z2Z9fT70JM)7zRCJ5f^f)8H_8!0h5JdCe|XCrvROh!I9zf3OdE-;P^EV5TJ4h#9Rzi zR7#2fyb%#1pf3TBiX|eUe}J+kxq6|lK}m2LjP)cB#3{}vC>lWtT^yXL$I1=ElP7$g`Z^twd? z;p1#VLU0Ks@Bk6eTtUX*e31MM&_tlTq<{ov57cLp6yeNS5k4mX3BkfC&<}Vt5Y!8Z zNWfPh20l7ZM@W$PYyyO&6u@r;Q2MAvilCr504$Dop#XBY1dqofQ4zUpHxn*Q1j?-d zbW=*eN52D_F#+NTDvT2I5D5j9#1{~BVNwGBgxt zF35!(<}&eh1n6JDn^Z6`BTs|~1w00DUV3S zP>bG zQOMZ_4klMJ^-GdPblE5e_+<<=9Gj(=w45cGe1Rp988N1dp)M&9I-C32Vnzcf7>x$i zV`_{Y(1|JLRm`lL_s`N|ihz#vu#8d{>0}}F8PdmJnB#K)C69jnIwwkKXxa>A;LcNS z2dTd%wN1wecX`3hK*DAf!DcAj4ChO9j#MF)S&ae_TOeSuXj;~5Y@to^OefIDd5mQa zI3ZUHRMJH&DqUjEeSq1RYQNEF;4@W9%!(s&Et4k~GO1;w1x06VsCr$aUDVQb67f7& zF0)`X^P;jLXVW;G8Cqe<>{VDagb7{ZGJ;04fN_Ns$QUAwClPBz6d#f5c#A5~#E}`H zh|3o9L9||{(d7#w8fL^PCSL&EiI4Hj3YaH~Fg_dA$#^)4h$L+2Q3zkAHZA%2KyZ5* zzd^{@i-bl@$q0$kBtaA?S0h^zDij>4R;LIdR$G+LPBQy4&RuOWQ? zJj|J)$8u#{NCo2q;FDx8@>qbXoI!Y(Ap5}61^lXbkzm7Yjw#`BKwgpD>pURa5kTlu z!9N&9$kh~YMvVPB@g@OyMc4ICIOL@XZ6Y%ZP(YXSUgJQ0Vj#b296Z(L?jvlNCSfJ#JYELdFN@wFF;!2AY~ z!8q_RNuijbf&iF@YeY6NPGixKS2zqF#bz^kASX#W0pc2@GbkZ)uAF~Sh4U~I0+m1p z*h-3Eyau|d0_6+f9}X^{lN8W@DWUipsMrh&6lrW87vYN-=u;Gz34bs*C)j)@JQ^Co zEFIbus*<2yHeuE{+7Vvfu{TJAC#*}x1?i}WIyo@>ye zW|Wbp>sdmz%%#Q=qXLM*Os9rrl<`O{PZwHXl1fBwaffE~4Be8F!O_Yi*t`okSbP@n zf%!DOPOeeNmMI-qM`tS)QYG^Ss|c`&1>$bB(co;*0>ca^yTF?L=1j@(Ix}(On~a%- z8N^9%b5m`+a^q#+aUQSz3+GQhowC?Lj-N4m%fHH9q($tE2w)jYaqwhuz>R-*{^rb; z*$Ugh2*eUU&KK*=1kJ;y<%5lA-hEv(qURQ9f5JXw`J%&C0%s*0fK+}Zv$rq^+A;qq z-JL;%tDI~jbu=GClZDbXWOm7GZ?0slW6NfDpMw^Nci$>x402{mCGu` z*qs^1Nxa+PY|*{#8aKbdzVt)fB?p`(S zoRw7=soD9Tadw!l(4ZX`(NbO-m-hYq4!sbtVZ~`|`zpe1v6Io%3fO!)ONFbfY2Xa~ z-4V?!NBs! zh4f{aCIW?d^v7L8!NjE&tOIQ#`lpn{nPhOqcBXz+jxeFE^V~%k1UawbI0xowz#~QYoJ$x23sy?L7+pXBA5Fn(9|r5ma~N`= zwc{+9cq6!k#X%4TzQE=Y3s;!bJQwu5c?K6}inu(Iq#*Y&xWS;uBB|LW#;-A;3@A+L zc^GFNR&;@!0l;0M09JcxxDe)Q7zPW~0=|%g5-6-g!8)&;oMkOq*{=cz8Kf?UYvzj- zB)$j>q7;E~C?O`Z0Fic4fGr{-(Bvg-1g5^P5G03-@CAUQUf|NXJP`C0N1}a&KyVJm zgSWzl2_qoTLC(WE41+>=0LLZf5YTa9NeNa4zQ>koNZ0&g*bG2E6H{dZZHNJASzIq; zTyrwM@&6I^Ucs_s*_BvRqtJxrLsASiK@B--8a2sovRzeIb=C9ln!I__dvB2*Y2bl` zPo($W+|29UyU*7v@7l20&88Pc&CrN3AyFa4=!f5o(ENznZ}g!L1dsJ!Qu>Dc5 zzuJAW#eCA&xAc}fZ|-Q^{jI}ycEPrI$DG+-@2Hxtv-56U?b;^)9sY0sK5j73h^V{2 zDIFPhcA2md9#KaRN~A`M`-^Ygdi>7Ub&s|vbEVb%lkRMB*|^W!cgCRb*y6W!=ilq< z^h4;cWNxyijQSe)@N4(Kad@S_RBE2ljr8@ zI}dc`!Opxg{)d*(^R&#`TetYmdF9);4Nclxx&QFfmb%zI-oihq8#;O;jPx3{^YHxL zukP*bZy9|%#}8#`udTl`>2$6Af64v+|57v<8KkU#@Nsbsp3p+n7?kM?D9HewvQ3akJ2gWp{r z>|QIfz41hR@eT9swpscX{>Uttq~+|E@x1)l`YXfxO0RPJzRo;;aHvlHx2mYR6|3~d z_sWga`q%I5M|`f-{reb_PFrzH0!PeY=*@L{B%pscqyIKsj)xG-MsyHRrR};1lCY+4 zCxX?oP4DxCOyFA;w&(yqGN7(KXF3{7L^4L^&Aa8Ddv|eN*cQ)58mVY0=6>im8sm|O zD-kt>Bz?lTqncBOSR|B4>h;>*MC6ZU^#^$o72;cpE?5zjx0NwE+6-FahQWa3$oR7RzpE z_cy@oTxW3tpysl}b8a*w0HSdDe|I|!**=7-Ixj?Q7I!$f{h#__hcgt4`K*>5!fs}*PHjCby_nrEE^G?XPZ_sTSLIzXF6+pM0K<9kt?w-SG1C-EeeC;#t^_idp z_nP z$lpzG+qQRfMi>_EI(PTM!Rjr?-oDuZgKWs|vTQlM+g_vI1nDdc8#{&v+h*v>^iF_e z!SC7vn06SVR^X!GFmA}mY+K&J} z0BN?Vv}*-41CXg(Fj`q4vV|b{A>{k^eFFr-faHS_J4D-IlMmdsEyk?b>v97E>@wX3 zd^}(W#!_>V{+_^4n<&1gpV9%dsDJ zAut3QJ>YC_u-H(@S3^awFJO#;xjcaDF~s!HoP^gx_51C-FTKU=g!>^w5`5gvJHE8( zA-VlcFmB)0>2{*qF~?TKVuUKeIOvbatDEfx< z#={5OclE?h)l?{krwI>5r0&J7`Y4i3#Ii>H>Mm;l>My55LV;X-e=i)^f8*gUujq?e z+?el%Ro(tWhb{gKp5Aw(rg$c^yDvu(%Pkm7^zj|N$m@~8c zePc2jO6~2i@d&z2??&1lBVx4a%zN>qc{{%Qb?g4l_f1>-A$@dzFK#v_-p<)l_YtlC zz-$U6L#Dl<53y{U{1DerMq4fuW03Usgzs#dwjb_dyS%x8BHfICj+C~77K1;MPkOS} zUF~+psQVpLW-sXmGp~JJ(yzPoaEHTp9r2VSHE@--Amy_s|LbV=ow&i9M|JyB60y7s zsLCD!5t6l{3x~7CfG4Pb{X4dzPFDiV;R9~FPk$?D_J++#izjU}M2!HiJ^--r9%B32 z!`*w9*M!r)QLm2%of)|4VEdZKZ!-gkZ{Ej^5lBM;RMjCCLm&{0rX#z-9Yfw`-;dl) zT7Yds5L+bf-w#;zuWutrkeqpA(6alM)4!i|Ia5J{0T9$(t2vl76oR3Lwyax%z$xws z{*JG7D`br&;+}oPlLvg#7~YKlEov&50d9PaV3=P+S0)FbG_J4PLSB2)d@F2$2hamh zfN|?J&1BaDaJY+@e%-XQ1^4^58Bq6K1iXwgfFOH)nQ*`cq$BWe#2#=gc9+iheSpAs zo!%J4XZwKWL*N$M^6s1Mj$3|9Bp5Y@bU;np(!e8n6Lv_%_e^0E0vs)nGy#Adyl!vA zi~td}?Q|4*zXw>Y&8&y6 z5&ExvooV;3*}V%qwaH_*@7{5k-n7Q7j)?KwJ0VEf0XxWq?f_ii^*;-S8C`%betsRs~4_d1#Ir(P!?exQgHo!w?OS@q_(AQP}T0(ZTZ*TgiW2@c~{94ZZ5 zyO4jFZKj|Z*8ALMU=Ct1GzGqC-nID+P9GAmnvLRMeENKFuDUfjJJT3sJTsCm?HUYZrFe5Q1Fe`3T-f!G> znWNrV$nFeSyasnDs*A#i8S=tvpUdiYSW<2PhVR178nu`LLA?#`PnX?h$yyRYtLaw2 z88yKe;_~dI{gATw-7tCt9BIJYz4p6a0ImUs0jiUOTgv(xW_KAqa7|3MAg~ICy$~cE zJ3#2Cybm2FfZXov`(1{>_B*?_9pGJ!dp?M2AtZ>v_-OEWqi=lM^%}-@nd3%aINc$W z`5pW<4`&Th#9TM#8%d|Dnrl0h|x&608Bs@4ki#VblPoFbxm*9Vi_z z8Sg`hg83n!Z$X3Z9h=Q!-t{>wImezGurG(s;C1P}yT4(A8=w#< z;QQ{Ia1Vl8pdNxE=n+DW{dX7_9M2I>?%xaV=`+@QCW{#{#B95_{N_CT94`#p;JFOf z9UgC&)%;Bu{v&Ye9)krQv)fjG0HU`53~l~BBOtH44*ypJhC9G30Q_YI>f|+D;C6PaAKWA1eb;@6-ppoW5YEFD3$vd6eU~Na_4|{%QK*CQ z8-x2{0C)XR;Q+e<7Qhj6K@9jWtS(z%-w+FK0d~0O2c9Epiv;ca`lvN&GVUat-d_s= zzHA8_f#mbsBX*Bj59pxNZnygkFbK!sAqHLuE-^eScH392A&)Jff79o+7_1qi2X(z0 zH9NeHyIz~y9I<}mwq@__-EE!m9%RaCyDyP81IgiXlpwBvyVh#N_e~F>uwr-r+d#qr zTMV?Fb=&Gp1Ap$l6Ep1X1#a)}IUoe~>j4CahMiWkD++Ok_p64yDQeqyc=z@JFigV} znXsgtPSb<14YzK6H)J*clGkJ~TcW`87$fE+Q13?TJrB^ZelGxpp{&Cjg!=%JYkdMp z{Qpb&|LZS5fqKLL3bgtD4(chBF#SiUV+%l?VF7B-{u$J<{X1BG6YBbEFmJ>3C2ap% zH?RTQq44<^Vbb99YaPX}LjB?lYB}G5`p7ucS{9(zaUVW!!}_1Xx_<@rfGTKsH(~iR z*ycO1k2b8o1MPna?f(bd);;n)7$X*^|Ag6j`R1h9}BeoBg`Yv{u9{#ZD_YaTNbwY z0On_~j)dQL5B3d_vor8tf|HuCcZJ)rh2hQR5;WGtwf{$SOJJ{y6f1(GzzYY8R z87%*|@Z0_owntze|0Vnne*vG*;PY$E_J0rG|87HK`Z>&H znEwMzXV9*L|L`|q`9Hw1{{i;#S|c$E?OvGEFuguEFRW)_e}4j>@4)m+uXPV$&cgZp z0!%-Lu!^ZNI|_8FL7ANwaT|9e>fHnjg3_VG7x{;#j;zkqZ80`_wXZ9jlN ze*#JuTVdTl zhy5MH_in>Be+J)s57zZye>{91!2~6e|LM3^*nS80_dmjVD@+oc%b&qM_o1y0*X&!c z9h6D_)3;$=64t#w?=iH$2h;xpKk`F3_8-BzZ^QQg8s`56Z2QZw47cc4FgTC@25L5) z&~^;lzy5DxaE<;A{KnVIua67m$Nyx9$pmfxviko$D_=2v&!=Xto6 zN_AOwF`Y6O3~ z_&lv;2nNG5d_j>7@rE;F8h|Ah=&au~nd_61ha4>OXg8zHc|;s?g$&11XuU9PT`P$k zo}HxCl!8}Md9^i;FuB8`%oGbv`-WfT)Xup~V2#v4HI}v zT0rJ=f^u`HHS*c3^{VjeE~GJSf^qLilG+%n4L}Gjui()L26~-f0YPkckK4?r`XDQV6L`xi3K%E^a5*T}ATpx$# zyv#KoFkF7q*~@BhveH|rf&NDsjlGd{_Eu8VL}@{vwa?=nu2QKX z@v0ngZ8{)Pb3kfi6&vQtX|>O+8EmT2El@ct68S#PR=P5NDp2jdiXI-$6N3(D=+or| zy3inNQx+f^O&q3%%rdV!mB?cGI58MZaBr=Y2C*hCoo;1`3D@z?$5~nRHr5SrdMWdT zrXUIln*z4uJwBOdnhnrWP9rjZdC(n#x_z1~qf1b2E=9!?jrJm05S%YZtA|8Anu-K^ z#d?#cs->e?p5&TgE?vx2N{duXI(Fkv*bE+eeOc#cwF5rW^;2h&eI~ZbmL4BP&EaGU zj#Cx!w(jUOqK zo*7p$8E3=g<5J!K;ib^}R+h!fh5Yl$oW?p`b|ckN$S6w@*I4OkOERnBSz>@G6{#C4 zR8XcoiZU7}Abm%xQeC>)I^U*?8>YAB6IW!0%u8*e+91%Amh%wNO6q*@#d=PDf2nW~VCRTngWF}?tV|A3d~?X5QFpa$w@`aWKm$fZsO#nSBs zMXcLT(nSIvp(JaYboQ&KLZWg?(UsVs;i@}A!Jfh(XkV;1BB1A1>Ghf!ipvMZxd5+< zN?CXhwl-N0mz`j_k2d%YN8~t=WFHPL*M5$VEqII(zu+egk(XQRY`8re=ZPh}gH@qH z5Ic)!#ZYNj3{6#cw=DLv-V)#0T;rwYqmB(1>Wt~E4ez!ua*PnuohM5N?cgRdkTkyc z^zmXU4w-Ia12zvuyqf``KQxnXKd&v5$dxK;k)g)$<4(Dj?b#~yg&4&*6)|5clW}st zs5{xrI?DH=+{I~lex(Is7YCX*9=%@wQuskgaMfy$|ATt_XRGbF?&w|})m-Z3&!fjU zSv`FG@lm>L8+ngx51Q|>(Qai>rX*Rj5`~6{+${QM@nYdD zejxLwB$Jr-(^`iF?*qk05Q@E`aQVqeD&LceCl4v76S}Nxg z6#iU5nSPn+o@5d@*ixPMlL=qs?@UtLIEq0K!j+4u zHa1N*GVLxH%LWy5Cozpkf0!C)2mM+>iYr+yNP&*mx}k>GcxNReti(X?Ept*Rmcl;= zxDStNyRIU^J*GR7AbShl9GzQ?(ya;z-DhxOoftH=HVrShiYMI>>*2$5O;N(&vn@6Q zjU{^Uc|q)Ej|-`)fNDdzX04n(_Efp=iq&C+#klTB1m)hztdHg@<6}`7Wby%AInrX2bE>vd?up84f9&5@7|z_s_Q~ z7gfpIkV^K@c|#Q1(N32b9NeJLJHKtSyxuk(es9 zRB3?M@)$1e(ae;QDgg)2B{6ovE9l|A;L78&uc0Q?>qML z%W5TH&EXSVed^94lO9rck46Q$GtJOgYkox$3h6f1hhtFG75#qt(1fC&(;n;8OiUmBYK)^VL3^= z0+)(rJb+8TDw%mSwOCY!%7Mhz_oPh7BCkGm;#cdldo9*zR=fP{rBwdG^P}3}j$Rub1UVJvxLgD8ivL#Nt z(^xOVB&tHBsi54#ot$+WQIft;M}lDD@cFy=sgwwGlSA>eSB)lwSgCMH{;(nqy`99B z%~1Xy2tEP#UNq4r4#!MCdN6Li_jt41CQc8*7AD4Sf&}XBFa4?Z2%D(|UbV-Rakksi zKJV1{UZ;kgFGtu>`g)W~dB^gE#8Z)72&dy8+KycG2F;hi;ik#`Rxwe08dLFulGe?W zx6${?HBL!iQx-d(tK`N?r9dMK<%A)Rs>^!fhhqU-4W|G&5J`_6x>IUrd^t#+AoLlID&rYC!ufeN(q`1t)4(yS z&b+>vZIgS%Rs?+LVFcUWM^{X@P7;5Z9dO|y>Y^4BH9>Jjjv&{#fGqn}zUzz-funj# zEQQD20(DsvncBxK8JtPQR#FO@Ae(8uJq}_ABz2UX%L6&@%1SGyQ_clx5=mtl@cw47 zHZu%N#e%bYAjhhMtTG)%sGCHfqpR4rh0trinO*I zrSgB)lY(5)WpRM;!W0s+5+c!;07> z3-m{#F!d&!gC`9_WSLRK{cTBud~Tu;7gbah6PU~e-*(b6)ne)mFqK=STPE`4ux271 z6KYV5^G3BU8XHQa$#7Q&8&sQk`d=D{P249jf`K-t}@oO`Q)B|K8J!csq+OZ`QP?8-?2D{PM- zu|jgiCEht$xQ;WU{y~hjn?kvEX)+Z{pl?@ozp@sGtRT$pA1v~NYwnDlc23)1TsJ}R zDv~~&3-#_*{b;I!Q$qb1yTN`;tt`tcEK%zhM@uG=;1>kNqor?bGTnGy%~dC74Q5ij zS83L$!RL7`P%xZT{AJ#?VH?(PBfw1;VavxOO;)m&-DQ^XW>!Hgk7SQm=2E!lk@la; zM7(@3ERNTP5f~bREQ{ZC6~E{ zM1y>vp_oBVNRsjNxy-DasHA52@|a&qxOkur7%cb((HW1HdA9L za?V!{y4RPondjjjC>HR+8r;*=uFx(Cj#yuiJBwy1Sp+$ord(3Vd6Pv5F`l2SoEkE}4f*B_mu+0%mecs^+M*27@DF~f$P(I`F& za)^s84$hK6B3)wyYc5e?5Zz5QlyK}C9e0BKY?vQE$Z-4cCxdmafz+@$PgjaHEQP5l2e- zULJJSS91|WHAp&vdk37g|Hg8(tj!g&?2U9=$xBpjwWG2AIK5P<)^k}HM+;hmHk*n1 zO{h+Mnadcff&|uOI5#+0tC3_m*&3!+iE5}ass}krgfer14Y#}XnOfFW>fMt=C0FV( z_)_K-R?Q;U@djFEnq5L|r1+R6(c~pkO4hoZ(rTgvba!g00(Nq2Mx|{$(*kW$3&9zt4boXJh(KRnFpd8ut>5ajbi zeuPbeivnfI6-OKsl)JMf)$BxT*+K~`sYJRpY_?OJzetx_$yt&2uYV|v0XS9*lXm+& za#_2qbeDr#C;c*mKC2RSB|~^~=~w|9s{?tZ28Wpzovb|;)$Ed_=Pp5gbk-}z&|F~{ zbQekNSvHpo4Eu9A&_Crby7xCj?x_@sgZ0gb7~%tIxqjM^GnoTRsYEvDXvYQCBPVvh zOePrlob)r1Lnh1D95t{Ctzk4K=lCjv2V`=P7=`@uR70;bdc>BNJc+uy*I%lI)}^w+ z+Tzi}qQtfHhiL`9C})yGYTAm|Wp|mAMTstaGvZ|0>Bek=1wPekt=Nf2Jj{PwIFP&S zU9R2)vE>3&8jpLU1dY4?Gciy0Lj%>srm1x>f-xkCS4kSVM*|-JV>^&VqL7{W-6s{eObq} z3LqjRWKi&R=-7nhyplc?^39f*$Cz5of~Ie#AY!A|C{uZ;W74tatWO{2u!Vm!T#J(+ z{4nVC+9E|YRy0kv@J}`|g#k>z6cW{|RaQbCdlnQb_@8EXFY@T0jjWgX$oplCs0{Ac zj=Ef@=B#<#<-f*OvR`wjpJ~J1*f3wxbTn1Tc6*tIWgR*GB+VVCq?F9$>s3#(N*oeo zr?Dpsp(OPr{?@uWn(@^rGER1Keb+b3>-X?<6Kj$;lBHGcbH^=ZPixe=tJaz#e%=5H zL=oYtOk!1y`7DSB;m%nCL$4U#{zkKi^{Uw(jyT?@PW;VnJ z=JCIp#Ea?S-3fV0{wCXc@iZ>{v&`QgcOuPl$J+Sl{PXbw-DDAmJ#z2Ko`O`%`}0gA z`7`O6SPOpnhXRwMn)v2OeLwWG4}YUE30Z#duh0J-ANgPwq&q^2-dn^=7x9yW&w2H< z^8Q{3`2-y$^WO0T#&`4ny7IhOsDkd=a1{&lbP5 zU|1PJQ*BA*8eQy1^dWZ9#X11w5p%G8oKfy%r-UAg@owpuZnE}4BehsEG)bkzRVAAv zc6hRWs-4eSUz&nwo(*Z$l*%V=>bZ2iHjo-|)!tmE!mZ75qEST2n#4LKp+4wS-RA3? z1z~L%_DT9WJKU(eFsX-#4EsS&YYrJPUc~DCS^jDyKHcZU%10&HnPd8cQt3{L2xrN% z%;$@V6J~Uwc58XBmPw-p-{E?$x!W`sTYQLLHo%hJT>KOi#Xgw7Y9VNIfdw;L&(_&| zl)o5o`NvDBR{VmPJM-KTjSeyKZ^HQGlxEMW7ah~gHVS?)yNLg)fX6pIhj{M5FNdT1 z)#F+};LOpP9QvqPz1LluXK42aIDT024?5!=B3-49tchqUx0EPv`N<%D{d_D%L~Ir* zSQ>{c;V!0JIKO(vDef||JnFuyaGjXOV9s=-=U2#kv>r`|D$VkFQbosM(i?f_gHiqG z>_m_i%UNB^M<Q8F`q3~#nGs(3J-|6F6Rxjo+2B^p(FD|5>@JJc?fx@nPf5k7of32cFFS>}X`iZ=3`^sR^QnC`x9d>=Ktj9Fh~ z-8j5r9u6LsugViDTFD;~!G1To;L<5DAOQ!`zL)p^v6M(DWpW*@{djadDG>!*+h20O z;rh4g_lV&uRN*ai_yf>t6VGM77=`y@D{{;n)|{Azd(y+q)zv~J?9KKM7T>BHS z?Bp=j|4JyoJ)N)e#hiRDe&M~&pC)OEz^>^_#&Du3TB;7-n|0a4AzY}OTg75}R8b~I z>CUv7iX9e_wGi~?SiUrV-`MavB99`9md2?o3}(f!)A zhhnu-r7g@(X73KW*2a>nR58{$4CW(?@)U1-FVLj#aVK)k9u;zNf$wMLrz2Zv@w^)? z(ZP%3&+;*Ubxxn$TiDnETXL70D)@`)TG+`=)O6lCx^z!M!YC7A`h8V|NP}xCoyH`p zx(obDJ@^yOEZUax+$5{)hu!Zbi(+a*bFW>km3s5f9V+j1wKT;;^fR#^%v5!>sLy7f zX0bxJRlpbRj`pAy7M1_#y8Ve*jWc~=IG*Oz2!X8mLp=lyhRXn3a?n(;wBNE{q%*xB zW5>t>)pov0EWE4XqikmrToNBdiRdKmoM6?18R;$uc0)4F=r zRP%V^nr)IN+__pPQsvU#1m`z~OgFN`4pIq(!D=j7&8qiw6IYaB=>;3de$2l=D8A>c z@oIOXj&whTfqiBo6N`gUTk6rfm3pHff9oT+ve*d4`paC~y=jY+!eMsyEGwKxG6CAs zob@0*i!NrF<8~uH%#`HZ?rm*FKkj`P%bAq-lb+M%B0t*{%44<{5K}$k%SnFlc@J|B zexxO;>8oN6ZOXYpcw<>zOE?P#NYS`sKF{Yle+(V6~Kl;(v%J1cQSbXJjviV@L*;~uD5KJZDs+7>#?bi0F9O@K?x(2aPBxrgt`-)s@KKB%*Q;nDtrJQ}{ZSheX-R!L*Jj~*K{NpR zAgCDStkc66!D6#kIHIXF{-;kb(Q*tu{`ggPT4a-#-ZFYM@8|f;nu$`Ad?zRbzFS-g z)q4A&^u0v%0}AP`(;G|bl0(Mh;n|RwdK5YiDy{bsdGaJYmivtcwV`{U_}ELe@lY$t zMDtU$L*7!0;}|;?M0d@2{wkU|jLdRkGd7adR+kLoQW%s|1DFG?1h~HGGXVr=v-L*H z3nXpvV3v9J@%&+{{FCwX=j}A-(ECnedqr+mPKZ;qx4>jTGH+Ma9|p?lRTiB#Vueh% zbSUzR#|Isz(45IZ>SkE{a^f7&iF_oxKP6*XXAVb>vnPHpfgl83eHK`pge`#yc3w4d zAMUBM_BPx6GGo-1%`W_C_s8zmZ z?ppV>>{3DIIqm+gs$}Ygfwuaa28v6^aT4lC{BGQzPj4t{|FBw?=9}%n?~|u}{9XOv zB!27wCs8qw%U4svsy&gF*nN4Fml>Ttn`oZJMx}BucNM@3#hQcXzlJ&GkIsKDIc>Jv zsyAL73;QP(BIZFV;2}q5rB3!BvY3KeLMbZsT8LK9q=F}Q#iD4*av6Pi)1?onVzrbLs8QBm%XLca zRe5ms_)#K1SV(B`vMYQ)1*S=Dg&UVXNVl~{^Ti{F;nNSQg1{w+WvkYkFr^I>>YlQ> ze%qVqimSzWL5NF+q}7tJwVNcK}tB%CCIOgSH%MpB`&_nhKyL~ zLPXX*zmL>1Vxx+kM#)&8Cn|PCT#AtnjS+=zp--O6IRr1R<$96mrmDyf(rgXKSf&J? z=_ofX$hFdWU0FP{UIJvyHZZ)u)P9Ik9knd7FRLLXi1tGSjTtR{y59}-@(?xF=@|Q^ zA&cH>6M(g>L){RqgVMk@Dg~7Pmyx6bU0rpyA+-W z!niB3Op1_LPC=xuZA%+W~C6OPuqtlkxK49VIACCcOC7NYk8Z;}hzKly{ zZ7>m{1aqi0sWm4I=;o}G^F!}b?4H8x-56#uD7DdB%o_ngJq3&^AmhfDQDYCV1^L+Sx>!G;`wBZ zuMOUZ|6A46ocF3wX+>{F&a1oGhogRD1#nayi!>(ThY%XI^hRJ2ye9LzoAuMPC~ zCY`@lPMo1Pf@@K(YN<4qTomQ^oGaR!iTLmKv4@j9AUBmrZQGf&et7&iySR9BUKW(X z{RXDII_!5}p61@mJ&_KnKiE7~kTQAEOaID(@fQEH`;C|K*^ioXB1J}y^3`Nk+5APR z%0=@YK$mUGHoq!JKSZDX;CzToy{(y&b%lPVyLr<3BXcy@jzIMdNkdW|F~qmcwMu^e53v7MKzho<8l?hs8{ZtW?8y&DhgSn|Eq<1 zdHeKBj4_$N`w4CP6+PC!2-nAU;V~4*SZpbUqN4`hVX1sbS>GD3@d5qHH^I zel%MCUNMe6y!zS4Kf$jA<;(~bZ&`%sw*<~mMm@bpAc zLWUSDAGYxOX3g;2rC&{d@2^#kH{(DwI~!bt%`Wicv}#trD*){CHMu`v!u^_>ZaM$C zP8N}NRqO=5o|vYhG5(#dGRtfvx?3nQ`!*z6j8AV7GhsR43!~V#YW*5knk@(%;x3_n z)p%vwo1~geI+8!CRN4n=yvnusoZP6F&yHT!4_&75!&??rFV(!ycpYM+X z5uP|=bCbe2!{H2-r3dSR%1uyGVDj=HRZoqQ@*oP9x1?~9tGJLvrpaWl8^n+EwRqi? zE%N;oD~mm`<&9!h{ymb~?WOL{`}hr-=f_OLG#m~_foJJdAV?RSx!itrrV1(}wKyWx zVh)pWMJ*<)?-PMZ^K?@x4lBPP#%m0HGUSyHVk9|`@ZR8=@YCt;n-xils9z*=oaC+? zAY`d}uQk6QIuCvM3X~`j>&6k=m^I~My7FbG)>_>gW!2jv2xqHC=LwX* zB}*^b5_VAD;A6*Om>oCGv~U_G2P1s^9jcBrve(%~4QhoMoN{OIS+Ti(xt1>sh{PUoF$tYzjY=>zP3|{otoIv5hO*E8D7Xwo+7r4q4svTu_!)7+D3KA^ekZTA zf%wS_nPO1%38EqQpR;}bb9eI@vO=1|CcDV*@rxuX@#VgI0HJP^PRj2cqFo<3E^77A z^|)uL;&Lf))M)ier@rIE8kQhOxu}Mv5*Xp(yM+~zE-|uo8V+%VOLx+)p0^bd+Z*>!`2W?1+B>J?+MEkN z$)GP5NS*W0Bee1!UP01;;P*+U^J^~_cT)J9J_g^l*3H)p1ura6-OeR9wF61z!p z(PO$*5gyw-nPY;yww)f+vqc7fJT0e6ia8x_D40`uobc$XzQnugplrH`E}^RABuyh6 zx-n|PiwJVrtbz4XX{-}^BVh$b1#R}z;VbKl?AZ2@l-j~$>X5$|s?9S*yv6FhzBlB| z@u1ewO|tuJvwONF|JW~8(r$;-C!+Z2#dyRk`zhlQ-@mC(1)%;CA=ScNzJwCr6Q41f z$8sABll<^{EPe){h8!en#n0K?RjV-gk$mADRsdpt*jP@cp-;-!1@yAToQ^JoWn|%D zE~_8c*K)CXByYUSt2vuGiHpz-G_&t{EL#TRk~+~yV*~;S4sMTSbhHhKqhE{`dBZ{K0mm@3rFp;A54yh zwce;Yv4Y{FhK!6StW1T2#r%L5g5!agglkBrUH)l93cs)ym>oG$w-V{M_x;^db~1WA z9cP;G^ea_oN_MC5!*zK8{oYzz8M~EN{d$-09fQcFo!W)WPoJSuVK@qb`2;8^;L>W) z3y({e^6tv~NtfNN9e3m(f;$C8j|FajD7O~z<@xI-Lc>XC^w>CP^J43q{h@)K0L9FY zj>>4b(bF7|xQ#uo?#fGTL!2#&OnXS3IoyE>r1+@X5Rt*-)nh%^bYWUY2a=JVHZc4+ zf`x!*ic@%Eno(oa)w~n`L$YOaCnZ+Yezu#pP)}`FPU%#`OR{HN&s-GbUGimVU(=D$XXk3l5Kndj3Wh!}f(?_TNJmsiYJ820{A-6(9FGFa5hOR+>K(l<}X0E2VEvnLtn*8pfQTlNI;v94$r~Hml{bg?)Wd_nFF3;;vY%`_rzmaRz zurc0HEe~2)-J?gI(w*04Gj|FLGp+Tb8*a#mF}K;j4d)p`rLcplVbx({@AYupIjEfV zI5)E^)4`N*o2?}7wbKtvPcLQKy&idZ6Ac zRNyuO$wB^hcQo-9Pr?t5cZ0+qYQyWr*5yYP+YGpYT`+Z!E-SxsZPvZ;n`(&rLd!J_ zvEv3A(0U4bY;T;K(e(*0dBNB)M;}5dX66r-2H%SqV{Z^WvrN|Y^-L`n1(^BqS)<8W z8LiM&k9rQxB!_aI@AY`W>rB+Xif$`sjKtUaZ} ze@nRe8ycH*v`)lm64!KOPgu^WhWI;fA{BobAJ) zrCv8ThQM#&ZCxUR_KerU@h0EyfXj``U!E+WuBm@4 z4+`Y|x@$5IFB_tp3}DdS-miVKUUu`&&3-Fpj=JK@;-bEY!gFcn)@7TJ@t?6xr|d;% z7bfN4dxZW3do0BJqb8T?C2$!>sjJUpb+_J)6Mo+`65z($T?Mpxirs>Xf{V1@_cNJ4%|;H zZQ&NWua`*X{c~-+YxnHRjh7DALdBP|+B$4?`|fu4V$s~&SvYF;=nvDAPWt)NTIb^9 zi`^TdcNT))m_=T6s)&A88=T!XtR4^2U^huNo0)5QqqP&e@W&X5zKopm)d9tDS8AX) zGAxZRdStJ)2cLr;n3re~%hJ<*ATW;JH2QBO1eO|PW55FVWe6o2%5FVcI31rK-pm~4 zrvcP&v#SnAf6{*?judo5 zca-^);dA=F*J0iqo)`_04BP$XXFWUYAA!59`V~;*eW<7LHGAT3HBL%fJti^3Y&@m! z864`79dtCj9233E&*GvK%=pM^ONG0vvy2tTiUM3PNAL6X^2vo+54rwh{na_8yH14* z+WE$y(?5EpC;?|%j|YAWDhcXAyVdH|gb456JrlE+-GNmzK0j_)ji*X59X8FfvwYDR zO~5j38su(rvlG5$J>9Qwj*#ed*tc4*&ETt(yxk>ss++Y&olx_IS!6v?`g>*2WbLicf6G%sh*d&;CX4+n)K3 zab*TJCm$WmnzYoN40nGxW{mF_>wCQ5FEw56GHi_Nh2 z_*oN3w#w;RFy=&P&%EMf7-t=*m}}qRYNrud8#f_Vf4p8835ScpaZS=jD5*~yiqL5Y zhsStLppM#kA~Jbdat9tWmqQ#Cb(mHadmJ9s`aF?;jn08Yhl9R>B9Z`0-!-{sEM(5!`3WU<+8KK70S^@5k_!>>-A3 zHzE7=cRR5+PuknwHwU2okw*r-%~!q$eZ(=;E zslDw;BRj*v7K0uBtab1?Ve7+5k2HW0$aQ{l>DL+inS)|O2`<1tnSGGy^3kBqAFgtW z13mx4T?cJ2@6FW^DHrwnuptg%vX8Y`yu?d;SpLF&+jjbQ2FY)1XuCsod?0m}M zSp8@mapnY0PCv4L+8kLbSZc+_-lvS(>M~ol@k*EhB=-;Xxix{ml<*h zHRPI4_S{;z_i}$;Y8t`=C(J8+Dazmf zMQ*xb%Xs4R=GukwC*G>ryghMF2KhU)AfbX2*#y+s;j2l01?bR3a^ZMzb zu;&f01^vlNQjDzJ$3Al86nI@=8un>0m3p@J5hnZ+NXJGWM5Wcm4;7GS>fDR`?nmR| zad%ZDK+%>D`KUR%d{zmoB|Z5(^=OoGh8iTy68_+WIE+A_NA8y=!WGgbyyZIY;IU16 zd{h>h5fHGp*mppXAZv+HLr+aG$n6{KwsjbqXD#s_biAJe^*XwIzE^oK(48L`43 zfrl~elf{9VZtd+1p74va{m?wi%q)#Bc0rMsu=g9YHv-$@x>*VDJ|<@1K=Q&E}0k2=M(l1$z~fp zN&aHCRVxLz&u0gZgx*ox?NT2Ai9YTBmwtryZotvb{7Z)L*59G8&0fx=7jnxyN-e^A zu5oF0X8*=ad5?c8XMnY%+sN!O7w})sH``A#gY6%`{o%9qZ=dYmo_t;SNk9C9rh3o|3H|NQ8cZGPQ*bAT6i&y`3PL`vgw z!25#T-y*H^%`)@)gXbZ6kb~7t-rSZ9+vJ9C@^i$w<1{`i#MLu3ID8+Hs6S|F_Y-Ju zzjfXb-P5l0s+BTcdXsI}Ddj$>?9XC>txAsJTh~yso8y*$a zzBikH@_OI=aC?6~p!5yrN3qJY^rHbm-hSFR9YM_TV_huie?0y;@w{-tT_={O&9c?` zBe1D_HmVNh)en?UzU)4IHTfQIj4n_AeIma1@h8{E-k033ekjDkpAXMwdF3Kqme;!J z7}`rZEw)@AJ<^^vYwb>{`|U0oKikio#ZEp7#d!M|WOnb=jN@WOJY}J6jP)DM z^VRfDZ@0tF&BL3-&%^GQDx_B7Bh94LL00Nnzw<~QNd0bswA$r)FHHXqjQp8bY0BeV zR#8cET<%;lU%g0cA5_uTbpL`FeY&=ubX6T6XjCIEC-CPNm9swg^ zAap6A(LT4vE;A{s#a?@H(VQBD;hvtF(3{~%tB5`+vG@M@SQG=*PWGZ`deN;~N@pSdX9{Gao&HE z@R?-)WYC)SdzX(Fl{xlB-BGGL?33f*7vkZT+mKqFT|bY3nL#z{c?gzbz`z4b(e$lP* ztujB6&~U6i!uEvyi+%cF;OzJ{eF!;N8?v8CWGvOKlvOjTV4y$S0BVZkVew0>JwVR# z1KK>$3SnyhfcUs%4>rw7(PAP8t-jYr z)-F2CVo+yLBb+uF(t*_Brrs|e#78|zM`YTyQnEC&CHF82Q^zc!9p_O^Z$@?C0qY50 z?0;0fl&g>V%bB{EpYC(I_~ys5BFLkWFSyS0VXHh}7{nG zFAKQEefGQoKIpHFWRyFrk7}C!#bh%BtS?3{ZdmQvn+)tplfceCXEP^;zF5yne~POt zJWjHN(i#GH;qQ!g^Qdfhdj!|}RNUWYK3xcw5kkBEl(hqRvS;xgHmJLFtviz&?&XsK zy`gB?X$RNd1f#*y)ZiW;np@9yPo3rpx3|yOUEAN=wYu`Gw%9-N_f2r|g~>j4M~XVs z!25?+2mUUwBzd^c@76(~Tb}oinf=)>PTpy#zM>#Txup(9>EQwceb6Go?tQ8jnrzqf zSH99dJe>>s)~nBIr%7qQ(LOD!KiWLHw?Df|&fL;x0;*nqm0Kz{dc&Xp7si8 zecoxoz7NEV71Y*yqhnjFgpF$ON@9S)oEq3|f3=OOQFw}z_JKIBcjsykNUdx0fR){F z?2e8Npoa0)ZTY1+17#x66(hGm9dN!soAQr+1)vT5;eyEZORMBCd8)nt^!Q_Se|(XM z!k$X>_xOp>Xzm)r5qzAHh&6iFE;T;}#!XTaplzTi_V7$l8xBO6^)9k4(e8Zt5z6Ny zaT5IA>p?qPCAp)+-Se0I(x#?y&%7^s-O~KS#C*a$c%I0AM9mJKXO8}WE3aNY-oKOt z;$yzq)|L*6C9X<}vJ>a!QDBC=0@)`iLVCF@IdrS_71YccG4 zqa!-kn>oYw&dHo9oVNJHSN5a&g27lQ+@PL8f;H7#-PxvEbJf5brR>jw(Ox(6>n|4d z+KF|iP40s%VCwK44|^(v0djEglr3A-VN=4W(k~uaU;<-P9n>rd+!ao$^95t?YtV6s0!;YznMs5Dw-)iXtxv2)9 zco(mfLx(-Hoj#s7mLM1M-9_pZBU?a$X8!u0{s$IT1Q=WUfRA+paI<~^@=pIN@b5nc zTrU;yy`BMx*Pnn4*0=uGU;XM6z|*P#))oW!T)zYvshe*rnD3-G!> z2U({#L5A)B@qhj5S8ec|zYVfmpZzt53q1FK1iy1ZhH46o@m+A7KLy7bgE7*8W%dn_ ztNLdkPxgNT$FjlL74V!_;8-wt%^Se&`b+Tm&%p2g_V4Maxr~ecDM8L8ChkpaIb;0Am0kAL{y#C(;URW1!!tR6d{(JB^ z0%K>uwZ8*0Xl-!)hal(m|NY;7^{ame&SMXp$3Ftk|0m!&PQmN{3LNiC@TmRIAYb_J z14bQSp8Yx)=Q{vKb^~O+{(Ep<{}0HM{old!6X54B0qgC@;20O++#Y~&{&O&11jhPz z;5EPfSHJqz*TMgdz?ffv^ZoE&{{LLEn}Auj;civkwzLyDqUm%eXGfTyDRV}qWdpKn z3ER#S*ajOvuXk7+IfZyY>yYI8fUy;m*l&;<5~q!sU`-F|pq z_cUD^bFJ#Ozj%-n4_nDye`8Q+m@_tQHj1n?6pKn~J6Jf~TjCF@Zs{a*kkKdEwmxsQ zV>F6yT*_*Lwd#k*?Xoe8b&adl;-sCjH=y!6nA$A0Li(B>DtKHpQN6{luHLI^?RV8J zcxDdUL)E=OzvIC#)k3`$j@N;Qu+|m~gCHbXi$BV7MkwX!z!qvhj`LK#l;_1K*{N4B=By&@18j%-;b{oX239Bg0vCqc52 zB|O@@sAn)rZK6e2n;+_A`Q%`y#GQc2kL_RIvYI2=GBISbbUxT(d3=5QJ2vr9YpVpl z9Ok8(kiP>VM2e5@Zd+}A>t1IQ3u|kw%fZ`ilInB~>()zTT+EZ3R6!U{i!EXoXw(D% z!9&Q*yvwiisa18ghQ@rSS}NFuIw`l3*}V)6dNSBeO?M}vQM`v;)7^U@P%hDW@J^Yg z3_H7@x^3io*=kJi%(2Rb@p6HM2ozW{xR%4HPuRfVGjdFmY%yn`nol>|;>VWGV5wPx zM5m%*B*}#p;w9?AqP?ZAy1Xd{aI1_0EwY|f@@U^RT6`@}Rnok@d5lX;I2&OX2mW66 zNsH>HfVoj2^l9h~xfEQ^*c{4K_J#n{jxjS*BsGtYr&`@G7+{M>ZG_flr`JS zPfCk5n}_IA6?~oQSj^Rj6$@vGO05DV3B*=YJms?Y;}zPbHlvoQkAU~|>)+l%?oNi# zVbdgTFhaX=HMSyM6P1Tfk)A%@SMYR3-jUf@*H81WuWHU*f!5ict`bjlS|gk1bvqkFHBt2f`d_ZvUa zci++ei+2k^NRmD+b>GbIXt%#w37IXk`HR2D_^ER$p8JE1ttrcPr$7JBo8;eNe(!uh zSccxD2~zK=c&H89@h&zW|7|Am@?r=klY`X7eg`0_7W-wQvRsGjY zIJdO$nrL=DnKxT>_0}r5g$DS{y05WxC97E?f@ahcIv zvGU+RGa~}T?wC5k8N?(f+yp!ZmuECTmAW}%4N9i0Oct@AwKj2M(d;{FA+dc2`E2h9 zd>K|)r7W6i06q-Ly$zCY8znJ>Gcl`FB87W+@>-#A6yIC5wo8j6G%rAsyDZbHe3Tht zsIWC`KfYsR4|)D!s{_a}UX}&(6;?JB!)gdMRb?5@=F(+`RD4KrDlDh#4j_7XoG0fw zN@75DninC+kSwN5nyk%otjLoHDI=T!G&?@-(|~t^@ERHtUC!8&F`L(0xkqUNuBJr3 z$J9Eo6VwZkXy;i9g9fh9-yG%H=_v}69KN#yz?l6eG-IP7ZF{<)(=Eo+Xr3b=CFl+wx|;Q5EYb-TIF6I9S)LGE_9v&7pSWCYRYY$q2Ts-8|FhCX`mQQ*{=m2DPv= zlHSJrdM5u-36{AkQoqua3uM7;R*uXoK$1uA+`Wr9XuuXQkZ78bMJ&N!JmL=w5%k%P|^1aFsZ2?R0p1q(snoDV@R%0CK!EZ z;PPb(V7m#G?15yugf=PFX^0bzd`XkOUJ1|&?5~$|JDQV`t-=O_#B;*i=FaGw zy0T&>sT*wg{l^+X5J3?84WpmP@#T%^ka@vj7MfCXJgjHS$J=Nv+p{NfN*qeHY@&`N zl}zR@ck6J-R{OvQ@1q;@?O3yx;rF@+X9=-K>bXgqVHb93vc`M`AZQ}}uGn4N+wNO#492rkRm}fC0_(>6&f>W z9G4l}Ei+63Q)tepFp`K00-*xgBJYqC1AJDBT8TV^;Dw@2iHxjcj9(zMCSPetLeBIB zoEhxi?2qhK#mbBOT()-Bm!SGy|}$%QI)J(AfUX@#2wN@2JiS0 zQ{P0=<{VbonoBSzd{dc{vRmPhnoSNxxGu({nphiII|9Eo>2gJ#$74jT0_rXeBJBd5 z;Rtm514tm-k{$v6EbBxDr_=yzJC285wMwtWc{au{JGr2);FUWex2`cgd%eI(#bTKh zc}LJxYE`n+AVRZ&3+WX%i@FVCi(wy%RWysC0}oRiK!{^fCtR2EK;b_lcBK=mQuR)P z%K$7z7o2s#wN^!Gp#lp+e;D+$NtC9sM-c%RSYk1!uxy4Y=Ijbv%E5W7Dm`;mHW4gt z4(|d>*Tz=R+9lP=)`2Tk)v%(XYG9^Lf&d9xyntNU*spmHU(;iejvI zwMtdf9x4u}KCH9=mZX$$o1$5-gjuB1hlOl(_KIACOti;HXN>H3(pn#(a zd$tRqjEEu(uTu$AmQ6&}Fh)d(PPr;dOrWwF%-WhQa3ZTl8UPNkf<%Jw$~q$8wydZr z4FZ;96w|p}m2DL9)y+7xs(9Gg8s*p!*x5TwL$D~ie2>Bb9bs1XDatgFfScCgrm-i1 z%rCWQGw`Zcb=VQv&FVB}6J-@QyG=sXrUzgi@qI`fJ_HG}x<#SpM0mjUEC(~2HE$&~ z@k%(sm-kc<>}JsmR`=KpZkL%*)SxAajzg{}0!d#V)^g=!jTB^643N@2_FmRPG$A;A zR>OD)*(?@UyjbMH`RW?YfSQ-ft4vA{ZKuYqDc~z@mcq1gx?K%dhv_Bo1 z0WLg4(~xj!;~IbH)x)93>G15U9RQgS`umKDX5z+sxAK#q0oB>=cq`i|cOtXc$Ou3^ zps8?W;yZIt^xBGI+=D4L>&J)hEMWHh{q;pLcVz^E=gBS zuO*WuW~_VEQea-qkn0`F<@-~UHmmyc4e^^WY48k=;WqDQkzB74dR#3dQkAKbd;zJv z1#Cga(q69J)W-=*FSn*Lh_wg-eOT)Yy4%QWWvU%;mU+k_TG`?egmMf$4V1SjdBo$D zq4?fGyhS3I7~yG<|2b_{BNG7o9md#tSS6L6qYf$M;M_4m0KgK$W{P~3GFM?eIbyji zZl(`HBQGWxFAg|@s9up01rOTnFPpeNEJoE8mpH-F-#J$S0)V;&PUzMK(^{WZ?(FB@ ziGx6ko9$+5a6Kiw?EIFUPk#|#A$w4xJiHL^=i1i_^yC)(u4i9jjWmbV9^%qmuCtZ;Z~-zx39(vWSMa=BtwFOx#Sse3o;Eqj$t zy;JZZ0n4|l2z)0MV`#3SlqBG_U9YTd+n63iY5W*HuSPs2vbc-3K?q+^v|7u7oH3W| z9k;3{m!jXTYxU{qFrP6bDXxBY`0!I+t#ZilHZZR9ztPT9*dsUr<@3^%!VOhT=B9fn!KL75GwPLVgBrL`rKR1V<%yvtgLn|}o*+yal8Ho0JddH+D z7K3mtslpg!J=PXIM)$y#vlG9@E`WCF9*n^56aflXLo*9?u+ax3dFAmoWDw`fUH z1zQRuT^3j*f;$3dn(C|;lj_`7R0aF>!9oXH=$4Wza1c^((u<+?deMalk>*a;PVJ z)D~{*{i!yRc`K6IHIX4WyZV|-JXk8<^RS3pczIK50f=3l#N4Qrc3L<2Sh;iOo7Gg+ zKm67mOQnl_;4AnFZq+iC*aKchX&ZjnOucoi1QVs$G0{{%3TkO+FZfQ_kk(w1?JX~h zRf4&<-bAq(FhGhKQP)c}n35SaLEFGV-;qr$r`KzxTvKH^j)GOY(5%?pp^mf(w5gbE zXNW~~xLxB|tT{JWPb|e@IkUlN8es21YTUN#zKc3xnnx3f@&xSRylaJD=f&b6Rc^J0 zCl`|j@-ks|WF3{?fY=SP(o1C)$=&tt!meAI?tc$#J__6#ZhaxhEu|bxPq%;j(*?f5 zkN#%I<(n<-UgbwAIvi5?f#P&E(spz!rtNY7a=T8e`@~g}-FEt`6}DL0orPN}swy1# zQ>vt#LmAw7$gjE6_;YWWw)^{B3@DeIxF%Pc<@}z-S)yYjt*TCEbrfDK>zgIdFif9U z(x86Ts97+9_eyqKKF_y%5-3!BUt)MDPiiJd7IzRf4mP(mNWG=TAq_Zs4*0?VTG>`T z%csO75QeZFF(JpeZ5vRA_Z6vSaK720%ZCQc11oL1fJ3r2D->DtJTrLVqM{c@9 z+Ao2RI!ZDgzT-L|8QU4C2_d zaxsGtWnG|5199pOOp>BzMe(8yDHynax=obf0;j@=5QPdATI9Hys2EL@B`E;fChA32 zGi`>(ELB2*jI@Ln7&!a5qhzj_qO;N=(K~BGtIq7-1Gb8$^bVAgfIanA)V;qETR++w zf$UZE{WlQx4QhhNja}mSdM&Zs{v-B*J8Q)|rua0mCzRc6j-Bb`-Tht5YV#pbNV?B{ zVRB*#23NkL{A9_?Ij{)}b6x!87IiB>*LO^8?X*`ycQ4evH3X&h*luhcdfd z=~0Ubmo##XLB6m)|D-THZ7VXkk&1UVV(~I?sj}ZIam^(|4YtJOthS40i-p&v9l{^~ z&S(|dZ~&9~nCuT3jx?=8`t8+k_HzkB<9FC0SmguH=AHIua`g6<&Z<)Nxzc$7&GbCW z{Y`(z3P(dtJA*oU*)G_YR7wIaxd1ZT!;Ow>5q4=)cZqFF-*`7g6$Q7D)H6m+&$lcI zkC948z-!vN7DF{Y!(`{=W>fQI$W9ZDCJ}!`By{FP+~EWX)Ii~vF-9$_*;L}PB@+CV zZ4pi1x>2j+6xCz@!R_7Bm0!F}0n z(araA9j*%1jQB@|N{dR|mt*U&w$&u}L~Zqf0GkOYCIH^z> zt_!yf=KCByUJNTwwcQn6sLrrFceJcI${n#<>{*NP&E18)jJBEjO`HTmml?Y`5!Yz< zZnGdZh?*+MaR-3ylV#gpx@y*Nk%3Zp(^dvY6R@3vOx0Y^SwbUFyOOk)0R@3k+2qNg zL<1j)U6NrbTc}Rr%p-?4Q0yUXBvw}YO|7^=9GF zkYaYV9V@cV!}`HGvb0CTcICQOndJ4MkvFHX4dB~)bthG99j*Jgvp}|hQ)@u+J4K|p zj4S-P8~6NGDfLodpY6@{^*(9J!^7*Z#`|&;pY~9-yzxxlGXOFmwMXXFscxz<)@pB`44~7O&R?v;^!*kvup}a7UP-Y8nBOk zFBuWvXhjiRsNQ-+G!Vmtgq2AsfSSq2y5b5T*r0+Zcdk`-A6$RhTwoVLmHYm&|9~oJ z%Lm()I20_q+5NQ%sEWgVW!s=~8Y?NJm-DvaSe=I%-oGny_r(DTkPoA6ll23YZ>TKT zT@mZ*S}#UQRY~Ozi*9Xu&9cpE5)^jD^cqmp*|WRxT*pnwJLmWKAf1Z4U=a!mkQBOX z8x?letRkz`Yl6Nf3@j|uAL}6kRku*Rof6Qp<%fQiW43zuD$A_oa&oe0-#)%eHM~|b zS%hWP!PYyWdlB2)$Qv9oIxO+pUJ0;ug8Ryc6>BHEiW@TjUYzJaco(M!46v7KbSfy} z@0-&`u7*`y;EhTNKla1=gOsK??W+(w=R?u%6^ijJt~o%AUZ`ILAJzB->pr|Xa<PcMy~N{<_JJ9t_KqF;iR3uQdV5?zKjXGiDuLSKaP%E=9Eb zC9>ijea$R&Zs54^=Gl+(YUZT;Oj^GE;cuMr$Mte9dzP}^ezC2&aKSu~nn^CbYw`6K z&UQ7x+nF|ti6^*P(Zn&NHVG4xWReJ|(1%h+ZJ-uVybienrr9*&@g__*z}8zY+I^z; z{9cwK3<}^@3_k%ad=;IbuthWo7 zq9<4n*x&NgeQULdk1|Kp#mw2hAM7qAtI8|Mn)As3m=R(6#jA%%HaA zJ=5o<&xvMg48SmDyx7YtM6n(!RlyK%CWa~TCm|jMwsu=xo--G#J9Sp&dgZ3x`4)2| z?lT+J^k>LJ*@52+wl>F$LJo6@Ydgk{+Ivpy>Rn$12slTue852}a*+=5+SLrG+MzyE zF?CJ=D{fjw@)Fn{(lSUQuZiszmlGN)kGjrWpTKx%34j$=;xWDkyKIr}#5vi9EH&GY zr#FrFdr}pYpgq%jjjii_CVy+4v@T^h*DHx4F_@{xuC{&EuJMS{jSmWsQ0AD|>qKK4 z0GR)%x|L>w*27IzlJm8*qWbukh`d7Pry7)~wqn$*TW+XwgtjUSmfOxi7$`NGm`3m_ zO`+%th!LSf0Lg|#E@KA8GGGAkSa}5UUYz6(;*fONRmBX^!}C%?6-|WeKn+d>G9a(a))R$FNuUPH z8CSn87f<}kZ__z{#H3`|`T?m&$`^Z}t+f+))(d2y%=LQldVy|>>$c+zlznMD88q5<60^5;gWngk%beZk^^CCQQoX2h)Ln;3WuW;SklLs#a@?+;0)Uj6JKMvyVB&Fj>OBYZDKSh zR<6B1pvPeO^+$4jD?D&%?S^azRi}O;#JsZo-4=Cn<3Dny2TJi`^ZB=hXFCtx4a?7r zy|0)EdX|^w5=m`kMCL&b-ME!!XJ#f<*xYKr?U4hqjm&`H)4p}90gU)PjQKHA!Nr|v zzT6mFhu5b5lL60vmouNt7Jm!MPa>4k@6Xik3;57*8}#=XusD0UN-5or>$OaVSZt-uI^<-J>SK&qP4K;%&Vh0!tbNVTxvb;V zMl@6tCqTcxdBJX_ha;H|XI+f`L20$J6c+XXltS*4Ma>RZ{vo}he8P#y0^IJS_WWSC zzyy=)Yz03yACy+jc#$xO;3KX#okv@x=d zD{>dCtTmjWd3(E^!`^`D$LSq6)6HJz%EA|&)Vx`RdnWEuj=qLikYOq|Nu$VR1K-x&a43Trz% z39tA8vuxaU>ZNV0*hn!4Pe+^{J@z zf(M{3?iK+bCKxu6n#W28q-bJvn_*-pS0+IzO<;J5RRqDP08Wc#Yf??&FxBIEn`8)9 zC3umy2;J8y4CN%6<#kb@)rKUA9L<3YGgi|TM&v;6OCSVGs8w~1b7&GIMSwztSrvjc z)v98|!BK4yXdC;5{leV{b2ahGC`18}RU@@Z3I_KX0aC?_cvC?cHB+tSO;~nV?wn#) zFq?^QptVN=0Eus=mfvax9gEo%H0GM0X1k7>?7ZhMMr#c;SrGEOoxaE{*O;Xn2Xz4Q!}c&Fp%q)N2o`}uJ+;o*8#_FFqZ%bFHBqkZdQX+9 zl3z$`Os`NzzsJnZ^ldXyU4Onu-imI3Jkr!1Twf1`+KtJWqNs}mGZ7f|(0|f4tI(QEu6$!GRPXOmZL0OWE|z4uOXpf%lXNEq z;oc4mYbhI=;SrbV;Ayj}Y%!rv{A8WmKg_B4cMTMFoo9p2&vWt%O%A)ptPC1?}y;||e-7PIX`svbsg6IF_DmG2${cpqM*U$ammDPR% zE*!nP5PQ%>3OZbrG{%>A$lA1`L-hQu-V!};U6a=7{h23P7QKG?*3sR6_ILW3dUPET z^@RTJaLdTczFE4%COWd$kS}rMd2_AI?taQAkXK?*yokqM^%kHZ?GAW9$P3*_au|L)qcum!RQLCx%}{mtRYJ8R%{ z2~+_LYdL%nBEHysEPU-+b8ZR`XLX`rZx`|%4kGI=_mFEGZCMXA6>WlT@=2>8%LNmg zmamtC^CGL1VrHo+X^o&vG7{L25#@wq5n-MWh0y0S355`Min@#F6pM*PAay9VD5*ff zF_WOumYr{5sMN=e&3Xb-ghau#*qt2TaEoZ*=Cgmjpo;<=){z`Y%$~o@_2>80f)NCr zVUc6RR;S9UU=NAa@1j}^%qRhctmWH(>mPl#NT!EL<=Yti>)L8`SU!`!wJ)|RSu${w znBf)crW3p1p5~9en_#A?XQHF9x=x};+{kn!dC@&I7i6C8N2?QFRTZgqr13^-pCj%+Dzp~c#U-9>7?$`%u_b0aX4XJ~ zUaG8@J@0@9i-fX3w_Tj!e0WjfDML7{a4Tw^KKQ|!RlJY*EfS{KzUZ(+L0*l}VK!NO z;2)&ft;a$yn)1*ASWd<+seC2@e|Bd0qb^q6L9RW~urV7zXIEvn@0eF}#~MOxS{}dt zt?c`LrsYLDRQ(Ph`mMCVHh{si(i3RCt+%yvo`CE^o;>>b{eKEi@izAEQEb~~g^oX6 z;tR~pjNu4d9lHxk>$HCj=ThyDizd6J-OU8>fqZW*{)0J;tL-MB!xWG5xAI9WnZeff zIZ!?OWkVPP{M>tA%jA0N{H%-f8m?eVOpoz+h5!+SUb9?~jt7h|h*F(pf_8BcI1EQ> zwY8BkGoEr`PY@2+T)!vP2vHZDBUY!bk}!(jQHaVz-BBzQVP#bIvS~#a!#G>QXae6` z7H`{0mNR85BtV0uD^dqlpTN5*Q$TI1wCckeu6e<%QaH`wwKxO~1twe*H*wM+ceFJo z4(4MfXX!<*JV-e13Q&Wsh^4cPeqM>XrKq5U zWI7lp3Ygrr)`VJEUOquT*60(_ltF} zef@Y`yqB4tBF?V8o5td!j9RVa*?}dRuhJ*Pq(*5Xc8kkle04OS#|#Yw5Of6vS9_Gc zI~=DnzIn@*t@w7n_07*PvjHfF@0<5 z6O+CH=UKmu7dT_H3_K&X9p={CrD~~~r@nRgcRTaz*BQUIo?%n=ex-E>qonWszzDFm zQ3+Klo&mK6W`Eew=Akov8h9kgc?{Vz@>o6 zhd>?U@lK#M6mq+iCcvIVfDLI_5=^O*^V&>kO-haWH#Qk=((4He%^6!9pg#{JQ8%Ui z7rNdUl1?59lCNvZ&a}1_|996)$WYLC1(M_>!HdIwX!TP3Fg^E@Q zsWV+0B%%3$yd+yeSg{qvE(?XE+HBU!eM9yI-SldrXhs_ics0Zi?bN2hgFM3Fm}jH= z+*A=(+ol!^=mJEox^Q+VpG1vXc|J6@dvGm#>KIzd2i9-QVClT_z%f7^U<1>%ty4DE zuo+v?qwS=VQE=f^<^J-8<9~mKnM8H(9e;KWaC;UuVoAAzpYtaX$1i*ttwK3 z!qTmD^YsmJWlFr!KonEu3Ap2vZhQy#mQuND$i<+g`XU6TQ1 zJDK@*^nGH8WEoN23pNlguhjoPqTVxDvh+UhTb0XHpecid$^;-1AV4r+0T2sd7qek< zxH)y-&N-Z%bB^78I;VTPZ@x2kW_D(Fc4szX6F>w4Odvtiq(xF>nPkajN-mluSJ_rs zwnS|fgtk;xKNl)a)b3EH4Q~fkUhe z@b%Cq)cFXv(oi@OayFFBDlW)3gjA6#M|#Cu!;=hn!)Z~K)Y7VG;T-Pbh*H3;w&9`+ z3IS9c$BCyRx*_eEb6b+|%oS>>tVX z_|;1n(lPgzpD67?42jp3`Wqc_ZSN1`sdQX{KETfmf9j*b>Wx7Pw79F>?{o@1a&6Js3tV(mH}CveC%os_RsKoT+7>$rUaJl}3RsrV1Tx!AjLJR0g$0C2!ZA zqGIQ23z;IVUkf*8C8NSCn=-vxAjZYd3aQTY)r1_-@aLtb1?RM6<#0qL>KmJYfsd-E zkt-4Pqhwyo+$K~hzx6!2A)o1-c$Y;}4Epubr?bfo`7>{5CjX74rABg{6NeT3r1(s& zl({@)w4W_~S zfaz3x^(D+-=2Yugxo}l2>Ux=Lfkyt>1MU=4+x_hq8)nzo%T zSLEua_zesn)$2tyqEcME3%@eDildEU=3=LktB;_xpC=Hnf*{5LDwSx2Qzh2UXV!Cm$dd;P7RJOG=={#?!s0Sh7B@id_3Bd~i$&5naxdde7bxhUDrLD%g zz-vnOm)M;WuAhjd#l!2rg8bTgZABPedw#wsU;Li-g27(Rzq`1;CGs0*mw$Pq2r7oS z(RtCSZN>y36P>fd{>K{R{i!}!{`R?d5em~0EbB`O-~PQHlj-zNN0(%vL}u2dKm)@- zrCQwc9F(BMY7x|V^)k#4TV@NiX>s1yt(|Swu_?h$Y=_IusIW|-i0F$MrV6vt39d4S zoyZ2pFdJlv?6|gAsWBwBbw#x5J5Z2Ym2&lD^Tzf75?eDT-L0)ezX((-lFwF+m-^3` zL}9@wjhTEY;isX<4rs*kOZL8$H@l}ImoYDZd=4!{CE-6J=y9wIW97BQXg24^_0 zkx)BR>1u}*PCb#+wmsI{2y@j{NZ#6Dos>?{Wu(`YdsyGASQ&u%+L6+(>&;HX0TtFN zQ|-jC#Z(Y))skC44Jw9)R-+RahvgEHR9ubCY+F?{#pSG)Zj;<)oxW(MD|AM@ca@6^ zU>`xQT?&dipARl2nn!bQJtZ|Ec>;yW*=kp*CD#%<=%uR)$l@-#961AJnUP=cxDF0w zCBDZrvk*jF)^UGKd*c?V5xYtp&&Rv#5w~nf(`{m%8VvJ#$74Oz^ zDps$F?Q12Xw&Cs)vn-M=uS2x;4Urs@)hfgShZTIcvW8^Svb~Kpk&T|J$Cvilehp|n z6%3+bgKfi z9^$1{Qrxebr9&?MriwPuYH*#lwPqEY<(tx0l}<7$*2$um#Ptd_p8$isLD|reBChwL|yGlGN4s|N0_rzzYBsI8o`c{iUMNh*Bt$dZpLrK`7JYS9m6-_PMCHd(!~ zGc~vi4y{OxYP}ZFB9}ng8xs-HHcneT%ox=~&HhF;i=Jdj_?&g>OBdzMbIl|jCqI*7 zR4pD}p$jxKu+_FmDdkv|%Hk}@$f>&i-Ef=oucgEoabq`y4M1hrWtW&k2IzT|JUvls zFKU&OHRCbz)o<(xiS^>IS67X80nhx*)1}<9h3wJp1hw4E9dq_oNAJIgm6it2Om2jr z1&G2Vem?lh>*V_A;ym;IjdSo@KuVc6w468_C4&pw39nv(dGp5AOGaA^_Bu`@*K7Gm z5$9%{5<&*?hGNMNn5KZV&WGZv`*ey)qhN>88fh^LA){2Y8!y^ccRox*{4tXgDQI~QT%X+N#f zd6nga=!P)9|GdM;Y@i8)_VWg{Ru5ahR8X%MU^bJ|SM{$=yGxi+Esoe+Zb)8z;Ig1f zVN&}&;&aIRKk=VG0=wi>74TYqA3T&l3x3V7!s|77{VzZE)KeG1XSo4B%Krvle+>MY z{}3Kuf#?4M-kXD`(gBa@KZ4~S!Ru%L5qL~tpC=y6%i!UB1lxW5pMh@^*8QLGd<~X= z4F1ex@S^@#|MgQ(eF$y+2^_o#UebH8-*3Y4zxUrg_0%Vy_)+1#>)`D?f%RaA{pe4@ z$N9fPyBa)h{~!Owy_M^W8 zAM9gT{vx#bDe$F6u>5;i{}XU6zYC8q!MYcowDHNOo|2EGz5~zy4m`TQ z3D^EBaL;}hJgv{bK7Rw({_o&CR^i;>#g8@P8*&V_{6|1TWlZ{WQD4t%<= z!g>BRy#7Vl=X22Jo6x=h=hcMek71v`f_;Al_AA2u{AaMwpToL;0PBw6cnloFhW&pG z*Y`d=JlL)Z`}~tnJoS_U?foURhr>OC)BA{k{eBVF$KjlhVBMd)YDyb@di_74+uI@7a02M z3R}s6c(VfuQ23tzKZe9d@d*fT4(@$zgzEfi^(dDCq;_KE2C*TK(ec! zO`|djMs)Og^?8+ApwWE1Bvpy+Lb0#aw!$AMR*JU5cghTe9Ywhfj6$w9jw_E?l_uA;2zWv!}cGU8RgzyUc zq982s>DS(*e@u|K*bArd`&N3Z_vjB_R|K0+4ZclZ-}2=*nX~vS$C=9{8n@p_R<>>) zVAD19^p#RZLj@wn)`?9imw$M|rEq!eop&m44@Fd|%Hf3diYES+{s-$Kv5GCD55N#A zL>+dLrJ;FJ+MjXty0BNT8lpGlyT@aD8~KE`c>tr3Ng9)YzIHHPOa2bWSZL`vSjZL|WB z8%Tqfo#=*EStcr5T{q_`y_(p`n~Lt|8%^NQD<%t^Iej3Dk4IXheuK&CNw?%jPrA#5;DN@5-Up)a0`rBO8k*rQj zZE~W0K>wL4VpnH7w*V-+!-`DvxUoSf+yHz%ml?V%2~;0@q)HI_tsFQSu_2mJ>z{}s4%}5cXWkv}dVN%kReR-&iZWD)^NcC1}u`Pa6=Olvn%eo+{O2bNv zVcS-F-6E+-{fZl2hN5+d3LRAlv;>Hh@2qq2I?Lu{Gy%k6Uf9W!Hg9a{^?=9fY>NSi zSRXW8Z9b;Zpwy{~sJf*|IFk=ma3jf8g;rH2V{JonN0*X;s?sH?XDLbmGjs7yK9Lj% zBs>kT~*~RoR>nuKg z=hiU0vQBlf#A3H5W8E@$@-6Cy!`^!H9WPb(Dwns7t~Z?&&&u51;1yncLtHVdDUm7O z7AG&+6jQigGv0?(%$s_qvrpYUG=*Dqdvna*t2_?FW@Q1JSwx<1weHKak zS*0UFz=Av^Qgdb45)JHtk}Bi=p*mGQ82@ImrQv;fp?uQUkcJ*{mj(5b&F(f>%kz~P zL`MvIo$o>FynnFIOlB>hganIDNJ`QHkg0sZmbm(9m-$0od`Emd`>>hW{ry+V`!oLE z_uqN+{r2m?p1boeu%t8i{)-a&ANkowG)?_rI82`BJCEqeH~;!S-T3I`>hE=b`{X2i zYxC{*jrXRv-j$B!@OwMY{m58-mH*#<^vyA`^v+vv|K8)ex^Yi6jydG7Tgty?{_=OF zfB9NRKralx^*!Pb?>#H(f3H&zfJkKt zob&795Ga*^d+m;POo+P9QNtZ^_r752dRu$jknbfRPQdi5J(d6f->}u};hSeNdD}XY zXVTu|!}M4qUza}Y^&mAzzcVD;g1p$~JCSayrXH>cbmuyWeSC^5>t%?i^CY<)dr&3U6Fw?Ny3M)&q^ll5xuCW5}rJ;M!c%bV>2k+Mq1#WzVSK#w>vxnw8 zbCJaN1<)gDl8NRDD>x%3$}`Jk0F%(GvK5)U!r=LwT9Ls0In~LLw47&9+-bCqM8S$c z0X5vy1|m|-t0)hJ7LY|^0$&`ZHh5ezdK3phgC@@A7<#E!j!Lv(^u!~H*jl7+8WbeR zX&hw=rq~nkj*1)ur<3G4bOTRD5W2N#6nOD&_6(-hubcqE?^c<#dmozoa9vI7c1E@a zamY}KEn zTK}PQH0Q0oMiF_a5M}qvm%e%Ro<5= zn)9EehQZ7qgn=>GoDicSebOHF>W8lfMpn9)i1$^k%{{pSOjeBR8b6NfZB9$UyuqfG zR?4c@MBrzF^b~9EOJ1M14*)+iCDtY%Ki>Fh=uFsG@PS_6y24{uQwa2z=pvnFVH%)DBHt#@qsSUs zNf!B>I^pf=hh*>eDZ=08HksJL@{>p=BfXpRiP$z0R8VynuSz;*YCHz!ku|wezkoB} z62T5~nYe(E&9bunr0;!zfe^gnx9bJ!*;Ep=AY68LYod|lnV@PiE5h_kvScvM`I_4j zu@#2I6)MJ)5D10|c|^!4yZ|>{iYuav?Ye4y7&*(#Gz}jaBUkWamFMWbKF#zQMavD? zJzdgV9G7gqF|P3yI=ZW+422OVa=j&HOIfKNqF%vec}}81AxG^s1V%tmo)Scy@eqGM zh~3FY21m3%+%;3bb_3hJ726x?#dmY$E5A&p`(&9Iyph}4Aa7Q%VfDxDi!q*Es5O81 zdmZoH-5t6a-hD&d6%Q8VT9M0~u)7?@!sEZHF4VqRl)XmLJYZfu+oGQ*)Y3Zscn3xA zIhD5xzjgU(@#vOOdwR+;h(~UcFJpr#lK-PTcSMUn_y&k@42z;J2_bxQhhdsdnj5qq zVO|s=A;OR^5hw|q4<^JLs0@XX7IWw^1)I&cW~SVoG_y0du(ygU+mc?{@oo6DRr2y4 ztwD?v%}>E^vq#V-1Ty2a$dp=YJg{kXhhaDQVJ5Hk8d!C+N%FkQ(p*5Q9*YfYkkBcE z=-=pkI}E_; zOf-w+TUF+BG(b2Gh&o2+u1@MX>qgctn$^bI@W#PUeB%GFr~FSa=KlYc>fT_&tHb|`rpF%`Afj! zGN8SG3S;^YVeJ1`uu)DU%~r-3gh&1Fh>6nj`O6J^AU`-FTu92 z!nPe)zX;oY2DbfeI8F}Ue^SF)h3%5C-_8H)qmModum1u#U_TA#^OsPU_$@f@XJFq? z!Pwq}<=;Tt3$T0*)_)ToKM!rz;Pq#rEfH!HQ+WM69AgLGn?pNaf#aNm_kINJ{{*!C zzr%H3g*N{Z-kZbH1SZ!lcz+%CQQ=(wZ+LzSwmpL-4sHAsc>HH*dk40=1CJ-Q4F=8u zg?r+`HQt9dz6Q00J=o_1XamX*AN?D+NB3a=Pr&s&fcJg}&hslz_Je!*9JKWxVBNok z^Y|9D(}wf=I<(V<=bwVd-@!T$){S7>Rap1$;2N&N;~&GiCv}@AHK`|cmM3hnKZV!N z!Mabv=W-X`o5J;*K!_~{}GNQed@n_*Pa57ndS<< z7n;&O+I+2S*cdsjj9H)K>v~%@A{G~XcVf^1?@;4;wS><(`Didps&5_+v~*Nr*oU@R z=`+=+NcyFz~B`&Rv2B@2p{>es#(_3+>=;v~o~3NTJEw z_zqgx?e9OotXA+VyQ>A5&wWmPHc}dLB!^C{j%NkOnfp_aSXj?Vd`qmqj^{(7xx_D8 zWAxbSfjYON*>>kz2$CgS*P?!sr#Mj% z+p>pIn9C|6v-aXv#Ue^nYW02kRcscrAk(7e=A~jJax5KVM5}h(ligWqmwbIUMD@e( z=^?OjO3#mS-EnGjbxgMvS+0S*i0OYJP#3wpGoOqH(|gmb0JOZN!FiqwvvIoa-;Q#g zzH?K+y5lm^J&r6FccWLF7}o*br_t)WH8Vrkdu+o}3JTt7ld^bo2?b8O-a0MLbFCg7 zm=fPKa4=i`thB&S$?6Hz8>ueGpulX z-?^|~{HtT8HJ)drxqMaK7n{F|VmYDZgOcS=i7M7oxP|s>pT#mbV*;?HA1`ly&}P@Q zjvXH47LPYhjhnZ);+C27e-_dlFndVL9B(w{Nb{X6t; zV^>e&;&J-sdhNh2uD^0o-@GOKR-)r?cBs~J{j0SH<64*6Y@cglli4I@s~2AR+)$ri zt|$KNWv6|)-=9-o$mI$x)+({3u4Bd=3Q;DJx|8_P?;KQ`XoxrKySMH{B&kH6R^Ir5 z=1$~ZZR*?+?QQ%USnr{Y3jK5EO6ObH5XnRQ*wz;6WS1)5bhAbX32aI&8OW_sOy7T` zJ##SnndhXj$WETklH9L%oO+aI76>nU^!I)ERWB?Lp{oK)U>av5f`08xhB3PrJeTs-g+IlQd`<0#WaK z1=)?(gCJX)Gf|$NH}V!zNbF^hT)!E~>f}Bpsd-$(1nn%~rZ}OD3{;P0d_Q6#xMXTL z2HK{ZDKW}Xevj_~G!_(;3$@*bCT%Q!z9gCg1lf*#IS5z(YEL~fJB=e;qh)6~X@Qr4^BI$q|_cr0&~0;XxmVo6 zxV*}XB6+RoeeeKiy}QNiK_Vng}8&TVBW@rEAa6_Lr zCY@P_V$e}E9jt7;jL9X6JbkGZp^lK6;jK&2b*=s;Y0PP{DrdV2wvFIoqr@Q7HoZmY z8o|!TOVn7JG-N^T-McQ3#Tq-mzbNCrtym+{+vZjO#kbX3@lZDVcXrL(nTKV9Rl94C zHiV#LWxQ;*9dhL^7mV9ld(X%ro#{?m9#*>tvwkb3HL@JPqE`qa=U3Zx)*+hdcne`@ zWje!IT{A}cUG5NV{<7k01kU&3o1X7(uyT*n&8_McY=oMR)yT~I&OP`mFEtzMxk{H< zxt&X)6}!Q1x?Xa&sJy|ksMFb=s9+^)jP4a)c(a}n+m2&n@%cSm@>(+#%ImhaQ8JWA zZyoQcs&%MiZF{2JV(PtKojH)*dc$Idx# z-+$waWiH9rKDa1fzx(*=R_-H+QC`!xa^EfiL*$4OdF2b^?_KIm_ZOXu^LxM4T)y(b zvoF44NQkrYLUr%{dRSj5ZQFuIlZP(_-4eQwBzk7u)hTiu>@*ZX<97R4(Jlp5y5gWg zMXi^LO@%CRO9R(|*c-WTTQ#gf4jgB@!mkQEykFEMu~PES!zl%tmz>Se$?j$^-aXtrH|Qj$ygJ>lV*z^=sIqR^nql^r`us!bHz=-a ziRwTI_0(pHT#RR##PqC5r^nN4hSv0~g^7O~ekG+bP3YkC%i4 zv7tqavCGF~Swi*&uRD5#hC-XDO4^ygX3t|1SMxkJP4LBtq~n>AJKI2lvKeg1SEkKQ zL>dA-Yp-v8TEJsGd(6MHahRuX4`xjD? zuF6=mc1T_rK=ikfdfrw007LLz)_FiPoBbowWg|`3>&?pyf7Ub^UM3LMVqLs<^)yKJ zhU~5swhw6mGmo!a*A!r&_Ja1UU1yKtFYzE_HAxVAP3~Jt#VR(WlO9-bvo$0Y8?b8D z@)53Aemdgru*0R{;HJnyEURZa@fB}x>vo`#2{Xv&3w?=c6uN+(SA+RH$m@L8=v>>2 zyFrt3C3H8*^&9kXA~>w*OceiDT0^l+YD}|8+eYt*-zlgqDNzV^SQHV4gLZkx)U53W zBzWzOwqPMqBjnr@4lph^S|u&e zG+;R3bfC8AitJ0HS}TaTykhmKC8FL9W`g;GRl2o`F<-@r?PhV1lgiJ}Ms(W~k& zgIUY`7G?CaTZi2qc~spVI)n}+TR&B8sM0)R(WQyhN&Mm}s+E)#((KvaoFq>%7oE8TlO_>JO!Y zQ}-cfp);U)tJyMK(RRglH7=t8Fp;G$uzHRD3-qqa$))~S(qpzLD-p}79b9X;J#jz1 zCaN_Tb=Q${99g_%xN{>MqT!8L4pO;VT4dunx1$jS$>aOwF|!32Tu%zMA&Tg?@9|sS_6KcUr=LniDlm-=}cK{{dTWO5kheXGk&!4oB%C53C4!7S5&H6s_N@IrqkCj6_LA!pGwG$6x*U!yv zcdPd=gqf8RL!p&NA878{cAGwRhPQaC_TbP<7v3B8e*d~tXSd7WE!`H$E6-K?`B^84 zwn}Pf`t!cV-M*4_m&);c#dIk|Uw82qp@0jAZB^t5X`}4iW&;Uq;T@yavxJ_`m_OXe zmTF?Ri%;v^q=3A`m>muz2RkQDS#MhOVYAT(r|)$-)z~h>_by^Hc86i^M)l6P^!mO` zrK+U$wXYd<>HP{(;?VkQDZR>dbF>J7kf6ABY+gHE6Ezhj_;PWBHC8ESAA-u|x>B>- zV`P&ogYGyyGy3?D;xx-TpL6TsogG3Tj*C`S?A%QPc1xy&BNADBEv99oR8Ti)yRV7` zd>;d!&UexXo`e}882IO=d24Em{g+|Ftkdi~7G)b~T^3r(qz5}@wy``M0C~T9bsmw| zM816H7-_Q`tp$=Yoh+ZIqtleGdf~Oi-qEvT7v$b!e(5eE6&Ec1%wP`z!<@KfUQ~~_ zL~<&-<(Xm1eL(a=Q1*2DH-{>S*gAhZ^oJ&DT zWyNr7h323=64KqOwLN%8!Cg~m<7NGM+|b0Jhb3sLlbDOkys)5-O9q)aQ0bePX-w@` z-7gS_9jEIC9qZU12FEEl?A076*nyK#Sy`dxGPEDf@giK6GJ0Mi6YvG|LcT(U*)V=tiJS0wI z$I?yC4_1ACBZYMp50$aKt}M=ulLTp=cU`IO8N^0WEXKlg!w+T~X-0N7vm<6+B@z`- zGRx%hE8aL?N%EDrb)Q5^{Hw)6gmnd#bCWHY(;{NbbgqFByY*eGIj$L(0Bebp<%b4U zPy&a*WlR!(6|j*ze}o^;O-KP!Vqvvgx%&xh;~NmBGH8vnm612GmC*M8+_DoqH42bB#qOBSWYw76K+qd}mO`doMR!KY!u?6sa;q<|L!zPN zbM6`pI0#Opmx{&6eofS^5uHci|5?ip$utHSVHZN? zO3(Mk%AneWf7{&sE1l|nNQ|&?G;O1@6NAhmS;fhcvs0CZ4sd#Sz2-vhq#$Q}a4E^6 z;@G}i9S;|vvmP`!MOaVoOa?~8;~87QWlbe4-{wb!X~Y0Dk~g$n(D2~3t8C8Qtkp>? z1GhdP3VZ=!uu`L0u z?^uWRZE76W`$gKRjVMS45-Ny=Y@{v<4YqFrNZ4*(5li7N%-5&o15jN365VZTXm6y( z(=|IY?sXfT+UfFZ4ka7Xu{GXxY(y0VPhWrcD{S?34Wf#3*MW&vfNw>Env{2I6|^yk zpweSb#JfreT;4OcO7%tayqT-itNlUX??GzDm4i2@UwM$YAf+{j1{RWD`>%!^{8&vmU-Il!195R1~IU=x)0wcr5C{Ler^mQJpG!S_BBdBIL_!WPQYG98UzTN|q^k)P1g;oXyrRcsGl-j z|G1NAXMhkT&|uJ?UA9HPb10yLUDAqW^a+jMV$DgQhtY^WZD{Vb>FPVPn)my?aB8}8 zNqwPsCGq&RSTiv?I& zfpDS^uiw~whkko&HDRgb79gTuSgoLhsK z=d((uc?A#E$7llnUawq!@j>hx2gB+$VYku3Iwfa9aFt$7Q#f$#P*`nmXs+NTvkuNQ zCFr<_TuLekzpP^MAaY9qhWGm;u`Dx*w(G3(h^cHzY-G!Jvd%I-A=4n{bR({2`rAfR z-=V!wL?XW-;&Xngph^a!1PV6S_F4>dV%;bzvO|+)v~C%dPlXh2EAD<-{25bio+0UJ zPu*7z?<(|7GCJOW0c}lRy?5KzY9Hp#I-Bcei`aM! z^{|Pke%^%sjiub2(!{48)}&v}J}-uv6KN~X!9l+z<#RE$9jbOZUSv?LsoOB~Rey%5 z=mkA2)7Oly#jWQycxR=9yMMA(!i zd9;H-wBOBK9RiV2rKYGSr>>)+*B=?y?#U}ahqJ0%88*{CzlAY4?9d`%Z?qrxM`dUhDlW|S$q}m-H$-DJpldHFO zJ+``dh&R}d&jRrW?fXfnZZR&JYYg)jNxa=EGF5yAnz2o9RI2YkCd}TpS{ibRRVwPG z)I@2-7rLjsOb3SOhkJaM%}yljAzPQ39X?r`$2CS~TD2Yqs7Q0Lgq~a&s^83=9?{o} zzIhC(YtOA_zW%0+Y_TtAcXK3jgf(i8W~lp#3rVjkD$U8LteNvAlOgv8fsucK?JkbK zPrg#PIAM)ntaztWnDm+iUof(sy$d3m!PfCK-VghThn(cck4R^&>Aidcy6r|GOjG3D zRQ5F{5vmu#7ql5Q_`}>*H`nN6y^|Nly(uqu&mZ7-1^wP6ymC0s`ICJJ8sIf%yP?cu z9fM7v`)>Y?^(~8M$*cN1kNQh_QLW~7+p`CqZm-ry=^NprJI|h%`D`*j=islrMuQNG zmiIf`Lz@h%k6O3YA->*2qmX78#Lzlo$L_|xZ8jyuL@zHu3HqJ z$&CEV)7s=Xw;^j*qh2AG_?B&?E@g~1&$Xm>Zi#MXTWl&F7)>F}XL3DSo-09ZFqRdw z5|(^VDuQ;p5ieopC}z75t_LdNRlh#)#ZxS26PLSl9&j(cPBRm}oWXgN5S&tPGFH8Q znU_1`C7sG}X@c`Nf_WRMOF^gbj65n*RJw1HExprBswidDU7FrbsIQ011a zJ9WCTUC@NDZy_UhZ0s$>po?c# zwnMMCk~e!BDf!~QwI|J*ebhNHSUxR`D!Ea99#N~S+7KMV*dFC7EVYlA1?mHHWRGB+ z#B8@~S(n)^V=Rz&T9cra2e$IT2-qW%L2|L-;pzty_Lz_ zm^jMqGmNWlYBbu`sP;W-d)z41Ahqr6YTB`M8fHS;Z3&e5!#-84?sWalTz)rCR|)B; z9|gN$ro+0j9jtUVH=t#aJvhA3d)%sL;#Y>ogN-i88XX}maNFEIt0cGBVY$Gxv`brrNrD-o z2y(Z1N0! zI8X2L)Pt6zZS?g(yIBL*e!qal%a)Bb9NwWdWdm zx%q-}=|-a?W*37h=En2sn>D!_yJU)oK*~+!OW|ylZ|!P9tD6of0|^?t%VC$^+EeMX zjto@-nt@a=?P`6c#155vA|YheZeMYCKLx47TyMX}a~tgvL8#?v87;^SBL59}nb@AP zPLST4vhzz8zf%)zaW}nN-cw(RBwv4UsT6kD7beY+WrXf&e13b-qwa_*IS%MD$UBeQS)BJLdRfPy`G~5b$`HST3(qJ&%}VL^WyRz<@B>K;SAiO zuC>8#_?og~LYNfv06k+_>u?s)E<}pVT;_sslxwqSEiRKPj%5J$&*!wBQpOAJhPowX zr!ad}L`ccYg2Y*ea=+|Sb3?9%CJeD%exb;vGK_LP5N>6++rpc)6xF>7lm3D*qb4mQ zNaOQ`h5l=S7Er#$@#QA2`%KFkwzh=>6cY99{RW-qsR+CX-2$ogN!sL{>fT+ta4;0p z9wT^4qwX}CJ z)#rTMfXFp1@Csfv@=05cwEX~2DMBJu)(d%&+mSjHM-r?&;B}}#)%$}JVx0&|DMO0{ zkoHi`m?vQ_N*i3Xi)Ndnj8Ba51X+{P0CU835et#-H9MvyF*36cII(?F5R3hc-1DJR zHIX%m>C3&ru3^dUOAslG~pZcRgdB8SU zCr@jYs%O`vex4rJZ{zMvW~b#~v5IoX`W;O!^sN2XQRlF*d1{!2p(9N1HR_hO*b>6C zLZ;WCnyNa~_ltLGeP_RM8Q@8ztQ`!xRYzQ`iLf&-Qq!DL{JKG*oRJy_$zic| z_(a1tiHVJ6o-70fD>IelND7jg7j8Rl+gYUwAR9Bmi?uA#;D?@r>2U zo~%}_Z-vR)fMa)F)0Eo&V0X31YLo&RC)P6~Cs`Wq4Y0Y?ri$Xl=D-cLtm&>fjNIJq z&qs+`3(6&iX+8lo_d?#&FmD)liW1~Ex^Lv|xkLE=G0$pWMuW1F%5U=N#(f#@(<{g_ zivWAhfKetaNd!xej*#$@g+)3a_r&s0G3U?5P{ZS<%^}l*VL+`>k-@koy{VNUgweyf$FyS^=05u9pr0iHBk_x*jW`dJ!hZMT;%oU*2J7 zMS~UlIjEY|!89W_H={7-8$G^PvjF*9rt-cV54iAA(c=Rc8LI{JR6oACiKL@7BL)5( zWksme?2?edpd2knoEYU4EO(a?cNqkD3}K^4E&;H&DoRY!qS1$X(#833yIR!HnCA;o zDBW!+wFiZQ6$8W8Cd%PZ$!V4?6q1l)y(VPBTL7vl4o;e>PB||p!Bl1kDaagC4sJbI z>olp7I*G^HhR>?s20+>O2E4S_kg30+5<5KMT@0ew-LjmPpWWAfh-2a0=boi${1K4t zAU~g$&@DwCaa0Gn!KOHJ5GK4qc?V#}hI^9+86Ujsf@$s3_CJ*}fpeq$;`d?lQ|Z0L zXd_mwVD|44N8r@m%U01u;oW=v+xy*7HG+_w-HX=VjTK#bQ4vSU z!^gt?_)%_Bk~@)vI8g6eqUd)3AolSCc2)nB$0D7gn=iH@6V!V%u%C1n!ol_*n#UbB zl#5fevs*;&rK3zx{{{N~F8RFmclV58_KWz(1uz?pNfmX6qIN;aA*3ePdoFy=Q4-v`Xq( zP(&ueJ$UF@ZunOsp8*k4`}gAAJcJ)qhhe@-n}PVbJ1A~BeE06ej+`^2C^L$6Z$2m?q#8rb@ zApoWsY5_y8UCiu>>t;tpNMd=*Oh`Zu0epQrSSmCMP`ifWlvuI`EI%@OHB16CE_R1S zByi0DLi~r>q**Kjmt&@Ux45V1pa=!!oMm%aH#mBO=5iwEfev!zlCjJ9Q!-IFrwuBd zbx71{S8yFy6}GiR6wI4~i|i%rGL3^XB4P_phA@=?QHP}U+FlLRzm*xGDY<0w0n>IEb(lGq z!l(zgWZ<$Zw~_#FjaD0Q2*`yt3N&IS_^CL~H2g{8ag1O>r`%Sk56h%Lyj}oIEwqK4pGq~n<;`6n`;n7JAiGRHPZu`}Pn%1fVE2*m z`WA1q+dun|TmgBWASXzNemq<6ZXab|f4DYF@ciEHU-=^%8m?O(WM>`vPP)&ViKI92 zN$xEQa;^2xY#;8di7lGgz4m2S!+$2$4^fbCH_SAF{Yzg0VlxcLP&%7ND2=krgJpTM z;Bnc5V$1^+S6+JaL^)Y6*xrGvMdJg`f>-y>6=+2)lNH%!W2+1lHq*@Bo!$ixouP9O zb3+6n`eeoTAn6qaty`?Y4Ui8`)IL^x{`BHuFkn@(;SQJs3uz^X%Kjp4fkwsBo|D?~ z2%U6l76-8ZL3ADNkQ(+-19bsLyuHX-bh1>>+PC~DxvVFt%l4R#(uYig4SMVN(?+EE zZZ;`sOyH4z?vJH%i?6q+(zR45JCIvnT?f%DhsRl2Dz(qKq6(B_5LC>7%6@1LYLtKI z5NV!P13S~$7#R_rzGs|0X(pwWsoU+_H8{oUv11L@t~*5 zF0%Q&UwXOMqe0SEL*`s;u?AH-pleXfj>3&oq%g7476@g`9~H^%ryKrA%24utwi_(O zCi@Mr94>x#^ zj}LP#2&8v==32C*hC#uXRN`VaoZl=c)pqf|h;U2UbnUmDlJ#}u{uPu|lb|h7?_TtV z{Q@?~j$HV$R}KI%9tM~IA%MR{{C+r2Wrj$t=An3Iq9hVOvet{a4J@QIC2Fk|lEQA* z*|{+ey_6B{x+sx*NbLEy<&_rrTLFR2G9m#g4@kn!1hRae)M_b_6e?6H4N@!aHb;1E zgsFvc4CKaQa@hlr%bX@|*Tp1aWqhb`JJx?J6ab<-)P(Biee^B3l-8-t$~K{?)dn{% zS%*Y{KMwBdp=u3;*s0ZoKq+#*5+=P&$&*-1#@OE8#zvmaNj@?{2=yFH&BNpxNJ3>w znRU*=gTsD60!6^5MJlOxO_2JR7wqk{*lrX@DZCs3D1(C8t(RP3XFEl3A3}^v+0sUaaD1u_5rq@oj7$j7-Fj?_ziPUa8cpW4C!iW^(?ZeaCJ2_>b!8 zX#@M^#{JLn5?3w1SoW~h$2*|*?K+=6`g3s|*t+meH~YlT8Otv|zYo{a=g#2HPBGa9 zDrV&{^ZX~{fJ9HAqunQpcuV8w-e3LX_ZNc?@#2RI61__fOc|t~!QMYX-uglZ*QF=T zPV(`P`EmomM3LO^A^NlbNrS^re-^)WnSUH#*&|c&lwPO1H~%>IKc@FH$IamVWXkBF z7!%k?QDhLS?})#F0$@BAmB+jd`;+!Hl-`1NZY{%I!9Loe2M^@+V*pDR;QBkcqpeTV z_g0vndJ|h)?*z{SgqQdE$Cb_{n~DNA9f83s8~$5n^4B|}g8Vm}^hf0r_iNCeH30h0 z$1h6?<5i^UeHt{;VILl;GcQ>g4wtJpl_F8r&<*e>9&kFWElv?zQ-bNz1Dt4P0o(Be z1|5!y1jL;=D#+(X?KG=xu>l6CTB~soD#r+W124Lfuya&71nUBr=aVrE*x_(Vv9>in6)egta{5o#~u3nyX^Fmxwh=Jzfe+)A5*5Z%*#m!oYrwt{| z4e}wnuj9bA2x^nH>Q7`J9O~UIx@liR6I|F|HVQ}w7>{xVWccw}^l@IFz=a&UhDMHC z;aBipAEd%Q*HRgy%vB$4S;QusjJDMcQhSgpJsQFw6LdH0H{*F?^_JXl6AWBTZIZsM z-on*xj(&)YJ5hfn1CqS*Mg-U;{e?Sr28I)=SmiM-gnURK=ujmzn#dI4cDn7P=lQMw z)Gc3H+Jwo*LnyN?WfM^iGKQ+Fj>v=GcMDn1TK z^AhZyl69+^+&q?L-)+*KQ2ylctswg(9Vx=R^P9drg@t&4N13=oj*jly(6 zCztc27@MN(*8?}EK2Yu|TeEw>(#jV9T2>uWNAEghk z$)M3~sC3b4RD-xW)JZFfN)5IEu(rsvyUDHQ7B`OZ)pAfof-qbh`552&qf*KL zP-p?AKMQynxF?aWr0a!W7!Q4iMDLNKUKVFL1CiPT{rkDYnN<|r(&{z;+1R|>p9gQOEwp)`%$o_6nQ zd@F~&pOCbNDISkxZSWVP{*M}`u3O}E=ya&1v2i2I;@fY=Ot4D}eC&NB=u8mHq-8^IPEG{O`bf{tdYOAHcWyUxU~FJviPUf%pCs z;LA*a``2JQ0UrMd-2MaDFAR>i0G`kP1o$|0aEv_I|L0&o4(xXgJ|hggoO@vV{|)y4 zpTXnb0q?QE<1_Gj32grnxc_J1`|N<@Xy7~lYw&yneC}KD+P??KeG8sf!S^}?&;K{@ z`YZ6>{~R3mKY;tc{H>oCcw0TdPkIMD{~y49CvXi8jbVv?8!Y^voh|*jOmTj@&3NO zN!9s!!HmA)l6j5-Z#R7at$53 zdO~_{dm2ohSGH)|p`Yhcp`o9!nsYt8Kg*CPr(wb}wfc+30^yZoA8=9&BrCQJ$^bEAG&#oumW-^dGW4Vi<2&z;o2X_nc<08b?chzRuV7z@SG;^*lu zGXp$qbZdr=+P>U$e(`-NwO`}5bTsvNKC3Te8LRfrVjT2cKh7M(UMh8nD%m5`RP8uS zuG2k{*vd~ zxwbdcSGKa7mr88E7w=9I@5Z#!aOXiZ0mISc{1fWu8mXu z*}Z~uG8xptB{Bqr=gISOHhtXDo88owwn{Mtv3~VI{M5(iw{1&IKw&H{eG~AZ)lR4X zwZD`(-H2f5G{XGia8>4Vy*MlRiD_m&>crOjLz}Q(o1>kiSf;uMr*wHAQ4f!r_07A1 zz#Jh{bs!b05AR>pb*Nj;8d-F3?X!Zy2VAbZO1Opnp;&)b?WxY*0q_I4kNN2@@Op>` zR~$3?QAkID40C>D{e7$k>M1wcfq_FGp_yui6KVw_fC>Y-WHqO?>)ou7!3|SPpqy-s zYnpC9rdIo>`4|vh+Sz-RG)RpG#b09j?cM*j7Y(ZI^~FH+@%=ZRc`r4v<$3kH{jV^N zs@P5OxOw=+;ISBu{y4heVp9+&HuK4aE>(oiF-s9{YpP41zAV+E2vdddqRt|iGG{ei zER}YS7yZKU!pOGwux7BlYc4mg!nARCzL8pCx`}fqf8^T`%K92VINb6cTIt-82P!td zD!i(66Kz90^I&^<=cd_pM0&vcXYrZ})E0D=GY&~9!{xJ`mVVx4U~tl32QNVzRpFlM z?DdKbd>7ptAt(02v=Bf1O?0c$><}~>7!7njv7olyT|%-NXIJ>0Pa7|RSi|d{^kqdM z@mI#qXOn$vbQ3Yn)O*drplJ!NPpv80LnZAfegUr)U+E_+0@Aj4*-ZDVjpof8v)e3- zYUM$8*$k*p;5edBtIhU>0IM(x9hLPQ$U<%{I|zM3tpRPQ)ELzq zqSuMm=~xaQga%qn)OFkG`)irp z$xRbwA?$Fe0w#6PBH3Nd9eEc!3^5(2`3s5(X>zH(Rk!2WsljXI7B(j%3(Zp_svV{n z(7@b&-wOAtV?&PN@-YynA3KK4gxrc>i8C#}@BxNKwez!s^-8;)nnW{kZU(F21JN9*JawyLRmik7yuO&+7 zq*RuV2G|o*`6RwqyeB7E>%_SpK0CR&s8w7B>VlR1_A|F#r>i3cw7Z6<4GOv*Ibn0< zB)TC~=cBZHY=F3k0HtTTMB9>B&vZ#Ba_}!d+w;XeHy{+RnzfS<{`Ig+p3g&UQB<*V zt6sjsEB3m)fgF-M&l~;9QzN+7L|ipVF1MX3Ng)3uj%6ZI(;*#r|$)*yK(Cn*Djd^;|T!n=k)VGPLK&{yuCw zqmWZY-eb~~bf_Lr?|iUt_IA^8d9#hby4Di-8RMSJ^LE;)i61t5Pb`VMjm$WWL7s(E z&01;UTwueW{ecTeR1JVYOP0`?S7| zr2Z)(m(*5T|0`*4q9dOP0Y6>eKh`?#4hp4!HJvyugBpQWf1CS~RBq*sE``kBUERO< z!pM|YV%j*^JCWgOgWL@V$H`rEZ_)lf&BV@Dkzbw9sz3xi_i} zA5J2`APHqU%_l4k-M_9Sde*GP{zbDrSk$fQajNRrPI-9Kva{>>y@IB*4e0hq`NPAM zKUb^WNB2763={H}lN7MjGUm#xK$$h8)e3w%MoV`{#gd|KOE$7SMO-MfT8(96FgNRJO^}c`4m;s?8{p~&{Al- z$|)=EHdqj!i!4l9y{{f(4AAF}oRdaf>HJk}=5!9b;wa*Y#$S43s{oj~%lnTbDa}eG zGR`x8a9N>=$^q8{Rg5QRWJ?wFY}qFAcZveFgy^qkiB# ztsnam+z}nrN&;qxkMgV|ul8lr?G)G$QExe8nz>9X!}^mb%{sgn1_{xkT9d8obJh

)> zZMu86-~8QFi99eKQZ5nYYn|kf`&aY82fmJ7;e{63oLZtSv1O_q^rz+K4|a!3XUV=b zdvMkHrNvd*RG_)HABqW^*E|-TnkAjx73RhzJH>EfvsQ z6q@LZV}DeM5X;ar@}^;#NSKs7O(HTtjNj{fEYnE8cqe9ZBtd zqfj<^&Tr*1)8=XY4aaxy!>5duidDjZX`aOfA2aytr%{_X<2|PZ9&58Kf$~zOJ>D)G=tbdB==COW(9#=B9B(7h zw%i|SC#_B!STOpLpQe#bIx47v#%rVpNv$`g@2Tfki&Q&L@3$^mb(ioSVAx?Z*GVU5 ze^V|@N_*LEdE*Oc_9wP1UsTCyoofEPH|O#@&BRmb(N+NOZL|66MJH5kHhMM4pXv6D zZck=PE4l$7mU-jsm#giAz4Qq=ZGK+$kmUp7W?uyP(RM z^l;;FWz^m=N>AM26E4(Ep$z9_J7Ym~6V1-4{_^pO4qWqq;fNLq!P^gyebU9%!B&8T zDc@XsBPGV%l|A~!+j^&*0OUz1XvYciUcQOM3c-tDqIcUhp+s1L#9Q|3PjQrdC^jeWe@fR|~cl7LL zZf>;!!eDamt1Qz&HvMIBQCHmQBNW1qk)PbuRKNlvCrP0p#PA|GVP*};uL`lh) zf-$%vo~&URryrjF@Hi6B8FuLZFicPT zi4$Ep$vVXY{rX6#`v463vd#lC!r7Bl1{l)I1!IV*?wKBcR?k2KEsASgv(CLym5U6f zGlP*F5~!7sDM~{Qh^t!98zKr#*05 z=H2r^K6u4_RlWtZz+*59fdMj|Gj^b+l9SYKy}6n2qzS&ZNVnhLqxq#Cpdf1KpV|9?X2~5twCmem&MkL0!v<$+8PCI zL=3h10aqoNF;n4rS}dfi>sL#&rTLRM;qO&#k}o$>wLuq_P}QaQE+xG5L(rHTFZGTZ zR(W#A6b>W6?ccIKK9)=eti$6bJQW+9DuSs=CnG4i{%zr~@)Bz)9~=Oij&!ZuC?L_T zqG&5X2cS4&jq+TWk)+j~b-DMZZQ-S^SB^5nie7w0=tNCV?d%S@8gn#s{|h9yGx{3u z`RC8~07|)JAPD^f)QW|yMbw-pk9y2F?`LKQdORr}=zr>Thq*c|aLhc7T;@`X3H5ox zK<{y*{ugD{_w(}3hM40rL_Z|j3hrsNP&c(OYx879xkQHXm9)iKpVxf`|J|4lqquB6 zmr_r!Te2tYW=^y%l8hid7YZ^lC|Z-}4Ep_{+zC>5J$cW957kNl{w?NIrl7dq8#W%Z zI6=RuL{4s(`bstZWH{cgBStLZH%H=PWf_hmbniGvV$XPr-a z#a3mGVh?_i-h|KZ@)9TwK9sa0Wtt6}7~sS-#e5B)=&lL`(kW)CAACB&LZ!-D>k01p z5JY zf;^5WgXLB}xsE%NM$@ie7EBo}viZ}7TCQ2i-4J1cKG+<+$wkz64^EG5(>jJ0aAgpkfv7Gg>io zhB5b)Uox+wgPj!LAj{5vC@z<}$ysfWR+^2S2OP}KFu;bgB)po=DcFIp(~fT)(l5Gs zy!&+6##^};h&OvJk)^vw)I30;scLaY5PFMB=p>&sPT+k?L+CD?xuD57Thz3p2B6kb z&i>RqO7vc5hU|LU7}b}ny)@U+?5J}Z1tM5sV5TyE9?l%t?X7+r{b?<=ys(;by84E^ z%P5@w`9Y;B(3D#|hYo zZds4>u75?-)+@n}8^*M^iyAJLI5|=nl57M7W9+LAPu`q$Do%u+=#v9wjRMyqcwYOw zC+8U%VB_?~rP4UYGFZF5Sz(e*Q|FO*Am*{*I`c_(kJ;F7WEu;P3nP_Q zi^GDB2J&<07d-k{a%0X)eD?q~c8y9T*eJn8e(6IJU+szVpwKZQLoNKY0rIO*e?`1o zeJ<0jF3tX;@unb3E+?S#NPXH7t<;p*J6IgV@zEOomd3&O9jqqHwtP(5Y^y?jl?>wc zkExj>5vA+TOueeh8IDx0yHC~IG8_-g-kWvccgAt z(@WCXySloo%THGCW@LuleoR-7bt0{Lv8RbVhB{~k3YT*L4p$w^k>#vtypAgD01H$RAEmaE6(<#qe~-?K%ay=U%gq!Q z>r@y2^F}jgYTlirSfSad!{m6_;3~jh9Ggcjv1C>NKVlbXXm#{Bb-FF&7q4nFKz96< z_#UTj>YC|2PO22^h$;8>z@Jc?Q=r+Zz9|kWAdl zchm6=yiHJ-hq}Yt*)1kU>_FN2bNuj;*+RI({+ER%Z1n&rPPoc08^ql6c#}FXip#kF zUyO=51r+S()KjRPgq0D+&V7mr{kqTbnZn~oSL)jjw1 zl%7egfylndB-n_J=S)r}_b_h?Dl`)Lv$^k)(JUrb<7BQ|fibq$ZFbkK!)!j2?4M55 zrZh6Jdbu*FV=N#Lx3y;EvvWIk8I@DWv+``lbq;J0q|$o$ z!_N{Cx3``C6yhqsNU|JMn<8~xrf4s)QKyIe3_dKV#V2$K{G#MN>Jn-xN6#`iR1y1N z(N}2qvYvp#wuesBx>Y?wkx{oOo_=4`>ayV za^dymeEe=yw2IZJx?*T&uEnH?UYvmwU0pzHTTeTqwL(ML?!G-R+xeWfd&n8ClAhtT z{yo%5koyU8mP&^XGMz+ahc|7S=v3)F-`g~f+K@V2zUCFjcY2A>z$)$O(R62vJpHL1 z457YmDbTmP#yLCPIcRJOWN94~b^Ka4=eCKqbQw=!?T3}FB@G*UO5I#s*4A33Pqb5P zuKh{`@Vz;oavh`lWhNkkps!+o1v*u*jWZ@M0v=k4cosM0aXUS#AolJY6CtsaiMHQl zW-J(32bLh%nY#D|cVIAdv25e{p^%%nEoN{Q=_x|@LE2yH<@%>p04PCzDm1lr;jZUp z3n;X)rO$p3bv8T^zSjD3X#ClT2OU z3TRk}p}U)mJxN9m%wDD3t&rz`Xd=^03#=M!D6^C_=Rto4-C$Cp9gZu8kVgs#44xesW#CHRZ_zy8(PGscQ3Y5KwtGnWdZC{2&nM4U zhM{qB%CTj;_AS3Rv!=#?w)W&F!^^yzZFUne9h6+EqcL}QS=A2KI9%#J=L*f*;tpc| z(2CNb47O;Aa@KCnbI{;X+aRgh7#HU4V-dia%43ky-L$4eO6jboUXvcML~C*S;prG< zn7NY4Pp~|_@ZPH%;rZQZJJ?99(5K#fBp3P$P&;*2;>HL5uXgy|#V@nM>%M-_5>86d z5*&?u{XF{5ur<|5+Q$hHyAdksDY8Oz!g$Dy?^;K)(jmXme$M=)6oQrqjprU~PiYf# zc27k5u#j_q237fTpbnpff>(E?ig=t?t22k= zi$^z{)#L+CGT~rNwAU@OzqYE#nM%$FWn9m08|r1y;+iaGNR1HROE|;nA)^o%`ip+@ zf|OOc0USO>3+2V~CP*(4^D`)4&M{4EMG)TEg`)NN<)mzMqI&V$X0twsTs4YZZ33Zs zG+c%O&7bUUpM@N!nIK{Xsqoie;VxQgOaIHVd0e8=$b=o8)XSebvU*uwV?>gY(zs`9 zdIi=U%fQH)c-f5aR{&sY8Lp7c_BjF6*!#V9#I{$s&Y$db3k4!vI}E1ZkVNdu$ov<);FE@d0O zP-~h!FBhbC)?dU@k&bOm%qH|^m~?&3@>Ih#67+KCa_^P0F5n+^%+qQhg8HkIYh((L z)-i9TB`v-gm<7KSQ!`PdkZ92UfdU(dkcw>2v@s&?Rm518X_&p=;L}1h)yMTGO7p;Z z&~V&3tbs&izqWMJ^oKjdF$;sDAR-iN`9x}ivhpT2k)8_WqZvIm!n*QKyUi4-!suG( zb76WczJy-k-^@x`?zBuy8$rx)ISeFU=$gxj8K zH-UrD?vqV=wy7SWvrR_>Mc34kWdfGWi<^r5Bp7Y#ix=Zpp~ue3fnr7ms`T~G7+p6Q z<SZ} zpSDjBnXGps@h1h<0T(SouaCsO0*QGO~F z+L@N)x4(ufYutxV-vqz(@7q9y54Rh1{Ntl;8!cJSwS12(;KOjJ*?H`_WAR-0g(ax5 z3p!sJVMm=uWXf!(5^*XxEqrPOO-J+nRvA>Z6BiiMX|thqh%0cW9}(Y zJKeJsKrbnn2zCcldj_?7#4;qzE1Xm{x(y6*dRGdZr044ilJ1m_<{GFG$%1ReYafHe zk`}y;mpy*%g4K_mU-dOH=@y96`O*9!(KL@H%QE|?1VVDLE4C9F#gnglh_G-sV&4E1 z9T@Fz($~wGw$Sru-n08@W}Y)^<&wCfEcIQ^vRc?oKEbQYiP-?T9AUpqdn7*mn^;=EfdOf8iJ5dFM-k^!00o3kY z!_(nfKpAqyd0vg&QH%-|gmw>HWI=L_5^G-}1;A#nR%*p&!&8T-AGt>TZF&it}i z%<30LyXP@5*YIp5V|`L#EM+A}HsLeuAqouFr-#4d9?7YLJ94(mLMFg-e1cdb2#>j4 zsSBtSh1{^wKiIRNugV8N*YoP7nOeAkbg2O~)6(0zQ7R_6xNHDW1R<}o9T0g{%Jd8& zCV2NUOFI)JJwDX(E&J+pX}b!nhbBAwDzbk4a_*A#rB;gf88iPgarfEQ9yN8SmiC;E zvyD&M;M|v13Q4}tyA}a_XaiH3w%d()Tl(|j-=-s}N$A2rhN?0!rN)#uQS`MXz8x+N z@#*$?AcHHVoCDb#f_XCD-iZp5B(&mSn9Wz>kGDo`Rz8z8C>J5hxJJ=pi(_{_M z`!>%BJ6P>4OPHK#r$Kzp5n>@;==oINL2T7MobC*t5bn&vU$_54hyrHTX}U7j#^H9!%2rEvQ(iE= zss@lk$TYdfSNkVaZ4;zOR%=ZNs(T(vD_b{60?vk|!@{`dJmIEzELKadpfZrKU!h~} zTy)@U8nD>iuNlmE1$9gBMz+IS2a}2o)(RLLs~}pi;*({6Hoa@%kNl>-1K%BUQ0452 zN=Hbmel)6NHnUPio|A?z{XqWkDN}38c^Xi&4Tef&fl+^(J*pK1q1(op&STK?Hy&W& z8)N^RIaUPWZE%!t)xHHPF#ppa2l))Qnyf!Q-fAgcY07RqD;%{#O;2sJej)a1|2^_2 zEy%uWrs4>EVT_nCw%TxZU@xb1%C6l+TW1D=wZTa^qSisCCD~ z*>q*Q$=MoDT2DzPF#tpvRBbKj^}H3+Ho|=*TM@PZxnlPls^aqpFmrhz<`_ zJtv2oI}&M*8}E@+y`U%&V1KC{t^&^DADKJN!y$zMC}(Mkr!U&w(#x~+{e<<@vwzus z?%ZkdXBB+1$GYfw`5=)ZGGJNWvOLC@FR-dMs%MLtu8*F#b`X71cowN`Khej>);tAM z+}Z>R)e>iXgvaWDQM+=bjNCdSQLio-70|;=&;wh*b(2$t-b1O2&GU+yZfmr7zqLTekpiZ&r;os<@Q1 zpT#z%sOHF}z50M6J>+3?$hG<4<<0|u|NF9|(8J>ALUw-^N=9EXS%Msi;DT-N0A~qi ztBp-6cl)~ciE3BA_+H}iS#6VgRNG#BTam+tKRki^0!K|LinRD|(0&`Zc}py3_^s}? zahjTJr@CBPES#k|Z#$bvx8lv^0drW* z+clA=?Hdy$8fX<2x(2=GNHMH5@S3=qv-v>+@o1Ar-RWgO#JZhy4dgX4Z@%hRzqAX0 zQp6OHZ)Olp@^y^*9w`;uj<^Haw-&;uPM-3ev8;|bG$7($|buyO%XYkp6DYj`mEz!wx z?s}oHVyE7EWifYgvL?0}9NOx&|Dsv%H~P}mNzS`j#A#R(G~U%b8FXV+3@F(!5$TT7 zqb)73+Jfo4i1M3Jwba)V+0A9Me6N*5tKaE`fR`uRyMwY>zSTlK6E2+vWj77F&MJ8tm#?kU&JOWQY73D$_L*M17RS*O6i7JEdi zy!i)js7zQA7gFEfjRAMA^iAwm4P@Vrm2G9RWFT+c4$80Bz;clLVltLKftsrjGD>-p z*1DPn>6J6bS4~8&1g$4<1HUpsYTtio1QW3lUP2qoI~}*#qiE{q$&U%UbK358i3)*? zskNQ9#j063%(Szkzw3#g&GN+>rcg2t1N?`8A3SX|7Dl&{rJ=`VycrjVZFyS_zs!vJ z+l_B08_l5#_-LoasKSZ;qhY7Q5C^|;ngn^U$<|{5w7dgjwLRw8dp6cem3JF?Zl{t} zRk7D-g+O9wMXjJQYDF|SZ1S}IReqmZ4W)RAJwD7eI}{Cx6QW00c0DwG(8be;+vS6` z_Z0ncdre4XwvUY(h%dq&R`@C|n;<$(Cqe_z-Ry#tgiV<^w4u|2LsY1t;?o$MDi4cO zG-c0Bya@0?H)JGCYU$X?m&B4&g?hcfc$jd_^qzqM%bUdw_&rm^t0il9Q|1CYG}>Jb zV$aSI9&WKU^O%h`%y!QoizNXS@-1HeSt!3;*1F4pKWwYJ$;@uBM(Kh{*A;M{>%ysJ zy&>WKoYG3S*@vA^OB@kmm0+Qq{Y$zL7{=2hA~pVY%J$uQsVd|~X`sMy0Wm2w0L0G+ppqHsefOlD` zN0({Wc34CqlYGD~J-J`Pct@gUQ9#iYtVs=>Cr#lJ{jyVzx`$fa|1fSys#j;~Bvd|U zIFPhkOncRMcC0S4C3zDD+>3j`$GvCfJ#F4-mf=iAJ2G-y19bgKL&2=$?HcZ{9az%U z!^SS%!cQX@Q_+3%c0Gq|GszrKtGcfEypk*=8IVbs^bMK*b#=>AsPYk+7uXxVKIy@@ zKXZCDZ1pVt<&zvmBeRY=@a12II?E4zBPf?D0W#HT?}@ZJF~C4&8i9ja^};Oh@8`yO z`5v%7erfa4?^s7O7h)6_i){0Sk)KaSd93Ny+<|=qElSLjgf-EpwNI>gU+eYyfDmec zObx>l!@Y2^IE}MuUwii&=E3Ap!@aIIUe?pZAHN!0j!Hp%!JNmd?jk7WJ6YdJ`?8-d zx&JyN5npiizzq6f7yCOA%mH2@i*L&(P>JSY;u76QuzA=l@Sr!Hfkm5cOg?zmbn!$Q0vWysoEV+*p zNsvpPMMZg(VJ!eM1WJ{8nl>2BpGnHb+i?82={e*5%V*6F+;|fxmbaxh!E9c9pD2Qy z?_TA*t5>cNtl`Q#SB<^?^2FKHG{VTL)&dRA}YG z26y?j>{P=gP9bbB^lKcdmpAp2fgg`^9Zi${Q3+FT=F<40-N-fE^O`f9)EO`k{>%B% zO*(#4JSePWrjiz-!m8$b4xk@b^Tu#TV=b}8Dn%QAc9;NPjPW*^Wa_$hFb zlAZJ*FqKne%?-3((y0bzj$5!G5#qP`wfD|X0jRoIIM${ze^*vB;(}PoOf=mT++km@ zHv{Z>&|Lopx&8kdT^xfaCJZZDpKOiRZeu)-S-;3}0MwUO`%4`})@+k2j#u2>}h& zVx?`@ZlX3_e5HM+er>sn;|sh0exPywX+82*=AHeEbr)P^^ROtD@oUlF$QTsTc;XgE z@Xp;Ve}XARfahNk4uE(rjl^4%C+zj3xbt)x(}xoe{2s;CStJ+4W_K}Z0kMPX(? zeaM|{H@ewpSx;7!-WRa>p&Q;0U|q3)G6&5fxW9L>QpDfisMK<~3DC6;-Wvy9k(P1p zv&|Qxj5Z12ZzEjk*S!ebXQi9&zv%|ji7EJ#(}%_BQNPUURiU3Vy53W|(+(hWN^bm= zA(39kOi6!clsp@70>MQ z&pcsi^XcPN!C!K5ollvCl|~g;Gd8OvR-ZLMpn`%+cw3tt*w#pI^9)i`17}iB5m(r~ z(?ly`om2Jxw+-llw~1HTG`t!&ht@oqoM?=>QUZvHoTFTvb7*zMMI(ZvP2_Ad?MJ`1 zYcTY6qmpiqG!oO&|Gdo7=>k%t{kb32_YxZ*T0OYz0pRx0Nd{5(QkH9X7YpH~6N?Aj zf2R?r<7Y%&eq-s*pXHJ5K-ks3fK1gvNNou!2@2rf9!^G6aHj<}X)t3;@2%z-n%y1g z7A1ZhE63Kyy%YQF{BP>|$l${o?-$DxDs;h|Fo}{Zt;`I`8lQBJ3T(D?A_=sr{Fear z4;u09yK1Yvyj1ER=wACMS<9Pt^)!!1+-2zVFAG_qr*7x+=`;Qd^Qcy{8>*Tt&&%M9 zip(}gugXJDy_yPvX2ki6K_?L!KfZ-?BJeYO59A`KM=;t7o1%wkHhz8sFxL_*7~vm? zsCsV6Fa40tep+#Tc8Czc78vgA16}F9)~Aj*E_IWg%lGcZo~v7JuYtvCiN|bJaw?^P z<|vUEbj_oMA18c6swp$CBe|0Vo&uhly4W zzsNZmar$wt^;Yc?16vNL<&03twr?I8`-Nbg$8v1c&MT>8u&@=a+Wb0HaT3Q|Dfcs} zu`fm4g%A|+r0PpW^=a-4?~o3pO@p?n#Z3VnPs6$aM`q@l@2>z+Z1zkYA2ebAz_5=g zkbvk8O}7X=hFAy1V{NOg)BFmRY_N+S;G1SS9F2Mryk<6b5mtNFJciUNij;Hb<6Y?l zyXTnKWLCbX^>H7)_H-uQ1trfY^&p%s*Y=PZ=SLF~FHt#u)-AU)$a0ag<0S40ZxtSf zAI;On=|hCq%p~#fXnSQ;JNWb*#*Y*y zr()+Yeg0T3tiGVJo?T4rtg1*m5&^<62c(ZnB#R#NVkKRqX;&o`O^C;ZI;iSPJ zrk}ZCg+r}GoID3aa4F?EW?g@JtVO%2QhG&8eVIygxm9bd{mn8JIUAi~d`lzLOk3+0 zdo$rh80at4626vrRhmaO($)f({YBV2rWAYM%-}sEV>*P8f^qQ2|Aua zpzX^g`mNg7@8Lu#Lf3gHe!L1qe4U2t$Z6J;^)ar0H+{5*Am~Z{SgLWy245vIzY)TV z8WZ;vkfJ-SmDvm>eX@A^Z$`p0$@51~vp@$E3oW-nHgcy%PoGXdE!+!3Io= zuX5j{j~B6S9A3$ZF&I!|>f9*qDMerHy&RXz+yV0jsMC5y0s;v>)`<=3fl}eat1Zm9 zC_v6|=b<3phA-|)Ry7~o9Mto`_1Oy`yb|&Cx2y5A>Y++}^661jHJNM;?%*3qX^^@6 zx>Ik*W)0XnZ!M4nTbw_m#Pn0)1Vp(QD>kZu)1(ydg4qXUQmbt*Jj^OECG|F_p6_@m^spWdyn17zDA=n{=m&Y@}2A2OMzg;?LWD_Y?ll$CAb zz#JXd3pp_#D}bvNy`2(Wc(|eV#=e@5A&qk+P%_HU8#|6E?K0tHnyY zJLm^1F;B5wsu%pUjVM=pjViD>;daj#sTz}y0>+?dta2J`sA$2 zn=$#pkvCFuSu(&}~K#aS=uC+V$Iz1pz{bkE0b)cI;^KyMD$QTNvB? z$YHWVpc}xaYZima=&5j^t*`g+ye;Iizez$<`l3mj!B7)Nz`}~R zYHQ4AF$o=Wb!1SrrRzKZhz9@}-$@jxn1`)^3VtI>MC%}q=&J1f`&1T6E?at>NIxdn zgVNs;@___WPG1z-#4b_U_0lIxX~MmO9TbIOFfd&-v`zdm>{ALxQwaJF%Tk-{|A*P2p0GMuaL zsB%%E3ebGng_CI?L-hS?p=|1XRBkC%{ojS23(!&6(%ku-ZgMJ9@`|^fr5Nzx+N@kSfOq34G}Qi!z{d=r8b} zv~R@J*By#UIj#1ingN`C=D+Hc*M*X!oL2$Pr~K@xlneJDNQ=oxd(sQE32pN+jOJdgsrWJ=vGXpRcOI@f=&@yom4KGjnfz~HUDsbciszs@!R9HQ%(ooh2rBuKfE+<`aM>BcO9@#pW zC=T-cSfP6<{!^2_-RP+(xY`32Z8y@osh!DOn<)^db;Rl(%3d~G^tSI9*H|PBbR2y| z)qcAziJbbW0gVvECip<2^2CZ|2*-M~ayhFNDPPJbZ<+5H z03vT*dbupS=%lk&yn>*aWlhbURD??A8)Wp$F~9r4uD%zJCkSGqMl((3eE}Kwy%1D5 zCXYyZ2T%H+<_Uz;3@$mkX2UEzF5T}115v$Xd%e%vmGRM9PAK8u+Zha+G^doI-JeR_LTZDoo`O|p9kJ&JXX6pugs z9X0_dz-nUa#CqYpsF67n0v;2=5;imG%NLkbI_2McC$D@~7Hg%aNou=}To5Oz-Dxi) z63C0pb3yn{CJUL`EI|hC;(DqxKENM)M^{S@{F()DTf&+yau-;YhX5-eoQZuwo}66e z7xY-uzfqX&M3fuwxnFqG&)Oq6)gwS@w|6+-0zCUtd)HKzJUqCGM5~pN&_ks;tL59%dsb2Tu#bT{03$LMyY(DrV@!}wGl)Kw@(PNsA z(_%v|>R2OFj+CJ_9ClNQDg@Q>gGI&LHrapuX#P8ZRg07bg1fq@{ptBE6)<-iz^A~Otx68cpS4GF3E4S z4(dUlJxje9y_`0TCE0ySiZJY?Nv%Q>kPUaLREE2<;?1%6a zRsegT3utoFW?wGJ?;16h1Y-$fCXJ7&XpW=1I;kGf^H6?HN2Yb)Xu(JX?X@s_Jm@)% zy=sj~pOKQkOixOa8qkfCm?1Zj#-D)Tw1_^Dn)PJ6(3Gp{qon~Oig|-o@PxVP^Xr8J zq6bt;_bU#NO$Ix=ezBiw%>lGC2tb#6@_(v_0W4hdGbwdgjpwBET`;26r$E!NW0WT% zoRy~?XT%8wl`S-%@Y!+}Srj!cn$e1E?eoL|D_I{i*j>Ak&1Y27twz5zqb)0IwybC~ zb*{}{TrzTijel2#2nWhOiwSe7TU^l2zFzcSYtdOF+$ek!g$z9#tw~s@ZLcAF;xD9F zu}8Aw-;-&_Ji(yuZFb9FRPmXKPxsbq4j*{L?uN2mHDJWre zBZ1O#mJju$9=En0oz2IM$-GEu%D?s-W*`SN9c#%7Su1JlN=v`fH&@GB?A(~V^PIn3 zU0lReO?T=iO-P#0!daLy1LzZF`o{GmK)x6mvo&tWnfFe%ysG}-;Zb9F|#5_;)$*w6Aarb zhKjr0_Ff+QB?6M``Seaf?DkW%$z5K&6g%c59v1mwtpdyST7pCOE!=Fm0Pooq-JZ;a0+%m+ECJ9kp&!SfUZN{N-kFr(bDG$nGSilzS2rXRiD4 zUJD>||4hdL;y&XtVP>Pt-yPmp_jV5Ry&9Sze^RUjuW%K4B3Q9^cB_j1|*q` zHHu}(?Q~`Hv4k!zlsxwnbp0Y^?)&k@y@&v0Wf61RipRIX+xkfpOmmO1e{QA+4v;r~ zQ?JL$eE+viGE^T8($_!18vXQ$c9u7KgfwRGAqe#|^sv0yj>M3Txk`-6BqOq z?NFwHxUYiGArGthgu_}(A#)|*h@xl(9{inCD8sQ&#Cd0rWHyxH?zExWBf8TvUHy%h zPNMYA%N5&fc-RXet#EH>HyQ`m7yDodpw*Cs<90YeO&|2P)(hR})&R?<7xixl-Y-YF z5>&>yik;p~Pm;VfYB}a-dCt8Fi2diuTzad2uk0QNoJcQ!YW=QRG24n5ekPt8oY4eK z{bns&ODy5Q4i9SYCMV8E3g6tmWG+@Rjup55@R9>Cy!LVO$B^Hil_gq*#HtH_4bE$TAoK zzW0bc%D&_hUnR(q&tzsrqrBa3>FSY6kkuC05hkn0@kSZ0(#1?84+iAMIRy<_4nSrL zx4#@WZ-gE{a$e{}a+nC})nvUfwsT9bGEK|d7*nXBzYyd9I3%A43;s4~oG5$1-O)%D zi><_IM3!_P46)M1z7p2*F|V;4jKa?C)<&o)ZA)X2v0 zI9}(|Rxm^iuo*QAJ}vSvk@M%-6QXZi`hcbe#cM{EZsu|uY>|r!T@^rx2vV7wb$FAX zOLJ*dK|`(lXJm^+gFGrlV=2)QX>SGZ0-mRp6Ri}wvL;G#HMhtf=4%gW>k1!fF|1E} ztQmfQF{J;-!p?;FW+uWLH0{Rj;dYKiMfJmclK5KfM2Bwtor{ za%HCcaf}`T%|{s;MK{6nMM}tMU5TV&ceb!?(5Mjx+93CN_SVVpYUF4bJyXy6LTa-o zmKCl!?-);X5Q{wrf&H#4d29vn7*tCiFMpB*@emO1onEPc#3#elHCJROt!PCKr|#C_ z{4Bo)HD04~6Nrz-g$OWAnnR{pFpZ_QkK51Jczzj-(n;w!;}=fESyw`pz=v=sn(_Ta zL3ba~kH#4>Dxj4t7b(A6Kh#Bi?Wgr-sB!5!sRDgW>Vhf=J(LOl`O=FH#_)(8EqZ-3 zsA?to3iO*hfJm`Iz#7^9@|0VC2zr=-*~N4aAzSC=P=YG3a54q4xtpHs!zOgz055+{ zEJR~Z_K$9VQ%QxECR#eNbdZ$iMh56RM>e$RFk@W;rvJX&G}l!)+o%6Q9!B z9>?h}J(G_@Srf-ns}WFxt?NY{nxDGLTIx8Z!T8f08tUywEf52I$@oGK6!c`Sq=q1h z;(1i$cI}y`_UE5Izn-V)(5rIOMZCt79MfF;1307pW%lzDi=f!*DL~wICJCQtHGYLu z$fu*C+C@l+xK)i`H$`t{AxnC6!&^t4!kX`OH0C42OPwVK&<9K|?$k$_&ZBy->gDjw zOsM}%t$vy>-|Sh#D2IL1Xu^9urFNbWvfkVWsJx;%NA5|SM*3H} zw=DW`o=j($_e6O?7QV78nMCX@1;^f44{Q0uTkETXY3VvMO%G!H@Eo`05V^NUTl=Hr z^5zrqeq+?nC+8MFoSJJ}y)Rm!Udl)*ig7Ls0<+zWWCZKovPYn>_&g1?I<&8tI0j+1 z=`roqBcD#Hz-1yM+` z(V7g@YEu08$?I4EJUIRrd;tKoSzO9tL3YobEcA2JY(+YYlTz|$FrHVijmY%dp+`hI zEI{m3OrNjOUQ4o}es#0X>~}}z{+>aWo6$O!0Lu&&Gb&LC59f;jcbKP7^5Ism_S9O6 zx(VLd5J78b76n+g-#}v^n|a9BD=GB3oXqY$FG+3{uOAxX237rVx@n|@4dLq%3?FxJ zCiDC}(Tvxi|8d)J0hg{5)4hz)N zS1N%0@(@TaqNG_rJSlKlIr&jpnN8zbYM>}5VBT}stFrrOt$ztO3oHuq9tV!N%+-$j zmT%ea7m`>n7F)Z`V%1pSs?0Bm_-Uwgj!x@IeN>NTI0mm8QbQWHAr-3H{BKo53jJQp#Y}$OT|$%ZJa4|{KuL9Yuk>mI>|bUmD%Fw z;eK8s)K+A$EpxpIe8)m2E~VmA{G<26@n zYjJoMkUWaSS42*p+Hh{sVC~3nmYkpgYUE|6aBaZb+u25`*c~8S2BRPd;%{o#a_)yC`O}+qYBRLwN0D zeX(!tJra;@3RIisGx?BKQW9%Ioy1Ox$ju-Tb0IT_@kGI0ENLuGLB6M2zmkN=9hg7|MsCqOOEcbl(gV?YP*b%!2`3U@n zRHLf>=Wa;N{jGmtiQH{i8TcqF3%y)R;6x3tzL(wpw`urdGg;CWh1)&YQQ~z4cYiLw zUOfYGHnNeIFllWn8ECi!d+aZxCEymL^u{w|ul>e5SlMnh$y}ySus9dkR<%Hs=(sk~eZwF||^q`z+T( z;g6$HoYj|fv(?5{!JI2U(x;PN^WXeY59Mnn8SC(Qv)tQywYU347!`b(pSFA zoqs0!>5#2I%(siv@GVvHw-|Dtu>;1K#d_ps=Z_wq#_5t(d{mrK*f{Aw+@=>| zn%{uXd?$VT5GZBaSbqq7U^}bb5LT(Wkl-;wC*L*8yX8o}b0-F~xk{R(nsj4xIVXxc zZhu03blJ@oX1eu0Epxdu4v@>4(xs^(n=JoX=~CW_p0BFT6~K$Kj=*e%*RMkEhD!<1 zms-5YFnmC4>}CXage~?Aj3y%tcN@!5uaPP25%5v5DVn+}b;)woIo6fqOzi#UpCpI1 z0aqxtvsc3sZ3lwg9{|vd%9L!2uRZjF>C!p^G&a9t zDqUokyHKe&80i`%#~A)XB6Zu?;z9BFG-VuPL#g+7JC`?~jGH0_JaO7Ob)r=ZQm!#f zmUduCE;0D)6UUE%$dSStCF9%JA28K!=j~9is>eydwe$A-2;q$UXF62~=)sRVwC%H5 zaW^%*m+IKXyTXkgTx6;zabHsfs1OQvJ!7b<*UiF9 zlK8gQG&!&YJ;{n^JoX=;?)DGYglhUevdcWiGz8YYUk)dGzl~@y7TKis;iy{Zav+F% zdKPFR-E;6=9P$r0^vF6>=|P!rhU4A&-1HH{;!NScUSL@oA37s;1U&x@>P)AdZ1mSk?9`uEFP=HCb6wrKPob@m!BRh?cK&o zBHbbQV*r~5w6n@l=1&sq{+EeqbUWuYRT3o(vbNb<_l%vi^!Xwjdj)@a z^gjHBd|S|74rtAHw2}8!k!&tgffl{;aceo+EyhV+^tG=3A+`G`T3XBP-{#RE_ar>A zoShA(R5sY0mYOLqkZ*5d+~c^@{;quO4PTbKAkzTZA#={(MWFI(Z#U|X;{4?r>>YB^ z^v+A!zl?_h0`#QX(xzwYh>^uJ#l}4~wheb`_bPEYJ9NhNR%4KZK6X@Iz}#Qk1vxZ0 zx2w3m2uD@;UbI!C>FnbYc^Fox#~t}3m9{g3^yp$L?tK82+fnAXF-(Q7$X6#@E}6?s(8&f1N`U7R-N9f=8YGh&7L*X&iECBu|LYV!Lx`ELC^7znWXyMz zf|p(N5=w}S{WVT0;CTb0N!JI*ihoRTnfNd`8KxT zjfS#Xv4g`X|v?A^FCI|dN5W0r&O??3w^!A0C5UA)y*$HAPo@7m;I&m0blEl2i)4Ex;V%oc`5Tq zfSE*5w`0j+tUUOmVhV|5vtOL;Un`$})L(p_qc3p@j)_tOX0QGR-Rkb{AA@O^&q(L7 z@oDwSJmW=l3zY#6g?DjisruRJ2Tu0$Cs)p3e>+0n-#W4mn?!nw4C^oJF;}iJH5y^0 z>kIa?xz=`eF)%{LfEWB0AOmQX??`Bz!~ar@rAiPqt(i+_%07o#g_9rC!D~jGEUO7a zU5QhL5wYzKX@Qc!2+xrnSHj9T2WNBhFngMNnzSSS=kw)@4Y`UMWx&+>R8YBiGfmJT z!Iyncd1I`veqiJ%S9yQ*nYq)k(hX_69lQ|6wX+e#uuysA8#NhbP8h(hezhv~uu9|? zB8qE~HJ{m}wvvN(?l^0ez6INY05WSpMb3RPCjc~zq&&`7msNXIqsTwXph zwCv`XLTbahrdE=n^t2SWi^B0qzDED^bu-An*@v+>u|5AMlWcBEpK8_<*iO~)%mpd) zb~8D#i7GA=?vhjwhlWr|>LSLLic!f#JlR5}`S;DrCqw`#!{k~vkveVTQ0Kp!=N^$z!^P)Tm=E6@ zLX)175z#5vRt^)8Zrrictk~|i(o6YZWx^#%R|ql(!C$wkH&-y!10$*3{RgQ|HUiZ; z{dC2cp&x~RAe#pjU%uY@4LvLw`2RQt*7uiP3NH-qidqi$8(`73zF<0R6Wl3SBN(OF z=QCuma}d}#=d+hG;|ak4O?)K#aU2yhJMgp5_4IYEiIdZk9-4t1WQ(9Mbo`a@2YZNA zvPoYLkBfPF<_MueGHraCeI4Wy*kZC*oJ_?`-4!LQ`6aZQiKY9Vp&NY=+73%}p)80S z2^G>LNdod_X@jFaC=JZ+SsirsadTG(9VA>1HLF5v?Auzz6EQ5+ZSS0LTO~~=xkPNR zU)sd1^VsgMIJkkLx>wP+S?PdJe#d?t83_Y51w^+W~!YIIb*Z%4Lor5v6!64*maO?u1ymsqm|qip-M z#_$r+H7O7NTUiWqZnDvpg;sH*BcWEiyqYwZIpFX(r<(k(4cYZ?J!`A?F(H`9X*|_9NAwe6dLFB)HMI7gMZaF1&ij{5>1%uLzcTue)VC%BF zXpAahbJ)=BDO{{YGQZkU%<@OH&Ypi#N1Ly6FqnYxuOTg;3%6{8uDkY64{INsTU~br zb9yEh`x~x?>L^rb5GV2da<5*6YJJG%k=^4z>YmqMiJ8t#?z~%^kV1i^?=xi}jbHmI zg>6A&DlJ9V(4%J0Hmkh*365Wsz7`nhJWC3%<+R;XOBQ2gmcu)Fff z=&f^@V3sm_MM~+KZCV}051+uH+eWE_dX;w{iLowl)B?pJ$n;R7%Hfag+fSz(_y@eo z6P!uDoVnTL$=Ly`WR3iV8?2IUWUDX=#s)H77;`7cnAUQ&V2A0)dN}$? zU!EhaCb1XVXSwwr1rQp$@LLLSHAeD7=WY3e6|x;5)$h}4d1rP|xR{Vo*D_);@SwRj z;ckX}fw?kQnsu_5^%c%5%D6R`pEiNs2Xyx%733^7Q~JeqD|FLSSx>M#3F0S8Jk;D`;Ut}(*vMBCt4deGYsf)gH9B7D53F-M-j<1OrrcQ zr>ET{@|(R3r{v|~+4u*mMi<(XlI&wMzvfwY>;OzL7_9@>J>u?zMqsjgP$la~s{hA4 zGpy8e`C3e$xe$P}^c?j6tvsF)i+t5;UJ!okxX&4R(1JTEWV7rj?P1gL>})2TVsd@N zYtjMC0~KPT7|r56H|qK+<#Roe>)uLt9d&<}!L|GVpF4vOL)lWVPq zc{XVZlTPDe;Y#P82ay}2EfpGm+eDs&`K92ws`ukcYBz=;@ac9TZ&|b1!4+ynpTA6N zYsVFc=K5EKqtlqOGVkuF1R{~~L=g~Krj6Q1)5#7C@=bz1w13Z;Zy?n9&oM-=XKHtykvW-J!ah|>1`)v6Bxn~%?T?ErSb ztab5C`y8@kH$^@pj$;AWRH*C59{P6s)723)MIva&8<83t+L2Z*>x*2DbKg zkNvN5?!D$Bi=3NfA|K5sO3e(ibl^1f#%N$!BXE0mU*Nvd@|(jr`rmZXykL>}TD8iY zrsD8^q}u5~r=b{C?2f8Ga4RR>T(~w$ti^>s+Daei3^fv*G(e+)^$TKQ>tKZeWrF?k zNihtI7ppS|+hyg10v)qhG9Hy5qY!T`DS z@L4kA<|+XR3(OfRZ`Ly93G*cCP0yG}vUFkok|f}YoHh|-Wf6%Tc6SPy_{f8T{Txkh zlsT}}0jRpM9HyXM;aKslkEE4+4eehVo$M`I>G~k%VVj;vEmW8Ty6F&tgAY^{-^&b% z&VGw+AWw7N?tSqe(n^TKrb6rET@rvGlgD@ob7);Jef!2b%5{4qGOs9VQ#lQGh5e_N zI@t$s(4kT`z;SjiZAQ?%XWwG?t0ib(1E03TvJMtWnZxQ?E%vYF;8jC1JD~44#>kCo zBhpZnD7jt;LUbQBsw@$Ahc8g4V$xKU8F_|2xtLMV|6zbd6;p7dPN~6 zdRS$nLo`X?P}*euFJer5)Pg(UU^QPTF%_Vrxt9P`l7CTYK))NJB~=kyp?4{weTKqc zq{sXAX|FT`7G&P3JZaR9AKN`8g?o24*&E2OOmE^1efCP$P~HSbc;mzy9(H5?NGmF; zQ2E$?tH`!-Gy)oS>PWai*Q1eaVtf@m9{!MmGSec^#{LfU8kiGK$l;br-jgfk3|EDN z)jTq7Lfctf1PrtfK$csJ<5GTrMsI;XOr5k@S9v9oOg9niE5dtjXI3wB3o~iJ_Qq&C z-HNUtZM3`8%L6OHJsyDqXp|cLZz5ENJp4%vtL?H~NexFp`(jgS2v$s8VTfVn0D#P}1Mi{}12bFU3B&>by*GF_N z5El09ee0q)#Z)m}KdO4d4+NkbmKq_U4IM_3i-VWDRxj)YX87lkiWwoGe(rGgjVfOW z@Lh@rz`AiI^tzR49vv7AbGP>+`%KyNxGhJAkG30@S!rzl;{g(F#3OjCTIbgY&S_07YE_*uFTm-Pz43RR8j(GC-y9c}}syrE24mdlx@e!R@^H z*8M(Z3t0q-9_o;+UWn)TcjU`b=D}VSr8`uW#2uZY<#ZX6=>Lv8iIe6HB(n#6jb$ zzOOTHqe8IMjT6(6_h7luiS|3qI8-)Bb*AS)A{RN-u;=C66T|vCL`uX&sxak^9G0_x z**@pk`I*%3Vabon;3#p}(SXfSA_PoP#azedY1@=ej$+LXq6r!diiv*D#Mngsn`CYWXRCY=t9AB=msV1nIeCKFWL98?q+>`qpoU%`m} zMeV46I@?N<2(j17#FXY7nAfe=SEglHZox1>wVk`qKzX0wn?1}-9`Spr%C@Ra2p;T{ z*(EYDer0UD?*d-V7)@7vRKZukQdZ4>&&QaR0}QHN&nzXnwm!s1@NRug5Ls^+`Fo}H&# zKsF(dNb~N1&yWMc|-0~?rRAzqv^wN zJpO&@HoKB3!{lB))TWSp(qODKNZ%1=W593CZ6@Gz+k1ZOFvY3G2s}}(@+a5XTs-{O z^BC_b>6meH8f+OHN_T{o0oQ9!$HrEVZ%&%*L!ZtwYuj71B=8P5=GaMpR`gOr5;N|M zqf_`Y3vs_liITT$N`Cyts-5TKJFFNt9xV%C>#27-+EEp0QGAe5il*uEDo1%hosbFlG>NFjwI5~d?XT*ax3rmx^Sdc!26pvjQexW zxa;p$^~ynt%?8umhU+9+dG5CLlFz*NZ9k?w32muH6yz7hfS+cFtg=}vm>Zhd{XR^iH-c60V4&%f#Iyf%#o%>8p8s}P!;eIN&%=}d{oY(p`wF2?U(Az ziJm;JfYdOJ+|2eeq}NYRUT{ zGWGkC%e{GMtc`wS?`ko?FQ-euAN$O#TVPleMZjDT2T%d!WF#SF_zgE8V+oREgORWf zM3#z&#t}3iloZJK0$oM4S7WDKts9!j*@r=hyL3 zbC+nw_~RLKMogNWpC`=EGTZIFbozOzTxR-;5weW{M$r3s<>~`&pCi&*Ju}Qu`NNwP z5hkBiT^2#2o%;?PJIBkTdNy6^_|V_wa+$_E`}bB6cob`BtFjK{)EJg_^{wzP60i93 ze|jnbhK(%(!sBFdJ+C>{lSP!d*QivFu;h;yfGgC@R3E1EL_q!jDSD4CN3!%xb3MB` zr)O4uMl8rVT}@80sxqr8t1>e(Lr1vQ4y`p{FoS8WVFnlsT5I7B*Y2TJMn+~;3X-MB z<{^ixIlbr~Fu!3gx3LL74u{#j-}k=H^Ee>-oRK=^B1{~i12seIinCOYRAn~GAPF6* zat8nLfMPSFQjdz?k0=FD|INQa1#s9wYE!!Y1p2~VA)gthC4)^SKv=DKKFZ3AkQXmc z6TcnpE1vP8Cr6^G8WiRXdN)NTt4e0vqP8Id=7KRLu2(w9`By4H7maJ@j>GM%9h9%wkW$RLtd8zTodic`W4=7h2rGL6JrY`vB*(RzIE0Uq$^#6V zWz^;9K{J^I{PZ4>EDX_B6~l_PVj1kZ<@R~+)(^qRA;FCG2n&}}9fxWP!Kr&z7ZUlh zH-C{S+2fwx=;er46c)a6ukv0>U@pyR_Gaz0W61{q0MgI>{Ak&bm)!teag^MT!z7~J z!Q#dBPnZ2v+AE`8DwA+y4WHu1Pllze5mv1h*J|Ku%u_u-yT0d8_7UfF?1=xgctRDT z)ZS?rRC2-ow~O6WmG(+8RTPlyH}krR%Aw10hX3(CdNDgg`nk%_QLIVWLsd@fmh-1% zt45`3PG6;>4aIK6_rGvx?nNkFrT4Oy8v41AdCxRRMQmrQnfzhB@CW}Fotf|80WQ3Y zG<%}eKO!FNa>00E00=h^HmNO-p}i_YL<0>HdFNi9DAqg)kS125gU=}FC{>yIN zC#mP4Y}=HJr4@rQck*Q|xSS*UqU;cCBv3nH=tVTyys}n7ZTI*no06tnj6g2fEcerg zmFC;Zp68PwOD1zC&S>cEG%>a?y;$tJ3T}7NtW|yo@lwi=%B`00)%CWW;=O8HGP271$3ugv0(14X|d;@O4u) zdTD(+CX$U8XGwtnIa}#RezO_RS0c`m;K&DyNs7DuTft8kQdicd1B{4v;|1^5Njx`J zn=_I7HYhzl+_iymr?4o);jD)AyO}TMR+j#17`fm+iyH&=hQsv?9^Oi_hZ;mAnkC^y zC^&?4%Pb1RNnlMWpm+n=xl(m@M>*YjW*sM>Ty47*{8Q4(;j~gNn@J!W>Yz`oh0}z`(7M^Q@Auo?`XZgH@UfM;e?}sx56=|6>8Myb zDjxLy;S;f75ng)#_}E4@p@KEqX!o%-W|F3q50-I4gL~dsF`~%a|E!us-I0UJd>B;aD7TXHN`hGEqv5j>mS5V z0K1(vQ;52^$a(U6{ZGWr??aF7e<`okqQUL0CVje^G@A!W*P;t7_WCVmUMfY~1QLKt znXFPh-`(-fzz^gNcJq4+{bZ0h7L2S(++Z`J3Q)3+g-&LEd$0&uXjggG@MSXe8dh7Y zMu1(`M`VvS+LHBJ`$ul}w$q8W7>~+?we^?7 zJA71jZKZ?$!ly*mHg`IliTt~ei6q!yyU?oaP8fIW6t%-@59zoviIv#tW)@NywXJkB z3?9+I34Y}rh^O(=cVNjoZRK+KCCHZu1V2JM8?JZ>*zge8IC$+U>8FmxpMPXg^dtt4 z2e&1FP&qVAU%djT%+mHb-3^^PW_69eq-*fpr*EoSC7$JZ`su z`oTi8fpT`#P3X@%E(^;p)@hKlI%=GOlG5_yshEZzdAUYq@+ju@2nXkWSCPROK)lNg zsphF_1*=sto^kW1G+Sk+trQt#Gx-!yd+;PpO+wmk^`fF?ZMh_e2(K0s`OS`AvGbv1 z+s7PChf;wELDOZeyH)xj9gYH2VS(5!yG^cVj0+VXrslC~RLH&#y!ER_9FOae%B8eB z1Hr7nZ5F;ji%Z_Q85&J(a+&pR7^;>y5lfj*UMvR9m$x`KTNdVg!V_gj7sPZ>&_+-` zZ-MZ=&>a7w3nsDPsH48M;a zknbNvK~6Q-a-AM19Sxv3Mvb#Ieb|*^xe(E56NJxK78t$E9P-sxzREsUn8@>FDOnkp z;kDuS@uUBl7trZ<;XQg_-J{aP`=% zAhMG!z;`E+1%Oc?Dq!(J8|kuscd^h7@~>lNhUBL?zR1*|kXrA3$tzHZvjYM37o+Dm z4{EpSAPOr7=+SEA2j1=$jv{;KkIC^}X$uoDG%-q=^)3~csrnHYnVGJWpgEkAt)qjq zm6P+$FP4S2jsU95(3ke5;sev~g<{ViglIcmS1-d$Zq423bYqcXK}qEfwT^(dr`vPa z>1TgOiX1lkJSNQSL@pA1dKME?kAvKktoQ*Gv>maPCU%37Xr3H_ESp1#tcrTy5iZ-o zSU!oc&0DmT+Xp;?uI8pp5DwqYr%Gqx3{h|wupHadTaz1^dlO3w@jvSf13gYJmv~}C zictu8vWTR=)p>y7f+f{`n)-=&Rcb&3_z$dmEnN?Hxu9O59!JnAu$Y4@9;)X*m0M`<&a`OKu?py-xQn}eaG|lfx9wp) zA)(iMdvslk*pj91vlqH8zAm?&9um(U;1`LK8brEws|c#*{lVFb_cm{*L4FUH*3!wU zm%U2^q0E!AJ>^c{*`itd45RnHefr_`KQ{in*ZW%bCxx2e7;p0^R%0CK6S!3}z!Z5J zHe~=j{dZ>|%a&O#As(~iksI`{1D_8b8rWU9OP4Wp#t`>oE&H6{TUNaQdK>5Av%~Qa zNx8tuClElr7RYq#WsqFdjWi^seWRy*mrrioYV8HTN^f8FiOacRm44Aq9I&F_Irijs z8+~9YAs$|R^19akG{30ik*#Xz^{Njm6q9v`>Xoq<*#I)wE$=3(AMq!Ix~LX|nUQan z%U2Uu#L=?NJfB-^MlkIepRH#1TjH(3lE}x70$)X?y?mq+)iaM0K!otCVxzIM>DxS! z^=a6$r#YRUl_ass61-*iz}-{aNoyz7gLRp@9mFjwzk<B25sd?W0I5+D^x~&y~Lg z=l!wm2Fp7f@s(Msy=`@wO6#Z&Ka~oJvQ_|h$Ni409&D%d-w;>_bgpx`tvu&^)bThy z=`y}&?wXDO9jK)6oTP-Yi0MF9s<~>3EG_CTrd!WG&u*dK;vX|_OFPdk22qxYRT&2Y zcdc_j`ea(Ic5eFULjh}&U~20rw1Txd@y$ZnRBQ1)V3B(zf|mNGhPEAZ!iUD)g~=gy z>^)2w*4qkE@THt&Z+zsr{wA=(KqwV2RTwIlW>=#W&t=!6^CTKVA-%ks$Zwr(?6f0S z@pQdj@9L?lzUO#+W2rhG?8$=esERXXLnT_BPz%uFfjoR{5AbGmCCEotira?5o5?am za>;De!o(bl2&Lp-%bOP7H)TsX5K<48pt8WA+D9)Efrj|?Lrjoxt zn^sNxX|)%$yl$_Yu5IqUWnw@Me1frSfAm|B(+BU8H{3385r-k?dyREY3sB-Zp8amH z+1KJ^;^ zYu+!0?N-J_FVv$_jFOmGWT)8moFg`l4Z~@Kj=_3^VYG&?1>Dv5etLmmu6vy zd3TTLG-59Q*YWmVt}(f~=kVVge$j#)&JNgrygAg<1`EZ5sWrO+*UW=k!*c@r;bX%ovaqG9V zqLx*^lq>BjH~|(}>LP<|cATj*t0K`8rj=t*q~n%w_&AJ{uLDS3*4vzxQGNgHS-ym=X}l)j;9nKyS=dB|z@JXX0)tgOSmt)8kAuRBHi zZs8dlGf|U%W6@t9iZLz&jSKQ+-3ihTe@##2-B=$+AXc&i>DFe|wN~IDiMC!sn11Y* zEeJ9T5u=j55wKkHaEF`B{{#_5= z66)=7Neik+xoQVDZfDZt4M)Z*!(>FMDc*4cion1gYU|ksZZm8Nuk4r!N&Q~SL;V1s zzRej{Ut%HUGa+hdTsxfkY}qd4$QzH;rL!*tE|}-q)xb0P z&)&vokHuoWF9NJl3M*ufN*OP5}-or*KJLzHsJUzS96WfV#6rym37Uey^=A8FH^f(HWPR!Hsn zb08i1h)-`yfUgXEn776*%+5me%bxXB?3~od6E3Asu8nO`gQ}%tAS%Z$EWm7FE&Z`X z4WyS83NuD^8;1_=jtWvm((?6WEWI(m{$h0HTO%%;wm|JTQ0D{6O7+!HfmQ0UyRG5U zzXLyBQ}&us?f-L?^RJoUG=pRrv?P&l8`ZxVwt0{f?#7&Ncq#e*OyleP$(L<}u7!hz zo=KOSN%@pUU_Z@k=)QEl#wS%goqE!VHA&P-)eQ{VpQhvoe4R|4KasZ0&1g7*0`MaB z4ozFpbP<8MmeKG;L&9f2WfwCFX=o;>?HmKH#@30@s3cG;QV(4Wnqk*2;Ga#QMr+r* zC6gnF=B9uYq@^)9OFQK%~R-YPFCg7^%6f&0G@5fuZDaH4eQoXX6 zK<_FN`n@(u7lubfHebwxs2T9|75dAg=ol-K&xC7GWQJb0^{I3o>f=Nn*vY zhoydDnXOe9rsa*D?B@z72TD!^Vs)_(-k|MZg9on@07&vFB$i0sNZ6{rTMfipHM&5D zHoD%JBq11MC_3)>g<9m=wEC6%&3hyko*K$wAO!>ybuZv;o*Ded4#&El^AT&-oJN%ej8+TAUgG~8#)K1FwNxxr1wY-e}XnDYyAQZR_CyU zGr(N)f|oeU1s-NydR=}&)qW{em~?~NxuCmrw_3la%k@;Q!fsk{k{iB|mPvGY zg=f5-8ezDOBVTxvDKS51iwZ|5psuF*xhlt%W2$`E{aXg@+@>*f6|W(w%J;d(w)Q(- z&mU~JJ6rXM>+@8tQGKMePCpIugv@j*#9AU4Iyl?|i;8VBV~L=TsqIn`Yb#rkh=N#K z#zT$yOProw6&iarhOIen-xsTG0!#P}lmPDlZ!#{RnShtehdi!MUS<7ReMYB|LQylV8hxc!Fmu?tiU)XNXK)R%7ca5 zAX^*Y)b6{!Wq4|Xu%9Qx;AS3AcThGr;NpB?Snj;=v!!aAkx_rT z(8Yxd_HJR7)Z^M~;a{VAftkiX78NE{69V0&CM=m11dbz(c(9;*(~A(4xt^)_Vz+ zD!rEbc-IPFr;>9j!*_9WMg-)v5e5~F0?7Tzp*SB)2fBjGsTSZZZ!o`{gqrb?e}0lk z3zLRJ@@92psjkzF%+rNKRU%w={6G4B{>=ZTi|{Lk=EAiWAj{RZdS0cHrUlT151@|3 ziF}ekP%&Ol26^MjX5Q0Xeg#*LA~*FfCk4wbF`4FBjV;ZaQS+>H6gEF%7X*XDnLx}@ z-SJ`2>%SeS_1#42mwkA1*oz**d7c4!={L=C7A){0&~CG5=lv@O_+^*8`B8fsMvBaXD>N>1e;NRV5?D|Tq(Ak@XqKE; zZd2IaZPY&RYIMxf2ggEC%28O^m-qU00eV=pCdwI5!j=XE0Gnm=yZ5glZFQhkFA|>($xgY9(U@vS$AEkIGZ`*1rNTmUgviyq7dP9pS|{td~6(`dd>Bs{)SQSfEJ! zTu81518wo19p?_!8G-B;OokKG=7-rWr*iRLEBkWUSM>l_w5u;?{}01+Hx}SzdKiF8 zQ}-f&6g}h2oGxSs(o@GF8QFsB6RkDi>y1szHCNRM*zc_VHjpZ}qykMOFO*6sJN{}Li^?{qXMSQb`NsX|oLb8KKg@>&Rj zGz0SVSd`ivuBa`)oxhI8LlJxwYeq^@5R99b3CSBFKU9Ep_edy)pyJ7LkQgzEJNxAx zkMtkZKozFRd$jea^tk+FozpfW?1Q`>w3!D08wJY4JGOtL^BTj@57~1q{*~nPUm+Ct zwg0Am(2Iyct;gkZ3H9r}mn(hmANGmv0zi&E8n3nEcX8(Y41g#4qN~_6mK{pKj$VzV z!FDDL?_uK8>OjNB^^7!Kc{yNTnzmm_egXHhPbq(xM`i=~_Zd?~$jpE64I6F0$D!fB zS9@i)`sisH`eD%M8_S%uhdx@!3T;dh)r;a*^xVqVHUrt9^4#Omw_L7Xs5dh5h3xKb zJW*PGlNoXzK!Vom{;tpBm-PxZ#G#yYa@OAi?qn?&X`o}P1_F-pX0q|V3h6%^un;5B zDW9fR_BX?c1IAYcH!dwH)8nxC=+t>>6_86q#nIB%0!FqUs$%tc-xkU{6A{B@6Ti4{ zcYmfmW}Ye@4KE^h5R~fnXWGN|h`RU}^W&i!izD)Y%N@-38Tyol%Nc`>ruwXL1n)@SmErRplPq+AHBqUAn!K_A0%Aq$?34 zaU}!4UiWn~^Ww&Uon0A3{@ZKh1eOKHoyJj;mpqjG7p{HCU?%bj4; z^bDw)0nO%*7S@{hF@ozt$pFcKqyi48GT#$X>t*KBIvo5q1DhZL7Mh=$xCA_VCv-sW zx{r=Oyjl>Dxu1idIS;!bJGI z3UlgcF3)&;f!E?9N<@mqCm&~u4Ivk^HAYEDWU6%r@L9e`PjnFOvWxEckN0G5+ER1c zp$BW1zF&xGn;p4+w$J~G61+(Lx5!q}UVI&0!22yZ74p@+k#}3C3_3noOhn`1M(V)l zQHfcZqNVUox8#ajdiuOTrDs!b^zoighqEBKlk32fp;{>qiRkn7JRXmynLSRBm#IuW z_QS6e&JxbxI17bJYHQrcKi_B>X<0cY5*>4?h$DOr$CPD6P8>f%vriW2{qGgF^12q> zC2nI)HH|CX3iJE?@qUBJ)u!bLZJx`^A|%Ai93B3Lezz$;Cs49cjYY>%wH9k)1*iHS zJ|_YBS@4q&BJIpo9-BN%J7OVE;ySTmpx);ti#*E4j)HZ~Z|R_5H%ea?<=qo^Ug~H$ zx8&Q;%Wuj$l$JF3{NWpMdy>NjTLhEBo29NuBFUoUPJkAFubgv4jS2W3NCBtQEKW3? zv#{vv@{n?~KNQHvwxa@;z_Qp&C^wJ9Qn= z?z)kgJemY+bd9uCa$ur+b)dRG!l8d9`>Yco$3oZLt{B;b`t+}v#mX|3W6I%!^)#>1 zA+%vP{$Twf|Ifi=DowlH?3?GWM}n^!56mi=w9$A4Qf)M-v`T^U5R;&n%8Qz;`^nr= z4HsbgZ4joxQKKjrn-8q@U%PpNOtN3JU%~_c>iXi@jP+jt|1It$wo z1uppCl;uhAnjLcjG}l&FKJdUuWQdHCz`B z+3mP-N_eFcGi~AYX15f3aNRL*S+m>7RM0(Tq1bZK4T0DJBHlx?n%C9xdS*0vJeK6{ z?3=S<3A)(%l7SQAebBUi*Sm&AK2%krD@Usvd&m{j&h3j_6h2CIMW*}cf;LwwVa_@g zK%a@i(#*1kKegAA>m|VIN*D1sz|rj|oFa-+<<7M|t%uBj;EW5sNCBC?VH2JmyH&sB zpug_)thF~>H^;QC1gF29Z3hWGMT1WP@CVo=Xb91`SD~L7rEEez@RyYJ zL3-b?|7PFX!;_MY{pZWJ|5{Qkz&@Pj@kR>f{?k_6(~L>VCEo<+IcHf9;T#sBx6+;m z3ZGTJeCgPk5~rP)EC?j);_0%M+^Q%AsP_FJv%lee5@F|5g(bN)RMPKS>kyRmE8O%0OBbhE9jM^k^5piRE|^ zAURxWN-t$@wvmI1xRiE9+Q@|^?ja{$=^giWgcqEKSg2+Ol_vXIFOvRV&T)7XP;gcL zuZLLaXkYa(>8wbl7eC38QSWM=O|-y2GzLn8^H##mRAa_#8nd+#XaKkMJ_0F;h!EHx6OaFSI(ER=3`=MP7Q zH>eHfb(a}Q?6_|Jf^}{duLZ+r zJUwkDA8+glAS*p*u|HI*EIM~arG=z$wMx6h#NW$DqyMRJh(YRc@&{n1{!7*yQ82$% z{8sk1pgBdojK=EUYD_*yBi+HL;t?|`wJQrh-X;`=$xoL!(J>YD*umAP@% zGlG$kU*dE4c2@;4;;~ZIS+fM_l_{-3_$hR=60u?N&$hZhJunWu2jHDOWv^l>2k2I4 zTp0C5rts<*I@Nh5xy-lTq}LKw2L1%YO|eaDVVXp|B>jo3k~=7V-{YjyWKlm@ko0s zni+hEN1|e5tK)looVU23FFhG<^hYX8O!CC(2XcSlN?lGofEDuWPXD*?VQ&>DyNTti zD)88Xw9AB+@;IK-^|))ABmQYnz_gQ=^HJ0ztbOv}5P};EA{;x8$BLPpQ2jkY_UH@U z6`%F`I@#miZuq-r(U$~JWk6Gw%N`wwpC_xWLt#f2q;T1Q zK*1LaAW~$m=eD`DLff0%gm>YjzNtE!Zc?qT-BN^-}S1eN>KJWg6gtxRMTB z8le0o$rr^!s+Y8CQJC62D3he#=Ij9l%4Bh<><^^HvUd3$fG+o~8vI<;#QSJ+!0@B; z>s%yvv|hUVZOa)?Me1-lxYg-{W%jTYDeB+%9j(A9$Us0|6N<-adLzI7#q*Y!ig5VD z6F2TXJFZ0nO{K1o^g*nW1u2Iq>`Opd$->c{e6wP2JbCoH%5KZEOqZ*w;>An|3mAav zYlIWGhyT4*^3$J6*JL+!M{7nu>YO=~JRT`MVcZbsTBNGmi}6g!4R5!_CzA@cyF}@v zBo$c*>l`3FM~ zt=eXXe^phxCex{&V(nWGO{RSK5Dfv_B9mb=YznFyT3DBJ-|MIV5g%Qr?f#ZgdfMw` zZ0f1rUvW1|YXlO?&}sCxp+PjCbNibmQ#akO{DxPMM$yck5457r*VwI_^Bf^da=uknE>|)!% zo>%iu2H|wF^bq0A`qyOtSp;1y?bJ?ZKld2jDdcAK&kooAW+pe_I6v5|t#?bbBqC|dDfLq#*QKgc(HOY_lg z4rW=UToTY-Zx@GdqTpPx^jXPasp$DqcIPh2^zVdJEZih&fbLqX8X8D@<%ZRPQvVzj zTL#P%BDE)t;z3gP>JX&3W4c5@C)u4n1lT+)o115)hyr(vXE+samnz>_IiYXS{4Nk=qt1y+egt)U* zn(Ev$@GborjXy2R2JI7jC*1LGA-PfM1os0ZPrxbqH*7?n*BTaXYU;0Z%&U7-ORk_E zBh&vZ`DCx-z*4fegMG3HR z!dy8`GtRSs5*Za-ft$!#4=+KJcd}YA+wfvki`5&k#q7xG<=D`OKxq^&ct0#~w(R*G}#YtP0zxvcaa(@t*NK5>)_205OKK&%2?1 z2K z=uxyk4@j7;fub4az6zXea2YiUfUBFsOGQh9^R?Xw_B^>&E3G(!aUqG?8x`OUs4HP(yah!kwMISRzxqdgj%*9- zWH$?SJ%~a*$mD)pNXh7;OVkG)PH4N+_gevua01nsO}T<)6z(OkzQNA|vEB2DUoUk! zo^jPh@1+~|jlCe174fxDbfw2eru<8$f1B{1qRdq?V~e+}>~AWc#^_2z`)a>>eEvc~ zXeC9xfGZAQgsF^6DH+k)>Pr-;12`JB-oAdCZl)5{WU-*|$QghCgQy**S^AA+933If zl7wuVcg7_%DEtyFmGgkOndaK}8z#7=)wwY^9~vYt%iS-P{A`jTFK=l<%0L;lfSgM* zRdq|tVb(ZM%iSIMOAh_2$?;^=A^}^g09=dn#H1PBGDIE?J~F%Gbh2otr_g{%m-`Qh zXzXDMu)@00{-xe1Be6|f_oEgTccW}$j~RAGO8ryW$q z3;|{sMbR_z_+`4c4ooXIfqj9}%E>!}HR890p9 zoIkiqY1I*q`;4On#RqGn^`6C+tu4Lh6oX>($dy`ASxFA_oSY0~!Mruxhl(NgEzbW2 z#(WqHS%#MKRJ~t(9o{yayv{YjA!y($Y%C!YD2K?W6u%_)*e)#O&+!hvE^myZ7$jc&12>D;xRMv9I>k95(muA30!JuTZn3 z#q#O&fv4S`elg!oV`zn2Iz2lyR&ZFu^ILl$C?kqiR`@4u0l$ z8dw}lnxtSC&&g5+Fgl39C|%8l8yI;V=?JY7yN|uYoV8z}o}Lxc*RgU? z<*v;2gksS9>2V>E5aw@um+7{JHY@o~#!SoH)9(e~;x4Hy=zOLH55CucSD|QC6Ue8W zvlvP)Fz(Zt%WQ$MEp@g4l!(+N+1pn>;rhP|E_!uK_6|byvWB4ht;kT_1L|aVz{^Jez0rPdiUz4q(UyL#edQnv3 zJv6f1Hj638XesI#%Fpz|B!ApW$7(K-T5KKW!UnCAZ^@U@L#-4qA4M_2R;{l}!4My; zJU$3&i1?XF&_jfi?P~8#$vhtC&!_o%_V4-Ed+p3RIC7HTxIoEI4M)vZp6Io-)m7v< zdMfoho}b3$)KMw_VnT%L_hb9t2W-A!h)z=q4bJLYHA}X@^HGBJ+x`AdbrG!IgL0GE z7wtHypDMFCdG2}DoWCQxwY{^*ofq}fM#cc#?_AC_=SA23{%N^b&Sl67rIhVSEw&SI znY*u;rbhs~J zXv*ZJuaC-na3aI|Tc;rQyFCP7*YArA0W=|X?wp&HC+HL>UB+iKp7E$=>B9wXPg z#M{K;;k{YHMYmZbRoHrP`7WepUwrC0JO335I_N#tbc5#lq?^hhiYMwB1CIlpO~gr< zW*_E=i<16*%-q*^F7D)0_BV|*? z)>y8-3Dd(8pZADp5@YnaqoZJ<5c+AQ4zBMBBwB~>UY<}Dfl|iVk?oP4oQAFyF4>75 zf9t{CnVC|1_aEww*~U}UtyH5)2`#j9!VL<_PoI8{vd(w8K7H1|m25&)@4k;PL@{b0 zB1U{0SQI{tF^&h>1?}r}r}|~s7JyNckj&y*nNX-oJ@KfDQKuKai5<(sgZ4R#e-_Q-t)66OugR^Sg3Q6&W-h9{Oa!{rg!RhS#3fj0M|UF;~O$Jh>C@HsPA_rCdL6D!G7~y1!SCK}rq^ zoMhkR$uPSHSSV#fW;hvQ&2(NXlf-o@fTvhh-22geA0EvB0}E%bpHnulVtl+YmF4&3 z12p6wO|}59>8G9RbO|h6`mAm1yZUk&QSjyQy&@-SM7~?gQVAe~kGru1silkKKngsr zDnm<1XXeQfdT4fMwJP91>-IBNf4i!VVobWV=3O}q_QJ9Ht@d8;GQa!0W_HPTFhJb( z_%LT!n|=NZm1eHgv)*i@@N=J;uh0W1`C@ll=jg^sZ0mr3;tAWNWIdIVk20CHi-K0@dD+wF;2sShzUNe8hSn| zrU$)ng?R~Sz|(Q#?O3I&AYrP>Rm_wByC9OEBx{IEHA7b|XQKUo$+R?OxYqFH|8aEh zD%^fTpQu{v^f;K9oa~Ia&~Yc*g7KOXFlu^>8yEuYyp$Uh7K?>76{H4JG5+d8$TXVS z@8030j;xLO+yGdsMy2cA49iHkFOkBos?{77Z1(s(aWL@^p(YNNF;%uTeA;#G+odRq zqnHq5a#knYQhR32uZcSO$gK2st%@Ie>rs-S!y-zrCqUp6+uu4v*k}6J^QYD18}?Iw z5%oEyP%aPG?ABGJR}Rj8B%p6Rd-&;x~&SGqUu=F|8gdlZ6JVWuhp0Yo8v&hA3@V_h~xY+ z#)5puEz!p{-6?$9G$?O02Z{ugTK$86$1Fo9Bm;yi+Htl4s`r$l6D3VCC#Lj^U&)Ot zrS%x7azL9`b*6nNQM)L0-qHEsQ5SyreFhpIjf=&4%NN;N?Mj_t(>`Q=1vB&sY)-2{a-34 zrWUR4HcKE%l<~7C66;J!Mc$DrUCO^<^$B(cNE={~HF*=&%hTBw4(;DR-z~`7gQ?|bcQR^^_lOewYR zdxWG2DMAs7kQAYP-*=^?RPL2oRayJiU0nqY-7E&oFg70JdEOVs3%0Qz9^2FJNL|^T z4>C>T_vhn&-_5wGyPV%W=WcQ1-gC-9JMY2mhuoX1=}H&XpXZaF(&HDs6_#l$?Ti|U zk+YRZvc2flmwn6aQD1uA-Xkg68q2Nv#H5F{qKp^`=ksLH2g)tsiha&T1ZLWa7$~1E zPp?fRmtI6;Q?5*epOwKFyPk4q`SW={s2VV)6Rm_5Mfn-u%tgB~{`Jd6_RF|b5fREQ zUbArXoUP%Ag)u7asq+>~T4O&ckIAtFV|^&0SxN>AzDP4|UCm(y9cg!C+FKUAH4{6> z$DI?UeBA2r1t(6`B+J*&$C;|seS)pwqdhJ&qGvd7yB$$xB`KXEI9>K8HdMuN-Uz0> zgtQe9Mt`wo`P0H+tON#dJ*8S-q0J5NVO|Hs?PS zwZjFjP>95{x^OAZWURTeb7ZL@MxUw;C0aun-8pG!VJeFH)Yhnl_ZL+uEf_|fo3Dp3 zW(A?k%JlKb)4f0%)8?vDP2)(IwwWW-b1q*rq@j};g45_m&?s#_^KcLu0~=M5xLIEI zXj{d)EUaPOq_xvQGq+M+ye&+QyKGj@M3hCPHWO&-;3YnhKb607D8zfFjCJ-F-A#(( zV7_3vCan0}iC`i7Y>g@?iQJJAUB#)*P$K6lBn~#gR-oSMl5+V=L!hjtC=Dg_`237- z4hbgmW;wAOr`z$IUzeB#m7~1svpS^@5sRp$g}9n{G2|*&ECwleIXnW^tu$bvB-=7u zv4D}n+A}j8PB&e6Vc#5gb;@p!oZ94EH9z4jYTShdrG^X-OBZ6(jet`xwC@|w$L5vl z8mpEpTS)N*W$@Ypj`>T%s7FV$Zij(vQvpV^*iuDY>SoYtR+M$0h)#*vsPyEML0?5k z855aCCVf0ZKYVcc(kIpLjnK5s*mF6#HDox?_h^cT!J6(?jy*`Jx z$*853Jh9mc6};EATW)&ORbat(nqgGWg>_W-nb-pX{k2$isV%qpKx07{Ognh?N9IDg zM6_BeV)0F(I?G}e&8vDmp|#pUjp?w_Ma!F*XwxuTxfx%QIAoG2vJlpu(1FpxFe!AY zhi20He__-_(@5ZdlOVHF#HIGNKuE zXX$kl7rJcscXeaIa4Oc+_-f?K#1b_hOHi4YV4bL}SQT^K#jw&WLMLJ?)>))9pJ@bL zkE*EU=xukwmg81b&S|fwSu0vK(LBqQRDszAsxyC%>>5fAB%=0rQ(n)4KA3JD%{nh8 zI$!n~j#DlJ$AAt(7p)-(v7&AFX5?TieR9r}^}@LZF!pLb4@aNQFL`9}vKIAKSidrzSK8xlKGbuwHcI5{WX!S3NT?P{ zfw?8xx?FPc`rN^}MLMmQK8QTAFlS2_%1GLH!6%~EW-CF@B=4ZhnH1Pj@$hY9(_$ig z;eZnEdt%$X9G@o5~MGj^19 z&Ck}Op#bKOv6E_R+`w-{YBR}XT&~m+m)Hv&<(qGevpXlhUjp^0GR}(K&BxaZ~W@QH9RsS=c z#>DhOM0O(T#2I#SK6Wfwc8w|${MaL@IlmFN>SJx9GH&!!xDOO;9Q#sO8H&hVD#o3Q zV|C8!rTMcJgo_y1oHS@~#3SiQWJL&{1_)tTu5anNJ6^^g?k13!-()O1xri-}8=5N# zJR#MRGQzvUE_E57V|k|GF<{HcF#2Ss5e=Xi&A}QB8 zGlke?Cz?%D-A057Hd^`Y7csDIjJu4v$t)OXXNOE`JFB9YO0rn=o~*+m(NXD)cGYma zN}e*>Z=uc=F^;{AoYK_Ht!g4<1 z%CId_m9*i`#Zo1Wswy!*;7%erAr&T0@@Y(%ED74BN7>ujq6(pgW5 zfxf!{)+jI;U4xS|ijZUMdR}qT7U<;*`B!EO4u8*^$z+2$ttuw9q*iKqZ}(Bfl+jXC zvnp4}N6M3qurE281QmCoT<&=@m3-dSG7=Q03Qjut7b24^>@(QG!19CxbR(G7+A}^h z)^>2$^Y$CnB^O@Ro|>}NEye@_N09Q`k!a6%-Dah~q-$rR`c~SGr7Rn_Z(gx^_?*d1 ztB{v$^Fb^Uo1AkeO!zz+M4r1Sy+CrFOeSX)M;6XE3XRJvFdi3|a`KvkI=*PPpgq3q zp=&uf;t+RXhOY&WEhVtn)%l8sJX;VZkK$)I*|G(1jkt_Sh0!h_UP)$2;t+x6B4ggm zc6*zv%%G>EZtbW?8CAJ{BcY5fU$ZG_ea;g-K_w9;HdPK^sO?Xj4B;%#W#$buSJHP| zHY`-VEap>QDuxu{v}jCNogbwchc{-9TvNp?+N53&TV)zxk6#XY|1^?tBsin7I^AWZ zyct)J5=4quG?fE-<|T_})+`WB2&C27 z27IBBnDgP^oO2U+G)gi{h%QOCniI2!PHpMYnV35=QJ?o~$V__G?W=-*WVb~$XGU&! zGdCUIG`dZN@&Uv%nXYMMA)0$W(QLPT6t?CDYssdBX@#q#uuV)CWEw1-6v6Uxo}ts{ zvjO8!*%m@i9!FCLLpEJiyJjn5d;wj@QtfEd&6V)-SBk~_go93*Wywz#hpH}{+!b%x z(ss#e>#8Drfz3@+ZRxzoBqY=q+j)u@N?&M4-1PP2NK`08sA6%^He?&%laCLxuvE_9j6OV7bP1p zrLw|QHp%fw_PJ*I$p;-Adsf_87hv2aOeGKXC# zD2wZi>CEdy$(1sqVA8BL3)eD-4}2+vO_9-z<=h2-INB98Z)xJ=T%^k}ZjL?Y>oMZ< z6^p50ZXjh}I3c{IWV7mElp*Q|on=WJp9H(a^BJ{0-6o6`n-7KW1qzjjN5ls%d$T1p zq^$5&yROOxfNL$MX`EZKFLS$I#10is$gGW zAp)<^aN9ey7GAxvG3)XidGR4_%1_^D)~ynM)`$ga8zGY~)2cdRh6BQD+`M+NyR~#} zdBQ*$95;A<$@s$Tgvdg#JrPtX?KAXTJPQVn%t{xbjfJF?bi->J&zDL?OXPRhkh_=q zZy{r-d1s@Mlbsv^U+kOsA1>rjx)L2PK5LvaV2{yEjCkUiNcYnEmpR> z%=y&8mx@bGCB77tMqn|Kvih_ht6DPwm)6^%Hl_A+xgbO789p6tAp#W4TZP5!GRVjRy%op`2q)N#ry; zhF}veve2DDUqKy2t~(t_YAchhCmQDrFTU!c4yk?ViMb6?Tr-QO(it#2`H~uQr03jT zzjaRQnU^=?VDR!t*um`d6_2WV3DHYf?d7@ zcAo3#i|6j7aU&c?#zJWHBpgqd*Gw5)Wwe2Q^A1vm{lG>j;p&+U2FZfQ;iH@8de*G7 zH(@tC+GFcpuJ_qIb?vEBV1}y~?HQb_4y@ELxL6@0inNyvk?^@aOJ+<7+yF#na4#HwKM@{p-KC#pbXdb zI$j?oELVyWJ!P58mq53)j!GBP>IIt$iIxi~I8R2D8!kuOP)EUrAYQH}=cEeGg!mhz zIjEAmqr*n(bi?isTFF^Oh?Q=|oD0Qfkabo@-T7GQk+hC#u(CRZh6@>gqC7h4jLJP_ zPlU%JN*khar-Foo4O$%YBSDuw4<}ToEB*+@`P1&@aJ<1sJwzDmvTAXd80}=04rOgZ zUz=A(b@Js@ARQdO|ycfpPO$R%x*m}85e@z55} z5IUViQf+GpJ5QB1EHgOVN^sot7~zfni5KH^N+0CseA$@98?iICxfh}NRi{YzWLjTLh>kB~iIw~lbLJS_+~K-jq|Tb=#az>$)olelYKE>- zRvcB9%lE zDU_$A7xBn&D^y%UG*m5Nj>VRjSeJ%ZIfVa5NJ6Ejni9&DDvXESZ$jU^2qV%o0L{->JY*T;X7-OweWoJA*F?6MDNP zZB>BMAnF-a!{^uLv}q)tjO4wW3e`eBbA=fb!|rp!rqdXe&y25c9tS(vF5#Lk-jMlwi(G4dBb+!~bR25qi*7r{<~dElI5|=^mPo>V(WE-( z7_Fd*3xrv@>{lvQ{K&#W5H1$#R)fS)F5jU2if-no?kM@Z8Mo4q$H4+XjI#S3%=z);mBNL#w@HlSa@UdgglmXY zBLtg-efOl&QAgbDwb~iZA_enIPIq89VDz4A2(9K=F!GySXFOpbM_;;;Y1M8VBc|m6 z++AsEY)n{rF>2OXgNe$xqZhNK6;o3Qbd)6}!&!ZL*5%jAXnvRYR z#ccEY<7QM6P01umUMdxau^ZIFlpDUUUw6HToVqkRv{+D5&Qx0eQ}!G3S|qRa_dKu6 z(glwp;_>F{W2)9z6khqsw9A3m>I_^wm*AYh14Q(_{e;ZR8|k?m?$mN7h98q}1!JniXKK+h2kf z*Cpib$O)A4)#jWA)N;9Kw>%4`sZIXa8Ki5`;<@pS1J(Ei(sadHj(GE?A|(wzb35)Z zIQjNwtGHPer%~f6uu*c6#YSprS$H~iTxvt;mXZ7VcnY<;>!1Kx)`n}dzDN)Qqr=@e zp1Lg;g9+plaT#c2bO{?~HO+|C8MRV8D>lR1xVKcRz#NcS|8~viPbr7v+Sl_4ku1T9azMl2!OEEzA?laI%Hvq`$e zZCK)%t%);RB%FmhXc!mPvtZpCE$mZgW*j%mT9)>y5@@Oz%B8ZQ@Pw6{U`X=p3#Rsh z+Fzf&Ty(>g3Z#X~sA zJ`Oj|HPr5qJA@PxtF%n-iALF2*H?78>1=Z7f-#yd`>ZteOvj(rr()EE^jiU|Nk*LsAUay)Tr^rMx z=^HB7Sp7+HO}~M;qmp8T6{eDPB{DYLURQWP3qo#+6pK@Su-JF55?FE^R3+gCS!%pk zc)(?tZ%__@P+-X)`lGGLh2c~u5t5vOiPE;YU8@k7unAnpb`!^74VYE~1u(Q|TI0-+xIcN3= zMa>nFGV>sB-gJFoi0S2{_DF>&g&FHf%*G)~gNpJq~=>P(yvCJi#7Xf8q-%M20eSuM*B z_Jr5N+ZahBcrMmiF8jwpmnI(McqM|ud7k{2D{v>5&9u^-ea3BTg;kWKU5uD%8LS{R z%Oaca`sx?qY^tX0vP+$s+A`d=%UYk~Y@q%tk>j19rnRhaQI@mCGT3)BpE@ynbYU`+ zv4?d*{mHg|%7LjfWIC59IKq_NC2m<@N$aWyr!oYQhFTnOrUGIW;i6&SgMO%eA($X z$jf$_>S8lApPP?a;B%8bQL)w5K@AH!@U>)Ng;d*Nf7RD?d-A?o&Us3Md1Ha$xTTZQVm(L4 z!^4RVot$+RK*K|N1umKwv%E}tg2b4V%(pmuyljtHIbBD6I(!B_Y+@to5x3e_v&ucu zWF{G&Cq^X{vmZ?m7FT9wSfsH~LpCgHamS$JOnXls<`Xe6u5l`DntL*iQj_6ZqG@V$ zF{Ww?B{G>VTCm{8lWB{wgSg$CJHR*fs&kf7w~U|l5R>7waeXP~_pIv2JI-`&b@mz` zLwaZ9)|b{6iWjt~+iKgEhgmmaCgqZqk(x&{!SO_`sjbkJN@OHY zdP#StrVRR#2c0$^w&c5ym820uV+&VWRNU*KEG_w^Mrh-ZbUKqc$foUfCep?NVl=8f z=8vPHb#9b%snL{6$s!Z>b(7O`d>#=G2Vxc#Z;M9cR(FFVH(M=UQmeRPYkFrOeZdz@ zbtlJjkwQ1%lBEw-8*;IaM_59P8+8Y4l(Q5J%$iCw+FxaXyHV$$B=4xz1EZLyko6mP`C{rO|Wppnax{ zDBFI!W2IrUa}8!nJJ-O;a$r7??tRR|1!j}a+N|vr=cVx}B<+_5C3M&D6c|)?9}5%M z*A3D0@k%d*xcvECT8ke5{`dTe(8_tx6nQXP%M6D+Wc1T!cKo^e*u+x#IM4K|Bx$ec zyyFSQdUdGh8OQWsE6j5xTr|yBqBcK5p6q@DV{I9<5npo!t=02(S7xIcG+Va8X30KlDCJ#&n{O~Sr=1AFt0f&($9%@|oOSVXFo+}dl)G&;mc230 zd?|lhe-W0QJgBuAd47pCavSsUs51g~h<&V4(q>(Sj*9Tgnt@_sy$}?d>aF0@88gAh z$wa#tC{PO1LZ&1uS2Ji@HwqT}j$tu-5t%%+2cYTQl`ip#a&a+nA0JL%D;@qDo0apfv> z5iG*0qAgK+xHtZp?5JV#R#44q97%P8n|JxNE!4^`#du#T3r6I}(@C3484g2 z%bNq^qzh@gISsby=FbMo+>$>(VIXB>?Tsg zbi$p+T`O&WW8RgN)=Eapq8IMQmA^GhIu0=r11NKh^YL?$D7{~nz^0~sJ$Jq4-oPl^ z^(x8~JaAILPkNOLHg9sR>d7eI?Xnu7yL2`Hlp<*inb8m z&9B9BaazU=H_UlZX0w(oKhX#_g#J*`)eMHw21B~DBNZ#=?YS2GE;mmvOT)4#Vbt zy2#|Z%TJ^5G>q=Fo%xkn$F|3NLNKRSmI8Sqj^>E?Knyxyp zKi|aF*HiP+B;}|KjRZD47i~pVOSMRaoap(m`s}LJpd+J{#~b2HPEN+78}!wjN4ZcM zcW`IP%DOP+ePz`@X~BFc^^&BMuP+3%s;e?iaml_G;NzB+ZnQa4W9W*A)g&CBiqta8 zYq^xEeerC{$tR~}#BHayziy%Jmq04CNNb!cha7HpvvWq zjeLdckPvfYE19`-c1BLEMiaajRGr}eb=GzGWYNNIOk0IYDT83zn}%lCao|8~ z66ZU$5^X?{K<#dndlA)lR0%d*YnL|(tHWB|^fmDW+&dS>!Nhqp?kr*XSTdSZWVy(Q zeW`4p?0VwKs4F{f(X^s&GrzU&liCiUH=eQ>;ZEv|i^7#Mu2IyORuZO@a&OWJavrZJ zQVIP8vSsNQJXBI6zT*ffQf$%~=qOSgifS`UV79qQ!KpjCZ8w)Oo{edDoC7FQ>r75bk!n(jy)uPJd?Q z;v#nOXvm3kmV&q$Qe2X1!$(Y$EN(2#yQS&wxJ9A2RjL7}JcGz5kb*wBRNTsOX81%B zki|>>Rk+xyrUTb(laUx-ZLvL0Vn4AM%?3OvIJtn2JC~=z8Hx)~O40fB-3DwCzmWcX zbG29SCuGs24@`tbUDhaE!d?f9yuJ)ZUVdsdPf)(O@fJeXWmOs-Po zMfWig&tggK>d?Urde$TA&6urEC*7Jjtn!-STSu^ z?zfLQRGxWPb>XQXSEYmjlm-2p$$7aW>Fek+E|Z2bMrZx2smUxA^E1+r`HA_3ntjuo zPMC$2RH}i6%$Luxrn%2cNw9d3uqSSZ!q8+pp{Oc0PuZ4R_72hE92N zp3vBmA%reEX{PHi5JZ@Ax^XTacE=HMLbFns$Bx_#r4IWi=cr({16OLi8v6|INUPb) zw$P;qFEi~Jxioj2EN*&biMDrYuF{Ac9S4=1>VzebR1l=Olu~FN=0a9Zw(HNiF-=5w z3LTv}JynW5al_)Mkk@K%ho6j;CGNa_IxunGpSBX5cdb@CS~^)%cS{;4p4xO}_>^vi z7;6_o`P95qAE*?-ZcQ3dubZ?zb+|?-I3XT@ZVGw?Hhk=HjPGv}6Xwx@|h^#y!B zg1e`6SPMSis5!eg|0PA1D5sD)t?!&NkhM(kLc18pG^giNZZG}(3*~xF?u-(NPWIAq zaTbNIesT%fiw@i1oUJyV&L3rp7$Od^VYcfFwldekblJGCv#y6aQ^5dcPa9-qtc8bT z1fOS&29qHw2^mwlv}jS)JaeSLG?mD)#f7p<%i4Siv{EPiL!q4a=B8WfmL|*gBLz_8 zpuop`7S@VbEMeP5a~rz1m=UmY-cWVtEB#dldMTJy0B)Z23rVCLg8QJ zXF)?y4RqGfM=eCsmU7LQC0TDY!ruI7OyhSZbZ{5-+kWh5)Xb*lrju-P$U-IEXd))I z@V1U|k#`po#}8?u1}_@UUbN=7$_2}2$Q$Se;eG_mR43hYWVjl$m#cF$a=Og8LiS8q zgyFWx(J2~thXYGRRbFLwBg&@No~SM;SsQDx&=#4ZaW3MG+azM66BVkNCNvW`s*thK zc^l$jBHnpc9rVCPt4}xEx)DZn^>93=S@34631iYP+wvkFbt%@%&}T9eEtO5>3sG6N zj4+c#Esj&8oAwH~>PC#YXlvGa${47+F?<-VXXi3GMjwiAXqjXU^{7Oa@)|czxNOG= zlEls5>|Thw5*;{48?`j&9yO%kR+`z6Z47gcu%qNC!VLrLL33}pY~Lgjm&ht*lhQ#; z%4k_wrLWxqRdc2E0h3qaD;MV&unAa^r|T{}sDz8;$yugaIw($7IgK{dT$!u+SIWkQ zK59QJPgIQVP>B(TV4q|;Y%<1D&%}LW8>_@ks?k-780j3PnjXS(jn^cpYrhJLe)fR=%>X zb)@vw1gSnsy4`Slv0ctc7kU3QSqsrk7?u;~tqA4T7Gbz-#afK+Gb5jK$RQsVZ{gbyx0EUELZ{ z=KR*H*UdrWW1z^lbGa2B3E8U#Ivy%v^fYqCjMtuXM*Y_&I=nfKYm_E}H1p?>t zr$*z;8r7-ht5)5nhrrcv7vz8|<6SWhPsU9T!@;Evm!juWRs=g*>v3iZZYYL~S7w7m z&>Z(rx4yd0ob{tKU=(?oRv8%76yY#!XUNT%*65zLRE#w-%4h4v=Dlz`z+b5ZO1eq2 zGfmHM#;eix^spsQUM3~RtYUEpi#UQGdo(z+%l)LDX z70)+`1ufiN)h1pXwXmYB-Id3ffI2Q8#pKI|bLaKcQp$P_4>8W-lb3=QW)_v8SI!qG z`R-PD&ahQ!#7mYMiH$xtYnoe2y9!GVkE5XopZVsB;-VyJjOi7DindZS@SEaDku4L{g`T9o9^^U!aE->+xkUXzit)$Kz2% zO^%jq+{NU}b}ErIdR9pyQ8qD2Bvp7S=?Z1X9ceq9dWnr+v2?NPQ)idf{n1M+vQ>*Z z83#R8)8=IoPAZedz*>GOcGP8NjVG1ZH<~%mty(61p~Z}?YArciRT+!8XW_twl}$0{ zRifqYlRl67bW6gFNzw_$F|P-dQ!7VU*XnpJIZV!F7s@@hW*$3v4MseQNV{l9h>oAF z(c$2$1}c@|yj#X_-5kz6Yk+I5bBz90&8`~}yWvs+f$03Pm~qjntgyEq)9MjpU|(lt zXu&j<2rY-guI6XbXl&SBi_3;8Yt&`Rt8)w^=~ABH7A>bwg?+Ou&PqY0wM#B<8+>_C zJCz7ZFeZoVs=6tHWLjX(IGsKdFolv?TSBk!8)E*b8|@GoIJ9EYUp6_Avg`7cBkffp zOsbH*+s)g~nz_p2@CGYKavx?CshhP^paKB2w7Ej#+Eq!fPEZ=P85D2W&FxYOc|oZg zj_j|np(^a*qxe~aY=g`DBhG@E^6&*()rw5WG*JRM*z>?CFHr4fErq*Hv(0=eJ6<%( zu)4(;$Ht$9gS)vRs4A3A9zY^4Z;~QC%B-f?Iq+Nr4c+ihM`GSBXJMtdf=1C3mJNlY zG;DZIF&`iTW|NJy+x5QE=|XPOM|e##O~S2fi{m3?b;(*zDRjv>rbl(^mipLw8LwjY zuLM&0@z02i;YISasuTGgeYs`-KgVM4C4On|srZ7n-!{${j9 znLj>}?nM!8>fBi zh6rVQG4gV(XvK&k6A9S9E{((S4~ua@H|6sf<8Fpk&qvBp>wI|4=y&m?6m-!bW$bJ) zsP#&o6SNa6Opw-@uT|Gobt!bPDK_nOVA-F>;8}=Jw#kOlal!%QdUJ~MSf$-{LB?psI zUDv^#J+Rdk*F~>Kj{9_@8H<6ngbFw(!GcoUJA(Q$1l=qV*|j*_$J7?{C2~KWm1sP^ zdY146Y7~eXE@+Dy_B&9;;4~0Rk(`kG&CGw@~nAGcv7S$j$4uoUhkLI7@EC& z^Oi^=4Z!)f>98j!IRj1ALy(Xu{L4xu8j_=J$@aYj1r*E(Y6Tj~tGFMF#|$lJ}B zE_3yY!tjCPuf6uO0&wr@|M{u=DF=SafuC~VryTex2Y$+dpK{=*9Qgke2R=4_Z!P!^ zfj~d>gllm>xA^HPG|)6AG*_NUsgff%u2#T?u<6|hTvw)!l*n%D^kedM?f zUvCU8AUKW_3;y2OuQHtj*eD;FVh?y7{t_K-2M?>chH(Rk5PbQi;5Ovm!I8K?G$+>G15XuVLTi=!*%naqT9*b}Kvcu1@$ zPP!F=_Q7-7*%i+_nT|RC>b&T}lVYP)d5YAUwpJhiS~n;JH4PRtWH=kW@nrCGj*jiu)EI=I- zpTln6AdZj+>_uY(`!qLy+qXIqZm5Eqi7U%dYMpoF4vj7a;#X8-mog4!p%SF*dI#%y z=G^8DDSuhlxoF-RcakQJM1LjMlordzvm(uJVQ$lBK04iY|8=hL>8UTUpLf=hk}LEi z);T&sWqrEUi4&^Mhn7qqONHT#dN#>OQSB=Z(ey~rk~O*%S3gE#N7NpCI8N=?`;x`E zHFqyXqNf?#tGIJtjd!;()>MZ2qxlTI;<6SN+Z2$CeJ}In#&8XU;b~vIl-A zF!#J1L&-yW57k(lr#$|rgX;M&>Sxw8GgaJR$9{u948|9J?d01oEPed9rTeC1V3U~* zh=b0t(Gy|Es3msL`@#}2Y5LCCpBSGiYg|knA-cG{0$qr+I@%kEE=ZoU-KTy-A# zZwnb_juTh+e_qotcZUuu3iVjx&th!05V&YD&CT9=aY$m7BRu2UFo*bsorP3YBM`^-1m7z(u4|OU-3wW$syFz;`fey?AixL#4!W(R%Ue z#8~vmrz)@{h$wERNYb6L6EZ$6wK?jc0tmqPk_Q@T_{{R6ThN z{eDawwLdCcnLM|YtuMC8V*2>RSDrQdwm4g|7z@=-JNCUOdwTUm{7mp^+x0KP_wwIQ z&zxww&wMc$DWO*uWCnJ_OOBeXU-D@X&9vSmOf2NTp#kkEvSLRk7xy}cE^lHdU$xj5 z^+z3Z=h#5i-Z`21tXiF6|A*XUIVHqDP0M-H{>{Y?GZzw!kUZznCRbDP$cZP2)zFvX zL0d+zMpeCZlC!q^CTg1^oC>Sx$!e=q{G>q zUB9__@>^4lui{8HIIg$kT4Dc_h2`dDf8MxM$~^leOXx~69GN$I7GkKuwjkMO3P0OM zgHKe;S?!U;ObYMB7v>4R@~Ud%+Q=&kG;mYrQG_Z7`8R5tm zB5`g?1~)aVpC^;ZbFq$;TQqV#7yaKOD><|M_iw#+Wb}W2JhxTR=a`Z|*6|cli)TXN zGhXC+;)xl4q4@aqPvEZ#>+zD}6Tg)ZlR3_MH04UW9t(f|v%=TM!gs<1F!ax_y>|2G z1cG-y{@QDw6211?{qWoS!S~~G>jJ^s;JNL86#^dt|2?xnV6q4VE{Q4s(mf^R_Dk9-u;JSGt6#smW4e*$_r zfxwZ4`2Qjhd>d$eD+6>sD-irFJntJ72xNJPdmL~=J#&zjc?81y0XxhEe^7+-!2j%| zK%fGAfr3D=1;1}Wo*(RkdL0x9Mt%b@&I$zkN>CnbD3qRn`U7^sbKtA+dj)9x40uT2 zEeizqe+Y0w+D+h%3F1o%NHYyIJ}3~ZsiB@K!0`-}t%S67$X62x{uOYJJTDO3f%NZ$ z-?x4Z@<91R4X6*Kw*dwQ(yjqdQ-J0EXQ8f8)~hWj7yJnDuU`SWS}6ZPkO9cyatGwn27e6b9R*%O_^XgE1bH3;EZ^w@4fwYsAA)+ifaYfa^9O)0koNN9P_7j4{u1zo5C~#! zkk>DR?1A=QLE8J?2Xw&y1<27sLi_qvz%v6hAZ-`qxcOo5P}f)A3u&P)CJ&_fMQ8`W z`>#U!??Jc*^nL(s=T!=50uM?LKpBwE12TUZ+T@`FP}X}uzDA($1%CVj z@Vx352%Z5RUp@u&A#LF2A^-c~e=*3C2i|Z3!Mjg_TmaAa0`MEi{v+@d{8x~75Xyk| zvId=a4bq0*1w8yUctXAU{pI^%;4%0=f$+bAoN6JE4V3>fAdf$Ihd?k9 zf!{^IC*b2Di2Kg3gO5R39|8LCy9=-%g8$zQX>E}2@^PS@fVT0oKpU_=69-u?fn25_ z4Lm;u&sU-S90GoppiO*4515947PO-^;P0nC379}O$0ShB&jEiyZnX5Z*Pv!U1L7Hs zKR{XlMGJt0;BDYR90CxjKn4C8@ESO0Y{egb^s$E#yT)wOE6rd819RzhO069w9)9CL zjosz4=~1w5_3)J5=6(vY3LcWZ_S!@6ydONU`wbeI96i1v7$1S(`gq^OsMhJ=|D`?X0xJiuPv_Pn;r;#R2iSiY&`s3d zd$;mB-M9C7^I?eWV;%wjcJO`L{b#>o+w`8;ohJ^v8FeXbW>l}n9v*P%s2#;!z=vmF zd+mSksN08tMxX!C#NVhJbVmVnJ3$xxW8k-GJ-RzBo!;Rxy)KFYr3p1KU`hVkYkl54 z0RBGkeeC_Z2EcE-{{O%Gci$ZF{-1^?2R!)Qzni+#fV>bj;Qf8~@a@(lwe`1mWk;M*P#`uJeV*yiK2 zZiT)guj%5%(H`@T?M}Fq+9Rl(})PR@2d=H}#e&~-6@9anH@Gx}>3}y^>?HF80 zz1e7T;Yz1ljk+E_p|x@tT-|}~7w=))=3hV0&b|W6m>+{}x3>8D)Tz_%&cAJZQ1qS{C6E`Q2mE{Q zJ&b$xmmTf!>HkPOe7A0QK5~74;QBmy=zmTP`pTuk*9Uz3%iy=}{GS~df8id+{ye)q zPwnI*EVH)xxCh1^GQPL+SOQ#+K)wgS-?yXOA8voIT2upUU;NYmKEU?<&;I3|2IPe- z1OC0|$MA2*c(X_Q+fBxAOn-tA{eO&q2O!@z#&_StxZC=uKS#eC{Jk;pnTKCr9tmFF zgYNb?*O%LFW8BppbpP``x;xwdhaulb!T0IzX#Y>`-nMq*&As+-*j;{hD0kcSF+O|q zYlA)-$eRZ8V~;<6M*?{vY9PON-NU%E{cmG@KP1{Azn=i(oAP_Nad({X`tgSA<9tAM zZP3n^_CGgRzx?t09>4o`$O};eoIn3#aPH`*Pwvjke|PyJxV#5W>l-Lv@2gikvH{MG z`(7LHlSi)KIkN-jJMZD#IUe`r4X+vlneP~n_rSTMufEAffzY&jKPcS8*XQ9wpZvi< z*3!x80X7k6ia7!KAZozF|8x)ATWF7aVB5Prejn)UHlA$b>%-g&<9Byp{JDD=-$Hx* z2{69N$d-9`oO~bPJ=e$ind^;#oWJwI4-VSndp`bacMFggq6WNt$32`o<=lTyxgUJL zZ|{)v9oPPq0Z^}7j-5sZI8zemYIRDAj8_4+&ZhU+o z=a2obU%yL3UWgj-^6mF+^A>^M4Ka_`qKs9`Nz0qW7)>@d(^<5v!H$HuO@Gfw8_3H=v z|8GU_-u?vSgW#YZ{|5s7dbq&%>(Q6rc0F_uo&~?XSF%T^`q4@9E!N=sd*F8f&A#j7 z_{YEdhXMXST8a(cWt3l@dg;$z$3NIl`Zq}Pe(*!!`};`!I`p4A-l>25`gYJXjt(}H zY{Rd;cEbK9fBqH9+KWFbh}+4ZJ@)ec!~A)_WB2~Fy`BW}10I>j=LbCMy!XBVkN)vP z_rLTm$P3W}8UG6e`gPbjZtm<)JJvyOxe7bqn_mE0KkmKn*?+k;=ucp`bvzl}+M_iqN+zI(d(5ume7R z_)otwz^A}IaJK+?A!@+K{~H2*KJLV~E#rP4-qC0G*p~g#uGHTL+lTLt&jV#{yFR|h z|7LlB@7cFKJdpE5_E+yd26-WBfbX9m@Wc4}ed7@LKE56N+XLV3{QC~j*sXu)pv=DO zV=H{wGr+de`@jI(5yj{3d~FA|e}puBY(ntcvfatM9rN}c*mi4zuXhy2^&i8BLOy^q z;NAD{;oI4NKMcS1-(&h~+a3LPk7;BF@9y>A_q%tOYX{2Pc75J;Zx;r<`@3Izy0hr-_O_g{V`1rew*$dOxdkL4MvT(IQ)8LCIC5tU@&Tj)_P>EZAN$V!z0Ko3_8t9uk3G8`*za-n z`%wNPkoN)bJLR8*|F>OV?hpO-2M2xVd*|YZ`v0f@W$Dg$cVPQ#_-!xw_p$Ai{~q6e z{(bpVyW@N--+>B{7a$CH_*c8&+#bV!5#pW$e^35<%+I^Qd~>2cM(>XEhu1q+|LgN$ z;Mrdoj7cK;`Mb|S-ff)!>)*lo&i^;gpJ1+e6X##;finbUZ@a#n<)YWujQ{YCCk8n0 zd+hMtKiYxwyO5?Y_r8p``_nH$_CitHO|8DR*=IfuhNj}8vF6SR!FMjw3nnAyb zWqx~r@mBM%?|cmM0{H>PzktAA{ife9cdl!Hf^qW=L4ONq_2;$w!M`8;wu~4E_g$Zl ztMj!1#<@p7GLZ4jZ@=&E7k6O%b4b(2xO3dup1b?=kG;Jg9d~T(?)fUW#<=LY_w9~zn}0xJ zz(dKGOaq+1eBf^eINuxm!#nSSybwLW`KJ)*P<*ATCN@VUGPT^iE%U7zlSqn8GBXOF%9jUxP= z2k)-DPIs_>@kfxR&x3yZ-noC#|4w|5-X8lGKgyG%uOEjIJR*DTwRZ*HtlJ+#S^c_A zfZwhg9OD{{Lp6BrJRh;g2b&$8=*{+d8EEu*^8on!z;BoR!~1=oDZXdGgR{NbfcH@o z_y5#(1>jK?S9gU75hEf3A|j?SRn$}wQ%pj{fGMRGHPw_-OCf4RM1%+tK~oGgB2tKm zh=>spBO+P^3=}OSs|G|wL>iSMB2q+4k)onfo09*WJ2Tl~cV>4Jzkj~(vh&`|z4yFx z@16Jdy|*E_)WWwLy5aatez3-^>vIphNG`aN=TCO$1zrJ@#`s+SeU4yTwZBTRojC2S zV1MXgNb=%ibcoGf$4y==#Qt=M$qUcP&1eY!_5K2u&5 z+;3srg*n7%*O$Ra`}TV}o@@C?j8=hFOmXx0Ti9kJ#`)r3>ntY(Cd$(cz*~V z4 zB=#|1m|xa)tLM7Gxf};_r@Hxz=ejTpx^OM{-bW`q&L$7ZpM72E0T}g*$^j3~$*^r2 zseb{qFVdA=qRiHrGAl zNzlWPa9D>9v6Vnr%jAuE6s{vfAbK6~T=RG-jQ93RAb*yKX=-m7W#r%?XB4Ikt;9}=Fwo9kJ@HVfNE%po@W zI-mKQ4M~J#Ky)t0oyykbSD>ui&0iVpvJs=}p4@ooVMzFU107;yU6^_>`OF$gf@q9C zHP_Ge(0B0@et+HdzTa1t?+vg`->hrn`&9J#9fRE2-*Y_|epc`#d8N`T9(40!?*EdP z3CFdOyjneWlH}FHs~!%PLk~m3%X)N(jj=M=%(ZhVPF`nX|L~h{D86&-&BJcLb;7Vw zH%z=a^edO~J##UhI?Ij2e(uw)|IY-+HAU(fY{5h8f}NlTb-_`K4snz~h@(4%_$iFf z5QvVCXTAC1*Chu$$6@n%*@chGbxBR}$#PxN@}AV{UFtYweR^Hv9f(i++|PP2>v`+~ zwH7Gojkvv`q2EKomH2o~`^XRT!5B8?aNIxNI@VVmd@Zwly{dghJ08!)OnA;-p>G2P zGDbHvw4WV{eGonvd%HeP!8U880Akli>@_!3A8X-*^_~o24w?KOi#{83%k_J-WeS^c zU4uE)4L^RqzYrsyk>_X4KnC99Y5#EO_^V+n{P_8NIt9WyxEK;W_rs}H7XoYH!*%>) zV2_Q*?|P?tnZhQ=zYcSF{NcHF8TQ3Ees4jVM93*>Xy`I0l-FyuuV`F*)<-YbmGCji z^6}R=j>nT@F^{Jm-g((?`FbUefncQ zH0w0F(&%SQ`|`~G^FlFy3HvyA0K`7eTHJV+3U?Sb8!Ou<|>^6&Z$!TZ)RyofoppAWJ5vdUqJI$562f<5@Ml!-WDrsxcduim7&DC=CbT$Cp3MgW zrSd*(LW{{YU7(90sjU~#A%{_pEAH=4W-+h43+F$}$Mf1pv^aU3#Eerry!*0W`4SG7 z!WTKTzZcrMn1wN&i%qR1_5G*9owvMS%|^~$&o;`ThautoIdsT*G(J2FNlZSHC*t@2 zmaiq+S9pBwYe~(D6L$Ii76n*IYL@yn#dTwMqGs_%4Rc@c5eZ)JBY1 z-}^6a0*8&y`AZhK>&ko|gA0kV>pSjP=weu6{0usbag=j~XK8YuD~{k@pKX@?N^L(p zhb+!L=joU!6ezr>p`p){p<{X4kYWZ3eo3|!iTJ|Wx^Z$jRV|mJR zEEaE`ugWnGB~Srh<1JqcwXg8nOUHh5?czCUxRU^S0lagw!}9T@$A@)Z^4yp9mUo~K zKKeWr;%tHT5ha&B_3b{4TG(Hp>`P5Q0sE|J=CV2OnKe5P`!0nvea%)SJ#dk`o=aiV zG_Kj7^u9#y6@B*JiX3j=)cd7iKj@+>7%R{rM&dCT8As+#62!U&^l%Z$H%}-B;3EzC z$&d~ZW81!8@Y8o55NtVH)Nd&}P5dUfOJS3>=5ehXo@e$N@5!@BQ2Hc30`KMYdpeW@ z^R*Ag$ak&fDa@FKQwLwmlrQ1>G5BZ)Auo(YDIA`GPtSF!T*;?by5oE|^h~3z{vH37 z$42`P>2FI>S${ zUz!Wg^@t1JP5irDhW5Z5ytgyj^7RmG(pPwXTCw%yjz6|X;A5NeA$2+n_QQEM&kzbQ zmI*O2_FM->IB&0OJSFdb$awC*GQc{{ovce`O%-e6C4jc3<>8Cq7#F24-c_9 zPrR?8VUO}9e3ZjiI3Fet7h=rL!zQfLjd?idId>kG!nSGT#P#>Srb8D)!pTf@VsPTQ zUWv^~SG*7Nt@0(D%z!U)!kjnf)pYck^QjQ)y25jMboo@2R4#S%F%Nc4BOk8c0nUdm zhJ+8^2O}R*Y9;RUH3gY^e?!AjFJf?r}|LBSzP6 zW=2B~LlSeoV@HhH5R-3)OJ2+IIg{sFt{?8o_T?W~Ojzc|aSwFah$Cy@WwQQO^m<)l zdiiIog5O#=_)aHli+Rs;9d-EG5pvV08^(>hb@;7#u&n=zA{Kw_FM(g?Wh$gI#NaPQ z-!$5}BlL!!v{JH+N$sp)qNMbx09b$Cxmxuk< z`!Sw#S_FN8dJnE+;_CPth6gQNr4C$XjUaaxE>G%3;+nI<&DR9jW0RV2{HA;=^e`-Z z-HT2*Uj`S?K^Csqm*x7(ByFV|SFG>P-JoZ1G3LaT4+*aW#?iV?@Wj!7ov_)1&*shb z`?+ch-ybwSu4Cc&x?>-?({-9@&fmE&x$BTQYZ_zbdapPOx)_p}m7qgz$b<2njDD)# zXT2xjxu0(3l6-BD+vtJ49(<*+YZ~#n?#t}3@bP>=e1jndpIO7mjkQjAVi$d#m{H~C z@h;dljW~C1>n!J@HU*31d~iaC^57KcVo0v@cprkb#~kB1h*y-@iO{AIXKerknsD*RBd35A&BkqTC#>(|f>_X^aNPLe+hrSsrDt1cN+`+C}wDBi>;)YLU@ocyk18gaV*26q?eVn}d~Lx(u+_3adF zGrtNTJs~F69-QF`Elk$J2iNOlNC$|?BXi$|*j)E)R$JJ34kI?!vC%hqSb%-@^;v9e z>V!GuWjC(Lu*XJR**8y?wd|Ujqq3H@9?{~}tD%P>;p`4{;>|w~E^mF*Hxu7M3ty4O zNB+ZkWA2f|P)yZzfAK5sd@8|Q)5wkMH`*hiiy`5L@%7~s*A6?MJZpzDpE~^2jgQv` zrV*d(?{2h#E`|i(Xmp6rSC{tpaSy)8*je|@)_QP;*X4Qds(WWWzF8+ZR(jIz;7SW8 z*PCcK?e)WRgCy$uQMlG!2j+PJxpe(rc@Xq4BwUWt814Er-*Y!&%&t!lMl05;KI=WW zu7^D~;#xJSMC$YTbt&@Q>9hw{1P@xcZo?dMmk%-dZ{xDh4<1~RG1ljYigoThH2C1s zi*Yt|F)ZATL?=AH2ID-8Wk5`gHsOAY{#JnhkD}TgU!FVJ$fN7G*u9{KA;EYnI>hLT zFEQHj^#vDz0mkh0*Zgq!u*dJcn9oKYroLJ#yi9#n-7`48v>~`p`IT$z z;h577!o25vUjO%n=sfZ)XI6;hvRbz9x^>*m$Gj(44?mq@%hVIPjg7uWEpL}SHW=dI zX6$2(n8!xP=AaAqd+vcrr6s)Tt~0|2)V-X1Z{grMhd6lNGdPHsQi?AJyw>SE{+7Hk zV@CDNy-4@4uMGYfzhp=Uh|!I8|7|??W=#aC)5YFeZ{2bCu z{1Mbv!B@1p-7qFM>-^eK&JKlda$5v3InVE#6k@v$5*=H3Ud2c0VPJ68+l%nWMs6-V zHBZ*R_9G|faJymMNw4lx*oDjAW1n3I#M>9rG`ybm4}N_4Irr|2-lNxtIAs2j!(s?I zWE~JMg_wQ~iH<4q^b{Y9`0ld7%_HNyKBs~5O9h*nu z&w(@*b3`)2MlSsLc&s0{AZxY4A$87cD)LwYA&=BEJQ5Rm)|fo^bAEW8&v?UK=jMI| zAPA)PEDQjSgbm4D*gy+E8r_yo#(7rpV@w6 zC`XBd+%hjroga@rh1je*_h4&Yo$mw&SDo`(la28${NN4A&%hTWB|k?l`cZI;!Y*9? z26OEEB;LM|=(!M%*Inm(;fMANC+JEeuD5!nIe$s|j=N=B5 zU+1Zt+&r>Y*ofo#3ICCt%=`Bw$;s5NMKztEhausS_-r28&w)heF%ok`GQvh0{P=j> zxAI3>3lt8i^L)%FkHk(MQz3>&Vj2R`m^`=het4Z1zvZs;2jSZ^Vj6h;W5Vg@DXoOl z{%PmejD#+Rgj3cHF`GK)u~>E9RQwUtmcduFIzQ>>8#2FFc!b+);h)?VLC7tS!JN;D z&8l+`w(z=%?@Uy?*`1$H!Y3PXELnM$oX`8zj>+NnEeY#`tD%RkaQQ2A$R+cGLX6P?)o?YKfd~yzvA|+ zp<6>7{t|xe`k_v*J6 zcJk)SOJcY3(u2MEd0FehKNEi0h_^?b`fcJ>bB0S^em9i`iBexp#UDX!Dttw&_fFlv&X}+8$o1J3z(a0} zAmosUT*~`xe4uT%Kg1sL)#7-Sz zXD)CaC9J+Y*qdKpsUNuU^Z6bdF{IzqLh$$4_OR5~qv=CyIzbOZk_&y&iBey2V~$7y z*iD0fIXPzc`+2NHo!Gj9B#I=v+G9RBN7a424M#YKOeO;9BcmKLs z_36Jd$739>q&_Y~C(iotU}}DSwqA5{&dj?a{jrd*A;U9N!)x(^>pZBSUAl>hn$)^=doCI-c5yZrzHTo1{v##$v*qT@8dx61K=QqM98)JKSX*a3! z9m8Lf{OplaRP(;VE?j0{4!JaSPP}~~(Q_djue;6hCKw|@Ci=FpEq99{~aHt)6pTPo;q(T{s?Mkz*n?7zh&m8j262?IeH1O zklP}NsdICk$sD!n+=DH=ZsMbKGcdU7d;om1F+VHjt(Q9gp%Q_Uic4porgcy`)bpZQs*a^ohx;|bauz;>!Azi!YO~VK~6n&-cZ9POuV|AsdT*#sSnqMxi3d~j>m>VA zcb$v?CN}c&@k1BOdQ-LGU8$4T=a$!82t5o*OfNtuN}a@wIU<;_n+rd_cz$@>_RQ4^ zht$dWm>;c9_`ZW)126vndeLRZJ~y9t!#5lGT=K~qQX?ZCyGCl{#x|E#Pl6tXgwONP zA)iAaCNH`EHy68$*9!Ra@p}5EnvAxeh4Pc@o}Hh>WSX6FfUFZ}p;tU7va)}YTr zJe~ueHjm_$cqzoh=i@OjHM`JL|LNv&7<{r3L%Zz(sq@a6Pf4BUeU@F*33?b39)HX^ z5XPJHiN7zT$(S7pnFc>T9#8*nRptVPL+U&gKJ7YRh;8CAnCv?Dyh!kaPl(0)-F%ww zx5P4hSe4ZI#6D+BoxiuNU-fY4VMzEq8y)g#>U<&kcAYmByMxw18T|QpU0%92<6DJC zcuj$SJ3o00rp}3taW}E{U~6`r*Z#}RXAyj|5zA8(PLMhu^YUXk-2V9N(ZSWwLs$6x z5ju8$F2uH7=W$|o*7;`m@$tBL=8ai>4utAF8GdaZdtjS*DGMQXoqI4fyUvroaPw%s zj}gO#bt~k$_xVZPrOpd(%&0kP;Xey=?D{Iew!zP1vFpq80RIQqSDoiLM!-KC@$_Dw zB**dYxr?Q~l84<`y#aa{l6>ff4)ej(R{{EVeKi%ktG@QapD!=+F9>7~IvC1}GvVLP z3m${1FJiOn%Y&`i^_BW>cV3w1LB#TMp*oL#{Z3^Lx4(FBX0Q|V&=o$3-L9_!Y}@q} zCuV1TrNNJn$G?1aN7e!h#~EQbn3H`WW^MA|h^#3`K&ztQICuT{%ySNf_~1MF(elgl zAlzFU4mg2a`1A2n)~zOEwZbFo_vyeBZ+>~OH9Nn`zI5jo-xXkEJ{`LHTXlc(=$|FO z4sA#Zj)xwGB)@onCR%>QiP@Q7GvLR^W50gmvW_Yo!sBVcVe`oRB7RB%#9qHWm?Cp6 z&MV%{U%7c44!>-~@Z8%+WZmm?@*}c-FMRBZnri4_NO3}kEjWb7j_^qyOCY>&Q3x?S5|e!%@jRUO)=Hdo2D%=0^JzaH`?&K0xlXHH ztgh1@ZQrT7E%X3U_~h}D&mj=v=W*EQk)~pI&>BdCKOe9CaGkb5;SpX>g?~FgiOHNt zh%FytV(r1!>^dLyjhjz%ABb4a8t|RedB;-qyMwP+-Vz)NJq!t-?a{IGlbG%ENSv6R zbzTfVJ|1f({UU3hg@eCkvT-mc`$A%H7g^hmfYwLB@!^&GGe>_LsuNtrLg$s>iB z>^kv0WdFf+lKY)IuJ(1~&X-n6oveTJ0=eEg`;F}CUeE(XiR;Pe@K{Wp@Hp%`X)1OH zt$}>_^TqdxiAOUwC_GXpZQ(z>PRz49VzcYSgRR+hGUte!PrhHw#@H6^u8=x;cE@!& z+#dbuhMK9+Ls$6Z?;gn~adMoP?K+7Qv$IYr;K#?~hO^JhYWts1ekQ@M&0_(!iN|2F z&xamNku?+N0=^SCT=#$M_2#AfOXPfb>Dw*keE9b%UsP{V_=U$4fz{@b`1?XkoyU#Y zk&t`g$H(LB_7gJ)eIMen4Sd>l&U;vf#{%^2I`_OR@Pq3-?Z0k5Z-aj};_3ZV2jTOo z+n$#?A8^85)!m?nA>s1`bnH6kaoBa^d*0ck^j~uj;>JhSd30z0~_R?_U`Rj)op!37=e(?EEBVyUydp?5y)L z`0?>LeA5|O-zpr!V=MTzc`U%T$xmXk>)eB>*>zs~KR1u#;Fpa!QXY9(^77Qje<^kT z(UR1f)zHI`@W`0kJQiTPFC<3nBQZM?ax?t+c>Heii4KFQ+{2>YciLOd2= zI~(HpEq9!&BL4qT_;am`C9us#zG^qN5x)L-e7V%|Ck6La?^8I1FTR&dzS8x4;!Hu` zgH5xd{v({7u;b(7(fKE4UfmGl17_vDBVOmRPoXcyCi3qp#znzJ#ba+6J+5fXxSL@k z3Dy(vtI;GJ4bN3BUhM1r*Sijs>zExY-j(}_nZJ2`9sU<~{Qbs6yvIW;9k3-$cp{hN ziE(3}_|JqyV!!H!;;Zf)W&W?ABtgR*Ahu)wJ|s398*R2c*eQeOw1nh9VBT5C9Y{M! z9d2S3Loy({;j|Jm7}6E8xoaRW3z7>-gsjDlky1zx$YI=^SOO`6Btz@FqzIArLBneW7i^h47fsl5PAc!x6WH11bT3iH8fMi1s zfW+mHLP#3q5KioskP(p1kexWe&VuAa5+R#$qAG#pKn~+1u@+JaNrxQ4qPqk#3X%-@ z9E;xzkOD|5qz;S8Jjg&u2gr65@(f5G3mQ_3;+X)+h8#jdR6z&`QV%NLR>S zs7!+lP^y8$>K%y_U{>-96v}V4_=)L(lkg8eDXO(X1|boFjPBB6jnS!+`D zCu^FmEtHeA9ol0{!aFcUO80o~qAq===}+!ES?i#y8|$LAWUy5i$AL2J4M~!E&UGSf zBBcV!`V({nPJzu1+Ai(0%>$Nth2=QTaUI90nA;K3UhC2h+d=j0{5Wn>a@8~aPUOes ze-ig=8??#R8Sk+KqhOwaIVBKIr#{<2=qy*hyqN!lc{3r?A=ITEwngeWbtC2@F)-f? zod+PaL7QxRH>ig?_7;q@i?hkt|3}Ca2z6^Ysh&@}Y?<<^t>^+1M|i~#zLq|J8WIV|8Xseo5HbxdhYMVP>d-> zdM@p<)d53>((~rwU6?lkG9E%dw8O^t6R@W-4@zVB9|oN}A+$l8Z28FDdiC7RS?{-F z&NUF~P@ioYbPmM8JPPwhLvDjmmv-28sXA?pTe^y)m$L%sjD*kzZL%dI*G4HlZ`^Lg zyh6xL5b72{*tUuP5N8})69dD|m~#V!b7_}t1~42_dfvE=z`R=_w?L>%J8ZdO7%~(m zUJl6ra0qSCCR-Bnwn{x8bH?rWm~%OVI@D*&flj67pE>JVbB195P{{8f)TJG^x$4<+ zW84nJz?=u2>mjs3n`}Y&=erB&dgFF22J<1;L8wbRY}3X6v2iO>w!FOk8hXEi&<^dg z3p6^t8-njL{yj;i? z5bEYY*!qe85N90guGSkbhM!?hUkK;YE?Ye?6evA!+;~m)3&_tQ)TJG^D$xsJSQ5iO z`R@;*4ccTYLB8*f;XfPm`apU^s7pI++fKqm`A8??z*5B~*9&l`iD zn3oQ@1VUZfVN=i1L&wjt7h+)eDdt=Z;au8fD@DHVReIhSWMEz<R5Vmya%+vh)YUd}IcM&8FLS5Qno384# zF>bqKVEz$wE`ZPmZL(Fte`odHfH!VuW8RM;sSxVY4%;O0AL5K-h02zfw{tKj1;V+s z%a(z>tx$U2xSfZ2=R>+fs7pI+heR)gVQ&oo$n&jA&l@+M$cz)J z(GFWL@gL%hV_ns=eJ_R&X5b98$tr9wkntxy1{)2h{g&cuUmv-2`SG8>@4df|1 zUe3ON&UX;npiQ<^O~q8@~EX_svYFeK?? z@p14C=6wq}452RVuoZ-1$X9l}9FYI7A+$l8Y~7K!Gh+DvC+7VN@;QXMw8J(@)oEi4 z_>KT}d;Nce!OtMHL7Qw#;6F*dJK&AM$C&pCWH*Gmw8J)5{2%+hYOu29#jpo@yCAef zyKHI5_gPBMTRWd(-ag1)2z6x%g*ktQP>1?#^Pt0b1km+j-h{!;kT*FALOW~+)idtK zxaF!idO53x&PE7r&?Z}FdI)uChmG$6U{B+Hi~-7ymjm)&3!x3#WaGO*wd(noGj4yyoaZ6b zp*~w4bgE)teg*T^LSBYYmv-0|tJ-diTYU`7%b>FcLL0QnwiW)<^*aJBvE2^R6~gO4 zesbS($V$ix2z61WrCu^g!EczJvfI*&nUgErZck;i39&l}5!Fz;c=YzTE} zhi$*C9U&e$wlxNZIhZpG!nw4|RtXG=Y8~>%@=?s22bl|@F72=l3B!=5?07jK|Bpau zgErYxk;~I!_&*c#&Vq1_p)T#P-L0M{H^yLV4F9J<=L`sK&?Z|Y{3q(?_`WsmRLtuL zVSZ7UcGyOX|6^m2uWWfa=mfp?5Za+#w(h8b8A{I^gVQmu3#2oIy0pW#Tl7L0>SFjm z89JvyXoEJ{mcW0K=F+Z%9vCz0dRz3{K~92Dmv&TZgZ=6mT^qHdF&+iVrk4x)N`lZH z?X#sLhvzB1V`?HDV+KnC`mG?xL#RtTs{KE=w8*MBD{Cqj=lG@pS1$mt&$jS9-ewK8~t>%T}spQrfD|gid<9oBC{dj^5si&z^c;mfqJ-eI{I~x3AUX z*X!*Bv!~vdrT6typ9xp$?Q8Y; z^?Lh8y**NIk5!)udhJe`qQ|G}@pR<*T)l6B-nUqNCM?t2tMz!b-W~y+O?uxpy>Ey5 zOxUBh_v`Vm^mYYwj_Q4_RBffSRi6o+^maG(+43B{eWBj&rMLU2&xHPZ`)WNtNN*=# z1>qVpT<;&P_m5Yf36u2pR6Sm|j+gZ_(2^@dYt@I{{{wjop=SU9 literal 0 HcmV?d00001 diff --git a/examples/nntool/visual_wake/vww.c b/examples/nntool/visual_wake/vww.c index f8158224e..84ec96b16 100644 --- a/examples/nntool/visual_wake/vww.c +++ b/examples/nntool/visual_wake/vww.c @@ -11,7 +11,7 @@ #include "vww.h" #include "vwwKernels.h" -#include "ImgIO.h" +#include "gaplib/ImgIO.h" #define __XSTR(__s) __STR(__s) #define __STR(__s) #__s @@ -30,137 +30,90 @@ AT_HYPERFLASH_FS_EXT_ADDR_TYPE __PREFIX(_L3_Flash) = 0; -#ifdef __EMUL__ - #include - #include - #include - #include - #include - #include - #ifndef TENSOR_DUMP_FILE - #define TENSOR_DUMP_FILE "tensor_dump_file.dat" - #endif -#endif - // Softmax always outputs Q15 short int even from 8 bit input -L2_MEM short int *ResOut; +signed short int Output_1[2]; typedef signed char IMAGE_IN_T; -L2_MEM IMAGE_IN_T *ImageIn; +unsigned char Input_1[AT_INPUT_SIZE]; static void RunNetwork() { - printf("Running on cluster\n"); + printf("Running on cluster\n"); #ifdef PERF - printf("Start timer\n"); - gap_cl_starttimer(); - gap_cl_resethwtimer(); + printf("Start timer\n"); + gap_cl_starttimer(); + gap_cl_resethwtimer(); #endif - __PREFIX(CNN)(ResOut); - printf("Runner completed\n"); - - printf("\n"); - - //Checki Results - if (ResOut[1] > ResOut[0]) { - printf("person seen (%d, %d)\n", ResOut[0], ResOut[1]); - } else { - printf("no person seen (%d, %d)\n", ResOut[0], ResOut[1]); - } - printf("\n"); + __PREFIX(CNN)(Input_1, Output_1); + printf("Runner completed\n"); + + printf("\n"); } -#if defined(__EMUL__) -int main(int argc, char *argv[]) -{ - if (argc < 2) { - printf("Usage: %s [image_file]\n", argv[0]); - exit(1); - } - char *ImageName = argv[1]; - if (dt_open_dump_file(TENSOR_DUMP_FILE)) { - printf("Failed to open tensor dump file %s.\n", TENSOR_DUMP_FILE); - exit(1); - } -#else int start() { - char *ImageName = __XSTR(AT_IMAGE); - struct pi_device cluster_dev; - struct pi_cluster_task *task; - struct pi_cluster_conf conf; -// gv_vcd_configure(0, NULL); -#endif + char *ImageName = __XSTR(AT_IMAGE); + struct pi_device cluster_dev; + struct pi_cluster_task *task; + struct pi_cluster_conf conf; + // gv_vcd_configure(0, NULL); + + //Input image size + + printf("Entering main controller\n"); + + pi_cluster_conf_init(&conf); + pi_open_from_conf(&cluster_dev, (void *)&conf); + pi_cluster_open(&cluster_dev); + pi_freq_set(PI_FREQ_DOMAIN_CL,175000000); + pi_freq_set(PI_FREQ_DOMAIN_FC,250000000); + task = pmsis_l2_malloc(sizeof(struct pi_cluster_task)); + if (!task) { + printf("failed to allocate memory for task\n"); + } + memset(task, 0, sizeof(struct pi_cluster_task)); + task->entry = &RunNetwork; + task->stack_size = STACK_SIZE; + task->slave_stack_size = SLAVE_STACK_SIZE; + task->arg = NULL; - //Input image size - - printf("Entering main controller\n"); - -#ifndef __EMUL__ - pi_cluster_conf_init(&conf); - pi_open_from_conf(&cluster_dev, (void *)&conf); - pi_cluster_open(&cluster_dev); -// pi_freq_set(PI_FREQ_DOMAIN_CL,175000000); - task = pmsis_l2_malloc(sizeof(struct pi_cluster_task)); - memset(task, 0, sizeof(struct pi_cluster_task)); - task->entry = &RunNetwork; - task->stack_size = STACK_SIZE; - task->slave_stack_size = SLAVE_STACK_SIZE; - task->arg = NULL; -#endif + printf("Constructor\n"); - // Allocate some stacks for cluster in L1, rt_nb_pe returns how many cores exist. -// void *stacks = rt_alloc(RT_ALLOC_CL_DATA, STACK_SIZE*rt_nb_pe()); -// if (stacks == NULL) return -1; - - printf("Constructor\n"); - - // IMPORTANT - MUST BE CALLED AFTER THE CLUSTER IS SWITCHED ON!!!! - if (__PREFIX(CNN_Construct)()) - { - printf("Graph constructor exited with an error\n"); - return 1; - } - -#ifndef NO_IMAGE - printf("Reading image\n"); - //Reading Image from Bridge - if (ReadImageFromFile(ImageName, AT_INPUT_WIDTH, AT_INPUT_HEIGHT, AT_INPUT_COLORS, (char *)vww_L2_Memory, AT_INPUT_SIZE*sizeof(IMAGE_IN_T), 1, 0)) { - printf("Failed to load image %s\n", ImageName); - return 1; - } - printf("Finished reading image\n"); -#endif -#ifdef PRINT_IMAGE - for (int i=0; i Output_1[0]) { + printf("person seen (%d, %d)\n", Output_1[0], Output_1[1]); + } else { + printf("no person seen (%d, %d)\n", Output_1[0], Output_1[1]); + } + printf("\n"); - printf("Call cluster\n"); - // Execute the function "RunNetwork" on the cluster. -#ifdef __EMUL__ - RunNetwork(NULL); -#else - pi_cluster_send_task_to_cl(&cluster_dev, task); -#endif - - __PREFIX(CNN_Destruct)(); + __PREFIX(CNN_Destruct)(); #ifdef PERF { unsigned int TotalCycles = 0, TotalOper = 0; printf("\n"); for (int i=0; i<(sizeof(AT_GraphPerf)/sizeof(unsigned int)); i++) { - printf("%45s: Cycles: %10d, Operations: %10d, Operations/Cycle: %f\n", AT_GraphNodeNames[i], AT_GraphPerf[i], AT_GraphOperInfosNames[i], ((float) AT_GraphOperInfosNames[i])/ AT_GraphPerf[i]); + printf("%45s: Cycles: %10d, Operations: %10d, Operations/Cycle: %f\n", AT_GraphNodeNames[i], + AT_GraphPerf[i], AT_GraphOperInfosNames[i], ((float) AT_GraphOperInfosNames[i])/ AT_GraphPerf[i]); TotalCycles += AT_GraphPerf[i]; TotalOper += AT_GraphOperInfosNames[i]; } printf("\n"); @@ -169,19 +122,12 @@ int start() } #endif -#ifdef __EMUL__ - dt_close_dump_file(); -#else - pmsis_exit(0); -#endif - - printf("Ended\n"); - return 0; + pmsis_exit(0); + printf("Ended\n"); + return 0; } -#ifndef __EMUL__ int main(void) { - return pmsis_kickoff((void *) start); + return pmsis_kickoff((void *) start); } -#endif diff --git a/examples/nntool/visual_wake/vww.h b/examples/nntool/visual_wake/vww.h index 7870ebe44..6724cccca 100644 --- a/examples/nntool/visual_wake/vww.h +++ b/examples/nntool/visual_wake/vww.h @@ -13,7 +13,6 @@ #include #include #include -#include "helpers.h" #endif extern AT_HYPERFLASH_FS_EXT_ADDR_TYPE __PREFIX(_L3_Flash); diff --git a/examples/nntool/visual_wake/vww_emul.c b/examples/nntool/visual_wake/vww_emul.c new file mode 100644 index 000000000..783304455 --- /dev/null +++ b/examples/nntool/visual_wake/vww_emul.c @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2017 GreenWaves Technologies + * All rights reserved. + * + * This software may be modified and distributed under the terms + * of the BSD license. See the LICENSE file for details. + * + */ + +#include + +#include "vww_emul.h" +#include "vwwKernels.h" +#include "gaplib/ImgIO.h" + +#define __XSTR(__s) __STR(__s) +#define __STR(__s) #__s + + +#define AT_INPUT_SIZE (AT_INPUT_WIDTH*AT_INPUT_HEIGHT*AT_INPUT_COLORS) + +#ifndef STACK_SIZE +#define STACK_SIZE 2048 +#endif + +AT_HYPERFLASH_FS_EXT_ADDR_TYPE __PREFIX(_L3_Flash) = 0; + +#include +#include +#include +#include +#include +#include + +// Softmax always outputs Q15 short int even from 8 bit input +signed short int Output_1[2]; +typedef signed char IMAGE_IN_T; +unsigned char Input_1[AT_INPUT_SIZE]; + +static void RunNetwork() +{ + printf("Running on cluster Input_1(%p) Output_1(%p)\n", Input_1, Output_1); + __PREFIX(CNN)(Input_1, Output_1); + printf("Runner completed Input_1(%p) Output_1(%p)\n", Input_1, Output_1); + + printf("\n"); + + //Checki Results + if (Output_1[1] > Output_1[0]) { + printf("person seen (%d, %d)\n", Output_1[0], Output_1[1]); + } else { + printf("no person seen (%d, %d)\n", Output_1[0], Output_1[1]); + } + printf("\n"); +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) { + printf("Usage: %s [image_file]\n", argv[0]); + exit(1); + } + char *ImageName = argv[1]; + + //Input image size + + printf("Entering main controller\n"); + + printf("Constructor\n"); + + // IMPORTANT - MUST BE CALLED AFTER THE CLUSTER IS SWITCHED ON!!!! + if (__PREFIX(CNN_Construct)()) + { + printf("Graph constructor exited with an error\n"); + return 1; + } + + printf("Reading image Input_1(%p)\n", Input_1); + //Reading Image from Bridge + if (ReadImageFromFile(ImageName, AT_INPUT_WIDTH, AT_INPUT_HEIGHT, AT_INPUT_COLORS, Input_1, AT_INPUT_SIZE*sizeof(IMAGE_IN_T), IMGIO_OUTPUT_CHAR, 0)) { + printf("Failed to load image %s\n", ImageName); + return 1; + } + printf("Finished reading image\n"); + + printf("Call cluster Input_1(%p)\n", Input_1); + // Execute the function "RunNetwork" on the cluster. + RunNetwork(NULL); + + __PREFIX(CNN_Destruct)(); + + printf("Ended\n"); + return 0; +} + diff --git a/examples/nntool/visual_wake/vww_emul.h b/examples/nntool/visual_wake/vww_emul.h new file mode 100644 index 000000000..88a4c860c --- /dev/null +++ b/examples/nntool/visual_wake/vww_emul.h @@ -0,0 +1,18 @@ + +#ifndef __VWW_H__ +#define __VWW_H__ + +#define __PREFIX(x) vww ## x + +#include "Gap.h" + +#include +#include +#include +#include +#include +#include + +extern AT_HYPERFLASH_FS_EXT_ADDR_TYPE __PREFIX(_L3_Flash); + +#endif diff --git a/examples/pmsis/test_periph/i2s/output/Makefile b/examples/pmsis/test_periph/i2s/output/Makefile new file mode 100644 index 000000000..1ceceecb4 --- /dev/null +++ b/examples/pmsis/test_periph/i2s/output/Makefile @@ -0,0 +1,36 @@ +APP = test +APP_SRCS = test.c +APP_CFLAGS = -O3 -g +APP_LDFLAGS = -lgcc + +CONFIG_I2S=1 + +SAMPLING_FREQ = 44100 +WORD_SIZE = 16 +SIGNAL_FREQ = 4000 +STIM_WAV_0_0 ?= $(CONFIG_BUILD_DIR)/stim_0_0.wav + +override config_args += --config-opt=**/runner/gvsoc_dpi/enabled=true + +override config_args += --config-opt=board/components/mic0/mic/stim=$(STIM_WAV_0_0) +override config_args += --config-opt=board/components/mic0/mic/stim_incr_start=0x0055 +override config_args += --config-opt=board/components/mic0/mic/stim_mode=incr + +override config_args += --config-opt=board/components/mic1/mic/stim=$(STIM_WAV_0_0) +override config_args += --config-opt=board/components/mic1/mic/stim_incr_start=0x1055 +override config_args += --config-opt=board/components/mic1/mic/stim_mode=incr + +override config_args += --config-opt=board/components/mic2/mic/stim=$(STIM_WAV_0_0) +override config_args += --config-opt=board/components/mic2/mic/stim_incr_start=0x2055 +override config_args += --config-opt=board/components/mic2/mic/stim_mode=incr + +override config_args += --config-opt=board/components/mic3/mic/stim=$(STIM_WAV_0_0) +override config_args += --config-opt=board/components/mic3/mic/stim_incr_start=0x3055 +override config_args += --config-opt=board/components/mic3/mic/stim_mode=incr + + +gen: + sox -n -r $(SAMPLING_FREQ) --bits $(WORD_SIZE) $(STIM_WAV_0_0) synth 3 sine $(SIGNAL_FREQ) vol 0.995 + + +include $(RULES_DIR)/pmsis_rules.mk diff --git a/examples/pmsis/test_periph/i2s/output/test.c b/examples/pmsis/test_periph/i2s/output/test.c new file mode 100644 index 000000000..b80fead6e --- /dev/null +++ b/examples/pmsis/test_periph/i2s/output/test.c @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2018 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This example shows how to use I2S to send data to a DAC and/or speaker. */ + +#include "pmsis.h" +#include "bsp/bsp.h" + +#define NB_ELEM ( 256 ) +#define ELEM_SIZE ( sizeof(uint16_t) ) +#define BUFF_SIZE ( NB_ELEM * ELEM_SIZE ) + +#define NB_CHANNELS ( 1 ) +#define NB_ACTIVE_CHANNELS ( 1 ) + +PI_L2 static uint8_t ch_buff[NB_CHANNELS][2][BUFF_SIZE]; +static struct pi_device i2s; + +static void buffer_init(void *buffer, uint32_t size) +{ + uint8_t *buff = (uint8_t *) buffer; + for (uint32_t i=0; i 0 && nb_transfers < 2) { - current_size[current_task] = ITER_SIZE; - if (remaining_size < ITER_SIZE) - current_size[current_task] = remaining_size; + int iter_size = ITER_SIZE; + if (remaining_size < iter_size) + iter_size = remaining_size; pi_task_t *task = &ctrl_tasks[current_task]; // Enqueue a transfer. The callback will be called once the transfer is finished // so that a new one is enqueued while another one is already running - pi_camera_capture_async(&camera, buff[current_task], current_size[current_task], pi_task_callback(task, handle_transfer_end, NULL)); + pi_camera_capture_async(&camera, buff[current_task], iter_size, pi_task_callback(task, handle_transfer_end, (void *) current_task)); - remaining_size -= current_size[current_task]; + current_size[current_task] = iter_size; + remaining_size -= iter_size; nb_transfers++; current_task ^= 1; } @@ -77,25 +76,20 @@ static void enqueue_transfer() static void handle_transfer_end(void *arg) { nb_transfers--; - current_buff = current_task; + current_buff = (unsigned char) arg; enqueue_transfer(); - if (saved_size < BUFF_SIZE && nb_hyper_transfers < 2) - { - pi_task_t *task = &ram_tasks[current_task]; - pi_ram_write_async(&ram, (l3_buff+saved_size), buff[current_buff], (uint32_t) current_size[current_buff], pi_task_callback(task, handle_ram_end, NULL)); - saved_size += current_size[current_buff]; - nb_hyper_transfers ++; - } + pi_task_t cb_tx; + pi_ram_write_async(&ram, (l3_buff+saved_size), buff[current_buff], (uint32_t) current_size[current_buff], pi_task_callback(&cb_tx, handle_ram_end, NULL)); + + saved_size += current_size[current_buff]; } static void handle_ram_end(void *arg) { - total_hyper_transfers ++; - nb_hyper_transfers --; - if (nb_transfers == 0 && saved_size == BUFF_SIZE && nb_hyper_transfers == 0) + if (nb_transfers == 0 && saved_size == BUFF_SIZE) done = 1; } @@ -197,12 +191,11 @@ static void ov5640_test_pattern(struct pi_device *device) static int test_entry() { - //printf("Entering main controller\n"); + printf("Entering main controller\n"); pi_freq_set(PI_FREQ_DOMAIN_FC, 250000000); pi_perf_conf(1 << PI_PERF_CYCLES | 1 << PI_PERF_ACTIVE_CYCLES); - pi_perf_reset(); - uint32_t cycles = pi_perf_read(PI_PERF_ACTIVE_CYCLES); - uint32_t tim_cycles = pi_perf_read(PI_PERF_CYCLES); + uint32_t cycles, tim_cycles; + uint32_t start = pi_perf_read(PI_PERF_CYCLES); pi_perf_start(); @@ -257,7 +250,7 @@ static int test_entry() unsigned int cfg_glob = (*(volatile unsigned int *)(long)(0x1A1024A0)); - cfg_glob |= ((0x4<<1)|0x1); // enable frame drop, and drop 1 image + cfg_glob |= ((0x5<<1)|0x1); // enable frame drop, and drop 1 image (*(volatile unsigned int *)(long)(0x1A1024A0)) = cfg_glob; //while (1) diff --git a/gvsoc/gvsoc/bin/gvsoc_analyze_insn b/gvsoc/gvsoc/bin/gvsoc_analyze_insn new file mode 100755 index 000000000..6d985340f --- /dev/null +++ b/gvsoc/gvsoc/bin/gvsoc_analyze_insn @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 + +import argparse +import os +from subprocess import Popen, PIPE +import re +from prettytable import PrettyTable +import collections + + +class Insn(object): + + def __init__(self, label): + self.label = label + self.nb = 0 + self.min = -1 + self.max = -1 + self.total = 0 + + def add_instance(self, cycles): + self.nb += 1 + self.total += cycles + if self.min == -1 or cycles < self.min: + self.min = cycles + if self.max == -1 or cycles > self.max: + self.max = cycles + + +class Trace_line(object): + + def __init__(self, time, cycles, path, debug, mode, pc, instr, label): + self.time = time + self.cycles = cycles + self.path = path + self.debug = debug + self.mode = mode + self.pc = pc + self.instr = instr + self.label = label + self.duration = 1 + + def set_duration(self, next_line_cycles): + self.duration = next_line_cycles - self.cycles + + + +class Trace_file(object): + + def __init__(self, path): + self.insns = {} + self.lines = [] + + + with open(path) as f: + prev_line = None + for line in f.readlines()[1:]: + try: + time, cycles, path, debug, mode, pc, instr = re.findall('([ \t]*\d+):([ \t]*\d+):([ \t]*\[.*\])[ \t]*([^ ^\t]*)[ \t]*([^ ^\t]*)[ \t]*([^ ^\t]*)[ \t]*(.*)', line)[0] + except: + time, cycles, pc, opcode, instr = re.findall('[ \t]*(\d+ns)[ \t]*(\d+)[ \t]*([^ ^\t]*)[ \t]*([^ ^\t]*)[ \t]*(.*)', line)[0] + debug = None + path = None + mode = None + + label = instr.split()[0] + cycles = int(cycles, 0) + + if label.find("c.") == 0: + label = label.replace("c.", "") + + if label == 'li': + label = 'add' + elif label == 'mv': + label = 'add' + elif label.find('add') == 0: + label = 'add' + elif label.find('jr') == 0: + label = 'jalr' + elif label.find('swsp') == 0: + label = 'sw' + elif label.find('lwsp') == 0: + label = 'lw' + elif label.find('p.extract') == 0: + label = 'p.extract' + elif label.find('p.bclr') == 0: + label = 'p.p.bclr' + elif label.find('beq') == 0: + label = 'beq' + elif label.find('pv.shuffle') == 0: + label = 'pv.shuffle' + + line = Trace_line(time, cycles, path, debug, mode, pc, instr, label) + self.lines.append(line) + + if prev_line is not None: + prev_line.set_duration(cycles) + + prev_line = line + + + for line in self.lines: + if self.insns.get(line.label) is None: + self.insns[line.label] = Insn(line.label) + + self.insns[line.label].add_instance(line.duration) + + + def dump(self): + for name, insn in self.insns.items(): + print ('%s %d %f' % (name, insn.nb, float(insn.total) / insn.nb)) + + + + +parser = argparse.ArgumentParser(description='Generate PC debug info') + +parser.add_argument("--trace", dest="traces", default=[], action="append", help="Specify trace input file") + +args = parser.parse_args() + + +trace_files = collections.OrderedDict() + +for trace_file_path in args.traces: + trace_files[trace_file_path] = Trace_file(trace_file_path) + + +insns = [] + +for path, trace_file in trace_files.items(): + for label, insn in trace_file.insns.items(): + found_insn = None + for insn_stat in insns: + if insn_stat[0] == label: + found_insn = insn_stat + break + + if found_insn is None: + found_insn = [ label, insn.total] + insns.append(found_insn) + + + +rows = ['Instruction'] + +for path, trace_file in trace_files.items(): + rows.append('Occurences (%s)' % path) + rows.append('Duration (%s)' % path) + +if len(trace_files.values()) == 2: + rows.append('diff') + + +table = PrettyTable(rows) + +table.float_format = ".2" +table.align = "r" +table.align['Instruction'] = 'l' + +total = 0 + +for insn in insns: + row = [insn[0]] + trace_insns = [] + for path, trace_file in trace_files.items(): + trace_insn = trace_file.insns.get(insn[0]) + if trace_insn is None: + duration = 0 + nb = 0 + trace_insns.append(None) + else: + duration = float(trace_insn.total) / trace_insn.nb + nb = trace_insn.nb + trace_insns.append(trace_insn) + + row.append(nb) + row.append(duration) + + if len(trace_files.values()) == 2: + if trace_insns[0] is None: + count0 = 0 + else: + count0 = trace_insns[0].total + if trace_insns[1] is None: + count1 = 0 + else: + count1 = trace_insns[1].total + + row.append(count1 - count0) + total += count1 - count0 + + table.add_row(row) + + +if len(trace_files.values()) == 2: + row = ['Total'] + + for path, trace_file in trace_files.items(): + row.append('') + row.append('') + + row.append(total) + table.add_row(row) + +print (table) \ No newline at end of file diff --git a/gvsoc/gvsoc/dpi-wrapper/Makefile b/gvsoc/gvsoc/dpi-wrapper/Makefile index 568c01f76..c589e12de 100644 --- a/gvsoc/gvsoc/dpi-wrapper/Makefile +++ b/gvsoc/gvsoc/dpi-wrapper/Makefile @@ -6,7 +6,7 @@ CFLAGS += -I$(INSTALL_DIR)/include -fPIC LDFLAGS += -L$(INSTALL_DIR)/lib -fPIC -shared -O3 -g -ljson DPI_CFLAGS += $(CFLAGS) -DUSE_DPI -DPI_LDFLAGS += $(LDFLAGS) -lpulpvp-debug +DPI_LDFLAGS += $(LDFLAGS) -lpulpvp-sv DPI_CFLAGS += -Iext/sv/include -Iext/nosv diff --git a/gvsoc/gvsoc/dpi-wrapper/src/dpi.cpp b/gvsoc/gvsoc/dpi-wrapper/src/dpi.cpp index 74d3b3545..df632ea38 100644 --- a/gvsoc/gvsoc/dpi-wrapper/src/dpi.cpp +++ b/gvsoc/gvsoc/dpi-wrapper/src/dpi.cpp @@ -18,6 +18,7 @@ * Authors: Germain Haugou, ETH (germain.haugou@iis.ee.ethz.ch) */ + #include #include #include diff --git a/gvsoc/gvsoc/engine/include/gv/gvsoc.h b/gvsoc/gvsoc/engine/include/gv/gvsoc.h index 0c4906e1b..a63c349d9 100644 --- a/gvsoc/gvsoc/engine/include/gv/gvsoc.h +++ b/gvsoc/gvsoc/engine/include/gv/gvsoc.h @@ -41,4 +41,4 @@ void *gv_chip_pad_bind(void *handle, char *name, int ext_handle); } #endif -#endif \ No newline at end of file +#endif diff --git a/gvsoc/gvsoc/models/Makefile b/gvsoc/gvsoc/models/Makefile index f230c1ea6..4adaa65ac 100644 --- a/gvsoc/gvsoc/models/Makefile +++ b/gvsoc/gvsoc/models/Makefile @@ -21,7 +21,8 @@ VP_DIRS=memory pulp pulp/fll pulp/stdout pulp/chips/pulpissimo cpu/iss \ pulp/chips/multino pulp/efuse board pulp/chips/arnold \ devices/hyperbus devices/spiflash vendor/dolphin pulp/chips/pulpissimo_v1 \ pulp/rtc pulp/gpio pulp/chips/gap_rev1 pulp/chips/pulp_v1 pulp/chips/vivosoc3_1 \ - pulp/mram pulp/hwce cache pulp/chips/gap8_revc pulp/hwacc devices/uart devices/sound + pulp/mram pulp/hwce cache pulp/chips/gap8_revc pulp/hwacc devices/uart devices/sound \ + devices/testbench -include $(ROOT_VP_BUILD_DIR)/props.mk diff --git a/gvsoc/gvsoc/models/cpu/iss/include/isa_lib/int.h b/gvsoc/gvsoc/models/cpu/iss/include/isa_lib/int.h index 71ec60964..8fd3290cc 100644 --- a/gvsoc/gvsoc/models/cpu/iss/include/isa_lib/int.h +++ b/gvsoc/gvsoc/models/cpu/iss/include/isa_lib/int.h @@ -54,6 +54,14 @@ static inline unsigned int lib_XOR(iss_cpu_state_t *s, unsigned int a, unsigned static inline unsigned int lib_OR(iss_cpu_state_t *s, unsigned int a, unsigned int b) { return a | b; } static inline unsigned int lib_AND(iss_cpu_state_t *s, unsigned int a, unsigned int b) { return a & b; } +static inline uint64_t lib_SLL_64(iss_cpu_state_t *s, uint64_t a, uint64_t b) { return a << b; } +static inline uint64_t lib_SRL_64(iss_cpu_state_t *s, uint64_t a, uint64_t b) { return a >> b; } +static inline uint64_t lib_SRA_64(iss_cpu_state_t *s, uint64_t a, uint64_t b) { return ((int32_t)a) >> b; } +static inline uint64_t lib_ROR_64(iss_cpu_state_t *s, uint64_t a, uint64_t b) { return (a >> b) | (a << (32 - b)); } +static inline uint64_t lib_XOR_64(iss_cpu_state_t *s, uint64_t a, uint64_t b) { return a ^ b; } +static inline uint64_t lib_OR_64(iss_cpu_state_t *s, uint64_t a, uint64_t b) { return a | b; } +static inline uint64_t lib_AND_64(iss_cpu_state_t *s, uint64_t a, uint64_t b) { return a & b; } + @@ -84,10 +92,13 @@ static inline unsigned int lib_ADD_C(iss_cpu_state_t *s, unsigned int a, unsigne #endif static inline unsigned int lib_ADD(iss_cpu_state_t *s, unsigned int a, unsigned int b) { return a + b; } +static inline uint64_t lib_ADD_64(iss_cpu_state_t *s, uint64_t a, uint64_t b) { return a + b; } #ifdef ISS_STATE_HAS_CARRY static inline unsigned int lib_ADDC_C(iss_cpu_state_t *s, unsigned int a, unsigned int b) { return addWithCarry(s, a, b); } #endif static inline unsigned int lib_SUB(iss_cpu_state_t *s, unsigned int a, unsigned int b) { return a - b; } +static inline uint64_t lib_SUB_64(iss_cpu_state_t *s, uint64_t a, uint64_t b) { return a - b; } + #ifdef ISS_STATE_HAS_CARRY static inline unsigned int lib_SUB_C(iss_cpu_state_t *s, unsigned int a, unsigned int b) { @@ -109,6 +120,12 @@ static inline unsigned int lib_MACC(iss_cpu_state_t *s, unsigned int a, unsigned static inline unsigned int lib_MSU(iss_cpu_state_t *s, unsigned int a, unsigned int b, unsigned int c) { return a - b * c; } static inline unsigned int lib_MMUL(iss_cpu_state_t *s, unsigned int a, unsigned int b, unsigned int c) { return - b * c; } + +static inline uint64_t lib_MACS_64(iss_cpu_state_t *s, int64_t a, int64_t b, int64_t c) { return a + b * c; } +static inline uint64_t lib_MSUS_64(iss_cpu_state_t *s, int64_t a, int64_t b, int64_t c) { return a - b * c; } +static inline uint64_t lib_MACU_64(iss_cpu_state_t *s, uint64_t a, uint64_t b, uint64_t c) { return a + b * c; } +static inline uint64_t lib_MSUU_64(iss_cpu_state_t *s, uint64_t a, uint64_t b, uint64_t c) { return a - b * c; } + #define SL(val) ((int16_t)((val) & 0xffff)) #define SH(val) ((int16_t)(((val)>>16) & 0xffff)) #define ZL(val) ((uint16_t)((val) & 0xffff)) @@ -279,6 +296,8 @@ static inline unsigned int lib_MMUL_ZH_SH(iss_cpu_state_t *s, unsigned int b, un static inline unsigned int lib_MULS(iss_cpu_state_t *s, int a, int b) { return a * b; } static inline unsigned int lib_MULU(iss_cpu_state_t *s, unsigned int a, unsigned int b) { return a * b; } +static inline uint64_t lib_MULS_64(iss_cpu_state_t *s, int64_t a, int64_t b) { return a * b; } +static inline uint64_t lib_MULU_64(iss_cpu_state_t *s, uint64_t a, uint64_t b) { return a * b; } static inline unsigned int lib_DIVS(iss_cpu_state_t *s, int a, int b) { if (b == 0) return 0; else return a / b; } static inline unsigned int lib_DIVU(iss_cpu_state_t *s, unsigned int a, unsigned int b) { if (b == 0) return 0; else return a / b; } static inline unsigned int lib_MINU(iss_cpu_state_t *s, unsigned int a, unsigned int b) { return a < b ? a : b; } @@ -289,6 +308,11 @@ static inline int lib_ABS(iss_cpu_state_t *s, int a) { return a >= 0 ? a : -a; } static inline unsigned int lib_AVGU(iss_cpu_state_t *s, unsigned int a, unsigned int b) { return (a + b) >> 1; } static inline int lib_AVGS(iss_cpu_state_t *s, int a, int b) { return (a + b) >> 1; } +static inline uint64_t lib_MINU_64(iss_cpu_state_t *s, uint64_t a, uint64_t b) { return a < b ? a : b; } +static inline int64_t lib_MINS_64(iss_cpu_state_t *s, int a, int64_t b) { return a < b ? a : b; } +static inline uint64_t lib_MAXU_64(iss_cpu_state_t *s, uint64_t a, uint64_t b) { return a > b ? a : b; } +static inline int64_t lib_MAXS_64(iss_cpu_state_t *s, int a, int64_t b) { return a > b ? a : b; } +static inline int64_t lib_ABS_64(iss_cpu_state_t *s, int64_t a) { return a >= 0 ? a : -a; } @@ -352,6 +376,17 @@ static inline unsigned int lib_CNT(iss_cpu_state_t *s, unsigned int t) { #endif } +static inline unsigned int lib_CNT_64(iss_cpu_state_t *s, uint64_t t) { +#if 1 + return __builtin_popcount(t); +#else + uint64_t v = cpu->regs[pc->inReg[0]]; + v = v - ((v >> 1) & 0x55555555); + v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + cpu->regs[pc->outReg[0]] = ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; +#endif +} + static inline unsigned int lib_CLB(iss_cpu_state_t *s, unsigned int t) { if (t == 0) return 0; diff --git a/gvsoc/gvsoc/models/cpu/iss/include/isa_lib/macros.h b/gvsoc/gvsoc/models/cpu/iss/include/isa_lib/macros.h index 32386af81..ab269cf6c 100644 --- a/gvsoc/gvsoc/models/cpu/iss/include/isa_lib/macros.h +++ b/gvsoc/gvsoc/models/cpu/iss/include/isa_lib/macros.h @@ -40,6 +40,9 @@ #define REG_SET(reg,val) iss_set_reg(iss, insn->out_regs[reg], val) #define IN_REG_SET(reg,val) iss_set_reg(iss, insn->in_regs[reg], val) +#define REG64_GET(reg) iss_get_reg64(iss, insn->in_regs[reg]) +#define REG64_SET(reg,val) iss_set_reg64(iss, insn->out_regs[reg], val) + #define SIM_GET(index) insn->sim[index] #define UIM_GET(index) insn->uim[index] diff --git a/gvsoc/gvsoc/models/cpu/iss/include/iss.hpp b/gvsoc/gvsoc/models/cpu/iss/include/iss.hpp index 191187000..c2d1f4e38 100644 --- a/gvsoc/gvsoc/models/cpu/iss/include/iss.hpp +++ b/gvsoc/gvsoc/models/cpu/iss/include/iss.hpp @@ -35,6 +35,7 @@ #include "pulp_v2.hpp" #include "rvXgap8.hpp" #include "rvXgap9.hpp" +#include "rvXint64.hpp" #include "rnnext.hpp" #endif diff --git a/gvsoc/gvsoc/models/cpu/iss/include/regs.hpp b/gvsoc/gvsoc/models/cpu/iss/include/regs.hpp index 921e65df0..4f38dc713 100644 --- a/gvsoc/gvsoc/models/cpu/iss/include/regs.hpp +++ b/gvsoc/gvsoc/models/cpu/iss/include/regs.hpp @@ -54,6 +54,28 @@ static inline iss_reg_t iss_get_reg(iss_t *iss, int reg) return iss_get_reg_untimed(iss, reg); } +static inline iss_reg64_t iss_get_reg64_untimed(iss_t *iss, int reg) +{ + if (reg == 0) + return 0; + else + return (((uint64_t)iss->cpu.regfile.regs[reg+1]) << 32) + iss->cpu.regfile.regs[reg]; +} + +static inline void iss_set_reg64(iss_t *iss, int reg, iss_reg64_t value) +{ + if (reg != 0) + { + iss->cpu.regfile.regs[reg] = value & 0xFFFFFFFF; + iss->cpu.regfile.regs[reg+1] = value >> 32; + } +} + +static inline iss_reg64_t iss_get_reg64(iss_t *iss, int reg) +{ + return iss_get_reg64_untimed(iss, reg); +} + static inline iss_reg_t iss_get_reg_for_jump(iss_t *iss, int reg) { //unsigned long regCycle = cpu->regsCycle[reg]; //if (cpu->cycles < regCycle + 1) { diff --git a/gvsoc/gvsoc/models/cpu/iss/include/rvXint64.hpp b/gvsoc/gvsoc/models/cpu/iss/include/rvXint64.hpp new file mode 100644 index 000000000..8317e246e --- /dev/null +++ b/gvsoc/gvsoc/models/cpu/iss/include/rvXint64.hpp @@ -0,0 +1,321 @@ +/* + * Copyright (C) 2018 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Authors: Germain Haugou, ETH (germain.haugou@iis.ee.ethz.ch) + */ + +#ifndef __CPU_ISS_RVXINT64_HPP +#define __CPU_ISS_RVXINT64_HPP + +#include "iss_core.hpp" +#include "isa_lib/int.h" +#include "isa_lib/macros.h" + + + +static inline iss_insn_t *add_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_ADD_64, REG64_GET(0), REG64_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *sub_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_SUB_64, REG64_GET(0), REG64_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *sll_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_SLL_64, REG64_GET(0), REG64_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *slt_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG_SET(0, (int32_t)REG64_GET(0) < (int32_t)REG64_GET(1)); + return insn->next; +} + + +static inline iss_insn_t *sltu_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG_SET(0, REG64_GET(0) < REG64_GET(1)); + return insn->next; +} + + +static inline iss_insn_t *xor_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_XOR_64, REG64_GET(0), REG64_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *srl_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_SRL_64, REG64_GET(0), REG64_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *sra_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_SRA_64, REG64_GET(0), REG64_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *or_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_OR_64, REG64_GET(0), REG64_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *and_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG_SET(0, LIB_CALL2(lib_AND_64, REG64_GET(0), REG64_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *slli_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_SLL_64, REG64_GET(0), UIM_GET(0))); + return insn->next; +} + + +static inline iss_insn_t *srli_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_SRL_64, REG64_GET(0), UIM_GET(0))); + return insn->next; +} + + +static inline iss_insn_t *srai_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_SRA_64, REG64_GET(0), UIM_GET(0))); + return insn->next; +} + + +static inline iss_insn_t *addi_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_ADD_64, REG64_GET(0), SIM_GET(0))); + return insn->next; +} + + +static inline iss_insn_t *slti_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG_SET(0, (int32_t)REG64_GET(0) < insn->sim[0]); + return insn->next; +} + + +static inline iss_insn_t *sltiu_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG_SET(0, REG64_GET(0) < (uint32_t)SIM_GET(0)); + return insn->next; +} + + +static inline iss_insn_t *xori_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_XOR_64, REG64_GET(0), SIM_GET(0))); + return insn->next; +} + + +static inline iss_insn_t *ori_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_OR_64, REG64_GET(0), SIM_GET(0))); + return insn->next; +} + + +static inline iss_insn_t *andi_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_AND_64, REG64_GET(0), SIM_GET(0))); + return insn->next; +} + + +static inline iss_insn_t *p_abs_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL1(lib_ABS_64, REG64_GET(0))); + return insn->next; +} + + +static inline iss_insn_t *p_seq_d_exec(iss_t *iss, iss_insn_t *insn) +{ + + REG_SET(0, REG64_GET(0) == REG64_GET(1)); + return insn->next; +} + + +static inline iss_insn_t *p_sne_d_exec(iss_t *iss, iss_insn_t *insn) +{ + + REG_SET(0, REG64_GET(0) != REG64_GET(1)); + return insn->next; +} + + +static inline iss_insn_t *p_slet_d_exec(iss_t *iss, iss_insn_t *insn) +{ + + REG_SET(0, (int64_t)REG64_GET(0) <= (int64_t)REG64_GET(1)); + return insn->next; +} + + +static inline iss_insn_t *p_sletu_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG_SET(0, REG64_GET(0) <= REG64_GET(1)); + return insn->next; +} + + +static inline iss_insn_t *p_min_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_MINS_64, REG64_GET(0), REG64_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *p_minu_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_MINU_64, REG64_GET(0), REG64_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *p_max_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_MAXS_64, REG64_GET(0), REG64_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *p_maxu_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_MAXU_64, REG64_GET(0), REG64_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *p_cnt_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG_SET(0, LIB_CALL1(lib_CNT_64, REG64_GET(0))); + return insn->next; +} + + +static inline iss_insn_t *p_exths_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, iss_get_signed_value64(REG_GET(0), 16)); + return insn->next; +} + + +static inline iss_insn_t *p_exthz_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, iss_get_field64(REG_GET(0), 0, 16)); + return insn->next; +} + + +static inline iss_insn_t *p_extbs_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, iss_get_signed_value64(REG_GET(0), 8)); + return insn->next; +} + + +static inline iss_insn_t *p_extbz_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, iss_get_field64(REG_GET(0), 0, 8)); + return insn->next; +} + + +static inline iss_insn_t *p_extws_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, iss_get_signed_value64(REG_GET(0), 8)); + return insn->next; +} + + +static inline iss_insn_t *p_extwz_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, iss_get_field64(REG_GET(0), 0, 8)); + return insn->next; +} + + +static inline iss_insn_t *p_mac_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL3(lib_MACS_64, REG64_GET(2), REG_GET(0), REG_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *p_msu_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL3(lib_MSUS_64, REG64_GET(2), REG_GET(0), REG_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *p_macu_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL3(lib_MACU_64, REG64_GET(2), REG_GET(0), REG_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *p_msuu_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL3(lib_MSUU_64, REG64_GET(2), REG_GET(0), REG_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *p_muls_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_MULS_64, REG_GET(0), REG_GET(1))); + return insn->next; +} + + +static inline iss_insn_t *p_mulu_d_exec(iss_t *iss, iss_insn_t *insn) +{ + REG64_SET(0, LIB_CALL2(lib_MULU_64, REG_GET(0), REG_GET(1))); + + return insn->next; +} + + +#endif \ No newline at end of file diff --git a/gvsoc/gvsoc/models/cpu/iss/include/types.hpp b/gvsoc/gvsoc/models/cpu/iss/include/types.hpp index f8ec5e632..f61da1d91 100644 --- a/gvsoc/gvsoc/models/cpu/iss/include/types.hpp +++ b/gvsoc/gvsoc/models/cpu/iss/include/types.hpp @@ -44,6 +44,10 @@ #error Unknown core version #endif +typedef uint64_t iss_reg64_t; + +#define PRIxFULLREG64 "16.16" PRIx64 + #if defined(ISS_WORD_64) #define ISS_OPCODE_MAX_SIZE 8 @@ -124,6 +128,7 @@ typedef enum { ISS_DECODER_ARG_FLAG_PREINC = 2, ISS_DECODER_ARG_FLAG_COMPRESSED = 4, ISS_DECODER_ARG_FLAG_FREG = 8, + ISS_DECODER_ARG_FLAG_REG64 = 16, } iss_decoder_arg_flag_e; typedef struct iss_insn_arg_s { @@ -134,6 +139,7 @@ typedef struct iss_insn_arg_s { struct { int index; iss_reg_t value; + iss_reg64_t value_64; } reg; struct { iss_sim_t value; diff --git a/gvsoc/gvsoc/models/cpu/iss/include/utils.hpp b/gvsoc/gvsoc/models/cpu/iss/include/utils.hpp index 96b39a2a6..dc30c9972 100644 --- a/gvsoc/gvsoc/models/cpu/iss/include/utils.hpp +++ b/gvsoc/gvsoc/models/cpu/iss/include/utils.hpp @@ -38,4 +38,19 @@ static inline iss_opcode_t iss_get_zext_value(iss_opcode_t val, int bits) return ((unsigned int)val) << (ISS_REG_WIDTH-bits) >> (ISS_REG_WIDTH-bits); } +static inline uint64_t iss_get_field64(iss_opcode_t val, int shift, int bits) +{ + return (val >> shift) & ((1<> (ISS_REG_WIDTH-bits); +} + +static inline uint64_t iss_get_zext_value64(iss_opcode_t val, int bits) +{ + return ((uint64_t)val) << (ISS_REG_WIDTH-bits) >> (ISS_REG_WIDTH-bits); +} + #endif diff --git a/gvsoc/gvsoc/models/cpu/iss/isa_gen/isa_gen.py b/gvsoc/gvsoc/models/cpu/iss/isa_gen/isa_gen.py index 8747f67b6..a5d7faf39 100644 --- a/gvsoc/gvsoc/models/cpu/iss/isa_gen/isa_gen.py +++ b/gvsoc/gvsoc/models/cpu/iss/isa_gen/isa_gen.py @@ -261,6 +261,10 @@ def __init__(self, id, ranges, dumpName=True): def genExtract(self, isaFile, level): dump(isaFile, level, ' pc->outReg[%d] = %s + NB_REGS;\n' % (self.id, self.ranges.gen())) +class OutReg64(OutReg): + def __init__(self, id, ranges, dumpName=True): + super(OutReg64, self).__init__(id=id, ranges=ranges, dumpName=dumpName, flags=['ISS_DECODER_ARG_FLAG_REG64']) + class OutRegComp(OutReg): def __init__(self, id, ranges, dumpName=True): super(OutRegComp, self).__init__(id=id, ranges=ranges, dumpName=dumpName, flags=['ISS_DECODER_ARG_FLAG_COMPRESSED']) @@ -313,6 +317,10 @@ def __init__(self, id, ranges, dumpName=True): def genExtract(self, isaFile, level): dump(isaFile, level, ' pc->inReg[%d] = %s + NB_REGS;\n' % (self.id, self.ranges.gen())) +class InReg64(InReg): + def __init__(self, id, ranges, dumpName=True): + super(InReg64, self).__init__(id=id, ranges=ranges, dumpName=dumpName, flags=['ISS_DECODER_ARG_FLAG_REG64']) + class InRegComp(InReg): def __init__(self, id, ranges, dumpName=True): super(InRegComp, self).__init__(id=id, ranges=ranges, dumpName=dumpName, flags=['ISS_DECODER_ARG_FLAG_COMPRESSED']) diff --git a/gvsoc/gvsoc/models/cpu/iss/isa_gen/isa_riscv_gen.py b/gvsoc/gvsoc/models/cpu/iss/isa_gen/isa_riscv_gen.py index 53ad77111..b255d24b7 100755 --- a/gvsoc/gvsoc/models/cpu/iss/isa_gen/isa_riscv_gen.py +++ b/gvsoc/gvsoc/models/cpu/iss/isa_gen/isa_riscv_gen.py @@ -94,6 +94,11 @@ def __init__(self, label, format, encoding, decode=None, N=None, L=None, mapTo=N InReg (0, Range(15, 5)), InReg (1, Range(20, 5)), ] + elif format == 'R64': + self.args = [ OutReg64(0, Range(7, 5)), + InReg64 (0, Range(15, 5)), + InReg64 (1, Range(20, 5)), + ] elif format == 'BITREV': self.args = [ OutReg(0, Range(7, 5)), InReg (0, Range(15, 5)), @@ -188,6 +193,12 @@ def __init__(self, label, format, encoding, decode=None, N=None, L=None, mapTo=N InReg (0, Range(15, 5)), InReg (1, Range(20, 5)), ] + elif format == 'RRRR64': + self.args = [ OutReg64(0, Range(7, 5)), + InReg64 (2, Range(7, 5), dumpName=False), + InReg64 (0, Range(15, 5)), + InReg64 (1, Range(20, 5)), + ] elif format == 'RRRR2': self.args = [ OutReg(0, Range(7, 5)), InReg (0, Range(7, 5), dumpName=False), @@ -223,6 +234,10 @@ def __init__(self, label, format, encoding, decode=None, N=None, L=None, mapTo=N self.args = [ OutReg(0, Range(7, 5)), InReg (0, Range(15, 5)), ] + elif format == 'R1_64': + self.args = [ OutReg64(0, Range(7, 5)), + InReg64 (0, Range(15, 5)), + ] elif format == 'RRU': self.args = [ OutReg(0, Range(7, 5)), InReg (0, Range(15, 5)), @@ -252,6 +267,12 @@ def __init__(self, label, format, encoding, decode=None, N=None, L=None, mapTo=N InReg (1, Range(20, 5)), InReg (2, Range(25, 5)), ] + elif format == 'RR64': + self.args = [ OutReg64(0, Range(7, 5)), + InReg64 (0, Range(15, 5)), + InReg64 (1, Range(20, 5)), + InReg64 (2, Range(7, 5)), + ] elif format == 'SR': self.args = [ InReg (1, Range(20, 5)), Indirect(InReg (0, Range(15, 5)), InReg (2, Range(7, 5))), @@ -265,6 +286,11 @@ def __init__(self, label, format, encoding, decode=None, N=None, L=None, mapTo=N InReg (0, Range(15, 5)), SignedImm(0, Range(20, 12)), ] + elif format == 'I64': + self.args = [ OutReg64(0, Range(7, 5)), + InReg64 (0, Range(15, 5)), + SignedImm(0, Range(20, 5)), + ] elif format == 'Z': self.args = [ ] @@ -1760,6 +1786,56 @@ def __init__(self, label, format, encoding, decode=None, N=None, L=None, mapTo=N R5('p.bitrev', 'BITREV', '11000-- ----- ----- 101 ----- 0110011', mapTo="gap9_BITREV"), ]) +int64 = IsaSubset('int64', +[ + R5('add.d', 'R64', '0010000 ----- ----- 000 ----- 0110011'), + R5('sub.d', 'R64', '0110000 ----- ----- 000 ----- 0110011'), + R5('sll.d', 'R64', '0010000 ----- ----- 001 ----- 0110011'), + R5('slt.d', 'R64', '0011000 ----- ----- 010 ----- 0110011'), + R5('sltu.d', 'R64', '0011000 ----- ----- 011 ----- 0110011'), + R5('xor.d', 'R64', '0010000 ----- ----- 100 ----- 0110011'), + R5('srl.d', 'R64', '0010000 ----- ----- 101 ----- 0110011'), + R5('sra.d', 'R64', '0110000 ----- ----- 101 ----- 0110011'), + R5('or.d', 'R64', '0110000 ----- ----- 110 ----- 0110011'), + R5('and.d', 'R64', '0110000 ----- ----- 111 ----- 0110011'), + + R5('slli.d', 'I64', '0010000 ----- ----- 001 ----- 0010011'), + R5('srli.d', 'I64', '0010010 ----- ----- 101 ----- 0010011'), + R5('srai.d', 'I64', '0110010 ----- ----- 101 ----- 0010011'), + R5('addi.d', 'I64', '0010001 ----- ----- 001 ----- 0010011'), + + R5('slti.d', 'I64', '0011000 ----- ----- 010 ----- 0011011'), + R5('sltiu.d', 'I64', '0011000 ----- ----- 011 ----- 0011011'), + R5('xori.d', 'I64', '0010000 ----- ----- 100 ----- 0011011'), + R5('ori.d', 'I64', '0010000 ----- ----- 110 ----- 0011011'), + R5('andi.d', 'I64', '0010000 ----- ----- 111 ----- 0011011'), + + R5('p.abs.d', 'R1_64', '0010010 00000 ----- 000 ----- 0110011'), + R5('p.seq.d', 'RRRR64', '0011011 ----- ----- 010 ----- 0110011'), + R5('p.slet.d', 'RRRR64', '0011010 ----- ----- 010 ----- 0110011'), + R5('p.sletu.d', 'RRRR64', '0011010 ----- ----- 011 ----- 0110011'), + R5('p.sne.d', 'RRRR64', '0011011 ----- ----- 011 ----- 0110011'), + R5('p.min.d', 'RRRR64', '0010010 ----- ----- 100 ----- 0110011'), + R5('p.minu.d', 'RRRR64', '0010010 ----- ----- 101 ----- 0110011'), + R5('p.max.d', 'RRRR64', '0010010 ----- ----- 110 ----- 0110011'), + R5('p.maxu.d', 'RRRR64', '0010010 ----- ----- 111 ----- 0110011'), + R5('p.cnt.d', 'R1_64', '0011010 00000 ----- 001 ----- 0110011'), + R5('p.exths.d', 'R1_64', '0110010 00000 ----- 000 ----- 0110011'), + R5('p.exthz.d', 'R1_64', '0110010 00000 ----- 001 ----- 0110011'), + R5('p.extbs.d', 'R1_64', '0110010 00000 ----- 011 ----- 0110011'), + R5('p.extbz.d', 'R1_64', '0010010 00000 ----- 100 ----- 0110011'), + R5('p.extws.d', 'R1_64', '0110010 00000 ----- 101 ----- 0110011'), + R5('p.extwz.d', 'R1_64', '0110010 00000 ----- 110 ----- 0110011'), + + R5('p.mac.d', 'RR64', '0111001 ----- ----- 000 ----- 0110011'), + R5('p.msu.d', 'RR64', '0111001 ----- ----- 001 ----- 0110011'), + R5('p.macu.d', 'RR64', '0111001 ----- ----- 010 ----- 0110011'), + R5('p.msuu.d', 'RR64', '0111001 ----- ----- 011 ----- 0110011'), + R5('p.muls.d', 'R64', '0111001 ----- ----- 100 ----- 0110011'), + R5('p.mulu.d', 'R64', '0111001 ----- ----- 101 ----- 0110011'), +]) + + parser = argparse.ArgumentParser(description='Generate ISA for RISCV') parser.add_argument("--version", dest="version", default=1, type=int, metavar="VALUE", help="Specify ISA version") @@ -1801,6 +1877,7 @@ def __init__(self, label, format, encoding, decode=None, N=None, L=None, mapTo=N IsaDecodeTree('sfloat', [Xf16, Xf16alt, Xf8, Xfvec, Xfaux]), IsaDecodeTree('gap8', [gap8]), IsaDecodeTree('gap9', [gap9]), + IsaDecodeTree('int64', [int64]), #IsaTree('fpud', rv32d), #IsaTree('gap8', gap8), #IsaTree('priv_pulp_v2', priv_pulp_v2), @@ -1836,7 +1913,7 @@ def __init__(self, label, format, encoding, decode=None, N=None, L=None, mapTo=N elif "mul" in insn.tags: insn.get_out_reg(0).set_latency(2) elif "mulh" in insn.tags: - insn.get_out_reg(0).set_latency(3) + insn.set_latency(5) elif "div" in insn.tags: insn.get_out_reg(0).set_latency(31) @@ -1852,7 +1929,7 @@ def __init__(self, label, format, encoding, decode=None, N=None, L=None, mapTo=N elif "mul" in insn.tags: insn.get_out_reg(0).set_latency(3) elif "mulh" in insn.tags: - insn.get_out_reg(0).set_latency(3) + insn.set_latency(5) elif "div" in insn.tags: insn.get_out_reg(0).set_latency(37) diff --git a/gvsoc/gvsoc/models/cpu/iss/src/trace.cpp b/gvsoc/gvsoc/models/cpu/iss/src/trace.cpp index 51c423b4f..98bcb6e79 100644 --- a/gvsoc/gvsoc/models/cpu/iss/src/trace.cpp +++ b/gvsoc/gvsoc/models/cpu/iss/src/trace.cpp @@ -154,7 +154,7 @@ static inline int iss_trace_dump_reg(iss_t *iss, iss_insn_t *insn, char *buff, u return sprintf(buff, "x%d", reg); } -static char *iss_trace_dump_reg_value(iss_t *iss, iss_insn_t *insn, char *buff, bool is_out, int reg, unsigned int saved_value, iss_decoder_arg_t **prev_arg, bool is_long) +static char *iss_trace_dump_reg_value(iss_t *iss, iss_insn_t *insn, char *buff, bool is_out, int reg, uint64_t saved_value, iss_decoder_arg_t *arg, iss_decoder_arg_t **prev_arg, bool is_long) { char regStr[16]; iss_trace_dump_reg(iss, insn, regStr, reg); @@ -163,7 +163,10 @@ static char *iss_trace_dump_reg_value(iss_t *iss, iss_insn_t *insn, char *buff, if (is_out) buff += sprintf(buff, "="); else buff += sprintf(buff, ":"); - buff += sprintf(buff, "%8.8x ", saved_value); + if (arg->flags & ISS_DECODER_ARG_FLAG_REG64) + buff += sprintf(buff, "%" PRIxFULLREG64 " ", saved_value); + else + buff += sprintf(buff, "%" PRIxFULLREG " ", (iss_reg_t)saved_value); return buff; } @@ -173,17 +176,17 @@ static char *iss_trace_dump_arg_value(iss_t *iss, iss_insn_t *insn, char *buff, { if ((dump_out && arg->type == ISS_DECODER_ARG_TYPE_OUT_REG) || (!dump_out && arg->type == ISS_DECODER_ARG_TYPE_IN_REG)) { - buff = iss_trace_dump_reg_value(iss, insn, buff, arg->type == ISS_DECODER_ARG_TYPE_OUT_REG, insn_arg->u.reg.index, saved_arg->u.reg.value, prev_arg, is_long); + buff = iss_trace_dump_reg_value(iss, insn, buff, arg->type == ISS_DECODER_ARG_TYPE_OUT_REG, insn_arg->u.reg.index, arg->flags & ISS_DECODER_ARG_FLAG_REG64 ? saved_arg->u.reg.value_64 : saved_arg->u.reg.value, arg, prev_arg, is_long); } } else if (arg->type == ISS_DECODER_ARG_TYPE_INDIRECT_IMM) { - if (!dump_out) buff = iss_trace_dump_reg_value(iss, insn, buff, 0, insn_arg->u.indirect_imm.reg_index, saved_arg->u.indirect_imm.reg_value, prev_arg, is_long); + if (!dump_out) buff = iss_trace_dump_reg_value(iss, insn, buff, 0, insn_arg->u.indirect_imm.reg_index, saved_arg->u.indirect_imm.reg_value, arg, prev_arg, is_long); iss_addr_t addr; if (arg->flags & ISS_DECODER_ARG_FLAG_POSTINC) { addr = saved_arg->u.indirect_imm.reg_value; - if (dump_out) buff = iss_trace_dump_reg_value(iss, insn, buff, 1, insn_arg->u.indirect_imm.reg_index, addr + insn_arg->u.indirect_imm.imm, prev_arg, is_long); + if (dump_out) buff = iss_trace_dump_reg_value(iss, insn, buff, 1, insn_arg->u.indirect_imm.reg_index, addr + insn_arg->u.indirect_imm.imm, arg, prev_arg, is_long); } else { @@ -193,13 +196,13 @@ static char *iss_trace_dump_arg_value(iss_t *iss, iss_insn_t *insn, char *buff, } else if (arg->type == ISS_DECODER_ARG_TYPE_INDIRECT_REG) { - if (!dump_out) buff = iss_trace_dump_reg_value(iss, insn, buff, 0, insn_arg->u.indirect_reg.offset_reg_index, saved_arg->u.indirect_reg.offset_reg_value, prev_arg, is_long); - if (!dump_out) buff = iss_trace_dump_reg_value(iss, insn, buff, 0, insn_arg->u.indirect_reg.base_reg_index, saved_arg->u.indirect_reg.base_reg_value, prev_arg, is_long); + if (!dump_out) buff = iss_trace_dump_reg_value(iss, insn, buff, 0, insn_arg->u.indirect_reg.offset_reg_index, saved_arg->u.indirect_reg.offset_reg_value, arg, prev_arg, is_long); + if (!dump_out) buff = iss_trace_dump_reg_value(iss, insn, buff, 0, insn_arg->u.indirect_reg.base_reg_index, saved_arg->u.indirect_reg.base_reg_value, arg, prev_arg, is_long); iss_addr_t addr; if (arg->flags & ISS_DECODER_ARG_FLAG_POSTINC) { addr = saved_arg->u.indirect_reg.base_reg_value; - if (dump_out) buff = iss_trace_dump_reg_value(iss, insn, buff, 1, insn_arg->u.indirect_reg.base_reg_index, addr + insn_arg->u.indirect_reg.offset_reg_value, prev_arg, is_long); + if (dump_out) buff = iss_trace_dump_reg_value(iss, insn, buff, 1, insn_arg->u.indirect_reg.base_reg_index, addr + insn_arg->u.indirect_reg.offset_reg_value, arg, prev_arg, is_long); } else { @@ -374,7 +377,12 @@ static void iss_trace_save_arg(iss_t *iss, iss_insn_t *insn, iss_insn_arg_t *ins if (save_out && arg->type == ISS_DECODER_ARG_TYPE_OUT_REG || !save_out && arg->type == ISS_DECODER_ARG_TYPE_IN_REG) { - saved_arg->u.reg.value = iss_get_reg_untimed(iss, insn_arg->u.reg.index); + if (arg->flags & ISS_DECODER_ARG_FLAG_REG64) + { + saved_arg->u.reg.value_64 = iss_get_reg64_untimed(iss, insn_arg->u.reg.index); + } + else + saved_arg->u.reg.value = iss_get_reg_untimed(iss, insn_arg->u.reg.index); } } diff --git a/gvsoc/gvsoc/models/devices/testbench/Makefile b/gvsoc/gvsoc/models/devices/testbench/Makefile new file mode 100644 index 000000000..355a391c8 --- /dev/null +++ b/gvsoc/gvsoc/models/devices/testbench/Makefile @@ -0,0 +1,2 @@ +IMPLEMENTATIONS += devices/testbench/testbench +devices/testbench/testbench_SRCS = devices/testbench/testbench.cpp diff --git a/gvsoc/gvsoc/models/devices/testbench/testbench.cpp b/gvsoc/gvsoc/models/devices/testbench/testbench.cpp new file mode 100644 index 000000000..394a892bb --- /dev/null +++ b/gvsoc/gvsoc/models/devices/testbench/testbench.cpp @@ -0,0 +1,303 @@ +/* + * Copyright (C) 2018 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Authors: Germain Haugou, GreenWaves Technologies (germain.haugou@greenwaves-technologies.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define PI_TESTBENCH_CMD_GPIO_LOOPBACK 1 +#define PI_TESTBENCH_MAX_REQ_SIZE 256 + + +typedef struct { + uint8_t input; + uint8_t output; + uint8_t enabled; +} pi_testbench_req_t; + + +typedef enum { + STATE_WAITING_CMD, + STATE_WAITING_REQUEST +} testbench_state_e; + +class Gpio +{ +public: + vp::wire_slave itf; + + int loopback = -1; + uint32_t value; +}; + + +class Testbench : public vp::component +{ +public: + Testbench(js::config *config); + + int build(); + + void uart_tx_sampling(); + +private: + + void uart_start_tx_sampling(int baudrate); + void uart_stop_tx_sampling(); + void handle_received_byte(uint8_t byte); + + void handle_gpio_loopback(); + + static void uart_sync(void *__this, int data); + static void gpio_sync(void *__this, int value, int id); + + static void uart_sampling_handler(void *__this, vp::clock_event *event); + + testbench_state_e state; + string ctrl_type; + uint64_t period; + bool uart_tx_wait_start = true; + bool uart_tx_wait_stop = false; + int uart_current_tx; + uint64_t uart_baudrate; + int uart_nb_bits; + bool uart_sampling_tx = false; + uint8_t uart_byte; + int nb_gpio; + int req_size; + int current_req_size; + uint8_t req[PI_TESTBENCH_MAX_REQ_SIZE]; + uint8_t cmd; + + std::vector gpios; + vp::uart_slave uart_in; + + vp::trace trace; + + vp::clock_event *uart_sampling_event; + vp::clock_master clock_cfg; +}; + +Testbench::Testbench(js::config *config) + : vp::component(config) +{ +} + + +int Testbench::build() +{ + traces.new_trace("trace", &trace, vp::DEBUG); + + this->new_master_port("clock_cfg", &clock_cfg); + + this->ctrl_type = get_js_config()->get("ctrl_type")->get_str(); + this->nb_gpio = get_js_config()->get("nb_gpio")->get_int(); + + if (this->ctrl_type == "uart") + { + this->uart_baudrate = get_js_config()->get("uart_baudrate")->get_int(); + this->uart_in.set_sync_meth(&Testbench::uart_sync); + this->new_slave_port("ctrl", &this->uart_in); + this->uart_sampling_event = event_new(Testbench::uart_sampling_handler); + } + + this->gpios.resize(this->nb_gpio); + + for (int i=0; inb_gpio; i++) + { + this->gpios[i].itf.set_sync_meth_muxed(&Testbench::gpio_sync, i); + this->new_slave_port("gpio" + std::to_string(i), &this->gpios[i].itf); + } + + this->state = STATE_WAITING_CMD; + + return 0; +} + + +void Testbench::uart_tx_sampling() +{ + this->trace.msg(vp::trace::LEVEL_TRACE, "Sampling bit (value: %d)\n", uart_current_tx); + + if (uart_tx_wait_stop) + { + if (uart_current_tx == 1) + { + this->trace.msg(vp::trace::LEVEL_TRACE, "Received stop bit\n", uart_current_tx); + uart_tx_wait_start = true; + uart_tx_wait_stop = false; + this->uart_stop_tx_sampling(); + } + } + else + { + this->trace.msg(vp::trace::LEVEL_TRACE, "Received data bit (data: %d)\n", uart_current_tx); + uart_byte = (uart_byte >> 1) | (uart_current_tx << 7); + uart_nb_bits++; + if (uart_nb_bits == 8) + { + this->trace.msg(vp::trace::LEVEL_DEBUG, "Sampled TX byte (value: 0x%x)\n", uart_byte); + this->trace.msg(vp::trace::LEVEL_TRACE, "Waiting for stop bit\n"); + uart_tx_wait_stop = true; + this->handle_received_byte(uart_byte); + } + } +} + + +void Testbench::uart_sampling_handler(void *__this, vp::clock_event *event) +{ + Testbench *_this = (Testbench *)__this; + + _this->uart_tx_sampling(); + + if (_this->uart_sampling_tx) + { + _this->event_enqueue(_this->uart_sampling_event, 2); + } +} + + +void Testbench::uart_sync(void *__this, int data) +{ + Testbench *_this = (Testbench *)__this; + + _this->trace.msg(vp::trace::LEVEL_TRACE, "UART sync (value: %d, waiting_start: %d)\n", data, _this->uart_tx_wait_start); + + _this->uart_current_tx = data; + + if (_this->uart_tx_wait_start && data == 0) + { + _this->trace.msg(vp::trace::LEVEL_TRACE, "Received start bit\n"); + + _this->uart_start_tx_sampling(_this->uart_baudrate); + _this->uart_tx_wait_start = false; + _this->uart_nb_bits = 0; + } +} + + +void Testbench::gpio_sync(void *__this, int value, int id) +{ + Testbench *_this = (Testbench *)__this; + Gpio *gpio = &_this->gpios[id]; + + _this->trace.msg(vp::trace::LEVEL_DEBUG, "Received GPIO sync (id: %d)\n", id); + + gpio->value = value; + + if (gpio->loopback != -1) + { + _this->trace.msg(vp::trace::LEVEL_DEBUG, "Generating gpio on loopback (id: %d)\n", gpio->loopback); + _this->gpios[gpio->loopback].itf.sync(value); + } +} + + +void Testbench::uart_start_tx_sampling(int baudrate) +{ + this->trace.msg(vp::trace::LEVEL_TRACE, "Start TX sampling (baudrate: %d)\n", this->uart_baudrate); + + // We set the frequency to twice the baudrate to be able sampling in the + // middle of the cycle + this->clock_cfg.set_frequency(this->uart_baudrate*2); + + this->uart_sampling_tx = 1; + + this->event_reenqueue(this->uart_sampling_event, 3); +} + + +void Testbench::uart_stop_tx_sampling(void) +{ + this->uart_sampling_tx = 0; + + if (this->uart_sampling_event->is_enqueued()) + { + this->event_cancel(this->uart_sampling_event); + } +} + + +void Testbench::handle_received_byte(uint8_t byte) +{ + if (this->state == STATE_WAITING_CMD) + { + this->cmd = byte; + + switch (byte) { + case PI_TESTBENCH_CMD_GPIO_LOOPBACK: + this->state = STATE_WAITING_REQUEST; + this->req_size = sizeof(pi_testbench_req_t); + this->current_req_size = 0; + break; + } + } + else if (this->state == STATE_WAITING_REQUEST) + { + this->req[this->current_req_size++] = byte; + if (this->current_req_size == this->req_size) + { + this->state = STATE_WAITING_CMD; + + switch (this->cmd) { + case PI_TESTBENCH_CMD_GPIO_LOOPBACK: + this->handle_gpio_loopback(); + break; + } + + } + } +} + + +void Testbench::handle_gpio_loopback() +{ + pi_testbench_req_t *req = (pi_testbench_req_t *)this->req; + + this->trace.msg(vp::trace::LEVEL_INFO, "Handling GPIO loopback (enabled: %d, output: %d, intput: %d)\n", req->enabled, req->output, req->input); + + if (req->enabled) + { + this->gpios[req->output].loopback = req->input; + this->gpios[req->input].itf.sync(this->gpios[req->output].value); + } + else + { + this->gpios[req->output].loopback = -1; + } +} + + +extern "C" vp::component *vp_constructor(js::config *config) +{ + return new Testbench(config); +} diff --git a/gvsoc/gvsoc/models/utils/dpi_chip_wrapper.cpp b/gvsoc/gvsoc/models/utils/dpi_chip_wrapper.cpp index f548cb165..ed9b8d138 100644 --- a/gvsoc/gvsoc/models/utils/dpi_chip_wrapper.cpp +++ b/gvsoc/gvsoc/models/utils/dpi_chip_wrapper.cpp @@ -30,6 +30,7 @@ #include #include #include +#include using namespace std; @@ -75,6 +76,7 @@ class Qspim_group : public Pad_group int *cs; }; + class I2s_group : public Pad_group { public: @@ -94,6 +96,20 @@ class I2s_group : public Pad_group int sdo; }; + +class Gpio_group : public Pad_group +{ +public: + Gpio_group(dpi_chip_wrapper *top, std::string name) : Pad_group(top, name) {} + void edge(Dpi_chip_wrapper_callback *callback, int64_t timestamp, int data); + void rx_edge(int data); + bool bind(std::string pad_name, Dpi_chip_wrapper_callback *callback); + vp::trace trace; + vp::wire_master master; + Dpi_chip_wrapper_callback *rx_callback; +}; + + class Uart_group : public Pad_group { public: @@ -108,6 +124,7 @@ class Uart_group : public Pad_group Dpi_chip_wrapper_callback *rx_callback; }; + class Hyper_group : public Pad_group { public: @@ -137,6 +154,8 @@ class dpi_chip_wrapper : public vp::component static void uart_sync(void *__this, int data, int id); static void hyper_sync_cycle(void *__this, int data, int id); static void i2s_slave_edge(void *__this, int sck, int ws, int sd, int id); + static void gpio_rx_edge(void *__this, int data, int id); + static void gpio_sync(void *__this, int data, int id); vp::trace trace; @@ -241,6 +260,16 @@ int dpi_chip_wrapper::build() traces.new_trace_event(name + "/rx", &group->rx_trace, 1); nb_itf++; } + else if (type == "gpio") + { + Gpio_group *group = new Gpio_group(this, name); + new_master_port(name, &group->master); + traces.new_trace(name, &group->trace, vp::WARNING); + + group->master.set_sync_meth_muxed(&dpi_chip_wrapper::gpio_rx_edge, nb_itf); + this->groups.push_back(group); + nb_itf++; + } else if (type == "i2s") { I2s_group *group = new I2s_group(this, name); @@ -260,6 +289,7 @@ int dpi_chip_wrapper::build() return 0; } + void dpi_chip_wrapper::qspim_sync(void *__this, int data_0, int data_1, int data_2, int data_3, int mask, int id) { #if 0 @@ -279,6 +309,7 @@ void dpi_chip_wrapper::qspim_sync(void *__this, int data_0, int data_1, int data #endif } + void dpi_chip_wrapper::uart_rx_edge(void *__this, int data, int id) { dpi_chip_wrapper *_this = (dpi_chip_wrapper *)__this; @@ -286,6 +317,15 @@ void dpi_chip_wrapper::uart_rx_edge(void *__this, int data, int id) group->rx_edge(data); } + +void dpi_chip_wrapper::gpio_rx_edge(void *__this, int data, int id) +{ + dpi_chip_wrapper *_this = (dpi_chip_wrapper *)__this; + Gpio_group *group = static_cast(_this->groups[id]); + group->rx_edge(data); +} + + void dpi_chip_wrapper::i2s_slave_edge(void *__this, int sck, int ws, int sd, int id) { dpi_chip_wrapper *_this = (dpi_chip_wrapper *)__this; @@ -293,6 +333,7 @@ void dpi_chip_wrapper::i2s_slave_edge(void *__this, int sck, int ws, int sd, int group->rx_edge(sck, ws, sd); } + void dpi_chip_wrapper::hyper_sync_cycle(void *__this, int data, int id) { #if 0 @@ -513,10 +554,40 @@ void Uart_group::rx_edge(int data) dpi_external_edge(this->rx_callback->handle, data); } + +/* + * GPIO + */ + +bool Gpio_group::bind(std::string pad_name, Dpi_chip_wrapper_callback *callback) +{ + this->rx_callback = callback; + return false; +} + + +void Gpio_group::edge(Dpi_chip_wrapper_callback *callback, int64_t timestamp, int data) +{ + this->trace.msg(vp::trace::LEVEL_TRACE, "GPIO edge (timestamp: %ld, name: %s, value: %d)\n", timestamp, callback->name.c_str(), data); + + if (this->master.is_bound()) + { + this->master.sync(data); + } +} + + +void Gpio_group::rx_edge(int data) +{ + dpi_external_edge(this->rx_callback->handle, data); +} + + void dpi_chip_wrapper::start() { } + extern "C" vp::component *vp_constructor(js::config *config) { return new dpi_chip_wrapper(config); diff --git a/libs/gap_lib/Makefile b/libs/gap_lib/Makefile index f860d85c1..d4bf72207 100644 --- a/libs/gap_lib/Makefile +++ b/libs/gap_lib/Makefile @@ -2,6 +2,8 @@ SRC = \ jpeg/dct.c jpeg/jpeg_constants.c jpeg/jpeg_encoder.c jpeg/cluster.c \ img_io/ImgIO.c +CFLAGS+= -I$(TILER_PATH)/include -I$(CURDIR)/include/gaplib + ifdef GAP_SDK_HOME include $(CURDIR)/rules/gap_sdk.mk else diff --git a/libs/gap_lib/img_io/ImgIO.c b/libs/gap_lib/img_io/ImgIO.c index 8eed6ae7a..011f5a57e 100644 --- a/libs/gap_lib/img_io/ImgIO.c +++ b/libs/gap_lib/img_io/ImgIO.c @@ -1,417 +1,448 @@ -/* - * Copyright 2019 GreenWaves Technologies, SAS - * * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include "pmsis.h" -#include "gaplib/ImgIO.h" -#include "bsp/bsp.h" -#include "bsp/fs.h" - -#define PPM_HEADER 40 - -#define Max(a, b) (((a)>(b))?(a):(b)) -#define Min(a, b) (((a)<(b))?(a):(b)) - -#define ALIGN(Value, Size) (((Value)&((1<<(Size))-1))?((((Value)>>(Size))+1)<<(Size)):(Value)) - -#define CHUNK_SIZE 8192 - -#define PRINTF printf - -unsigned char *img_rgb888; - -static void progress_bar(char * OutString, int n, int tot) -{ - int tot_chars = 30; - PRINTF("%s",OutString); - PRINTF(" ["); - int chars = (n*tot_chars)/tot; - - for (int i=0; i= '0') && ((C) <= '9')) - unsigned int Val, Ind = 0; - - if ((ImgIn[0] == 'P') && (ImgIn[1] == '5') && (ImgIn[2] == '\n')) - { - *IsRGB = 0; - } - else if ((ImgIn[0] == 'P') && (ImgIn[1] == '6') && (ImgIn[2] == '\n')) - { - *IsRGB = 1; - } - else - { - return 0; - } - Ind = 3; - - Ind=SkipComment(ImgIn, Ind); - while (!IS_DIGIT(ImgIn[Ind])) - { - Ind++; - } - Val = 0; - while (IS_DIGIT(ImgIn[Ind])) - { - Val = Val*10 + (ImgIn[Ind] - 0x30); - Ind++; - } - *W = Val; - Ind=SkipComment(ImgIn, Ind); - while (!IS_DIGIT(ImgIn[Ind])) - { - Ind++; - } - Val = 0; - while (IS_DIGIT(ImgIn[Ind])) - { - Val = Val*10 + (ImgIn[Ind] - 0x30); - Ind++; - } - *H = Val; - - Ind=SkipComment(ImgIn, Ind); - while (!IS_DIGIT(ImgIn[Ind])) - { - Ind++; - } - Val = 0; - while (IS_DIGIT(ImgIn[Ind])) - { - Val = Val*10 + (ImgIn[Ind] - 0x30); - Ind++; - } - if (Val != 255) - { - return 0; - } - while (ImgIn[Ind] != 0xA) - { - Ind++; - } - - return (Ind+1); -#undef IS_DIGIT -} - -static unsigned int GetInputImageInfos(char *Name, unsigned int *W, unsigned int *H, unsigned int *IsRGB, unsigned int *HeaderSize) -{ - struct pi_fs_conf conf; - pi_fs_conf_init(&conf); - struct pi_device fs; - - conf.type = PI_FS_HOST; - - pi_open_from_conf(&fs, &conf); - - if (pi_fs_mount(&fs)) - return -2; - - static int Debug = 0; - void *File = pi_fs_open(&fs, Name, PI_FS_FLAGS_READ); - unsigned int Err = 0; - - *W = 0; *H = 0; *IsRGB = 0; *HeaderSize = 0; - - if (Debug) - { - PRINTF("File: %s open: %s\n", Name, File?"Ok":"Failed"); - } - if (File) - { - unsigned char *Header = (unsigned char *) pmsis_l2_malloc(256); - Err |= (Header == NULL); - if (Err) - { - return Err; - } - if (pi_fs_read(File,Header, 256) == 256) - { - unsigned int i; - *HeaderSize = ReadPPMHeader(Header, W, H, IsRGB); - if (Debug) - { - PRINTF("Image %s: [W: %d, H: %d] %s, HeaderSize: %d\n", Name, *W, *H, *IsRGB?"Color":"Gray", *HeaderSize); - for (i=0; i<*HeaderSize; i++) - { - PRINTF("%c", Header[i]); - } - PRINTF("\n"); - } - } - else - { - Err = 2; - } - pmsis_l2_malloc_free(Header, 256); - pi_fs_close(File); - pi_fs_unmount(&fs); - } - return Err; -} - -unsigned char *ReadImageFromFile(char *ImageName, unsigned int *W, unsigned int *H, unsigned char *InBuffer, unsigned int BuffSize) -{ - void *File = NULL; - unsigned int IsRGB, HeaderSize, Size, AlignedSize, ReadSize=0; - unsigned char *ImagePtr = 0; - int Allocated = 0; - - struct pi_fs_conf conf; - pi_fs_conf_init(&conf); - struct pi_device fs; - - conf.type = PI_FS_HOST; - - pi_open_from_conf(&fs, &conf); - - if (pi_fs_mount(&fs)) - return NULL; - - if (GetInputImageInfos(ImageName, W, H, &IsRGB, &HeaderSize)) - { - PRINTF("Failed to get input images infos, %s\n", ImageName); goto Fail; - } - if (IsRGB) - { - PRINTF("Only Gray levels supported, found RGB\n"); goto Fail; - } - File = pi_fs_open(&fs, ImageName, PI_FS_FLAGS_READ); - if (File == 0) - { - PRINTF("Failed to open file, %s\n", ImageName); goto Fail; - } - Size = (*W)*(*H)*(IsRGB?3:1); - if (InBuffer && (BuffSize >= Size)) - { - AlignedSize = Size; - ImagePtr = InBuffer; - } - else - { - Allocated = 1; - AlignedSize = ALIGN(Size, 2); - ImagePtr = (unsigned char *) pmsis_l2_malloc(AlignedSize); - } - if (ImagePtr == 0) - { - PRINTF("Failed to allocate %d bytes for input image\n", AlignedSize); goto Fail; - } - pi_fs_seek(File,HeaderSize); - { - unsigned char *TargetImg = ImagePtr; - unsigned int RemainSize = AlignedSize; - - while (RemainSize > 0) - { - unsigned int Chunk = Min(4096, RemainSize); - unsigned R = pi_fs_read(File,TargetImg, Chunk); - ReadSize+=R; - if (R!=Chunk) break; - TargetImg += Chunk; RemainSize -= Chunk; - } - } - if (AlignedSize!=ReadSize) - { - PRINTF("Error, expects %d bytes but got %d\n", AlignedSize, ReadSize); goto Fail; - } - pi_fs_close(File); - pi_fs_unmount(&fs); - PRINTF("Image %s, [W: %d, H: %d], Gray, Size: %d bytes, Loaded sucessfully\n", ImageName, *W, *H, AlignedSize); - - return (ImagePtr); -Fail: - if (ImagePtr && Allocated) - { - pmsis_l2_malloc_free(ImagePtr, AlignedSize); - } - pi_fs_close(File); - pi_fs_unmount(&fs); - PRINTF("Failed to load image %s from flash\n", ImageName); - return 0; -} - -static void WritePPMHeader(void *FD, unsigned int W, unsigned int H, unsigned char imgFormat) -{ - // BYPASS mode, no need any header - if (imgFormat == BYPASS_IO) - return ; - - unsigned int Ind = 0, x, i, L; - unsigned char *Buffer = (unsigned char *) pmsis_l2_malloc(PPM_HEADER * sizeof(unsigned char)); - - /* P5* */ - Buffer[Ind++] = 0x50; // P - if (imgFormat == GRAY_SCALE_IO) Buffer[Ind++] = 0x35; // 5 - else Buffer[Ind++] = 0x36; // 6 - Buffer[Ind++] = 0xA; // - - /* W */ - x = W; L=0; - while (x>0) - { - x = x/10; - L++; - } - x = W; i = 1; - while (x>0) - { - Buffer[Ind+L-i] = 0x30 + (x%10); - i++; - x=x/10; - } - Ind += L; - Buffer[Ind++] = 0x20; - - /* H */ - x = H; L=0; - while (x>0) - { - x = x/10; - L++; - } - x = H; i = 1; - while (x>0) - { - Buffer[Ind+L-i] = 0x30 + (x%10); - i++; - x=x/10; - } - Ind += L; - Buffer[Ind++] = 0xA; - - /* 255 */ - Buffer[Ind++] = 0x32; - Buffer[Ind++] = 0x35; - Buffer[Ind++] = 0x35; - Buffer[Ind++] = 0xA; - - for (unsigned int a=0; a> 11); // 5 - green = (unsigned short)((pixel & 0x07E0) >> 5); // 6 - blue = (unsigned short)(pixel & 0x001F); // 5 - - output[ind] = red << 3; /* red */ - output[ind+1] = green << 2; /* green */ - output[ind+2] = blue << 3; /* blue */ - ind += 3; - } -} - -int WriteImageToFile(char *ImageName, unsigned int W, unsigned int H, unsigned char PixelSize, unsigned char *OutBuffer, unsigned char imgFormat) -{ - struct pi_fs_conf conf; - pi_fs_conf_init(&conf); - struct pi_device fs; - - conf.type = PI_FS_HOST; - - pi_open_from_conf(&fs, &conf); - - if (pi_fs_mount(&fs)) - return 0; - - void *File = pi_fs_open(&fs, ImageName, PI_FS_FLAGS_WRITE); - - int ret = 0; - WritePPMHeader(File,W,H, imgFormat); - - if(imgFormat == RGB565_IO) - { - unsigned int rgb888_size = (CHUNK_SIZE/2)*3; // size of 888 image in byte - img_rgb888 = (unsigned char *) pmsis_l2_malloc(rgb888_size); - - int steps = (W*H*PixelSize) / CHUNK_SIZE; // convert and fs write times - - for(int i=0;i(b))?(a):(b)) +#define Min(a, b) (((a)<(b))?(a):(b)) + +#define ALIGN(Value, Size) (((Value)&((1<<(Size))-1))?((((Value)>>(Size))+1)<<(Size)):(Value)) + +#define CHUNK_SIZE 8192 + +unsigned char *img_rgb888; + + +static void progress_bar(char * OutString, int n, int tot) +{ + int tot_chars = 30; + printf("%s",OutString); + printf(" ["); + int chars = (n*tot_chars)/tot; + + for(int i=0;i= '0') && ((C) <= '9')) + unsigned int val = 0; + SkipCommentAndWhiteSpace(pImg, buf_len, i); + while (*i < buf_len && !IS_DIGIT(pImg[*i])) { + (*i)++; + } + while (*i < buf_len && IS_DIGIT(pImg[*i])) { + val = val * 10 + (pImg[*i] - 0x30); + (*i)++; + } + return val; +#undef IS_DIGIT +} + +static int ReadPPMHeader(unsigned char *ImgIn, unsigned int *W, unsigned int *H, unsigned int *BytesPerPixel, unsigned int *HeaderLen, int buf_len) +{ + *W = *H = *BytesPerPixel = *HeaderLen = 0; + + if (ImgIn[0] == 'P' && ImgIn[1] == '5' && ImgIn[2] == '\n') *BytesPerPixel = 1; + else if (ImgIn[0] == 'P' && ImgIn[1] == '6' && ImgIn[2] == '\n') *BytesPerPixel = 3; + else return 1; + + int i = 3; + + *W = ReadValue(ImgIn, buf_len, &i); + *H = ReadValue(ImgIn, buf_len, &i); + unsigned int Val = ReadValue(ImgIn, buf_len, &i); + + if (Val != 255) return 1; + + while (ImgIn[i++] != 0xA) {}; + *HeaderLen = i; + return 0; +} + +static int GetInputImageInfos(char *Name, unsigned int *W, unsigned int *H, unsigned int *BytesPerPixel, unsigned int *HeaderSize) +{ + *W = 0; *H = 0; *BytesPerPixel = 0; *HeaderSize = 0; + switch_fs_t fs; + __FS_INIT(fs); + switch_file_t File = __OPEN_READ(fs, Name); + + if (!File) { + printf("Unable to open file %s\n", Name); + return 1; + } + + unsigned int Err = 0; + unsigned char *Header = (unsigned char *) gap_allocL2(256); + Err |= (Header == 0); + if (__READ(File, Header, 256) == 256) { + unsigned int i; + if (ReadPPMHeader(Header, W, H, BytesPerPixel, HeaderSize, 256)) { + printf("Unable to load header %s", Name); + Err = 1; + } else { + printf("Image %s: [W: %d, H: %d] Bytes per pixel %d, HeaderSize: %d\n", Name, *W, *H, *BytesPerPixel, *HeaderSize); + for (i=0; i<*HeaderSize;i++) printf("%c", Header[i]); + printf("\n"); + } + } else { + printf("Unable to read header %s", Name); + Err = 1; + } + gap_freeL2(Header, 256); + __CLOSE(File); + __FS_DEINIT(fs); + return Err; +} + +static int ReadMultiChannelImageRGB565(switch_file_t File, unsigned short * InBuffer, int W, int H) +{ + unsigned int RowSize = W*3; + unsigned char InputBuf[RowSize]; + unsigned short * pInBuffer = InBuffer; + + for (int CurRow=0; CurRow < H; CurRow++) { + int RemainBytes = RowSize; + unsigned char *pInpBuf = InputBuf; + while (RemainBytes > 0) { + __int_ssize_t len = __READ(File, pInpBuf, RemainBytes); + if (!len) return 1; + RemainBytes -= len; + pInpBuf += len; + } + for (int j=0, i=0; i < W; i++) { + pInBuffer[W * CurRow + i] = ((((uint16_t)InputBuf[j]&0xf8)<<8)|(((uint16_t)InputBuf[j+1]&0xfc)<<3)|(((uint16_t)InputBuf[j+2]&0xf8)>>3)); + j+=3; + } + } + return 0; +} + +static int ReadMultiChannelImageTranspose2CHW(switch_file_t File, signed char * InBuffer, int W, int H, int BytesPerPixel) +{ + unsigned int RowSize = W*BytesPerPixel, ChannelSize = W * H; + unsigned char InputBuf[RowSize]; + signed char * pInBuffer = InBuffer; + + for (int CurRow=0; CurRow < H; CurRow++) { + int RemainBytes = RowSize; + unsigned char *pInpBuf = InputBuf; + while (RemainBytes > 0) { + __int_ssize_t len = __READ(File, pInpBuf, RemainBytes); + if (!len) return 1; + RemainBytes -= len; + pInpBuf += len; + } + for (int i=0; i < W; i++) { + for (int j=0; j < BytesPerPixel; j++) { + pInBuffer[ChannelSize * j + W * CurRow + i] = InputBuf[i * BytesPerPixel + j]; + } + } + } + return 0; +} + +static int ReadMultiChannelImage(switch_file_t File, signed char * InBuffer, int W, int H, int BytesPerPixel) +{ + unsigned int RowSize = W*BytesPerPixel, ChannelSize = W * H; + unsigned char InputBuf[RowSize]; + signed char * pInBuffer = InBuffer; + + for (int CurRow=0; CurRow < H; CurRow++) { + int RemainBytes = RowSize; + unsigned char *pInpBuf = InputBuf; + while (RemainBytes > 0) { + __int_ssize_t len = __READ(File, pInpBuf, RemainBytes); + if (!len) return 1; + RemainBytes -= len; + pInpBuf += len; + } + for (int i=0; i < W; i++) { + for (int j=0; j < BytesPerPixel; j++) { + pInBuffer[RowSize * CurRow + i * BytesPerPixel + j] = InputBuf[i * BytesPerPixel + j]; + } + } + } + return 0; +} + +static int ReadMultiChannelImageShortTranspose2CHW(switch_file_t File, short int * InBuffer, int W, int H, int BytesPerPixel) +{ + unsigned int RowSize = W*BytesPerPixel, ChannelSize = W * H; + unsigned char InputBuf[RowSize]; + short int * pInBuffer = InBuffer; + + for (int CurRow=0; CurRow < H; CurRow++) { + int RemainBytes = RowSize; + unsigned char *pInpBuf = InputBuf; + while (RemainBytes > 0) { + __int_ssize_t len = __READ(File, pInpBuf, RemainBytes); + if (!len) return 1; + RemainBytes -= len; + pInpBuf += len; + } + for (int i=0; i < W; i++) { + for (int j=0; j < BytesPerPixel; j++) { + pInBuffer[ChannelSize * j + W * CurRow + i] = (short int) (InputBuf[i * BytesPerPixel + j]); + } + } + } + return 0; +} + +static int ReadMultiChannelImageShort(switch_file_t File, short int * InBuffer, int W, int H, int BytesPerPixel) +{ + unsigned int RowSize = W*BytesPerPixel, ChannelSize = W * H; + unsigned char InputBuf[RowSize]; + short int * pInBuffer = InBuffer; + + for (int CurRow=0; CurRow < H; CurRow++) { + int RemainBytes = RowSize; + unsigned char *pInpBuf = InputBuf; + while (RemainBytes > 0) { + __int_ssize_t len = __READ(File, pInpBuf, RemainBytes); + if (!len) return 1; + RemainBytes -= len; + pInpBuf += len; + } + for (int i=0; i < W; i++) { + for (int j=0; j < BytesPerPixel; j++) { + pInBuffer[RowSize * CurRow + i * BytesPerPixel + j] = (short int) (InputBuf[i * BytesPerPixel + j]); + } + } + } + return 0; +} + +int ReadImageFromFile(char *ImageName, unsigned int DesiredW, unsigned int DesiredH, unsigned int DesiredBytesPerPixel, void *InBuffer, unsigned int BuffSize, img_io_out_t out_type, int Transpose2CHW) +{ + switch_file_t File = (switch_file_t) 0; + unsigned int BytesPerPixel, W, H, HeaderSize, Size, ReadSize=0; + + if (GetInputImageInfos(ImageName, &W, &H, &BytesPerPixel, &HeaderSize)) { + printf("Failed to get input images infos, %s\n", ImageName); goto Fail; + } + if (BytesPerPixel != DesiredBytesPerPixel) { + printf("Expecting %d bytes per pixel image, %s\n", BytesPerPixel, ImageName); goto Fail; + } + if (DesiredH != H || DesiredW != W) { + printf("Expecting [%dx%d] image, got [%dx%d] %s\n", DesiredW, DesiredH, W, H, ImageName); goto Fail; + } + switch_fs_t fs; + __FS_INIT(fs); + File = __OPEN_READ(fs, ImageName); + if (File == 0) { + printf("Failed to open file, %s\n", ImageName); goto Fail; + } + + Size = W*H*BytesPerPixel; + if (out_type == IMGIO_OUTPUT_RGB565) { + if (BuffSize < W*H*2) { + printf("Buffer is too small, %s\n", ImageName); goto Fail; + } + } else { + if (BuffSize < Size) { + printf("Buffer is too small, %s\n", ImageName); goto Fail; + } + } + __SEEK(File, HeaderSize); + int res; + switch (out_type) { + case IMGIO_OUTPUT_CHAR: + if (Transpose2CHW){ + res = ReadMultiChannelImageTranspose2CHW(File, (signed char *)InBuffer, W, H, BytesPerPixel); + } else { + res = ReadMultiChannelImage(File, (signed char *)InBuffer, W, H, BytesPerPixel); + } + break; + case IMGIO_OUTPUT_SHORT: + if (Transpose2CHW){ + res = ReadMultiChannelImageShortTranspose2CHW(File, (short int *)InBuffer, W, H, BytesPerPixel); + } else { + res = ReadMultiChannelImageShort(File, (short int *)InBuffer, W, H, BytesPerPixel); + } + break; + case IMGIO_OUTPUT_RGB565: + res = ReadMultiChannelImageRGB565(File, (unsigned short *)InBuffer, W, H); + break; + default: + res = 1; + } + if (res) { + printf("Input ended unexpectedly or bad format, %s\n", ImageName); goto Fail; + } + __CLOSE(File); + __FS_DEINIT(fs); + printf("Image %s, [W: %d, H: %d], Bytes per pixel %d, Size: %d bytes, Loaded successfully\n", ImageName, W, H, BytesPerPixel, Size); + + return 0; +Fail: + __CLOSE(File); + __FS_DEINIT(fs); + printf("Failed to load image %s from flash\n", ImageName); + return 1; +} + +static void WritePPMHeader(void *FD, unsigned int W, unsigned int H, unsigned char imgFormat) +{ + // BYPASS mode, no need any header + if (imgFormat == BYPASS_IO) + return ; + + unsigned int Ind = 0, x, i, L; + unsigned char *Buffer = (unsigned char *) gap_allocL2(PPM_HEADER * sizeof(unsigned char)); + + /* P5* */ + Buffer[Ind++] = 0x50; // P + if (imgFormat == GRAY_SCALE_IO) Buffer[Ind++] = 0x35; // 5 + else Buffer[Ind++] = 0x36; // 6 + Buffer[Ind++] = 0xA; // + + /* W */ + x = W; L=0; + while (x>0) + { + x = x/10; + L++; + } + x = W; i = 1; + while (x>0) + { + Buffer[Ind+L-i] = 0x30 + (x%10); + i++; + x=x/10; + } + Ind += L; + Buffer[Ind++] = 0x20; + + /* H */ + x = H; L=0; + while (x>0) + { + x = x/10; + L++; + } + x = H; i = 1; + while (x>0) + { + Buffer[Ind+L-i] = 0x30 + (x%10); + i++; + x=x/10; + } + Ind += L; + Buffer[Ind++] = 0xA; + + /* 255 */ + Buffer[Ind++] = 0x32; + Buffer[Ind++] = 0x35; + Buffer[Ind++] = 0x35; + Buffer[Ind++] = 0xA; + + for (unsigned int a=0; a> 11); // 5 + green = (unsigned short)((pixel & 0x07E0) >> 5); // 6 + blue = (unsigned short)(pixel & 0x001F); // 5 + + output[ind] = red << 3; /* red */ + output[ind+1] = green << 2; /* green */ + output[ind+2] = blue << 3; /* blue */ + ind += 3; + } +} + + +int WriteImageToFile(char *ImageName, unsigned int W, unsigned int H, unsigned char PixelSize, unsigned char *OutBuffer, unsigned char imgFormat) +{ + + switch_fs_t fs; + __FS_INIT(fs); + + void *File = __OPEN_WRITE(fs, ImageName); + + int ret = 0; + WritePPMHeader(File,W,H, imgFormat); + + if(imgFormat == RGB565_IO) + { + unsigned int rgb888_size = (CHUNK_SIZE/2)*3; // size of 888 image in byte + img_rgb888 = (unsigned char *) gap_allocL2(rgb888_size); + + int steps = (W*H*PixelSize) / CHUNK_SIZE; // convert and fs write times + + for(int i=0;i +#include +#include +#include +#include +#include + +#define PPM_HEADER 40 + +#ifdef __EMUL__ +#include + #include + #include + typedef int switch_fs_t; + typedef int switch_file_t; + #define __FS_INIT(__X) 1 + #define __FS_DEINIT(__X) + #define __OPEN_READ(__FS, __NAME) open(__NAME, O_RDONLY, 0) + #define __OPEN_WRITE(__FS, __NAME) open(__NAME, O_RDWR | O_CREAT, S_IRWXU) + #define __CLOSE(__FD) close(__FD) + #define __SEEK(__FD, __POS) lseek(__FD, __POS, SEEK_SET) + #define __READ(__FD, __BUF, __LEN) read(__FD, __BUF, __LEN) + #define __WRITE(__FD, __BUF, __LEN) write(__FD, __BUF, __LEN) + #define __WRITEATCLUSTER(__FD, __POS, __BUF, __LEN) write(__FD, __BUF, __LEN) + #define __int_ssize_t ssize_t +#else + #include "bsp/fs.h" + typedef pi_fs_file_t * switch_file_t; + typedef struct { + struct pi_fs_conf conf; + struct pi_device fs; + } switch_fs_t; + + static inline void switch_init_fs(switch_fs_t * fs) { + pi_fs_conf_init(&fs->conf); + fs->conf.type = PI_FS_HOST; + pi_open_from_conf(&fs->fs, &fs->conf); + pi_fs_mount(&fs->fs); + } + + static inline int32_t switch_writeat_cl(pi_fs_file_t *file, uint32_t index, void *buffer, + uint32_t size) { + pi_cl_fs_req_t evt; + pi_cl_fs_copy(file, index, buffer, size, 0, &evt); + return pi_cl_fs_wait(&evt); + } + + #define __FS_INIT(__FS) \ + do { \ + pi_fs_conf_init(&__FS.conf); \ + __FS.conf.type = PI_FS_HOST; \ + pi_open_from_conf(&__FS.fs, &__FS.conf); \ + pi_fs_mount(&__FS.fs); \ + } while (0) + #define strerror(__x) "error unknown" + #define __FS_DEINIT(__FS) pi_fs_unmount(&__FS.fs) + #define __OPEN_READ(__FS, __NAME) pi_fs_open(&__FS.fs, __NAME, PI_FS_FLAGS_READ) + #define __OPEN_WRITE(__FS, __NAME) pi_fs_open(&__FS.fs, __NAME, PI_FS_FLAGS_WRITE) + #define __CLOSE(__FD) pi_fs_close(__FD) + #define __SEEK(__FD, __POS) pi_fs_seek(__FD, (__POS)) + #define __READ(__FD, __BUF, __LEN) pi_fs_read(__FD, (void *)(__BUF), (__LEN)) + #define __WRITE(__FD, __BUF, __LEN) pi_fs_write(__FD, (void *)(__BUF), (__LEN)) + #define __WRITEATCLUSTER(__FD, __POS, __BUF, __LEN) switch_writeat_cl(__FD, __POS, __BUF, __LEN) + #define __int_ssize_t unsigned int +#endif + +#endif diff --git a/rtos/freeRTOS/vendors/gwt/TARGET_GWT/pmsis/implem/pmsis_implem_gap8/drivers/udma/i2s/i2s_internal.c b/rtos/freeRTOS/vendors/gwt/TARGET_GWT/pmsis/implem/pmsis_implem_gap8/drivers/udma/i2s/i2s_internal.c index b1cb1b2fb..ca8a35417 100644 --- a/rtos/freeRTOS/vendors/gwt/TARGET_GWT/pmsis/implem/pmsis_implem_gap8/drivers/udma/i2s/i2s_internal.c +++ b/rtos/freeRTOS/vendors/gwt/TARGET_GWT/pmsis/implem/pmsis_implem_gap8/drivers/udma/i2s/i2s_internal.c @@ -284,7 +284,7 @@ static int32_t __pi_i2s_conf_apply(struct i2s_itf_data_s *itf_data) * are used, so both i2s_0 and i2s_1 use i2s_0 internal clock. */ itf_data->clk = (g_i2s_flags & PI_I2S_SETUP_SINGLE_CLOCK) ? 0 : itf_data->i2s_id; - if ((itf_data->clk == 0) && (itf_data->options & PI_I2S_OPT_EXT_CLK)) + if ((g_i2s_flags & PI_I2S_SETUP_SINGLE_CLOCK) && (itf_data->options & PI_I2S_OPT_EXT_CLK)) { I2S_TRACE_ERR("Error clock configuration : Single internal clock and " "external clock are defined !\n"); diff --git a/rtos/freeRTOS/vendors/gwt/TARGET_GWT/pmsis/implem/pmsis_implem_gap8/include/pmsis/implem/drivers/perf/perf.h b/rtos/freeRTOS/vendors/gwt/TARGET_GWT/pmsis/implem/pmsis_implem_gap8/include/pmsis/implem/drivers/perf/perf.h index 3cc4aab0e..f92d39fe1 100644 --- a/rtos/freeRTOS/vendors/gwt/TARGET_GWT/pmsis/implem/pmsis_implem_gap8/include/pmsis/implem/drivers/perf/perf.h +++ b/rtos/freeRTOS/vendors/gwt/TARGET_GWT/pmsis/implem/pmsis_implem_gap8/include/pmsis/implem/drivers/perf/perf.h @@ -74,6 +74,19 @@ static inline void pi_perf_reset() } } +static inline void pi_perf_fc_reset() +{ + __pi_perf_fc_reset(); +} + +static inline void pi_perf_cl_reset() +{ + /* Reset all performance counters to 0. */ + #if (FEATURE_CLUSTER == 1) + __pi_perf_cl_reset(); + #endif /* FEATURE_CLUSTER */ +} + static inline void pi_perf_start() { /* Initialize timer if needed and start counters. */ @@ -89,6 +102,19 @@ static inline void pi_perf_start() } } +static inline void pi_perf_fc_start() +{ + __pi_perf_fc_start(); +} + +static inline void pi_perf_cl_start() +{ + /* Initialize timer if needed and start counters. */ + #if (FEATURE_CLUSTER == 1) + __pi_perf_cl_start(); + #endif /* FEATURE_CLUSTER */ +} + static inline void pi_perf_stop() { /* Stop counters and timers, and save values. */ @@ -118,4 +144,16 @@ static inline unsigned int pi_perf_read(int id) } } +static inline unsigned int pi_perf_fc_read(int id) +{ + return __pi_perf_fc_read(id); +} + +static inline unsigned int pi_perf_cl_read(int id) +{ + #if (FEATURE_CLUSTER == 1) + return __pi_perf_cl_read(id); + #endif /* FEATURE_CLUSTER */ +} + #endif /* __PI_PERF_H__ */ diff --git a/rtos/pmsis/pmsis_api/include/pmsis/drivers/uart.h b/rtos/pmsis/pmsis_api/include/pmsis/drivers/uart.h index f4667f728..00410fe27 100644 --- a/rtos/pmsis/pmsis_api/include/pmsis/drivers/uart.h +++ b/rtos/pmsis/pmsis_api/include/pmsis/drivers/uart.h @@ -163,7 +163,14 @@ enum pi_uart_ioctl_cmd * * This command disables flow control on UART device. */ - PI_UART_IOCTL_DISABLE_FLOW_CONTROL = 6 + PI_UART_IOCTL_DISABLE_FLOW_CONTROL = 6, + + /** + * \brief Flush UART TX. + * + * This command will wait until all pending buffers are flushed outside + */ + PI_UART_IOCTL_FLUSH = 7 }; /** diff --git a/rtos/pmsis/pmsis_bsp/rules/freertos_bsp_rules.mk b/rtos/pmsis/pmsis_bsp/rules/freertos_bsp_rules.mk index d358bd8b3..7b03e8a1c 100644 --- a/rtos/pmsis/pmsis_bsp/rules/freertos_bsp_rules.mk +++ b/rtos/pmsis/pmsis_bsp/rules/freertos_bsp_rules.mk @@ -32,7 +32,7 @@ else ifeq ($(BOARD_NAME), gap9_v2) PMSIS_BSP_SRC = $(GAP9_SRC) endif -EXCLUDE_FROM_SRCS= transport/transport.c transport/nina_w10/nina_w10.c flash/spiflash/spiflash.c +EXCLUDE_FROM_SRCS= transport/transport.c transport/nina_w10/nina_w10.c PMSIS_BSP_SRCS := $(filter-out $(EXCLUDE_FROM_SRCS), $(PMSIS_BSP_SRC)) PMSIS_BSP_SRCS := $(foreach f, $(PMSIS_BSP_SRCS), $(PMSIS_BSP_DIR)/$f) diff --git a/rtos/pulp/pulp-os/drivers/drivers.mk b/rtos/pulp/pulp-os/drivers/drivers.mk index 976fba4ae..1d1ac7e7e 100644 --- a/rtos/pulp/pulp-os/drivers/drivers.mk +++ b/rtos/pulp/pulp-os/drivers/drivers.mk @@ -102,7 +102,6 @@ endif # GPIO -ifeq '$(pulp_chip_family)' 'gap' ifeq '$(CONFIG_GPIO_ENABLED)' '1' PULP_FC_CFLAGS += -DRT_CONFIG_GPIO_ENABLED ifneq '$(gpio/version)' '' @@ -110,7 +109,6 @@ PULP_LIB_FC_SRCS_rt += drivers/gpio/gpio-v$(gpio/version).c #PULP_LIB_FC_ASM_SRCS_rt += drivers/gpio/gpio-v$(gpio/version)_asm.S endif endif -endif diff --git a/rtos/pulp/pulp-os/drivers/gpio/gpio-v3.c b/rtos/pulp/pulp-os/drivers/gpio/gpio-v3.c new file mode 100644 index 000000000..49d11871f --- /dev/null +++ b/rtos/pulp/pulp-os/drivers/gpio/gpio-v3.c @@ -0,0 +1,177 @@ +/* + * Copyright (C) 2018 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pmsis.h" + +#define NB_GPIO_PORT ((ARCHI_NB_GPIO+31)/32) + +extern RT_FC_TINY_DATA uint32_t __rt_gpio_status; + +typedef struct +{ + int port; +} pi_gpio_t; + +static pi_gpio_t __rt_gpio[NB_GPIO_PORT]; + + + +void pi_gpio_conf_init(struct pi_gpio_conf *conf) +{ + conf->port = 0; +} + + + +int pi_gpio_open(struct pi_device *device) +{ + int irq = rt_irq_disable(); + + struct pi_gpio_conf *conf = (struct pi_gpio_conf *)device->config; + + if (conf->port >= NB_GPIO_PORT) + goto error; + + pi_gpio_t *gpio = &__rt_gpio[conf->port]; + + device->data = (void *)gpio; + + gpio->port = conf->port; + + rt_irq_restore(irq); + + return 0; + +error: + rt_irq_restore(irq); + return -1; +} + + +int pi_gpio_pin_configure(struct pi_device *device, pi_gpio_e pin, pi_gpio_flags_e flags) +{ + if (pin & PI_GPIO_IS_GPIO_MASK) + { + pi_pad_e pad = ((pin & ~PI_GPIO_IS_GPIO_MASK)>> PI_GPIO_NUM_SHIFT); + /* Setup first pad for GPIO. */ + + pi_pad_set_function(pad, PI_PAD_FUNC1); + } + pin = (pin & PI_GPIO_NUM_MASK); + return pi_gpio_mask_configure(device, 1<> pin) & 1; + return 0; +} + +int pi_gpio_pin_task_add(struct pi_device *device, uint32_t pin, pi_task_t *task, pi_gpio_notif_e flags) +{ + pin = (pin & PI_GPIO_NUM_MASK); + return 0; +} + +int pi_gpio_pin_task_remove(struct pi_device *device, uint32_t pin) +{ + pin = (pin & PI_GPIO_NUM_MASK); + return 0; +} + +int pi_gpio_mask_configure(struct pi_device *device, uint32_t mask, pi_gpio_flags_e flags) +{ + int irq = rt_irq_disable(); + int is_out = flags & PI_GPIO_OUTPUT; + hal_gpio_set_dir(mask, is_out); + + if (is_out) + hal_gpio_en_set(hal_gpio_en_get() & ~mask); + else + hal_gpio_en_set(hal_gpio_en_get() | mask); + + rt_irq_restore(irq); + + return 0; +} + +int pi_gpio_mask_write(struct pi_device *device, uint32_t mask, uint32_t value) +{ + hal_gpio_set_value(mask, value); + return 0; +} + +int pi_gpio_mask_read(struct pi_device *device, uint32_t mask, uint32_t *value) +{ + *value = hal_gpio_get_value(); + return 0; +} + +int pi_gpio_mask_task_add(struct pi_device *device, uint32_t mask, pi_task_t *task, pi_gpio_notif_e flags) +{ + return 0; +} + +int pi_gpio_mask_task_remove(struct pi_device *device, uint32_t mask) +{ + return 0; +} + +void pi_gpio_pin_notif_configure(struct pi_device *device, uint32_t pin, pi_gpio_notif_e flags) +{ + pin = (pin & PI_GPIO_NUM_MASK); + int irq = rt_irq_disable(); + if (flags == PI_GPIO_NOTIF_NONE) + { + //hal_gpio_inten_set(hal_gpio_inten_get() & ~(1<> pin) & 1; +} diff --git a/rtos/pulp/pulp-os/drivers/pwm/pwm-v1.c b/rtos/pulp/pulp-os/drivers/pwm/pwm-v1.c index 32ff19aa3..8f27782f4 100644 --- a/rtos/pulp/pulp-os/drivers/pwm/pwm-v1.c +++ b/rtos/pulp/pulp-os/drivers/pwm/pwm-v1.c @@ -64,9 +64,6 @@ int32_t pi_pwm_open(struct pi_device *device) device->data = (void *)timer; - pi_pad_set_function(PI_PAD_33_B12_TIMER0_CH2, PI_PAD_33_B12_TIMER0_CH2_FUNC0); - - timer->open_count++; if (timer->open_count == 1) { diff --git a/rtos/pulp/pulp-os/libs/io/prf.c b/rtos/pulp/pulp-os/libs/io/prf.c index a9d264d4e..3ce41f726 100644 --- a/rtos/pulp/pulp-os/libs/io/prf.c +++ b/rtos/pulp/pulp-os/libs/io/prf.c @@ -12,64 +12,60 @@ #include #include #include - -#ifndef MAXFLD -#define MAXFLD 200 -#endif +#include +#include #ifndef EOF #define EOF -1 #endif +#define CONFIG_MINIMAL_LIBC_LL_PRINTF 1 + +#ifdef CONFIG_MINIMAL_LIBC_LL_PRINTF +#define VALTYPE long long +#else +#define VALTYPE long +#endif + static void _uc(char *buf) { - for (/**/; *buf; buf++) { + do { if (*buf >= 'a' && *buf <= 'z') { *buf += 'A' - 'a'; } - } + } while (*buf++); } -/* Convention note: "end" as passed in is the standard "byte after - * last character" style, but... - */ -static int _reverse_and_pad(char *start, char *end, int minlen) -{ - int len; - - while (end - start < minlen) { - *end++ = '0'; - } - - *end = 0; - len = end - start; - for (end--; end > start; end--, start++) { - char tmp = *end; - *end = *start; - *start = tmp; - } - return len; -} - -/* Writes the specified number into the buffer in the given base, +/* + * Writes the specified number into the buffer in the given base, * using the digit characters 0-9a-z (i.e. base>36 will start writing - * odd bytes), padding with leading zeros up to the minimum length. + * odd bytes). */ -static int _to_x(char *buf, uint32_t n, int base, int minlen) +static int _to_x(char *buf, unsigned VALTYPE n, unsigned int base) { - char *buf0 = buf; + char *start = buf; + int len; do { - int d = n % base; + unsigned int d = n % base; n /= base; *buf++ = '0' + d + (d > 9 ? ('a' - '0' - 10) : 0); } while (n); - return _reverse_and_pad(buf0, buf, minlen); + + *buf = 0; + len = buf - start; + + for (buf--; buf > start; buf--, start++) { + char tmp = *buf; + *buf = *start; + *start = tmp; + } + + return len; } -static int _to_hex(char *buf, uint32_t value, - int alt_form, int precision, int prefix) +static int _to_hex(char *buf, unsigned VALTYPE value, bool alt_form, char prefix) { int len; char *buf0 = buf; @@ -79,7 +75,7 @@ static int _to_hex(char *buf, uint32_t value, *buf++ = 'x'; } - len = _to_x(buf, value, 16, precision); + len = _to_x(buf, value, 16); if (prefix == 'X') { _uc(buf0); } @@ -87,7 +83,7 @@ static int _to_hex(char *buf, uint32_t value, return len + (buf - buf0); } -static int _to_octal(char *buf, uint32_t value, int alt_form, int precision) +static int _to_octal(char *buf, unsigned VALTYPE value, bool alt_form) { char *buf0 = buf; @@ -99,32 +95,28 @@ static int _to_octal(char *buf, uint32_t value, int alt_form, int precision) return 1; } } - return (buf - buf0) + _to_x(buf, value, 8, precision); + return (buf - buf0) + _to_x(buf, value, 8); } -static int _to_udec(char *buf, uint32_t value, int precision) +static int _to_udec(char *buf, unsigned VALTYPE value) { - return _to_x(buf, value, 10, precision); + return _to_x(buf, value, 10); } -static int _to_dec(char *buf, int32_t value, int fplus, int fspace, int precision) +static int _to_dec(char *buf, VALTYPE value, bool fplus, bool fspace) { char *start = buf; -#if (MAXFLD < 10) - #error buffer size MAXFLD is too small -#endif - if (value < 0) { *buf++ = '-'; - if (value != (int32_t)0x80000000) - value = -value; - } else if (fplus) + value = -value; + } else if (fplus) { *buf++ = '+'; - else if (fspace) + } else if (fspace) { *buf++ = ' '; + } - return (buf + _to_udec(buf, (uint32_t) value, precision)) - start; + return (buf + _to_udec(buf, value)) - start; } static void _rlrshift(uint64_t *v) @@ -132,7 +124,8 @@ static void _rlrshift(uint64_t *v) *v = (*v & 1) + (*v >> 1); } -/* Tiny integer divide-by-five routine. The full 64 bit division +/* + * Tiny integer divide-by-five routine. The full 64 bit division * implementations in libgcc are very large on some architectures, and * currently nothing in Zephyr pulls it into the link. So it makes * sense to define this much smaller special case here to avoid @@ -149,19 +142,22 @@ static void _rlrshift(uint64_t *v) */ static void _ldiv5(uint64_t *v) { - uint32_t i, hi; - uint64_t rem = *v, quot = 0, q; + uint32_t hi; + uint64_t rem = *v, quot = 0U, q; + int i; + static const char shifts[] = { 32, 3, 0 }; - /* Usage in this file wants rounded behavior, not truncation. So add + /* + * Usage in this file wants rounded behavior, not truncation. So add * two to get the threshold right. */ - rem += 2; + rem += 2U; for (i = 0; i < 3; i++) { hi = rem >> shifts[i]; - q = (uint64_t)(hi / 5) << shifts[i]; - rem -= q * 5; + q = (uint64_t)(hi / 5U) << shifts[i]; + rem -= q * 5U; quot += q; } @@ -170,16 +166,18 @@ static void _ldiv5(uint64_t *v) static char _get_digit(uint64_t *fr, int *digit_count) { - int rval; + char rval; if (*digit_count > 0) { *digit_count -= 1; - *fr = *fr * 10; + *fr = *fr * 10U; rval = ((*fr >> 60) & 0xF) + '0'; *fr &= 0x0FFFFFFFFFFFFFFFull; - } else + } else { rval = '0'; - return (char) (rval); + } + + return rval; } /* @@ -195,6 +193,7 @@ static char _get_digit(uint64_t *fr, int *digit_count) * "fplus" TRUE if "+" conversion flag in effect. * "fspace" TRUE if " " conversion flag in effect. * "precision" Desired precision (negative if undefined). + * "zeropad" To store padding info to be inserted later */ /* @@ -207,27 +206,34 @@ static char _get_digit(uint64_t *fr, int *digit_count) #define MAXFP1 0xFFFFFFFF /* Largest # if first fp format */ #define HIGHBIT64 (1ull<<63) -static int _to_float(char *buf, uint64_t double_temp, int c, - int falt, int fplus, int fspace, int precision) +struct zero_padding { int predot, postdot, trail; }; + +static int _to_float(char *buf, uint64_t double_temp, char c, + bool falt, bool fplus, bool fspace, int precision, + struct zero_padding *zp) { - register int decexp; - register int exp; - int sign; - int digit_count; - uint64_t fract; - uint64_t ltemp; - int prune_zero; - char *start = buf; + int decexp; + int exp; + bool sign; + int digit_count; + uint64_t fract; + uint64_t ltemp; + bool prune_zero; + char *start = buf; exp = double_temp >> 52 & 0x7ff; fract = (double_temp << 11) & ~HIGHBIT64; sign = !!(double_temp & HIGHBIT64); + if (sign) { + *buf++ = '-'; + } else if (fplus) { + *buf++ = '+'; + } else if (fspace) { + *buf++ = ' '; + } if (exp == 0x7ff) { - if (sign) { - *buf++ = '-'; - } if (!fract) { if (isupper(c)) { *buf++ = 'I'; @@ -258,18 +264,14 @@ static int _to_float(char *buf, uint64_t double_temp, int c, } if ((exp | fract) != 0) { + if (exp == 0) { + /* this is a denormal */ + while (((fract <<= 1) & HIGHBIT64) == 0) { + exp--; + } + } exp -= (1023 - 1); /* +1 since .1 vs 1. */ fract |= HIGHBIT64; - decexp = true; /* Wasn't zero */ - } else - decexp = false; /* It was zero */ - - if (decexp && sign) { - *buf++ = '-'; - } else if (fplus) { - *buf++ = '+'; - } else if (fspace) { - *buf++ = ' '; } decexp = 0; @@ -278,7 +280,7 @@ static int _to_float(char *buf, uint64_t double_temp, int c, _rlrshift(&fract); exp++; } - fract *= 5; + fract *= 5U; exp++; decexp--; @@ -303,30 +305,38 @@ static int _to_float(char *buf, uint64_t double_temp, int c, exp++; } - if (precision < 0) + if (precision < 0) { precision = 6; /* Default precision if none given */ + } + prune_zero = false; /* Assume trailing 0's allowed */ if ((c == 'g') || (c == 'G')) { - if (!falt && (precision > 0)) - prune_zero = true; - if ((decexp < (-4 + 1)) || (decexp > (precision + 1))) { - if (c == 'g') - c = 'e'; - else - c = 'E'; - } else + if (decexp < (-4 + 1) || decexp > precision) { + c += 'e' - 'g'; + if (precision > 0) { + precision--; + } + } else { c = 'f'; + precision -= decexp; + } + if (!falt && (precision > 0)) { + prune_zero = true; + } } if (c == 'f') { exp = precision + decexp; - if (exp < 0) + if (exp < 0) { exp = 0; - } else + } + } else { exp = precision + 1; + } digit_count = 16; - if (exp > 16) + if (exp > 16) { exp = 16; + } ltemp = 0x0800000000000000; while (exp--) { @@ -343,111 +353,118 @@ static int _to_float(char *buf, uint64_t double_temp, int c, if (c == 'f') { if (decexp > 0) { - while (decexp > 0) { + while (decexp > 0 && digit_count > 0) { *buf++ = _get_digit(&fract, &digit_count); decexp--; } - } else + zp->predot = decexp; + decexp = 0; + } else { *buf++ = '0'; - if (falt || (precision > 0)) + } + if (falt || (precision > 0)) { *buf++ = '.'; - while (precision-- > 0) { - if (decexp < 0) { - *buf++ = '0'; - decexp++; - } else - *buf++ = _get_digit(&fract, &digit_count); } + if (decexp < 0 && precision > 0) { + zp->postdot = -decexp; + if (zp->postdot > precision) { + zp->postdot = precision; + } + precision -= zp->postdot; + } + while (precision > 0 && digit_count > 0) { + *buf++ = _get_digit(&fract, &digit_count); + precision--; + } + zp->trail = precision; } else { *buf = _get_digit(&fract, &digit_count); - if (*buf++ != '0') + if (*buf++ != '0') { decexp--; - if (falt || (precision > 0)) + } + if (falt || (precision > 0)) { *buf++ = '.'; - while (precision-- > 0) + } + while (precision > 0 && digit_count > 0) { *buf++ = _get_digit(&fract, &digit_count); + precision--; + } + zp->trail = precision; } if (prune_zero) { + zp->trail = 0; while (*--buf == '0') ; - if (*buf != '.') + if (*buf != '.') { buf++; + } } if ((c == 'e') || (c == 'E')) { - *buf++ = (char) c; + *buf++ = c; if (decexp < 0) { decexp = -decexp; *buf++ = '-'; - } else + } else { *buf++ = '+'; - *buf++ = (char) ((decexp / 10) + '0'); + } + if (decexp >= 100) { + *buf++ = (decexp / 100) + '0'; + decexp %= 100; + } + *buf++ = (decexp / 10) + '0'; decexp %= 10; - *buf++ = (char) (decexp + '0'); + *buf++ = decexp + '0'; } *buf = 0; return buf - start; } -static int _atoi(char **sptr) +static int _atoi(const char **sptr) { - register char *p; - register int i; + const char *p = *sptr - 1; + int i = 0; - i = 0; - p = *sptr; - p--; - while (isdigit(((int) *p))) + while (isdigit(*p)) { i = 10 * i + *p++ - '0'; + } *sptr = p; return i; } -int _prf(int (*func)(), void *dest, char *format, va_list vargs) +int _prf(int (*func)(), void *dest, const char *format, va_list vargs) { /* - * Due the fact that buffer is passed to functions in this file, - * they assume that it's size if MAXFLD + 1. In need of change - * the buffer size, either MAXFLD should be changed or the change - * has to be propagated across the file + * The work buffer has to accommodate for the largest data length. + * The max range octal length is one prefix + 3 bits per digit + * meaning 12 bytes on 32-bit and 23 bytes on 64-bit. + * The float code may extract up to 16 digits, plus a prefix, + * a leading 0, a dot, and an exponent in the form e+xxx for + * a total of 24. Add a trailing NULL so it is 25. */ - char buf[MAXFLD + 1]; - register int c; - int count; - register char *cptr; - int falt; - int fminus; - int fplus; - int fspace; - register int i; - int need_justifying; - char pad; - int precision; - int prefix; - int width; - char *cptr_temp; - int32_t *int32ptr_temp; - int32_t int32_temp; - uint32_t uint32_temp; - uint64_t double_temp; + char buf[25]; + char c; + int count; + char *cptr; + bool falt, fminus, fplus, fspace, fzero; + int i; + int width, precision; + int clen, prefix, zero_head; + struct zero_padding zero; + VALTYPE val; + +#define PUTC(c) do { if ((*func)(c, dest) == EOF) return EOF; } while (false) count = 0; while ((c = *format++)) { if (c != '%') { - if ((*func) (c, dest) == EOF) { - return EOF; - } - + PUTC(c); count++; - } else { - fminus = fplus = fspace = falt = false; - pad = ' '; /* Default pad character */ - precision = -1; /* No precision specified */ - + fminus = fplus = fspace = falt = fzero = false; while (strchr("-+ #0", (c = *format++)) != NULL) { switch (c) { case '-': @@ -467,7 +484,7 @@ int _prf(int (*func)(), void *dest, char *format, va_list vargs) break; case '0': - pad = '0'; + fzero = true; break; case '\0': @@ -477,85 +494,88 @@ int _prf(int (*func)(), void *dest, char *format, va_list vargs) if (c == '*') { /* Is the width a parameter? */ - width = (int32_t) va_arg(vargs, int32_t); + width = va_arg(vargs, int); if (width < 0) { fminus = true; width = -width; } c = *format++; - } else if (!isdigit(c)) + } else if (!isdigit(c)) { width = 0; - else { + } else { width = _atoi(&format); /* Find width */ c = *format++; } - /* - * If is INT_MIN, then its absolute value can - * not be expressed as a positive number using 32-bit - * two's complement. To cover that case, cast it to - * an unsigned before comparing it against MAXFLD. - */ - if ((unsigned) width > MAXFLD) { - width = MAXFLD; - } - + precision = -1; if (c == '.') { c = *format++; if (c == '*') { - precision = (int32_t) - va_arg(vargs, int32_t); - } else + precision = va_arg(vargs, int); + } else { precision = _atoi(&format); + } - if (precision > MAXFLD) - precision = -1; c = *format++; } /* - * This implementation only checks that the following format - * specifiers are followed by an appropriate type: + * This implementation only supports the following + * length modifiers: * h: short + * hh: char * l: long - * L: long double + * ll: long long * z: size_t or ssize_t - * No further special processing is done for them. */ - - if (strchr("hlLz", c) != NULL) { + i = 0; + if (strchr("hlz", c) != NULL) { i = c; c = *format++; - /* - * Here there was a switch() block - * which was doing nothing useful, I - * am still puzzled at why it was left - * over. Maybe before it contained - * stuff that was needed, but in its - * current form, it was being - * optimized out. - */ + if (i == 'l' && c == 'l') { + i = 'L'; + c = *format++; + } else if (i == 'h' && c == 'h') { + i = 'H'; + c = *format++; + } } - need_justifying = false; + cptr = buf; prefix = 0; + zero.predot = zero.postdot = zero.trail = 0; + switch (c) { case 'c': - buf[0] = (char) ((int32_t) va_arg(vargs, int32_t)); - buf[1] = '\0'; - need_justifying = true; - c = 1; + buf[0] = va_arg(vargs, int); + clen = 1; + precision = 0; break; case 'd': case 'i': - int32_temp = (int32_t) va_arg(vargs, int32_t); - c = _to_dec(buf, int32_temp, fplus, fspace, precision); - if (fplus || fspace || (int32_temp < 0)) + switch (i) { + case 'l': + val = va_arg(vargs, long); + break; +#ifdef CONFIG_MINIMAL_LIBC_LL_PRINTF + case 'L': + val = va_arg(vargs, long long); + break; +#endif + case 'z': + val = va_arg(vargs, ssize_t); + break; + case 'h': + case 'H': + default: + val = va_arg(vargs, int); + break; + } + clen = _to_dec(buf, val, fplus, fspace); + if (fplus || fspace || val < 0) { prefix = 1; - need_justifying = true; - if (precision != -1) - pad = ' '; + } break; case 'e': @@ -564,120 +584,221 @@ int _prf(int (*func)(), void *dest, char *format, va_list vargs) case 'F': case 'g': case 'G': - /* standard platforms which supports double */ { + uint64_t double_val; + + /* standard platforms which supports double */ union { double d; uint64_t i; } u; - u.d = (double) va_arg(vargs, double); - double_temp = u.i; - } + u.d = va_arg(vargs, double); + double_val = u.i; - c = _to_float(buf, double_temp, c, falt, fplus, - fspace, precision); - if (fplus || fspace || (buf[0] == '-')) + clen = _to_float(buf, double_val, c, falt, + fplus, fspace, precision, + &zero); + if (fplus || fspace || (buf[0] == '-')) { prefix = 1; - need_justifying = true; + } + clen += zero.predot + zero.postdot + zero.trail; + if (!isdigit(buf[prefix])) { + /* inf or nan: no zero padding */ + fzero = false; + } + precision = -1; break; + } case 'n': - int32ptr_temp = (int32_t *)va_arg(vargs, int32_t *); - *int32ptr_temp = count; - break; - - case 'o': - uint32_temp = (uint32_t) va_arg(vargs, uint32_t); - c = _to_octal(buf, uint32_temp, falt, precision); - need_justifying = true; - if (precision != -1) - pad = ' '; - break; + switch (i) { + case 'h': + *va_arg(vargs, short *) = count; + break; + case 'H': + *va_arg(vargs, char *) = count; + break; + case 'l': + *va_arg(vargs, long *) = count; + break; +#ifdef CONFIG_MINIMAL_LIBC_LL_PRINTF + case 'L': + *va_arg(vargs, long long *) = count; + break; +#endif + case 'z': + *va_arg(vargs, ssize_t *) = count; + break; + default: + *va_arg(vargs, int *) = count; + break; + } + continue; case 'p': - uint32_temp = (uint32_t) va_arg(vargs, uint32_t); - c = _to_hex(buf, uint32_temp, true, 8, (int) 'x'); - need_justifying = true; - if (precision != -1) - pad = ' '; + val = (uintptr_t) va_arg(vargs, void *); + clen = _to_hex(buf, val, true, 'x'); + prefix = 2; break; case 's': - cptr_temp = (char *) va_arg(vargs, char *); + cptr = va_arg(vargs, char *); /* Get the string length */ - for (c = 0; c < MAXFLD; c++) { - if (cptr_temp[c] == '\0') { + if (precision < 0) { + precision = INT_MAX; + } + for (clen = 0; clen < precision; clen++) { + if (cptr[clen] == '\0') { break; } } - if ((precision >= 0) && (precision < c)) - c = precision; - if (c > 0) { - memcpy(buf, cptr_temp, (size_t) c); - need_justifying = true; - } + precision = 0; break; + case 'o': case 'u': - uint32_temp = (uint32_t) va_arg(vargs, uint32_t); - c = _to_udec(buf, uint32_temp, precision); - need_justifying = true; - if (precision != -1) - pad = ' '; - break; - case 'x': case 'X': - uint32_temp = (uint32_t) va_arg(vargs, uint32_t); - c = _to_hex(buf, uint32_temp, falt, precision, c); - if (falt) - prefix = 2; - need_justifying = true; - if (precision != -1) - pad = ' '; + switch (i) { + case 'l': + val = va_arg(vargs, unsigned long); + break; +#ifdef CONFIG_MINIMAL_LIBC_LL_PRINTF + case 'L': + val = va_arg(vargs, unsigned long long); + break; +#endif + case 'z': + val = va_arg(vargs, size_t); + break; + case 'h': + case 'H': + default: + val = va_arg(vargs, unsigned int); + break; + } + if (c == 'o') { + clen = _to_octal(buf, val, falt); + } else if (c == 'u') { + clen = _to_udec(buf, val); + } else { + clen = _to_hex(buf, val, falt, c); + if (falt) { + prefix = 2; + } + } break; case '%': - if ((*func)('%', dest) == EOF) { - return EOF; - } - + PUTC('%'); count++; - break; + continue; + + default: + PUTC('%'); + PUTC(c); + count += 2; + continue; case 0: return count; } - if (c >= MAXFLD + 1) - return EOF; - - if (need_justifying) { - if (c < width) { - if (fminus) { - /* Left justify? */ - for (i = c; i < width; i++) - buf[i] = ' '; - } else { - /* Right justify */ - (void) memmove((buf + (width - c)), buf, (size_t) (c - + 1)); - if (pad == ' ') - prefix = 0; - c = width - c + prefix; - for (i = prefix; i < c; i++) - buf[i] = pad; - } - c = width; + if (precision >= 0) { + zero_head = precision - clen + prefix; + } else if (fzero) { + zero_head = width - clen; + } else { + zero_head = 0; + } + if (zero_head < 0) { + zero_head = 0; + } + width -= clen + zero_head; + + /* padding for right justification */ + if (!fminus && width > 0) { + count += width; + while (width-- > 0) { + PUTC(' '); + } + } + + /* data prefix */ + clen -= prefix; + count += prefix; + while (prefix-- > 0) { + PUTC(*cptr++); + } + + /* zero-padded head */ + count += zero_head; + while (zero_head-- > 0) { + PUTC('0'); + } + + /* + * main data: + * + * In the case of floats, 3 possible zero-padding + * are included in the clen count, either with + * xxxxxx. + * or with + * x.xxxxxx[e+xx] + * In the non-float cases, those predot, postdot and + * tail params are equal to 0. + */ + count += clen; + if (zero.predot) { + c = *cptr; + while (isdigit(c)) { + PUTC(c); + clen--; + c = *++cptr; + } + clen -= zero.predot; + while (zero.predot-- > 0) { + PUTC('0'); + } + } + if (zero.postdot) { + do { + c = *cptr++; + PUTC(c); + clen--; + } while (c != '.'); + clen -= zero.postdot; + while (zero.postdot-- > 0) { + PUTC('0'); + } + } + if (zero.trail) { + c = *cptr; + while (isdigit(c) || c == '.') { + PUTC(c); + clen--; + c = *++cptr; + } + clen -= zero.trail; + while (zero.trail-- > 0) { + PUTC('0'); } + } + while (clen-- > 0) { + PUTC(*cptr++); + } - for (cptr = buf; c > 0; c--, cptr++, count++) { - if ((*func)(*cptr, dest) == EOF) - return EOF; + /* padding for left justification */ + if (width > 0) { + count += width; + while (width-- > 0) { + PUTC(' '); } } } } return count; + +#undef PUTC } diff --git a/tools/autotiler_v3/Makefile b/tools/autotiler_v3/Makefile index 5ad3a7b48..2258541b8 100644 --- a/tools/autotiler_v3/Makefile +++ b/tools/autotiler_v3/Makefile @@ -1,4 +1,4 @@ -TILER_VER=3.0.5 +TILER_VER=3.0.6 export TILER_LIB=libtile.${TILER_VER}.a ifdef GAP_SDK_HOME export TILER_URL=$(GAP_SDK_HOME)/.tiler_url diff --git a/tools/autotiler_v3/generators/CNN/CNN_AT_Misc.c b/tools/autotiler_v3/generators/CNN/CNN_AT_Misc.c new file mode 100644 index 000000000..18c82161f --- /dev/null +++ b/tools/autotiler_v3/generators/CNN/CNN_AT_Misc.c @@ -0,0 +1,253 @@ +#include "Gap.h" +#include "CNN_BasicKernels_SQ8.h" + +static int CoreCountDynamic = 1; +static int ActiveCore = gap_ncore(); + +static inline unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X) + +{ + unsigned int NCore; + unsigned int Log2Core; + unsigned int Chunk; + + if (CoreCountDynamic) NCore = ActiveCore; else NCore = gap_ncore(); + Log2Core = gap_fl1(NCore); + Chunk = (X>>Log2Core) + ((X&(NCore-1))!=0); + return Chunk; +} + +/* Tensor Dump */ +typedef enum { + AT_MEM_UNDEF, + AT_MEM_L3_HRAM, + AT_MEM_L3_QSPIRAM, + AT_MEM_L3_OSPIRAM, + AT_MEM_L3_HFLASH, + AT_MEM_L3_QSPIFLASH, + AT_MEM_L3_OSPIFLASH, + AT_MEM_L3_MRAMFLASH, + AT_MEM_L2, + AT_MEM_L1, + AT_MEM_LAST +} AT_MemLocation_T; + +static void *AT_TensorGetNextPage( + int Loc, + void *L3_Device, + void *L3_Event, + unsigned int Size, + void *L2_BufferAddr, + void *Addr, + int Offset) + +{ + switch (Loc) { + case AT_MEM_L3_HRAM: + AT_HYPERRAM_CL_COPY((AT_HYPERRAM_T *) L3_Device, (AT_HYPERRAM_EXT_ADDR_TYPE) (Addr+Offset), (AT_HYPERRAM_INT_ADDR_TYPE) L2_BufferAddr, Size, 0, L3_Event); + AT_HYPERRAM_CL_WAIT((AT_HYPERRAM_T *) L3_Device, L3_Event); + break; +#if 0 + case AT_MEM_L3_QSPIRAM: + AT_QSPIRAM_CL_COPY((AT_QSPIRAM_T *) L3_Device, (AT_QSPIRAM_EXT_ADDR_TYPE) (Addr+Offset), (AT_QSPIRAM_INT_ADDR_TYPE) L2_BufferAddr, Size, 1, L3_Event); + AT_QSPIRAM_CL_WAIT((AT_QSPIRAM_T *) L3_Device, L3_Event); + break; + case AT_MEM_L3_OSPIRAM: + AT_OSPIRAM_CL_COPY((AT_OSPIRAM_T *) L3_Device, (AT_OSPIRAM_EXT_ADDR_TYPE) (Addr+Offset), (AT_OSPIRAM_INT_ADDR_TYPE) L2_BufferAddr, Size, 1, L3_Event); + AT_OSPIRAM_CL_WAIT((AT_OSPIRAM_T *) L3_Device, L3_Event); + break; +#endif + case AT_MEM_L3_HFLASH: + AT_HYPERFLASH_FS_CL_COPY((AT_HYPERFLASH_FS_T *) L3_Device, (AT_HYPERFLASH_FS_EXT_ADDR_TYPE) (Addr+Offset), (AT_HYPERFLASH_FS_INT_ADDR_TYPE) L2_BufferAddr, Size, 0, L3_Event); + AT_HYPERFLASH_FS_CL_WAIT((AT_HYPERFLASH_FS_T *) L3_Device, L3_Event); + break; +#if 0 + case AT_MEM_L3_QSPIFLASH: + AT_QSPIFLASH_FS_CL_COPY((AT_QSPIFLASH_FS_T *) L3_Device, (AT_QSPIFLASH_FS_EXT_ADDR_TYPE) (Addr+Offset), (AT_QSPIFLASH_FS_INT_ADDR_TYPE) L2_BufferAddr, Size, 1, L3_Event); + AT_QSPIFLASH_FS_CL_WAIT((AT_QSPIFLASH_FS_T *) L3_Device, L3_Event); + break; + case AT_MEM_L3_OSPIFLASH: + AT_OSPIFLASH_FS_CL_COPY((AT_OSPIFLASH_FS_T *) L3_Device, (AT_OSPIFLASH_FS_EXT_ADDR_TYPE) (Addr+Offset), (AT_OSPIFLASH_FS_INT_ADDR_TYPE) L2_BufferAddr, Size, 1, L3_Event); + AT_OSPIFLASH_FS_CL_WAIT((AT_OSPIFLASH_FS_T *) L3_Device, L3_Event); + break; + case AT_MEM_L3_MRAMFLASH: + AT_EMRAMFLASH_FS_CL_COPY((AT_EMRAMFLASH_FS_T *) L3_Device, (AT_EMRAMFLASH_FS_EXT_ADDR_TYPE) (Addr+Offset), (AT_EMRAMFLASH_FS_INT_ADDR_TYPE) L2_BufferAddr, Size, 1, L3_Event); + AT_EMRAMFLASH_FS_CL_WAIT((AT_EMRAMFLASH_FS_T *) L3_Device, L3_Event); + break; +#endif + case AT_MEM_L2: + case AT_MEM_L1: + return Addr; + } + return L2_BufferAddr; +} + +void AT_DumpTensor( + char *NodeName, + char *ArgName, + int Loc, + void *L3_Device, + void *L3_Event, + int ItemSize, + int Dim, + int D0, + int D1, + int D2, + int D3, + int D4, + void *L2_BufferAddr, + unsigned int L2_BufferSize, + void *Addr) +{ + int MAX_PER_LINE = 30; + int SizeToRead = D0*D1*D2*D3*D4*ItemSize; + int InBuffer=0; + if (L2_BufferSize==0) L2_BufferSize = SizeToRead; + int Item = 0; + int ReadSoFar = 0; + void *BaseAddr = Addr; + + printf("Node: %s, Argument: %s, Dim: %d, [%d][%d][%d][%d][%d] ItemSize: %d\n", NodeName, ArgName, Dim, D0,D1,D2,D3,D4, ItemSize); + for (int d0=0; d0=5) printf("D%d: %d\n", Dim-5, d0); + for (int d1=0; d1=4) printf("D%d: %d\n", Dim-4, d1); + for (int d2=0; d2=3) printf("D%d: %d\n", Dim-3, d2); + for (int d3=0; d3=2) printf("D%d: %d - D%d:0..%d\n", Dim-2, d3, Dim-1, D4); + else printf("D%d:0..%d\n", Dim-1, D4); + for (int d4=0; d4In; + int W = Arg->W; + int H = Arg->H; + int Feat = Arg->Feat; + int Size = Arg->Size; + int Pad = Arg->Pad; + + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(Feat); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Feat); + + if (Arg->Orientation == 0) { /* Horizontal */ + int ClrSize = W*Pad*Size; + for (unsigned int i=First; iIn; + int W = Arg->W; + int H = Arg->H; + int Size = Arg->Size; + int Pad = Arg->Pad; + + unsigned int CoreId = gap_coreid(); + if (Arg->Orientation == 0) { /* Horizontal */ + int ClrSize = W*Pad*Size; + unsigned int Chunk = ChunkSize(ClrSize); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, ClrSize); + char *Base = (In + W*(H-Pad)*Size); + for (unsigned int i=First; i 1) { + AT_FORK(gap_ncore(), (void *) AT_KerParTileClear, (void *) &Arg); + __CALL(AT_KerParTileClear, (AT_KerTileClear_T *) &Arg); + } else { + AT_FORK(gap_ncore(), (void *) AT_KerTileClear, (void *) &Arg); + __CALL(AT_KerTileClear, (AT_KerTileClear_T *) &Arg); + } +} diff --git a/tools/autotiler_v3/generators/CNN/CNN_Activation_SQ8.c b/tools/autotiler_v3/generators/CNN/CNN_Activation_SQ8.c new file mode 100644 index 000000000..83f538a46 --- /dev/null +++ b/tools/autotiler_v3/generators/CNN/CNN_Activation_SQ8.c @@ -0,0 +1,1208 @@ +#include "Gap.h" +#include "CNN_BasicKernels_SQ8.h" + +static int CoreCountDynamic = 1; +static int ActiveCore = gap_ncore(); + +static inline unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X) + +{ + unsigned int NCore; + unsigned int Log2Core; + unsigned int Chunk; + + if (CoreCountDynamic) NCore = ActiveCore; else NCore = gap_ncore(); + Log2Core = gap_fl1(NCore); + Chunk = (X>>Log2Core) + ((X&(NCore-1))!=0); + return Chunk; +} + +/* + * Standalone activation +*/ +static void Ker_Activation_SQ8( + signed char * __restrict__ In, + signed char * __restrict__ Out, + unsigned int N, + CNN_ActivationOper_T Activation, + unsigned int ActScale, unsigned int ActScaleN, int A0, int B0, int C0 + ) + +{ + for (unsigned int i=0; iFeat; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Arg->W*Arg->H; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=First; cFeat; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Arg->W*Arg->H; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=First; cFeat; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Arg->W*Arg->H; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=First; cFeat; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Arg->W*Arg->H; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=First; cFeat; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Arg->W*Arg->H; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=First; cFeat; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Arg->W*Arg->H; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=First; cFeat; + unsigned int Size = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + S = Size*Max(0, Last-First); + for (int c=First; cFeat; + unsigned int Size = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + S = Size*Max(0, Last-First); + for (int c=First; cFeat; + unsigned int Size = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + S = Size*Max(0, Last-First); + for (int c=First; cFeat; + unsigned int Size = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + S = Size*Max(0, Last-First); + for (int c=First; cFeat; + unsigned int Size = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + S = Size*Max(0, Last-First); + for (int c=First; cFeat; + unsigned int Size = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + S = Size*Max(0, Last-First); + for (int c=First; cFeat; + unsigned S = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=0; cFeat; + unsigned S = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=0; cFeat; + unsigned S = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=0; cFeat; + unsigned S = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=0; cFeat; + unsigned S = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=0; cFeat; + unsigned S = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ In = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=0; cFeat; + unsigned int S = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ InOut = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=0; cFeat; + unsigned int S = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ InOut = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=0; cFeat; + unsigned int S = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ InOut = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=0; cFeat; + unsigned int S = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ InOut = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=0; cFeat; + unsigned int S = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ InOut = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=0; cFeat; + unsigned int S = Arg->W*Arg->H; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + int * __restrict__ InOut = (int *__restrict__) Arg->In; + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=0; cFeat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + signed char * __restrict__ In = (signed char *__restrict__) Arg->In; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + unsigned int Size = Arg->W*Arg->H; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + if (ActScale) for (int c=First; cFeat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + signed char * __restrict__ In = (signed char *__restrict__) Arg->In; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + unsigned int Size = Arg->W*Arg->H; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + if (ActScale) for (int c=First; cFeat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + signed char * __restrict__ In = (signed char *__restrict__) Arg->In; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + unsigned int Size = Arg->W*Arg->H; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=First; cFeat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + signed char * __restrict__ In = (signed char *__restrict__) Arg->In; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + unsigned int Size = Arg->W*Arg->H; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=First; cFeat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + signed char * __restrict__ In = (signed char *__restrict__) Arg->In; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + unsigned int Size = Arg->W*Arg->H; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + for (int c=First; cW*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + signed char * __restrict__ In = (signed char *__restrict__) Arg->In; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + + if (ActScale) Ker_Activation_SQ8(In+First, Out+First, Size, ACT_RELU, ActScale, ActScaleN, A0, B0, C0); + else Ker_ActivationScale1_SQ8(In+First, Out+First, Size, ACT_RELU, A0); + gap_waitbarrier(0); +} + +void Ker_ReLUN_SQ8(KerActivation_SQ8_T *Arg) + +{ + unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + signed char * __restrict__ In = (signed char *__restrict__) Arg->In; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + + if (ActScale) Ker_Activation_SQ8(In+First, Out+First, Size, ACT_RELUN, ActScale, ActScaleN, A0, B0, C0); + else Ker_ActivationScale1_SQ8(In+First, Out+First, Size, ACT_RELUN, A0); + gap_waitbarrier(0); +} + +void Ker_HSigmoid_SQ8(KerActivation_SQ8_T *Arg) + +{ + unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + signed char * __restrict__ In = (signed char *__restrict__) Arg->In; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + + Ker_Activation_SQ8(In+First, Out+First, Size, ACT_HSIGMOID, ActScale, ActScaleN, A0, B0, C0); + gap_waitbarrier(0); +} + +void Ker_HSwish_SQ8(KerActivation_SQ8_T *Arg) + +{ + unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + signed char * __restrict__ In = (signed char *__restrict__) Arg->In; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + + Ker_Activation_SQ8(In+First, Out+First, Size, ACT_HSWISH, ActScale, ActScaleN, A0, B0, C0); + gap_waitbarrier(0); +} + +void Ker_LeakyReLU_SQ8(KerActivation_SQ8_T *Arg) + +{ + unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); + signed char * __restrict__ In = (signed char *__restrict__) Arg->In; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; + unsigned int Size = Max(0, Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + + Ker_Activation_SQ8(In+First, Out+First, Size, ACT_LEAKYRELU, ActScale, ActScaleN, A0, B0, C0); + gap_waitbarrier(0); +} diff --git a/tools/autotiler_v3/generators/CNN/CNN_BasicKernels.h b/tools/autotiler_v3/generators/CNN/CNN_BasicKernels.h index 28fe68ed5..9cb070f4d 100644 --- a/tools/autotiler_v3/generators/CNN/CNN_BasicKernels.h +++ b/tools/autotiler_v3/generators/CNN/CNN_BasicKernels.h @@ -194,9 +194,9 @@ typedef KerConv_fps_T KerConvDW_fps_T; /* Alias to separate regular conv from d /* Double precision */ typedef struct { short int * __restrict__ In; /**< Pointer to input tile */ - unsigned short int W; /**< Width of the input tile */ + unsigned short int W; /**< Width of the input tile */ unsigned short int UsedW; /**< Part of the input width to be used */ - unsigned short int H; /**< Height of the input tile */ + unsigned short int H; /**< Height of the input tile */ unsigned short int UsedH; /**< Part of the input height to be used */ unsigned short int InFeatures; /**< Number of output features, used for channel parallel kernels */ unsigned short int OutFeatures; /**< Number of output features, used for channel parallel kernels */ @@ -216,9 +216,9 @@ typedef struct { typedef struct { signed char * __restrict__ In; /**< Pointer to input tile */ - unsigned short int W; /**< Width of the input tile */ + unsigned short int W; /**< Width of the input tile */ unsigned short int UsedW; /**< Part of the input width to be used */ - unsigned short int H; /**< Height of the input tile */ + unsigned short int H; /**< Height of the input tile */ unsigned short int UsedH; /**< Part of the input height to be used */ unsigned short int InFeatures; /**< Number of output features, used for channel parallel kernels */ unsigned short int OutFeatures; /**< Number of output features, used for channel parallel kernels */ @@ -250,7 +250,7 @@ typedef struct { signed char NormBias; /**< Bias to ConvOut correction factor */ unsigned short int InFeatures; /**< Number of channels, used only for channel parallel kernels */ int LB; /**< If OutUB then Out=UB or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< If Out>UB then Out=UB */ short int * __restrict__ MulBias; /**< Output Multiplicative bias */ } KerDP_fp_T; @@ -263,7 +263,7 @@ typedef struct { signed char NormBias; /**< Precision of the multiplicative bias */ unsigned short int InFeatures; /**< Number of channels, used only for channel parallel kernels */ int LB; /**< If OutUB then Out=UB or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< If Out>UB then Out=UB */ signed char * __restrict__ MulBias; /**< Output Multiplicative bias */ } KerDP_fps_T; @@ -292,7 +292,7 @@ typedef struct { unsigned char Sy; /**< Stride Sy, used only if Sx!=Sy */ unsigned char Dy; /**< Dilation Dy, used only if Dx!=Dy */ int LB; /**< Lower bound for the output */ - int UB; /**< Upper bound for the output or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< Upper bound for the output */ } KerReLUPool_fp_T; typedef struct { @@ -342,7 +342,7 @@ typedef struct { unsigned char Sy; /**< Stride Sy, used only if Sx!=Sy */ unsigned char Dy; /**< Dilation Dy, used only if Dx!=Dy */ int LB; /**< Lower bound for the output */ - int UB; /**< Upper bound for the output or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< Upper bound for the output */ } KerReLUPool_fps_T; typedef struct { @@ -388,7 +388,7 @@ typedef struct { unsigned char Norm; /**< Normalization factor */ signed char NormBias; /**< Bias Normalization factor */ int LB; /**< Lower bound for the output */ - int UB; /**< Upper bound for the output or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< Upper bound for the output */ } KerLinearLayerReLU_fp_T; typedef struct { @@ -402,7 +402,7 @@ typedef struct { unsigned char Norm; /**< Normalization factor */ signed char NormBias; /**< Bias Normalization factor */ int LB; /**< Lower bound for the output */ - int UB; /**< Upper bound for the output or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< Upper bound for the output */ } KerLinearLayerReLU_fps_T; typedef struct { @@ -416,7 +416,7 @@ typedef struct { unsigned char Norm; /**< Normalization factor */ signed char NormBias; /**< Normalization factor for the bias */ int LB; /**< Lower bound for the output */ - int UB; /**< Upper bound for the output or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< Upper bound for the output */ } KerLinearLayerReLU_fp_fps_fp_T; typedef struct { @@ -430,9 +430,23 @@ typedef struct { unsigned char Norm; /**< Normalization factor */ signed char NormBias; /**< Normalization factor for the bias */ int LB; /**< Lower bound for the output */ - int UB; /**< Upper bound for the output or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< Upper bound for the output */ } KerLinearLayerReLU_fps_fps_fp_T; +typedef struct { + signed char * __restrict__ In; /**< Pointer to input tile */ + unsigned short int InSize; /**< Size of the the tile */ + unsigned short int TotalInSize; /**< Total input size in case parallelization is performed on outputs */ + unsigned short int OutSize; /**< Size of the output tile */ + signed char * __restrict__ Filter; /**< Pointer to filter tile, width is TotalInSize */ + signed char * __restrict__ Bias; /**< Pointer to bias tile, size is OutSize */ + int * __restrict__ Out; /**< Pointer to output tile, size if OutSize */ + unsigned char Norm; /**< Normalization factor */ + signed char NormBias; /**< Normalization factor for the bias */ + int LB; /**< Lower bound for the output */ + int UB; /**< Upper bound for the output */ +} KerLinearLayerReLU_fps_fps_fpd_T; + typedef struct { short int * __restrict__ In; /**< Pointer to input tile */ unsigned short int InSize; /**< Size of the the tile */ @@ -444,7 +458,7 @@ typedef struct { unsigned char Norm; /**< Normalization factor */ signed char NormBias; /**< Normalization factor for the bias */ int LB; /**< Lower bound for the output */ - int UB; /**< Upper bound for the output or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< Upper bound for the output */ } KerLinearLayerReLU_fp_fp_fpd_T; typedef struct { @@ -476,7 +490,7 @@ typedef struct { short int * __restrict__ Bias; /**< Pointer to Bias */ short int * __restrict__ Out; /**< Pointer to Out */ int LB; /**< Min output value */ - int UB; /**< Max output value or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< Max output value */ char Norm; /**< Precision, input/output/filter */ signed char NormBias; /**< Bias precision */ char Oper; /**< Activation operation after linear layer, see CNN_Activation_Oper_T */ @@ -487,7 +501,7 @@ typedef struct { signed char * __restrict__ Bias; /**< Pointer to Bias */ signed char * __restrict__ Out; /**< Pointer to Out */ int LB; /**< Min output value */ - int UB; /**< Max output value or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< Max output value */ char Norm; /**< Precision, input/output/filter */ signed char NormBias; /**< Bias precision */ char Oper; /**< Activation operation after linear layer, see CNN_Activation_Oper_T */ @@ -505,7 +519,7 @@ typedef struct { unsigned short int H; /**< Input Height */ unsigned short int N; /**< Number of (input, input, output) */ int LB; /**< Lower bound for the output */ - int UB; /**< Upper bound for the output or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< Upper bound for the output */ unsigned char In1_Q; /**< In1 quantization */ unsigned char In2_Q; /**< In2 quantization */ unsigned char Out_Q; /**< Out quantization */ @@ -519,7 +533,7 @@ typedef struct { unsigned short int H; /**< Input Height */ unsigned short int N; /**< Number of (input, input, output) */ int LB; /**< Lower bound for the output */ - int UB; /**< Upper bound for the output or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< Upper bound for the output */ unsigned char In1_Q; /**< In1 quantization */ unsigned char In2_Q; /**< In2 quantization */ unsigned char Out_Q; /**< Out quantization */ @@ -533,7 +547,7 @@ typedef struct { unsigned short int H; /**< Input Height */ unsigned short int N; /**< Number of channels/features */ int LB; /**< Lower bound for the output */ - int UB; /**< Upper bound for the output or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< Upper bound for the output */ short int *__restrict__ ScaleScalar; /**< Pointer to optional Scalar value to be applied to all channels */ unsigned char Scale_Q; /**< Scalar fixed point format */ unsigned char Norm; /**< Normalization factor to be applied to result */ @@ -547,7 +561,7 @@ typedef struct { unsigned short int H; /**< Input Height */ unsigned short int N; /**< Number of channels/features */ int LB; /**< Lower bound for the output */ - int UB; /**< Upper bound for the output or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< Upper bound for the output */ signed char *__restrict__ ScaleScalar; /**< Pointer to optional Scalar value to be applied to all channels */ unsigned char Scale_Q; /**< Scalar fixed point format */ unsigned char Norm; /**< Normalization factor to be applied to result */ @@ -566,7 +580,7 @@ typedef struct { unsigned short int OutFirstCol; /**< Equal M2FirstCol */ short int * __restrict__ BufferColIn2; /**< In case vectorization is used will be used to copy a column of In2 into a line */ int LB; /**< If OutUB then Out=UB or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< If Out>UB then Out=UB */ unsigned char Norm; /**< Fixed point format */ signed char NormBias; /**< Precision of the bias */ unsigned char NormMulBias; /**< Precision of the multiplicative bias */ @@ -590,7 +604,7 @@ typedef struct { unsigned short int OutFirstCol; /**< Equal M2FirstCol */ short int * __restrict__ BufferColIn2; /**< In case vectorization is used will be used to copy a column of In2 into a line */ int LB; /**< If OutUB then Out=UB or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< If Out>UB then Out=UB */ unsigned char Norm; /**< Fixed point format */ signed char NormBias; /**< Precision of the bias */ unsigned char NormMulBias; /**< Precision of the multiplicative bias */ @@ -614,7 +628,7 @@ typedef struct { unsigned short int OutFirstCol; /**< Equal M2FirstCol */ signed char * __restrict__ BufferColIn2;/**< In case vectorization is used will be used to copy a column of In2 into a line */ int LB; /**< If OutUB then Out=UB or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< If Out>UB then Out=UB */ unsigned char Norm; /**< Fixed point format */ signed char NormBias; /**< Precision of the bias */ unsigned char NormMulBias; /**< Precision of the multiplicative bias */ @@ -638,7 +652,7 @@ typedef struct { unsigned short int OutFirstCol; /**< Equal M2FirstCol */ signed char * __restrict__ BufferColIn2;/**< In case vectorization is used will be used to copy a column of In2 into a line */ int LB; /**< If OutUB then Out=UB or pointer to upper bound vector if RELUN_VECTOR */ + int UB; /**< If Out>UB then Out=UB */ unsigned char Norm; /**< Fixed point format */ signed char NormBias; /**< Precision of the bias */ unsigned char NormMulBias; /**< Precision of the multiplicative bias */ @@ -1267,12 +1281,10 @@ extern void KerConvNxMDxDyStrideSxSy_DP_fps(KerConv_DP_fps_T *Arg); Feature maps of bytes (_fps) or half words (_fp) */ extern void KerParReLU_fp(KerReLUPool_fp_T *Arg); -extern void KerParReLUN_Vector_fp(KerReLUPool_fp_T *Arg); extern void KerParHswish_fp(KerReLUPool_fp_T *Arg); extern void KerParHsigmoid_fp(KerReLUPool_fp_T *Arg); extern void KerParReLU_fps(KerReLUPool_fps_T *Arg); -extern void KerParReLUN_Vector_fps(KerReLUPool_fps_T *Arg); extern void KerParHswish_fps(KerReLUPool_fps_T *Arg); extern void KerParHsigmoid_fps(KerReLUPool_fps_T *Arg); @@ -1280,9 +1292,7 @@ extern void KerParHsigmoid_fps(KerReLUPool_fps_T *Arg); Feature maps of bytes (_fps) or half words (_fp) */ extern void KerReLU_fp(KerReLUPool_fp_T *Arg); -extern void KerReLUN_Vector_fp(KerReLUPool_fp_T *Arg); extern void KerReLU_fps(KerReLUPool_fps_T *Arg); -extern void KerReLUN_Vector_fps(KerReLUPool_fps_T *Arg); /******************************************************************************************************************************/ /**************** DOUBLE PRECISION TO SINGLE PRECISION REDUCTION **************************************************************/ @@ -1293,26 +1303,20 @@ extern void KerReLUN_Vector_fps(KerReLUPool_fps_T *Arg); /* Input is Double precision on 32 bits Qx.2N, Output is Single precision on 16 bits Qx.N, input and output are disjoints */ extern void KerDP_fp(KerDP_fp_T *Arg); -extern void KerDP_ReLUN_Vector_fp(KerDP_fp_T *Arg); /* Input is Double precision on 32 bits Qx.2N, Output is Single precision on 16 bits Qx.N, input and output point to same location */ extern void KerDP_IO_fp(KerDP_fp_T *Arg); -extern void KerDP_IO_ReLUN_Vector_fp(KerDP_fp_T *Arg); /* Input is Double precision on 32 bits Qx.2N, Output is Single precision on 16 bits Qx.N, input and output are disjoints, same MulBias applied to all out channels */ extern void KerDPMulBiasScalar_fp(KerDP_fp_T *Arg); -extern void KerDPMulBiasScalar_ReLUN_Vector_fp(KerDP_fp_T *Arg); /* Input is Double precision on 32 bits Qx.2N, Output is Single precision on 16 bits Qx.N, input and output point to same location, same MulBias applied to all out channels */ extern void KerDPMulBiasScalar_IO_fp(KerDP_fp_T *Arg); -extern void KerDPMulBiasScalar_IO_ReLUN_Vector_fp(KerDP_fp_T *Arg); /* Input is Double precision on 32 bits Qx.2N, Output is Single precision on 16 bits Qx.N, input and output are disjoints, each out channel has its own MulBias */ extern void KerDPMulBias_fp(KerDP_fp_T *Arg); -extern void KerDPMulBias_ReLUN_Vector_fp(KerDP_fp_T *Arg); /* Input is Double precision on 32 bits Qx.2N, Output is Single precision on 16 bits Qx.N, input and output point to same location, each out channel has its own MulBias */ extern void KerDPMulBias_IO_fp(KerDP_fp_T *Arg); -extern void KerDPMulBias_IO_ReLUN_Vector_fp(KerDP_fp_T *Arg); /* Input is Double precision on 32 bits Qx.2N, Output is Single precision on 16 bits Qx.N, input and output are disjoints out is (in * ReLU6(in+3))/6 */ extern void KerDP_hswish_fp(KerDP_fp_T *Arg); @@ -1334,26 +1338,20 @@ extern void KerDP_IO_leakyrelu_fp(KerDP_fp_T *Arg); /* Input is Double precision on 16 or 32 bits Qx.2N, Output is Single precision on 8 bits Qx.N, input and output are disjoints */ extern void KerDP_fps(KerDP_fps_T *Arg); -extern void KerDP_ReLUN_Vector_fps(KerDP_fps_T *Arg); /* Input is Double precision on 16 or 32 bits Qx.2N, Output is Single precision on 8 bits Qx.N, input and output point to same location */ extern void KerDP_IO_fps(KerDP_fps_T *Arg); -extern void KerDP_IO_ReLUN_Vector_fps(KerDP_fps_T *Arg); /* Input is Double precision on 16 or 32 bits Qx.2N, Output is Single precision on 8 bits Qx.N, input and output are disjoints, same MulBias applied to all out channels */ extern void KerDPMulBiasScalar_fps(KerDP_fps_T *Arg); -extern void KerDPMulBiasScalar_ReLUN_Vector_fps(KerDP_fps_T *Arg); /* Input is Double precision on 16 or 32 bits Qx.2N, Output is Single precision on 8 bits Qx.N, input and output point to same location, same MulBias applied to all out channels */ extern void KerDPMulBiasScalar_IO_fps(KerDP_fps_T *Arg); -extern void KerDPMulBiasScalar_IO_ReLUN_Vector_fps(KerDP_fps_T *Arg); /* Input is Double precision on 16 or 32 bits Qx.2N, Output is Single precision on 8 bits Qx.N, input and output are disjoints, each out channel has its own MulBias */ extern void KerDPMulBias_fps(KerDP_fps_T *Arg); -extern void KerDPMulBias_ReLUN_Vector_fps(KerDP_fps_T *Arg); /* Input is Double precision on 16 or 32 bits Qx.2N, Output is Single precision on 8 bits Qx.N, input and output point to same location, each out channel has its own MulBias */ extern void KerDPMulBias_IO_fps(KerDP_fps_T *Arg); -extern void KerDPMulBias_IO_ReLUN_Vector_fps(KerDP_fps_T *Arg); /* Input is Double precision on 16 or 32 bits Qx.2N, Output is Single precision on 8 bits Qx.N, input and output are disjoints out is (in * ReLU6(in+3))/6 */ extern void KerDP_hswish_fps(KerDP_fps_T *Arg); @@ -1441,15 +1439,10 @@ extern void KerPoolNxMStrideSxSy_fps(KerReLUPool_fps_T *Arg); /* A single output is evaluated in parallel on all cores */ extern void KerLinearLayerReLU_fp(KerLinearLayerReLU_fp_T *Arg); -extern void KerLinearLayerReLUN_Vector_fp(KerLinearLayerReLU_fp_T *Arg); extern void KerLinearLayerReLU_fps(KerLinearLayerReLU_fps_T *Arg); -extern void KerLinearLayerReLUN_Vector_fps(KerLinearLayerReLU_fps_T *Arg); extern void KerLinearLayerReLU_fp_fps_fp(KerLinearLayerReLU_fp_fps_fp_T *Arg); -extern void KerLinearLayerReLUN_Vector_fp_fps_fp(KerLinearLayerReLU_fp_fps_fp_T *Arg); extern void KerLinearLayerReLU_fp_fp_fpd(KerLinearLayerReLU_fp_fp_fpd_T *Arg); -extern void KerLinearLayerReLUN_Vector_fp_fp_fpd(KerLinearLayerReLU_fp_fp_fpd_T *Arg); extern void KerLinearLayerReLU_fps_fps_fp(KerLinearLayerReLU_fps_fps_fp_T *Arg); -extern void KerLinearLayerReLUN_Vector_fps_fps_fp(KerLinearLayerReLU_fps_fps_fp_T *Arg); /* A single output is evaluated in parallel on all cores, double precision output, need reduction step after */ extern void KerDPLinearLayer_fp(KerDPLinearLayer_fp_T *Arg); @@ -1460,21 +1453,17 @@ extern void KerDPLinearLayerReduct_fps(KerDPLinearLayerReduct_fps_T *Arg); /* Several output are evaluated in parallel, one per core */ extern void KerParLinearLayerReLU_fp(KerLinearLayerReLU_fp_T *Arg); -extern void KerParLinearLayerReLUN_Vector_fp(KerLinearLayerReLU_fp_T *Arg); extern void KerParLinearLayerHswish_fp(KerLinearLayerReLU_fp_T *Arg); extern void KerParLinearLayerHsigmoid_fp(KerLinearLayerReLU_fp_T *Arg); extern void KerParLinearLayerReLU_fps(KerLinearLayerReLU_fps_T *Arg); -extern void KerParLinearLayerReLUN_Vector_fps(KerLinearLayerReLU_fps_T *Arg); extern void KerParLinearLayerHswish_fps(KerLinearLayerReLU_fps_T *Arg); extern void KerParLinearLayerHsigmoid_fps(KerLinearLayerReLU_fps_T *Arg); extern void KerParLinearLayerReLU_fps_fps_fp(KerLinearLayerReLU_fps_fps_fp_T *Arg); -extern void KerParLinearLayerReLUN_Vector_fps_fps_fp(KerLinearLayerReLU_fps_fps_fp_T *Arg); +extern void KerParLinearLayerReLU_fps_fps_fpd(KerLinearLayerReLU_fps_fps_fpd_T *Arg); extern void KerParLinearLayerReLU_fp_fps_fp(KerLinearLayerReLU_fp_fps_fp_T *Arg); -extern void KerParLinearLayerReLUN_Vector_fp_fps_fp(KerLinearLayerReLU_fp_fps_fp_T *Arg); extern void KerParLinearLayerReLU_fp_fp_fpd(KerLinearLayerReLU_fp_fp_fpd_T *Arg); -extern void KerParLinearLayerReLUN_Vector_fp_fp_fpd(KerLinearLayerReLU_fp_fp_fpd_T *Arg); /******************************************************************************************************************************/ @@ -1491,73 +1480,51 @@ extern void KerParMatAddReLU_fp(KerMat3_fp_T *Arg); extern void KerParMatAddReLU_fps(KerMat3_fps_T *Arg); extern void KerParMatMul_fp(KerMatMul_fp_T *Arg); -extern void KerParMatMul_ReLUN_Vector_fp(KerMatMul_fp_T *Arg); extern void KerParMatMulSxSy_fp(KerMatMul_fp_T *Arg); -extern void KerParMatMulSxSy_ReLUN_Vector_fp(KerMatMul_fp_T *Arg); extern void KerParMatMul_fpd_fp(KerMatMul_fpd_fp_T *Arg); -extern void KerParMatMul_ReLUN_Vector_fpd_fp(KerMatMul_fpd_fp_T *Arg); extern void KerParMatMulSxSy_fpd_fp(KerMatMul_fpd_fp_T *Arg); -extern void KerParMatMulSxSy_ReLUN_Vector_fpd_fp(KerMatMul_fpd_fp_T *Arg); extern void KerParMatMulScaleScalar_fp(KerMatMul_fp_T *Arg); -extern void KerParMatMulScaleScalar_ReLUN_Vector_fp(KerMatMul_fp_T *Arg); extern void KerParMatMulScaleScalarSxSy_fp(KerMatMul_fp_T *Arg); -extern void KerParMatMulScaleScalarSxSy_ReLUN_Vector_fp(KerMatMul_fp_T *Arg); extern void KerParMatMulScaleScalar_fpd_fp(KerMatMul_fpd_fp_T *Arg); -extern void KerParMatMulScaleScalar_ReLUN_Vector_fpd_fp(KerMatMul_fpd_fp_T *Arg); extern void KerParMatMulScaleScalarSxSy_fpd_fp(KerMatMul_fpd_fp_T *Arg); -extern void KerParMatMulScaleScalarSxSy_ReLUN_Vector_fpd_fp(KerMatMul_fpd_fp_T *Arg); extern void KerParMatMulScale_fp(KerMatMul_fp_T *Arg); -extern void KerParMatMulScale_ReLUN_Vector_fp(KerMatMul_fp_T *Arg); extern void KerParMatMulScaleSxSy_fp(KerMatMul_fp_T *Arg); -extern void KerParMatMulScaleSxSy_ReLUN_Vector_fp(KerMatMul_fp_T *Arg); extern void KerParMatMulScale_fpd_fp(KerMatMul_fpd_fp_T *Arg); -extern void KerParMatMulScale_ReLUN_Vector_fpd_fp(KerMatMul_fpd_fp_T *Arg); extern void KerParMatMulScaleSxSy_fpd_fp(KerMatMul_fpd_fp_T *Arg); -extern void KerParMatMulScaleSxSy_ReLUN_Vector_fpd_fp(KerMatMul_fpd_fp_T *Arg); extern void KerParMatMul_fps(KerMatMul_fps_T *Arg); -extern void KerParMatMul_ReLUN_Vector_fps(KerMatMul_fps_T *Arg); extern void KerParMatMulSxSy_fps(KerMatMul_fps_T *Arg); -extern void KerParMatMulSxSy_ReLUN_Vector_fps(KerMatMul_fps_T *Arg); extern void KerParMatMul_fp_fps(KerMatMul_fp_fps_T *Arg); -extern void KerParMatMul_ReLUN_Vector_fp_fps(KerMatMul_fp_fps_T *Arg); extern void KerParMatMulSxSy_fp_fps(KerMatMul_fp_fps_T *Arg); -extern void KerParMatMulSxSy_ReLUN_Vector_fp_fps(KerMatMul_fp_fps_T *Arg); extern void KerParMatMulScaleScalar_fps(KerMatMul_fps_T *Arg); extern void KerParMatMulScaleScalarSxSy_fps(KerMatMul_fps_T *Arg); extern void KerParMatMulScaleScalar_fp_fps(KerMatMul_fp_fps_T *Arg); -extern void KerParMatMulScaleScalar_ReLUN_Vector_fp_fps(KerMatMul_fp_fps_T *Arg); extern void KerParMatMulScaleScalarSxSy_fp_fps(KerMatMul_fp_fps_T *Arg); -extern void KerParMatMulScaleScalarSxSy_ReLUN_Vector_fp_fps(KerMatMul_fp_fps_T *Arg); extern void KerParMatMulScale_fps(KerMatMul_fps_T *Arg); -extern void KerParMatMulScale_ReLUN_Vector_fps(KerMatMul_fps_T *Arg); extern void KerParMatMulScaleSxSy_fps(KerMatMul_fps_T *Arg); -extern void KerParMatMulScaleSxSy_ReLUN_Vector_fps(KerMatMul_fps_T *Arg); extern void KerParMatMulScale_fp_fps(KerMatMul_fp_fps_T *Arg); -extern void KerParMatMulScale_ReLUN_Vector_fp_fps(KerMatMul_fp_fps_T *Arg); extern void KerParMatMulScaleSxSy_fp_fps(KerMatMul_fp_fps_T *Arg); -extern void KerParMatMulScaleSxSy_ReLUN_Vector_fp_fps(KerMatMul_fp_fps_T *Arg); extern void KerParMatMulHswish_fp(KerMatMul_fp_T *Arg); extern void KerParMatMulHswishSxSy_fp(KerMatMul_fp_T *Arg); @@ -1575,48 +1542,36 @@ extern void KerParMatMulLeakyrelu_fps(KerMatMul_fps_T *Arg); extern void KerParMatMulLeakyreluSxSy_fps(KerMatMul_fps_T *Arg); extern void KerParMatMulSmallFeat_fp(KerMatMul_fp_T *Arg); -extern void KerParMatMulSmallFeat_ReLUN_Vector_fp(KerMatMul_fp_T *Arg); extern void KerParMatMulHswishSmallFeat_fp(KerMatMul_fp_T *Arg); extern void KerParMatMulHsigmoidSmallFeat_fp(KerMatMul_fp_T *Arg); extern void KerParMatMulLeakyreluSmallFeat_fp(KerMatMul_fp_T *Arg); extern void KerParMatMulScaleScalarSmallFeat_fp(KerMatMul_fp_T *Arg); -extern void KerParMatMulScaleScalarSmallFeat_ReLUN_Vector_fp(KerMatMul_fp_T *Arg); extern void KerParMatMulScaleSmallFeat_fp(KerMatMul_fp_T *Arg); -extern void KerParMatMulScaleSmallFeat_ReLUN_Vector_fp(KerMatMul_fp_T *Arg); extern void KerParMatMulSmallFeat_fps(KerMatMul_fps_T *Arg); -extern void KerParMatMulSmallFeat_ReLUN_Vector_fps(KerMatMul_fps_T *Arg); extern void KerParMatMulHswishSmallFeat_fps(KerMatMul_fps_T *Arg); extern void KerParMatMulHsigmoidSmallFeat_fps(KerMatMul_fps_T *Arg); extern void KerParMatMulLeakyreluSmallFeat_fps(KerMatMul_fps_T *Arg); extern void KerParMatMulScaleScalarSmallFeat_fps(KerMatMul_fps_T *Arg); -extern void KerParMatMulScaleScalarSmallFeat_ReLUN_Vector_fps(KerMatMul_fps_T *Arg); extern void KerParMatMulScaleSmallFeat_fps(KerMatMul_fps_T *Arg); -extern void KerParMatMulScaleSmallFeat_ReLUN_Vector_fps(KerMatMul_fps_T *Arg); extern void KerParMatScaleVector_fp(KerMatScale_fp_T *Arg); -extern void KerParMatScaleVector_ReLUN_Vector_fp(KerMatScale_fp_T *Arg); extern void KerParMatScaleVector_fps(KerMatScale_fps_T *Arg); -extern void KerParMatScaleVector_ReLUN_Vector_fps(KerMatScale_fps_T *Arg); extern void KerParMatScaleScalar_fp(KerMatScale_fp_T *Arg); -extern void KerParMatScaleScalar_ReLUN_Vector_fp(KerMatScale_fp_T *Arg); extern void KerParMatScaleScalar_fps(KerMatScale_fps_T *Arg); -extern void KerParMatScaleScalar_ReLUN_Vector_fps(KerMatScale_fps_T *Arg); extern void KerParMatScaleVectorScalar_fp(KerMatScale_fp_T *Arg); -extern void KerParMatScaleVectorScalar_ReLUN_Vector_fp(KerMatScale_fp_T *Arg); extern void KerParMatScaleVectorScalar_fps(KerMatScale_fps_T *Arg); -extern void KerParMatScaleVectorScalar_ReLUN_Vector_fps(KerMatScale_fps_T *Arg); extern void CNN_ParTranspose_fps(KerMatTranspose_fps_T *Arg); extern void CNN_ParTransposeSxSy_fps(KerMatTranspose_fps_T *Arg); diff --git a/tools/autotiler_v3/generators/CNN/CNN_BasicKernels_SQ8.h b/tools/autotiler_v3/generators/CNN/CNN_BasicKernels_SQ8.h new file mode 100644 index 000000000..d82244007 --- /dev/null +++ b/tools/autotiler_v3/generators/CNN/CNN_BasicKernels_SQ8.h @@ -0,0 +1,745 @@ +#ifndef __CNN_BASICKERNELS_SQ8__ +#define __CNN_BASICKERNELS_SQ8__ +#include "Gap.h" + +#ifdef __pulp__ +#ifndef Min +#define Min(a, b) __builtin_pulp_minsi((a), (b)) +#endif +#ifndef Max +#define Max(a, b) __builtin_pulp_maxsi((a), (b)) +#endif +#else +#define Min(a, b) (((a)<(b))?(a):(b)) +#define Max(a, b) (((a)>(b))?(a):(b)) +#endif + + +#define NORM_ROUND +#ifdef NORM_ROUND +#define AT_NORM(x, n) gap_roundnorm_reg((x), (n)) +#else +#define AT_NORM(x, n) gap_norm_reg((x), (n)) +#endif + +#define AT_SCALE(X, Scale, ScaleN) AT_NORM((X)*(Scale), (ScaleN)) + +#define AT_LSHIFT(x, n) ((x)<<(n)) +#define AT_RSHIFT(x, n) ((x)>>(n)) + +#ifdef GENASM +#ifdef __EMUL__ +#define gap_ncore() 8 +#define gap_coreid() __builtin_pulp_CoreId() +#endif +#endif + +#define AT_INF_BIASL_SM 0 +#define AT_INF_ACTSCALE 0 +#define AT_INF_ACTSCALEN 1 +#define AT_INF_A0 2 +#define AT_INF_B0 3 +#define AT_INF_C0 4 + +#define AT_INF_BIASN 5 +#define AT_INF_IN1SCALE 5 +#define AT_INF_SCALE 5 + +#define AT_INF_SCALEN 6 +#define AT_INF_IN1SCALEN 6 + +#define AT_INF_OUTSCALE 7 +#define AT_INF_OUTSCALEN 8 + +#define AT_INF_DIM 9 + +typedef enum { + ACT_NONE = 0, + ACT_RELU, + ACT_RELUN, + ACT_HSIGMOID, + ACT_HSWISH, + ACT_LEAKYRELU, +} CNN_ActivationOper_T; + +/****************************************************************************************************************** + Bias setting for convolution and linear layers +******************************************************************************************************************/ +typedef struct { + int *__restrict__ Out; + unsigned short int W; + unsigned short int H; + unsigned short int Feat; + void *__restrict__ Bias; + unsigned char NormBias; +} KerSetBias_SQ8_T; + +/****************************************************************************************************************** + Linear, Bias 32b, 16b or 8b + 1) 32b output with Bias assumed to be set in Out before, need channel scaling and reduction after + 2) Output can be fully evaluated (FullFeat) then Bias setting scaling and activation (ReLU, ReLUN) is + performed in a single shot +******************************************************************************************************************/ +typedef struct { + signed char * __restrict__ In; + signed char * __restrict__ Weights; + void *__restrict__ Bias; + int * __restrict__ Out; + unsigned short int InDim; + unsigned short int TotalInDim; + unsigned short int OutDim; + unsigned char *__restrict__ Scale; + unsigned char *__restrict__ ScaleN; + signed char *__restrict__ Infos; +} KerLinear_SQ8_T; + +/****************************************************************************************************************** + Point Wise and Depth Convolution, 32b output + Bias assumed to be set in Out before for Point Wise conv, directly managed for depth wise conv + Need scaling and reduction after +******************************************************************************************************************/ + +typedef struct { + signed char * __restrict__ In; /**< Pointer to input tile */ + unsigned short int W; /**< Width of the input tile */ + unsigned short int UsedW; /**< Part of the input width to be used */ + unsigned short int H; /**< Height of the input tile */ + unsigned short int UsedH; /**< Part of the input height to be used */ + unsigned short int InFeatures; /**< Number of input features, used for channel parallel kernels */ + unsigned short int OutFeatures; /**< Number of output features, used for channel parallel kernels */ + unsigned short int TotalInFeatures; /**< Total Input feature space in current tile */ + signed char * __restrict__ Filter; /**< Pointer to convolution coefficients. (Nx x Ny) coeffs in Q15 */ + signed char * __restrict__ Bias; /**< Pointer to bias tile, used when convolution is depth wise */ + int * __restrict__ Out; /**< Pointer to output tile, this tile can have up to N-1 lines and N-1 column than In depending on Pad */ + v4s Pad; /**< Paddding, 0: Left, 1: Right, 2: Top, 3: Bottom */ + unsigned char NormBias; /**< Bias init correction factor */ + unsigned char Orientation; /**< Tile orientation: 1 => Horizontal, 0 => Vertical, used only for feature parallel kernels */ + unsigned char N; /**< Dimension of the convolution: Nx, NxN, used only for general versions */ + unsigned char S; /**< Output stride, S, used only for general versions */ + unsigned char D; /**< Dilation Dx */ + unsigned char Ny; /**< Filter Ny, used only if Nx!=Ny */ + unsigned char Sy; /**< Stride Sy, used only if Sx!=Sy */ + unsigned char Dy; /**< Dilation Dy, used only if Dx!=Dy */ +} KerConv_SQ8_T; + +/****************************************************************************************************************** + Reduction scaling and activation after double precision convolution or linear layer +******************************************************************************************************************/ +typedef struct { + int *__restrict__ In; + void *__restrict__ Out; + unsigned short int Feat; + unsigned short int W; + unsigned short int H; + unsigned char * __restrict__ Scale; + unsigned char * __restrict__ ScaleN; + signed char * __restrict__ Infos; +} KerConvLinReduct_SQ8_T; + +/****************************************************************************************************************** + Standalone scaling and activation +******************************************************************************************************************/ +typedef struct { + signed char *__restrict__ In; + signed char *__restrict__ Out; + unsigned short int Feat; + unsigned short int W; + unsigned short int H; + signed char * __restrict__ Infos; +} KerActivation_SQ8_T; + + +/****************************************************************************************************************** + Pooling followed by optional scaling and activation +******************************************************************************************************************/ +typedef struct { + signed char * __restrict__ In; + signed char * __restrict__ Out; + unsigned short int Feat; + unsigned short int W; + unsigned short int UsedW; + unsigned short int H; + unsigned short int UsedH; + unsigned char PoolMax; + unsigned char FS; /* Filter Size, x */ + unsigned char FSy; /* Filter Size, y */ + unsigned char S; /* Filter Stride, x */ + unsigned char Sy; /* Filter Stride, y */ + unsigned char D; + unsigned char Dy; + unsigned char Orientation; + unsigned char DoScale; + v4s Pad; + signed char * __restrict__ Infos; +} KerPool_SQ8_T; + +/****************************************************************************************************************** + Global average Pooling followed by optional scaling and activation +******************************************************************************************************************/ +typedef struct { + void * __restrict__ In; /**< Pointer to input tile */ + unsigned short int W; /**< Width of the input tile */ + unsigned short int H; /**< Height of the input tile */ + unsigned short int Feat; /**< Number of features, used only for channel parallel kernels */ + unsigned char FirstTile; /**< 1 if this tile is the first one of its group */ + unsigned char DoScale; /**< Apply Scaling */ + void * __restrict__ Out; /**< Pointer to output tile */ + signed char * __restrict__ Infos; +} KerGlobalPool_SQ8_T; + + +/****************************************************************************************************************** + Tensor multiplication used for 1x1 convolution. Channel centric scaling. +******************************************************************************************************************/ + +typedef struct { + signed char * __restrict__ In1; /**< First input matrix tile, convolution weights */ + unsigned short int W_In1; /**< First input matrix tile width */ + unsigned short int H_In1; /**< First input matrix tile height */ + signed char * __restrict__ In2; /**< Second input matrix tile, convolution features */ + unsigned short int W_In2; /**< Second input matrix tile width, height is by construction H_In1 */ + void * __restrict__ Bias; /**< Bias input tile, convolution bias */ + unsigned char * __restrict__ Scale; /**< Scale Factor to be applied after convolution */ + unsigned char * __restrict__ ScaleN; /**< Normalization Factor to be applied after scaling */ + signed char * __restrict__ Out; /**< Output matrix tile, W=W_In2, H=H_In1 by construction */ + unsigned short int W_Out; /**< Output matrix full width */ + unsigned short int OutFirstCol; /**< Equal M2FirstCol */ + signed char * __restrict__ BufferColIn2;/**< In case vectorization is used will be used to copy a column of In2 into a line */ + unsigned char NormBias; /**< Normalization factor to be applied to Bias */ + unsigned char ColFirst; /**< 1 if product is formed with a vertical tile from In1 and a horizontal from In2, 0 if Hor tile In1 Ver tile In2 */ + unsigned char Sx; /**< When used for 1x1 convolution In2 is a feature maps [H_In2=W_In1=InFeat, W_In2=W*H], Sx applies to W and Sy to H */ + unsigned char Sy; /**< When used for 1x1 convolution In2 is a feature maps [H_In2=W_In1=InFeat, W_In2=W*H], Sy applies to W and Sy to H */ + unsigned short int W; /**< When used for 1x1 convolution In2 is a feature maps [H_In2=W_In1=InFeat, W_In2=W*H], W */ + unsigned short int H; /**< When used for 1x1 convolution In2 is a feature maps [H_In2=W_In1=InFeat, W_In2=W*H], H */ + signed char * __restrict__ Infos; /**< Scaling and constants data */ +} KerMatMul_SQ8_T; + +/****************************************************************************************************************** + Tensor addition with optional In1 and Output scaling +******************************************************************************************************************/ + +typedef struct { + signed char * __restrict__ In1; /**< First input tensor */ + signed char * __restrict__ In2; /**< Second input tensor */ + signed char * __restrict__ Out; /**< Onput tensor */ + unsigned short int Feat; /**< Number of features */ + unsigned short int W; /**< Feature width */ + unsigned short int H; /**< Feature height */ + unsigned char DoScale; /**< Apply Scaling */ + signed char * __restrict__ Infos; /**< Scaling and constants data */ +} KerMat3_SQ8_T; + +/****************************************************************************************************************** + Matrix transposition and 3D Tensor dim permutation +******************************************************************************************************************/ + +typedef struct { + signed char *__restrict__ In; /**< Input matrix */ + signed char *__restrict__ Out; /**< Output matrix */ + unsigned int Feat; /**< Number of matrices */ + unsigned int W; /**< Matrix width */ + unsigned int H; /**< Matrix height */ + unsigned char Sx; /**< Stride for W dimension */ + unsigned char Sy; /**< Stride for H dimension */ +} KerMatTranspose_fps_T; + + +/****************************************************************************************************************** + SoftMax, Q15 output +******************************************************************************************************************/ + +typedef struct { + signed char *__restrict__ In; /**< Pointer to input tile */ + unsigned short int N; /**< Size of the tile */ + unsigned short int Norm; /**< Normalization factor */ + short int *__restrict__ Out; /**< Pointer to output tile */ + signed char * __restrict__ Infos; /**< Scaling and constants data */ +} KerSoftMax_SQ8_T; + +/****************************************************************************************************************** + Bias setting for convolution and linear layers, output is 32b, input is 8,16 or 32b +******************************************************************************************************************/ + +/* Features in parallel */ +void KerParSetBiasB32_SQ8(KerSetBias_SQ8_T *Arg); +void KerParSetBiasB16_SQ8(KerSetBias_SQ8_T *Arg); +void KerParSetBiasB8_SQ8(KerSetBias_SQ8_T *Arg); + +/* Each feature in parallel */ +void KerSetBiasB32_SQ8(KerSetBias_SQ8_T *Arg); +void KerSetBiasB16_SQ8(KerSetBias_SQ8_T *Arg); +void KerSetBiasB8_SQ8(KerSetBias_SQ8_T *Arg); + + +/****************************************************************************************************************** + Convolution, Bias is assigned separately. Output is 32 bits. Parallel Features and Features Parallel +******************************************************************************************************************/ + +void KerParConv1x1Stride1_SQ8(KerConv_SQ8_T *Arg); +void KerParConv1x1Stride2_SQ8(KerConv_SQ8_T *Arg); +void KerParConv1x1StrideS_SQ8(KerConv_SQ8_T *Arg); +void KerParConv3x1Stride1x1_SQ8(KerConv_SQ8_T *Arg); +void KerParConv3x1Stride2x1_SQ8(KerConv_SQ8_T *Arg); +void KerParConv1x3Stride1x1_SQ8(KerConv_SQ8_T *Arg); +void KerParConv1x3Stride1x2_SQ8(KerConv_SQ8_T *Arg); +void KerParConv3x3Stride1_SQ8(KerConv_SQ8_T *Arg); +void KerParConv3x3Stride2_SQ8(KerConv_SQ8_T *Arg); +void KerParConv3x3StrideS_SQ8(KerConv_SQ8_T *Arg); +void KerParConv5x1Stride1x1_SQ8(KerConv_SQ8_T *Arg); +void KerParConv5x1Stride2x1_SQ8(KerConv_SQ8_T *Arg); +void KerParConv1x5Stride1x1_SQ8(KerConv_SQ8_T *Arg); +void KerParConv1x5Stride1x2_SQ8(KerConv_SQ8_T *Arg); +void KerParConv5x5Stride1_SQ8(KerConv_SQ8_T *Arg); +void KerParConv5x5Stride2_SQ8(KerConv_SQ8_T *Arg); +void KerParConv5x5StrideS_SQ8(KerConv_SQ8_T *Arg); +void KerParConv7x7StrideS_SQ8(KerConv_SQ8_T *Arg); +void KerParConvNxNStrideS_SQ8(KerConv_SQ8_T *Arg); +void KerParConvNxMStrideSxSy_SQ8(KerConv_SQ8_T *Arg); +void KerParConvNxMDxDyStrideSxSy_SQ8(KerConv_SQ8_T *Arg); + + +void KerConv1x1Stride1_SQ8(KerConv_SQ8_T *Arg); +void KerConv1x1Stride2_SQ8(KerConv_SQ8_T *Arg); +void KerConv1x1StrideS_SQ8(KerConv_SQ8_T *Arg); +void KerConv3x1Stride1x1_SQ8(KerConv_SQ8_T *Arg); +void KerConv3x1Stride2x1_SQ8(KerConv_SQ8_T *Arg); +void KerConv1x3Stride1x1_SQ8(KerConv_SQ8_T *Arg); +void KerConv1x3Stride1x2_SQ8(KerConv_SQ8_T *Arg); +void KerConv3x3Stride1_SQ8(KerConv_SQ8_T *Arg); +void KerConv3x3Stride2_SQ8(KerConv_SQ8_T *Arg); +void KerConv3x3StrideS_SQ8(KerConv_SQ8_T *Arg); +void KerConv5x1Stride1x1_SQ8(KerConv_SQ8_T *Arg); +void KerConv5x1Stride2x1_SQ8(KerConv_SQ8_T *Arg); +void KerConv1x5Stride1x1_SQ8(KerConv_SQ8_T *Arg); +void KerConv1x5Stride1x2_SQ8(KerConv_SQ8_T *Arg); +void KerConv5x5Stride1_SQ8(KerConv_SQ8_T *Arg); +void KerConv5x5Stride2_SQ8(KerConv_SQ8_T *Arg); +void KerConv5x5StrideS_SQ8(KerConv_SQ8_T *Arg); +void KerConv7x7StrideS_SQ8(KerConv_SQ8_T *Arg); +void KerConvNxNStrideS_SQ8(KerConv_SQ8_T *Arg); +void KerConvNxMStrideSxSy_SQ8(KerConv_SQ8_T *Arg); +void KerConvNxMDxDyStrideSxSy_SQ8(KerConv_SQ8_T *Arg); + + +/****************************************************************************************************************** + Depth Wise convolution, 8b, 16b and 32b Bias. Output is 32 bits. Parallel Features and Features Parallel +******************************************************************************************************************/ + +/* 8 Bits Bias, Features in parallel */ +void KerParConvDW1x1Stride1B8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x1Stride2B8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x1StrideSB8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW3x1Stride1x1B8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW3x1Stride2x1B8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x3Stride1x1B8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x3Stride1x2B8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW3x3Stride1B8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW3x3Stride2B8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW3x3StrideSB8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW5x1Stride1x1B8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW5x1Stride2x1B8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x5Stride1x1B8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x5Stride1x2B8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW5x5Stride1B8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW5x5Stride2B8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW5x5StrideSB8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW7x7StrideSB8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDWNxNStrideSB8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDWNxMStrideSxSyB8_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDWNxMDxDyStrideSxSyB8_SQ8(KerConv_SQ8_T *Arg); + +/* 16 Bits Bias, Features in parallel */ +void KerParConvDW1x1Stride1B16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x1Stride2B16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x1StrideSB16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW3x1Stride1x1B16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW3x1Stride2x1B16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x3Stride1x1B16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x3Stride1x2B16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW3x3Stride1B16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW3x3Stride2B16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW3x3StrideSB16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW5x1Stride1x1B16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW5x1Stride2x1B16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x5Stride1x1B16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x5Stride1x2B16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW5x5Stride1B16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW5x5Stride2B16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW5x5StrideSB16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW7x7StrideSB16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDWNxNStrideSB16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDWNxMStrideSxSyB16_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDWNxMDxDyStrideSxSyB16_SQ8(KerConv_SQ8_T *Arg); + +/* 32 Bits Bias, Features in parallel */ +void KerParConvDW1x1Stride1B32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x1Stride2B32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x1StrideSB32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW3x1Stride1x1B32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW3x1Stride2x1B32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x3Stride1x1B32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x3Stride1x2B32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW3x3Stride1B32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW3x3Stride2B32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW3x3StrideSB32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW5x1Stride1x1B32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW5x1Stride2x1B32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x5Stride1x1B32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW1x5Stride1x2B32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW5x5Stride1B32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW5x5Stride2B32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW5x5StrideSB32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDW7x7StrideSB32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDWNxNStrideSB32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDWNxMStrideSxSyB32_SQ8(KerConv_SQ8_T *Arg); +void KerParConvDWNxMDxDyStrideSxSyB32_SQ8(KerConv_SQ8_T *Arg); + +/* 8 Bits Bias, One Feature in parallel */ +void KerConvDW1x1Stride1B8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x1Stride2B8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x1StrideSB8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW3x1Stride1x1B8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW3x1Stride2x1B8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x3Stride1x1B8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x3Stride1x2B8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW3x3Stride1B8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW3x3Stride2B8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW3x3StrideSB8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW5x1Stride1x1B8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW5x1Stride2x1B8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x5Stride1x1B8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x5Stride1x2B8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW5x5Stride1B8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW5x5Stride2B8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW5x5StrideSB8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW7x7StrideSB8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDWNxNStrideSB8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDWNxMStrideSxSyB8_SQ8(KerConv_SQ8_T *Arg); +void KerConvDWNxMDxDyStrideSxSyB8_SQ8(KerConv_SQ8_T *Arg); + +/* 16 Bits Bias, One Feature in parallel */ +void KerConvDW1x1Stride1B16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x1Stride2B16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x1StrideSB16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW3x1Stride1x1B16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW3x1Stride2x1B16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x3Stride1x1B16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x3Stride1x2B16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW3x3Stride1B16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW3x3Stride2B16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW3x3StrideSB16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW5x1Stride1x1B16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW5x1Stride2x1B16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x5Stride1x1B16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x5Stride1x2B16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW5x5Stride1B16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW5x5Stride2B16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW5x5StrideSB16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW7x7StrideSB16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDWNxNStrideSB16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDWNxMStrideSxSyB16_SQ8(KerConv_SQ8_T *Arg); +void KerConvDWNxMDxDyStrideSxSyB16_SQ8(KerConv_SQ8_T *Arg); + +/* 32 Bits Bias, One Feature in parallel */ +void KerConvDW1x1Stride1B32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x1Stride2B32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x1StrideSB32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW3x1Stride1x1B32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW3x1Stride2x1B32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x3Stride1x1B32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x3Stride1x2B32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW3x3Stride1B32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW3x3Stride2B32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW3x3StrideSB32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW5x1Stride1x1B32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW5x1Stride2x1B32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x5Stride1x1B32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW1x5Stride1x2B32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW5x5Stride1B32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW5x5Stride2B32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW5x5StrideSB32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDW7x7StrideSB32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDWNxNStrideSB32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDWNxMStrideSxSyB32_SQ8(KerConv_SQ8_T *Arg); +void KerConvDWNxMDxDyStrideSxSyB32_SQ8(KerConv_SQ8_T *Arg); + +/****************************************************************************************************************** + Input Scaling followed by an optional activation. Parallel Feature, Feature Parallel + Input is assumed to be the 32b unnormalized output of a convolution or a linear layer + Optional activation is applied to the scaled input and can be optionaly scaled also + Output is a scaled 8b quantized tensor + Channel Centric (CC) +******************************************************************************************************************/ + +/* + * Input Scaling and reduction to 8b then channel centric activation, Out location != In location. Features are evaluated in parallel +*/ +void KerParReduct_CC_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerParReduct_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerParReduct_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerParReduct_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerParReduct_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerParReduct_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg); + +/* + * Input Scaling and reduction to 8b then channel centric activation, Out location = In location. Features are evaluated in parallel +*/ +void KerParReductIO_CC_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerParReductIO_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerParReductIO_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerParReductIO_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerParReductIO_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerParReductIO_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg); + +/* + * Input Scaling and reduction to 8b then channel centric activation, Out location != In location. Features are evaluated one after the other in parallel +*/ +void KerReduct_CC_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerReduct_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerReduct_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerReduct_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerReduct_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerReduct_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg); + +/* + * Input Scaling and reduction to 8b then channel centric activation, Out location = In location. Features are evaluated one after the other in parallel +*/ +void KerReductIO_CC_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerReductIO_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerReductIO_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerReductIO_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerReductIO_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg); +void KerReductIO_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg); + +/****************************************************************************************************************** + Stand alone activation. Parallel Feature, Feature Parallel + Input is a scaled 8b tensor + Output is a scaled 8b tensor, Scale can be different from the one of input +******************************************************************************************************************/ + +/* + * Standalone Scaled Activation, Features are evaluated in parallel +*/ +void KerPar_ReLU_SQ8(KerActivation_SQ8_T *Arg); +void KerPar_ReLUN_SQ8(KerActivation_SQ8_T *Arg); +void KerPar_HSigmoid_SQ8(KerActivation_SQ8_T *Arg); +void KerPar_HSwish_SQ8(KerActivation_SQ8_T *Arg); +void KerPar_LeakyReLU_SQ8(KerActivation_SQ8_T *Arg); + +/* + * Standalone Scaled Activation, Features are evaluated one after the other in parallel +*/ +void Ker_ReLU_SQ8(KerActivation_SQ8_T *Arg); +void Ker_ReLUN_SQ8(KerActivation_SQ8_T *Arg); +void Ker_HSigmoid_SQ8(KerActivation_SQ8_T *Arg); +void Ker_HSwish_SQ8(KerActivation_SQ8_T *Arg); +void Ker_LeakyReLU_SQ8(KerActivation_SQ8_T *Arg); + + +/****************************************************************************************************************** + Pooling group. + Performs Max, Average or Global average pooling followed by an optional Scaling or Scaling and Activation. + Several output feature maps are evaluated in parallel, one feature map per core +******************************************************************************************************************/ + +void KerParPool2x2Stride2_SQ8(KerPool_SQ8_T *Arg); +void KerParPool2x2Stride2_ReLU_SQ8(KerPool_SQ8_T *Arg); +void KerParPool2x2Stride2_ReLUN_SQ8(KerPool_SQ8_T *Arg); + +void KerParPoolNxNStrideS_SQ8(KerPool_SQ8_T *Arg); +void KerParPoolNxNStrideS_ReLU_SQ8(KerPool_SQ8_T *Arg); +void KerParPoolNxNStrideS_ReLUN_SQ8(KerPool_SQ8_T *Arg); + +void KerParPoolNxMStrideSxSy_SQ8(KerPool_SQ8_T *Arg); +void KerParPoolNxMStrideSxSy_ReLU_SQ8(KerPool_SQ8_T *Arg); +void KerParPoolNxMStrideSxSy_ReLUN_SQ8(KerPool_SQ8_T *Arg); + +void KerParGlobalMaxPool_SQ8(KerGlobalPool_SQ8_T *Arg); +void KerParGlobalMaxPool_Reduct_SQ8(KerGlobalPool_SQ8_T *Arg); +void KerParGlobalMaxPool_Reduct_ReLU_SQ8(KerGlobalPool_SQ8_T *Arg); +void KerParGlobalMaxPool_Reduct_ReLUN_SQ8(KerGlobalPool_SQ8_T *Arg); + +void KerParGlobalAvgPool_SQ8(KerGlobalPool_SQ8_T *Arg); +void KerParGlobalAvgPool_Reduct_SQ8(KerGlobalPool_SQ8_T *Arg); +void KerParGlobalAvgPool_Reduct_ReLU_SQ8(KerGlobalPool_SQ8_T *Arg); +void KerParGlobalAvgPool_Reduct_ReLUN_SQ8(KerGlobalPool_SQ8_T *Arg); + +void KerParGlobalMaxPoolFullFeat_SQ8(KerGlobalPool_SQ8_T *Arg); +void KerParGlobalMaxPoolFullFeat_ReLU_SQ8(KerGlobalPool_SQ8_T *Arg); +void KerParGlobalMaxPoolFullFeat_ReLUN_SQ8(KerGlobalPool_SQ8_T *Arg); + +void KerParGlobalAvgPoolFullFeat_SQ8(KerGlobalPool_SQ8_T *Arg); +void KerParGlobalAvgPoolFullFeat_ReLU_SQ8(KerGlobalPool_SQ8_T *Arg); +void KerParGlobalAvgPoolFullFeat_ReLUN_SQ8(KerGlobalPool_SQ8_T *Arg); + + +/************************************************************************************************************************************************* + Pooling group. + Performs Max or Average pooling followed by an optional Scaling or Scaling and Activation. + One output feature map is evaluated in parallel on all cores +*************************************************************************************************************************************************/ + +void KerPool2x2Stride2_SQ8(KerPool_SQ8_T *Arg); +void KerPool2x2Stride2_ReLU_SQ8(KerPool_SQ8_T *Arg); +void KerPool2x2Stride2_ReLUN_SQ8(KerPool_SQ8_T *Arg); + +void KerPoolNxNStrideS_SQ8(KerPool_SQ8_T *Arg); +void KerPoolNxNStrideS_ReLU_SQ8(KerPool_SQ8_T *Arg); +void KerPoolNxNStrideS_ReLUN_SQ8(KerPool_SQ8_T *Arg); + +void KerPoolNxMStrideSxSy_SQ8(KerPool_SQ8_T *Arg); +void KerPoolNxMStrideSxSy_ReLU_SQ8(KerPool_SQ8_T *Arg); +void KerPoolNxMStrideSxSy_ReLUN_SQ8(KerPool_SQ8_T *Arg); + + +/************************************************************************************************************************************************* + Tensor Addition with Input1 and Output optionally scaled (Tensor centric) followed by optional activation +*************************************************************************************************************************************************/ + +void KerParMatAdd_SQ8(KerMat3_SQ8_T *Arg); +void KerParMatAdd_ReLU_SQ8(KerMat3_SQ8_T *Arg); +void KerParMatAdd_ReLUN_SQ8(KerMat3_SQ8_T *Arg); +void KerParMatAdd_HSigmoid_SQ8(KerMat3_SQ8_T *Arg); +void KerParMatAdd_HSwish_SQ8(KerMat3_SQ8_T *Arg); +void KerParMatAdd_LeakyReLU_SQ8(KerMat3_SQ8_T *Arg); + +/************************************************************************************************************************************************* + Matrix mult with channel centric scaling, followed by optional activation: ReLU and ReLUN, other activations should be performed + using standalone activation kernels + In1 convolution weights + In2 convolution Features +*************************************************************************************************************************************************/ + +void KerParMatMulB8_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulB8_ReLU_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulSxSyB8_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulSxSyB8_ReLU_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulSxSyB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg); + +void KerParMatMulB16_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulB16_ReLU_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulB16_ReLUN_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulSxSyB16_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulSxSyB16_ReLU_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulSxSyB16_ReLUN_SQ8(KerMatMul_SQ8_T *Arg); + +void KerParMatMulB32_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulB32_ReLU_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulB32_ReLUN_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulSxSyB32_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulSxSyB32_ReLU_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulSxSyB32_ReLUN_SQ8(KerMatMul_SQ8_T *Arg); + +/************************************************************************************************************************************************* + Matrix mult with channel centric scaling for small first matrix in the product, goal is to improve parallelism in this specific situation + followed by an optional activation: ReLU and ReLUN, other activations should be performed using standalone activation kernels + + In1 fits completly in shared L1, convolution weights + In2 has been transposed before being used, convolution Features + Parallelization scheme partition In2 along H_In2 +*************************************************************************************************************************************************/ + +void KerParMatMulB8_SF_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulB8_ReLU_SF_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulB8_ReLUN_SF_SQ8(KerMatMul_SQ8_T *Arg); + +void KerParMatMulB16_SF_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulB16_ReLU_SF_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulB16_ReLUN_SF_SQ8(KerMatMul_SQ8_T *Arg); + +void KerParMatMulB32_SF_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulB32_ReLU_SF_SQ8(KerMatMul_SQ8_T *Arg); +void KerParMatMulB32_ReLUN_SF_SQ8(KerMatMul_SQ8_T *Arg); + +/************************************************************************************************************************************************* + Matrix by Vector Multiplication followed by an optional Activation (all of them supported) +*************************************************************************************************************************************************/ + +void KerParMatVectMul_SQ8(KerMat3_SQ8_T *Arg); +void KerParMatVectMul_ReLU_SQ8(KerMat3_SQ8_T *Arg); +void KerParMatVectMul_ReLUN_SQ8(KerMat3_SQ8_T *Arg); +void KerParMatVectMul_HSigmoid_SQ8(KerMat3_SQ8_T *Arg); +void KerParMatVectMul_HSwish_SQ8(KerMat3_SQ8_T *Arg); +void KerParMatVectMul_LeakyReLU_SQ8(KerMat3_SQ8_T *Arg); + + +/************************************************************************************************************************************************* + Linear Layer with 32b output. + When an output cannot be evaluated by a single call (Non FullFeat) Bias assumed to be set before. Scaling and activation done after + When FullFeat Bias setting, Linear and channel scaling all performed in the same call. Bias can be 8,16 or 32b +*************************************************************************************************************************************************/ + +void KerParLinearLayer_SQ8(KerLinear_SQ8_T *Arg); + +void KerParLinearLayerFullFeatB8_SQ8(KerLinear_SQ8_T *Arg); +void KerParLinearLayerFullFeatB8_ReLU_SQ8(KerLinear_SQ8_T *Arg); +void KerParLinearLayerFullFeatB8_ReLUN_SQ8(KerLinear_SQ8_T *Arg); + +void KerParLinearLayerFullFeatB16_SQ8(KerLinear_SQ8_T *Arg); +void KerParLinearLayerFullFeatB16_ReLU_SQ8(KerLinear_SQ8_T *Arg); +void KerParLinearLayerFullFeatB16_ReLUN_SQ8(KerLinear_SQ8_T *Arg); + +void KerParLinearLayerFullFeatB32_SQ8(KerLinear_SQ8_T *Arg); +void KerParLinearLayerFullFeatB32_ReLU_SQ8(KerLinear_SQ8_T *Arg); +void KerParLinearLayerFullFeatB32_ReLUN_SQ8(KerLinear_SQ8_T *Arg); + +/************************************************************************************************************************************************* + List of Matrix Transposition, no scaling +*************************************************************************************************************************************************/ + +void CNN_ParTranspose_fps(KerMatTranspose_fps_T *Arg); +void CNN_ParTransposeSxSy_fps(KerMatTranspose_fps_T *Arg); +void CNN_Transpose_fps(KerMatTranspose_fps_T *Arg); +void CNN_TransposeSxSy_fps(KerMatTranspose_fps_T *Arg); + +/************************************************************************************************************************************************* + 3D Tensor dimension permutations, no scaling +*************************************************************************************************************************************************/ + +void CNN_MatPermCHW2CWH_fps(KerMatTranspose_fps_T *Arg); +void CNN_MatPermCHW2HWC_fps(KerMatTranspose_fps_T *Arg); +void CNN_MatPermCHW2WHC_fps(KerMatTranspose_fps_T *Arg); +void CNN_MatPermCHW2WCH_fps(KerMatTranspose_fps_T *Arg); +void CNN_MatPermCHW2HCW_fps(KerMatTranspose_fps_T *Arg); + +/************************************************************************************************************************************************* + SotMax, Q15 Output +*************************************************************************************************************************************************/ + +void KerParSoftMax_SQ8(KerSoftMax_SQ8_T *Arg); + +/************************************************************************************************************************************************* + AT book keeping functions +*************************************************************************************************************************************************/ + +void AT_TileClear( + char *__restrict__ In, /**< Tile */ + int W, /**< Tile width */ + int H, /**< Tile height */ + int Feat, /**< Number of features */ + int Size, /**< Tile element size in bytes */ + int Pad, /**< Height or width of the area to be 0 padded */ + int Orientation /**< 0: Horizontal tile, 1: Vertical tile */ + ); + +void AT_DumpTensor( + char *NodeName, /**< Graph Node Name, a User Kernel */ + char *ArgName, /**< Argument name of this user kernel */ + int Loc, /**< Exec location if this argument, AT_MEM_xyz */ + void *L3_Device, /**< Pointer to device descriptor in case Loc is external */ + void *L3_Event, /**< Pointer to a read event for this device descriptor if any */ + int ItemSize, /**< Data type size in bytes */ + int Dim, /**< Number of dimensions, up to 5, from D0 most outer to D4 most inner */ + int D0, /**< Actual value of this dimension if defined, 1 otherwise */ + int D1, /**< Actual value of this dimension if defined, 1 otherwise */ + int D2, /**< Actual value of this dimension if defined, 1 otherwise */ + int D3, /**< Actual value of this dimension if defined, 1 otherwise */ + int D4, /**< Actual value of this dimension if defined, 1 otherwise */ + void *L2_BufferAddr, /**< In case exec loc is external pointer to a buffer in L2 to host partial copy of Arg */ + unsigned int L2_BufferSize, /**< Size of this buffer */ + void *Addr /**< Address of Arg */ + ); + + +#endif diff --git a/tools/autotiler_v3/generators/CNN/CNN_BiasReLULinear_BasicKernels.c b/tools/autotiler_v3/generators/CNN/CNN_BiasReLULinear_BasicKernels.c index f1b132ae9..2bb1a7c46 100644 --- a/tools/autotiler_v3/generators/CNN/CNN_BiasReLULinear_BasicKernels.c +++ b/tools/autotiler_v3/generators/CNN/CNN_BiasReLULinear_BasicKernels.c @@ -316,7 +316,7 @@ void KerSetBias_fpd(KerSetBias_fpd_T *Arg) // int Bias = AT_LSHIFT(*Arg->Bias, NormBias); int Bias = *Arg->Bias; - for (i=0; i<(Iter); i++) Out[i] = Bias; + for (i=0; i<(Iter); i++) Out[First+i] = Bias; gap_waitbarrier(0); } @@ -388,7 +388,7 @@ void KerSetBias_fpd_fp(KerSetBias_fpd_fp_T *Arg) // int Bias = AT_LSHIFT(*Arg->Bias, NormBias); int Bias = *Arg->Bias; - for (i=0; i<(Iter); i++) Out[i] = Bias; + for (i=0; i<(Iter); i++) Out[First+i] = Bias; gap_waitbarrier(0); } @@ -410,7 +410,7 @@ void KerSetBias_fpd_fps(KerSetBias_fpd_fps_T *Arg) // int Bias = AT_LSHIFT(*Arg->Bias, NormBias); int Bias = *Arg->Bias; - for (i=0; i<(Iter); i++) Out[i] = Bias; + for (i=0; i<(Iter); i++) Out[First+i] = Bias; gap_waitbarrier(0); } @@ -456,7 +456,7 @@ void KerSetBias_DP_fp(KerSetBias_fpd_fp_T *Arg) int i; int Bias = AT_LSHIFT(*Arg->Bias, NormBias); - for (i=0; i<(Iter); i++) Out[i] = Bias; + for (i=0; i<(Iter); i++) Out[First+i] = Bias; gap_waitbarrier(0); } @@ -477,7 +477,7 @@ void KerSetBias_DP_fps(KerSetBias_fpd_fps_T *Arg) int i; int Bias = AT_LSHIFT(*Arg->Bias, NormBias); - for (i=0; i<(Iter); i++) Out[i] = Bias; + for (i=0; i<(Iter); i++) Out[First+i] = Bias; gap_waitbarrier(0); } @@ -675,61 +675,14 @@ void KerParReLU_fps(KerReLUPool_fps_T *Arg) unsigned int OutFeatures = Arg->OutFeatures; signed char * __restrict__ Out = Arg->Out; int LB = Arg->LB; - int UB = Arg->UB; - - unsigned int CoreId = gap_coreid(); - unsigned int Chunk = ChunkSize(OutFeatures); - unsigned int First = Chunk*CoreId; - unsigned int Last = Min(First+Chunk, OutFeatures); - - for (unsigned int of=First; ofIn; - unsigned int W = Arg->W; - unsigned int H = Arg->H; - unsigned int Wo = W; - unsigned int Ho = H; - unsigned int OutFeatures = Arg->OutFeatures; - short int * __restrict__ Out = Arg->Out; - int LB = Arg->LB; - int UB = Arg->UB; - - unsigned int CoreId = gap_coreid(); - unsigned int Chunk = ChunkSize(OutFeatures); - unsigned int First = Chunk*CoreId; - unsigned int Last = Min(First+Chunk, OutFeatures); - - for (unsigned int of=First; ofIn; - unsigned int W = Arg->W; - unsigned int H = Arg->H; - unsigned int Wo = W; - unsigned int Ho = H; - unsigned int OutFeatures = Arg->OutFeatures; - signed char * __restrict__ Out = Arg->Out; - int LB = Arg->LB, UB = Arg->UB; + char *UB = (char *) Arg->UB; unsigned int CoreId = gap_coreid(); unsigned int Chunk = ChunkSize(OutFeatures); unsigned int First = Chunk*CoreId; unsigned int Last = Min(First+Chunk, OutFeatures); - for (unsigned int of=First; ofIn; - int W = Arg->W; - int H = Arg->H; - short int * __restrict__ Out = Arg->Out; - int LB = Arg->LB, UB = *((char *) Arg->UB); - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize((W*H)/2); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Minu(First+ChunkCell, (W*H)/2); - v2s * VectIn = (v2s *) In; - v2s * VectOut = (v2s *) Out; - int i, j; - - for (i=First; iIn; - int W = Arg->W; - int H = Arg->H; - signed char * __restrict__ Out = Arg->Out; - int LB = Arg->LB, UB = *((char *) Arg->UB); - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize((W*H)/4); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Minu(First+ChunkCell, (W*H)/4); - v4s * VectIn = (v4s *) In; - v4s * VectOut = (v4s *) Out; - int i, j; - - for (i=First; i> eventhough should have bias in max i+w bits. Report an error? - Norm = i+w-o - NormBias = i+w-b - LB, UB: o - */ - short int * __restrict__ In = Arg->In; - int InSize = Arg->InSize; - const short int * __restrict__ Filter = Arg->Filter; - const short int * __restrict__ Bias = Arg->Bias; - unsigned int Norm = Arg->Norm; - int NormBias = Arg->NormBias; - short int * __restrict__ Out = Arg->Out; - int OutSize = Arg->OutSize; - int LB = Arg->LB; - char *UB = (char *)Arg->UB; - static L1_CL_MEM int Reduct[8]; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(InSize); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, InSize); - int Iter = Max(0, Last-First); - - for (int i=0; i> eventhough should have bias in max i+w bits. Report an error? - Norm = i+w-o - Bias = i+w-b - LB, UB: o - */ - signed char * __restrict__ In = Arg->In; - int InSize = Arg->InSize; - const signed char * __restrict__ Filter = Arg->Filter; - const signed char * __restrict__ Bias = Arg->Bias; - unsigned int Norm = Arg->Norm; - int NormBias = Arg->NormBias; - signed char * __restrict__ Out = Arg->Out; - int OutSize = Arg->OutSize; - int LB = Arg->LB; - char *UB = (char *)Arg->UB; - static L1_CL_MEM int Reduct[8]; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(InSize); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, InSize); - int Iter = Max(0, Last-First); - - for (int i=0; i> eventhough should have bias in max i+w bits. Report an error? - Norm = i+w-o - Bias = i+w-b - LB, UB: o - */ - short int * __restrict__ In = Arg->In; - int InSize = Arg->InSize; - const signed char * __restrict__ Filter = Arg->Filter; - const short int * __restrict__ Bias = Arg->Bias; - unsigned int Norm = Arg->Norm; - int NormBias = Arg->NormBias; - short int * __restrict__ Out = Arg->Out; - int OutSize = Arg->OutSize; - int LB = Arg->LB; - char *UB = (char *)Arg->UB; - static L1_CL_MEM int Reduct[8]; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(InSize); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, InSize); - int Iter = Max(0, Last-First); - - for (int i=0; i> eventhough should have bias in max i+w bits. Report an error? - Norm = i+w-o - Bias = i+w-b - LB, UB: o - */ - signed char * __restrict__ In = Arg->In; - int InSize = Arg->InSize; - const signed char * __restrict__ Filter = Arg->Filter; - const short int * __restrict__ Bias = Arg->Bias; - unsigned int Norm = Arg->Norm; - int NormBias = Arg->NormBias; - short int * __restrict__ Out = Arg->Out; - int OutSize = Arg->OutSize; - int LB = Arg->LB; - char *UB = (char *)Arg->UB; - static L1_CL_MEM int Reduct[8]; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(InSize); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, InSize); - int Iter = Max(0, Last-First); - - for (int i=0; i> eventhough should have bias in max i+w bits. Report an error? - Norm = i+w-o - Bias = i+w-b - LB, UB: o - */ - short int * __restrict__ In = Arg->In; - int InSize = Arg->InSize; - const short int * __restrict__ Filter = Arg->Filter; - const short int * __restrict__ Bias = Arg->Bias; - unsigned int Norm = Arg->Norm; - int NormBias = Arg->NormBias; - int * __restrict__ Out = Arg->Out; - int OutSize = Arg->OutSize; - int LB = Arg->LB; - char *UB = (char *)Arg->UB; - static L1_CL_MEM int Reduct[8]; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(InSize); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, InSize); - int Iter = Max(0, Last-First); - - for (int i=0; iLB), *((char *)Arg->UB)); - break; default: /* No Activation LB=-32768, UB=32767, ReLU: LB=0,UB=32767, ReLUN: LB=0,UB=N<LB), Arg->UB); @@ -1565,9 +1245,6 @@ void KerDPLinearLayerReduct_fps(KerDPLinearLayerReduct_fps_T *Arg) Acc = gap_clip(Neg*Input1+Pos*Input, 7); } break; - case KACT_RELUN_VECTOR: - Acc = Min(Max(AT_NORM(Acc, Norm), Arg->LB), *((char *)Arg->UB)); - break; default: /* No Activation LB=-128, UB=127, ReLU: LB=0,UB=127, ReLUN: LB=0,UB=N<LB), Arg->UB); @@ -1621,48 +1298,6 @@ void KerParLinearLayerReLU_fp(KerLinearLayerReLU_fp_T *Arg) gap_waitbarrier(0); } -void KerParLinearLayerReLUN_Vector_fp(KerLinearLayerReLU_fp_T *Arg) - -{ - /* - NormBias: w+i-b - Norm: w+i-o - LB, UB: in o format - */ - short int * __restrict__ In = Arg->In; - int TotalInSize = Arg->TotalInSize; - int InSize = Arg->InSize; - const short int * __restrict__ Filter = Arg->Filter; - const short int * __restrict__ Bias = Arg->Bias; - unsigned int Norm = Arg->Norm; - int NormBias = Arg->NormBias; - short int * __restrict__ Out = Arg->Out; - int LB = Arg->LB; - char *UB = (char *) Arg->UB; - - int OutSize = Arg->OutSize; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(OutSize); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, OutSize); - int i,j; - v2s * __restrict__ VectIn = (v2s *) In; - - /* Don't use this kernel for partial evaluation of an output */ - for (i=First; iIn; - int TotalInSize = Arg->TotalInSize; - int InSize = Arg->InSize; - const signed char * __restrict__ Filter = Arg->Filter; - const signed char * __restrict__ Bias = Arg->Bias; - unsigned int Norm = Arg->Norm; - int NormBias = Arg->NormBias; - signed char * __restrict__ Out = Arg->Out; - int LB = Arg->LB; - char *UB = (char *) Arg->UB; - - int OutSize = Arg->OutSize; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(OutSize); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, OutSize); - int i,j; - v4s * __restrict__ VectIn = (v4s *) In; - - /* Don't use this kernel for partial evaluation of an output */ - for (i=First; iTotalInSize; int InSize = Arg->InSize; const signed char * __restrict__ Filter = Arg->Filter; - short int * __restrict__ Bias = Arg->Bias; + const signed char * __restrict__ Bias = Arg->Bias; unsigned int Norm = Arg->Norm; int NormBias = Arg->NormBias; - short int * __restrict__ Out = Arg->Out; - int LB = Arg->LB; - char *UB = (char *) Arg->UB; + int * __restrict__ Out = Arg->Out; + int LB = Arg->LB, UB = Arg->UB; int OutSize = Arg->OutSize; unsigned int CoreId = gap_coreid(); @@ -2001,7 +1594,7 @@ void KerParLinearLayerReLUN_Vector_fps_fps_fp(KerLinearLayerReLU_fps_fps_fp_T *A } if (InSize&0x4) Acc = gap_sumdotp4(VectIn[InSize/4-1], VectFilter[InSize/4-1], Acc); for (j=((InSize/4)*4); jIn; - int TotalInSize = Arg->TotalInSize; - int InSize = Arg->InSize; - const signed char * __restrict__ Filter = Arg->Filter; - const short int * __restrict__ Bias = Arg->Bias; - unsigned int Norm = Arg->Norm; - int NormBias = Arg->NormBias; - short int * __restrict__ Out = Arg->Out; - int LB = Arg->LB; - char *UB = (char *) Arg->UB; - - int OutSize = Arg->OutSize; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(OutSize); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, OutSize); - int i,j; - v2s * __restrict__ VectIn = (v2s *) In; - - for (i=First; iIn; - int TotalInSize = Arg->TotalInSize; - int InSize = Arg->InSize; - const short int * __restrict__ Filter = Arg->Filter; - const short int * __restrict__ Bias = Arg->Bias; - unsigned int Norm = Arg->Norm; - int NormBias = Arg->NormBias; - int * __restrict__ Out = Arg->Out; - int LB = Arg->LB; - char *UB = (char *) Arg->UB; - - int OutSize = Arg->OutSize; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(OutSize); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, OutSize); - int i,j; - v2s * __restrict__ VectIn = (v2s *) In; - - for (i=First; iIn; - short int * __restrict__ Out = Arg->Out; - int S = Arg->W*Arg->H; - int Feat = Arg->InFeatures; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - unsigned int Norm = Arg->Norm; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(Feat); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, Feat); - - for (int f=First; fIn; - short int * __restrict__ Out = Arg->Out; - int S = Arg->W*Arg->H; - int Feat = Arg->InFeatures; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int M = *Arg->MulBias; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(Feat); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, Feat); - - for (int f=First; fIn; - short int * __restrict__ Out = Arg->Out; - int Feat = Arg->InFeatures; - int S = Arg->W*Arg->H; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - short int * __restrict__ MB = Arg->MulBias; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(Feat); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, Feat); - - for (int f=First; fIn; - short int * __restrict__ Out = Arg->Out; - int S = Arg->W*Arg->H; - int Feat = Arg->InFeatures; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - unsigned int Norm = Arg->Norm; - int i,j,k,U,A,B,Log2Core; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(Feat); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, Feat); - int Size = S*Max(0, Last-First); - - /* First normalize In, each parallel chunk overwrites it's own input - After we are done In contains groups of contiguous normalized values - each group beeing followed by an empty group of exactly the same size, these - one need to be supressed, second step is taking care of this reduction */ - for (int f=First; fIn; - short int * __restrict__ Out = Arg->Out; - int Feat = Arg->InFeatures; - int S = Arg->W*Arg->H; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int M = *Arg->MulBias; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int i,j,k,U,A,B,Log2Core; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(Feat); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, Feat); - int Size = S*Max(0, Last-First); - - /* First normalize In, each parallel chunk overwrites it's own input - After we are done In contains groups of contiguous normalized values - each group beeing followed by an empty group of exactly the same size, these - one need to be supressed, second step is taking care of this reduction */ - for (int f=First; fIn; - short int * __restrict__ Out = Arg->Out; - int S = Arg->W*Arg->H; - int Feat = Arg->InFeatures; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - short int * __restrict__ MB = Arg->MulBias; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int i,j,k,U,A,B,Log2Core; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(Feat); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, Feat); - int Size = S*Max(0, Last-First); - - /* First normalize In, each parallel chunk overwrites it's own input - After we are done In contains groups of contiguous normalized values - each group beeing followed by an empty group of exactly the same size, these - one need to be supressed, second step is taking care of this reduction */ - for (int f=First; fIn; - signed char * __restrict__ Out = Arg->Out; - unsigned int S = Arg->W*Arg->H; - int Feat = Arg->InFeatures; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - unsigned int Norm = Arg->Norm; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(Feat); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, Feat); + unsigned int Last = Min(First+ChunkCell, S); + int Size = Max(0, Last-First); - for (int f=First; fIn; - signed char * __restrict__ Out = Arg->Out; - unsigned int S = Arg->W*Arg->H; - int Feat = Arg->InFeatures; - int LB = Arg->LB; - char *pUB = (char *)Arg->UB; - int M = *Arg->MulBias; - unsigned int Norm = Arg->Norm+Arg->NormBias; - int i, j; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(Feat); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, Feat); - int Size = S*Max(0, Last-First); - - for (int f=First; fIn; - signed char * __restrict__ Out = Arg->Out; - unsigned int S = Arg->W*Arg->H; - int Feat = Arg->InFeatures; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - signed char * __restrict__ MB = Arg->MulBias; - unsigned int Norm = Arg->Norm+Arg->NormBias; - int i, j; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(Feat); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, Feat); - int Size = S*Max(0, Last-First); - - for (int f=First; fIn; - signed char * __restrict__ Out = Arg->Out; - unsigned int S = Arg->W*Arg->H; - int Feat = Arg->InFeatures; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int i,j,k,U,A,B,Log2Core; - unsigned int Norm = Arg->Norm; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(Feat); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, Feat); - int Size = S*Max(0, Last-First); - - /* First normalize In, each parallel chunk overwrites it's own input - After we are done In contains groups of contiguous normalized values - each group beeing followed by an empty group of exactly the same size, these - one need to be supressed, second step is taking care of this reduction */ - for (int f=First; fIn; - signed char * __restrict__ Out = Arg->Out; - int Feat = Arg->InFeatures; - int S = Arg->W*Arg->H; - unsigned int Norm = Arg->Norm+Arg->NormBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int M = *Arg->MulBias; - int i,j,k,U,A,B,Log2Core; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(Feat); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, Feat); - int Size = S*Max(0, Last-First); - - - /* First normalize In, each parallel chunk overwrites it's own input - After we are done In contains groups of contiguous normalized values - each group beeing followed by an empty group of exactly the same size, these - one need to be supressed, second step is taking care of this reduction */ - for (int f=First; fIn; - signed char * __restrict__ Out = Arg->Out; - int Feat = Arg->InFeatures; - int S = Arg->W*Arg->H; - unsigned int Norm = Arg->Norm+Arg->NormBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - signed char * __restrict__ MB = Arg->MulBias; - int i,j,k,U,A,B,Log2Core; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(Feat); - unsigned int First = CoreId*ChunkCell; - unsigned int Last = Min(First+ChunkCell, Feat); - int Size = S*Max(0, Last-First); - - - /* First normalize In, each parallel chunk overwrites it's own input - After we are done In contains groups of contiguous normalized values - each group beeing followed by an empty group of exactly the same size, these - one need to be supressed, second step is taking care of this reduction */ - for (int f=First; f>Log2Core) + ((X&(NCore-1))!=0); + return Chunk; +} + + +/* Set output features maps initial bias group + + KerParSetBiasB32_SQ8 Bias and out are int, output feature maps are evaluated in parallel (one per core) + KerParSetBiasB16_SQ8 Bias is half word, out is int, output feature maps are evaluated in parallel (one per core) + KerParSetBiasB8_SQ8 Bias is byte, out is int, output feature maps are evaluated in parallel (one per core) + + KerSetBiasB32_SQ8 Bias and out are int, a single output feature map is evaluated in parallel on all cores + KerSetBiasB16_SQ8 Bias is short, out is int, a single output feature map is evaluated in parallel on all cores + KerSetBiasB8_SQ8 Bias is byte, out is int, a single output feature map is evaluated in parallel on all cores +*/ + +/* Set Bias, all outputs evalauted in parallel */ +void KerParSetBiasB32_SQ8(KerSetBias_SQ8_T *Arg) + +{ + int * __restrict__ Out = Arg->Out; + unsigned int W = Arg->W, H = Arg->H, Feat = Arg->Feat; + int * __restrict__ Bias = Arg->Bias; + unsigned int NormBias = Arg->NormBias; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + for (unsigned int of=First; ofOut; + unsigned int W = Arg->W, H = Arg->H, Feat = Arg->Feat; + short int * __restrict__ Bias = Arg->Bias; + unsigned int NormBias = Arg->NormBias; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + for (unsigned int of=First; ofOut; + unsigned int W = Arg->W, H = Arg->H, Feat = Arg->Feat; + signed char * __restrict__ Bias = Arg->Bias; + unsigned int NormBias = Arg->NormBias; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + for (unsigned int of=First; ofOut; + unsigned int W = Arg->W, H = Arg->H, Feat = Arg->Feat; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + unsigned int NormBias = Arg->NormBias; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(W*H), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, W*H), Iter = Max(0, Last-First); + + for (unsigned int f=0; fOut; + unsigned int W = Arg->W, H = Arg->H, Feat = Arg->Feat; + short int * __restrict__ Bias = Arg->Bias; + unsigned int NormBias = Arg->NormBias; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(W*H), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, W*H), Iter = Max(0, Last-First); + + for (int f=0; fOut; + unsigned int W = Arg->W, H = Arg->H, Feat = Arg->Feat; + signed char * __restrict__ Bias = Arg->Bias; + unsigned int NormBias = Arg->NormBias; + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(W*H), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, W*H), Iter = Max(0, Last-First); + + for (int f=0; fIn; + int TotalInDim = Arg->TotalInDim; // Reorganize weight layout to make TotalInDim = InDim + unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; + const signed char * __restrict__ Weights = Arg->Weights; + int * __restrict__ Out = Arg->Out; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); + v4s * __restrict__ VectIn = (v4s *) In; + + for (int i=First; iIn; + unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; + const signed char * __restrict__ Weights = Arg->Weights; + const signed char * __restrict__ Bias = Arg->Bias; + unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN]; + unsigned char *Scale = Arg->Scale; + unsigned char *ScaleN = Arg->ScaleN; + signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); + v4s * __restrict__ VectIn = (v4s *) In; + + for (int i=First; iIn; + unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; + const signed char * __restrict__ Weights = Arg->Weights; + const signed char * __restrict__ Bias = Arg->Bias; + unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN]; + unsigned char *Scale = Arg->Scale; + unsigned char *ScaleN = Arg->ScaleN; + signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); + v4s * __restrict__ VectIn = (v4s *) In; + + for (int i=First; iIn; + unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; + const signed char * __restrict__ Weights = Arg->Weights; + const signed char * __restrict__ Bias = Arg->Bias; + unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN]; + unsigned char *Scale = Arg->Scale; + unsigned char *ScaleN = Arg->ScaleN; + int A0 = Arg->Infos[AT_INF_A0]; + signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); + v4s * __restrict__ VectIn = (v4s *) In; + + for (int i=First; iIn; + unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; + const signed char * __restrict__ Weights = Arg->Weights; + const short int * __restrict__ Bias = Arg->Bias; + unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN]; + unsigned char *Scale = Arg->Scale; + unsigned char *ScaleN = Arg->ScaleN; + signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); + v4s * __restrict__ VectIn = (v4s *) In; + + for (int i=First; iIn; + unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; + const signed char * __restrict__ Weights = Arg->Weights; + const short int * __restrict__ Bias = Arg->Bias; + unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN]; + unsigned char *Scale = Arg->Scale; + unsigned char *ScaleN = Arg->ScaleN; + signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); + v4s * __restrict__ VectIn = (v4s *) In; + + for (int i=First; iIn; + unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; + const signed char * __restrict__ Weights = Arg->Weights; + const short int * __restrict__ Bias = Arg->Bias; + unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN]; + unsigned char *Scale = Arg->Scale; + unsigned char *ScaleN = Arg->ScaleN; + int A0 = Arg->Infos[AT_INF_A0]; + signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); + v4s * __restrict__ VectIn = (v4s *) In; + + for (int i=First; iIn; + unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; + const signed char * __restrict__ Weights = Arg->Weights; + const int * __restrict__ Bias = Arg->Bias; + unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN]; + unsigned char *Scale = Arg->Scale; + unsigned char *ScaleN = Arg->ScaleN; + signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); + v4s * __restrict__ VectIn = (v4s *) In; + + for (int i=First; iIn; + unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; + const signed char * __restrict__ Weights = Arg->Weights; + const int * __restrict__ Bias = Arg->Bias; + unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN]; + unsigned char *Scale = Arg->Scale; + unsigned char *ScaleN = Arg->ScaleN; + signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); + v4s * __restrict__ VectIn = (v4s *) In; + + for (int i=First; iIn; + unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; + const signed char * __restrict__ Weights = Arg->Weights; + const int * __restrict__ Bias = Arg->Bias; + unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN]; + unsigned char *Scale = Arg->Scale; + unsigned char *ScaleN = Arg->ScaleN; + int A0 = Arg->Infos[AT_INF_A0]; + signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); + v4s * __restrict__ VectIn = (v4s *) In; + + for (int i=First; iUsedH, FS, PadIn[2], S)); unsigned int InFeatures = Arg->InFeatures; - +#ifdef OLD for (unsigned int of=First; of>Log2Core) + ((X&(NCore-1))!=0); + return Chunk; +} + +static int FirstDefinedOutput(unsigned int F, unsigned int Pad, unsigned int Stride) + +{ + // k*S - (F-1)/2 >=0 => k >= (((F-1)/2) + S-1)/S + + return ((Pad+Stride-1)/Stride); +} + +static int LastDefinedOutput(unsigned int DimIn, unsigned int F, unsigned int PadL, unsigned int Stride) + +{ + // k*S + ((F-1)/2 - PadL + F/2) < Dim => k < (Dim-((F-1)/2 - PadL + (F/2)) + S-1)/S + + return ((DimIn - ((F-1)/2 - PadL + (F/2)) + Stride-1)/Stride); +} + +static int __attribute__ ((always_inline)) MinCond(int a, int b) + +{ + if (a<0 || b<0) printf("MinCond(%d, %d)\n", a, b); +#ifdef DIM_ALWAYS_GREATER_THAN_FILTER + return a; +#else + return Max(0, Min(a, b)); +#endif +} + +static void __attribute__ ((noinline)) KerConv2x1from3x1StrideNx1_V_SQ8( + signed char * __restrict__ In, + int W, int PadTOrg, + int Wo, int Ho, int Ho_F, int Ho_L, + int Bias, + int * __restrict__ Out, + signed char * __restrict__ Filter, + int FilterConf + ) +{ + int V0,V1; + int C0,C1; + signed char *PtIn; + int *PtOut; + + if (FilterConf) { /* Right Side */ + C0 = Filter[0]; C1 = Filter[1]; + } else { /* Left Side */ + C0 = Filter[1]; C1 = Filter[2]; + } + PtIn = In + (Ho_F*1-PadTOrg)*W; PtOut = Out+Ho_F*Wo; + for (unsigned int i=Ho_F; i [2..4 x 0] PadL==2 + C0 = *((v4s*) (Filter + 0*5+2)); C0[3] = 0; + break; + case 1: // [0..4 x 0] => [1..4 x 0] PadL==1 + C0 = *((v4s*) (Filter + 0*5+1)); + break; + case 3: // [0..4 x 0] => [0..3 x 0] PadR==1 + C0 = *((v4s*) (Filter + 0*5+0)); + break; + case 4: // [0..4 x 0] => [0..2 x 0] PadR==2 + C0 = *((v4s*) (Filter + 0*5+0)); C0 = (v4s)(((int)C0)<<8); + break; + case 5: // [0..4 x 0] => [0..2 x 0] PadR==2, Wo==1 + C0 = *((v4s*) (Filter + 0*5+0)); C0[3] = 0; + break; + } + PtIn = In + (Ho_F*1-PadOrg[2])*W; PtOut = Out+Ho_F*Wo; + V0 = * (v4s *) PtIn; PtIn += W; + for (unsigned int i=Ho_F; i [2..4 x 0..4] PadL == 2 + C0 = *((v4s*) (Filter + 0*5+2)); C0[3] = 0; + C1 = *((v4s*) (Filter + 1*5+2)); C1[3] = 0; + C2 = *((v4s*) (Filter + 2*5+2)); C2[3] = 0; + C3 = *((v4s*) (Filter + 3*5+2)); C3[3] = 0; + C4 = *((v4s*) (Filter + 4*5+2)); C4[3] = 0; + break; + case 1: // [0..4 x 0..4] => [1..4 x 0..4] PadL == 1 + C0 = *((v4s*) (Filter + 0*5+1)); + C1 = *((v4s*) (Filter + 1*5+1)); + C2 = *((v4s*) (Filter + 2*5+1)); + C3 = *((v4s*) (Filter + 3*5+1)); + C4 = *((v4s*) (Filter + 4*5+1)); + break; + case 3: // [0..4 x 0..4] => [0..3 x 0..4] PadR == 1 + C0 = *((v4s*) (Filter + 0*5+0)); + C1 = *((v4s*) (Filter + 1*5+0)); + C2 = *((v4s*) (Filter + 2*5+0)); + C3 = *((v4s*) (Filter + 3*5+0)); + C4 = *((v4s*) (Filter + 4*5+0)); + break; + case 4: // [0..4 x 0..4] => [1..3 x 0..4] PadR == 2 + C0 = *((v4s*) (Filter + 0*5+0)); C0 = (v4s)(((int)C0)<<8); + C1 = *((v4s*) (Filter + 1*5+0)); C1 = (v4s)(((int)C1)<<8); + C2 = *((v4s*) (Filter + 2*5+0)); C2 = (v4s)(((int)C2)<<8); + C3 = *((v4s*) (Filter + 3*5+0)); C3 = (v4s)(((int)C3)<<8); + C4 = *((v4s*) (Filter + 4*5+0)); C4 = (v4s)(((int)C4)<<8); + break; + case 5: // [0..4 x 0..4] => [0..2 x 0..4] PadR == 2, Wo==1 + C0 = *((v4s*) (Filter + 0*5+0)); C0[3] = 0; + C1 = *((v4s*) (Filter + 1*5+0)); C1[3] = 0; + C2 = *((v4s*) (Filter + 2*5+0)); C2[3] = 0; + C3 = *((v4s*) (Filter + 3*5+0)); C3[3] = 0; + C4 = *((v4s*) (Filter + 4*5+0)); C4[3] = 0; + break; + } + if (PadT==2) { + PtIn = In; Ho_F = 0; + V0 = (v4s){0,0,0,0}; V1 = (v4s){0,0,0,0}; + } else if (PadT) { // == 1 + PtIn = In; Ho_F = 0; + V0 = (v4s){0,0,0,0}; + V1 = *((v4s *) PtIn); PtIn += W; + } else { // Ho_F==0 + PtIn = In + (Ho_F*1-PadTOrg)*W; + V0 = *((v4s *) PtIn); PtIn += W; + V1 = *((v4s *) PtIn); PtIn += W; + } + V2 = *((v4s *) PtIn); PtIn += W; + V3 = *((v4s *) PtIn); PtIn += W; + PtOut = Out+Ho_F*Wo; + if (Ho==1) { + int Acc = Bias; Acc = gap_sumdotp4(V0, C0, Acc); Acc = gap_sumdotp4(V1, C1, Acc); Acc = gap_sumdotp4(V2, C2, Acc); *PtOut = Acc; + return; + } + for (unsigned int i=Ho_F; i [2..4 x 0..4] PadL == 2 + C0 = *((v4s*) (Filter + 0*5+2)); C0[3] = 0; + C1 = *((v4s*) (Filter + 1*5+2)); C1[3] = 0; + C2 = *((v4s*) (Filter + 2*5+2)); C2[3] = 0; + C3 = *((v4s*) (Filter + 3*5+2)); C3[3] = 0; + C4 = *((v4s*) (Filter + 4*5+2)); C4[3] = 0; + break; + case 1: // [0..4 x 0..4] => [1..4 x 0..4] PadL==1 + C0 = *((v4s*) (Filter + 0*5+1)); + C1 = *((v4s*) (Filter + 1*5+1)); + C2 = *((v4s*) (Filter + 2*5+1)); + C3 = *((v4s*) (Filter + 3*5+1)); + C4 = *((v4s*) (Filter + 4*5+1)); + break; + case 3: // [0..4 x 0..4] => [0..3 x 0..4] PadR==1 + C0 = *((v4s*) (Filter + 0*5+0)); + C1 = *((v4s*) (Filter + 1*5+0)); + C2 = *((v4s*) (Filter + 2*5+0)); + C3 = *((v4s*) (Filter + 3*5+0)); + C4 = *((v4s*) (Filter + 4*5+0)); + break; + case 4: // [0..4 x 0..4] => [0..2 x 0..4] PadR==2 + C0 = *((v4s*) (Filter + 0*5+0)); C0[3] = 0; + C1 = *((v4s*) (Filter + 1*5+0)); C1[3] = 0; + C2 = *((v4s*) (Filter + 2*5+0)); C2[3] = 0; + C3 = *((v4s*) (Filter + 3*5+0)); C3[3] = 0; + C4 = *((v4s*) (Filter + 4*5+0)); C4[3] = 0; + break; + } + if (PadT==2) { + PtIn = In; Ho_F = 0; + V0 = (v4s){0,0,0,0}; V1 = (v4s){0,0,0,0}; + } else if (PadT) { // == 1 + PtIn = In; Ho_F = 0; + V0 = (v4s){0,0,0,0}; V1 = *((v4s *) PtIn); PtIn += W; + } else { + PtIn = In + (Ho_F*2-PadTOrg)*W; + V0 = *((v4s *) PtIn); PtIn += W; + V1 = *((v4s *) PtIn); PtIn += W; + } + PtOut = Out+Ho_F*Wo; + V2 = *((v4s *) PtIn); PtIn += W; + for (unsigned int i=Ho_F; i 2 since we have specialized form for smaller strides */ + v4s V0, V1, V2, V3, V4; + v4s C0, C1, C2, C3, C4; + signed char *PtIn; + int *PtOut; + int PadL = PadOrg[0], PadT = Pad[2], PadTOrg = PadOrg[2], PadB = Pad[3]; + + switch (FilterConf) { + case 2: // [0..4 x 0..4] => [2..4 x 0..4] PadL==2 + C0 = *((v4s*) (Filter + 0*5+2)); C0[3] = 0; + C1 = *((v4s*) (Filter + 1*5+2)); C1[3] = 0; + C2 = *((v4s*) (Filter + 2*5+2)); C2[3] = 0; + C3 = *((v4s*) (Filter + 3*5+2)); C3[3] = 0; + C4 = *((v4s*) (Filter + 4*5+2)); C4[3] = 0; + break; + case 1: // [0..4 x 0..4] => [1..4 x 0..4] PadL==1 + C0 = *((v4s*) (Filter + 0*5+1)); + C1 = *((v4s*) (Filter + 1*5+1)); + C2 = *((v4s*) (Filter + 2*5+1)); + C3 = *((v4s*) (Filter + 3*5+1)); + C4 = *((v4s*) (Filter + 4*5+1)); + break; + case 3: // [0..4 x 0..4] => [0..3 x 0..4] PadR==1 + C0 = *((v4s*) (Filter + 0*5+0)); + C1 = *((v4s*) (Filter + 1*5+0)); + C2 = *((v4s*) (Filter + 2*5+0)); + C3 = *((v4s*) (Filter + 3*5+0)); + C4 = *((v4s*) (Filter + 4*5+0)); + break; + case 4: // [0..4 x 0..4] => [0..2 x 0..4] PadR==2 + C0 = *((v4s*) (Filter + 0*5+0)); C0[3] = 0; + C1 = *((v4s*) (Filter + 1*5+0)); C1[3] = 0; + C2 = *((v4s*) (Filter + 2*5+0)); C2[3] = 0; + C3 = *((v4s*) (Filter + 3*5+0)); C3[3] = 0; + C4 = *((v4s*) (Filter + 4*5+0)); C4[3] = 0; + break; + } + if (PadT==2) { + PtIn = In; Ho_F = 0; + V0 = (v4s){0,0,0,0}; V1 = (v4s){0,0,0,0}; + } else if (PadT) { // == 1 + PtIn = In; Ho_F = 0; + V0 = (v4s){0,0,0,0}; V1 = *((v4s *) PtIn); PtIn += W; + } else { + PtIn = In + (Ho_F*Stride-PadTOrg)*W; PtOut = Out+Ho_F*Wo; + V0 = *((v4s *) PtIn); PtIn += W; + V1 = *((v4s *) PtIn); PtIn += W; + } + PtOut = Out+Ho_F*Wo; + + + for (unsigned int i=Ho_F; iHi_F + int Wi_F = (Fw-1)/2 - PadLOrg; + int Wi_L = Wi_F + (Wo_L-1)*Stride; // iff Wi_L>Wi_F + + if (PadT) { /* Top */ + int ht = PadTOrg, hb = H - Hi_F + Fh/2; + for (unsigned int h=0; h F by definition of Ho_L so we can remove and use ht only + for (unsigned int w=Wo_F; w F by definition of Ho_L so we can remove and use ht only + for (unsigned int h=Ho_F; h F by definition of Ho_L so we can remove and use ht only. ht Can't be > F by definition of Ho_L so we can remove and use ht only + int Wh_min = wl, Wh_max = MinCond(wr, Fw), Fh_min = ht, Fh_max = MinCond(Fh, hb); + for (unsigned int i=Fh_min; iHi_F + int Wi_F = (Fw-1)/2 - PadLOrg; + int Wi_L = Wi_F + (Wo_L-1)*StrideX; // iff Wi_L>Wi_F + + if (PadT) { /* Top */ + int ht = PadTOrg, hb = H - Hi_F + Fh/2; + for (unsigned int h=0; h F by definition of Ho_L so we can remove and use ht only + for (unsigned int w=Wo_F; w F by definition of Ho_L so we can remove and use ht only + for (unsigned int h=Ho_F; h F by definition of Ho_L so we can remove and use ht only. ht Can't be > F by definition of Ho_L so we can remove and use ht only + int Wh_min = wl, Wh_max = MinCond(wr, Fw), Fh_min = ht, Fh_max = MinCond(Fh, hb); + for (unsigned int i=Fh_min; iHi_F + int Wi_F = (TFw-1)/2 - PadLOrg; + int Wi_L = Wi_F + (Wo_L-1)*StrideX; // iff Wi_L>Wi_F + int Prec=10; + int InvDh = ((1< F by definition of Ho_L so we can remove and use ht only + for (unsigned int h=Ho_F; h 2 */ + int Fh=3, Fw=3; + int PadLOrg = PadOrg[0], PadTOrg = PadOrg[2]; + int PadL = Pad[0], PadR = Pad[1], PadT = Pad[2], PadB = Pad[3]; + + if (PadL) KerConv2x3from3x3StrideS_V_SQ8(In, W, PadTOrg, Wo, Ho, Ho_F, Ho_L, Stride, Bias, Out, Filter, 0); + if (PadR) KerConv2x3from3x3StrideS_V_SQ8(In+Wo_L*Stride-PadLOrg, W, PadTOrg, Wo, Ho, Ho_F, Ho_L, Stride, Bias, Out+Wo-1, Filter, 1); + if (PadT) KerConv3x2from3x3StrideS_H_SQ8(In, W, PadLOrg, Wo, Wo_F, Wo_L, Stride, Bias, Out+Wo_F, Filter, 0); + if (PadB) KerConv3x2from3x3StrideS_H_SQ8(In+(Ho_L*Stride-PadTOrg)*W, W, PadLOrg, Wo, Wo_F, Wo_L, Stride, Bias, Out+Ho_L*Wo+Wo_F, Filter, 1); +} + +static void __attribute__ ((noinline)) KerConv5x1BorderStrideNx1_SQ8( + signed char *__restrict__ In, + int *__restrict__ Out, + signed char *__restrict__ Filter, + int W, + int H, + int Wo, + int Wo_F, + int Wo_L, + int Ho, + int Ho_F, + int Ho_L, + int Stride, + v4s Pad, + v4s PadOrg, + int Bias + ) + +{ + int PadLOrg = PadOrg[0], PadTOrg = PadOrg[2]; + int PadL = Pad[0], PadR = Pad[1], PadT = Pad[2], PadB = Pad[3]; + + if (PadL) { + if (Wo_F==2) { + KerConv4x1from5x1StrideNx1_V_SQ8(In, W, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Bias, Out, Filter, 2); + KerConv4x1from5x1StrideNx1_V_SQ8(In, W, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Bias, Out+1, Filter, 1); + } else KerConv4x1from5x1StrideNx1_V_SQ8(In, W, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Bias, Out, Filter, PadL); + } + if (PadR) { + if ((Wo-Wo_L)==2) { + KerConv4x1from5x1StrideNx1_V_SQ8(In+Wo_L*Stride-PadLOrg, W, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Bias, Out+Wo-2, Filter, 3); + KerConv4x1from5x1StrideNx1_V_SQ8(In+Wo_L*Stride-PadLOrg, W, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Bias, Out+Wo-1, Filter, 4); + } else if (Wo==1) KerConv4x1from5x1StrideNx1_V_SQ8(In+Wo_L*Stride-PadLOrg, W, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Bias, Out+Wo-1, Filter, 5); + else KerConv4x1from5x1StrideNx1_V_SQ8(In+Wo_L*Stride-PadLOrg, W, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Bias, Out+Wo-1, Filter, PadR+2); + } +} + +static void __attribute__ ((noinline)) KerConv1x5BorderStride1xN_SQ8( + signed char *__restrict__ In, + int *__restrict__ Out, + signed char *__restrict__ Filter, + int W, + int H, + int Wo, + int Wo_F, + int Wo_L, + int Ho, + int Ho_F, + int Ho_L, + int Stride, + v4s Pad, + v4s PadOrg, + int Bias + ) + +{ + int PadLOrg = PadOrg[0], PadTOrg = PadOrg[2]; + int PadL = Pad[0], PadR = Pad[1], PadT = Pad[2], PadB = Pad[3]; + + if (PadT) { + if (Ho_F==2) { // Happens only if stride = 1 + KerConv1x4from1x5Stride1xN_H_SQ8(In, W, PadLOrg, Wo, Wo_F, Wo_L, Bias, Out+Wo_F, Filter, 2); + KerConv1x4from1x5Stride1xN_H_SQ8(In, W, PadLOrg, Wo, Wo_F, Wo_L, Bias, Out+Wo_F+Wo, Filter, 1); + } else KerConv1x4from1x5Stride1xN_H_SQ8(In, W, PadLOrg, Wo, Wo_F, Wo_L, Bias, Out+Wo_F, Filter, PadT); + } + if (PadB) { + if ((Ho-Ho_L)==2) { // Happens only if stride == 1 + KerConv1x4from1x5Stride1xN_H_SQ8(In+(Ho_L*Stride-PadTOrg)*W, W, PadLOrg, Wo, Wo_F, Wo_L, Bias, Out+Ho_L*Wo+Wo_F, Filter, 3); + KerConv1x4from1x5Stride1xN_H_SQ8(In+(Ho_L*Stride-PadTOrg)*W, W, PadLOrg, Wo, Wo_F, Wo_L, Bias, Out+(Ho_L+1)*Wo+Wo_F, Filter, 4); + } else if (Ho==1) KerConv1x4from1x5Stride1xN_H_SQ8(In+(Ho_L*Stride-PadTOrg)*W, W, PadLOrg, Wo, Wo_F, Wo_L, Bias, Out+Ho_L*Wo+Wo_F, Filter, 5); + else KerConv1x4from1x5Stride1xN_H_SQ8(In+(Ho_L*Stride-PadTOrg)*W, W, PadLOrg, Wo, Wo_F, Wo_L, Bias, Out+Ho_L*Wo+Wo_F, Filter, PadB+2); + } +} + +static void __attribute__ ((noinline)) KerConv5x5BorderStride1_SQ8( + signed char *__restrict__ In, + int *__restrict__ Out, + signed char *__restrict__ Filter, + int W, + int H, + int Wo, + int Wo_F, + int Wo_L, + int Ho, + int Ho_F, + int Ho_L, + v4s Pad, + v4s PadOrg, + int Bias + ) + +{ + /* With stride=1 we are sure that padding will be 2, 2 for a given dim */ + int Fh=5, Fw=5, Stride=1; + int PadLOrg = PadOrg[0], PadTOrg = PadOrg[2]; + int PadL = Pad[0], PadR = Pad[1], PadT = Pad[2], PadB = Pad[3]; + + if (PadL==2) { + KerConv4x5from5x5Stride1_V_SQ8(In, W, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Bias, Out, Filter, 2); + KerConv4x5from5x5Stride1_V_SQ8(In, W, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Bias, Out+1, Filter, 1); + } else if (PadL==1) KerConv4x5from5x5Stride1_V_SQ8(In, W, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Bias, Out, Filter, 1); + if (PadR==2) { + if (Wo==1) KerConv4x5from5x5Stride1_V_SQ8(In+Wo_L*Stride-PadLOrg, W, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Bias, Out+Wo-1, Filter, 5); + else { + KerConv4x5from5x5Stride1_V_SQ8(In+Wo_L*Stride-PadLOrg, W, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Bias, Out+Wo-2, Filter, 3); + KerConv4x5from5x5Stride1_V_SQ8(In+Wo_L*Stride-PadLOrg, W, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Bias, Out+Wo-1, Filter, 4); + } + } else if (PadR==1) KerConv4x5from5x5Stride1_V_SQ8(In+Wo_L*Stride-PadLOrg, W, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Bias, Out+Wo-1, Filter, 3); + if (PadT==2) { + KerConv5x4from5x5Stride1_H_SQ8(In, W, PadLOrg, Wo, Wo_F, Wo_L, Bias, Out+Wo_F, Filter, 2); + KerConv5x4from5x5Stride1_H_SQ8(In, W, PadLOrg, Wo, Wo_F, Wo_L, Bias, Out+Wo_F+Wo, Filter, 1); + } else if (PadT==1) KerConv5x4from5x5Stride1_H_SQ8(In, W, PadLOrg, Wo, Wo_F, Wo_L, Bias, Out+Wo_F, Filter, 1); + if (PadB==2) { + if (Ho==1) KerConv5x4from5x5Stride1_H_SQ8(In+(Ho_L*Stride-PadTOrg)*W, W, PadLOrg, Wo, Wo_F, Wo_L, Bias, Out+(Ho_L)*Wo+Wo_F, Filter, 5); + else { + KerConv5x4from5x5Stride1_H_SQ8(In+(Ho_L*Stride-PadTOrg)*W, W, PadLOrg, Wo, Wo_F, Wo_L, Bias, Out+Ho_L*Wo+Wo_F, Filter, 3); + KerConv5x4from5x5Stride1_H_SQ8(In+(Ho_L*Stride-PadTOrg)*W, W, PadLOrg, Wo, Wo_F, Wo_L, Bias, Out+(Ho_L+1)*Wo+Wo_F, Filter, 4); + } + } else if (PadB==1) KerConv5x4from5x5Stride1_H_SQ8(In+(Ho_L*Stride-PadTOrg)*W, W, PadLOrg, Wo, Wo_F, Wo_L, Bias, Out+Ho_L*Wo+Wo_F, Filter, 3); +} + +static void __attribute__ ((noinline)) KerConv5x5BorderStride2_SQ8( + signed char *__restrict__ In, + int *__restrict__ Out, + signed char *__restrict__ Filter, + int W, + int H, + int Wo, + int Wo_F, + int Wo_L, + int Ho, + int Ho_F, + int Ho_L, + v4s Pad, + v4s PadOrg, + int Bias + ) + +{ + /* Max padding is 4, distributed Pad/2, Pad-Pad/2 or Pad-Pad/2, Pad, with stride 2 each padded area cannot produce more than 1 outputput */ + int Fh=5, Fw=5, Stride=2; + int PadLOrg = PadOrg[0], PadTOrg = PadOrg[2]; + int PadL = Pad[0], PadR = Pad[1], PadT = Pad[2], PadB = Pad[3]; + + if (PadL) KerConv4x5from5x5Stride2_V_SQ8(In, W, H, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Bias, Out, Filter, PadL); + if (PadR) KerConv4x5from5x5Stride2_V_SQ8(In+Wo_L*Stride-PadLOrg, W, H, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Bias, Out+Wo-1, Filter, PadR+2); + + if (PadT) KerConv5x4from5x5Stride2_H_SQ8(In, W, H, PadLOrg, PadTOrg, Wo, Wo_F, Wo_L, Bias, Out+Wo_F, Filter, PadT); + if (PadB) KerConv5x4from5x5Stride2_H_SQ8(In+(Ho_L*Stride-PadTOrg)*W, W, H, PadLOrg, PadTOrg, Wo, Wo_F, Wo_L, Bias, Out+Ho_L*Wo+Wo_F, Filter, PadB+2); +} + +static void __attribute__ ((noinline)) KerConv5x5BorderStrideS_SQ8( + signed char *__restrict__ In, + int *__restrict__ Out, + signed char *__restrict__ Filter, + int W, + int H, + int Wo, + int Wo_F, + int Wo_L, + int Ho, + int Ho_F, + int Ho_L, + int Stride, + v4s Pad, + v4s PadOrg, + int Bias + ) + +{ + /* Stride is assumed to be >2 since we have specialized variants therefore no more than 1 output can be created in each padded area */ + int Fh=5, Fw=5; + int PadLOrg = PadOrg[0], PadTOrg = PadOrg[2]; + int PadL = Pad[0], PadR = Pad[1], PadT = Pad[2], PadB = Pad[3]; + + if (PadL) KerConv4x5from5x5StrideS_V_SQ8(In, W, H, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Stride, Bias, Out, Filter, PadL); + if (PadR) KerConv4x5from5x5StrideS_V_SQ8(In+Wo_L*Stride-PadLOrg, W, H, PadOrg, Pad, Wo, Ho, Ho_F, Ho_L, Stride, Bias, Out+Wo-1, Filter, PadR+2); + + if (PadT) KerConv5x4from5x5StrideS_H_SQ8(In, W, H, PadLOrg, PadTOrg, Wo, Wo_F, Wo_L, Stride, Bias, Out+Wo_F, Filter, PadT); + if (PadB) KerConv5x4from5x5StrideS_H_SQ8(In+(Ho_L*Stride-PadTOrg)*W, W, H, PadLOrg, PadTOrg, Wo, Wo_F, Wo_L, Stride, Bias, Out+Ho_L*Wo+Wo_F, Filter, PadB+2); +} + +/* Convolution, body processing (covers both padded and non padded variants) + + Input feature maps, Output feature maps and filter on bytes + + KerConv1x1Stride1_Body_SQ8 1x1 convolution, stride 1 + KerConv1x1Stride2_Body_SQ8 1x1 convolution, stride 2 + KerConv1x1StrideS_Body_SQ8 1x1 convolution, stride S + + KerConv3x1Stride1x1_Body_SQ8 3x1 convolution, stride 1x1 + KerConv3x1Stride2x1_Body_SQ8 3x1 convolution, stride 2x1 + KerConv1x3Stride1x1_Body_SQ8 1x3 convolution, stride 1x1 + KerConv1x3Stride1x2_Body_SQ8 1x3 convolution, stride 1x2 + KerConv3x3Stride1_Body_SQ8 3x3 convolution, stride 1 + KerConv3x3Stride2_Body_SQ8 3x3 convolution, stride 2 + KerConv3x3StrideS_Body_SQ8 3x3 convolution, stride S + + KerConv5x1Stride1x1_Body_SQ8 5x1 convolution, stride 1x1 + KerConv5x1Stride2x1_Body_SQ8 5x1 convolution, stride 2x1 + KerConv1x5Stride1x1_Body_SQ8 1x5 convolution, stride 1x1 + KerConv1x5Stride1x2_Body_SQ8 1x5 convolution, stride 1x2 + KerConv5x5Stride1_Body_SQ8 5x5 convolution, stride 1 + KerConv5x5Stride2_Body_SQ8 5x5 convolution, stride 2 + KerConv5x5StrideS_Body_SQ8 5x5 convolution, stride S + KerConv7x7StrideS_Body_SQ8 7x7 convolution, stride S + + KerConvNxNStrideS_Body_SQ8 NxN convolution, stride S + KerConvNxMStrideSxSy_Body_SQ8 NxM convolution, stride Sx, Sy + KerConvNxMDxDyStrideSxSy_Body_SQ8 NxM convolution, dilation Dx,Dy, stride Sx, Sy + +*/ + +static void __attribute__ ((noinline)) KerConv1x1Stride1_Body_SQ8( + signed char *__restrict__ In, + int *__restrict__ Out, + signed char *__restrict__ Filter, + int W, + int H, + int Wo, + int Wo_F, + int Wo_L, + int Ho, + int Ho_F, + int Ho_L, + v4s Pad, + int Bias + ) + +{ + int Stride = 1; + int PadL = Pad[0], PadT = Pad[2]; + + int C0 = Filter[0]; + int IterW = Wo_L-Wo_F; + for (unsigned int h=Ho_F; hIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofN, S=Arg->S; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofN, Sx=Arg->S; + unsigned int FSy=Arg->Ny, Sy=Arg->Sy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofN, Sx=Arg->S; + unsigned int FSy=Arg->Ny, Sy=Arg->Sy; + int Dx=Arg->D, Dy=Arg->Dy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + signed char * __restrict__ Bias_SQ8 = (signed char * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-(Dx*(FSx-1)+1)+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput((Dx*(FSx-1)+1), PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, (Dx*(FSx-1)+1), PadIn[0], Sx)); + int Ho = (Arg->UsedH-(Dy*(FSy-1)+1)+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput((Dy*(FSy-1)+1), PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, (Dy*(FSy-1)+1), PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofN, S=Arg->S; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofN, Sx=Arg->S; + unsigned int FSy=Arg->Ny, Sy=Arg->Sy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofN, Sx=Arg->S; + unsigned int FSy=Arg->Ny, Sy=Arg->Sy; + int Dx=Arg->D, Dy=Arg->Dy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + short int * __restrict__ Bias = (short int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-(Dx*(FSx-1)+1)+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput((Dx*(FSx-1)+1), PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, (Dx*(FSx-1)+1), PadIn[0], Sx)); + int Ho = (Arg->UsedH-(Dy*(FSy-1)+1)+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput((Dy*(FSy-1)+1), PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, (Dy*(FSy-1)+1), PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofN, S=Arg->S; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + for (unsigned int of=First; ofN, Sx=Arg->S; + unsigned int FSy=Arg->Ny, Sy=Arg->Sy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + for (unsigned int of=First; ofN, Sx=Arg->S; + unsigned int FSy=Arg->Ny, Sy=Arg->Sy; + int Dx=Arg->D, Dy=Arg->Dy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Bias = (int * __restrict__) Arg->Bias; + int * __restrict__ Out = (int * __restrict__) Arg->Out; + + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-(Dx*(FSx-1)+1)+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput((Dx*(FSx-1)+1), PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, (Dx*(FSx-1)+1), PadIn[0], Sx)); + int Ho = (Arg->UsedH-(Dy*(FSy-1)+1)+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput((Dy*(FSy-1)+1), PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, (Dy*(FSy-1)+1), PadIn[2], Sy)); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstN, S=Arg->S; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstN, Sx=Arg->S; + unsigned int FSy=Arg->Ny, Sy=Arg->Sy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstN, Sx=Arg->S; + unsigned int FSy=Arg->Ny, Sy=Arg->Sy; + int Dx=Arg->D, Dy=Arg->Dy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((signed char *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-(Dx*(FSx-1)+1)+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput((Dx*(FSx-1)+1), PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, (Dx*(FSx-1)+1), PadIn[0], Sx)); + int Ho = (Arg->UsedH-(Dy*(FSy-1)+1)+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput((Dy*(FSy-1)+1), PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, (Dy*(FSy-1)+1), PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstN, S=Arg->S; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstN, Sx=Arg->S; + unsigned int FSy=Arg->Ny, Sy=Arg->Sy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstN, Sx=Arg->S; + unsigned int FSy=Arg->Ny, Sy=Arg->Sy; + int Dx=Arg->D, Dy=Arg->Dy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((short int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-(Dx*(FSx-1)+1)+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput((Dx*(FSx-1)+1), PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, (Dx*(FSx-1)+1), PadIn[0], Sx)); + int Ho = (Arg->UsedH-(Dy*(FSy-1)+1)+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput((Dy*(FSy-1)+1), PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, (Dy*(FSy-1)+1), PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstN, S=Arg->S; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstN, Sx=Arg->S; + unsigned int FSy=Arg->Ny, Sy=Arg->Sy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstN, Sx=Arg->S; + unsigned int FSy=Arg->Ny, Sy=Arg->Sy; + int Dx=Arg->D, Dy=Arg->Dy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + unsigned int NormBias = Arg->NormBias; + int B = AT_LSHIFT(((int *__restrict__) Arg->Bias)[0], NormBias); + int * __restrict__ Out = (int * __restrict__) Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-(Dx*(FSx-1)+1)+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput((Dx*(FSx-1)+1), PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, (Dx*(FSx-1)+1), PadIn[0], Sx)); + int Ho = (Arg->UsedH-(Dy*(FSy-1)+1)+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput((Dy*(FSy-1)+1), PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, (Dy*(FSy-1)+1), PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (First>Log2Core) + ((X&(NCore-1))!=0); + return Chunk; +} + +static int FirstDefinedOutput(unsigned int F, unsigned int Pad, unsigned int Stride) + +{ + // k*S - (F-1)/2 >=0 => k >= (((F-1)/2) + S-1)/S + + return ((Pad+Stride-1)/Stride); +} + +static int LastDefinedOutput(unsigned int DimIn, unsigned int F, unsigned int PadL, unsigned int Stride) + +{ + // k*S + ((F-1)/2 - PadL + F/2) < Dim => k < (Dim-((F-1)/2 - PadL + (F/2)) + S-1)/S + + return ((DimIn - ((F-1)/2 - PadL + (F/2)) + Stride-1)/Stride); +} + +static int __attribute__ ((always_inline)) MinCond(int a, int b) + +{ +#ifdef DIM_ALWAYS_GREATER_THAN_FILTER + return a; +#else + return Max(0, Min(a, b)); +#endif +} + +/* Padded Convolution Border processing + + Zero padding support. Implementation is based on partial convolutions derived from the original filter + + Input feature maps, Output feature maps and filter on bytes + + KerConv3x1BorderStrideNx1_SQ8 + |------ KerConv2x1from3x1StrideNx1_V_SQ8 3x1 convolution, stride Nx1, Left and right 0 padded stripes processing. + + KerConv1x3BorderStride1xN_SQ8 + |------ KerConv1x2from1x3Stride1xN_H_SQ8 1x3 convolution, stride 1xN, Left and right 0 padded stripes processing. + + KerConv3x3BorderStride1_SQ8 + |------ KerConv2x3from3x3Stride1_V_SQ8 3x3 convolution, stride 1, Left and right 0 padded stripes processing. + |------ KerConv3x2from3x3Stride1_H_SQ8 3x3 convolution, stride 1, Top and bottom 0 padded stripes processing. + + KerConv3x3BorderStride2_SQ8 + |------ KerConv2x3from3x3Stride2_V_SQ8 3x3 convolution, stride 2, Left and right 0 padded stripes processing. + |------ KerConv3x2from3x3Stride2_H_SQ8 3x3 convolution, stride 2, Top and bottom 0 padded stripes processing. + + KerConv3x3BorderStrideS_SQ8 + |------ KerConv2x3from3x3StrideS_V_SQ8 3x3 convolution, stride S, Left and right 0 padded stripes processing. + |------ KerConv3x2from3x3StrideS_H_SQ8 3x3 convolution, stride S, Top and bottom 0 padded stripes processing. + + KerConv5x1BorderStrideNx1_SQ8 + |------ KerConv4x1from5x1StrideNx1_V_SQ8 5x1 convolution, stride Nx1, Left and right 0 padded stripes processing. + + KerConv1x5BorderStride1xN_SQ8 + |------ KerConv1x4from1x5Stride1xN_H_SQ8 1x5 convolution, stride 1xN, Left and right 0 padded stripes processing. + + KerConv5x5BorderStride1_SQ8 + |------ KerConv4x5from5x5Stride1_V_SQ8 5x5 convolution, stride 1, Left and right 0 padded stripes processing. + |------ KerConv5x4from5x5Stride1_H_SQ8 5x5 convolution, stride 1, Top and bottom 0 padded stripes processing. + + KerConv5x5BorderStride2_SQ8 + |------ KerConv4x5from5x5Stride2_V_SQ8 5x5 convolution, stride 2, Left and right 0 padded stripes processing. + |------ KerConv5x4from5x5Stride2_H_SQ8 5x5 convolution, stride 2, Top and bottom 0 padded stripes processing. + + KerConv5x5BorderStrideS_SQ8 + |------ KerConv4x5from5x5StrideS_V_SQ8 5x5 convolution, stride S, Left and right 0 padded stripes processing. + |------ KerConv5x4from5x5StrideS_H_SQ8 5x5 convolution, stride S, Top and bottom 0 padded stripes processing. + + KerConvNxNStrideS_Border_fp NxN convolution, stride S, Left, Right, Top and Bottom borders + + KerConvNxMStrideSxSy_Border_fp NxM convolution, stride Sx,Sy, Left, Right, Top and Bottom borders + + KerConvNxMDxDyStrideSxSy_Border_fp NxM convolution, dilation Dx,Dy, stride Sx,Sy, Left, Right, Top and Bottom borders + + + + + Input feature maps, Output feature maps and filter on half words + + KerConv3x1BorderStrideNx1_fp + |------ KerConv2x1from3x1StrideNx1_V_fp 3x1 convolution, stride Nx1, Left and right 0 padded stripes processing. + + KerConv1x3BorderStride1xN_fp + |------ KerConv1x2from1x3Stride1xN_H_fp 1x3 convolution, stride 1xN, Left and right 0 padded stripes processing. + + KerConv3x3BorderStride1_fp + |------ KerConv2x3from3x3Stride1_V_fp 3x3 convolution, stride 1, Left and right 0 padded stripes processing. + |------ KerConv3x2from3x3Stride1_H_fp 3x3 convolution, stride 1, Top and bottom 0 padded stripes processing. + + KerConv3x3BorderStride2_fp + |------ KerConv2x3from3x3Stride2_V_fp 3x3 convolution, stride 2, Left and right 0 padded stripes processing. + |------ KerConv3x2from3x3Stride2_H_fp 3x3 convolution, stride 2, Top and bottom 0 padded stripes processing. + + KerConv3x3BorderStrideS_fp + |------ KerConv2x3from3x3StrideS_V_fp 3x3 convolution, stride S, Left and right 0 padded stripes processing. + |------ KerConv3x2from3x3StrideS_H_fp 3x3 convolution, stride S, Top and bottom 0 padded stripes processing. + + KerConv5x1BorderStrideNx1_fp + |------ KerConv4x1from5x1StrideNx1_V_fp 5x1 convolution, stride Nx1, Left and right 0 padded stripes processing. + + KerConv1x5BorderStride1xN_fp + |------ KerConv1x4from1x5Stride1xN_H_fp 1x5 convolution, stride 1xN, Left and right 0 padded stripes processing. + + KerConv5x5BorderStride1_fp + |------ KerConv4x5from5x5Stride1_V_fp 5x5 convolution, stride 1, Left and right 0 padded stripes processing. + |------ KerConv5x4from5x5Stride1_H_fp 5x5 convolution, stride 1, Top and bottom 0 padded stripes processing. + + KerConv5x5BorderStride2_fp + |------ KerConv4x5from5x5Stride2_V_fp 5x5 convolution, stride 2, Left and right 0 padded stripes processing. + |------ KerConv5x4from5x5Stride2_H_fp 5x5 convolution, stride 2, Top and bottom 0 padded stripes processing. + + KerConv5x5BorderStrideS_fp + |------ KerConv4x5from5x5StrideS_V_fp 5x5 convolution, stride S, Left and right 0 padded stripes processing. + |------ KerConv5x4from5x5StrideS_H_fp 5x5 convolution, stride S, Top and bottom 0 padded stripes processing. + + KerConvNxNStrideS_Border_SQ8 NxN convolution, stride S, Left, Right, Top and Bottom borders + + KerConvNxMStrideSxSy_Border_SQ8 NxM convolution, stride Sx,Sy, Left, Right, Top and Bottom borders + + KerConvNxMDxDyStrideSxSy_Border_SQ8 NxM convolution, dilation Dx,Dy, stride Sx,Sy, Left, Right, Top and Bottom borders +*/ + +static void __attribute__ ((noinline)) KerConv2x1from3x1StrideNx1_V_SQ8( + signed char * __restrict__ In, + int W, int PadTOrg, + int Wo, int Ho, int Ho_F, int Ho_L, + int * __restrict__ Out, + signed char * __restrict__ Filter, + int FilterConf + ) +{ + int V0,V1; + int C0,C1; + signed char *PtIn; + int *PtOut; + + if (FilterConf) { /* Right Side */ + C0 = Filter[0]; C1 = Filter[1]; + } else { /* Left Side */ + C0 = Filter[1]; C1 = Filter[2]; + } + PtIn = In + (Ho_F*1-PadTOrg)*W; PtOut = Out+Ho_F*Wo; + for (unsigned int i=Ho_F; i [2..4 x 0] => PadL==2 + C0 = *((v4s*) (Filter + 0*5+2)); C0[3] = 0; + break; + case 1: // [0..4 x 0] => [1..4 x 0] => PadL==1 + C0 = *((v4s*) (Filter + 0*5+1)); + break; + case 3: // [0..4 x 0] => [0..3 x 0] => PadR==1 + C0 = *((v4s*) (Filter + 0*5+0)); + break; + case 4: // [0..4 x 0] => [0..2 x 0] => PadR==2 + C0 = *((v4s*) (Filter + 0*5+0)); C0 = (v4s)(((int)C0)<<8); + break; + case 5: // [0..4 x 0] => [0..2 x 0] PadR==2, Wo==1 + C0 = *((v4s*) (Filter + 0*5+0)); C0[3] = 0; + break; + } + PtIn = In + (Ho_F*1-PadOrg[2])*W; PtOut = Out+Ho_F*Wo; + V0 = * (v4s *) PtIn; PtIn += W; + for (unsigned int i=Ho_F; i [2..4 x 0..4] PadL == 2 + C0 = *((v4s*) (Filter + 0*5+2)); C0[3] = 0; + C1 = *((v4s*) (Filter + 1*5+2)); C1[3] = 0; + C2 = *((v4s*) (Filter + 2*5+2)); C2[3] = 0; + C3 = *((v4s*) (Filter + 3*5+2)); C3[3] = 0; + C4 = *((v4s*) (Filter + 4*5+2)); C4[3] = 0; + break; + case 1: // [0..4 x 0..4] => [1..4 x 0..4] PadL == 1 + C0 = *((v4s*) (Filter + 0*5+1)); + C1 = *((v4s*) (Filter + 1*5+1)); + C2 = *((v4s*) (Filter + 2*5+1)); + C3 = *((v4s*) (Filter + 3*5+1)); + C4 = *((v4s*) (Filter + 4*5+1)); + break; + case 3: // [0..4 x 0..4] => [0..3 x 0..4] PadR == 1 + C0 = *((v4s*) (Filter + 0*5+0)); + C1 = *((v4s*) (Filter + 1*5+0)); + C2 = *((v4s*) (Filter + 2*5+0)); + C3 = *((v4s*) (Filter + 3*5+0)); + C4 = *((v4s*) (Filter + 4*5+0)); + break; + case 4: // [0..4 x 0..4] => [0..2 x 0..4] PadR == 2 + C0 = *((v4s*) (Filter + 0*5+0)); C0 = (v4s)(((int)C0)<<8); + C1 = *((v4s*) (Filter + 1*5+0)); C1 = (v4s)(((int)C1)<<8); + C2 = *((v4s*) (Filter + 2*5+0)); C2 = (v4s)(((int)C2)<<8); + C3 = *((v4s*) (Filter + 3*5+0)); C3 = (v4s)(((int)C3)<<8); + C4 = *((v4s*) (Filter + 4*5+0)); C4 = (v4s)(((int)C4)<<8); + break; + case 5: // [0..4 x 0..4] => [0..2 x 0..4] PadR == 2, Wo==1 + C0 = *((v4s*) (Filter + 0*5+0)); C0[3] = 0; + C1 = *((v4s*) (Filter + 1*5+0)); C1[3] = 0; + C2 = *((v4s*) (Filter + 2*5+0)); C2[3] = 0; + C3 = *((v4s*) (Filter + 3*5+0)); C3[3] = 0; + C4 = *((v4s*) (Filter + 4*5+0)); C4[3] = 0; + break; + } + if (PadT==2) { + PtIn = In; Ho_F = 0; + V0 = (v4s){0,0,0,0}; V1 = (v4s){0,0,0,0}; + } else if (PadT) { // == 1 + PtIn = In; Ho_F = 0; + V0 = (v4s){0,0,0,0}; + V1 = *((v4s *) PtIn); PtIn += W; + } else { // Ho_F==0 + PtIn = In + (Ho_F*1-PadTOrg)*W; + V0 = *((v4s *) PtIn); PtIn += W; + V1 = *((v4s *) PtIn); PtIn += W; + } + V2 = *((v4s *) PtIn); PtIn += W; + if (Ho==1) { + int Acc = *PtOut; Acc = gap_sumdotp4(V0, C0, Acc); Acc = gap_sumdotp4(V1, C1, Acc); Acc = gap_sumdotp4(V2, C2, Acc); *PtOut = Acc; + return; + } + V3 = *((v4s *) PtIn); PtIn += W; + PtOut = Out+Ho_F*Wo; + for (unsigned int i=Ho_F; i [2..4 x 0..4] PadL == 2 + C0 = *((v4s*) (Filter + 0*5+2)); C0[3] = 0; + C1 = *((v4s*) (Filter + 1*5+2)); C1[3] = 0; + C2 = *((v4s*) (Filter + 2*5+2)); C2[3] = 0; + C3 = *((v4s*) (Filter + 3*5+2)); C3[3] = 0; + C4 = *((v4s*) (Filter + 4*5+2)); C4[3] = 0; + break; + case 1: // [0..4 x 0..4] => [1..4 x 0..4] PadL==1 + C0 = *((v4s*) (Filter + 0*5+1)); + C1 = *((v4s*) (Filter + 1*5+1)); + C2 = *((v4s*) (Filter + 2*5+1)); + C3 = *((v4s*) (Filter + 3*5+1)); + C4 = *((v4s*) (Filter + 4*5+1)); + break; + case 3: // [0..4 x 0..4] => [0..3 x 0..4] PadR==1 + C0 = *((v4s*) (Filter + 0*5+0)); + C1 = *((v4s*) (Filter + 1*5+0)); + C2 = *((v4s*) (Filter + 2*5+0)); + C3 = *((v4s*) (Filter + 3*5+0)); + C4 = *((v4s*) (Filter + 4*5+0)); + break; + case 4: // [0..4 x 0..4] => [0..2 x 0..4] PadR==2 + C0 = *((v4s*) (Filter + 0*5+0)); C0[3] = 0; + C1 = *((v4s*) (Filter + 1*5+0)); C1[3] = 0; + C2 = *((v4s*) (Filter + 2*5+0)); C2[3] = 0; + C3 = *((v4s*) (Filter + 3*5+0)); C3[3] = 0; + C4 = *((v4s*) (Filter + 4*5+0)); C4[3] = 0; + break; + } + if (PadT==2) { + PtIn = In; Ho_F = 0; + V0 = (v4s){0,0,0,0}; V1 = (v4s){0,0,0,0}; + } else if (PadT) { // == 1 + PtIn = In; Ho_F = 0; + V0 = (v4s){0,0,0,0}; V1 = *((v4s *) PtIn); PtIn += W; + } else { + PtIn = In + (Ho_F*2-PadTOrg)*W; + V0 = *((v4s *) PtIn); PtIn += W; + V1 = *((v4s *) PtIn); PtIn += W; + } + PtOut = Out+Ho_F*Wo; + V2 = *((v4s *) PtIn); PtIn += W; + for (unsigned int i=Ho_F; i [2..4 x 0..4] PadL==2 + C0 = *((v4s*) (Filter + 0*5+2)); C0[3] = 0; + C1 = *((v4s*) (Filter + 1*5+2)); C1[3] = 0; + C2 = *((v4s*) (Filter + 2*5+2)); C2[3] = 0; + C3 = *((v4s*) (Filter + 3*5+2)); C3[3] = 0; + C4 = *((v4s*) (Filter + 4*5+2)); C4[3] = 0; + break; + case 1: // [0..4 x 0..4] => [1..4 x 0..4] PadL==1 + C0 = *((v4s*) (Filter + 0*5+1)); + C1 = *((v4s*) (Filter + 1*5+1)); + C2 = *((v4s*) (Filter + 2*5+1)); + C3 = *((v4s*) (Filter + 3*5+1)); + C4 = *((v4s*) (Filter + 4*5+1)); + break; + case 3: // [0..4 x 0..4] => [0..3 x 0..4] PadR==1 + C0 = *((v4s*) (Filter + 0*5+0)); + C1 = *((v4s*) (Filter + 1*5+0)); + C2 = *((v4s*) (Filter + 2*5+0)); + C3 = *((v4s*) (Filter + 3*5+0)); + C4 = *((v4s*) (Filter + 4*5+0)); + break; + case 4: // [0..4 x 0..4] => [0..2 x 0..4] PadR==2 + C0 = *((v4s*) (Filter + 0*5+0)); C0[3] = 0; + C1 = *((v4s*) (Filter + 1*5+0)); C1[3] = 0; + C2 = *((v4s*) (Filter + 2*5+0)); C2[3] = 0; + C3 = *((v4s*) (Filter + 3*5+0)); C3[3] = 0; + C4 = *((v4s*) (Filter + 4*5+0)); C4[3] = 0; + break; + } + if (PadT==2) { + PtIn = In; Ho_F = 0; + V0 = (v4s){0,0,0,0}; V1 = (v4s){0,0,0,0}; + } else if (PadT) { // == 1 + PtIn = In; Ho_F = 0; + V0 = (v4s){0,0,0,0}; V1 = *((v4s *) PtIn); PtIn += W; + } else { + PtIn = In + (Ho_F*Stride-PadTOrg)*W; PtOut = Out+Ho_F*Wo; + V0 = *((v4s *) PtIn); PtIn += W; + V1 = *((v4s *) PtIn); PtIn += W; + } + PtOut = Out+Ho_F*Wo; + + for (unsigned int i=Ho_F; iHi_F + int Wi_F = (Fw-1)/2 - PadLOrg; + int Wi_L = Wi_F + (Wo_L-1)*Stride; // iff Wi_L>Wi_F + + if (PadT) { /* Top */ + int ht = PadTOrg, hb = H - Hi_F + Fh/2; + for (unsigned int h=0; h F by definition of Ho_L so we can remove and use ht only + for (unsigned int w=Wo_F; w F by definition of Ho_L so we can remove and use ht only + for (unsigned int h=Ho_F; h F by definition of Ho_L so we can remove and use ht only. ht Can't be > F by definition of Ho_L so we can remove and use ht only + int Wh_min = wl, Wh_max = MinCond(wr, Fw), Fh_min = ht, Fh_max = MinCond(Fh, hb); + for (unsigned int i=Fh_min; iHi_F + int Wi_F = (Fw-1)/2 - PadLOrg; + int Wi_L = Wi_F + (Wo_L-1)*StrideX; // iff Wi_L>Wi_F + + if (PadT) { /* Top */ + int ht = PadTOrg, hb = H - Hi_F + Fh/2; + for (unsigned int h=0; h F by definition of Ho_L so we can remove and use ht only + for (unsigned int w=Wo_F; w F by definition of Ho_L so we can remove and use ht only + for (unsigned int h=Ho_F; h F by definition of Ho_L so we can remove and use ht only. ht Can't be > F by definition of Ho_L so we can remove and use ht only + int Wh_min = wl, Wh_max = MinCond(wr, Fw), Fh_min = ht, Fh_max = MinCond(Fh, hb); + for (unsigned int i=Fh_min; iHi_F + int Wi_F = (TFw-1)/2 - PadLOrg; + int Wi_L = Wi_F + (Wo_L-1)*StrideX; // iff Wi_L>Wi_F + int Prec=10; + int InvDh = ((1< F by definition of Ho_L so we can remove and use ht only + for (unsigned int h=Ho_F; hIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int InFeatures = Arg->InFeatures; + unsigned int Iter = Max(0, Last-First); + for (unsigned int i=0; iIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int InFeatures = Arg->InFeatures; + for (unsigned int of=First; ofS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofN, S=Arg->S; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofN, Sx=Arg->S; + unsigned int FSy=Arg->Ny, Sy=Arg->Sy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofN, Sx=Arg->S; + unsigned int FSy=Arg->Ny, Sy=Arg->Sy; + int Dx=Arg->D, Dy=Arg->Dy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + int TotalInFeatures = Arg->TotalInFeatures; + unsigned int OutFeatures = Arg->OutFeatures; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(OutFeatures); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, OutFeatures); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-(Dx*(FSx-1)+1)+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput((Dx*(FSx-1)+1), PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, (Dx*(FSx-1)+1), PadIn[0], Sx)); + int Ho = (Arg->UsedH-(Dy*(FSy-1)+1)+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput((Dy*(FSy-1)+1), PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, (Dy*(FSy-1)+1), PadIn[2], Sy)); + + unsigned int InFeatures = Arg->InFeatures; + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstTotalInFeatures, InFeatures = Arg->InFeatures, OutFeatures = Arg->OutFeatures; + for (unsigned int of=0; ofIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstIn; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstS; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstN, S=Arg->S; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstN, Sx=Arg->S; + unsigned int FSy=Arg->Ny, Sy=Arg->Sy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (FirstN, Sx=Arg->S; + unsigned int FSy=Arg->Ny, Sy=Arg->Sy; + int Dx=Arg->D, Dy=Arg->Dy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + signed char * __restrict__ Filter = Arg->Filter; + int * __restrict__ Out = Arg->Out; + v4s PadIn = Arg->Pad; + int Wo = (Arg->UsedW-(Dx*(FSx-1)+1)+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput((Dx*(FSx-1)+1), PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, (Dx*(FSx-1)+1), PadIn[0], Sx)); + int Ho = (Arg->UsedH-(Dy*(FSy-1)+1)+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput((Dy*(FSy-1)+1), PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, (Dy*(FSy-1)+1), PadIn[2], Sy)); + unsigned int CoreId = gap_coreid(); + v4s PadOrg = PadIn; + unsigned int Chunk, First, Last; + + if (Arg->Orientation) { // Horizontal + Chunk = ChunkSize(Wo); First = Chunk*CoreId; Last = Min(First+Chunk, Wo); + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + Wo_F = Max(First, Wo_F); Wo_L = Min(Last, Wo_L); + } else { + Chunk = ChunkSize(Ho); First = Chunk*CoreId; Last = Min(First+Chunk, Ho); + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L); + } + if (First +#include +#include "AutoTilerLib.h" +#include "CNN_Generators.h" +#include "CNN_Generator_Util.h" +#include "Gap.h" + +int CNN_EncodePoolOperation(KernelOper_T PoolOper, KernelOper_T ReLUOper) + +{ + int Pool=0; + int ReLU = (ReLUOper==KOP_RELU)?1:0; + + switch (PoolOper) { + case KOP_GLOBAL_MAXPOOL: + case KOP_MAXPOOL: Pool = 0; break; + case KOP_GLOBAL_AVGPOOL: + case KOP_AVGPOOL: Pool = 1; break; + } + return ((Pool<<1)|ReLU); +} + +int CNN_EncodeActivation(KernelOper_T Oper) + +{ + switch (Oper) { + case KOP_RELU: return 1; + case KOP_RELUN: return 2; + case KOP_HSIGMOID: return 3; + case KOP_HSWISH: return 4; + case KOP_LEAKYRELU: return 5; + default: return 0; + } +} + +int CNN_Gcd(int a, int b) + +{ + int x, y, z; + + x = Abs (a); y = Abs (b); + while (x > 0) { + z = y % x; y = x; x = z; + } + return y; +} + +int CNN_Scm(int a, int b) + +{ + return ((a*b)/CNN_Gcd(a,b)); +} + +int CNN_UsedInputDimension(int Dim, int F, int S, int D, int Pad) + +{ + /* Dim: input dimension, F: Filter dim, S: Stride, D: Dilation, Pad: pad values (sum of both sides) */ + return ((Dim-1)*S+(D*(F-1)+1)-Pad); +} + +int CNN_TotalPaddingValue(int Dim, int F, int S, int D) + +{ + /* F: Filter dim, S: Stride, D: Dilation */ + return ((Dim%S) == 0)?Max((D*(F-1)+1)-S, 0):Max((D*(F-1)+1) - (Dim%S), 0); +} + +v4s CNN_EdgePaddingValue(AT_PadType PadType, int Padw, int Padh) + +{ + v4s Pad; + switch (PadType) { + case PAD_LEFT: Pad = (v4s) {Padw, 0, Padh, 0}; break; + case PAD_RIGHT: Pad = (v4s) {0, Padw, 0, Padh}; break; + case PAD_BALANCED_LEFT: Pad = (v4s) {Padw-Padw/2, Padw/2, Padh-Padh/2, Padh/2}; break; + case PAD_BALANCED_RIGHT: Pad = (v4s) {Padw/2, Padw-Padw/2, Padh/2, Padh-Padh/2}; break; + default: GenTilingError("CNN_EdgePaddingValue: unknown padding method %d", PadType); + } + return Pad; +} + +void CNN_LayerOutputDim(int Width, int Height, + KernelOper_T ConvOper, int Fcx, int Fcy, int Dcx, int Dcy, int Scx, int Scy, int ConvPad, + KernelOper_T PoolOper, int Fpx, int Fpy, int Dpx, int Dpy, int Spx, int Spy, int PoolPad, + int *Wc, int *Hc, int *Wo, int *Ho, + int *Pcw, int *Pch, int *Ppw, int *Pph) + +{ + /* Convolution: Fc = Filter dim, Sc = Stride, Dc = Dilation + Pooling : Fp = Filter dim, Sp = Stride, Dp = Dilation + 3 different configurations: + Convolution then Pooling + Convolution + Pooling + Wc, Hc : convolution output dimension if present, otherwise returns Width, Eight + Wo, Ho : If conv then pool output dimension after conv and pooling, if pool only pool out dim, if conv only conv out dim + Pcw, Pch: Horizontal and vertical padding for convolution + Ppw, Pph: Horizontal and vertical padding for pooling + */ + int PadCw=0, PadCh=0; + int PadPw=0, PadPh=0; + + if (ConvOper==KOP_NONE) { + Fcx=1; Dcx=1; Scx=1; Fcy=1; Dcy=1; Scy=1; + } + if (PoolOper==KOP_NONE) { + Fpx=1; Dpx=1; Spx=1; Fpy=1; Dpy=1; Spy=1; + } + if (ConvOper!=KOP_NONE && ConvPad) { + PadCw = CNN_TotalPaddingValue(Width, Fcx, Scx, Dcx); PadCh = CNN_TotalPaddingValue(Height, Fcy, Scy, Dcy); + } + int ConvW = (Width - (Dcx*(Fcx-1)+1) + PadCw)/Scx + 1; + int ConvH = (Height - (Dcy*(Fcy-1)+1) + PadCh)/Scy + 1; + + if (Wc) *Wc = ConvW; else ConvW = Width; + if (Hc) *Hc = ConvH; else ConvH = Height; + + if (PoolOper!=KOP_NONE && PoolPad) { + PadPw = CNN_TotalPaddingValue(ConvW, Fpx, Spx, Dpx); PadPh = CNN_TotalPaddingValue(ConvH, Fpy, Spy, Dpy); + } + + if (Wo) *Wo = (ConvW - (Dpx*(Fpx-1)+1) + PadPw)/Spx + 1; + if (Ho) *Ho = (ConvH - (Dpy*(Fpy-1)+1) + PadPh)/Spy + 1; + if (Pcw) *Pcw = PadCw; + if (Pch) *Pch = PadCh; + if (Ppw) *Ppw = PadPw; + if (Pph) *Pph = PadPh; +} + +void CNN_TileOverlap(Tile_Orientation_T TileOrientation, + int Fcx, int Fcy, int Dcx, int Dcy, int Scx, int Scy, + int Fpx, int Fpy, int Dpx, int Dpy, int Spx, int Spy, + int *OverlapC, int *OverlapP) + +{ + /* Convolution: Fc = Filter dim, Sc = Stride, Dc = Dilation + Pooling : Fp = Filter dim, Sp = Stride, Dp = Dilation + 3 different configurations: + Convolution then Pooling + Convolution + Pooling + */ + if (OverlapC == 0) { + Fcx = Scx = Dcx = 1; Fcy = Scy = Dcy = 1; + } + if (OverlapP == 0) { + Fpx = Spx = Dpx = 1; Fpy = Spy = Dpy = 1; + } + int OverlapCx = (Dcx*(Fcx-1)+1) + Scx*((Dpx*(Fpx-1)+1)-Spx-1); + int OverlapCy = (Dcy*(Fcy-1)+1)+ Scy*((Dpy*(Fpy-1)+1)-Spy-1); + int OverlapPx = (Dpx*(Fpx-1)+1)-Spx; + int OverlapPy = (Dpy*(Fpy-1)+1)-Spy; + + if (OverlapC) *OverlapC = (TileOrientation==TILE_HOR)?OverlapCy:OverlapCx; + if (OverlapP) *OverlapP = (TileOrientation==TILE_HOR)?OverlapPy:OverlapPx; + + +} + +int CNN_CheckIfRepresentable(int Value, int Nbits) + +{ + return ((Abs(Value)&((1<(int)(b))?(a):(b)) +#define Max(a, b) (((a)>(b))?(a):(b)) +#define Min(a, b) (((a)<(b))?(a):(b)) +#define Abs(x) (((x)<0)?-(x):(x)) + +#define D0 KER_ITER_D0 +#define D1 KER_ITER_D1 +#define D2 KER_ITER_D2 +#define D3 KER_ITER_D3 +#define T0 KER_ITER_TILE0 +#define T1 KER_ITER_TILE1 +#define T2 KER_ITER_TILE2 + +#define MAXDPPREC + +#ifdef MAXDPPREC +#define DP_fps_S 4 +#else +#define DP_fps_S 2 +#endif + +int CNN_EncodePoolOperation(KernelOper_T PoolOper, KernelOper_T ReLUOper); +int CNN_EncodeActivation(KernelOper_T Oper); +int CNN_Gcd(int a, int b); +int CNN_Scm(int a, int b); +int CNN_UsedInputDimension(int Dim, int F, int S, int D, int Pad); +int CNN_TotalPaddingValue(int Dim, int F, int S, int D); +v4s CNN_EdgePaddingValue(AT_PadType PadType, int Padw, int Padh); +void CNN_LayerOutputDim(int Width, int Height, + KernelOper_T ConvOper, int Fcx, int Fcy, int Dcx, int Dcy, int Scx, int Scy, int ConvPad, + KernelOper_T PoolOper, int Fpx, int Fpy, int Dpx, int Dpy, int Spx, int Spy, int PoolPad, + int *Wc, int *Hc, int *Wo, int *Ho, + int *Pcw, int *Pch, int *Ppw, int *Pph); +void CNN_TileOverlap(Tile_Orientation_T TileOrientation, + int Fcx, int Fcy, int Dcx, int Dcy, int Scx, int Scy, + int Fpx, int Fpy, int Dpx, int Dpy, int Spx, int Spy, + int *OverlapC, int *OverlapP); +int CNN_CheckIfRepresentable(int Value, int Nbits); +int CNN_SetUpperLowerBounds(KernelOper_T ReLUOper, int DataSize, int DoReLU, int *LB, int *UB, int ReluN, int Precision); +#endif diff --git a/tools/autotiler_v3/generators/CNN/CNN_Generators.c b/tools/autotiler_v3/generators/CNN/CNN_Generators.c index 30a3f0643..0665c8456 100644 --- a/tools/autotiler_v3/generators/CNN/CNN_Generators.c +++ b/tools/autotiler_v3/generators/CNN/CNN_Generators.c @@ -2,8 +2,10 @@ #include #include "AutoTilerLib.h" #include "CNN_Generators.h" +#include "CNN_Generator_Util.h" #include "Gap.h" +#ifdef OLD #define MaxS(a, b) (((int)(a)>(int)(b))?(a):(b)) #define Max(a, b) (((a)>(b))?(a):(b)) #define Min(a, b) (((a)<(b))?(a):(b)) @@ -24,6 +26,7 @@ #else #define DP_fps_S 2 #endif +#endif void LoadCNNLibrary() @@ -632,6 +635,21 @@ void LoadCNNLibrary() TCArg("int", "UB") ) ); + LibKernelTemplate("KerLinearLayerReLU_fps_fps_fpd_T", + CArgs(11, + TCArg("signed char * __restrict__", "In"), + TCArg("unsigned short int", "InSize"), + TCArg("unsigned short int", "TotalInSize"), + TCArg("unsigned short int", "OutSize"), + TCArg("signed char * __restrict__", "Filter"), + TCArg("signed char * __restrict__", "Bias"), + TCArg("int * __restrict__", "Out"), + TCArg("unsigned char", "Norm"), + TCArg("signed char", "NormBias"), + TCArg("int", "LB"), + TCArg("int", "UB") + ) + ); LibKernelTemplate("KerLinearLayerReLU_fp_fp_fpd_T", CArgs(11, TCArg("short int * __restrict__", "In"), @@ -844,15 +862,12 @@ void LoadCNNLibrary() /* Linear Rectification (ReLU) */ LibKernel("KerParReLU_fp", CALL_PARALLEL, 0, "KerReLUPool_fp_T", CNN_Match(CNN_OperList(2, KOP_RELU, KOP_RELUN), 0, 1, CNN_Type(2,0,0,0,2), 0,0,0,0,0,0)); - LibKernel("KerParReLUN_Vector_fp", CALL_PARALLEL, 0, "KerReLUPool_fp_T", CNN_Match(CNN_OperList(1, KOP_RELUN_VECTOR), 0, 1, CNN_Type(2,0,0,0,2), 0,0,0,0,0,0)); LibKernel("KerParHswish_fp", CALL_PARALLEL, 0, "KerReLUPool_fp_T", CNN_Match(CNN_OperList(1, KOP_HSWISH), 0, 1, CNN_Type(2,0,0,0,2), 0,0,0,0,0,0)); LibKernel("KerParHsigmoid_fp", CALL_PARALLEL, 0, "KerReLUPool_fp_T", CNN_Match(CNN_OperList(1, KOP_HSIGMOID), 0, 1, CNN_Type(2,0,0,0,2), 0,0,0,0,0,0)); /* Linear layer followed by an optional activation, don't use when partial evaluation of the output is needed */ LibKernel("KerParLinearLayerReLU_fp", CALL_PARALLEL, 0, "KerLinearLayerReLU_fp_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,2,0,2), 0,0,0,0,0,0)); - LibKernel("KerParLinearLayerReLUN_Vector_fp", CALL_PARALLEL, 0, "KerLinearLayerReLU_fp_T",CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN_VECTOR), 1, - CNN_Type(2,2,2,0,2), 0,0,0,0,0,0)); LibKernel("KerParLinearLayerHswish_fp", CALL_PARALLEL, 0, "KerLinearLayerReLU_fp_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(2,2,2,0,2), 0,0,0,0,0,0)); LibKernel("KerParLinearLayerHsigmoid_fp", CALL_PARALLEL, 0, "KerLinearLayerReLU_fp_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_HSIGMOID), 1, @@ -869,84 +884,48 @@ void LoadCNNLibrary() /* Matrix scaling, one scalar per channel */ LibKernel("KerParMatScaleVector_fp", CALL_PARALLEL, 0, "KerMatScale_fp_T", CNN_Match(CNN_OperList(1, KOP_MATSCALE_VECTOR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,0,0,2), 0,0,0,0,0,0)); - LibKernel("KerParMatScaleVector_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerMatScale_fp_T", CNN_Match(CNN_OperList(1, KOP_MATSCALE_VECTOR), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,0,0,2), 0,0,0,0,0,0)); /* Matrix scaling, single scalar for all channels */ LibKernel("KerParMatScaleScalar_fp", CALL_PARALLEL, 0, "KerMatScale_fp_T", CNN_Match(CNN_OperList(1, KOP_MATSCALE_SCALAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,0,2,0,2), 0,0,0,0,0,0)); - LibKernel("KerParMatScaleScalar_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerMatScale_fp_T", CNN_Match(CNN_OperList(1, KOP_MATSCALE_SCALAR), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,0,2,0,2), 0,0,0,0,0,0)); /* Matrix scaling, single scalar for all channels then one scalar per channel */ LibKernel("KerParMatScaleVectorScalar_fp", CALL_PARALLEL, 0, "KerMatScale_fp_T", CNN_Match(CNN_OperList(1, KOP_MATSCALE_VECTOR_SCALAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,2,0,2), 0,0,0,0,0,0)); - LibKernel("KerParMatScaleVectorScalar_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerMatScale_fp_T",CNN_Match(CNN_OperList(1, KOP_MATSCALE_VECTOR_SCALAR), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,2,0,2), 0,0,0,0,0,0)); /* Matrix multiplication */ LibKernel("KerParMatMul_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,2,0,2), 0,0,0,0,1,1)); - LibKernel("KerParMatMul_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,2,0,2), 0,0,0,0,1,1)); LibKernel("KerParMatMulSxSy_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,2,0,2), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulSxSy_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,2,0,2), 0,0,0,0,-1,-1)); LibKernel("KerParMatMul_fpd_fp", CALL_PARALLEL, 0, "KerMatMul_fpd_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,4,0,2), 0,0,0,0,1,1)); - LibKernel("KerParMatMul_ReLUN_Vector_fpd_fp", CALL_PARALLEL, 0, "KerMatMul_fpd_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,4,0,2), 0,0,0,0,1,1)); LibKernel("KerParMatMulSxSy_fpd_fp", CALL_PARALLEL, 0, "KerMatMul_fpd_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,4,0,2), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulSxSy_ReLUN_Vector_fpd_fp", CALL_PARALLEL, 0, "KerMatMul_fpd_fp_T",CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,4,0,2), 0,0,0,0,-1,-1)); LibKernel("KerParMatMulSmallFeat_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,2,0,2), 0,0,0,0,1,1)); - LibKernel("KerParMatMulSmallFeat_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,2,0,2), 0,0,0,0,1,1)); /* Matrix multiplication, output scaled, single scalar for all channels */ LibKernel("KerParMatMulScaleScalar_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,2,2,2), 0,0,0,0,1,1)); - LibKernel("KerParMatMulScaleScalar_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,2,2,2), 0,0,0,0,1,1)); LibKernel("KerParMatMulScaleScalarSxSy_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,2,2,2), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulScaleScalarSxSy_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,2,2,2), 0,0,0,0,-1,-1)); LibKernel("KerParMatMulScaleScalar_fpd_fp", CALL_PARALLEL, 0, "KerMatMul_fpd_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,4,4,2), 0,0,0,0,1,1)); - LibKernel("KerParMatMulScaleScalar_ReLUN_Vector_fpd_fp", CALL_PARALLEL, 0, "KerMatMul_fpd_fp_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,4,4,2), 0,0,0,0,1,1)); LibKernel("KerParMatMulScaleScalarSxSy_fpd_fp", CALL_PARALLEL, 0, "KerMatMul_fpd_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,4,4,2), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulScaleScalarSxSy_ReLUN_Vector_fpd_fp", CALL_PARALLEL, 0, "KerMatMul_fpd_fp_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,4,4,2), 0,0,0,0,-1,-1)); LibKernel("KerParMatMulScaleScalarSmallFeat_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR_SM1), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,2,2,2), 0,0,0,0,1,1)); - LibKernel("KerParMatMulScaleScalarSmallFeat_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR_SM1), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,2,2,2), 0,0,0,0,1,1)); /* Matrix multiplication, output scaled, one scalar per channel */ LibKernel("KerParMatMulScale_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,2,2,2), 0,0,0,0,1,1)); - LibKernel("KerParMatMulScale_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,2,2,2), 0,0,0,0,1,1)); LibKernel("KerParMatMulScaleSxSy_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,2,2,2), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulScaleSxSy_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,2,2,2), 0,0,0,0,-1,-1)); LibKernel("KerParMatMulScale_fpd_fp", CALL_PARALLEL, 0, "KerMatMul_fpd_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,4,4,2), 0,0,0,0,1,1)); - LibKernel("KerParMatMulScale_ReLUN_Vector_fpd_fp", CALL_PARALLEL, 0, "KerMatMul_fpd_fp_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,4,4,2), 0,0,0,0,1,1)); LibKernel("KerParMatMulScaleSxSy_fpd_fp", CALL_PARALLEL, 0, "KerMatMul_fpd_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,4,4,2), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulScaleSxSy_ReLUN_Vector_fpd_fp", CALL_PARALLEL, 0, "KerMatMul_fpd_fp_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,4,4,2), 0,0,0,0,-1,-1)); LibKernel("KerParMatMulScaleSmallFeat_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SM1), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,2,2,2), 0,0,0,0,1,1)); - LibKernel("KerParMatMulScaleSmallFeat_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SM1), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(2,2,2,2,2), 0,0,0,0,1,1)); /* Matrix multiplication with H Swish reduction */ LibKernel("KerParMatMulHswish_fp", CALL_PARALLEL, 0, "KerMatMul_fp_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSWISH), @@ -1121,20 +1100,17 @@ void LoadCNNLibrary() /* Linear Rectification (ReLU) */ LibKernel("KerReLU_fp", CALL_PARALLEL, 0, "KerReLUPool_fp_T", CNN_Match(CNN_OperList(2, KOP_RELU, KOP_RELUN), 0, 0, CNN_Type(2,0,0,0,2), 0,0,0,0,0,0)); - LibKernel("KerReLU_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerReLUPool_fp_T", CNN_Match(CNN_OperList(1, KOP_RELUN_VECTOR), 0, 0, CNN_Type(2,0,0,0,2), 0,0,0,0,0,0)); /* Linear layer followed by an optional activation */ LibKernel("KerLinearLayerReLU_fp", CALL_PARALLEL, 0, "KerLinearLayerReLU_fp_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 0, CNN_Type(2,2,2,0,2), 0,0,0,0,0,0)); - LibKernel("KerLinearLayerReLUN_Vector_fp", CALL_PARALLEL, 0, "KerLinearLayerReLU_fp_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN_VECTOR), 0, - CNN_Type(2,2,2,0,2), 0,0,0,0,0,0)); /* Full precision Linear layer */ LibKernel("KerDPLinearLayer_fp", CALL_PARALLEL, 0, "KerDPLinearLayer_fp_T", CNN_Match(CNN_OperList(1, KOP_LINEAR_DP), 0, 0, CNN_Type(2,2,0,0,4), 0,0,0,0,0,0)); LibKernel("KerDPLinearLayer_fp_fps", CALL_PARALLEL, 0, "KerDPLinearLayer_fp_fps_T", CNN_Match(CNN_OperList(1, KOP_LINEAR_DP), 0, 0, CNN_Type(2,1,0,0,4), 0,0,0,0,0,0)); LibKernel("KerDPLinearLayerReduct_fp", CALL_SEQUENTIAL_STRUCT, 0, "KerDPLinearLayerReduct_fp_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_LINEAR), - CNN_OperList(7, KOP_NONE, KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSIGMOID, KOP_HSWISH, KOP_LEAKYRELU), + CNN_OperList(6, KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_HSWISH, KOP_LEAKYRELU), 0, CNN_Type(4,2,0,0,2), 0,0,0,0,0,0)); /****************************************************************************************************************/ @@ -1337,7 +1313,6 @@ void LoadCNNLibrary() /* Linear Rectification (ReLU) */ LibKernel("KerParReLU_fps", CALL_PARALLEL, 0, "KerReLUPool_fps_T", CNN_Match(CNN_OperList(2, KOP_RELU, KOP_RELUN), 0, 1, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerParReLUN_Vector_fps", CALL_PARALLEL, 0, "KerReLUPool_fps_T", CNN_Match(CNN_OperList(1, KOP_RELUN_VECTOR), 0, 1, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); LibKernel("KerParHswish_fps", CALL_PARALLEL, 0, "KerReLUPool_fps_T", CNN_Match(CNN_OperList(1, KOP_HSWISH), 0, 1, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); LibKernel("KerParHsigmoid_fps", CALL_PARALLEL, 0, "KerReLUPool_fps_T", CNN_Match(CNN_OperList(1, KOP_HSIGMOID), 0, 1, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); @@ -1363,84 +1338,48 @@ void LoadCNNLibrary() /* Matrix scaling, one scalar per channel */ LibKernel("KerParMatScaleVector_fps", CALL_PARALLEL, 0, "KerMatScale_fps_T", CNN_Match(CNN_OperList(1, KOP_MATSCALE_VECTOR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerParMatScaleVector_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerMatScale_fps_T",CNN_Match(CNN_OperList(1, KOP_MATSCALE_VECTOR), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,0,0,1), 0,0,0,0,0,0)); /* Matrix scaling, single scalar for all channels */ LibKernel("KerParMatScaleScalar_fps", CALL_PARALLEL, 0, "KerMatScale_fps_T", CNN_Match(CNN_OperList(1, KOP_MATSCALE_SCALAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,0,1,0,1), 0,0,0,0,0,0)); - LibKernel("KerParMatScaleScalar_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerMatScale_fps_T",CNN_Match(CNN_OperList(1, KOP_MATSCALE_SCALAR), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,0,1,0,1), 0,0,0,0,0,0)); /* Matrix scaling, single scalar for all channels then one scalar per channel */ LibKernel("KerParMatScaleVectorScalar_fps", CALL_PARALLEL, 0, "KerMatScale_fps_T", CNN_Match(CNN_OperList(1, KOP_MATSCALE_VECTOR_SCALAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); - LibKernel("KerParMatScaleVectorScalar_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerMatScale_fps_T",CNN_Match(CNN_OperList(1, KOP_MATSCALE_VECTOR_SCALAR), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); /* Matrix multiplication */ LibKernel("KerParMatMul_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMul_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); LibKernel("KerParMatMulSxSy_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulSxSy_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); LibKernel("KerParMatMul_fp_fps", CALL_PARALLEL, 0, "KerMatMul_fp_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMul_ReLUN_Vector_fp_fps", CALL_PARALLEL, 0, "KerMatMul_fp_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); LibKernel("KerParMatMulSxSy_fp_fps", CALL_PARALLEL, 0, "KerMatMul_fp_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulSxSy_ReLUN_Vector_fp_fps", CALL_PARALLEL, 0, "KerMatMul_fp_fps_T",CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); LibKernel("KerParMatMulSmallFeat_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulSmallFeat_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); /* Matrix multiplication, output scaled, single scalar for all channels */ LibKernel("KerParMatMulScaleScalar_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,1,1,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulScaleScalar_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,1,1,1), 0,0,0,0,1,1)); LibKernel("KerParMatMulScaleScalarSxSy_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,1,1,1), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulScaleScalarSxSy_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,1,1,1), 0,0,0,0,-1,-1)); LibKernel("KerParMatMulScaleScalar_fp_fps", CALL_PARALLEL, 0, "KerMatMul_fp_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,2,2,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulScaleScalar_ReLUN_Vector_fp_fps", CALL_PARALLEL, 0, "KerMatMul_fp_fps_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,2,2,1), 0,0,0,0,1,1)); LibKernel("KerParMatMulScaleScalarSxSy_fp_fps", CALL_PARALLEL, 0, "KerMatMul_fp_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,2,2,1), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulScaleScalarSxSy_ReLUN_Vector_fp_fps", CALL_PARALLEL, 0, "KerMatMul_fp_fps_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,2,2,1), 0,0,0,0,-1,-1)); LibKernel("KerParMatMulScaleScalarSmallFeat_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR_SM1), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,1,1,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulScaleScalarSmallFeat_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SCALAR_SM1), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,1,1,1), 0,0,0,0,1,1)); /* Matrix multiplication, output scaled, one scalar per channel */ LibKernel("KerParMatMulScale_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,1,1,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulScale_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,1,1,1), 0,0,0,0,1,1)); LibKernel("KerParMatMulScaleSxSy_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,1,1,1), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulScaleSxSy_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,1,1,1), 0,0,0,0,-1,-1)); LibKernel("KerParMatMulScale_fp_fps", CALL_PARALLEL, 0, "KerMatMul_fp_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,2,2,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulScale_ReLUN_Vector_fp_fps", CALL_PARALLEL, 0, "KerMatMul_fp_fps_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,2,2,1), 0,0,0,0,1,1)); LibKernel("KerParMatMulScaleSxSy_fp_fps", CALL_PARALLEL, 0, "KerMatMul_fp_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,2,2,1), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulScaleSxSy_ReLUN_Vector_fp_fps", CALL_PARALLEL, 0, "KerMatMul_fp_fps_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,2,2,1), 0,0,0,0,-1,-1)); LibKernel("KerParMatMulScaleSmallFeat_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SM1), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,1,1,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulScaleSmallFeat_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T",CNN_Match(CNN_OperList(1, KOP_MATMUL_SCALE_SM1), CNN_OperList(1, KOP_RELUN_VECTOR), - 1, CNN_Type(1,1,1,1,1), 0,0,0,0,1,1)); /* Matrix multiplication with H Swish reduction */ LibKernel("KerParMatMulHswish_fps", CALL_PARALLEL, 0, "KerMatMul_fps_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSWISH), @@ -1613,19 +1552,16 @@ void LoadCNNLibrary() /* Linear Rectification (ReLU) */ LibKernel("KerReLU_fps", CALL_PARALLEL, 0, "KerReLUPool_fps_T", CNN_Match(CNN_OperList(2, KOP_RELU, KOP_RELUN), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReLUN_Vector_fps", CALL_PARALLEL, 0, "KerReLUPool_fps_T", CNN_Match(CNN_OperList(1, KOP_RELUN_VECTOR), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); /* Linear layer followed by an optional activation */ LibKernel("KerLinearLayerReLU_fps", CALL_PARALLEL, 0, "KerLinearLayerReLU_fps_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 0, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); - LibKernel("KerLinearLayerReLUN_Vector_fps", CALL_PARALLEL, 0, "KerLinearLayerReLU_fps_T",CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN_VECTOR), 0, - CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); /* Full precision Linear Layer followed by a reduction */ LibKernel("KerDPLinearLayer_fps", CALL_PARALLEL, 0, "KerDPLinearLayer_fps_T", CNN_Match(CNN_OperList(1, KOP_LINEAR_DP), 0, 0, CNN_Type(1,1,0,0,4), 0,0,0,0,0,0)); LibKernel("KerDPLinearLayerReduct_fps", CALL_SEQUENTIAL_STRUCT, 0, "KerDPLinearLayerReduct_fps_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_LINEAR), - CNN_OperList(7, KOP_NONE, KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSIGMOID, KOP_HSWISH, KOP_LEAKYRELU), + CNN_OperList(6, KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_HSWISH, KOP_LEAKYRELU), 0, CNN_Type(4,1,0,0,1), 0,0,0,0,0,0)); @@ -1638,30 +1574,18 @@ void LoadCNNLibrary() LibKernel("KerDP_fp", CALL_PARALLEL, 0, "KerDP_fp_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), -1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); - LibKernel("KerDP_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerDP_fp_T", - CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUN_VECTOR), -1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); LibKernel("KerDP_IO_fp", CALL_PARALLEL, 0, "KerDP_fp_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), -1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); - LibKernel("KerDP_IO_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerDP_fp_T", - CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUN_VECTOR), -1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); LibKernel("KerDPMulBiasScalar_fp", CALL_PARALLEL, 0, "KerDP_fp_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_MULBIAS_SCALAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), -1, CNN_Type(4,0,2,0,2), 0,0,0,0,0,0)); - LibKernel("KerDPMulBiasScalar_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerDP_fp_T", - CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_MULBIAS_SCALAR), CNN_OperList(1, KOP_RELUN_VECTOR), -1, CNN_Type(4,0,2,0,2), 0,0,0,0,0,0)); LibKernel("KerDPMulBiasScalar_IO_fp", CALL_PARALLEL, 0, "KerDP_fp_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_MULBIAS_SCALAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), -1, CNN_Type(4,0,2,0,2), 0,0,0,0,0,0)); - LibKernel("KerDPMulBiasScalar_IO_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerDP_fp_T", - CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_MULBIAS_SCALAR), CNN_OperList(1, KOP_RELUN_VECTOR), -1, CNN_Type(4,0,2,0,2), 0,0,0,0,0,0)); LibKernel("KerDPMulBias_fp", CALL_PARALLEL, 0, "KerDP_fp_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_MULBIAS), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), -1, CNN_Type(4,0,2,0,2), 0,0,0,0,0,0)); - LibKernel("KerDPMulBias_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerDP_fp_T", - CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_MULBIAS), CNN_OperList(1, KOP_RELUN_VECTOR), -1, CNN_Type(4,0,2,0,2), 0,0,0,0,0,0)); LibKernel("KerDPMulBias_IO_fp", CALL_PARALLEL, 0, "KerDP_fp_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_MULBIAS), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), -1, CNN_Type(4,0,2,0,2), 0,0,0,0,0,0)); - LibKernel("KerDPMulBias_IO_ReLUN_Vector_fp", CALL_PARALLEL, 0, "KerDP_fp_T", - CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_MULBIAS), CNN_OperList(1, KOP_RELUN_VECTOR), -1, CNN_Type(4,0,2,0,2), 0,0,0,0,0,0)); LibKernel("KerDP_hswish_fp", CALL_PARALLEL, 0, "KerDP_fp_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSWISH), -1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); @@ -1684,31 +1608,19 @@ void LoadCNNLibrary() LibKernel("KerDP_fps", CALL_PARALLEL, 0, "KerDP_fps_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), -1, CNN_Type(DP_fps_S,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerDP_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerDP_fps_T", - CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUN_VECTOR), -1, CNN_Type(DP_fps_S,0,0,0,1), 0,0,0,0,0,0)); LibKernel("KerDP_IO_fps", CALL_PARALLEL, 0, "KerDP_fps_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), -1, CNN_Type(DP_fps_S,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerDP_IO_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerDP_fps_T", - CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUN_VECTOR), -1, CNN_Type(DP_fps_S,0,0,0,1), 0,0,0,0,0,0)); LibKernel("KerDPMulBiasScalar_fps", CALL_PARALLEL, 0, "KerDP_fps_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_MULBIAS_SCALAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), -1, CNN_Type(DP_fps_S,0,1,0,1), 0,0,0,0,0,0)); - LibKernel("KerDPMulBiasScalar_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerDP_fps_T", - CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_MULBIAS_SCALAR), CNN_OperList(1, KOP_RELUN_VECTOR), -1, CNN_Type(DP_fps_S,0,1,0,1), 0,0,0,0,0,0)); LibKernel("KerDPMulBiasScalar_IO_fps", CALL_PARALLEL, 0, "KerDP_fps_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_MULBIAS_SCALAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), -1, CNN_Type(DP_fps_S,0,1,0,1), 0,0,0,0,0,0)); - LibKernel("KerDPMulBiasScalar_IO_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerDP_fps_T", - CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_MULBIAS_SCALAR), CNN_OperList(1, KOP_RELUN_VECTOR), -1, CNN_Type(DP_fps_S,0,1,0,1), 0,0,0,0,0,0)); LibKernel("KerDPMulBias_fps", CALL_PARALLEL, 0, "KerDP_fps_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_MULBIAS), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), -1, CNN_Type(DP_fps_S,0,1,0,1), 0,0,0,0,0,0)); - LibKernel("KerDPMulBias_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerDP_fps_T", - CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_MULBIAS), CNN_OperList(1, KOP_RELUN_VECTOR), -1, CNN_Type(DP_fps_S,0,1,0,1), 0,0,0,0,0,0)); LibKernel("KerDPMulBias_IO_fps", CALL_PARALLEL, 0, "KerDP_fps_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_MULBIAS), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), -1, CNN_Type(DP_fps_S,0,1,0,1), 0,0,0,0,0,0)); - LibKernel("KerDPMulBias_IO_ReLUN_Vector_fps", CALL_PARALLEL, 0, "KerDP_fps_T", - CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_MULBIAS), CNN_OperList(1, KOP_RELUN_VECTOR), -1, CNN_Type(DP_fps_S,0,1,0,1), 0,0,0,0,0,0)); LibKernel("KerDP_hswish_fps", CALL_PARALLEL, 0, "KerDP_fps_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSWISH), -1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); @@ -1737,16 +1649,12 @@ void LoadCNNLibrary() /* Linear layer followed by an optional ReLU */ LibKernel("KerParLinearLayerReLU_fps_fps_fp",CALL_PARALLEL, 0, "KerLinearLayerReLU_fps_fps_fp_T",CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(1,1,2,0,2), 0,0,0,0,0,0)); - LibKernel("KerParLinearLayerReLUN_Vector_fps_fps_fp",CALL_PARALLEL, 0, "KerLinearLayerReLU_fps_fps_fp_T",CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN_VECTOR), 1, - CNN_Type(1,1,2,0,2), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerReLU_fps_fps_fpd",CALL_PARALLEL, 0, "KerLinearLayerReLU_fps_fps_fpd_T",CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, + CNN_Type(1,1,1,0,4), 0,0,0,0,0,0)); LibKernel("KerParLinearLayerReLU_fp_fps_fp", CALL_PARALLEL, 0, "KerLinearLayerReLU_fp_fps_fp_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,1,2,0,2), 0,0,0,0,0,0)); - LibKernel("KerParLinearLayerReLUN_Vector_fp_fps_fp", CALL_PARALLEL, 0, "KerLinearLayerReLU_fp_fps_fp_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN_VECTOR), 1, - CNN_Type(2,1,2,0,2), 0,0,0,0,0,0)); LibKernel("KerParLinearLayerReLU_fp_fp_fpd", CALL_PARALLEL, 0, "KerLinearLayerReLU_fp_fp_fpd_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 1, CNN_Type(2,2,2,0,4), 0,0,0,0,0,0)); - LibKernel("KerParLinearLayerReLUN_Vector_fp_fp_fpd", CALL_PARALLEL, 0, "KerLinearLayerReLU_fp_fp_fpd_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN_VECTOR), 1, - CNN_Type(2,2,2,0,4), 0,0,0,0,0,0)); /****************************************************************************************************************/ @@ -1761,21 +1669,15 @@ void LoadCNNLibrary() /* Linear layer followed by an optional ReLU */ LibKernel("KerLinearLayerReLU_fp_fps_fp", CALL_PARALLEL, 0, "KerLinearLayerReLU_fp_fps_fp_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 0, CNN_Type(2,1,2,0,2), 0,0,0,0,0,0)); - LibKernel("KerLinearLayerReLUN_Vector_fp_fps_fp", CALL_PARALLEL, 0, "KerLinearLayerReLU_fp_fps_fp_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN_VECTOR), 0, - CNN_Type(2,1,2,0,2), 0,0,0,0,0,0)); LibKernel("KerLinearLayerReLU_fps_fps_fp", CALL_PARALLEL, 0, "KerLinearLayerReLU_fps_fps_fp_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 0, CNN_Type(1,1,2,0,2), 0,0,0,0,0,0)); - LibKernel("KerLinearLayerReLUN_Vector_fps_fps_fp", CALL_PARALLEL, 0, "KerLinearLayerReLU_fps_fps_fp_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN_VECTOR), 0, - CNN_Type(1,1,2,0,2), 0,0,0,0,0,0)); LibKernel("KerLinearLayerReLU_fp_fp_fpd", CALL_PARALLEL, 0, "KerLinearLayerReLU_fp_fp_fpd_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(3, KOP_RELU, KOP_RELUN, KOP_NONE), 0, CNN_Type(2,2,2,0,4), 0,0,0,0,0,0)); - LibKernel("KerLinearLayerReLUN_Vector_fp_fp_fpd", CALL_PARALLEL, 0, "KerLinearLayerReLU_fp_fp_fpd_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN_VECTOR), 0, - CNN_Type(2,2,2,0,4), 0,0,0,0,0,0)); } - -static int EncodePoolOperation(KernelOper_T PoolOper, KernelOper_T ReLUOper) +#ifdef OLD +static int CNN_EncodePoolOperation(KernelOper_T PoolOper, KernelOper_T ReLUOper) { int Pool=0; @@ -1790,21 +1692,20 @@ static int EncodePoolOperation(KernelOper_T PoolOper, KernelOper_T ReLUOper) return ((Pool<<1)|ReLU); } -static int EncodeActivation(KernelOper_T Oper) +static int CNN_EncodeActivation(KernelOper_T Oper) { switch (Oper) { case KOP_RELU: return 1; case KOP_RELUN: return 2; - case KOP_RELUN_VECTOR: return 3; - case KOP_HSIGMOID: return 4; - case KOP_HSWISH: return 5; - case KOP_LEAKYRELU: return 6; + case KOP_HSIGMOID: return 3; + case KOP_HSWISH: return 4; + case KOP_LEAKYRELU: return 5; default: return 0; } } -static int Gcd(int a, int b) +static int CNN_Gcd(int a, int b) { int x, y, z; @@ -1816,27 +1717,27 @@ static int Gcd(int a, int b) return y; } -static int Scm(int a, int b) +static int CNN_Scm(int a, int b) { - return ((a*b)/Gcd(a,b)); + return ((a*b)/CNN_Gcd(a,b)); } -static int UsedInputDimension(int Dim, int F, int S, int D, int PadL, int PadR) +static int CNN_UsedInputDimension(int Dim, int F, int S, int D, int Pad) { - /* Dim: input dimension, F: Filter dim, S: Stride, D: Dilation, PadL,PadR: pad values on both sides */ - return ((Dim-1)*S+(D*(F-1)+1)-PadL-PadR); + /* Dim: input dimension, F: Filter dim, S: Stride, D: Dilation, Pad: pad values (sum of both sides) */ + return ((Dim-1)*S+(D*(F-1)+1)-Pad); } -static int TotalPaddingValue(int Dim, int F, int S, int D) +static int CNN_TotalPaddingValue(int Dim, int F, int S, int D) { /* F: Filter dim, S: Stride, D: Dilation */ return ((Dim%S) == 0)?Max((D*(F-1)+1)-S, 0):Max((D*(F-1)+1) - (Dim%S), 0); } -static v4s EdgePaddingValue(AT_PadType PadType, int Padw, int Padh) +static v4s CNN_EdgePaddingValue(AT_PadType PadType, int Padw, int Padh) { v4s Pad; @@ -1845,15 +1746,16 @@ static v4s EdgePaddingValue(AT_PadType PadType, int Padw, int Padh) case PAD_RIGHT: Pad = (v4s) {0, Padw, 0, Padh}; break; case PAD_BALANCED_LEFT: Pad = (v4s) {Padw-Padw/2, Padw/2, Padh-Padh/2, Padh/2}; break; case PAD_BALANCED_RIGHT: Pad = (v4s) {Padw/2, Padw-Padw/2, Padh/2, Padh-Padh/2}; break; - default: GenTilingError("EdgePaddingValue: unknown padding method %d", PadType); + default: GenTilingError("CNN_EdgePaddingValue: unknown padding method %d", PadType); } return Pad; } -static void ConvOutDim( int Width, int Height, +static void CNN_LayerOutputDim( int Width, int Height, KernelOper_T ConvOper, int Fcx, int Fcy, int Dcx, int Dcy, int Scx, int Scy, int ConvPad, KernelOper_T PoolOper, int Fpx, int Fpy, int Dpx, int Dpy, int Spx, int Spy, int PoolPad, - int *Wc, int *Hc, int *Wo, int *Ho) + int *Wc, int *Hc, int *Wo, int *Ho, + int *Pcw, int *Pch, int *Ppw, int *Pph) { /* Convolution: Fc = Filter dim, Sc = Stride, Dc = Dilation @@ -1862,8 +1764,10 @@ static void ConvOutDim( int Width, int Height, Convolution then Pooling Convolution Pooling - Wc, Hc: convolution output dimension if present, otherwise returns Width, Eight - Wo, Ho: If conv then pool output dimension after conv and pooling, if pool only pool out dim, if conv only conv out dim + Wc, Hc : convolution output dimension if present, otherwise returns Width, Eight + Wo, Ho : If conv then pool output dimension after conv and pooling, if pool only pool out dim, if conv only conv out dim + Pcw, Pch: Horizontal and vertical padding for convolution + Ppw, Pph: Horizontal and vertical padding for pooling */ int PadCw=0, PadCh=0; int PadPw=0, PadPh=0; @@ -1875,7 +1779,7 @@ static void ConvOutDim( int Width, int Height, Fpx=1; Dpx=1; Spx=1; Fpy=1; Dpy=1; Spy=1; } if (ConvOper!=KOP_NONE && ConvPad) { - PadCw = TotalPaddingValue(Width, Fcx, Scx, Dcx); PadCh = TotalPaddingValue(Height, Fcy, Scy, Dcy); + PadCw = CNN_TotalPaddingValue(Width, Fcx, Scx, Dcx); PadCh = CNN_TotalPaddingValue(Height, Fcy, Scy, Dcy); } int ConvW = (Width - (Dcx*(Fcx-1)+1) + PadCw)/Scx + 1; int ConvH = (Height - (Dcy*(Fcy-1)+1) + PadCh)/Scy + 1; @@ -1884,14 +1788,18 @@ static void ConvOutDim( int Width, int Height, if (Hc) *Hc = ConvH; else ConvH = Height; if (PoolOper!=KOP_NONE && PoolPad) { - PadPw = TotalPaddingValue(ConvW, Fpx, Spx, Dpx); PadPh = TotalPaddingValue(ConvH, Fpy, Spy, Dpy); + PadPw = CNN_TotalPaddingValue(ConvW, Fpx, Spx, Dpx); PadPh = CNN_TotalPaddingValue(ConvH, Fpy, Spy, Dpy); } if (Wo) *Wo = (ConvW - (Dpx*(Fpx-1)+1) + PadPw)/Spx + 1; if (Ho) *Ho = (ConvH - (Dpy*(Fpy-1)+1) + PadPh)/Spy + 1; + if (Pcw) *Pcw = PadCw; + if (Pch) *Pch = PadCh; + if (Ppw) *Ppw = PadPw; + if (Pph) *Pph = PadPh; } -static int TileOverlap(Tile_Orientation_T TileOrientation, +static void CNN_TileOverlap(Tile_Orientation_T TileOrientation, int Fcx, int Fcy, int Dcx, int Dcy, int Scx, int Scy, int Fpx, int Fpy, int Dpx, int Dpy, int Spx, int Spy, int *OverlapC, int *OverlapP @@ -1922,13 +1830,13 @@ static int TileOverlap(Tile_Orientation_T TileOrientation, } -static int CheckIfRepresentable(int Value, int Nbits) +static int CNN_CheckIfRepresentable(int Value, int Nbits) { return ((Abs(Value)&((1<MulBiasScalar != -1) MulBiasScalar = Ctrl->MulBiasScalar; } - ConvOutDim(Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, + CNN_LayerOutputDim(Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, - &Wc, &Hc, &Wo, &Ho); + &Wc, &Hc, &Wo, &Ho, 0, 0, 0, 0); if ((InFeat%GroupIn)||(OutFeat%GroupOut)||((InFeat/GroupIn)!=(OutFeat/GroupOut))) GenTilingError("CNN_GroupedConvolutionPoolReLU: %s cannot divide In(%d)/Out(%d) feature spaces with these group parameters: GroupIn %d, GroupOut: %d", @@ -3282,24 +3146,20 @@ int CNN_GroupedConvolutionMulBiasPoolReLU( ); CloseKernelGroup(); - CKernel_Arg_T **KCArgs = AllocateCArgs(5+(ReLUOper==KOP_RELUN_VECTOR)); + CKernel_Arg_T **KCArgs = AllocateCArgs(5); int Ca=0; KCArgs[Ca++] = TCArg(CNN_ArgDataType(In_DataSize,1,1), "In"); KCArgs[Ca++] = TCArg(CNN_ArgDataType(Filter_DataSize,1,1), "Filter"); KCArgs[Ca++] = TCArg(CNN_ArgDataType(Bias_DataSize,1,1), "Bias"); KCArgs[Ca++] = TCArg(CNN_ArgDataType(MulBias_DataSize,1,1),"MulBias"); - if (ReLUOper==KOP_RELUN_VECTOR) - KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "ReLUN"); KCArgs[Ca++] = TCArg(CNN_ArgDataType(Out_DataSize,1,1), "Out"); - Object_T **KArgs = AllocateKerArgs(5+(ReLUOper==KOP_RELUN_VECTOR)); + Object_T **KArgs = AllocateKerArgs(5); int Ka=0; KArgs[Ka++] = KerGroupArg("In", O_IN, NGroups*GroupIn*Width*Height, In_DataSize, "In"); KArgs[Ka++] = KerGroupArg("Filter", O_IN, NGroups*GroupIn*GroupOut*Fcx*Fcy, Filter_DataSize, "Filter"); KArgs[Ka++] = KerGroupArg("Bias", O_IN, NGroups*GroupOut, Bias_DataSize, "Bias"); KArgs[Ka++] = KerGroupArg("MulBias",O_IN, MulBiasScalar?1:NGroups*GroupOut, MulBias_DataSize, "MulBias"); - if (ReLUOper==KOP_RELUN_VECTOR) - KArgs[Ka++] = KerGroupArg("ReLUN", O_IN, NGroups*GroupOut, 1, "ReLUN"); KArgs[Ka++] = KerGroupArg("Out", O_OUT, NGroups*GroupOut*Wo*Ho, Out_DataSize, "Out"); UKGroup = UserKernelGroupK(Name, @@ -3308,15 +3168,6 @@ int CNN_GroupedConvolutionMulBiasPoolReLU( 0, Calls(1, UserKernelCall(BodyName, LOC_GROUP, - (ReLUOper==KOP_RELUN_VECTOR)? - Bindings(6, - KG_ArgOper("In", '*', GroupIn*Width*Height*In_DataSize), - KG_ArgOper("Filter", '*', GroupIn*GroupOut*Fcx*Fcy*Filter_DataSize), - KG_ArgOper("Bias", '*', GroupOut*Bias_DataSize), - KG_ArgOper("MulBias",'*', MulBiasScalar?1:GroupOut*Bias_DataSize), - KG_ArgOper("ReLUN", '*', GroupOut*1), - KG_ArgOper("Out", '*', GroupOut*Wo*Ho*Out_DataSize) - ): Bindings(5, KG_ArgOper("In", '*', GroupIn*Width*Height*In_DataSize), KG_ArgOper("Filter", '*', GroupIn*GroupOut*Fcx*Fcy*Filter_DataSize), @@ -3362,7 +3213,7 @@ int CNN_GroupedConvolutionMulBiasPoolReLU( Spy: Pooling stride, y dimension ReLUOper: Optional activation function: if (PoolOper!=KOP_NONE) KOP_RELU or KOP_NONE - else Optional activation function: KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + else Optional activation function: KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU Signature: Name(In, Out) Name(In, ReLUN, Out) @@ -3466,7 +3317,7 @@ int CNN_PoolReLU( PadInp[1] = Max(0, PadInp[1]-(Width-UsedWidth)); PadInp[3] = Max(0, PadInp[3]-(Height-UsedHeight)); /* Set output Lower and Upper bounds */ - if (SetUpperLowerBounds(ReLUOper, Out_DataSize, (ReLUOper==KOP_RELU || ReLUOper==KOP_RELUN || ReLUOper==KOP_RELUN_VECTOR), &OutLB, &OutUB, ReluN, Out_Q)) + if (CNN_SetUpperLowerBounds(ReLUOper, Out_DataSize, (ReLUOper==KOP_RELU || ReLUOper==KOP_RELUN), &OutLB, &OutUB, ReluN, Out_Q)) GenTilingError("CNN_PoolReLU %s, cannot represent saturation value with given out fixed point format %d", Name, Out_Q); if (PoolOper) LayerOp += OutFeat*Wo*Ho*Fpx*Fpy; @@ -3486,18 +3337,14 @@ int CNN_PoolReLU( printf("Nb Oper : %lld\n", LayerOp); } - CKernel_Arg_T **KCArgs = AllocateCArgs(2+(ReLUOper==KOP_RELUN_VECTOR)); + CKernel_Arg_T **KCArgs = AllocateCArgs(2); int Ca=0; KCArgs[Ca++] = TCArg(CNN_ArgDataType(In_DataSize,1,1), "In"); - if (ReLUOper==KOP_RELUN_VECTOR) - KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "ReLUN"); KCArgs[Ca++] = TCArg(CNN_ArgDataType(Out_DataSize,1,1), "Out"); - Object_T **KArgs = AllocateKerArgs(2+(ReLUOper==KOP_RELUN_VECTOR)); + Object_T **KArgs = AllocateKerArgs(2); int Ka=0; KArgs[Ka++] = KerArgP("In", KerArgSpace(2,D0,T0), OBJ_IN_DB|InL3, Width, Height, UsedWidth, UsedHeight, PadInp,PadInp, In_DataSize, OverlapP, 0, TileCons, "In"); - if (ReLUOper==KOP_RELUN_VECTOR) - KArgs[Ka++] = KerArg ("ReLUN", KerArgSpace(1,D0), O_IN|O_DB|O_CONST, 1, 1, 1, 0, 0, 0, "ReLUN"); KArgs[Ka++] = KerArg ("Out", KerArgSpace(2,D0,T0), OBJ_OUT_DB|OutL3, Wo, Ho, Out_DataSize, 0, 0, 0, "Out"); Kernel_T *Kernel = UserKernel(Name, @@ -3523,7 +3370,7 @@ int CNN_PoolReLU( NeedFpx?Imm(Fpx):AT_IGNORE_ARG_BINDING, /* Pooling Fx */ NeedSpx?Imm(Spx):AT_IGNORE_ARG_BINDING, /* Pooling Stridex */ Imm((TileOrientation==TILE_HOR)?1:0), /* Pooling Orientation */ - Imm(EncodePoolOperation(PoolOper, ReLUOper)), /* Pooling operation with optional ReLU */ + Imm(CNN_EncodePoolOperation(PoolOper, ReLUOper)), /* Pooling operation with optional ReLU */ NeedDpx?Imm(Dpx):AT_IGNORE_ARG_BINDING, /* Pooling Dx */ NeedFpy?Imm(Fpy):AT_IGNORE_ARG_BINDING, /* Pooling Fy */ NeedSpy?Imm(Spy):AT_IGNORE_ARG_BINDING, /* Pooling Stridey */ @@ -3555,9 +3402,7 @@ int CNN_PoolReLU( Imm(OutLB), /* Activation lower bound, clip or relu */ (ReLUOper==KOP_HSWISH||ReLUOper==KOP_HSIGMOID)? Imm(Out_Q): /* Use UB to pass Norm */ - ((ReLUOper==KOP_RELUN_VECTOR)? - K_Arg("ReLUN", KER_ARG_TILE): /* ReLUN input tile */ - Imm(OutUB)) /* Activation upper bound, clip or relu */ + Imm(OutUB) /* Activation upper bound, clip or relu */ ) ) ), @@ -3568,7 +3413,6 @@ int CNN_PoolReLU( AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); AddKernelArgDim(Name, "In", 4, InFeat, Height, Width, In_DataSize); - if (ReLUOper==KOP_RELUN_VECTOR) AddKernelArgDim(Name, "ReLUN", 2, OutFeat, 1); AddKernelArgDim(Name, "Out", 4, OutFeat, Ho, Wo, Out_DataSize); AT_PrepareForTest(Name, @@ -3799,7 +3643,7 @@ int CNN_GlobalPool( OutDim: Number of outputs LinearOper Should always be KOP_LINEAR - ReLUOper Optional activation function: KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + ReLUOper Optional activation function: KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU Signature: Name(In, Filter, Bias, Out) Name(In, Filter, Bias, ReLUN, Out) @@ -3862,7 +3706,7 @@ int CNN_LinearReLU( if (LinearKerName==0) GenTilingError("CNN_LinearReLU Kernel: %s, Can't find a matching %s basic kernel", Name, ReLUOper?"with linear rectification":""); /* Set output Lower and Upper bounds */ - if (SetUpperLowerBounds(ReLUOper, Out_DataSize, (ReLUOper==KOP_RELU || ReLUOper==KOP_RELUN || ReLUOper==KOP_RELUN_VECTOR), &OutLB, &OutUB, ReluN, Out_Q)) + if (CNN_SetUpperLowerBounds(ReLUOper, Out_DataSize, (ReLUOper==KOP_RELU || ReLUOper==KOP_RELUN), &OutLB, &OutUB, ReluN, Out_Q)) GenTilingError("CNN_LinearReLU %s, cannot represent saturation value with given out fixed point format %d", Name, Out_Q); LayerOp += InDim*OutDim; @@ -3878,24 +3722,20 @@ int CNN_LinearReLU( } Kernel_T *Kernel; - CKernel_Arg_T **KCArgs = AllocateCArgs(4+(ReLUOper==KOP_RELUN_VECTOR)); + CKernel_Arg_T **KCArgs = AllocateCArgs(4); int Ca=0; KCArgs[Ca++] = TCArg(CNN_ArgDataType(In_DataSize,1,1), "In"); KCArgs[Ca++] = TCArg(CNN_ArgDataType(Filter_DataSize,1,1), "Filter"); KCArgs[Ca++] = TCArg(CNN_ArgDataType(Bias_DataSize,1,1), "Bias"); - if (ReLUOper==KOP_RELUN_VECTOR) - KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "ReLUN"); KCArgs[Ca++] = TCArg(CNN_ArgDataType(Out_DataSize,1,1), "Out"); AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF); - Object_T **KArgs = AllocateKerArgs(4+(ReLUOper==KOP_RELUN_VECTOR)); + Object_T **KArgs = AllocateKerArgs(4); int Ka=0; KArgs[Ka++] = KerArg("In", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED|InL3, 1, 1, InDim*In_DataSize, 0, 0, 0, "In"); KArgs[Ka++] = KerArg("Filter", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST|FilterL3, 1, 1, InDim*Filter_DataSize, 0, 0, 0, "Filter"); KArgs[Ka++] = KerArg("Bias", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST|BiasL3, 1, 1, Bias_DataSize, 0, 0, 0, "Bias"); - if (ReLUOper==KOP_RELUN_VECTOR) - KArgs[Ka++] = KerArg("ReLUN", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST, 1, 1, 1, 0, 0, 0, "ReLUN"); KArgs[Ka++] = KerArg("Out", KerArgSpace(1,D0), OBJ_OUT_DB|OutL3, 1, 1, Out_DataSize, 0, 0, 0, "Out"); Kernel = UserKernel(Name, @@ -3917,9 +3757,7 @@ int CNN_LinearReLU( Imm(OutLB), /* Conv out lower bound, clip or relu */ (ReLUOper==KOP_HSWISH||ReLUOper==KOP_HSIGMOID)? Imm(Out_Q): /* Output format */ - ((ReLUOper==KOP_RELUN_VECTOR)? - K_Arg("ReLUN", KER_ARG_TILE): /* ReLUN input tile */ - Imm(OutUB)) /* Activation upper bound, clip or relu */ + Imm(OutUB) /* Activation upper bound, clip or relu */ ) ) ), @@ -3938,13 +3776,11 @@ int CNN_LinearReLU( if (LinearKerName==0 || ReductKerName==0) GenTilingError("CNN_LinearReLU Kernel: %s, Can't find a matching %s basic kernel", Name, ReLUOper?"with linear rectification":""); /* First try with Input as a buffer in */ - Object_T **KArgs = AllocateKerArgs(5+(ReLUOper==KOP_RELUN_VECTOR)); + Object_T **KArgs = AllocateKerArgs(5); int Ka=0; KArgs[Ka++] = KerArg("In", KerArgSpace(1,T0), OBJ_BUFFER_IN|InL3, 1, InDim, In_DataSize, 0, 0, 0, "In"); KArgs[Ka++] = KerArg("Filter", KerArgSpace(2,D0,T0), OBJ_IN_DB|O_CONST|FilterL3, 1, InDim, Filter_DataSize, 0, 0, 0, "Filter"); KArgs[Ka++] = KerArg("Bias", KerArgSpace(1,D0), OBJ_BUFFER_IN|O_CONST|BiasL3, 1, 1, Bias_DataSize, 0, 0, 0, "Bias"); - if (ReLUOper==KOP_RELUN_VECTOR) - KArgs[Ka++] = KerArg("ReLUN", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST, 1, 1, 1, 0, 0, 0, "ReLUN"); KArgs[Ka++] = KerArg("Out", KerArgSpace(1,D0), OBJ_BUFFER_OUT|OutL3, 1, 1, Out_DataSize, 0, 0, 0, "Out"); KArgs[Ka++] = KerArg("Reduct", KerArgSpace(1,T0), O_BUFF|O_NTILED, 8, 1, 4, 0, 0, 0, 0); @@ -3953,16 +3789,16 @@ int CNN_LinearReLU( TileOrientation, KCArgs, Calls(2, - Call(LinearKerName, LOC_INNER_LOOP, + Call(LinearKerName, LOC_LOOP, Bindings(5, K_Arg("In", KER_ARG_TILE), /* Input tile */ K_Arg("Filter", KER_ARG_TILE), /* Filter tile */ K_Arg("Reduct", KER_ARG_TILE), /* Output tile */ - K_Arg("In", KER_ARG_TILE_H), /* Input tile size */ + K_Arg("Filter", KER_ARG_TILE_H), /* Input tile size */ Ker_IteratorIndex(T0) /* Which tile index */ ) ), - Call(ReductKerName, LOC_INNER_LOOP_EPILOG, + Call(ReductKerName, LOC_LOOP_EPILOG, Bindings(8, K_Arg("Reduct", KER_ARG_TILE), /* Input tile */ K_Arg("Bias", KER_ARG_TILE), /* Filter tile */ @@ -3970,13 +3806,11 @@ int CNN_LinearReLU( Imm(OutLB), /* LB */ (ReLUOper==KOP_HSWISH||ReLUOper==KOP_HSIGMOID)? Imm(Out_Q): /* Output fixed point format */ - ((ReLUOper==KOP_RELUN_VECTOR)? - K_Arg("ReLUN", KER_ARG_TILE): /* ReLUN input tile */ - Imm(OutUB)), /* Activation upper bound, clip or relu */ + Imm(OutUB), /* Activation upper bound, clip or relu */ Imm(Filter_Q+In_Q-Out_Q), /* Normalization factor to be used for sum of product */ Imm(Filter_Q+In_Q-Bias_Q), /* Normalization factor to be used to adjust bias */ Imm(Out_Q), /* Output fixed point format */ - Imm(EncodeActivation(ReLUOper)) /* Oper, unused here */ + Imm(CNN_EncodeActivation(ReLUOper)) /* Oper, unused here */ ) ) ), @@ -3986,13 +3820,11 @@ int CNN_LinearReLU( AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON); if (Log && (Kernel==0)) printf("Feature parallel with in buffered failed, switching to non buffered in form\n"); if (Kernel==0) { - Object_T **KArgs = AllocateKerArgs(5+(ReLUOper==KOP_RELUN_VECTOR)); + Object_T **KArgs = AllocateKerArgs(5); int Ka=0; KArgs[Ka++] = KerArg("In", KerArgSpace(1,T0), OBJ_IN_DB|InL3, 1, InDim, In_DataSize, 0, 0, 0, "In"); KArgs[Ka++] = KerArg("Filter", KerArgSpace(2,D0,T0), OBJ_IN_DB|O_CONST|FilterL3, 1, InDim, Filter_DataSize, 0, 0, 0, "Filter"); KArgs[Ka++] = KerArg("Bias", KerArgSpace(1,D0), OBJ_BUFFER_IN|O_CONST|BiasL3, 1, 1, Bias_DataSize, 0, 0, 0, "Bias"); - if (ReLUOper==KOP_RELUN_VECTOR) - KArgs[Ka++] = KerArg("ReLUN", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST, 1, 1, 1, 0, 0, 0, "ReLUN"); KArgs[Ka++] = KerArg("Out", KerArgSpace(1,D0), OBJ_BUFFER_OUT|OutL3, 1, 1, Out_DataSize, 0, 0, 0, "Out"); KArgs[Ka++] = KerArg("Reduct", KerArgSpace(1,T0), O_BUFF|O_NTILED, 8, 1, 4, 0, 0, 0, 0); @@ -4001,16 +3833,16 @@ int CNN_LinearReLU( TileOrientation, KCArgs, Calls(2, - Call(LinearKerName, LOC_INNER_LOOP, + Call(LinearKerName, LOC_LOOP, Bindings(5, K_Arg("In", KER_ARG_TILE), /* Input tile */ K_Arg("Filter", KER_ARG_TILE), /* Filter tile */ K_Arg("Reduct", KER_ARG_TILE), /* Output tile */ - K_Arg("In", KER_ARG_TILE_H), /* Input tile size */ + K_Arg("Filter", KER_ARG_TILE_H), /* Input tile size */ Ker_IteratorIndex(T0) /* Which tile index */ ) ), - Call(ReductKerName, LOC_INNER_LOOP_EPILOG, + Call(ReductKerName, LOC_LOOP_EPILOG, Bindings(8, K_Arg("Reduct", KER_ARG_TILE), /* Input tile */ K_Arg("Bias", KER_ARG_TILE), /* Filter tile */ @@ -4018,12 +3850,10 @@ int CNN_LinearReLU( Imm(OutLB), /* LB */ (ReLUOper==KOP_HSWISH||ReLUOper==KOP_HSIGMOID)? Imm(Out_Q): /* Output fixed point format */ - ((ReLUOper==KOP_RELUN_VECTOR)? - K_Arg("ReLUN", KER_ARG_TILE): /* ReLUN input tile */ - Imm(OutUB)), /* Activation upper bound, clip or relu */ + Imm(OutUB), /* Activation upper bound, clip or relu */ Imm(Filter_Q+In_Q-Out_Q), /* Normalization factor to be used for sum of product */ Imm(Filter_Q+In_Q-Bias_Q), /* Normalization factor to be used to adjust bias */ - Imm(EncodeActivation(ReLUOper)) /* Oper, unused here */ + Imm(CNN_EncodeActivation(ReLUOper)) /* Oper, unused here */ ) ) ), @@ -4039,7 +3869,6 @@ int CNN_LinearReLU( AddKernelArgDim(Name, "In", 2, InDim, In_DataSize); AddKernelArgDim(Name, "Filter", 3, OutDim, InDim, Filter_DataSize); AddKernelArgDim(Name, "Bias", 2, OutDim, Bias_DataSize); - if (ReLUOper==KOP_RELUN_VECTOR) AddKernelArgDim(Name, "ReLUN", 2, OutDim, 1); AddKernelArgDim(Name, "Out", 2, OutDim, Out_DataSize); AT_PrepareForTest(Name, @@ -4100,7 +3929,7 @@ int CNN_SoftMax( int Out_InL3, int Dim, - KernelOper_T SoftMaxOper + KernelOper_T SoftMaxOper ) { @@ -4129,7 +3958,7 @@ int CNN_SoftMax( TCArg(CNN_ArgDataType(Out_DataSize,1,1), "Out") ), Calls(1, - Call(SoftMaxKerName, LOC_INNER_LOOP, + Call(SoftMaxKerName, LOC_LOOP, Bindings(4, K_Arg("In", KER_ARG_TILE), /* Input tile */ K_Arg("In", KER_ARG_TILE_H), /* Number of inputs */ @@ -4189,6 +4018,7 @@ int CNN_SoftMax( Height: Height of a given feature AddMatOper: Should always be KOP_MATADD + ReLUOper Optional activation function: KOP_RELU or KOP_NONE for no activation Signature: Name(In1, In2, Out) @@ -4196,6 +4026,8 @@ int CNN_SoftMax( *********************************************************************************************************************************************************************/ + +//This is just a wrapper for the old generator int CNN_MatAdd( char *Name, @@ -4218,7 +4050,42 @@ int CNN_MatAdd( int Width, int Height, - KernelOper_T AddMatOper + KernelOper_T AddMatOper){ + + KernelOper_T ReLUOper = KOP_NONE; + + return CNN_MatAddRelu(Name, Ctrl, + In1_DataSize, In2_DataSize, Out_DataSize, + In1_Q, In2_Q, Out_Q, + In1_InL3, In2_InL3, Out_InL3, + InFeat, OutFeat, Width, Height, + AddMatOper,ReLUOper); +} + +int CNN_MatAddRelu( + char *Name, + + CNN_GenControl_T *Ctrl, + + int In1_DataSize, + int In2_DataSize, + int Out_DataSize, + + int In1_Q, + int In2_Q, + int Out_Q, + + int In1_InL3, + int In2_InL3, + int Out_InL3, + + int InFeat, + int OutFeat, + int Width, + int Height, + + KernelOper_T AddMatOper, + KernelOper_T ReLUOper ) { @@ -4234,13 +4101,14 @@ int CNN_MatAdd( unsigned long long int LayerOp = 0; unsigned long long int LayerBandwidth = 0; int OutLB, OutUB; - KernelOper_T KernelOper = CNN_CompositeKernel(AddMatOper, KOP_NONE, KOP_NONE); + KernelOper_T KernelOper = CNN_CompositeKernel(AddMatOper, ReLUOper, KOP_NONE); - char *MatAddKerName = CNN_FindMatchingKernel(AddMatOper, KOP_NONE, ParFeat, In1_DataSize, In2_DataSize, 0, 0, Out_DataSize, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + char *MatAddKerName = CNN_FindMatchingKernel(AddMatOper, ReLUOper, ParFeat, In1_DataSize, In2_DataSize, 0, 0, Out_DataSize, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); if (MatAddKerName==0) GenTilingError("CNN_MatAdd Kernel: %s, Can't find a matching basic kernel", Name); + - SetUpperLowerBounds(KOP_NONE, Out_DataSize, 0, &OutLB, &OutUB, 0, Out_Q); + CNN_SetUpperLowerBounds(KOP_NONE, Out_DataSize, 0, &OutLB, &OutUB, 0, Out_Q); LayerOp += OutFeat * Width * Height; LayerBandwidth += Width*Height*In1_DataSize*InFeat; @@ -4256,7 +4124,7 @@ int CNN_MatAdd( TCArg(CNN_ArgDataType(Out_DataSize,1,1), "Out") ), Calls(1, - Call(MatAddKerName, LOC_INNER_LOOP, + Call(MatAddKerName, LOC_LOOP, Bindings(11, K_Arg("In1", KER_ARG_TILE), /* First input tile */ K_Arg("In2", KER_ARG_TILE), /* Second input tile */ @@ -4378,7 +4246,7 @@ int CNN_MatAddDynAdjust( if ((In2_Q<0) || (In2_Q>(In2_DataSize*8 - 1))) GenTilingError("CNN_MatAddDynAdjust Kernel: %s, Incorrect quantization value for In2 %d", Name, In2_Q); if ((Out_Q<0) || (Out_Q>(Out_DataSize*8 - 1))) GenTilingError("CNN_MatAddDynAdjust Kernel: %s, Incorrect quantization value for Out %d", Name, Out_Q); - SetUpperLowerBounds(KOP_NONE, Out_DataSize, 0, &OutLB, &OutUB, 0, Out_Q); + CNN_SetUpperLowerBounds(KOP_NONE, Out_DataSize, 0, &OutLB, &OutUB, 0, Out_Q); LayerOp += OutFeat * Width * Height; LayerBandwidth += Width*Height*In1_DataSize*InFeat; @@ -4394,7 +4262,7 @@ int CNN_MatAddDynAdjust( TCArg(CNN_ArgDataType(Out_DataSize,1,1), "Out") ), Calls(1, - Call(MatAddKerName, LOC_INNER_LOOP, + Call(MatAddKerName, LOC_LOOP, Bindings(11, K_Arg("In1", KER_ARG_TILE), /* First input tile */ K_Arg("In2", KER_ARG_TILE), /* Second input tile */ @@ -4466,7 +4334,7 @@ int CNN_MatAddDynAdjust( Height: Height of a given feature ScaleOper: Should always be KOP_MATSCALE_VECTOR, KOP_MATSCALE_SCALAR or KOP_MATSCALE_VECTOR_SCALAR - ReLUOper: Optional activation, should be KOP_NONE, KOP_RELU, KOP_RELUN or KOP_RELUN_VECTOR + ReLUOper: Optional activation, should be KOP_NONE, KOP_RELU, KOP_RELUN Signature: Name(In, Scalar, Out) Name(In, Scalar, Out, ReLUN) @@ -4541,7 +4409,7 @@ int CNN_MatScale( char *MatScaleKerName = CNN_FindMatchingKernel(ScaleOper, ReLUOper, ParFeat, In_DataSize, Vector_DataSize, Scalar_DataSize, 0, Out_DataSize, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); if (MatScaleKerName==0) GenTilingError("CNN_MatScale Kernel: %s, Can't find a matching basic kernel", Name); - SetUpperLowerBounds(ReLUOper, Out_DataSize, (ReLUOper!=KOP_NONE), &OutLB, &OutUB, 0, Out_Q); + CNN_SetUpperLowerBounds(ReLUOper, Out_DataSize, (ReLUOper!=KOP_NONE), &OutLB, &OutUB, 0, Out_Q); LayerOp += OutFeat * Width * Height; @@ -4557,47 +4425,35 @@ int CNN_MatScale( switch (ScaleOper) { case KOP_MATSCALE_VECTOR: - KerCArgs = AllocateCArgs(3+(ReLUOper==KOP_RELUN_VECTOR)); + KerCArgs = AllocateCArgs(3); KerCArgs[Ca++] = TCArg(CNN_ArgDataType(In_DataSize,1,1), "In"); KerCArgs[Ca++] = TCArg(CNN_ArgDataType(Vector_DataSize,1,1), "Vector"); - if (ReLUOper==KOP_RELUN_VECTOR) - KerCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "ReLUN"); KerCArgs[Ca++] = TCArg(CNN_ArgDataType(Out_DataSize,1,1), "Out"); - KArgs = AllocateKerArgs(3+(ReLUOper==KOP_RELUN_VECTOR)); + KArgs = AllocateKerArgs(3); KArgs[Ka++] = KerArg("In", KerArgSpace(2,D0,T0), O_IN|O_DB|InL3, 1, 1, Width*Height*In_DataSize, 0, 0, 0, "In"); KArgs[Ka++] = KerArg("Vector", KerArgSpace(1,D0), O_IN|O_DB|VectorL3, 1, 1, Vector_DataSize, 0, 0, 0, "Vector"); - if (ReLUOper==KOP_RELUN_VECTOR) - KArgs[Ka++] = KerArg("ReLUN", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST, 1, 1, 1, 0, 0, 0, "ReLUN"); KArgs[Ka++] = KerArg("Out", KerArgSpace(2,D0,T0), O_OUT|O_DB|VectorL3, 1, 1, Width*Height*Out_DataSize, 0, 0, 0, "Out"); break; case KOP_MATSCALE_SCALAR: - KerCArgs = AllocateCArgs(3+(ReLUOper==KOP_RELUN_VECTOR)); + KerCArgs = AllocateCArgs(3); KerCArgs[Ca++] = TCArg(CNN_ArgDataType(In_DataSize,1,1), "In"); KerCArgs[Ca++] = TCArg(CNN_ArgDataType(Scalar_DataSize,1,1), "Scalar"); - if (ReLUOper==KOP_RELUN_VECTOR) - KerCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "ReLUN"); KerCArgs[Ca++] = TCArg(CNN_ArgDataType(Out_DataSize,1,1), "Out"); - KArgs = AllocateKerArgs(3+(ReLUOper==KOP_RELUN_VECTOR)); + KArgs = AllocateKerArgs(3); KArgs[Ka++] = KerArg("In", KerArgSpace(2,D0,T0), O_IN|O_DB|InL3, 1, 1, Width*Height*In_DataSize, 0, 0, 0, "In"); KArgs[Ka++] = KerArg("Scalar", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED|VectorL3, 1, 1, Scalar_DataSize, 0, 0, 0, "Scalar"); - if (ReLUOper==KOP_RELUN_VECTOR) - KArgs[Ka++] = KerArg("ReLUN", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST, 1, 1, 1, 0, 0, 0, "ReLUN"); KArgs[Ka++] = KerArg("Out", KerArgSpace(2,D0,T0), O_OUT|O_DB|VectorL3, 1, 1, Width*Height*Out_DataSize, 0, 0, 0, "Out"); break; case KOP_MATSCALE_VECTOR_SCALAR: - KerCArgs = AllocateCArgs(4+(ReLUOper==KOP_RELUN_VECTOR)); + KerCArgs = AllocateCArgs(4); KerCArgs[Ca++] = TCArg(CNN_ArgDataType(In_DataSize,1,1), "In"); KerCArgs[Ca++] = TCArg(CNN_ArgDataType(Vector_DataSize,1,1), "Vector"); KerCArgs[Ca++] = TCArg(CNN_ArgDataType(Scalar_DataSize,1,1), "Scalar"); - if (ReLUOper==KOP_RELUN_VECTOR) - KerCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "ReLUN"); KerCArgs[Ca++] = TCArg(CNN_ArgDataType(Out_DataSize,1,1), "Out"); - KArgs = AllocateKerArgs(4+(ReLUOper==KOP_RELUN_VECTOR)); + KArgs = AllocateKerArgs(4); KArgs[Ka++] = KerArg("In", KerArgSpace(2,D0,T0), O_IN|O_DB|InL3, 1, 1, Width*Height*In_DataSize, 0, 0, 0, "In"); KArgs[Ka++] = KerArg("Vector", KerArgSpace(1,D0), O_IN|O_DB|VectorL3, 1, 1, Vector_DataSize, 0, 0, 0, "Vector"); KArgs[Ka++] = KerArg("Scalar", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED|VectorL3, 1, 1, Scalar_DataSize, 0, 0, 0, "Scalar"); - if (ReLUOper==KOP_RELUN_VECTOR) - KArgs[Ka++] = KerArg("ReLUN", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST, 1, 1, 1, 0, 0, 0, "ReLUN"); KArgs[Ka++] = KerArg("Out", KerArgSpace(2,D0,T0), O_OUT|O_DB|VectorL3, 1, 1, Width*Height*Out_DataSize, 0, 0, 0, "Out"); break; } @@ -4608,7 +4464,7 @@ int CNN_MatScale( TileOrientation, KerCArgs, Calls(1, - Call(MatScaleKerName, LOC_INNER_LOOP, + Call(MatScaleKerName, LOC_LOOP, Bindings(11, K_Arg("In", KER_ARG_TILE), /* First input tile */ (ScaleOper==KOP_MATSCALE_VECTOR||ScaleOper==KOP_MATSCALE_VECTOR_SCALAR)? @@ -4619,9 +4475,7 @@ int CNN_MatScale( Imm(Height), /* Input tile height */ K_ArgPar("In", KER_ARG_PARTILE_SIZE, D0), /* Number of Matrices involved */ Imm(OutLB), /* Out lower bound */ - ((ReLUOper==KOP_RELUN_VECTOR)? - K_Arg("ReLUN", KER_ARG_TILE): /* ReLUN input tile */ - Imm(OutUB)), /* Activation upper bound, clip or relu */ + Imm(OutUB), /* Activation upper bound, clip or relu */ (ScaleOper==KOP_MATSCALE_SCALAR||ScaleOper==KOP_MATSCALE_VECTOR_SCALAR)? K_Arg("Scalar", KER_ARG_TILE): /* Scalar Scale input tile */ AT_IGNORE_ARG_BINDING, /* Scale Scalar, not relevant here */ @@ -4640,28 +4494,22 @@ int CNN_MatScale( Ka=0; switch (ScaleOper) { case KOP_MATSCALE_VECTOR: - KArgs = AllocateKerArgs(3+(ReLUOper==KOP_RELUN_VECTOR)); + KArgs = AllocateKerArgs(3); KArgs[Ka++] = KerArg("In", KerArgSpace(2,D0,T0), O_IN|O_DB|InL3, Width, Height, In_DataSize, 0, 0, 0, "In"); KArgs[Ka++] = KerArg("Vector", KerArgSpace(1,D0), O_IN|O_DB|VectorL3, 1, 1, Vector_DataSize, 0, 0, 0, "Vector"); - if (ReLUOper==KOP_RELUN_VECTOR) - KArgs[Ka++] = KerArg("ReLUN", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST, 1, 1, 1, 0, 0, 0, "ReLUN"); KArgs[Ka++] = KerArg("Out", KerArgSpace(2,D0,T0), O_OUT|O_DB|VectorL3, Width, Height, Out_DataSize, 0, 0, 0, "Out"); break; case KOP_MATSCALE_SCALAR: - KArgs = AllocateKerArgs(3+(ReLUOper==KOP_RELUN_VECTOR)); + KArgs = AllocateKerArgs(3); KArgs[Ka++] = KerArg("In", KerArgSpace(2,D0,T0), O_IN|O_DB|InL3, Width, Height, In_DataSize, 0, 0, 0, "In"); KArgs[Ka++] = KerArg("Scalar", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED|VectorL3, 1, 1, Scalar_DataSize, 0, 0, 0, "Scalar"); - if (ReLUOper==KOP_RELUN_VECTOR) - KArgs[Ka++] = KerArg("ReLUN", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST, 1, 1, 1, 0, 0, 0, "ReLUN"); KArgs[Ka++] = KerArg("Out", KerArgSpace(2,D0,T0), O_OUT|O_DB|VectorL3, Width, Height, Out_DataSize, 0, 0, 0, "Out"); break; case KOP_MATSCALE_VECTOR_SCALAR: - KArgs = AllocateKerArgs(4+(ReLUOper==KOP_RELUN_VECTOR)); + KArgs = AllocateKerArgs(4); KArgs[Ka++] = KerArg("In", KerArgSpace(2,D0,T0), O_IN|O_DB|InL3, Width, Height, In_DataSize, 0, 0, 0, "In"); KArgs[Ka++] = KerArg("Vector", KerArgSpace(1,D0), O_IN|O_DB|VectorL3, 1, 1, Vector_DataSize, 0, 0, 0, "Vector"); KArgs[Ka++] = KerArg("Scalar", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED|VectorL3, 1, 1, Scalar_DataSize, 0, 0, 0, "Scalar"); - if (ReLUOper==KOP_RELUN_VECTOR) - KArgs[Ka++] = KerArg("ReLUN", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST, 1, 1, 1, 0, 0, 0, "ReLUN"); KArgs[Ka++] = KerArg("Out", KerArgSpace(2,D0,T0), O_OUT|O_DB|VectorL3, Width, Height, Out_DataSize, 0, 0, 0, "Out"); break; } @@ -4670,7 +4518,7 @@ int CNN_MatScale( TileOrientation, KerCArgs, Calls(1, - Call(MatScaleKerName, LOC_INNER_LOOP, + Call(MatScaleKerName, LOC_LOOP, Bindings(11, K_Arg("In", KER_ARG_TILE), /* First input tile */ (ScaleOper==KOP_MATSCALE_VECTOR||ScaleOper==KOP_MATSCALE_VECTOR_SCALAR)? @@ -4681,9 +4529,7 @@ int CNN_MatScale( K_Arg("In", KER_ARG_TILE_H), /* Input tile height */ K_ArgPar("In", KER_ARG_PARTILE_SIZE, D0), /* Number of Matrices involved */ Imm(OutLB), /* Out lower bound */ - ((ReLUOper==KOP_RELUN_VECTOR)? - K_Arg("ReLUN", KER_ARG_TILE): /* ReLUN input tile */ - Imm(OutUB)), /* Activation upper bound, clip or relu */ + Imm(OutUB), /* Activation upper bound, clip or relu */ (ScaleOper==KOP_MATSCALE_SCALAR||ScaleOper==KOP_MATSCALE_VECTOR_SCALAR)? K_Arg("Scalar", KER_ARG_TILE): /* Scalar Scale input tile */ AT_IGNORE_ARG_BINDING, /* Scalar Scale, not relevant here */ @@ -4704,7 +4550,6 @@ int CNN_MatScale( AddKernelArgDim(Name, "In", 4, InFeat, Height, Width, In_DataSize); if (ScaleOper==KOP_MATSCALE_VECTOR || ScaleOper==KOP_MATSCALE_VECTOR_SCALAR) AddKernelArgDim(Name, "Vector", 2, InFeat, Vector_DataSize); if (ScaleOper==KOP_MATSCALE_SCALAR || ScaleOper==KOP_MATSCALE_VECTOR_SCALAR) AddKernelArgDim(Name, "Scalar", 2, 1, Scalar_DataSize); - if (ReLUOper==KOP_RELUN_VECTOR) AddKernelArgDim(Name, "ReLUN", 2, OutFeat, 1); AddKernelArgDim(Name, "Out", 4, OutFeat, Height, Width, Out_DataSize); AT_PrepareForTest(Name, @@ -4761,7 +4606,7 @@ int CNN_MatScale( ReLU_UpperBound In case ReLUOper!=KOP_NONE Upper bound to be used for activation MatMulOper: Should always be KOP_MATMUL - ReLUOper: Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) + ReLUOper: Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) Signature: Name(In2, In1, Bias, Out) Name(In2, In1, Bias, ReLUN, Out) @@ -4820,16 +4665,15 @@ int CNN_MatMul( int LineO = LineM1, ColO = ColM2; int OutLB, OutUB, ReluN = 6; int ConsT0 = Scx; - int MultiLineBuff = 0; - int Nbuff = MultiLineBuff?4:1; + int Nbuff; if (Ctrl) { if (Ctrl->ReluN != -1) ReluN = Ctrl->ReluN; } if (!(MatMulOper == KOP_MATMUL)) GenTilingError("CNN_MatMul Kernel: %s, MatMulOper should be KOP_MATMUL", Name); - if (!(ReLUOper == KOP_NONE || ReLUOper == KOP_RELU || ReLUOper == KOP_RELUN || ReLUOper==KOP_RELUN_VECTOR || ReLUOper == KOP_HSWISH || ReLUOper == KOP_HSIGMOID || ReLUOper == KOP_LEAKYRELU)) - GenTilingError("CNN_MatMul Kernel: %s, ReLUOper should be KOP_NONE, KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID or KOP_LEAKYRELU", Name); + if (!(ReLUOper == KOP_NONE || ReLUOper == KOP_RELU || ReLUOper == KOP_RELUN || ReLUOper == KOP_HSWISH || ReLUOper == KOP_HSIGMOID || ReLUOper == KOP_LEAKYRELU)) + GenTilingError("CNN_MatMul Kernel: %s, ReLUOper should be KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID or KOP_LEAKYRELU", Name); KernelOper_T KernelOper = CNN_CompositeKernel(MatMulOper, ReLUOper, KOP_NONE); if (ColM1 != LineM2) GenTilingError("CNN_MatMul: %s, Incorrect input matrices dimensions for a matrix multiplication: [%d x %d]*[%d x %d] %s", Name, LineM1, ColM1, LineM2, ColM2); @@ -4842,6 +4686,8 @@ int CNN_MatMul( if (MatMulKerName==0) GenTilingError("CNN_MatMul Kernel: %s, Can't find a matching basic kernel", Name); + if (In1_DataSize==1 && In2_DataSize==1 && Scx==1 && Scy==1) Nbuff = 4; else Nbuff = 1; + ColO = ((Width+Scx-1)/Scx) * ((Height+Scy-1)/Scy); LayerOp += ColM1*ColO*LineM1; LayerBandwidth += LineM1*(ColM1*ColM2*(In1_DataSize+In2_DataSize)); @@ -4855,9 +4701,9 @@ int CNN_MatMul( } else if (Out_DataSize==2) { OutLB = -32768; OutUB = 32767; } else GenTilingError("CNN_MatMul Kernel: %s, Unsupported Data Type Size for Output", Name); - if (ReLUOper==KOP_RELU || ReLUOper==KOP_RELUN || ReLUOper==KOP_RELUN_VECTOR) { + if (ReLUOper==KOP_RELU || ReLUOper==KOP_RELUN) { if (ReLU_LowerBound==0 && ReLU_UpperBound==0) { - if (SetUpperLowerBounds(ReLUOper, Out_DataSize, 1, &OutLB, &OutUB, ReluN, Out_Q)) + if (CNN_SetUpperLowerBounds(ReLUOper, Out_DataSize, 1, &OutLB, &OutUB, ReluN, Out_Q)) GenTilingError("CNN_MatMul %s, cannot represent saturation value with given out fixed point format %d", Name, Out_Q); } else { if (ReLU_LowerBound) OutLB = ReLU_LowerBound; else OutLB = 0; @@ -4878,15 +4724,14 @@ int CNN_MatMul( Kernel_T *Kernel = UserKernel(Name, KernelIterSpace(2, IterTiledSpace(T1), IterTiledSpace(T0)), TILE_HOR, - CArgs(5, + CArgs(4, TCArg(CNN_ArgDataType(In2_DataSize,1,1), "In2"), TCArg(CNN_ArgDataType(In1_DataSize,1,1), "In1"), TCArg(CNN_ArgDataType(Bias_DataSize,1,1), "Bias"), - (ReLUOper==KOP_RELUN_VECTOR)?TCArg(CNN_ArgDataType(1,1,1), "ReLUN"):AT_NO_C_ARG, TCArg(CNN_ArgDataType(Out_DataSize,1,1), "Out") ), Calls(1, - Call(MatMulKerName, LOC_INNER_LOOP, + Call(MatMulKerName, LOC_LOOP, Bindings(21, K_Arg("In1", KER_ARG_TILE), K_Arg("In1", KER_ARG_TILE_W), K_Arg("In1", KER_ARG_TILE_H), K_Arg("In2", KER_ARG_TILE), K_Arg("In2", KER_ARG_TILE_W), @@ -4896,9 +4741,7 @@ int CNN_MatMul( Imm(OutLB), (ReLUOper==KOP_HSWISH||ReLUOper==KOP_HSIGMOID)? Imm(Out_Q): /* Output fixed point format */ - ((ReLUOper==KOP_RELUN_VECTOR)? - K_Arg("ReLUN", KER_ARG_TILE): /* ReLUN input tile */ - Imm(OutUB)), /* Activation upper bound, clip or relu */ + Imm(OutUB), /* Activation upper bound, clip or relu */ Imm(In1_Q+In2_Q-Out_Q), /* Out fixed point adjust */ Imm(In1_Q+In2_Q-Bias_Q), /* Bias fixed point adjust */ AT_IGNORE_ARG_BINDING, /* MulBias fixed point format, unused */ @@ -4911,22 +4754,18 @@ int CNN_MatMul( ) ), ColFirst? - KerArgs(6, + KerArgs(5, KerArg("KerBuff",KerArgSpace(1, T1), O_BUFF|O_NTILED, Nbuff*ColM1, 1, In2_DataSize, 0, 0, 0, 0), KerArg("In1", KerArgSpace(1, T0), O_IN|O_DB|O_CONST|In1L3, ColM1, LineM1, In1_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), KerArg("In2", KerArgSpace(1, T1), O_IN|O_DB|In2L3, ColM2, LineM2, In2_DataSize, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"), KerArg("Bias", KerArgSpace(1, T0), O_IN|O_DB|O_CONST|BiasL3, 1, LineO, Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Bias"), - (ReLUOper==KOP_RELUN_VECTOR)? - KerArg("ReLUN", KerArgSpace(1, T0), O_IN|O_DB|O_CONST, 1, LineO, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "ReLUN"):AT_NO_KER_ARG, KerArg("Out", KerArgSpace(1, T1), O_OUT|O_DB|OutL3, ColO, LineO, Out_DataSize, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out") ): - KerArgs(6, + KerArgs(5, KerArg("KerBuff",KerArgSpace(1, T0), O_BUFF|O_NTILED, Nbuff*ColM1, 1, In2_DataSize, 0, 0, 0, 0), KerArg("In1", KerArgSpace(1, T1), O_IN|O_DB|O_CONST|In1L3, ColM1, LineM1, In1_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), KerArg("In2", KerArgSpace(1, T0), O_IN|O_DB|In2L3, ColM2, LineM2, In2_DataSize, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"), KerArg("Bias", KerArgSpace(1, T1), O_IN|O_DB|O_CONST|BiasL3, 1, LineO, Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Bias"), - (ReLUOper==KOP_RELUN_VECTOR)? - KerArg("ReLUN", KerArgSpace(1, T1), O_IN|O_DB|O_CONST, 1, LineO, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "ReLUN"):AT_NO_KER_ARG, KerArg("Out", KerArgSpace(1, T1), O_OUT|O_DB|OutL3, ColO, LineO, Out_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Out") ) ); @@ -4937,7 +4776,6 @@ int CNN_MatMul( AddKernelArgDim(Name, "In1", 3, LineM1, ColM1, In1_DataSize); AddKernelArgDim(Name, "In2", 4, LineM2, Height, Width, In2_DataSize); AddKernelArgDim(Name, "Bias", 2, LineO, Bias_DataSize); - if (ReLUOper==KOP_RELUN_VECTOR) AddKernelArgDim(Name, "ReLUN", 2, LineO, 1); AddKernelArgDim(Name, "Out", 3, LineO, ColO, Out_DataSize); AT_PrepareForTest(Name, @@ -4995,7 +4833,7 @@ int CNN_MatMul( ReLU_UpperBound In case ReLUOper!=KOP_NONE Upper bound to be used for activation MatMulOper Should always be KOP_MATMUL - ReLUOper Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) + ReLUOper Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) Signature: Name(In2, In1, Bias, Out) Name(In2, In1, Bias, ReLUN, Out) @@ -5061,7 +4899,7 @@ int CNN_MatMulSmallM1( } if (!(MatMulOper == KOP_MATMUL_SM1)) GenTilingError("CNN_MatMulSmallM1 Kernel: %s, MatMulOper should be KOP_MATMUL_SM1", Name); - if (!(ReLUOper == KOP_NONE || ReLUOper == KOP_RELU || ReLUOper == KOP_RELUN || ReLUOper==KOP_RELUN_VECTOR || ReLUOper == KOP_HSWISH || ReLUOper == KOP_HSIGMOID || ReLUOper == KOP_LEAKYRELU)) + if (!(ReLUOper == KOP_NONE || ReLUOper == KOP_RELU || ReLUOper == KOP_RELUN || ReLUOper == KOP_HSWISH || ReLUOper == KOP_HSIGMOID || ReLUOper == KOP_LEAKYRELU)) GenTilingError("CNN_MatMulSmallM1 Kernel: %s, ReLUOper should be KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID or KOP_RELUN", Name); KernelOper_T KernelOper = CNN_CompositeKernel(MatMulOper, ReLUOper, KOP_NONE); @@ -5088,9 +4926,9 @@ int CNN_MatMulSmallM1( } else if (Out_DataSize==2) { OutLB = -32768; OutUB = 32767; } else GenTilingError("CNN_MatMulSmallM1 Kernel: %s, Unsupported Data Type Size for Output", Name); - if (ReLUOper==KOP_RELU || ReLUOper==KOP_RELUN || ReLUOper==KOP_RELUN_VECTOR) { + if (ReLUOper==KOP_RELU || ReLUOper==KOP_RELUN) { if (ReLU_LowerBound==0 && ReLU_UpperBound==0) { - if (SetUpperLowerBounds(ReLUOper, Out_DataSize, 1, &OutLB, &OutUB, ReluN, Out_Q)) + if (CNN_SetUpperLowerBounds(ReLUOper, Out_DataSize, 1, &OutLB, &OutUB, ReluN, Out_Q)) GenTilingError("CNN_MatMulSmallM1 %s, cannot represent saturation value with given out fixed point format %d", Name, Out_Q); } else { if (ReLU_LowerBound) OutLB = ReLU_LowerBound; else OutLB = 0; @@ -5111,15 +4949,14 @@ int CNN_MatMulSmallM1( Kernel_T *Kernel = UserKernel(Name, KernelIterSpace(1, IterTiledSpace(T0)), TILE_VER, - CArgs(5, + CArgs(4, TCArg(CNN_ArgDataType(In2_DataSize,1,1), "In2"), TCArg(CNN_ArgDataType(In1_DataSize,1,1), "In1"), TCArg(CNN_ArgDataType(Bias_DataSize,1,1), "Bias"), - (ReLUOper==KOP_RELUN_VECTOR)?TCArg(CNN_ArgDataType(1,1,1), "ReLUN"):AT_NO_C_ARG, TCArg(CNN_ArgDataType(Out_DataSize,1,1), "Out") ), Calls(2, - Call(MatTransKerName, LOC_INNER_LOOP, + Call(MatTransKerName, LOC_LOOP, Bindings(7, K_Arg("In2", KER_ARG_TILE), /* Input tile */ K_Arg("TransIn2", KER_ARG_TILE), /* Transposed input tile */ @@ -5130,7 +4967,7 @@ int CNN_MatMulSmallM1( NeedScy?Imm(Scy):AT_IGNORE_ARG_BINDING ) ), - Call(MatMulKerName, LOC_INNER_LOOP, + Call(MatMulKerName, LOC_LOOP, Bindings(21, K_Arg("In1", KER_ARG_TILE), Imm(ColM1), Imm(LineM1), K_Arg("TransIn2", KER_ARG_TILE), K_Arg("TransIn2", KER_ARG_TILE_W), @@ -5140,9 +4977,7 @@ int CNN_MatMulSmallM1( Imm(OutLB), (ReLUOper==KOP_HSWISH||ReLUOper==KOP_HSIGMOID)? Imm(Out_Q): /* Output fixed point format */ - ((ReLUOper==KOP_RELUN_VECTOR)? - K_Arg("ReLUN", KER_ARG_TILE): /* ReLUN input tile */ - Imm(OutUB)), /* Activation upper bound, clip or relu */ + Imm(OutUB), /* Activation upper bound, clip or relu */ Imm(In1_Q+In2_Q-Out_Q), /* Out fixed point format */ Imm(In1_Q+In2_Q-Bias_Q), /* Bias fixed point format */ AT_IGNORE_ARG_BINDING, /* MulBias fixed point format, unused */ @@ -5154,13 +4989,12 @@ int CNN_MatMulSmallM1( ) ) ), - KerArgs(6, + KerArgs(5, KerArg("In1", KerArgSpace(1, T0), O_IN|O_BUFF|O_NTILED|O_CONST|In1L3, ColM1, LineM1, In1_DataSize, 0, 0, 0, "In1"), KerArg("In2", KerArgSpace(1, T0), O_IN|O_DB|In2L3, ColM2, LineM2, In2_DataSize, 0, 0, TileCons, "In2"), KerArg("TransIn2", KerArgSpace(1, T0), O_BUFF|O_ONETILE, ColO, LineM2, In2_DataSize, 0, 0, 0, ""), + // KerArg("TransIn2", KerArgSpace(1, T0), O_BUFF|O_ONETILE, ColM2, LineM2, In2_DataSize, 0, 0, 0, ""), KerArg("Bias", KerArgSpace(1, T0), O_BUFF|O_IN|O_NTILED|O_CONST|BiasL3, 1, LineM1, Bias_DataSize, 0, 0, 0, "Bias"), - (ReLUOper==KOP_RELUN_VECTOR)? - KerArg("ReLUN", KerArgSpace(1, T0), O_IN|O_DB|O_CONST, 1, LineO, 1, 0, 0, 0, "ReLUN"):AT_NO_KER_ARG, KerArg("Out", KerArgSpace(1, T0), O_OUT|O_DB|OutL3, ColO, LineM1, Out_DataSize, 0, 0, 0, "Out") ) ); @@ -5171,7 +5005,6 @@ int CNN_MatMulSmallM1( AddKernelArgDim(Name, "In1", 3, LineM1, ColM1, In1_DataSize); AddKernelArgDim(Name, "In2", 4, LineM2, Height, Width, In2_DataSize); AddKernelArgDim(Name, "Bias", 2, LineO, Bias_DataSize); - if (ReLUOper==KOP_RELUN_VECTOR) AddKernelArgDim(Name, "ReLUN", 2, LineO, 1); AddKernelArgDim(Name, "Out", 3, LineO, ColO, Out_DataSize); AT_PrepareForTest(Name, @@ -5231,7 +5064,7 @@ int CNN_MatMulSmallM1( ReLU_UpperBound In case ReLUOper!=KOP_NONE Upper bound to be used for activation MatMulOper Should always be KOP_MATMUL_SCALE or KOP_MATMUL_SCALE_SCALAR - ReLUOper Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) + ReLUOper Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) Signature: Name(In2, In1, Bias, MulBias, Out) Name(In2, In1, Bias, MulBias, ReLUN, Out) @@ -5295,13 +5128,14 @@ int CNN_MatMulScale( int OutLB, OutUB, ReluN = 6; int ConsT0 = Scx; int MulBiasScalar = (MatMulOper==KOP_MATMUL_SCALE_SCALAR); + int Nbuff; if (Ctrl) { if (Ctrl->ReluN != -1) ReluN = Ctrl->ReluN; } if (!(MatMulOper == KOP_MATMUL_SCALE || MatMulOper == KOP_MATMUL_SCALE_SCALAR)) GenTilingError("CNN_MatMulScale Kernel: %s, MatMulOper should be KOP_MATMUL_SCALE or KOP_MATMUL_SCALE_SCALAR", Name); - if (!(ReLUOper == KOP_NONE || ReLUOper == KOP_RELU || ReLUOper == KOP_RELUN || ReLUOper==KOP_RELUN_VECTOR || ReLUOper == KOP_HSWISH || ReLUOper == KOP_HSIGMOID || ReLUOper == KOP_LEAKYRELU)) + if (!(ReLUOper == KOP_NONE || ReLUOper == KOP_RELU || ReLUOper == KOP_RELUN || ReLUOper == KOP_HSWISH || ReLUOper == KOP_HSIGMOID || ReLUOper == KOP_LEAKYRELU)) GenTilingError("CNN_MatMulScale Kernel: %s, ReLUOper should be KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID or KOP_LEAKYRELU", Name); KernelOper_T KernelOper = CNN_CompositeKernel(MatMulOper, ReLUOper, KOP_NONE); @@ -5315,6 +5149,8 @@ int CNN_MatMulScale( if (MatMulKerName==0) GenTilingError("CNN_MatMulScale Kernel: %s, Can't find a matching basic kernel", Name); + if (In1_DataSize==1 && In2_DataSize==1 && Scx==1 && Scy==1) Nbuff = 4; else Nbuff = 1; + ColO = ((Width+Scx-1)/Scx) * ((Height+Scy-1)/Scy); LayerOp += ColM1*ColO*LineM1; @@ -5330,9 +5166,9 @@ int CNN_MatMulScale( } else if (Out_DataSize==2) { OutLB = -32768; OutUB = 32767; } else GenTilingError("CNN_MatMulScale Kernel: %s, Unsupported Data Type Size for Output", Name); - if (ReLUOper==KOP_RELU || ReLUOper==KOP_RELUN || ReLUOper==KOP_RELUN_VECTOR) { + if (ReLUOper==KOP_RELU || ReLUOper==KOP_RELUN) { if (ReLU_LowerBound==0 && ReLU_UpperBound==0) { - if (SetUpperLowerBounds(ReLUOper, Out_DataSize, 1, &OutLB, &OutUB, ReluN, Out_Q)) + if (CNN_SetUpperLowerBounds(ReLUOper, Out_DataSize, 1, &OutLB, &OutUB, ReluN, Out_Q)) GenTilingError("CNN_MatMulScale %s, cannot represent saturation value with given out fixed point format %d", Name, Out_Q); } else { if (ReLU_LowerBound) OutLB = ReLU_LowerBound; else OutLB = 0; @@ -5352,16 +5188,15 @@ int CNN_MatMulScale( Kernel_T *Kernel = UserKernel(Name, KernelIterSpace(2, IterTiledSpace(T1), IterTiledSpace(T0)), TILE_HOR, - CArgs(6, + CArgs(5, TCArg(CNN_ArgDataType(In2_DataSize,1,1), "In2"), TCArg(CNN_ArgDataType(In1_DataSize,1,1), "In1"), TCArg(CNN_ArgDataType(Bias_DataSize,1,1), "Bias"), TCArg(CNN_ArgDataType(MulBias_DataSize,1,1), "MulBias"), - (ReLUOper==KOP_RELUN_VECTOR)?TCArg(CNN_ArgDataType(1,1,1), "ReLUN"):AT_NO_C_ARG, TCArg(CNN_ArgDataType(Out_DataSize,1,1), "Out") ), Calls(1, - Call(MatMulKerName, LOC_INNER_LOOP, + Call(MatMulKerName, LOC_LOOP, Bindings(21, K_Arg("In1", KER_ARG_TILE), K_Arg("In1", KER_ARG_TILE_W), K_Arg("In1", KER_ARG_TILE_H), K_Arg("In2", KER_ARG_TILE), K_Arg("In2", KER_ARG_TILE_W), @@ -5371,9 +5206,7 @@ int CNN_MatMulScale( Imm(OutLB), (ReLUOper==KOP_HSWISH||ReLUOper==KOP_HSIGMOID)? Imm(Out_Q): /* Output fixed point format */ - ((ReLUOper==KOP_RELUN_VECTOR)? - K_Arg("ReLUN", KER_ARG_TILE): /* ReLUN input tile */ - Imm(OutUB)), /* Activation upper bound, clip or relu */ + Imm(OutUB), /* Activation upper bound, clip or relu */ Imm(In1_Q+In2_Q-Out_Q), /* Fixed point adjust for output */ Imm(In1_Q+In2_Q-Bias_Q), /* Fixed point adjust for bias */ Imm(MulBias_Q), /* Fixed point format for multiplicative bias */ @@ -5386,28 +5219,24 @@ int CNN_MatMulScale( ) ), ColFirst? - KerArgs(7, - KerArg("KerBuff", KerArgSpace(1, T1), O_BUFF|O_NTILED, ColM1, 1, In2_DataSize, 0, 0, 0, 0), + KerArgs(6, + KerArg("KerBuff", KerArgSpace(1, T1), O_BUFF|O_NTILED, Nbuff*ColM1, 1, In2_DataSize, 0, 0, 0, 0), KerArg("In1", KerArgSpace(1, T0), O_IN|O_DB|O_CONST|In1L3, ColM1, LineM1, In1_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), KerArg("In2", KerArgSpace(1, T1), O_IN|O_DB|In2L3, ColM2, LineM2, In2_DataSize, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"), KerArg("Bias", KerArgSpace(1, T0), O_IN|O_DB|O_CONST|BiasL3, 1, LineO, Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Bias"), MulBiasScalar? KerArg("MulBias", KerArgSpace(1, T0), O_IN|O_BUFF|O_NTILED|O_CONST|MulBiasL3, 1, 1, MulBias_DataSize, 0, 0, 0, "MulBias"): KerArg("MulBias", KerArgSpace(1, T0), O_IN|O_DB|O_CONST|MulBiasL3, 1, LineO, MulBias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "MulBias"), - (ReLUOper==KOP_RELUN_VECTOR)? - KerArg("ReLUN", KerArgSpace(1, T0), O_IN|O_DB|O_CONST, 1, LineO, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "ReLUN"):AT_NO_KER_ARG, KerArg("Out", KerArgSpace(1, T1), O_OUT|O_DB|OutL3, ColO, LineO, Out_DataSize, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out") ): - KerArgs(7, - KerArg("KerBuff", KerArgSpace(1, T0), O_BUFF|O_NTILED, ColM1, 1, In2_DataSize, 0, 0, 0, 0), + KerArgs(6, + KerArg("KerBuff", KerArgSpace(1, T0), O_BUFF|O_NTILED, Nbuff*ColM1, 1, In2_DataSize, 0, 0, 0, 0), KerArg("In1", KerArgSpace(1, T1), O_IN|O_DB|O_CONST|In1L3, ColM1, LineM1, In1_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), KerArg("In2", KerArgSpace(1, T0), O_IN|O_DB|In2L3, ColM2, LineM2, In2_DataSize, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"), KerArg("Bias", KerArgSpace(1, T1), O_IN|O_DB|O_CONST|BiasL3, 1, LineO, Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Bias"), MulBiasScalar? KerArg("MulBias", KerArgSpace(1, T0), O_IN|O_BUFF|O_NTILED|O_CONST|MulBiasL3, 1, 1, MulBias_DataSize, 0, 0, 0, "MulBias"): KerArg("MulBias", KerArgSpace(1, T1), O_IN|O_DB|O_CONST|MulBiasL3, 1, LineO, MulBias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "MulBias"), - (ReLUOper==KOP_RELUN_VECTOR)? - KerArg("ReLUN", KerArgSpace(1, T1), O_IN|O_DB|O_CONST, 1, LineO, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "ReLUN"):AT_NO_KER_ARG, KerArg("Out", KerArgSpace(1, T1), O_OUT|O_DB|OutL3, ColO, LineO, Out_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Out") ) ); @@ -5419,7 +5248,6 @@ int CNN_MatMulScale( AddKernelArgDim(Name, "In2", 4, LineM2, Height, Width, In2_DataSize); AddKernelArgDim(Name, "Bias", 2, LineO, Bias_DataSize); if (MulBiasScalar) AddKernelArgDim(Name, "MulBias", 2, 1, MulBias_DataSize); else AddKernelArgDim(Name, "MulBias", 2, LineO, MulBias_DataSize); - if (ReLUOper==KOP_RELUN_VECTOR) AddKernelArgDim(Name, "ReLUN", 2, LineO, 1); AddKernelArgDim(Name, "Out", 3, LineO, ColO, Out_DataSize); AT_PrepareForTest(Name, @@ -5480,7 +5308,7 @@ int CNN_MatMulScale( ReLU_UpperBound In case ReLUOper!=KOP_NONE Upper bound to be used for activation MatMulOper Should always be KOP_MATMUL_SCALE_SM1 or KOP_MATMUL_SCALE_SCALAR_SM1 - ReLUOper Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) + ReLUOper Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) Signature: Name(In2, In1, Bias, MulBias, Out) Name(In2, In1, Bias, MulBias, ReLUN, Out) @@ -5552,7 +5380,7 @@ int CNN_MatMulScaleSmallM1( if (!(MatMulOper == KOP_MATMUL_SCALE_SM1 || MatMulOper==KOP_MATMUL_SCALE_SCALAR_SM1)) GenTilingError("CNN_MatMulScaleSmallM1 Kernel: %s, MatMulOper should be KOP_MATMUL_SCALE_SM1 or KOP_MATMUL_SCALE_SCALAR_SM1", Name); - if (!(ReLUOper == KOP_NONE || ReLUOper == KOP_RELU || ReLUOper == KOP_RELUN || ReLUOper==KOP_RELUN_VECTOR || ReLUOper == KOP_HSWISH || ReLUOper == KOP_HSIGMOID || ReLUOper == KOP_LEAKYRELU)) + if (!(ReLUOper == KOP_NONE || ReLUOper == KOP_RELU || ReLUOper == KOP_RELUN || ReLUOper == KOP_HSWISH || ReLUOper == KOP_HSIGMOID || ReLUOper == KOP_LEAKYRELU)) GenTilingError("CNN_MatMulScaleSmallM1 Kernel: %s, ReLUOper should be KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU or KOP_RELUN", Name); KernelOper_T KernelOper = CNN_CompositeKernel(MatMulOper, ReLUOper, KOP_NONE); @@ -5581,9 +5409,9 @@ int CNN_MatMulScaleSmallM1( } else if (Out_DataSize==2) { OutLB = -32768; OutUB = 32767; } else GenTilingError("CNN_MatMulScaleSmallM1 Kernel: %s, Unsupported Data Type Size for Output", Name); - if (ReLUOper==KOP_RELU || ReLUOper==KOP_RELUN || ReLUOper==KOP_RELUN_VECTOR) { + if (ReLUOper==KOP_RELU || ReLUOper==KOP_RELUN) { if (ReLU_LowerBound==0 && ReLU_UpperBound==0) { - if (SetUpperLowerBounds(ReLUOper, Out_DataSize, 1, &OutLB, &OutUB, ReluN, Out_Q)) + if (CNN_SetUpperLowerBounds(ReLUOper, Out_DataSize, 1, &OutLB, &OutUB, ReluN, Out_Q)) GenTilingError("CNN_MatMulScaleSmallM1 %s, cannot represent saturation value with given out fixed point format %d", Name, Out_Q); } else { if (ReLU_LowerBound) OutLB = ReLU_LowerBound; else OutLB = 0; @@ -5604,16 +5432,15 @@ int CNN_MatMulScaleSmallM1( Kernel_T *Kernel = UserKernel(Name, KernelIterSpace(1, IterTiledSpace(T0)), TILE_VER, - CArgs(6, + CArgs(5, TCArg(CNN_ArgDataType(In2_DataSize,1,1), "In2"), TCArg(CNN_ArgDataType(In1_DataSize,1,1), "In1"), TCArg(CNN_ArgDataType(Bias_DataSize,1,1), "Bias"), TCArg(CNN_ArgDataType(MulBias_DataSize,1,1), "MulBias"), - (ReLUOper==KOP_RELUN_VECTOR)?TCArg(CNN_ArgDataType(1,1,1), "ReLUN"):AT_NO_C_ARG, TCArg(CNN_ArgDataType(Out_DataSize,1,1), "Out") ), Calls(2, - Call(MatTransKerName, LOC_INNER_LOOP, + Call(MatTransKerName, LOC_LOOP, Bindings(7, K_Arg("In2", KER_ARG_TILE), /* Input tile */ K_Arg("TransIn2", KER_ARG_TILE), /* Transposed input tile */ @@ -5624,7 +5451,7 @@ int CNN_MatMulScaleSmallM1( NeedScy?Imm(Scy):AT_IGNORE_ARG_BINDING /* Stride y if != 1 */ ) ), - Call(MatMulKerName, LOC_INNER_LOOP, + Call(MatMulKerName, LOC_LOOP, Bindings(21, K_Arg("In1", KER_ARG_TILE), Imm(ColM1), Imm(LineM1), K_Arg("TransIn2", KER_ARG_TILE), K_Arg("TransIn2", KER_ARG_TILE_W), @@ -5634,9 +5461,7 @@ int CNN_MatMulScaleSmallM1( Imm(OutLB), /* Output lower bound */ (ReLUOper==KOP_HSWISH||ReLUOper==KOP_HSIGMOID)? Imm(Out_Q): /* Output fixed point format */ - ((ReLUOper==KOP_RELUN_VECTOR)? - K_Arg("ReLUN", KER_ARG_TILE): /* ReLUN input tile */ - Imm(OutUB)), /* Activation upper bound, clip or relu */ + Imm(OutUB), /* Activation upper bound, clip or relu */ Imm(In1_Q+In2_Q-Out_Q), /* Out fixed point adjust format */ Imm(In1_Q+In2_Q-Bias_Q), /* Bias fixed point adjust format */ Imm(MulBias_Q), /* MulBias fixed point format */ @@ -5648,7 +5473,7 @@ int CNN_MatMulScaleSmallM1( ) ) ), - KerArgs(7, + KerArgs(6, KerArg("In1", KerArgSpace(1, T0), O_IN|O_BUFF|O_NTILED|O_CONST|In1L3, ColM1, LineM1, In1_DataSize, 0, 0, 0, "In1"), KerArg("In2", KerArgSpace(1, T0), O_IN|O_DB|In2L3, ColM2, LineM2, In2_DataSize, 0, 0, TileCons, "In2"), KerArg("TransIn2", KerArgSpace(1, T0), O_BUFF|O_ONETILE, ColO, LineM2, In2_DataSize, 0, 0, 0, ""), @@ -5656,8 +5481,6 @@ int CNN_MatMulScaleSmallM1( MulBiasScalar? KerArg("MulBias", KerArgSpace(1, T0), O_IN|O_BUFF|O_NTILED|O_CONST|MulBiasL3, 1, 1, MulBias_DataSize, 0, 0, 0, "MulBias"): KerArg("MulBias", KerArgSpace(1, T0), O_IN|O_BUFF|O_NTILED|O_CONST|MulBiasL3, 1, LineM1, MulBias_DataSize, 0, 0, 0, "MulBias"), - (ReLUOper==KOP_RELUN_VECTOR)? - KerArg("ReLUN", KerArgSpace(1, T0), O_IN|O_DB|O_CONST, 1, LineO, 1, 0, 0, 0, "ReLUN"):AT_NO_KER_ARG, KerArg("Out", KerArgSpace(1, T0), O_OUT|O_DB|OutL3, ColO, LineM1, Out_DataSize, 0, 0, 0, "Out") ) ); @@ -5669,7 +5492,6 @@ int CNN_MatMulScaleSmallM1( AddKernelArgDim(Name, "In2", 4, LineM2, Height, Width, In2_DataSize); AddKernelArgDim(Name, "Bias", 2, LineO, Bias_DataSize); if (MulBiasScalar) AddKernelArgDim(Name, "MulBias", 2, 1, MulBias_DataSize); else AddKernelArgDim(Name, "MulBias", 2, LineO, MulBias_DataSize); - if (ReLUOper==KOP_RELUN_VECTOR) AddKernelArgDim(Name, "ReLUN", 2, LineO, 1); AddKernelArgDim(Name, "Out", 3, LineO, ColO, Out_DataSize); AT_PrepareForTest(Name, @@ -5774,7 +5596,7 @@ int CNN_MatTranspose( TCArg(CNN_ArgDataType(Out_DataSize,1,1), "Out") ), Calls(1, - Call(MatTransKerName, LOC_INNER_LOOP, + Call(MatTransKerName, LOC_LOOP, Bindings(7, K_Arg("In", KER_ARG_TILE), /* Input tile */ K_Arg("Out", KER_ARG_TILE), /* Output tile */ @@ -5895,7 +5717,7 @@ int CNN_3DTensorPermute( TILE_VER, CArgs(2, TCArg(CNN_ArgDataType(In_DataSize,1,1), "In"), TCArg(CNN_ArgDataType(Out_DataSize,1,1), "Out")), Calls(1, - Call(MatPermKerName, LOC_INNER_LOOP, + Call(MatPermKerName, LOC_LOOP, Bindings(7, K_Arg("In", KER_ARG_TILE), /* Input tile */ K_Arg("Out", KER_ARG_TILE), /* Output tile */ diff --git a/tools/autotiler_v3/generators/CNN/CNN_Generators.h b/tools/autotiler_v3/generators/CNN/CNN_Generators.h index d2978da54..eff92b6e0 100644 --- a/tools/autotiler_v3/generators/CNN/CNN_Generators.h +++ b/tools/autotiler_v3/generators/CNN/CNN_Generators.h @@ -73,7 +73,7 @@ extern void LoadCNNLibrary(); \param Spy: Pooling filter stride y dimension \param PoolPad: 0: No padding, 1: Zero padding - \param ReLUOper: Optional activation function: KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + \param ReLUOper: Optional activation function: KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU \param Signature: Name(In, Filter, Bias, Out) \param Name(In, Filter, Bias, ReLUN, Out) @@ -176,7 +176,7 @@ extern int CNN_ConvolutionPoolReLU( \param Spy: Pooling filter stride y dimension \param PoolPad: 0: No padding, 1: Zero padding - \param ReLUOper: Optional activation function: KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + \param ReLUOper: Optional activation function: KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU \param Signature: Name(In, Filter, Bias, MulBias, Out) \param Name(In, Filter, Bias, MulBias, ReLUN, Out) @@ -281,7 +281,7 @@ extern int CNN_ConvolutionMulBiasPoolReLU( \param Spy: Pooling filter stride y dimension \param PoolPad: 0: No padding, 1: Zero padding - \param ReLUOper: Optional activation function: KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + \param ReLUOper: Optional activation function: KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU \param Signature: Name(In, Filter, Bias, Out) \param Name(In, Filter, Bias, ReLUN, Out) @@ -389,7 +389,7 @@ extern int CNN_GroupedConvolutionPoolReLU( \param Spy: Pooling filter stride y dimension \param PoolPad: 0: No padding, 1: Zero padding - \param ReLUOper: Optional activation function: KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + \param ReLUOper: Optional activation function: KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU \param Signature: Name(In, Filter, Bias, MulBias, Out) \param Name(In, Filter, Bias, MulBias, ReLUN, Out) @@ -485,9 +485,9 @@ extern int CNN_GroupedConvolutionMulBiasPoolReLU( \param PoolPad: 0: No padding, 1: Zero padding \param ReLUOper: Optional activation function: if (PoolOper!=KOP_NONE) KOP_RELU or KOP_NONE - \param else Optional activation function: KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + \param else Optional activation function: KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU - \param ReLUOper: Optional activation function: KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + \param ReLUOper: Optional activation function: KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU \param Signature: Name(In, Out) \param Name(In, ReLUN, Out) @@ -608,7 +608,7 @@ extern int CNN_GlobalPool( \param OutDim: Number of outputs \param LinearOper: Should always be KOP_LINEAR - \param ReLUOper: Optional activation function: KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + \param ReLUOper: Optional activation function: KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU \param Signature: Name(In, Filter, Bias, Out) \param Name(In, Filter, Bias, ReLUN, Out) @@ -717,28 +717,54 @@ extern int CNN_SoftMax( */ extern int CNN_MatAdd( + char *Name, + + CNN_GenControl_T *Ctrl, + + int In1_DataSize, + int In2_DataSize, + int Out_DataSize, + + int In1_Q, + int In2_Q, + int Out_Q, + + int In1_InL3, + int In2_InL3, + int Out_InL3, + + int InFeat, + int OutFeat, + int Width, + int Height, + + KernelOper_T AddMatOper); + + +extern int CNN_MatAddRelu( char *Name, CNN_GenControl_T *Ctrl, - int In1_DataSize, - int In2_DataSize, - int Out_DataSize, + int In1_DataSize, + int In2_DataSize, + int Out_DataSize, int In1_Q, int In2_Q, int Out_Q, - int In1_InL3, - int In2_InL3, - int Out_InL3, + int In1_InL3, + int In2_InL3, + int Out_InL3, - int InFeat, - int OutFeat, - int Width, - int Height, + int InFeat, + int OutFeat, + int Width, + int Height, - KernelOper_T AddMatOper + KernelOper_T AddMatOper, + KernelOper_T ReLUOper ); /** \brief CNN_MatAddDynAdjust @@ -832,7 +858,7 @@ extern int CNN_MatAddDynAdjust( \param Height: Height of a given feature \param ScaleOper Should always be KOP_MATSCALE_VECTOR, KOP_MATSCALE_SCALAR or KOP_MATSCALE_VECTOR_SCALAR - \param ReLUOper Optional activation, should be KOP_NONE, KOP_RELU, KOP_RELUN or KOP_RELUN_VECTOR + \param ReLUOper Optional activation, should be KOP_NONE, KOP_RELU, KOP_RELUN \param Signature: Name(In, Scalar, Out) \param Name(In, Scalar, Out, ReLUN) @@ -915,7 +941,7 @@ extern int CNN_MatScale( \param ReLU_UpperBound In case ReLUOper!=KOP_NONE Upper bound to be used for activation \param MatMulOper: Should always be KOP_MATMUL - \param ReLUOper: Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) + \param ReLUOper: Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) \param Signature: Name(In2, In1, Bias, Out) \param Name(In2, In1, Bias, ReLUN, Out) @@ -1001,7 +1027,7 @@ extern int CNN_MatMul( \param ReLU_UpperBound In case ReLUOper!=KOP_NONE Upper bound to be used for activation \param MatMulOper: Should always be KOP_MATMUL - \param ReLUOper: Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) + \param ReLUOper: Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) \param Signature: Name(In2, In1, Bias, Out) \param Name(In2, In1, Bias, ReLUN, Out) @@ -1091,7 +1117,7 @@ extern int CNN_MatMulSmallM1( \param ReLU_UpperBound In case ReLUOper!=KOP_NONE Upper bound to be used for activation \param MatMulOper Should always be KOP_MATMUL_SCALE_SCALAR or KOP_MATMUL_SCALE_SCALAR - \param ReLUOper Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) + \param ReLUOper Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) \param Signature: Name(In2, In1, Bias, Out) \param Name(In2, In1, Bias, ReLUN, Out) @@ -1184,7 +1210,7 @@ extern int CNN_MatMulScale( \param ReLU_UpperBound In case ReLUOper!=KOP_NONE Upper bound to be used for activation \param MatMulOper Should always be KOP_MATMUL_SCALE_SCALAR or KOP_MATMUL_SCALE_SCALAR - \param ReLUOper Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_RELUN_VECTOR, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) + \param ReLUOper Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) \param Signature: Name(In2, In1, Bias, Out) \param Name(In2, In1, Bias, ReLUN, Out) @@ -1321,5 +1347,39 @@ int CNN_3DTensorPermute( KernelOper_T MatPermOper ); +int CNN_ConvNxN_HWCE( + char *Name, + CNN_GenControl_T *Ctrl, + + int In_DataSize, + int Filter_DataSize, + int Bias_DataSize, + int Out_DataSize, + + int In_Q, + int Filter_Q, + int Bias_Q, + int Out_Q, + + int In_InL3, + int Filter_InL3, + int Bias_InL3, + int Out_InL3, + + int InFeat, + int OutFeat, + int Width, + int Height, + + KernelOper_T ConvOper, + int Fcx, + int Fcy, + int Dcx, + int Dcy, + int Scx, + int Scy, + int ConvPad + ); + /** @} */ #endif diff --git a/tools/autotiler_v3/generators/CNN/CNN_Generators_SQ8.c b/tools/autotiler_v3/generators/CNN/CNN_Generators_SQ8.c new file mode 100644 index 000000000..e8d414c78 --- /dev/null +++ b/tools/autotiler_v3/generators/CNN/CNN_Generators_SQ8.c @@ -0,0 +1,3056 @@ +#include +#include +#include "AutoTilerLib.h" +#include "CNN_Generators_SQ8.h" +#include "CNN_Generator_Util.h" +#include "Gap.h" + +#define MaxS(a, b) (((int)(a)>(int)(b))?(a):(b)) +#define Max(a, b) (((a)>(b))?(a):(b)) +#define Min(a, b) (((a)<(b))?(a):(b)) +#define Abs(x) (((x)<0)?-(x):(x)) + +#define D0 KER_ITER_D0 +#define D1 KER_ITER_D1 +#define D2 KER_ITER_D2 +#define D3 KER_ITER_D3 +#define T0 KER_ITER_TILE0 +#define T1 KER_ITER_TILE1 +#define T2 KER_ITER_TILE2 + +#define AT_INF_BIASL_SM 0 +#define AT_INF_ACTSCALE 0 +#define AT_INF_ACTSCALEN 1 +#define AT_INF_A0 2 +#define AT_INF_B0 3 +#define AT_INF_C0 4 + +#define AT_INF_BIASN 5 +#define AT_INF_IN1SCALE 5 +#define AT_INF_SCALEN 5 + +#define AT_INF_IN1SCALEN 6 +#define AT_INF_OUTSCALE 7 +#define AT_INF_OUTSCALEN 8 + +#define AT_INF_DIM 9 + +void LoadCNN_SQ8_Library() + +{ + LibKernelTemplate("KerSetBias_SQ8_T", + CArgs(6, + TCArg("int * __restrict__", "Out"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "H"), + TCArg("unsigned short int", "Feat"), + TCArg("void * __restrict__", "Bias"), + TCArg("unsigned char", "NormBias") + ) + ); + LibKernelTemplate("KerConv_SQ8_T", + CArgs(20, + TCArg("signed char * __restrict__", "In"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "UsedW"), + TCArg("unsigned short int", "H"), + TCArg("unsigned short int", "UsedH"), + TCArg("unsigned short int", "InFeatures"), + TCArg("unsigned short int", "OutFeatures"), + TCArg("unsigned short int", "TotalInFeatures"), + TCArg("signed char * __restrict__", "Filter"), + TCArg("signed char * __restrict__", "Bias"), + TCArg("int * __restrict__", "Out"), + TCArg("v4s", "Pad"), + TCArg("unsigned char", "NormBias"), + TCArg("unsigned char", "Orientation"), + TCArg("unsigned char", "N"), + TCArg("unsigned char", "S"), + TCArg("unsigned char", "D"), + TCArg("unsigned char", "Ny"), + TCArg("unsigned char", "Sy"), + TCArg("unsigned char", "Dy") + ) + ); + LibKernelTemplate("KerConvLinReduct_SQ8_T", + CArgs(8, + TCArg("int *__restrict__", "In"), + TCArg("void *__restrict__", "Out"), + TCArg("unsigned short int", "Feat"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "H"), + TCArg("unsigned char *__restrict__", "Scale"), + TCArg("unsigned char *__restrict__", "ScaleN"), + TCArg("signed char *__restrict__", "Infos") + ) + ); + LibKernelTemplate("KerActivation_SQ8_T", + CArgs(6, + TCArg("signed char *__restrict__", "In"), + TCArg("signed char *__restrict__", "Out"), + TCArg("unsigned short int", "Feat"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "H"), + TCArg("signed char *__restrict__", "Infos") + ) + ); + LibKernelTemplate("KerPool_SQ8_T", + CArgs(18, + TCArg("signed char * __restrict__", "In"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "UsedW"), + TCArg("unsigned short int", "H"), + TCArg("unsigned short int", "UsedH"), + TCArg("unsigned short int", "Feat"), + TCArg("signed char * __restrict__", "Out"), + TCArg("v4s", "Pad"), + TCArg("unsigned char", "FS"), + TCArg("unsigned char", "S"), + TCArg("unsigned char", "D"), + TCArg("unsigned char", "FSy"), + TCArg("unsigned char", "Sy"), + TCArg("unsigned char", "Dy"), + TCArg("unsigned char", "PoolMax"), + TCArg("unsigned char", "Orientation"), + TCArg("unsigned char", "DoScale"), + TCArg("signed char * __restrict__", "Infos") + ) + ); + LibKernelTemplate("KerGlobalPool_SQ8_T", + CArgs(8, + TCArg("void * __restrict__", "In"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "H"), + TCArg("unsigned short int", "Feat"), + TCArg("unsigned short int", "TileIndex"), + TCArg("int * __restrict__", "Out"), + TCArg("unsigned char", "DoScale"), + TCArg("void * __restrict__", "Infos") + ) + ); + LibKernelTemplate("KerLinear_SQ8_T", + CArgs(10, + TCArg("signed char * __restrict__", "In"), + TCArg("signed char * __restrict__", "Weights"), + TCArg("void * __restrict__", "Bias"), + TCArg("void * __restrict__", "Out"), + TCArg("unsigned short int", "InDim"), + TCArg("unsigned short int", "TotalInDim"), + TCArg("unsigned short int", "OutDim"), + TCArg("unsigned char *__restrict__", "Scale"), + TCArg("unsigned char *__restrict__", "ScaleN"), + TCArg("signed char *__restrict__", "Infos") + ) + ); + LibKernelTemplate("KerMat3_SQ8_T", + CArgs(8, + TCArg("signed char *__restrict__", "In1"), + TCArg("signed char *__restrict__", "In2"), + TCArg("signed char *__restrict__", "Out"), + TCArg("unsigned short int", "Feat"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "H"), + TCArg("unsigned char", "DoScale"), + TCArg("signed char *__restrict__", "Infos") + ) + ); + LibKernelTemplate("KerMatMul_SQ8_T", + CArgs(19, + TCArg("signed char * __restrict__", "In1"), + TCArg("unsigned short int", "W_In1"), + TCArg("unsigned short int", "H_In1"), + TCArg("signed char * __restrict__", "In2"), + TCArg("unsigned short int", "W_In2"), + TCArg("void * __restrict__", "Bias"), + TCArg("unsigned char * __restrict__", "Scale"), + TCArg("unsigned char * __restrict__", "ScaleN"), + TCArg("signed char * __restrict__", "Out"), + TCArg("unsigned short int", "W_Out"), + TCArg("unsigned short int", "OutFirstCol"), + TCArg("signed char * __restrict__", "BufferColIn2"), + TCArg("unsigned char", "NormBias"), + TCArg("unsigned char", "ColFirst"), + TCArg("unsigned char", "Sx"), + TCArg("unsigned char", "Sy"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "H"), + TCArg("signed char *__restrict__", "Infos") + ) + ); + LibKernelTemplate("KerMatTranspose_fps_T", + CArgs(7, + TCArg("signed char *__restrict__", "In"), + TCArg("signed char *__restrict__", "Out"), + TCArg("unsigned short int", "Feat"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "H"), + TCArg("unsigned char", "Sx"), + TCArg("unsigned char", "Sy") + ) + ); + LibKernelTemplate("KerSoftMax_SQ8_T", + CArgs(5, + TCArg("signed char *__restrict__", "In"), + TCArg("unsigned short int", "N"), + TCArg("unsigned short int", "Norm"), + TCArg("short int *__restrict__", "Out"), + TCArg("signed char *__restrict__", "Infos") + ) + ); + + /****************************************************************************************************************/ + /* Kernels for features and coefficients on 8 bits. Kernels for multiple output features evaluated in parallel */ + /****************************************************************************************************************/ + + /* Bias setting */ + LibKernel("KerParSetBiasB8_SQ8", CALL_PARALLEL, 0, "KerSetBias_SQ8_T", CNN_Match(CNN_OperList(1, KOP_SETBIAS), 0, 1, CNN_Type(1,0,0,0,4), 0,0,0,0,0,0)); + LibKernel("KerParSetBiasB16_SQ8", CALL_PARALLEL, 0, "KerSetBias_SQ8_T", CNN_Match(CNN_OperList(1, KOP_SETBIAS), 0, 1, CNN_Type(2,0,0,0,4), 0,0,0,0,0,0)); + LibKernel("KerParSetBiasB32_SQ8", CALL_PARALLEL, 0, "KerSetBias_SQ8_T", CNN_Match(CNN_OperList(1, KOP_SETBIAS), 0, 1, CNN_Type(4,0,0,0,4), 0,0,0,0,0,0)); + + /* Convolutions with 32b output, Bias set before */ + LibKernel("KerParConv1x1Stride1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 1,1,1,1,1,1)); + LibKernel("KerParConv1x1Stride2_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 1,1,1,1,2,2)); + LibKernel("KerParConv1x1StrideS_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 1,1,1,1,-1,-2)); + + LibKernel("KerParConv3x1Stride1x1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 3,1,1,1,1,1)); + LibKernel("KerParConv3x1Stride2x1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 3,1,1,1,2,1)); + LibKernel("KerParConv1x3Stride1x1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 1,3,1,1,1,1)); + LibKernel("KerParConv1x3Stride1x2_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 1,3,1,1,1,2)); + + LibKernel("KerParConv3x3Stride1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 3,3,1,1,1,1)); + LibKernel("KerParConv3x3Stride2_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 3,3,1,1,2,2)); + LibKernel("KerParConv3x3StrideS_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 3,3,1,1,-1,-2)); + + LibKernel("KerParConv5x1Stride1x1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 5,1,1,1,1,1)); + LibKernel("KerParConv5x1Stride2x1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 5,1,1,1,2,1)); + LibKernel("KerParConv1x5Stride1x1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 1,5,1,1,1,1)); + LibKernel("KerParConv1x5Stride1x2_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 1,5,1,1,1,2)); + + LibKernel("KerParConv5x5Stride1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 5,5,1,1,1,1)); + LibKernel("KerParConv5x5Stride2_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 5,5,1,1,2,2)); + LibKernel("KerParConv5x5StrideS_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 5,5,1,1,-1,-2)); + LibKernel("KerParConv7x7StrideS_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), 7,7,1,1,-1,-2)); + + LibKernel("KerParConvNxNStrideS_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), -1,-2,1,1,-1,-2)); + LibKernel("KerParConvNxMStrideSxSy_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), -1,-1,1,1,-1,-1)); + + LibKernel("KerParConvNxMDxDyStrideSxSy_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 1, CNN_Type(1,1,1,0,4), -1,-1,-1,-1,-1,-1)); + + /* Depth Wise Convolutions, 8b bias, 32b output */ + LibKernel("KerParConvDW1x1Stride1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 1,1,1,1,1,1)); + LibKernel("KerParConvDW1x1Stride2B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 1,1,1,1,2,2)); + LibKernel("KerParConvDW1x1StrideSB8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 1,1,1,1,-1,-2)); + + LibKernel("KerParConvDW3x1Stride1x1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 3,1,1,1,1,1)); + LibKernel("KerParConvDW3x1Stride2x1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 3,1,1,1,2,1)); + LibKernel("KerParConvDW1x3Stride1x1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 1,3,1,1,1,1)); + LibKernel("KerParConvDW1x3Stride1x2B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 1,3,1,1,1,2)); + + LibKernel("KerParConvDW3x3Stride1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 3,3,1,1,1,1)); + LibKernel("KerParConvDW3x3Stride2B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 3,3,1,1,2,2)); + LibKernel("KerParConvDW3x3StrideSB8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 3,3,1,1,-1,-2)); + + LibKernel("KerParConvDW5x1Stride1x1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 5,1,1,1,1,1)); + LibKernel("KerParConvDW5x1Stride2x1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 5,1,1,1,2,1)); + LibKernel("KerParConvDW1x5Stride1x1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 1,5,1,1,1,1)); + LibKernel("KerParConvDW1x5Stride1x2B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 1,5,1,1,1,2)); + + LibKernel("KerParConvDW5x5Stride1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 5,5,1,1,1,1)); + LibKernel("KerParConvDW5x5Stride2B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 5,5,1,1,2,2)); + LibKernel("KerParConvDW5x5StrideSB8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 5,5,1,1,-1,-2)); + + LibKernel("KerParConvDW7x7StrideSB8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), 7,7,1,1,-1,-2)); + + LibKernel("KerParConvDWNxNStrideSB8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), -1,-2,1,1,-1,-2)); + LibKernel("KerParConvDWNxMStrideSxSyB8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), -1,-1,1,1,-1,-1)); + + LibKernel("KerParConvDWNxMDxDyStrideSxSyB8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,1,0,4), -1,-1,-1,-1,-1,-1)); + + /* Depth Wise Convolutions, 16b bias, 32b output */ + LibKernel("KerParConvDW1x1Stride1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 1,1,1,1,1,1)); + LibKernel("KerParConvDW1x1Stride2B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 1,1,1,1,2,2)); + LibKernel("KerParConvDW1x1StrideSB16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 1,1,1,1,-1,-2)); + + LibKernel("KerParConvDW3x1Stride1x1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 3,1,1,1,1,1)); + LibKernel("KerParConvDW3x1Stride2x1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 3,1,1,1,2,1)); + LibKernel("KerParConvDW1x3Stride1x1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 1,3,1,1,1,1)); + LibKernel("KerParConvDW1x3Stride1x2B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 1,3,1,1,1,2)); + + LibKernel("KerParConvDW3x3Stride1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 3,3,1,1,1,1)); + LibKernel("KerParConvDW3x3Stride2B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 3,3,1,1,2,2)); + LibKernel("KerParConvDW3x3StrideSB16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 3,3,1,1,-1,-2)); + + LibKernel("KerParConvDW5x1Stride1x1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 5,1,1,1,1,1)); + LibKernel("KerParConvDW5x1Stride2x1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 5,1,1,1,2,1)); + LibKernel("KerParConvDW1x5Stride1x1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 1,5,1,1,1,1)); + LibKernel("KerParConvDW1x5Stride1x2B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 1,5,1,1,1,2)); + + LibKernel("KerParConvDW5x5Stride1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 5,5,1,1,1,1)); + LibKernel("KerParConvDW5x5Stride2B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 5,5,1,1,2,2)); + LibKernel("KerParConvDW5x5StrideSB16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 5,5,1,1,-1,-2)); + + LibKernel("KerParConvDW7x7StrideSB16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), 7,7,1,1,-1,-2)); + + LibKernel("KerParConvDWNxNStrideSB16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), -1,-2,1,1,-1,-2)); + LibKernel("KerParConvDWNxMStrideSxSyB16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), -1,-1,1,1,-1,-1)); + + LibKernel("KerParConvDWNxMDxDyStrideSxSyB16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,2,0,4), -1,-1,-1,-1,-1,-1)); + + /* Depth Wise Convolutions, 32b bias, 32b output */ + LibKernel("KerParConvDW1x1Stride1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 1,1,1,1,1,1)); + LibKernel("KerParConvDW1x1Stride2B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 1,1,1,1,2,2)); + LibKernel("KerParConvDW1x1StrideSB32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 1,1,1,1,-1,-2)); + + LibKernel("KerParConvDW3x1Stride1x1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 3,1,1,1,1,1)); + LibKernel("KerParConvDW3x1Stride2x1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 3,1,1,1,2,1)); + LibKernel("KerParConvDW1x3Stride1x1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 1,3,1,1,1,1)); + LibKernel("KerParConvDW1x3Stride1x2B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 1,3,1,1,1,2)); + + LibKernel("KerParConvDW3x3Stride1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 3,3,1,1,1,1)); + LibKernel("KerParConvDW3x3Stride2B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 3,3,1,1,2,2)); + LibKernel("KerParConvDW3x3StrideSB32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 3,3,1,1,-1,-2)); + + LibKernel("KerParConvDW5x1Stride1x1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 5,1,1,1,1,1)); + LibKernel("KerParConvDW5x1Stride2x1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 5,1,1,1,2,1)); + LibKernel("KerParConvDW1x5Stride1x1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 1,5,1,1,1,1)); + LibKernel("KerParConvDW1x5Stride1x2B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 1,5,1,1,1,2)); + + LibKernel("KerParConvDW5x5Stride1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 5,5,1,1,1,1)); + LibKernel("KerParConvDW5x5Stride2B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 5,5,1,1,2,2)); + LibKernel("KerParConvDW5x5StrideSB32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 5,5,1,1,-1,-2)); + + LibKernel("KerParConvDW7x7StrideSB32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), 7,7,1,1,-1,-2)); + + LibKernel("KerParConvDWNxNStrideSB32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), -1,-2,1,1,-1,-2)); + LibKernel("KerParConvDWNxMStrideSxSyB32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), -1,-1,1,1,-1,-1)); + + LibKernel("KerParConvDWNxMDxDyStrideSxSyB32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 1, CNN_Type(1,1,4,0,4), -1,-1,-1,-1,-1,-1)); + + /* Linear layer, 32b output with bias set before and scaling/activation done after */ + LibKernel("KerParLinearLayer_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), 0, 1, CNN_Type(1,1,0,0,4), 0,0,0,0,0,0)); + + /* Linear layer, 8b output with bias and scaling/activation (ReLU, ReLUN) done in a single shot */ + LibKernel("KerParLinearLayerFullFeatB8_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB8_ReLU_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB8_ReLUN_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); + + LibKernel("KerParLinearLayerFullFeatB16_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB16_ReLU_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB16_ReLUN_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T",CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0)); + + LibKernel("KerParLinearLayerFullFeatB32_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB32_ReLU_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB32_ReLUN_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T",CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0)); + + /* Convolution or Linear output reduction with per channel scaling and optional activation. Out != In and In Place (IO) */ + LibKernel("KerParReduct_CC_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_NONE), 1, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParReduct_CC_ReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELU), 1, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParReduct_CC_ReLUN_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUN), 1, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParReduct_CC_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSIGMOID), 1, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParReduct_CC_HSwish_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSWISH), 1, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParReduct_CC_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_LEAKYRELU), 1, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + + LibKernel("KerParReductIO_CC_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_NONE), 1, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParReductIO_CC_ReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELU), 1, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParReductIO_CC_ReLUN_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUN), 1, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParReductIO_CC_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_HSIGMOID), 1, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParReductIO_CC_HSwish_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_HSWISH), 1, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParReductIO_CC_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_LEAKYRELU), 1, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + + /* Activation wth tensor centric scaling */ + LibKernel("KerPar_ReLU_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_RELU), 0, 1, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerPar_ReLUN_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_RELUN), 0, 1, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerPar_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_HSIGMOID), 0, 1, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerPar_HSwish_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_HSWISH), 0, 1, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerPar_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LEAKYRELU), 0, 1, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); + + /* Pooling (Max or Avg) with tensor centric scaling and optional ReLU or ReLUN activation */ + LibKernel("KerParPool2x2Stride2_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_NONE), 1, + CNN_Type(1,0,0,0,1), 2,2,1,1,2,2)); + LibKernel("KerParPool2x2Stride2_ReLU_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_RELU), 1, + CNN_Type(1,0,0,0,1), 2,2,1,1,2,2)); + LibKernel("KerParPool2x2Stride2_ReLUN_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUN), 1, + CNN_Type(1,0,0,0,1), 2,2,1,1,2,2)); + + LibKernel("KerParPoolNxNStrideS_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_NONE), 1, + CNN_Type(1,0,0,0,1), -1,-2,1,1,-1,-2)); + LibKernel("KerParPoolNxNStrideS_ReLU_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_RELU), 1, + CNN_Type(1,0,0,0,1), -1,-2,1,1,-1,-2)); + LibKernel("KerParPoolNxNStrideS_ReLUN_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUN), 1, + CNN_Type(1,0,0,0,1), -1,-2,1,1,-1,-2)); + + LibKernel("KerParPoolNxMStrideSxSy_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_NONE), 1, + CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParPoolNxMStrideSxSy_ReLU_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_RELU), 1, + CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParPoolNxMStrideSxSy_ReLUN_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUN), 1, + CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + + /* Global Pooling (Max or Avg) with tensor centric scaling and optional ReLU or ReLUN activation */ + LibKernel("KerParGlobalMaxPoolFullFeat_SQ8", CALL_PARALLEL, 0, "KerGlobalPool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_GLOBAL_MAXPOOL), CNN_OperList(1, KOP_NONE), 1, + CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParGlobalMaxPoolFullFeat_ReLU_SQ8", CALL_PARALLEL, 0, "KerGlobalPool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_GLOBAL_MAXPOOL), CNN_OperList(1, KOP_RELU), 1, + CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParGlobalMaxPoolFullFeat_ReLUN_SQ8", CALL_PARALLEL, 0, "KerGlobalPool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_GLOBAL_MAXPOOL), CNN_OperList(1, KOP_RELUN), 1, + CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); + + LibKernel("KerParGlobalMaxPool_SQ8", CALL_PARALLEL, 0, "KerGlobalPool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_GLOBAL_MAXPOOL), CNN_OperList(1, KOP_NONE), 1, + CNN_Type(1,0,0,0,4), 0,0,0,0,0,0)); + LibKernel("KerParGlobalMaxPool_Reduct_SQ8", CALL_PARALLEL, 0, "KerGlobalPool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_GLOBAL_MAXPOOL_REDUCT), CNN_OperList(1, KOP_NONE), 1, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParGlobalMaxPool_Reduct_ReLU_SQ8", CALL_PARALLEL, 0, "KerGlobalPool_SQ8_T",CNN_Match(CNN_OperList(1, KOP_GLOBAL_MAXPOOL_REDUCT), CNN_OperList(1, KOP_RELU), 1, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParGlobalMaxPool_Reduct_ReLUN_SQ8", CALL_PARALLEL, 0, "KerGlobalPool_SQ8_T",CNN_Match(CNN_OperList(1, KOP_GLOBAL_MAXPOOL_REDUCT), CNN_OperList(1, KOP_RELUN), 1, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + + LibKernel("KerParGlobalAvgPoolFullFeat_SQ8", CALL_PARALLEL, 0, "KerGlobalPool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_GLOBAL_AVGPOOL), CNN_OperList(1, KOP_NONE), 1, + CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParGlobalAvgPoolFullFeat_ReLU_SQ8", CALL_PARALLEL, 0, "KerGlobalPool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_GLOBAL_AVGPOOL), CNN_OperList(1, KOP_RELU), 1, + CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParGlobalAvgPoolFullFeat_ReLUN_SQ8", CALL_PARALLEL, 0, "KerGlobalPool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_GLOBAL_AVGPOOL), CNN_OperList(1, KOP_RELUN), 1, + CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); + + LibKernel("KerParGlobalAvgPool_SQ8", CALL_PARALLEL, 0, "KerGlobalPool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_GLOBAL_AVGPOOL), CNN_OperList(1, KOP_NONE), 1, + CNN_Type(1,0,0,0,4), 0,0,0,0,0,0)); + LibKernel("KerParGlobalAvgPool_Reduct_SQ8", CALL_PARALLEL, 0, "KerGlobalPool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_GLOBAL_AVGPOOL_REDUCT), CNN_OperList(1, KOP_NONE), 1, + CNN_Type(1,0,0,0,4), 0,0,0,0,0,0)); + LibKernel("KerParGlobalAvgPool_Reduct_ReLU_SQ8", CALL_PARALLEL, 0, "KerGlobalPool_SQ8_T",CNN_Match(CNN_OperList(1, KOP_GLOBAL_AVGPOOL_REDUCT), CNN_OperList(1, KOP_RELU), 1, + CNN_Type(1,0,0,0,4), 0,0,0,0,0,0)); + LibKernel("KerParGlobalAvgPool_Reduct_ReLUN_SQ8", CALL_PARALLEL, 0, "KerGlobalPool_SQ8_T",CNN_Match(CNN_OperList(1, KOP_GLOBAL_AVGPOOL_REDUCT), CNN_OperList(1, KOP_RELUN), 1, + CNN_Type(1,0,0,0,4), 0,0,0,0,0,0)); + + /* Matrix Algebra */ + + /* Matrix Addition with tensor centric scaling and optional activation */ + LibKernel("KerParMatAdd_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATADD), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParMatAdd_ReLU_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATADD), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParMatAdd_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATADD), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParMatAdd_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATADD), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParMatAdd_HSwish_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATADD), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerParMatAdd_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATADD), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,0,0,1), 0,0,0,0,0,0)); + + /* Matrix Multiplication for 1x1 convolutions with channel scaling and optional ReLU or ReLUN activation */ + /* 8b Bias */ + LibKernel("KerParMatMulB8_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + + LibKernel("KerParMatMuSxSylB8_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB8_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB8_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); + + /* 16b Bias */ + LibKernel("KerParMatMulB16_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + + LibKernel("KerParMatMuSxSylB16_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB16_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB16_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); + + /* 32b Bias */ + LibKernel("KerParMatMulB32_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + + LibKernel("KerParMatMuSxSylB32_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB32_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB32_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1)); + + /* Matrix Multiplication for 1x1 convolutions with channel scaling and optional ReLU or ReLUN activation, optimized form when In1 fits entirely into shared L1 */ + /* 8b Bias */ + LibKernel("KerParMatMulB8_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_ReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_ReLUN_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + + /* 16b Bias */ + LibKernel("KerParMatMulB16_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_ReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_ReLUN_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + + /* 32b Bias */ + LibKernel("KerParMatMulB32_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_ReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_ReLUN_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + + + /* Matrix by vector multiplication with tensor centric scaling and optional activation */ + LibKernel("KerParMatVectMul_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatVectMul_ReLU_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatVectMul_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatVectMul_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatVectMul_HSwish_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatVectMul_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + + /* Matrix Transposition, no scaling */ + LibKernel("CNN_ParTranspose_fps", CALL_PARALLEL, 0, "KerMatTranspose_fps_T", CNN_Match(CNN_OperList(1, KOP_MATTRANSP), 0, 1, CNN_Type(1,0,0,0,1), 0,0,0,0,1,1)); + LibKernel("CNN_ParTransposeSxSy_fps", CALL_PARALLEL, 0, "KerMatTranspose_fps_T",CNN_Match(CNN_OperList(1, KOP_MATTRANSP), 0, 1, CNN_Type(1,0,0,0,1), 0,0,0,0,-1,-1)); + LibKernel("CNN_Transpose_fps", CALL_PARALLEL, 0, "KerMatTranspose_fps_T", CNN_Match(CNN_OperList(1, KOP_MATTRANSP), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,1,1)); + LibKernel("CNN_TransposeSxSy_fps", CALL_PARALLEL, 0, "KerMatTranspose_fps_T", CNN_Match(CNN_OperList(1, KOP_MATTRANSP), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,-1,-1)); + + /* Tensor Permutation, no scaling */ + LibKernel("CNN_MatPermCHW2CWH_fps", CALL_PARALLEL, 0, "KerMatTranspose_fps_T", CNN_Match(CNN_OperList(1, KOP_MATPERM_CHW2CWH), 0, 1, CNN_Type(1,0,0,0,1), 0,0,0,0,1,1)); + LibKernel("CNN_MatPermCHW2HWC_fps", CALL_PARALLEL, 0, "KerMatTranspose_fps_T", CNN_Match(CNN_OperList(1, KOP_MATPERM_CHW2HWC), 0, 1, CNN_Type(1,0,0,0,1), 0,0,0,0,1,1)); + LibKernel("CNN_MatPermCHW2WHC_fps", CALL_PARALLEL, 0, "KerMatTranspose_fps_T", CNN_Match(CNN_OperList(1, KOP_MATPERM_CHW2WHC), 0, 1, CNN_Type(1,0,0,0,1), 0,0,0,0,1,1)); + LibKernel("CNN_MatPermCHW2WCH_fps", CALL_PARALLEL, 0, "KerMatTranspose_fps_T", CNN_Match(CNN_OperList(1, KOP_MATPERM_CHW2WCH), 0, 1, CNN_Type(1,0,0,0,1), 0,0,0,0,1,1)); + LibKernel("CNN_MatPermCHW2HCW_fps", CALL_PARALLEL, 0, "KerMatTranspose_fps_T", CNN_Match(CNN_OperList(1, KOP_MATPERM_CHW2HCW), 0, 1, CNN_Type(1,0,0,0,1), 0,0,0,0,1,1)); + + /* SoftMax, pre scaling */ + LibKernel("KerParSoftMax_SQ8", CALL_PARALLEL, 0, "KerSoftMax_SQ8_T", CNN_Match(CNN_OperList(1, KOP_SOFTMAX), 0, -1, CNN_Type(1,0,0,0,2), 0,0,0,0,0,0)); + + + + /****************************************************************************************************************/ + /* Kernels for features and coefficients on 8 bits. Kernels for a single feature evaluated in parallel */ + /****************************************************************************************************************/ + + /* Bias setting */ + LibKernel("KerSetBiasB8_SQ8", CALL_PARALLEL, 0, "KerSetBias_SQ8_T", CNN_Match(CNN_OperList(1, KOP_SETBIAS), 0, 0, CNN_Type(1,0,0,0,4), 0,0,0,0,0,0)); + LibKernel("KerSetBiasB16_SQ8", CALL_PARALLEL, 0, "KerSetBias_SQ8_T", CNN_Match(CNN_OperList(1, KOP_SETBIAS), 0, 0, CNN_Type(2,0,0,0,4), 0,0,0,0,0,0)); + LibKernel("KerSetBiasB32_SQ8", CALL_PARALLEL, 0, "KerSetBias_SQ8_T", CNN_Match(CNN_OperList(1, KOP_SETBIAS), 0, 0, CNN_Type(4,0,0,0,4), 0,0,0,0,0,0)); + + /* Convolutions with 32b output */ + LibKernel("KerConv1x1Stride1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 1,1,1,1,1,1)); + LibKernel("KerConv1x1Stride2_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 1,1,1,1,2,2)); + LibKernel("KerConv1x1StrideS_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 1,1,1,1,-1,-2)); + + LibKernel("KerConv3x1Stride1x1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 3,1,1,1,1,1)); + LibKernel("KerConv3x1Stride2x1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 3,1,1,1,2,1)); + LibKernel("KerConv1x3Stride1x1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 1,3,1,1,1,1)); + LibKernel("KerConv1x3Stride1x2_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 1,3,1,1,1,2)); + + LibKernel("KerConv3x3Stride1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 3,3,1,1,1,1)); + LibKernel("KerConv3x3Stride2_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 3,3,1,1,2,2)); + LibKernel("KerConv3x3StrideS_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 3,3,1,1,-1,-2)); + + LibKernel("KerConv5x1Stride1x1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 5,1,1,1,1,1)); + LibKernel("KerConv5x1Stride2x1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 5,1,1,1,2,1)); + LibKernel("KerConv1x5Stride1x1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 1,5,1,1,1,1)); + LibKernel("KerConv1x5Stride1x2_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 1,5,1,1,1,2)); + + LibKernel("KerConv5x5Stride1_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 5,5,1,1,1,1)); + LibKernel("KerConv5x5Stride2_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 5,5,1,1,2,2)); + LibKernel("KerConv5x5StrideS_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 5,5,1,1,-1,-2)); + + LibKernel("KerConv7x7StrideS_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), 7,7,1,1,-1,-2)); + + LibKernel("KerConvNxNStrideS_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), -1,-2,1,1,-1,-2)); + LibKernel("KerConvNxMStrideSxSy_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), -1,-1,1,1,-1,-1)); + + LibKernel("KerConvNxMDxDyStrideSxSy_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV), 0, 0, CNN_Type(1,1,1,0,4), -1,-1,-1,-1,-1,-1)); + + /* Depth Wise Convolutions, 8b bias, 32b output */ + LibKernel("KerConvDW1x1Stride1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 1,1,1,1,1,1)); + LibKernel("KerConvDW1x1Stride2B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 1,1,1,1,2,2)); + LibKernel("KerConvDW1x1StrideSB8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 1,1,1,1,-1,-2)); + + LibKernel("KerConvDW3x1Stride1x1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 3,1,1,1,1,1)); + LibKernel("KerConvDW3x1Stride2x1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 3,1,1,1,2,1)); + LibKernel("KerConvDW1x3Stride1x1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 1,3,1,1,1,1)); + LibKernel("KerConvDW1x3Stride1x2B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 1,3,1,1,1,2)); + + LibKernel("KerConvDW3x3Stride1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 3,3,1,1,1,1)); + LibKernel("KerConvDW3x3Stride2B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 3,3,1,1,2,2)); + LibKernel("KerConvDW3x3StrideSB8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 3,3,1,1,-1,-2)); + + LibKernel("KerConvDW5x1Stride1x1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 5,1,1,1,1,1)); + LibKernel("KerConvDW5x1Stride2x1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 5,1,1,1,2,1)); + LibKernel("KerConvDW1x5Stride1x1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 1,5,1,1,1,1)); + LibKernel("KerConvDW1x5Stride1x2B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 1,5,1,1,1,2)); + + LibKernel("KerConvDW5x5Stride1B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 5,5,1,1,1,1)); + LibKernel("KerConvDW5x5Stride2B8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 5,5,1,1,2,2)); + LibKernel("KerConvDW5x5StrideSB8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 5,5,1,1,-1,-2)); + + LibKernel("KerConvDW7x7StrideSB8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), 7,7,1,1,-1,-2)); + + LibKernel("KerConvDWNxNStrideSB8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), -1,-2,1,1,-1,-2)); + LibKernel("KerConvDWNxMStrideSxSyB8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), -1,-1,1,1,-1,-1)); + + LibKernel("KerConvDWNxMDxDyStrideSxSyB8_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T",CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,1,0,4), -1,-1,-1,-1,-1,-1)); + + /* Depth Wise Convolutions, 16b bias, 32b output */ + LibKernel("KerConvDW1x1Stride1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 1,1,1,1,1,1)); + LibKernel("KerConvDW1x1Stride2B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 1,1,1,1,2,2)); + LibKernel("KerConvDW1x1StrideSB16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 1,1,1,1,-1,-2)); + + LibKernel("KerConvDW3x1Stride1x1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 3,1,1,1,1,1)); + LibKernel("KerConvDW3x1Stride2x1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 3,1,1,1,2,1)); + LibKernel("KerConvDW1x3Stride1x1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 1,3,1,1,1,1)); + LibKernel("KerConvDW1x3Stride1x2B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 1,3,1,1,1,2)); + + LibKernel("KerConvDW3x3Stride1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 3,3,1,1,1,1)); + LibKernel("KerConvDW3x3Stride2B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 3,3,1,1,2,2)); + LibKernel("KerConvDW3x3StrideSB16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 3,3,1,1,-1,-2)); + + LibKernel("KerConvDW5x1Stride1x1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 5,1,1,1,1,1)); + LibKernel("KerConvDW5x1Stride2x1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 5,1,1,1,2,1)); + LibKernel("KerConvDW1x5Stride1x1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 1,5,1,1,1,1)); + LibKernel("KerConvDW1x5Stride1x2B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 1,5,1,1,1,2)); + + LibKernel("KerConvDW5x5Stride1B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 5,5,1,1,1,1)); + LibKernel("KerConvDW5x5Stride2B16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 5,5,1,1,2,2)); + LibKernel("KerConvDW5x5StrideSB16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 5,5,1,1,-1,-2)); + + LibKernel("KerConvDW7x7StrideSB16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), 7,7,1,1,-1,-2)); + + LibKernel("KerConvDWNxNStrideSB16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), -1,-2,1,1,-1,-2)); + LibKernel("KerConvDWNxMStrideSxSyB16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), -1,-1,1,1,-1,-1)); + + LibKernel("KerConvDWNxMDxDyStrideSxSyB16_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T",CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,2,0,4), -1,-1,-1,-1,-1,-1)); + + /* Depth Wise Convolutions, 32b bias, 32b output */ + LibKernel("KerConvDW1x1Stride1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 1,1,1,1,1,1)); + LibKernel("KerConvDW1x1Stride2B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 1,1,1,1,2,2)); + LibKernel("KerConvDW1x1StrideSB32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 1,1,1,1,-1,-2)); + + LibKernel("KerConvDW3x1Stride1x1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 3,1,1,1,1,1)); + LibKernel("KerConvDW3x1Stride2x1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 3,1,1,1,2,1)); + LibKernel("KerConvDW1x3Stride1x1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 1,3,1,1,1,1)); + LibKernel("KerConvDW1x3Stride1x2B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 1,3,1,1,1,2)); + + LibKernel("KerConvDW3x3Stride1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 3,3,1,1,1,1)); + LibKernel("KerConvDW3x3Stride2B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 3,3,1,1,2,2)); + LibKernel("KerConvDW3x3StrideSB32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 3,3,1,1,-1,-2)); + + LibKernel("KerConvDW5x1Stride1x1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 5,1,1,1,1,1)); + LibKernel("KerConvDW5x1Stride2x1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 5,1,1,1,2,1)); + LibKernel("KerConvDW1x5Stride1x1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 1,5,1,1,1,1)); + LibKernel("KerConvDW1x5Stride1x2B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 1,5,1,1,1,2)); + + LibKernel("KerConvDW5x5Stride1B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 5,5,1,1,1,1)); + LibKernel("KerConvDW5x5Stride2B32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 5,5,1,1,2,2)); + LibKernel("KerConvDW5x5StrideSB32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 5,5,1,1,-1,-2)); + + LibKernel("KerConvDW7x7StrideSB32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), 7,7,1,1,-1,-2)); + + LibKernel("KerConvDWNxNStrideSB32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), -1,-2,1,1,-1,-2)); + LibKernel("KerConvDWNxMStrideSxSyB32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), -1,-1,1,1,-1,-1)); + + LibKernel("KerConvDWNxMDxDyStrideSxSyB32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T",CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), -1,-1,-1,-1,-1,-1)); + + /* Convolution, Linear output reduction with per channel scaling and optional activation. Out != In and In Place (IO) */ + LibKernel("KerReduct_CC_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_NONE), 0, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_ReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELU), 0, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_ReLUN_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUN), 0, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSIGMOID), 0, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_HSwish_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSWISH), 0, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_LEAKYRELU), 0, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + + LibKernel("KerReductIO_CC_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_NONE), 0, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_ReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELU), 0, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_ReLUN_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUN), 0, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_HSIGMOID), 0, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_HSwish_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_HSWISH), 0, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_LEAKYRELU), 0, + CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + + /* Activations with tensor centric scaling */ + LibKernel("Ker_ReLU_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_RELU), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("Ker_ReLUN_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_RELUN), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("Ker_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_HSIGMOID), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("Ker_HSwish_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_HSWISH), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("Ker_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LEAKYRELU), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); + + + /* Pooling (Max or Avg) with tensor centric scaling and optional ReLU or ReLUN activation */ + LibKernel("KerPool2x2Stride2_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_NONE), 0, + CNN_Type(1,0,0,0,1), 2,2,1,1,2,2)); + LibKernel("KerPool2x2Stride2_ReLU_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_RELU), 0, + CNN_Type(1,0,0,0,1), 2,2,1,1,2,2)); + LibKernel("KerPool2x2Stride2_ReLUN_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUN), 0, + CNN_Type(1,0,0,0,1), 2,2,1,1,2,2)); + + LibKernel("KerPoolNxNStrideS_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_NONE), 0, + CNN_Type(1,0,0,0,1), -1,-2,1,1,-1,-2)); + LibKernel("KerPoolNxNStrideS_ReLU_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_RELU), 0, + CNN_Type(1,0,0,0,1), -1,-2,1,1,-1,-2)); + LibKernel("KerPoolNxNStrideS_ReLUN_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUN), 0, + CNN_Type(1,0,0,0,1), -1,-2,1,1,-1,-2)); + + LibKernel("KerPoolNxMStrideSxSy_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_NONE), 0, + CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPoolNxMStrideSxSy_ReLU_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_RELU), 0, + CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPoolNxMStrideSxSy_ReLUN_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUN), 0, + CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); +} + +/********************************************************************************************************************************************************************* + Generator for Convolutions with channel centric scaling, followed by an optional pooling (Max or Average), + followed by an optional Activation. + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features, Use HWCE), Def=(TILE_HOR, 1, 0) + + Bias_DataSize: 1: byte, 2: half word, 4: word + Scale_DataSize: 1: byte, 2: half word, 4: word + + InFeat: Number of input feature's maps + OutFeat: Number of output feature's maps + Width: Number of columns of a given feature map + Height: Number of lines of a given feature map + + ConvOper: Type of convolution, Regular convolution: KOP_CONV, KOP_CONV_DP, Depth wise convolution: KOP_CONV_DW + Fcx: Convolution filter x dimension + Fcy: Convolution filter y dimension + Dcx: Convolution filter dilation factor, x dimension + Dcy: Convolution filter dilation factor, y dimension + Scx: Convolution filter stride x dimension + Scy: Convolution filter stride y dimension + ConvPad: 0: No padding, 1: Zero padding + + PoolOper: Type of Pooling, KOP_NONE, Max Pooling: KOP_MAXPOOL, Average Pooling: KOP_AVGPOOL + Fpx: Pooling filter x dimension + Fpy: Pooling filter y dimension + Dpx: Pooling filter dilation factor, x dimension + Dpy: Pooling filter dilation factor, y dimension + Spx: Pooling filter stride x dimension + Spy: Pooling filter stride y dimension + PoolPad: 0: No padding, 1: Zero padding + + ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + + Signature: Name(In, Filter, Bias, Out, Scale, ScaleN, Infos) + + CNN_ConvolutionPoolAct_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_ConvolutionPoolAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Bias_DataSize, + int Scale_DataSize, + + int InFeat, + int OutFeat, + int Width, + int Height, + + KernelOper_T ConvOper, + int Fcx, + int Fcy, + int Dcx, + int Dcy, + int Scx, + int Scy, + int ConvPad, + + KernelOper_T PoolOper, + int Fpx, + int Fpy, + int Dpx, + int Dpy, + int Spx, + int Spy, + int PoolPad, + + KernelOper_T ActOper + ) + +{ + if (ConvOper==KOP_NONE) { + if (PoolOper!=KOP_NONE) + return CNN_PoolAct_SQ8(Name, Ctrl, InFeat, Width, Height, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper); + else if (ActOper!=KOP_NONE) + return CNN_Act_SQ8(Name, Ctrl, InFeat, Width, Height, ActOper); + else GenTilingError("CNN_ConvolutionPoolAct_SQ8 Kernel: %s, All requested operations are KOP_NONE", Name); + } + + int ParFeat = 1; + Tile_Orientation_T TileOrientation = TILE_HOR; + AT_PadType PadType = PAD_BALANCED_LEFT; + if (PoolOper==KOP_NONE) { + Fpx=1; Dpx=1; Spx=1; Fpy=1; Dpy=1; Spy=1; + } + if (Ctrl) { + if (Ctrl->TileOrientation != -1) TileOrientation = (Ctrl->TileOrientation==0)?TILE_HOR:TILE_VER; + if (Ctrl->ParallelFeatures != -1) ParFeat = Ctrl->ParallelFeatures; + if (Ctrl->PadType != -1) PadType = Ctrl->PadType; + } + int OverlapC, OverlapP; + int TileCons; + int Wo, Ho, Wc, Hc; + int PadCw=0, PadCh=0, PadPw=0, PadPh=0; + v4s PadInp = (v4s){0,0,0,0}, PadInc = (v4s){0,0,0,0}, PadIncT = (v4s){0,0,0,0}; + char *ConvKerName=0, *PoolKerName=0, *ActKerName=0, *SetBiasKerName=0, *DPReductionKerName=0; + int DWConv=(ConvOper==KOP_CONV_DW); + int NeedFcx, NeedFcy, NeedDcx, NeedDcy, NeedScx, NeedScy, NeedFpx, NeedFpy, NeedDpx, NeedDpy, NeedSpx, NeedSpy; + int Os=(DWConv?D0:D1); + int UsedWidth, UsedHeight, UsedWc, UsedHc; + int InTileCons = 4; + int StandAloneAct = (ActOper!=KOP_NONE); + unsigned long long int LayerOp = 0; + unsigned long long int LayerBandwidth = 0; + int Log=1; + + if (!(ConvOper == KOP_NONE || ConvOper == KOP_CONV || ConvOper == KOP_CONV_DW)) + GenTilingError("CNN_ConvolutionPoolAct_SQ8 Kernel: %s, ConvOper, expecting KOP_NONE, KOP_CONV or KOP_CONV_DW", Name); + if (!(PoolOper == KOP_NONE || PoolOper == KOP_MAXPOOL || PoolOper == KOP_AVGPOOL)) + GenTilingError("CNN_ConvolutionPoolAct_SQ8 Kernel: %s, PoolOper, expecting KOP_NONE, KOP_MAXPOOL or KOP_AVGPOOL", Name); + if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU)) + GenTilingError("CNN_ConvolutionPoolAct_SQ8 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_HSWISH or KOP_LEAKYRELU", Name); + + if (DWConv && (InFeat != OutFeat)) GenTilingError("CNN_ConvolutionPoolAct_SQ8 Kernel: %s, Depth wise convolution requested with InFeat:%d != OutFeat:%d", Name, InFeat, OutFeat); + + CNN_LayerOutputDim(Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, + &Wc, &Hc, &Wo, &Ho, &PadCw, &PadCh, &PadPw, &PadPh); + PadInc = CNN_EdgePaddingValue(PadType, PadCw, PadCh); + PadInp = CNN_EdgePaddingValue(PadType, PadPw, PadPh); + /* Pad value for tiling, need to accrue phantom values created for Pool padding */ + PadIncT = (v4s) {PadInp[0]*Scx+PadInc[0], PadInp[1]*Scx+PadInc[1], PadInp[2]*Scy+PadInc[2], PadInp[3]*Scy+PadInc[3]}; + + CNN_TileOverlap(TileOrientation, Fcx, Fcy, Dcx, Dcy, Scx, Scy, Fpx, Fpy, Dpx, Dpy, Spx, Spy, &OverlapC, &OverlapP); + UsedWc = CNN_UsedInputDimension(Wo, Fpx, Spx, Dpx, PadPw); + UsedHc = CNN_UsedInputDimension(Ho, Fpy, Spy, Dpy, PadPh); + UsedWidth = CNN_UsedInputDimension(UsedWc, Fcx, Scx, Dcx, PadCw); + UsedHeight = CNN_UsedInputDimension(UsedHc, Fcy, Scy, Dcy, PadCh); + TileCons = (TileOrientation==TILE_HOR)?CNN_Scm(Scy, Spy):CNN_Scm(Scx, Spx); + + /* Re evaluate now that we know exactly what is used */ + PadInc[1] = Max(0, PadInc[1]-(Width-UsedWidth)); PadInc[3] = Max(0, PadInc[3]-(Height-UsedHeight)); + PadInp[1] = Max(0, PadInp[1]-(Wc-UsedWc)); PadInp[3] = Max(0, PadInp[3]-(Hc-UsedHc)); + PadIncT = (v4s) {PadInp[0]*Scx+PadInc[0], PadInp[1]*Scx+PadInc[1], PadInp[2]*Scy+PadInc[2], PadInp[3]*Scy+PadInc[3]}; + UsedWc = (Wo-1)*Spx+(Dpx*(Fpx-1)+1)-PadInp[0]-PadInp[1]; UsedHc = (Ho-1)*Spy+(Dpy*(Fpy-1)+1)-PadInp[2]-PadInp[3]; + UsedWidth = (UsedWc-1)*Scx+(Dcx*(Fcx-1)+1) -PadInc[0]-PadInc[1]; UsedHeight = (UsedHc-1)*Scy+(Dcy*(Fcy-1)+1)-PadInc[2]-PadInc[3]; + Wc = UsedWc; Hc = UsedHc; + + /* Layer number of operations and memory bandwidth requirements */ + LayerOp += Wc*Hc*Fcx*Fcy*OutFeat; + if (!DWConv) LayerOp *= InFeat; + if (PoolOper) LayerOp += OutFeat*Wo*Ho*Fpx*Fpy; + if (ActOper) LayerOp += OutFeat*Wo*Ho; + LayerBandwidth += Width*Height*1*InFeat*(DWConv?1:OutFeat); + LayerBandwidth += Wo*Ho*1*OutFeat; + LayerBandwidth += Fcx*Fcy*1*InFeat*(DWConv?1:OutFeat); + LayerBandwidth += Bias_DataSize*OutFeat; + + /* Basic Kernel Matching */ + if (!DWConv) { + SetBiasKerName = CNN_FindMatchingKernel(KOP_SETBIAS, KOP_NONE, ParFeat, Bias_DataSize, 0, 0, 0, 4, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (SetBiasKerName==0) GenTilingError("CNN_ConvolutionPoolAct_SQ8 Kernel: %s, Can't find a matching Set Bias basic kernel", Name); + } + + ConvKerName = CNN_FindMatchingKernel(ConvOper, KOP_NONE, ParFeat, 1, 1, DWConv?Bias_DataSize:0, 0, 4, Fcx, Fcy, Dcx, Dcy, Scx, Scy, + &NeedFcx, &NeedFcy, &NeedDcx, &NeedDcy, &NeedScx, &NeedScy, 0); + if (ConvKerName==0) GenTilingError("CNN_ConvolutionPoolAct_SQ8 Kernel: %s, Can't find a matching Convolution basic kernel", Name); + + if (PoolOper != KOP_NONE) { + DPReductionKerName = CNN_FindMatchingKernel(KOP_DP_REDUCT_IO, KOP_NONE, ParFeat, 4, 0, 0, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (DPReductionKerName==0) GenTilingError("CNN_ConvolutionPoolAct_SQ8 Kernel: %s, Can't find a matching Reduction basic kernel", Name); + } else { + DPReductionKerName = CNN_FindMatchingKernel(KOP_DP_REDUCT, ActOper, ParFeat, 4, 0, 0, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (DPReductionKerName==0) + DPReductionKerName = CNN_FindMatchingKernel(KOP_DP_REDUCT, KOP_NONE, ParFeat, 4, 0, 0, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + else if (ActOper) StandAloneAct = 0; + if (DPReductionKerName==0) GenTilingError("CNN_ConvolutionPoolAct_SQ8 Kernel: %s, Can't find a matching Reduction basic kernel", Name); + } + + if (PoolOper!=KOP_NONE) { + PoolKerName = CNN_FindMatchingKernel(PoolOper, ActOper, ParFeat, 1, 0, 0, 0, 1, Fpx, Fpy, Dpx, Dpy, Spx, Spy, + &NeedFpx, &NeedFpy, &NeedDpx, &NeedDpy, &NeedSpx, &NeedSpy, 0); + if (PoolKerName==0) + PoolKerName = CNN_FindMatchingKernel(PoolOper, KOP_NONE, ParFeat, 1, 0, 0, 0, 1, Fpx, Fpy, Dpx, Dpy, Spx, Spy, + &NeedFpx, &NeedFpy, &NeedDpx, &NeedDpy, &NeedSpx, &NeedSpy, 0); + else if (ActOper) StandAloneAct = 0; + if (PoolKerName==0) GenTilingError("CNN_ConvolutionPoolAct_SQ8 Kernel: %s, Can't find a matching Pooling %s basic kernel", Name, ActOper?"with linear rectification":""); + } + if (ActOper && StandAloneAct) { + ActKerName = CNN_FindMatchingKernel(ActOper, KOP_NONE, ParFeat, 1, 0, 0, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (ActKerName==0) GenTilingError("CNN_ConvolutionPoolAct_SQ8 Kernel: %s, Can't find a matching Activation basic kernel", Name); + } + + if (Log) { + printf("InFeat: %d, OutFeat: %d\n", InFeat, OutFeat); + printf("Conv => W: %d, Pad:[%d,%d] PadT:[%d,%d] => Wc: %d, Filter:[%d,%d]\n", Width, PadInc[0], PadInc[1], PadIncT[0], PadIncT[1], Wc, Fcx, Fcy); + printf(" => H: %d, Pad:[%d,%d] PadT:[%d,%d] => Hc: %d\n", Height, PadInc[2], PadInc[3], PadIncT[2], PadIncT[3], Hc); + printf("Pool => Wc: %d, Pad:[%d,%d] => Wo: %d, Filter:[%d,%d]\n", Wc, PadInp[0], PadInp[1], Wo, Fpx, Fpy); + printf(" => Hc: %d, Pad:[%d,%d] => Ho: %d\n", Hc, PadInp[2], PadInp[3], Ho); + printf("OverlapC: %d\n", OverlapC); + printf("OverlapP: %d\n", OverlapP); + printf("TileCons: %d\n", TileCons); + printf("UsedIn : [%d x %d]\n", UsedWidth, UsedHeight); + printf("UsedC : [%d x %d]\n", UsedWc, UsedHc); + if (SetBiasKerName) printf("%20s: %s\n", "SetBiasKerName", SetBiasKerName); + if (ConvKerName) printf("%20s: %s\n", "ConvKerName", ConvKerName); + if (DPReductionKerName) printf("%20s: %s\n", "DPReductionKerName", DPReductionKerName); + if (PoolKerName) printf("%20s: %s\n", "PoolKerName", PoolKerName); + if (ActKerName) printf("%20s: %s\n", "ActKerName", ActKerName); + printf("Nb Oper : %lld\n", LayerOp); + + } + if (Ctrl && (Ctrl->EnableIm2Col==1) && (ConvOper==KOP_CONV) && (PoolOper==KOP_NONE) && (Fcx==1) && (Fcy==1) && (Dcx==1) && (Dcy==1)) { + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF); + // if ((InFeat+OutFeat)<80) { + if ((InFeat+OutFeat)<100) { + if (Log) printf("Mapping this convolution to matrix multiplication with small first operand\n"); + int Ok = CNN_MatMulSmallM1Act_SQ8(Name, 0, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width*Height, InFeat, Width, Height, Scx, Scy, KOP_MATMUL_SM1, ActOper); + if (!Ok&&Log) printf("Mapping this convolution to matrix multiplication with small first operand FAILED, trying with standard mult implementation\n"); + if (Ok) return Ok; + } + if (Log) printf("Mapping this convolution to matrix multiplication\n"); + int Ok = CNN_MatMulAct_SQ8(Name, 0, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width*Height, InFeat, Width, Height, Scx, Scy, KOP_MATMUL, ActOper); + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON); + if (Ok) return Ok; + if (Log) printf("Mapping this convolution to matrix multiplication FAILED, reverting to standard implementation\n"); + } + + /* User kernel C arguments */ + CKernel_Arg_T **KCArgs = AllocateCArgs(7); + int Ca=0; + + KCArgs[Ca++] = TCArg(CNN_ArgDataType(1, 1,1), "In"); + KCArgs[Ca++] = TCArg(CNN_ArgDataType(1, 1,1), "Filter"); + KCArgs[Ca++] = TCArg(CNN_ArgDataType(Bias_DataSize,1,1), "Bias"); + KCArgs[Ca++] = TCArg(CNN_ArgDataType(1, 1,1), "Out"); + KCArgs[Ca++] = TCArg(CNN_ArgDataTypeUns(1, 1,1), "Scale"); + KCArgs[Ca++] = TCArg(CNN_ArgDataTypeUns(1, 1,1), "ScaleN"); + KCArgs[Ca++] = TCArg(CNN_ArgDataType(1, 1,1), "Infos"); + + /* User kernel kernel arguments */ + Object_T **KArgs = AllocateKerArgs(8); + int Ka=0; + + KArgs[Ka++] = KerArgP("In", KerArgSpace(2,D0,T0), O_IN|O_DB, Width, Height, UsedWidth, UsedHeight, PadIncT, PadInc, 1, OverlapC, 0, TileCons, "In"); + KArgs[Ka++] = KerArg ("Bias", KerArgSpace(1,Os), O_IN|O_DB|O_CONST, 1, 1, Bias_DataSize, 0, 0, 0, "Bias"); + KArgs[Ka++] = KerArg ("Scale", KerArgSpace(1,Os), O_IN|O_DB|O_CONST, 1, 1, 1, 0, 0, 0, "Scale"); + KArgs[Ka++] = KerArg ("ScaleN", KerArgSpace(1,Os), O_IN|O_DB|O_CONST, 1, 1, 1, 0, 0, 0, "ScaleN"); + if (DWConv) + KArgs[Ka++] = KerArg ("Filter", KerArgSpace(1,Os), O_IN|O_DB|O_CONST, 1, 1, Fcx*Fcy, 0, 0, 0, "Filter"); + else + KArgs[Ka++] = KerArg ("Filter", KerArgSpace(2,Os,D0), O_IN|O_DB|O_CONST, 1, 1, Fcx*Fcy, 0, 0, 0, "Filter"); + KArgs[Ka++] = KerArg ("Out", KerArgSpace(2,Os,T0), O_OUT|O_DB, Wo, Ho, 1, 0, 0, 0, "Out"); + if (ParFeat) + KArgs[Ka++] = KerArgP("ConvOut",KerArgSpace(2,Os,T0), O_BUFF|O_ONETILE, Wc, Hc, UsedWc, UsedHc, PadInp, PadInp, 4, OverlapP, 0, 0, ""); + else + KArgs[Ka++] = KerArgP("ConvOut",KerArgSpace(1,T0), O_BUFF|O_ONETILE, Wc, Hc, UsedWc, UsedHc, PadInp, PadInp, 4, OverlapP, 0, 0, ""); + + KArgs[Ka++] = KerArg ("Infos", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED|O_CONST, AT_INF_DIM, 1, 1, 0, 0, 0, "Infos"); + + Kernel_T *Kernel = UserKernel(Name, + ParFeat? + (DWConv? + KernelIterSpace(2, IterParSpace(D0, InFeat, 8), IterTiledSpace(T0)): + KernelIterSpace(3, IterParSpace(D1, OutFeat, 8), IterTiledSpace(T0), IterParSpace(D0, InFeat, InTileCons))): + (DWConv? + KernelIterSpace(2, IterFixedSpace(D0, InFeat), IterTiledSpace(T0)): + KernelIterSpace(3, IterFixedSpace(D1, OutFeat), IterTiledSpace(T0), IterFixedSpace(D0, InFeat))), + TileOrientation, + KCArgs, + Calls(5, + (SetBiasKerName==0)?AT_NO_CALL: + Call(SetBiasKerName, LOC_D0_PROLOG, + Bindings(6, + K_Arg("ConvOut", KER_ARG_TILE), /* SetBias output tile */ + K_Arg("ConvOut", KER_ARG_TILE_W), /* SetBias output tile width */ + K_Arg("ConvOut", KER_ARG_TILE_H), /* SetBias output tile height */ + ParFeat?K_ArgPar("ConvOut", KER_ARG_PARTILE_SIZE, Os):Imm(1), /* Number of output features in this tile */ + K_Arg("Bias", KER_ARG_TILE), /* SetBias Bias tile */ + K_TileOper("Infos", "char *", '@', AT_INF_BIASN) /* Bias Norm */ + )), + Call(ConvKerName, DWConv?LOC_LOOP:LOC_D0, + Bindings(20, + K_Arg("In", KER_ARG_TILE), /* Conv input tile */ + K_Arg("In", KER_ARG_TILE_W), /* Conv input tile width */ + K_Arg("In", KER_ARG_TILE_USEDW), /* Conv input tile width, used part of it */ + K_Arg("In", KER_ARG_TILE_H), /* Conv input tile height */ + K_Arg("In", KER_ARG_TILE_USEDH), /* Conv input tile height, used part of it */ + K_ArgPar("Filter", KER_ARG_PARTILE_SIZE, D0), /* Number of input features in this tile */ + ParFeat?K_ArgPar("ConvOut", KER_ARG_PARTILE_SIZE, Os):Imm(1), /* Number of output features in this tile */ + K_ArgPar("Filter", KER_ARG_LOADEDPARTILE_SIZE, D0), /* Total number of input features currently in L1 memory, argument promotion */ + K_Arg("Filter", KER_ARG_TILE), /* Conv filter */ + DWConv?K_Arg("Bias", KER_ARG_TILE):AT_IGNORE_ARG_BINDING, /* Conv Bias when depth wise conv*/ + K_Arg("ConvOut", KER_ARG_TILE), /* Conv output */ + K_Arg("In", KER_ARG_TILE_PAD), /* Conv Padding */ + DWConv?K_TileOper("Infos", "char *", '@', AT_INF_BIASN):AT_IGNORE_ARG_BINDING, /* NormBias is depth wise conv */ + ParFeat?AT_IGNORE_ARG_BINDING:Imm((TileOrientation==TILE_HOR)?1:0), /* Orientation when feature parallel */ + NeedFcx?Imm(Fcx):AT_IGNORE_ARG_BINDING, /* Conv Fx */ + NeedScx?Imm(Scx):AT_IGNORE_ARG_BINDING, /* Conv Stridex */ + NeedDcx?Imm(Dcx):AT_IGNORE_ARG_BINDING, /* Conv Dx */ + NeedFcy?Imm(Fcy):AT_IGNORE_ARG_BINDING, /* Conv Fy */ + NeedScy?Imm(Scy):AT_IGNORE_ARG_BINDING, /* Conv Stridey */ + NeedDcy?Imm(Dcy):AT_IGNORE_ARG_BINDING /* Conv Dy */ + ) + ), + Call(DPReductionKerName, DWConv?LOC_LOOP:LOC_D0_EPILOG, /* DP Reduction also take care of optional activation */ + Bindings(8, + K_Arg("ConvOut", KER_ARG_TILE), /* Double precision input tile */ + K_Arg(PoolOper?"ConvOut":"Out", KER_ARG_TILE), /* Single precision output tile, warning use IO kernel when In=Out */ + ParFeat?K_ArgPar("ConvOut", KER_ARG_PARTILE_SIZE, Os):Imm(1), /* Input tile Number of features */ + K_Arg("ConvOut", KER_ARG_TILE_W), /* Input tile width */ + K_Arg("ConvOut", KER_ARG_TILE_H), /* Input tile height */ + K_Arg("Scale", KER_ARG_TILE), /* Per channel scale tile */ + K_Arg("ScaleN", KER_ARG_TILE), /* Per channel scale normalization tile */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ), + (PoolKerName==0)?AT_NO_CALL: + Call(PoolKerName, DWConv?LOC_LOOP:LOC_D0_EPILOG, + Bindings(18, + K_Arg("ConvOut", KER_ARG_TILE), /* Pooling input tile */ + K_Arg("ConvOut", KER_ARG_TILE_W), /* Pooling input tile width */ + K_Arg("ConvOut", KER_ARG_TILE_USEDW), /* Pooling input tile width, used part of it */ + K_Arg("ConvOut", KER_ARG_TILE_H), /* Pooling input tile height */ + K_Arg("ConvOut", KER_ARG_TILE_USEDH), /* Pooling input tile height, used part of it */ + ParFeat?K_ArgPar("ConvOut", KER_ARG_PARTILE_SIZE, Os):Imm(1), /* Number of output features in this tile */ + K_Arg("Out", KER_ARG_TILE), /* Pooling output tile */ + K_Arg("ConvOut", KER_ARG_TILE_PAD), /* Pooling Pad */ + NeedFpx?Imm(Fpx):AT_IGNORE_ARG_BINDING, /* Pooling Fx */ + NeedSpx?Imm(Spx):AT_IGNORE_ARG_BINDING, /* Pooling Stridex */ + NeedDpx?Imm(Dpx):AT_IGNORE_ARG_BINDING, /* Pooling Dx */ + NeedFpy?Imm(Fpy):AT_IGNORE_ARG_BINDING, /* Pooling Fy */ + NeedSpy?Imm(Spy):AT_IGNORE_ARG_BINDING, /* Pooling Stridey */ + NeedDpy?Imm(Dpy):AT_IGNORE_ARG_BINDING, /* Pooling Dy */ + Imm((PoolOper==KOP_MAXPOOL)?1:0), /* PoolMax or PoolAverage */ + ParFeat?AT_IGNORE_ARG_BINDING:Imm((TileOrientation==TILE_HOR)?1:0), /* Pooling Orientation when feature parallel */ + Imm((ActOper==KOP_NONE)), /* Scaling when no activation */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ), + (ActKerName==0)?AT_NO_CALL: + Call(ActKerName, DWConv?LOC_LOOP:LOC_D0_EPILOG, + Bindings(6, + K_Arg("Out", KER_ARG_TILE), /* Input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + ParFeat?K_ArgPar("Out", KER_ARG_PARTILE_SIZE, Os):Imm(1), /* Number of features in this tile */ + K_Arg("Out", KER_ARG_TILE_W), /* Tile width */ + K_Arg("Out", KER_ARG_TILE_H), /* Tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ) + ), + KArgs + ); + if (Kernel) { + AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0); + AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); + + AddKernelArgDim(Name, "In", 4, InFeat, Height, Width, 1); + if (DWConv) AddKernelArgDim(Name, "Filter", 4, InFeat, Fcx, Fcy, 1); + else AddKernelArgDim(Name, "Filter", 5, OutFeat, InFeat, Fcx, Fcy, 1); + AddKernelArgDim(Name, "Bias", 2, OutFeat, Bias_DataSize); + AddKernelArgDim(Name, "Out", 4, OutFeat, Ho, Wo, 1); + AddKernelArgDim(Name, "Scale", 2, OutFeat, 1); + AddKernelArgDim(Name, "ScaleN", 2, OutFeat, 1); + AddKernelArgDim(Name, "Infos", 2, AT_INF_DIM, 1); + + if (Ctrl && (Ctrl->In_L3)) SetKerArgInL3(Name, "In"); + if (Ctrl && (Ctrl->Filter_L3)) SetKerArgInL3(Name, "Filter"); + if (Ctrl && (Ctrl->Bias_L3)) SetKerArgInL3(Name, "Bias"); + if (Ctrl && (Ctrl->Out_L3)) SetKerArgInL3(Name, "Out"); + if (Ctrl && (Ctrl->Scale_L3)) SetKerArgInL3(Name, "Scale"); + if (Ctrl && (Ctrl->ScaleN_L3)) SetKerArgInL3(Name, "ScaleN"); + + AT_PrepareForTest_SQ8(Name, InFeat, OutFeat, Width, Height, Bias_DataSize, + ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, PadInc, + PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PadInp, + ActOper); + } + return (Kernel!=0); +} + +/********************************************************************************************************************************************************************* + Generator for Grouped Convolutions with channel centric scaling, followed by an optional pooling (Max or Average), + followed by an optional activation. + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features, Use double precision convolution, Use HWCE), Def=(TILE_HOR, 1, 0, 0) + + GroupIn: Size of the group for input features + GroupOut: Size of the group for output features + + Bias_DataSize: 1: byte, 2: half word, 4: word + Scale_DataSize: 1: byte, 2: half word, 4: word + + InFeat: Number of input feature's maps + OutFeat: Number of output feature's maps + Width: Number of columns of a given feature map + Height: Number of lines of a given feature map + + ConvOper: Type of convolution, Regular convolution: KOP_CONV + Fcx: Convolution filter x dimension + Fcy: Convolution filter y dimension + Dcx: Convolution filter dilation factor, x dimension + Dcy: Convolution filter dilation factor, y dimension + Scx: Convolution filter stride x dimension + Scy: Convolution filter stride y dimension + ConvPad: 0: No padding, 1: Zero padding + + PoolOper: Type of Pooling, KOP_NONE, Max Pooling: KOP_MAXPOOL, Average Pooling: KOP_AVGPOOL + Fpx: Pooling filter x dimension + Fpy: Pooling filter y dimension + Dpx: Pooling filter dilation factor, x dimension + Dpy: Pooling filter dilation factor, y dimension + Spx: Pooling filter stride x dimension + Spy: Pooling filter stride y dimension + PoolPad: 0: No padding, 1: Zero padding + + ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + + Signature: Name(In, Filter, Bias, Out, Scale, ScaleN, Infos) + + CNN_GroupedConvolutionPoolAct_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_GroupedConvolutionPoolAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int GroupIn, + int GroupOut, + + int Bias_DataSize, + int Scale_DataSize, + + int InFeat, + int OutFeat, + int Width, + int Height, + + KernelOper_T ConvOper, + int Fcx, + int Fcy, + int Dcx, + int Dcy, + int Scx, + int Scy, + int ConvPad, + + KernelOper_T PoolOper, + int Fpx, + int Fpy, + int Dpx, + int Dpy, + int Spx, + int Spy, + int PoolPad, + + KernelOper_T ActOper + ) + +{ + char *BodyName = AppendNames(Name, "Body"); + KernelGroup_T *UKGroup; + int g, Wc, Hc, Wo, Ho; + int NGroups = InFeat/GroupIn; + + if (!(ConvOper == KOP_CONV)) + GenTilingError("CNN_GroupedConvolutionPoolAct_SQ8: Kernel: %s, ConvOper, expecting KOP_NONE, KOP_CONV or KOP_CONV_DW", Name); + if (!(PoolOper == KOP_NONE || PoolOper == KOP_MAXPOOL || PoolOper == KOP_AVGPOOL)) + GenTilingError("CNN_GroupedConvolutionPoolAct_SQ8: Kernel: %s, PoolOper, expecting KOP_NONE, KOP_MAXPOOL or KOP_AVGPOOL", Name); + if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU)) + GenTilingError("CNN_GroupedConvolutionPoolAct_SQ8: Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_HSWISH or KOP_LEAKYRELU", Name); + + CNN_LayerOutputDim(Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, &Wc, &Hc, &Wo, &Ho, 0, 0, 0, 0); + + if ((InFeat%GroupIn)||(OutFeat%GroupOut)||((InFeat/GroupIn)!=(OutFeat/GroupOut))) + GenTilingError("CNN_GroupedConvolutionPoolAct_SQ8: %s cannot divide In(%d)/Out(%d) feature spaces with these group parameters: GroupIn %d, GroupOut: %d", + Name, InFeat, OutFeat, GroupIn, GroupOut); + + OpenKernelGroup(Name); + CNN_ConvolutionPoolAct_SQ8(BodyName, Ctrl, + Bias_DataSize, Scale_DataSize, + GroupIn, GroupOut, Width, Height, + ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, + PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, + ActOper + ); + CloseKernelGroup(); + + CKernel_Arg_T **KCArgs = AllocateCArgs(7); + int Ca=0; + KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "In"); + KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "Filter"); + KCArgs[Ca++] = TCArg(CNN_ArgDataType(Bias_DataSize,1,1), "Bias"); + KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "Out"); + KCArgs[Ca++] = TCArg(CNN_ArgDataTypeUns(1, 1,1), "Scale"); + KCArgs[Ca++] = TCArg(CNN_ArgDataTypeUns(1, 1,1), "ScaleN"); + KCArgs[Ca++] = TCArg(CNN_ArgDataType(1, 1,1), "Infos"); + + Object_T **KArgs = AllocateKerArgs(7); + int Ka=0; + KArgs[Ka++] = KerGroupArg("In", O_IN, NGroups*GroupIn*Width*Height, 1, "In"); + KArgs[Ka++] = KerGroupArg("Filter", O_IN, NGroups*GroupIn*GroupOut*Fcx*Fcy, 1, "Filter"); + KArgs[Ka++] = KerGroupArg("Bias", O_IN, NGroups*GroupOut, Bias_DataSize, "Bias"); + KArgs[Ka++] = KerGroupArg("Out", O_OUT, NGroups*GroupOut*Wo*Ho, 1, "Out"); + KArgs[Ka++] = KerGroupArg("Scale", O_IN, NGroups*GroupOut, 1, "Scale"); + KArgs[Ka++] = KerGroupArg("ScaleN", O_IN, NGroups*GroupOut, 1, "ScaleN"); + KArgs[Ka++] = KerGroupArg("Infos", O_IN, AT_INF_DIM, 1, "Infos"); + + UKGroup = UserKernelGroupK(Name, + NGroups, + KCArgs, + 0, + Calls(1, + UserKernelCall(BodyName, LOC_GROUP, + Bindings(7, + KG_ArgOper("In", '*', GroupIn*Width*Height), + KG_ArgOper("Filter", '*', GroupIn*GroupOut*Fcx*Fcy), + KG_ArgOper("Bias", '*', GroupOut*Bias_DataSize), + KG_ArgOper("Out", '*', GroupOut*Wo*Ho), + KG_ArgOper("Scale", '*', GroupOut), + KG_ArgOper("ScaleN", '*', GroupOut), + KG_ArgOper("Infos", '+', 0) + + ) + ) + ), + KArgs + ); + return (UKGroup!=0); +} + +/********************************************************************************************************************************************************************* + Generator for Pooling (Max or Average) with tensor centric scaling followed by an optional activation + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 1) + + Feat: Number of feature's maps + Width: Number of columns of a given feature map + Height: Number of lines of a given feature map + + PoolOper: KOP_MAXPOOL or KOP_AVGPOOL + Fpx: Size of the pooling filter, x dimension + Fpy: Size of the pooling filter, y dimension + Dpx: Dilation factor, x dimension + Dpy: Dilation factor, y dimension + Spx: Pooling stride, x dimension + Spy: Pooling stride, y dimension + + ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + + Signature: Name(In, Out, Infos) + + CNN_PoolAct_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_PoolAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Feat, + int Width, + int Height, + + KernelOper_T PoolOper, + int Fpx, + int Fpy, + int Dpx, + int Dpy, + int Spx, + int Spy, + int PoolPad, + + KernelOper_T ActOper + ) + +{ + if (PoolOper==KOP_NONE && ActOper!=KOP_NONE) return CNN_Act_SQ8(Name, Ctrl, Feat, Width, Height, ActOper); + + Tile_Orientation_T TileOrientation = TILE_HOR; + int ParFeat = 1; + AT_PadType PadType = PAD_BALANCED_LEFT; + if (Ctrl) { + if (Ctrl->TileOrientation != -1) TileOrientation = (Ctrl->TileOrientation==0)?TILE_HOR:TILE_VER; + if (Ctrl->ParallelFeatures != -1) ParFeat = Ctrl->ParallelFeatures; + if (Ctrl->PadType != -1) PadType = Ctrl->PadType; + } + int TileCons, NeedFpx=0, NeedFpy=0, NeedDpx=0, NeedDpy=0, NeedSpx=0, NeedSpy=0, OverlapP; + int Wo, Ho; + int UsedWidth, UsedHeight; + int PadPw=0, PadPh=0; + v4s PadInp = (v4s){0,0,0,0}; + char *PoolKerName=0, *ActKerName=0; + unsigned long long int LayerOp = 0; + unsigned long long int LayerBandwidth = 0; + int StandAloneAct = (ActOper!=KOP_NONE); + int Log=1; + + if (!(PoolOper == KOP_MAXPOOL || PoolOper == KOP_AVGPOOL)) + GenTilingError("CNN_Pool_SQ8 Kernel: %s, PoolOper, expecting KOP_MAXPOOL or KOP_AVGPOOL", Name); + if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU)) + GenTilingError("CNN_Pool_SQ8 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_HSWISH or KOP_LEAKYRELU", Name); + + /* Set Kernel characteristics */ + CNN_LayerOutputDim(Width, Height, KOP_NONE, 1, 1, 1, 1, 1, 1, 1, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, 0, 0, &Wo, &Ho, 0, 0, &PadPw, &PadPh); + PadInp = CNN_EdgePaddingValue(PadType, PadPw, PadPh); + CNN_TileOverlap(TileOrientation, 1, 1, 1, 1, 1, 1, Fpx, Fpy, Dpx, Dpy, Spx, Spy, 0, &OverlapP); + UsedWidth = CNN_UsedInputDimension(Wo, Fpx, Spx, Dpx, PadPw); + UsedHeight = CNN_UsedInputDimension(Ho, Fpy, Spy, Dpy, PadPh); + TileCons = (TileOrientation==TILE_HOR)?(Spy):(Spx); + /* Re evaluate truly used width/height and update padding accordingly */ + PadInp[1] = Max(0, PadInp[1]-(Width-UsedWidth)); PadInp[3] = Max(0, PadInp[3]-(Height-UsedHeight)); + + + + PoolKerName = CNN_FindMatchingKernel(PoolOper, ActOper, ParFeat, 1, 0, 0, 0, 1, Fpx, Fpy, Dpx, Dpy, Spx, Spy, &NeedFpx, &NeedFpy, &NeedDpx, &NeedDpy, &NeedSpx, &NeedSpy, 0); + if (PoolKerName==0) PoolKerName = CNN_FindMatchingKernel(PoolOper, KOP_NONE, ParFeat, 1, 0, 0, 0, 1, Fpx, Fpy, Dpx, Dpy, Spx, Spy, &NeedFpx, &NeedFpy, &NeedDpx, &NeedDpy, &NeedSpx, &NeedSpy, 0); + else if (ActOper) StandAloneAct = 0; + if (PoolKerName==0) GenTilingError("CNN_Pool_SQ8 Kernel: %s, Can't find a matching Pooling basic kernel", Name); + + if (ActOper && StandAloneAct) { + ActKerName = CNN_FindMatchingKernel(ActOper, KOP_NONE, ParFeat, 1, 0, 0, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (ActKerName==0) GenTilingError("CNN_Pool_SQ8 Kernel: %s, Can't find a matching Activation basic kernel", Name); + } + + if (PoolOper) LayerOp += Feat*Wo*Ho*Fpx*Fpy; + if (ActOper) LayerOp += Feat*Wo*Ho; + + LayerBandwidth += Width*Height*1*Feat; + LayerBandwidth += Wo*Ho*1*Feat; + + if (Log) { + printf("Pool => W: %d, Pad:[%d,%d] => Wo: %d\n", Width, PadInp[0], PadInp[1], Wo); + printf(" => H: %d, Pad:[%d,%d] => Ho: %d\n", Height, PadInp[2], PadInp[3], Ho); + printf("OverlapP: %d\n", OverlapP); + printf("TileCons: %d\n", TileCons); + printf("UsedIn : [%d x %d]\n", UsedWidth, UsedHeight); + if (PoolKerName) printf("%20s: %s\n", "PoolKerName", PoolKerName); + if (ActKerName) printf("%20s: %s\n", "ActKerName", ActKerName); + printf("Nb Oper : %lld\n", LayerOp); + } + + CKernel_Arg_T **KCArgs = AllocateCArgs(3); + int Ca=0; + KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "In"); + KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "Out"); + KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "Infos"); + + Object_T **KArgs = AllocateKerArgs(3); + int Ka=0; + KArgs[Ka++] = KerArgP("In", KerArgSpace(2,D0,T0), OBJ_IN_DB, Width, Height, UsedWidth, UsedHeight, PadInp,PadInp, 1, OverlapP, 0, TileCons, "In"); + KArgs[Ka++] = KerArg ("Out", KerArgSpace(2,D0,T0), OBJ_OUT_DB, Wo, Ho, 1, 0, 0, 0, "Out"); + KArgs[Ka++] = KerArg ("Infos", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED|O_CONST, AT_INF_DIM, 1, 1, 0, 0, 0, "Infos"); + + Kernel_T *Kernel = UserKernel(Name, + ParFeat? + KernelIterSpace(2, IterParSpace(D0, Feat, 8), IterTiledSpace(T0)): + KernelIterSpace(2, IterFixedSpace(D0, Feat), IterTiledSpace(T0)), + TileOrientation, + KCArgs, + Calls(2, + Call(PoolKerName, LOC_LOOP, + Bindings(18, + K_Arg("In", KER_ARG_TILE), + K_Arg("In", KER_ARG_TILE_W), + K_Arg("In", KER_ARG_TILE_USEDW), + K_Arg("In", KER_ARG_TILE_H), + K_Arg("In", KER_ARG_TILE_USEDH), + ParFeat?K_ArgPar("Out", KER_ARG_PARTILE_SIZE, D0):Imm(1), /* Number of features in this tile */ + K_Arg("Out", KER_ARG_TILE), + K_Arg("In", KER_ARG_TILE_PAD), + NeedFpx?Imm(Fpx):AT_IGNORE_ARG_BINDING, /* Pooling Fx */ + NeedSpx?Imm(Spx):AT_IGNORE_ARG_BINDING, /* Pooling Stridex */ + NeedDpx?Imm(Dpx):AT_IGNORE_ARG_BINDING, /* Pooling Dx */ + NeedFpy?Imm(Fpy):AT_IGNORE_ARG_BINDING, /* Pooling Fy */ + NeedSpy?Imm(Spy):AT_IGNORE_ARG_BINDING, /* Pooling Stridey */ + NeedDpy?Imm(Dpy):AT_IGNORE_ARG_BINDING, /* Pooling Dy */ + Imm((PoolOper==KOP_MAXPOOL)?1:0), /* PoolMax or PoolAvg */ + Imm((TileOrientation==TILE_HOR)?1:0), /* Pooling Orientation when feature parallel */ + Imm((ActOper==KOP_NONE)), /* Scaling when no activation */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ), + (ActKerName==0)?AT_NO_CALL: + Call(ActKerName, LOC_LOOP, + Bindings(6, + K_Arg("Out", KER_ARG_TILE), /* Input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + ParFeat?K_ArgPar("Out", KER_ARG_PARTILE_SIZE, D0):Imm(1), /* Number of features in this tile */ + K_Arg("Out", KER_ARG_TILE_W), /* Tile width */ + K_Arg("Out", KER_ARG_TILE_H), /* Tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ) + ), + KArgs + ); + if (Kernel) { + AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0); + AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); + + AddKernelArgDim(Name, "In", 4, Feat, Height, Width, 1); + AddKernelArgDim(Name, "Out", 4, Feat, Ho, Wo, 1); + AddKernelArgDim(Name, "Infos", 2, AT_INF_DIM, 1); + + AT_PrepareForTest_SQ8(Name, Feat, Feat, Width, Height, 1, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PadInp, 0, 0,0,0,0,0,0,(v4s) 0, ActOper); + } + return (Kernel!=0); +} + + +/********************************************************************************************************************************************************************* + Generator for Activation with tensor centric scaling + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 1) + + Feat: Number of feature's maps + Width: Number of columns of a given feature map + Height: Number of lines of a given feature map + + ActOper: KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + + Signature: Name(In, Out, Infos) + + CNN_Act_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_Act_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Feat, + int Width, + int Height, + + KernelOper_T ActOper + ) + +{ + Tile_Orientation_T TileOrientation = TILE_HOR; + int ParFeat = 1; + if (Ctrl) { + if (Ctrl->TileOrientation != -1) TileOrientation = (Ctrl->TileOrientation==0)?TILE_HOR:TILE_VER; + if (Ctrl->ParallelFeatures != -1) ParFeat = Ctrl->ParallelFeatures; + } + int TileCons = 0; + char *ActKerName=0; + unsigned long long int LayerOp = 0; + unsigned long long int LayerBandwidth = 0; + int StandAloneAct = (ActOper!=KOP_NONE); + int Log=1; + + if (!(ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU)) + GenTilingError("CNN_Act_SQ8 Kernel: %s, ActOper, expecting KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_HSWISH or KOP_LEAKYRELU", Name); + + ActKerName = CNN_FindMatchingKernel(ActOper, KOP_NONE, ParFeat, 1, 0, 0, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (ActKerName==0) GenTilingError("CNN_Act_SQ8 Kernel: %s, Can't find a matching Activation basic kernel", Name); + + LayerOp += Feat*Width*Height; + + LayerBandwidth += Width*Height*1*Feat; + LayerBandwidth += Width*Height*1*Feat; + + if (Log) { + printf("Act => W: %d, Wo: %d\n", Width, Width); + printf(" => H: %d, Ho: %d\n", Height, Height); + printf("%20s: %s\n", "ActKerName", ActKerName); + printf("Nb Oper : %lld\n", LayerOp); + } + + Kernel_T *Kernel = UserKernel(Name, + ParFeat? + KernelIterSpace(2, IterParSpace(D0, Feat, 8), IterTiledSpace(T0)): + KernelIterSpace(2, IterFixedSpace(D0, Feat), IterTiledSpace(T0)), + TileOrientation, + CArgs(3, + TCArg(CNN_ArgDataType(1,1,1), "In"), + TCArg(CNN_ArgDataType(1,1,1), "Out"), + TCArg(CNN_ArgDataType(1,1,1), "Infos") + ), + Calls(1, + Call(ActKerName, LOC_LOOP, + Bindings(6, + K_Arg("In", KER_ARG_TILE), /* Input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + ParFeat?K_ArgPar("In", KER_ARG_PARTILE_SIZE, D0):Imm(1), /* Number of features in this tile */ + K_Arg("In", KER_ARG_TILE_W), /* Tile width */ + K_Arg("In", KER_ARG_TILE_H), /* Tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ) + ), + KerArgs(3, + KerArg("In", KerArgSpace(2,D0,T0), OBJ_IN_DB, Width, Height, 1, 0, 0, 0, "In"), + KerArg("Out", KerArgSpace(2,D0,T0), OBJ_OUT_DB, Width, Height, 1, 0, 0, 0, "Out"), + KerArg("Infos", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED|O_CONST, AT_INF_DIM, 1, 1, 0, 0, 0, "Infos") + ) + ); + if (Kernel) { + AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0); + AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); + + AddKernelArgDim(Name, "In", 4, Feat, Height, Width, 1); + AddKernelArgDim(Name, "Out", 4, Feat, Height, Width, 1); + AddKernelArgDim(Name, "Infos", 2, AT_INF_DIM, 1); + + AT_PrepareForTest_SQ8(Name, Feat,Feat,Width,Height, 1, ActOper, 0,0,0,0,0,0,(v4s) 0, 0, 0,0,0,0,0,0,(v4s) 0, KOP_NONE); + } + return (Kernel!=0); +} + +/********************************************************************************************************************************************************************* + Generator for Global Pooling (Max or Average) with tensor centric scaling and optional activation + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 1) + + Feat: Number of feature's maps + Width: Number of columns of a given feature map + Height: Number of lines of a given feature map + + PoolOper: KOP_GLOBAL_MAXPOOL or KOP_GLOBAL_AVGPOOL + + ActOper: Optional activation function: KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + + Signature: Name(In, Out, Infos) + + + CNN_GlobalPoolAct_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_GlobalPoolAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Feat, + int Width, + int Height, + + KernelOper_T PoolOper, + KernelOper_T ActOper + ) + +{ + Tile_Orientation_T TileOrientation = TILE_HOR; + int ParFeat = 1; + int Wo, Ho; + char *PoolKerName=0, *ActKerName=0; + unsigned long long int LayerOp = 0; + unsigned long long int LayerBandwidth = 0; + int StandAloneAct = (ActOper!=0); + int Log=1; + + if (!(PoolOper == KOP_GLOBAL_MAXPOOL || PoolOper == KOP_GLOBAL_AVGPOOL)) + GenTilingError("CNN_GlobalPoolAct_SQ8 Kernel: %s, PoolOper should be KOP_GLOBAL_MAXPOOL or KOP_GLOBAL_AVGPOOL", Name); + if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU)) + GenTilingError("CNN_GlobalPoolAct_SQ8 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_HSWISH or KOP_LEAKYRELU", Name); + + PoolKerName = CNN_FindMatchingKernel(PoolOper, ActOper, ParFeat, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + if (PoolKerName) StandAloneAct = 0; + else if (StandAloneAct) PoolKerName = CNN_FindMatchingKernel(PoolOper, KOP_NONE, ParFeat, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + if (PoolKerName==0) GenTilingError("CNN_GlobalPoolAct_SQ8 Kernel: %s, Can't find a matching Pooling basic kernel", Name); + + if (StandAloneAct) { + ActKerName = CNN_FindMatchingKernel(ActOper, KOP_NONE, ParFeat, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + if (ActKerName==0) GenTilingError("CNN_GlobalPoolAct_SQ8 Kernel: %s, Can't find a matching Activation basic kernel", Name); + } + + Wo = 1; Ho = 1; + + + if (PoolOper) LayerOp += Feat*Wo*Ho*Width*Height; + LayerBandwidth += Width*Height*1*Feat; + LayerBandwidth += Wo*Ho*1*Feat; + + if (Log) { + printf("Global Pool => W: %d => Wo: %d\n", Width, Wo); + printf(" => H: %d => Ho: %d\n", Height, Ho); + printf(" => Feat: %d\n", Feat); + if (PoolKerName) printf("%20s: %s\n", "PoolKerName", PoolKerName); + if (ActKerName) printf("%20s: %s\n", "ActKerName", ActKerName); + printf("Nb Oper : %lld\n", LayerOp); + } + + Kernel_T *Kernel; + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF); + /* First try moving entire features only */ + Kernel = UserKernel(Name, + KernelIterSpace(1, IterTiledSpace(T0)), + TileOrientation, + CArgs(3, + TCArg(CNN_ArgDataType(1,1,1), "In"), + TCArg(CNN_ArgDataType(1,1,1), "Out"), + TCArg(CNN_ArgDataType(1,1,1), "Infos") + ), + Calls(2, + Call(PoolKerName, LOC_LOOP, + Bindings(8, + K_Arg("In", KER_ARG_TILE), /* In tile */ + Imm(Width), /* In tile width */ + Imm(Height), /* In Tile Height */ + K_Arg("Out", KER_ARG_TILE_H), /* Number of output features in this tile */ + AT_IGNORE_ARG_BINDING, /* In Tile Index, ignored here */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + Imm((ActOper==KOP_NONE)), /* Scaling when no activation */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ), + (ActKerName==0)?AT_NO_CALL: + Call(ActKerName, LOC_LOOP, + Bindings(6, + K_Arg("Out", KER_ARG_TILE), /* Input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + K_Arg("Out", KER_ARG_TILE_H), /* Number of features in this tile */ + Imm(1), /* Tile width */ + Imm(1), /* Tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ) + ), + KerArgs(3, + KerArg("In", KerArgSpace(1,T0), OBJ_IN_DB, Width*Height, Feat, 1, 0, 0, 8, "In"), + KerArg("Out", KerArgSpace(1,T0), OBJ_OUT_DB, 1, Feat, 1, 0, 0, 0, "Out"), + KerArg("Infos", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED|O_CONST, AT_INF_DIM, 1, 1, 0, 0, 0, "Infos") + ) + ); + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON); + if (Kernel == 0) { + char *PoolKerName=0, *PoolReductKerName=0, *ActKerName=0; + int StandAloneAct = (ActOper!=KOP_NONE); + KernelOper_T ReductOper = (PoolOper==KOP_GLOBAL_MAXPOOL)?KOP_GLOBAL_MAXPOOL_REDUCT:KOP_GLOBAL_AVGPOOL_REDUCT; + + PoolKerName = CNN_FindMatchingKernel(PoolOper, KOP_NONE, ParFeat, 1, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + if (PoolKerName==0) GenTilingError("CNN_GlobalPoolAct_SQ8 Kernel: %s, Can't find a matching global pooling basic kernel", Name); + + PoolReductKerName = CNN_FindMatchingKernel(ReductOper, ActOper, ParFeat, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + if (PoolReductKerName) StandAloneAct = 0; + else if (StandAloneAct) PoolReductKerName = CNN_FindMatchingKernel(ReductOper, KOP_NONE, ParFeat, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + if (PoolReductKerName==0) GenTilingError("CNN_GlobalPoolAct_SQ8 Kernel: %s, Can't find a matching reduction basic kernel", Name); + + if (StandAloneAct) { + ActKerName = CNN_FindMatchingKernel(ActOper, KOP_NONE, ParFeat, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + if (ActKerName==0) GenTilingError("CNN_GlobalPoolAct_SQ8 Kernel: %s, Can't find a matching activation basic kernel", Name); + } + if (Log) { + printf("Global Pool DP => W: %d => Wo: %d\n", Width, Wo); + printf(" => H: %d => Ho: %d\n", Height, Ho); + printf(" => Feat: %d\n", Feat); + if (PoolKerName) printf("%20s: %s\n", "PoolKerName", PoolKerName); + if (PoolReductKerName) printf("%20s: %s\n", "PoolReductKerName", PoolReductKerName); + if (ActKerName) printf("%20s: %s\n", "ActKerName", ActKerName); + printf("Nb Oper : %lld\n", LayerOp); + } + Kernel = UserKernel(Name, + KernelIterSpace(2, IterParSpace(D0, Feat, 8), IterTiledSpace(T0)), + TileOrientation, + CArgs(3, + TCArg(CNN_ArgDataType(1,1,1), "In"), + TCArg(CNN_ArgDataType(1,1,1), "Out"), + TCArg(CNN_ArgDataType(1,1,1), "Infos") + ), + Calls(3, + Call(PoolKerName, LOC_LOOP, + Bindings(8, + K_Arg("In", KER_ARG_TILE), /* In tile */ + K_Arg("In", KER_ARG_TILE_W), /* In tile width */ + K_Arg("In", KER_ARG_TILE_H), /* In Tile Height */ + K_ArgPar("In", KER_ARG_PARTILE_SIZE, D0), /* Number of output features in this tile */ + K_Arg("In", KER_ARG_TILEINDEX), /* In Tile Index */ + K_Arg("DPOut", KER_ARG_TILE), /* Output tile */ + AT_IGNORE_ARG_BINDING, /* Scaling when no activation, not needed here */ + AT_IGNORE_ARG_BINDING /* Infos, not needed here */ + ) + ), + Call(PoolReductKerName, LOC_LOOP_EPILOG, + Bindings(8, + K_Arg("DPOut", KER_ARG_TILE), /* In tile */ + K_Arg("In", KER_ARG_TILE_W), /* In tile width */ + K_Arg("In", KER_ARG_TILE_H), /* In Tile Height */ + K_ArgPar("In", KER_ARG_PARTILE_SIZE, D0), /* Number of output features in this tile */ + AT_IGNORE_ARG_BINDING, /* In Tile Index, not needed here */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + Imm((ActOper==KOP_NONE)), /* Scaling when no activation */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ), + (ActKerName==0)?AT_NO_CALL: + Call(ActKerName, LOC_LOOP_EPILOG, + Bindings(6, + K_Arg("Out", KER_ARG_TILE), /* Input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + K_ArgPar("Out", KER_ARG_PARTILE_SIZE, D0), /* Number of features in this tile */ + Imm(1), /* Tile width */ + Imm(1), /* Tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ) + ), + KerArgs(4, + KerArg("In", KerArgSpace(2,D0,T0), OBJ_IN_DB, Width, Height, 1, 0, 0, 0, "In"), + KerArg("DPOut", KerArgSpace(1,D0), O_BUFF, 1, 1, 4, 0, 0, 0, ""), + KerArg("Out", KerArgSpace(1,D0), OBJ_OUT_DB, 1, 1, 1, 0, 0, 0, "Out"), + KerArg("Infos", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED|O_CONST, AT_INF_DIM, 1, 1, 0, 0, 0, "Infos") + ) + ); + } + if (Kernel) { + AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0); + AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); + + AddKernelArgDim(Name, "In", 4, Feat, Height, Width, 1); + AddKernelArgDim(Name, "Out", 4, Feat, 1, 1, 1); + AddKernelArgDim(Name, "Infos", 2, AT_INF_DIM, 1); + + AT_PrepareForTest_SQ8(Name, Feat,Feat,Width,Height, 1, PoolOper, 0,0,0,0,0,0, (v4s)0, 0, 0,0,0,0,0,0,(v4s) 0, ActOper); + } + return (Kernel!=0); +} + + +/********************************************************************************************************************************************************************* + Generator for Linear layers followed wth channel centric scaling followed by an optional activation + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 0) + + Bias_DataSize: 1: byte, 2: half word, 4: word + Scale_DataSize: 1: byte, 2: half word, 4: word + + InDim: Number of inputs + OutDim: Number of outputs + + LinearOper KOP_LINEAR + ActOper Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + + Signature: Name(In, Filter, Bias, Out, Scale, ScaleN, Infos) + + CNN_LinearAct_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_LinearAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Bias_DataSize, + int Scale_DataSize, + + int InDim, + int OutDim, + + KernelOper_T LinearOper, + KernelOper_T ActOper + ) + +{ + int Log = 1; + int Iter; + Tile_Orientation_T TileOrientation = TILE_HOR; + int ParFeat = 1; + int StandAloneAct = (ActOper!=KOP_NONE); + if (Ctrl) { + if (Ctrl->TileOrientation != -1) TileOrientation = (Ctrl->TileOrientation==0)?TILE_HOR:TILE_VER; + if (Ctrl->ParallelFeatures != -1) ParFeat = Ctrl->ParallelFeatures; + } + char *LinearKerName=0, *ActKerName=0; + unsigned long long int LayerOp = 0; + unsigned long long int LayerBandwidth = 0; + + /* First try to map on Linear Kernel without reduction, for that In and one full line of Weights must fit into L1 */ + if (LinearOper != KOP_LINEAR) GenTilingError("CNN_LinearAct_SQ8 Kernel: %s, only KOP_LINEAR should be used as LinearOper argument", Name); + + LinearKerName = CNN_FindMatchingKernel(LinearOper, ActOper, ParFeat, 1, 1, Bias_DataSize, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (LinearKerName) StandAloneAct = 0; + else if (StandAloneAct) LinearKerName = CNN_FindMatchingKernel(LinearOper, KOP_NONE, ParFeat, 1, 1, Bias_DataSize, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (LinearKerName==0) GenTilingError("CNN_LinearAct_SQ8 Kernel: %s, Can't find a matching Linear basic kernel", Name); + + if (StandAloneAct) { + ActKerName = CNN_FindMatchingKernel(ActOper, KOP_NONE, ParFeat, 1, 1, Bias_DataSize, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (ActKerName==0) GenTilingError("CNN_LinearAct_SQ8 Kernel: %s, Can't find a matching Activation basic kernel", Name); + } + + LayerOp += InDim*OutDim; + if (ActOper) LayerOp += OutDim; + LayerBandwidth += InDim*OutDim*1; + LayerBandwidth += OutDim*1; + LayerBandwidth += InDim*OutDim*1; + LayerBandwidth += Bias_DataSize*OutDim; + + if (Log) { + printf("Linear Layer %s, %s: InDim: %d, OutDim: %d, Activation: %s\n", Name, CNN_KernelOperImage(LinearOper), InDim, OutDim, CNN_KernelOperImage(ActOper)); + if (LinearKerName) printf("Linear Kernel: %s\n", LinearKerName); + if (ActKerName) printf("Act Kernel: %s\n", ActKerName); + } + Kernel_T *Kernel; + + CKernel_Arg_T **KCArgs = AllocateCArgs(7); + int Ca=0; + KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "In"); + KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "Filter"); + KCArgs[Ca++] = TCArg(CNN_ArgDataType(Bias_DataSize,1,1), "Bias"); + KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "Out"); + KCArgs[Ca++] = TCArg(CNN_ArgDataTypeUns(1,1,1), "Scale"); + KCArgs[Ca++] = TCArg(CNN_ArgDataTypeUns(1,1,1), "ScaleN"); + KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "Infos"); + + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF); + + Object_T **KArgs = AllocateKerArgs(7); + int Ka=0; + KArgs[Ka++] = KerArg("In", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED, 1, 1, InDim*1, 0, 0, 0, "In"); + KArgs[Ka++] = KerArg("Filter", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST, 1, 1, InDim*1, 0, 0, 0, "Filter"); + KArgs[Ka++] = KerArg("Bias", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST, 1, 1, Bias_DataSize, 0, 0, 0, "Bias"); + KArgs[Ka++] = KerArg("Out", KerArgSpace(1,D0), OBJ_OUT_DB, 1, 1, 1, 0, 0, 0, "Out"); + KArgs[Ka++] = KerArg("Scale", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST, 1, 1, 1, 0, 0, 0, "Scale"); + KArgs[Ka++] = KerArg("ScaleN", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST, 1, 1, 1, 0, 0, 0, "ScaleN"); + KArgs[Ka++] = KerArg("Infos", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos"); + + Kernel = UserKernel(Name, + KernelIterSpace(2, IterParSpace(D0, OutDim, 8), IterTiledSpace(T0)), + TileOrientation, + KCArgs, + Calls(2, + Call(LinearKerName, LOC_LOOP, + Bindings(10, + K_Arg("In", KER_ARG_TILE), /* Input tile */ + K_Arg("Filter", KER_ARG_TILE), /* Filter tile */ + K_Arg("Bias", KER_ARG_TILE), /* Bias tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + Imm(InDim), /* Input tile size */ + Imm(InDim), /* Total Input tile size */ + K_ArgPar("Out", KER_ARG_PARTILE_SIZE, D0), /* Output tile size */ + K_Arg("Scale", KER_ARG_TILE), /* Scale tile */ + K_Arg("ScaleN", KER_ARG_TILE), /* Norm Scale tile */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ), + (ActKerName==0)?AT_NO_CALL: + Call(ActKerName, LOC_LOOP_EPILOG, + Bindings(6, + K_Arg("Out", KER_ARG_TILE), /* Input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + K_ArgPar("Out", KER_ARG_PARTILE_SIZE, D0), /* Number of features in this tile */ + Imm(1), /* Tile width */ + Imm(1), /* Tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ) + ), + KArgs + ); + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON); + if (Kernel==0) { + char *SetBiasKerName=0, *LinearKerName=0, *ReductKerName=0; + + SetBiasKerName = CNN_FindMatchingKernel(KOP_SETBIAS, KOP_NONE, ParFeat, Bias_DataSize, 0, 0, 0, 4, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (SetBiasKerName==0) GenTilingError("CNN_LinearAct_SQ8 Kernel: %s, Can't find a matching Set Bias basic kernel", Name); + + LinearKerName = CNN_FindMatchingKernel(KOP_LINEAR, KOP_NONE, ParFeat, 1, 1, 0, 0, 4, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (LinearKerName==0) GenTilingError("CNN_LinearAct_SQ8 Kernel: %s, Can't find a matching Linear basic kernel", Name); + + ReductKerName = CNN_FindMatchingKernel(KOP_DP_REDUCT, ActOper, ParFeat, 4, 0, 0, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (ReductKerName==0) GenTilingError("CNN_LinearAct_SQ8 Kernel: %s, Can't find a matching Reduction basic kernel", Name); + + if (Log) { + printf("Linear Layer %s, %s: InDim: %d, OutDim: %d, Activation: %s, output parallel failed, switching to feature parallel form\n", + Name, CNN_KernelOperImage(LinearOper), InDim, OutDim, CNN_KernelOperImage(ActOper)); + if (SetBiasKerName) printf("SetBias Kernel: %s\n", SetBiasKerName); + if (LinearKerName) printf("Linear Kernel : %s\n", LinearKerName); + if (ReductKerName) printf("Reduction Kernel: %s\n", ReductKerName); + } + + Object_T **KArgs = AllocateKerArgs(8); + int Ka=0; + KArgs[Ka++] = KerArg("In", KerArgSpace(1,T0), OBJ_IN_DB, 1, InDim, 1, 0, 0, 0, "In"); + KArgs[Ka++] = KerArg("Filter", KerArgSpace(2,D0,T0), OBJ_IN_DB|O_CONST, 1, InDim, 1, 0, 0, 0, "Filter"); + KArgs[Ka++] = KerArg("Bias", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST, 1, 1, Bias_DataSize, 0, 0, 0, "Bias"); + KArgs[Ka++] = KerArg("LinOut", KerArgSpace(1,D0), O_BUFF|O_ONETILE, 1, 1, 4, 0, 0, 0, ""); + KArgs[Ka++] = KerArg("Out", KerArgSpace(1,D0), OBJ_OUT_DB, 1, 1, 1, 0, 0, 0, "Out"); + KArgs[Ka++] = KerArg("Scale", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST, 1, 1, 1, 0, 0, 0, "Scale"); + KArgs[Ka++] = KerArg("ScaleN", KerArgSpace(1,D0), OBJ_IN_DB|O_CONST, 1, 1, 1, 0, 0, 0, "ScaleN"); + KArgs[Ka++] = KerArg("Infos", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos"); + + Kernel = UserKernel(Name, + KernelIterSpace(2, IterParSpace(D0, OutDim, 8), IterTiledSpace(T0)), + // KernelIterSpace(2, IterFixedSpace(D0, OutDim), IterTiledSpace(T0)), + TileOrientation, + KCArgs, + Calls(3, + Call(SetBiasKerName, LOC_LOOP_PROLOG, + Bindings(6, + K_Arg("LinOut", KER_ARG_TILE), /* SetBias output tile */ + Imm(1), /* SetBias output tile width */ + Imm(1), /* SetBias output tile height */ + K_ArgPar("LinOut", KER_ARG_PARTILE_SIZE, D0), /* Number of output features in this tile */ + K_Arg("Bias", KER_ARG_TILE), /* SetBias Bias tile */ + K_TileOper("Infos", "char *", '@', AT_INF_BIASN) /* Bias Norm */ + ) + ), + Call(LinearKerName, LOC_LOOP, + Bindings(10, + K_Arg("In", KER_ARG_TILE), /* Input tile */ + K_Arg("Filter", KER_ARG_TILE), /* Filter tile */ + AT_IGNORE_ARG_BINDING, /* Bias tile, ignored */ + K_Arg("LinOut", KER_ARG_TILE), /* Output tile */ + K_Arg("In", KER_ARG_TILE_H), /* Input tile height, number of in elements */ + K_Arg("In", KER_ARG_TILE_H), /* Total Input tile height, number of in elements */ + K_ArgPar("Out", KER_ARG_PARTILE_SIZE, D0), /* Output tile size, number of out elements */ + AT_IGNORE_ARG_BINDING, /* Scale tile, ignored here */ + AT_IGNORE_ARG_BINDING, /* Norm Scale tile, ignored here */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ), + Call(ReductKerName, LOC_LOOP_EPILOG, /* Reduction also take care of optional activation */ + Bindings(8, + K_Arg("LinOut", KER_ARG_TILE), /* Double precision input tile */ + K_Arg("Out", KER_ARG_TILE), /* Single precision output tile */ + K_ArgPar("Out", KER_ARG_PARTILE_SIZE, D0), /* Input tile Number of features */ + Imm(1), /* Input tile width */ + Imm(1), /* Input tile height */ + K_Arg("Scale", KER_ARG_TILE), /* Per channel scale tile */ + K_Arg("ScaleN", KER_ARG_TILE), /* Per channel scale normalization tile */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ) + ), + KArgs + ); + } + + if (Kernel) { + AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0); + AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); + + AddKernelArgDim(Name, "In", 2, InDim, 1); + AddKernelArgDim(Name, "Filter", 3, OutDim, InDim, 1); + AddKernelArgDim(Name, "Bias", 2, OutDim, Bias_DataSize); + AddKernelArgDim(Name, "Out", 2, OutDim, 1); + AddKernelArgDim(Name, "Scale", 2, OutDim, 1); + AddKernelArgDim(Name, "ScaleN", 2, OutDim, 1); + AddKernelArgDim(Name, "Infos", 2, AT_INF_DIM, 1); + + AT_PrepareForTest_SQ8(Name, InDim,OutDim,1,1, Bias_DataSize, LinearOper, 0,0,0,0,0,0,(v4s)0, 0, 0,0,0,0,0,0,(v4s)0, ActOper); + } + return (Kernel!=0); +} + +/********************************************************************************************************************************************************************* + Generator for SoftMax layers + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation), Def=(TILE_HOR) + + Dim: Number of inputs + + SoftMaxOper: Should always be KOP_SOFTMAX + + Signature: Name(In, Out, Infos) + + CNN_SoftMax + Input and output are assumed to fit within given shared L1 memory. Dim is partitionned into subsets of inputs and each subset is given to + a different code. By definition Output contains value is the [0.0 .. 1.0] range with sum(Output)=1.0. Results are always represented in Q15 +*********************************************************************************************************************************************************************/ + +int CNN_SoftMax_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Dim, + + KernelOper_T SoftMaxOper + ) + +{ + Tile_Orientation_T TileOrientation = TILE_HOR; + int ParFeat = 0; + if (Ctrl) { + if (Ctrl->TileOrientation != -1) TileOrientation = (Ctrl->TileOrientation==0)?TILE_HOR:TILE_VER; + if (Ctrl->ParallelFeatures != -1) ParFeat = Ctrl->ParallelFeatures; + } + unsigned long long int LayerOp = 0; + unsigned long long int LayerBandwidth = 0; + char *SoftMaxKerName = CNN_FindMatchingKernel(SoftMaxOper, KOP_NONE, ParFeat, 1, 0, 0, 0, 2, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + + if (SoftMaxKerName==0) GenTilingError("CNN_SoftMax_SQ8 Kernel: %s, Can't find a matching basic kernel, warning 16 bits output only, Q15 output", Name); + + LayerOp += Dim; + LayerBandwidth += Dim*1 + Dim*2; + Kernel_T *Kernel = UserKernel(Name, + KernelIterSpace(1, IterTiledSpace(T0)), + TileOrientation, + CArgs(3, + TCArg(CNN_ArgDataType(1,1,1), "In"), + TCArg(CNN_ArgDataType(2,1,1), "Out"), + TCArg(CNN_ArgDataType(1,1,1), "Infos") + ), + Calls(1, + Call(SoftMaxKerName, LOC_LOOP, + Bindings(5, + K_Arg("In", KER_ARG_TILE), /* Input tile */ + K_Arg("In", KER_ARG_TILE_H), /* Number of inputs */ + K_TileOper("Infos", "char *", '@', AT_INF_BIASL_SM), /* Input left Norm factor */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ) + ), + KerArgs(3, + KerArg("In", KerArgSpace(1,T0), OBJ_BUFFER_IN, 1, Dim, 1, 0, 0, 8, "In"), + KerArg("Out", KerArgSpace(1,T0), OBJ_BUFFER_OUT, 1, Dim, 2, 0, 0, 0, "Out"), + KerArg("Infos", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED|O_CONST, AT_INF_DIM, 1, 1, 0, 0, 0, "Infos") + ) + ); + if (Kernel) { + AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0); + AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); + + AddKernelArgDim(Name, "In", 2, Dim, 1); + AddKernelArgDim(Name, "Out", 2, Dim, 2); + AddKernelArgDim(Name, "Infos", 2, AT_INF_DIM, 1); + + AT_PrepareForTest_SQ8(Name, Dim,Dim,1,1, 1, SoftMaxOper, 0,0,0,0,0,0,(v4s)0, 0, 0,0,0,0,0,0,(v4s)0, 0); + } + return (Kernel!=0); +} + + +/********************************************************************************************************************************************************************* + Generator for Matrix Addition layers with input scale adjustment (tensor centric), output scaling (tensor centric) and optional activation + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 1) + + Feat: Number of features + Width: Width of a given feature + Height: Height of a given feature + + AddMatOper: Should always be KOP_MATADD + ActOper: Optional activation + + Signature: Name(In1, In2, Out, Infos) + + CNN_MatAddAct_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_MatAddAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Feat, + int Width, + int Height, + + KernelOper_T AddMatOper, + KernelOper_T ActOper +) + +{ + Tile_Orientation_T TileOrientation = TILE_HOR; + int ParFeat = 1; + if (Ctrl) { + if (Ctrl->TileOrientation != -1) TileOrientation = (Ctrl->TileOrientation==0)?TILE_HOR:TILE_VER; + if (Ctrl->ParallelFeatures != -1) ParFeat = Ctrl->ParallelFeatures; + } + unsigned long long int LayerOp = 0; + unsigned long long int LayerBandwidth = 0; + int StandAloneAct = (ActOper!=KOP_NONE); + char *MatAddKerName=0, *ActKerName=0; + + MatAddKerName = CNN_FindMatchingKernel(AddMatOper, ActOper, ParFeat, 1, 1, 0, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (MatAddKerName) StandAloneAct = 0; + else MatAddKerName = CNN_FindMatchingKernel(AddMatOper, KOP_NONE, ParFeat, 1, 1, 0, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (MatAddKerName==0) GenTilingError("CNN_MatAddAct_SQ8 Kernel: %s, Can't find a matching basic kernel for MatAdd", Name); + + if (StandAloneAct) { + ActKerName = CNN_FindMatchingKernel(ActOper, KOP_NONE, ParFeat, 1, 1, 0, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (ActKerName==0) GenTilingError("CNN_MatAddAct_SQ8 Kernel: %s, Can't find a matching basic kerne for Activationl", Name); + } + + LayerOp += Feat * Width * Height; + LayerBandwidth += Width*Height*1*Feat; + LayerBandwidth += Width*Height*1*Feat; + LayerBandwidth += Width*Height*1*Feat; + + Kernel_T *Kernel = UserKernel(Name, + KernelIterSpace(2, IterParSpace(D0, Feat, 8), IterTiledSpace(T0)), + TileOrientation, + CArgs(4, + TCArg(CNN_ArgDataType(1,1,1), "In1"), + TCArg(CNN_ArgDataType(1,1,1), "In2"), + TCArg(CNN_ArgDataType(1,1,1), "Out"), + TCArg(CNN_ArgDataType(1,1,1), "Infos") + ), + Calls(2, + Call(MatAddKerName, LOC_LOOP, + Bindings(8, + K_Arg("In1", KER_ARG_TILE), /* First input tile */ + K_Arg("In2", KER_ARG_TILE), /* Second input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + K_ArgPar("In1", KER_ARG_PARTILE_SIZE, D0), /* Number of Matrices involved */ + K_Arg("In1", KER_ARG_TILE_W), /* Input tile width */ + K_Arg("In1", KER_ARG_TILE_H), /* Input tile height */ + Imm((ActOper==KOP_NONE)), /* Scaling when no activation */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ), + (ActKerName==0)?AT_NO_CALL: + Call(ActKerName, LOC_LOOP, + Bindings(6, + K_Arg("Out", KER_ARG_TILE), /* Input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + K_ArgPar("Out", KER_ARG_PARTILE_SIZE, D0), /* Number of features in this tile */ + K_Arg("Out", KER_ARG_TILE_W), /* Input tile width */ + K_Arg("Out", KER_ARG_TILE_H), /* Input tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ) + ), + KerArgs(4, + KerArg("In1", KerArgSpace(2,D0,T0), O_IN|O_DB, Width, Height, 1, 0, 0, 0, "In1"), + KerArg("In2", KerArgSpace(2,D0,T0), O_IN|O_DB, Width, Height, 1, 0, 0, 0, "In2"), + KerArg("Out", KerArgSpace(2,D0,T0), O_OUT|O_DB, Width, Height, 1, 0, 0, 0, "Out"), + KerArg("Infos", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos") + ) + ); + if (Kernel) { + AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0); + AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); + + AddKernelArgDim(Name, "In1", 4, Feat, Height, Width, 1); + AddKernelArgDim(Name, "In2", 4, Feat, Height, Width, 1); + AddKernelArgDim(Name, "Out", 4, Feat, Height, Width, 1); + AddKernelArgDim(Name, "Infos", 2, AT_INF_DIM, 1); + + AT_PrepareForTest_SQ8(Name, Feat,Feat,Width,Height, 1, AddMatOper, 0,0,0,0,0,0,(v4s)0, 0, 0,0,0,0,0,0,(v4s)0, ActOper); + } + return (Kernel!=0); +} + +/********************************************************************************************************************************************************************* + Generator for Tensor [CxHxW] by Vector [C] product with tensor centric scaling and optional Activation + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 1) + + InFeat: Number of input features + Width: Width of a In1 + Height: Height of a In1 + + MatOper: KOP_MATVECTMUL + ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + + Signature: Name(In1, In2, Out, Infos) + + CNN_TensorVectMultAct_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_TensorVectMultAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Feat, + int Width, + int Height, + + KernelOper_T MatOper, + KernelOper_T ActOper +) + +{ + Tile_Orientation_T TileOrientation = TILE_HOR; + int ParFeat = 1; + if (Ctrl) { + if (Ctrl->TileOrientation != -1) TileOrientation = (Ctrl->TileOrientation==0)?TILE_HOR:TILE_VER; + if (Ctrl->ParallelFeatures != -1) ParFeat = Ctrl->ParallelFeatures; + } + unsigned long long int LayerOp = 0; + unsigned long long int LayerBandwidth = 0; + char *MatOperKerName=0, *ActKerName=0; + int StandAloneAct=(ActOper!=KOP_NONE); + + MatOperKerName = CNN_FindMatchingKernel(MatOper, ActOper, ParFeat, 1, 1, 1, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (MatOperKerName) StandAloneAct=0; + else if (StandAloneAct) MatOperKerName = CNN_FindMatchingKernel(MatOper, KOP_NONE, ParFeat, 1, 1, 1, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (MatOperKerName==0) GenTilingError("CNN_TensorVectMultAct_SQ8 Kernel: %s, Can't find a matching basic kernel for MatVectMult", Name); + if (StandAloneAct) { + ActKerName = CNN_FindMatchingKernel(ActOper, KOP_NONE, ParFeat, 1, 1, 1, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (ActKerName==0) GenTilingError("CNN_TensorVectMultAct_SQ8 Kernel: %s, Can't find a matching basic kernel for Activation", Name); + } + + + LayerOp += Feat * Width * Height; + + LayerBandwidth += Width*Height*1*Feat; + LayerBandwidth += 1*Feat; + LayerBandwidth += Width*Height*1*Feat; + + Kernel_T *Kernel; + CKernel_Arg_T **KerCArgs; + int Ca=0; + Object_T **KArgs; + int Ka=0; + + KerCArgs = AllocateCArgs(4); + KerCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "In1"); + KerCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "In2"); + KerCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "Out"); + KerCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1), "Infos"); + + KArgs = AllocateKerArgs(4); + KArgs[Ka++] = KerArg("In1", KerArgSpace(2,D0,T0), O_IN|O_DB, 1, 1, Width*Height*1, 0, 0, 0, "In1"); + KArgs[Ka++] = KerArg("In2", KerArgSpace(1,D0), O_IN|O_DB, 1, 1, 1, 0, 0, 0, "In2"); + KArgs[Ka++] = KerArg("Out", KerArgSpace(2,D0,T0), O_OUT|O_DB, 1, 1, Width*Height*1, 0, 0, 0, "Out"); + KArgs[Ka++] = KerArg("Infos", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos"); + + + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF); + /* First try with entire feature in L1 */ + Kernel= UserKernel(Name, + KernelIterSpace(2, IterParSpace(D0, Feat, 8), IterTiledSpace(T0)), + TileOrientation, + KerCArgs, + Calls(2, + Call(MatOperKerName, LOC_LOOP, + Bindings(8, + K_Arg("In1", KER_ARG_TILE), /* First input tile */ + K_Arg("In2", KER_ARG_TILE), /* Second input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + K_ArgPar("In1", KER_ARG_PARTILE_SIZE, D0), /* Number of Matrices involved */ + Imm(Width), /* Input tile width */ + Imm(Height), /* Input tile height */ + Imm((ActOper==KOP_NONE)), /* Scaling when no activation */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ), + (ActKerName==0)?AT_NO_CALL: + Call(ActKerName, LOC_LOOP, + Bindings(6, + K_Arg("Out", KER_ARG_TILE), /* Input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + K_ArgPar("Out", KER_ARG_PARTILE_SIZE, D0), /* Number of features in this tile */ + Imm(Width), /* Input tile width */ + Imm(Height), /* Input tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ) + ), + KArgs + ); + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON); + + if (Kernel == 0) { + Ka=0; + KArgs[Ka++] = KerArg("In1", KerArgSpace(2,D0,T0), O_IN|O_DB, Width, Height, 1, 0, 0, 0, "In1"); + KArgs[Ka++] = KerArg("In2", KerArgSpace(1,D0), O_IN|O_DB, 1, 1, 1, 0, 0, 0, "In2"); + KArgs[Ka++] = KerArg("Out", KerArgSpace(2,D0,T0), O_OUT|O_DB, Width, Height, 1, 0, 0, 0, "Out"); + KArgs[Ka++] = KerArg("Infos", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos"); + + Kernel= UserKernel(Name, + KernelIterSpace(2, IterParSpace(D0, Feat, 8), IterTiledSpace(T0)), + TileOrientation, + KerCArgs, + Calls(2, + Call(MatOperKerName, LOC_LOOP, + Bindings(8, + K_Arg("In1", KER_ARG_TILE), /* First input tile */ + K_Arg("In2", KER_ARG_TILE), /* Second input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + K_ArgPar("In1", KER_ARG_PARTILE_SIZE, D0), /* Number of Matrices involved */ + K_Arg("In1", KER_ARG_TILE_W), /* Input tile width */ + K_Arg("In1", KER_ARG_TILE_H), /* Input tile height */ + Imm((ActOper==KOP_NONE)), /* Scaling when no activation */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ), + (ActKerName==0)?AT_NO_CALL: + Call(ActKerName, LOC_LOOP, + Bindings(6, + K_Arg("Out", KER_ARG_TILE), /* Input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + K_ArgPar("Out", KER_ARG_PARTILE_SIZE, D0), /* Number of features in this tile */ + K_Arg("Out", KER_ARG_TILE_W), /* Input tile width */ + K_Arg("Out", KER_ARG_TILE_H), /* Input tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ) + ), + KArgs + ); + } + if (Kernel) { + AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0); + AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); + + AddKernelArgDim(Name, "In1", 4, Feat, Height, Width, 1); + AddKernelArgDim(Name, "In2", 2, Feat, 1); + AddKernelArgDim(Name, "Out", 4, Feat, Height, Width, 1); + AddKernelArgDim(Name, "Infos", 2, AT_INF_DIM, 1); + + AT_PrepareForTest_SQ8(Name, Feat,Feat,Width,Height, 1, MatOper, 0,0,0,0,0,0,(v4s)0, 0, 0,0,0,0,0,0,(v4s)0, ActOper); + } + return (Kernel!=0); +} + +/********************************************************************************************************************************************************************* + Generator for Matrix Multiplication layers with channel centric scaling followed by an optional Activation. + + Can be used for 1x1 convolutions with Filters in In1 [OutFeat x InFeat] and Features in In2 [InFeat x W*H] + When non unit strides are used they apply to In2, produced output is [OutFeat x Floor((W+Scx-1)/Scx)*Floor((H+Scy-1)/Scy)] + Bias [OutFeat x 1] is added to each individual features + Line x Col sum of products are evaluated on 32 bits therefore, when used for 1x1 convolution, this generator is equivalent to KOP_CONV_DP + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options + + Bias_DataSize: 1: byte, 2: half word, + Scale_DataSize: 1: byte, 2: half word, 4: word + + ColM1: Number of colums for matrix In1, for 1x1 convolution this is InFeat + LineM1: Number of lines for matrix In1, for 1x1 convolution this is OutFeat + ColM2: Number of colums for matrix In2, for 1x1 convolution this is W*H + LineM2: Number of lines for matrix In2, for 1x1 convolution this is InFeat + + Width For 1x1 convolution, width of an input feature map + Height For 1x1 convolution, height of an input feature map + Scx: stride x dimension for In2 + Scy: stride y dimension for In2 + + MatMulOper: Should always be KOP_MATMUL + + ActOper: Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) + + Signature: Name(In2, In1, Bias, Out, Scale, ScaleN, Infos) + + CNN_MatMulAct_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_MatMulAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Bias_DataSize, + int Scale_DataSize, + + int ColM1, + int LineM1, + int ColM2, + int LineM2, + + int Width, + int Height, + int Scx, + int Scy, + + KernelOper_T MatMulOper, + KernelOper_T ActOper + ) + +{ + int Log = 1; + Tile_Orientation_T TileOrientation = TILE_HOR; + int NeedScx, NeedScy; + unsigned long long int LayerOp = 0; + unsigned long long int LayerBandwidth = 0; + int LineO = LineM1, ColO = ColM2; + int ConsT0 = Scx; + int Nbuff; + int ColFirst = ((LineM1*ColM1)<(LineM2*ColM2)); + char *MatMulKerName=0, *ActKerName=0; + int StandAloneAct = (ActOper!=KOP_NONE); + + if (!(MatMulOper == KOP_MATMUL)) GenTilingError("CNN_MatMulAct_SQ8 Kernel: %s, MatMulOper should be KOP_MATMUL", Name); + + if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU)) + GenTilingError("CNN_MatMulAct_SQ8 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_HSWISH or KOP_LEAKYRELU", Name); + + if (ColM1 != LineM2) GenTilingError("CNN_MatMulAct_SQ8: %s, Incorrect input matrices dimensions for a matrix multiplication: [%d x %d]*[%d x %d] %s", Name, LineM1, ColM1, LineM2, ColM2); + if (Width==0||Height==0) { + Width = ColM2; Height=1; Scx = 1; Scy = 1; + } + if ((Width*Height) != ColM2) GenTilingError("CNN_MatMulAct_SQ8: %s, ColM2: %d should always be equal to Width: %d * Height: %d", Name, ColM2, Width, Height); + if (Scx==1 && Scy==1) Nbuff = 4; else Nbuff = 1; + + MatMulKerName = CNN_FindMatchingKernel(MatMulOper, ActOper, 1, 1, 1, Bias_DataSize, 0, 1, 0,0,0,0,Scx,Scy, 0,0,0,0, &NeedScx, &NeedScy, 0); + if (MatMulKerName) StandAloneAct = 0; + else MatMulKerName = CNN_FindMatchingKernel(MatMulOper, KOP_NONE, 1, 1, 1, Bias_DataSize, 0, 1, 0,0,0,0,Scx,Scy, 0,0,0,0, &NeedScx, &NeedScy, 0); + if (MatMulKerName==0) GenTilingError("CNN_MatMulAct_SQ8 Kernel: %s, Can't find a matching basic kernel for MatMul", Name); + + if (StandAloneAct) { + ActKerName = CNN_FindMatchingKernel(ActOper, KOP_NONE, 0, 0, 0, 0, 0, 1, 0,0,0,0,0,0, 0,0,0,0, 0, 0, 0); + if (ActKerName==0) GenTilingError("CNN_MatMulAct_SQ8 Kernel: %s, Can't find a matching basic kernel for Activation", Name); + } + + ColO = ((Width+Scx-1)/Scx) * ((Height+Scy-1)/Scy); + LayerOp += ColM1*ColO*LineM1; + LayerBandwidth += LineM1*(ColM1*ColM2*(1+1)); + LayerBandwidth += LineM1*ColM2*1; + LayerBandwidth += LineM1*Bias_DataSize; + + if (Scy!=1) ConsT0 = Width*Scy; else ConsT0 = 4; + + if (Log) { + printf("CNN_MatMul_SQ8: %s\n", Name); + printf("In1 => W: %4d, H: %4d\n", ColM1, LineM1); + printf("In2 => W: %4d, H: %4d, w: %4d, h: %4d, Sx: %1d, Sy: %1d\n", ColM2, LineM2, Width, Height, Scx, Scy); + printf("Out => W: %4d, H: %4d => %s\n", ColO, LineO, ColFirst?"Column first":"Line First"); + if (MatMulKerName) printf("%20s: %s\n", "MatMulKerName", MatMulKerName); + if (ActKerName) printf("%20s: %s\n", "ActKerName", ActKerName); + // printf("Nb Oper : %lld\n", LayerOp); + } + + Kernel_T *Kernel; + + /* First try buffering small objects */ + Kernel = UserKernel(Name, + KernelIterSpace(2, IterTiledSpace(T1), IterTiledSpace(T0)), + TILE_HOR, + CArgs(7, + TCArg(CNN_ArgDataType(1,1,1), "In2"), + TCArg(CNN_ArgDataType(1,1,1), "In1"), + TCArg(CNN_ArgDataType(Bias_DataSize,1,1), "Bias"), + TCArg(CNN_ArgDataType(1,1,1), "Out"), + TCArg(CNN_ArgDataTypeUns(1,1,1),"Scale"), + TCArg(CNN_ArgDataTypeUns(1,1,1),"ScaleN"), + TCArg(CNN_ArgDataType(1,1,1), "Infos") + ), + Calls(2, + Call(MatMulKerName, LOC_LOOP, + Bindings(19, + K_Arg("In1", KER_ARG_TILE), K_Arg("In1", KER_ARG_TILE_W), K_Arg("In1", KER_ARG_TILE_H), + K_Arg("In2", KER_ARG_TILE), K_Arg("In2", KER_ARG_TILE_W), + K_Arg("Bias", KER_ARG_TILE), + K_Arg("Scale", KER_ARG_TILE), K_Arg("ScaleN", KER_ARG_TILE), + K_Arg("Out", KER_ARG_TILE), K_Arg("Out", KER_ARG_TILE_W), K_Arg(ColFirst?"In1":"In2", KER_ARG_TILE_BASE), + K_Arg("KerBuff", KER_ARG_TILE), + K_TileOper("Infos", "char *", '@', AT_INF_BIASN), + Imm(ColFirst), + NeedScx?Imm(Scx):AT_IGNORE_ARG_BINDING, + NeedScy?Imm(Scy):AT_IGNORE_ARG_BINDING, + (NeedScx||NeedScy)?Imm(Width):AT_IGNORE_ARG_BINDING, + (NeedScx||NeedScy)?Imm(Height):AT_IGNORE_ARG_BINDING, + K_Arg("Infos", KER_ARG_TILE) + ) + ), + (ActKerName==0)?AT_NO_CALL: + Call(ActKerName, LOC_LOOP, + Bindings(6, /* Uses feature par form with W: Out.W, H: Out.H given the fact that H is feat and W is w*h */ + K_Arg("Out", KER_ARG_TILE), /* Input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + Imm(1), /* Input of features in this tile */ + K_Arg("Out", KER_ARG_TILE_W), /* Input tile width */ + K_Arg("Out", KER_ARG_TILE_H), /* Input tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ) + ), + ColFirst? + KerArgs(8, + KerArg("KerBuff",KerArgSpace(1, T1), O_BUFF|O_NTILED, Nbuff*ColM1, 1, 1, 0, 0, 0, 0), + KerArg("In1", KerArgSpace(1, T0), O_IN|O_DB|O_CONST, ColM1, LineM1, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), + KerArg("In2", KerArgSpace(1, T1), O_IN|O_DB, ColM2, LineM2, 1, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"), + KerArg("Bias", KerArgSpace(1, T0), O_BUFF|O_IN|O_CONST, 1, LineO, Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Bias"), + KerArg("Out", KerArgSpace(1, T1), O_OUT|O_DB, ColO, LineO, 1, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out"), + KerArg("Scale", KerArgSpace(1, T0), O_BUFF|O_IN|O_CONST, 1, LineO, 1, 0, 0, 0, "Scale"), + KerArg("ScaleN", KerArgSpace(1, T0), O_BUFF|O_IN|O_CONST, 1, LineO, 1, 0, 0, 0, "ScaleN"), + KerArg("Infos", KerArgSpace(1, T1), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos") + ): + KerArgs(8, + KerArg("KerBuff",KerArgSpace(1, T0), O_BUFF|O_NTILED, Nbuff*ColM1, 1, 1, 0, 0, 0, 0), + KerArg("In1", KerArgSpace(1, T1), O_IN|O_DB|O_CONST, ColM1, LineM1, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), + KerArg("In2", KerArgSpace(1, T0), O_IN|O_DB, ColM2, LineM2, 1, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"), + KerArg("Bias", KerArgSpace(1, T1), O_BUFF|O_IN|O_CONST, 1, LineO, Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Bias"), + KerArg("Out", KerArgSpace(1, T1), O_OUT|O_DB, ColO, LineO, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Out"), + KerArg("Scale", KerArgSpace(1, T1), O_BUFF|O_IN|O_CONST, 1, LineO, 1, 0, 0, 0, "Scale"), + KerArg("ScaleN", KerArgSpace(1, T1), O_BUFF|O_IN|O_CONST, 1, LineO, 1, 0, 0, 0, "ScaleN"), + KerArg("Infos", KerArgSpace(1, T0), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos") + ) + ); + /* If no soultion found roll back to tiled approach for small objects */ + if (Kernel==0) + Kernel = UserKernel(Name, + KernelIterSpace(2, IterTiledSpace(T1), IterTiledSpace(T0)), + TILE_HOR, + CArgs(7, + TCArg(CNN_ArgDataType(1,1,1), "In2"), + TCArg(CNN_ArgDataType(1,1,1), "In1"), + TCArg(CNN_ArgDataType(Bias_DataSize,1,1), "Bias"), + TCArg(CNN_ArgDataType(1,1,1), "Out"), + TCArg(CNN_ArgDataTypeUns(1,1,1),"Scale"), + TCArg(CNN_ArgDataTypeUns(1,1,1),"ScaleN"), + TCArg(CNN_ArgDataType(1,1,1), "Infos") + ), + Calls(2, + Call(MatMulKerName, LOC_LOOP, + Bindings(19, + K_Arg("In1", KER_ARG_TILE), K_Arg("In1", KER_ARG_TILE_W), K_Arg("In1", KER_ARG_TILE_H), + K_Arg("In2", KER_ARG_TILE), K_Arg("In2", KER_ARG_TILE_W), + K_Arg("Bias", KER_ARG_TILE), + K_Arg("Scale", KER_ARG_TILE), K_Arg("ScaleN", KER_ARG_TILE), + K_Arg("Out", KER_ARG_TILE), K_Arg("Out", KER_ARG_TILE_W), K_Arg(ColFirst?"In1":"In2", KER_ARG_TILE_BASE), + K_Arg("KerBuff", KER_ARG_TILE), + K_TileOper("Infos", "char *", '@', AT_INF_BIASN), + Imm(ColFirst), + NeedScx?Imm(Scx):AT_IGNORE_ARG_BINDING, + NeedScy?Imm(Scy):AT_IGNORE_ARG_BINDING, + (NeedScx||NeedScy)?Imm(Width):AT_IGNORE_ARG_BINDING, + (NeedScx||NeedScy)?Imm(Height):AT_IGNORE_ARG_BINDING, + K_Arg("Infos", KER_ARG_TILE) + ) + ), + (ActKerName==0)?AT_NO_CALL: + Call(ActKerName, LOC_LOOP, + Bindings(6, /* Uses feature par form with W: Out.W, H: Out.H given the fact that H is feat and W is w*h */ + K_Arg("Out", KER_ARG_TILE), /* Input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + Imm(1), /* Input of features in this tile */ + K_Arg("Out", KER_ARG_TILE_W), /* Input tile width */ + K_Arg("Out", KER_ARG_TILE_H), /* Input tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ) + ), + ColFirst? + KerArgs(8, + KerArg("KerBuff",KerArgSpace(1, T1), O_BUFF|O_NTILED, Nbuff*ColM1, 1, 1, 0, 0, 0, 0), + KerArg("In1", KerArgSpace(1, T0), O_IN|O_DB|O_CONST, ColM1, LineM1, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), + KerArg("In2", KerArgSpace(1, T1), O_IN|O_DB, ColM2, LineM2, 1, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"), + KerArg("Bias", KerArgSpace(1, T0), O_IN|O_DB|O_CONST, 1, LineO, Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Bias"), + KerArg("Out", KerArgSpace(1, T1), O_OUT|O_DB, ColO, LineO, 1, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out"), + KerArg("Scale", KerArgSpace(1, T0), OBJ_IN_DB|O_CONST, 1, LineO, 1, 0, 0, 0, "Scale"), + KerArg("ScaleN", KerArgSpace(1, T0), OBJ_IN_DB|O_CONST, 1, LineO, 1, 0, 0, 0, "ScaleN"), + KerArg("Infos", KerArgSpace(1, T1), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos") + ): + KerArgs(8, + KerArg("KerBuff",KerArgSpace(1, T0), O_BUFF|O_NTILED, Nbuff*ColM1, 1, 1, 0, 0, 0, 0), + KerArg("In1", KerArgSpace(1, T1), O_IN|O_DB|O_CONST, ColM1, LineM1, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), + KerArg("In2", KerArgSpace(1, T0), O_IN|O_DB, ColM2, LineM2, 1, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"), + KerArg("Bias", KerArgSpace(1, T1), O_IN|O_DB|O_CONST, 1, LineO, Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Bias"), + KerArg("Out", KerArgSpace(1, T1), O_OUT|O_DB, ColO, LineO, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Out"), + KerArg("Scale", KerArgSpace(1, T1), OBJ_IN_DB|O_CONST, 1, LineO, 1, 0, 0, 0, "Scale"), + KerArg("ScaleN", KerArgSpace(1, T1), OBJ_IN_DB|O_CONST, 1, LineO, 1, 0, 0, 0, "ScaleN"), + KerArg("Infos", KerArgSpace(1, T0), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos") + ) + ); + if (Kernel) { + AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0); + AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); + + AddKernelArgDim(Name, "In1", 3, LineM1, ColM1, 1); + AddKernelArgDim(Name, "In2", 4, LineM2, Height, Width, 1); + AddKernelArgDim(Name, "Bias", 2, LineO, Bias_DataSize); + AddKernelArgDim(Name, "Out", 3, LineO, ColO, 1); + AddKernelArgDim(Name, "Scale", 2, LineO, 1); + AddKernelArgDim(Name, "ScaleN", 2, LineO, 1); + AddKernelArgDim(Name, "Infos", 2, AT_INF_DIM, 1); + + AT_PrepareForTest_SQ8(Name, ColM1,LineM1,Width,Height, Bias_DataSize, MatMulOper, 1,1,1,1,Scx,Scy,(v4s)0, 0, 0,0,0,0,0,0,(v4s)0, ActOper); + } + return (Kernel!=0); +} + +/********************************************************************************************************************************************************************* + Generator for Matrix Multiplication layers with channel centric scaling followed by an optional Activation. + Special form to handle small form factor In1 (InFeat x OutFeat) + + Can be used for 1x1 convolutions with Filters in In1 [OutFeat x InFeat] and Features in In2 [InFeat x W*H] + When non unit strides are used they apply to In2, produced output is [OutFeat x Floor((W+Scx-1)/Scx)*Floor((H+Scy-1)/Scy)] + Bias [OutFeat x 1] is added to each individual features + Line x Col sum of products are evaluated on 32 bits therefore, when used for 1x1 convolution, this generator is equivalent to KOP_CONV_DP + This generator assumes that the whole In1 and Bias can be accomodated into shared L1. Expecting to be used for us to 32 InFeat x 32 OutFeat + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (ReluN), Def=(6) + + Bias_DataSize: 1: byte, 2: half word, + Scale_DataSize: 1: byte, 2: half word, 4: word + + ColM1: Number of colums for matrix In1, for 1x1 convolution this is InFeat + LineM1: Number of lines for matrix In1, for 1x1 convolution this is OutFeat + ColM2: Number of colums for matrix In2, for 1x1 convolution this is W*H + LineM2: Number of lines for matrix In2, for 1x1 convolution this is InFeat + + Width For 1x1 convolution, width of an input feature map + Height For 1x1 convolution, height of an input feature map + Scx: stride x dimension for In2 + Scy: stride y dimension for In2 + + MatMulOper Should always be KOP_MATMUL + + ActOper Optionnal Activation (KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) + + Signature: Name(In2, In1, Bias, Out, Scale, ScaleN, Infos) + + CNN_MatMulSmallM1Act_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_MatMulSmallM1Act_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Bias_DataSize, + int Scale_DataSize, + + int ColM1, + int LineM1, + int ColM2, + int LineM2, + + int Width, + int Height, + int Scx, + int Scy, + + KernelOper_T MatMulOper, + KernelOper_T ActOper + ) + +{ + int Log = 1; + Tile_Orientation_T TileOrientation = TILE_VER; + int NeedScx, NeedScy; + unsigned long long int LayerOp = 0; + unsigned long long int LayerBandwidth = 0; + int LineO = LineM1, ColO = ColM2; + int OutLB, OutUB, ReluN = 6; + int ConsT0 = Scx; + int TileCons = 8; + + char *MatMulKerName=0, *MatTransKerName=0, *ActKerName=0; + int StandAloneAct = (ActOper!=KOP_NONE); + + if (!(MatMulOper == KOP_MATMUL_SM1)) GenTilingError("CNN_MatMulSmallM1Act_SQ8 Kernel: %s, MatMulOper should be KOP_MATMUL", Name); + + if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU)) + GenTilingError("CNN_MatMulSmallM1Act_SQ8 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_HSWISH or KOP_LEAKYRELU", Name); + + if (ColM1 != LineM2) GenTilingError("CNN_MatMulSmallM1Act_SQ8: %s, Incorrect input matrices dimensions for a matrix multiplication: [%d x %d]*[%d x %d] %s", Name, LineM1, ColM1, LineM2, ColM2); + if (Width==0||Height==0) { + Width = ColM2; Height=1; Scx = 1; Scy = 1; + } else if (Scy != 1) TileCons = Width*Scy; + if ((Width*Height) != ColM2) GenTilingError("CNN_MatMulSmallM1Act_SQ8: %s, ColM2: %d should always be equal to Width: %d * Height: %d", Name, ColM2, Width, Height); + + MatMulKerName = CNN_FindMatchingKernel(MatMulOper, ActOper, 1, 1, 1, Bias_DataSize, 0, 1, 0,0,0,0,1,1, 0,0,0,0, &NeedScx, &NeedScy, 0); + if (MatMulKerName) StandAloneAct = 0; + else if (StandAloneAct) MatMulKerName = CNN_FindMatchingKernel(MatMulOper, KOP_NONE, 1, 1, 1, Bias_DataSize, 0, 1, 0,0,0,0,1,1, 0,0,0,0, &NeedScx, &NeedScy, 0); + if (MatMulKerName==0) GenTilingError("CNN_MatMulSmallM1Act_SQ8 Kernel: %s, Can't find a matrix multiplication matching basic kernel", Name); + + if (StandAloneAct) { + ActKerName = CNN_FindMatchingKernel(ActOper, KOP_NONE, 0, 0, 0, 0, 0, 1, 0,0,0,0,0,0, 0,0,0,0, 0, 0, 0); + if (ActKerName==0) GenTilingError("CNN_MatMulSmallM1Act_SQ8 Kernel: %s, Can't find a matching basic kernel for Activation", Name); + } + + MatTransKerName = CNN_FindMatchingKernel(KOP_MATTRANSP, KOP_NONE, 0, 1, 0, 0, 0, 1, 0,0,0,0,Scx,Scy, 0,0,0,0, &NeedScx, &NeedScy, 0); + if (MatTransKerName==0) GenTilingError("CNN_MatMulSmallM1Act_SQ8 Kernel: %s, Can't find a matrix transpose matching basic kernel", Name); + + ColO = ((Width+Scx-1)/Scx) * ((Height+Scy-1)/Scy); + LayerOp += ColM1*ColO*LineM1; + LayerBandwidth += LineM1*(ColM1*ColM2*(1+1)); + LayerBandwidth += LineM1*ColM2*1; + LayerBandwidth += LineM1*Bias_DataSize; + + if (Log) { + printf("CNN_MatMulSmallM1_SQ8: %s\n", Name); + printf("In1 => W: %4d, H: %4d\n", ColM1, LineM1); + printf("In2 => W: %4d, H: %4d, w: %4d, h: %4d, Sx: %1d, Sy: %1d, TileCons: %d\n", ColM2, LineM2, Width, Height, Scx, Scy, TileCons); + printf("Out => W: %4d, H: %4d\n", ColO, LineO); + if (MatMulKerName) printf("%20s: %s\n", "MatMulKerName", MatMulKerName); + if (MatTransKerName) printf("%20s: %s\n", "MatTransKerName", MatTransKerName); + if (ActKerName) printf("%20s: %s\n", "ActKerName", ActKerName); + printf("Act: %s\n", CNN_KernelOperImage(ActOper)); + printf("Nb Oper : %lld\n", LayerOp); + } + + Kernel_T *Kernel = UserKernel(Name, + KernelIterSpace(1, IterTiledSpace(T0)), + TILE_VER, + CArgs(7, + TCArg(CNN_ArgDataType(1,1,1), "In2"), + TCArg(CNN_ArgDataType(1,1,1), "In1"), + TCArg(CNN_ArgDataType(Bias_DataSize,1,1), "Bias"), + TCArg(CNN_ArgDataType(1,1,1), "Out"), + TCArg(CNN_ArgDataTypeUns(1,1,1),"Scale"), + TCArg(CNN_ArgDataTypeUns(1,1,1),"ScaleN"), + TCArg(CNN_ArgDataType(1,1,1), "Infos") + ), + Calls(3, + Call(MatTransKerName, LOC_LOOP, + Bindings(7, + K_Arg("In2", KER_ARG_TILE), /* Input tile */ + K_Arg("TransIn2", KER_ARG_TILE), /* Transposed input tile */ + Imm(1), /* A single matrix */ + K_Arg("In2", KER_ARG_TILE_W), /* Input tile width */ + K_Arg("In2", KER_ARG_TILE_H), /* Input tile height */ + NeedScx?Imm(Scx):AT_IGNORE_ARG_BINDING, + NeedScy?Imm(Scy):AT_IGNORE_ARG_BINDING + ) + ), + Call(MatMulKerName, LOC_LOOP, + Bindings(19, + K_Arg("In1", KER_ARG_TILE), Imm(ColM1), Imm(LineM1), + K_Arg("TransIn2", KER_ARG_TILE), K_Arg("TransIn2", KER_ARG_TILE_W), + K_Arg("Bias", KER_ARG_TILE), + K_Arg("Scale", KER_ARG_TILE), K_Arg("ScaleN", KER_ARG_TILE), + K_Arg("Out", KER_ARG_TILE), AT_IGNORE_ARG_BINDING, AT_IGNORE_ARG_BINDING, + AT_IGNORE_ARG_BINDING, + K_TileOper("Infos", "char *", '@', AT_INF_BIASN), + AT_IGNORE_ARG_BINDING, + AT_IGNORE_ARG_BINDING, + AT_IGNORE_ARG_BINDING, + AT_IGNORE_ARG_BINDING, + AT_IGNORE_ARG_BINDING, + K_Arg("Infos", KER_ARG_TILE) + ) + ), + (ActKerName==0)?AT_NO_CALL: + Call(ActKerName, LOC_LOOP, + Bindings(6, + K_Arg("Out", KER_ARG_TILE), /* Input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + Imm(1), /* Number of features in this tile */ + K_Arg("Out", KER_ARG_TILE_W), /* Input tile width */ + K_Arg("Out", KER_ARG_TILE_H), /* Input tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ + ) + ) + ), + KerArgs(8, + KerArg("In1", KerArgSpace(1, T0), O_IN|O_BUFF|O_NTILED|O_CONST, ColM1, LineM1, 1, 0, 0, 0, "In1"), + KerArg("In2", KerArgSpace(1, T0), O_IN|O_DB, ColM2, LineM2, 1, 0, 0, TileCons, "In2"), + // KerArg("TransIn2", KerArgSpace(1, T0), O_BUFF|O_ONETILE, ColM2, LineM2, 1, 0, 0, 0, ""), + KerArg("TransIn2", KerArgSpace(1, T0), O_BUFF|O_ONETILE, ColO, LineM2, 1, 0, 0, 0, ""), + KerArg("Bias", KerArgSpace(1, T0), O_BUFF|O_IN|O_NTILED|O_CONST, 1, LineM1, Bias_DataSize, 0, 0, 0, "Bias"), + KerArg("Out", KerArgSpace(1, T0), O_OUT|O_DB, ColO, LineM1, 1, 0, 0, 0, "Out"), + KerArg("Scale", KerArgSpace(1, T0), O_BUFF|O_IN|O_NTILED|O_CONST, 1, LineM1, 1, 0, 0, 0, "Scale"), + KerArg("ScaleN", KerArgSpace(1, T0), O_BUFF|O_IN|O_NTILED|O_CONST, 1, LineM1, 1, 0, 0, 0, "ScaleN"), + KerArg("Infos", KerArgSpace(1, T0), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos") + ) + ); + if (Kernel) { + AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0); + AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); + + AddKernelArgDim(Name, "In1", 3, LineM1, ColM1, 1); + AddKernelArgDim(Name, "In2", 4, LineM2, Height, Width, 1); + AddKernelArgDim(Name, "Bias", 2, LineO, Bias_DataSize); + AddKernelArgDim(Name, "Out", 3, LineO, ColO, 1); + AddKernelArgDim(Name, "Scale", 2, LineO, 1); + AddKernelArgDim(Name, "ScaleN", 2, LineO, 1); + AddKernelArgDim(Name, "Infos", 2, AT_INF_DIM, 1); + + AT_PrepareForTest_SQ8(Name, ColM1,LineM1,Width,Height, Bias_DataSize, MatMulOper, 1,1,1,1,Scx,Scy,(v4s)0, 0, 0,0,0,0,0,0,(v4s)0, ActOper); + } + return (Kernel!=0); + +} + +/********************************************************************************************************************************************************************* + Generator for Matrix Transposition + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 1) + + Feat Number of matrices + Width For 1x1 convolution, width of an input feature map + Height For 1x1 convolution, height of an input feature map + + MatTransOper KOP_MATTRANSP + + Signature: Name(In, Out) + + CNN_MatTranspose_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_MatTranspose_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Feat, + int Width, + int Height, + + KernelOper_T MatTransOper +) + +{ + int Log = 1; + Tile_Orientation_T TileOrientation = TILE_HOR; + unsigned int OutTileOrientation; + int ParFeat = 1; + if (Ctrl) { + if (Ctrl->TileOrientation != -1) TileOrientation = (Ctrl->TileOrientation==0)?TILE_HOR:TILE_VER; + if (Ctrl->ParallelFeatures != -1) ParFeat = Ctrl->ParallelFeatures; + } + unsigned long long int LayerOp = Width*Height*Feat; + unsigned long long int LayerBandwidth = 0; + + if (!(MatTransOper == KOP_MATTRANSP)) GenTilingError("CNN_MatTranspose_SQ8 Kernel: %s, MatTransOper should be KOP_MATTRANSP", Name); + + char *MatTransKerName = CNN_FindMatchingKernel(MatTransOper, KOP_NONE, ParFeat, 1, 0, 0, 0, 1, 0,0,0,0,0,0, 0,0,0,0, 0, 0, 0); + if (MatTransKerName==0) GenTilingError("CNN_MatTranspose_SQ8 Kernel: %s, Can't find a matching basic kernel for Matrix Transpose", Name); + + if (TileOrientation==TILE_HOR) OutTileOrientation = OBJ_CONSTRAINTS_TILE_VER; else OutTileOrientation = OBJ_CONSTRAINTS_TILE_HOR; + LayerBandwidth += Width*Height*1; + LayerBandwidth += Width*Height*1; + + if (Log) { + printf("CNN_MatTranspose: %s %s\n", Name, ParFeat?"Par Feat":""); + printf("In => Feat: %4d, W: %4d, H: %4d\n", Feat, Width, Height); + printf("Out => Feat: %4d, W: %4d, H: %4d\n", Feat, Width, Height); + if (MatTransKerName) printf("%20s: %s\n", "MatTransKerName", MatTransKerName); + printf("Nb Oper : %lld\n", LayerOp); + } + + Kernel_T *Kernel = + UserKernel(Name, + (ParFeat)? + KernelIterSpace(2, IterParSpace(D0, Feat, 8), IterTiledSpace(T0)): + KernelIterSpace(2, IterFixedSpace(D0, Feat), IterTiledSpace(T0)), + TileOrientation, + CArgs(2, + TCArg(CNN_ArgDataType(1,1,1), "In"), + TCArg(CNN_ArgDataType(1,1,1), "Out") + ), + Calls(1, + Call(MatTransKerName, LOC_LOOP, + Bindings(7, + K_Arg("In", KER_ARG_TILE), /* Input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + (ParFeat)? + K_ArgPar("In", KER_ARG_PARTILE_SIZE, D0): /* Number of Matrices involved */ + Imm(1), /* A single matrix */ + K_Arg("In", KER_ARG_TILE_W), /* Input tile width */ + K_Arg("In", KER_ARG_TILE_H), /* Input tile height */ + AT_IGNORE_ARG_BINDING, /* StrideX */ + AT_IGNORE_ARG_BINDING /* StrideY */ + ) + ) + ), + KerArgs(2, + KerArg("In", KerArgSpace(2,D0,T0), O_IN|O_DB, Width, Height, 1, 0, 0, 0, "In"), + KerArg("Out", KerArgSpace(2,D0,T0), O_OUT|O_DB, Height, Width, 1, 0, OutTileOrientation, 0, "Out") + ) + ); + if (Kernel) { + AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0); + AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); + + AddKernelArgDim(Name, "In", 4, Feat, Height, Width, 1); + AddKernelArgDim(Name, "Out", 4, Feat, Width, Height, 1); + AT_PrepareForTest_SQ8(Name, Feat,Feat,Width,Height, 1, MatTransOper, 0,0,0,0,0,0,(v4s)0, 0, 0,0,0,0,0,0,(v4s)0, 0); + } + return (Kernel!=0); +} + +/********************************************************************************************************************************************************************* + Generator for 3D Tensor permutations: CHW => {CWH, HWC, WHC, WCH, HCW} + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 1) + + Feat Number of channels of the tensor + Width Tensor width + Height Tensor height + + MatPermOper Permutation oper: KOP_MATPERM_CHW2CWH, KOP_MATPERM_CHW2HWC, KOP_MATPERM_CHW2WHC, KOP_MATPERM_CHW2WCH, KOP_MATPERM_CHW2HCW + + Signature: Name(In, Out) + + CNN_3DTensorPermute_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_3DTensorPermute_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Feat, + int Width, + int Height, + + KernelOper_T MatPermOper +) + +{ + int Log = 1; + int ParFeat = 1; + unsigned long long int LayerOp = Width*Height*Feat; + unsigned long long int LayerBandwidth = 0; + + if (!(MatPermOper == KOP_MATPERM_CHW2CWH || MatPermOper == KOP_MATPERM_CHW2HWC || MatPermOper == KOP_MATPERM_CHW2WHC || + MatPermOper == KOP_MATPERM_CHW2WCH || MatPermOper == KOP_MATPERM_CHW2HCW)) + GenTilingError("CNN_3DTensorPermute_SQ8 Kernel: %s, MatTransOper should be KOP_MATPERM_CHW2CWH, KOP_MATPERM_CHW2HWC, KOP_MATPERM_CHW2WHC, KOP_MATPERM_CHW2WCH or KOP_MATPERM_CHW2HCW", Name); + char *MatPermKerName = CNN_FindMatchingKernel(MatPermOper, KOP_NONE, ParFeat, 1, 0, 0, 0, 1, 0,0,0,0,0,0, 0,0,0,0, 0, 0, 0); + if (MatPermKerName==0) GenTilingError("CNN_3DTensorPermute_SQ8 Kernel: %s, Can't find a matching basic kernel for 3D Tensor Permutation", Name); + + LayerBandwidth += Feat*Width*Height*1; + LayerBandwidth += Feat*Width*Height*1; + if (Log) { + printf("CNN_MatPermute: %s %s\n", Name, ParFeat?"Par Feat":""); + printf("In => Feat: %4d, W: %4d, H: %4d\n", Feat, Width, Height); + printf("Out => Feat: %4d, W: %4d, H: %4d\n", Feat, Width, Height); + if (MatPermKerName) printf("%20s: %s\n", "MatPermKerName", MatPermKerName); + printf("Nb Oper : %lld\n", LayerOp); + } + + Object_T **PKerArgs = AllocateKerArgs(2); + PKerArgs[0] = KerArg("In", KerArgSpace(1,T0), O_IN|O_DB, Width, Height*Feat, 1, 0, 0, 0, "In"); + switch (MatPermOper) { + case KOP_MATPERM_CHW2CWH: + PKerArgs[1] = KerArg("Out", KerArgSpace(1,T0), O_OUT|O_DB, Width*Height, Feat, 1, 0, OBJ_CONSTRAINTS_TILE_VER, 0, "Out"); + break; + case KOP_MATPERM_CHW2HWC: + PKerArgs[1] = KerArg("Out", KerArgSpace(1,T0), O_OUT|O_DB, Width*Feat, Height, 1, 0, OBJ_CONSTRAINTS_TILE_VER, 0, "Out"); + break; + case KOP_MATPERM_CHW2WHC: + PKerArgs[1] = KerArg("Out", KerArgSpace(1,T0), O_OUT|O_DB, Height*Feat, Width, 1, 0, OBJ_CONSTRAINTS_TILE_HOR, 0, "Out"); + break; + case KOP_MATPERM_CHW2WCH: + PKerArgs[1] = KerArg("Out", KerArgSpace(1,T0), O_OUT|O_DB, Height*Feat, Width, 1, 0, OBJ_CONSTRAINTS_TILE_HOR, 0, "Out"); + break; + case KOP_MATPERM_CHW2HCW: + PKerArgs[1] = KerArg("Out", KerArgSpace(1,T0), O_OUT|O_DB, Width, Height*Feat, 1, 0, OBJ_CONSTRAINTS_TILE_VER, 0, "Out"); + break; + } + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF); + Kernel_T *Kernel = UserKernel(Name, + KernelIterSpace(1, IterTiledSpace(T0)), + TILE_VER, + CArgs(2, TCArg(CNN_ArgDataType(1,1,1), "In"), TCArg(CNN_ArgDataType(1,1,1), "Out")), + Calls(1, + Call(MatPermKerName, LOC_LOOP, + Bindings(7, + K_Arg("In", KER_ARG_TILE), /* Input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + Imm(Feat), /* Number of Channels */ + K_Arg("In", KER_ARG_TILE_W), /* Input tile width */ + Imm(Height), /* Input tile height */ + AT_IGNORE_ARG_BINDING, /* StrideX */ + AT_IGNORE_ARG_BINDING /* StrideY */ + ) + ) + ), + PKerArgs + ); + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON); + if (Kernel==0) { + printf("NEW SCHEME\n"); + PKerArgs[0] = KerArg("In", KerArgSpace(3,D2,D1,D0), O_IN|O_DB, 1, 1, 1, 0, 0, 0, "In"); + switch (MatPermOper) { + case KOP_MATPERM_CHW2CWH: + PKerArgs[1] = KerArg("Out", KerArgSpace(3,D2,D0,D1), O_OUT|O_DB, 1, 1, 1, 0, 0, 0, "Out"); + break; + case KOP_MATPERM_CHW2HWC: + PKerArgs[1] = KerArg("Out", KerArgSpace(3,D1,D0,D2), O_OUT|O_DB, 1, 1, 1, 0, 0, 0, "Out"); + break; + case KOP_MATPERM_CHW2WHC: + PKerArgs[1] = KerArg("Out", KerArgSpace(3,D0,D1,D2), O_OUT|O_DB, 1, 1, 1, 0, 0, 0, "Out"); + break; + case KOP_MATPERM_CHW2WCH: + PKerArgs[1] = KerArg("Out", KerArgSpace(3,D0,D2,D1), O_OUT|O_DB, 1, 1, 1, 0, 0, 0, "Out"); + break; + case KOP_MATPERM_CHW2HCW: + PKerArgs[1] = KerArg("Out", KerArgSpace(3,D1,D2,D0), O_OUT|O_DB, 1, 1, 1, 0, 0, 0, "Out"); + break; + } + Kernel = UserKernel(Name, + KernelIterSpace(3, IterParSpace(D2, Feat, 1), IterParSpace(D1, Height, 1), IterParSpace(D0, Width, Width)), + TILE_HOR, + CArgs(2, TCArg(CNN_ArgDataType(1,1,1), "In"), TCArg(CNN_ArgDataType(1,1,1), "Out")), + Calls(1, + Call(MatPermKerName, LOC_LOOP, + Bindings(7, + K_Arg("In", KER_ARG_TILE), /* Input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + K_ArgPar("In", KER_ARG_PARTILE_SIZE, D2), /* Number of Channels */ + K_Arg("In", KER_ARG_TILE_W), /* Input tile width */ + K_Arg("In", KER_ARG_TILE_H), /* Input tile height */ + AT_IGNORE_ARG_BINDING, /* StrideX */ + AT_IGNORE_ARG_BINDING /* StrideY */ + ) + ) + ), + PKerArgs + ); + } + if (Kernel) { + AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0); + AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); + + AddKernelArgDim(Name, "In", 4, Feat, Height, Width, 1); + switch (MatPermOper) { + case KOP_MATPERM_CHW2CWH: + AddKernelArgDim(Name, "Out", 4, Feat, Width, Height, 1); + break; + case KOP_MATPERM_CHW2HWC: + AddKernelArgDim(Name, "Out", 4, Height, Width, Feat, 1); + break; + case KOP_MATPERM_CHW2WHC: + AddKernelArgDim(Name, "Out", 4, Width, Height, Feat, 1); + break; + case KOP_MATPERM_CHW2WCH: + AddKernelArgDim(Name, "Out", 4, Width, Feat, Height, 1); + break; + case KOP_MATPERM_CHW2HCW: + AddKernelArgDim(Name, "Out", 4, Height, Feat, Width, 1); + break; + } + AT_PrepareForTest_SQ8(Name, Feat,Feat,Width,Height, 1, MatPermOper, 0,0,0,0,0,0,(v4s)0, 0, 0,0,0,0,0,0,(v4s)0, 0); + } + return (Kernel!=0); +} diff --git a/tools/autotiler_v3/generators/CNN/CNN_Generators_SQ8.h b/tools/autotiler_v3/generators/CNN/CNN_Generators_SQ8.h new file mode 100644 index 000000000..4bbba9eb5 --- /dev/null +++ b/tools/autotiler_v3/generators/CNN/CNN_Generators_SQ8.h @@ -0,0 +1,605 @@ +#ifndef __CNN_GENERATORS_SQ8_H__ +#define __CNN_GENERATORS_SQ8_H__ +#include +#include "AutoTilerLib.h" + +void LoadCNN_SQ8_Library(); + +/********************************************************************************************************************************************************************* + Generator for Convolutions with channel centric scaling, followed by an optional pooling (Max or Average), + followed by an optional Activation. + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features, Use HWCE), Def=(TILE_HOR, 1, 0) + + Bias_DataSize: 1: byte, 2: half word, 4: word + Scale_DataSize: 1: byte, 2: half word, 4: word + + InFeat: Number of input feature's maps + OutFeat: Number of output feature's maps + Width: Number of columns of a given feature map + Height: Number of lines of a given feature map + + ConvOper: Type of convolution, Regular convolution: KOP_CONV, Depth wise convolution: KOP_CONV_DW + Fcx: Convolution filter x dimension + Fcy: Convolution filter y dimension + Dcx: Convolution filter dilation factor, x dimension + Dcy: Convolution filter dilation factor, y dimension + Scx: Convolution filter stride x dimension + Scy: Convolution filter stride y dimension + ConvPad: 0: No padding, 1: Zero padding + + PoolOper: Type of Pooling, KOP_NONE, Max Pooling: KOP_MAXPOOL, Average Pooling: KOP_AVGPOOL + Fpx: Pooling filter x dimension + Fpy: Pooling filter y dimension + Dpx: Pooling filter dilation factor, x dimension + Dpy: Pooling filter dilation factor, y dimension + Spx: Pooling filter stride x dimension + Spy: Pooling filter stride y dimension + PoolPad: 0: No padding, 1: Zero padding + + ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + + Signature: Name(In, Filter, Bias, Out, Scale, ScaleN, Infos) + + CNN_ConvolutionPoolAct_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_ConvolutionPoolAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Bias_DataSize, + int Scale_DataSize, + + int InFeat, + int OutFeat, + int Width, + int Height, + + KernelOper_T ConvOper, + int Fcx, + int Fcy, + int Dcx, + int Dcy, + int Scx, + int Scy, + int ConvPad, + + KernelOper_T PoolOper, + int Fpx, + int Fpy, + int Dpx, + int Dpy, + int Spx, + int Spy, + int PoolPad, + + KernelOper_T ActOper + ); + +/********************************************************************************************************************************************************************* + Generator for Grouped Convolutions with channel centric scaling, followed by an optional pooling (Max or Average), + followed by an optional activation. + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features, Use double precision convolution, Use HWCE), Def=(TILE_HOR, 1, 0, 0) + + GroupIn: Size of the group for input features + GroupOut: Size of the group for output features + + Bias_DataSize: 1: byte, 2: half word, 4: word + Scale_DataSize: 1: byte, 2: half word, 4: word + + InFeat: Number of input feature's maps + OutFeat: Number of output feature's maps + Width: Number of columns of a given feature map + Height: Number of lines of a given feature map + + ConvOper: Type of convolution, Regular convolution: KOP_CONV, DepthWise convolution: KOP_CONV_DW + Fcx: Convolution filter x dimension + Fcy: Convolution filter y dimension + Dcx: Convolution filter dilation factor, x dimension + Dcy: Convolution filter dilation factor, y dimension + Scx: Convolution filter stride x dimension + Scy: Convolution filter stride y dimension + ConvPad: 0: No padding, 1: Zero padding + + PoolOper: Type of Pooling, KOP_NONE, Max Pooling: KOP_MAXPOOL, Average Pooling: KOP_AVGPOOL + Fpx: Pooling filter x dimension + Fpy: Pooling filter y dimension + Dpx: Pooling filter dilation factor, x dimension + Dpy: Pooling filter dilation factor, y dimension + Spx: Pooling filter stride x dimension + Spy: Pooling filter stride y dimension + PoolPad: 0: No padding, 1: Zero padding + + ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + + Signature: Name(In, Filter, Bias, Out, Scale, ScaleN, Infos) + + CNN_GroupedConvolutionPoolAct_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_GroupedConvolutionPoolAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int GroupIn, + int GroupOut, + + int Bias_DataSize, + int Scale_DataSize, + + int InFeat, + int OutFeat, + int Width, + int Height, + + KernelOper_T ConvOper, + int Fcx, + int Fcy, + int Dcx, + int Dcy, + int Scx, + int Scy, + int ConvPad, + + KernelOper_T PoolOper, + int Fpx, + int Fpy, + int Dpx, + int Dpy, + int Spx, + int Spy, + int PoolPad, + + KernelOper_T ActOper + ); + +/********************************************************************************************************************************************************************* + Generator for Pooling (Max or Average) with tensor centric scaling followed by an optional activation + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 1) + + Feat: Number of feature's maps + Width: Number of columns of a given feature map + Height: Number of lines of a given feature map + + PoolOper: KOP_MAXPOOL or KOP_AVGPOOL + Fpx: Size of the pooling filter, x dimension + Fpy: Size of the pooling filter, y dimension + Dpx: Dilation factor, x dimension + Dpy: Dilation factor, y dimension + Spx: Pooling stride, x dimension + Spy: Pooling stride, y dimension + + ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + + Signature: Name(In, Out, Infos) + + CNN_PoolAct_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_PoolAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Feat, + int Width, + int Height, + + KernelOper_T PoolOper, + int Fpx, + int Fpy, + int Dpx, + int Dpy, + int Spx, + int Spy, + int PoolPad, + + KernelOper_T ActOper + ); + +/********************************************************************************************************************************************************************* + Generator for Activation with tensor centric scaling + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 1) + + Feat: Number of feature's maps + Width: Number of columns of a given feature map + Height: Number of lines of a given feature map + + ActOper: KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + + Signature: Name(In, Out, Infos) + + CNN_Act_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_Act_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Feat, + int Width, + int Height, + + KernelOper_T ActOper + ); + + +/********************************************************************************************************************************************************************* + Generator for Global Pooling (Max or Average) with tensor centric scaling and optional activation + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 1) + + Feat: Number of feature's maps + Width: Number of columns of a given feature map + Height: Number of lines of a given feature map + + PoolOper: KOP_GLOBAL_MAXPOOL or KOP_GLOBAL_AVGPOOL + + ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + + Signature: Name(In, Out, Infos) + + + CNN_GlobalPoolAct_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_GlobalPoolAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Feat, + int Width, + int Height, + + KernelOper_T PoolOper, + KernelOper_T ActOper + ); + +/********************************************************************************************************************************************************************* + Generator for Linear layers followed with channel centric scaling followed by an optional activation + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 0) + + Bias_DataSize: 1: byte, 2: half word, 4: word + Scale_DataSize: 1: byte, 2: half word, 4: word + + InDim: Number of inputs + OutDim: Number of outputs + + LinearOper KOP_LINEAR + + ActOper Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + + Signature: Name(In, Filter, Bias, Out, Scale, ScaleN, Infos) + + CNN_LinearAct_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_LinearAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Bias_DataSize, + int Scale_DataSize, + + int InDim, + int OutDim, + + KernelOper_T LinearOper, + KernelOper_T ActOper + ); + +/********************************************************************************************************************************************************************* + Generator for SoftMax layers, no scaling + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation), Def=(TILE_HOR) + + Dim: Number of inputs + + SoftMaxOper: Should always be KOP_SOFTMAX + + Signature: Name(In, Out, Infos) + + CNN_SoftMax_SQ8 + Input and output are assumed to fit within given shared L1 memory. Dim is partitionned into subsets of inputs and each subset is given to + a different code. By definition Output contains value is the [0.0 .. 1.0] range with sum(Output)=1.0. Results are always represented in Q15 +*********************************************************************************************************************************************************************/ + +int CNN_SoftMax_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Dim, + + KernelOper_T SoftMaxOper + ); + +/********************************************************************************************************************************************************************* + Generator for Matrix Addition layers with input scale adjustment (tensor centric), output scaling (tensor centric) and optional activation + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 1) + + Feat: Number of features + Width: Width of a given feature + Height: Height of a given feature + + AddMatOper: Should always be KOP_MATADD + ActOper: Optional activation + + Signature: Name(In1, In2, Out, Infos) + + CNN_MatAddAct_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_MatAddAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Feat, + int Width, + int Height, + + KernelOper_T AddMatOper, + KernelOper_T ActOper + ); + +/********************************************************************************************************************************************************************* + Generator for Tensor [CxHxW] by Vector [C] product with tensor centric scaling and optional Activation + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 1) + + InFeat: Number of input features + Width: Width of a In1 + Height: Height of a In1 + + MatOper: KOP_MATVECTMUL + + ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + + Signature: Name(In1, In2, Out, Infos) + + CNN_TensorVectMultAct_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_TensorVectMultAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Feat, + int Width, + int Height, + + KernelOper_T MatOper, + KernelOper_T ActOper + ); + +/********************************************************************************************************************************************************************* + Generator for Matrix Multiplication layers with channel centric scaling followed by an optional Activation. + + Can be used for 1x1 convolutions with Filters in In1 [OutFeat x InFeat] and Features in In2 [InFeat x W*H] + When non unit strides are used they apply to In2, produced output is [OutFeat x Floor((W+Scx-1)/Scx)*Floor((H+Scy-1)/Scy)] + Bias [OutFeat x 1] is added to each individual features + Line x Col sum of products are evaluated on 32 bits therefore, when used for 1x1 convolution, this generator is equivalent to KOP_CONV_DP + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options + + Bias_DataSize: 1: byte, 2: half word, + Scale_DataSize: 1: byte, 2: half word, 4: word + + ColM1: Number of colums for matrix In1, for 1x1 convolution this is InFeat + LineM1: Number of lines for matrix In1, for 1x1 convolution this is OutFeat + ColM2: Number of colums for matrix In2, for 1x1 convolution this is W*H + LineM2: Number of lines for matrix In2, for 1x1 convolution this is InFeat + + Width For 1x1 convolution, width of an input feature map + Height For 1x1 convolution, height of an input feature map + Scx: stride x dimension for In2 + Scy: stride y dimension for In2 + + MatMulOper: Should always be KOP_MATMUL + + ActOper: Optionnal Activation (KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) + + Signature: Name(In2, In1, Bias, Out, Scale, ScaleN, Infos) + + CNN_MatMulAct_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_MatMulAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Bias_DataSize, + int Scale_DataSize, + + int ColM1, + int LineM1, + int ColM2, + int LineM2, + + int Width, + int Height, + int Scx, + int Scy, + + KernelOper_T MatMulOper, + KernelOper_T ActOper + ); + +/********************************************************************************************************************************************************************* + Generator for Matrix Multiplication layers with channel centric scaling followed by an optional Activation. + Special form to handle small form factor In1 (InFeat x OutFeat) + + Can be used for 1x1 convolutions with Filters in In1 [OutFeat x InFeat] and Features in In2 [InFeat x W*H] + When non unit strides are used they apply to In2, produced output is [OutFeat x Floor((W+Scx-1)/Scx)*Floor((H+Scy-1)/Scy)] + Bias [OutFeat x 1] is added to each individual features + Line x Col sum of products are evaluated on 32 bits therefore, when used for 1x1 convolution, this generator is equivalent to KOP_CONV_DP + This generator assumes that the whole In1 and Bias can be accomodated into shared L1. Expecting to be used for us to 32 InFeat x 32 OutFeat + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options + + Bias_DataSize: 1: byte, 2: half word, + Scale_DataSize: 1: byte, 2: half word, 4: word + + ColM1: Number of colums for matrix In1, for 1x1 convolution this is InFeat + LineM1: Number of lines for matrix In1, for 1x1 convolution this is OutFeat + ColM2: Number of colums for matrix In2, for 1x1 convolution this is W*H + LineM2: Number of lines for matrix In2, for 1x1 convolution this is InFeat + + Width For 1x1 convolution, width of an input feature map + Height For 1x1 convolution, height of an input feature map + Scx: stride x dimension for In2 + Scy: stride y dimension for In2 + + MatMulOper Should always be KOP_MATMUL + + ActOper Optionnal Activation (KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU) + + Signature: Name(In2, In1, Bias, Out, Scale, ScaleN, Infos) + + CNN_MatMulSmallM1Act_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_MatMulSmallM1Act_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Bias_DataSize, + int Scale_DataSize, + + int ColM1, + int LineM1, + int ColM2, + int LineM2, + + int Width, + int Height, + int Scx, + int Scy, + + KernelOper_T MatMulOper, + KernelOper_T ActOper + ); + +/********************************************************************************************************************************************************************* + Generator for Matrix Transposition, no scaling + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 1) + + Feat Number of matrices + Width For 1x1 convolution, width of an input feature map + Height For 1x1 convolution, height of an input feature map + + MatTransOper KOP_MAT_TRANSPOSE + + Signature: Name(In, Out) + + CNN_MatTranspose_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_MatTranspose_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Feat, + int Width, + int Height, + + KernelOper_T MatTransOper + ); + +/********************************************************************************************************************************************************************* + Generator for 3D Tensor permutations: CHW => {CWH, HWC, WHC, WCH, HCW}, no scaling + + Template: + Name: Name of the generated user kernel + + Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 1) + + Feat Number of channels of the tensor + Width Tensor width + Height Tensor height + + MatPermOper Permutation oper: KOP_MATPERM_CHW2CWH, KOP_MATPERM_CHW2HWC, KOP_MATPERM_CHW2WHC, KOP_MATPERM_CHW2WCH, KOP_MATPERM_CHW2HCW + + Signature: Name(In, Out) + + CNN_3DTensorPermute_SQ8 + +*********************************************************************************************************************************************************************/ + +int CNN_3DTensorPermute_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Feat, + int Width, + int Height, + + KernelOper_T MatPermOper + ); + +#endif diff --git a/tools/autotiler_v3/generators/CNN/CNN_MatAlgebra.c b/tools/autotiler_v3/generators/CNN/CNN_MatAlgebra.c index fdd5de34f..fd93c6ce3 100644 --- a/tools/autotiler_v3/generators/CNN/CNN_MatAlgebra.c +++ b/tools/autotiler_v3/generators/CNN/CNN_MatAlgebra.c @@ -30,6 +30,70 @@ static inline unsigned int __attribute__((always_inline)) ChunkSize(unsigned int return Chunk; } +#if 0 +void KerParMatAddScaleScalar_fps(KerMatAddScale_fps_T *Arg) + +{ + signed char * __restrict__ In1 = Arg->In1; + signed char * __restrict__ In2 = Arg->In2; + signed char * __restrict__ Out = Arg->Out; + int MI1 = *Arg->MulBiasIn1; + int MOut = *Arg->MulBiasOut; + int NormIn = Arg->NormIn; + int NormOut = Arg->NormOut; + int W = Arg->W; + int H = Arg->H; + int LB = Arg->LB; + int UB = Arg->UB; + + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(Arg->N); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Arg->N); + int i, j; + + for (i=First; iIn1; + signed char * __restrict__ In2 = Arg->In2; + signed char * __restrict__ Out = Arg->Out; + int pMI1 = *Arg->MulBiasIn1; + int pMOut = *Arg->MulBiasOut; + int NormIn = Arg->NormIn; + int NormOut = Arg->NormOut; + int W = Arg->W; + int H = Arg->H; + int LB = Arg->LB; + int UB = Arg->UB; + + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(Arg->N); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Arg->N); + int i, j; + + for (i=First; iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB; - char *pUB = (char *)Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; - - unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - - At=0; OffLine=0; Oo=0; - if (ColFirst) OffLine=Pi; else Oo=Pi; - while (L>0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - - for (Col=0; ColIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; + unsigned int Norm = Arg->Norm; + unsigned int NormBias = Arg->NormBias; + int LB = Arg->LB, UB = Arg->UB; + int ColFirst = Arg->ColFirst; - When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)] -*/ - short int * __restrict__ In1 = Arg->In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int ColFirst = Arg->ColFirst; + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; + v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); + v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); + v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; + unsigned int CoreId = gap_coreid(); + unsigned int ChunkCell = ChunkSize(H_In1); + unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + int Pi = Arg->OutFirstCol; + signed char *BufferColIn2 = Arg->BufferColIn2; + unsigned int Norm = Arg->Norm; + unsigned int NormBias = Arg->NormBias; + int Wi = Arg->W, Hi = Arg->H; + int Sx = Arg->Sx, Sy = Arg->Sy; + int LB = Arg->LB, UB = Arg->UB; + int ColFirst = Arg->ColFirst; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + + int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; + int Oo, OffLine; int At, F=0, L = W_In2; unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; + v4s *VBuff = (v4s *) BufferColIn2; unsigned int CoreId = gap_coreid(); unsigned int ChunkCell = ChunkSize(H_In1); unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - At=0; OffLine=0; Oo=0; - if (ColFirst) OffLine=Pi; else Oo=Pi; + At = 0; OffLine = 0; Oo = 0; + if (ColFirst) OffLine = Pi; else Oo = Pi; + while (L>0) { for (i=Fi;iIn1; unsigned int W_In1 = Arg->W_In1; unsigned int H_In1 = Arg->H_In1; signed char * __restrict__ In2 = Arg->In2; unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; + short int * __restrict__ Bias = Arg->Bias; signed char * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; unsigned int OutFirstCol = Arg->OutFirstCol; @@ -794,7 +813,7 @@ static void KerParMatMul_4Col_fps(KerMatMul_fps_T *Arg) gap_waitbarrier(0); } if (W_In2&0x2) { - Col = W_In2/4; + Col = W_In2/2 - 1; for (i=F;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int LB = Arg->LB, UB = Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColH_In1; signed char * __restrict__ In2 = Arg->In2; unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; + short int * __restrict__ Bias = Arg->Bias; signed char * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; int Pi = Arg->OutFirstCol; @@ -988,9 +904,8 @@ void KerParMatMulSxSy_fps(KerMatMul_fps_T *Arg) unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - At = 0; OffLine = 0; Oo = 0; - if (ColFirst) OffLine = Pi; else Oo = Pi; - + At=0; OffLine=0; Oo=0; + if (ColFirst) OffLine=Pi; else Oo=Pi; while (L>0) { for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; + int Acc; + switch (Oper) { + case 0: /* Channel Scale */ + Acc = Min(Max(AT_NORM(AT_NORM(P, Norm)*M, Norm), LB), UB); + break; + case 1: /* H Sigmoid */ + /* C1 = (1<<(2*Norm))-1; C2 = (1<>1; + Acc = gap_max(0, gap_min(C2, AT_NORM(Acc, Norm))); + break; + case 2: /* H Swish */ + /* C1 = 3<<(2*Norm); C2 = (1<<16)/6; 1/6 in Q16 */ + Acc = AT_NORM(AT_NORM(AT_NORM(gap_min(gap_max(P + C1, 0), UB), Norm) * P, Norm) * C2, 16); + break; + default: + Acc = Min(Max(AT_NORM(P, Norm), LB), UB); - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; + } + return Acc; +} - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); +static inline int __attribute__((always_inline)) MatMulReduct_fps(int P, int C1, int C2, int M, int Norm, int LB, int UB, int Oper) - At = 0; OffLine = 0; Oo = 0; - if (ColFirst) OffLine = Pi; else Oo = Pi; +{ + int Acc; + switch (Oper) { + case 0: /* Channel Scale */ + /* Norm = 2*Norm */ + Acc = Min(Max(AT_NORM(P*M, Norm), LB), UB); + break; + case 1: /* H Sigmoid */ + /* C1 = (1<<(2*Norm))-1; C2 = (1<>1; + Acc = gap_max(0, gap_min(C2, AT_NORM(Acc, Norm))); + break; + case 2: /* H Swish */ + /* Norm = 2*Norm; C1 = 3<<(2*Norm); C2 = (1<<16)/6; 1/6 in Q16 */ + Acc = AT_NORM(AT_NORM(gap_min(gap_max(P + C1, 0), UB) * P, Norm) * C2, 16); + break; + default: + Acc = Min(Max(AT_NORM(P, Norm), LB), UB); - while (L>0) { - for (i=Fi;iIn1; + short int * __restrict__ In1 = Arg->In1; unsigned int W_In1 = Arg->W_In1; unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; + short int * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ short int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; + short int * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; unsigned int OutFirstCol = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; + short int *BufferColIn2 = Arg->BufferColIn2; unsigned int Norm = Arg->Norm; unsigned int NormBias = Arg->NormBias; + unsigned int NormMulBias = Arg->NormMulBias; int LB = Arg->LB, UB = Arg->UB; int ColFirst = Arg->ColFirst; + int M = *Arg->MulBias; unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; + v2s *VBuff = (v2s *) BufferColIn2; unsigned int CoreId = gap_coreid(); unsigned int ChunkCell = ChunkSize(H_In1); @@ -1120,44 +1018,114 @@ void KerParMatMul_fp_fps(KerMatMul_fp_fps_T *Arg) for (Col=0; ColIn1; +/* + In1 is usually the Conv1x1 filter set, e,g In1 is [OutFeat][InFeat] + In2 is [InFeat][Width*Height] + + When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)] +*/ + short int * __restrict__ In1 = Arg->In1; unsigned int W_In1 = Arg->W_In1; unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; + short int * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ short int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; + short int * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + int Pi = Arg->OutFirstCol; + short int *BufferColIn2 = Arg->BufferColIn2; + unsigned int Norm = Arg->Norm; + unsigned int NormBias = Arg->NormBias; + unsigned int NormMulBias = Arg->NormMulBias; + int Wi = Arg->W, Hi = Arg->H; + int Sx = Arg->Sx, Sy = Arg->Sy; + int LB = Arg->LB, UB = Arg->UB; + int ColFirst = Arg->ColFirst; + int M = *Arg->MulBias; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + + int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; + int Oo, OffLine; + int At, F=0, L = W_In2; + + unsigned int Line, Col, i; + v2s *VBuff = (v2s *) BufferColIn2; + + unsigned int CoreId = gap_coreid(); + unsigned int ChunkCell = ChunkSize(H_In1); + unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); + + At=0; OffLine=0; Oo=0; + if (ColFirst) OffLine=Pi; else Oo=Pi; + while (L>0) { + for (i=Fi;iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + short int * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ + int * __restrict__ Bias = Arg->Bias; + short int * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; unsigned int OutFirstCol = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; + short int *BufferColIn2 = Arg->BufferColIn2; unsigned int Norm = Arg->Norm; unsigned int NormBias = Arg->NormBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; + unsigned int NormMulBias = Arg->NormMulBias; + int LB = Arg->LB, UB = Arg->UB; int ColFirst = Arg->ColFirst; + int M = *Arg->MulBias; unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; + v2s *VBuff = (v2s *) BufferColIn2; unsigned int CoreId = gap_coreid(); unsigned int ChunkCell = ChunkSize(H_In1); @@ -1166,26 +1134,25 @@ void KerParMatMul_ReLUN_Vector_fp_fps(KerMatMul_fp_fps_T *Arg) int OffLine = 0, OffCol = 0; if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColIn1; + short int * __restrict__ In1 = Arg->In1; unsigned int W_In1 = Arg->W_In1; unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - short int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; + short int * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ + int * __restrict__ Bias = Arg->Bias; + short int * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; + short int *BufferColIn2 = Arg->BufferColIn2; unsigned int Norm = Arg->Norm; unsigned int NormBias = Arg->NormBias; + unsigned int NormMulBias = Arg->NormMulBias; int Wi = Arg->W, Hi = Arg->H; int Sx = Arg->Sx, Sy = Arg->Sy; int LB = Arg->LB, UB = Arg->UB; int ColFirst = Arg->ColFirst; + int M = *Arg->MulBias; unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; @@ -1219,7 +1188,7 @@ void KerParMatMulSxSy_fp_fps(KerMatMul_fp_fps_T *Arg) int At, F=0, L = W_In2; unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; + v2s *VBuff = (v2s *) BufferColIn2; unsigned int CoreId = gap_coreid(); unsigned int ChunkCell = ChunkSize(H_In1); @@ -1232,15 +1201,14 @@ void KerParMatMulSxSy_fp_fps(KerMatMul_fp_fps_T *Arg) for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; + /* + Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2 + */ + signed char * __restrict__ In1 = Arg->In1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; + unsigned int NormBias = Arg->NormBias; + unsigned int Norm = Arg->Norm+Arg->NormMulBias; + int LB = Arg->LB, UB = Arg->UB; + int ColFirst = Arg->ColFirst; + int M = *Arg->MulBias; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; + v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); + v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); + v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); + + unsigned int CoreId = gap_coreid(); + unsigned int ChunkCell = ChunkSize(H_In1); + unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColIn1; + unsigned int W_In1 = Arg->W_In1; unsigned int H_In1 = Arg->H_In1; signed char * __restrict__ In2 = Arg->In2; unsigned int W_In2 = Arg->W_In2; - short int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Bias = Arg->Bias; signed char * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; int Pi = Arg->OutFirstCol; signed char *BufferColIn2 = Arg->BufferColIn2; unsigned int Norm = Arg->Norm; unsigned int NormBias = Arg->NormBias; + unsigned int NormT = Arg->Norm+Arg->NormMulBias; int Wi = Arg->W, Hi = Arg->H; int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; + int LB = Arg->LB, UB = Arg->UB; int ColFirst = Arg->ColFirst; + int M = *Arg->MulBias; unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; @@ -1295,22 +1389,22 @@ void KerParMatMulSxSy_ReLUN_Vector_fp_fps(KerMatMul_fp_fps_T *Arg) unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - At=0; OffLine=0; Oo=0; - if (ColFirst) OffLine=Pi; else Oo=Pi; + At = 0; OffLine = 0; Oo = 0; + if (ColFirst) OffLine = Pi; else Oo = Pi; + while (L>0) { for (i=Fi;i>1; - Acc = gap_max(0, gap_min(C2, AT_NORM(Acc, Norm))); - break; - case 2: /* H Swish */ - /* C1 = 3<<(2*Norm); C2 = (1<<16)/6; 1/6 in Q16 */ - Acc = AT_NORM(AT_NORM(AT_NORM(gap_min(gap_max(P + C1, 0), UB), Norm) * P, Norm) * C2, 16); - break; - default: - Acc = Min(Max(AT_NORM(P, Norm), LB), UB); - - } - return Acc; -} - -static inline int __attribute__((always_inline)) MatMulReduct_fps(int P, int C1, int C2, int M, int Norm, int LB, int UB, int Oper) - -{ - int Acc; - switch (Oper) { - case 0: /* Channel Scale */ - /* Norm = 2*Norm */ - Acc = Min(Max(AT_NORM(P*M, Norm), LB), UB); - break; - case 1: /* H Sigmoid */ - /* C1 = (1<<(2*Norm))-1; C2 = (1<>1; - Acc = gap_max(0, gap_min(C2, AT_NORM(Acc, Norm))); - break; - case 2: /* H Swish */ - /* Norm = 2*Norm; C1 = 3<<(2*Norm); C2 = (1<<16)/6; 1/6 in Q16 */ - Acc = AT_NORM(AT_NORM(gap_min(gap_max(P + C1, 0), UB) * P, Norm) * C2, 16); - break; - default: - Acc = Min(Max(AT_NORM(P, Norm), LB), UB); - - } - return Acc; -} - -/* Matrix multiplication with output scaling by the same scalar for all channels */ - -void KerParMatMulScaleScalar_fp(KerMatMul_fp_T *Arg) +void KerParMatMulScaleScalar_fp_fps(KerMatMul_fp_fps_T *Arg) { - short int * __restrict__ In1 = Arg->In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; + /* + Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2 + */ + signed char * __restrict__ In1 = Arg->In1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + short int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; unsigned int NormBias = Arg->NormBias; - unsigned int NormMulBias = Arg->NormMulBias; - int LB = Arg->LB, UB = Arg->UB; - int ColFirst = Arg->ColFirst; + unsigned int Norm = Arg->Norm+Arg->NormMulBias; + int LB = Arg->LB, UB = Arg->UB; + int ColFirst = Arg->ColFirst; int M = *Arg->MulBias; - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormMulBias = Arg->NormMulBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int ColFirst = Arg->ColFirst; - int M = *Arg->MulBias; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; + signed char * __restrict__ In1 = Arg->In1; unsigned int W_In1 = Arg->W_In1; unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; + signed char * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; int Pi = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; + signed char *BufferColIn2 = Arg->BufferColIn2; unsigned int Norm = Arg->Norm; unsigned int NormBias = Arg->NormBias; - unsigned int NormMulBias = Arg->NormMulBias; + unsigned int NormT = Norm+Arg->NormMulBias; int Wi = Arg->W, Hi = Arg->H; int Sx = Arg->Sx, Sy = Arg->Sy; int LB = Arg->LB, UB = Arg->UB; @@ -1513,7 +1578,7 @@ void KerParMatMulScaleScalarSxSy_fp(KerMatMul_fp_T *Arg) int At, F=0, L = W_In2; unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; + v4s *VBuff = (v4s *) BufferColIn2; unsigned int CoreId = gap_coreid(); unsigned int ChunkCell = ChunkSize(H_In1); @@ -1526,14 +1591,15 @@ void KerParMatMulScaleScalarSxSy_fp(KerMatMul_fp_T *Arg) for (i=Fi;iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + short int * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ + short int * __restrict__ Bias = Arg->Bias; + short int * __restrict__ MulBias = Arg->MulBias; + short int * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned int OutFirstCol = Arg->OutFirstCol; + short int *BufferColIn2 = Arg->BufferColIn2; + unsigned int Norm = Arg->Norm; + unsigned int NormBias = Arg->NormBias; + unsigned int NormMulBias = Arg->NormMulBias; + int LB = Arg->LB, UB = Arg->UB; + int ColFirst = Arg->ColFirst; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + v2s *VBuff = (v2s *) BufferColIn2; + + unsigned int CoreId = gap_coreid(); + unsigned int ChunkCell = ChunkSize(H_In1); + unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColIn2; unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ short int * __restrict__ Bias = Arg->Bias; + short int * __restrict__ MulBias = Arg->MulBias; short int * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; int Pi = Arg->OutFirstCol; @@ -1570,10 +1690,8 @@ void KerParMatMulScaleScalarSxSy_ReLUN_Vector_fp(KerMatMul_fp_T *Arg) unsigned int NormMulBias = Arg->NormMulBias; int Wi = Arg->W, Hi = Arg->H; int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; + int LB = Arg->LB, UB = Arg->UB; int ColFirst = Arg->ColFirst; - int M = *Arg->MulBias; unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; @@ -1598,12 +1716,12 @@ void KerParMatMulScaleScalarSxSy_ReLUN_Vector_fp(KerMatMul_fp_T *Arg) for (Line=First; LineIn1; @@ -1626,6 +1744,7 @@ void KerParMatMulScaleScalar_fpd_fp(KerMatMul_fpd_fp_T *Arg) short int * __restrict__ In2 = Arg->In2; unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ int * __restrict__ Bias = Arg->Bias; + int * __restrict__ MulBias = Arg->MulBias; short int * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; unsigned int OutFirstCol = Arg->OutFirstCol; @@ -1635,7 +1754,6 @@ void KerParMatMulScaleScalar_fpd_fp(KerMatMul_fpd_fp_T *Arg) unsigned int NormMulBias = Arg->NormMulBias; int LB = Arg->LB, UB = Arg->UB; int ColFirst = Arg->ColFirst; - int M = *Arg->MulBias; unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; @@ -1661,120 +1779,70 @@ void KerParMatMulScaleScalar_fpd_fp(KerMatMul_fpd_fp_T *Arg) S = gap_sumdotp2(VIn1[2*i+1], VBuff[2*i+1], S); } for (i=(W_In1/4)*4; iIn1; unsigned int W_In1 = Arg->W_In1; unsigned int H_In1 = Arg->H_In1; short int * __restrict__ In2 = Arg->In2; unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ int * __restrict__ Bias = Arg->Bias; + int * __restrict__ MulBias = Arg->MulBias; short int * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; + int Pi = Arg->OutFirstCol; short int *BufferColIn2 = Arg->BufferColIn2; unsigned int Norm = Arg->Norm; unsigned int NormBias = Arg->NormBias; unsigned int NormMulBias = Arg->NormMulBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; + int Wi = Arg->W, Hi = Arg->H; + int Sx = Arg->Sx, Sy = Arg->Sy; + int LB = Arg->LB, UB = Arg->UB; int ColFirst = Arg->ColFirst; - int M = *Arg->MulBias; unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; + + int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; + int Oo, OffLine; + int At, F=0, L = W_In2; + unsigned int Line, Col, i; v2s *VBuff = (v2s *) BufferColIn2; unsigned int CoreId = gap_coreid(); unsigned int ChunkCell = ChunkSize(H_In1); unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormMulBias = Arg->NormMulBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB, UB = Arg->UB; - int ColFirst = Arg->ColFirst; - int M = *Arg->MulBias; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; - - unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - - At=0; OffLine=0; Oo=0; - if (ColFirst) OffLine=Pi; else Oo=Pi; - while (L>0) { - for (i=Fi;i0) { + for (i=Fi;iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ MulBias = Arg->MulBias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; + unsigned int NormBias = Arg->NormBias; + unsigned int Norm = Arg->Norm+Arg->NormMulBias; + int LB = Arg->LB, UB = Arg->UB; + int ColFirst = Arg->ColFirst; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; + v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); + v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); + v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); + + unsigned int CoreId = gap_coreid(); + unsigned int ChunkCell = ChunkSize(H_In1); + unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColIn1; + signed char * __restrict__ In1 = Arg->In1; unsigned int W_In1 = Arg->W_In1; unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ MulBias = Arg->MulBias; + signed char * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; int Pi = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; + signed char *BufferColIn2 = Arg->BufferColIn2; unsigned int Norm = Arg->Norm; unsigned int NormBias = Arg->NormBias; - unsigned int NormMulBias = Arg->NormMulBias; + unsigned int NormT = Norm+Arg->NormMulBias; int Wi = Arg->W, Hi = Arg->H; int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; + int LB = Arg->LB, UB = Arg->UB; int ColFirst = Arg->ColFirst; - int M = *Arg->MulBias; unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; @@ -1825,28 +2020,30 @@ void KerParMatMulScaleScalarSxSy_ReLUN_Vector_fpd_fp(KerMatMul_fpd_fp_T *Arg) int At, F=0, L = W_In2; unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; + v4s *VBuff = (v4s *) BufferColIn2; unsigned int CoreId = gap_coreid(); unsigned int ChunkCell = ChunkSize(H_In1); unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - At=0; OffLine=0; Oo=0; - if (ColFirst) OffLine=Pi; else Oo=Pi; + At = 0; OffLine = 0; Oo = 0; + if (ColFirst) OffLine = Pi; else Oo = Pi; + while (L>0) { for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; + /* + Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2 + */ + signed char * __restrict__ In1 = Arg->In1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + short int * __restrict__ Bias = Arg->Bias; + short int * __restrict__ MulBias = Arg->MulBias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; unsigned int NormBias = Arg->NormBias; - unsigned int NormT = Arg->Norm+Arg->NormMulBias; - int LB = Arg->LB, UB = Arg->UB; - int ColFirst = Arg->ColFirst; - int M = *Arg->MulBias; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; + unsigned int Norm = Arg->Norm+Arg->NormMulBias; + int LB = Arg->LB, UB = Arg->UB; + int ColFirst = Arg->ColFirst; - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; + v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); + v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); + v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormT = Arg->Norm+Arg->NormMulBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int ColFirst = Arg->ColFirst; - int M = *Arg->MulBias; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColH_In1; signed char * __restrict__ In2 = Arg->In2; unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; + short int * __restrict__ Bias = Arg->Bias; + short int * __restrict__ MulBias = Arg->MulBias; signed char * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; int Pi = Arg->OutFirstCol; signed char *BufferColIn2 = Arg->BufferColIn2; unsigned int Norm = Arg->Norm; unsigned int NormBias = Arg->NormBias; - unsigned int NormT = Arg->Norm+Arg->NormMulBias; + unsigned int NormT = Norm+Arg->NormMulBias; int Wi = Arg->W, Hi = Arg->H; int Sx = Arg->Sx, Sy = Arg->Sy; int LB = Arg->LB, UB = Arg->UB; int ColFirst = Arg->ColFirst; - int M = *Arg->MulBias; unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; @@ -2004,9 +2227,8 @@ void KerParMatMulScaleScalarSxSy_fps(KerMatMul_fps_T *Arg) unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - At = 0; OffLine = 0; Oo = 0; - if (ColFirst) OffLine = Pi; else Oo = Pi; - + At=0; OffLine=0; Oo=0; + if (ColFirst) OffLine=Pi; else Oo=Pi; while (L>0) { for (i=Fi;iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + short int * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ + short int * __restrict__ Bias = Arg->Bias; + short int * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned int OutFirstCol = Arg->OutFirstCol; + short int *BufferColIn2 = Arg->BufferColIn2; + unsigned int Norm = Arg->Norm; + unsigned int NormBias = Arg->NormBias; + int NormOut = Arg->UB; + int ColFirst = Arg->ColFirst; + int C1 = 3<In1; + short int * __restrict__ In1 = Arg->In1; unsigned int W_In1 = Arg->W_In1; unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; + short int * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ + short int * __restrict__ Bias = Arg->Bias; + short int * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; + short int *BufferColIn2 = Arg->BufferColIn2; unsigned int Norm = Arg->Norm; unsigned int NormBias = Arg->NormBias; - unsigned int NormT = Arg->Norm+Arg->NormMulBias; + int NormOut = Arg->UB; int Wi = Arg->W, Hi = Arg->H; int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; int ColFirst = Arg->ColFirst; - int M = *Arg->MulBias; + int C1 = 3<0) { for (i=Fi;iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; + unsigned int Norm = Arg->Norm; + unsigned int NormBias = Arg->NormBias; + int NormOut = Arg->UB; + int ColFirst = Arg->ColFirst; + int C1 = 3<In1; unsigned int W_In1 = Arg->W_In1; unsigned int H_In1 = Arg->H_In1; signed char * __restrict__ In2 = Arg->In2; unsigned int W_In2 = Arg->W_In2; - short int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Bias = Arg->Bias; signed char * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; + int Pi = Arg->OutFirstCol; signed char *BufferColIn2 = Arg->BufferColIn2; unsigned int Norm = Arg->Norm; unsigned int NormBias = Arg->NormBias; - unsigned int NormT = Norm+Arg->NormMulBias; - int LB = Arg->LB, UB = Arg->UB; + int NormOut = Arg->UB; + int Wi = Arg->W, Hi = Arg->H; + int Sx = Arg->Sx, Sy = Arg->Sy; int ColFirst = Arg->ColFirst; - int M = *Arg->MulBias; + int C1 = 3<0) { + for (i=Fi;iIn1; + short int * __restrict__ In1 = Arg->In1; unsigned int W_In1 = Arg->W_In1; unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; + short int * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ short int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; + short int * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; unsigned int OutFirstCol = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; + short int *BufferColIn2 = Arg->BufferColIn2; unsigned int Norm = Arg->Norm; unsigned int NormBias = Arg->NormBias; - unsigned int NormT = Norm+Arg->NormMulBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; + int NormOut = Arg->UB; int ColFirst = Arg->ColFirst; - int M = *Arg->MulBias; + int C1 = 3<In1; + short int * __restrict__ In1 = Arg->In1; unsigned int W_In1 = Arg->W_In1; unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; + short int * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ short int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; + short int * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; + short int *BufferColIn2 = Arg->BufferColIn2; unsigned int Norm = Arg->Norm; unsigned int NormBias = Arg->NormBias; - unsigned int NormT = Norm+Arg->NormMulBias; + int NormOut = Arg->UB; int Wi = Arg->W, Hi = Arg->H; int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB, UB = Arg->UB; int ColFirst = Arg->ColFirst; - int M = *Arg->MulBias; + int C1 = 3<In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - short int * __restrict__ Bias = Arg->Bias; + /* + Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2 + */ + signed char * __restrict__ In1 = Arg->In1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; + unsigned int Norm = Arg->Norm; + unsigned int NormBias = Arg->NormBias; + int NormOut = Arg->UB; + int ColFirst = Arg->ColFirst; + int C1 = 3<In1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + signed char * __restrict__ Bias = Arg->Bias; signed char * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; int Pi = Arg->OutFirstCol; signed char *BufferColIn2 = Arg->BufferColIn2; unsigned int Norm = Arg->Norm; unsigned int NormBias = Arg->NormBias; - unsigned int NormT = Norm+Arg->NormMulBias; + int NormOut = Arg->UB; int Wi = Arg->W, Hi = Arg->H; int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; int ColFirst = Arg->ColFirst; - int M = *Arg->MulBias; + int C1 = 3<0) { for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ MulBias = Arg->MulBias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormMulBias = Arg->NormMulBias; - int LB = Arg->LB, UB = Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; @@ -2411,16 +2916,12 @@ void KerParMatMulScale_ReLUN_Vector_fp(KerMatMul_fp_T *Arg) short int * __restrict__ In2 = Arg->In2; unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ MulBias = Arg->MulBias; short int * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; unsigned int OutFirstCol = Arg->OutFirstCol; short int *BufferColIn2 = Arg->BufferColIn2; unsigned int Norm = Arg->Norm; unsigned int NormBias = Arg->NormBias; - unsigned int NormMulBias = Arg->NormMulBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; int ColFirst = Arg->ColFirst; unsigned int H_In2 = W_In1; @@ -2438,24 +2939,27 @@ void KerParMatMulScale_ReLUN_Vector_fp(KerMatMul_fp_T *Arg) for (Col=0; Col=0); + int Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); + int Acc0 = gap_clip(Neg*Input1+Pos*Input, 15); + Out[(Line+OffLine)*W_Out+Col+OffCol] = Acc0; + } } gap_waitbarrier(0); } } -void KerParMatMulScaleSxSy_fp(KerMatMul_fp_T *Arg) +void KerParMatMulLeakyreluSxSy_fp(KerMatMul_fp_T *Arg) { /* @@ -2470,17 +2974,14 @@ void KerParMatMulScaleSxSy_fp(KerMatMul_fp_T *Arg) short int * __restrict__ In2 = Arg->In2; unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ MulBias = Arg->MulBias; short int * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; int Pi = Arg->OutFirstCol; short int *BufferColIn2 = Arg->BufferColIn2; unsigned int Norm = Arg->Norm; unsigned int NormBias = Arg->NormBias; - unsigned int NormMulBias = Arg->NormMulBias; int Wi = Arg->W, Hi = Arg->H; int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB, UB = Arg->UB; int ColFirst = Arg->ColFirst; unsigned int H_In2 = W_In1; @@ -2511,8 +3012,13 @@ void KerParMatMulScaleSxSy_fp(KerMatMul_fp_T *Arg) S = gap_sumdotp2(VIn1[2*i+1], VBuff[2*i+1], S); } for (i=(W_In1/4)*4; i=0); + int Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); + int Acc0 = gap_clip(Neg*Input1+Pos*Input, 15); + Out[(Line+OffLine)*W_Out+Oo] = Acc0; + } } int nF = F+Sx; if (nFIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ MulBias = Arg->MulBias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormMulBias = Arg->NormMulBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; - - unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - - At=0; OffLine=0; Oo=0; - if (ColFirst) OffLine=Pi; else Oo=Pi; - while (L>0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - int * __restrict__ Bias = Arg->Bias; - int * __restrict__ MulBias = Arg->MulBias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormMulBias = Arg->NormMulBias; - int LB = Arg->LB, UB = Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - int * __restrict__ Bias = Arg->Bias; - int * __restrict__ MulBias = Arg->MulBias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormMulBias = Arg->NormMulBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - int * __restrict__ Bias = Arg->Bias; - int * __restrict__ MulBias = Arg->MulBias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormMulBias = Arg->NormMulBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB, UB = Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; - - unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - - At=0; OffLine=0; Oo=0; - if (ColFirst) OffLine=Pi; else Oo=Pi; - while (L>0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - int * __restrict__ Bias = Arg->Bias; - int * __restrict__ MulBias = Arg->MulBias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormMulBias = Arg->NormMulBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; - - unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - - At=0; OffLine=0; Oo=0; - if (ColFirst) OffLine=Pi; else Oo=Pi; - while (L>0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ MulBias = Arg->MulBias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormT = Norm+Arg->NormMulBias; - int LB = Arg->LB, UB = Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ MulBias = Arg->MulBias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormT = Norm+Arg->NormMulBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ MulBias = Arg->MulBias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormT = Norm+Arg->NormMulBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB, UB = Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; - - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - - At = 0; OffLine = 0; Oo = 0; - if (ColFirst) OffLine = Pi; else Oo = Pi; - - while (L>0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ MulBias = Arg->MulBias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormT = Norm+Arg->NormMulBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; - - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - - At = 0; OffLine = 0; Oo = 0; - if (ColFirst) OffLine = Pi; else Oo = Pi; - - while (L>0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ MulBias = Arg->MulBias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormT = Norm+Arg->NormMulBias; - int LB = Arg->LB, UB = Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ MulBias = Arg->MulBias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormT = Norm+Arg->NormMulBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ MulBias = Arg->MulBias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormT = Norm+Arg->NormMulBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB, UB = Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; - - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - - At=0; OffLine=0; Oo=0; - if (ColFirst) OffLine=Pi; else Oo=Pi; - while (L>0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ MulBias = Arg->MulBias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - unsigned int NormT = Norm+Arg->NormMulBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; - - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - - At=0; OffLine=0; Oo=0; - if (ColFirst) OffLine=Pi; else Oo=Pi; - while (L>0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int NormOut = Arg->UB; - int ColFirst = Arg->ColFirst; - int C1 = 3<In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int NormOut = Arg->UB; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int ColFirst = Arg->ColFirst; - int C1 = 3<0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int NormOut = Arg->UB; - int ColFirst = Arg->ColFirst; - int C1 = 3<In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int NormOut = Arg->UB; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int ColFirst = Arg->ColFirst; - int C1 = 3<0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int NormOut = Arg->UB; - int ColFirst = Arg->ColFirst; - int C1 = 3<In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int NormOut = Arg->UB; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int ColFirst = Arg->ColFirst; - int C1 = 3<0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int NormOut = Arg->UB; - int ColFirst = Arg->ColFirst; - int C1 = 3<In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int NormOut = Arg->UB; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int ColFirst = Arg->ColFirst; - int C1 = 3<0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; Col=0); - int Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); - int Acc0 = gap_clip(Neg*Input1+Pos*Input, 15); - Out[(Line+OffLine)*W_Out+Col+OffCol] = Acc0; - } - } - gap_waitbarrier(0); - } -} - -void KerParMatMulLeakyreluSxSy_fp(KerMatMul_fp_T *Arg) - -{ -/* - In1 is usually the Conv1x1 filter set, e,g In1 is [OutFeat][InFeat] - In2 is [InFeat][Width*Height] - - When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)] -*/ - short int * __restrict__ In1 = Arg->In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - short int *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; - - unsigned int Line, Col, i; - v2s *VBuff = (v2s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - - At=0; OffLine=0; Oo=0; - if (ColFirst) OffLine=Pi; else Oo=Pi; - while (L>0) { - for (i=Fi;i=0); - int Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); - int Acc0 = gap_clip(Neg*Input1+Pos*Input, 15); - Out[(Line+OffLine)*W_Out+Oo] = Acc0; - } - } - int nF = F+Sx; - if (nFIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; Col=0); - int Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); - int Acc0 = gap_clip(Neg*Input1+Pos*Input, 7); - Out[(Line+OffLine)*W_Out+Col+OffCol] = Acc0; - } - } - gap_waitbarrier(0); - } -} - -void KerParMatMulLeakyreluSxSy_fps(KerMatMul_fps_T *Arg) - -{ -/* - In1 is usually the Conv1x1 filter set, e,g In1 is [OutFeat][InFeat] - In2 is [InFeat][Width*Height] - - When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)] -*/ - signed char * __restrict__ In1 = Arg->In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int Norm = Arg->Norm; - unsigned int NormBias = Arg->NormBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; - - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(H_In1); - unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - - At = 0; OffLine = 0; Oo = 0; - if (ColFirst) OffLine = Pi; else Oo = Pi; - - while (L>0) { - for (i=Fi;i=0); - int Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); - int Acc0 = gap_clip(Neg*Input1+Pos*Input, 7); - Out[(Line+OffLine)*W_Out+Oo] = Acc0; - } - } - int nF = F+Sx; - if (nFIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; + unsigned int Norm = Arg->Norm; + unsigned int NormBias = Arg->NormBias; + int ColFirst = Arg->ColFirst; + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; + v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); + v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); + v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); -static void KerParMatMulSmallFeat_Bis_fps(KerMatMul_fps_T *Arg) + unsigned int CoreId = gap_coreid(); + unsigned int ChunkCell = ChunkSize(H_In1); + unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; -{ - signed char * __restrict__ In1 = Arg->In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - int LB = Arg->LB, UB = Arg->UB; - unsigned int NormBias = Arg->NormBias; - unsigned int Norm = Arg->Norm; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - // for (int l2=0; l2=0); Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); S0 = gap_clip(Neg*Input1+Pos*Input, 7); + Input = AT_NORM(S1, Norm); Neg = (Input<0); Pos = (Input>=0); Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); S1 = gap_clip(Neg*Input1+Pos*Input, 7); + Input = AT_NORM(S2, Norm); Neg = (Input<0); Pos = (Input>=0); Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); S2 = gap_clip(Neg*Input1+Pos*Input, 7); + Input = AT_NORM(S3, Norm); Neg = (Input<0); Pos = (Input>=0); Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); S3 = gap_clip(Neg*Input1+Pos*Input, 7); } - Out[l1*H_In2 + l2+0] = gap_min(UB, gap_max(AT_NORM(Acc, Norm), LB)); + v4s R = gap_pack4(S0, S1, S2, S3); + *((v4s *) (Out+(Line+OffLine)*W_Out+4*Col+0+OffCol)) = R; + } + gap_waitbarrier(0); + } + if (W_In2&0x2) { + Col = W_In2/2 - 1; + for (i=F;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - int LB = Arg->LB, UB = Arg->UB; - unsigned int NormBias = Arg->NormBias; - unsigned int Norm = Arg->Norm; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - - int IterC = W_In1/4; - for (int l2=First; l2In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - int LB = Arg->LB, UB = Arg->UB; - unsigned int NormBias = Arg->NormBias; - unsigned int Norm = Arg->Norm; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - - int IterC = W_In1/4; - int Off1 = W_In1-1; - for (int l2=First; l2=0); Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); S0 = gap_clip(Neg*Input1+Pos*Input, 7); + Input = AT_NORM(S1, Norm); Neg = (Input<0); Pos = (Input>=0); Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); S1 = gap_clip(Neg*Input1+Pos*Input, 7); } - char *p1 = In1+l1*W_In1+Off1, *p2 = In2+l2*W_In2+Off1; - Acc += *p1-- * *p2--; - Out[l1*H_In2 + l2] = gap_min(UB, gap_max(AT_NORM(Acc, Norm), LB)); - } + Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = S0; + Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = S1; + } + gap_waitbarrier(0); } -} - -static void KerParMatMulSmallFeat4NP2_fps(KerMatMul_fps_T *Arg) - -{ - signed char * __restrict__ In1 = Arg->In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - int LB = Arg->LB, UB = Arg->UB; - unsigned int NormBias = Arg->NormBias; - unsigned int Norm = Arg->Norm; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - - int IterC = W_In1/4; - int Off1 = W_In1-1; - for (int l2=First; l2In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - int LB = Arg->LB, UB = Arg->UB; - unsigned int NormBias = Arg->NormBias; - unsigned int Norm = Arg->Norm; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - - int IterC = W_In1/4; - int Off1 = W_In1-1; - for (int l2=First; l2=0); Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); S0 = gap_clip(Neg*Input1+Pos*Input, 7); + } + Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = S0; + } + gap_waitbarrier(0); } } -void KerParMatMulSmallFeat_fps(KerMatMul_fps_T *Arg) +void KerParMatMulLeakyreluSxSy_fps(KerMatMul_fps_T *Arg) { - int OPT = 1; - - if (OPT) KerParMatMulSmallFeat_Bis_fps(Arg); - else { - int Sel = Arg->W_In1&0x3; - switch (Sel) { - case 0: KerParMatMulSmallFeat4N_fps(Arg); break; - case 1: KerParMatMulSmallFeat4NP1_fps(Arg); break; - case 2: KerParMatMulSmallFeat4NP2_fps(Arg); break; - default: KerParMatMulSmallFeat4NP3_fps(Arg); break; - } - gap_waitbarrier(0); - } -} - -static void KerParMatMulSmallFeat4N_ReLUN_Vector_fps(KerMatMul_fps_T *Arg) +/* + In1 is usually the Conv1x1 filter set, e,g In1 is [OutFeat][InFeat] + In2 is [InFeat][Width*Height] -{ + When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)] +*/ signed char * __restrict__ In1 = Arg->In1; unsigned int W_In1 = Arg->W_In1; unsigned int H_In1 = Arg->H_In1; signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; + unsigned int W_In2 = Arg->W_In2; signed char * __restrict__ Bias = Arg->Bias; signed char * __restrict__ Out = Arg->Out; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - unsigned int NormBias = Arg->NormBias; + unsigned int W_Out = Arg->W_Out; + int Pi = Arg->OutFirstCol; + signed char *BufferColIn2 = Arg->BufferColIn2; unsigned int Norm = Arg->Norm; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int NormBias = Arg->NormBias; + int Wi = Arg->W, Hi = Arg->H; + int Sx = Arg->Sx, Sy = Arg->Sy; + int ColFirst = Arg->ColFirst; - int IterC = W_In1/4; - for (int l2=First; l20) { + for (i=Fi;i=0); + int Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); + int Acc0 = gap_clip(Neg*Input1+Pos*Input, 7); + Out[(Line+OffLine)*W_Out+Oo] = Acc0; + } + } + int nF = F+Sx; + if (nFIn1; @@ -4391,116 +3259,82 @@ static void KerParMatMulSmallFeat4NP1_ReLUN_Vector_fps(KerMatMul_fps_T *Arg) unsigned int W_In2 = W_In1; signed char * __restrict__ Bias = Arg->Bias; signed char * __restrict__ Out = Arg->Out; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; + int LB = Arg->LB, UB = Arg->UB; unsigned int NormBias = Arg->NormBias; unsigned int Norm = Arg->Norm; unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int Iter = Max(0, Last-First); - int IterC = W_In1/4; - int Off1 = W_In1-1; - for (int l2=First; l2In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - unsigned int NormBias = Arg->NormBias; - unsigned int Norm = Arg->Norm; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - - int IterC = W_In1/4; - int Off1 = W_In1-1; - for (int l2=First; l2In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - unsigned int NormBias = Arg->NormBias; - unsigned int Norm = Arg->Norm; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - - int IterC = W_In1/4; - int Off1 = W_In1-1; - for (int l2=First; l2W_In1&0x3; - switch (Sel) { - case 0: KerParMatMulSmallFeat4N_ReLUN_Vector_fps(Arg); break; - case 1: KerParMatMulSmallFeat4NP1_ReLUN_Vector_fps(Arg); break; - case 2: KerParMatMulSmallFeat4NP2_ReLUN_Vector_fps(Arg); break; - default: KerParMatMulSmallFeat4NP3_ReLUN_Vector_fps(Arg); break; - } gap_waitbarrier(0); } @@ -4524,31 +3358,79 @@ void KerParMatMulHswishSmallFeat_fps(KerMatMul_fps_T *Arg) int UB = 6<1) { - Acc += *p1-- * *p2--; - if (Rem1==3) Acc += *p1-- * *p2--; - } + for (int c=(W_In1/4)*4; c1) { - Acc += *p1-- * *p2--; - if (Rem1==3) Acc += *p1-- * *p2--; - } + v4s R = gap_pack4(AT_NORM(gap_max(0, gap_min(AT_NORM(S0, Norm)+C1, UB))*C2, 15), + AT_NORM(gap_max(0, gap_min(AT_NORM(S1, Norm)+C1, UB))*C2, 15), + AT_NORM(gap_max(0, gap_min(AT_NORM(S2, Norm)+C1, UB))*C2, 15), + AT_NORM(gap_max(0, gap_min(AT_NORM(S3, Norm)+C1, UB))*C2, 15)); + *((v4s *) (Out+l1*H_In2 + l2)) = R; + } + } + if (Iter&0x2) { + int l2 = (4*(Iter/4)) + First; + v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2); + v4s *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2); + for (int l1=0; l1Bias; signed char * __restrict__ Out = Arg->Out; + int LB = Arg->LB; unsigned int NormBias = Arg->NormBias; unsigned int Norm = Arg->Norm; unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int Iter = Max(0, Last-First); - int IterC = W_In1/4; - int Off1 = W_In1-1; - int Rem1 = W_In1&0x3; - for (int l2=First; l21) { - Acc += *p1-- * *p2--; - if (Rem1==3) Acc += *p1-- * *p2--; - } + for (int c=(W_In1/4)*4; c=0); - int Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); - Out[l1*H_In2 + l2] = gap_clip(Neg*Input1+Pos*Input, 15); + { + int Input, Neg, Pos, Input1; + Input = AT_NORM(S0, Norm); Neg = (Input<0); Pos = (Input>=0); Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); S0 = gap_clip(Neg*Input1+Pos*Input, 7); + Input = AT_NORM(S1, Norm); Neg = (Input<0); Pos = (Input>=0); Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); S1 = gap_clip(Neg*Input1+Pos*Input, 7); + Input = AT_NORM(S2, Norm); Neg = (Input<0); Pos = (Input>=0); Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); S2 = gap_clip(Neg*Input1+Pos*Input, 7); + Input = AT_NORM(S3, Norm); Neg = (Input<0); Pos = (Input>=0); Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); S3 = gap_clip(Neg*Input1+Pos*Input, 7); + } + v4s R = gap_pack4(S0, S1, S2, S3); + *((v4s *) (Out+l1*H_In2 + l2)) = R; + } + } + if (Iter&0x2) { + int l2 = (4*(Iter/4)) + First; + v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2); + v4s *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2); + for (int l1=0; l1=0); Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); S0 = gap_clip(Neg*Input1+Pos*Input, 7); + Input = AT_NORM(S1, Norm); Neg = (Input<0); Pos = (Input>=0); Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); S1 = gap_clip(Neg*Input1+Pos*Input, 7); + } + Out[l1*H_In2 + l2+0] = S0; + Out[l1*H_In2 + l2+1] = S1; + } + } + if (Iter&0x1) { + int l2 = Last-1; + v4s *pIn2 = (v4s *) (In2 + (l2+0)*W_In2); + for (int l1=0; l1=0); Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); S0 = gap_clip(Neg*Input1+Pos*Input, 7); + } + Out[l1*H_In2 + l2+0] = S0; } } + gap_waitbarrier(0); } void KerParMatMulScaleScalarSmallFeat_fps(KerMatMul_fps_T *Arg) @@ -4658,119 +3648,82 @@ void KerParMatMulScaleScalarSmallFeat_fps(KerMatMul_fps_T *Arg) unsigned int NormBias = Arg->NormBias; unsigned int Norm = Arg->Norm+Arg->NormMulBias; unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int Iter = Max(0, Last-First); - int IterC = W_In1/4; - int Off1 = W_In1-1; - int Rem1 = W_In1&0x3; - for (int l2=First; l21) { - Acc += *p1-- * *p2--; - if (Rem1==3) Acc += *p1-- * *p2--; - } + for (int c=(W_In1/4)*4; cIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - int M = *Arg->MulBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - unsigned int NormBias = Arg->NormBias; - unsigned int Norm = Arg->Norm+Arg->NormMulBias; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - - int IterC = W_In1/4; - int Off1 = W_In1-1; - int Rem1 = W_In1&0x3; - for (int l2=First; l21) { - Acc += *p1-- * *p2--; - if (Rem1==3) Acc += *p1-- * *p2--; - } + for (int c=(W_In1/4)*4; cIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - signed char * __restrict__ MulBias = Arg->MulBias; - int LB = Arg->LB, UB = Arg->UB; - unsigned int NormBias = Arg->NormBias; - unsigned int Norm = Arg->Norm+Arg->NormMulBias; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - - int IterC = W_In1/4; - int Off1 = W_In1-1; - int Rem1 = W_In1&0x3; - for (int l2=First; l21) { - Acc += *p1-- * *p2--; - if (Rem1==3) Acc += *p1-- * *p2--; - } + if (W_In1&0x4) S0 = gap_sumdotp4(pIn1[W_In1/4-1], pIn2[W_In1/4-1], S0); + for (int c=(W_In1/4)*4; cIn1; @@ -4782,156 +3735,93 @@ void KerParMatMulScaleSmallFeat_ReLUN_Vector_fps(KerMatMul_fps_T *Arg) signed char * __restrict__ Bias = Arg->Bias; signed char * __restrict__ Out = Arg->Out; signed char * __restrict__ MulBias = Arg->MulBias; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; + int LB = Arg->LB, UB = Arg->UB; unsigned int NormBias = Arg->NormBias; unsigned int Norm = Arg->Norm+Arg->NormMulBias; unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int Iter = Max(0, Last-First); - int IterC = W_In1/4; - int Off1 = W_In1-1; - int Rem1 = W_In1&0x3; - for (int l2=First; l21) { - Acc += *p1-- * *p2--; - if (Rem1==3) Acc += *p1-- * *p2--; - } + for (int c=(W_In1/4)*4; cIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; - int LB = Arg->LB, UB = Arg->UB; - unsigned int NormBias = Arg->NormBias; - unsigned int Norm = Arg->Norm; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - // for (int l2=0; l2In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - unsigned int NormBias = Arg->NormBias; - unsigned int Norm = Arg->Norm; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - // for (int l2=0; l2In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - short int * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ Out = Arg->Out; - int LB = Arg->LB; - unsigned int NormBias = Arg->NormBias; - unsigned int Norm = Arg->Norm; - int NormOut = Arg->UB; - int C1 = 3<In1; @@ -4942,14 +3832,9 @@ void KerParMatMulHsigmoidSmallFeat_fp(KerMatMul_fp_T *Arg) unsigned int W_In2 = W_In1; short int * __restrict__ Bias = Arg->Bias; short int * __restrict__ Out = Arg->Out; - int LB = Arg->LB; + int LB = Arg->LB, UB = Arg->UB; unsigned int NormBias = Arg->NormBias; unsigned int Norm = Arg->Norm; - int NormOut = Arg->UB; - int C1 = 3<In1; @@ -4983,8 +3867,13 @@ void KerParMatMulLeakyreluSmallFeat_fp(KerMatMul_fp_T *Arg) unsigned int W_In2 = W_In1; short int * __restrict__ Bias = Arg->Bias; short int * __restrict__ Out = Arg->Out; + int LB = Arg->LB; unsigned int NormBias = Arg->NormBias; unsigned int Norm = Arg->Norm; + int NormOut = Arg->UB; + int C1 = 3<=0); - int Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); - Out[l1*H_In2 + l2] = gap_clip(Neg*Input1+Pos*Input, 7); + Acc = AT_NORM(Acc, Norm); + Out[l1*H_In2 + l2] = AT_NORM(AT_NORM(gap_min(gap_max(Acc + C1, 0), UB) * Acc, NormOut) * C2, 15); } } gap_waitbarrier(0); } -void KerParMatMulScaleScalarSmallFeat_fp(KerMatMul_fp_T *Arg) +void KerParMatMulHsigmoidSmallFeat_fp(KerMatMul_fp_T *Arg) { short int * __restrict__ In1 = Arg->In1; @@ -5023,11 +3908,14 @@ void KerParMatMulScaleScalarSmallFeat_fp(KerMatMul_fp_T *Arg) unsigned int W_In2 = W_In1; short int * __restrict__ Bias = Arg->Bias; short int * __restrict__ Out = Arg->Out; - int LB = Arg->LB, UB = Arg->UB; - int M = *Arg->MulBias; + int LB = Arg->LB; unsigned int NormBias = Arg->NormBias; unsigned int Norm = Arg->Norm; - unsigned int NormMulBias = Arg->NormMulBias; + int NormOut = Arg->UB; + int C1 = 3<In1; @@ -5060,12 +3949,9 @@ void KerParMatMulScaleScalarSmallFeat_ReLUN_Vector_fp(KerMatMul_fp_T *Arg) unsigned int W_In2 = W_In1; short int * __restrict__ Bias = Arg->Bias; short int * __restrict__ Out = Arg->Out; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - int M = *Arg->MulBias; unsigned int NormBias = Arg->NormBias; unsigned int Norm = Arg->Norm; - unsigned int NormMulBias = Arg->NormMulBias; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); // for (int l2=0; l2=0); + int Input1 = AT_NORM(Input*LEAK_CONSTANT, LEAK_CONSTANT_FORMAT); + Out[l1*H_In2 + l2] = gap_clip(Neg*Input1+Pos*Input, 7); } } gap_waitbarrier(0); } -void KerParMatMulScaleSmallFeat_fp(KerMatMul_fp_T *Arg) +void KerParMatMulScaleScalarSmallFeat_fp(KerMatMul_fp_T *Arg) { short int * __restrict__ In1 = Arg->In1; @@ -5098,9 +3988,9 @@ void KerParMatMulScaleSmallFeat_fp(KerMatMul_fp_T *Arg) unsigned int H_In2 = Arg->W_In2; unsigned int W_In2 = W_In1; short int * __restrict__ Bias = Arg->Bias; - short int * __restrict__ MulBias = Arg->MulBias; short int * __restrict__ Out = Arg->Out; int LB = Arg->LB, UB = Arg->UB; + int M = *Arg->MulBias; unsigned int NormBias = Arg->NormBias; unsigned int Norm = Arg->Norm; unsigned int NormMulBias = Arg->NormMulBias; @@ -5119,13 +4009,13 @@ void KerParMatMulScaleSmallFeat_fp(KerMatMul_fp_T *Arg) } if (W_In1&0x2) Acc = gap_sumdotp2(pIn1[W_In1/2-1], pIn2[W_In1/2-1], Acc); if (W_In1&0x1) Acc += In1[l1*W_In1+W_In1-1] * In2[l2*W_In2+W_In1-1]; - Out[l1*H_In2 + l2] = gap_min(UB, gap_max(AT_NORM(AT_NORM(Acc, Norm)*MulBias[l1], NormMulBias), LB)); + Out[l1*H_In2 + l2] = gap_min(UB, gap_max(AT_NORM(AT_NORM(Acc, Norm)*M, NormMulBias), LB)); } } gap_waitbarrier(0); } -void KerParMatMulScaleSmallFeat_ReLUN_Vector_fp(KerMatMul_fp_T *Arg) +void KerParMatMulScaleSmallFeat_fp(KerMatMul_fp_T *Arg) { short int * __restrict__ In1 = Arg->In1; @@ -5137,8 +4027,7 @@ void KerParMatMulScaleSmallFeat_ReLUN_Vector_fp(KerMatMul_fp_T *Arg) short int * __restrict__ Bias = Arg->Bias; short int * __restrict__ MulBias = Arg->MulBias; short int * __restrict__ Out = Arg->Out; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; + int LB = Arg->LB, UB = Arg->UB; unsigned int NormBias = Arg->NormBias; unsigned int Norm = Arg->Norm; unsigned int NormMulBias = Arg->NormMulBias; @@ -5149,7 +4038,6 @@ void KerParMatMulScaleSmallFeat_ReLUN_Vector_fp(KerMatMul_fp_T *Arg) for (int l1=0; l1In1; - short int * __restrict__ Scale = Arg->In2; - short int * __restrict__ Out = Arg->Out; - int W = Arg->W; - int H = Arg->H; - int Norm = Arg->Norm; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - - unsigned int CoreId = gap_coreid(); - unsigned int Chunk = ChunkSize(Arg->N); - unsigned int First = Chunk*CoreId; - unsigned int Last = Min(First+Chunk, Arg->N); - int i, j; - - for (i=First; iIn1; - int Scale = *Arg->ScaleScalar; - short int * __restrict__ Out = Arg->Out; - int W = Arg->W; - int H = Arg->H; - int Norm = Arg->Norm; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - - unsigned int CoreId = gap_coreid(); - unsigned int Chunk = ChunkSize(Arg->N); - unsigned int First = Chunk*CoreId; - unsigned int Last = Min(First+Chunk, Arg->N); - int i, j; - - for (i=First; iIn1; - short int * __restrict__ Scale = Arg->In2; - int ScaleScalar = *Arg->ScaleScalar; - short int * __restrict__ Out = Arg->Out; - int W = Arg->W; - int H = Arg->H; - int Norm = Arg->Norm; - int Scale_Q = Arg->Scale_Q; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - - unsigned int CoreId = gap_coreid(); - unsigned int Chunk = ChunkSize(Arg->N); - unsigned int First = Chunk*CoreId; - unsigned int Last = Min(First+Chunk, Arg->N); - int i, j; - - for (i=First; iIn1; - signed char * __restrict__ Scale= Arg->In2; - signed char * __restrict__ Out = Arg->Out; - int W = Arg->W; - int H = Arg->H; - int Norm = Arg->Norm; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - - unsigned int CoreId = gap_coreid(); - unsigned int Chunk = ChunkSize(Arg->N); - unsigned int First = Chunk*CoreId; - unsigned int Last = Min(First+Chunk, Arg->N); - int i, j; - - for (i=First; iIn1; - int Scale = *Arg->ScaleScalar; - signed char * __restrict__ Out = Arg->Out; - int W = Arg->W; - int H = Arg->H; - int Norm = Arg->Norm; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - - unsigned int CoreId = gap_coreid(); - unsigned int Chunk = ChunkSize(Arg->N); - unsigned int First = Chunk*CoreId; - unsigned int Last = Min(First+Chunk, Arg->N); - int i, j; - - for (i=First; iIn1; - signed char * __restrict__ Scale= Arg->In2; - int ScaleScalar = *Arg->ScaleScalar; - signed char * __restrict__ Out = Arg->Out; - int W = Arg->W; - int H = Arg->H; - int Norm = Arg->Norm+Arg->Scale_Q; - int LB = Arg->LB; - char *pUB = (char *) Arg->UB; - - unsigned int CoreId = gap_coreid(); - unsigned int Chunk = ChunkSize(Arg->N); - unsigned int First = Chunk*CoreId; - unsigned int Last = Min(First+Chunk, Arg->N); - int i, j; - - for (i=First; i +#include "CNN_BasicKernels_SQ8.h" + +static int CoreCountDynamic = 1; +static int ActiveCore = gap_ncore(); + +static inline unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X) + +{ + unsigned int NCore; + unsigned int Log2Core; + unsigned int Chunk; + + if (CoreCountDynamic) NCore = ActiveCore; else NCore = gap_ncore(); + Log2Core = gap_fl1(NCore); + Chunk = (X>>Log2Core) + ((X&(NCore-1))!=0); + return Chunk; +} + + +/************************************************************************************************************************************************* + Tensor Addition with Input1 and Output scaling followed by optional activation + Scaling is optional, no scaling is expressed using Scale=0 +*************************************************************************************************************************************************/ + +void KerParMatAdd_SQ8(KerMat3_SQ8_T *Arg) + +{ + signed char * __restrict__ In1 = Arg->In1; + signed char * __restrict__ In2 = Arg->In2; + signed char * __restrict__ Out = Arg->Out; + int W = Arg->W; + int H = Arg->H; + unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN]; + unsigned int OutScale = ((unsigned char *)Arg->Infos)[AT_INF_OUTSCALE], OutScaleN = ((unsigned char *)Arg->Infos)[AT_INF_OUTSCALEN]; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); + + unsigned int F = First*W*H, S = Max(0, Last*W*H-F); + signed char * __restrict__ I1 = In1 + F, *__restrict__ I2 = In2 + F, *__restrict__ O = Out + F; + if (In1Scale && OutScale) { + for (int i=0; iIn1; + signed char * __restrict__ In2 = Arg->In2; + signed char * __restrict__ Out = Arg->Out; + int W = Arg->W; + int H = Arg->H; + unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN]; + unsigned int OutScale = ((unsigned char *)Arg->Infos)[AT_INF_OUTSCALE], OutScaleN = ((unsigned char *)Arg->Infos)[AT_INF_OUTSCALEN]; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); + + unsigned int F = First*W*H, S = Max(0, Last*W*H-F); + signed char * __restrict__ I1 = In1 + F, *__restrict__ I2 = In2 + F, *__restrict__ O = Out + F; + if (In1Scale && OutScale) { + for (int i=0; iIn1; + signed char * __restrict__ In2 = Arg->In2; + signed char * __restrict__ Out = Arg->Out; + int W = Arg->W; + int H = Arg->H; + unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN]; + unsigned int OutScale = ((unsigned char *)Arg->Infos)[AT_INF_OUTSCALE], OutScaleN = ((unsigned char *)Arg->Infos)[AT_INF_OUTSCALEN]; + int A0 = Arg->Infos[AT_INF_A0]; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); + + unsigned int F = First*W*H, S = Max(0, Last*W*H-F); + signed char * __restrict__ I1 = In1 + F, *__restrict__ I2 = In2 + F, *__restrict__ O = Out + F; + if (In1Scale && OutScale) { + for (int i=0; iIn1; + signed char * __restrict__ In2 = Arg->In2; + signed char * __restrict__ Out = Arg->Out; + int W = Arg->W; + int H = Arg->H; + unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN]; + unsigned int OutScale = ((unsigned char *)Arg->Infos)[AT_INF_OUTSCALE], OutScaleN = ((unsigned char *)Arg->Infos)[AT_INF_OUTSCALEN]; + unsigned int ActScale = ((unsigned char *)Arg->Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Arg->Infos)[AT_INF_ACTSCALEN]; + int A0 = Arg->Infos[AT_INF_A0], B0 = Arg->Infos[AT_INF_B0], C0 = Arg->Infos[AT_INF_C0]; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); + + unsigned int F = First*W*H, S = Max(0, Last*W*H-F); + signed char * __restrict__ I1 = In1 + F, *__restrict__ I2 = In2 + F, *__restrict__ O = Out + F; + if (In1Scale && OutScale) { + for (int i=0; iIn1; + signed char * __restrict__ In2 = Arg->In2; + signed char * __restrict__ Out = Arg->Out; + int W = Arg->W; + int H = Arg->H; + unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN]; + unsigned int OutScale = ((unsigned char *)Arg->Infos)[AT_INF_OUTSCALE], OutScaleN = ((unsigned char *)Arg->Infos)[AT_INF_OUTSCALEN]; + unsigned int ActScale = ((unsigned char *)Arg->Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Arg->Infos)[AT_INF_ACTSCALEN]; + int A0 = Arg->Infos[AT_INF_A0], B0 = Arg->Infos[AT_INF_B0], C0 = Arg->Infos[AT_INF_C0]; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); + + unsigned int F = First*W*H, S = Max(0, Last*W*H-F); + signed char * __restrict__ I1 = In1 + F, *__restrict__ I2 = In2 + F, *__restrict__ O = Out + F; + if (In1Scale && OutScale) { + for (int i=0; iIn1; + signed char * __restrict__ In2 = Arg->In2; + signed char * __restrict__ Out = Arg->Out; + int W = Arg->W; + int H = Arg->H; + unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN]; + unsigned int OutScale = ((unsigned char *)Arg->Infos)[AT_INF_OUTSCALE], OutScaleN = ((unsigned char *)Arg->Infos)[AT_INF_OUTSCALEN]; + unsigned int ActScale = ((unsigned char *)Arg->Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Arg->Infos)[AT_INF_ACTSCALEN]; + int A0 = Arg->Infos[AT_INF_A0], B0 = Arg->Infos[AT_INF_B0], C0 = Arg->Infos[AT_INF_C0]; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); + + unsigned int F = First*W*H, S = Max(0, Last*W*H-F); + signed char * __restrict__ I1 = In1 + F, *__restrict__ I2 = In2 + F, *__restrict__ O = Out + F; + if (In1Scale && OutScale) { + for (int i=0; iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; + int ColFirst = Arg->ColFirst; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; + v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); + v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); + v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; + int ColFirst = Arg->ColFirst; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; + v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); + v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); + v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; + int ColFirst = Arg->ColFirst; + int A0 = Arg->Infos[AT_INF_A0]; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; + v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); + v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); + v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + int Pi = Arg->OutFirstCol; + signed char *BufferColIn2 = Arg->BufferColIn2; + unsigned int NormBias = Arg->NormBias; + int Wi = Arg->W, Hi = Arg->H; + int Sx = Arg->Sx, Sy = Arg->Sy; + int ColFirst = Arg->ColFirst; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + + int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; + int Oo, OffLine; + int At, F=0, L = W_In2; + + unsigned int Line, Col, i; + v4s *VBuff = (v4s *) BufferColIn2; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); + + At = 0; OffLine = 0; Oo = 0; + if (ColFirst) OffLine = Pi; else Oo = Pi; + + while (L>0) { + for (i=Fi;iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + int Pi = Arg->OutFirstCol; + signed char *BufferColIn2 = Arg->BufferColIn2; + unsigned int NormBias = Arg->NormBias; + int Wi = Arg->W, Hi = Arg->H; + int Sx = Arg->Sx, Sy = Arg->Sy; + int ColFirst = Arg->ColFirst; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + + int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; + int Oo, OffLine; + int At, F=0, L = W_In2; + + unsigned int Line, Col, i; + v4s *VBuff = (v4s *) BufferColIn2; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); + + At = 0; OffLine = 0; Oo = 0; + if (ColFirst) OffLine = Pi; else Oo = Pi; + + while (L>0) { + for (i=Fi;iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + int Pi = Arg->OutFirstCol; + signed char *BufferColIn2 = Arg->BufferColIn2; + unsigned int NormBias = Arg->NormBias; + int Wi = Arg->W, Hi = Arg->H; + int Sx = Arg->Sx, Sy = Arg->Sy; + int ColFirst = Arg->ColFirst; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + int A0 = Arg->Infos[AT_INF_A0]; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + + int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; + int Oo, OffLine; + int At, F=0, L = W_In2; + + unsigned int Line, Col, i; + v4s *VBuff = (v4s *) BufferColIn2; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); + + At = 0; OffLine = 0; Oo = 0; + if (ColFirst) OffLine = Pi; else Oo = Pi; + + while (L>0) { + for (i=Fi;iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + short int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; + int ColFirst = Arg->ColFirst; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; + v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); + v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); + v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + short int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; + int ColFirst = Arg->ColFirst; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; + v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); + v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); + v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + short int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; + int ColFirst = Arg->ColFirst; + int A0 = Arg->Infos[AT_INF_A0]; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; + v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); + v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); + v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + short int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + int Pi = Arg->OutFirstCol; + signed char *BufferColIn2 = Arg->BufferColIn2; + unsigned int NormBias = Arg->NormBias; + int Wi = Arg->W, Hi = Arg->H; + int Sx = Arg->Sx, Sy = Arg->Sy; + int ColFirst = Arg->ColFirst; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + + int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; + int Oo, OffLine; + int At, F=0, L = W_In2; + + unsigned int Line, Col, i; + v4s *VBuff = (v4s *) BufferColIn2; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); + + At = 0; OffLine = 0; Oo = 0; + if (ColFirst) OffLine = Pi; else Oo = Pi; + + while (L>0) { + for (i=Fi;iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + short int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + int Pi = Arg->OutFirstCol; + signed char *BufferColIn2 = Arg->BufferColIn2; + unsigned int NormBias = Arg->NormBias; + int Wi = Arg->W, Hi = Arg->H; + int Sx = Arg->Sx, Sy = Arg->Sy; + int ColFirst = Arg->ColFirst; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + + int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; + int Oo, OffLine; + int At, F=0, L = W_In2; + + unsigned int Line, Col, i; + v4s *VBuff = (v4s *) BufferColIn2; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); + + At = 0; OffLine = 0; Oo = 0; + if (ColFirst) OffLine = Pi; else Oo = Pi; + + while (L>0) { + for (i=Fi;iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + short int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + int Pi = Arg->OutFirstCol; + signed char *BufferColIn2 = Arg->BufferColIn2; + unsigned int NormBias = Arg->NormBias; + int Wi = Arg->W, Hi = Arg->H; + int Sx = Arg->Sx, Sy = Arg->Sy; + int ColFirst = Arg->ColFirst; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + int A0 = Arg->Infos[AT_INF_A0]; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + + int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; + int Oo, OffLine; + int At, F=0, L = W_In2; + + unsigned int Line, Col, i; + v4s *VBuff = (v4s *) BufferColIn2; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); + + At = 0; OffLine = 0; Oo = 0; + if (ColFirst) OffLine = Pi; else Oo = Pi; + + while (L>0) { + for (i=Fi;iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; + int ColFirst = Arg->ColFirst; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; + v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); + v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); + v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; + int ColFirst = Arg->ColFirst; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; + v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); + v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); + v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; + int ColFirst = Arg->ColFirst; + int A0 = Arg->Infos[AT_INF_A0]; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; + v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); + v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); + v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + int Pi = Arg->OutFirstCol; + signed char *BufferColIn2 = Arg->BufferColIn2; + unsigned int NormBias = Arg->NormBias; + int Wi = Arg->W, Hi = Arg->H; + int Sx = Arg->Sx, Sy = Arg->Sy; + int ColFirst = Arg->ColFirst; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + + int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; + int Oo, OffLine; + int At, F=0, L = W_In2; + + unsigned int Line, Col, i; + v4s *VBuff = (v4s *) BufferColIn2; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); + + At = 0; OffLine = 0; Oo = 0; + if (ColFirst) OffLine = Pi; else Oo = Pi; + + while (L>0) { + for (i=Fi;iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + int Pi = Arg->OutFirstCol; + signed char *BufferColIn2 = Arg->BufferColIn2; + unsigned int NormBias = Arg->NormBias; + int Wi = Arg->W, Hi = Arg->H; + int Sx = Arg->Sx, Sy = Arg->Sy; + int ColFirst = Arg->ColFirst; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + + int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; + int Oo, OffLine; + int At, F=0, L = W_In2; + + unsigned int Line, Col, i; + v4s *VBuff = (v4s *) BufferColIn2; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); + + At = 0; OffLine = 0; Oo = 0; + if (ColFirst) OffLine = Pi; else Oo = Pi; + + while (L>0) { + for (i=Fi;iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + int Pi = Arg->OutFirstCol; + signed char *BufferColIn2 = Arg->BufferColIn2; + unsigned int NormBias = Arg->NormBias; + int Wi = Arg->W, Hi = Arg->H; + int Sx = Arg->Sx, Sy = Arg->Sy; + int ColFirst = Arg->ColFirst; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + int A0 = Arg->Infos[AT_INF_A0]; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + + int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; + int Oo, OffLine; + int At, F=0, L = W_In2; + + unsigned int Line, Col, i; + v4s *VBuff = (v4s *) BufferColIn2; + + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); + + At = 0; OffLine = 0; Oo = 0; + if (ColFirst) OffLine = Pi; else Oo = Pi; + + while (L>0) { + for (i=Fi;iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int H_In2 = Arg->W_In2; + unsigned int W_In2 = W_In1; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int Iter = Max(0, Last-First); + + for (int i=0; iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int H_In2 = Arg->W_In2; + unsigned int W_In2 = W_In1; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int Iter = Max(0, Last-First); + + for (int i=0; iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int H_In2 = Arg->W_In2; + unsigned int W_In2 = W_In1; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int Iter = Max(0, Last-First); + int A0 = Arg->Infos[AT_INF_A0]; + + for (int i=0; iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int H_In2 = Arg->W_In2; + unsigned int W_In2 = W_In1; + short int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int Iter = Max(0, Last-First); + + for (int i=0; iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int H_In2 = Arg->W_In2; + unsigned int W_In2 = W_In1; + short int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int Iter = Max(0, Last-First); + + for (int i=0; iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int H_In2 = Arg->W_In2; + unsigned int W_In2 = W_In1; + short int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int Iter = Max(0, Last-First); + int A0 = Arg->Infos[AT_INF_A0]; + + for (int i=0; iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int H_In2 = Arg->W_In2; + unsigned int W_In2 = W_In1; + int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int Iter = Max(0, Last-First); + + for (int i=0; iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int H_In2 = Arg->W_In2; + unsigned int W_In2 = W_In1; + int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int Iter = Max(0, Last-First); + + for (int i=0; iIn1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int H_In2 = Arg->W_In2; + unsigned int W_In2 = W_In1; + int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned int NormBias = Arg->NormBias; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int Iter = Max(0, Last-First); + int A0 = Arg->Infos[AT_INF_A0]; + + for (int i=0; iIn1; + signed char * __restrict__ In2 = Arg->In2; + signed char * __restrict__ Out = Arg->Out; + int W = Arg->W; + int H = Arg->H; + unsigned int Scale = Arg->Infos[AT_INF_SCALE]; + unsigned int ScaleN = Arg->Infos[AT_INF_SCALEN]; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); + + if (Scale) + for (int i=First; iIn1; + signed char * __restrict__ In2 = Arg->In2; + signed char * __restrict__ Out = Arg->Out; + int W = Arg->W; + int H = Arg->H; + unsigned int Scale = Arg->Infos[AT_INF_SCALE]; + unsigned int ScaleN = Arg->Infos[AT_INF_SCALEN]; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); + + if (Scale) + for (int i=First; iIn1; + signed char * __restrict__ In2 = Arg->In2; + signed char * __restrict__ Out = Arg->Out; + int W = Arg->W; + int H = Arg->H; + int A0 = Arg->Infos[AT_INF_A0]; + unsigned int Scale = Arg->Infos[AT_INF_SCALE]; + unsigned int ScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN]; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); + + if (Scale) + for (int i=First; iIn1; + signed char * __restrict__ In2 = Arg->In2; + signed char * __restrict__ Out = Arg->Out; + int W = Arg->W; + int H = Arg->H; + unsigned int ActScale = ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALE]; + unsigned int ActScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALEN]; + int A0 = Arg->Infos[AT_INF_A0]; + int B0 = Arg->Infos[AT_INF_B0]; + int C0 = Arg->Infos[AT_INF_C0]; + unsigned int Scale = Arg->Infos[AT_INF_SCALE]; + unsigned int ScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN]; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); + + if (Scale) + for (int i=First; iIn1; + signed char * __restrict__ In2 = Arg->In2; + signed char * __restrict__ Out = Arg->Out; + int W = Arg->W; + int H = Arg->H; + unsigned int ActScale = ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALE]; + unsigned int ActScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALEN]; + int A0 = Arg->Infos[AT_INF_A0]; + int B0 = Arg->Infos[AT_INF_B0]; + int C0 = Arg->Infos[AT_INF_C0]; + unsigned int Scale = Arg->Infos[AT_INF_SCALE]; + unsigned int ScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN]; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); + + if (Scale) + for (int i=First; iIn1; + signed char * __restrict__ In2 = Arg->In2; + signed char * __restrict__ Out = Arg->Out; + int W = Arg->W; + int H = Arg->H; + unsigned int ActScale = ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALE]; + unsigned int ActScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALEN]; + int A0 = Arg->Infos[AT_INF_A0]; + int B0 = Arg->Infos[AT_INF_B0]; + int C0 = Arg->Infos[AT_INF_C0]; + unsigned int Scale = Arg->Infos[AT_INF_SCALE]; + unsigned int ScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN]; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); + + if (Scale) + for (int i=First; iIn; + signed char *__restrict__ Out = Arg->Out; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int Feat = Arg->Feat; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(Feat); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Feat); + + for (int f=First; fIn; + signed char *__restrict__ Out = Arg->Out; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int Feat = Arg->Feat; + unsigned int Sx = Arg->Sx; + unsigned int Sy = Arg->Sy; + unsigned int Wo = (W+Sx-1)/Sx; + unsigned int Ho = (H+Sy-1)/Sy; + unsigned int CoreId = gap_coreid(); + unsigned int Chunk = ChunkSize(Feat); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Feat); + + for (int f=First; fIn; + signed char *__restrict__ Out = Arg->Out; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int Wo_F, Wo_L, Ho_F, Ho_L; + unsigned int Feat = Arg->Feat; + unsigned int Chunk, CoreId = gap_coreid(); + + if (H>W) { + /* Tile horizontally */ + Chunk = ChunkSize(H); Ho_F = Chunk*CoreId; Ho_L = Min(Ho_F+Chunk, H); Wo_F = 0; Wo_L = W; + } else { + /* Tile vertically */ + Chunk = ChunkSize(W); Wo_F = Chunk*CoreId; Wo_L = Min(Wo_F+Chunk, W); Ho_F = 0; Ho_L = H; + } + if (Wo_FIn; + signed char *__restrict__ Out = Arg->Out; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int Wo_F, Wo_L, Ho_F, Ho_L; + unsigned int Sx = Arg->Sx; + unsigned int Sy = Arg->Sy; + unsigned int Wo = (W+Sx-1)/Sx; + unsigned int Ho = (H+Sy-1)/Sy; + unsigned int Feat = Arg->Feat; + unsigned int Chunk, CoreId = gap_coreid(); + + if (Ho>Wo) { + /* Tile horizontally */ + Chunk = ChunkSize(Ho); Ho_F = Chunk*CoreId; Ho_L = Min(Ho_F+Chunk, Ho); Wo_F = 0; Wo_L = Wo; + } else { + /* Tile vertically */ + Chunk = ChunkSize(Wo); Wo_F = Chunk*CoreId; Wo_L = Min(Wo_F+Chunk, Wo); Ho_F = 0; Ho_L = Ho; + } + if (Wo_FIn; + signed char *__restrict__ Out = Arg->Out; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int Feat = Arg->Feat, C = Feat; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + for (int c=First; cIn; + signed char *__restrict__ Out = Arg->Out; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int Feat = Arg->Feat, C = Feat; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + for (int c=First; cIn; + signed char *__restrict__ Out = Arg->Out; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int Feat = Arg->Feat, C = Feat; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + for (int c=First; cIn; + signed char *__restrict__ Out = Arg->Out; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int Feat = Arg->Feat, C = Feat; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + for (int c=First; cIn; + signed char *__restrict__ Out = Arg->Out; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int Feat = Arg->Feat, C = Feat; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + for (int c=First; c +#include "Gap.h" +#include "CNN_BasicKernels_SQ8.h" + +static int CoreCountDynamic = 1; +static int ActiveCore = gap_ncore(); + +static inline unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X) + +{ + unsigned int NCore; + unsigned int Log2Core; + unsigned int Chunk; + + if (CoreCountDynamic) NCore = ActiveCore; else NCore = gap_ncore(); + Log2Core = gap_fl1(NCore); + Chunk = (X>>Log2Core) + ((X&(NCore-1))!=0); + return Chunk; +} + +static int FirstDefinedOutput(unsigned int F, unsigned int Pad, unsigned int Stride) + +{ + // k*S - (F-1)/2 >=0 => k >= (((F-1)/2) + S-1)/S + + return ((Pad+Stride-1)/Stride); +} + +static int LastDefinedOutput(unsigned int DimIn, unsigned int F, unsigned int PadL, unsigned int Stride) + +{ + // k*S + ((F-1)/2 - PadL + F/2) < Dim => k < (Dim-((F-1)/2 - PadL + (F/2)) + S-1)/S + + return ((DimIn - ((F-1)/2 - PadL + (F/2)) + Stride-1)/Stride); +} + +static int __attribute__ ((always_inline)) MinCond(int a, int b) + +{ +#ifdef DIM_ALWAYS_GREATER_THAN_FILTER + return a; +#else + return Max(0, Min(a, b)); +#endif +} + +/* + * Standalone activation, assuming contiguous tile (horizontal) +*/ +static void Ker_Activation_SQ8( + signed char * __restrict__ InOut, + unsigned int N, + CNN_ActivationOper_T Activation, + unsigned int ActScale, unsigned int ActScaleN, int A0, int B0, int C0 + ) + +{ + for (unsigned int i=0; iHi_F + int Wi_F = (Fw-1)/2 - PadLOrg; + int Wi_L = Wi_F + (Wo_L-1)*Stride; // iff Wi_L>Wi_F + + if (PadT) + for (unsigned int w=Wo_F; wHo_F, then in this case Fh_min is = 0 by construction */ + for (unsigned int h=Ho_L; h F by definition of Ho_L so we can remove and use ht only + int Acc = 0; + for (unsigned int i=Fh_min; i F by definition of Ho_L so we can remove and use ht only + for (unsigned int i=0; i F by definition of Ho_L so we can remove and use ht only. ht Can't be > F by definition of Ho_L so we can remove and use ht only + int Wh_min = wl, Wh_max = MinCond(wr, Fw), Fh_min = ht, Fh_max = MinCond(Fh, hb); + for (unsigned int i=Fh_min; iHi_F + int Wi_F = (Fw-1)/2 - PadLOrg; + int Wi_L = Wi_F + (Wo_L-1)*StrideX; // iff Wi_L>Wi_F + + if (PadT) + for (unsigned int w=Wo_F; wHo_F, then in this case Fh_min is = 0 by construction */ + for (unsigned int h=Ho_L; h F by definition of Ho_L so we can remove and use ht only + int Acc = 0; + for (unsigned int i=Fh_min; i F by definition of Ho_L so we can remove and use ht only + for (unsigned int i=0; i F by definition of Ho_L so we can remove and use ht only. ht Can't be > F by definition of Ho_L so we can remove and use ht only + int Wh_min = wl, Wh_max = MinCond(wr, Fw), Fh_min = ht, Fh_max = MinCond(Fh, hb); + for (unsigned int i=Fh_min; iHi_F + int Wi_F = (Fw-1)/2 - PadLOrg; + int Wi_L = Wi_F + (Wo_L-1)*Stride; // iff Wi_L>Wi_F + + + if (PadT) + for (unsigned int w=Wo_F; wHo_F, then in this case Fh_min is = 0 by construction */ + for (unsigned int h=Ho_L; h F by definition of Ho_L so we can remove and use ht only + int Acc = 0; + for (unsigned int i=Fh_min; i F by definition of Ho_L so we can remove and use ht only + for (unsigned int i=0; i F by definition of Ho_L so we can remove and use ht only. ht Can't be > F by definition of Ho_L so we can remove and use ht only + int Wh_min = wl, Wh_max = MinCond(wr, Fw), Fh_min = ht, Fh_max = MinCond(Fh, hb); + for (unsigned int i=Fh_min; iHi_F + int Wi_F = (Fw-1)/2 - PadLOrg; + int Wi_L = Wi_F + (Wo_L-1)*StrideX; // iff Wi_L>Wi_F + + + if (PadT) + for (unsigned int w=Wo_F; wHo_F, then in this case Fh_min is = 0 by construction */ + for (unsigned int h=Ho_L; h F by definition of Ho_L so we can remove and use ht only + int Acc = 0; + for (unsigned int i=Fh_min; i F by definition of Ho_L so we can remove and use ht only + for (unsigned int i=0; i F by definition of Ho_L so we can remove and use ht only. ht Can't be > F by definition of Ho_L so we can remove and use ht only + int Wh_min = wl, Wh_max = MinCond(wr, Fw), Fh_min = ht, Fh_max = MinCond(Fh, hb); + for (unsigned int i=Fh_min; iPad) + + Arg->MaxPool Max Pooling, otherwise Average Pooling + + Input and output feature maps are bytes: + KerParPool2x2Stride2_SQ8 Special case: Stride=2 and PoolSize=2, Padding management is embedded + |------ KerMaxPool2x2Stride2_SQ8 + |------ KerAvgPool2x2Stride2_SQ8 + KerParPoolNxNStrideS_SQ8 General case. Padding management: KerAvgPoolNxNStrideS_Border_SQ8 or KerMaxPoolNxNStrideS_Border_SQ8 + |------ KerMaxPoolNxNStrideS_Body_SQ8 + |------ KerMaxPoolNxNStrideS_Border_SQ8 + |------ KerAvgPoolNxNStrideS_Body_SQ8 + |------ KerAvgPoolNxNStrideS_Border_SQ8 + KerParPoolNxMStrideSxSy_SQ8 General case. Padding management: KerAvgPoolNxMStrideSxSy_Border_SQ8 or KerMaxPoolNxMStrideSxSy_Border_SQ8 + |------ KerMaxPoolNxMStrideSxSy_Body_SQ8 + |------ KerMaxPoolNxMStrideSxSy_Border_SQ8 + |------ KerAvgPoolNxMStrideSxSy_Body_SQ8 + |------ KerAvgPoolNxMStrideSxSy_Border_SQ8 + KerParGlobalMaxPool_SQ8 + KerParGlobalAvgPool_SQ8 + KerParGlobalMaxPoolFullFeat_SQ8 + KerParGlobalAvgPoolFullFeat_SQ8 +*/ + + +static void KerParPoolActivation(signed char *__restrict__ InOut, int W, int H, int FirstFeat, int LastFeat, + signed char *__restrict__ Infos, CNN_ActivationOper_T Activation) + +{ + if (Infos[AT_INF_ACTSCALE]) { + unsigned int Off = W*H*FirstFeat, Size = W*H*(LastFeat-FirstFeat); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + switch (Activation) { + case ACT_NONE: Ker_Activation_SQ8(InOut+Off, Size, ACT_NONE, ActScale, ActScaleN, A0, B0, C0); break; + case ACT_RELU: Ker_Activation_SQ8(InOut+Off, Size, ACT_RELU, ActScale, ActScaleN, A0, B0, C0); break; + case ACT_RELUN: Ker_Activation_SQ8(InOut+Off, Size, ACT_RELUN, ActScale, ActScaleN, A0, B0, C0); break; + case ACT_HSIGMOID: Ker_Activation_SQ8(InOut+Off, Size, ACT_HSIGMOID, ActScale, ActScaleN, A0, B0, C0); break; + case ACT_HSWISH: Ker_Activation_SQ8(InOut+Off, Size, ACT_HSWISH, ActScale, ActScaleN, A0, B0, C0); break; + case ACT_LEAKYRELU: Ker_Activation_SQ8(InOut+Off, Size, ACT_LEAKYRELU, ActScale, ActScaleN, A0, B0, C0); break; + } + } else if (Activation == ACT_RELU) { + unsigned int Off = W*H*FirstFeat, Size = W*H*(LastFeat-FirstFeat); + Ker_ActivationScale1_SQ8(InOut+Off, Size, ACT_RELU, 0); + } else if (Activation == ACT_RELUN) { + unsigned int Off = W*H*FirstFeat, Size = W*H*(LastFeat-FirstFeat); + int A0 = Infos[AT_INF_A0]; + Ker_ActivationScale1_SQ8(InOut+Off, Size, ACT_RELUN, A0); + } +} + +void KerParPool2x2Stride2_SQ8(KerPool_SQ8_T *Arg) + +{ + unsigned int FS=2,S=2; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + int PoolMax = Arg->PoolMax; + int DoScale = Arg->DoScale; + v4s PadIn = Arg->Pad; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + if (PoolMax) for (unsigned int of=First; ofIn; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + int PoolMax = Arg->PoolMax; + v4s PadIn = Arg->Pad; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + if (PoolMax) for (unsigned int of=First; ofIn; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + int PoolMax = Arg->PoolMax; + v4s PadIn = Arg->Pad; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + if (PoolMax) for (unsigned int of=First; ofFS, S=Arg->S; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = Arg->Out; + int PoolMax = Arg->PoolMax; + signed char * __restrict__ Infos = Arg->Infos; + v4s PadIn = Arg->Pad; + int DoScale = Arg->DoScale; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + if (PoolMax) { + for (unsigned int of=First; ofFS, S=Arg->S; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = Arg->Out; + int PoolMax = Arg->PoolMax; + signed char * __restrict__ Infos = Arg->Infos; + v4s PadIn = Arg->Pad; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + if (PoolMax) { + for (unsigned int of=First; ofFS, S=Arg->S; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = Arg->Out; + int PoolMax = Arg->PoolMax; + signed char * __restrict__ Infos = Arg->Infos; + v4s PadIn = Arg->Pad; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + if (PoolMax) { + for (unsigned int of=First; ofFS, Sx=Arg->S; + unsigned int FSy=Arg->FSy, Sy=Arg->Sy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + int DoScale = Arg->DoScale; + int PoolMax = Arg->PoolMax; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + if (PoolMax) { + for (unsigned int of=First; ofFS, Sx=Arg->S; + unsigned int FSy=Arg->FSy, Sy=Arg->Sy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + int PoolMax = Arg->PoolMax; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + if (PoolMax) { + for (unsigned int of=First; ofFS, Sx=Arg->S; + unsigned int FSy=Arg->FSy, Sy=Arg->Sy; + signed char * __restrict__ In = Arg->In; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + int PoolMax = Arg->PoolMax; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + v4s PadIn = Arg->Pad; + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + if (PoolMax) { + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + int * __restrict__ Out = (int *__restrict) Arg->Out; + int FirstTile = Arg->FirstTile; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char *__restrict__ Infos; + int DoScale = Arg->DoScale; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + if (DoScale && ActScale) for (unsigned int of=First; ofIn; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char *__restrict__ Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + if (ActScale) for (unsigned int of=First; ofIn; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char *__restrict__ Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0]; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + if (ActScale) for (unsigned int of=First; ofIn; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + int * __restrict__ Out = (int *__restrict__) Arg->Out; + int FirstTile = Arg->FirstTile; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char *__restrict__ Infos; + int DoScale = Arg->DoScale; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + if (DoScale && ActScale) for (unsigned int of=First; ofIn; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char *__restrict__ Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + if (ActScale) for (unsigned int of=First; ofIn; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char *__restrict__ Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0]; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + if (ActScale) for (unsigned int of=First; ofIn; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + int DoScale = Arg->DoScale; + signed char * __restrict__ Infos = Arg->Infos; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + int DoScale = Arg->DoScale; + signed char * __restrict__ Infos = Arg->Infos; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + for (unsigned int of=First; ofIn; + unsigned int W = Arg->W, H = Arg->H; + unsigned int Feat = Arg->Feat; + signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); + + for (unsigned int of=First; ofPad) + + Arg->MaxPool Max Pooling, otherwise Average Pool + + Input and output feature maps are bytes: + KerPool2x2Stride2_SQ8 Special case: Stride=2 and PoolSize=2, Padding management is embedded + |------ KerMaxPool2x2Stride2_SQ8 + |------ KerAvgPool2x2Stride2_SQ8 + KerPoolNxNStrideS_SQ8 General case. Padding management: KerAvgPoolNxNStrideS_Border_SQ8 or KerMaxPoolNxNStrideS_Border_SQ8 + |------ KerMaxPoolNxNStrideS_Body_SQ8 + |------ KerMaxPoolNxNStrideS_Border_SQ8 + |------ KerAvgPoolNxNStrideS_Body_SQ8 + |------ KerAvgPoolNxNStrideS_Border_SQ8 + KerPoolNxMStrideSxSy_SQ8 General case. Padding management: KerAvgPoolNxMStrideSxSy_Border_SQ8 or KerMaxPoolNxMStrideSxSy_Border_SQ8 + |------ KerMaxPoolNxMStrideSxSy_Body_SQ8 + |------ KerMaxPoolNxMStrideSxSy_Border_SQ8 + |------ KerAvgPoolNxMStrideSxSy_Body_SQ8 + |------ KerAvgPoolNxMStrideSxSy_Border_SQ8 +*/ + +static void KerPoolActivation(signed char *__restrict__ InOut, int W, int H, int First, int Last, + signed char *__restrict__ Infos, CNN_ActivationOper_T Activation, int Orientation) + +{ + if (Orientation) { // Horizontal + if (Infos[AT_INF_ACTSCALE]) { + unsigned int Off = W*First, Size = W*(Last-First); + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + + switch (Activation) { + case ACT_NONE: Ker_Activation_SQ8(InOut+Off, Size, ACT_NONE, ActScale, ActScaleN, A0, B0, C0); break; + case ACT_RELU: Ker_Activation_SQ8(InOut+Off, Size, ACT_RELU, ActScale, ActScaleN, A0, B0, C0); break; + case ACT_RELUN: Ker_Activation_SQ8(InOut+Off, Size, ACT_RELUN, ActScale, ActScaleN, A0, B0, C0); break; + case ACT_HSIGMOID: Ker_Activation_SQ8(InOut+Off, Size, ACT_HSIGMOID, ActScale, ActScaleN, A0, B0, C0); break; + case ACT_HSWISH: Ker_Activation_SQ8(InOut+Off, Size, ACT_HSWISH, ActScale, ActScaleN, A0, B0, C0); break; + case ACT_LEAKYRELU: Ker_Activation_SQ8(InOut+Off, Size, ACT_LEAKYRELU, ActScale, ActScaleN, A0, B0, C0); break; + } + } else if (Activation == ACT_RELU) { + unsigned int Off = W*First, Size = W*(Last-First); + Ker_ActivationScale1_SQ8(InOut+Off, Size, ACT_RELU, 0); + } else if (Activation == ACT_RELUN) { + unsigned int Off = W*First, Size = W*(Last-First); + int A0 = Infos[AT_INF_A0]; + Ker_ActivationScale1_SQ8(InOut+Off, Size, ACT_RELUN, A0); + } + } else { + if (Infos[AT_INF_ACTSCALE]) { + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; + switch (Activation) { + case ACT_NONE: Ker_Activation_Ver_SQ8(InOut+First, W, H, First, Last, ACT_NONE, ActScale, ActScaleN, A0, B0, C0); break; + case ACT_RELU: Ker_Activation_Ver_SQ8(InOut+First, W, H, First, Last, ACT_RELU, ActScale, ActScaleN, A0, B0, C0); break; + case ACT_RELUN: Ker_Activation_Ver_SQ8(InOut+First, W, H, First, Last, ACT_RELUN, ActScale, ActScaleN, A0, B0, C0); break; + case ACT_HSIGMOID: Ker_Activation_Ver_SQ8(InOut+First, W, H, First, Last, ACT_HSIGMOID, ActScale, ActScaleN, A0, B0, C0); break; + case ACT_HSWISH: Ker_Activation_Ver_SQ8(InOut+First, W, H, First, Last, ACT_HSWISH, ActScale, ActScaleN, A0, B0, C0); break; + case ACT_LEAKYRELU: Ker_Activation_Ver_SQ8(InOut+First, W, H, First, Last, ACT_LEAKYRELU, ActScale, ActScaleN, A0, B0, C0); break; + } + } else if (Activation == ACT_RELU) { + Ker_ActivationScale1_Ver_SQ8(InOut+First, W, H, First, Last, ACT_RELU, 0); + } else if (Activation == ACT_RELUN) { + int A0 = Infos[AT_INF_A0]; + Ker_ActivationScale1_Ver_SQ8(InOut+First, W, H, First, Last, ACT_RELUN, A0); + } + } +} + +void KerPool2x2Stride2_SQ8(KerPool_SQ8_T *Arg) + +{ + signed char * __restrict__ In = Arg->In; + signed char * __restrict__ Out = Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + unsigned int W = Arg->W, H = Arg->H; + unsigned int FS = 2, S = 2; + v4s PadIn = Arg->Pad; + int DoScale = Arg->DoScale; + int PoolMax = Arg->PoolMax; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int CoreId = gap_coreid(); + + if (Arg->Orientation) { // Horizontal + unsigned int Chunk = ChunkSize(Wo); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Wo); + v4s PadOrg = PadIn; + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + if (PoolMax) KerMaxPool2x2Stride2_SQ8(In, W, H, Out, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, PadIn, PadOrg); + else KerAvgPool2x2Stride2_SQ8(In, W, H, Out, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, PadIn, PadOrg); + if (DoScale) KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_NONE, 1); + } else { + unsigned int Chunk = ChunkSize(Ho); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Ho); + v4s PadOrg = PadIn; + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + if (PoolMax) KerMaxPool2x2Stride2_SQ8(In, W, H, Out, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), PadIn, PadOrg); + else KerAvgPool2x2Stride2_SQ8(In, W, H, Out, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), PadIn, PadOrg); + if (DoScale) KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_NONE, 0); + } + gap_waitbarrier(0); +} + +void KerPool2x2Stride2_ReLU_SQ8(KerPool_SQ8_T *Arg) + +{ + signed char * __restrict__ In = Arg->In; + signed char * __restrict__ Out = Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + unsigned int W = Arg->W, H = Arg->H; + unsigned int FS = 2, S = 2; + v4s PadIn = Arg->Pad; + int PoolMax = Arg->PoolMax; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int CoreId = gap_coreid(); + + if (Arg->Orientation) { // Horizontal + unsigned int Chunk = ChunkSize(Wo); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Wo); + v4s PadOrg = PadIn; + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + if (PoolMax) KerMaxPool2x2Stride2_SQ8(In, W, H, Out, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, PadIn, PadOrg); + else KerAvgPool2x2Stride2_SQ8(In, W, H, Out, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, PadIn, PadOrg); + KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_RELU, 1); + } else { + unsigned int Chunk = ChunkSize(Ho); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Ho); + v4s PadOrg = PadIn; + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + if (PoolMax) KerMaxPool2x2Stride2_SQ8(In, W, H, Out, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), PadIn, PadOrg); + else KerAvgPool2x2Stride2_SQ8(In, W, H, Out, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), PadIn, PadOrg); + KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_RELU, 0); + } + gap_waitbarrier(0); +} + +void KerPool2x2Stride2_ReLUN_SQ8(KerPool_SQ8_T *Arg) + +{ + signed char * __restrict__ In = Arg->In; + signed char * __restrict__ Out = Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + unsigned int W = Arg->W, H = Arg->H; + unsigned int FS = 2, S = 2; + v4s PadIn = Arg->Pad; + int PoolMax = Arg->PoolMax; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int CoreId = gap_coreid(); + + if (Arg->Orientation) { // Horizontal + unsigned int Chunk = ChunkSize(Wo); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Wo); + v4s PadOrg = PadIn; + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + if (PoolMax) KerMaxPool2x2Stride2_SQ8(In, W, H, Out, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, PadIn, PadOrg); + else KerAvgPool2x2Stride2_SQ8(In, W, H, Out, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, PadIn, PadOrg); + KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_RELUN, 1); + } else { + unsigned int Chunk = ChunkSize(Ho); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Ho); + v4s PadOrg = PadIn; + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + if (PoolMax) KerMaxPool2x2Stride2_SQ8(In, W, H, Out, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), PadIn, PadOrg); + else KerAvgPool2x2Stride2_SQ8(In, W, H, Out, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), PadIn, PadOrg); + KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_RELUN, 0); + } + gap_waitbarrier(0); +} + +void KerPoolNxNStrideS_SQ8(KerPool_SQ8_T *Arg) + +{ + signed char * __restrict__ In = Arg->In; + signed char * __restrict__ Out = Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + unsigned int W = Arg->W, H = Arg->H; + unsigned int FS = Arg->FS, S = Arg->S; + v4s PadIn = Arg->Pad; + int DoScale = Arg->DoScale; + int PoolMax = Arg->PoolMax; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int CoreId = gap_coreid(); + + if (Arg->Orientation) { // Horizontal + unsigned int Chunk = ChunkSize(Wo); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Wo); + v4s PadOrg = PadIn; + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + if (PoolMax) { + KerMaxPoolNxNStrideS_Body_SQ8(In, Out, FS, FS, PadOrg[0], PadOrg[2], W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, S); + if ((int) PadIn) KerMaxPoolNxNStrideS_Border_SQ8(In, Out, FS, FS, PadIn, PadOrg, W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, S); + } else { + KerAvgPoolNxNStrideS_Body_SQ8(In, Out, FS, FS, PadOrg[0], PadOrg[2], W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, S); + if ((int) PadIn) KerAvgPoolNxNStrideS_Border_SQ8(In, Out, FS, FS, PadIn, PadOrg, W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, S); + } + if (DoScale) KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_NONE, 1); + } else { + unsigned int Chunk = ChunkSize(Ho); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Ho); + v4s PadOrg = PadIn; + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + if (PoolMax) { + KerMaxPoolNxNStrideS_Body_SQ8(In, Out, FS, FS, PadOrg[0], PadOrg[2], W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), S); + if ((int) PadIn) KerMaxPoolNxNStrideS_Border_SQ8(In, Out, FS, FS, PadIn, PadOrg, W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), S); + } else { + KerAvgPoolNxNStrideS_Body_SQ8(In, Out, FS, FS, PadOrg[0], PadOrg[2], W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), S); + if ((int) PadIn) KerAvgPoolNxNStrideS_Border_SQ8(In, Out, FS, FS, PadIn, PadOrg, W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), S); + } + if (DoScale) KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_NONE, 0); + } + gap_waitbarrier(0); +} + +void KerPoolNxNStrideS_ReLU_SQ8(KerPool_SQ8_T *Arg) + +{ + signed char * __restrict__ In = Arg->In; + signed char * __restrict__ Out = Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + unsigned int W = Arg->W, H = Arg->H; + unsigned int FS = Arg->FS, S = Arg->S; + v4s PadIn = Arg->Pad; + int PoolMax = Arg->PoolMax; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int CoreId = gap_coreid(); + + if (Arg->Orientation) { // Horizontal + unsigned int Chunk = ChunkSize(Wo); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Wo); + v4s PadOrg = PadIn; + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + if (PoolMax) { + KerMaxPoolNxNStrideS_Body_SQ8(In, Out, FS, FS, PadOrg[0], PadOrg[2], W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, S); + if ((int) PadIn) KerMaxPoolNxNStrideS_Border_SQ8(In, Out, FS, FS, PadIn, PadOrg, W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, S); + } else { + KerAvgPoolNxNStrideS_Body_SQ8(In, Out, FS, FS, PadOrg[0], PadOrg[2], W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, S); + if ((int) PadIn) KerAvgPoolNxNStrideS_Border_SQ8(In, Out, FS, FS, PadIn, PadOrg, W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, S); + } + KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_RELU, 1); + } else { + unsigned int Chunk = ChunkSize(Ho); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Ho); + v4s PadOrg = PadIn; + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + if (PoolMax) { + KerMaxPoolNxNStrideS_Body_SQ8(In, Out, FS, FS, PadOrg[0], PadOrg[2], W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), S); + if ((int) PadIn) KerMaxPoolNxNStrideS_Border_SQ8(In, Out, FS, FS, PadIn, PadOrg, W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), S); + } else { + KerAvgPoolNxNStrideS_Body_SQ8(In, Out, FS, FS, PadOrg[0], PadOrg[2], W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), S); + if ((int) PadIn) KerAvgPoolNxNStrideS_Border_SQ8(In, Out, FS, FS, PadIn, PadOrg, W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), S); + } + KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_RELU, 0); + } + gap_waitbarrier(0); +} + +void KerPoolNxNStrideS_ReLUN_SQ8(KerPool_SQ8_T *Arg) + +{ + signed char * __restrict__ In = Arg->In; + signed char * __restrict__ Out = Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + unsigned int W = Arg->W, H = Arg->H; + unsigned int FS = Arg->FS, S = Arg->S; + v4s PadIn = Arg->Pad; + int PoolMax = Arg->PoolMax; + + int Wo = (Arg->UsedW-FS+PadIn[0]+PadIn[1])/S + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FS, PadIn[0], S)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FS, PadIn[0], S)); + int Ho = (Arg->UsedH-FS+PadIn[2]+PadIn[3])/S + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FS, PadIn[2], S)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FS, PadIn[2], S)); + + unsigned int CoreId = gap_coreid(); + + if (Arg->Orientation) { // Horizontal + unsigned int Chunk = ChunkSize(Wo); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Wo); + v4s PadOrg = PadIn; + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + if (PoolMax) { + KerMaxPoolNxNStrideS_Body_SQ8(In, Out, FS, FS, PadOrg[0], PadOrg[2], W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, S); + if ((int) PadIn) KerMaxPoolNxNStrideS_Border_SQ8(In, Out, FS, FS, PadIn, PadOrg, W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, S); + } else { + KerAvgPoolNxNStrideS_Body_SQ8(In, Out, FS, FS, PadOrg[0], PadOrg[2], W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, S); + if ((int) PadIn) KerAvgPoolNxNStrideS_Border_SQ8(In, Out, FS, FS, PadIn, PadOrg, W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, S); + } + KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_RELUN, 1); + } else { + unsigned int Chunk = ChunkSize(Ho); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Ho); + v4s PadOrg = PadIn; + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + if (PoolMax) { + KerMaxPoolNxNStrideS_Body_SQ8(In, Out, FS, FS, PadOrg[0], PadOrg[2], W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), S); + if ((int) PadIn) KerMaxPoolNxNStrideS_Border_SQ8(In, Out, FS, FS, PadIn, PadOrg, W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), S); + } else { + KerAvgPoolNxNStrideS_Body_SQ8(In, Out, FS, FS, PadOrg[0], PadOrg[2], W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), S); + if ((int) PadIn) KerAvgPoolNxNStrideS_Border_SQ8(In, Out, FS, FS, PadIn, PadOrg, W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), S); + } + KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_RELUN, 0); + } + gap_waitbarrier(0); +} + +void KerPoolNxMStrideSxSy_SQ8(KerPool_SQ8_T *Arg) + +{ + signed char * __restrict__ In = Arg->In; + signed char * __restrict__ Out = Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + unsigned int W = Arg->W, H = Arg->H; + unsigned int FSx = Arg->FS, Sx = Arg->S; + unsigned int FSy = Arg->FSy, Sy = Arg->Sy; + v4s PadIn = Arg->Pad; + int DoScale = Arg->DoScale; + int PoolMax = Arg->PoolMax; + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + unsigned int CoreId = gap_coreid(); + + if (Arg->Orientation) { // Horizontal + unsigned int Chunk = ChunkSize(Wo); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Wo); + v4s PadOrg = PadIn; + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + if (PoolMax) { + KerMaxPoolNxMStrideSxSy_Body_SQ8(In, Out, FSx, FSy, PadOrg[0], PadOrg[2], W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, Sx, Sy); + if ((int) PadIn) KerMaxPoolNxMStrideSxSy_Border_SQ8(In, Out, FSx, FSy, PadIn, PadOrg, W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, Sx, Sy); + } else { + KerAvgPoolNxMStrideSxSy_Body_SQ8(In, Out, FSx, FSy, PadOrg[0], PadOrg[2], W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, Sx, Sy); + if ((int) PadIn) KerAvgPoolNxMStrideSxSy_Border_SQ8(In, Out, FSx, FSy, PadIn, PadOrg, W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, Sx, Sy); + } + if (DoScale) KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_NONE, 1); + } else { + unsigned int Chunk = ChunkSize(Ho); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Ho); + v4s PadOrg = PadIn; + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + if (PoolMax) { + KerMaxPoolNxMStrideSxSy_Body_SQ8(In, Out, FSx, FSy, PadOrg[0], PadOrg[2], W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), Sx, Sy); + if ((int) PadIn) KerMaxPoolNxMStrideSxSy_Border_SQ8(In, Out, FSx, FSy, PadIn, PadOrg, W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), Sx, Sy); + } else { + KerAvgPoolNxMStrideSxSy_Body_SQ8(In, Out, FSx, FSx, PadOrg[0], PadOrg[2], W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), Sx, Sy); + if ((int) PadIn) KerAvgPoolNxMStrideSxSy_Border_SQ8(In, Out, FSx, FSy, PadIn, PadOrg, W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), Sx, Sy); + } + if (DoScale) KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_NONE, 0); + } + gap_waitbarrier(0); +} + +void KerPoolNxMStrideSxSy_ReLU_SQ8(KerPool_SQ8_T *Arg) + +{ + signed char * __restrict__ In = Arg->In; + signed char * __restrict__ Out = Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + unsigned int W = Arg->W, H = Arg->H; + unsigned int FSx = Arg->FS, Sx = Arg->S; + unsigned int FSy = Arg->FSy, Sy = Arg->Sy; + v4s PadIn = Arg->Pad; + int PoolMax = Arg->PoolMax; + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + unsigned int CoreId = gap_coreid(); + + if (Arg->Orientation) { // Horizontal + unsigned int Chunk = ChunkSize(Wo); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Wo); + v4s PadOrg = PadIn; + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + if (PoolMax) { + KerMaxPoolNxMStrideSxSy_Body_SQ8(In, Out, FSx, FSy, PadOrg[0], PadOrg[2], W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, Sx, Sy); + if ((int) PadIn) KerMaxPoolNxMStrideSxSy_Border_SQ8(In, Out, FSx, FSy, PadIn, PadOrg, W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, Sx, Sy); + } else { + KerAvgPoolNxMStrideSxSy_Body_SQ8(In, Out, FSx, FSy, PadOrg[0], PadOrg[2], W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, Sx, Sy); + if ((int) PadIn) KerAvgPoolNxMStrideSxSy_Border_SQ8(In, Out, FSx, FSy, PadIn, PadOrg, W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, Sx, Sy); + } + KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_RELU, 1); + } else { + unsigned int Chunk = ChunkSize(Ho); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Ho); + v4s PadOrg = PadIn; + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + if (PoolMax) { + KerMaxPoolNxMStrideSxSy_Body_SQ8(In, Out, FSx, FSy, PadOrg[0], PadOrg[2], W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), Sx, Sy); + if ((int) PadIn) KerMaxPoolNxMStrideSxSy_Border_SQ8(In, Out, FSx, FSy, PadIn, PadOrg, W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), Sx, Sy); + } else { + KerAvgPoolNxMStrideSxSy_Body_SQ8(In, Out, FSx, FSx, PadOrg[0], PadOrg[2], W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), Sx, Sy); + if ((int) PadIn) KerAvgPoolNxMStrideSxSy_Border_SQ8(In, Out, FSx, FSy, PadIn, PadOrg, W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), Sx, Sy); + } + KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_RELU, 0); + } + gap_waitbarrier(0); +} + +void KerPoolNxMStrideSxSy_ReLUN_SQ8(KerPool_SQ8_T *Arg) + +{ + signed char * __restrict__ In = Arg->In; + signed char * __restrict__ Out = Arg->Out; + signed char * __restrict__ Infos = Arg->Infos; + unsigned int W = Arg->W, H = Arg->H; + unsigned int FSx = Arg->FS, Sx = Arg->S; + unsigned int FSy = Arg->FSy, Sy = Arg->Sy; + v4s PadIn = Arg->Pad; + int PoolMax = Arg->PoolMax; + + int Wo = (Arg->UsedW-FSx+PadIn[0]+PadIn[1])/Sx + 1; + int Wo_F = Min(Wo, FirstDefinedOutput(FSx, PadIn[0], Sx)), Wo_L = Max(Wo_F, LastDefinedOutput(Arg->UsedW, FSx, PadIn[0], Sx)); + int Ho = (Arg->UsedH-FSy+PadIn[2]+PadIn[3])/Sy + 1; + int Ho_F = Min(Ho, FirstDefinedOutput(FSy, PadIn[2], Sy)), Ho_L = Max(Ho_F, LastDefinedOutput(Arg->UsedH, FSy, PadIn[2], Sy)); + + unsigned int CoreId = gap_coreid(); + + if (Arg->Orientation) { // Horizontal + unsigned int Chunk = ChunkSize(Wo); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Wo); + v4s PadOrg = PadIn; + PadIn[0] *= (First==0); PadIn[1] *= (Last==Wo); + if (PoolMax) { + KerMaxPoolNxMStrideSxSy_Body_SQ8(In, Out, FSx, FSy, PadOrg[0], PadOrg[2], W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, Sx, Sy); + if ((int) PadIn) KerMaxPoolNxMStrideSxSy_Border_SQ8(In, Out, FSx, FSy, PadIn, PadOrg, W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, Sx, Sy); + } else { + KerAvgPoolNxMStrideSxSy_Body_SQ8(In, Out, FSx, FSy, PadOrg[0], PadOrg[2], W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, Sx, Sy); + if ((int) PadIn) KerAvgPoolNxMStrideSxSy_Border_SQ8(In, Out, FSx, FSy, PadIn, PadOrg, W, H, Wo, Max(First, Wo_F), Min(Last, Wo_L), Ho, Ho_F, Ho_L, Sx, Sy); + } + KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_RELUN, 1); + } else { + unsigned int Chunk = ChunkSize(Ho); + unsigned int First = Chunk*CoreId; + unsigned int Last = Min(First+Chunk, Ho); + v4s PadOrg = PadIn; + PadIn[2] *= (First==0); PadIn[3] *= (Last==Ho); + if (PoolMax) { + KerMaxPoolNxMStrideSxSy_Body_SQ8(In, Out, FSx, FSy, PadOrg[0], PadOrg[2], W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), Sx, Sy); + if ((int) PadIn) KerMaxPoolNxMStrideSxSy_Border_SQ8(In, Out, FSx, FSy, PadIn, PadOrg, W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), Sx, Sy); + } else { + KerAvgPoolNxMStrideSxSy_Body_SQ8(In, Out, FSx, FSx, PadOrg[0], PadOrg[2], W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), Sx, Sy); + if ((int) PadIn) KerAvgPoolNxMStrideSxSy_Border_SQ8(In, Out, FSx, FSy, PadIn, PadOrg, W, H, Wo, Wo_F, Wo_L, Ho, Max(First, Ho_F), Min(Last, Ho_L), Sx, Sy); + } + KerPoolActivation(Out, Wo, Ho, First, Last, Infos, ACT_RELUN, 0); + } + gap_waitbarrier(0); +} diff --git a/tools/autotiler_v3/generators/CNN/CNN_SoftMax.c b/tools/autotiler_v3/generators/CNN/CNN_SoftMax.c index c7e9e6ab4..a1388b9bb 100644 --- a/tools/autotiler_v3/generators/CNN/CNN_SoftMax.c +++ b/tools/autotiler_v3/generators/CNN/CNN_SoftMax.c @@ -236,4 +236,5 @@ void KerParSoftMax_fps(KerSoftMax_fps_T *Arg) InvSum = ((FP2FIX(1.0, 15)<<15)/Sum); for (int i=First; i +#include +#include "CNN_BasicKernels_SQ8.h" + +#define Minu(a, b) (( ((unsigned int)a)<((unsigned int)b) )?((unsigned int)a):((unsigned int)b) ) + +#ifdef __pulp__ +#define Abs(a) __builtin_pulp_abs((a)) +#define Min(a, b) __builtin_pulp_minsi((a), (b)) +#define Max(a, b) __builtin_pulp_maxsi((a), (b)) +#else +#define Abs(a) (((int)(a)<0)?(-(a)):(a)) +#define Min(a, b) (((a)<(b))?(a):(b)) +#define Max(a, b) (((a)>(b))?(a):(b)) +#endif + +static int CoreCountDynamic = 1; +static int ActiveCore = gap_ncore(); + +static inline unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X) + +{ + unsigned int NCore; + unsigned int Log2Core; + unsigned int Chunk; + + if (CoreCountDynamic) NCore = ActiveCore; else NCore = gap_ncore(); + Log2Core = gap_fl1(NCore); + Chunk = (X>>Log2Core) + ((X&(NCore-1))!=0); + return Chunk; +} + + + +static unsigned short int IntegerExpLUT[] = +{ + 0x0001, 0x0002, 0x0007, 0x0014, 0x0036, 0x0094, 0x0193, 0x0448, 0x0BA4, 0x1FA7, 0x560A, 0xE9E2 +}; + +static unsigned short int FractionExpLUT[] = +{ + 0x0000, 0x5BF1, 0x31CD, 0x0AF3, 0x4C90, 0x34E2, 0x36E3, 0x510B, 0x7A9F, 0x0ABE, 0x3B9F, 0x1224 +}; + +/* 17.15 fixed point format */ +static unsigned short int ExpCoeffLUT[] = { + 0x7FFF, 0x7FFF, 0x4000, 0x1555, 0x0555, 0x0111, 0x002E, 0x0007, 0x0001 +}; + + +#define ARRAYSIZE(x) (sizeof(x) / sizeof(x[ 0 ])) + +/* X : fixed point, format Q17.15, returns in Q17.15 */ +static unsigned int Exp_fp_17_15(unsigned int X) + +{ + int Y, Result, IntX, FractX, ScaledInt; + short int Z_s, FractX_s; + unsigned short int ScaledFract; + + if (!X) return 0x8000; + Y = Abs(X); + IntX = (Y >> 15); + FractX = (Y & 0x7FFF); + if (gap_bitextractu(FractX, 1, 14)) { + /* Taylor series converges quickly only when | FractX | < 0.5 */ + FractX -= 0x8000; IntX++; + } + if (IntX >= (int) ARRAYSIZE (IntegerExpLUT)) { + if (Y==X) return 0x7FFFFFFF; else return 0; + } + + ScaledInt = IntegerExpLUT[IntX]; ScaledFract = FractionExpLUT[IntX]; + /* Taylor's series: exp(x) = 1 + x + x ^ 2 / 2 + x ^ 3 / 3! + x ^ 4 / 4! + x ^ 5 / 5! + x ^ 6 / 6! + x ^ 7 / 7! + x ^ 8 / 8! */ + FractX_s = FractX; Z_s = FractX; Result = 0; + for (int i = 1; i < ARRAYSIZE (ExpCoeffLUT); i++) { + Result += Z_s*ExpCoeffLUT[i]; // gap_macs(Result, Z, ExpCoeffLUT[ i ]); + Z_s = gap_mulsRN(Z_s, FractX_s, 15); + } + Result = gap_roundnorm(Result, 15) + ExpCoeffLUT[0]; + unsigned short int U_Res = Result; + Result = gap_muluRN(U_Res, ScaledFract, 15) + U_Res * ScaledInt; + if (Result && (X > 0x7FFFFFFF)) + Result = ((0x7FFFFFFF / Result) >> 1); /* negative value */ + return (unsigned int) Result; +} + +#if WITH16B +void KerParSoftMax_fp(KerSoftMax_fp_T *Arg) + +{ + short int * __restrict__ In = Arg->In; + short int * __restrict__ Out = Arg->Out; + int N = Arg->N; + unsigned Norm = Arg->Norm; + static L1_CL_MEM int Reduct[8]; + int M, Sum, InvSum; + unsigned int CoreId = gap_coreid(); + unsigned int ChunkCell = ChunkSize(N); + unsigned int First = CoreId*ChunkCell; + unsigned int Last = Min(First+ChunkCell, N); + unsigned int *Red = &Reduct[CoreId]; + + /* Turns In into distribution */ + /* Find max */ + M = 0x80000000; + for (int i=First; iIn; + short int * __restrict__ Out = Arg->Out; + int N = Arg->N; + unsigned Norm = Arg->Norm; + unsigned int CoreId = gap_coreid(); + unsigned int ChunkCell = ChunkSize(N); + unsigned int First = CoreId*ChunkCell; + unsigned int Last = Min(First+ChunkCell, N); + + for (int i=First; i0)?-1:1; + unsigned int Exp = Exp_fp_17_15((NotSign*In[i])<<(16-Norm)); /* Exp(2x) or Exp(-2x) */ + int Y = ((int)Exp + FP2FIX(+1.0, 15)); + + if (In>0) Thx =((FP2FIX(1.0, 15) - Exp)<<15)/Y; + else { + int X = ((int)Exp + FP2FIX(-1.0, 15)); + Thx = ((X<<15)/(Y>>1))>>1; + } + Out[i] = gap_roundnorm_reg(Thx, (15-Norm)); + } +} + + +int Tanh_fp(int In, int Norm) + +{ + + int Thx, Out; + int NotSign = (In>0)?-1:1; + unsigned int Exp = Exp_fp_17_15((NotSign*In)<<(16-Norm)); /* Exp(2x) or Exp(-2x) */ + + int Y = ((int)Exp + FP2FIX(+1.0, 15)); + if (In>0) Thx =((FP2FIX(1.0, 15) - Exp)<<15)/Y; + else { + int X = ((int)Exp + FP2FIX(-1.0, 15)); + Thx = ((X<<15)/(Y>>1))>>1; + } + Out = gap_roundnorm_reg(Thx, (15-Norm)); + return Out; +} +#endif + +void KerParSoftMax_SQ8(KerSoftMax_SQ8_T *Arg) + +{ + signed char * __restrict__ In = Arg->In; + short int * __restrict__ Out = Arg->Out; + int N = Arg->N; + int Norm = Arg->Infos[AT_INF_BIASL_SM]; + static L1_CL_MEM int Reduct[8]; + int M, Sum, InvSum; + unsigned int CoreId = gap_coreid(); + unsigned int ChunkCell = ChunkSize(N); + unsigned int First = CoreId*ChunkCell; + unsigned int Last = Min(First+ChunkCell, N); + unsigned int *Red = &Reduct[CoreId]; + + /* Turns In into distribution */ + /* Find max */ + M = 0x80000000; + for (int i=First; i Pad Exec if several kernels are cascaded */ + v4s PadExec, /**< Left, Right, Top, Bottom amount of pad, actual pad to be used at kernel exec time */ + unsigned int ItemSize, /**< Data plane basic data type size in bytes */ + int TileOverlap, /**< Amount of overlap between 2 adjacent tiles */ + KernelArgConstraints_T Constraint, /**< Kernel argument constraints */ + unsigned int PreferedTileSize, /**< Tile variable dimension must be a multiple of PreferedTileSize if not 0 */ + char *CArgName /**< To which user kernel C argument this kernel argument is related to */ + ); + +/** +@brief Creates one user kernel argument, extra pad on variable dim for alignment sake. Kernel argument Space is explicitely described + +Creates one user kernel argument, extra pad on variable dim for alignment sake. Kernel argument Space is explicitely described +*/ +Object_T *KerArgPadAlign( + char *KerArgName, /**< Kernel argument name */ + KernelArgDimDescrT *KerArgSpace, /**< Kernel argument space descriptor */ + Object_Type_T ObjType, /**< Kernel argument type: logical OR of types (O_xxx) or pre defined types */ + unsigned int W, /**< Kernel argument Data plane width */ + unsigned int H, /**< Kernel argument Data plane height */ + unsigned int TileWPadAlign, /**< Add TilePadAlign to the width of the tile, use adjust tile alignment through tile expansion */ + unsigned int ItemSize, /**< Data plane basic data type size in bytes */ + unsigned int RawItemSize, /**< In case ItemSize has to be padded this is the ItemSize before padding */ + int TileOverlap, /**< Amount of overlap between 2 adjacent tiles */ + KernelArgConstraints_T Constraint, /**< Kernel argument constraints */ + unsigned int PreferedTileSize, /**< Tile variable dimension must be a multiple of PreferedTileSize if not 0 */ + char *CArgName /**< To which user kernel C argument this kernel argument is related to */ + ); /** @@ -998,6 +1052,27 @@ void AddKernelArgDim( ... /**< List of space dimensions from outer to inner, most inner is the item size */ ); +/** +@brief For 2D parametric const arg interleave each tile by group of TileLineInterleave Lines + +For 2D parametric const arg interleave each tile by group of TileLineInterleave Lines +*/ +void SetKernelArgInterleave( + char *Name, /**< Kernel Name */ + char *ArgName, /**< Argument Name */ + unsigned int TileLineInterleave /**< Number of lines of the interleaved group of lines */ + ); + +/** +@brief Set L2DB (in L3) property to Kernel Name, Kernel Argument ArgName + +Set L2DB (in L3) property to Kernel Name, Kernel Argument ArgName +*/ +void SetKerArgInL3( + char *Name, /**< Kernel Name */ + char *ArgName /**< Argument Name */ + ); + /** @brief Alter the behaviour of UserKernel processing @@ -1092,6 +1167,11 @@ StackedTensors_T *AT_StackedTensors( ... ); +void AddStackedTensors( + char *OutTensorName, + int Count, + ... + ); /** @brief Creates a list of stacked tensors @@ -1247,6 +1327,18 @@ ArgBindingDescr_T *GNodeArgImmOper( int Value /**< Offset value */ ); +/** +@brief Add a pair of symbols to a graph C arg to pass it's allocated address and memory location + +Add a pair of symbols to a graph C arg to pass it's allocated address and memory location +*/ +void AddGraphArgExportSymbols( + char *GraphCArgName, /**< Graph CArg name, should be CArg with scope=ARG_SCOPE_GLOBAL */ + char *ExportAddrName, /**< Legal C Name to store Graph CArg allocated address, type is unsigned int */ + char *ExportLocName /**< Legal C Name to store n which memort Graph CArg has been allocated, an int */ + ); + + /** @brief Binds a given Graph node arg, simplified form @@ -1412,15 +1504,27 @@ char *CNN_FindMatchingKernel( ); /** -@brief Returns a C type for an argument given it's size in byte +@brief Returns a signed C type for an argument given it's size in byte -Returns a C type for an argument given it's size in byte +Returns a signed C type for an argument given it's size in byte */ char *CNN_ArgDataType( int DataSize, /**< Argument size in byte (1,2 or 4) */ int Pointer, /**< Is this argument a pointer */ int Restrict /**< In case this argument is a pointer can it be restricted? */ ); + +/** +@brief Returns an unsigned C type for an argument given it's size in byte + +Returns an unsigned C type for an argument given it's size in byte +*/ +char *CNN_ArgDataTypeUns( + int DataSize, /**< Argument size in byte (1,2 or 4) */ + int Pointer, /**< Is this argument a pointer */ + int Restrict /**< In case this argument is a pointer can it be restricted? */ + ); + /** @brief For merged CNN layers retrieves composite Layer operation from individual operations. @@ -1507,7 +1611,17 @@ extern void AT_PrepareForTest(char *Name, KernelOper_T KerOper, int Norm, int NormBias); +extern void AT_PrepareForTest_SQ8( + char *Name, + int InFeat, int OutFeat, int Width, int Height, + int BiasDataSize, + KernelOper_T OpC, int Fcx, int Fcy, int Dcx, int Dcy, int Scx, int Scy, v4s PadC, + KernelOper_T OpP, int Fpx, int Fpy, int Dpx, int Dpy, int Spx, int Spy, v4s PadP, + KernelOper_T OpA + ); + extern void AT_TestFinalize(); +extern void AT_TestFinalize_SQ8(); extern void DecodeCNNOper( KernelOper_T Oper, diff --git a/tools/autotiler_v3/include/AutoTilerLibTypes.h b/tools/autotiler_v3/include/AutoTilerLibTypes.h index 9054ba670..856fffc4a 100644 --- a/tools/autotiler_v3/include/AutoTilerLibTypes.h +++ b/tools/autotiler_v3/include/AutoTilerLibTypes.h @@ -34,6 +34,7 @@ typedef enum { /* Primitive operations */ KOP_SETBIAS, KOP_SETBIAS_DP, + KOP_CONV_HWCE, KOP_CONV, KOP_CONV_DP, KOP_CONV_DW, @@ -52,7 +53,6 @@ typedef enum { KOP_GLOBAL_AVGPOOL_REDUCT, KOP_RELU, KOP_RELUN, - KOP_RELUN_VECTOR, KOP_HSIGMOID, KOP_HSWISH, KOP_LEAKYRELU, @@ -70,6 +70,7 @@ typedef enum { KOP_MATSCALE_VECTOR, KOP_MATSCALE_SCALAR, KOP_MATSCALE_VECTOR_SCALAR, + KOP_MATVECTMUL, KOP_MATTRANSP, KOP_MATPERM_CHW2CWH, KOP_MATPERM_CHW2HWC, @@ -77,6 +78,8 @@ typedef enum { KOP_MATPERM_CHW2WCH, KOP_MATPERM_CHW2HCW, KOP_SOFTMAX, + KOP_EXPAND, + KOP_COLLAPSE, /* Grouped operations */ KOP_CONV_RELU, @@ -130,15 +133,16 @@ typedef enum { } KernelOper_T; +#if 0 typedef enum { KACT_NONE = 0, KACT_RELU, KACT_RELUN, - KACT_RELUN_VECTOR, KACT_HSIGMOID, KACT_HSWISH, KACT_LEAKY, } CNN_ActivationOper_T; +#endif typedef enum { PAD_LEFT, /* All padding elements are inserted on the left/top */ @@ -268,30 +272,6 @@ typedef enum { extern KernelCallLocationT IterCallLocation[][CALL_LAST]; -#define LOC_INNER_LOOP 0 -#define LOC_INNER_LOOP_PROLOG 1 -#define LOC_INNER_LOOP_EPILOG 2 - -#define LOC_INNER_LOOP1 3 -#define LOC_INNER_LOOP1_PROLOG 4 -#define LOC_INNER_LOOP1_EPILOG 5 - -#define LOC_INNER_LOOP2 6 -#define LOC_INNER_LOOP2_PROLOG 7 -#define LOC_INNER_LOOP2_EPILOG 8 - -#define LOC_IN_PLANE 9 -#define LOC_IN_PLANE_PROLOG 10 -#define LOC_IN_PLANE_EPILOG 11 - -#define LOC_OUT_PLANE 12 -#define LOC_OUT_PLANE_PROLOG 13 -#define LOC_OUT_PLANE_EPILOG 14 - -#define LOC_IN_OUT_PLANE 15 -#define LOC_IN_OUT_PLANE_PROLOG 16 -#define LOC_IN_OUT_PLANE_EPILOG 17 - /** @brief User kernel tiling orientation @@ -309,20 +289,17 @@ typedef enum { /** @brief User kernel argument constraints -User kernel argument constraints +User kernel argument constraints. Max 16 of them */ typedef enum { OBJ_CONSTRAINTS_NONE = 0, /**< No constraints on this user kernel argument */ - OBJ_CONSTRAINTS_EVEN = (1<<1), /**< Variable tile size generated for this user kernel should be even */ - OBJ_CONSTRAINTS_ODD = (1<<2), /**< Variable tile size generated for this user kernel should be odd */ - OBJ_CONSTRAINTS_ONEPREFTILE = (1<<3), /**< This user kernel argument has a prefered tile variable size, use only a single tile for it */ - OBJ_CONSTRAINTS_TILE_HOR = (1<<4), /**< Force this kernel argument to be tiled horizontaly */ - OBJ_CONSTRAINTS_TILE_VER = (1<<5), /**< Force this kernel argument to be tiled verticaly */ - OBJ_CONSTRAINTS_PAD_REM = (1<<6), /**< When argument has non integer dim ratio use last tile to recover missing elements if possible */ - OBJ_CONSTRAINTS_DROP_REM = (1<<7), /**< When argument has non integer dim ratio simply drop them */ - OBJ_CONSTRAINTS_DYNAMIC = (1<<8), /**< When argument has non integer dim ratio dynamically evaluate tile size using DimRatio */ - OBJ_CONSTRAINTS_2D = (1<<9), /**< Argument is 2D strided */ - OBJ_CONSTRAINTS_3D = (1<<10), + OBJ_CONSTRAINTS_ONEPREFTILE = (1<<0), /**< This user kernel argument has a prefered tile variable size, use only a single tile for it */ + OBJ_CONSTRAINTS_TILE_HOR = (1<<1), /**< Force this kernel argument to be tiled horizontaly */ + OBJ_CONSTRAINTS_TILE_VER = (1<<2), /**< Force this kernel argument to be tiled verticaly */ + OBJ_CONSTRAINTS_PAD_REM = (1<<3), /**< When argument has non integer dim ratio use last tile to recover missing elements if possible */ + OBJ_CONSTRAINTS_DROP_REM = (1<<4), /**< When argument has non integer dim ratio simply drop them */ + OBJ_CONSTRAINTS_DYNAMIC = (1<<5), /**< When argument has non integer dim ratio dynamically evaluate tile size using DimRatio */ + OBJ_CONSTRAINTS_2D = (1<<6), /**< Argument is 2D strided */ } KernelArgConstraints_T; /** @@ -346,23 +323,25 @@ typedef enum { KER_ARG_W = 12, /**< User kernel argument width */ KER_ARG_H = 13, /**< User kernel argument height */ KER_ARG_NTILES = 14, /**< Number of tiles for related user kernel argument */ - KER_ARG_TILEINDEX = 15, /**< Current tile index for related user kernel argument, starts at 0 */ - KER_ARG_TILE_BASE = 16, /**< Current tile base in line or column unit, when argument is dynamic it is computed at runtime */ - KER_ARG_IT_INDEX = 17, /**< Actual value of iterator attached to ItSpace */ - KER_ARG_PAD = 18, /**< Actual padding of a feature space associated to arg (left,right,top,bottom) as a v4s */ - KER_ARG_TILE_PAD = 19, /**< Actual padding of tile associated to arg (left,right,top,bottom) as a v4s */ - KER_ARG_PARTILE_DIM = 20, /**< Actual dimension of a parametric space */ - KER_ARG_PARTILE_SIZE = 21, /**< Size of a tile from a parametric space */ - KER_ARG_LOADEDPARTILE_SIZE = 22,/**< Size of a tile from a parametric space, in case the related subspace has been promoted to partial buffer returns the dimension of this subspace otherwise is equal to KER_ARG_PARTILE_SIZE */ - KER_IT_INDEX = 23, /**< Actual value of a given kernel iterator */ - - TC_ARG = 24, /**< A C argument */ - TC_IMM = 25, /**< An immediate int value */ - TC_USYMB = 26, /**< A user defined symbol */ - TC_KDIM = 27, /**< One of the user Kernel Dimensions */ - TC_ARG_IND = 28, /**< An indirection on a C argument */ - TC_ARG_IND_IT_INDEX = 29, /**< An indirection on a C argument with respect to actual value of ItSpace */ - TC_ARG_PLUS_IT_INDEX = 30, /**< A C argument added to actual value of ItSpace, ItSpace multiplied by a constant */ + KER_ARG_TILEFIRST = 15, /**< Predicate, != 0 if if current tile is the first one */ + KER_ARG_TILELAST = 16, /**< Predicate, != 0 if current tile is the last one */ + KER_ARG_TILEINDEX = 17, /**< Current tile index for related user kernel argument, starts at 0 */ + KER_ARG_TILE_BASE = 18, /**< Current tile base in line or column unit, when argument is dynamic it is computed at runtime */ + KER_ARG_IT_INDEX = 19, /**< Actual value of iterator attached to ItSpace */ + KER_ARG_PAD = 20, /**< Actual padding of a feature space associated to arg (left,right,top,bottom) as a v4s */ + KER_ARG_TILE_PAD = 21, /**< Actual padding of tile associated to arg (left,right,top,bottom) as a v4s */ + KER_ARG_PARTILE_DIM = 22, /**< Actual dimension of a parametric space */ + KER_ARG_PARTILE_SIZE = 23, /**< Size of a tile from a parametric space */ + KER_ARG_LOADEDPARTILE_SIZE = 24,/**< Size of a tile from a parametric space, in case the related subspace has been promoted to partial buffer returns the dimension of this subspace otherwise is equal to KER_ARG_PARTILE_SIZE */ + KER_IT_INDEX = 25, /**< Actual value of a given kernel iterator */ + + TC_ARG = 26, /**< A C argument */ + TC_IMM = 27, /**< An immediate int value */ + TC_USYMB = 28, /**< A user defined symbol */ + TC_KDIM = 29, /**< One of the user Kernel Dimensions */ + TC_ARG_IND = 30, /**< An indirection on a C argument */ + TC_ARG_IND_IT_INDEX = 31, /**< An indirection on a C argument with respect to actual value of ItSpace */ + TC_ARG_PLUS_IT_INDEX = 32, /**< A C argument added to actual value of ItSpace, ItSpace multiplied by a constant */ /* Deprecated */ @@ -529,7 +508,8 @@ typedef enum { BIND_OP_MOD=6, BIND_OP_LSHIFT=7, BIND_OP_RSHIFT=8, - BIND_OP_LAST=7, + BIND_OP_AT_INDEX=9, + BIND_OP_LAST=10, } ArgBindingOper; /* Internal tiler data structures */ @@ -689,6 +669,8 @@ typedef struct { ConstInit_T *Init; /* How to initialize in case Kernel argument is a constant */ Kernel_Arg_T *KerArg; /* In case C arg is referenced into a Ker Arg gives a straight access to this kernel arguement */ GraphEdgeWeb_T *GraphSymbol; /* Pointer to related CNN graph argument */ + NameT *ExportSymbolName; /* For Graph CArg allocated by the autotiler external name to be used to export CArg address to outside world */ + NameT *ExportSymbolLoc; /* For Graph CArg allocated by the autotiler external name to be used to export CArg mem location to outside world */ } CArg_Descriptor_T; #define HAS_ARG_INFO(Arg) ((Arg) && (Arg)->CArg && (Arg)->CArg->ArgInfo) @@ -706,6 +688,7 @@ typedef struct { NameT *ValueKernelArg; /* When a second C arg is needed */ KernelIteratorT ItSpace; /* In case an iterator name is needed */ CArg_Descriptor_T *ArgInfo; + NameT *KerArgAccessType; } CKernel_Arg_T; typedef enum {GNA_UNDEF, GNA_IN, GNA_OUT, GNA_INOUT} GraghNodeArgT; @@ -724,6 +707,7 @@ typedef struct { CKernel_Arg_T *AliasTargetArgDescr; CKernel_Arg_T *SourceArgDescr; KernelIteratorT ItSpace; /* In case an iterator name is needed */ + NameT *KerArgAccessType; } ArgBindingDescr_T; typedef struct { @@ -751,6 +735,8 @@ typedef struct { char UsedLength[2*CG_MAX_PIPE_DEPTH+1]; /* To tack 2D length of tiles (if arg is 2D) for proper variable declaration */ int ArgNDim; /* Number of dimensions of this argumentt */ int *ArgDim; /* Space dimension from outer to inner, most inner dim is item size */ + unsigned int TileLineInterleave; /* In case related arg is 2D parametric and constant interleave tile lines by group of TileLineInterleave lines, + remainder is kept non interleaved */ } KerArgInfos_T; #define TILE_PTR(PipeOff) ((PipeOff) + CG_MAX_PIPE_DEPTH) @@ -778,8 +764,9 @@ typedef struct A_Kernel_Arg_T { unsigned int Height; unsigned int UsedHeight; unsigned int UsedH; - int Overlap; + int TileOverlap; /* By how much 2 adjacent tiles should overlap, can be negative in case of non unit stride */ unsigned int DimRatio; + float FDimRatio; unsigned int DimOff; unsigned int DimRem; unsigned int Constraints; @@ -788,6 +775,7 @@ typedef struct A_Kernel_Arg_T { unsigned int Pad[4]; unsigned int ArgPad[4]; int ItemSize; + int RawItemSize; unsigned int MoveSize[4]; /* [D1][D0] or [D0][T] or [T] D1,D0 parameteric spaces, T tileable space. D1/D0/T=0 Std tile, D1/D0/T=1 Last Tile */ unsigned int MoveStride; unsigned int MoveStride1D[2]; @@ -827,8 +815,10 @@ typedef struct A_Object_T { unsigned int ArgStride; unsigned int BottomBuffer; unsigned int TopBuffer; + unsigned int TileWPadAlign; /* Number of points to be added to the width of the tile, object should be O_TILED */ int ItemSize; - int Overlap; + int RawItemSize; + int TileOverlap; /* By how much 2 adjacent tiles should overlap, can be negative in case of non unit stride */ unsigned int Alignment; unsigned int PreferedTileSize; unsigned int PrefRem; /* Tile size should be Ts = PrefRem + K * PreferedTileSize */ @@ -923,14 +913,20 @@ typedef struct A_Kernel_T { } Kernel_T; typedef struct { - int TileOrientation; /* Set Tiling orientation TILE_HOR TILE_VER */ - int ParallelFeatures; /* Parallelize along channels */ - int ForceDPconv; /* Forces double precision convolution*/ - int UseHwCE; /* Enable HW CE */ + char TileOrientation; /* Set Tiling orientation TILE_HOR TILE_VER */ + char ParallelFeatures; /* Parallelize along channels */ + char ForceDPconv; /* Forces double precision convolution*/ + char UseHwCE; /* Enable HW CE */ AT_PadType PadType; /* Control padding strategy */ - int EnableIm2Col; /* Enable mat mul based convolution when feasible */ + char EnableIm2Col; /* Enable mat mul based convolution when feasible */ int ReluN; /* if != -1 Overides 6 as a default value for ReLUN */ - int MulBiasScalar; /* if != -1 Overides default non scalar for MulBias convolutions */ + char MulBiasScalar; /* if != -1 Overides default non scalar for MulBias convolutions */ + char In_L3; /* if != 0 In (or In1) forced to be in L3 memory */ + char Filter_L3; /* if != 0 Filter (or In2) forced to be in L3 memory */ + char Bias_L3; /* if != 0 Bias forced to be in L3 memory */ + char Out_L3; /* if != 0 Out forced to be in L3 memory */ + char Scale_L3; /* if != 0 Scale forced to be in L3 memory */ + char ScaleN_L3; /* if != 0 ScaleN forced to be in L3 memory */ } CNN_GenControl_T; typedef struct { @@ -1144,8 +1140,12 @@ typedef struct { #define Q2F(V, N) ((float) (((float) (V))/((1<<(N))-0))) #define MultRndu(x,y, scale) ((unsigned int)(((x)*(y)) + (1<<((scale)-1)))>>(scale)) +#ifndef Max #define Max(a, b) (((a)>(b))?(a):(b)) +#endif +#ifndef Min #define Min(a, b) (((a)<(b))?(a):(b)) +#endif /* Return aligned value, alignment is 2^Size */ #define ALIGN(Value, Size) (((Value)&((1<<(Size))-1))?((((Value)>>(Size))+1)<<(Size)):(Value)) diff --git a/tools/autotiler_v3/include/GapBuiltins.h b/tools/autotiler_v3/include/GapBuiltins.h index eed969fcc..c3ef8eceb 100644 --- a/tools/autotiler_v3/include/GapBuiltins.h +++ b/tools/autotiler_v3/include/GapBuiltins.h @@ -173,6 +173,8 @@ static inline unsigned int ExtInsMaskSafe(unsigned int Size, unsigned int Offset #define gap_bitinsert_r(dst, src, size, off) __builtin_pulp_binsert_r((dst), (src), ExtInsMaskFast((size), (off))) #define gap_bitinsert_r_safe(dst, src, size, off) __builtin_pulp_binsert_r((dst), (src), ExtInsMaskSafe((size), (off))) +/* Bit clear */ +/* Bit set */ /* 1 bit rotation to the right, 32 bits input */ #define gap_rotr(x) __builtin_pulp_rotr((x)) diff --git a/tools/autotiler_v3/include/at_api_emul.h b/tools/autotiler_v3/include/at_api_emul.h index 5addaee98..f3593aac8 100644 --- a/tools/autotiler_v3/include/at_api_emul.h +++ b/tools/autotiler_v3/include/at_api_emul.h @@ -216,7 +216,7 @@ typedef int AT_HYPERFLASH_FS_CL_EVENT; #define AT_HYPERFLASH_FS_FC_COPY(file,ext,loc,size,dir,event) \ __at_hyperflash_fs_copy(*(file), ext, loc, size, dir) -#define AT_HYPERFLASH_FS_FC_COPY2D(file, dev,ext,loc,size,stride,len,dir,event) \ +#define AT_HYPERFLASH_FS_FC_COPY2D(file, ext,loc,size,stride,len,dir,event) \ __at_hyperflash_fs_copy_2d(*(file), ext, loc, size, stride, len, dir) #define AT_HYPERFLASH_FS_FC_WAIT(file,event) @@ -224,7 +224,7 @@ typedef int AT_HYPERFLASH_FS_CL_EVENT; #define AT_HYPERFLASH_FS_CL_COPY(file,ext,loc,size,dir,event) \ __at_hyperflash_fs_copy(*(file), ext, loc, size, dir) -#define AT_HYPERFLASH_FS_CL_COPY2D(file, dev,ext,loc,size,stride,len,dir,event) \ +#define AT_HYPERFLASH_FS_CL_COPY2D(file, ext,loc,size,stride,len,dir,event) \ __at_hyperflash_fs_copy_2d(*(file), ext, loc, size, stride, len, dir) #define AT_HYPERFLASH_FS_CL_WAIT(file,event) diff --git a/tools/autotiler_v3/include/at_api_pmsis.h b/tools/autotiler_v3/include/at_api_pmsis.h index 6ef177d15..fbb136742 100644 --- a/tools/autotiler_v3/include/at_api_pmsis.h +++ b/tools/autotiler_v3/include/at_api_pmsis.h @@ -37,33 +37,33 @@ static inline void gap_fc_starttimer() { pi_perf_conf(1<flash = &file->hyperflash; - if (is_write) +#ifdef __FLASH_FS_SEMIHOST__ + printf("Open in semi host mode\n"); conf->type = PI_FS_HOST; +#endif pi_open_from_conf(&file->fs, conf); if (pi_fs_mount(&file->fs)) { @@ -254,8 +256,8 @@ static inline void __at_hyperflash_fs_close(AT_HYPERFLASH_FS_T *file) #define AT_HYPERFLASH_FS_FC_COPY(fs,ext,loc,size,dir,event) \ pi_fs_copy_async((fs)->file, ext, loc, size, !(dir), pi_task_block(event)) -#define AT_HYPERFLASH_FS_FC_COPY2D(file, dev,ext,loc,size,stride,len,dir,event) \ - pi_fs_copy_2d_async(file->file, ext, loc, size, stride, len, !(dir), pi_task_block(event)) +#define AT_HYPERFLASH_FS_FC_COPY2D(fs,ext,loc,size,stride,len,dir,event) \ + pi_fs_copy_2d_async((fs)->file, ext, loc, size, stride, len, !(dir), pi_task_block(event)) #define AT_HYPERFLASH_FS_FC_WAIT(file,event) \ pi_task_wait_on(event) @@ -263,8 +265,8 @@ static inline void __at_hyperflash_fs_close(AT_HYPERFLASH_FS_T *file) #define AT_HYPERFLASH_FS_CL_COPY(fs,ext,loc,size,dir,event) \ pi_cl_fs_copy((fs)->file, ext, loc, size, !(dir), event) -#define AT_HYPERFLASH_FS_CL_COPY2D(file, dev,ext,loc,size,stride,len,dir,event) \ - pi_cl_fs_copy_2d(file->file, ext, loc, size, stride, len, !(dir), event) +#define AT_HYPERFLASH_FS_CL_COPY2D(fs,ext,loc,size,stride,len,dir,event) \ + pi_cl_fs_copy_2d((fs)->file, ext, loc, size, stride, len, !(dir), event) #define AT_HYPERFLASH_FS_CL_WAIT(file,event) \ pi_cl_fs_wait(event) diff --git a/tools/gap-configs/configs/chips/gap9_v2/gap9_v2.json b/tools/gap-configs/configs/chips/gap9_v2/gap9_v2.json index 06d6e60ad..53c7c6e9d 100644 --- a/tools/gap-configs/configs/chips/gap9_v2/gap9_v2.json +++ b/tools/gap-configs/configs/chips/gap9_v2/gap9_v2.json @@ -70,7 +70,7 @@ "base": "0x10000000", "alias": "0x00000000", "size": "0x00400000", - "core": "ri5ky_v2_6_sfloat_single_regfile", + "core": "ri5ky_v2_6_sfloat_single_regfile_int64", "version": 5, "json_file": "cluster_v6", "nb_cluster": 1, diff --git a/tools/gap-configs/configs/chips/gap9_v2/gap9_v2_rtl.json b/tools/gap-configs/configs/chips/gap9_v2/gap9_v2_rtl.json index 78c71329b..d1f3c668f 100644 --- a/tools/gap-configs/configs/chips/gap9_v2/gap9_v2_rtl.json +++ b/tools/gap-configs/configs/chips/gap9_v2/gap9_v2_rtl.json @@ -15,39 +15,144 @@ } }, - "components": { - "mic0": { - "@includes@": ["devices/microphone.json"] - }, - "mic1": { - "@includes@": ["devices/microphone.json"] - }, - "mic2": { - "@includes@": ["devices/microphone.json"] - }, - "mic3": { - "@includes@": ["devices/microphone.json"] - }, - "sink0": { - "@includes@": ["devices/speaker.json"] + "@cond@": { + "@os.environ.get('GVSOC_TESTBENCH') is None@": { + "components": { + "mic0": { + "@includes@": ["devices/microphone.json"] + }, + "mic1": { + "@includes@": ["devices/microphone.json"] + }, + "mic2": { + "@includes@": ["devices/microphone.json"] + }, + "mic3": { + "@includes@": ["devices/microphone.json"] + }, + "sink0": { + "@includes@": ["devices/speaker.json"] + }, + "uart": { + "@includes@": ["devices/uart_checker.json"] + } + }, + + "bindings": [ + [ "chip.uart0", "uart.input" ], + [ "mic0.i2s", "chip.i2s0" ], + [ "mic1.i2s", "chip.i2s0" ], + [ "mic2.i2s", "chip.i2s0" ], + [ "mic3.i2s", "chip.i2s0" ], + [ "mic0.ws_out", "mic1.ws_in" ], + [ "mic1.ws_out", "mic2.ws_in" ], + [ "mic2.ws_out", "mic3.ws_in" ], + [ "sink0.i2s", "chip.i2s0" ] + ] }, - "uart": { - "@includes@": ["devices/uart_checker.json"] + + "@os.environ.get('GVSOC_TESTBENCH') is not None@": { + "components": { + "testbench": { + "@includes@": ["devices/testbench.json"] + } + }, + "bindings": [ + [ "chip.uart0", "testbench.ctrl" ], + [ "chip.gpio0", "testbench.gpio0" ], + [ "chip.gpio1", "testbench.gpio1" ], + [ "chip.gpio2", "testbench.gpio2" ], + [ "chip.gpio3", "testbench.gpio3" ], + [ "chip.gpio4", "testbench.gpio4" ], + [ "chip.gpio5", "testbench.gpio5" ], + [ "chip.gpio6", "testbench.gpio6" ], + [ "chip.gpio7", "testbench.gpio7" ], + [ "chip.gpio8", "testbench.gpio8" ], + [ "chip.gpio9", "testbench.gpio9" ], + [ "chip.gpio10", "testbench.gpio10" ], + [ "chip.gpio11", "testbench.gpio11" ], + [ "chip.gpio12", "testbench.gpio12" ], + [ "chip.gpio13", "testbench.gpio13" ], + [ "chip.gpio14", "testbench.gpio14" ], + [ "chip.gpio15", "testbench.gpio15" ], + [ "chip.gpio16", "testbench.gpio16" ], + [ "chip.gpio17", "testbench.gpio17" ], + [ "chip.gpio18", "testbench.gpio18" ], + [ "chip.gpio19", "testbench.gpio19" ], + [ "chip.gpio20", "testbench.gpio20" ], + [ "chip.gpio21", "testbench.gpio21" ], + [ "chip.gpio22", "testbench.gpio22" ], + [ "chip.gpio23", "testbench.gpio23" ], + [ "chip.gpio24", "testbench.gpio24" ], + [ "chip.gpio25", "testbench.gpio25" ], + [ "chip.gpio26", "testbench.gpio26" ], + [ "chip.gpio27", "testbench.gpio27" ], + [ "chip.gpio28", "testbench.gpio28" ], + [ "chip.gpio29", "testbench.gpio29" ], + [ "chip.gpio30", "testbench.gpio30" ], + [ "chip.gpio31", "testbench.gpio31" ], + [ "chip.gpio32", "testbench.gpio32" ], + [ "chip.gpio33", "testbench.gpio33" ], + [ "chip.gpio34", "testbench.gpio34" ], + [ "chip.gpio35", "testbench.gpio35" ], + [ "chip.gpio36", "testbench.gpio36" ], + [ "chip.gpio37", "testbench.gpio37" ], + [ "chip.gpio38", "testbench.gpio38" ], + [ "chip.gpio39", "testbench.gpio39" ], + [ "chip.gpio40", "testbench.gpio40" ], + [ "chip.gpio41", "testbench.gpio41" ], + [ "chip.gpio42", "testbench.gpio42" ], + [ "chip.gpio43", "testbench.gpio43" ], + [ "chip.gpio44", "testbench.gpio44" ], + [ "chip.gpio45", "testbench.gpio45" ], + [ "chip.gpio46", "testbench.gpio46" ], + [ "chip.gpio47", "testbench.gpio47" ], + [ "chip.gpio48", "testbench.gpio48" ], + [ "chip.gpio49", "testbench.gpio49" ], + [ "chip.gpio50", "testbench.gpio50" ], + [ "chip.gpio51", "testbench.gpio51" ], + [ "chip.gpio52", "testbench.gpio52" ], + [ "chip.gpio53", "testbench.gpio53" ], + [ "chip.gpio54", "testbench.gpio54" ], + [ "chip.gpio55", "testbench.gpio55" ], + [ "chip.gpio56", "testbench.gpio56" ], + [ "chip.gpio57", "testbench.gpio57" ], + [ "chip.gpio58", "testbench.gpio58" ], + [ "chip.gpio59", "testbench.gpio59" ], + [ "chip.gpio60", "testbench.gpio60" ], + [ "chip.gpio61", "testbench.gpio61" ], + [ "chip.gpio62", "testbench.gpio62" ], + [ "chip.gpio63", "testbench.gpio63" ], + [ "chip.gpio64", "testbench.gpio64" ], + [ "chip.gpio65", "testbench.gpio65" ], + [ "chip.gpio66", "testbench.gpio66" ], + [ "chip.gpio67", "testbench.gpio67" ], + [ "chip.gpio68", "testbench.gpio68" ], + [ "chip.gpio69", "testbench.gpio69" ], + [ "chip.gpio70", "testbench.gpio70" ], + [ "chip.gpio71", "testbench.gpio71" ], + [ "chip.gpio72", "testbench.gpio72" ], + [ "chip.gpio73", "testbench.gpio73" ], + [ "chip.gpio74", "testbench.gpio74" ], + [ "chip.gpio75", "testbench.gpio75" ], + [ "chip.gpio76", "testbench.gpio76" ], + [ "chip.gpio77", "testbench.gpio77" ], + [ "chip.gpio78", "testbench.gpio78" ], + [ "chip.gpio79", "testbench.gpio79" ], + [ "chip.gpio80", "testbench.gpio80" ], + [ "chip.gpio81", "testbench.gpio81" ], + [ "chip.gpio82", "testbench.gpio82" ], + [ "chip.gpio83", "testbench.gpio83" ], + [ "chip.gpio84", "testbench.gpio84" ], + [ "chip.gpio85", "testbench.gpio85" ], + [ "chip.gpio86", "testbench.gpio86" ], + [ "chip.gpio87", "testbench.gpio87" ], + [ "chip.gpio88", "testbench.gpio88" ], + [ "chip.gpio89", "testbench.gpio89" ] + ] } }, - - "bindings": [ - [ "chip.uart0", "uart.input" ], - [ "mic0.i2s", "chip.i2s0" ], - [ "mic1.i2s", "chip.i2s0" ], - [ "mic2.i2s", "chip.i2s0" ], - [ "mic3.i2s", "chip.i2s0" ], - [ "mic0.ws_out", "mic1.ws_in" ], - [ "mic1.ws_out", "mic2.ws_in" ], - [ "mic2.ws_out", "mic3.ws_in" ], - [ "sink0.i2s", "chip.i2s0" ] - ], - + "chip": { "@includes@": ["ips/dpi/chip_wrapper.json"], @@ -177,7 +282,32 @@ "gpio61": { "type": "gpio", "is_master": true }, "gpio62": { "type": "gpio", "is_master": true }, "gpio63": { "type": "gpio", "is_master": true }, - "gpio64": { "type": "gpio", "is_master": true } + "gpio64": { "type": "gpio", "is_master": true }, + "gpio65": { "type": "gpio", "is_master": true }, + "gpio66": { "type": "gpio", "is_master": true }, + "gpio67": { "type": "gpio", "is_master": true }, + "gpio68": { "type": "gpio", "is_master": true }, + "gpio69": { "type": "gpio", "is_master": true }, + "gpio70": { "type": "gpio", "is_master": true }, + "gpio71": { "type": "gpio", "is_master": true }, + "gpio72": { "type": "gpio", "is_master": true }, + "gpio73": { "type": "gpio", "is_master": true }, + "gpio74": { "type": "gpio", "is_master": true }, + "gpio75": { "type": "gpio", "is_master": true }, + "gpio76": { "type": "gpio", "is_master": true }, + "gpio77": { "type": "gpio", "is_master": true }, + "gpio78": { "type": "gpio", "is_master": true }, + "gpio79": { "type": "gpio", "is_master": true }, + "gpio80": { "type": "gpio", "is_master": true }, + "gpio81": { "type": "gpio", "is_master": true }, + "gpio82": { "type": "gpio", "is_master": true }, + "gpio83": { "type": "gpio", "is_master": true }, + "gpio84": { "type": "gpio", "is_master": true }, + "gpio85": { "type": "gpio", "is_master": true }, + "gpio86": { "type": "gpio", "is_master": true }, + "gpio87": { "type": "gpio", "is_master": true }, + "gpio88": { "type": "gpio", "is_master": true }, + "gpio89": { "type": "gpio", "is_master": true } } } }, diff --git a/tools/gap-configs/configs/devices/testbench.json b/tools/gap-configs/configs/devices/testbench.json new file mode 100644 index 000000000..d0817cb85 --- /dev/null +++ b/tools/gap-configs/configs/devices/testbench.json @@ -0,0 +1,117 @@ +{ + "vp_comps": [ + "testbench", "clock" + ], + + "vp_bindings": [ + ["self->ctrl", "testbench->ctrl"], + ["self->gpio0", "testbench->gpio0"], + ["self->gpio1", "testbench->gpio1"], + ["self->gpio2", "testbench->gpio2"], + ["self->gpio3", "testbench->gpio3"], + ["self->gpio4", "testbench->gpio4"], + ["self->gpio5", "testbench->gpio5"], + ["self->gpio6", "testbench->gpio6"], + ["self->gpio7", "testbench->gpio7"], + ["self->gpio8", "testbench->gpio8"], + ["self->gpio9", "testbench->gpio9"], + ["self->gpio10", "testbench->gpio10"], + ["self->gpio11", "testbench->gpio11"], + ["self->gpio12", "testbench->gpio12"], + ["self->gpio13", "testbench->gpio13"], + ["self->gpio14", "testbench->gpio14"], + ["self->gpio15", "testbench->gpio15"], + ["self->gpio16", "testbench->gpio16"], + ["self->gpio17", "testbench->gpio17"], + ["self->gpio18", "testbench->gpio18"], + ["self->gpio19", "testbench->gpio19"], + ["self->gpio20", "testbench->gpio20"], + ["self->gpio21", "testbench->gpio21"], + ["self->gpio22", "testbench->gpio22"], + ["self->gpio23", "testbench->gpio23"], + ["self->gpio24", "testbench->gpio24"], + ["self->gpio25", "testbench->gpio25"], + ["self->gpio26", "testbench->gpio26"], + ["self->gpio27", "testbench->gpio27"], + ["self->gpio28", "testbench->gpio28"], + ["self->gpio29", "testbench->gpio29"], + ["self->gpio30", "testbench->gpio30"], + ["self->gpio31", "testbench->gpio31"], + ["self->gpio32", "testbench->gpio32"], + ["self->gpio33", "testbench->gpio33"], + ["self->gpio34", "testbench->gpio34"], + ["self->gpio35", "testbench->gpio35"], + ["self->gpio36", "testbench->gpio36"], + ["self->gpio37", "testbench->gpio37"], + ["self->gpio38", "testbench->gpio38"], + ["self->gpio39", "testbench->gpio39"], + ["self->gpio40", "testbench->gpio40"], + ["self->gpio41", "testbench->gpio41"], + ["self->gpio42", "testbench->gpio42"], + ["self->gpio43", "testbench->gpio43"], + ["self->gpio44", "testbench->gpio44"], + ["self->gpio45", "testbench->gpio45"], + ["self->gpio46", "testbench->gpio46"], + ["self->gpio47", "testbench->gpio47"], + ["self->gpio48", "testbench->gpio48"], + ["self->gpio49", "testbench->gpio49"], + ["self->gpio50", "testbench->gpio50"], + ["self->gpio51", "testbench->gpio51"], + ["self->gpio52", "testbench->gpio52"], + ["self->gpio53", "testbench->gpio53"], + ["self->gpio54", "testbench->gpio54"], + ["self->gpio55", "testbench->gpio55"], + ["self->gpio56", "testbench->gpio56"], + ["self->gpio57", "testbench->gpio57"], + ["self->gpio58", "testbench->gpio58"], + ["self->gpio59", "testbench->gpio59"], + ["self->gpio60", "testbench->gpio60"], + ["self->gpio61", "testbench->gpio61"], + ["self->gpio62", "testbench->gpio62"], + ["self->gpio63", "testbench->gpio63"], + ["self->gpio64", "testbench->gpio64"], + ["self->gpio65", "testbench->gpio65"], + ["self->gpio66", "testbench->gpio66"], + ["self->gpio67", "testbench->gpio67"], + ["self->gpio68", "testbench->gpio68"], + ["self->gpio69", "testbench->gpio69"], + ["self->gpio70", "testbench->gpio70"], + ["self->gpio71", "testbench->gpio71"], + ["self->gpio72", "testbench->gpio72"], + ["self->gpio73", "testbench->gpio73"], + ["self->gpio74", "testbench->gpio74"], + ["self->gpio75", "testbench->gpio75"], + ["self->gpio76", "testbench->gpio76"], + ["self->gpio77", "testbench->gpio77"], + ["self->gpio78", "testbench->gpio78"], + ["self->gpio79", "testbench->gpio79"], + ["self->gpio80", "testbench->gpio80"], + ["self->gpio81", "testbench->gpio81"], + ["self->gpio82", "testbench->gpio82"], + ["self->gpio83", "testbench->gpio83"], + ["self->gpio84", "testbench->gpio84"], + ["self->gpio85", "testbench->gpio85"], + ["self->gpio86", "testbench->gpio86"], + ["self->gpio87", "testbench->gpio87"], + ["self->gpio88", "testbench->gpio88"], + ["self->gpio89", "testbench->gpio89"], + ["clock->out", "testbench->clock"], + ["testbench->clock_cfg", "clock->clock_in"] + ], + + "clock": { + "vp_component": "vp.clock_domain_impl", + "frequency": "50000000" + }, + + "testbench": { + "name": "Testbench", + + "vp_component": "devices.testbench.testbench", + + "verbose": false, + "ctrl_type": "uart", + "uart_baudrate": 115200, + "nb_gpio": 90 + } +} diff --git a/tools/gap-configs/configs/ips/riscv/ri5ky_v2_6_sfloat_single_regfile_int64.json b/tools/gap-configs/configs/ips/riscv/ri5ky_v2_6_sfloat_single_regfile_int64.json new file mode 100644 index 000000000..4c0089112 --- /dev/null +++ b/tools/gap-configs/configs/ips/riscv/ri5ky_v2_6_sfloat_single_regfile_int64.json @@ -0,0 +1,4 @@ +{ + "@includes@" : ["ips/riscv/ri5ky_v2_6_sfloat_single_regfile.json"], + "isa" : "rv32imfcXpulpv2Xf8Xf16XfvecXfauxXf16altXgap9Xint64" + } \ No newline at end of file diff --git a/tools/gap-configs/configs/ips/riscv/ri5ky_v2_sfloat_single_regfile_sec.json b/tools/gap-configs/configs/ips/riscv/ri5ky_v2_sfloat_single_regfile_sec.json index c7c2380de..059611a3a 100644 --- a/tools/gap-configs/configs/ips/riscv/ri5ky_v2_sfloat_single_regfile_sec.json +++ b/tools/gap-configs/configs/ips/riscv/ri5ky_v2_sfloat_single_regfile_sec.json @@ -1,4 +1,5 @@ { "@includes@" : ["ips/riscv/ri5ky_v2_sfloat_single_regfile.json"], - "defines" : [ "ARCHI_CORE_HAS_PULPV2", "ARCHI_CORE_HAS_CPLX", "ARCHI_CORE_HAS_SECURITY" ] + "defines" : [ "ARCHI_CORE_HAS_PULPV2", "ARCHI_CORE_HAS_CPLX", "ARCHI_CORE_HAS_SECURITY" ], + "isa" : "rv32imfcXpulpv2Xf8Xf16XfvecXfauxXf16altXgap9Xint64" } \ No newline at end of file diff --git a/tools/gap8-openocd-tools/tcl/fuser.tcl b/tools/gap8-openocd-tools/tcl/fuser.tcl index c2186e998..48a7cc09a 100644 --- a/tools/gap8-openocd-tools/tcl/fuser.tcl +++ b/tools/gap8-openocd-tools/tcl/fuser.tcl @@ -185,3 +185,64 @@ proc fuse_spiflash_boot {gap_tools_path} { # now close the flasher gap_fuse_terminate 0x1c000190 } + + +proc dump_fuse_array {gap_tools_path} { + reset + gap8_jtag_load_binary_and_start ${gap_tools_path}/gap_bins/gap_fuser@gapuino8.elf elf + sleep 100 + puts "${gap_tools_path}/gap_bins/gap_fuser@gapoc_a.elf" + gap_fuse_open 0x1c000190 + + array set fuse_array { + 0 0x0 + 1 0 + 2 0 + 3 0 + 4 0 + 5 0 + 6 0 + 7 0 + 8 0 + 9 0 + 10 0 + 11 0 + 12 0 + 13 0 + 14 0 + 15 0 + 16 0 + 17 0 + 18 0 + 19 0 + 20 0 + 21 0 + 22 0 + 23 0 + 24 0 + 25 0 + 26 0 + 27 0 + 28 0 + 29 0 + 30 0 + 31 0x0 + } + + gap_fuse_once 0x1c000190 0x0 0 1024 0xf 32 + + puts "dump array:" + puts "-------------------------" + set iter [expr 0x0] + while { [expr $iter != 32] } { + puts "|word\[$iter\] \t| [format 0x%x $fuse_array($iter)]\t|" + puts "-------------------------" + set iter [expr $iter + 1] + } + + # now close the flasher + gap_fuse_terminate 0x1c000190 + puts "fuse done" +} + + diff --git a/tools/gapy/runner/board/board_runner.py b/tools/gapy/runner/board/board_runner.py index 1c7a65207..5ae853462 100644 --- a/tools/gapy/runner/board/board_runner.py +++ b/tools/gapy/runner/board/board_runner.py @@ -110,11 +110,16 @@ def exec(self): chip_family = self.config.get_str('**/chip_family') - platform = self.config.get_str('runner/platform') - if chip_family == 'vega' or chip_family == 'gap9_v2': - cmd = '%s -c "gdb_port disabled; telnet_port disabled; tcl_port disabled" -c "script %s; script %s; load_and_start_binary %s 0x%x"' % (openocd, cable, script, binary, entry) + if chip_family == 'vega': + cmd = 'plpbridge --chip=vega --verbose 10 --cable=ftdi --binary %s reset load ioloop reqloop start wait' % (binary) + + else: - cmd = "%s -c 'gdb_port disabled; telnet_port disabled; tcl_port disabled' -f %s -f %s -f tcl/jtag_boot.tcl -c 'gap8_jtag_load_binary_and_start \"%s\" elf'" % (openocd, cable, script, binary) + platform = self.config.get_str('runner/platform') + if chip_family == 'vega' or chip_family == 'gap9_v2': + cmd = '%s -c "gdb_port disabled; telnet_port disabled; tcl_port disabled" -c "script %s; script %s; load_and_start_binary %s 0x%x"' % (openocd, cable, script, binary, entry) + else: + cmd = "%s -c 'gdb_port disabled; telnet_port disabled; tcl_port disabled' -f %s -f %s -f tcl/jtag_boot.tcl -c 'gap8_jtag_load_binary_and_start \"%s\" elf'" % (openocd, cable, script, binary) os.chdir(self.config.get_str('gapy/work_dir')) diff --git a/tools/nntool/.vscode/launch.json b/tools/nntool/.vscode/launch.json index 5d5717b58..448d976fa 100644 --- a/tools/nntool/.vscode/launch.json +++ b/tools/nntool/.vscode/launch.json @@ -13,16 +13,6 @@ ], "console": "integratedTerminal" }, - { - "name": "Python: start nntool with vww quantized", - "type": "python", - "request": "launch", - "program": "${workspaceFolder}/nntool", - "args": [ - "tests/graph/model_quantized.tflite" - ], - "console": "integratedTerminal" - }, { "name": "Python: start nntool with sample", "type": "python", @@ -34,16 +24,6 @@ "cwd": "${workspaceFolder}/../nntool_examples/mnist", "console": "integratedTerminal" }, - { - "name": "Python: start nntool with mobv1", - "type": "python", - "request": "launch", - "program": "${workspaceFolder}/nntool", - "args": [ - "tests/graph/mobilenet_v1_1_0_224.tflite" - ], - "console": "integratedTerminal" - }, { "name": "Python: start nntool with kws", "type": "python", @@ -135,7 +115,19 @@ "request": "launch", "program": "${workspaceFolder}/nntool", "args": [ - "tests/graph/mobv1_quant.tflite" + "tests/graph/mobv1_quant.tflite", + "-q" + ], + "cwd": "${workspaceFolder}", + "console": "integratedTerminal" + }, + { + "name": "Python: start nntool with mobilenet_v3 tflite (json)", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/nntool", + "args": [ + "tests/graph/v3-large_224_1.0_float.json" ], "cwd": "${workspaceFolder}", "console": "integratedTerminal" @@ -151,6 +143,18 @@ "cwd": "${workspaceFolder}", "console": "integratedTerminal" }, + { + "name": "Python: start nntool with vs mobilenet_v2 tflite", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/nntool", + "args": [ + "tests/graph/vergesense_mnv2.tflite", + "-q" + ], + "cwd": "${workspaceFolder}", + "console": "integratedTerminal" + }, { "name": "Python: start nntool with visual wake sample tflite", "type": "python", @@ -227,11 +231,12 @@ "console": "integratedTerminal" }, { - "name": "Python: Current File (Integrated Terminal)", + "name": "Python: Current File in dir (Integrated Terminal)", "type": "python", "request": "launch", "program": "${file}", - "console": "integratedTerminal" + "console": "integratedTerminal", + "cwd": "${fileDirname}", } ] } \ No newline at end of file diff --git a/tools/nntool/Makefile b/tools/nntool/Makefile index 71a8b4804..23cfbe7e6 100644 --- a/tools/nntool/Makefile +++ b/tools/nntool/Makefile @@ -6,7 +6,7 @@ PACKAGES=build/packages FLAT_SRC=$(PACKAGES)/flatbuffers FLAT_BUILD=$(FLAT_SRC)/build TFLITE_DIR=importer/tflite -FLAT_TAG=3c964e10ab4f97e2a3602a8d0a8f4c402806ef89 +FLAT_TAG=de89bd193370c8b33686f1f33edd63593e48cd3f all: flatbuffers tflite req diff --git a/tools/nntool/README.md b/tools/nntool/README.md index 810919a43..045d45845 100644 --- a/tools/nntool/README.md +++ b/tools/nntool/README.md @@ -1,5 +1,18 @@ # NNTOOL +## Table of contents +- [Overview](#overview) +- [Installation](#installation) +- [Model Conversion](#model-conversion) +- [Quantization](#quantization) +- [Nntool Execution](#nntool-execution) +- [Model Save](#model-save) +- [Autotiler Model Generation](#autotiler-model-generation) +- [Image Formatter](#image-formatter) +- [Input Options](#input-options) + +## Overview + NNTOOL helps to port NN graphs from various NN training packages to GAP8. It helps with: - Post training graph quantization @@ -22,6 +35,8 @@ The tool has a command interpreter mode where it provides an interface to: The tool also has a command line mode which takes a saved state file and directly genenerates the model and saves a parameters file; the two elements necessary for a GAP project build. +## Installation + To set up the tool install the packages in the requirements file python -m pip install -r requirements.txt @@ -40,55 +55,61 @@ When everything is installed, the nntool command line can be accessed with: These steps will automatically be executed by the GAP SDK setup procedure -## Visual-Wake-Words Example - -To show the **nntool** usage, we provide this detailed example of the 2019 Visual Wakeup Words Challenge Winner model porting on GAP8. -### Model Loading +## Model Conversion -The nntool takes as input network a non-quantized .tflite model. After downloading the model from the github repository (https://github.com/mit-han-lab/VWW) we can open the model into the nntool: +The nntool takes as input network a float or quantized .tflite model. You can find bunch of trained tflite models online in both integer-only and floating-point version online ([TF hosted models](https://www.tensorflow.org/lite/guide/hosted_models)). To start nntool and open the target model: nntool - open /path/to/model.tflite + open /path/to/model.tflite [-q] -TFLite format uses HxWxC activations and CoutxHxWxCin weights while Autotiler uses CxHxW activations and CoutxCinxHxW weights. Moreover to increase the efficiency of the kernels, the Autotiler uses fused layers (e.g. ConvReLUPool). To generate the compatible AT model, the nntool has to apply graph transformations and match the Autotiler features: +The -q is required if the target model has already been quantized in the tflite conversion process (i.e. inference_type=QUANTIZED_UINT8). + +TFLite execution kernels use HxWxC order for the activations and CoutxHxWxCin order for filters. On the other hand, the Autotiler, and therefore GAP execution kernels, use CxHxW activations and CoutxCinxHxW filters. Moreover to increase the efficiency of the kernels, the Autotiler uses fused layers (e.g. Convolution followed by a pooling and a ReLU can be performed by a single AT layer: ConvPoolRelu). For these reasons, to generate the compatible AT model, the nntool has to apply graph transformations and match the Autotiler features: adjust - fusions + fusions [--scale8 | --pow2] + +IMPORTANT: the _fusions_ type (scale8 vs pow2) must match the quantization scheme which will be performed (see next session). To see the current nntool model topology use: show -### Quantization +## Quantization -Now we quantize the model to either 8 or 16 bit. The aquant command quantizes the original floating point model to a fixed point one. For the constant parameters, i.e. weights and biases, the number of integer and decimal bits is computed from their actual values distributions. On the other hand, non-constant values, i.e. activations, need a representative dataset to collect their distributions. Whenever you feed the nntool model with some inputs data you can do manipulation on them: +To run on GAP platforms the model must be quantized with one of the Autotiler supported scheme: +- 8-bits quantization: similar to [tensorflow lite quantization](https://www.tensorflow.org/lite/performance/quantization_spec) but targets symmetric computational kernels (Autotiler). This scheme is automatically applied if the input tflite graph is already quantized: the quantization specs are simply translated to match the symmetric kernels. +- 16-bits quantization: this scheme targets a PowerOf2 quantization, i.e. each tensor is interpreted as a signed 16bits Qm.n fixed-point vector with m integer bits and n decimal bits. This approach can lead to better accuracy results but is paid with 2x memory footprint and almost 2x latency increase. - aquant -f /path/to/images/direcotry/* [input-options: -T, -D, -O, ...] - ***IMPORTANT*** If the adjust command has been used and the input has more than one channel, the -T flag is necessary to match the new activations order +NOTE: the _fusions_ command above must meet the quantization scheme that you want to apply (--scale8 in case of 8-bits quantization and --pow2 in case of 16-bits quantization). If you want to change the scheme for any reason, you will need to reopen the original graph and perform the _fusions_ step again. +NOTE2: the 16-bits quantization scheme is supported only if the input graph is not already quantized. +If the imported tflite graph targets a floating point execution, nntool can perform the post-training quantization step providing the network a set of calibration data on which it can collect the min/max ranges statistic for the activations: -We can evaluate the signal to noise ratio (QSNR) after the quantization step by processing one or more input data: + aquant -f [8 | 16] /path/to/images/direcotry/* [input-options: -T, -D, -O, ...] + ***IMPORTANT*** If the adjust command has been used and the input has more than one channel, the -T (transpose) flag is necessary to match the new activations order (input image with HxWxC to CxHxW) - qerror /path/to/the/image.ppm [input-options: -T, -D, -O, -W, -H, ...] [-s] +nntool can execute the graph in both floating point and quantized precision, hence we can evaluate the signal to noise ratio (QSNR) after the quantization is set one or more input data by comparing the two results: -It computes the model in the original FP32 version and then in the quantized version and compare the outputs of each layer. With the -s flag the comparison is done individually for each layer. Their output is evaluated from the FP32 input quantized instead of the output result of the quantized computation. + qerror /path/to/the/image.ppm [input-options: -T, -D, -O, -W, -H, ...] [-s] -If for some layer a very low QSNR is reported, the user can change the bit precision with the qtune command. +With the -s flag the comparison is done individually for each layer: their output is evaluated from the FP32 input quantized instead of the output result of the quantized computation. -#### Quantization Inspection +## Nntool Execution -The nntool provides utilities to inspect the quantization performance in details for specific tensors by comparing them side by side, for a given sample data like below: +The nntool provides utilities to inspect the output activation tensors which come from a specific input execution in details: dump ./image.ppm -S tensors_1 dump ./image.ppm -S tensors_2 -q -d Usage: -S: store the tensors in the workspace with the given name - -q: compute the inference with the quantized graph - -d: export the dequantized version of the tensors to have the same format of the one computed with the FP32 graph + [-q]: compute the inference with the quantized graph (if not specified the network is run in floating point) + [-d]: beside -q export the dequantized version of the tensors to have the same format of the one computed with the FP32 graph + [-P file.npy]: save the list of activations tensors in a file -To compare them: +To compare them side by side or with QSNR (in this case tensors_1 come from the float execution while tensors_2 from the quantized one with dequantized (real numbers) values instead of the integer ones): tensors -t tensors_1 tensors_2 -s 2 -c 0 [-Q] @@ -98,7 +119,7 @@ To compare them: -c: layer channel [-Q]: if present outputs the QSNR between the tensors, otherwise the tensors elements are displayed side by side -### Saving the model +## Model Save To save the nntool graph with all the quantization information and constant parameters tensors in the .json format: @@ -108,14 +129,31 @@ To load back the saved model: open /path/to/nntool_model_state.json -### Autotiler Model Generation +## Autotiler Model Generation At this point the nntool graph is ready to be translated in an Autotiler Model format: nntool -g path/to/nntool_model_state.json -M /path/to/model/dir -m Autotiler_model_file.c -T path/to/tensors/dir +## Image Formatter + +To handle different type of input images format you can add to your graph an input formatter which will generate the Autotiler optimized code for the proper conversion. It supports: +- rgb565 HxWxC input to rgb888 CxHxW (rgb565) +- rgb888 HxWxC input to rgb888 CxHxW (rgb888) +- grayscale8 input to grayscale8 (bw8) +- grayscale8 input to grayscale16 (bw16) + +It also handle the conversion between uint8 [0:255] values to int8 [-128:127] supported in the AT convolutional kernels. You will need to speify the desired technique: +- shift_int8: will apply elemnt-wise a right shift of 1 bit (>> 1) so that the values do not overflow the max int8 [0:128] (more efficient) +- offset_int8: will apply element-wise a -128 addition to output [-128:127] values ready for AT Convolutional kernels (more accurate) +- for 16 bits graphs only: out_int16: takes the uint8 input and converts to int16 output by applying a left shift of 7 bits (<< 7) + +The command to run to introduce the formatter into your graph is: + imageformat input_x [bw8 | bw16 | rgb888 | rgb565] [shift_int8 | offset_int8] + +NOTE: in case of multichannel input the image formmatter will automatically handle also the automatic transposition when you run the network in nntool (i.e. the -T option is no more needed) -## Input Images Options +## Input Options Whenever one or several images are given to the nntool (i.e. with commands like dump, aquant, qerror, ...) there are options you can set to perform preprocessing on them: @@ -136,4 +174,4 @@ Another option is to set some of this parameters by default using the set comman set input_offset set input_norm_func [lambda function]: is equivalent to set each time an image is provide -N [lambda function] -NOTE: if an input_norm_func is set the other operational settings (input_divisor or input_offset) will be ignored +NOTE: if an input_norm_func is set the other operational settings (input_divisor or input_offset) will be ignored \ No newline at end of file diff --git a/tools/nntool/_version.py b/tools/nntool/_version.py index a5d9167eb..42e4cc452 100644 --- a/tools/nntool/_version.py +++ b/tools/nntool/_version.py @@ -13,4 +13,4 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -__version__ = '3.0' +__version__ = '4.0' diff --git a/tools/nntool/autotiler/generators/nntool_extra_generators.c b/tools/nntool/autotiler/generators/nntool_extra_generators.c new file mode 100644 index 000000000..58163003f --- /dev/null +++ b/tools/nntool/autotiler/generators/nntool_extra_generators.c @@ -0,0 +1,280 @@ +#include +#include +#include "AutoTilerLib.h" +#include "nntool_extra_generators.h" +#include "Gap.h" + +#define D0 KER_ITER_D0 +#define D1 KER_ITER_D1 +#define D2 KER_ITER_D2 +#define D3 KER_ITER_D3 +#define T0 KER_ITER_TILE0 +#define T1 KER_ITER_TILE1 +#define T2 KER_ITER_TILE2 + +void LoadNNTools_Extra_Library() + +{ + LibKernel("CNN_NormRGB565_offset_fps", CALL_PARALLEL, + CArgs(6, + TCArg("unsigned short *__restrict__", "In"), + TCArg("signed char *__restrict__", "Out0"), + TCArg("signed char *__restrict__", "Out1"), + TCArg("signed char *__restrict__", "Out2"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "H") + ), + "KerNormRGB565_fps_T", NULL + ); + + LibKernel("CNN_NormRGB565_shift_fps", CALL_PARALLEL, + CArgs(6, + TCArg("unsigned short *__restrict__", "In"), + TCArg("signed char *__restrict__", "Out0"), + TCArg("signed char *__restrict__", "Out1"), + TCArg("signed char *__restrict__", "Out2"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "H") + ), + "KerNormRGB565_fps_T", NULL + ); + + LibKernel("CNN_NormRGB888_offset_fps", CALL_PARALLEL, + CArgs(6, + TCArg("unsigned char *__restrict__", "In"), + TCArg("signed char *__restrict__", "Out0"), + TCArg("signed char *__restrict__", "Out1"), + TCArg("signed char *__restrict__", "Out2"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "H") + ), + "KerNormRGB888_fps_T", NULL + ); + + LibKernel("CNN_NormRGB888_shift_fps", CALL_PARALLEL, + CArgs(6, + TCArg("unsigned char *__restrict__", "In"), + TCArg("signed char *__restrict__", "Out0"), + TCArg("signed char *__restrict__", "Out1"), + TCArg("signed char *__restrict__", "Out2"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "H") + ), + "KerNormRGB888_fps_T", NULL + ); + + LibKernel("CNN_NormRGB16_fp", CALL_PARALLEL, + CArgs(6, + TCArg("unsigned char *__restrict__", "In"), + TCArg("signed short int *__restrict__", "Out0"), + TCArg("signed short int *__restrict__", "Out1"), + TCArg("signed short int *__restrict__", "Out2"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "H") + ), + "KerNormRGB16_fp_T", NULL + ); + + LibKernel("CNN_NormBW_offset_fps", CALL_PARALLEL, + CArgs(4, + TCArg("unsigned char *__restrict__", "In"), + TCArg("signed char *__restrict__", "Out"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "H") + ), + "KerNormBW_fps_T", NULL + ); + + LibKernel("CNN_NormBW_shift_fps", CALL_PARALLEL, + CArgs(4, + TCArg("unsigned char *__restrict__", "In"), + TCArg("signed char *__restrict__", "Out"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "H") + ), + "KerNormBW_fps_T", NULL + ); + + LibKernel("CNN_NormBW_fp", CALL_PARALLEL, + CArgs(4, + TCArg("unsigned char *__restrict__", "In"), + TCArg("signed short int *__restrict__", "Out"), + TCArg("unsigned short int", "W"), + TCArg("unsigned short int", "H") + ), + "KerNormBW_fp_T", NULL + ); +} + +/********************************************************************************************************************************************************************* + Generator for RGB565 image preprocessing: + +Template: +Name: Name of the generated user kernel + +Width Image width +Height Image height + +DoOffset If true offset pixel by -128 + +Signature: Name(In, Out) + +CNN_NormRGB565 + + *********************************************************************************************************************************************************************/ + +int CNN_NormRGB( + char *Name, + int Width, + int Height, + int DoOffset, + nntool_kop_t kop + ) + +{ + int Log = 1; + char *BodyName = AppendNames(Name, "Body"); + unsigned long long int LayerOp = (Width*Height*4) + (Width*Height*(DoOffset?2:1))/4; + unsigned long long int LayerBandwidth = 0; + + char *NormRGBKerName = (kop==NNTOOL_KOP_RGB16?"CNN_NormRGB16_fp":(kop==NNTOOL_KOP_RGB565?(DoOffset?"CNN_NormRGB565_offset_fps":"CNN_NormRGB565_shift_fps"):(DoOffset?"CNN_NormRGB888_offset_fps":"CNN_NormRGB888_shift_fps"))); + + LayerBandwidth += 2*Width*Height*1; + LayerBandwidth += 3*Width*Height*1; + + if (Log) { + printf("CNN_NormRGB%d: %s\n", (kop==NNTOOL_KOP_RGB16?16:(kop==NNTOOL_KOP_RGB565?565:888)), Name); + printf("In => Feat: %d W: %4d, H: %4d\n", (kop==NNTOOL_KOP_RGB565?1:3), Width, Height); + printf("Out => Feat: 3, W: %4d, H: %4d\n", Width, Height); + if (NormRGBKerName) printf("%20s: %s\n", "KerName", NormRGBKerName); + printf("Nb Oper : %lld\n", LayerOp); + } + + Object_T **PKerArgs = AllocateKerArgs(4); + PKerArgs[0] = KerArg("In", KerArgSpace(1,T0), O_IN|O_DB, Width*(kop==NNTOOL_KOP_RGB565?1:3), Height, (kop==NNTOOL_KOP_RGB565?2:1), 0, 0, 0, "In"); + PKerArgs[1] = KerArg("Out0", KerArgSpace(1,T0), O_OUT|O_DB, Width, Height, (kop==NNTOOL_KOP_RGB16?2:1), 0, 0, 0, "Out0"); + PKerArgs[2] = KerArg("Out1", KerArgSpace(1,T0), O_OUT|O_DB, Width, Height, (kop==NNTOOL_KOP_RGB16?2:1), 0, 0, 0, "Out1"); + PKerArgs[3] = KerArg("Out2", KerArgSpace(1,T0), O_OUT|O_DB, Width, Height, (kop==NNTOOL_KOP_RGB16?2:1), 0, 0, 0, "Out2"); + OpenKernelGroup(Name); + UserKernel(BodyName, + KernelIterSpace(1, IterTiledSpace(T0)), + TILE_HOR, + CArgs(4, TCArg(CNN_ArgDataTypeUns((kop==NNTOOL_KOP_RGB565?2:1),1,1), "In"), + TCArg(CNN_ArgDataType((kop==NNTOOL_KOP_RGB16?2:1),1,1), "Out0"), + TCArg(CNN_ArgDataType((kop==NNTOOL_KOP_RGB16?2:1),1,1), "Out1"), + TCArg(CNN_ArgDataType((kop==NNTOOL_KOP_RGB16?2:1),1,1), "Out2") + ), + Calls(1, + Call(NormRGBKerName, LOC_LOOP, + Bindings(6, + K_Arg("In", KER_ARG_TILE), /* Input tile */ + K_Arg("Out0", KER_ARG_TILE), /* Output tile */ + K_Arg("Out1", KER_ARG_TILE), /* Output tile */ + K_Arg("Out2", KER_ARG_TILE), /* Output tile */ + K_Arg((kop==NNTOOL_KOP_RGB565?"In":"Out0"), KER_ARG_TILE_W), /* tile width */ + K_Arg((kop==NNTOOL_KOP_RGB565?"In":"Out0"), KER_ARG_TILE_H) /* tile height */ + ) + ) + ), + PKerArgs + ); + AddKernelInfos(BodyName, AT_KERINFO_OPER, LayerOp, 0); + AddKernelInfos(BodyName, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); + AddKernelArgDim(BodyName, "In", 4, (kop==NNTOOL_KOP_RGB565?1:3), Height, Width, (kop==NNTOOL_KOP_RGB565?2:1)); + AddKernelArgDim(BodyName, "Out0", 4, 1, Height, Width, (kop==NNTOOL_KOP_RGB16?2:1)); + AddKernelArgDim(BodyName, "Out1", 4, 1, Height, Width, (kop==NNTOOL_KOP_RGB16?2:1)); + AddKernelArgDim(BodyName, "Out2", 4, 1, Height, Width, (kop==NNTOOL_KOP_RGB16?2:1)); + CloseKernelGroup(); + CKernel_Arg_T **KCArgs = AllocateCArgs(2); + int Ca=0; + KCArgs[Ca++] = TCArg(CNN_ArgDataTypeUns((kop==NNTOOL_KOP_RGB565?2:1),1,1), "In"); + KCArgs[Ca++] = TCArg(CNN_ArgDataType( (kop==NNTOOL_KOP_RGB16?2:1), 1,1), "Out"); + Object_T **KArgs = AllocateKerArgs(7); + int Ka=0; + KArgs[Ka++] = KerGroupArg("In", O_IN, Width*Height*(kop==NNTOOL_KOP_RGB565?1:3), (kop==NNTOOL_KOP_RGB565?2:1), "In"); + KArgs[Ka++] = KerGroupArg("Out", O_OUT, Width*Height*3, (kop==NNTOOL_KOP_RGB16?2:1), "Out"); + UserKernelGroup(Name, + KCArgs, + Calls(1, + UserKernelCall(BodyName, LOC_GROUP, + Bindings(4, + C_Arg("In"), + C_ArgPlusImmOffset("Out", 0), + C_ArgPlusImmOffset("Out", Height * Width), + C_ArgPlusImmOffset("Out", Height * Width * 2) + + ) + ) + ) + ); + return 0; + +} + +int CNN_NormBW( + char *Name, + int Width, + int Height, + int DoOffset, + nntool_kop_t kop + ) +{ + int Log = 1; + unsigned long long int LayerOp = Width*Height; + unsigned long long int LayerBandwidth = 0; + char *NormBWKerName = (kop==NNTOOL_KOP_BW16?"CNN_NormBW_fp":(DoOffset?"CNN_NormBW_offset_fps":"CNN_NormBW_shift_fps")); + + LayerBandwidth += Width*Height*1; + LayerBandwidth += Width*Height*1; + + if (Log) { + printf("CNN_NormBW: %s\n", Name); + printf("In => Feat: 1 W: %4d, H: %4d\n", Width, Height); + printf("Out => Feat: 1, W: %4d, H: %4d\n", Width, Height); + if (NormBWKerName) printf("%20s: %s\n", "KerName", NormBWKerName); + printf("Nb Oper : %lld\n", LayerOp); + } + + Object_T **PKerArgs = AllocateKerArgs(2); + PKerArgs[0] = KerArg("In", KerArgSpace(1,T0), O_IN|O_DB, Width, Height, 1, 0, 0, 0, "In"); + PKerArgs[1] = KerArg("Out", KerArgSpace(1,T0), O_OUT|O_DB, Width, Height, (kop==NNTOOL_KOP_BW16?2:1), 0, 0, 0, "Out"); + UserKernel(Name, + KernelIterSpace(1, IterTiledSpace(T0)), + TILE_HOR, + CArgs(2, TCArg(CNN_ArgDataTypeUns(1,1,1), "In"), TCArg(CNN_ArgDataType((kop==NNTOOL_KOP_BW16?2:1),1,1), "Out")), + Calls(1, + Call(NormBWKerName, LOC_LOOP, + Bindings(4, + K_Arg("In", KER_ARG_TILE), /* Input tile */ + K_Arg("Out", KER_ARG_TILE), /* Output tile */ + K_Arg("In", KER_ARG_TILE_W), /* Input tile width */ + K_Arg("In", KER_ARG_TILE_H) /* Input tile width */ + ) + ) + ), + PKerArgs + ); + AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0); + AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); + AddKernelArgDim(Name, "In", 4, 1, Height, Width, 1); + AddKernelArgDim(Name, "Out", 4, 1, Height, Width, 1); + return 0; +} + +int CNN_Norm( + char *Name, + int Width, + int Height, + int DoOffset, + nntool_kop_t kop + ) +{ + if (kop == NNTOOL_KOP_BW || kop == NNTOOL_KOP_BW16) { + return CNN_NormBW(Name, Width, Height, DoOffset, kop); + } else { + return CNN_NormRGB(Name, Width, Height, DoOffset, kop); + } +} + + + diff --git a/tools/nntool/autotiler/generators/nntool_extra_generators.h b/tools/nntool/autotiler/generators/nntool_extra_generators.h new file mode 100644 index 000000000..3f6a05796 --- /dev/null +++ b/tools/nntool/autotiler/generators/nntool_extra_generators.h @@ -0,0 +1,24 @@ +#ifndef __NNTOOL_EXTRA_GENERATORS__ +#define __NNTOOL_EXTRA_GENERATORS__ +#include +#include "AutoTilerLib.h" + +typedef enum { + NNTOOL_KOP_RGB565, + NNTOOL_KOP_RGB888, + NNTOOL_KOP_RGB16, + NNTOOL_KOP_BW, + NNTOOL_KOP_BW16 +} nntool_kop_t; + +void LoadNNTools_Extra_Library(); +int CNN_Norm( + char *Name, + + int Width, + int Height, + int DoOffset, + nntool_kop_t kop +); + +#endif diff --git a/tools/nntool/autotiler/kernels/nntool_extra_kernels.h b/tools/nntool/autotiler/kernels/nntool_extra_kernels.h new file mode 100644 index 000000000..085360d9b --- /dev/null +++ b/tools/nntool/autotiler/kernels/nntool_extra_kernels.h @@ -0,0 +1,74 @@ +#ifndef __NNTOOL_EXTRA_KERNELS__ +#define __NNTOOL_EXTRA_KERNELS__ +#include "Gap.h" + +#ifdef __pulp__ +#ifndef Min +#define Min(a, b) __builtin_pulp_minsi((a), (b)) +#endif +#ifndef Max +#define Max(a, b) __builtin_pulp_maxsi((a), (b)) +#endif +#else +#define Min(a, b) (((a)<(b))?(a):(b)) +#define Max(a, b) (((a)>(b))?(a):(b)) +#endif + +#ifdef GENASM +#ifdef __EMUL__ +#define gap_ncore() 8 +#define gap_coreid() __builtin_pulp_CoreId() +#endif +#endif + +typedef struct { + unsigned short *__restrict__ In; /**< Input matrix */ + signed char *__restrict__ Out0; /**< Output matrix */ + signed char *__restrict__ Out1; /**< Output matrix */ + signed char *__restrict__ Out2; /**< Output matrix */ + unsigned int W; /**< Matrix width */ + unsigned int H; /**< Matrix height */ +} KerNormRGB565_fps_T; + +typedef struct { + unsigned char *__restrict__ In; /**< Input matrix */ + signed char *__restrict__ Out0; /**< Output matrix */ + signed char *__restrict__ Out1; /**< Output matrix */ + signed char *__restrict__ Out2; /**< Output matrix */ + unsigned int W; /**< Matrix width */ + unsigned int H; /**< Matrix height */ +} KerNormRGB888_fps_T; + +typedef struct { + unsigned char *__restrict__ In; /**< Input matrix */ + signed short int *__restrict__ Out0; /**< Output matrix */ + signed short int *__restrict__ Out1; /**< Output matrix */ + signed short int *__restrict__ Out2; /**< Output matrix */ + unsigned int W; /**< Matrix width */ + unsigned int H; /**< Matrix height */ +} KerNormRGB16_fp_T; + +typedef struct { + unsigned char *__restrict__ In; /**< Input matrix */ + signed char *__restrict__ Out; /**< Output matrix */ + unsigned int W; /**< Matrix width */ + unsigned int H; /**< Matrix height */ +} KerNormBW_fps_T; + +typedef struct { + unsigned char *__restrict__ In; /**< Input matrix */ + signed short int *__restrict__ Out; /**< Output matrix */ + unsigned int W; /**< Matrix width */ + unsigned int H; /**< Matrix height */ +} KerNormBW_fp_T; + +void CNN_NormRGB565_offset_fps(KerNormRGB565_fps_T *Arg); +void CNN_NormRGB565_shift_fps(KerNormRGB565_fps_T *Arg); +void CNN_NormRGB888_offset_fps(KerNormRGB888_fps_T *Arg); +void CNN_NormRGB888_shift_fps(KerNormRGB888_fps_T *Arg); +void CNN_NormRGB16_fp(KerNormRGB16_fp_T *Arg); +void CNN_NormBW_offset_fps(KerNormBW_fps_T *Arg); +void CNN_NormBW_shift_fps(KerNormBW_fps_T *Arg); +void CNN_NormBW_fp(KerNormBW_fp_T *Arg); + +#endif diff --git a/tools/nntool/autotiler/kernels/norm_transpose.c b/tools/nntool/autotiler/kernels/norm_transpose.c new file mode 100644 index 000000000..e2b1488e2 --- /dev/null +++ b/tools/nntool/autotiler/kernels/norm_transpose.c @@ -0,0 +1,204 @@ +#include "nntool_extra_kernels.h" + +static int CoreCountDynamic = 1; +static int ActiveCore = gap_ncore(); + +static inline unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X) + +{ + unsigned int NCore; + unsigned int Log2Core; + unsigned int Chunk; + + if (CoreCountDynamic) NCore = ActiveCore; else NCore = gap_ncore(); + Log2Core = gap_fl1(NCore); + Chunk = (X>>Log2Core) + ((X&(NCore-1))!=0); + return Chunk; +} + +void CNN_NormRGB565_offset_fps(KerNormRGB565_fps_T *Arg) + +{ + unsigned short *__restrict__ In = Arg->In; + signed char *__restrict__ Out0 = Arg->Out0; + signed char *__restrict__ Out1 = Arg->Out1; + signed char *__restrict__ Out2 = Arg->Out2; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H), First = Chunk*CoreId, Last = Min(First+Chunk, H); + for (int h=First; hIn; + signed char *__restrict__ Out0 = Arg->Out0; + signed char *__restrict__ Out1 = Arg->Out1; + signed char *__restrict__ Out2 = Arg->Out2; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H), First = Chunk*CoreId, Last = Min(First+Chunk, H); + + for (int h=First; hIn; + signed char *__restrict__ Out0 = Arg->Out0; + signed char *__restrict__ Out1 = Arg->Out1; + signed char *__restrict__ Out2 = Arg->Out2; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int Sz = W * H; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Sz), First = Chunk*CoreId, Last = Min(First+Chunk, Sz); + + unsigned int Diff = Last-First; + for (int OutIdx=First; OutIdx<(First+((Diff*4)/4)); OutIdx+=4) { + int InIdx0 = OutIdx * 3, InIdx1 = InIdx0 + 6; + *((v4u *)&Out0[OutIdx]) = __builtin_shuffle(*((v4u *) &In[InIdx0++]), *((v4u *) &In[InIdx1++]), (v4u) {0, 3, 4, 7}) >> 1; + *((v4u *)&Out1[OutIdx]) = __builtin_shuffle(*((v4u *) &In[InIdx0++]), *((v4u *) &In[InIdx1++]), (v4u) {0, 3, 4, 7}) >> 1; + *((v4u *)&Out2[OutIdx]) = __builtin_shuffle(*((v4u *) &In[InIdx0++]), *((v4u *) &In[InIdx1++]), (v4u) {0, 3, 4, 7}) >> 1; + } + int Left = Diff&0x3; + for (int i=Last-Left, InIdx=i*3; i> 1; + Out1[i] = In[InIdx++] >> 1; + Out2[i] = In[InIdx++] >> 1; + } + gap_waitbarrier(0); +} + +void CNN_NormRGB888_offset_fps(KerNormRGB888_fps_T *Arg) +{ + unsigned char *__restrict__ In = Arg->In; + signed char *__restrict__ Out0 = Arg->Out0; + signed char *__restrict__ Out1 = Arg->Out1; + signed char *__restrict__ Out2 = Arg->Out2; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int Sz = W * H; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Sz), First = Chunk*CoreId, Last = Min(First+Chunk, Sz); + + unsigned int InIdx = First * 3; + for (int OutIdx=First; OutIdxIn; + signed short int *__restrict__ Out0 = Arg->Out0; + signed short int *__restrict__ Out1 = Arg->Out1; + signed short int *__restrict__ Out2 = Arg->Out2; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int Sz = W * H; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Sz), First = Chunk*CoreId, Last = Min(First+Chunk, Sz); + + unsigned int InIdx = First * 3; + for (int OutIdx=First; OutIdxIn; + signed char *__restrict__ Out = Arg->Out; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int Sz = W * H; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Sz), First = Chunk*CoreId, Last = Min(First+Chunk, Sz); + + unsigned int Diff = Last-First; + for (int Idx=First; Idx> 1; + } + int Left = Diff&0x3; + for (int i=Last-Left; i> 1; + } + gap_waitbarrier(0); +} + +void CNN_NormBW_offset_fps(KerNormBW_fps_T *Arg) +{ + unsigned char *__restrict__ In = Arg->In; + signed char *__restrict__ Out = Arg->Out; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int Sz = W * H; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Sz), First = Chunk*CoreId, Last = Min(First+Chunk, Sz); + + for (int Idx=First; IdxIn; + signed short int *__restrict__ Out = Arg->Out; + unsigned int W = Arg->W; + unsigned int H = Arg->H; + unsigned int Sz = W * H; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Sz), First = Chunk*CoreId, Last = Min(First+Chunk, Sz); + + unsigned int Diff = Last-First; + for (int Idx=First; Idx +#include +#include "AutoTilerLib.h" +#include "nntool_extra_generators.h" + +#ifdef CONSTINIT +#define Cinit(a, b, c, d, e) ConstInfo((a), (b), (c), (d), (e)) +#else +#define Cinit(a, b, c, d, e) 0 +#endif + + +void testModel(unsigned int L1Memory, unsigned int L2Memory, unsigned int L3Memory, unsigned int L3Flash) +{ + // SetKernelOpts(KER_OPT_NONE, KER_OPT_BUFFER_PROMOTE); + SetSymbolDynamics(); + SetUsedFilesNames(0, 1, "nntool_extra_kernels.h"); + SetGeneratedFilesNames("testKernels.c", "testKernels.h"); + + SetMemoryDeviceInfos(4, + AT_MEM_L1, L1Memory, "vww_L1_Memory", 0, 0, + AT_MEM_L2, L2Memory, "vww_L2_Memory", 0, 0, + AT_MEM_L3_HRAM, L3Memory, "vww_L3_Memory", 0, 0, + AT_MEM_L3_HFLASH, L3Flash, "0", "vww_L3_Flash_Const.dat", 0 + ); + + LoadNNTools_Extra_Library(); + CNN_Norm("testfunc1", 320, 240, 1, NNTOOL_KOP_RGB565); + CNN_Norm("testfunc2", 320, 240, 0, NNTOOL_KOP_RGB565); + CNN_Norm("testfunc3", 320, 240, 1, NNTOOL_KOP_RGB888); + CNN_Norm("testfunc4", 320, 240, 0, NNTOOL_KOP_RGB888); + CNN_Norm("testfunc5", 320, 240, 1, NNTOOL_KOP_BW); + CNN_Norm("testfunc6", 320, 240, 0, NNTOOL_KOP_BW); + CNN_Norm("testfunc7", 320, 240, 0, NNTOOL_KOP_BW16); + CNN_Norm("testfunc8", 320, 240, 0, NNTOOL_KOP_RGB16); +} + +int main(int argc, char **argv) + +{ + if (TilerParseOptions(argc, argv)) { + printf("Failed to initialize or incorrect output arguments directory.\\n"); return 1; + } + testModel(52000, 300*1024, 8*1024*1024, 20*1024*1024); + GenerateTilingCode(); + return 0; +} + diff --git a/tools/nntool/autotiler/tests/testRun.c b/tools/nntool/autotiler/tests/testRun.c new file mode 100644 index 000000000..cb4f57759 --- /dev/null +++ b/tools/nntool/autotiler/tests/testRun.c @@ -0,0 +1,381 @@ +#include "Gap.h" +#include +#include "testKernels.h" + +#define SIZEHW 320 * 240 +#define SIZE888 320 * 240 * 3 + +int err_count = 0; +unsigned short *In565; +unsigned char *In888, *In8; +signed char *Out8, *Out888; + +#define assert(__test, __h, __w, __c, __got, __expect) \ + while (__got != __expect) \ + { \ + err_count++; \ + printf(" %s[%d, %d, %d] expected %d got %d", __test, __h, __w, __c, __expect, __got); \ + return 1; \ + } + +typedef struct +{ + void (*test_func)(void *In, signed char *Out, int offset); + void *In; + signed char *Out; + int offset; +} test_args_t; + +typedef struct +{ + void (*test_func)(void *In, signed char *Out, int offset); + void *In; + signed char *Out; +} test_args_16_t; + +void test565(void *In, signed char *Out, int offset) +{ + if (offset) + { + printf("testfunc1\n"); + testfunc1((unsigned short *)In, Out); + } + else + { + printf("testfunc2\n"); + testfunc2((unsigned short *)In, Out); + } +} + +void test888(void *In, signed char *Out, int offset) +{ + if (offset) + { + printf("testfunc3\n"); + testfunc3((unsigned char *)In, Out); + } + else + { + printf("testfunc4\n"); + testfunc4((unsigned char *)In, Out); + } +} + +void testBW(void *In, signed char *Out, int offset) +{ + if (offset) + { + printf("testfunc5\n"); + testfunc5((unsigned char *)In, Out); + } + else + { + printf("testfunc6\n"); + testfunc6((unsigned char *)In, Out); + } +} + +void prepare565(void *buf) +{ + unsigned short *sbuf = (unsigned short *)buf; + printf("prepare565\n"); + for (int h = 0; h < 240; h++) + { + for (int w = 0; w < 320; w++) + { + sbuf[h * 320 + w] = ((h % 3) << 11) | (((h % 3) + 1) << 6) | ((h % 3) + 2); //testfunc1-2 + } + } +} + +void prepare888(void *buf) +{ + unsigned char *cbuf = (unsigned char *)buf; + printf("prepare888\n"); + for (int h = 0; h < 240; h++) + { + for (int w = 0; w < 320; w++) + { + for (int c = 0; c < 3; c++) + { + cbuf[h * 320 * 3 + w * 3 + c] = h + c; + } + } + } +} + +void prepareBW(void *buf) +{ + unsigned char *cbuf = (unsigned char *)buf; + printf("prepareBW\n"); + for (int h = 0; h < 240; h++) + { + for (int w = 0; w < 320; w++) + { + cbuf[h * 320 + w] = h; + } + } +} + +int verify565(signed char *buf, int offset) +{ + printf("verify565 offset(%d)", offset); + for (int c = 0; c < 3; c++) + { + for (int h = 0; h < 240; h++) + { + for (int w = 0; w < 320; w++) + { + if (offset) + { + assert("testfunc1", h, w, c, buf[c * 240 * 320 + 320 * h + w], (((h % 3) + c) << 3) - 128); //testfunc1 + } + else + { + assert("testfunc2", h, w, c, buf[c * 240 * 320 + 320 * h + w], (((h % 3) + c) << 3) >> 1); //testfunc2 + } + } + } + } + return 0; +} + +int verify888(signed char *buf, int offset) +{ + printf("verify888 offset(%d)", offset); + for (int c = 0; c < 3; c++) + { + for (int h = 0; h < 240; h++) + { + for (int w = 0; w < 320; w++) + { + if (offset) + { + assert("testfunc3", h, w, c, buf[c * 240 * 320 + 320 * h + w], ((h + c) - 128)); //testfunc3 + } + else + { + assert("testfunc4", h, w, c, buf[c * 240 * 320 + 320 * h + w], ((h + c) >> 1)); //testfunc4 + } + } + } + } + return 0; +} + +int verifyBW(signed char *buf, int offset) +{ + printf("verifyBW offset(%d)", offset); + for (int h = 0; h < 240; h++) + { + for (int w = 0; w < 320; w++) + { + if (offset) + { + assert("testfunc5", h, w, 1, buf[320 * h + w], h - 128); //testfunc5 + } + else + { + assert("testfunc6", h, w, 1, buf[320 * h + w], h >> 1); //testfunc6 + } + } + } + return 0; +} + +void do_test_cluster(void *arg) +{ + test_args_t *targs = (test_args_t *)arg; + targs->test_func(targs->In, targs->Out, targs->offset); +} + +void do_test(void *pcluster_dev, void *In, signed char *Out, + void (*prepare)(void *), + int (*verify)(signed char *, int), + void (*test_func)(void *In, signed char *Out, int offset)) +{ + for (int o = 1; o >= 0; o--) + { + prepare(In); +#ifndef __EMUL__ + struct pi_device *cluster_dev = (struct pi_device *)pcluster_dev; + test_args_t *targs; + targs = pmsis_l2_malloc(sizeof(test_args_t)); + memset(targs, 0, sizeof(test_args_t)); + targs->test_func = test_func; + targs->In = In; + targs->Out = Out; + targs->offset = o; + + struct pi_cluster_task *task; + task = pmsis_l2_malloc(sizeof(struct pi_cluster_task)); + memset(task, 0, sizeof(struct pi_cluster_task)); + task->entry = &do_test_cluster; + task->stack_size = 4096; + task->slave_stack_size = 1024; + task->arg = targs; + + pi_cluster_send_task_to_cl(cluster_dev, task); + + pi_l2_free(task, sizeof(struct pi_cluster_task)); + pi_l2_free(targs, sizeof(test_args_t)); +#else + test_func(In, Out, o); +#endif + if (verify(Out, o)) + { + printf(" failed !!\n"); + } + else + { + printf(" passed\n"); + } + } +} + +void testBW16(void *In, signed short int *Out) +{ + printf("testfunc7\n"); + testfunc7((unsigned char *)In, Out); +} + +int verifyBW16(signed short int *buf) +{ + printf("verifyBW 16 bits"); + for (int h = 0; h < 240; h++) + { + for (int w = 0; w < 320; w++) + { + assert("testfunc7", h, w, 1, buf[320 * h + w], h << 7); //testfunc7 + } + } + return 0; +} + +void testRGB16(void *In, signed short int *Out) +{ + printf("testfunc8\n"); + testfunc8((unsigned char *)In, Out); +} + +int verifyRGB16(signed short int *buf) +{ + printf("verifyRGB16"); + for (int c = 0; c < 3; c++) + { + for (int h = 0; h < 240; h++) + { + for (int w = 0; w < 320; w++) + { + assert("testfunc8", h, w, c, buf[c * 240 * 320 + 320 * h + w], ((h + c) << 7)); //testfunc8 + } + } + } + return 0; +} + +void do_test16(void *pcluster_dev, void *In, signed short int *Out, + void (*prepare)(void *), + int (*verify)(signed short int *), + void (*test_func)(void *In, signed short int *Out)) +{ + prepare(In); +#ifndef __EMUL__ + struct pi_device *cluster_dev = (struct pi_device *)pcluster_dev; + test_args_16_t *targs; + targs = pmsis_l2_malloc(sizeof(test_args_16_t)); + memset(targs, 0, sizeof(test_args_16_t)); + targs->test_func = test_func; + targs->In = In; + targs->Out = Out; + + struct pi_cluster_task *task; + task = pmsis_l2_malloc(sizeof(struct pi_cluster_task)); + memset(task, 0, sizeof(struct pi_cluster_task)); + task->entry = &do_test_cluster; + task->stack_size = 4096; + task->slave_stack_size = 1024; + task->arg = targs; + + pi_cluster_send_task_to_cl(cluster_dev, task); + + pi_l2_free(task, sizeof(struct pi_cluster_task)); + pi_l2_free(targs, sizeof(test_args_16_t)); +#else + test_func(In, Out); +#endif + if (verify(Out)) + { + printf(" failed !!\n"); + } + else + { + printf(" passed\n"); + } +} + +int start() +{ + + unsigned char *In = AT_L2_ALLOC(0, SIZE888 * sizeof(unsigned char)); + if (!In) + { + printf("unable to allocate In\n"); + exit(1); + } + signed char *Out = AT_L2_ALLOC(0, SIZE888 * sizeof(signed char)); + if (!Out) + { + printf("unable to allocate Out\n"); + exit(1); + } + signed short int *Out16 = AT_L2_ALLOC(0, SIZE888 * sizeof(signed short int)); + if (!Out16) + { + printf("unable to allocate Out16\n"); + exit(1); + } + + void *pcluster_dev; +#ifndef __EMUL__ + struct pi_device cluster_dev; + struct pi_cluster_conf conf; + pi_cluster_conf_init(&conf); + conf.id = 0; /* Set cluster ID. */ + pi_open_from_conf(&cluster_dev, (void *)&conf); + pi_cluster_open(&cluster_dev); + pi_freq_set(PI_FREQ_DOMAIN_CL, 50000000); + pi_freq_set(PI_FREQ_DOMAIN_FC, 50000000); + pcluster_dev = &cluster_dev; +#else + pcluster_dev = 0; +#endif + L1_Memory = (AT_L1_POINTER)AT_L1_ALLOC(0, _L1_Memory_SIZE * sizeof(signed char)); + if (!L1_Memory) + { + printf("unable to allocate L1"); + exit(1); + } + do_test(pcluster_dev, In, Out, prepare565, verify565, test565); + do_test(pcluster_dev, In, Out, prepare888, verify888, test888); + do_test(pcluster_dev, In, Out, prepareBW, verifyBW, testBW); + do_test16(pcluster_dev, In, Out16, prepareBW, verifyBW16, testBW16); + do_test16(pcluster_dev, In, Out16, prepare888, verifyRGB16, testRGB16); + + printf("test %s\n", (err_count ? "failed" : "passed")); + +#ifdef __EMUL__ + return err_count; +#else + pi_cluster_close(&cluster_dev); + pmsis_exit(err_count); +#endif +} +int main(void) +{ +#ifndef __EMUL__ + return pmsis_kickoff((void *)start); +#else + return start(); +#endif +} diff --git a/tools/nntool/execution/execution_progress.py b/tools/nntool/execution/execution_progress.py new file mode 100644 index 000000000..37f15dd69 --- /dev/null +++ b/tools/nntool/execution/execution_progress.py @@ -0,0 +1,54 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + + +class ExecutionProgress(object): + __instance = None + def __new__(cls): + if ExecutionProgress.__instance is None: + ExecutionProgress.__instance = object.__new__(cls) + return ExecutionProgress.__instance + + def __init__(self): + if not hasattr(self, 'listeners'): + self.listeners = [] + + @classmethod + def progress(cls, step_idx, name): + inst = cls() + for func in inst.listeners: + func(step_idx, name) + + @classmethod + def start(cls): + inst = cls() + for func in inst.listeners: + func(None, "start") + + @classmethod + def end(cls): + inst = cls() + for func in inst.listeners: + func(None, "end") + + @classmethod + def listen(cls, func): + inst = cls() + inst.listeners.append(func) + + @classmethod + def unlisten(cls, func): + inst = cls() + inst.listeners.remove(func) diff --git a/tools/nntool/execution/graph_executer.py b/tools/nntool/execution/graph_executer.py new file mode 100644 index 000000000..959c7d31f --- /dev/null +++ b/tools/nntool/execution/graph_executer.py @@ -0,0 +1,334 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from typing import Optional, Sequence, Mapping + +import numpy as np + +from utils.graph import Graph +from utils.node_id import NodeId +from graph.types import ConvFusionParameters, ActivationFusion, InputParameters, ConstantInputParameters, Parameters +from quantization.quantization_record_base import QuantizationRecordBase +from quantization.kernels.kernel_switch import KernelSwitchBase, DefaultKernelSwitch +from quantization.float32.float_kernet_set import Float32KernelSet +from quantization.symmetric.symmetric_kernet_set import SymmetricKernelSet +from execution.quantization_mode import QuantizationMode +from execution.execution_progress import ExecutionProgress + +LOG = logging.getLogger('nntool.'+__name__) + + +class GraphExecuter(): + def __init__(self, + G: Graph, + qrecs: Optional[Mapping[NodeId, QuantizationRecordBase]] = None, + kernel_switch: Optional[KernelSwitchBase] = None, + quantized_kernel_switch: Optional[KernelSwitchBase] = None): + self._G = G + self._qrecs = qrecs + self._kernel_switch = DefaultKernelSwitch( + Float32KernelSet()) if kernel_switch is None else kernel_switch + self._quantized_kernel_switch = DefaultKernelSwitch( + SymmetricKernelSet()) if quantized_kernel_switch is None else quantized_kernel_switch + + def collect_outputs(self, saved_outputs, node): + # collect outputs from previous nodes + # InputNode is already set above + if isinstance(node, InputParameters): + output = None + else: + output = [None]*len(node.in_dims) + for edge in self._G.in_edges(node.name): + output[edge.to_idx] = saved_outputs[edge.from_node][edge.from_idx] + return output + + @staticmethod + def save_output(saved_outputs, node, outputs): + saved_outputs[node] = outputs + + def execute_qnoq_iterator(self, in_tensors, step_idx_limit=None, silent=False, yield_fusions=True): + + if not silent: + LOG.info("execute quantization comparison") + ExecutionProgress.start() + saved_outputs = {} + for step_idx, step in enumerate(self._G.graph_state.steps): + + if step_idx_limit is not None and step_idx > step_idx_limit: + break + + node = step['node'] + + if not silent: + ExecutionProgress.progress(step_idx, node.name) + + output = self.collect_outputs(saved_outputs, node) + nid = NodeId(node, None) + qrec = self._qrecs[nid] + + if isinstance(node, (ConvFusionParameters, ActivationFusion)): + for fusion_node in node.contained_nodes(): + fnid = NodeId(node, fusion_node) + fqrec = self._qrecs[fnid] + + qoutput = [] + for val_idx, val in enumerate(output): + qoutput.append(fqrec.in_qs[val_idx].quantize(val)) + + details = {} + output = self._kernel_switch.execute(fusion_node, output, + fqrec if self._G.has_quantized_parameters else None, + details=details) + qdetails = {} + qoutput = self._quantized_kernel_switch.execute( + fusion_node, qoutput, fqrec, details=qdetails) + qoutput = [fqrec.out_qs[i].dequantize(out) for i, out in enumerate(qoutput)] + if yield_fusions: + yield step_idx, node, output, details, qoutput, qdetails, fusion_node + else: + if isinstance(node, (InputParameters, ConstantInputParameters)): + details = {} + output = self._kernel_switch.execute(node, in_tensors, + qrec if self._G.has_quantized_parameters else None, + details=details) + qdetails = {} + qoutput = self._quantized_kernel_switch.execute( + node, in_tensors, qrec, details=qdetails) + else: + qoutput = [] + for val_idx, val in enumerate(output): + qoutput.append(qrec.in_qs[val_idx].quantize(val)) + details = {} + output = self._kernel_switch.execute(node, output, + qrec if self._G.has_quantized_parameters else None, + details=details) + qdetails = {} + qoutput = self._quantized_kernel_switch.execute( + node, qoutput, qrec, details=qdetails) + + qoutput = [qrec.out_qs[i].dequantize(out) for i, out in enumerate(qoutput)] + + yield step_idx, node, output, details, qoutput, qdetails, None + self.save_output(saved_outputs, node, output) + + if not silent: + ExecutionProgress.end() + + def execute_iterator(self, + in_tensors: Sequence[np.ndarray], + step_idx_limit: Optional[int] = None, + start_node: Optional[Parameters] = None, + qmode: Optional[QuantizationMode] = None, + yield_fusions=True, + yield_details=True, + only_yield_step=False, + record_inputs: Optional[Mapping] = None, + silent=False): + if qmode is None: + qmode = QuantizationMode.none() + + saved_outputs = {} + + if not silent: + LOG.info("execute uncached: quantization mode %s", qmode) + ExecutionProgress.start() + for step_idx, step in enumerate(self._G.graph_state.steps): + + if step_idx_limit is not None and step_idx > step_idx_limit: + break + + node = step['node'] + + if start_node and start_node != node: + continue + + # collect outputs from previous nodes + # InputNode is already set above + output_tensors = self.collect_outputs(saved_outputs, node) + + if not silent: + ExecutionProgress.progress(step_idx, node.name) + nid = NodeId(node, None) + if record_inputs is not None: + if output_tensors is None: + record_inputs[nid] = output_tensors + else: + record_inputs[nid] = [np.copy(output_tensor) + for output_tensor in output_tensors] + + qrec = self._qrecs[nid] if self._qrecs is not None else None + if qmode.get_quantized(node, step_idx): + switch = self._quantized_kernel_switch + if qmode.is_step and output_tensors: + output_tensors = [qrec.in_qs[i].quantize( + output_tensor) for i, output_tensor in enumerate(output_tensors)] + else: + switch = self._kernel_switch + + details = {} if yield_details and ( + not only_yield_step or step_idx == step_idx_limit) else None + if isinstance(node, (ConvFusionParameters, ActivationFusion)): + for fusion_node in node.contained_nodes(): + fnid = NodeId(node, fusion_node) + fqrec = None if not qrec else self._qrecs[fnid] + if record_inputs is not None: + record_inputs[nid] = [np.copy(output_tensor) + for output_tensor in output_tensors] + details = {} if yield_fusions and yield_details else None + output_tensors = switch.execute(fusion_node, output_tensors, fqrec, details) + if yield_fusions: + if qmode.dequantize: + qoutput_tensors = [fqrec.out_qs[i].dequantize(output_tensor) + for i, output_tensor + in enumerate(output_tensors)] + yield step_idx, node, fusion_node, qoutput_tensors, details + elif qmode.is_float_q_deq: + qoutput_tensors = [fqrec.out_qs[i].dequantize(fqrec.out_qs[i].quantize(output_tensor)) + for i, output_tensor + in enumerate(output_tensors)] + yield step_idx, node, fusion_node, qoutput_tensors, details + else: + yield step_idx, node, fusion_node, output_tensors, details + elif isinstance(node, InputParameters): + output_tensors = switch.execute(node, in_tensors, qrec, details) + else: + output_tensors = switch.execute(node, output_tensors, qrec, details) + + if qmode.dequantize: + qoutput_tensors = [qrec.out_qs[i].dequantize( + output_tensor) for i, output_tensor in enumerate(output_tensors)] + if not only_yield_step or step_idx == step_idx_limit: + yield step_idx, node, None, qoutput_tensors, details + if qmode.is_step and qmode.get_quantized(node, step_idx): + output_tensors = qoutput_tensors + elif qmode.is_float_q_deq: + if qmode.is_step and qmode.get_quantized(node, step_idx): + output_tensors = [qrec.out_qs[i].dequantize( + output_tensor) for i, output_tensor in enumerate(output_tensors)] + qoutput_tensors = [qrec.out_qs[i].dequantize(qrec.out_qs[i].quantize( + output_tensor)) for i, output_tensor in enumerate(output_tensors)] + if not only_yield_step or step_idx == step_idx_limit: + yield step_idx, node, None, qoutput_tensors, details + else: + if qmode.is_step and qmode.get_quantized(node, step_idx): + output_tensors = [qrec.out_qs[i].dequantize( + output_tensor) for i, output_tensor in enumerate(output_tensors)] + if not only_yield_step or step_idx == step_idx_limit: + yield step_idx, node, None, output_tensors, details + + self.save_output(saved_outputs, node, output_tensors) + + if not silent: + ExecutionProgress.end() + + def execute_qnoq(self, + in_tensors: Sequence[np.ndarray], + step_idx_limit=None, + all_details=None, + yield_fusions=False, + silent=False): + outputs = [] + if yield_fusions: + fusion_outputs = [] + if all_details is not None: + fusion_details = [] + for _, _, _, _, qoutput, qdetails, fnode in self.execute_qnoq_iterator(in_tensors, + step_idx_limit=step_idx_limit, + silent=silent): + if yield_fusions: + if fnode: + fusion_outputs.append([output_tensor.copy() + for output_tensor in qoutput]) + if all_details is not None: + fusion_details.append(qdetails) + else: + outputs.append({ + 'outputs': outputs.append([output_tensor.copy() for output_tensor in qoutput]), + 'fusion_outputs': fusion_outputs.copy(), + }) + fusion_outputs.clear() + if all_details is not None: + all_details.append({ + 'details': qdetails, + 'fusion_details': fusion_details.copy() + }) + fusion_details.clear() + elif fnode is None: + outputs.append([output_tensor.copy() for output_tensor in qoutput]) + if all_details is not None: + all_details.append(qdetails) + return outputs + + def execute(self, + in_tensors: Sequence[np.ndarray], + step_idx_limit=None, + only_yield_step=False, + qmode: QuantizationMode = None, + all_details=None, + yield_fusions=False, + silent=False): + + if qmode is None: + qmode = QuantizationMode.none() + + if qmode.is_step_all: + iterator = [(qoutput, qdetails, fnode) + for _, _, _, _, qoutput, qdetails, fnode + in self.execute_qnoq_iterator(in_tensors, + yield_fusions=yield_fusions, + step_idx_limit=step_idx_limit, + silent=silent)] + else: + iterator = [(output_tensors, details, fnode) + for _, _, fnode, output_tensors, details + in self.execute_iterator(in_tensors, step_idx_limit=step_idx_limit, + qmode=qmode, + yield_fusions=yield_fusions, + only_yield_step=only_yield_step, + yield_details=all_details is not None, + silent=silent)] + + outputs = [] + if yield_fusions: + fusion_outputs = [] + if all_details is not None: + fusion_details = [] + + for output_tensors, details, fnode in iterator: + if yield_fusions: + if fnode: + fusion_outputs.append([output_tensor.copy() + for output_tensor in output_tensors]) + if all_details is not None: + fusion_details.append(details) + else: + outputs.append({ + 'outputs': outputs.append([output_tensor.copy() for output_tensor in output_tensors]), + 'fusion_outputs': fusion_outputs.copy(), + }) + fusion_outputs.clear() + if all_details is not None: + all_details.append({ + 'details': details, + 'fusion_details': fusion_details.copy() + }) + fusion_details.clear() + else: + outputs.append([output_tensor.copy() for output_tensor in output_tensors]) + if all_details is not None: + all_details.append(details) + return outputs diff --git a/tools/nntool/execution/quantization_mode.py b/tools/nntool/execution/quantization_mode.py index 21c78dbfc..54176cecf 100644 --- a/tools/nntool/execution/quantization_mode.py +++ b/tools/nntool/execution/quantization_mode.py @@ -13,18 +13,32 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from typing import Union +from typing import Union, Optional from graph.types import Parameters +from utils.node_id import NodeId class QuantizationMode(): - def __init__(self, qlevel: str = "all", qstep: Union[Parameters, int] = None): + def __init__(self, qlevel: str = "all", qstep: Optional[Union[int, NodeId]] = None, dequantize=False): self._qlevel = qlevel self._qstep = qstep + self._dequantize = dequantize @classmethod def all(cls): return cls() + @classmethod + def step_all(cls): + return cls(qlevel="step_all") + + @classmethod + def all_float_quantize_dequantize(cls): + return cls(qlevel="float_q_deq") + + @classmethod + def all_dequantize(cls): + return cls(dequantize=True) + @classmethod def none(cls): return cls(qlevel="none") @@ -34,7 +48,7 @@ def step(cls, qstep: Union[Parameters, int]): return cls(qlevel="step", qstep=qstep) def get_quantized(self, node: Parameters, step_idx: int): - if self._qlevel == "none": + if self._qlevel in ("none", "float_q_deq"): return False if self._qlevel == "all": return True @@ -42,10 +56,18 @@ def get_quantized(self, node: Parameters, step_idx: int): return node == self._qstep return step_idx == self._qstep + @property + def is_float_q_deq(self): + return self._qlevel == "float_q_deq" + @property def is_step(self): return self._qlevel == "step" + @property + def is_step_all(self): + return self._qlevel == "step_all" + @property def is_all(self): return self._qlevel == "all" @@ -54,6 +76,10 @@ def is_all(self): def is_none(self): return self._qlevel == "none" + @property + def dequantize(self): + return (self.is_step or self.is_all) and self._dequantize + def __str__(self): if self.is_none or self.is_all: return self._qlevel diff --git a/tools/nntool/generation/at_generators/__init__.py b/tools/nntool/generation/at_generators/__init__.py new file mode 100644 index 000000000..556c4e3e0 --- /dev/null +++ b/tools/nntool/generation/at_generators/__init__.py @@ -0,0 +1,21 @@ +from .cnn_3d_tensor_permute import (gen_3d_transpose_at_params, + gen_at_3d_transpose) +from .cnn_convolution_mulbias_pool_relu import gen_at_mulconv_pool_relu +from .cnn_convolution_pool_relu import (NO_ACTIVATION, NO_CONV, NO_POOL, + ActivationATParam, ConvATParam, + PoolATParam, gen_activation_op, + gen_active_at_params, + gen_at_conv_pool_relu, + gen_conv_at_params, gen_pool_at_params) +from .cnn_global_pool import gen_at_globalpool, gen_globalpool_at_params +from .cnn_grouped_convolution_mulbias_pool_relu import \ + gen_at_grouped_mulconv_pool_relu +from .cnn_grouped_convolution_pool_relu import (GroupedConvATParam, + gen_at_grouped_conv_pool_relu) +from .cnn_linear_relu import gen_at_linear_relu, gen_linear_at_params +from .cnn_matrix import (gen_at_matrixadd, gen_at_matrixadddyn, + gen_at_matscale, gen_matrixadd_at_params, + gen_matrixadddyn_at_params, gen_matscale_at_params) +from .cnn_pool_relu import gen_at_pool_relu +from .cnn_softmax import gen_at_softmax, gen_softmax_at_params +from .utils import at_bits, at_bits_and_q, at_q diff --git a/tools/nntool/generation/at_generators/cnn_3d_tensor_permute.py b/tools/nntool/generation/at_generators/cnn_3d_tensor_permute.py new file mode 100644 index 000000000..ddb76eac8 --- /dev/null +++ b/tools/nntool/generation/at_generators/cnn_3d_tensor_permute.py @@ -0,0 +1,47 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from collections import namedtuple +from .utils import at_bits + +GEN_3D_TRANSPOSE = "CNN_3DTensorPermute" + +ThreeDTensorTransposeATParam = namedtuple('ThreeDTensorTransposeATParam', [ + 'MatPermOper' +]) + +def gen_3d_transpose_at_params(params): + if params.transpose_dimension == 2: + perm = params.permute(['H', 'W']) + permop = "KOP_MATPERM_CHW2C{}".format("".join(perm)) + else: + perm = params.permute(['C', 'H', 'W']) + permop = "KOP_MATPERM_CHW2{}".format("".join(perm)) + return ThreeDTensorTransposeATParam( + MatPermOper=permop + ) + +def gen_at_3d_transpose(code_block, name, in_q, out_q, + in_shape, at_transpose_params, gen_ctrl=None, + at_ver=3): + if gen_ctrl is None: + gen_ctrl = "0" + else: + raise NotImplementedError("genctrl is not yet implemented") + + code_block.write('{}("{}", {}, {}, {}, {}, {}, 1, 1, {}, {}, {}, {});', + GEN_3D_TRANSPOSE, name, gen_ctrl, at_bits(in_q), at_bits(out_q), + in_q.q, out_q.q, in_shape[0], in_shape[1], in_shape[2], + at_transpose_params.MatPermOper) diff --git a/tools/nntool/generation/at_generators/cnn_convolution_mulbias_pool_relu.py b/tools/nntool/generation/at_generators/cnn_convolution_mulbias_pool_relu.py new file mode 100644 index 000000000..87292b01e --- /dev/null +++ b/tools/nntool/generation/at_generators/cnn_convolution_mulbias_pool_relu.py @@ -0,0 +1,46 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from .utils import at_bits + +GEN_MULCONV_POOL_RELU = "CNN_ConvolutionMulBiasPoolReLU" + +def gen_at_mulconv_pool_relu(code_block, name, in_q, out_q, + filt_q, bias_q, mul_biases_q, in_dim, out_dim, + at_conv, at_pool, at_active, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + gen_ctrl = "0" + else: + gen_ctrl = gen_ctrl.ctrl_name + + if at_ver < 3: + raise NotImplementedError("mulbias before ver 3 not supported") + + code_block.write('{}("{}", {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, 1, 1, 1, 1, 1, {}, {}, {}, {},', + GEN_MULCONV_POOL_RELU, name, gen_ctrl, + at_bits(in_q), at_bits(filt_q), at_bits( + bias_q), at_bits(mul_biases_q), at_bits(out_q), + in_q.q, filt_q.q, bias_q.q, mul_biases_q.q, out_q.q, + in_dim.c, out_dim.c, in_dim.w, in_dim.h) + code_block.indent() + code_block.write('{}, {}, {}, {}, {}, {}, {}, {},', + at_conv.ConvOper, at_conv.Fcx, at_conv.Fcy, + at_conv.Dcx, at_conv.Dcy, at_conv.Scx, at_conv.Scy, + at_conv.ConvPad) + code_block.write('{}, {}, {}, {}, {}, {}, {}, {}, {});', + at_pool.PoolOper, at_pool.Fpx, at_pool.Fpy, + at_pool.Dpx, at_pool.Dpy, at_pool.Spx, at_pool.Spy, + at_pool.PoolPad, at_active.ReLUOper) + code_block.deindent() diff --git a/tools/nntool/generation/at_generators/cnn_convolution_pool_relu.py b/tools/nntool/generation/at_generators/cnn_convolution_pool_relu.py new file mode 100644 index 000000000..e9dea6f6f --- /dev/null +++ b/tools/nntool/generation/at_generators/cnn_convolution_pool_relu.py @@ -0,0 +1,214 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from collections import namedtuple + +from .cnn_grouped_convolution_pool_relu import GroupedConvATParam +from .utils import at_bits + +GEN_CONV_POOL_RELU = "CNN_ConvolutionPoolReLU" + +# extern void CNN_ConvolutionPoolReLU( +# char *Name, + +# CNN_GenControl_T *Ctrl, + +# int In_DataSize, +# int Filter_DataSize, +# int Bias_DataSize, +# int Out_DataSize, + +# int In_InL3, // 1 if In comes from L3, 0 if it comes from L2 +# int Filter_InL3, +# int Bias_InL3, +# int Out_InL3, + +# int InFeat, +# int OutFeat, +# int Width, +# int Height, + +# KernelOper_T ConvOper, +# int Fcx, +# int Fcy, +# int Dcx, +# int Dcy, +# int Scx, +# int Scy, +# int ConvPad, + +# KernelOper_T PoolOper, +# int Fpx, +# int Fpy, +# int Dpx, +# int Dpy, +# int Spx, +# int Spy, +# int PoolPad, + +# KernelOper_T ReLUOper +# ); + +# pylint: disable=too-many-arguments + +ConvATParam = namedtuple('ConvATParam', [ + "ConvOper", + "Fcx", + "Fcy", + "Dcx", + "Dcy", + "Scx", + "Scy", + "ConvPad" +]) + +NO_CONV = ConvATParam(ConvOper='KOP_NONE', Fcx=0, Fcy=0, Dcx=0, Dcy=0, Scx=0, Scy=0, ConvPad=0) + + +def is_dp(_): + # if conv_q.calc_q == conv_q.acc_q and\ + # conv_q.acc_q.bits > conv_q.out_qs[0].bits: + # cop = "KOP_CONV_DP" + # else: + # cop = "KOP_CONV" + return True + + +def gen_conv_at_params(params, conv_q, pad_compatibilities, do_dp=False): + if params.is_depthwise_conv(): + assert params.multiplier == 1, "Multiplier not supported" + assert not do_dp, "No DP output for DW convolution" + cop = is_dp(conv_q) and "KOP_CONV_DWDP" or "KOP_CONV_DW" + elif params.is_grouped_conv(): + cop = is_dp(conv_q) and "KOP_CONV_DP" or "KOP_CONV" + return GroupedConvATParam( + ConvOper=cop, + GroupIn=params.groups, + GroupOut=params.multiplier, + Fcx=params.filter.w, + Fcy=params.filter.h, + Dcx=params.dilation.w, + Dcy=params.dilation.h, + Scx=params.stride.w, + Scy=params.stride.h, + ConvPad=params.has_at_zero_pad() and 1 or 0 + ) + else: + cop = is_dp(conv_q) and "KOP_CONV_DP" or "KOP_CONV" + + pad_compatibilities.append(params.padding.pad_compatibility) + return ConvATParam( + ConvOper=cop, + Fcx=params.filter.w, + Fcy=params.filter.h, + Dcx=params.dilation.w, + Dcy=params.dilation.h, + Scx=params.stride.w, + Scy=params.stride.h, + ConvPad=params.has_at_zero_pad() and 1 or 0 + ) + + +PoolATParam = namedtuple('PoolATParam', [ + "PoolOper", + "Fpx", + "Fpy", + "Dpx", + "Dpy", + "Spx", + "Spy", + "PoolPad" +]) + +NO_POOL = PoolATParam(PoolOper='KOP_NONE', Fpx=0, Fpy=0, Dpx=0, Dpy=0, Spx=0, Spy=0, PoolPad=0) + + +def gen_pool_at_params(params, pad_compatibilities): + if params.pool_type == "average": + pop = "KOP_AVGPOOL" + elif params.pool_type == "max": + pop = "KOP_MAXPOOL" + else: + raise NotImplementedError() + + pad_compatibilities.append(params.padding.pad_compatibility) + return PoolATParam( + PoolOper=pop, + Fpx=params.filter.w, + Fpy=params.filter.h, + Dpx=1, + Dpy=1, + Spx=params.stride.w, + Spy=params.stride.h, + PoolPad=params.has_at_zero_pad() and 1 or 0 + ) + + +ActivationATParam = namedtuple('ActivationATParam', [ + "ReLUOper" +]) + +NO_ACTIVATION = ActivationATParam(ReLUOper='KOP_NONE') + + +def gen_activation_op(activation): + if activation is None or activation == "none": + aop = "KOP_NONE" + elif activation == "relu": + aop = "KOP_RELU" + elif activation == "relu6": + aop = "KOP_RELUN" + elif activation == "relun": + aop = "KOP_RELUN" + elif activation == "sigmoid" or activation == "hsigmoid": + aop = "KOP_HSIGMOID" + elif activation == "swish" or activation == "hswish": + aop = "KOP_HSWISH" + else: + raise NotImplementedError("activation type %s not implemented" % activation) + return aop + + +def gen_active_at_params(params): + return ActivationATParam( + ReLUOper=gen_activation_op(params.activation) + ) + + +def gen_at_conv_pool_relu(code_block, name, in_q, out_q, + filt_q, bias_q, in_dim, out_dim, + at_conv: ConvATParam, at_pool: PoolATParam, + at_active: ActivationATParam, gen_ctrl=None, at_ver=3): + del at_ver + if gen_ctrl is None: + gen_ctrl = "0" + else: + gen_ctrl = gen_ctrl.ctrl_name + + code_block.write('{}("{}", {}, {}, {}, {}, {}, {}, {}, {}, {}, 1, 1, 1, 1, {}, {}, {}, {},', + GEN_CONV_POOL_RELU, name, gen_ctrl, + at_bits(in_q), at_bits(filt_q), at_bits(bias_q), at_bits(out_q), + in_q.q, filt_q.q, bias_q.q, out_q.q, + in_dim.c, out_dim.c, in_dim.w, in_dim.h) + code_block.indent() + code_block.write('{}, {}, {}, {}, {}, {}, {}, {},', + at_conv.ConvOper, at_conv.Fcx, at_conv.Fcy, + at_conv.Dcx, at_conv.Dcy, at_conv.Scx, at_conv.Scy, + at_conv.ConvPad) + code_block.write('{}, {}, {}, {}, {}, {}, {}, {}, {});', + at_pool.PoolOper, at_pool.Fpx, at_pool.Fpy, + at_pool.Dpx, at_pool.Dpy, at_pool.Spx, at_pool.Spy, + at_pool.PoolPad, at_active.ReLUOper) + code_block.deindent() diff --git a/tools/nntool/generation/at_generators/cnn_global_pool.py b/tools/nntool/generation/at_generators/cnn_global_pool.py new file mode 100644 index 000000000..7919a716f --- /dev/null +++ b/tools/nntool/generation/at_generators/cnn_global_pool.py @@ -0,0 +1,70 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from collections import namedtuple +from .utils import at_bits + +GEN_GLOBALPOOL = "CNN_GlobalPool" +# /** \brief CNN_GlobalPool +# * Generator for Global Pooling (Max or Average) +# * + +# \param Name: Name of the generated user kernel + +# \param Ctrl: Overide generator default options (TileOrientation, Parallel Features), Def=(TILE_HOR, 1) + +# \param In_DataSize: 1: byte, 2: half word, 4: word +# \param Out_DataSize: 1: byte, 2: half word, 4: word + +# \param In_Q: In fixed point format +# \param Out_Q: Out fixed point format + +# \param In_InL3: 0: In is in L2, 1: In is in L3 memory +# \param Out_InL3: 0: Out is in L2, 1: Out is in L3 memory + +# \param InFeat: Number of input feature's maps +# \param OutFeat: Number of output feature's maps (InFeat has to be equal to OutFeat for these generators +# \param Width: Number of columns of a given feature map +# \param Height: Number of lines of a given feature map + +# \param PoolOper: KOP_GLOBAL_MAXPOOL or KOP_GLOBAL_AVGPOOL + +GlobalPoolATParam = namedtuple('GlobalPoolATParam', [ + "GlobalPoolOper" +]) + +def gen_globalpool_at_params(params): + return GlobalPoolATParam( + GlobalPoolOper="KOP_GLOBAL_AVGPOOL" if params.pool_type == "average" else "KOP_GLOBAL_MAXPOOL" + ) + +def gen_at_globalpool(code_block, name, in_q, out_q, + in_dim, out_dim, at_globalpool, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + gen_ctrl = "0" + else: + raise NotImplementedError("genctrl is not yet implemented") + + if at_ver < 3: + code_block.write('{}("{}", {}, {}, {}, 1, 1, {}, {}, {}, {}, {});', + GEN_GLOBALPOOL, name, gen_ctrl, + at_bits(in_q), at_bits(out_q), in_dim.shape[0], out_dim.shape[0], + in_dim.shape[1], in_dim.shape[2], at_globalpool.GlobalPoolOper) + else: + code_block.write('{}("{}", {}, {}, {}, {}, {}, 1, 1, {}, {}, {}, {}, {});', + GEN_GLOBALPOOL, name, gen_ctrl, + at_bits(in_q), at_bits( + out_q), in_q.q, out_q.q, in_dim.shape[0], out_dim.shape[0], + in_dim.shape[1], in_dim.shape[2], at_globalpool.GlobalPoolOper) diff --git a/tools/nntool/generation/at_generators/cnn_grouped_convolution_mulbias_pool_relu.py b/tools/nntool/generation/at_generators/cnn_grouped_convolution_mulbias_pool_relu.py new file mode 100644 index 000000000..1a97f5470 --- /dev/null +++ b/tools/nntool/generation/at_generators/cnn_grouped_convolution_mulbias_pool_relu.py @@ -0,0 +1,48 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from .utils import at_bits + +GEN_GROUPED_MULCONV_POOL_RELU = "CNN_GroupedConvolutionMulBiasPoolReLU" + +def gen_at_grouped_mulconv_pool_relu(code_block, name, in_q, out_q, + filt_q, bias_q, mul_biases_q, in_dim, out_dim, + at_conv, at_pool, at_active, gen_ctrl=None, + at_ver=3): + if gen_ctrl is None: + gen_ctrl = "0" + else: + gen_ctrl = gen_ctrl.ctrl_name + + if at_ver < 3: + raise NotImplementedError("mulbias before ver 3 not supported") + + code_block.write('{}("{}", {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, 1, 1, 1, 1, 1, {}, {}, {}, {},', + GEN_GROUPED_MULCONV_POOL_RELU, name, gen_ctrl, + at_conv.GroupIn, at_conv.GroupOut, + at_bits(in_q), at_bits(filt_q), at_bits(bias_q), + at_bits(mul_biases_q), at_bits(out_q), + in_q.q, filt_q.q, bias_q.q, mul_biases_q.q, out_q.q, + in_dim.c, out_dim.c, in_dim.w, in_dim.h) + code_block.indent() + code_block.write('{}, {}, {}, {}, {}, {}, {}, {},', + at_conv.ConvOper, at_conv.Fcx, at_conv.Fcy, + at_conv.Dcx, at_conv.Dcy, at_conv.Scx, at_conv.Scy, + at_conv.ConvPad) + code_block.write('{}, {}, {}, {}, {}, {}, {}, {}, {});', + at_pool.PoolOper, at_pool.Fpx, at_pool.Fpy, + at_pool.Dpx, at_pool.Dpy, at_pool.Spx, at_pool.Spy, + at_pool.PoolPad, at_active.ReLUOper) + code_block.deindent() diff --git a/tools/nntool/generation/at_generators/cnn_grouped_convolution_pool_relu.py b/tools/nntool/generation/at_generators/cnn_grouped_convolution_pool_relu.py new file mode 100644 index 000000000..20d5ed751 --- /dev/null +++ b/tools/nntool/generation/at_generators/cnn_grouped_convolution_pool_relu.py @@ -0,0 +1,126 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from collections import namedtuple + +from .utils import at_bits + +GEN_GROUPED_CONV_POOL_RELU = "CNN_GroupedConvolutionPoolReLU" + +# ConvOper: Type of convolution, Regular convolution: KOP_CONV, +# Regular convolution with double precision output: KOP_CONV_DP, +# Depth wise convolution: KOP_CONV_DW +# GroupIn: Size of the group for input features +# GroupOut: Size of the group for output features +# Fcx: Convolution filter x dimension +# Fcy: Convolution filter y dimension +# Dcx: Convolution filter dilation factor, x dimension +# Dcy: Convolution filter dilation factor, y dimension +# Scx: Convolution filter stride x dimension +# Scy: Convolution filter stride y dimension +# ConvPad: 0: No padding, 1: Zero padding + +GroupedConvATParam = namedtuple('GroupedConvATParam', [ + "ConvOper", + "GroupIn", + "GroupOut", + "Fcx", + "Fcy", + "Dcx", + "Dcy", + "Scx", + "Scy", + "ConvPad" +]) + +# extern void CNN_GroupedConvolutionPoolReLU( +# char *Name, + +# CNN_GenControl_T *Ctrl, + +# GroupIn: Size of the group for input features +# GroupOut: Size of the group for output features + +# int In_DataSize, +# int Filter_DataSize, +# int Bias_DataSize, +# int Out_DataSize, + +# int In_InL3, // 1 if In comes from L3, 0 if it comes from L2 +# int Filter_InL3, +# int Bias_InL3, +# int Out_InL3, + +# int InFeat, +# int OutFeat, +# int Width, +# int Height, + +# KernelOper_T ConvOper, +# int Fcx, +# int Fcy, +# int Dcx, +# int Dcy, +# int Scx, +# int Scy, +# int ConvPad, + +# KernelOper_T PoolOper, +# int Fpx, +# int Fpy, +# int Dpx, +# int Dpy, +# int Spx, +# int Spy, +# int PoolPad, + +# KernelOper_T ReLUOper +# ); + +# pylint: disable=too-many-arguments + + +def gen_at_grouped_conv_pool_relu(code_block, name, in_q, out_q, + filt_q, bias_q, in_dim, out_dim, + at_conv, at_pool, at_active, gen_ctrl=None, + at_ver=3): + if gen_ctrl is None: + gen_ctrl = "0" + else: + gen_ctrl = gen_ctrl.ctrl_name + + if at_ver < 3: + code_block.write('{}("{}", {}, {}, {}, {}, {}, 1, 1, 1, 1, {}, {}, {}, {},', + GEN_GROUPED_CONV_POOL_RELU, name, gen_ctrl, + at_conv.GroupIn, at_conv.GroupOut, + at_bits(in_q), at_bits(filt_q), at_bits(bias_q), at_bits(out_q), + in_dim.c, out_dim.c, in_dim.w, in_dim.h) + else: + code_block.write('{}("{}", {}, {}, {}, {}, {}, {}, {}, {}, {}, 1, 1, 1, 1, {}, {}, {}, {},', + GEN_GROUPED_CONV_POOL_RELU, name, gen_ctrl, + at_conv.GroupIn, at_conv.GroupOut, + at_bits(in_q), at_bits(filt_q), at_bits(bias_q), at_bits(out_q), + in_q.q, filt_q.q, bias_q.q, out_q.q, + in_dim.c, out_dim.c, in_dim.w, in_dim.h) + code_block.indent() + code_block.write('{}, {}, {}, {}, {}, {}, {}, {},', + at_conv.ConvOper, at_conv.Fcx, at_conv.Fcy, + at_conv.Dcx, at_conv.Dcy, at_conv.Scx, at_conv.Scy, + at_conv.ConvPad) + code_block.write('{}, {}, {}, {}, {}, {}, {}, {}, {});', + at_pool.PoolOper, at_pool.Fpx, at_pool.Fpy, + at_pool.Dpx, at_pool.Dpy, at_pool.Spx, at_pool.Spy, + at_pool.PoolPad, at_active.ReLUOper) + code_block.deindent() diff --git a/tools/nntool/generation/at_generators/cnn_linear_relu.py b/tools/nntool/generation/at_generators/cnn_linear_relu.py new file mode 100644 index 000000000..0c528504d --- /dev/null +++ b/tools/nntool/generation/at_generators/cnn_linear_relu.py @@ -0,0 +1,78 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from collections import namedtuple +from .utils import at_bits + +GEN_LINEAR_RELU = "CNN_LinearReLU" +# extern void CNN_LinearReLU( +# char *Name, + +# CNN_GenControl_T *Ctrl, + +# int In_DataSize, +# int Filter_DataSize, +# int Bias_DataSize, +# int Out_DataSize, + +# int In_InL3, +# int Filter_InL3, +# int Bias_InL3, +# int Out_InL3, + +# int InDim, +# int OutDim, + +# KernelOper_T LinearOper, +# KernelOper_T ReLUOper +# ); + +# pylint: disable=too-many-arguments + + +LinearATParam = namedtuple('LinearATParam', [ + "LinearOper" +]) + + +def gen_linear_at_params(_): + return LinearATParam( + LinearOper="KOP_LINEAR" + ) + + +def gen_at_linear_relu(code_block, name, in_q, out_q, + filt_q, bias_q, in_dim, out_dim, + at_linear, at_active, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + gen_ctrl = "0" + else: + gen_ctrl = gen_ctrl.ctrl_name + + if at_ver < 3: + code_block.write('{}("{}", {}, {}, {}, {}, {}, 1, 1, 1, 1, {}, {},', + GEN_LINEAR_RELU, name, gen_ctrl, + at_bits(in_q), at_bits(filt_q), at_bits(bias_q), at_bits(out_q), + in_dim.size(), out_dim.size()) + else: + code_block.write('{}("{}", {}, {}, {}, {}, {}, {}, {}, {}, {}, 1, 1, 1, 1, {}, {},', + GEN_LINEAR_RELU, name, gen_ctrl, + at_bits(in_q), at_bits(filt_q), at_bits(bias_q), at_bits(out_q), + in_q.q, filt_q.q, bias_q.q, out_q.q, + in_dim.size(), out_dim.size()) + code_block.indent() + code_block.write('{}, {});', + at_linear.LinearOper, at_active.ReLUOper) + code_block.deindent() diff --git a/tools/nntool/generation/at_generators/cnn_matrix.py b/tools/nntool/generation/at_generators/cnn_matrix.py new file mode 100644 index 000000000..1a6632fe4 --- /dev/null +++ b/tools/nntool/generation/at_generators/cnn_matrix.py @@ -0,0 +1,109 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from collections import namedtuple +from .utils import at_bits, at_q +from .cnn_convolution_pool_relu import gen_activation_op + +GEN_MATADD = "CNN_MatAdd" +GEN_MATADDDYN = "CNN_MatAddDynAdjust" +GEN_MATSCALE = "CNN_MatScale" + + +MatrixAddATParam = namedtuple('MatrixAddATParam', [ + "MatrixAddOper" +]) + +def gen_matrixadd_at_params(_): + return MatrixAddATParam( + MatrixAddOper="KOP_MATADD" + ) + +def gen_at_matrixadd(code_block, name, in_q1, in_q2, out_q, + in_dim, out_dim, at_matrixadd, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + gen_ctrl = "0" + else: + raise NotImplementedError("genctrl is not yet implemented") + + if at_ver < 3: + code_block.write('{}("{}", {}, {}, {}, {}, 1, 1, 1, {}, {}, {}, {}, {});', + GEN_MATADD, name, gen_ctrl, + at_bits(in_q1), at_bits(in_q2), at_bits( + out_q), in_dim.shape[0], out_dim.shape[0], + in_dim.shape[1], in_dim.shape[2], at_matrixadd.MatrixAddOper) + else: + code_block.write('{}("{}", {}, {}, {}, {}, {}, {}, {}, 1, 1, 1, {}, {}, {}, {}, {});', + GEN_MATADD, name, gen_ctrl, + at_bits(in_q1), at_bits(in_q2), at_bits(out_q), + in_q1.q, in_q2.q, out_q.q, in_dim.shape[0], out_dim.shape[0], + in_dim.shape[1], in_dim.shape[2], at_matrixadd.MatrixAddOper) + +# pylint: disable=too-many-arguments + +def gen_matrixadddyn_at_params(_): + return MatrixAddATParam( + MatrixAddOper="KOP_MATADD_DYNADJUST" + ) + +def gen_at_matrixadddyn(code_block, name, in_q1, in_q2, out_q, + in_dim, out_dim, at_matrixadd, gen_ctrl=None): + if gen_ctrl is None: + gen_ctrl = "0" + else: + raise NotImplementedError("genctrl is not yet implemented") + + code_block.write('{}("{}", {}, {}, {}, {}, {}, {}, {}, 1, 1, 1, {}, {}, {}, {}, {});', + GEN_MATADDDYN, name, gen_ctrl, + at_bits(in_q1), at_bits(in_q2), at_bits(out_q), + in_q1.q, in_q2.q, out_q.q, + in_dim.shape[0], out_dim.shape[0], + in_dim.shape[1], in_dim.shape[2], at_matrixadd.MatrixAddOper) + +# pylint: disable=too-many-arguments + +MatScaleATParam = namedtuple('MatScaleATParam', [ + "ScaleOper", + "ReLUOper" +]) + +def gen_matscale_at_params(params): + # KOP_MATSCALE_VECTOR, KOP_MATSCALE_SCALAR or KOP_MATSCALE_VECTOR_SCALAR + if params.fusion_type == "vec_scalar": + ms_op = 'KOP_MATSCALE_VECTOR_SCALAR' + elif params.fusion_type == "vector": + ms_op = 'KOP_MATSCALE_VECTOR' + elif params.fusion_type == "scalar": + ms_op = 'KOP_MATSCALE_SCALAR' + else: + raise NotImplementedError("unknown fusion type %s" % params.fusion_type) + return MatScaleATParam( + ScaleOper=ms_op, + ReLUOper=gen_activation_op(params.activation) + ) + +def gen_at_matscale(code_block, name, other_q, vector_q, scalar_q, out_q, + in_dim, out_dim, at_matscale, gen_ctrl=None): + if gen_ctrl is None: + gen_ctrl = "0" + else: + raise NotImplementedError("genctrl is not yet implemented") + + code_block.write('{}("{}", {}, {}, {}, {}, {}, {}, {}, {}, {}, 1, 1, 1, 1, {}, {}, {}, {}, {}, {});', + GEN_MATSCALE, name, gen_ctrl, + at_bits(other_q), at_bits(vector_q), at_bits(scalar_q), at_bits(out_q), + at_q(other_q), at_q(vector_q), at_q(scalar_q), at_q(out_q), + in_dim.shape[0], out_dim.shape[0], + in_dim.shape[2], in_dim.shape[1], at_matscale.ScaleOper, at_matscale.ReLUOper) \ No newline at end of file diff --git a/tools/nntool/generation/at_generators/cnn_pool_relu.py b/tools/nntool/generation/at_generators/cnn_pool_relu.py new file mode 100644 index 000000000..015f378b1 --- /dev/null +++ b/tools/nntool/generation/at_generators/cnn_pool_relu.py @@ -0,0 +1,85 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from .utils import at_bits + +GEN_POOL_RELU = "CNN_PoolReLU" + +# extern void CNN_PoolReLU( +# char *Name, + +# CNN_GenControl_T *Ctrl, + +# int In_DataSize, +# int Out_DataSize, + +# int In_InL3, // 1 if In comes from L3, 0 if it comes from L2 +# int Out_InL3, + +# int InFeat, +# int OutFeat, +# int Width, +# int Height, + +# KernelOper_T PoolOper, +# int Fpx, +# int Fpy, +# int Dpx, +# int Dpy, +# int Spx, +# int Spy, +# int PoolPad, + +# KernelOper_T ReLUOper +# ); + +# pylint: disable=too-many-arguments + +def gen_at_pool_relu(code_block, name, in_q, out_q, in_dim, + out_dim, at_pool, at_active, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + gen_ctrl = "0" + else: + gen_ctrl = gen_ctrl.ctrl_name + + if at_pool.PoolOper == 'KOP_NONE': + if in_dim.is_named and in_dim.has_keys(['c', 'w', 'h']): + dims = [in_dim.c, in_dim.h, in_dim.w, in_dim.c] + else: + dims = in_dim.shape.copy() + dims = dims + [1] * (4 - len(dims)) + + if out_dim.is_named and out_dim.has_key('c'): + dims[3] = out_dim.c + else: + dims[3] = dims[0] + else: + dims = [in_dim.c, in_dim.h, in_dim.w, out_dim.c] + + if at_ver < 3: + code_block.write('{}("{}", {}, {}, {}, 1, 1, {}, {}, {}, {},', + GEN_POOL_RELU, name, gen_ctrl, at_bits(in_q), at_bits(out_q), + dims[0], dims[3], dims[2], dims[1]) + else: + code_block.write('{}("{}", {}, {}, {}, {}, {}, 1, 1, {}, {}, {}, {},', + GEN_POOL_RELU, name, gen_ctrl, at_bits( + in_q), at_bits(out_q), in_q.q, out_q.q, + dims[0], dims[3], dims[2], dims[1]) + code_block.indent() + code_block.write('{}, {}, {}, {}, {}, {}, {}, {}, {});', + at_pool.PoolOper, at_pool.Fpx, at_pool.Fpy, + at_pool.Dpx, at_pool.Dpy, at_pool.Spx, at_pool.Spy, + at_pool.PoolPad, at_active.ReLUOper) + code_block.deindent() diff --git a/tools/nntool/generation/at_generators/cnn_softmax.py b/tools/nntool/generation/at_generators/cnn_softmax.py new file mode 100644 index 000000000..8097834c2 --- /dev/null +++ b/tools/nntool/generation/at_generators/cnn_softmax.py @@ -0,0 +1,53 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from collections import namedtuple +from .utils import at_bits + +GEN_SOFTMAX = "CNN_SoftMax" +# extern void CNN_SoftMax( +# char *Name, +# CNN_GenControl_T *Ctrl, +# int In_DataSize, +# int Out_DataSize, +# int In_InL3, +# int Out_InL3, +# int Dim, +# KernelOper_T SoftMaxOper +# ); + +# pylint: disable=too-many-arguments + +SoftMaxATParam = namedtuple('SoftMaxATParam', [ + "SoftMaxOper" +]) + + +def gen_softmax_at_params(_): + return SoftMaxATParam( + SoftMaxOper="KOP_SOFTMAX" + ) + + +def gen_at_softmax(code_block, name, in_q, out_q, + in_dim, at_softmax, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + gen_ctrl = "0" + else: + raise NotImplementedError("genctrl is not yet implemented") + + code_block.write('{}("{}", {}, {}, {}, {}, {}, 1, 1, {}, {});', + GEN_SOFTMAX, name, gen_ctrl, + at_bits(in_q), at_bits(out_q), in_q.q, out_q.q, in_dim.size(), at_softmax.SoftMaxOper) diff --git a/tools/nntool/generation/at_generators/utils.py b/tools/nntool/generation/at_generators/utils.py new file mode 100644 index 000000000..a41412d98 --- /dev/null +++ b/tools/nntool/generation/at_generators/utils.py @@ -0,0 +1,36 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +def at_bits(qtype): + if qtype is None: + return 0 + # 1: byte, 2: half word, 4: word + if qtype.bits == 8: + return 1 + if qtype.bits == 16: + return 2 + if qtype.bits == 32: + return 4 + raise NotImplementedError("unsupported number of bits") + + +def at_q(qtype): + if qtype is None: + return 0 + return qtype.q + + +def at_bits_and_q(qtype): + return "{}, {}".format(at_bits(qtype), qtype.q) \ No newline at end of file diff --git a/tools/nntool/generation/at_types/__init__.py b/tools/nntool/generation/at_types/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/generation/at_types/at_params.py b/tools/nntool/generation/at_types/at_params.py new file mode 100644 index 000000000..432d67fd1 --- /dev/null +++ b/tools/nntool/generation/at_types/at_params.py @@ -0,0 +1,214 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from collections import namedtuple + +# ACTIVATION +ActivationATParam = namedtuple('ActivationATParam', [ + "ReLUOper" +]) + +NO_ACTIVATION = ActivationATParam(ReLUOper='KOP_NONE') + +def gen_activation_op(activation, force_relu=False): + if activation is None or activation == "none": + aop = "KOP_NONE" + elif activation == "relu": + aop = "KOP_RELU" + elif activation == "relu6": + aop = "KOP_RELUN" if not force_relu else "KOP_RELU" + elif activation == "relun": + aop = "KOP_RELUN" if not force_relu else "KOP_RELU" + elif activation == "sigmoid" or activation == "hsigmoid": + aop = "KOP_HSIGMOID" + elif activation == "swish" or activation == "hswish": + aop = "KOP_HSWISH" + else: + raise NotImplementedError("activation type %s not implemented" % activation) + return aop + +def gen_active_at_params(params, force_relu=False): + return ActivationATParam( + ReLUOper=gen_activation_op(params.activation, force_relu=force_relu) + ) + +# CONV + +ConvATParam = namedtuple('ConvATParam', [ + "ConvOper", + "Fcx", + "Fcy", + "Dcx", + "Dcy", + "Scx", + "Scy", + "ConvPad" +]) + +NO_CONV = ConvATParam(ConvOper='KOP_NONE', Fcx=0, Fcy=0, Dcx=0, Dcy=0, Scx=0, Scy=0, ConvPad=0) + +# GROUPED CONV +# ConvOper: Type of convolution, Regular convolution: KOP_CONV, +# Regular convolution with double precision output: KOP_CONV_DP, +# Depth wise convolution: KOP_CONV_DW +# GroupIn: Size of the group for input features +# GroupOut: Size of the group for output features +# Fcx: Convolution filter x dimension +# Fcy: Convolution filter y dimension +# Dcx: Convolution filter dilation factor, x dimension +# Dcy: Convolution filter dilation factor, y dimension +# Scx: Convolution filter stride x dimension +# Scy: Convolution filter stride y dimension +# ConvPad: 0: No padding, 1: Zero padding + +GroupedConvATParam = namedtuple('GroupedConvATParam', [ + "ConvOper", + "GroupIn", + "GroupOut", + "Fcx", + "Fcy", + "Dcx", + "Dcy", + "Scx", + "Scy", + "ConvPad" +]) + +def gen_conv_at_params(params, pad_compatibilities): + if params.is_depthwise_conv(): + assert params.multiplier == 1, "Multiplier not supported" + cop = "KOP_CONV_DW" + elif params.is_grouped_conv(): + cop = "KOP_CONV" + return GroupedConvATParam( + ConvOper=cop, + GroupIn=params.groups, + GroupOut=params.multiplier, + Fcx=params.filter.w, + Fcy=params.filter.h, + Dcx=params.dilation.w, + Dcy=params.dilation.h, + Scx=params.stride.w, + Scy=params.stride.h, + ConvPad=params.has_at_zero_pad() and 1 or 0 + ) + else: + cop = "KOP_CONV" + + pad_compatibilities.append(params.padding.pad_compatibility) + return ConvATParam( + ConvOper=cop, + Fcx=params.filter.w, + Fcy=params.filter.h, + Dcx=params.dilation.w, + Dcy=params.dilation.h, + Scx=params.stride.w, + Scy=params.stride.h, + ConvPad=params.has_at_zero_pad() and 1 or 0 + ) + +# POOL +PoolATParam = namedtuple('PoolATParam', [ + "PoolOper", + "Fpx", + "Fpy", + "Dpx", + "Dpy", + "Spx", + "Spy", + "PoolPad" +]) + +NO_POOL = PoolATParam(PoolOper='KOP_NONE', Fpx=0, Fpy=0, Dpx=0, Dpy=0, Spx=0, Spy=0, PoolPad=0) + +def gen_pool_at_params(params, pad_compatibilities): + if params.pool_type == "average": + pop = "KOP_AVGPOOL" + elif params.pool_type == "max": + pop = "KOP_MAXPOOL" + else: + raise NotImplementedError() + + pad_compatibilities.append(params.padding.pad_compatibility) + return PoolATParam( + PoolOper=pop, + Fpx=params.filter.w, + Fpy=params.filter.h, + Dpx=1, + Dpy=1, + Spx=params.stride.w, + Spy=params.stride.h, + PoolPad=params.has_at_zero_pad() and 1 or 0 + ) + +GlobalPoolATParam = namedtuple('GlobalPoolATParam', [ + "GlobalPoolOper" +]) + +def gen_globalpool_at_params(params): + return GlobalPoolATParam( + GlobalPoolOper="KOP_GLOBAL_AVGPOOL" if params.pool_type == "average" else "KOP_GLOBAL_MAXPOOL" + ) + +# LINEAR +LinearATParam = namedtuple('LinearATParam', [ + "LinearOper" +]) + +def gen_linear_at_params(_): + return LinearATParam( + LinearOper="KOP_LINEAR" + ) + +# MATRIX ADD +MatrixAddATParam = namedtuple('MatrixAddATParam', [ + "MatrixAddOper" +]) + +def gen_matrixadd_at_params(_): + return MatrixAddATParam( + MatrixAddOper="KOP_MATADD" + ) + +# MATRIX SCALE +MatScaleATParam = namedtuple('MatScaleATParam', [ + "ScaleOper", + "ReLUOper" +]) + +def gen_matscale_at_params(params): + # KOP_MATSCALE_VECTOR, KOP_MATSCALE_SCALAR or KOP_MATSCALE_VECTOR_SCALAR + if params.fusion_type == "vec_scalar": + ms_op = 'KOP_MATSCALE_VECTOR_SCALAR' + elif params.fusion_type == "vector": + ms_op = 'KOP_MATSCALE_VECTOR' + elif params.fusion_type == "scalar": + ms_op = 'KOP_MATSCALE_SCALAR' + else: + raise NotImplementedError("unknown fusion type %s" % params.fusion_type) + return MatScaleATParam( + ScaleOper=ms_op, + ReLUOper=gen_activation_op(params.activation) + ) + +# SOFTMAX +SoftMaxATParam = namedtuple('SoftMaxATParam', [ + "SoftMaxOper" +]) + +def gen_softmax_at_params(_): + return SoftMaxATParam( + SoftMaxOper="KOP_SOFTMAX" + ) diff --git a/tools/nntool/generation/at_types/constant_info.py b/tools/nntool/generation/at_types/constant_info.py new file mode 100644 index 000000000..952d1df2d --- /dev/null +++ b/tools/nntool/generation/at_types/constant_info.py @@ -0,0 +1,63 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +# typedef struct { +# char *FileName; /* Name of the file containing the initial values */ +# int Format; /* Float or fixed point */ +# int Binary; /* 1: file content is binary, 0: file content is text */ +# int Size; /* When Format is Fract Size in bytes of the container */ +# int Fract; /* When format is fract position of the point */ +# } ConstInit_T; + +import numpy as np + +from quantization.qtype_base import QTypeBase + + +class ConstantInfo(): + FMT_TYPES = { + 'float': 0, + 'fixed': 1 + } + + def __init__(self, file_name: str, qtype: QTypeBase, numeric_format: str = "fixed", is_binary: bool = True, contents: np.ndarray = None): + self._file_name = file_name + self._qtype = qtype + assert numeric_format in self.FMT_TYPES, "invalid numeric format" + self._numeric_format = numeric_format + self._is_binary = is_binary + self._contents = contents + + @property + def contents(self): + return self._contents + + @contents.setter + def contents(self, val): + self._contents = val + + @property + def file_name(self): + return self._file_name + + @property + def qtype(self): + return self._qtype + + def __str__(self): + return 'ConstInfo("{0}", {1}, 1, {2}, {3})'.format(self._file_name, + self.FMT_TYPES[self._numeric_format], + self._qtype.bits//8, + self._qtype.q) diff --git a/tools/nntool/generation/at_types/gen_ctrl.py b/tools/nntool/generation/at_types/gen_ctrl.py new file mode 100644 index 000000000..d4b6d0a3c --- /dev/null +++ b/tools/nntool/generation/at_types/gen_ctrl.py @@ -0,0 +1,105 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from utils.option_list import OptionList + +# int TileOrientation; /* Set Tiling orientation TILE_HOR TILE_VER */ +# int ParallelFeatures; /* Parallelize along channels */ +# int ForceDPconv; /* Forces double precision convolution*/ +# int UseHwCE; /* Enable HW CE */ +# AT_PadType PadType; /* Control padding strategy */ +# int EnableIm2Col; /* Enable mat mul based convolution when feasible */ +# int ReluN; /* if != -1 Overides 6 as a default value for ReLUN */ +# int MulBiasScalar; /* if != -1 Overides default non scalar for MulBias convolutions */ + + +def gen_ctrl_call(api, op, val, code_block): + if isinstance(val, str): + val = 'AT_OPT_VAL("%s")' % val + elif isinstance(val, bool): + val = val and 'AT_OPT_ON' or 'AT_OPT_OFF' + elif isinstance(val, int): + val = 'AT_OPT_VAL(%s)' % val + else: + raise ValueError() + + code_block.write('{}({}, {});', api, op, val) + + +def gen_kernel_ctrl(op, val, code_block): + gen_ctrl_call('AT_SetKernelCtrl', op, val, code_block) + + +def gen_graph_ctrl(op, val, code_block): + gen_ctrl_call('AT_SetGraphCtrl', op, val, code_block) + +CTRL_FEATURES = { + "TILEORIENTATION": int, + "PARALLELFEATURES": int, + "FORCEDPCONV": int, + "USEHWCE": int, + "PADTYPE": int, + "ENABLEIM2COL": int, + "RELUN": int, + "MULBIASSCALAR": int, + "RELUNNONORM": int +} + + +class GenCtrl(OptionList): + PREFIX = "gen_ctrl_" + + def __init__(self, options, *args, cname=None, **kwargs): + super(GenCtrl, self).__init__(*args, valid_options=CTRL_FEATURES, **kwargs) + if options is not None: + self.extend(options, name_filter=lambda name: name in CTRL_FEATURES) + self._cname = cname + + @property + def is_unmodified(self): + return len(self) == 0 + + @property + def set_features(self): + return self.set_options + + @property + def prefixed_cname(self): + return self.PREFIX + self._cname + + @property + def ctrl_name(self): + if self.is_unmodified: + return "0" + + return "&{}".format(self.prefixed_cname) + + @property + def cname(self): + return self._cname + + @cname.setter + def cname(self, val): + self._cname = val + + def gen_ctrl_decl(self, code_block): + code_block.write('CNN_GenControl_T {};', self.prefixed_cname) + code_block.write('CNN_InitGenCtrl({});', self.ctrl_name) + for name, val in self._options.items(): + if self.valid_options[name] == int: + code_block.write('CNN_SetGenCtrl({}, "{}", AT_OPT_VAL({}));', + self.ctrl_name, name.upper(), val) + else: + raise NotImplementedError() diff --git a/tools/nntool/generation/at_types/tc_arg_info.py b/tools/nntool/generation/at_types/tc_arg_info.py new file mode 100644 index 000000000..3fb80b5cf --- /dev/null +++ b/tools/nntool/generation/at_types/tc_arg_info.py @@ -0,0 +1,148 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +# CKernel_Arg_T *TCArgInfo( +# char *ArgType, /**< C argument type as a string */ +# char *ArgName, /**< C argument name as a string */ +# ArgScope_T Scope, /**< Argument scope, ARG_SCOPE_ARG: passed as an argument, ARG_SCOPE_GLOBAL: global symbol */ +# ArgDirection_T Dir, /**< Argument's direction (in, out or in out) */ +# AT_MemLocation_T HomeLoc, /**< Permanent location in the mem hierarch variants of L3 or L2 */ +# AT_MemLocation_T ExecLoc, /**< Location of the argument when it is used, can be != HomeLoc */ +# ConstInit_T *ConstInit /**< Descriptor for initialized constant */ +# ); + +from typing import Optional + +from generation.at_types.constant_info import ConstantInfo + + +class TCArgInfo(): + ARG_TYPES = { + "uint8": "unsigned char * __restrict__", + "int8": "signed char * __restrict__", + "uint16": "unsigned short * __restrict__", + "int16": "signed short * __restrict__", + "uint32": "unsigned int * __restrict__", + "int32": "signed int * __restrict__", + "unsigned char": "unsigned char * __restrict__", + "signed char": "signed char * __restrict__", + "unsigned short": "unsigned short int * __restrict__", + "signed short": "short int * __restrict__", + "short int": "short int * __restrict__", + "unsigned int": "unsigned int * __restrict__", + "signed int": "signed int * __restrict__", + } + ARG_SCOPES = [ + "ARG_SCOPE_UNDEF", "ARG_SCOPE_ARG", "ARG_SCOPE_ARG_ALLOC", "ARG_SCOPE_GLOBAL", "ARG_SCOPE_LOCAL" + ] + ARG_DIRECTIONS = [ + "ARG_DIR_UNDEF", "ARG_DIR_IN", "ARG_DIR_CONSTIN", "ARG_DIR_OUT", "ARG_DIR_INOUT" + ] + MEM_LOCATIONS = [ + "AT_MEM_UNDEF", + "AT_MEM_L3_HRAM", + "AT_MEM_L3_QSPIRAM", + "AT_MEM_L3_OSPIRAM", + "AT_MEM_L3_HFLASH", + "AT_MEM_L3_QSPIFLASH", + "AT_MEM_L3_OSPIFLASH", + "AT_MEM_L3_MRAMFLASH", + "AT_MEM_L2", + "AT_MEM_L1", + ] + + def __init__(self, + arg_type: str, arg_name: str, arg_scope: str, + arg_dir: str, home_location: Optional[str] = None, + exec_location: Optional[str] = None, + const_info: Optional[ConstantInfo] = None, + comment: Optional[str] = None): + assert arg_type in self.ARG_TYPES + self._arg_type = arg_type + self._arg_name = arg_name + assert arg_scope in self.ARG_SCOPES + self._arg_scope = arg_scope + assert arg_dir in self.ARG_DIRECTIONS + self._arg_dir = arg_dir + assert home_location is None or home_location in self.MEM_LOCATIONS + self._home_location = home_location + assert exec_location is None or exec_location in self.MEM_LOCATIONS + self._exec_location = exec_location + self._const_info = const_info + self._comment = comment + + @property + def comment(self): + return self._comment + + @comment.setter + def comment(self, val): + self._comment = val + + @property + def const_info(self): + return self._const_info + + @property + def arg_name(self): + return self._arg_name + + def __str__(self): + return str.format('TCArgInfo("{}", "{}", {}, {}, {}, {}, {})', + self.ARG_TYPES[self._arg_type], + self._arg_name, + self._arg_scope, + self._arg_dir, + self._home_location or self.MEM_LOCATIONS[0], + self._exec_location or self.MEM_LOCATIONS[0], + self._const_info or "0") + + +class GlobalArgInfo(TCArgInfo): + + def __init__(self, arg_type: str, arg_name: str, home_location: Optional[str] = None, + exec_location: Optional[str] = None, + const_info: Optional[ConstantInfo] = None, + comment=None): + super(GlobalArgInfo, self).__init__(arg_type, arg_name, "ARG_SCOPE_GLOBAL", + "ARG_DIR_CONSTIN", home_location=home_location, + exec_location=exec_location, + const_info=const_info, + comment=comment) + + +class LocalArgInfo(TCArgInfo): + def __init__(self, arg_type: str, arg_name: str, home_location: Optional[str] = None): + super(LocalArgInfo, self).__init__(arg_type, arg_name, "ARG_SCOPE_LOCAL", + "ARG_DIR_INOUT", home_location=home_location, + exec_location=home_location) + + +class OutputArgInfo(TCArgInfo): + def __init__(self, arg_type: str, arg_name: str, home_location: Optional[str] = None, + exec_location: Optional[str] = None, allocate=False): + scope = "ARG_SCOPE_ARG_ALLOC" if allocate else "ARG_SCOPE_ARG" + super(OutputArgInfo, self).__init__(arg_type, arg_name, scope, + "ARG_DIR_OUT", home_location=home_location, + exec_location=exec_location) + + +class InputArgInfo(TCArgInfo): + def __init__(self, arg_type: str, arg_name: str, home_location: Optional[str] = None, + exec_location: Optional[str] = None, allocate=False): + scope = "ARG_SCOPE_ARG_ALLOC" if allocate else "ARG_SCOPE_ARG" + super(InputArgInfo, self).__init__(arg_type, arg_name, scope, + "ARG_DIR_IN", home_location=home_location, + exec_location=exec_location) diff --git a/tools/nntool/generation/at_types/tensor_stack.py b/tools/nntool/generation/at_types/tensor_stack.py new file mode 100644 index 000000000..0bb7ec37f --- /dev/null +++ b/tools/nntool/generation/at_types/tensor_stack.py @@ -0,0 +1,38 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +# pylint: disable=line-too-long + +from typing import Sequence +import logging + +LOG = logging.getLogger("nntool." + __name__) + + +class TensorStack(): + def __init__(self, out_name: str, in_names: Sequence[str]): + self._out_name = out_name + self._in_names = in_names + + @staticmethod + def gen_str(name): + return '"%s"' % name + + def __str__(self): + return str.format('AddStackedTensors("{}", {}, {});', + self._out_name, + len(self._in_names), + ', '.join([self.gen_str(in_name) for in_name + in self._in_names])) diff --git a/tools/nntool/generation/autotiler_options.py b/tools/nntool/generation/autotiler_options.py new file mode 100644 index 000000000..a33b958a2 --- /dev/null +++ b/tools/nntool/generation/autotiler_options.py @@ -0,0 +1,118 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from .memory_device_info import MemoryDeviceInfos + +#pylint: disable=line-too-long + +AUTO_TILER_OPTIONS = [ + { + 'name': 'KERNEL_BUFFER_PROMOTE', 'type': 'kernel', 'var_type': bool, + 'descr': 'When all user kernel arguments can fit into given L1 memory promote them to buffer', 'default': True}, + { + 'name': 'KERNEL_PARTIAL_BUFFER_PROMOTE', 'type': 'kernel', 'var_type': bool, + 'descr': 'When all tile of a user kernel argument across Input Features can fit into given L1 memory promote them to partial buffer', 'default': True}, + { + 'name': 'KERNEL_NOSOLUTION_ERROR', 'type': 'kernel', 'var_type': bool, + 'descr': 'Report an error when no tiling solution is found', 'default': True}, + { + 'name': 'GRAPH_MONITOR_CYCLES', 'type': 'graph', 'var_type': bool, + 'descr': 'Enable automatic cycle capture for each node of the graph', 'default': False}, + { + 'name': 'GRAPH_MONITOR_CVAR_NAME', 'type': 'graph', 'var_type': str, + 'descr': 'When monitor cycles is on name of the C var array to receive results', 'default': 'AT_GraphPerf'}, + { + 'name': 'GRAPH_PRODUCE_NODE_NAMES', 'type': 'graph', 'var_type': bool, + 'descr': 'Enable production of an array containing the name of each graph node', 'default': False}, + { + 'name': 'GRAPH_PRODUCE_NODE_CVAR_NAME', 'type': 'graph', 'var_type': str, + 'descr': 'When producing node names is on name of the C array receiving the names as strings', 'default': 'AT_GraphNodeNames'}, + { + 'name': 'GRAPH_PRODUCE_OPERINFOS', 'type': 'graph', 'var_type': bool, + 'descr': 'Enable production of number of macs for each layer', 'default': False}, + { + 'name': 'GRAPH_PRODUCE_OPERINFOS_CVAR_NAME', 'type': 'graph', 'var_type': str, + 'descr': 'When Number of oper Infos is on name of the C array receiving mac infos for each node', 'default': 'AT_GraphOperInfosNames'}, + { + 'name': 'GRAPH_REORDER_CONSTANT_IN', 'type': 'graph', 'var_type': bool, + 'descr': 'Enable reodering of constant inputs in order to transform 2D accesses into 1D accesses', 'default': True}, + { + 'name': 'GRAPH_TRACE_EXEC', 'type': 'graph', 'var_type': bool, 'descr': 'Enable trace of activity', 'default': False}, + { + 'name': 'GRAPH_NOINLINE_NODE', 'type': 'graph', 'var_type': bool, + 'descr': 'If on, all user kernel function are marked as noinline', 'default': False}, + { + 'name': 'GRAPH_PREF_L3_EXEC', 'type': 'graph', 'var_type': str, + 'descr': 'In case a symbol must be allocated in L3 for execution this is the prefered memory', 'default': 'AT_MEM_L3_HRAM'}, + { + 'name': 'GRAPH_CONST_EXEC_FROM_FLASH', 'type': 'graph', 'var_type': bool, + 'descr': 'If on, constant symbols executes from home location', 'default': False}, + { + 'name': 'GRAPH_PREF_L3_HOME', 'type': 'graph', 'var_type': str, + 'descr': 'For constant symbols which L3 flash prefered memory', 'default': 'AT_MEM_L3_HFLASH' + }, + { + 'name': 'GRAPH_DUMP_TENSOR', 'type': 'graph', 'var_type': int, + 'descr': 'Trace selected tensors arguments at inference time, either all nodes or selected node', 'default': 0 + }, + { + 'name': 'GRAPH_DUMP_ONE_NODE', 'type': 'graph', 'var_type': str, + 'descr': 'Trace one specific graph node', 'default': None + } +] + +DEFAULT_GEN_OPTS = { + 'default_input_home_location': 'AT_MEM_L2', + 'default_input_exec_location': 'AT_MEM_L2', + 'default_output_home_location': 'AT_MEM_L2', + 'default_output_exec_location': 'AT_MEM_L2', + 'default_global_home_location': 'AT_MEM_L3_HFLASH', + 'default_global_exec_location': 'AT_MEM_UNDEF', + 'default_local_location': 'AT_MEM_UNDEF', + 'l2_ram_ext_managed': True, + 'l3_ram_ext_managed': False, + 'l3_flash_ext_managed': False, + 'generate_checksums': False, + 'include_project_header': False, + 'checksum_file': '', + 'tensor_directory': '.', + 'model_directory': '.', + 'model_file': 'model.c', + 'at_ver': 3, + 'memory_devices': MemoryDeviceInfos.default() +} + +DEFAULT_GEN_OPTS.update({(elem['name'].lower()): elem['default'] for elem in AUTO_TILER_OPTIONS}) + +DEFAULT_GEN_OPTS_DESCRIPTIONS = { + 'default_input_home_location': {'type': str, 'descr': 'default home location for inputs for code generation'}, + 'default_input_exec_location': {'type': str, 'descr': 'default exec location for inputs for code generation'}, + 'default_output_home_location': {'type': str, 'descr': 'default home location for outputs for code generation'}, + 'default_output_exec_location': {'type': str, 'descr': 'default exec location for outputs for code generation'}, + 'default_global_home_location': {'type': str, 'descr': 'default home location for globals for code generation'}, + 'default_global_exec_location': {'type': str, 'descr': 'default exec location for globals for code generation'}, + 'default_local_location': {'type': str, 'descr': 'default location for locals for code generation'}, + 'l2_ram_ext_managed': {'type': bool, 'descr': 'Externally manage L2 RAM'}, + 'l3_ram_ext_managed': {'type': bool, 'descr': 'Externally manage L3 RAM'}, + 'l3_flash_ext_managed': {'type': bool, 'descr': 'Externally manage L3 flash'}, + 'include_project_header': {'type': bool, 'descr': 'Include a header file called "GraphName.h" in generated code'}, + 'tensor_directory': {'type': str, 'descr': 'directory to dump tensors to'}, + 'model_directory': {'type': str, 'descr': 'directory to dump model to'}, + 'model_file': {'type': str, 'descr': 'filename for model'}, + 'at_ver': {'type': int, 'descr': 'AutoTiler version'}, +} + +DEFAULT_GEN_OPTS_DESCRIPTIONS.update( + {elem['name'].lower(): {'type': elem['var_type'], 'descr': elem['descr']} for elem in AUTO_TILER_OPTIONS}) diff --git a/tools/nntool/generation/bindings.py b/tools/nntool/generation/bindings.py index c4f2b5d3f..344878f7b 100644 --- a/tools/nntool/generation/bindings.py +++ b/tools/nntool/generation/bindings.py @@ -15,9 +15,6 @@ from abc import ABC, abstractmethod -from .code_generators import (gen_gnode_arg, gen_imm_arg, gen_at_bindings, - gen_at_func_bindings, gen_g_node_c_arg) - TT_TENSOR_TYPES = { 'TT_INPUT': 0, 'TT_OUTPUT': 1, @@ -25,12 +22,44 @@ 'TT_BIASES': 3 } +def gen_gnode_arg(direction, name): + return 'GNodeArg({}, "{}", 0)'.format(direction, name) + + +def gen_g_arg(name): + return 'GArg("{}")'.format(name) + + +def gen_g_node_c_arg(name): + return 'GNodeCArg("{}")'.format(name) + + +def gen_imm_arg(symbol): + return "Imm({})".format(symbol) + + +def gen_at_bindings(name, binding_list, code_block): + code_block.write('AddNode("{0}", Bindings({1}, {2}));' + .format(name, len(binding_list), ", ".join(binding_list))) + + +def gen_at_func_bindings(name, func_name, where, binding_list, code_block): + code_block.write('AddCallToNode("{0}", {1}, "{2}", Bindings({3}, {4}));' + .format(name, where, func_name, len(binding_list), ", ".join(binding_list))) + class Binding(ABC): @abstractmethod def gen_binding(self, generator): pass # pylint: disable=abstract-method +class InfoListName(Binding): + def __init__(self, cname): + self.cname = cname + + def gen_binding(self, generator): + return "{}_infos".format(self.cname) + class GNodeArg(Binding): def __init__(self, direction): self.direction = direction @@ -79,6 +108,14 @@ class BindingList(ABC): def gen_bindings(self, generator, code_block): pass +class InfosList(BindingList): + def __init__(self, cname, infos): + self.cname = cname + self.infos = infos + + def gen_bindings(self, _, code_block): + code_block.write("char {}_infos[] = {{{}}};".format(self.cname, ", ".join(self.infos))) + class CommentBindingList(BindingList): def __init__(self, fmt, *args, **kwargs): self.comment = fmt.format(*args, **kwargs) @@ -108,5 +145,3 @@ def gen_bindings(self, generator, code_block): self.where, [binding.gen_binding(generator) for binding in self.binding_list], code_block) - - \ No newline at end of file diff --git a/tools/nntool/generation/code_generator.py b/tools/nntool/generation/code_generator.py index 03846e6a3..eac32e93c 100644 --- a/tools/nntool/generation/code_generator.py +++ b/tools/nntool/generation/code_generator.py @@ -14,135 +14,38 @@ # along with this program. If not, see . import logging -import os - -from graph.types import (ActivationParameters, ConcatParameters, - ConstantInputParameters, Conv2DParameters, - ConvFusionParameters, FcParameters, FilterParameters, - GlobalPoolParameters, InputParameters, - MatrixAddParameters, MatScaleFusionParameters, - OutputParameters, PoolingParameters, - ReshapeParameters, SoftMaxParameters, - TransposeParameters, MultiplicativeBiasParameters) + +from generation.generators import RegisteredGeneratorsMixin +from generation.name_cache import NameCache +from graph.types import (ConcatParameters, ConstantInputParameters, + FilterParameters, InputParameters, OutputParameters, + ReshapeParameters, TransposeParameters) from utils.node_id import NodeId -from .bindings import (TT_TENSOR_TYPES, CommentBindingList, - FunctionBindingList, GArgEdge, GArgNode, GNodeArgEdge, - GNodeArgNode, Imm, NodeBindingList) +from .at_types.gen_ctrl import gen_graph_ctrl, gen_kernel_ctrl +from .at_types.tc_arg_info import LocalArgInfo +from .at_types.tensor_stack import TensorStack +from .autotiler_options import AUTO_TILER_OPTIONS, DEFAULT_GEN_OPTS +from .bindings import (TT_TENSOR_TYPES, FunctionBindingList, GArgEdge, + GArgNode, Imm) from .checksums import calc_value_checksum, checksum_func from .code_block import CodeBlock -from .code_generators import (gen_3d_transpose, gen_const_info, - gen_conv_pool_relu, gen_global_decl, - gen_globalpool, gen_graph_ctrl, gen_input_decl, - gen_kernel_ctrl, gen_linear_relu, gen_local_decl, - gen_matrixadd, gen_matrixadddyn, gen_matscale, - gen_output_decl, gen_pool_relu, gen_softmax, - gen_stack_decl) -from .memory_device_info import MemoryDeviceInfos from .write_constants import write_constants +from generation.generators.globals.global_names import * LOG = logging.getLogger("nntool." + __name__) -AUTO_TILER_OPTIONS = [ - { - 'name': 'KERNEL_BUFFER_PROMOTE', 'type': 'kernel', - 'descr': 'When all user kernel arguments can fit into given L1 memory promote them to buffer', 'default': True}, - { - 'name': 'KERNEL_PARTIAL_BUFFER_PROMOTE', 'type': 'kernel', - 'descr': 'When all tile of a user kernel argument across Input Features can fit into given L1 memory promote them to partial buffer', 'default': True}, - { - 'name': 'KERNEL_NOSOLUTION_ERROR', 'type': 'kernel', - 'descr': 'Report an error when no tiling solution is found', 'default': True}, - { - 'name': 'GRAPH_MONITOR_CYCLES', 'type': 'graph', - 'descr': 'Enable automatic cycle capture for each node of the graph', 'default': False}, - { - 'name': 'GRAPH_MONITOR_CVAR_NAME', 'type': 'graph', - 'descr': 'When monitor cycles is on name of the C var array to receive results', 'default': 'AT_GraphPerf'}, - { - 'name': 'GRAPH_PRODUCE_NODE_NAMES', 'type': 'graph', - 'descr': 'Enable production of an array containing the name of each graph node', 'default': False}, - { - 'name': 'GRAPH_PRODUCE_NODE_CVAR_NAME', 'type': 'graph', - 'descr': 'When producing node names is on name of the C array receiving the names as strings', 'default': 'AT_GraphNodeNames'}, - { - 'name': 'GRAPH_PRODUCE_OPERINFOS', 'type': 'graph', - 'descr': 'Enable production of number of macs for each layer', 'default': False}, - { - 'name': 'GRAPH_PRODUCE_OPERINFOS_CVAR_NAME', 'type': 'graph', - 'descr': 'When Number of oper Infos is on name of the C array receiving mac infos for each node', 'default': 'AT_GraphOperInfosNames'}, - { - 'name': 'GRAPH_REORDER_CONSTANT_IN', 'type': 'graph', - 'descr': 'Enable reodering of constant inputs in order to transform 2D accesses into 1D accesses', 'default': True}, - { - 'name': 'GRAPH_TRACE_EXEC', 'type': 'graph', 'descr': 'Enable trace of activity', 'default': True}, - { - 'name': 'GRAPH_NOINLINE_NODE', 'type': 'graph', - 'descr': 'If on, all user kernel function are marked as noinline', 'default': False}, - { - 'name': 'GRAPH_PREF_L3_EXEC', 'type': 'graph', - 'descr': 'In case a symbol must be allocated in L3 for execution this is the prefered memory', 'default': 'AT_MEM_L3_HRAM'}, - { - 'name': 'GRAPH_CONST_EXEC_FROM_FLASH', 'type': 'graph', - 'descr': 'If on, constant symbols executes from home location', 'default': False}, - { - 'name': 'GRAPH_PREF_L3_HOME', 'type': 'graph', - 'descr': 'For constant symbols which L3 flash prefered memory', 'default': 'AT_MEM_L3_HFLASH'}, -] - -DEFAULT_GEN_OPTS = { - 'default_input_home_location': 'AT_MEM_L2', - 'default_input_exec_location': 'AT_MEM_L2', - 'default_output_home_location': 'AT_MEM_L2', - 'default_output_exec_location': 'AT_MEM_L2', - 'default_global_home_location': 'AT_MEM_L3_HFLASH', - 'default_global_exec_location': 'AT_MEM_UNDEF', - 'default_local_location': 'AT_MEM_UNDEF', - 'l2_ram_ext_managed': True, - 'l3_ram_ext_managed': False, - 'l3_flash_ext_managed': False, - 'generate_checksums': False, - 'include_project_header': False, - 'checksum_file': '', - 'dump_tensors': False, - 'tensor_directory': '.', - 'model_directory': '.', - 'model_file': 'model.c', - 'at_ver': 3, - 'memory_devices': MemoryDeviceInfos.default() -} - -DEFAULT_GEN_OPTS.update({(elem['name'].lower()): elem['default'] for elem in AUTO_TILER_OPTIONS}) - -DEFAULT_GEN_OPTS_DESCRIPTIONS = { - 'default_input_home_location': 'default home location for inputs for code generation', - 'default_input_exec_location': 'default exec location for inputs for code generation', - 'default_output_home_location': 'default home location for outputs for code generation', - 'default_output_exec_location': 'default exec location for outputs for code generation', - 'default_global_home_location': 'default home location for globals for code generation', - 'default_global_exec_location': 'default exec location for globals for code generation', - 'default_local_location': 'default location for locals for code generation', - 'l2_ram_ext_managed': 'Externally manage L2 RAM', - 'l3_ram_ext_managed': 'Externally manage L3 RAM', - 'l3_flash_ext_managed': 'Externally manage L3 flash', - 'include_project_header': 'Include a header file called "GraphName.h" in generated code', - 'tensor_directory': 'directory to dump tensors to', - 'model_directory': 'directory to dump model to', - 'model_file': 'filename for model', - 'at_ver': 'AutoTiler version', - 'dump_tensors': 'write the tensors to files. currently only works in emulation mode.', -} - -DEFAULT_GEN_OPTS_DESCRIPTIONS.update( - {elem['name'].lower(): elem['descr'] for elem in AUTO_TILER_OPTIONS}) - - -class CodeGenerator(): + +class CodeGenerator(RegisteredGeneratorsMixin): def __init__(self, G, naming_convension, opts=None): self.G = G self.naming_convension = naming_convension - self.name_cache = {} + self.name_cache = NameCache() self.bindings = [] + self.kernels = [] + self.globals = [] + self.stacked_tensors = [] + self.locals = [] self.inputs_by_name = {} self.func_bindings = [] self.include_files = ["CNN_Basic_Kernels.h"] @@ -151,16 +54,11 @@ def __init__(self, G, naming_convension, opts=None): self.opts.update(opts) if self.opts['include_project_header']: self.include_files.append(self.project_name + '.h') - has_dump = False has_vcd = False for step in G.graph_state.steps: node = step['node'] if node.at_options.vcd_trace_on is not None: has_vcd = True - if node.at_options.dump_tensors is not None: - has_dump = True - if self.opts['dump_tensors'] or has_dump: - self.include_files.append('helpers.h') if has_vcd: self.include_files.append('hal/gvsoc/gvsoc.h') @@ -168,6 +66,14 @@ def __init__(self, G, naming_convension, opts=None): def project_name(self): return self.naming_convension.get_project_name() + @property + def basic_ker_header(self): + if self.G.graph_identity.quantization_type == 'SQ8': + return "CNN_BasicKernels_SQ8.h" + if self.G.graph_identity.quantization_type == 'POW2': + return "CNN_BasicKernels.h" + return ValueError("Quantization type not known %s", self.G.graph_identity.quantization_type) + def get_edge_name(self, eparams): return self.name_cache[eparams]['edge'] @@ -226,8 +132,6 @@ def real_down_connection(G, eparams): return oedge def local_generator(self, indent=0): - code_block = CodeBlock(starting_indent=indent + 1) - num_locals = 0 edges = set(edge.params for edge in self.G.edges()) sorted_edges = list(edges) sorted_edges.sort(key=lambda eparams: eparams.creating_step) @@ -245,23 +149,30 @@ def local_generator(self, indent=0): eparams.creating_node.step_idx, eparams.creating_node.name, rout_eparams.creating_node.name, rout_eparams.creating_node.step_idx, rout_eparams.creating_step, cname) - self.name_cache[eparams] = {'edge': cname} + self.name_cache.set(eparams, 'edge', cname) continue rin_eparams, set_real = self.real_up_connection(self.G, eparams) - if set_real: - # Code will not be generated for reshape or empty transpose so the input to the - # following node is the input to this node - cname = self.naming_convension.get_edge_name(rin_eparams.creating_node.name, - rin_eparams.creating_step, - rin_eparams.edge_type, - rin_eparams.edge_order) - LOG.info("edge from step %s %s is not used and is replaced with edge from step %s:%s %s cname: %s", - eparams.creating_node.step_idx, eparams.creating_node.name, - rin_eparams.creating_node.name, rin_eparams.creating_node.step_idx, - rin_eparams.creating_step, cname) - self.name_cache[eparams] = {'edge': cname} + if rin_eparams.edge_type == "out": + # The edge was marked as an output so find the real edge down + rin_eparams = self.real_down_connection(self.G, rin_eparams).params + self.name_cache.set(eparams, 'edge', rin_eparams.name) continue + else: + if set_real: + # Code will not be generated for reshape or empty transpose so the input to the + # following node is the input to this node + cname = self.naming_convension.get_edge_name(rin_eparams.creating_node.name, + rin_eparams.creating_step, + rin_eparams.edge_type, + rin_eparams.edge_order) + LOG.info("edge from step %s %s is not used and is replaced with edge from step %s:%s %s cname: %s", + eparams.creating_node.step_idx, eparams.creating_node.name, + rin_eparams.creating_node.name, rin_eparams.creating_node.step_idx, + rin_eparams.creating_step, cname) + self.name_cache.set(eparams, 'edge', cname) + continue + cname = self.naming_convension.get_edge_name(eparams.creating_node.name, eparams.creating_step, eparams.edge_type, @@ -269,190 +180,129 @@ def local_generator(self, indent=0): out_q = self.G.quantization[NodeId(eparams.creating_node, None)]\ .out_qs[eparams.creating_node_idx] - self.name_cache[eparams] = {'edge': cname} + self.name_cache.set(eparams, 'edge', cname) if eparams.edge_type != "in_out" or eparams.is_alias: continue - if num_locals != 0: + self.locals.append(LocalArgInfo(out_q.ctype, eparams.name, + self.opts['default_local_location'])) + + code_block = CodeBlock(starting_indent=indent) + code_block.write_start("CArgs({},", len(self.locals)) + code_block.indent() + first = True + for local_def in self.locals: + if first: + first = False + else: code_block.append_last(',') - gen_local_decl(eparams, out_q, self.opts['default_local_location'], code_block) - num_locals += 1 + code_block.write(str(local_def)) code_block.deindent() - code_block.write_start("CArgs({},", num_locals) code_block.write(")") return str(code_block) def stack_generator(self, indent=0): - if self.opts['at_ver'] < 2: - LOG.warning("AutoTiler version is less than 3. Stacked tensors are not supported.") edges = set(edge.params for edge in self.G.edges()) sorted_edges = list(edges) sorted_edges.sort(key=lambda eparams: eparams.creating_step) concat_edges = list([eparams for eparams in sorted_edges if isinstance( eparams.creating_node, ConcatParameters)]) - code_block = CodeBlock(starting_indent=indent) - if len(concat_edges) == 0: - code_block.comment("no concats in graph so not stacked tensors created") for eparams in concat_edges: node = eparams.creating_node cname_out = self.name_cache[eparams]['edge'] in_edge_names = [self.name_cache[edge.params]['edge'] for edge in self.G.in_edges(node.name)] - gen_stack_decl(cname_out, in_edge_names, code_block) + self.stacked_tensors.append(TensorStack(cname_out, in_edge_names)) + + code_block = CodeBlock(starting_indent=indent) + if len(self.stacked_tensors) == 0: + code_block.comment("no concats in graph so not stacked tensors created") + else: + for stacked_tensor in self.stacked_tensors: + code_block.write(str(stacked_tensor)) return str(code_block) def global_generator(self, indent=0): + self.generate_inputs() + self.generate_constants() + self.generate_outputs() code_block = CodeBlock(starting_indent=indent + 1) - - num_globals = self.generate_inputs(code_block) - num_globals = self.generate_constants(num_globals, code_block) - num_globals = self.generate_outputs(num_globals, code_block) + code_block.write("CArgs({}", len(self.globals)) + code_block.indent() + first = False + for global_def in self.globals: + if first: + first = False + else: + code_block.append_last(',') + if global_def.comment is not None: + code_block.comment(global_def.comment) + code_block.write(str(global_def)) code_block.deindent() - code_block.write_start("CArgs({},", num_globals) code_block.write(")") return str(code_block) - def generate_outputs(self, num_globals, code_block): + def generate_outputs(self): outputs = set() for node in self.G.output_nodes(): - in_qs = self.G.quantization[NodeId(node)].in_qs + qrec = self.G.quantization[NodeId(node)] for edge in self.G.in_edges(node.name): - eparams = edge.params + eparams, _ = self.real_up_connection(self.G, edge.params) if eparams in outputs: continue + eparams.edge_type = "out" outputs.add(eparams) - if num_globals != 0: - code_block.append_last(',') - gen_output_decl(eparams, - in_qs[edge.to_idx], - self.opts['default_output_home_location'], - self.opts['default_output_exec_location'], - code_block, - allocate=node.at_options.allocate) - num_globals += 1 - return num_globals - - def generate_constants(self, num_globals, code_block): - for step_idx, pnode, _, fnode in self.G.nodes_iterator(): + self.execute_phase("outputs", node, qrec, edge) + + def generate_constants(self): + for _, pnode, _, fnode in self.G.nodes_iterator(): anode = pnode if not fnode else fnode - if isinstance(anode, FilterParameters): - qrec = self.G.quantization[NodeId(pnode, fnode)] - cname = self.naming_convension.\ - get_global_name(pnode.name, step_idx, pnode, "weights") - c_entry = self.name_cache.get(anode) - if not c_entry: - c_entry = {} - self.name_cache[anode] = c_entry - c_entry['weights'] = cname - if num_globals != 0: - code_block.append_last(',') - const_info = gen_const_info(os.path.join(self.opts['tensor_directory'], - cname+".tensor"), qrec.weights_q) - gen_global_decl(cname, qrec.weights_q, - self.opts['default_global_home_location'], - self.opts['default_global_exec_location'], - code_block, - const_info=const_info) - num_globals += 1 - - # biases are always generated even if they are 0 - if anode.has_bias: - biases_q = qrec.biases_q - else: - biases_q = qrec.out_q - - cname = self.naming_convension.\ - get_global_name(pnode.name, step_idx, pnode, "biases") - c_entry['biases'] = cname - if num_globals != 0: - code_block.append_last(',') - const_info = gen_const_info(os.path.join(self.opts['tensor_directory'], - cname+".tensor"), biases_q) - gen_global_decl(cname, biases_q, - self.opts['default_global_home_location'], - self.opts['default_global_exec_location'], - code_block, - const_info=const_info) - num_globals += 1 - - if isinstance(anode, MultiplicativeBiasParameters) and anode.has_mul_bias: - mul_biases_q = qrec.mul_biases_q - - cname = self.naming_convension.get_global_name(pnode.name, step_idx, - pnode, "mul_biases") - c_entry['mul_biases'] = cname - if num_globals != 0: - code_block.append_last(',') - const_info = gen_const_info(os.path.join(self.opts['tensor_directory'], - cname+".tensor"), mul_biases_q) - gen_global_decl(cname, mul_biases_q, - self.opts['default_global_home_location'], - self.opts['default_global_exec_location'], - code_block, - const_info=const_info) - num_globals += 1 - elif isinstance(anode, ConstantInputParameters): - qrec = self.G.quantization[NodeId(pnode, fnode)] - # the name cache will be updated when all the edges are analysed by local_generator - # the name of the constant is attached to the output edge so find it - out_edge = self.G.out_edges(pnode.name)[0] - eparams = out_edge.params - cname = self.naming_convension.get_edge_name(eparams.creating_node.name, - eparams.creating_step, - eparams.edge_type, - eparams.edge_order) - if num_globals != 0: - code_block.append_last(',') - const_info = gen_const_info(os.path.join(self.opts['tensor_directory'], - cname+".tensor"), qrec.out_qs[0]) - gen_global_decl(cname, qrec.out_qs[0], - self.opts['default_global_home_location'], - self.opts['default_global_exec_location'], - code_block, - const_info=const_info) - num_globals += 1 - return num_globals - - def generate_inputs(self, code_block): - num_globals = 0 + qrec = self.G.quantization.get(NodeId(pnode, fnode)) + self.execute_phase("globals", anode, qrec, pnode, fnode) + + def generate_inputs(self): inputs = set() for node in self.G.input_nodes(): - out_qs = self.G.quantization[NodeId(node)].out_qs + qrec = self.G.quantization[NodeId(node)] for edge in self.G.out_edges(node.name): eparams = edge.params if eparams in inputs: continue - if num_globals != 0: - code_block.append_last(',') inputs.add(eparams) - gen_input_decl(eparams, - out_qs[edge.from_idx], - self.opts['default_input_home_location'], - self.opts['default_input_exec_location'], - code_block, - allocate=node.at_options.allocate) - num_globals += 1 - return num_globals + self.execute_phase("inputs", node, qrec, edge) + + def cnn_generators(self): + if self.G.graph_identity.quantization_type == 'SQ8': + return "\"CNN_Generators_SQ8.h\"" + if self.G.graph_identity.quantization_type == 'POW2': + return "\"CNN_Generators.h\"" + raise ValueError() + + def cnn_kernels(self): + if self.G.graph_identity.quantization_type == 'SQ8': + return "\"CNN_BasicKernels_SQ8.h\"" + if self.G.graph_identity.quantization_type == 'POW2': + return "\"CNN_BasicKernels.h\"" + raise ValueError() def extra_includes_generator(self, indent=0): code_block = CodeBlock(starting_indent=indent) + code_block.write("#include \"nntool_extra_generators.h\"") + return str(code_block) + + def extra_includes_kernels(self, indent=0): + code_block = CodeBlock(starting_indent=indent) + code_block.write("\"nntool_extra_kernels.h\"") return str(code_block) def kernel_generator(self, indent=0): code_block = CodeBlock(starting_indent=indent) - dump_input = False - at_ver = self.opts['at_ver'] - for step_idx, node, _, fnode in self.G.nodes_iterator(): - if fnode: - continue + for step_idx, node, _, _ in self.G.nodes_iterator(yield_fusions=False): name = node.name cname = self.naming_convension.get_node_name(name, step_idx, node) if node.at_options.vcd_trace_on is not None: self.add_vcd_trace_binding(cname, node.at_options.vcd_trace_on) - self.name_cache[node] = {'node': cname} - if not isinstance(node, (InputParameters, OutputParameters, - ConcatParameters, ReshapeParameters)): - code_block.comment("generator for {}", name) + self.name_cache.set(node, 'node', cname) in_eparams = self.G.get_in_params(name) out_eparams = self.G.get_out_params(name) try: @@ -461,123 +311,27 @@ def kernel_generator(self, indent=0): LOG.error("Quantization record not found for node %s", node.name) raise err - if isinstance(node, Conv2DParameters): - self.set_conv_bindings(step_idx, in_eparams, out_eparams, cname, node, qrec, - dump_tensors=self.opts['dump_tensors'] or node.at_options.dump_tensors) - gen_conv_pool_relu(cname, node, qrec, None, - None, None, None, code_block=code_block, at_ver=at_ver, - gen_ctrl=node.get_gen_ctrl()) - elif isinstance(node, PoolingParameters): - self.set_in_out_bindings(in_eparams, out_eparams, cname, node, qrec) - gen_conv_pool_relu(cname, None, None, - node, qrec, None, None, code_block=code_block, at_ver=at_ver, - gen_ctrl=node.get_gen_ctrl()) - elif isinstance(node, ActivationParameters): - self.set_in_out_bindings(in_eparams, out_eparams, cname, node, qrec) - gen_conv_pool_relu(cname, None, None, - None, None, node, qrec, code_block=code_block, at_ver=at_ver, - gen_ctrl=node.get_gen_ctrl()) - elif isinstance(node, FcParameters): - self.set_fc_bindings(step_idx, in_eparams, out_eparams, cname, node, qrec, - dump_tensors=self.opts['dump_tensors'] or node.at_options.dump_tensors) - gen_linear_relu(cname, node, qrec, None, None, code_block=code_block, - at_ver=at_ver, gen_ctrl=node.get_gen_ctrl()) - elif isinstance(node, GlobalPoolParameters): - self.set_in_out_bindings(in_eparams, out_eparams, cname, node, qrec) - gen_globalpool(cname, node, qrec, code_block=code_block, at_ver=at_ver) - elif isinstance(node, SoftMaxParameters): - self.set_softmax_bindings(in_eparams, out_eparams, cname, node, qrec) - gen_softmax(cname, node, qrec, code_block=code_block, at_ver=at_ver) - elif isinstance(node, ConvFusionParameters): - cnodes = node.contained_nodes() - quants = [self.G.quantization[NodeId(node, fnode)] for fnode in cnodes] - if node.fusion_type == "conv_active_pool": - self.set_conv_bindings(step_idx, in_eparams, out_eparams, - cname, cnodes[0], quants[0], out_q=quants[1], - dump_tensors=self.opts['dump_tensors'] or node.at_options.dump_tensors) - gen_conv_pool_relu(cname, cnodes[0], quants[0], cnodes[2], quants[2], - cnodes[1], quants[1], code_block=code_block, at_ver=at_ver, - gen_ctrl=node.get_gen_ctrl()) - elif node.fusion_type == "conv_pool_active": - self.set_conv_bindings(step_idx, in_eparams, out_eparams, - cname, cnodes[0], quants[0], out_q=quants[2], - dump_tensors=self.opts['dump_tensors'] or node.at_options.dump_tensors) - gen_conv_pool_relu(cname, cnodes[0], quants[0], cnodes[1], quants[1], - cnodes[2], quants[2], code_block=code_block, at_ver=at_ver, - gen_ctrl=node.get_gen_ctrl()) - elif node.fusion_type == "conv_active": - self.set_conv_bindings(step_idx, in_eparams, out_eparams, - cname, cnodes[0], quants[0], out_q=quants[1], - dump_tensors=self.opts['dump_tensors'] or node.at_options.dump_tensors) - gen_conv_pool_relu(cname, cnodes[0], quants[0], None, None, cnodes[1], - quants[1], code_block=code_block, at_ver=at_ver, - gen_ctrl=node.get_gen_ctrl()) - elif node.fusion_type == "conv_pool": - self.set_conv_bindings(step_idx, in_eparams, out_eparams, - cname, cnodes[0], quants[0], out_q=quants[1], - dump_tensors=self.opts['dump_tensors'] or node.at_options.dump_tensors) - gen_conv_pool_relu(cname, cnodes[0], quants[0], cnodes[1], quants[1], None, - None, code_block=code_block, at_ver=at_ver, gen_ctrl=node.get_gen_ctrl()) - elif node.fusion_type == "linear_active": - self.set_fc_bindings(step_idx, in_eparams, out_eparams, - cname, cnodes[0], quants[0], out_q=quants[1], - dump_tensors=self.opts['dump_tensors'] or node.at_options.dump_tensors) - gen_linear_relu(cname, cnodes[0], quants[0], - cnodes[1], quants[1], code_block=code_block, at_ver=at_ver, - gen_ctrl=node.get_gen_ctrl()) - elif node.fusion_type == "pool_active": - self.set_in_out_bindings(in_eparams, out_eparams, cname, - node, qrec, out_q=quants[1]) - gen_pool_relu(cname, cnodes[0], quants[0], - cnodes[1], quants[1], code_block=code_block, at_ver=at_ver, - gen_ctrl=node.get_gen_ctrl()) - else: - raise NotImplementedError("this fusion type is not implemented") - elif isinstance(node, MatScaleFusionParameters): - if at_ver < 3: - raise NotImplementedError( - "matscale not imlemented before version 3 of AUtoTiler") - self.set_matscale_bindings(in_eparams, out_eparams, cname, node, qrec) - gen_matscale(cname, node, qrec, code_block=code_block) - elif isinstance(node, MatrixAddParameters): - self.set_matrixadd_bindings(in_eparams, out_eparams, cname, node, qrec) - if qrec.in_qs[0].q == qrec.in_qs[1].q and qrec.in_qs[0].q == qrec.out_qs[0].q: - gen_matrixadd(cname, node, qrec, code_block=code_block, at_ver=at_ver) - else: - gen_matrixadddyn(cname, node, qrec, code_block=code_block, at_ver=at_ver) - elif isinstance(node, ReshapeParameters): + if isinstance(node, ReshapeParameters): if node.transpose_in is not None or node.transpose_out is not None: LOG.error("Don't know how to generate kernel \ for a reshape that has a transpose.") return "" continue - elif isinstance(node, TransposeParameters): - if node.transpose_dimension == 1: - continue - if node.transpose_dimension < 2 or node.transpose_dimension > 3: - raise NotImplementedError("only 2D or 3D transposes are currently supported") - code_block.comment("transpose from {} to {}", node.in_dims[0], node.out_dims[0]) - self.set_in_out_bindings(in_eparams, out_eparams, cname, node, qrec) - gen_3d_transpose(cname, node, qrec, code_block=code_block) - elif isinstance(node, (InputParameters, ConstantInputParameters)): - if self.opts['dump_tensors'] or node.at_options.dump_tensors: - dump_input = True - continue - elif isinstance(node, (OutputParameters)): + elif isinstance(node, (InputParameters, OutputParameters, ConstantInputParameters)): continue elif not isinstance(node, (ConcatParameters)): - LOG.error("Don't know how to generate kernel for parameter type %s. \ - Perhaps you need to run some fusions.", node.__class__.__name__) - return "" + self.execute_phase("bindings", node, qrec, in_eparams, out_eparams, cname) + if not self.execute_phase("kernels", node, qrec, in_eparams, out_eparams, cname): + raise NotImplementedError(("Don't know how to generate kernel for parameter type %s %s. " + + "Perhaps you need to run some fusions.") % (node.name, + node.__class__.__name__)) + # if self.opts['generate_checksums']: # if last_node_was_input: # self.add_checksum_binding(cname, name, step_idx, in_eparams, True) # self.add_checksum_binding(cname, name, step_idx, out_eparams, False) - if dump_input: - self.add_dump_tensors_binding(cname, name, step_idx, in_eparams, qrec, True) - dump_input = False - if self.opts['dump_tensors'] or node.at_options.dump_tensors: - self.add_dump_tensors_binding(cname, name, step_idx, out_eparams, qrec, False) + for kernel in self.kernels: + kernel.code(code_block) return str(code_block) def add_vcd_trace_binding(self, cname, enable): @@ -588,60 +342,6 @@ def add_vcd_trace_binding(self, cname, enable): Imm(0), before=True)) - def add_dump_tensors_binding(self, cname, name, step_idx, eparams, qrec, is_input): - node = self.G[name] - if is_input: - dims = node.in_dims[0] - qtype = qrec.in_qs[0] - tensor_type = TT_TENSOR_TYPES['TT_INPUT'] - step_idx = self.G.in_edges(name)[0].from_node.step_idx - else: - dims = node.out_dims[0] - qtype = qrec.out_qs[0] - tensor_type = TT_TENSOR_TYPES['TT_OUTPUT'] - - self.func_bindings.append( - FunctionBindingList(cname, - "dt_write_tensor", - GArgEdge(eparams[0]), - Imm(step_idx), - Imm(tensor_type), - Imm(dims.size()), - Imm(qtype.bits), - Imm(len(dims.shape)), - *[Imm(v) for v in dims.shape], - before=is_input)) - - def add_dump_params_binding(self, cname, node: FilterParameters, qrec, step_idx): - dims = node.filter - qtype = qrec.weights_q - tensor_type = TT_TENSOR_TYPES['TT_WEIGHTS'] - self.func_bindings.append( - FunctionBindingList(cname, - "dt_write_tensor", - GArgNode(node, 'weights'), - Imm(step_idx), - Imm(tensor_type), - Imm(dims.size()), - Imm(qtype.bits), - Imm(len(dims.actual_shape)), - *[Imm(v) for v in dims.actual_shape], - before=True)) - if node.has_bias: - qtype = qrec.biases_q - tensor_type = TT_TENSOR_TYPES['TT_BIASES'] - self.func_bindings.append( - FunctionBindingList(cname, - "dt_write_tensor", - GArgNode(node, 'biases'), - Imm(step_idx), - Imm(tensor_type), - Imm(node.out_dims[0].c), - Imm(qtype.bits), - Imm(1), - Imm(node.out_dims[0].c), - before=True)) - def add_checksum_binding(self, cname, name, step_idx, eparams, before): node = self.G[name] if before: @@ -658,130 +358,34 @@ def add_checksum_binding(self, cname, name, step_idx, eparams, before): before=before) ) - def set_in_out_bindings(self, in_eparams, out_eparams, cname, node, node_q, out_q=None): - if out_q is None: - out_q = node_q - self.bindings.append( - CommentBindingList("Node {} inq {} outq {}", node.name, - node_q.in_qs[0].q, out_q.out_qs[0].q) - ) - self.bindings.append( - NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), - GNodeArgEdge(out_eparams[0], "GNA_OUT"))) - - def set_matrixadd_bindings(self, in_eparams, out_eparams, cname, params, node_q): - self.bindings.append( - CommentBindingList("Node {} inq1 {} inq2 {} outq {}", params.name, - node_q.in_qs[0].q, node_q.in_qs[1].q, node_q.out_qs[0].q) - ) - self.bindings.append( - NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), GNodeArgEdge(in_eparams[1]), - GNodeArgEdge(out_eparams[0], "GNA_OUT"))) - - def set_matscale_bindings(self, in_eparams, out_eparams, cname, params, node_q): - if self.opts['at_ver'] < 3: - raise NotImplementedError("matscale is only implemented in AutoTiler v3") - if params.fusion_type == "vec_scalar": - self.bindings.append( - CommentBindingList("Node {} inq1 {} inq2 {} inq3 {} outq {}", params.name, - node_q.in_qs[0].q, node_q.in_qs[1].q, node_q.in_qs[2].q, node_q.out_qs[0].q) - ) - self.bindings.append( - NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), GNodeArgEdge(in_eparams[1]), GNodeArgEdge(in_eparams[2]), - GNodeArgEdge(out_eparams[0], "GNA_OUT"))) - else: - self.bindings.append( - CommentBindingList("Node {} inq1 {} inq2 {} outq {}", params.name, - node_q.in_qs[0].q, node_q.in_qs[1].q, node_q.out_qs[0].q) - ) - self.bindings.append( - NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), GNodeArgEdge(in_eparams[1]), - GNodeArgEdge(out_eparams[0], "GNA_OUT"))) - - def set_softmax_bindings(self, in_eparams, out_eparams, cname, params, node_q): - self.bindings.append( - CommentBindingList("Node {} inq {} outq {}", params.name, - node_q.in_qs[0].q, node_q.out_qs[0].q) - ) - if self.opts['at_ver'] > 2: - self.bindings.append( - NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), - GNodeArgEdge(out_eparams[0], "GNA_OUT"))) - else: - self.bindings.append( - NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), - GNodeArgEdge(out_eparams[0], "GNA_OUT"), - Imm(node_q.in_qs[0].q))) - - def set_conv_bindings(self, step_idx, in_eparams, out_eparams, cname, params, conv_q, - out_q=None, dump_tensors=False): - if out_q is None: - out_q = conv_q - self.bindings.append( - CommentBindingList("Node {} inq {} weightsq {} outq {} biasesq {}", cname, - conv_q.in_qs[0].q, conv_q.weights_q.q, out_q.out_qs[0].q, conv_q.biases_q.q) - ) - if self.opts['at_ver'] > 2: - if params.has_mul_bias: - self.bindings.append( - NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), GNodeArgNode(params, 'weights'), - GNodeArgNode(params, 'biases'), GNodeArgNode(params, 'mul_biases'), - GNodeArgEdge(out_eparams[0], "GNA_OUT"))) - else: - self.bindings.append( - NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), GNodeArgNode(params, 'weights'), - GNodeArgNode(params, 'biases'), - GNodeArgEdge(out_eparams[0], "GNA_OUT"))) - else: - norm = conv_q.in_qs[0].q + conv_q.weights_q.q - out_q.out_qs[0].q - normbias = conv_q.in_qs[0].q + conv_q.weights_q.q - \ - out_q.out_qs[0].q * 2 + conv_q.biases_q.q - self.bindings.append( - NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), GNodeArgNode(params, 'weights'), - GNodeArgNode(params, 'biases'), - GNodeArgEdge(out_eparams[0], "GNA_OUT"), - Imm(norm), - Imm(normbias))) - if dump_tensors: - self.add_dump_params_binding(cname, params, conv_q, step_idx) - - def set_fc_bindings(self, step_idx, in_eparams, out_eparams, cname, - params, linear_q, out_q=None, dump_tensors=False): - if out_q is None: - out_q = linear_q - self.bindings.append( - CommentBindingList("Node {} inq {} weightsq {} outq {}", params.name, - linear_q.in_qs[0].q, linear_q.weights_q.q, out_q.out_qs[0].q) - ) - if self.opts['at_ver'] > 2: - self.bindings.append( - NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), GNodeArgNode(params, 'weights'), - GNodeArgNode(params, 'biases'), - GNodeArgEdge(out_eparams[0], "GNA_OUT"))) - else: - self.bindings.append( - NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), GNodeArgNode(params, 'weights'), - GNodeArgNode(params, 'biases'), - GNodeArgEdge(out_eparams[0], "GNA_OUT"), - Imm(linear_q.in_qs[0].q + - linear_q.weights_q.q - linear_q.out_qs[0].q), - Imm(linear_q.in_qs[0].q + linear_q.weights_q.q - linear_q.biases_q.q))) - if dump_tensors: - self.add_dump_params_binding(cname, params, linear_q, step_idx) - def write_constants(self): - write_constants(self.G, self.naming_convension, self.opts['tensor_directory']) + write_constants(self.globals, tensor_directory=self.opts['tensor_directory']) + + def load_basic_kernel_library(self, indent=0): + code_block = CodeBlock(starting_indent=indent) + if self.G.graph_identity.quantization_type == 'SQ8': + code_block.write("LoadCNN_SQ8_Library();") + return str(code_block) + if self.G.graph_identity.quantization_type == 'POW2': + code_block.write("LoadCNNLibrary();") + return str(code_block) + return ValueError("Quantization type not known %s", self.G.graph_identity.quantization_type) def header_generator(self, indent=0): code_block = CodeBlock(starting_indent=indent) - for step_idx, node, _, fnode in self.G.nodes_iterator(): + for _, node, _, fnode in self.G.nodes_iterator(): if fnode: continue - if not isinstance(node, InputParameters) and not isinstance(node, OutputParameters): - continue - name = node.name - cname = self.naming_convension.get_node_name(name, step_idx, node) + cname = self.name_cache[node]['node'] qrec = self.G.quantization[NodeId(node)] - for i, out_q in enumerate(qrec.out_qs): - code_block.write("#define {}_Q\t{}".format(cname, out_q.q)) - return str(code_block) \ No newline at end of file + code_block.comment(cname) + if self.G.graph_identity.quantization_type == 'SQ8': + code_block.write("#define {}_OUT_SCALE\t{}".format(cname, qrec.out_qs[0].scale[0])) + qscales, qnorms = qrec.out_qs[0].get_quantized_scale() + code_block.write("#define {}_OUT_QSCALE\t{}".format(cname, qscales[0])) + code_block.write("#define {}_OUT_QNORM\t{}".format(cname, qnorms[0])) + + elif self.G.graph_identity.quantization_type == 'POW2': + for out_q in qrec.out_qs: + code_block.write("#define {}_Q\t{}".format(cname, out_q.q)) + return str(code_block) diff --git a/tools/nntool/generation/default_template.py b/tools/nntool/generation/default_template.py index 74c59fa5e..99c451cbe 100644 --- a/tools/nntool/generation/default_template.py +++ b/tools/nntool/generation/default_template.py @@ -11,7 +11,7 @@ def generator_template(G, gen): #include #include #include "AutoTilerLib.h" -#include "CNN_Generators.h" +#include ${gen.cnn_generators()} ${gen.extra_includes_generator(indent=0)} void ${gen.project_name}Model(unsigned int L1Memory, unsigned int L2Memory, unsigned int L3Memory, unsigned int L3Flash) @@ -21,13 +21,13 @@ def generator_template(G, gen): // SetKernelOpts(KER_OPT_NONE, KER_OPT_BUFFER_PROMOTE); SetSymbolDynamics(); - SetUsedFilesNames(0, 2, "CNN_BasicKernels.h", "${gen.project_name}.h"); + SetUsedFilesNames(0, 3, ${gen.extra_includes_kernels(indent=0)}, ${gen.cnn_kernels()}, "${gen.project_name}.h"); SetGeneratedFilesNames("${gen.project_name}Kernels.c", "${gen.project_name}Kernels.h"); ${gen.options_generator(indent=1)} ${gen.memory_device_generator(indent=1)} - LoadCNNLibrary(); +${gen.load_basic_kernel_library(indent=1)} ${gen.kernel_generator(indent=1)} @@ -67,7 +67,7 @@ def generator_template_v3(G, gen): #include #include #include "AutoTilerLib.h" -#include "CNN_Generators.h" +#include ${gen.cnn_generators()} ${gen.extra_includes_generator(indent=0)} void ${gen.project_name}Model(unsigned int L1Memory, unsigned int L2Memory, unsigned int L3Memory, unsigned int L3Flash) @@ -77,13 +77,14 @@ def generator_template_v3(G, gen): // SetKernelOpts(KER_OPT_NONE, KER_OPT_BUFFER_PROMOTE); SetSymbolDynamics(); - SetUsedFilesNames(0, 2, "CNN_BasicKernels.h", "${gen.project_name}.h"); + SetUsedFilesNames(0, 3, ${gen.extra_includes_kernels(indent=0)}, ${gen.cnn_kernels()}, "${gen.project_name}.h"); SetGeneratedFilesNames("${gen.project_name}Kernels.c", "${gen.project_name}Kernels.h"); ${gen.options_generator(indent=1)} ${gen.memory_device_generator(indent=1)} - LoadCNNLibrary(); +${gen.load_basic_kernel_library(indent=1)} + LoadNNTools_Extra_Library(); ${gen.kernel_generator(indent=1)} @@ -121,7 +122,12 @@ def generator_template_v3(G, gen): # pylint: disable=unused-argument def generator_template_header(G, gen): ''' +#ifndef ${gen.project_name.upper()}_GRAPHINFO_H +#define ${gen.project_name.upper()}_GRAPHINFO_H +// Quantized scales can be used round_norm(val * QSCALE, QNORM) giving the real value in Q8 + ${gen.header_generator(indent=0)} +#endif ${gen.project_name.upper()}_GRAPHINFO_H ''' def execute_template(template_function, G, naming_convension=None, code_generator=None): diff --git a/tools/nntool/generation/generators/__init__.py b/tools/nntool/generation/generators/__init__.py new file mode 100644 index 000000000..776be0c7e --- /dev/null +++ b/tools/nntool/generation/generators/__init__.py @@ -0,0 +1,69 @@ +from .bindings.pow2.softmax_bindings_generator import \ + softmax_bindings_generator as pow2_softmax_bindings_generator +from .bindings.mult8.softmax_bindings_generator import \ + softmax_bindings_generator as mult8_softmax_bindings_generator +from .bindings.mult8.conv_bindings_generator import \ + conv_bindings_generator as mult8_conv_bondings_generator +from .bindings.mult8.fc_binding_generator import \ + fc_bindings_generator as mult8_fc_bindigns_generator +from .bindings.mult8.inout_bindings_generator import \ + in_out_bindings_generator as mult8_in_out_bindings_generator +from .bindings.mult8.inout_infos_bindings_generator import \ + in_out_infos_bindings_generator +from .bindings.mult8.matadd_bindings_generator import \ + matadd_bindings_generator as mult8_matadd_bindings_generator +from .bindings.pow2.conv_bindings_generator import \ + conv_bindings_generator as pow2_conv_bindings_generator +from .bindings.pow2.fc_bindings_generator import \ + fc_bindings_generator as pow2_fc_bindings_generator +from .bindings.pow2.inout_bindings_generator import \ + in_out_bindings_generator as pow2_in_out_bindings_generator +from .bindings.pow2.matadd_bindings_generator import \ + matadd_bindings_generator as pow2_matadd_bindings_generator +from .bindings.pow2.matscale_bindings_generator import \ + matscale_bindings_generator as pow2_matscale_bindings_generator +from .bindings.pow2.transpose_bindings_generator import \ + transpose_bindings_generator +from .generator_decorators import (RegisteredGeneratorsMixin, + generation_function) +from .globals.constant_input_generator import constant_input_globals_generator +from .globals.filter_generator import filter_globals_generator +from .globals.input_generator import inputs_input_generator +from .globals.mult8_filter_generator import mult8_filter_globals_generator +from .globals.mult8_infos_generator import mult8_infos_generator +from .globals.output_generator import outputs_output_generator +from .kernels.general.imageformat_kernels_generator import \ + imageformat_kernels_generator +from .kernels.mult8.conv_pool_relu_kernels_generator import \ + conv_pool_relu_kernels_generator as mult8_conv_pool_relu_kernels_generator +from .kernels.mult8.global_pool_kernels_generator import \ + global_pool_kernels_generator as mult8_global_pool_kernels_generator +from .kernels.mult8.linear_relu_kernels_generator import \ + linear_relu_kernels_generator +from .kernels.mult8.matadd_kernels_generator import matadd_kernel_generator +from .kernels.mult8.pool_relu_kernels_generator import \ + pool_act_kernels_generator as mult8_pool_act_kernels_generatoor +from .kernels.mult8.softmax_kernels_generator import \ + softmax_kernels_generator as mult8_softmax_kernels_generator +from .kernels.mult8.three_d_transpose_kernels_generator import \ + three_d_transpose_kernels_generator as \ + mult8_three_d_transpose_kernels_generator +from .kernels.mult8.mat_vect_mult_kernels_generator import \ + mat_vect_mult_kernel_generator +from .kernels.pow2.conv_pool_relu_kernels_generator import \ + conv_pool_relu_kernels_generator as pow2_conv_pool_relu_kernels_generator +from .kernels.pow2.global_pool_kernels_generator import \ + global_pool_kernels_generator as pow2_global_pool_kernels_generator +from .kernels.pow2.linear_relu_kernels_generator import \ + linear_relu_kernels_generator as pow2_linear_relu_kernels_generator +from .kernels.pow2.matadd_kernels_generator import \ + matadd_kernels_generator as pow2_matadd_kernels_generator +from .kernels.pow2.matscale_kernels_generator import \ + matscale_kernels_generator as pow2_matscale_kernels_generator +from .kernels.pow2.pool_relu_kernels_generator import \ + pool_kernels_generator as pow2_pool_kernels_generator +from .kernels.pow2.softmax_kernels_generator import \ + softmax_kernels_generator as pow2_softmax_kernels_generator +from .kernels.pow2.three_d_transpose_kernels_generator import \ + three_d_transpose_kernels_generator as \ + pow2_three_d_transpose_kernels_generator diff --git a/tools/nntool/generation/generators/bindings/__init__.py b/tools/nntool/generation/generators/bindings/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/generation/generators/bindings/mult8/__init__.py b/tools/nntool/generation/generators/bindings/mult8/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/generation/generators/bindings/mult8/conv_bindings_generator.py b/tools/nntool/generation/generators/bindings/mult8/conv_bindings_generator.py new file mode 100644 index 000000000..5f6581d1a --- /dev/null +++ b/tools/nntool/generation/generators/bindings/mult8/conv_bindings_generator.py @@ -0,0 +1,79 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from generation.bindings import (CommentBindingList, GNodeArgEdge, + GNodeArgNode, NodeBindingList) +from generation.generators.generator_decorators import (QREC_MULT8, + generation_function) +from generation.generators.globals.global_names import (BIASES, INFOS, + MULSCALE, MULSHIFT, + WEIGHTS) +from graph.types import Conv2DParameters, ConvFusionParameters, ActivationParameters +from utils.node_id import NodeId + + +@generation_function("bindings", (Conv2DParameters, ConvFusionParameters, ActivationParameters), qrec_types=(QREC_MULT8,)) +def conv_bindings_generator(gen, node, qrec, in_eparams, out_eparams, cname) -> bool: + step_idx = node.step_idx + if isinstance(node, ActivationParameters): + set_act_bindings(gen, step_idx, in_eparams, out_eparams, cname, node, qrec) + elif isinstance(node, Conv2DParameters): + set_conv_bindings(gen, step_idx, in_eparams, out_eparams, cname, node, qrec) + elif isinstance(node, ConvFusionParameters): + cnodes = node.contained_nodes() + quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes] + if node.fusion_type in ("conv_active_pool", "conv_active", "conv_pool"): + set_conv_bindings(gen, step_idx, in_eparams, out_eparams, + cname, cnodes[0], quants[0], out_q=quants[1]) + elif node.fusion_type == "conv_pool_active": + set_conv_bindings(gen, step_idx, in_eparams, out_eparams, + cname, cnodes[0], quants[0], out_q=quants[2]) + else: + return False + else: + return False + return True + + +def set_conv_bindings(gen, step_idx, in_eparams, out_eparams, cname, + conv_params, conv_q, out_q=None): + del step_idx + if out_q is None: + out_q = conv_q + gen.bindings.append( + CommentBindingList("Node {} inq {} weightsq {} outq {} biasesq {}", cname, + conv_q.in_qs[0], conv_q.weights_q, out_q.out_qs[0], conv_q.biases_q) + ) + gen.bindings.append( + NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), + GNodeArgNode(conv_params, WEIGHTS), + GNodeArgNode(conv_params, BIASES), + GNodeArgEdge(out_eparams[0], "GNA_OUT"), + GNodeArgNode(conv_params, MULSCALE), + GNodeArgNode(conv_params, MULSHIFT), + GNodeArgNode(conv_params, INFOS) + )) + +def set_act_bindings(gen, step_idx, in_eparams, out_eparams, cname, + act_params, act_qrec): + del step_idx + gen.bindings.append( + CommentBindingList("Node {} inq {} outq {}", cname, + act_qrec.in_qs[0], act_qrec.out_qs[0]) + ) + gen.bindings.append( + NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), + GNodeArgEdge(out_eparams[0], "GNA_OUT"), + GNodeArgNode(act_params, INFOS) + )) diff --git a/tools/nntool/generation/generators/bindings/mult8/fc_binding_generator.py b/tools/nntool/generation/generators/bindings/mult8/fc_binding_generator.py new file mode 100644 index 000000000..c790b1f1e --- /dev/null +++ b/tools/nntool/generation/generators/bindings/mult8/fc_binding_generator.py @@ -0,0 +1,57 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + + +from generation.bindings import (CommentBindingList, GNodeArgEdge, + GNodeArgNode, NodeBindingList) +from generation.generators.generator_decorators import (QREC_MULT8, + generation_function) +from generation.generators.globals.global_names import WEIGHTS, BIASES, MULSCALE, MULSHIFT, INFOS +from graph.types import ConvFusionParameters, FcParameters +from utils.node_id import NodeId + + +@generation_function("bindings", (ConvFusionParameters, FcParameters), qrec_types=(QREC_MULT8, )) +def fc_bindings_generator(gen, node, qrec, in_eparams, out_eparams, cname) -> bool: + step_idx = node.step_idx + if isinstance(node, FcParameters): + set_fc_bindings(gen, step_idx, in_eparams, out_eparams, cname, node, qrec) + elif isinstance(node, ConvFusionParameters) and node.fusion_type == "linear_active": + cnodes = node.contained_nodes() + quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes] + set_fc_bindings(gen, step_idx, in_eparams, out_eparams, + cname, cnodes[0], quants[0], out_q=quants[1]) + else: + return False + return True + + +def set_fc_bindings(gen, step_idx, in_eparams, out_eparams, cname, + params, linear_q, out_q=None): + del step_idx + if out_q is None: + out_q = linear_q + gen.bindings.append( + CommentBindingList("Node {} inq {} weightsq {} outq {}", params.name, + linear_q.in_qs[0], linear_q.weights_q, out_q.out_qs[0]) + ) + gen.bindings.append( + NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), + GNodeArgNode(params, WEIGHTS), + GNodeArgNode(params, BIASES), + GNodeArgEdge(out_eparams[0], "GNA_OUT"), + GNodeArgNode(params, MULSCALE), + GNodeArgNode(params, MULSHIFT), + GNodeArgNode(params, INFOS) + )) diff --git a/tools/nntool/generation/generators/bindings/mult8/inout_bindings_generator.py b/tools/nntool/generation/generators/bindings/mult8/inout_bindings_generator.py new file mode 100644 index 000000000..f50a654ae --- /dev/null +++ b/tools/nntool/generation/generators/bindings/mult8/inout_bindings_generator.py @@ -0,0 +1,40 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from generation.bindings import (CommentBindingList, GNodeArgEdge, + NodeBindingList) +from generation.generators.generator_decorators import (QREC_MULT8, + generation_function) +from graph.types import ImageFormatParameters, TransposeParameters + + +@generation_function("bindings", (TransposeParameters, ImageFormatParameters), qrec_types=(QREC_MULT8, )) +def in_out_bindings_generator(gen, node, qrec, in_eparams, out_eparams, cname) -> bool: + if isinstance(node, TransposeParameters): + _, real_transpose = node.real_shape() + if len(real_transpose) <= 1: + return True + set_in_out_bindings(gen, in_eparams, out_eparams, cname, node, qrec) + + +def set_in_out_bindings(gen, in_eparams, out_eparams, cname, node, node_q, out_q=None): + if out_q is None: + out_q = node_q + gen.bindings.append( + CommentBindingList("Node {} inq {} outq {}", node.name, + str(node_q.in_qs[0]), str(out_q.out_qs[0])) + ) + gen.bindings.append( + NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), + GNodeArgEdge(out_eparams[0], "GNA_OUT"))) diff --git a/tools/nntool/generation/generators/bindings/mult8/inout_infos_bindings_generator.py b/tools/nntool/generation/generators/bindings/mult8/inout_infos_bindings_generator.py new file mode 100644 index 000000000..2e55a53fb --- /dev/null +++ b/tools/nntool/generation/generators/bindings/mult8/inout_infos_bindings_generator.py @@ -0,0 +1,47 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from generation.bindings import (CommentBindingList, GNodeArgEdge, GNodeArgNode, + NodeBindingList) +from generation.generators.generator_decorators import generation_function, QREC_MULT8 +from graph.types import (ActivationParameters, ConvFusionParameters, + GlobalPoolParameters, PoolingParameters, ActivationFusion) + + +@generation_function("bindings", (PoolingParameters, ConvFusionParameters, ActivationParameters, + GlobalPoolParameters, ActivationFusion), qrec_types=(QREC_MULT8, )) +def in_out_infos_bindings_generator(gen, node, qrec, in_eparams, out_eparams, cname) -> bool: + if isinstance(node, ActivationFusion): + cnodes = node.contained_nodes() + if isinstance(cnodes[0], (GlobalPoolParameters, PoolingParameters)): + set_in_out_bindings(gen, in_eparams, out_eparams, cname, cnodes[0], qrec) + return True + return False + if isinstance(node, (GlobalPoolParameters, PoolingParameters)): + set_in_out_bindings(gen, in_eparams, out_eparams, cname, node, qrec) + else: + return False + return True + + +def set_in_out_bindings(gen, in_eparams, out_eparams, cname, node, node_q, out_q=None): + if out_q is None: + out_q = node_q + gen.bindings.append( + CommentBindingList("Node {} inq {} outq {}", node.name, + str(node_q.in_qs[0]), str(out_q.out_qs[0])) + ) + gen.bindings.append( + NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), + GNodeArgEdge(out_eparams[0], "GNA_OUT"), GNodeArgNode(node, "infos"))) diff --git a/tools/nntool/generation/generators/bindings/mult8/mat_vect_mult_bindings_generator.py b/tools/nntool/generation/generators/bindings/mult8/mat_vect_mult_bindings_generator.py new file mode 100644 index 000000000..b14b9189d --- /dev/null +++ b/tools/nntool/generation/generators/bindings/mult8/mat_vect_mult_bindings_generator.py @@ -0,0 +1,50 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from generation.bindings import (CommentBindingList, GNodeArgEdge, + GNodeArgNode, NodeBindingList) +from generation.generators.generator_decorators import generation_function, QREC_MULT8 +from graph.types import MatrixMulParameters, ActivationFusion +from utils.node_id import NodeId + + +@generation_function("bindings", (MatrixMulParameters, ActivationFusion), qrec_types=(QREC_MULT8,)) +def mat_vect_mul_bindings_generator(gen, node, qrec, in_eparams, out_eparams, cname) -> bool: + step_idx = node.step_idx + if isinstance(node, ActivationFusion): + cnodes = node.contained_nodes() + quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes] + if isinstance(cnodes[0], MatrixMulParameters): + set_mat_vect_mul_bindings(gen, cnodes[0], step_idx, in_eparams, out_eparams, + cname, quants[0], out_q=quants[1]) + return True + return False + set_mat_vect_mul_bindings(gen, node, step_idx, in_eparams, out_eparams, cname, qrec) + return True + + +def set_mat_vect_mul_bindings(gen, node, step_idx, in_eparams, out_eparams, cname, qrec, out_q=None): + del step_idx + if out_q is None: + out_q = qrec + gen.bindings.append( + CommentBindingList("Node {} in1q {} in2q {} outq {}", cname, + qrec.in_qs[0], qrec.in_qs[1], out_q.out_qs[0]) + ) + gen.bindings.append( + NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), + GNodeArgEdge(in_eparams[1]), + GNodeArgEdge(out_eparams[0], "GNA_OUT"), + GNodeArgNode(node, 'infos') + )) diff --git a/tools/nntool/generation/generators/bindings/mult8/matadd_bindings_generator.py b/tools/nntool/generation/generators/bindings/mult8/matadd_bindings_generator.py new file mode 100644 index 000000000..aa6fab4b7 --- /dev/null +++ b/tools/nntool/generation/generators/bindings/mult8/matadd_bindings_generator.py @@ -0,0 +1,52 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from generation.bindings import (CommentBindingList, GNodeArgEdge, + GNodeArgNode, NodeBindingList) +from generation.generators.generator_decorators import generation_function, QREC_MULT8 +from graph.types import MatrixAddParameters, ActivationFusion +from utils.node_id import NodeId + + +@generation_function("bindings", (MatrixAddParameters, ActivationFusion), qrec_types=(QREC_MULT8,)) +def matadd_bindings_generator(gen, node, qrec, in_eparams, out_eparams, cname) -> bool: + step_idx = node.step_idx + if isinstance(node, ActivationFusion): + cnodes = node.contained_nodes() + quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes] + if isinstance(cnodes[0], MatrixAddParameters): + set_matadd_bindings(gen, cnodes[0], step_idx, in_eparams, out_eparams, + cname, quants[0], out_q=quants[1]) + return True + return False + set_matadd_bindings(gen, node, step_idx, in_eparams, out_eparams, cname, qrec) + return True + + +def set_matadd_bindings(gen, node, step_idx, in_eparams, out_eparams, cname, qrec, out_q=None): + del step_idx + if out_q is None: + out_q = qrec + scaled_idx = qrec.scaled_idx + not_scaled_idx = 0 if scaled_idx else 1 + gen.bindings.append( + CommentBindingList("Node {} in1q {} in2q {} outq {}", cname, + qrec.in_qs[scaled_idx], qrec.in_qs[not_scaled_idx], out_q.out_qs[0]) + ) + gen.bindings.append( + NodeBindingList(cname, GNodeArgEdge(in_eparams[scaled_idx]), + GNodeArgEdge(in_eparams[not_scaled_idx]), + GNodeArgEdge(out_eparams[0], "GNA_OUT"), + GNodeArgNode(node, 'infos') + )) diff --git a/tools/nntool/generation/generators/bindings/mult8/softmax_bindings_generator.py b/tools/nntool/generation/generators/bindings/mult8/softmax_bindings_generator.py new file mode 100644 index 000000000..246ae1ba1 --- /dev/null +++ b/tools/nntool/generation/generators/bindings/mult8/softmax_bindings_generator.py @@ -0,0 +1,39 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +import numpy as np + +from generation.bindings import (CommentBindingList, GNodeArgEdge, + GNodeArgNode, NodeBindingList) +from generation.generators.generator_decorators import generation_function, QREC_MULT8 +from graph.types import SoftMaxParameters + + +@generation_function("bindings", (SoftMaxParameters,), qrec_types=(QREC_MULT8,)) +def softmax_bindings_generator(gen, node, qrec, in_eparams, out_eparams, cname) -> bool: + set_softmax_bindings(gen, in_eparams, out_eparams, cname, node, qrec) + return True + + +def set_softmax_bindings(gen, in_eparams, out_eparams, cname, params, node_q): + in_q = -np.ceil(np.log2(node_q.in_qs[0].scale)) + out_q = -np.ceil(np.log2(node_q.out_qs[0].scale)) + gen.bindings.append( + CommentBindingList("Node {} inq {} outq {}", + params.name, int(in_q[0]), int(out_q[0])) + ) + gen.bindings.append( + NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), + GNodeArgEdge(out_eparams[0], "GNA_OUT"), + GNodeArgNode(params, 'infos') + )) diff --git a/tools/nntool/generation/generators/bindings/pow2/__init__.py b/tools/nntool/generation/generators/bindings/pow2/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/generation/generators/bindings/pow2/conv_bindings_generator.py b/tools/nntool/generation/generators/bindings/pow2/conv_bindings_generator.py new file mode 100644 index 000000000..a11610e98 --- /dev/null +++ b/tools/nntool/generation/generators/bindings/pow2/conv_bindings_generator.py @@ -0,0 +1,68 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from generation.bindings import (CommentBindingList, GNodeArgEdge, + GNodeArgNode, NodeBindingList) +from generation.generators.generator_decorators import (QREC_POW2, + generation_function) +from generation.generators.globals.global_names import (BIASES, MULSCALE, + WEIGHTS) +from graph.types import Conv2DParameters, ConvFusionParameters +from utils.node_id import NodeId + + +@generation_function("bindings", (Conv2DParameters, ConvFusionParameters), qrec_types=(QREC_POW2, )) +def conv_bindings_generator(gen, node, qrec, in_eparams, out_eparams, cname) -> bool: + step_idx = node.step_idx + if isinstance(node, Conv2DParameters): + set_conv_bindings(gen, step_idx, in_eparams, out_eparams, cname, node, qrec) + elif isinstance(node, ConvFusionParameters): + cnodes = node.contained_nodes() + quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes] + if node.fusion_type == "conv_active_pool": + set_conv_bindings(gen, step_idx, in_eparams, out_eparams, + cname, cnodes[0], quants[0], out_q=quants[1]) + elif node.fusion_type == "conv_pool_active": + set_conv_bindings(gen, step_idx, in_eparams, out_eparams, + cname, cnodes[0], quants[0], out_q=quants[2]) + elif node.fusion_type == "conv_active": + set_conv_bindings(gen, step_idx, in_eparams, out_eparams, + cname, cnodes[0], quants[0], out_q=quants[1]) + elif node.fusion_type == "conv_pool": + set_conv_bindings(gen, step_idx, in_eparams, out_eparams, + cname, cnodes[0], quants[0], out_q=quants[1]) + else: + return False + else: + return False + return True + +def set_conv_bindings(gen, step_idx, in_eparams, out_eparams, cname, params, conv_q, + out_q=None): + if out_q is None: + out_q = conv_q + gen.bindings.append( + CommentBindingList("Node {} inq {} weightsq {} outq {} biasesq {}", cname, + conv_q.in_qs[0].q, conv_q.weights_q.q, out_q.out_qs[0].q, conv_q.biases_q.q) + ) + if params.has_mul_bias: + gen.bindings.append( + NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), GNodeArgNode(params, WEIGHTS), + GNodeArgNode(params, BIASES), GNodeArgNode(params, MULSCALE), + GNodeArgEdge(out_eparams[0], "GNA_OUT"))) + else: + gen.bindings.append( + NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), GNodeArgNode(params, WEIGHTS), + GNodeArgNode(params, BIASES), + GNodeArgEdge(out_eparams[0], "GNA_OUT"))) diff --git a/tools/nntool/generation/generators/bindings/pow2/fc_bindings_generator.py b/tools/nntool/generation/generators/bindings/pow2/fc_bindings_generator.py new file mode 100644 index 000000000..56478d7a8 --- /dev/null +++ b/tools/nntool/generation/generators/bindings/pow2/fc_bindings_generator.py @@ -0,0 +1,50 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + + +from generation.bindings import (CommentBindingList, GNodeArgEdge, + GNodeArgNode, NodeBindingList) +from generation.generators.generator_decorators import (QREC_POW2, + generation_function) +from generation.generators.globals.global_names import WEIGHTS, BIASES +from graph.types import ConvFusionParameters, FcParameters +from utils.node_id import NodeId + + +@generation_function("bindings", (FcParameters,), qrec_types=(QREC_POW2, )) +def fc_bindings_generator(gen, node, qrec, in_eparams, out_eparams, cname) -> bool: + step_idx = node.step_idx + if isinstance(node, FcParameters): + set_fc_bindings(gen, step_idx, in_eparams, out_eparams, cname, node, qrec) + elif isinstance(node, ConvFusionParameters) and node.fusion_type == "linear_active": + cnodes = node.contained_nodes() + quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes] + set_fc_bindings(gen, step_idx, in_eparams, out_eparams, + cname, cnodes[0], quants[0], out_q=quants[1]) + else: + return False + return True + +def set_fc_bindings(gen, step_idx, in_eparams, out_eparams, cname, + params, linear_q, out_q=None): + if out_q is None: + out_q = linear_q + gen.bindings.append( + CommentBindingList("Node {} inq {} weightsq {} outq {}", params.name, + linear_q.in_qs[0].q, linear_q.weights_q.q, out_q.out_qs[0].q) + ) + gen.bindings.append( + NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), GNodeArgNode(params, WEIGHTS), + GNodeArgNode(params, BIASES), + GNodeArgEdge(out_eparams[0], "GNA_OUT"))) diff --git a/tools/nntool/generation/generators/bindings/pow2/inout_bindings_generator.py b/tools/nntool/generation/generators/bindings/pow2/inout_bindings_generator.py new file mode 100644 index 000000000..fe2b01bb5 --- /dev/null +++ b/tools/nntool/generation/generators/bindings/pow2/inout_bindings_generator.py @@ -0,0 +1,49 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from generation.bindings import (CommentBindingList, GNodeArgEdge, + NodeBindingList) +from generation.generators.generator_decorators import generation_function, QREC_POW2 +from graph.types import (ActivationParameters, ConvFusionParameters, + GlobalPoolParameters, PoolingParameters, TransposeParameters, ImageFormatParameters) +from utils.node_id import NodeId + + +@generation_function("bindings", (PoolingParameters, + ConvFusionParameters, + ActivationParameters, + GlobalPoolParameters, TransposeParameters, ImageFormatParameters), qrec_types=(QREC_POW2, )) +def in_out_bindings_generator(gen, node, qrec, in_eparams, out_eparams, cname) -> bool: + if isinstance(node, (PoolingParameters, ActivationParameters, GlobalPoolParameters, TransposeParameters, ImageFormatParameters)): + set_in_out_bindings(gen, in_eparams, out_eparams, cname, node, qrec) + elif isinstance(node, ConvFusionParameters) and node.fusion_type == "pool_active": + cnodes = node.contained_nodes() + quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes] + set_in_out_bindings(gen, in_eparams, out_eparams, cname, + node, qrec, out_q=quants[1]) + else: + return False + return True + + +def set_in_out_bindings(gen, in_eparams, out_eparams, cname, node, node_q, out_q=None): + if out_q is None: + out_q = node_q + gen.bindings.append( + CommentBindingList("Node {} inq {} outq {}", node.name, + str(node_q.in_qs[0]), str(out_q.out_qs[0])) + ) + gen.bindings.append( + NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), + GNodeArgEdge(out_eparams[0], "GNA_OUT"))) diff --git a/tools/nntool/generation/generators/bindings/pow2/matadd_bindings_generator.py b/tools/nntool/generation/generators/bindings/pow2/matadd_bindings_generator.py new file mode 100644 index 000000000..6dc854933 --- /dev/null +++ b/tools/nntool/generation/generators/bindings/pow2/matadd_bindings_generator.py @@ -0,0 +1,35 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + + +from generation.bindings import (CommentBindingList, GNodeArgEdge, + NodeBindingList) +from generation.generators.generator_decorators import generation_function, QREC_POW2 +from graph.types import MatrixAddParameters + + +@generation_function("bindings", (MatrixAddParameters,), qrec_types=(QREC_POW2, )) +def matadd_bindings_generator(gen, node, qrec, in_eparams, out_eparams, cname) -> bool: + set_matrixadd_bindings(gen, in_eparams, out_eparams, cname, node, qrec) + return True + + +def set_matrixadd_bindings(gen, in_eparams, out_eparams, cname, params, node_q): + gen.bindings.append( + CommentBindingList("Node {} inq1 {} inq2 {} outq {}", params.name, + node_q.in_qs[0].q, node_q.in_qs[1].q, node_q.out_qs[0].q) + ) + gen.bindings.append( + NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), GNodeArgEdge(in_eparams[1]), + GNodeArgEdge(out_eparams[0], "GNA_OUT"))) diff --git a/tools/nntool/generation/generators/bindings/pow2/matscale_bindings_generator.py b/tools/nntool/generation/generators/bindings/pow2/matscale_bindings_generator.py new file mode 100644 index 000000000..c550b8e23 --- /dev/null +++ b/tools/nntool/generation/generators/bindings/pow2/matscale_bindings_generator.py @@ -0,0 +1,46 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + + +from generation.bindings import (CommentBindingList, GNodeArgEdge, + NodeBindingList) +from generation.generators.generator_decorators import generation_function, QREC_POW2 +from graph.types import MatScaleFusionParameters + + +@generation_function("bindings", (MatScaleFusionParameters,), qrec_types=(QREC_POW2, )) +def matscale_bindings_generator(gen, node, qrec, in_eparams, out_eparams, cname) -> bool: + set_matscale_bindings(gen, in_eparams, out_eparams, cname, node, qrec) + return True + + +def set_matscale_bindings(gen, in_eparams, out_eparams, cname, params, node_q): + if params.fusion_type == "vec_scalar": + gen.bindings.append( + CommentBindingList("Node {} inq1 {} inq2 {} inq3 {} outq {}", params.name, + node_q.in_qs[0].q, node_q.in_qs[1].q, + node_q.in_qs[2].q, node_q.out_qs[0].q) + ) + gen.bindings.append( + NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), + GNodeArgEdge(in_eparams[1]), GNodeArgEdge(in_eparams[2]), + GNodeArgEdge(out_eparams[0], "GNA_OUT"))) + else: + gen.bindings.append( + CommentBindingList("Node {} inq1 {} inq2 {} outq {}", params.name, + node_q.in_qs[0].q, node_q.in_qs[1].q, node_q.out_qs[0].q) + ) + gen.bindings.append( + NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), GNodeArgEdge(in_eparams[1]), + GNodeArgEdge(out_eparams[0], "GNA_OUT"))) diff --git a/tools/nntool/generation/generators/bindings/pow2/softmax_bindings_generator.py b/tools/nntool/generation/generators/bindings/pow2/softmax_bindings_generator.py new file mode 100644 index 000000000..8dd5aa170 --- /dev/null +++ b/tools/nntool/generation/generators/bindings/pow2/softmax_bindings_generator.py @@ -0,0 +1,35 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + + +from generation.bindings import (CommentBindingList, GNodeArgEdge, + NodeBindingList) +from generation.generators.generator_decorators import generation_function, QREC_POW2 +from graph.types import SoftMaxParameters + + +@generation_function("bindings", (SoftMaxParameters,), qrec_types=(QREC_POW2,)) +def softmax_bindings_generator(gen, node, qrec, in_eparams, out_eparams, cname) -> bool: + set_softmax_bindings(gen, in_eparams, out_eparams, cname, node, qrec) + return True + + +def set_softmax_bindings(gen, in_eparams, out_eparams, cname, params, node_q): + gen.bindings.append( + CommentBindingList("Node {} inq {} outq {}", params.name, + node_q.in_qs[0].q, node_q.out_qs[0].q) + ) + gen.bindings.append( + NodeBindingList(cname, GNodeArgEdge(in_eparams[0]), + GNodeArgEdge(out_eparams[0], "GNA_OUT"))) diff --git a/tools/nntool/generation/generators/bindings/pow2/transpose_bindings_generator.py b/tools/nntool/generation/generators/bindings/pow2/transpose_bindings_generator.py new file mode 100644 index 000000000..975c1bf54 --- /dev/null +++ b/tools/nntool/generation/generators/bindings/pow2/transpose_bindings_generator.py @@ -0,0 +1,24 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from generation.generators.bindings.pow2.inout_bindings_generator import \ + set_in_out_bindings +from generation.generators.generator_decorators import generation_function, QREC_POW2 +from graph.types import TransposeParameters + + +@generation_function("bindings", (TransposeParameters, ), qrec_types=(QREC_POW2,)) +def transpose_bindings_generator(gen, node, qrec, in_eparams, out_eparams, cname) -> bool: + set_in_out_bindings(gen, in_eparams, out_eparams, cname, node, qrec) + return True diff --git a/tools/nntool/generation/generators/generator_decorators.py b/tools/nntool/generation/generators/generator_decorators.py new file mode 100644 index 000000000..dd787714e --- /dev/null +++ b/tools/nntool/generation/generators/generator_decorators.py @@ -0,0 +1,102 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging +from abc import ABC, abstractmethod + +from quantization.multiplicative.mult_quantization import \ + MultQuantizationRecordBase +from quantization.symmetric.symmetric_quantization import \ + SymmetricQuantizationBase + +LOG = logging.getLogger("nntool." + __name__) + +GENERATION_PHASES = [ + "inputs", + "outputs", + "globals", + "bindings", + "kernels" +] + +QREC_POW2 = "qrec_pow2" +QREC_MULT8 = "qrec_mult8" +QREC_MULT32 = "qrec_mult32" + + +class GeneratorMatcher(ABC): + @abstractmethod + def match(self, param, qrec): + pass + + +class OldMatcher(GeneratorMatcher): + def __init__(self, params_matched, qrecs_matched): + self._params_matched = params_matched + self._qrecs_matched = qrecs_matched + + @staticmethod + def match_qrec(qrec_types, qrec): + if qrec_types is None: + return True + if isinstance(qrec, SymmetricQuantizationBase): + return any([qrec_type == "qrec_pow2" for qrec_type in qrec_types]) + if isinstance(qrec, MultQuantizationRecordBase): + return any([qrec_type == "qrec_mult8" for qrec_type in qrec_types]) + return False + + def match(self, param, qrec): + return isinstance(param, self._params_matched) and self.match_qrec(self._qrecs_matched, qrec) + + def __str__(self): + return "OldMatcher: nodes:({}) qrecs:({})".format(",".join(param.__name__ for param in self._params_matched), + "All" if self._qrecs_matched is None + else ",".join(self._qrecs_matched)) + + +class RegisteredGeneratorsMixin(): + Generators = {phase: [] for phase in GENERATION_PHASES} + + def execute_phase(self, phase_name, param, qrec, *args, break_on_true=False, **kwargs): + gens = self.Generators[phase_name] + res = False + for gen in gens: + if gen['matcher'].match(param, qrec): + LOG.debug("gen phase %s: matched function %s", + phase_name, + gen['func'].__name__) + this_res = gen['func'](self, param, qrec, *args, **kwargs) + res = this_res or res + if res and break_on_true: + break + return res + +def generation_match(phase_name, matcher): + def phase_func(func): + gens = RegisteredGeneratorsMixin.Generators[phase_name] + gens.append({'matcher': matcher, 'func': func}) + LOG.debug("registering function %s with matcher %s", + func.__name__, matcher) + return func + return phase_func + +def generation_function(phase_name, params, qrec_types=None): + def phase_func(func): + gens = RegisteredGeneratorsMixin.Generators[phase_name] + matcher = OldMatcher(params, qrec_types) + gens.append({'matcher': matcher, 'func': func}) + LOG.debug("registering function %s with matcher %s", + func.__name__, matcher) + return func + return phase_func diff --git a/tools/nntool/generation/generators/globals/__init__.py b/tools/nntool/generation/generators/globals/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/generation/generators/globals/constant_input_generator.py b/tools/nntool/generation/generators/globals/constant_input_generator.py new file mode 100644 index 000000000..ac63dbf5e --- /dev/null +++ b/tools/nntool/generation/generators/globals/constant_input_generator.py @@ -0,0 +1,41 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import os + +from generation.at_types.constant_info import ConstantInfo +from generation.at_types.tc_arg_info import GlobalArgInfo +from generation.generators.generator_decorators import generation_function, QREC_POW2 +from graph.types import ConstantInputParameters + + +@generation_function("globals", (ConstantInputParameters,), qrec_types=(QREC_POW2, )) +def constant_input_globals_generator(gen, node, qrec, pnode, fnode) -> bool: + del node, fnode + # the name cache will be updated when all the edges are analysed by local_generator + # the name of the constant is attached to the output edge so find it + out_edge = gen.G.out_edges(pnode.name)[0] + eparams = out_edge.params + cname = gen.naming_convension.get_edge_name(eparams.creating_node.name, + eparams.creating_step, + eparams.edge_type, + eparams.edge_order) + file_name = os.path.join(gen.opts['tensor_directory'], + cname+".tensor") + const_info = ConstantInfo(file_name, qrec.out_qs[0], contents=qrec.out_qs[0].quantize(node.value)) + gen.globals.append(GlobalArgInfo(qrec.out_qs[0].ctype, cname, + gen.opts['default_global_home_location'], + gen.opts['default_global_exec_location'], + const_info=const_info)) + return True diff --git a/tools/nntool/generation/generators/globals/filter_generator.py b/tools/nntool/generation/generators/globals/filter_generator.py new file mode 100644 index 000000000..c676e3564 --- /dev/null +++ b/tools/nntool/generation/generators/globals/filter_generator.py @@ -0,0 +1,93 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import os + +import numpy as np + +from generation.at_types.constant_info import ConstantInfo +from generation.at_types.tc_arg_info import GlobalArgInfo +from generation.generators.generator_decorators import (QREC_POW2, + generation_function) +from generation.generators.globals.global_names import WEIGHTS, BIASES, MULSCALE +from graph.types import FilterParameters, MultiplicativeBiasParameters + + +@generation_function("globals", (FilterParameters,), qrec_types=(QREC_POW2,)) +def filter_globals_generator(gen, node, qrec, pnode, fnode) -> bool: + del fnode + cname = gen.naming_convension.get_global_name(pnode.name, pnode.step_idx, + pnode, WEIGHTS) + gen.name_cache.set(node, WEIGHTS, cname) + + file_name = os.path.join(gen.opts['tensor_directory'], + cname+".tensor") + weights_q = qrec.weights_q + contents = weights_q.quantize(node.weights).astype(weights_q.dtype, + order='C', + casting='no', + copy=True) + + const_info = ConstantInfo(file_name, qrec.weights_q, contents=contents) + + gen.globals.append(GlobalArgInfo(qrec.weights_q.ctype, cname, + gen.opts['default_global_home_location'], + gen.opts['default_global_exec_location'], + const_info=const_info)) + + # biases are always generated even if they are 0 + if node.has_bias: + biases_q = qrec.biases_q + contents = biases_q.quantize(node.biases).astype(biases_q.dtype, + order='C', + casting='no', + copy=True) + else: + biases_q = qrec.out_q + contents = biases_q.quantize(np.zeros((node.out_dims[0].c))).astype(biases_q.dtype, + order='C', + casting='no', + copy=True) + + cname = gen.naming_convension.get_global_name(pnode.name, pnode.step_idx, + pnode, BIASES) + + gen.name_cache.set(node, BIASES, cname) + file_name = os.path.join(gen.opts['tensor_directory'], + cname+".tensor") + const_info = ConstantInfo(file_name, biases_q, contents=contents) + + gen.globals.append(GlobalArgInfo(biases_q.ctype, cname, + gen.opts['default_global_home_location'], + gen.opts['default_global_exec_location'], + const_info=const_info)) + + if isinstance(node, MultiplicativeBiasParameters) and node.has_mul_bias: + mul_biases_q = qrec.mul_biases_q + + cname = gen.naming_convension.get_global_name(pnode.name, pnode.step_idx, + pnode, MULSCALE) + gen.name_cache.set(node, MULSCALE, cname) + + contents = mul_biases_q.quantize(node.mul_biases).astype(mul_biases_q.dtype, + order='C', + casting='no', + copy=True) + const_info = ConstantInfo(file_name, mul_biases_q, contents=contents) + + gen.globals.append(GlobalArgInfo(mul_biases_q.ctype, cname, + gen.opts['default_global_home_location'], + gen.opts['default_global_exec_location'], + const_info=const_info)) + return True diff --git a/tools/nntool/generation/generators/globals/global_names.py b/tools/nntool/generation/generators/globals/global_names.py new file mode 100644 index 000000000..41bd32714 --- /dev/null +++ b/tools/nntool/generation/generators/globals/global_names.py @@ -0,0 +1,21 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +WEIGHTS = "weights" +BIASES = "biases" +MULSCALE = "mul_scale" +MULSHIFT = "mul_shift" +INFOS = "infos" + +__all__ = ['WEIGHTS', 'BIASES', 'MULSCALE', 'MULSHIFT', 'INFOS'] diff --git a/tools/nntool/generation/generators/globals/input_generator.py b/tools/nntool/generation/generators/globals/input_generator.py new file mode 100644 index 000000000..372808126 --- /dev/null +++ b/tools/nntool/generation/generators/globals/input_generator.py @@ -0,0 +1,26 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from generation.at_types.tc_arg_info import InputArgInfo +from generation.generators.generator_decorators import generation_function +from graph.types import InputParameters + + +@generation_function("inputs", (InputParameters,)) +def inputs_input_generator(gen, node, qrec, edge) -> bool: + gen.globals.append(InputArgInfo(qrec.out_qs[edge.from_idx].ctype, edge.params.name, + gen.opts['default_input_home_location'], + gen.opts['default_input_exec_location'], + allocate=node.at_options.allocate)) + return True diff --git a/tools/nntool/generation/generators/globals/mult8_filter_generator.py b/tools/nntool/generation/generators/globals/mult8_filter_generator.py new file mode 100644 index 000000000..067e92b23 --- /dev/null +++ b/tools/nntool/generation/generators/globals/mult8_filter_generator.py @@ -0,0 +1,94 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import os + +import numpy as np + +from generation.at_types.constant_info import ConstantInfo +from generation.at_types.tc_arg_info import GlobalArgInfo +from generation.generators.generator_decorators import (QREC_MULT8, + generation_function) +from graph.types import (ConvFusionParameters, FilterParameters) + +from utils.node_id import NodeId + +from .mult8_infos_generator import gen_constant +from .global_names import * + +@generation_function("globals", (FilterParameters, ConvFusionParameters), qrec_types=(QREC_MULT8,)) +def mult8_filter_globals_generator(gen, node, qrec, pnode, fnode) -> bool: + if fnode is not None: + return False + if isinstance(pnode, FilterParameters): + gen_filter_globals(gen, pnode, pnode, qrec) + elif isinstance(pnode, ConvFusionParameters): + cnodes = node.contained_nodes() + quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes] + if node.fusion_type in ("conv_active_pool", "conv_active", "linear_active", "conv_pool_active", "conv_pool"): + gen_filter_globals(gen, pnode, cnodes[0], quants[0]) + else: + return False + else: + return False + return True + + +def gen_filter_globals(gen, pnode, fnode, fqrec): + cname, file_name = gen_constant(gen, pnode, fnode, WEIGHTS) + weights_q = fqrec.weights_q + const_info = ConstantInfo(file_name, weights_q, contents=fqrec.gen_weights(fnode, fnode.weights)) + + gen.globals.append(GlobalArgInfo(weights_q.ctype, cname, + gen.opts['default_global_home_location'], + gen.opts['default_global_exec_location'], + const_info=const_info)) + + # biases are always generated even if they are 0 + if fnode.has_bias: + biases_q = fqrec.biases_q + biases = fnode.biases + else: + biases_q = fqrec.out_qs[0] + biases = np.zeros((fnode.out_dims[0].c)) + + contents = fqrec.gen_biases(fnode, biases, fnode.weights) + + cname, file_name = gen_constant(gen, pnode, fnode, BIASES) + const_info = ConstantInfo(file_name, biases_q, contents=contents) + + gen.globals.append(GlobalArgInfo(biases_q.ctype, cname, + gen.opts['default_global_home_location'], + gen.opts['default_global_exec_location'], + const_info=const_info)) + + cname_mul_scale, file_name_mul_scale = gen_constant(gen, pnode, fnode, MULSCALE) + cname_mul_shift, file_name_mul_shift = gen_constant(gen, pnode, fnode, MULSHIFT) + + mul_biases_q = fqrec.mul_biases_q + + const_info_mul_scale = ConstantInfo( + file_name_mul_scale, mul_biases_q, contents=fqrec.gen_mul_biases(fnode)) + const_info_mul_shift = ConstantInfo( + file_name_mul_shift, mul_biases_q.shift_qtype, contents=fqrec.mul_biases_q.qnorms) + + gen.globals.append(GlobalArgInfo(mul_biases_q.ctype, cname_mul_scale, + gen.opts['default_global_home_location'], + gen.opts['default_global_exec_location'], + const_info=const_info_mul_scale)) + + gen.globals.append(GlobalArgInfo(mul_biases_q.shift_ctype, cname_mul_shift, + gen.opts['default_global_home_location'], + gen.opts['default_global_exec_location'], + const_info=const_info_mul_shift)) diff --git a/tools/nntool/generation/generators/globals/mult8_infos_generator.py b/tools/nntool/generation/generators/globals/mult8_infos_generator.py new file mode 100644 index 000000000..6b0453c03 --- /dev/null +++ b/tools/nntool/generation/generators/globals/mult8_infos_generator.py @@ -0,0 +1,182 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import os + +import numpy as np + +from generation.at_types.constant_info import ConstantInfo +from generation.at_types.tc_arg_info import GlobalArgInfo +from generation.generators.generator_decorators import (QREC_MULT8, + generation_function) +from graph.types import (ConvFusionParameters, FilterParameters, + GlobalPoolParameters, HSigmoidActivationParameters, + HSwishActivationParameters, PoolingParameters, + SoftMaxParameters, ActivationFusion, MatrixMulParameters, + ReluActivationParameters, MatrixAddParameters, ActivationParameters) +from quantization.qtype import QType +from quantization.symmetric.kernels.activations import ( + hsigmoid_mult_gen_factors, hswish_mult_gen_factors) +from utils.node_id import NodeId +from .global_names import * + + +@generation_function("globals", + (FilterParameters, ConvFusionParameters, ActivationParameters, + GlobalPoolParameters, MatrixAddParameters, MatrixMulParameters, + ActivationFusion, PoolingParameters, SoftMaxParameters), + qrec_types=(QREC_MULT8,)) +def mult8_infos_generator(gen, node, qrec, pnode, fnode) -> bool: + if fnode is not None: + return False + if isinstance(pnode, FilterParameters): + if pnode.has_bias: + bias_q = qrec.biases_q.q + else: + bias_q = qrec.biases_q.q + act_infos(gen, pnode, pnode, None, None, extra1=bias_q) + elif isinstance(pnode, (GlobalPoolParameters, PoolingParameters)): + act_infos(gen, pnode, pnode, None, None) + elif isinstance(pnode, ActivationParameters): + act_infos(gen, pnode, pnode, pnode, gen.G.quantization[NodeId(pnode)]) + elif isinstance(pnode, ConvFusionParameters): + cnodes = node.contained_nodes() + quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes] + if node.fusion_type.startswith('linear') or node.fusion_type.startswith('conv'): + if cnodes[0].has_bias: + bias_q = quants[0].biases_q.q + else: + bias_q = quants[0].out_qs[0].q + if node.fusion_type in ("conv_active_pool", "conv_active", "linear_active"): + act_infos(gen, pnode, cnodes[0], cnodes[1], quants[1], extra1=bias_q) + elif node.fusion_type == "conv_pool_active": + act_infos(gen, pnode, cnodes[0], cnodes[2], quants[2], extra1=bias_q) + elif node.fusion_type == "conv_pool": + act_infos(gen, pnode, cnodes[0], None, None, extra1=bias_q) + elif isinstance(pnode, MatrixAddParameters): + qrec.set_add_scale() + act_infos(gen, pnode, pnode, None, None, + extra1=qrec.scale_in_mul_biases_q.qbiases[0], + extra2=qrec.scale_in_mul_biases_q.qnorms[0], + extra3=qrec.scale_mul_biases_q.qbiases[0], + extra4=qrec.scale_mul_biases_q.qnorms[0]) + elif isinstance(pnode, MatrixMulParameters): + qrec.set_scale() + act_infos(gen, pnode, pnode, None, None, + extra1=qrec.scale_mul_biases_q.qbiases[0], + extra2=qrec.scale_mul_biases_q.qnorms[0]) + elif isinstance(pnode, SoftMaxParameters): + act_infos(gen, pnode, pnode, pnode, qrec) + elif isinstance(pnode, ActivationFusion): + cnodes = node.contained_nodes() + quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes] + if isinstance(cnodes[0], (GlobalPoolParameters, PoolingParameters)): + act_infos(gen, pnode, cnodes[0], cnodes[1], quants[1]) + elif isinstance(cnodes[0], MatrixAddParameters): + quants[0].set_add_scale() + act_infos(gen, pnode, cnodes[0], cnodes[1], quants[1], + extra1=quants[0].scale_in_mul_biases_q.qbiases[0], + extra2=quants[0].scale_in_mul_biases_q.qnorms[0], + extra3=quants[0].scale_mul_biases_q.qbiases[0], + extra4=quants[0].scale_mul_biases_q.qnorms[0]) + elif isinstance(cnodes[0], MatrixMulParameters): + qrec.set_scale() + act_infos(gen, pnode, cnodes[0], cnodes[1], quants[1], + extra1=qrec.scale_mul_biases_q.qbiases[0], + extra2=qrec.scale_mul_biases_q.qnorms[0]) + else: + return False + return True + else: + return False + return True + + +def gen_constant(gen, pnode, cache_node, const_type): + cname = gen.naming_convension.get_global_name(pnode.name, pnode.step_idx, + pnode, const_type) + gen.name_cache.set(cache_node, const_type, cname) + file_name = os.path.join(gen.opts['tensor_directory'], + cname+".tensor") + return cname, file_name + + +def act_infos(gen, pnode, fnode, act_params, act_q, extra1=0, extra2=0, extra3=0, extra4=0): + if isinstance(pnode, FilterParameters): + comment = str.format("BiasQ: {}", extra1) + elif isinstance(pnode, MatrixAddParameters): + comment = str.format("In1Scale: {} In1ScaleN: {} OutScale: {} OutScaleN: {}", + extra1, extra2, extra3, extra4) + else: + comment = "" + + if act_params is None: + contents = np.array([0, 0, 0, 0, 0, extra1, extra2, extra3, extra4], dtype=np.int8) + elif isinstance(act_params, ReluActivationParameters): + if act_params.upper_bound is None or fnode is not None: + contents = np.array([0, 0, 0, 0, 0, extra1, extra2, extra3, extra4], dtype=np.int8) + if len(comment) == 0: + comment = "all 0" + else: + fac_1 = act_q.in_qs[0].quantize(act_params.upper_bound) + contents = np.array([0, 0, fac_1, 0, 0, extra1, extra2, extra3, extra4], + dtype=np.int8) + comment += str.format("in: {:05f} out: {:05f} A0: {} B0: 0 C0: 0", + act_q.in_qs[0].scale[0], + act_q.out_qs[0].scale[0], + fac_1[0]) + elif isinstance(act_params, HSigmoidActivationParameters): + # currently combines all scaling factors into one scale and shift + fac_1, upper_bound, _ = hsigmoid_mult_gen_factors(act_params, act_q) + contents = np.array([act_q.scale_mul_biases_q.qbiases[0], + act_q.scale_mul_biases_q.qnorms[0], + upper_bound, fac_1, 1, extra1, extra2, extra3, extra4], + dtype=np.int8) + comment += str.format("in: {:05f} out: {:05f} qbias: {} qnorm: {} A0: {} B0: {} C0: 1", + act_q.in_qs[0].scale[0], + act_q.out_qs[0].scale[0], + act_q.scale_mul_biases_q.qbiases[0], + act_q.scale_mul_biases_q.qnorms[0], + fac_1[0], upper_bound[0]) + elif isinstance(act_params, HSwishActivationParameters): + # currently combines all scaling factors into one scale and shift + fac_1, upper_bound, _ = hswish_mult_gen_factors(act_q) + contents = np.array([act_q.scale_mul_biases_q.qbiases[0], + act_q.scale_mul_biases_q.qnorms[0], + upper_bound, fac_1, 1, extra1, extra2, extra3, extra4], + dtype=np.int8) + comment += str.format("in: {:05f} out: {:05f} qbias: {} qnorm: {} A0: {} B0: {} C0: 1", + act_q.in_qs[0].scale[0], + act_q.out_qs[0].scale[0], + act_q.scale_mul_biases_q.qbiases[0], + act_q.scale_mul_biases_q.qnorms[0], + fac_1[0], upper_bound[0]) + elif isinstance(act_params, SoftMaxParameters): + norm = 15 + np.ceil(np.log2(act_q.in_qs[0].scale)) + contents = np.array([norm, 0, 0, 0, 0, extra1, extra2, extra3, extra4], dtype=np.int8) + comment += str.format("in: {:05f} out: {:05f} NORM: {}", + act_q.in_qs[0].scale[0], + act_q.out_qs[0].scale[0], + int(norm[0])) + else: + raise NotImplementedError("activation tye not implemented") + + cname, file_name = gen_constant(gen, pnode, fnode, INFOS) + const_info = ConstantInfo(file_name, QType(bits=8, q=0, signed=True), contents=contents) + + gen.globals.append(GlobalArgInfo("int8", cname, + gen.opts['default_global_home_location'], + gen.opts['default_global_exec_location'], + const_info=const_info, + comment=comment)) diff --git a/tools/nntool/generation/generators/globals/output_generator.py b/tools/nntool/generation/generators/globals/output_generator.py new file mode 100644 index 000000000..2869b0c87 --- /dev/null +++ b/tools/nntool/generation/generators/globals/output_generator.py @@ -0,0 +1,27 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from generation.at_types.tc_arg_info import OutputArgInfo +from generation.generators.generator_decorators import generation_function +from graph.types import OutputParameters + + +@generation_function("outputs", (OutputParameters,)) +def outputs_output_generator(gen, node, qrec, edge) -> bool: + eparams = edge.params + gen.globals.append(OutputArgInfo(qrec.in_qs[edge.to_idx].ctype, eparams.name, + gen.opts['default_output_home_location'], + gen.opts['default_output_exec_location'], + allocate=node.at_options.allocate)) + return True diff --git a/tools/nntool/generation/generators/kernels/__init__.py b/tools/nntool/generation/generators/kernels/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/generation/generators/kernels/autotiler_kernel.py b/tools/nntool/generation/generators/kernels/autotiler_kernel.py new file mode 100644 index 000000000..94ed4402e --- /dev/null +++ b/tools/nntool/generation/generators/kernels/autotiler_kernel.py @@ -0,0 +1,21 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + + +from abc import ABC, abstractmethod + +class AutotilerKernel(ABC): + @abstractmethod + def code(self, code_block=None): + pass diff --git a/tools/nntool/generation/generators/kernels/general/__init__.py b/tools/nntool/generation/generators/kernels/general/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/generation/generators/kernels/general/imageformat_kernels_generator.py b/tools/nntool/generation/generators/kernels/general/imageformat_kernels_generator.py new file mode 100644 index 000000000..d55931d00 --- /dev/null +++ b/tools/nntool/generation/generators/kernels/general/imageformat_kernels_generator.py @@ -0,0 +1,60 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import generation_function +from graph.types import ImageFormatParameters + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + +NNTOOL_KOP = {"RGB565_RGB888": "NNTOOL_KOP_RGB565", + "RGB888": "NNTOOL_KOP_RGB888", + "RGB16": "NNTOOL_KOP_RGB16", + "BW8": "NNTOOL_KOP_BW", + "BW16": "NNTOOL_KOP_BW16"} + +def gen_at_imageformat(code_block, name, in_dim, do_offset, nntool_kop): + code_block.write('CNN_Norm("{}", {}, {}, {}, {});', + name, in_dim.w, in_dim.h, do_offset and "1" or "0", nntool_kop) + + +@generation_function("kernels", (ImageFormatParameters, )) +def imageformat_kernels_generator(gen, node, qrec, in_eparams, out_eparams, cname): + del in_eparams, out_eparams, qrec + gen.kernels.append(ImageFormatKernel(cname, node)) + return True + + +class ImageFormatKernel(AutotilerKernel): + def __init__(self, cname, params): + self.in_dim = params.in_dims[0] + self.cname = cname + self.node_name = params.name + assert params.format_change in ("RGB565_RGB888", "RGB888", "RGB16", "BW8", "BW16"), "unknown format change" + assert params.norm_func in ("OFFSET_INT8", "SHIFT_INT8", "OUT_INT16"), "unknown normalization" + self.in_format = params.format_change + self.do_offset = params.norm_func == "OFFSET_INT8" + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + + gen_at_imageformat(code_block, self.cname, self.in_dim, self.do_offset, NNTOOL_KOP[self.in_format]) + return code_block diff --git a/tools/nntool/generation/generators/kernels/mult8/__init__.py b/tools/nntool/generation/generators/kernels/mult8/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/generation/generators/kernels/mult8/conv_pool_relu_kernels_generator.py b/tools/nntool/generation/generators/kernels/mult8/conv_pool_relu_kernels_generator.py new file mode 100644 index 000000000..d05f4ded5 --- /dev/null +++ b/tools/nntool/generation/generators/kernels/mult8/conv_pool_relu_kernels_generator.py @@ -0,0 +1,257 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.at_types.at_params import (NO_ACTIVATION, NO_CONV, NO_POOL, + ConvATParam, GroupedConvATParam, + gen_active_at_params, gen_conv_at_params, + gen_pool_at_params) +from generation.at_types.gen_ctrl import GenCtrl +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import generation_function, QREC_MULT8 +from graph.dim import PadDim +from graph.types import (ActivationParameters, Conv2DParameters, + ConvFusionParameters, PoolingParameters) +from utils.node_id import NodeId + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + + +@generation_function("kernels", + (Conv2DParameters, + ConvFusionParameters, + ActivationParameters), + qrec_types=(QREC_MULT8,)) +def conv_pool_relu_kernels_generator(gen, node, qrec, in_eparams, out_eparams, cname) -> bool: + del in_eparams, out_eparams + if isinstance(node, Conv2DParameters): + gen.kernels.append(ConvPoolReluKernel(node.name, cname, node, qrec, None, + None, None, None, at_ver=gen.opts['at_ver'], + gen_ctrl=node.get_gen_ctrl())) + # We want to match the pool_act generator for PoolingParameters + # elif isinstance(node, PoolingParameters): + # gen.kernels.append(ConvPoolReluKernel(node.name, cname, None, None, + # node, qrec, None, None, at_ver=gen.opts['at_ver'], + # gen_ctrl=node.get_gen_ctrl())) + elif isinstance(node, ActivationParameters): + # self.set_in_out_bindings(in_eparams, out_eparams, cname, node, qrec) + gen.kernels.append(ConvPoolReluKernel(node.name, cname, None, None, + None, None, node, qrec, at_ver=gen.opts['at_ver'], + gen_ctrl=node.get_gen_ctrl())) + elif isinstance(node, ConvFusionParameters): + cnodes = node.contained_nodes() + quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes] + if node.fusion_type == "conv_active_pool": + gen.kernels.append(ConvPoolReluKernel(node.name, cname, cnodes[0], quants[0], cnodes[2], quants[2], + cnodes[1], quants[1], at_ver=gen.opts['at_ver'], + gen_ctrl=node.get_gen_ctrl())) + elif node.fusion_type == "conv_pool_active": + gen.kernels.append(ConvPoolReluKernel(node.name, cname, cnodes[0], quants[0], cnodes[1], quants[1], + cnodes[2], quants[2], at_ver=gen.opts['at_ver'], + gen_ctrl=node.get_gen_ctrl())) + elif node.fusion_type == "conv_active": + gen.kernels.append(ConvPoolReluKernel(node.name, cname, cnodes[0], quants[0], None, None, cnodes[1], + quants[1], at_ver=gen.opts['at_ver'], + gen_ctrl=node.get_gen_ctrl())) + elif node.fusion_type == "conv_pool": + gen.kernels.append(ConvPoolReluKernel(node.name, cname, cnodes[0], quants[0], cnodes[1], quants[1], None, + None, at_ver=gen.opts['at_ver'], gen_ctrl=node.get_gen_ctrl())) + else: + return False + else: + return False + return True + +def gen_cnn_conv_pool_act_qs8(code_block, cname, + in_feat, out_feat, width, height, bias_size, + conv_oper, fcx, fcy, dcx, dcy, scx, scy, conv_pad, + pool_oper, fpx, fpy, dpx, dpy, spx, spy, pool_pad, + act_oper, gen_ctrl, at_ver=3): + del at_ver + code_block.write('CNN_ConvolutionPoolAct_SQ8("{}", {}, {}, {}, {}, {}, {}, {},', + cname, + gen_ctrl, + bias_size, + 1, + in_feat, + out_feat, + width, + height) + code_block.indent() + code_block.write('{}, {}, {}, {}, {}, {}, {}, {},', + conv_oper, fcx, fcy, dcx, dcy, scx, scy, conv_pad) + code_block.write('{}, {}, {}, {}, {}, {}, {}, {},', + pool_oper, fpx, fpy, dpx, dpy, spx, spy, pool_pad) + code_block.write('{});', act_oper) + code_block.deindent() + + +def gen_cnn_grp_conv_pool_act_qs8(code_block, cname, + grp_in, grp_out, in_feat, out_feat, width, height, bias_size, + conv_oper, fcx, fcy, dcx, dcy, scx, scy, conv_pad, + pool_oper, fpx, fpy, dpx, dpy, spx, spy, pool_pad, + act_oper, gen_ctrl, at_ver=3): + del at_ver + code_block.write('CNN_GroupedConvolutionPoolAct_SQ8("{}", {}, {}, {}, {}, {}, {}, {}, {}, {},', + cname, + gen_ctrl, + grp_in, + grp_out, + bias_size, + 1, + in_feat, + out_feat, + width, + height) + code_block.indent() + code_block.write('{}, {}, {}, {}, {}, {}, {}, {},', + conv_oper, fcx, fcy, dcx, dcy, scx, scy, conv_pad) + code_block.write('{}, {}, {}, {}, {}, {}, {}, {},', + pool_oper, fpx, fpy, dpx, dpy, spx, spy, pool_pad) + code_block.write('{});', act_oper) + code_block.deindent() + + +class ConvPoolReluKernel(AutotilerKernel): + def __init__(self, node_name, cname, conv_params, conv_q, + pool_params, pool_q, act_params, act_q, at_ver=3, gen_ctrl=None): + if gen_ctrl is None: + self.gen_ctrl = gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + in_q = filter_q = out_q = bias_q = mul_biases_q = None + in_dim = out_dim = None + pad_compatibilities = [] + if conv_params is not None: + at_conv_params = gen_conv_at_params(conv_params, pad_compatibilities) + in_dim = conv_params.in_dims[0] + out_dim = conv_params.out_dims[0] + # Set ENABLEIM2COL on 1x1 filters by default + if conv_params.filter.h == 1 and conv_params.filter.w == 1 and gen_ctrl.enableim2col is None: + gen_ctrl.enableim2col = 1 + filter_q = conv_q.weights_q + in_q = conv_q.in_qs[0] + out_q = conv_q.out_qs[0] + bias_q = conv_q.biases_q + if conv_params.has_mul_bias: + mul_biases_q = conv_q.mul_biases_q + else: + at_conv_params = NO_CONV + + if pool_params is not None: + at_pool_params = gen_pool_at_params(pool_params, pad_compatibilities) + if in_dim is None: + in_dim = pool_params.in_dims[0] + out_dim = pool_params.out_dims[0] + if in_q is None: + in_q = pool_q.in_qs[0] + out_q = pool_q.out_qs[0] + else: + at_pool_params = NO_POOL + + if act_params is not None: + at_act_params = gen_active_at_params(act_params, force_relu=True) + if in_dim is None: + in_dim = act_params.in_dims[0] + if out_dim is None: + out_dim = act_params.out_dims[0] + if in_q is None: + in_q = act_q.in_qs[0] + out_q = act_q.out_qs[0] + + else: + at_act_params = NO_ACTIVATION + + if pad_compatibilities: + reduction = PadDim.pad_compatibility_reduce(*pad_compatibilities, + "convolution padding is not compatible with pool padding") + if not reduction[2]: # default is balanced pad left + at_pad_ctrl = next(i for i, v in enumerate(reduction) if v) + LOG.debug("%s: generating pad control block", node_name) + self.gen_ctrl.PadType = at_pad_ctrl + self.in_dim = in_dim + self.out_dim = out_dim + self.in_q = in_q + self.bias_q = bias_q + self.out_q = out_q + self.filter_q = filter_q + self.mul_biases_q = mul_biases_q + self.at_act_params = at_act_params + self.at_pool_params = at_pool_params + self.at_conv_params = at_conv_params + self.cname = cname + self.node_name = node_name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + gen_ctrl = self.gen_ctrl.ctrl_name + else: + gen_ctrl = "0" + + if self.at_conv_params == NO_CONV: + pp = self.at_pool_params + ap = self.at_act_params + gen_cnn_conv_pool_act_qs8(code_block, self.cname, self.in_dim.c, + self.out_dim.c, self.in_dim.w, self.in_dim.h, + self.bias_q.bits//8 if self.bias_q is not None else 0, + "KOP_NONE", 0, 0, 0, 0, 0, 0, 0, + pp.PoolOper, pp.Fpx, pp.Fpy, pp.Dpx, pp.Dpy, + pp.Spx, pp.Spy, pp.PoolPad, + ap.ReLUOper, gen_ctrl, + at_ver=self.at_ver) + else: + cp = self.at_conv_params + pp = self.at_pool_params + ap = self.at_act_params + if isinstance(self.at_conv_params, ConvATParam): + LOG.debug("%s: conv pool relu inq %s outq %s control block", + self.node_name, self.in_q, self.out_q) + gen_cnn_conv_pool_act_qs8(code_block, self.cname, self.in_dim.c, + self.out_dim.c, self.in_dim.w, self.in_dim.h, + self.bias_q.bits//8, + cp.ConvOper, cp.Fcx, cp.Fcy, cp.Dcx, cp.Dcy, + cp.Scx, cp.Scy, cp.ConvPad, + pp.PoolOper, pp.Fpx, pp.Fpy, pp.Dpx, pp.Dpy, + pp.Spx, pp.Spy, pp.PoolPad, + ap.ReLUOper, gen_ctrl, + at_ver=self.at_ver) + elif isinstance(self.at_conv_params, GroupedConvATParam): + LOG.debug("%s: grouped mulconv pool relu inq %s outq %s control block", + self.node_name, self.in_q, self.out_q) + gen_cnn_grp_conv_pool_act_qs8(code_block, self.cname, cp.GroupIn, cp.GroupOut, + self.in_dim.c, + self.out_dim.c, self.in_dim.w, self.in_dim.h, + self.bias_q.bits//8, + cp.ConvOper, cp.Fcx, cp.Fcy, cp.Dcx, cp.Dcy, + cp.Scx, cp.Scy, cp.ConvPad, + pp.PoolOper, pp.Fpx, pp.Fpy, pp.Dpx, pp.Dpy, + pp.Spx, pp.Spy, pp.PoolPad, + ap.ReLUOper, gen_ctrl, + at_ver=self.at_ver) + else: + raise ValueError('Internal error') + + return code_block diff --git a/tools/nntool/generation/generators/kernels/mult8/global_pool_kernels_generator.py b/tools/nntool/generation/generators/kernels/mult8/global_pool_kernels_generator.py new file mode 100644 index 000000000..458e8b06a --- /dev/null +++ b/tools/nntool/generation/generators/kernels/mult8/global_pool_kernels_generator.py @@ -0,0 +1,85 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.at_types.at_params import (NO_ACTIVATION, gen_active_at_params, + gen_globalpool_at_params) +from generation.at_types.gen_ctrl import GenCtrl +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import generation_function, QREC_MULT8 +from graph.types import GlobalPoolParameters, ActivationFusion + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + + +@generation_function("kernels", (GlobalPoolParameters, ActivationFusion), qrec_types=(QREC_MULT8, )) +def global_pool_kernels_generator(gen, node, qrec, in_eparams, out_eparams, cname): + del in_eparams, out_eparams, qrec + if isinstance(node, ActivationFusion): + cnodes = node.contained_nodes() + if isinstance(cnodes[0], GlobalPoolParameters): + gen.kernels.append(GlobalPoolKernel(node.name, cname, cnodes[0], cnodes[1], at_ver=gen.opts['at_ver'])) + return True + return False + gen.kernels.append(GlobalPoolKernel(node.name, cname, node, None, at_ver=gen.opts['at_ver'])) + return True + + +def gen_cnn_globalpool_sq8(code_block, cname, ctrl, feat, width, height, pooloper, actoper): + code_block.write('CNN_GlobalPoolAct_SQ8("{}", {}, {}, {}, {}, {}, {});'.format(cname, ctrl, + feat, width, + height, pooloper, + actoper)) + + +class GlobalPoolKernel(AutotilerKernel): + def __init__(self, node_name, cname, pool_params, act_params, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + self.gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + if act_params is not None: + self.at_act_params = gen_active_at_params(act_params, force_relu=True) + else: + self.at_act_params = NO_ACTIVATION + + self.at_globalpool_params = gen_globalpool_at_params(pool_params) + self.in_dim = pool_params.in_dims[0] + self.out_dim = pool_params.out_dims[0] + self.cname = cname + self.node_name = node_name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + gen_ctrl = self.gen_ctrl.ctrl_name + else: + gen_ctrl = "0" + + gen_cnn_globalpool_sq8(code_block, self.cname, gen_ctrl, self.in_dim.c, + self.in_dim.w, self.in_dim.h, + self.at_globalpool_params.GlobalPoolOper, + self.at_act_params.ReLUOper) + return code_block diff --git a/tools/nntool/generation/generators/kernels/mult8/linear_relu_kernels_generator.py b/tools/nntool/generation/generators/kernels/mult8/linear_relu_kernels_generator.py new file mode 100644 index 000000000..5acd71c3a --- /dev/null +++ b/tools/nntool/generation/generators/kernels/mult8/linear_relu_kernels_generator.py @@ -0,0 +1,121 @@ + # Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.at_types.at_params import (NO_ACTIVATION, + gen_active_at_params, gen_linear_at_params) +from generation.at_types.gen_ctrl import GenCtrl +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import generation_function, QREC_MULT8 +from graph.types import (FcParameters, ConvFusionParameters) +from utils.node_id import NodeId + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + + +@generation_function("kernels", (ConvFusionParameters, FcParameters), qrec_types=(QREC_MULT8, )) +def linear_relu_kernels_generator(gen, node, qrec, in_eparams, out_eparams, cname): + del in_eparams, out_eparams + if isinstance(node, FcParameters): + gen.kernels.append(LinearReluKernel(node.name, cname, node, qrec, None, None, + at_ver=gen.opts['at_ver'], gen_ctrl=node.get_gen_ctrl())) + elif isinstance(node, ConvFusionParameters) and node.fusion_type == "linear_active": + cnodes = node.contained_nodes() + quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes] + gen.kernels.append(LinearReluKernel(node.name, cname, cnodes[0], quants[0], + cnodes[1], quants[1], at_ver=gen.opts['at_ver'], + gen_ctrl=node.get_gen_ctrl())) + else: + return False + return True + +def gen_at_linear_relu(code_block, cname, biases_ds, mulbiases_ds, + in_dim, out_dim, linear_oper, act_oper, gen_ctrl, at_ver=3): + del at_ver + code_block.write('CNN_LinearAct_SQ8("{}", {}, {}, {}, {}, {}, {}, {});', + cname, + gen_ctrl, + biases_ds, + mulbiases_ds, + in_dim, + out_dim, + linear_oper, + act_oper) + + +class LinearReluKernel(AutotilerKernel): + def __init__(self, node_name, cname, linear_params, linear_q, act_params, act_q, at_ver=3, gen_ctrl=None): + if gen_ctrl is None: + self.gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + assert linear_params is not None, "linear should always be included" + at_linear_params = gen_linear_at_params(linear_params) + in_dim = linear_params.in_dims[0] + out_dim = linear_params.out_dims[0] + filter_q = linear_q.weights_q + in_q = linear_q.in_qs[0] + out_q = linear_q.out_qs[0] + bias_q = linear_q.biases_q + mulbiases_q = linear_q.mul_biases_q + + if act_params is not None: + at_act_params = gen_active_at_params(act_params, force_relu=True) + if in_dim is None: + in_dim = act_params.in_dims[0] + if out_dim is None: + out_dim = act_params.out_dims[0] + if in_q is None: + in_q = act_q.in_qs[0] + out_q = act_q.out_qs[0] + else: + at_act_params = NO_ACTIVATION + + self.at_linear_params = at_linear_params + self.in_dim = in_dim.size() + self.out_dim = out_dim.size() + self.in_q = in_q + self.bias_q = bias_q + self.mulbiases_q = mulbiases_q + self.out_q = out_q + self.filter_q = filter_q + self.at_act_params = at_act_params + self.cname = cname + self.node_name = node_name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + gen_ctrl = self.gen_ctrl.ctrl_name + else: + gen_ctrl = "0" + + gen_at_linear_relu(code_block, self.cname, self.bias_q.bits//8, self.mulbiases_q.bits//8, + self.in_dim, self.out_dim, + self.at_linear_params.LinearOper, + self.at_act_params.ReLUOper, + at_ver=self.at_ver, gen_ctrl=gen_ctrl) + + return code_block diff --git a/tools/nntool/generation/generators/kernels/mult8/mat_vect_mult_kernels_generator.py b/tools/nntool/generation/generators/kernels/mult8/mat_vect_mult_kernels_generator.py new file mode 100644 index 000000000..90e135a3e --- /dev/null +++ b/tools/nntool/generation/generators/kernels/mult8/mat_vect_mult_kernels_generator.py @@ -0,0 +1,86 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.at_types.at_params import (NO_ACTIVATION, gen_active_at_params) +from generation.at_types.gen_ctrl import GenCtrl +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import generation_function, QREC_MULT8 +from graph.types import MatrixMulParameters, ActivationFusion + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + +MAT_VECT_MUL_OPER = "KOP_MATVECTMUL" + +@generation_function("kernels", (MatrixMulParameters, ActivationFusion), qrec_types=(QREC_MULT8, )) +def mat_vect_mult_kernel_generator(gen, node, qrec, in_eparams, out_eparams, cname): + del in_eparams, out_eparams, qrec + if isinstance(node, ActivationFusion): + cnodes = node.contained_nodes() + if isinstance(cnodes[0], MatrixMulParameters): + gen.kernels.append(MatVectMulKernel(node.name, cname, cnodes[0], cnodes[1], at_ver=gen.opts['at_ver'])) + return True + return False + gen.kernels.append(MatVectMulKernel(node.name, cname, node, None, at_ver=gen.opts['at_ver'])) + return True + +def gen_mat_vect_mul_sq8(code_block, cname, ctrl, feat, width, height, act_oper): + code_block.write('CNN_TensorVectMultAct_SQ8("{}", {}, {}, {}, {}, {}, {});'.format(cname, ctrl, + feat, width, + height, + MAT_VECT_MUL_OPER, + act_oper)) + +class MatVectMulKernel(AutotilerKernel): + def __init__(self, node_name, cname, tens_vect_mul_params, act_params, at_ver=3, gen_ctrl=None): + if gen_ctrl is None: + self.gen_ctrl = gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + self.cname = cname + self.node_name = node_name + self.at_ver = at_ver + + if act_params is not None: + self.at_act_params = gen_active_at_params(act_params, force_relu=True) + else: + self.at_act_params = NO_ACTIVATION + + self.tens_vect_mul_params = tens_vect_mul_params + dimensions = tens_vect_mul_params.in_dims[0] + self.feat_dim = dimensions[0] + self.width = dimensions[1] + self.height = dimensions[2] + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + gen_ctrl = self.gen_ctrl.ctrl_name + else: + gen_ctrl = "0" + + gen_mat_vect_mul_sq8(code_block, self.cname, gen_ctrl, self.feat_dim, + self.width, self.height, self.at_act_params.ReLUOper) + + return code_block diff --git a/tools/nntool/generation/generators/kernels/mult8/matadd_kernels_generator.py b/tools/nntool/generation/generators/kernels/mult8/matadd_kernels_generator.py new file mode 100644 index 000000000..27500412a --- /dev/null +++ b/tools/nntool/generation/generators/kernels/mult8/matadd_kernels_generator.py @@ -0,0 +1,86 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.at_types.at_params import (NO_ACTIVATION, gen_active_at_params) +from generation.at_types.gen_ctrl import GenCtrl +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import generation_function, QREC_MULT8 +from graph.types import MatrixAddParameters, ActivationFusion + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + +MAT_ADD_OPER = "KOP_MATADD" + +@generation_function("kernels", (MatrixAddParameters, ActivationFusion), qrec_types=(QREC_MULT8, )) +def matadd_kernel_generator(gen, node, qrec, in_eparams, out_eparams, cname): + del in_eparams, out_eparams, qrec + if isinstance(node, ActivationFusion): + cnodes = node.contained_nodes() + if isinstance(cnodes[0], MatrixAddParameters): + gen.kernels.append(MatAddKernel(node.name, cname, cnodes[0], cnodes[1], at_ver=gen.opts['at_ver'])) + return True + return False + gen.kernels.append(MatAddKernel(node.name, cname, node, None, at_ver=gen.opts['at_ver'])) + return True + +def gen_mat_add_sq8(code_block, cname, ctrl, feat, width, height, act_oper): + code_block.write('CNN_MatAddAct_SQ8("{}", {}, {}, {}, {}, {}, {});'.format(cname, ctrl, + feat, width, + height, + MAT_ADD_OPER, + act_oper)) + +class MatAddKernel(AutotilerKernel): + def __init__(self, node_name, cname, matrixadd_params, act_params, at_ver=3, gen_ctrl=None): + if gen_ctrl is None: + self.gen_ctrl = gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + self.cname = cname + self.node_name = node_name + self.at_ver = at_ver + + if act_params is not None: + self.at_act_params = gen_active_at_params(act_params, force_relu=True) + else: + self.at_act_params = NO_ACTIVATION + + self.matrixadd_params = matrixadd_params + dimensions = matrixadd_params.in_dims[0] + self.feat_dim = dimensions[0] + self.width = dimensions[1] + self.height = dimensions[2] + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + gen_ctrl = self.gen_ctrl.ctrl_name + else: + gen_ctrl = "0" + + gen_mat_add_sq8(code_block, self.cname, gen_ctrl, self.feat_dim, + self.width, self.height, self.at_act_params.ReLUOper) + + return code_block diff --git a/tools/nntool/generation/generators/kernels/mult8/pool_relu_kernels_generator.py b/tools/nntool/generation/generators/kernels/mult8/pool_relu_kernels_generator.py new file mode 100644 index 000000000..0c4c9ed91 --- /dev/null +++ b/tools/nntool/generation/generators/kernels/mult8/pool_relu_kernels_generator.py @@ -0,0 +1,93 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.at_types.at_params import (gen_pool_at_params, gen_active_at_params, NO_ACTIVATION) +from generation.at_types.gen_ctrl import GenCtrl +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import generation_function, QREC_MULT8 +from graph.types import PoolingParameters, ActivationFusion + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + + +@generation_function("kernels", (PoolingParameters, ActivationFusion), qrec_types=(QREC_MULT8, )) +def pool_act_kernels_generator(gen, node, qrec, in_eparams, out_eparams, cname): + del in_eparams, out_eparams, qrec + if isinstance(node, ActivationFusion): + cnodes = node.contained_nodes() + if isinstance(cnodes[0], PoolingParameters): + gen.kernels.append(PoolKernel(node.name, cname, cnodes[0], cnodes[1], at_ver=gen.opts['at_ver'])) + return True + return False + gen.kernels.append(PoolKernel(node.name, cname, node, None, at_ver=gen.opts['at_ver'])) + return True + + +def gen_cnn_pool_act_sq8(code_block, cname, ctrl, feat, width, height, at_pool_params, actoper): + code_block.write('CNN_PoolAct_SQ8("{}", {}, {}, {}, {},'.format(cname, ctrl, feat, width, height)) + code_block.indent() + code_block.write('{}, {}, {}, {}, {}, {}, {}, {}, {});'.format(at_pool_params.PoolOper, + at_pool_params.Fpx, + at_pool_params.Fpy, + at_pool_params.Dpx, + at_pool_params.Dpy, + at_pool_params.Spx, + at_pool_params.Spy, + at_pool_params.PoolPad, + actoper)) + code_block.deindent() + + +class PoolKernel(AutotilerKernel): + def __init__(self, node_name, cname, pool_params, act_params, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + self.gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + if act_params is not None: + self.at_act_params = gen_active_at_params(act_params, force_relu=True) + else: + self.at_act_params = NO_ACTIVATION + + pad_compatibilities = [] + self.at_pool_params = gen_pool_at_params(pool_params, pad_compatibilities) + self.in_dim = pool_params.in_dims[0] + self.out_dim = pool_params.out_dims[0] + self.cname = cname + self.node_name = node_name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + gen_ctrl = self.gen_ctrl.ctrl_name + else: + gen_ctrl = "0" + + gen_cnn_pool_act_sq8(code_block, self.cname, gen_ctrl, self.in_dim.c, + self.in_dim.w, self.in_dim.h, + self.at_pool_params, + self.at_act_params.ReLUOper) + return code_block diff --git a/tools/nntool/generation/generators/kernels/mult8/softmax_kernels_generator.py b/tools/nntool/generation/generators/kernels/mult8/softmax_kernels_generator.py new file mode 100644 index 000000000..0277852ed --- /dev/null +++ b/tools/nntool/generation/generators/kernels/mult8/softmax_kernels_generator.py @@ -0,0 +1,79 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.at_types.at_params import gen_softmax_at_params +from generation.at_types.gen_ctrl import GenCtrl +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import generation_function, QREC_MULT8 +from graph.types import SoftMaxParameters + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + +GEN_SOFTMAX = "CNN_SoftMax_SQ8" +# extern void CNN_SoftMax( +# char *Name, +# CNN_GenControl_T *Ctrl, +# int Dim, +# KernelOper_T SoftMaxOper +# ); + +def gen_at_softmax(code_block, name, in_dim, at_softmax_params, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + gen_ctrl = "0" + else: + raise NotImplementedError("genctrl is not yet implemented") + + code_block.write('{}("{}", {}, {}, {});', + GEN_SOFTMAX, name, gen_ctrl, + in_dim.size(), at_softmax_params) + + +@generation_function("kernels", (SoftMaxParameters, ), qrec_types=(QREC_MULT8, )) +def softmax_kernels_generator(gen, node, qrec, in_eparams, out_eparams, cname): + del in_eparams, out_eparams + gen.kernels.append(SoftmaxKernel(cname, node, qrec, at_ver=gen.opts['at_ver'])) + return True + + +class SoftmaxKernel(AutotilerKernel): + def __init__(self, cname, params, qrec, gen_ctrl=None, at_ver=3): + del qrec + if gen_ctrl is None: + self.gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + self.at_softmax_params = gen_softmax_at_params(params) + self.in_dim = params.in_dims[0] + self.cname = cname + self.node_name = params.name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + + gen_at_softmax(code_block, self.cname, self.in_dim, + self.at_softmax_params.SoftMaxOper, at_ver=self.at_ver) + return code_block diff --git a/tools/nntool/generation/generators/kernels/mult8/three_d_transpose_kernels_generator.py b/tools/nntool/generation/generators/kernels/mult8/three_d_transpose_kernels_generator.py new file mode 100644 index 000000000..d22c74c4e --- /dev/null +++ b/tools/nntool/generation/generators/kernels/mult8/three_d_transpose_kernels_generator.py @@ -0,0 +1,152 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.at_types.gen_ctrl import GenCtrl +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import generation_function, QREC_MULT8 +from graph.types import TransposeParameters + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + + +@generation_function("kernels", (TransposeParameters, ), qrec_types=(QREC_MULT8, )) +def three_d_transpose_kernels_generator(gen, node, qrec, in_eparams, out_eparams, cname): + del in_eparams, out_eparams + LOG.info("generating for transpose in %s out %s trans %s", + node.in_dims[0], node.out_dims[0], node.transpose_in) + real_in_shape, real_transpose = node.real_shape() + if len(real_transpose) <= 1: + return True + if len(real_transpose) == 2: + gen.kernels.append(TwoDTransposeKernelSq8(cname, node, real_in_shape, + real_transpose, qrec, at_ver=gen.opts['at_ver'])) + elif len(real_transpose) == 3: + gen.kernels.append(ThreeDTransposeKernelSq8(cname, node, real_in_shape, + real_transpose, qrec, at_ver=gen.opts['at_ver'])) + else: + raise NotImplementedError("only 2D or 3D transposes are currently supported") + return True + +# int CNN_MatTranspose_SQ8( +# char *Name, + +# CNN_GenControl_T *Ctrl, + +# int Feat, +# int Width, +# int Height, + +# KernelOper_T MatTransOper +# ) + + +def gen_at_2d_transpose(code_block, name, + in_shape, gen_ctrl=None, + at_ver=3): + if gen_ctrl is None: + gen_ctrl = "0" + else: + raise NotImplementedError("genctrl is not yet implemented") + + code_block.write('CNN_MatTranspose_SQ8("{}", {}, 1, {}, {}, KOP_MATTRANSP);', + name, gen_ctrl, in_shape[1], in_shape[0]) + + +class TwoDTransposeKernelSq8(AutotilerKernel): + def __init__(self, cname, params, real_in_shape, real_transpose, qrec, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + self.gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + self.in_q = qrec.in_qs[0] + self.out_q = qrec.out_qs[0] + self.in_shape = real_in_shape + self.in_dim = params.in_dims[0] + self.out_dim = params.out_dims[0] + self.real_transpose = real_transpose + self.cname = cname + self.node_name = params.name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + code_block.comment("transpose from {} to {} ({})", self.in_dim, + self.out_dim, self.real_transpose) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + + gen_at_2d_transpose(code_block, self.cname, + self.in_shape) + return code_block + + +def gen_at_3d_transpose(code_block, name, + in_shape, permop, gen_ctrl=None, + at_ver=3): + if gen_ctrl is None: + gen_ctrl = "0" + else: + raise NotImplementedError("genctrl is not yet implemented") + + code_block.write('CNN_3DTensorPermute_SQ8("{}", {}, {}, {}, {}, {});', + name, gen_ctrl, in_shape[0], in_shape[2], in_shape[1], + permop) + + +class ThreeDTransposeKernelSq8(AutotilerKernel): + def __init__(self, cname, params, real_in_shape, real_transpose, qrec, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + self.gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + self.in_shape = real_in_shape + dim_names = ['C', 'H', 'W'] + perm = [dim_names[i] for i in real_transpose] + self.permop = "KOP_MATPERM_CHW2{}".format("".join(perm)) + self.real_transpose = real_transpose + + self.in_q = qrec.in_qs[0] + self.out_q = qrec.out_qs[0] + self.in_dim = params.in_dims[0] + self.out_dim = params.out_dims[0] + self.cname = cname + self.node_name = params.name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + code_block.comment("transpose from {} to {} ({})", self.in_dim, + self.out_dim, self.real_transpose) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + + gen_at_3d_transpose(code_block, self.cname, + self.in_shape, self.permop) + return code_block diff --git a/tools/nntool/generation/generators/kernels/pow2/__init__.py b/tools/nntool/generation/generators/kernels/pow2/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/generation/generators/kernels/pow2/conv_pool_relu_kernels_generator.py b/tools/nntool/generation/generators/kernels/pow2/conv_pool_relu_kernels_generator.py new file mode 100644 index 000000000..640eaafff --- /dev/null +++ b/tools/nntool/generation/generators/kernels/pow2/conv_pool_relu_kernels_generator.py @@ -0,0 +1,214 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.at_generators import (NO_ACTIVATION, NO_CONV, NO_POOL, + ConvATParam, GroupedConvATParam, + gen_active_at_params, + gen_at_conv_pool_relu, + gen_at_grouped_conv_pool_relu, + gen_at_grouped_mulconv_pool_relu, + gen_at_mulconv_pool_relu, + gen_at_pool_relu, gen_conv_at_params, + gen_pool_at_params) +from generation.at_types.gen_ctrl import GenCtrl +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import generation_function, QREC_POW2 +from graph.dim import PadDim +from graph.types import (ActivationParameters, Conv2DParameters, + ConvFusionParameters, PoolingParameters) +from utils.node_id import NodeId + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + + +@generation_function("kernels", + (Conv2DParameters, + ConvFusionParameters, + PoolingParameters, + ActivationParameters), + qrec_types=(QREC_POW2,)) +def conv_pool_relu_kernels_generator(gen, node, qrec, in_eparams, out_eparams, cname) -> bool: + del in_eparams, out_eparams + if isinstance(node, Conv2DParameters): + gen.kernels.append(ConvPoolReluKernel(node.name, cname, node, qrec, None, + None, None, None, at_ver=gen.opts['at_ver'], + gen_ctrl=node.get_gen_ctrl())) + elif isinstance(node, PoolingParameters): + gen.kernels.append(ConvPoolReluKernel(node.name, cname, None, None, + node, qrec, None, None, at_ver=gen.opts['at_ver'], + gen_ctrl=node.get_gen_ctrl())) + elif isinstance(node, ActivationParameters): + # self.set_in_out_bindings(in_eparams, out_eparams, cname, node, qrec) + gen.kernels.append(ConvPoolReluKernel(node.name, cname, None, None, + None, None, node, qrec, at_ver=gen.opts['at_ver'], + gen_ctrl=node.get_gen_ctrl())) + elif isinstance(node, ConvFusionParameters): + cnodes = node.contained_nodes() + quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes] + if node.fusion_type == "conv_active_pool": + gen.kernels.append(ConvPoolReluKernel(node.name, cname, cnodes[0], quants[0], cnodes[2], quants[2], + cnodes[1], quants[1], at_ver=gen.opts['at_ver'], + gen_ctrl=node.get_gen_ctrl())) + elif node.fusion_type == "conv_pool_active": + gen.kernels.append(ConvPoolReluKernel(node.name, cname, cnodes[0], quants[0], cnodes[1], quants[1], + cnodes[2], quants[2], at_ver=gen.opts['at_ver'], + gen_ctrl=node.get_gen_ctrl())) + elif node.fusion_type == "conv_active": + gen.kernels.append(ConvPoolReluKernel(node.name, cname, cnodes[0], quants[0], None, None, cnodes[1], + quants[1], at_ver=gen.opts['at_ver'], + gen_ctrl=node.get_gen_ctrl())) + elif node.fusion_type == "conv_pool": + gen.kernels.append(ConvPoolReluKernel(node.name, cname, cnodes[0], quants[0], cnodes[1], quants[1], None, + None, at_ver=gen.opts['at_ver'], gen_ctrl=node.get_gen_ctrl())) + else: + return False + else: + return False + return True + + +class ConvPoolReluKernel(AutotilerKernel): + def __init__(self, node_name, cname, conv_params, conv_q, + pool_params, pool_q, act_params, act_q, at_ver=3, gen_ctrl=None): + if gen_ctrl is None: + self.gen_ctrl = gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + in_q = filter_q = out_q = bias_q = mul_biases_q = None + in_dim = out_dim = None + pad_compatibilities = [] + if conv_params is not None: + at_conv_params = gen_conv_at_params(conv_params, conv_q, pad_compatibilities) + in_dim = conv_params.in_dims[0] + out_dim = conv_params.out_dims[0] + filter_q = conv_q.weights_q + in_q = conv_q.in_qs[0] + out_q = conv_q.out_qs[0] + bias_q = conv_q.biases_q + if conv_params.has_mul_bias: + mul_biases_q = conv_q.mul_biases_q + else: + at_conv_params = NO_CONV + + if pool_params is not None: + at_pool_params = gen_pool_at_params(pool_params, pad_compatibilities) + if in_dim is None: + in_dim = pool_params.in_dims[0] + out_dim = pool_params.out_dims[0] + if in_q is None: + in_q = pool_q.in_qs[0] + out_q = pool_q.out_qs[0] + else: + at_pool_params = NO_POOL + + if act_params is not None: + at_act_params = gen_active_at_params(act_params) + if in_dim is None: + in_dim = act_params.in_dims[0] + if out_dim is None: + out_dim = act_params.out_dims[0] + if in_q is None: + in_q = act_q.in_qs[0] + out_q = act_q.out_qs[0] + if at_ver < 3: + if act_params.activation == "relu6" and out_q.q != 0: + self.gen_ctrl.ReluN = 6 << out_q.q + self.gen_ctrl.ReluNNoNorm = 1 + else: + if act_params.activation == "relun": + self.gen_ctrl.ReluN = act_params.activation_params + + else: + at_act_params = NO_ACTIVATION + + if pad_compatibilities: + reduction = PadDim.pad_compatibility_reduce(*pad_compatibilities, + "convolution padding is not compatible with pool padding") + if not reduction[2]: # default is balanced pad left + at_pad_ctrl = next(i for i, v in enumerate(reduction) if v) + LOG.debug("%s: generating pad control block", node_name) + self.gen_ctrl.PadType = at_pad_ctrl + self.in_dim = in_dim + self.out_dim = out_dim + self.in_q = in_q + self.bias_q = bias_q + self.out_q = out_q + self.filter_q = filter_q + self.mul_biases_q = mul_biases_q + self.at_act_params = at_act_params + self.at_pool_params = at_pool_params + self.at_conv_params = at_conv_params + self.cname = cname + self.node_name = node_name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + + if self.at_conv_params == NO_CONV: + if self.in_q.bits != self.out_q.bits: + raise NotImplementedError("only homogenious operations are supported at present") + LOG.debug("%s: pool relu inq %s outq %s control block", + self.node_name, self.in_q, self.out_q) + gen_at_pool_relu(code_block, self.cname, self.in_q, self.out_q, + self.in_dim, self.out_dim, self.at_pool_params, self.at_act_params, gen_ctrl=self.gen_ctrl, + at_ver=self.at_ver) + else: + if isinstance(self.at_conv_params, ConvATParam): + if self.mul_biases_q is not None: + LOG.debug("%s: mulconv pool relu inq %s outq %s control block", + self.node_name, self.in_q, self.out_q) + gen_at_mulconv_pool_relu(code_block, self.cname, self.in_q, self.out_q, + self.filter_q, self.bias_q, self.mul_biases_q, + self.in_dim, self.out_dim, self.at_conv_params, self.at_pool_params, + self.at_act_params, gen_ctrl=self.gen_ctrl, at_ver=self.at_ver) + else: + LOG.debug("%s: conv pool relu inq %s outq %s control block", + self.node_name, self.in_q, self.out_q) + gen_at_conv_pool_relu(code_block, self.cname, self.in_q, self.out_q, + self.filter_q, self.bias_q, + self.in_dim, self.out_dim, self.at_conv_params, self.at_pool_params, + self.at_act_params, gen_ctrl=self.gen_ctrl, at_ver=self.at_ver) + elif isinstance(self.at_conv_params, GroupedConvATParam): + if self.mul_biases_q is not None: + LOG.debug("%s: grouped conv pool relu inq %s outq %s control block", + self.node_name, self.in_q, self.out_q) + gen_at_grouped_mulconv_pool_relu(code_block, self.cname, self.in_q, self.out_q, + self.filter_q, self.bias_q, self.mul_biases_q, + self.in_dim, self.out_dim, self.at_conv_params, + self.at_pool_params, + self.at_act_params, gen_ctrl=self.gen_ctrl, at_ver=self.at_ver) + else: + LOG.debug("%s: grouped mulconv pool relu inq %s outq %s control block", + self.node_name, self.in_q, self.out_q) + gen_at_grouped_conv_pool_relu(code_block, self.cname, self.in_q, self.out_q, + self.filter_q, self.bias_q, + self.in_dim, self.out_dim, self.at_conv_params, self.at_pool_params, + self.at_act_params, gen_ctrl=self.gen_ctrl, at_ver=self.at_ver) + else: + raise ValueError('Internal error') + + return code_block diff --git a/tools/nntool/generation/generators/kernels/pow2/global_pool_kernels_generator.py b/tools/nntool/generation/generators/kernels/pow2/global_pool_kernels_generator.py new file mode 100644 index 000000000..93a413f7f --- /dev/null +++ b/tools/nntool/generation/generators/kernels/pow2/global_pool_kernels_generator.py @@ -0,0 +1,66 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.at_generators import (gen_globalpool_at_params, gen_at_globalpool) +from generation.at_types.gen_ctrl import GenCtrl +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import generation_function, QREC_POW2 +from graph.types import GlobalPoolParameters + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + + +@generation_function("kernels", + (GlobalPoolParameters, ), + qrec_types=(QREC_POW2, )) +def global_pool_kernels_generator(gen, node, qrec, in_eparams, out_eparams, cname): + del in_eparams, out_eparams + gen.kernels.append(GlobalPoolKernel(node.name, cname, node, qrec, at_ver=gen.opts['at_ver'])) + return True + + +class GlobalPoolKernel(AutotilerKernel): + def __init__(self, node_name, cname, params, qrec, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + self.gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + self.at_globalpool_params = gen_globalpool_at_params(params) + self.in_dim = params.in_dims[0] + self.out_dim = params.out_dims[0] + self.in_q = qrec.in_qs[0] + self.out_q = qrec.out_qs[0] + self.cname = cname + self.node_name = node_name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + + gen_at_globalpool(code_block, self.cname, self.in_q, self.out_q, + self.in_dim, self.out_dim, self.at_globalpool_params, + at_ver=self.at_ver) + return code_block diff --git a/tools/nntool/generation/generators/kernels/pow2/linear_relu_kernels_generator.py b/tools/nntool/generation/generators/kernels/pow2/linear_relu_kernels_generator.py new file mode 100644 index 000000000..fdd44d318 --- /dev/null +++ b/tools/nntool/generation/generators/kernels/pow2/linear_relu_kernels_generator.py @@ -0,0 +1,103 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.at_generators import (NO_ACTIVATION, + gen_active_at_params, gen_linear_at_params, gen_at_linear_relu) +from generation.at_types.gen_ctrl import GenCtrl +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import generation_function, QREC_POW2 +from graph.types import (FcParameters, ConvFusionParameters) +from utils.node_id import NodeId + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + + +@generation_function("kernels", (ConvFusionParameters, FcParameters), qrec_types=(QREC_POW2, )) +def linear_relu_kernels_generator(gen, node, qrec, in_eparams, out_eparams, cname): + del in_eparams, out_eparams + if isinstance(node, FcParameters): + gen.kernels.append(LinearReluKernel(node.name, cname, node, qrec, None, None, + at_ver=gen.opts['at_ver'], gen_ctrl=node.get_gen_ctrl())) + elif isinstance(node, ConvFusionParameters) and node.fusion_type == "linear_active": + cnodes = node.contained_nodes() + quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes] + gen.kernels.append(LinearReluKernel(node.name, cname, cnodes[0], quants[0], + cnodes[1], quants[1], at_ver=gen.opts['at_ver'], + gen_ctrl=node.get_gen_ctrl())) + else: + return False + return True + + +class LinearReluKernel(AutotilerKernel): + def __init__(self, node_name, cname, linear_params, linear_q, act_params, act_q, at_ver=3, gen_ctrl=None): + if gen_ctrl is None: + self.gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + assert linear_params is not None, "linear should always be included" + at_linear_params = gen_linear_at_params(linear_params) + in_dim = linear_params.in_dims[0] + out_dim = linear_params.out_dims[0] + filter_q = linear_q.weights_q + in_q = linear_q.in_qs[0] + out_q = linear_q.out_qs[0] + bias_q = linear_q.biases_q + + if act_params is not None: + at_act_params = gen_active_at_params(act_params) + out_q = act_q.out_qs[0] + if at_ver < 3: + if act_params.activation == "relu6" and out_q.q != 0: + self.gen_ctrl.ReluN = 6 << out_q.q + self.gen_ctrl.ReluNNoNorm = 1 + else: + if act_params.activation == "relun": + self.gen_ctrl.ReluN = act_params.activation_params + else: + at_act_params = NO_ACTIVATION + + self.at_linear_params = at_linear_params + self.in_dim = in_dim + self.out_dim = out_dim + self.in_q = in_q + self.bias_q = bias_q + self.out_q = out_q + self.filter_q = filter_q + self.at_act_params = at_act_params + self.cname = cname + self.node_name = node_name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + + gen_at_linear_relu(code_block, self.cname, self.in_q, self.out_q, + self.filter_q, self.bias_q, + self.in_dim, self.out_dim, self.at_linear_params, self.at_act_params, + at_ver=self.at_ver, gen_ctrl=self.gen_ctrl) + + return code_block diff --git a/tools/nntool/generation/generators/kernels/pow2/matadd_kernels_generator.py b/tools/nntool/generation/generators/kernels/pow2/matadd_kernels_generator.py new file mode 100644 index 000000000..1267fd8ab --- /dev/null +++ b/tools/nntool/generation/generators/kernels/pow2/matadd_kernels_generator.py @@ -0,0 +1,147 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.at_generators import (NO_ACTIVATION, gen_active_at_params, + gen_at_matrixadd, gen_at_matrixadddyn, + gen_matrixadd_at_params, + gen_matrixadddyn_at_params) +from generation.at_types.gen_ctrl import GenCtrl +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import generation_function, QREC_POW2 +from graph.types import MatrixAddParameters + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + + +@generation_function("kernels", (MatrixAddParameters, ), qrec_types=(QREC_POW2, )) +def matadd_kernels_generator(gen, node, qrec, in_eparams, out_eparams, cname): + del in_eparams, out_eparams + if qrec.in_qs[0].q == qrec.in_qs[1].q and qrec.in_qs[0].q == qrec.out_qs[0].q: + gen.kernels.append(MatrixAddKernel(cname, node, qrec, None, None, at_ver=gen.opts['at_ver'])) + else: + gen.kernels.append(MatrixAddDynKernel(cname, node, qrec, None, None, at_ver=gen.opts['at_ver'])) + return True + + +class MatrixAddKernel(AutotilerKernel): + def __init__(self, cname, matrixadd_params, matrixadd_q, act_params, act_q, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + self.gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + at_matrixadd_params = gen_matrixadd_at_params(matrixadd_params) + in_dim = matrixadd_params.in_dims[0] + out_dim = matrixadd_params.out_dims[0] + in_q1 = matrixadd_q.in_qs[0] + in_q2 = matrixadd_q.in_qs[1] + out_q = matrixadd_q.out_qs[0] + + if act_params is not None: + at_act_params = gen_active_at_params(act_params) + out_q = act_q.out_qs[0] + if at_ver < 3: + if act_params.activation == "relu6" and out_q.q != 0: + self.gen_ctrl.ReluN = 6 << out_q.q + self.gen_ctrl.ReluNNoNorm = 1 + else: + if act_params.activation == "relun": + self.gen_ctrl.ReluN = act_params.activation_params + else: + at_act_params = NO_ACTIVATION + + self.at_matrixadd_params = at_matrixadd_params + self.in_dim = in_dim + self.out_dim = out_dim + self.in_q1 = in_q1 + self.in_q2 = in_q2 + self.out_q = out_q + self.at_act_params = at_act_params + self.cname = cname + self.node_name = matrixadd_params.name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + + gen_at_matrixadd(code_block, self.cname, self.in_q1, self.in_q2, self.out_q, + self.in_dim, self.out_dim, self.at_matrixadd_params, + at_ver=self.at_ver, gen_ctrl=self.gen_ctrl) + + return code_block + + +class MatrixAddDynKernel(AutotilerKernel): + def __init__(self, cname, matrixadd_params, matrixadd_q, act_params, act_q, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + self.gen_ctrl = GenCtrl(None, cname=cname) + else: + self.gen_ctrl.cname = cname + + at_matrixadd_params = gen_matrixadddyn_at_params(matrixadd_params) + in_dim = matrixadd_params.in_dims[0] + out_dim = matrixadd_params.out_dims[0] + in_q1 = matrixadd_q.in_qs[0] + in_q2 = matrixadd_q.in_qs[1] + out_q = matrixadd_q.out_qs[0] + + if act_params is not None: + at_act_params = gen_active_at_params(act_params) + out_q = act_q.out_qs[0] + if at_ver < 3: + if act_params.activation == "relu6" and out_q.q != 0: + self.gen_ctrl.ReluN = 6 << out_q.q + self.gen_ctrl.ReluNNoNorm = 1 + else: + if act_params.activation == "relun": + self.gen_ctrl.ReluN = act_params.activation_params + else: + at_act_params = NO_ACTIVATION + + self.at_matrixadd_params = at_matrixadd_params + self.in_dim = in_dim + self.out_dim = out_dim + self.in_q1 = in_q1 + self.in_q2 = in_q2 + self.out_q = out_q + self.at_act_params = at_act_params + self.cname = cname + self.node_name = matrixadd_params.name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + + gen_at_matrixadddyn(code_block, self.cname, self.in_q1, self.in_q2, self.out_q, + self.in_dim, self.out_dim, self.at_matrixadd_params, + gen_ctrl=self.gen_ctrl) + + return code_block diff --git a/tools/nntool/generation/generators/kernels/pow2/matscale_kernels_generator.py b/tools/nntool/generation/generators/kernels/pow2/matscale_kernels_generator.py new file mode 100644 index 000000000..471d81fa7 --- /dev/null +++ b/tools/nntool/generation/generators/kernels/pow2/matscale_kernels_generator.py @@ -0,0 +1,85 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.at_generators import (gen_matscale_at_params, gen_at_matscale) +from generation.at_types.gen_ctrl import GenCtrl +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import generation_function, QREC_POW2 +from graph.types import MatScaleFusionParameters + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + + +@generation_function("kernels", (MatScaleFusionParameters, ), qrec_types=(QREC_POW2, )) +def matscale_kernels_generator(gen, node, qrec, in_eparams, out_eparams, cname): + del in_eparams, out_eparams + gen.kernels.append(MatrixScaleKernel(cname, node, qrec, at_ver=gen.opts['at_ver'])) + return True + + +class MatrixScaleKernel(AutotilerKernel): + def __init__(self, cname, params, qrec, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + self.gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + at_matscale_params = gen_matscale_at_params(params) + in_dim = params.in_dims[0] + out_dim = params.out_dims[0] + assert in_dim.shape[0] == out_dim.shape[0] + if params.fusion_type == "vec_scalar": + otherq = qrec.in_qs[0] + vectorq = qrec.in_qs[1] + scalarq = qrec.in_qs[2] + elif params.fusion_type == "vector": + otherq = qrec.in_qs[1] + vectorq = qrec.in_qs[2] + scalarq = None + elif params.fusion_type == "scalar": + otherq = qrec.in_qs[0] + vectorq = None + scalarq = qrec.in_qs[1] + else: + raise NotImplementedError("unknown fusion type %s" % params.fusion_type) + + self.at_matscale_params = at_matscale_params + self.in_dim = in_dim + self.out_dim = out_dim + self.otherq = otherq + self.vectorq = vectorq + self.scalarq = scalarq + self.out_q = qrec.out_qs[0] + self.cname = cname + self.node_name = params.name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + + gen_at_matscale(code_block, self.cname, self.otherq, self.vectorq, self.scalarq, self.out_q, + self.in_dim, self.out_dim, self.at_matscale_params) + + return code_block diff --git a/tools/nntool/generation/generators/kernels/pow2/pool_relu_kernels_generator.py b/tools/nntool/generation/generators/kernels/pow2/pool_relu_kernels_generator.py new file mode 100644 index 000000000..206ac18e6 --- /dev/null +++ b/tools/nntool/generation/generators/kernels/pow2/pool_relu_kernels_generator.py @@ -0,0 +1,128 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.at_generators import (NO_ACTIVATION, NO_POOL, + gen_active_at_params, gen_at_pool_relu, + gen_pool_at_params) +from generation.at_types.gen_ctrl import GenCtrl +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import generation_function, QREC_POW2 +from graph.dim import PadDim +from graph.types import ConvFusionParameters +from utils.node_id import NodeId + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + + +@generation_function("kernels", (ConvFusionParameters,), qrec_types=(QREC_POW2, )) +def pool_kernels_generator(gen, node, qrec, in_eparams, out_eparams, cname): + del in_eparams, out_eparams, qrec + if isinstance(node, ConvFusionParameters) and node.fusion_type == "pool_active": + cnodes = node.contained_nodes() + quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes] + gen.kernels.append(PoolReluKernel(node.name, cname, cnodes[0], quants[0], + cnodes[1], quants[1], at_ver=gen.opts['at_ver'], + gen_ctrl=node.get_gen_ctrl())) + return True + return False + + +class PoolReluKernel(AutotilerKernel): + def __init__(self, node_name, cname, pool_params, pool_q, + act_params, act_q, code_block=None, at_ver=3, gen_ctrl=None): + if gen_ctrl is None: + self.gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + in_q = out_q = None + in_dim = out_dim = None + pad_compatibilities = [] + + if pool_params is not None: + at_pool_params = gen_pool_at_params(pool_params, pad_compatibilities) + if in_dim is None: + in_dim = pool_params.in_dims[0] + out_dim = pool_params.out_dims[0] + if in_q is None: + in_q = pool_q.in_qs[0] + out_q = pool_q.out_qs[0] + else: + at_pool_params = NO_POOL + + if act_params is not None: + at_act_params = gen_active_at_params(act_params) + if in_dim is None: + in_dim = act_params.in_dims[0] + if out_dim is None: + out_dim = act_params.out_dims[0] + if in_q is None: + in_q = act_q.in_qs[0] + out_q = act_q.out_qs[0] + if at_ver < 3: + if act_params.activation == "relu6" and out_q.q != 0: + self.gen_ctrl.ReluN = 6 << out_q.q + self.gen_ctrl.ReluNNoNorm = 1 + else: + if act_params.activation == "relun": + self.gen_ctrl.ReluN = act_params.activation_params + else: + at_act_params = NO_ACTIVATION + + if code_block is None: + code_block = CodeBlock() + + if pad_compatibilities: + reduction = PadDim.pad_compatibility_reduce(*pad_compatibilities, + "convolution padding is not compatible with pool padding") + if not reduction[2]: # default is balanced pad left + at_pad_ctrl = next(i for i, v in enumerate(reduction) if v) + self.gen_ctrl.PadType = at_pad_ctrl + + if in_q.bits != out_q.bits: + raise NotImplementedError("only homogenious operations are supported at present") + if at_pool_params == NO_POOL: + raise NotImplementedError( + "activation layer on its own should not be matched by this kernel") + + self.at_pool_params = at_pool_params + self.in_dim = in_dim + self.out_dim = out_dim + self.in_q = in_q + self.out_q = out_q + self.at_act_params = at_act_params + self.cname = cname + self.node_name = node_name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + + gen_at_pool_relu(code_block, self.cname, self.in_q, self.out_q, + self.in_dim, self.out_dim, self.at_pool_params, + self.at_act_params, gen_ctrl=self.gen_ctrl, + at_ver=self.at_ver) + + return code_block diff --git a/tools/nntool/generation/generators/kernels/pow2/softmax_kernels_generator.py b/tools/nntool/generation/generators/kernels/pow2/softmax_kernels_generator.py new file mode 100644 index 000000000..7e7ebaf5d --- /dev/null +++ b/tools/nntool/generation/generators/kernels/pow2/softmax_kernels_generator.py @@ -0,0 +1,63 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.at_generators import gen_at_softmax +from generation.at_types.at_params import gen_softmax_at_params +from generation.at_types.gen_ctrl import GenCtrl +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import generation_function, QREC_POW2 +from graph.types import SoftMaxParameters + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + + +@generation_function("kernels", (SoftMaxParameters, ), qrec_types=(QREC_POW2, )) +def softmax_kernels_generator(gen, node, qrec, in_eparams, out_eparams, cname): + del in_eparams, out_eparams + gen.kernels.append(SoftmaxKernel(cname, node, qrec, at_ver=gen.opts['at_ver'])) + return True + + +class SoftmaxKernel(AutotilerKernel): + def __init__(self, cname, params, qrec, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + self.gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + self.at_softmax_params = gen_softmax_at_params(params) + self.in_dim = params.in_dims[0] + self.in_q = qrec.in_qs[0] + self.out_q = qrec.out_qs[0] + self.cname = cname + self.node_name = params.name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + + gen_at_softmax(code_block, self.cname, self.in_q, self.out_q, + self.in_dim, self.at_softmax_params, at_ver=self.at_ver) + return code_block diff --git a/tools/nntool/generation/generators/kernels/pow2/three_d_transpose_kernels_generator.py b/tools/nntool/generation/generators/kernels/pow2/three_d_transpose_kernels_generator.py new file mode 100644 index 000000000..c32077347 --- /dev/null +++ b/tools/nntool/generation/generators/kernels/pow2/three_d_transpose_kernels_generator.py @@ -0,0 +1,144 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from generation.at_generators.utils import at_bits +from generation.at_types.gen_ctrl import GenCtrl +from generation.code_block import CodeBlock +from generation.generators.generator_decorators import (QREC_POW2, + generation_function) +from graph.types import TransposeParameters + +from ..autotiler_kernel import AutotilerKernel + +LOG = logging.getLogger("nntool." + __name__) + + +@generation_function("kernels", (TransposeParameters, ), qrec_types=(QREC_POW2, )) +def three_d_transpose_kernels_generator(gen, node, qrec, in_eparams, out_eparams, cname): + del in_eparams, out_eparams + real_in_shape, real_transpose = node.real_shape() + if len(real_transpose) <= 1: + return True + if len(real_transpose) == 2: + gen.kernels.append(TwoDTransposeKernelPow2(cname, node, real_in_shape, + real_transpose, qrec, + at_ver=gen.opts['at_ver'])) + elif len(real_transpose) == 3: + gen.kernels.append(ThreeDTransposeKernelPow2(cname, node, real_in_shape, + real_transpose, qrec, + at_ver=gen.opts['at_ver'])) + else: + raise NotImplementedError("only 2D or 3D transposes are currently supported") + return True + + +def gen_at_2d_transpose(code_block, name, in_q, out_q, + in_shape, gen_ctrl=None, + at_ver=3): + if gen_ctrl is None: + gen_ctrl = "0" + else: + raise NotImplementedError("genctrl is not yet implemented") + + code_block.write('CNN_MatTranspose("{}", {}, {}, {}, {}, {}, 1, 1, 1, {}, {}, KOP_MATTRANSP);', + name, gen_ctrl, at_bits(in_q), at_bits(out_q), + in_q.q, out_q.q, in_shape[1], in_shape[0]) + + +class TwoDTransposeKernelPow2(AutotilerKernel): + def __init__(self, cname, params, real_in_shape, real_transpose, qrec, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + self.gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + self.in_q = qrec.in_qs[0] + self.out_q = qrec.out_qs[0] + self.in_shape = real_in_shape + self.in_dim = params.in_dims[0] + self.out_dim = params.out_dims[0] + self.real_transpose = real_transpose + self.cname = cname + self.node_name = params.name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + code_block.comment("transpose from {} to {} ({})", self.in_dim, + self.out_dim, self.real_transpose) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + + gen_at_2d_transpose(code_block, self.cname, self.in_q, self.out_q, + self.in_shape) + return code_block + + +def gen_at_3d_transpose(code_block, name, in_q, out_q, + in_shape, permop, gen_ctrl=None, + at_ver=3): + if gen_ctrl is None: + gen_ctrl = "0" + else: + raise NotImplementedError("genctrl is not yet implemented") + + code_block.write('CNN_3DTensorPermute("{}", {}, {}, {}, {}, {}, 1, 1, {}, {}, {}, {});', + name, gen_ctrl, at_bits(in_q), at_bits(out_q), + in_q.q, out_q.q, in_shape[0], in_shape[1], in_shape[2], + permop) + + +class ThreeDTransposeKernelPow2(AutotilerKernel): + def __init__(self, cname, params, real_in_shape, real_transpose, qrec, gen_ctrl=None, at_ver=3): + if gen_ctrl is None: + self.gen_ctrl = GenCtrl(None, cname=cname) + else: + gen_ctrl.cname = cname + self.gen_ctrl = gen_ctrl + + self.in_shape = real_in_shape + dim_names = ['C', 'H', 'W'] + perm = [dim_names[i] for i in real_transpose] + self.permop = "KOP_MATPERM_CHW2{}".format("".join(perm)) + self.real_transpose = real_transpose + + self.in_q = qrec.in_qs[0] + self.out_q = qrec.out_qs[0] + self.in_dim = params.in_dims[0] + self.out_dim = params.out_dims[0] + self.cname = cname + self.node_name = params.name + self.at_ver = at_ver + + def code(self, code_block=None): + if code_block is None: + code_block = CodeBlock() + + code_block.comment("generator for {}", self.node_name) + code_block.comment("transpose from {} to {} ({})", self.in_dim, + self.out_dim, self.real_transpose) + + if not self.gen_ctrl.is_unmodified: + self.gen_ctrl.gen_ctrl_decl(code_block) + + gen_at_3d_transpose(code_block, self.cname, self.in_q, self.out_q, + self.in_shape, self.permop) + return code_block diff --git a/tools/nntool/generation/name_cache.py b/tools/nntool/generation/name_cache.py new file mode 100644 index 000000000..45c6c8269 --- /dev/null +++ b/tools/nntool/generation/name_cache.py @@ -0,0 +1,41 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from typing import Any + +from graph.types import Parameters + + +class NameCache(): + def __init__(self): + self._cache = {} + + def set(self, node: Parameters, name: str, val: Any): + entry = self._cache.get(node) + if entry is None: + entry = {} + self._cache[node] = entry + entry[name] = val + + def get(self, node: Parameters, name: str) -> Any: + entry = self._cache.get(node) + if entry is not None: + entry = entry.get(name) + return entry + + def __getitem__(self, param: Parameters): + if param in self._cache: + return self._cache[param] + raise KeyError("%s not found"%param.name) diff --git a/tools/nntool/generation/naming_convension.py b/tools/nntool/generation/naming_convension.py index 325f8faf2..f73bc6614 100644 --- a/tools/nntool/generation/naming_convension.py +++ b/tools/nntool/generation/naming_convension.py @@ -16,7 +16,8 @@ from abc import ABC, abstractmethod from graph.types import (ConcatParameters, Conv2DParameters, FcParameters, SoftMaxParameters, ConvFusionParameters, PoolingParameters, - ActivationParameters) + ActivationParameters, MatrixAddParameters, ActivationFusion, + MatrixMulParameters, GlobalPoolParameters) class NamingConvension(ABC): @@ -46,7 +47,7 @@ def get_project_name(self): return self.G.name def get_global_name(self, name, step_idx, params, gtype): - return "Step{}{}".format(step_idx, gtype.capitalize()) + return "S{}_{}".format(step_idx, gtype.capitalize()) # pylint: disable=too-many-return-statements def get_node_name(self, node_name, step_idx, params): @@ -81,10 +82,28 @@ def get_node_name(self, node_name, step_idx, params): if isinstance(params, PoolingParameters): return "S{}_{}Pool_{}".format(step_idx, params.pool_type.capitalize(), params.filter) if isinstance(params, ActivationParameters): - return "S{}_{}".format(step_idx, params.activation.capitalize()) - return node_name - - def get_edge_name(self, node_name, step_idx, edge_type, edge_order=None): + return "S{}_Act_{}".format(step_idx, params.activation.capitalize()) + if isinstance(params, MatrixAddParameters): + return "S{}_MatAdd_{}".format(step_idx, str(params.out_dims[0])) + if isinstance(params, MatrixMulParameters): + return "S{}_MatMul_{}".format(step_idx, str(params.out_dims[0])) + if isinstance(params, ActivationFusion): + nodes = params.contained_nodes() + if isinstance(nodes[0], MatrixAddParameters): + return "S{}_MatAdd_{}_{}".format(step_idx, str(nodes[0].out_dims[0]), + nodes[1].activation.capitalize()) + if isinstance(nodes[0], (PoolingParameters)): + return "S{}_{}Pool_{}_{}".format(step_idx, nodes[0].pool_type.capitalize(), + nodes[0].filter, nodes[1].activation.capitalize()) + if isinstance(nodes[0], (GlobalPoolParameters)): + return "S{}_{}Pool_{}_{}".format(step_idx, nodes[0].pool_type.capitalize(), + nodes[0].out_dims[0], nodes[1].activation.capitalize()) + if isinstance(nodes[0], MatrixMulParameters): + return "S{}_MatMul_{}_{}".format(step_idx, str(nodes[0].out_dims[0]), + nodes[1].activation.capitalize()) + return "S{}_Op_{}".format(step_idx, node_name) + + def get_edge_name(self, node_name, step_idx, edge_type, edge_order=None, edge_params=None): if edge_type == "in": return node_name.capitalize() if edge_type == "out": @@ -92,7 +111,7 @@ def get_edge_name(self, node_name, step_idx, edge_type, edge_order=None): return self.G.out_edges(node_name)[0].to_node.name.capitalize() return node_name.capitalize() if edge_type == "in_out": - ename = "OutputStep{}".format(step_idx) + ename = "S{}_Output".format(step_idx) return ename assert False, "unknown edge type" return None diff --git a/tools/nntool/generation/write_constants.py b/tools/nntool/generation/write_constants.py index eaa1534c9..3369dab90 100644 --- a/tools/nntool/generation/write_constants.py +++ b/tools/nntool/generation/write_constants.py @@ -21,52 +21,15 @@ from graph.types import FilterParameters, ConstantInputParameters, MultiplicativeBiasParameters -def write_constants(G, naming_convension, tensor_directory=None): +def write_constants(global_recs, tensor_directory=None): if tensor_directory is None: tensor_directory = "." else: os.makedirs(tensor_directory, mode=0o750, exist_ok=True) - for step_idx, pnode, _, fnode in G.nodes_iterator(): - anode = pnode if not fnode else fnode - if isinstance(anode, FilterParameters): - cname = naming_convension.get_global_name(pnode.name, step_idx, pnode, "weights") - qrec = G.quantization[NodeId(pnode, fnode)] - weights_q = qrec.weights_q - with open(os.path.join(tensor_directory, cname + ".tensor"), 'wb') as t_fp: - weights_q.quantize(anode.weights)\ - .astype(weights_q.dtype, order='C', casting='no', copy=True)\ - .tofile(t_fp) - - # biases are always generated even if they are 0 - if anode.has_bias: - biases_q = qrec.biases_q - biases = biases_q.quantize(anode.biases)\ - .astype(biases_q.dtype, order='C', casting='no', copy=True) - else: - biases = np.zeros(weights_q.dtype, dtype=np.float32, order='C') - - cname = naming_convension.get_global_name(pnode.name, step_idx, pnode, "biases") - with open(os.path.join(tensor_directory, cname + ".tensor"), 'wb') as t_fp: - biases.tofile(t_fp) - - if isinstance(anode, MultiplicativeBiasParameters) and anode.has_mul_bias: - mul_biases_q = qrec.mul_biases_q - mul_biases = mul_biases_q.quantize(anode.mul_biases)\ - .astype(mul_biases_q.dtype, order='C', casting='no', copy=True) - - cname = naming_convension.get_global_name(pnode.name, step_idx, pnode, "mul_biases") - with open(os.path.join(tensor_directory, cname + ".tensor"), 'wb') as t_fp: - mul_biases.tofile(t_fp) - elif isinstance(anode, ConstantInputParameters): - out_edge = G.out_edges(anode.name)[0] - eparams = out_edge.params - cname = naming_convension.get_edge_name(eparams.creating_node.name, - eparams.creating_step, - eparams.edge_type, - eparams.edge_order) - qrec = G.quantization[NodeId(pnode, fnode)] - constant_q = qrec.out_qs[0] - with open(os.path.join(tensor_directory, cname + ".tensor"), 'wb') as t_fp: - weights_q.quantize(anode.value)\ - .astype(constant_q.dtype, order='C', casting='no', copy=True)\ - .tofile(t_fp) + for global_rec in global_recs: + if global_rec.const_info is None: + continue + const_info = global_rec.const_info + with open(const_info.file_name, 'wb') as t_fp: + const_info.contents.astype(const_info.qtype.dtype, order='C', casting='no', copy=True)\ + .tofile(t_fp) diff --git a/tools/nntool/graph/dim.py b/tools/nntool/graph/dim.py index 9ffe59f38..f7dbccf25 100644 --- a/tools/nntool/graph/dim.py +++ b/tools/nntool/graph/dim.py @@ -21,42 +21,52 @@ from functools import reduce from math import ceil, floor + class DimError(Exception): pass + class NoSizeError(DimError): pass + class DimUnknownError(DimError): pass + class DimHasNoOrderError(DimError): pass + class DimHasNoNamesError(DimError): pass + class DimMissingKeyError(DimError): pass + class DimIncorrectKeyError(DimError): pass + class MissMatchedInputsError(DimError): pass + class MoreThanOneInputError(DimError): pass + class Dim(): def __init__(self, shape=None, names=None, is_ordered=False, is_unknown=False): set_shape = shape if shape is not None else [] if names is None else [None] * len(names) super().__setattr__('_shape', set_shape) super().__setattr__('_names', names) super().__setattr__('_is_ordered', is_ordered) - super().__setattr__('_is_unknown', is_unknown or\ - (shape is None) or\ - any(elem is None for elem in set_shape)) + super().__setattr__('_is_unknown', is_unknown or + (shape is None) or + any(elem is None for elem in set_shape)) super().__setattr__('_is_named', names is not None) @classmethod @@ -175,6 +185,11 @@ def order(self) -> list: def is_single_channel(self) -> bool: return self.is_named and self.has_key('c') and self.c == 1 + @property + def layout_shape(self): + self._verify_is_ordered() + return tuple(sz for sz in self.shape if sz > 1) + def transpose(self, order): '''transpose dimension in order which is a list of indexes or list of names''' self._verify_is_ordered() @@ -189,11 +204,26 @@ def transpose(self, order): object.__setattr__(self, '_names', [self._names[i] for i in order]) return self + def calc_transpose(self, order): + '''transpose dimension in order which is a list of indexes or list of names''' + self._verify_is_ordered() + if len(order) != len(self.shape): + raise MissMatchedInputsError() + # if the order is names then convert to indices + if isinstance(order[0], str): + self._verify_is_named() + order = [self.keys.index(k) for k in order] + res = self.clone() + object.__setattr__(res, '_shape', [self._shape[i] for i in order]) + if self.is_named: + object.__setattr__(res, '_names', [self._names[i] for i in order]) + return res + def move_last_to_first(self): self._verify_is_ordered() - self._shape.append(self._shape.pop(0)) + self._shape.insert(0, self._shape.pop()) if self.is_named: - self._names.append(self._names.pop(0)) + self._names.insert(0, self._names.pop()) def apply_naming_hints(self, hint): self._verify_is_ordered() @@ -342,8 +372,8 @@ def combine(dims: Iterable, axis) -> 'Dim': for i in range(1, len(dims)): dim = dims[i] if len(dim.shape) != len(base.shape) or\ - not all(dim.shape[j] == base.shape[j]\ - for j in range(len(base.shape)) if j != axis): + not all(dim.shape[j] == base.shape[j] + for j in range(len(base.shape)) if j != axis): raise MissMatchedInputsError() cnt += dim.shape[axis] base[axis] = cnt @@ -493,16 +523,18 @@ def __str__(self): return "unknown" return 'x'.join([str(v) for v in self._shape]) + PAD_DIMS = ['t', 'b', 'l', 'r'] PAD_VERT_DIMS = ['t', 'b'] PAD_HORIZ_DIMS = ['l', 'r'] + class PadDim(Dim): - def __init__(self, *args, is_same=False): + def __init__(self, *args, same_type=None): if not args: super().__init__(names=PAD_DIMS.copy(), is_ordered=True, is_unknown=True) - object.__setattr__(self, '_same', is_same) + object.__setattr__(self, '_same_type', same_type) else: if not all(isinstance(i, int) for i in args): raise TypeError("incorrect type for PadDim") @@ -516,12 +548,23 @@ def __init__(self, *args, is_same=False): super().__init__([args[0], args[1], args[2], args[3]], PAD_DIMS, is_ordered=True) else: raise ValueError("incorrect pad argument length") - object.__setattr__(self, '_same', False) + object.__setattr__(self, '_same_type', same_type) def height_width(self) -> Dim: '''return a dim representing the width and height''' return Dim.named_ordered(h=self.h, w=self.w) + @property + def has_padding(self): + return self.t > 0 or self.b > 0 or self.l > 0 or self.r > 0 + + PadCompatibilityTypes = [ + "left", + "right", + "balanced_left", + "balanced_right" + ] + @classmethod def compute_pad_compatibility(cls, l, r): # left, right, balanced_left, balanced_right @@ -563,7 +606,6 @@ def pad_compatibility_reduce(cls, *pad_compatibilities, err_msg=None): return None return reduction - @property def pad_compatibility(self): return self.pad_compatibility_reduce( @@ -584,17 +626,21 @@ def h(self) -> int: self._verify_is_known() return self.t + self.b + @property + def same_type(self): + return self._same_type + def clone(self, keys=None): '''clone the paddim''' assert not keys if self.is_unknown: - return PadDim(is_same=self.is_same) - return PadDim(self.t, self.b, self.l, self.r, is_same=self.is_same) + return PadDim(same_type=self.same_type) + return PadDim(self.t, self.b, self.l, self.r, same_type=self.same_type) @classmethod - def same(cls) -> 'PadDim': + def same(cls, same_type="balanced_right") -> 'PadDim': '''return a same padding''' - return cls(is_same=True) + return cls(same_type=same_type) @classmethod def valid(cls) -> 'PadDim': @@ -618,28 +664,62 @@ def numpy_pad_shape(self, in_dim: Dim) -> list: @property def is_same(self) -> bool: '''checks if PadDim is set same''' - return self._same + return self._same_type is not None - def calculate_same(self, in_dim, filt, stride) -> Dim: + def calculate_same(self, in_dim, filt, stride, dilation=None) -> Dim: '''calculates the actual padding from the input dimension''' out_height = ceil(float(in_dim.h) / float(stride.h)) out_width = ceil(float(in_dim.w) / float(stride.w)) - - pad_along_height = max( - (out_height - 1) * stride.h + filt.h - in_dim.h, - 0) - pad_along_width = max( - (out_width - 1) * stride.w + filt.w - in_dim.w, - 0) + if dilation is None: + pad_along_height = max( + (out_height - 1) * stride.h + filt.h - in_dim.h, + 0) + pad_along_width = max( + (out_width - 1) * stride.w + filt.w - in_dim.w, + 0) + else: + pad_along_height = max( + (out_height - 1) * stride.h + filt.h + (filt.h - 1)*(dilation.h - 1) - in_dim.h, + 0) + pad_along_width = max( + (out_width - 1) * stride.w + filt.w + (filt.w - 1)*(dilation.w - 1) - in_dim.w, + 0) + if self._same_type == "left": + self.set( + t=pad_along_height, + b=0, + l=pad_along_width, + r=0 + ) + return self + elif self._same_type == "right": + self.set( + t=0, + b=pad_along_height, + l=0, + r=pad_along_width + ) + return self pad_top = pad_along_height // 2 pad_left = pad_along_width // 2 - self.set( - t=pad_top, - b=pad_along_height - pad_top, - l=pad_left, - r=pad_along_width - pad_left - ) - return self + if self._same_type == "balanced_right": + self.set( + t=pad_top, + b=pad_along_height - pad_top, + l=pad_left, + r=pad_along_width - pad_left + ) + return self + elif self._same_type == "balanced_left": + self.set( + t=pad_along_height - pad_top, + b=pad_top, + l=pad_along_width - pad_left, + r=pad_left + ) + return self + else: + raise ValueError("same padding is not set") @property def has_end_h_pad(self) -> bool: @@ -655,12 +735,16 @@ def has_at_pad(self) -> bool: '''checks if padding is compatible with autotiler''' if self.t == 0 and self.b == 0 and self.l == 0 and self.r == 0: return False - if self.has_end_h_pad and self.has_end_w_pad and self.t == self.l: + if self._same_type is not None: return True - raise AttributeError("Padding is probably not compatible with AutoTiler") + if any(pad_type for pad_type in self.pad_compatibility): + return True + raise AttributeError("Padding is not same so not compatible with AutoTiler") + DEFAULT_CONVFILTER_DIMS = ['out_c', 'in_c', 'h', 'w'] + class Conv2DFilterDim(Dim): def __init__(self, h, w, out_c, in_c=None, order=None): @@ -676,8 +760,10 @@ def clone(self, keys=None) -> 'Conv2DFilterDim': assert not keys return Conv2DFilterDim(self.h, self.w, self.out_c, self.in_c, order=self.keys) + DEFAULT_FCFILTER_DIMS = ['out_c', 'in_c', 'h', 'w'] + class FcFilterDim(Dim): def __init__(self, h, w, out_c, in_c=None, order=None): @@ -721,8 +807,10 @@ def clone(self, keys=None) -> 'FcFilterDim': assert not keys return FcFilterDim(self.h, self.w, self.out_c, self.in_c, order=self.keys) + DEFAULT_2DDIMS = ['h', 'w'] + class Dim2D(Dim): def __init__(self, *args, order=None): @@ -739,14 +827,18 @@ def clone(self, keys=None) -> 'Dim2D': assert not keys return self.__class__(self.h, self.w, order=self.keys) + class StrideDim(Dim2D): pass + class PoolFilterDim(Dim2D): pass + class ScaleDim(Dim2D): pass + class DilationDim(Dim2D): pass diff --git a/tools/nntool/graph/graph_identity.py b/tools/nntool/graph/graph_identity.py index ae6209a0b..911bac28a 100644 --- a/tools/nntool/graph/graph_identity.py +++ b/tools/nntool/graph/graph_identity.py @@ -14,6 +14,7 @@ # along with this program. If not, see . import json +import os import xxhash @@ -21,6 +22,8 @@ # This class tracks any changes to the graph that render it incompatible with a value cache entry class GraphIdentity(): def __init__(self, filename): + if filename is not None: + filename = os.path.abspath(filename) self._identity = {'filename': filename, 'fusions': []} @property @@ -74,6 +77,22 @@ def is_equalized(self): def set_equalized(self, threshold): self._identity['equalization'] = threshold + @property + def tflite_quantization(self): + return self._identity.get('tflite_quantization') + + @tflite_quantization.setter + def tflite_quantization(self, val: bool): + self._identity['tflite_quantization'] = val + + @property + def quantization_type(self): + return self._identity.get('quantization_type') + + @quantization_type.setter + def quantization_type(self, val: str): + self._identity['quantization_type'] = val + @property def hexdigest(self): h = xxhash.xxh64() diff --git a/tools/nntool/graph/manipulations/adjust_order.py b/tools/nntool/graph/manipulations/adjust_order.py index c7f87f1d2..63b3569f8 100644 --- a/tools/nntool/graph/manipulations/adjust_order.py +++ b/tools/nntool/graph/manipulations/adjust_order.py @@ -19,7 +19,7 @@ from ..types import (ConcatParameters, ConstantInputParameters, Conv2DParameters, FcParameters, InputBaseParameters, OutputParameters, ReshapeParameters, - UnconvertedOpParameters) + UnconvertedOpParameters, ImageFormatParameters) from .dimensions import add_dimensions from .eliminate_transposes import eliminate_transposes @@ -42,26 +42,29 @@ def maybe_transpose(cur, desired_order, tensor, reshape=None): def adjust_dims(step_idx, node, dims, hint, direction="input"): for idx, dim in enumerate(dims): if dim.just_has_keys(AT_ACTIVATION_ORD): - LOG.info("step %s: %s adjust %s %s %s => %s", - step_idx, node.name, direction, idx, dim, " x ".join(AT_ACTIVATION_ORD)) + LOG.debug("step %s: %s adjust %s %s %s => %s", + step_idx, node.name, direction, idx, dim, " x ".join(AT_ACTIVATION_ORD)) dim.impose_order(AT_ACTIVATION_ORD) if hint and hint[idx]: hint[idx] = deepcopy(AT_ACTIVATION_ORD) elif len(dim) == 1: - LOG.info("step %s: %s %s is one dimensional so no adjustment", - step_idx, node.name, direction) + LOG.debug("step %s: %s %s is one dimensional so no adjustment", + step_idx, node.name, direction) else: dim.move_last_to_first() -def adjust_order(G, reshape_weights=True): +def adjust_order(G, reshape_weights=True, postprocess=True): for step_idx, node, fusion_idx, _ in G.nodes_iterator(): assert not fusion_idx, "order must be adjusted before fusing" if isinstance(node, InputBaseParameters): if node.fixed_order: - node.transpose_out = node.last_first(node.dims) - if node.out_dims_hint and node.out_dims_hint[0]: - node.out_dims_hint[0] = deepcopy(AT_ACTIVATION_ORD) + # Check if followed by an ImageFormat node in which case reordering will + # happen there and there should be no transform on the input + if not isinstance(G.out_edges(node.name)[0].to_node, ImageFormatParameters): + node.transpose_out = node.last_first(node.dims) + if node.out_dims_hint and node.out_dims_hint[0]: + node.out_dims_hint[0] = deepcopy(AT_ACTIVATION_ORD) else: if isinstance(node, ConstantInputParameters) and node.value is not None and reshape_weights: node.value = maybe_transpose(node.dims, AT_ACTIVATION_ORD, node.value) @@ -76,15 +79,15 @@ def adjust_order(G, reshape_weights=True): node.dims = node.in_dims[0] continue elif isinstance(node, Conv2DParameters): - LOG.info("step %s: %s adjust weights %s => %s", - step_idx, node.name, node.filter, " x ".join(AT_CONVFILTER_ORD)) + LOG.debug("step %s: %s adjust weights %s => %s", + step_idx, node.name, node.filter, " x ".join(AT_CONVFILTER_ORD)) if node.weights is not None and reshape_weights: node.weights = maybe_transpose(node.filter, AT_CONVFILTER_ORD, node.weights) node.filter.impose_order(AT_CONVFILTER_ORD) elif isinstance(node, FcParameters): - LOG.info("step %s: %s adjust weights %s => %s", - step_idx, node.name, node.filter, " x ".join(AT_FCFILTER_EXP_ORD)) + LOG.debug("step %s: %s adjust weights %s => %s", + step_idx, node.name, node.filter, " x ".join(AT_FCFILTER_EXP_ORD)) if node.weights is not None and reshape_weights: exp_weights = node.weights.reshape(node.filter.shape) node.weights = maybe_transpose(node.filter, AT_FCFILTER_EXP_ORD, @@ -95,23 +98,34 @@ def adjust_order(G, reshape_weights=True): node.in_dims[0].impose_order(AT_ACTIVATION_ORD) continue elif isinstance(node, ConcatParameters): - if node.axis == len(node.in_dims[0].shape) - 1: - node.axis = 0 - elif node.axis == 0: - node.transpose_in = node.first_last(node.in_dims[0]) - node.transpose_out = node.last_first(node.out_dims[0]) - else: - raise NotImplementedError("this needs to be implemented") + # if axis is last it will become first so this concat is valid without change + if node.axis != len(node.out_dims[0]) - 1: + # real axis will be one more since last axis will move to first + node.axis += 1 + trans_length = len(node.out_dims[0]) + # move concat axis first + node.transpose_in = [node.axis] + [i for i in range(trans_length) + if i != node.axis] + # move concat axis back into original position + node.transpose_out = node.transpose_in.copy() + # axis is 0 in all cases + node.axis = 0 elif isinstance(node, ReshapeParameters): in_dim = node.in_dims[0] out_dim = node.out_dims[0] - if (in_dim.shape[-1] == out_dim.shape[-1] or node.does_nothing() or - (len(in_dim.shape) == 1 and out_dim.is_named and out_dim.c == 1)): - node.shape.move_last_to_first() - node.old_shape.move_last_to_first() + if in_dim.layout_shape != out_dim.layout_shape: + # These two tests look at whether the last dimension is 1 + # in which case moving it first does not change the reshape + # or whether the layout_shape (shape with all 1 dimensions removed) + # has a single dimension in which case the reshape will also not + # change with the axis move + if in_dim.shape[-1] != 1 and len(in_dim.layout_shape) > 1: + node.transpose_in = node.first_last(in_dim) + if out_dim.shape[-1] != 1 and len(out_dim.layout_shape) > 1: + node.transpose_out = node.last_first(out_dim) else: - node.transpose_in = node.first_last(node.in_dims[0]) - node.transpose_out = node.last_first(node.out_dims[0]) + node.old_shape.move_last_to_first() + node.shape.move_last_to_first() elif isinstance(node, UnconvertedOpParameters): if node.indicated_outputs: for out in node.indicated_outputs: @@ -121,5 +135,6 @@ def adjust_order(G, reshape_weights=True): adjust_dims(step_idx, node, node.out_dims, node.out_dims_hint, direction="output") add_dimensions(G) - eliminate_transposes(G) - add_dimensions(G) + if postprocess: + eliminate_transposes(G) + add_dimensions(G) diff --git a/tools/nntool/graph/manipulations/eliminate_transposes.py b/tools/nntool/graph/manipulations/eliminate_transposes.py index 7e85b74cb..8d2e50cb9 100644 --- a/tools/nntool/graph/manipulations/eliminate_transposes.py +++ b/tools/nntool/graph/manipulations/eliminate_transposes.py @@ -15,173 +15,268 @@ import logging -from graph.types.others import ReshapeParameters from graph.types.base import SensitiveToOrder, Transposable +from graph.types.others import ConcatParameters, ReshapeParameters LOG = logging.getLogger("nntool." + __name__) -def add_sequence(trans_seqs, trans_nodes): - if trans_nodes and len(trans_nodes) > 1: - trans_seq = trans_seqs.get(trans_nodes[-1]) - if not trans_seq: - trans_seq = [] - trans_seqs[trans_nodes[-1]] = trans_seq - trans_seq.append(trans_nodes) - -def find_last_transpose(G, node, trans_seqs, trans_nodes=None): - if isinstance(node, str): - node = G.node(node) - - if isinstance(node, SensitiveToOrder): - add_sequence(trans_seqs, trans_nodes) - trans_nodes = None - elif isinstance(node, Transposable): - if trans_nodes is None: - # new sequence - trans_nodes = [] - trans_nodes.append(node) - - out_edges = G.out_edges(node.name) - - if len(out_edges) == 0: - add_sequence(trans_seqs, trans_nodes) - return - - # Edges are visited in a repeatable order - out_edges.sort(key=lambda x: str(x.from_idx) + x.to_node.name + str(x.to_idx)) - - for edge in out_edges: - if trans_nodes: - if len(out_edges) > 1: - trans_nodes_copy = trans_nodes.copy() + +class Shape(): + def __init__(self, shape): + self.shape = shape + self.idx = 0 + self.inc = True + self.cur = 1 + + +def reverse_reshape(trans, from_shape, to_shape): + """reverses the effect of this reshape on the transpose""" + # The reshape goes from shape -> to shape. Find the equivalent transpose + # that can be examined for things in to shape or return None if the transpose + # cannot be converted. from shape may have smaller larger or the same dimensions + # as to shape. + shapes = [Shape(to_shape.shape), + Shape(from_shape.shape)] + # Build a mask containing the indexes of the from_shape in the + # shape of to_shape. Here we are looking for continuous sequences of combinations + # of the two masks + trans_mask = [[] for _ in shapes[0].shape] + filling_shape = None + while all(shape.idx < len(shape.shape) for shape in shapes): + # multiply the shapes of the indexes that have incremented + for shape in shapes: + if shape.inc: + shape.cur *= shape.shape[shape.idx] + shape.inc = False + + # add the transpose index to the mask + trans_mask[shapes[0].idx].append(trans[shapes[1].idx]) + if shapes[0].cur == shapes[1].cur: + # the shapes match so increment both indexes + for shape in shapes: + shape.idx += 1 + shape.cur = 1 + shape.inc = True + filling_shape = None + elif shapes[0].cur < shapes[1].cur: + if filling_shape is None or filling_shape == 0: + # look for a combination of axes in the shape before the reshape + shapes[0].idx += 1 + shapes[0].inc = True + filling_shape = 0 else: - trans_nodes_copy = trans_nodes - find_last_transpose(G, edge.to_node, trans_seqs, trans_nodes_copy) + return None else: - find_last_transpose(G, edge.to_node, trans_seqs) - -def find_last_transposes(G): - """Does a depth first search in the graph to discover transposable - nodes with no SensitiveToOrder nodes between them""" - LOG.info("finding transpose sequences") - trans_seqs = {} - for node in G.inputs_and_constants(): - find_last_transpose(G, node, trans_seqs) - return trans_seqs - -def reverses_transpose(trans1, trans2): - """Checks if one transpose reverses another""" + if filling_shape is None or filling_shape == 1: + # look for a combination of axes in the shape after the reshape + shapes[1].idx += 1 + shapes[1].inc = True + filling_shape = 1 + else: + return None + + # Either the mask will be complete or one of the two shapes will not have been + # consumed. Make sure that both shapes are fully used + for i in [0, 1]: + if shapes[i].idx < len(shapes[i].shape): + # can only add shapes that are 1 in length + if shapes[i].shape[shapes[i].idx] == 1: + idxes = [shape.idx if shape.idx < len(shape.shape) else -1 for shape in shapes] + trans_mask[idxes[0]].append(trans[idxes[1]]) + shapes[i].idx += 1 + else: + # no solution found transpose is modified by the reshape + return None + # Make sure the mask is in ascending order + trans_mask = [sorted(mask) for mask in trans_mask] + + # now we have a mask of the form [[1], [0], [0]] or [[2], [0, 1]] + # turn this into [2, 0, 1] or [1, 0] + # old in this case is the shape after reshape + cur_old_idx = 0 + mask_idx = 0 + cur_new_idx = 0 + new_trans = [] + found_elem = False + while len(new_trans) < len(shapes[0].shape): + # if this mask element has not been consumed and its first element + # matches the index after reshape then consume it + if len(trans_mask[mask_idx]) > 0 and trans_mask[mask_idx][0] == cur_old_idx: + new_trans.append(mask_idx) + # the new old index is the last one in the mask + cur_old_idx = trans_mask[mask_idx][-1] + # consume the mask + trans_mask[mask_idx] = [] + cur_new_idx += 1 + # continue to loop + found_elem = True + mask_idx += 1 + if mask_idx >= len(trans_mask): + # if we didn't find anything then the reshape modifies the transpose + mask_idx = 0 + cur_old_idx += 1 + if cur_old_idx >= len(shapes[1].shape): + if not found_elem: + return None + found_elem = False + cur_old_idx = 0 + + + return new_trans + + +def reverses_transpose(trans1, trans2, dim=None): + """Checks if one transpose reverses another. If a dim is provided then + look if the transpose sequence produces an equivalent dim to cope with 1s in + dimensions.""" if trans1 is None or trans2 is None: return False + if dim and dim.layout_shape == dim.calc_transpose(trans1).calc_transpose(trans2).layout_shape: + return True for idx, val in enumerate(trans1): if trans2[val] != idx: return False return True -def get_first_transposable(rseq, idx): - """Looks back in the string of transposables for a vlid transposable. Reshapes that are - not transposing are skipped but returned in an array""" - reshapes = [] - while idx < len(rseq): - node = rseq[idx] - if isinstance(node, ReshapeParameters) and not node.has_transpose: - reshapes.append(rseq[idx]) - elif isinstance(node, Transposable): - return node, reshapes, idx - idx += 1 - return None, reshapes, idx - -def apply_reshape(trans, reshape): - """Create a new transpose if there are 1 sized dimensions in the reshape""" - if not reshape.does_nothing(): - return trans - - old_shape = reshape.old_shape.shape.copy() - trans = trans.copy() - while True: - change = False - idx = 0 - while idx < len(trans): - dim_idx = trans[idx] - if old_shape[dim_idx] == 1: - change = True - del old_shape[dim_idx] - del trans[idx] - for jdx, dim_jdx in enumerate(trans): - if dim_jdx > dim_idx: - trans[jdx] -= 1 - change = True - break - idx += 1 - if not change: - break - return trans - -def apply_reshapes(trans, reshapes): - for reshape in reversed(reshapes): - trans = apply_reshape(trans, reshape) - return trans - -def process(seq, switchable): - rseq = seq[::-1] - idx = 0 - while idx < len(rseq) - 1: - node = rseq[idx] - pnode, reshapes, idx = get_first_transposable(rseq, idx + 1) - sw_node = switchable.get(node) - if reverses_transpose(node.transpose_in, apply_reshapes(pnode.transpose_out, reshapes)): - if not sw_node: - switchable[node] = { - 'can_switch': True, - 'segments': {pnode: reshapes} - } - elif sw_node['can_switch']: - sw_node['segments'][pnode] = reshapes +def search_up_for_reverse(G, visited_edges, node, out_idx, transpose, edge_list): + """Search up the graph for transpose sequences""" + if len(G.out_edges(node.name)) > 1 or isinstance(node, SensitiveToOrder): + return [] + + if isinstance(node, Transposable) and node.transpose_out: + if reverses_transpose(node.transpose_out, transpose, node.out_dims[out_idx]): + return [(node, edge_list, 'out')] else: - if not sw_node: - # This node cannot be switched so all the nodes that could - # switched cannot be - switchable[node] = {'can_switch': False, 'segments': {}} - elif sw_node['can_switch']: - sw_node['can_switch'] = False - sw_node['segments'].clear() - -def process_sequences(trans_seqs): - """Extracts nodes that are valid for transpose elimination""" - LOG.info("processing transpose sequences") - switchable = {} - for seqs in trans_seqs.values(): - for seq in seqs: - process(seq, switchable) - return switchable - -def update_switchable(switchable): - """Updates the node transposes""" - LOG.info("updating nodes") - updated_reshapes = set() - for node, switch in switchable.items(): - if not switch['can_switch']: - continue - for pnode, reshapes in switch['segments'].items(): - for reshape in reshapes: - if reshape not in updated_reshapes: - updated_reshapes.add(reshape) - reshape.old_shape.transpose(pnode.transpose_out) - reshape.shape.transpose(node.transpose_in) - LOG.info("reshape %s modified", reshape.name) - pnode.transpose_out = None - LOG.info("transpose eliminated %s => %s", pnode.name, node.name) - - node.transpose_in = None + return [] + + if isinstance(node, ReshapeParameters): + new_transpose = reverse_reshape(transpose, node.shape, node.old_shape) + if new_transpose is None: + return [] + transpose = new_transpose + if node.transpose_in and reverses_transpose(node.transpose_in, transpose): + return [(node, edge_list, "in")] + + if isinstance(node, Transposable) and node.transpose_in: + return [] + + return search_up_edges(G, visited_edges, node, transpose, edge_list) + + +def search_up_edges(G, visited_edges, node, transpose, edge_list): + all_nodes = [] + for edge in G.in_edges(node.name): + if edge in visited_edges: + return [] + next_res = search_up_for_reverse( + G, visited_edges | {edge}, edge.from_node, edge.from_idx, transpose, edge_list + [edge]) + if not next_res: + return [] + all_nodes += next_res + return all_nodes + + +def search_down_for_reverse(G, visited_edges, node, in_idx, transpose, edge_list=None): + """Search down the graph for transpose sequences""" + if len(G.in_edges(node.name)) > 1 or isinstance(node, SensitiveToOrder): + return [] + + if edge_list is None: + edge_list = [] + + if isinstance(node, Transposable) and node.transpose_in: + if reverses_transpose(transpose, node.transpose_in, node.in_dims[in_idx]): + return [(node, edge_list, "in")] + else: + return [] + + # if the node is a concat then we cannot proceed further since the + # concat must happen on axis 0 and the transposes were already set up for + # this to happen + if isinstance(node, ConcatParameters): + return [] + + # if there is a reshape then the dimensionality of the transpose + # that we are examining may change and that may or may not be compatible + # with reversing the transpose + if isinstance(node, ReshapeParameters): + new_transpose = reverse_reshape(transpose, node.old_shape, node.shape) + if new_transpose is None: + return [] + transpose = new_transpose + if node.transpose_out and reverses_transpose(transpose, node.transpose_out): + return [(node, edge_list, "out")] + + if isinstance(node, Transposable) and node.transpose_out: + return [] + + return search_down_edges(G, visited_edges, node, transpose, edge_list) + + +def search_down_edges(G, visited_edges, node, transpose, edge_list): + all_nodes = [] + for edge in G.out_edges(node.name): + if edge in visited_edges: + return [] + next_res = search_down_for_reverse( + G, visited_edges | {edge}, edge.to_node, edge.to_idx, transpose, edge_list + [edge]) + if not next_res: + return [] + all_nodes += next_res + return all_nodes + + +def search_for_reverses(G): + results = [] + # visited edges contains all edges included in found transpose pairs + visited_edges = set() + for transpose_node in [node for node in G.nodes() if isinstance(node, Transposable)]: + # for each transpose node we look up and down from the transpose in and transpose out + # respectively to see if another transpose reverses this one with nothing + # inbetween that is transpose sensitive + if transpose_node.transpose_in: + result = search_up_edges(G, visited_edges, transpose_node, + transpose_node.transpose_in, []) + for r in result: + visited_edges |= set(r[1]) + results.append(((r[0], r[2]), (transpose_node, 'in'), r[1] + [::-1], getattr(r[0], "transpose_" + r[2]))) + if transpose_node.transpose_out: + result = search_down_edges(G, visited_edges, transpose_node, + transpose_node.transpose_out, []) + for r in result: + visited_edges |= set(r[1]) + results.append( + ((transpose_node, 'out'), (r[0], r[2]), r[1], transpose_node.transpose_out)) + return results + + +def process_result(res): + LOG.info("eliminating transpose between %s[%s] and %s[%s]", + res[0][0].name, res[0][1], res[1][0].name, res[1][1] + ) + transpose = res[3] + for edge in res[2]: + to_node = edge.to_node + if isinstance(to_node, ReshapeParameters) and not to_node.transpose_in: + LOG.info("eliminating input transpose on %s", to_node.name) + transpose = reverse_reshape(transpose, to_node.old_shape, to_node.shape) + to_node.shape.transpose(transpose) + + for node, direction in [res[idx] for idx in range(2)]: + setattr(node, "transpose_"+direction, None) + def eliminate_transposes(G): """Eliminates unnecessary transposes from the graph. Valid transposes are those that have no nodes that are sensitive to order between them and where one reverses the other""" LOG.info("eliminating unnecessary transposes") - trans_seqs = find_last_transposes(G) - if not trans_seqs: - LOG.info("no transpose sequences found") - return - switchable = process_sequences(trans_seqs) - update_switchable(switchable) + while True: + results = search_for_reverses(G) + if not results: + LOG.info("no further transpose sequences found") + break + for result in results: + process_result(result) + G.add_dimensions() diff --git a/tools/nntool/graph/matches/equalize_sym_mult_concats.py b/tools/nntool/graph/matches/equalize_sym_mult_concats.py new file mode 100644 index 000000000..a95435101 --- /dev/null +++ b/tools/nntool/graph/matches/equalize_sym_mult_concats.py @@ -0,0 +1,76 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +import logging +from copy import deepcopy + +from graph.matches.matcher import Matcher +from graph.types import ConcatParameters, ReshapeParameters, TransposeParameters +from quantization.multiplicative.mult_quantization import MultQuantizationRecord, MultScalableFilterQuantizationRecord +from quantization.multiplicative.symmetric.symmetric_mult_qtype import SymmetricMultQType +from utils.graph import Edge, GraphView +from utils.node_id import NodeId + +LOG = logging.getLogger("nntool." + __name__) + +CAN_PASS = ( + ReshapeParameters, + TransposeParameters +) + +def set_in_scale(qrec, index, scale): + in_q = qrec.in_qs[index] + assert isinstance(in_q, SymmetricMultQType), "not supported on other quantization types" + in_q.scale = scale + +def set_out_scale(qrec, index, scale): + out_q = qrec.out_qs[index] + assert isinstance(out_q, SymmetricMultQType), "not supported on other quantization types" + if isinstance(qrec, MultScalableFilterQuantizationRecord): + assert index == 0, "trying to set strange index on filter quantization record" + out_q.scale = scale + qrec.mul_biases_q.scale = qrec.in_qs[0].scale * qrec.weights_q.scale / out_q.scale + else: + out_q.scale = scale + +def propagate_qtype_up(G, qtype, edge: Edge): + LOG.info("propagating scale up from node %s to node %s", edge.to_node.name, edge.from_node.name) + qrec_out = G.quantization[NodeId(edge.from_node)] + set_out_scale(qrec_out, edge.from_idx, qtype.scale) + qrec_in = G.quantization[NodeId(edge.to_node)] + set_in_scale(qrec_in, edge.to_idx, qtype.scale) + if isinstance(edge.from_node, CAN_PASS): + for edge in G.in_edges(edge.from_node.name): + propagate_qtype_up(G, qtype, edge) + +class EqualizeSymmetricMultiplicativeQuantivedConcats(Matcher): + NAME = "equalize_sm_concats" + DESCRIPTION = """Equalize input quantization of concats with symmetric multiplicative quantization""" + + def match(self, G: GraphView, set_identity: bool = True): + if not G.quantization: + return + concats = [node for node in G.nodes() if isinstance(node, ConcatParameters)] + qrecs = [G.quantization[NodeId(node)] for node in concats] + if not all(isinstance(qrec, MultQuantizationRecord) for qrec in qrecs): + return + for concat, qrec in zip(concats, qrecs): + out_q = qrec.out_qs[0] + for edge in G.in_edges(concat.name): + in_q = qrec.in_qs[edge.to_idx] + if in_q != out_q: + propagate_qtype_up(G, out_q, edge) + + if set_identity: + self.set_identity(G) diff --git a/tools/nntool/graph/matches/expand_transposes.py b/tools/nntool/graph/matches/expand_transposes.py index 5495d2e38..dd33a2410 100644 --- a/tools/nntool/graph/matches/expand_transposes.py +++ b/tools/nntool/graph/matches/expand_transposes.py @@ -14,18 +14,19 @@ # along with this program. If not, see . from utils.graph import GraphView +from utils.node_id import NodeId +from graph.types.base import Transposable +from graph.types.others import TransposeParameters -from ..types.base import Transposable -from ..types.others import TransposeParameters - -from .matcher import Matcher +from graph.matches.matcher import Matcher def apply_reverse_transpose_to_hint(hint, transpose): - reverse_transpose = {transpose[i] : v for i, v in enumerate(hint)} + reverse_transpose = {transpose[i]: v for i, v in enumerate(hint)} reversed_hint = [reverse_transpose[idx] for idx in range(len(hint))] return reversed_hint + class ExpandTransposesMatcher(Matcher): NAME = "expand_transposes" DESCRIPTION = "Extract transposes from Transposable nodes for model generation" @@ -33,8 +34,8 @@ class ExpandTransposesMatcher(Matcher): def match(self, G: GraphView, set_identity: bool = True): # get a list of all the nodes that are transposable but not transposes # Need to do this first to avoid mutating it when doing the modifications - tnodes = list(filter(lambda n: isinstance(n, Transposable) and\ - not isinstance(n, TransposeParameters), + tnodes = list(filter(lambda n: isinstance(n, Transposable) and + not isinstance(n, TransposeParameters), G.nodes())) for node in tnodes: if node.transpose_in: @@ -47,6 +48,8 @@ def match(self, G: GraphView, set_identity: bool = True): in_params.in_dims_hint = [in_hint.copy()] in_params.out_dims_hint = [out_hint.copy()] node.in_dims_hint[edge.to_idx] = out_hint + if G.quantization: + G.quantization.copy_to_node(node, in_params) G.insert_node(in_params, edge.from_node.name, edge.to_node.name, from_idx=edge.from_idx, to_idx=edge.to_idx) node.transpose_in = None @@ -60,6 +63,8 @@ def match(self, G: GraphView, set_identity: bool = True): out_params.in_dims_hint = [in_hint.copy()] out_params.out_dims_hint = [out_hint.copy()] node.out_dims_hint[edge.from_idx] = in_hint + if G.quantization: + G.quantization.copy_to_node(node, out_params) G.insert_node(out_params, edge.from_node.name, edge.to_node.name, from_idx=edge.from_idx, to_idx=edge.to_idx) node.transpose_out = None diff --git a/tools/nntool/graph/matches/find_asymmetric_quantization.py b/tools/nntool/graph/matches/find_asymmetric_quantization.py new file mode 100644 index 000000000..e21d72436 --- /dev/null +++ b/tools/nntool/graph/matches/find_asymmetric_quantization.py @@ -0,0 +1,203 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +import logging + +from graph.matches.matcher import Matcher +from graph.types import (ActivationParameters, ConcatParameters, + ConstantInputParameters, Conv2DParameters, + ConvFusionParameters, FcParameters, + GlobalPoolParameters, InputParameters, + MatrixAddParameters, OutputParameters, + PoolingParameters, ReshapeParameters) +from quantization.multiplicative.symmetric.symmetric_mult_qtype_wrapper import \ + SymmetricMultQTypeWrapper +from utils.graph import GraphView +from utils.node_id import NodeId + +LOG = logging.getLogger("nntool." + __name__) + +CAN_CHANGE_OUTPUT = ( + InputParameters, ConstantInputParameters, Conv2DParameters, + ConvFusionParameters, FcParameters, MatrixAddParameters +) + +CAN_CHANGE_INPUT = ( + OutputParameters, Conv2DParameters, ConvFusionParameters, + FcParameters, MatrixAddParameters +) + +CAN_PROPAGATE_INPUT = ( + GlobalPoolParameters, ReshapeParameters, ConcatParameters, ActivationParameters, PoolingParameters +) + +ARE_MULTI_INPUT = ( + ConcatParameters +) + +class FindAsymmetricQuantization(Matcher): + NAME = "find_asymmetric_quantization" + DESCRIPTION = """Find nodes that can have asymmetric quantization. Must run after padding has been fused.""" + + def can_change_input(self, G, node, exclude=None): + """Returns None or a list of tuples of (node, multi_input_node) where node is an + input of multi_input_node. An empty list is a confirmed string. A list that contains + multi input nodes needs to be reconciled. An empty list means that this node + cannot be changed.""" + + if isinstance(node, CAN_PROPAGATE_INPUT): + if exclude and node in exclude: + return None + nodes = [] + for succ in [succ + for succs in G.successors(node.name) + for succ in succs]: + can_change = self.can_change_input(G, succ, exclude=exclude) + if can_change is None: + return None + nodes += can_change + if isinstance(succ, ARE_MULTI_INPUT): + nodes.append((node, succ)) + return nodes + if not isinstance(node, CAN_CHANGE_INPUT): + return None + if isinstance(node, ConvFusionParameters): + filters = node.contained_filters() + if len(filters) == 1 and not filters[0].padding.has_padding: + return [] + else: + return None + if isinstance(node, Conv2DParameters): + return None if node.padding.has_padding else [] + return [] + + def can_change_output(self, node): + return isinstance(node, CAN_CHANGE_OUTPUT) + + def validate_multi_input(self, G, input_dict): + # {start_node: [(pred, mi_node), ..]} + mi_nodes = {} + # index all of the predecessor nodes by mi node + for pr_node, mi_node in [match for matches in input_dict.values() for match in matches]: + pr_node_set = mi_nodes.get(mi_node) + if pr_node_set is None: + pr_node_set = set() + mi_nodes[mi_node] = pr_node_set + pr_node_set.add(pr_node) + bad_mi_nodes = [] + # check that all the predecessors were OK + for mi_node, pr_nodes in mi_nodes.items(): + if not all(node in pr_nodes for node in G.predecessors(mi_node)): + bad_mi_nodes.append(mi_node) + start_nodes = [] + # find the records that have bad nodes in them + if bad_mi_nodes: + for start_node, matches in input_dict.items(): + if any(mi_node in bad_mi_nodes for _, mi_node in matches): + start_nodes.append(start_nodes) + for start_node in start_nodes: + del input_dict[start_node] + matches = self.can_change_input(G, start_node, exclude=bad_mi_nodes) + if matches is not None: + assert len(matches) == 0 + input_dict[start_node] = [] + return input_dict + + def change_output_to_async(self, G, node, idx): + if isinstance(node, ConvFusionParameters): + changing = False + for fnode in node.contained_nodes(): + if changing: + nid = NodeId(node, fnode) + qrec = G.quantization[nid] + if isinstance(qrec.in_qs[0], SymmetricMultQTypeWrapper): + qrec.in_qs[0] = qrec.in_qs[0].wrapped + if isinstance(qrec.out_qs[0], SymmetricMultQTypeWrapper): + qrec.out_qs[0] = qrec.out_qs[0].wrapped + elif isinstance(fnode, (Conv2DParameters, FcParameters)): + changing = True + nid = NodeId(node, fnode) + qrec = G.quantization[nid] + if isinstance(qrec.out_qs[0], SymmetricMultQTypeWrapper): + qrec.out_qs[0] = qrec.out_qs[0].wrapped + + nid = NodeId(node) + qrec = G.quantization[nid] + if isinstance(qrec.out_qs[idx], SymmetricMultQTypeWrapper): + qrec.out_qs[idx] = qrec.out_qs[idx].wrapped + + def change_input_to_async(self, G, node, idx): + if isinstance(node, ConvFusionParameters): + for fnode in node.contained_nodes(): + nid = NodeId(node, fnode) + qrec = G.quantization[nid] + if isinstance(fnode, (Conv2DParameters, FcParameters)): + if isinstance(qrec.in_qs[0], SymmetricMultQTypeWrapper): + qrec.in_qs[0] = qrec.in_qs[0].wrapped + qrec.biases_q.link(qrec.weights_q, qrec.in_qs[0]) + return + if isinstance(qrec.in_qs[0], SymmetricMultQTypeWrapper): + qrec.in_qs[0] = qrec.in_qs[0].wrapped + if isinstance(qrec.out_qs[0], SymmetricMultQTypeWrapper): + qrec.out_qs[0] = qrec.out_qs[0].wrapped + + nid = NodeId(node) + qrec = G.quantization[nid] + if isinstance(qrec.in_qs[idx], SymmetricMultQTypeWrapper): + qrec.in_qs[idx] = qrec.in_qs[idx].wrapped + if isinstance(node, (Conv2DParameters, FcParameters)): + qrec.biases_q.link(qrec.weights_q, qrec.in_qs[idx]) + if isinstance(node, OutputParameters) and isinstance(qrec.out_qs[0], SymmetricMultQTypeWrapper): + qrec.out_qs[0] = qrec.out_qs[0].wrapped + + def do_change(self, G, node, idx=0): + self.change_output_to_async(G, node, idx) + for edge in G.out_edges(node.name): + if isinstance(edge.to_node, CAN_PROPAGATE_INPUT): + self.change_input_to_async(G, edge.to_node, edge.to_idx) + self.do_change(G, edge.to_node, edge.from_idx) + else: + assert isinstance(edge.to_node, CAN_CHANGE_INPUT) + if isinstance(edge.to_node, ConvFusionParameters): + filters = edge.to_node.contained_filters() + assert len(filters) == 1 and not filters[0].padding.has_padding + if isinstance(edge.to_node, Conv2DParameters): + assert not edge.to_node.padding.has_padding + self.change_input_to_async(G, edge.to_node, edge.to_idx) + + def match(self, G: GraphView, set_identity: bool = True): + if not G.quantization: + return + input_dict = {} + for node in G.nodes(): + if not self.can_change_output(node): + continue + all_matches = [] + for succ in [succ for succs in G.successors(node.name) for succ in succs]: + matches = self.can_change_input(G, succ) + if matches is None: + all_matches = None + break + all_matches += matches + if all_matches is None: + continue + input_dict[node] = all_matches + + input_dict = self.validate_multi_input(G, input_dict) + for node in input_dict: + # all nodes that can currently change output have one output + self.do_change(G, node) + + if set_identity: + self.set_identity(G) diff --git a/tools/nntool/graph/matches/find_hsigmoid.py b/tools/nntool/graph/matches/find_hsigmoid.py new file mode 100644 index 000000000..1c2a1e7c6 --- /dev/null +++ b/tools/nntool/graph/matches/find_hsigmoid.py @@ -0,0 +1,208 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging +import math +from graph.types import MatrixMulParameters, ConvFusionParameters, FcParameters, HSigmoidActivationParameters, ConstantInputParameters, FilterParameters, ReluActivationParameters, MatrixBroadcastedLinearOpParameters, MatrixAddParameters, MatrixSubParameters, MatrixDivParameters, MatrixMulParameters + +from utils.graph import GraphView, Edge, Node +from utils.graph_matcher import NodeMatch, MatchNodeByClass, EdgeMatch, GraphMatcher, MatchFinishSuccess +from utils.node_id import NodeId +from quantization.multiplicative.mult_quantization import MultQuantizationRecordBase +from .matcher import Matcher, MatchNode, DefaultMatcher +from quantization.symmetric.symmetric_quantization import ( + SymmetricQuantizationRecord, SymmetricScalableFilterQuantizationRecord) +from quantization.multiplicative.symmetric.symmetric_mult_qtype_wrapper import SymmetricMultQTypeWrapper +from quantization.multiplicative.mult_quantization import ( + MultQuantizationRecord, MultScalableFilterQuantizationRecord) +from quantization.float32.float32_quantization import ( + Float32QuantizationRecord, Float32ScalableFilterQuantizationRecord) + +LOG = logging.getLogger("nntool." + __name__) + + +def check_equals(G, node, val): + if node.value is None or len(node.value) != 1: + return False + + if G.has_quantized_parameters: + qrec = G.quantization[NodeId(node)] + if isinstance(qrec.out_qs[0], SymmetricMultQTypeWrapper): + node_val = qrec.out_qs[0].wrapped.dequantize(node.value) + else: + node_val = qrec.out_qs[0].dequantize(node.value) + else: + node_val = node.value + node_val = node_val.reshape((1,))[0] + if val < 0: + node_val = 1.0/node_val + val = 1.0/val + return math.floor(0.5 + node_val) == math.floor(0.5 + val) + +# Matches filter -> mul with 1/6th constant + + +class MatchCloseHSigmoid(DefaultMatcher): + NAME = 'match_close_hsigmoid' + DESCRIPTION = 'Match relu6 followed by matmul with 1/6 constant and replaces with hsigmoid activation' + + def match_function(self, G: GraphView): + sub = GraphView() + sub.add_node(MatchNode('0', matcher=lambda node: + isinstance(node, ReluActivationParameters) and node.upper_bound == 6)) + sub.add_node(MatchNode('1', matcher=lambda node: + isinstance(node, MatrixMulParameters))) + sub.add_node(MatchNode('2', matcher=lambda node: + isinstance(node, ConstantInputParameters) and check_equals(G, node, 1.0/6.0))) + sub.add_edge(Edge('0', '1', to_idx=0)) + sub.add_edge(Edge('2', '1', to_idx=1)) + + return G.match_fragment(sub) + + def replace_function(self, G: GraphView, subgraph: GraphView): + relu_node = None + constant_node = None + mul_node = None + for node in subgraph.nodes(): + if isinstance(node, ReluActivationParameters): + relu_node = node + elif isinstance(node, ConstantInputParameters): + constant_node = node + elif isinstance(node, MatrixMulParameters): + mul_node = node + + activation = HSigmoidActivationParameters(mul_node.name + "_fused_close_hsigmoid", offset=0) + + if G.quantization: + reluqrec = G.quantization[NodeId(relu_node)] + mulqrec = G.quantization[NodeId(mul_node)] + del G.quantization[NodeId(constant_node)] + if isinstance(reluqrec, (SymmetricQuantizationRecord)): + pqrec = SymmetricQuantizationRecord( + in_qs=reluqrec.in_qs, out_qs=mulqrec.out_qs) + elif isinstance(reluqrec, (MultQuantizationRecord)): + pqrec = MultQuantizationRecord(in_qs=reluqrec.in_qs, out_qs=mulqrec.out_qs) + elif isinstance(reluqrec, (Float32QuantizationRecord)): + pqrec = Float32QuantizationRecord(in_qs=reluqrec.in_qs, out_qs=mulqrec.out_qs) + else: + raise NotImplementedError() + G.quantization[NodeId(activation)] = pqrec + return activation + + +def look_back(G, node, state=None): + # TODO - Pass through nodes that don't modify the tensor contents + if state is None: + state = {'relu1': None, 'add': None, 'relu2': None, 'mul': None, 'relu3': None} + qrec = G.quantization.get(NodeId(node)) + if not isinstance(qrec, MultQuantizationRecordBase): + return None + if isinstance(node, ReluActivationParameters): + if state['add']: + state['relu1'] = None # (node, qrec) + elif node.upper_bound == 6: + state['relu2'] = (node, qrec) + else: + return None + return look_back(G, G.in_edges(node.name)[0].from_node, state=state) + elif isinstance(node, MatrixBroadcastedLinearOpParameters): + edges = G.in_edges(node.name) + if isinstance(edges[0].from_node, ConstantInputParameters): + const_edge_idx = 0 + nonconst_edge = edges[1] + elif isinstance(edges[1].from_node, ConstantInputParameters): + const_edge_idx = 1 + nonconst_edge = edges[0] + else: + return None + const_node = edges[const_edge_idx].from_node + if len(const_node.value) != 1: + return None + if isinstance(node, MatrixMulParameters): + if state['mul']: + return None + state['mul'] = (node, qrec, const_node) + elif isinstance(node, MatrixAddParameters): + if state['add'] or not check_equals(G, const_node, 3): + return None + state['add'] = (node, qrec, const_node) + else: + return None + return look_back(G, nonconst_edge.from_node, state=state) + else: + if state['add'] and state['relu2'] and state['mul']: + return state + return None + + +def process_rec(G, oprec): + mul_node = oprec['mul'][0] + activation = HSigmoidActivationParameters(mul_node.name + "_fused_far_hsigmoid") + G.add_node(activation) + mulqrec = G.quantization[NodeId(mul_node)] + G.quantization[NodeId(activation)] = mulqrec + if oprec['relu1'] is not None: + mulqrec.in_qs = oprec['relu1'][1].in_qs + del G.quantization[NodeId(oprec['relu1'][0])] + for edge in G.in_edges(oprec['relu1'][0].name): + G.add_edge(Edge(from_node=edge.from_node, from_idx=edge.from_idx, to_node=activation.name)) + G.remove(oprec['relu1'][0]) + else: + mulqrec.in_qs = oprec['add'][1].in_qs + for edge in G.in_edges(oprec['add'][0].name): + G.add_edge(Edge(from_node=edge.from_node, from_idx=edge.from_idx, to_node=activation.name)) + if oprec['relu3'] is not None: + mulqrec.out_qs = oprec['relu3'][1].out_qs + del G.quantization[NodeId(oprec['relu3'][0])] + for edge in G.out_edges(oprec['relu3'][0].name): + G.add_edge(Edge(to_node=edge.to_node, to_idx=edge.to_idx, from_node=activation.name)) + G.remove(oprec['relu3'][0]) + else: + for edge in G.out_edges(oprec['mul'][0].name): + G.add_edge(Edge(to_node=edge.to_node, to_idx=edge.to_idx, from_node=activation.name)) + + del G.quantization[NodeId(oprec['relu2'][0])] + G.remove(oprec['relu2'][0]) + for node_type in ('add', 'mul'): + del G.quantization[NodeId(oprec[node_type][0])] + G.remove(oprec[node_type][0]) + del G.quantization[NodeId(oprec[node_type][2])] + G.remove(oprec[node_type][2]) + + +class MatchFarHSigmoid(Matcher): + NAME = 'match_far_hsigmoid' + DESCRIPTION = 'Looks for quantized HSigmoid - [Relu] -> Add 3 -> Relu6 -> Mul 1/6 -> [Relu]' + + def match(self, G: GraphView, set_identity: bool = True): + const_ops = [node for node in G.nodes() + if isinstance(node, MatrixMulParameters) + and any([isinstance(edge.from_node, ConstantInputParameters) + and check_equals(G, edge.from_node, 1.0/6.0) + for edge in G.in_edges(node.name)])] + + oprecs = [oprec for oprec in (look_back(G, op) + for op in const_ops) + if oprec is not None] + for oprec in oprecs: + mul_edge = G.out_edges(oprec['mul'][0].name) + if len(mul_edge) == 1: + mul_edge = mul_edge[0] + if isinstance(mul_edge.to_node, ReluActivationParameters): + oprec['relu3'] = (mul_edge.to_node, G.quantization[NodeId(mul_edge.to_node)]) + process_rec(G, oprec) + + if set_identity: + self.set_identity(G) diff --git a/tools/nntool/graph/matches/find_missing_quantization.py b/tools/nntool/graph/matches/find_missing_quantization.py new file mode 100644 index 000000000..a0d0b846d --- /dev/null +++ b/tools/nntool/graph/matches/find_missing_quantization.py @@ -0,0 +1,98 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +import logging +from copy import deepcopy + +from graph.matches.matcher import Matcher +from quantization.multiplicative.mult_quantization import MultQuantizationRecord +from utils.graph import Edge, GraphView +from utils.node_id import NodeId + +LOG = logging.getLogger("nntool." + __name__) + +def reduce_qtypes(qtypes): + max_idx = max(idx for idx, _ in qtypes) + res = [None] * (max_idx + 1) + for idx, qtype in qtypes: + if res[idx] is None: + res[idx] = qtype + elif qtype != res[idx]: + raise ValueError("qtypes are not compatible on index %s" % idx) + return res + + +class FindMissingQuantization(Matcher): + NAME = "find_missing_quantization" + DESCRIPTION = """Propagate quantization to nodes that have none""" + + + def match(self, G: GraphView, set_identity: bool = True): + if not G.quantization: + return + for nid in [nid for nid, qrec in G.quantization.sorted_iterator(G) if qrec is None or not (qrec.in_qs and qrec.out_qs)]: + if nid.fnode_name: + LOG.warning("can't add quantization to fused node %s", nid.fnode_name) + continue + if nid.node_name not in G: + # previous fusions may have removed nodes from the graph + continue + + node = nid.get_node(G) + predecessors = [NodeId(pred) for pred in G.predecessors(node.name)] + successors = [NodeId(succ) for succs in G.successors(node.name) for succ in succs] + go_back = not successors or (predecessors and all(pred in G.quantization for pred in predecessors)) + go_forward = not predecessors or (successors and all(succ in G.quantization for succ in successors)) + + if not (go_back or go_forward): + LOG.warning("node %s is not connected to anything and has no quantization", node.name) + continue + + if go_forward: + out_qrecs = set(G.quantization[nid] for nid in successors) + if not all(isinstance(out_qrec, MultQuantizationRecord) for out_qrec in out_qrecs): + continue + out_qtypes = reduce_qtypes([(edge.from_idx, G.quantization[NodeId(edge.to_node)].in_qs[edge.to_idx]) + for edge in G.out_edges(node.name)]) + else: + out_qtypes = None + if go_back: + in_qrecs = set(G.quantization[nid] for nid in predecessors) + if not all(isinstance(in_qrec, MultQuantizationRecord) for in_qrec in in_qrecs): + continue + in_qtypes = reduce_qtypes([(edge.to_idx, G.quantization[NodeId(edge.from_node)].out_qs[edge.from_idx]) + for edge in G.in_edges(node.name)]) + else: + in_qtypes = None + + if not in_qtypes: + if not predecessors: + LOG.info("setting quantization on input node %s", node.name) + qrec = MultQuantizationRecord(in_qs=deepcopy(out_qtypes), out_qs=deepcopy(out_qtypes)) + else: + raise NotImplementedError("propagating qrecs not implemented") + elif not out_qtypes: + if not successors: + LOG.info("setting quantization on output node %s", node.name) + qrec = MultQuantizationRecord(in_qs=deepcopy(in_qtypes), out_qs=deepcopy(in_qtypes)) + else: + raise NotImplementedError("propagating qrecs not implemented") + else: + LOG.info("setting quantization on node %s", node.name) + qrec = MultQuantizationRecord(in_qs=deepcopy(in_qtypes), out_qs=deepcopy(out_qtypes)) + + G.quantization[nid] = qrec + + if set_identity: + self.set_identity(G) diff --git a/tools/nntool/graph/matches/fuse_pad.py b/tools/nntool/graph/matches/fuse_pad.py index 68b9aad49..03c5eab7e 100644 --- a/tools/nntool/graph/matches/fuse_pad.py +++ b/tools/nntool/graph/matches/fuse_pad.py @@ -51,5 +51,6 @@ def replace_function(self, G: GraphView, subgraph: GraphView): filter_like_node.padding = pad_node.padding filter_like_node.pad_type = "zero" - + if G.quantization: + G.quantization.remove_node(pad_node) return filter_like_node diff --git a/tools/nntool/graph/matches/match_external_bias.py b/tools/nntool/graph/matches/match_external_bias.py index 05383fe46..872b2428c 100644 --- a/tools/nntool/graph/matches/match_external_bias.py +++ b/tools/nntool/graph/matches/match_external_bias.py @@ -17,6 +17,7 @@ from graph.types import FilterParameters, MatrixAddParameters, ConstantInputParameters from utils.graph import MatchNode, GraphView, Edge +from utils.node_id import NodeId from .matcher import DefaultMatcher, DontReplaceError @@ -58,4 +59,65 @@ def replace_function(self, G: GraphView, subgraph: GraphView): filter_node.biases = flattened_constant else: raise DontReplaceError() + if G.quantization: + fnid = NodeId(filter_node) + cnid = NodeId(constant_node) + if fnid in G.quantization and cnid in G.quantization: + G.quantization[fnid].biases_q = G.quantization[cnid].out_qs[0] + return filter_node + +class MatchExternalBiasSQ8(DefaultMatcher): + NAME = 'fuse_external_bias_sq8' + DESCRIPTION = 'Fuse bias addition after filter with filter bias' + + def match_function(self, G: GraphView): + sub = GraphView() + sub.add_node(MatchNode('0', matcher=lambda node:\ + isinstance(node, FilterParameters))) + sub.add_node(MatchNode('1', matcher=lambda node:\ + isinstance(node, MatrixAddParameters))) + sub.add_node(MatchNode('2', matcher=lambda node:\ + isinstance(node, ConstantInputParameters))) + sub.add_edge(Edge('0', '1', to_idx=0)) + sub.add_edge(Edge('2', '1', to_idx=1)) + + return G.match_fragment(sub) + + def replace_function(self, G: GraphView, subgraph: GraphView): + filter_node = None + constant_node = None + for node in subgraph.nodes(): + if isinstance(node, FilterParameters): + filter_node = node + elif isinstance(node, ConstantInputParameters): + constant_node = node + flattened_constant = constant_node.value.flatten() + if G.quantization: + fnid = NodeId(filter_node) + cnid = NodeId(constant_node) + if fnid in G.quantization and cnid in G.quantization: + biases_q = G.quantization[fnid].biases_q + const_q = G.quantization[cnid].out_qs[0] + + # shape needs to match + if flattened_constant.shape[0] == filter_node.filter.out_c: + if filter_node.has_bias: + assert filter_node.biases is not None, "can't absorb bias into filter. maybe weights are not loaded" + if G.quantization: + #dequantize the constants + flattened_constant_dq = const_q.get_dequantized(flattened_constant) + biases_dq = biases_q.get_dequantized(filter_node.biases) + #sum the floats and requantize at biases_q scale + filter_node.biases = biases_q.quantize(flattened_constant_dq + biases_dq) + else: + filter_node.biases += flattened_constant + else: + if G.quantization: + #dequantize the constants + flattened_constant_dq = const_q.get_dequantized(flattened_constant) + filter_node.biases = biases_q.get_quantized(flattened_constant_dq) + else: + filter_node.biases = flattened_constant + else: + raise DontReplaceError() return filter_node diff --git a/tools/nntool/graph/matches/match_gap_conv.py b/tools/nntool/graph/matches/match_gap_conv.py index bddb563ca..486f0324d 100644 --- a/tools/nntool/graph/matches/match_gap_conv.py +++ b/tools/nntool/graph/matches/match_gap_conv.py @@ -15,17 +15,26 @@ import logging -from graph.types import Conv2DParameters, ConvFusionParameters, PoolingParameters, ActivationParameters -from utils.graph import MatchNode, GraphView, Edge - -from .matcher import DefaultMatcher, MatchGroup, DontReplaceError +from graph.types import (ActivationParameters, Conv2DParameters, + ConvFusionParameters, PoolingParameters) +from quantization.symmetric.symmetric_quantization import ( + SymmetricQuantizationRecord, SymmetricScalableFilterQuantizationRecord) +from quantization.multiplicative.mult_quantization import ( + MultQuantizationRecord, MultScalableFilterQuantizationRecord) +from quantization.float32.float32_quantization import ( + Float32QuantizationRecord, Float32ScalableFilterQuantizationRecord) +from utils.graph import Edge, GraphView, MatchNode +from utils.node_id import NodeId + +from .matcher import DefaultMatcher, DontReplaceError, MatchGroup LOG = logging.getLogger("nntool." + __name__) class MatchGapConv(DefaultMatcher): - def __init__(self, match_activation=True, match_pool=False, pool_after_activation=False): + def __init__(self, *args, match_activation=True, match_pool=False, pool_after_activation=False, **kwargs): + super(MatchGapConv, self).__init__(*args, **kwargs) assert match_activation or match_pool, "not very interesting to just match conv" self.match_activation = match_activation self.match_pool = match_pool @@ -118,7 +127,21 @@ def replace_function(self, G: GraphView, subgraph: GraphView): LOG.debug("fused nodes %s", ",".join((node.name for node in subgraph.nodes()))) # simple node order is necessary because nodes() will not necessarily # be in order - return ConvFusionParameters(conv_name, self.fusion_type, subgraph) + pnode = ConvFusionParameters(conv_name, self.fusion_type, subgraph) + if G.quantization: + qrecs = G.quantization.get_all(subgraph.nodes()) + if qrecs: + if isinstance(qrecs[0], (SymmetricQuantizationRecord, SymmetricScalableFilterQuantizationRecord)): + prec = SymmetricQuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) + elif isinstance(qrecs[0], (MultQuantizationRecord, MultScalableFilterQuantizationRecord)): + prec = MultQuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) + elif isinstance(qrecs[0], (Float32QuantizationRecord, Float32ScalableFilterQuantizationRecord)): + prec = Float32QuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) + for node in subgraph.nodes(): + G.quantization.move_to_fusion(node, pnode) + G.quantization[NodeId(pnode)] = prec + return pnode + class MatchAllGapConv(MatchGroup): NAME = 'fuse_gap_convs' diff --git a/tools/nntool/graph/matches/match_gap_linear.py b/tools/nntool/graph/matches/match_gap_linear.py index 663ca87ee..24d52a151 100644 --- a/tools/nntool/graph/matches/match_gap_linear.py +++ b/tools/nntool/graph/matches/match_gap_linear.py @@ -15,8 +15,17 @@ import logging -from graph.types import FcParameters, ActivationParameters, ConvFusionParameters -from utils.graph import MatchNode, GraphView, Edge +from graph.nngraph import NNGraph +from graph.types import (ActivationParameters, ConvFusionParameters, + FcParameters) +from quantization.symmetric.symmetric_quantization import ( + SymmetricQuantizationRecord, SymmetricScalableFilterQuantizationRecord) +from quantization.multiplicative.mult_quantization import ( + MultQuantizationRecord, MultScalableFilterQuantizationRecord) +from quantization.float32.float32_quantization import ( + Float32QuantizationRecord, Float32ScalableFilterQuantizationRecord) +from utils.graph import Edge, GraphView, MatchNode +from utils.node_id import NodeId from .matcher import DefaultMatcher @@ -49,7 +58,7 @@ def match_function(self, G: GraphView): sub.add_edge(Edge('0', '1')) return G.match_fragment(sub) - def replace_function(self, G: GraphView, subgraph: GraphView): + def replace_function(self, G: NNGraph, subgraph: GraphView): step = 0 for node in subgraph.nodes(): node.step_idx = step @@ -61,4 +70,18 @@ def replace_function(self, G: GraphView, subgraph: GraphView): (node.name for node in subgraph.nodes()))) # simple node order is necessary because nodes() will not necessarily # be in order - return ConvFusionParameters(linear_name, "linear_active", subgraph) + pnode = ConvFusionParameters(linear_name, "linear_active", subgraph) + if G.quantization: + qrecs = G.quantization.get_all(subgraph.nodes()) + if qrecs: + if isinstance(qrecs[0], (SymmetricQuantizationRecord, SymmetricScalableFilterQuantizationRecord)): + prec = SymmetricQuantizationRecord( + in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) + elif isinstance(qrecs[0], (MultQuantizationRecord, MultScalableFilterQuantizationRecord)): + prec = MultQuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) + elif isinstance(qrecs[0], (Float32QuantizationRecord, Float32ScalableFilterQuantizationRecord)): + prec = Float32QuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) + for node in subgraph.nodes(): + G.quantization.move_to_fusion(node, pnode) + G.quantization[NodeId(pnode)] = prec + return pnode diff --git a/tools/nntool/graph/matches/match_gap_pool.py b/tools/nntool/graph/matches/match_gap_pool.py index 4ea4cf2f8..4b252e2bc 100644 --- a/tools/nntool/graph/matches/match_gap_pool.py +++ b/tools/nntool/graph/matches/match_gap_pool.py @@ -15,8 +15,16 @@ import logging -from graph.types import PoolingParameters, ActivationParameters, ConvFusionParameters -from utils.graph import MatchNode, GraphView, Edge +from graph.types import (ActivationParameters, ConvFusionParameters, + PoolingParameters) +from quantization.symmetric.symmetric_quantization import ( + SymmetricQuantizationRecord, SymmetricScalableFilterQuantizationRecord) +from quantization.multiplicative.mult_quantization import ( + MultQuantizationRecord, MultScalableFilterQuantizationRecord) +from quantization.float32.float32_quantization import ( + Float32QuantizationRecord, Float32ScalableFilterQuantizationRecord) +from utils.graph import Edge, GraphView, MatchNode +from utils.node_id import NodeId from .matcher import DefaultMatcher @@ -61,4 +69,18 @@ def replace_function(self, G: GraphView, subgraph: GraphView): (node.name for node in subgraph.nodes()))) # simple node order is necessary because nodes() will not necessarily # be in order - return ConvFusionParameters(pool_name, "pool_active", subgraph) + pnode = ConvFusionParameters(pool_name, "pool_active", subgraph) + if G.quantization: + qrecs = G.quantization.get_all(subgraph.nodes()) + if qrecs: + if isinstance(qrecs[0], (SymmetricQuantizationRecord, SymmetricScalableFilterQuantizationRecord)): + prec = SymmetricQuantizationRecord( + in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) + elif isinstance(qrecs[0], (MultQuantizationRecord, MultScalableFilterQuantizationRecord)): + prec = MultQuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) + elif isinstance(qrecs[0], (Float32QuantizationRecord, Float32ScalableFilterQuantizationRecord)): + prec = Float32QuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) + for node in subgraph.nodes(): + G.quantization.move_to_fusion(node, pnode) + G.quantization[NodeId(pnode)] = prec + return pnode diff --git a/tools/nntool/graph/matches/match_op_activation.py b/tools/nntool/graph/matches/match_op_activation.py new file mode 100644 index 000000000..e72bc4de6 --- /dev/null +++ b/tools/nntool/graph/matches/match_op_activation.py @@ -0,0 +1,89 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging +from abc import abstractclassmethod + +from graph.nngraph import NNGraph +from graph.types import (ActivationFusion, ActivationParameters, + GlobalPoolParameters, MatrixAddParameters, + MatrixMulParameters, PoolingParameters) +from quantization.float32.float32_quantization import ( + Float32QuantizationRecord, Float32ScalableFilterQuantizationRecord) +from quantization.multiplicative.mult_quantization import ( + MultQuantizationRecord, MultScalableFilterQuantizationRecord) +from quantization.symmetric.symmetric_quantization import ( + SymmetricQuantizationRecord, SymmetricScalableFilterQuantizationRecord) +from utils.graph import Edge, GraphView, MatchNode +from utils.node_id import NodeId + +from .matcher import DefaultMatcher + +LOG = logging.getLogger("nntool." + __name__) + + +class MatchOpActivation(DefaultMatcher): + + @abstractclassmethod + def valid_node_classes(cls): + pass + + def match_function(self, G: GraphView): + sub = GraphView() + sub.add_node(MatchNode('0', + matcher=lambda node: + isinstance(node, self.valid_node_classes()))) + sub.add_node(MatchNode('1', matcher=lambda node: + isinstance(node, ActivationParameters))) + sub.add_edge(Edge('0', '1')) + return G.match_fragment(sub) + + def replace_function(self, G: NNGraph, subgraph: GraphView): + nodes = list(subgraph.nodes()) + pnode = ActivationFusion(nodes[0].name + "fusion", nodes[0].op_name + "_active", subgraph) + nodes[0].step_idx = 0 + nodes[1].step_idx = 1 + LOG.debug("fused nodes %s", ",".join( + (node.name for node in nodes))) + if G.quantization: + qrecs = G.quantization.get_all(subgraph.nodes()) + if qrecs: + if isinstance(qrecs[0], (SymmetricQuantizationRecord, SymmetricScalableFilterQuantizationRecord)): + prec = SymmetricQuantizationRecord( + in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) + elif isinstance(qrecs[0], (MultQuantizationRecord, MultScalableFilterQuantizationRecord)): + prec = MultQuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) + elif isinstance(qrecs[0], (Float32QuantizationRecord, Float32ScalableFilterQuantizationRecord)): + prec = Float32QuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) + for node in subgraph.nodes(): + G.quantization.move_to_fusion(node, pnode) + G.quantization[NodeId(pnode)] = prec + return pnode + + +class MatchOpActivationScaleKernels(MatchOpActivation): + NAME = 'fuse_op_activation_scale8' + DESCRIPTION = 'Fuse non-filter nodes and activations to match GAP AutoTiler SQ8 kernels' + @classmethod + def valid_node_classes(cls): + return (PoolingParameters, GlobalPoolParameters, MatrixAddParameters, MatrixMulParameters) + + +class MatchOpActivationPow2Kernels(MatchOpActivation): + NAME = 'fuse_op_activation_pow2' + DESCRIPTION = 'Fuse non-filter nodes and activations to match GAP AutoTiler POW2 kernels' + @classmethod + def valid_node_classes(cls): + return (PoolingParameters, MatrixAddParameters, MatrixMulParameters) diff --git a/tools/nntool/graph/matches/matches.py b/tools/nntool/graph/matches/matches.py index c3c8b2f62..0b8193d08 100644 --- a/tools/nntool/graph/matches/matches.py +++ b/tools/nntool/graph/matches/matches.py @@ -13,24 +13,43 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from .remove_unused_concats import RemoveUnusedConcats -from .match_gap_conv import MatchAllGapConv -from .fuse_pad import MatchFusePad +from .equalize_sym_mult_concats import \ + EqualizeSymmetricMultiplicativeQuantivedConcats from .expand_transposes import ExpandTransposesMatcher -from .move_activation import MoveActivationsMatcher +from .find_missing_quantization import FindMissingQuantization +from .fuse_pad import MatchFusePad +from .match_external_bias import MatchExternalBias, MatchExternalBiasSQ8 +from .match_gap_conv import MatchAllGapConv from .match_gap_linear import MatchGapLinear from .match_gap_pool import MatchGapPool -from .match_external_bias import MatchExternalBias -from .matscale import FuseMatScalePair, FuseMatScale - from .matcher import MatchGroup +from .matscale import FuseMatScale, FuseMatScalePair +from .move_activation import MoveActivationsMatcherScale8, MoveActivationsMatcherPow2 +from .propagate_softmax_sym_mult_qrec import PropagateSoftmaxSymQrec +from .remove_noops import RemoveNoOPs +from .remove_unused_concats import RemoveUnusedConcats +from .find_asymmetric_quantization import FindAsymmetricQuantization +from .match_op_activation import MatchOpActivationPow2Kernels, MatchOpActivationScaleKernels +from .find_hsigmoid import MatchCloseHSigmoid, MatchFarHSigmoid +from .remove_relus import RemoveRelusMatch -ALL_MATCH_CLASSES = [MatchExternalBias, MatchFusePad, RemoveUnusedConcats, - MoveActivationsMatcher, MatchAllGapConv, MatchGapPool, - MatchGapLinear, ExpandTransposesMatcher, FuseMatScalePair, FuseMatScale] -STD_MATCH_CLASSES = [MatchExternalBias, MatchFusePad, RemoveUnusedConcats, - MoveActivationsMatcher, MatchAllGapConv, ExpandTransposesMatcher, +ALL_MATCH_CLASSES = [RemoveRelusMatch, RemoveNoOPs, MatchExternalBias, MatchFusePad, RemoveUnusedConcats, + FindMissingQuantization, MatchFarHSigmoid, MatchCloseHSigmoid, MoveActivationsMatcherScale8, + MoveActivationsMatcherPow2, + EqualizeSymmetricMultiplicativeQuantivedConcats, + MatchAllGapConv, MatchGapPool, MatchOpActivationScaleKernels, + MatchOpActivationPow2Kernels, + MatchGapLinear, ExpandTransposesMatcher, FindAsymmetricQuantization, FuseMatScalePair, FuseMatScale] +POW2_MATCH_CLASSES = [RemoveRelusMatch, RemoveNoOPs, MatchExternalBias, MatchFusePad, + RemoveUnusedConcats, FindMissingQuantization, MatchCloseHSigmoid, + MoveActivationsMatcherPow2, ExpandTransposesMatcher, MatchAllGapConv, MatchGapLinear, + EqualizeSymmetricMultiplicativeQuantivedConcats] +SCALE8_MATCH_CLASSES = [RemoveRelusMatch, RemoveNoOPs, MatchExternalBiasSQ8, MatchFusePad, + RemoveUnusedConcats, FindMissingQuantization, + MatchFarHSigmoid, MatchCloseHSigmoid, MoveActivationsMatcherScale8, ExpandTransposesMatcher, + MatchAllGapConv, MatchGapLinear, MatchOpActivationScaleKernels, PropagateSoftmaxSymQrec, + EqualizeSymmetricMultiplicativeQuantivedConcats] FUSION_LIST = [((match_class.NAME, match_class.DESCRIPTION), match_class()) for match_class in ALL_MATCH_CLASSES] @@ -40,16 +59,25 @@ def get_fusions(): return [(match_class.NAME, match_class.DESCRIPTION) for match_class in ALL_MATCH_CLASSES] -def get_std_match_group(): +def get_pow2_match_group(): + return MatchGroup( + *[match_class() for match_class in POW2_MATCH_CLASSES], + identity="pow2_match_group" + ) + + +def get_scale8_match_group(): return MatchGroup( - *[match_class() for match_class in STD_MATCH_CLASSES], + *[match_class() for match_class in SCALE8_MATCH_CLASSES], identity="std_match_group" ) def get_fusion(name): - if name == "std_match_group": - return get_std_match_group() + if name in ["pow2_match_group"]: + return get_pow2_match_group() + if name in ["std_match_group", "scale8_match_group"]: + return get_scale8_match_group() match_class = next((match_class for match_class in ALL_MATCH_CLASSES if match_class.NAME == name), None) if match_class is not None: diff --git a/tools/nntool/graph/matches/move_activation.py b/tools/nntool/graph/matches/move_activation.py index 37ebf0627..29ef7bfa0 100644 --- a/tools/nntool/graph/matches/move_activation.py +++ b/tools/nntool/graph/matches/move_activation.py @@ -13,19 +13,17 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import logging +from copy import deepcopy +from graph.types import (ActivationParameters, ConcatParameters, + Conv2DParameters, FcParameters, GlobalPoolParameters, + MatrixAddParameters, MatrixMulParameters, + PoolingParameters, ReshapeParameters, + TransposeParameters) +from utils.graph import Edge, GraphView +from utils.node_id import NodeId -from utils.graph import GraphView, Edge - -from ..types.conv2d import Conv2DParameters -from ..types.linear import FcParameters -from ..types.others import (ActivationParameters, ConcatParameters, - ReshapeParameters, TransposeParameters) -from ..types.pooling import PoolingParameters from .matcher import Matcher -VALID_FUSIONS = (Conv2DParameters, FcParameters, PoolingParameters) -VALID_NODES_TO_PASS = (ReshapeParameters, TransposeParameters) - LOG = logging.getLogger("nntool." + __name__) @@ -34,9 +32,9 @@ class LocationNotFoundError(Exception): class MoveActivationsMatcher(Matcher): - NAME = "move_activations" - DESCRIPTION = "Tries to move activations so they are after layers that they can be fused with. \ - Should be run before match_gap_* fusions." + + ValidNodesToPass = None + ValidFusions = None def find_home_for_activation(self, G, @@ -52,18 +50,20 @@ def find_home_for_activation(self, yield from self.find_home_for_activation(G, activation, edge=in_edge) - elif isinstance(edge.from_node, VALID_NODES_TO_PASS): + elif isinstance(edge.from_node, self.ValidNodesToPass): in_edge = G.in_edges(edge.from_node.name)[0] yield from self.find_home_for_activation(G, activation, edge=in_edge) - elif isinstance(edge.from_node, VALID_FUSIONS): + elif isinstance(edge.from_node, self.ValidFusions): yield edge else: raise LocationNotFoundError() @staticmethod def move_activation(G, activation, edges): + nid = NodeId(activation) + qrec = G.quantization[nid] if G.quantization and nid in G.quantization else None ain_edge = G.in_edges(activation.name)[0] aout_edge = G.out_edges(activation.name)[0] G.remove(activation) @@ -83,12 +83,19 @@ def move_activation(G, activation, edges): new_activation.out_dims = [edge.to_node.in_dims[edge.to_idx].clone()] G.insert_node(new_activation, edge.from_node, edge.to_node, from_idx=edge.from_idx, to_idx=edge.to_idx) + if qrec: + from_qrec = G.quantization[NodeId(edge.from_node)] + new_qrec = deepcopy(qrec) + new_qrec.in_qs[0] = deepcopy(from_qrec.out_qs[edge.from_idx]) + G.quantization[NodeId(new_activation)] = new_qrec + G.quantization.propagate( + G, new_activation, new_edge.from_node, qtype=new_qrec.out_qs[0]) def match(self, G: GraphView, set_identity: bool = True): activations = [node for node in G.nodes( ) if isinstance(node, ActivationParameters)] activations = filter(lambda n: not isinstance( - G.in_edges(n.name)[0].from_node, VALID_FUSIONS), activations) + G.in_edges(n.name)[0].from_node, self.ValidFusions), activations) can_be_moved = [] for activation in activations: try: @@ -104,6 +111,20 @@ def match(self, G: GraphView, set_identity: bool = True): self.set_identity(G) -# Find activation -# check node in front -# if it isn't conv, linear or pool +class MoveActivationsMatcherScale8(MoveActivationsMatcher): + NAME = "move_activations_scale8" + DESCRIPTION = "Tries to move activations so they are after layers that they can be fused with. \ + Should be run before match_gap_* fusions. Compatible with AutoTiler SQ8 kernels." + + ValidNodesToPass = (ReshapeParameters, TransposeParameters) + ValidFusions = (Conv2DParameters, FcParameters, PoolingParameters, PoolingParameters, + GlobalPoolParameters, MatrixAddParameters, MatrixMulParameters) + + +class MoveActivationsMatcherPow2(MoveActivationsMatcher): + NAME = "move_activations_pow2" + DESCRIPTION = "Tries to move activations so they are after layers that they can be fused with. \ + Should be run before match_gap_* fusions. Compatible with AutoTiler POW2 kernels." + + ValidNodesToPass = (ReshapeParameters, TransposeParameters) + ValidFusions = (Conv2DParameters, FcParameters, PoolingParameters) diff --git a/tools/nntool/graph/matches/propagate_softmax_sym_mult_qrec.py b/tools/nntool/graph/matches/propagate_softmax_sym_mult_qrec.py new file mode 100644 index 000000000..d1b535f71 --- /dev/null +++ b/tools/nntool/graph/matches/propagate_softmax_sym_mult_qrec.py @@ -0,0 +1,45 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +from graph.matches.matcher import Matcher +from graph.types import SoftMaxParameters, OutputParameters +from quantization.multiplicative.mult_quantization import MultQuantizationRecord +from utils.graph import GraphView +from utils.node_id import NodeId +from .equalize_sym_mult_concats import propagate_qtype_up + +class PropagateSoftmaxSymQrec(Matcher): + NAME = "propagate_softmax_sym_qrec" + DESCRIPTION = """Set input qrec of softmaxes to pow2 and propagate up""" + + def match(self, G: GraphView, set_identity: bool = True): + if not G.quantization: + return + softmaxes = [node for node in G.nodes() if isinstance(node, SoftMaxParameters)] + qrecs = [G.quantization[NodeId(node)] for node in softmaxes] + if not all(isinstance(qrec, MultQuantizationRecord) for qrec in qrecs): + return + for softmax, qrec in zip(softmaxes, qrecs): + in_q = qrec.in_qs[0] + in_q.scale_to_pow2() + for edge in G.in_edges(softmax.name): + propagate_qtype_up(G, in_q, edge) + for edge in G.out_edges(softmax.name): + assert isinstance(edge.to_node, OutputParameters), "Softmax is supported only at the end of the graph" + out_qrec = G.quantization[NodeId(edge.to_node)] + out_qrec.in_qs[0] = qrec.out_qs[0] + out_qrec.out_qs[0] = qrec.out_qs[0] + + if set_identity: + self.set_identity(G) diff --git a/tools/nntool/graph/matches/remove_noops.py b/tools/nntool/graph/matches/remove_noops.py new file mode 100644 index 000000000..e1bd5cd68 --- /dev/null +++ b/tools/nntool/graph/matches/remove_noops.py @@ -0,0 +1,33 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from utils.graph import GraphView, MatchNode, Node, Edge +from graph.types import NoOPParameters +from .matcher import DefaultMatcher + +class NoOPMatcher(MatchNode): + def _match(self, G: GraphView, node: Node, edge: Edge): + return isinstance(node, NoOPParameters) + +class RemoveNoOPs(DefaultMatcher): + NAME = "remove_noops" + DESCRIPTION = "Remove noop nodes" + def match_function(self, G: GraphView): + sub = GraphView() + sub.add_node(NoOPMatcher('0')) + return G.match_fragment(sub) + + def replace_function(self, G: GraphView, subgraph: GraphView): + return None diff --git a/tools/nntool/graph/matches/remove_relus.py b/tools/nntool/graph/matches/remove_relus.py new file mode 100644 index 000000000..e80571267 --- /dev/null +++ b/tools/nntool/graph/matches/remove_relus.py @@ -0,0 +1,121 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from graph.types import (ConstantInputParameters, HSigmoidActivationParameters, + MatrixAddParameters, MatrixDivParameters, + MatrixMulParameters, ReluActivationParameters, + ReshapeParameters, TransposeParameters, PoolingParameters, + ConcatParameters) +from utils.graph import Edge, GraphView +from utils.node_id import NodeId +from .matcher import Matcher + +def reduce_edges(in_edges, visited_edges): + status = [None, None] + for edge in in_edges: + edge_rec = visited_edges.get(edge) + if edge_rec is None: + return None + if edge_rec[0] is False: + return [False, False] + status[0] = True + if edge_rec[1] is False: + status[1] = False + elif status[1] is None or (status[1] is not False and edge_rec[1] > status[1]): + status[1] = edge_rec[1] + return status + + +def find_redundant_relus(G, node, visited_edges): + # status 0 is relued + # status 1 is upper bound + status = reduce_edges(G.in_edges(node.name), visited_edges) + if status is None: + return [] + nodes_to_remove = [] + if isinstance(node, ReluActivationParameters): + if status[0]: + # this relu has an upper bound + if node.upper_bound is not None: + # if we are already relued less than or equal to that bound + if status[1] is not False: + if status[1] <= node.upper_bound: + # remove this relu + nodes_to_remove.append(node) + else: + status[1] = node.upper_bound + else: + # new bound + status[1] = node.upper_bound + else: + # we're already relued so this is redundant + nodes_to_remove.append(node) + if node.upper_bound is not None: + if status[1] is False or status[1] > node.upper_bound: + status[1] = node.upper_bound + else: + status[0] = True + if node.upper_bound is not None: + status[1] = node.upper_bound + elif isinstance(node, HSigmoidActivationParameters): + status[0] = True + if status[1] is None or status[1] > 1: + status[1] = 1 + elif isinstance(node, (MatrixAddParameters, MatrixDivParameters, MatrixMulParameters)): + status[1] = False + elif not isinstance(node, (ConstantInputParameters, ReshapeParameters, TransposeParameters, PoolingParameters, ConcatParameters)): + status = [False, False] + for edge in G.out_edges(node.name): + visited_edges[edge] = status + nodes_to_remove += find_redundant_relus(G, edge.to_node, visited_edges) + return nodes_to_remove + +class RemoveRelusMatch(Matcher): + NAME = 'remove_relus' + DESCRIPTION = 'Finds redundant relus in graph' + def match(self, G: GraphView, set_identity: bool = True): + visited_edges = {} + nodes_to_remove = [] + for node in G.inputs(): + # check if constantinput. if is then check if positive and check max value + if isinstance(node, ConstantInputParameters): + if node.value is not None: + if G.has_quantized_parameters: + qrec = G.quantization[NodeId(node)] + qtype = qrec.out_qs[0] + if hasattr(qtype, 'wrapped'): + qtype = qtype.wrapped + val = qtype.dequantize(node.value) + else: + val = node.value + if val.min() >= 0: + status = (True, val.max()) + else: + status = (False, False) + else: + status = (False, False) + + for edge in G.out_edges(node.name): + visited_edges[edge] = status + nodes_to_remove += find_redundant_relus(G, edge.to_node, visited_edges) + for node in nodes_to_remove: + # Only relus so only one in edge + in_edge = G.in_edges(node.name)[0] + for edge in G.out_edges(node.name): + G.add_edge(Edge(from_node=in_edge.from_node, + from_idx=in_edge.from_idx, + to_node=edge.to_node, + to_idx=edge.to_idx)) + G.remove(node) diff --git a/tools/nntool/graph/nngraph.py b/tools/nntool/graph/nngraph.py index af4d63e5f..2042aa008 100644 --- a/tools/nntool/graph/nngraph.py +++ b/tools/nntool/graph/nngraph.py @@ -17,40 +17,55 @@ import os from typing import Generator, Sequence, Union +from graph.dim import Dim +from graph.dump_tensor import PrintDumper, dump_tensor +from graph.graph_identity import GraphIdentity +from graph.manipulations import (add_dimensions, adjust_order, + balance_all_filters, balance_filter, + calculate_liveness) +from graph.types import (ConstantInputParameters, ConvFusionParameters, + InputBaseParameters, InputParameters, + MultiplicativeBiasParameters, OutputParameters) +from quantization.quantization_set import QuantizationSet from utils.graph import Graph, Node from utils.json_serializable import JsonSerializable from utils.node_id import NodeId - -from .dim import Dim -from .dump_tensor import PrintDumper, dump_tensor -from .graph_identity import GraphIdentity -from .manipulations import add_dimensions, adjust_order, calculate_liveness, balance_filter, balance_all_filters -from .types import (ConstantInputParameters, FilterParameters, - ConvFusionParameters, InputBaseParameters, InputParameters, - OutputParameters, MultiplicativeBiasParameters) +from interpreter.commands.imageformat import insert_formatter LOG = logging.getLogger("nntool." + __name__) + class NNGraphError(Exception): pass + class GraphStepsNotCalculatedError(NNGraphError): pass -class NNGraphAttributeChanges(JsonSerializable): + +class NNGraphChanges(JsonSerializable): def __init__(self, init=None): if init is not None: self._changes = init['changes'] + self._image_format = init.get('image_format') or {} return self._changes = [] + self._image_format = {} def _encapsulate(self): - return {'changes': self._changes} + return {'changes': self._changes, 'image_format': self._image_format} @classmethod def _dencapsulate(cls, val): return cls(init=val) + def image_format(self, input_node_name, formatter, normalizer): + if formatter is None and normalizer is None: + if input_node_name in self._image_format: + del self._image_format[input_node_name] + return + self._image_format[input_node_name] = {"formatter": formatter, "normalizer": normalizer} + def modify(self, node, attr, val, fnode=None): nid = NodeId(node, fnode) self._changes.append({ @@ -66,6 +81,14 @@ def replay(self, G): for change in self._changes: node = change['nid'].get_node(G) setattr(node, change['attr'], change['val']) + graph_changed = False + for input_node_name, params in self._image_format.items(): + graph_changed = True + out_edge = G.out_edges(input_node_name)[0] + insert_formatter(G, out_edge, params["formatter"], params["normalizer"]) + if graph_changed: + G.add_dimensions() + class NNGraphState(): def __init__(self): @@ -103,12 +126,13 @@ def has_quantization_info(self): def has_quantization_info(self, val): self._state['quantization'] = val + class NNGraph(Graph): - def __init__(self, model=None, name=None, - filename=None, value_cache=None, + def __init__(self, + model=None, + name=None, + filename=None, constant_store=None): - # TODO - Value caching disabled - del value_cache super().__init__() self.model = model @@ -122,14 +146,11 @@ def __init__(self, model=None, name=None, self.load_function = None self.graphname = name - # disable value cache for now -# self.value_cache = value_cache - self.value_cache = None self.constant_store = constant_store self.graph_identity = GraphIdentity(filename) self._info = { 'quantization': None, - 'changes': NNGraphAttributeChanges() + 'changes': NNGraphChanges() } @property @@ -141,13 +162,21 @@ def info(self, val): self._info = val @property - def quantization(self): - return self._info['quantization'] + def quantization(self) -> QuantizationSet: + return self._info.get('quantization') @quantization.setter - def quantization(self, val): + def quantization(self, val: QuantizationSet): self._info['quantization'] = val + @property + def has_quantized_parameters(self) -> bool: + return self._info.get('has_quantized_parameters') + + @has_quantized_parameters.setter + def has_quantized_parameters(self, val: bool): + self._info['has_quantized_parameters'] = val + @property def changes(self): return self._info['changes'] @@ -244,8 +273,8 @@ def nodes_iterator(self, yield_fusions=True): else: yield (step_idx, node, None, None) - def adjust_order(self, reshape_weights=True): - adjust_order(self, reshape_weights) + def adjust_order(self, reshape_weights=True, postprocess=True): + adjust_order(self, reshape_weights=reshape_weights, postprocess=postprocess) LOG.info("adjusted order") self.graph_identity.is_adjusted = True @@ -263,14 +292,16 @@ def balance_filters(self, step_idx=None, precision_threshold=0.20): if isinstance(pnode, ConvFusionParameters): fnode = pnode.contained_filters() if len(fnode) > 1: - raise NotImplementedError("fusions with more than one contained filter is not supported") + raise NotImplementedError( + "fusions with more than one contained filter is not supported") fnode = fnode[0] node = fnode else: node = pnode fnode = None if not isinstance(node, MultiplicativeBiasParameters): - raise ValueError("weights can only be balanced on nodes that support multiplicative bias") + raise ValueError( + "weights can only be balanced on nodes that support multiplicative bias") balance_filter(pnode, fnode=fnode, G=self) else: balance_all_filters(self, precision_threshold=precision_threshold) @@ -282,11 +313,12 @@ def print_step(step, outs): print(node.name) for out_idx, out in enumerate(outs): dims = node.out_dims[out_idx] - if order is not None and order != dims.order: + if order is not None and dims.is_named and order != dims.order and all(k in dims.order + for k in order): transpose = dims.transpose_to_order(order) out = out.transpose(transpose) if channel is not None: - out = out[channel].reshape((1, dims.h, dims.w)) + out = out[channel:channel+1:1, ...] dump_tensor(out, PrintDumper(out, width=width, precision=precision)) if limit is not None: diff --git a/tools/nntool/graph/types/__init__.py b/tools/nntool/graph/types/__init__.py index acbc03696..831c25164 100644 --- a/tools/nntool/graph/types/__init__.py +++ b/tools/nntool/graph/types/__init__.py @@ -13,21 +13,31 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from .base import (EdgeParameters, FilterLikeParameters, FilterParameters, - MultiplicativeBiasParameters, NNEdge, NodeOptions, - Parameters, SameNumberOfDimensionsForInputs, - SingleInputAndOutput) -from .conv2d import Conv2DParameters -from .linear import FcParameters -from .others import (ActivationParameters, ConcatParameters, - ConstantInputParameters, ConvFusionParameters, FusionBase, - GlobalPoolParameters, GroupParameters, - InputBaseParameters, InputParameters, - MatScaleFusionParameters, MatrixAddParameters, - MatrixBroadcastedLinearOpParameters, MatrixDivParameters, - MatrixMulParameters, MatrixSubParameters, - OutputParameters, PadParameters, ReshapeParameters, - SoftMaxParameters, Transposable, TransposeParameters, - UnconvertedOpParameters, UnexecutableOpParameters, - UnknownOpParameters, UpsampleParameters, YoloParameters) -from .pooling import PoolingParameters +from graph.types.activations import (ActivationParameters, + HSigmoidActivationParameters, + HSwishActivationParameters, + LeakyActivationParameters, + ReluActivationParameters) +from graph.types.base import (EdgeParameters, FilterLikeParameters, + FilterParameters, MultiplicativeBiasParameters, + NNEdge, NodeOptions, Parameters, + SameNumberOfDimensionsForInputs, + SingleInputAndOutput) +from graph.types.conv2d import Conv2DParameters +from graph.types.fusions import (ActivationFusion, ConvFusionParameters, + FusionBase, MatScaleFusionParameters) +from graph.types.linear import FcParameters +from graph.types.others import (ConcatParameters, ConstantInputParameters, + GlobalPoolParameters, GroupParameters, + ImageFormatParameters, InputBaseParameters, + InputParameters, MatrixAddParameters, + MatrixBroadcastedLinearOpParameters, + MatrixDivParameters, MatrixMulParameters, + MatrixSubParameters, NoOPParameters, + OutputParameters, PadParameters, + ReshapeParameters, SoftMaxParameters, + Transposable, TransposeParameters, + UnconvertedOpParameters, + UnexecutableOpParameters, UnknownOpParameters, + UpsampleParameters, YoloParameters) +from graph.types.pooling import PoolingParameters diff --git a/tools/nntool/graph/types/activations.py b/tools/nntool/graph/types/activations.py new file mode 100644 index 000000000..fcd873bbf --- /dev/null +++ b/tools/nntool/graph/types/activations.py @@ -0,0 +1,156 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from .base import NoSizeChangeParameters, SingleInputAndOutput + +LOG = logging.getLogger("nntool." + __name__) + +#pylint: disable=abstract-method +class ActivationParameters(NoSizeChangeParameters, SingleInputAndOutput): + + def __init__(self, name): + super(ActivationParameters, self).__init__(name) + + @classmethod + def get_activation(cls, activation_type: str, name: str): + if activation_type == "hsigmoid": + return HSigmoidActivationParameters(name) + if activation_type == "relu": + return ReluActivationParameters(name) + if activation_type == "relu6": + return ReluActivationParameters(name, upper_bound=6) + if activation_type == "hswish": + return HSwishActivationParameters(name) + raise ValueError("don't know how to create %s"%activation_type) + + @property + def activation(self): + return self.op_name + + def get_parameter_size(self): + return 0 + + def compute_load(self): + return 0 + + def __str__(self): + return "Activation {} {}".format( + self.op_name, + self.at_options + ) + +class ReluActivationParameters(ActivationParameters): + def __init__(self, name, lower_bound=0, upper_bound=None): + super(ReluActivationParameters, self).__init__(name) + self._lower_bound = lower_bound + self._upper_bound = upper_bound + + @property + def op_name(self): + if self._lower_bound == 0: + if self._upper_bound == 6: + return "relu6" + if self._upper_bound is None: + return "relu" + return "relun" + return "relunm" + + @property + def lower_bound(self): + return self._lower_bound + + @lower_bound.setter + def lower_bound(self, val): + self._lower_bound = val + + @property + def upper_bound(self): + return self._upper_bound + + @upper_bound.setter + def upper_bound(self, val): + self._upper_bound = val + + def clone(self, name, groupn=None): + return ReluActivationParameters(name, self._lower_bound, self._upper_bound) + + @property + def can_equalize(self): + return self.op_name == "relu" + +class LeakyActivationParameters(ActivationParameters): + def __init__(self, name, leak_factor=0.01): + super(LeakyActivationParameters, self).__init__(name) + self._leak_factor = leak_factor + + @property + def leak_factor(self): + return self._leak_factor + + @property + def op_name(self): + return "leaky" + + def clone(self, name, groupn=None): + return LeakyActivationParameters(name, self._leak_factor) + + @property + def can_equalize(self): + return False + +class HSigmoidActivationParameters(ActivationParameters): + def __init__(self, name, offset=3): + super(HSigmoidActivationParameters, self).__init__(name) + self._offset = offset + + @property + def offset(self): + return self._offset + + @offset.setter + def offset(self, val): + self._offset = val + + @property + def op_name(self): + return "hsigmoid" + + def clone(self, name, groupn=None): + return HSigmoidActivationParameters(name) + + @property + def can_equalize(self): + return False + + def __str__(self): + return "Activation {} offset={} {}".format( + self.op_name, + self.offset, + self.at_options + ) + +class HSwishActivationParameters(ActivationParameters): + @property + def op_name(self): + return "hswish" + + def clone(self, name, groupn=None): + return HSwishActivationParameters(name) + + @property + def can_equalize(self): + return False diff --git a/tools/nntool/graph/types/base.py b/tools/nntool/graph/types/base.py index ecf6a97dd..26b5295cb 100644 --- a/tools/nntool/graph/types/base.py +++ b/tools/nntool/graph/types/base.py @@ -19,7 +19,7 @@ from utils.graph import Edge, Node from utils.option_list import OptionList -from generation.kernel_parameters import GenCtrl, CTRL_FEATURES +from generation.at_types.gen_ctrl import GenCtrl, CTRL_FEATURES LOG = logging.getLogger("nntool." + __name__) @@ -65,7 +65,7 @@ def get_gen_ctrl(self): @property def valid_at_options(self): - return self.valid_at_options + return self._valid_at_options @property def at_options(self): @@ -148,6 +148,10 @@ def out_dims(self, value): def get_parameter_size(self): pass + @abstractmethod + def get_output_size(self, in_dims): + pass + @property @abstractmethod def can_equalize(self): @@ -166,7 +170,7 @@ def clone_dim_with_hints(self, dims, hint_dir="in"): assert hints is None or len(dims) == len(hints), "incorrect dimensions length" cloned_dims = [] for dim_idx, dim in enumerate(dims): - if dim.is_named: + if dim.is_named and all(k in dim.keys for k in ['c', 'h', 'w']): cloned_dims.append(dim.clone(['c', 'h', 'w'])) else: cloned_dim = dim.clone() diff --git a/tools/nntool/graph/types/conv2d.py b/tools/nntool/graph/types/conv2d.py index 91a8e5814..0f0604a08 100644 --- a/tools/nntool/graph/types/conv2d.py +++ b/tools/nntool/graph/types/conv2d.py @@ -112,6 +112,10 @@ def get_parameter_size(self): return 0 return self.get_weights_count() + self.get_bias_count() + @property + def at_options(self): + return self._at_options + def get_output_size(self, in_dims): assert len(in_dims) == 1,\ @@ -123,7 +127,7 @@ def get_output_size(self, in_dims): "The number of groups cannot be larger than the amount of input channels" self.filter.in_c = in_dims.c // self.groups if self.padding.is_same: - self.padding.calculate_same(in_dims, self.filter, self.stride) + self.padding.calculate_same(in_dims, self.filter, self.stride, dilation=self.dilation) filter_d = self.filter + (self.filter - 1) * (self.dilation - 1) pad = self.padding.height_width() diff --git a/tools/nntool/graph/types/fusions.py b/tools/nntool/graph/types/fusions.py new file mode 100644 index 000000000..93f94f4d4 --- /dev/null +++ b/tools/nntool/graph/types/fusions.py @@ -0,0 +1,116 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from ..dim import Dim +from .base import (Parameters, NodeOptions, FilterParameters, SingleInputAndOutput) + +LOG = logging.getLogger("nntool." + __name__) + +class FusionBase(Parameters): + fusion_op_name = "!!NOT SET!!" + + def __init__(self, name, fusion_type, subgraph): + super(FusionBase, self).__init__(name) + self._subgraph = subgraph + nodes = self.contained_nodes() + self.in_dims_hint = nodes[0].in_dims_hint + self.out_dims_hint = nodes[-1].out_dims_hint + self.fusion_type = fusion_type + + @property + def op_name(self): + return self.fusion_op_name + '_' + self.fusion_type + + @property + def subgraph(self): + return self._subgraph + + def contained_nodes(self): + return [node for node in self.subgraph.dfs()] + + def get_contained_node(self, name): + return next((n for n in self.contained_nodes() if n.name == name), None) + + @property + def can_equalize(self): + return all([param.can_equalize for param in self.contained_nodes()]) + + def clone(self, name, groupn=None): + return self.__class__(name, self.fusion_type, self._subgraph) + + def get_parameter_size(self): + return 0 + + def get_output_size(self, in_dims): + + out_dims = in_dims + + for node in self.contained_nodes(): + out_dims = node.get_output_size(out_dims) + + return out_dims + + def __str__(self): + return "{}".format(", ".join([str(node).strip() for node in self.contained_nodes()])) + + +class MatScaleFusionParameters(FusionBase): + fusion_op_name = "matscale" + + def __init__(self, *args, activation=None, **kwargs): + self.activation = activation + super(MatScaleFusionParameters, self).__init__(*args, **kwargs) + + def get_output_size(self, in_dims): + return [Dim.broadcast(in_dims)] + +class ConvFusionParameters(FusionBase, SingleInputAndOutput): + '''Fusion of operators. At present restricted to single input and output but + this could be removed perhaps''' + + fusion_op_name = "conv_fusion" + + def _init_at_options(self): + if self._at_options is None: + self._at_options = NodeOptions(None) + self._at_options.extend(*[node.at_options for node in self.contained_nodes()]) + + @property + def at_options(self): + self._init_at_options() + return self._at_options + + @at_options.setter + def gen_ctrl(self, val): + self._init_at_options() + self._at_options = val + + def contained_filters(self): + return [x for x in self.contained_nodes() if isinstance(x, FilterParameters)] + + def get_parameter_size(self): + return sum([node.get_parameter_size() for node in self.contained_nodes()]) + + def __str__(self): + return "{} {}".format(", ".join([str(node).strip() for node in self.contained_nodes()]), self.gen_ctrl or "") + + def compute_load(self): + return sum([load if load else 0 for load in [node.compute_load() + for node in self.contained_nodes()]]) + +class ActivationFusion(FusionBase): + fusion_op_name = "activation_fusion" diff --git a/tools/nntool/graph/types/linear.py b/tools/nntool/graph/types/linear.py index 39518d81f..91547810c 100644 --- a/tools/nntool/graph/types/linear.py +++ b/tools/nntool/graph/types/linear.py @@ -16,16 +16,15 @@ import logging from ..dim import Dim -from .base import FilterParameters, SingleInputAndOutput +from .base import MultiplicativeBiasParameters, SingleInputAndOutput LOG = logging.getLogger("nntool." + __name__) -class FcParameters(FilterParameters, SingleInputAndOutput): +class FcParameters(MultiplicativeBiasParameters, SingleInputAndOutput): op_name = "linear" - def __init__(self, name, **kwargs): + def __init__(self, *args, **kwargs): - super(FcParameters, self).__init__(name, - **kwargs) + super(FcParameters, self).__init__(*args, **kwargs) LOG.debug("created LINEAR %s", str(self)) def get_parameter_size(self): @@ -55,7 +54,7 @@ def clone(self, name, groupn=None): return FcParameters(name, filt=self.filter.clone(), has_bias=self.has_bias) def compute_load(self): - return self.in_dims[0].size() * self.filter.size() + return self.in_dims[0].size() * self.out_dims[0].c def __str__(self): return "F {} {}".format(self.filter, self.at_options or "") diff --git a/tools/nntool/graph/types/others.py b/tools/nntool/graph/types/others.py index e88a05ad6..245aaf04b 100644 --- a/tools/nntool/graph/types/others.py +++ b/tools/nntool/graph/types/others.py @@ -15,12 +15,15 @@ import logging import sys -from functools import reduce -from ..dim import Dim -from .base import (FilterParameters, NodeOptions, NoSizeChangeParameters, - Parameters, SameNumberOfDimensionsForInputs, - SensitiveToOrder, SingleInputAndOutput, Transposable) +import numpy as np + +from graph.dim import Dim +from utils.formatters import FORMAT_CHANGES, NORMALIZATIONS + +from .base import (NoSizeChangeParameters, Parameters, + SameNumberOfDimensionsForInputs, SensitiveToOrder, + SingleInputAndOutput, Transposable) LOG = logging.getLogger("nntool." + __name__) @@ -127,6 +130,105 @@ def clone(self, name, groupn=None): # self.out_q = get_quantization(self.activation_stats, None, self.out_q.bits * 2) # return True +class ImageFormatParameters(Parameters, SingleInputAndOutput, SensitiveToOrder): + op_name = "image_format" + NORMALIZATIONS = NORMALIZATIONS + FORMAT_CHANGES = FORMAT_CHANGES + + def __init__(self, *args, norm_func=None, format_change=None, **kwargs): + self._norm_func = None + self._format_change = None + super(ImageFormatParameters, self).__init__(*args, **kwargs) + self.norm_func = norm_func + self.format_change = format_change + + @property + def input_channels(self): + if self.format_change in ("RGB565_RGB888", "BW8", "BW16"): + return 1 + if self.format_change in ("RGB888", "RGB16"): + return 3 + return None + + @property + def input_dtype(self): + if self.format_change == "RGB565_RGB888": + return np.uint16 + if self.format_change in ("RGB888", "BW8", "BW16", "RGB16"): + return np.uint8 + return None + + @property + def output_channels(self): + if self.format_change in ("RGB565_RGB888", "RGB888", "RGB16"): + return 3 + if self.format_change in ("BW8", "BW16"): + return 1 + return None + + @property + def output_dtype(self): + if self.norm_func in ("SHIFT_INT8", "OFFSET_INT8"): + return np.int8 + if self.norm_func in "OUT_INT16": + return np.int16 + return None + + @property + def format_change(self): + # RGB565_RGB888 + return self._format_change + + @format_change.setter + def format_change(self, val): + val = val and val.upper() + if val is not None and val not in self.FORMAT_CHANGES: + raise ValueError("format change is not valid") + self._format_change = val + + @property + def norm_func(self): + # None, "shift", "offset" + return self._norm_func + + @norm_func.setter + def norm_func(self, val): + val = val and val.upper() + if val is not None and val not in self.NORMALIZATIONS: + raise ValueError("normalization is not valid") + self._norm_func = val + + def get_parameter_size(self): + return 0 + + def get_output_size(self, in_dims): + assert len(in_dims) == 1 + self.in_dims = self.clone_dim_with_hints(in_dims, hint_dir='in') + out_dim = self.clone_dim_with_hints(in_dims, hint_dir='out')[0] + if self.format_change == "RGB565_RGB888": + assert out_dim.is_named and out_dim.c == 1 + out_dim.impose_order(self.out_dims_hint[0]) + out_dim.c = 3 + elif self.format_change in ("BW8", "BW16"): + assert out_dim.is_named and out_dim.c == 1 + out_dim.impose_order(self.out_dims_hint[0]) + elif self.format_change in ("RGB888", "RGB16"): + assert out_dim.is_named and out_dim.c == 3 + out_dim.impose_order(self.out_dims_hint[0]) + else: + raise ValueError("unknow format change") + + return [out_dim] + + @property + def can_equalize(self): + return False + + def clone(self, name, groupn=None): + raise NotImplementedError() + + def __str__(self): + return "FORMAT_CHANGE Fmt: {} Norm: {}".format(self.format_change, self.norm_func) class ConstantInputParameters(InputBaseParameters): op_name = "constant" @@ -186,45 +288,6 @@ def clone(self, name, groupn=None): raise NotImplementedError() -class ActivationParameters(NoSizeChangeParameters, SingleInputAndOutput): - - def __init__(self, name, activation="relu", activation_params=None): - super(ActivationParameters, self).__init__(name) - self.activation = activation - self.activation_params = activation_params - - @property - def op_name(self): - return self.activation - - def get_parameter_size(self): - return 0 - - def clone(self, name, groupn=None): - return ActivationParameters(name, self.activation, self.activation_params) - - @property - def can_equalize(self): - # Is leaky usable? Looks like it. - return self.activation == "relu" or self.activation == "leaky"\ - or self.activation == "relu6" or self.activation == "relun" - - def compute_load(self): - # TODO - Be more accurate with different activation types - return self.out_dims[0].size() - - def activation_to_string(self): - if self.activation == "relun": - return "relun({})".format(self.activation_params) - return self.activation - - def __str__(self): - return "Activation {} {}".format( - self.activation_to_string(), - self.at_options - ) - - class TransposeParameters(Transposable, SingleInputAndOutput): op_name = "transpose" @@ -242,24 +305,13 @@ def permute(self, val): def can_equalize(self): return False - def transpose_elements(self): - tin = self.transpose_in - elems = [] - cur = [] - for i in tin: - if len(cur) == 0 or cur[-1] + 1 == i: - cur.append(i) - else: - elems.append(cur) - cur = [i] - if len(cur) > 0: - elems.append(cur) - return elems - - @property - def transpose_size(self): - dim = self.in_dims[0].shape - return [reduce(lambda x, y: x * dim[y], telem, 1) for telem in self.transpose_elements()] + def real_shape(self): + input_shape = self.in_dims[0].shape + cond_input_idx = [i for i, sz in enumerate(self.in_dims[0].shape) if sz != 1] + real_transpose = [i for i in self.transpose_in if i in cond_input_idx] + cond_input_shape = [input_shape[i] for i in cond_input_idx] + cond_transpose = [cond_input_idx.index(i) for i in real_transpose] + return tuple(cond_input_shape), tuple(cond_transpose) @property def transpose_dimension(self): @@ -276,7 +328,7 @@ def transpose_out(self, val): self._transpose_in = val def get_output_size(self, in_dims): - self.in_dims = in_dims + self.in_dims = self.clone_dim_with_hints(in_dims) out_dim = in_dims[0].clone() if self.transpose_in: out_dim = out_dim.transpose(self.transpose_in) @@ -318,7 +370,7 @@ def can_equalize(self): def get_output_size(self, in_dims): if in_dims[0].is_named and self._axis_hint: self._axis = in_dims[0].get_order_idx(self._axis_hint) - self.in_dims = in_dims + self.in_dims = self.clone_dim_with_hints(in_dims) if self.transpose_in: in_dims = [in_dim.clone().transpose(self.transpose_in) for in_dim in in_dims] out_dim = Dim.combine([in_dim for in_dim in in_dims], self.axis) @@ -336,103 +388,6 @@ def __str__(self): self.at_options ) -class FusionBase(Parameters): - fusion_op_name = "!!NOT SET!!" - - def __init__(self, name, fusion_type, subgraph): - super(FusionBase, self).__init__(name) - self._subgraph = subgraph - nodes = self.contained_nodes() - self.in_dims_hint = nodes[0].in_dims_hint - self.out_dims_hint = nodes[-1].out_dims_hint - self.fusion_type = fusion_type - - @property - def op_name(self): - return self.fusion_op_name + '_' + self.fusion_type - - @property - def subgraph(self): - return self._subgraph - - def contained_nodes(self): - return [node for node in self.subgraph.dfs()] - - def get_contained_node(self, name): - return next((n for n in self.contained_nodes() if n.name == name), None) - - @property - def can_equalize(self): - return all([param.can_equalize for param in self.contained_nodes()]) - - def clone(self, name, groupn=None): - return self.__class__(name, self.fusion_type, self._subgraph) - - def get_parameter_size(self): - return 0 - - def get_output_size(self, in_dims): - - out_dims = in_dims - - for node in self.contained_nodes(): - out_dims = node.get_output_size(out_dims) - - return out_dims - - def __str__(self): - return "{}".format(", ".join([str(node).strip() for node in self.contained_nodes()])) - - -class MatScaleFusionParameters(FusionBase): - fusion_op_name = "matscale" - - def __init__(self, *args, activation=None, **kwargs): - self.activation = activation - super(MatScaleFusionParameters, self).__init__(*args, **kwargs) - - def get_output_size(self, in_dims): - return [Dim.broadcast(in_dims)] - -class ConvFusionParameters(FusionBase, SingleInputAndOutput): - '''Fusion of operators. At present restricted to single input and output but - this could be removed perhaps''' - - fusion_op_name = "conv_fusion" - - def _init_at_options(self): - if self._at_options is None: - self._at_options = NodeOptions(None) - self._at_options.extend(*[node.at_options for node in self.contained_nodes()]) - - @property - def at_options(self): - self._init_at_options() - return self._at_options - - @at_options.setter - def gen_ctrl(self, val): - self._init_at_options() - self._at_options = val - - def contained_filters(self): - return [x for x in self.contained_nodes() if isinstance(x, FilterParameters)] - - def get_parameter_size(self): - return sum([node.get_parameter_size() for node in self.contained_nodes()]) - - def __str__(self): - return "{} {}".format(", ".join([str(node).strip() for node in self.contained_nodes()]), self.gen_ctrl or "") - - # # Needs to be refactored out - # @property - # def params(self): - # return self._nodes - - def compute_load(self): - return sum([load if load else 0 for load in [node.compute_load() - for node in self.contained_nodes()]]) - class GroupParameters(Parameters, SensitiveToOrder): @@ -450,8 +405,8 @@ def get_parameter_size(self): def get_output_size(self, in_dims): assert len(in_dims) == 1 - self.in_dims = in_dims - in_dims = in_dims[0] + self.in_dims = self.clone_dim_with_hints(in_dims) + in_dims = self.in_dims[0] assert in_dims.c % self.groups == 0 out_edges = in_dims.c // self.groups out_c = in_dims.c // out_edges @@ -482,6 +437,7 @@ def __init__(self, name, padding, in_dims_hint=None, out_dims_hint=None): in_dims_hint=in_dims_hint, out_dims_hint=out_dims_hint) self.padding = padding + self.pad_type = "zero" def get_parameter_size(self): return 0 @@ -555,7 +511,7 @@ def get_parameter_size(self): def get_output_size(self, in_dims): assert len(in_dims) == 1 - self.in_dims = in_dims + self.in_dims = self.clone_dim_with_hints(in_dims) in_dims = in_dims[0] out_dim = in_dims.clone() @@ -590,16 +546,17 @@ def __init__(self, *args, old_shape=None, shape=None, **kwargs): self._old_shape = old_shape def does_nothing(self): - return self.shape.shape == list(filter(lambda x: x != 1, self.old_shape.shape)) + return self.shape.layout_shape == self.old_shape.layout_shape def get_parameter_size(self): return 0 def get_output_size(self, in_dims): assert len(in_dims) == 1 - self.in_dims = in_dims - in_dims = in_dims[0] - assert in_dims.size() == self.shape.size() + self.in_dims = self.clone_dim_with_hints(in_dims) + in_dim = in_dims[0] + self._old_shape = in_dim + assert in_dim.size() == self.shape.size(), "in shape does not match in size" out = self.shape.clone() if self.transpose_out: out.transpose(self.transpose_out) @@ -677,8 +634,9 @@ def compute_load(self): return self.out_dims[0].size() * 2 def get_output_size(self, in_dims): - max_idx, _ = max(enumerate(in_dims), key=lambda x: x[1].size()) - return [in_dims[max_idx]] + self.in_dims = self.clone_dim_with_hints(in_dims) + max_idx, _ = max(enumerate(self.in_dims), key=lambda x: x[1].size()) + return [self.in_dims[max_idx]] def __str__(self): return "{} {}".format(self.op_name, self.at_options) @@ -730,6 +688,32 @@ def __str__(self): # pylint: disable=abstract-method +class NoOPParameters(NoSizeChangeParameters, SingleInputAndOutput): + op_name = "noop" + + def __init__(self, name, desc=""): + super(NoOPParameters, self).__init__(name) + self._desc = desc + + def get_parameter_size(self): + return 0 + + @property + def can_equalize(self): + return False + + def clone(self, name, groupn=None): + raise NotImplementedError() + + def compute_load(self): + return 0 + + def __str__(self): + return "NOOP {}".format( + self._desc + ) + + class UnexecutableOpParameters(Parameters): pass @@ -750,8 +734,9 @@ def op_name(self): def get_output_size(self, in_dims): if self.indicated_outputs: return self.indicated_outputs - if len(in_dims) == 1: - return [in_dims[0]] + self.in_dims = self.clone_dim_with_hints(in_dims) + if len(self.in_dims) == 1: + return [self.in_dims[0]] return [Dim.unknown()] @property @@ -775,8 +760,9 @@ def __init__(self, name, info): self.info = info def get_output_size(self, in_dims): - if len(in_dims) == 1: - return [in_dims[0]] + self.in_dims = self.clone_dim_with_hints(in_dims) + if len(self.in_dims) == 1: + return [self.in_dims[0]] return [Dim.unknown()] @property diff --git a/tools/nntool/importer/tflite/new_tflite_graph_all.py b/tools/nntool/importer/tflite/new_tflite_graph_all.py index 75786d85a..8b98f979c 100644 --- a/tools/nntool/importer/tflite/new_tflite_graph_all.py +++ b/tools/nntool/importer/tflite/new_tflite_graph_all.py @@ -32,6 +32,8 @@ # }; import logging +import os +from copy import deepcopy from functools import reduce import numpy as np @@ -44,11 +46,25 @@ Conv2DParameters, FcParameters, GlobalPoolParameters, MatrixAddParameters, MatrixDivParameters, MatrixMulParameters, MatrixSubParameters, NNEdge, - PadParameters, PoolingParameters, ReshapeParameters, - SoftMaxParameters, UnconvertedOpParameters, - UnknownOpParameters) -from quantization.quantization_record import (FilterQuantizationRecord, - QuantizationRecord) + NoOPParameters, PadParameters, PoolingParameters, + ReshapeParameters, SoftMaxParameters, + UnconvertedOpParameters, UnknownOpParameters) +from quantization.multiplicative.asymmetric.asymmetric_mult_qtype import \ + AsymmetricMultQType +from quantization.multiplicative.mult_quantization import ( + MultAddQuantizationRecord, MultConstantQuantizationRecord, + MultQuantizationRecord, MultQuantizationRecordBase, + MultScalableFilterQuantizationRecord) +from quantization.multiplicative.symmetric.mult_mulbias_qtype_new import \ + MultMulBiasScaleQType +from quantization.multiplicative.symmetric.symmetric_mult_biases_qtype import \ + SymmetricMultBiasesQType +from quantization.multiplicative.symmetric.symmetric_mult_qtype import \ + SymmetricMultQType +from quantization.multiplicative.symmetric.symmetric_mult_qtype_wrapper import \ + SymmetricMultQTypeWrapper +from quantization.quantization_set import QuantizationSet +from utils.add_sys_path import add_sys_path from utils.graph import Node from utils.node_id import NodeId from utils.sparse_list import SparseList @@ -56,7 +72,6 @@ from ..importer_base import ImporterBase from . import utils from .propagate_hints import propagate_hints -from .tflite_qtype import TfliteQType from .tflite_schema_head import (ActivationFunctionType, AddOptions, ConcatenationOptions, Conv2DOptions, DepthwiseConv2DOptions, DivOptions, @@ -84,13 +99,19 @@ class TFLiteImportException(Exception): } TF_ACTIVATION_OPERATORS = { - "LOGISTIC": "sigmoid", + "LOGISTIC": "hsigmoid", "RELU": "relu", "RELU6": "relu6", "TANH": "tanh", "HARD_SWISH": "hswish" } +UNDIGNED_TO_SIGNED = { + np.uint8: np.int8, + np.uint16: np.int16, + np.uint32: np.int32 +} + def check(condition, message): if not condition: @@ -289,12 +310,12 @@ def get_fin_cput_size(subgraph, elem, idx): class TfliteTensorWrapper(): TF_TO_NUMPY_TYPE = { - TensorType.TensorType.FLOAT32: np.dtype(' may be an omitted relu + if node_qrec.out_qs[0].min_val == 0: + if np.all(np.round(node_qrec.out_qs[0].max_val) == 6): + anode = ActivationParameters.get_activation('relu6', aname(name)) + else: + anode = ActivationParameters.get_activation('relu', aname(name)) + else: + return add_node(self.G, node) + else: + return add_node(self.G, node) + else: + anode = ActivationParameters.get_activation(TF_ACTIVATIONS[tfl_opts.FusedActivationFunction()], + aname(name)) - activation = TF_ACTIVATIONS[tfl_opts.FusedActivationFunction()] - anode = ActivationParameters(aname(name), activation) if self.load_quantization: + # In between the fused operation and activation the + # transfer is in int32 representation node_qrec = self.qrecs[NodeId(node)] - self.qrecs[NodeId(anode)] = QuantizationRecord( - in_qs=[node_qrec.out_qs[0]], out_qs=[node_qrec.out_qs[0]]) + outa_qtype = deepcopy(node_qrec.out_qs[0]) + #node_qrec.out_qs[0].dtype = np.int32 + ina_qtype = deepcopy(node_qrec.out_qs[0]) + self.qrecs[NodeId(anode)] = MultQuantizationRecord( + in_qs=[ina_qtype], out_qs=[outa_qtype]) return add_node(self.G, node, anode=anode) def add_unconverted(self, name, subgraph, op_name, op): + LOG.warning("graph has unknown operator %s and cannot be properly processed", op_name) node = add_node(self.G, UnconvertedOpParameters( name, @@ -409,6 +495,182 @@ def add_unconverted(self, name, subgraph, op_name, op): )) return node + def make_weights_symmetric(self, node, input_tensors): + biases_scales = input_tensors[2].scale if node.has_bias else np.array([1], dtype=np.int32) + # already symmetric or something we don't know + if input_tensors[1].dtype != np.uint8: + return input_tensors[1].scale, biases_scales, None, None + weights_scales = input_tensors[1].scale + # symmetric unsigned. just change zero point scale stays the same + if np.all(input_tensors[1].zero_point == 128): + node.weights = (node.weights.astype(np.int64) - 128).astype(np.int8) + return weights_scales, biases_scales, None, None + # asymmetric unsigned. change zero point and rescale + if self.rescale_perchannel: + return self.scale_weights_by_channel(node, weights_scales, biases_scales, + input_tensors[0].qtype.scale, + zero_point=input_tensors[1].zero_point) + else: + return self.scale_weights_by_tensor(node, weights_scales, biases_scales, + input_tensors[0].qtype.scale, + zero_point=input_tensors[1].zero_point) + + def scale_weights_by_tensor(self, node, weights_scales, biases_scales, in_scale, zero_point=None): + if zero_point is None: + zero_point = np.array([0]) + if node.has_bias: + dq_biases = node.biases * biases_scales + else: + dq_biases = np.array([0] * node.filter.out_c, dtype=np.float32) + + if len(weights_scales) > 1: + raise ValueError('You should not rescale perchannel weights to pertensor format') + + dq_weights = (node.weights.astype(np.float32) - zero_point) * weights_scales + w_min = min(np.min(dq_weights), 0) + w_max = max(np.max(dq_weights), 0) + w_max = w_max if w_min != w_max and w_max == 0 else 1 + + w_abs_max = max(w_max, np.abs(w_min)) + new_weights_scale = w_abs_max / 127 + int8_iinfo = np.iinfo(np.int8) + int32_iinfo = np.iinfo(np.int32) + new_biases_scale = new_weights_scale * in_scale + node.weights = np.clip(np.floor(dq_weights / new_weights_scale + 0.5), + int8_iinfo.min, + int8_iinfo.max).astype(np.int8) + node.biases = np.clip(np.floor(dq_biases / new_biases_scale + 0.5), + int32_iinfo.min, + int32_iinfo.max).astype(np.int32) + return np.array([new_weights_scale]), np.array([new_biases_scale]),\ + np.array([w_min]), np.array([w_max]) + + def scale_weights_by_channel(self, node, weights_scales, biases_scales, in_scale, zero_point=None): + # scale weights by channel optionally correcting zero point + if zero_point is None: + zero_point = np.array([0]) + + out_idx = node.filter.get_order_idx('out_c') + actual_len = len(node.filter.actual_shape) + ones_shape = tuple(node.filter.out_c if idx == out_idx else 1 for idx in range(actual_len)) + filter_axis = tuple(idx for idx in range(actual_len) if idx != out_idx) + + if node.has_bias: + dq_biases = node.biases * biases_scales + else: + dq_biases = np.array([0] * node.filter.out_c, dtype=np.float32) + + if len(weights_scales) > 1: + weights_scales = weights_scales.reshape(ones_shape) + if len(zero_point) > 1: + zero_point = zero_point.reshape(ones_shape) + dq_weights = (node.weights.astype(np.float32) - zero_point) * weights_scales + + w_mins = np.minimum(np.min(dq_weights, axis=filter_axis), 0) + w_maxes = np.maximum(np.max(dq_weights, axis=filter_axis), 0) + + w_zero_cond = np.logical_and(w_mins == w_maxes, w_maxes == 0) + w_maxes = np.where(w_zero_cond, 1, w_maxes) + + w_abs_maxes = np.maximum(np.abs(w_mins), w_maxes) + new_weights_scales = w_abs_maxes / 127 + int8_iinfo = np.iinfo(np.int8) + int32_iinfo = np.iinfo(np.int32) + new_biases_scales = new_weights_scales * in_scale + np.seterr(all='raise') + node.weights = np.clip(np.floor(dq_weights / new_weights_scales.reshape(ones_shape) + 0.5), + int8_iinfo.min, + int8_iinfo.max).astype(np.int8) + node.biases = np.clip(np.floor(dq_biases / new_biases_scales + 0.5), + int32_iinfo.min, + int32_iinfo.max).astype(np.int32) + return new_weights_scales, new_biases_scales, w_mins, w_maxes + + def detect_small_scales(self, node, weights_scales, biases_scales, in_scale): + # at this point all tensors are in expected formats + # weights int8 biases int32 channel scaled + tiny_weight_scales = weights_scales < SymmetricMultQType.kNearZeroTolerance + if np.count_nonzero(tiny_weight_scales) == 0: + return weights_scales, biases_scales + + out_idx = node.filter.get_order_idx('out_c') + shape = tuple(slice(None) if idx != + out_idx else tiny_weight_scales for idx in range(len(node.weights.shape))) + + node.weights[shape] = 0 + dq_biases = node.biases * biases_scales + weights_scales = np.where(tiny_weight_scales, 1, weights_scales) + biases_scales = in_scale * weights_scales + int32_iinfo = np.iinfo(np.int32) + node.biases = np.clip(np.floor(dq_biases / biases_scales + 0.5), + int32_iinfo.min, + int32_iinfo.max).astype(np.int32) + return weights_scales, biases_scales + + def fix_weights_and_biases(self, node, input_tensors): + weights_scales, biases_scales, w_mins, w_maxes = self.make_weights_symmetric( + node, input_tensors) + if self.rescale_perchannel: + if len(weights_scales) != node.filter.out_c: + weights_scales, biases_scales, w_mins, w_maxes = self.scale_weights_by_channel( + node, weights_scales, biases_scales, input_tensors[0].qtype.scale) + weights_scales, biases_scales = self.detect_small_scales( + node, weights_scales, biases_scales, input_tensors[0].scale) + if w_mins is None: + w_mins = input_tensors[1].min_val + w_maxes = input_tensors[1].max_val + return weights_scales, biases_scales, w_mins, w_maxes + + def load_filter_parameters(self, node, input_tensors, output_tensors, converted_to_conv=False): + if self.load_tensors or self.load_quantization: + node.weights = input_tensors[1].get_value(self.model) + if converted_to_conv: + node.weights = node.weights.transpose(TF_LITE_DW_FILTER_TRANSPOSE) + if node.has_bias: + node.biases = input_tensors[2].get_value(self.model) + + if self.load_quantization: + if input_tensors[0].qtype is None: + raise NoQuantizationError("quantization not present in tflite file") + weights_scales, biases_scales, w_mins, w_maxes = self.fix_weights_and_biases( + node, input_tensors) + biases_q = SymmetricMultBiasesQType(dtype=np.int32, scale=biases_scales) + weights_q = SymmetricMultQType( + dtype=np.int8, narrow_range=True, scale=weights_scales, min_val=w_mins, max_val=w_maxes) + in_q = input_tensors[0].qtype + out_q = output_tensors[0].qtype + mulbiases_q = MultMulBiasScaleQType.from_filter(in_q, weights_q, out_q, node) + qrec = MultScalableFilterQuantizationRecord(in_qs=[in_q], + out_qs=[out_q], + mul_biases_q=mulbiases_q, + weights_q=weights_q, + biases_q=biases_q) + self.qrecs[NodeId(node)] = qrec + + def load_dequantized_filter_parameters(self, node, input_tensors, converted_to_conv=False, is_dw=False): + weights_scales = input_tensors[1].scale + in_scale = input_tensors[0].scale + weights_quant = input_tensors[1].get_value(self.model) + # save in the node the dequantized values + if len(weights_scales) > 1: # tf2 conv and dw (fully connected should be per-tensor) + if is_dw: + # depthwise + shape_pc = tuple(size if idx == 3 else 1 # always along axis 3 from tflite quantization spec + for idx, size in enumerate(weights_quant.shape)) + else: + # normal convolution + shape_pc = tuple(size if idx == 0 else 1 # always along axis 0 from tflite quantization spec + for idx, size in enumerate(weights_quant.shape)) + node.weights = (weights_quant.astype(np.int64) - input_tensors[1].zero_point.reshape(shape_pc)) \ + * weights_scales.reshape(shape_pc) + else: + node.weights = (weights_quant - input_tensors[1].zero_point) * weights_scales + if converted_to_conv: + node.weights = node.weights.transpose(TF_LITE_DW_FILTER_TRANSPOSE) + if node.has_bias: + biases_scales = weights_scales * in_scale + node.biases = input_tensors[2].get_value(self.model) * biases_scales + def add_convolution(self, name, subgraph, _, op): del subgraph conv_opts = Conv2DOptions.Conv2DOptions() @@ -441,17 +703,10 @@ def add_convolution(self, name, subgraph, _, op): out_dims_hint=SparseList([['h', 'w', 'c']]), constant_store=self.G.constant_store) - if self.load_quantization: - qrec = FilterQuantizationRecord(in_qs=[input_tensors[0].qtype], - out_qs=[output_tensors[0].qtype], - weights_q=input_tensors[1].qtype, - biases_q=input_tensors[2].qtype if len(input_tensors) > 2 else None) - self.qrecs[NodeId(node)] = qrec - - if self.load_tensors: - node.weights = input_tensors[1].get_value(self.model) - if has_bias: - node.biases = input_tensors[2].get_value(self.model) + if self.load_dequantized: + self.load_dequantized_filter_parameters(node, input_tensors) + else: + self.load_filter_parameters(node, input_tensors, output_tensors) return self.fuse_activation(conv_opts, name, node) @@ -515,19 +770,12 @@ def add_depthwise_convolution(self, name, subgraph, _, op): out_dims_hint=SparseList([['h', 'w', 'c']]), constant_store=self.G.constant_store) - if self.load_quantization: - qrec = FilterQuantizationRecord(in_qs=[input_tensors[0].qtype], - out_qs=[output_tensors[0].qtype], - weights_q=input_tensors[1].qtype, - biases_q=input_tensors[2].qtype if len(input_tensors) > 2 else None) - self.qrecs[NodeId(node)] = qrec - if self.load_tensors: - node.weights = input_tensors[1].get_value(self.model) - # If we've converted to a normal conv then change the weight order - if convert_to_conv: - node.weights = node.weights.transpose(TF_LITE_DW_FILTER_TRANSPOSE) - if has_bias: - node.biases = input_tensors[2].get_value(self.model) + if self.load_dequantized: + self.load_dequantized_filter_parameters( + node, input_tensors, convert_to_conv, is_dw=True) + else: + self.load_filter_parameters(node, input_tensors, output_tensors, + converted_to_conv=convert_to_conv) return self.fuse_activation(conv_opts, name, node) @@ -572,17 +820,10 @@ def add_fully_connected(self, name, subgraph, _, op): out_dims_hint=SparseList([['c']]), constant_store=self.G.constant_store) - if self.load_quantization: - qrec = FilterQuantizationRecord(in_qs=[input_tensors[0].qtype], - out_qs=[output_tensors[0].qtype], - weights_q=input_tensors[1].qtype, - biases_q=input_tensors[2].qtype if len(input_tensors) > 2 else None) - self.qrecs[NodeId(node)] = qrec - - if self.load_tensors: - node.weights = input_tensors[1].get_value(self.model) - if has_bias: - node.biases = input_tensors[2].get_value(self.model) + if self.load_dequantized: + self.load_dequantized_filter_parameters(node, input_tensors) + else: + self.load_filter_parameters(node, input_tensors, output_tensors) return self.fuse_activation(fc_opts, name, node) @@ -592,10 +833,12 @@ def add_fully_connected(self, name, subgraph, _, op): "MAX_POOL_2D": "max" } - def load_tf_quantization(self, input_tensors, output_tensors, node): - qrec = QuantizationRecord(in_qs=[tensor.qtype for tensor in input_tensors], - out_qs=[tensor.qtype for tensor in output_tensors]) - self.qrecs[NodeId(node)] = qrec + def load_tf_quantization(self, input_tensors, output_tensors, qrec_class=None): + if qrec_class is None: + qrec_class = MultQuantizationRecord + qrec = qrec_class(in_qs=[tensor.qtype for tensor in input_tensors], + out_qs=[tensor.qtype for tensor in output_tensors]) + return qrec # pylint: disable=unused-argument @@ -633,7 +876,8 @@ def add_pool(self, name, subgraph, op_name, op): out_dims_hint=SparseList([['h', 'w', 'c']])) if self.load_quantization: - self.load_tf_quantization(input_tensors, get_output_tensors(self.tensors, op), node) + self.qrecs[NodeId(node)] = self.load_tf_quantization( + input_tensors, get_output_tensors(self.tensors, op)) return self.fuse_activation(pool_opts, name, node) @@ -644,8 +888,21 @@ def add_softmax(self, name, subgraph, _, op): softmax_opts.Init(op.BuiltinOptions().Bytes, op.BuiltinOptions().Pos) node = SoftMaxParameters(name, softmax_opts.Beta()) if self.load_quantization: - self.load_tf_quantization(get_input_tensors(self.tensors, op), - get_output_tensors(self.tensors, op), node) + input_tensors = get_input_tensors(self.tensors, op) + iqtype = input_tensors[0].qtype + iqtype.scale_to_pow2() + oqtype = SymmetricMultQType(min_val=-1, max_val=1, dtype=np.int16, scale=2**(-15)) + qrec = MultQuantizationRecord(in_qs=[iqtype], + out_qs=[oqtype]) + self.qrecs[NodeId(node)] = qrec + + return add_node(self.G, node) + + def add_noop(self, name, subgraph, op_name, op): + node = NoOPParameters(name, desc=op_name) + if self.load_quantization: + self.qrecs[NodeId(node)] = self.load_tf_quantization(get_input_tensors(self.tensors, op), + get_output_tensors(self.tensors, op)) return add_node(self.G, node) # pylint: disable=unused-argument @@ -657,28 +914,44 @@ def add_concatenation(self, name, subgraph, _, op): input_tensors = get_input_tensors(self.tensors, op) output_tensors = get_output_tensors(self.tensors, op) + buffer_idxes = [tensor.buffer_idx for tensor in input_tensors] + if len(set(buffer_idxes)) != len(buffer_idxes): + raise NotImplementedError("concats with multiple versions of the same input are not supported. This is normally a graph design problem.") + axis_hint = None + axis = None # nasty hack to try to figure out how the axis relates to our # internal axis representation if concat_opts.Axis() == 0: if len(output_tensors[0].shape) == 2: axis_hint = 'c' + axis = 0 elif len(output_tensors[0].shape) == 4: axis_hint = 'h' + axis = 0 elif concat_opts.Axis() == 1: if len(output_tensors[0].shape) == 2: axis_hint = 'c' + axis = 0 + elif len(output_tensors[0].shape) == 3: + axis = 0 elif len(output_tensors[0].shape) == 4: axis_hint = 'h' + axis = 0 + elif concat_opts.Axis() == 2: + if all(tensor.shape[1] == 1 for tensor in input_tensors): + axis_hint = 'w' + axis = 1 elif concat_opts.Axis() == 3: if len(output_tensors[0].shape) == 4: axis_hint = 'c' - - node = ConcatParameters(name, axis=max(concat_opts.Axis() - 1, 0), axis_hint=axis_hint) + axis = 2 + if axis is None: + axis = concat_opts.Axis() - 1 + node = ConcatParameters(name, axis=axis, axis_hint=axis_hint) if self.load_quantization: - self.load_tf_quantization(input_tensors, - output_tensors, - node) + self.qrecs[NodeId(node)] = self.load_tf_quantization(input_tensors, + output_tensors) return self.fuse_activation(concat_opts, name, node) # pylint: disable=unused-argument @@ -703,7 +976,8 @@ def add_reshape(self, name, subgraph, _, op): new_shape = Dim.unnamed(remove_batch_dim(new_shape), is_ordered=True) node = ReshapeParameters(name, old_shape=old_shape, shape=new_shape) if self.load_quantization: - self.load_tf_quantization(input_tensors, get_output_tensors(self.tensors, op), node) + self.qrecs[NodeId(node)] = self.load_tf_quantization( + [input_tensors[0]], get_output_tensors(self.tensors, op)) return add_node(self.G, node) # pylint: disable=unused-argument @@ -711,11 +985,10 @@ def add_reshape(self, name, subgraph, _, op): def add_activation(self, name, subgraph, op_name, op): check(op.InputsLength() == 1, "Very odd " + str(op.InputsAsNumpy())) - activation = TF_ACTIVATION_OPERATORS[op_name] - node = ActivationParameters(name, activation) + node = ActivationParameters.get_activation(TF_ACTIVATION_OPERATORS[op_name], name) if self.load_quantization: - self.load_tf_quantization(get_input_tensors(self.tensors, op), - get_output_tensors(self.tensors, op), node) + self.qrecs[NodeId(node)] = self.load_tf_quantization(get_input_tensors(self.tensors, op), + get_output_tensors(self.tensors, op)) return add_node(self.G, node) def add_pad(self, name, subgraph, op_name, op): @@ -727,44 +1000,53 @@ def add_pad(self, name, subgraph, op_name, op): node = PadParameters(name, PadDim(*pad_dim)) if self.load_quantization: - self.load_tf_quantization(get_input_tensors(self.tensors, op), - get_output_tensors(self.tensors, op), node) + self.qrecs[NodeId(node)] = self.load_tf_quantization(get_input_tensors(self.tensors, op), + get_output_tensors(self.tensors, op)) return add_node(self.G, node) - def add_broadcasted_op(self, name, subgraph, op_name, op, tf_opts, params): + def add_broadcasted_op(self, name, subgraph, op_name, op, tf_opts, params, qrec_class=None): tf_opts.Init(op.BuiltinOptions().Bytes, op.BuiltinOptions().Pos) inputs = get_all_const_broadcasted_inputs( self.G, self.model, self.tensors, subgraph, op, load_tensors=self.load_tensors) check(len(inputs) == 2, - "Very odd " + str(op.InputsAsNumpy())) - node_pair = self.fuse_activation(tf_opts, name, params) + "broadcasted ops should only have 2 inputs " + str(op.InputsAsNumpy())) if self.load_quantization: - self.load_tf_quantization(get_input_tensors(self.tensors, op), - get_output_tensors(self.tensors, op), node_pair[0]) + self.qrecs[NodeId(params)] = self.load_tf_quantization(get_input_tensors(self.tensors, op), + get_output_tensors( + self.tensors, op), + qrec_class=qrec_class) + node_pair = self.fuse_activation(tf_opts, name, params) for idx, input_node in enumerate(inputs): if input_node[1] is not None: if self.load_quantization: node_qrec = self.qrecs[NodeId(params)] - self.qrecs[NodeId(input_node[1])] = QuantizationRecord( - in_qs=[], out_qs=[node_qrec.in_qs[idx]]) + self.qrecs[NodeId(input_node[1])] = MultConstantQuantizationRecord( + in_qs=[node_qrec.in_qs[idx]], + out_qs=[node_qrec.in_qs[idx]]) self.G.add_edge(NNEdge(input_node[1], node_pair[0], to_idx=idx)) return node_pair def add_add(self, name, subgraph, op_name, op): return self.add_broadcasted_op(name, subgraph, op_name, op, - AddOptions.AddOptions(), MatrixAddParameters(name)) + AddOptions.AddOptions(), + MatrixAddParameters(name), + MultAddQuantizationRecord) def add_div(self, name, subgraph, op_name, op): return self.add_broadcasted_op(name, subgraph, op_name, op, - DivOptions.DivOptions(), MatrixDivParameters(name)) + DivOptions.DivOptions(), + MatrixDivParameters(name)) def add_mul(self, name, subgraph, op_name, op): return self.add_broadcasted_op(name, subgraph, op_name, op, - MulOptions.MulOptions(), MatrixMulParameters(name)) + MulOptions.MulOptions(), + MatrixMulParameters(name)) def add_sub(self, name, subgraph, op_name, op): return self.add_broadcasted_op(name, subgraph, op_name, op, - SubOptions.SubOptions(), MatrixSubParameters(name)) + SubOptions.SubOptions(), + MatrixSubParameters(name), + MultAddQuantizationRecord) def add_mean(self, name, subgraph, op_name, op): check(op.InputsLength() == 2, @@ -785,8 +1067,8 @@ def add_mean(self, name, subgraph, op_name, op): in_dims_hint=SparseList([['h', 'w', 'c']]), out_dims_hint=SparseList([['h', 'w', 'c']])) if self.load_quantization: - self.load_tf_quantization(get_input_tensors(self.tensors, op), - get_output_tensors(self.tensors, op), node) + self.qrecs[NodeId(node)] = self.load_tf_quantization(get_input_tensors(self.tensors, op), + get_output_tensors(self.tensors, op)) return add_node(self.G, node) @@ -817,11 +1099,13 @@ def add_custom(self, name, subgraph, op_name, op): "MUL": add_mul, "SUB": add_sub, "DIV": add_div, - "MEAN": add_mean + "MEAN": add_mean, + "QUANTIZE": add_noop, + "DEQUANTIZE": add_noop } - for __op in TF_ACTIVATION_OPERATORS: - SWITCH_ADD_FUNCTIONS[__op] = add_activation + for operator in TF_ACTIVATION_OPERATORS: + SWITCH_ADD_FUNCTIONS[operator] = add_activation def add_operator(self, subgraph, subgraph_idx, op, op_idx): op_name, is_custom = utils.get_operator_name(self.model, op.OpcodeIndex()) @@ -848,15 +1132,16 @@ def create_subgraph(self, graph_index): node = self.G.add_input(Dim.unnamed(remove_batch_dim(dims))) tensor = self.tensors[graph.Inputs(i)] tensor.output = node.name - if self.load_quantization: - self.qrecs[NodeId(node)] = QuantizationRecord(in_qs=[], out_qs=[tensor.qtype]) + if self.load_quantization and tensor.qtype: + self.qrecs[NodeId(node)] = MultQuantizationRecord(in_qs=[], out_qs=[tensor.qtype]) for i in range(graph.OutputsLength()): node = self.G.add_output() tensor = self.tensors[graph.Outputs(i)] tensor.inputs.append((node.name, 0)) - if self.load_quantization: - self.qrecs[NodeId(node)] = QuantizationRecord(in_qs=[tensor.qtype], out_qs=[]) + if self.load_quantization and tensor.qtype: + self.qrecs[NodeId(node)] = MultQuantizationRecord( + in_qs=[tensor.qtype], out_qs=[tensor.qtype]) for i in range(graph.OperatorsLength()): op = graph.Operators(i) @@ -876,16 +1161,22 @@ def create_subgraph(self, graph_index): LOG.warning("unused tensors in graph") def create_graph(self, filename, opts): + add_sys_path(os.path.dirname(__file__)) buf = open(filename, "rb").read() self.model = Model.Model.GetRootAsModel(buf, 0) self.load_quantization = opts.get('load_quantization') self.load_tensors = opts.get('load_tensors') + self.load_dequantized = opts.get('load_dequantized') LOG.info("Importing TFLITE model version %s", self.model.Version()) check(self.model.Version() == 3, "Only support version 3 graphs at present") check(self.model.SubgraphsLength() == 1, "Only supports one subgraph at present") self.G = NNGraph(model=self.model, filename=filename, name=opts.get('name'), - value_cache=opts.get('value_cache'), constant_store=ConstantStore()) + constant_store=ConstantStore()) self.create_subgraph(0) - self.G.quantization = self.qrecs + if self.load_quantization: + self.G.quantization = self.qrecs + self.G.has_quantized_parameters = True + self.G.graph_identity.quantization_type = 'SQ8' + propagate_hints(self.G) return self.G diff --git a/tools/nntool/importer/tflite/propagate_hints.py b/tools/nntool/importer/tflite/propagate_hints.py index abdcecbb6..36391dfee 100644 --- a/tools/nntool/importer/tflite/propagate_hints.py +++ b/tools/nntool/importer/tflite/propagate_hints.py @@ -20,7 +20,7 @@ from graph.types import (ConstantInputParameters, MatrixBroadcastedLinearOpParameters, ReshapeParameters, - InputParameters) + InputParameters, ConcatParameters) from utils.sparse_list import SparseList @@ -31,8 +31,8 @@ def propagate_downwards(G: NNGraph): if node.in_dims_hint is not None: if isinstance(node, ReshapeParameters): - assert len(node.old_shape) == len(node.in_dims_hint[0]), "reshape doesn't match input" - node.old_shape.apply_naming_hints(node.in_dims_hint[0]) + if len(node.old_shape) == len(node.in_dims_hint[0]): + node.old_shape.apply_naming_hints(node.in_dims_hint[0]) elif isinstance(node, MatrixBroadcastedLinearOpParameters): max_hint = None for hint in node.in_dims_hint: @@ -40,6 +40,14 @@ def propagate_downwards(G: NNGraph): max_hint = hint if max_hint is not None: node.out_dims_hint = [max_hint] + elif isinstance(node, ConcatParameters): + # if any incoming edge of the concat doesn't have a hint + # set it the same as the others + any_in_hint = next((hint for hint in node.in_dims_hint if hint is not None), None) + if any_in_hint: + for edge in G.in_edges(node.name): + if not node.in_dims_hint[edge.to_idx]: + node.in_dims_hint[edge.to_idx] = any_in_hint else: if node.out_dims_hint is None: node.out_dims_hint = deepcopy(node.in_dims_hint) diff --git a/tools/nntool/interpreter/commands/__init__.py b/tools/nntool/interpreter/commands/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/interpreter/commands/adjust.py b/tools/nntool/interpreter/commands/adjust.py new file mode 100644 index 000000000..ea83de8cc --- /dev/null +++ b/tools/nntool/interpreter/commands/adjust.py @@ -0,0 +1,33 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from interpreter.nntool_shell_base import NNToolShellBase + +class AdjustCommand(NNToolShellBase): + # ADJUST COMMAND + # parser_adjust = Cmd2ArgumentParser("display statistics on globals") + + # @with_argparser(parser_adjust) + def do_adjust(self, _): + """ +Adjust activation and parameter tensors to match AutoTiler order. +Must be run before generating code.""" + self._check_graph() + if self.is_adjusted: + self.perror("graph is already adjusted") + return + self.G.adjust_order() + self.G.add_dimensions() + \ No newline at end of file diff --git a/tools/nntool/interpreter/commands/aquant.py b/tools/nntool/interpreter/commands/aquant.py new file mode 100644 index 000000000..acbc3115f --- /dev/null +++ b/tools/nntool/interpreter/commands/aquant.py @@ -0,0 +1,92 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import argparse +import logging + +from cmd2 import Cmd2ArgumentParser, with_argparser + +from graph.matches.propagate_softmax_sym_mult_qrec import PropagateSoftmaxSymQrec +from graph.matches.equalize_sym_mult_concats import EqualizeSymmetricMultiplicativeQuantivedConcats +from interpreter.nntool_shell_base import NNToolShellBase +from interpreter.shell_utils import (glob_input_files, input_options) +from quantization.multiplicative.mult_quantizer import MultQuantizer +from quantization.symmetric.symmetric_quantizer import SymmetricQuantizer +from stats.activation_stats_collector import ActivationStatsCollector +from stats.filter_stats_collector import FilterStatsCollector +from utils.data_importer import import_data +from utils.stats_funcs import STATS_BITS + +LOG = logging.getLogger('nntool.'+__name__) + +QUANTIZATION_SCHEMES = ['SQ8', 'POW2'] + +class AquantCommand(NNToolShellBase): + # AQUANT COMMAND + parser_aquant = Cmd2ArgumentParser() + parser_aquant_group = parser_aquant.add_mutually_exclusive_group(required=False) + parser_aquant_group.add_argument('-q', '--qsnr', + type=float, default=50.0, help='QSNR threshold in case of POW2 scheme') + parser_aquant_group.add_argument('-f', '--force_width', + choices=STATS_BITS, type=int, default=16, + help='force all layers to this bit-width in case of POW2 scheme, ' + + 'SQ8 will automatically force 8-bits') + parser_aquant.add_argument('-s', '--scheme', + type=str, choices=QUANTIZATION_SCHEMES, default='SQ8', + help='quantize with scaling factors (TFlite quantization-like) [default] or POW2') + parser_aquant.add_argument('-d', '--quant_dimension', + choices=['tensor', 'channel'], default='channel') + parser_aquant.add_argument('-r', '--relun_threshold', + type=int, default=1, help='Threshold above floored max value to adjust relun\'s to.') + parser_aquant.add_argument('-n', '--no_narrow_weights', + action='store_true', help='Don\'t quantize weights uniformly over negative/positive ' + + 'range. i.e. Avoid -128 vs 127') + input_options(parser_aquant) + + @with_argparser(parser_aquant) + def do_aquant(self, args: argparse.Namespace): + """ +Attempt to calculate quantization for graph using one or more sample imput files.""" + self._check_graph() + input_args = self._get_input_args(args) + processed_input = False + stats_collector = ActivationStatsCollector() + for file_per_input in glob_input_files(args.input_files, self.G.num_inputs): + LOG.info("input file %s", file_per_input) + processed_input = True + data = [import_data(input_file, **input_args) for input_file in file_per_input] + stats_collector.collect_stats(self.G, data) + if not processed_input: + self.perror("No imput files found") + return + astats = stats_collector.reduce_stats() + if args.scheme == 'SQ8': + quantizer = MultQuantizer(astats, 8, + quantized_dimension=args.quant_dimension, + narrow_weights=not args.no_narrow_weights) + else: + stats_collector = FilterStatsCollector() + fstats = stats_collector.collect_stats(self.G) + quantizer = SymmetricQuantizer(astats, fstats, + force_width=args.force_width, + min_qsnr=args.qsnr) + qrecs = quantizer.quantize(self.G) + self.G.quantization = qrecs + if args.scheme == 'SQ8': + concats_matcher = EqualizeSymmetricMultiplicativeQuantivedConcats() + concats_matcher.match(self.G, set_identity=False) + softmax_qrec_matcher = PropagateSoftmaxSymQrec() + softmax_qrec_matcher.match(self.G, set_identity=False) + LOG.info("Quantization set. Use qshow command to see it.") diff --git a/tools/nntool/interpreter/commands/astats.py b/tools/nntool/interpreter/commands/astats.py new file mode 100644 index 000000000..484113663 --- /dev/null +++ b/tools/nntool/interpreter/commands/astats.py @@ -0,0 +1,69 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import argparse +import logging +from cmd2 import Cmd2ArgumentParser, with_argparser +from interpreter.nntool_shell_base import NNToolShellBase +from interpreter.shell_utils import (output_table, table_options, input_options, + glob_input_files) +from reports.activation_reporter import ActivationReporter +from stats.activation_stats_collector import ActivationStatsCollector +from utils.data_importer import import_data + +LOG = logging.getLogger("nntool") + +class AstatsCommand(NNToolShellBase): + # ASTATS COMMAND + parser_astats = Cmd2ArgumentParser() + parser_astats.add_argument('-q', '--qsnr', + type=float, default=30.0, help='QSNR threshold') + parser_astats.add_argument('-d', '--detail', + action="store_true", help='Show fusions detail') + parser_astats.add_argument('-s', + '--step', + type=int, + nargs=(1, 2), + help='display information by channel for step. You can indicate a fusion step with two values. The step_idx and the idx of the node in the fusion.') + table_options(parser_astats, default_width=180) + input_options(parser_astats) + + @with_argparser(parser_astats) + def do_astats(self, args: argparse.Namespace): + """ +Calculate activation statistics on one or more imput files.""" + self._check_graph() + input_args = self._get_input_args(args) + stats_collector = ActivationStatsCollector() + step_idx = args.step + if step_idx is not None: + if len(step_idx) == 1: + step_idx = step_idx[0] + else: + step_idx = tuple(step_idx) + if len(args.input_files) == 0: + self.perror("You must enter some files to process") + return + for file_per_input in glob_input_files(args.input_files, self.G.num_inputs): + LOG.info("input file %s", file_per_input) + data = [import_data(input_file, **input_args) for input_file in file_per_input] + stats_collector.collect_stats(self.G, data) + + fmt = ('tab' if args.output is None else args.output['fmt']) + tab = ActivationReporter(do_totals=(fmt != "csv"), + threshold=args.qsnr, + yield_fusions=args.detail or isinstance(step_idx, tuple)).report(self.G, + stats_collector.reduce_stats()) + output_table(tab, args) diff --git a/tools/nntool/interpreter/commands/dump.py b/tools/nntool/interpreter/commands/dump.py new file mode 100644 index 000000000..fc31d70fb --- /dev/null +++ b/tools/nntool/interpreter/commands/dump.py @@ -0,0 +1,229 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import argparse +import logging +import pickle + +import numpy as np +from cmd2 import Cmd, Cmd2ArgumentParser, with_argparser + +from execution.graph_executer import GraphExecuter +from execution.quantization_mode import QuantizationMode +from graph.types import ConvFusionParameters, FilterParameters +from interpreter.nntool_shell_base import NNToolShellBase +from interpreter.shell_utils import (glob_input_files, + input_options) +from utils.data_importer import import_data +from utils.node_id import NodeId +from utils.at_norm import set_do_rounding, get_do_rounding + +LOG = logging.getLogger('nntool.'+__name__) + + +def format_dump_file(G, outputs, quantized, dequantize, quantize_step): + # simplify the output since we only have one for now and add weights + foutputs = [] + for idx, out in enumerate(outputs): + if quantize_step == idx: + step_is_quantized = True + step_is_dequantized = True + elif quantized: + step_is_quantized = True + step_is_dequantized = dequantize + else: + step_is_quantized = False + step_is_dequantized = False + + tensors = [out[0]] + node = G.graph_state.steps[idx]['node'] + if isinstance(node, ConvFusionParameters): + for filt in node.contained_filters(): + if step_is_quantized: + qrec = G.quantization[NodeId(node, filt)] + if G.has_quantized_parameters: + if step_is_dequantized: + qrec = G.quantization[NodeId(node, filt)] + tensors.append(qrec.weights_q.get_dequantized(filt.weights)) + tensors.append(qrec.biases_q.get_dequantized(filt.biases)) + else: + tensors.append(np.copy(filt.weights)) + tensors.append(qrec.biases_q.get_quantized(filt.biases)) + else: + if step_is_dequantized: + tensors.append(np.copy(filt.weights)) + tensors.append(np.copy(filt.biases)) + else: + tensors.append(qrec.weights_q.quantize(filt.weights)) + tensors.append(qrec.biases_q.quantize(filt.biases)) + else: + if G.has_quantized_parameters: + qrec = G.quantization[NodeId(node, filt)] + tensors.append(qrec.weights_q.get_dequantized(filt.weights)) + tensors.append(qrec.biases_q.get_dequantized(filt.biases)) + else: + tensors.append(np.copy(filt.weights)) + tensors.append(np.copy(filt.biases)) + elif isinstance(node, FilterParameters): + if step_is_quantized: + qrec = G.quantization[NodeId(node, None)] + if G.has_quantized_parameters: + if step_is_dequantized: + tensors.append(qrec.weights_q.get_dequantized(node.weights)) + tensors.append(qrec.biases_q.get_dequantized(node.biases)) + else: + tensors.append(np.copy(node.weights)) + tensors.append(qrec.biases_q.get_quantized(node.biases)) + else: + if step_is_dequantized: + tensors.append(np.copy(node.weights)) + tensors.append(np.copy(node.biases)) + else: + tensors.append(qrec.weights_q.quantize(node.weights)) + tensors.append(qrec.biases_q.quantize(node.biases)) + else: + if G.has_quantized_parameters: + qrec = G.quantization[NodeId(node, None)] + tensors.append(qrec.weights_q.dequantize(node.weights)) + tensors.append(qrec.biases_q.dequantize(node.biases)) + else: + tensors.append(np.copy(node.weights)) + tensors.append(np.copy(node.biases)) + else: + tensors.append(None) + tensors.append(None) + foutputs.append(tuple(tensors)) + return foutputs + + +class DumpCommand(NNToolShellBase): + # DUMP COMMAND + parser_dump = Cmd2ArgumentParser() + parser_dump.add_argument('-s', '--step', + type=int, help='step to dump output of', default=None) + parser_dump.add_argument('-w', '--number_width', + type=int, help='width of numbers', default=8) + parser_dump.add_argument('-p', '--precision', + type=int, help='number of decimal places', default=4) + parser_dump.add_argument('-c', '--channel', + type=int, help='channel to dump', default=None) + parser_dump.add_argument('-d', '--dequantize', + action='store_true', help='dequantize result') + parser_dump.add_argument('--quantize_and_dequantize', + action='store_true', help='quantize and dequantize float results') + parser_dump_group = parser_dump.add_mutually_exclusive_group(required=False) + parser_dump_group.add_argument('-q', '--quantize', action='store_true', + help='quantize the graph (must have already set quantization)') + parser_dump_group.add_argument('-Q', '--quantize_step', type=int, + help='quantize a step of the graph (must have already' + + ' set quantization)', + default=None) + parser_dump_group.add_argument('-A', '--quantize_all_steps', + action='store_true', + help='quantize all steps of the graph feeding' + + ' unquantized float data into each step') + parser_dump.add_argument('-P', '--pickle', + completer_method=Cmd.path_complete, + help='pickle all the outputed tensors to this file') + parser_dump.add_argument('-S', '--save', + help='save the tensor to the tensors list') + input_options(parser_dump) + + @with_argparser(parser_dump) + def do_dump(self, args: argparse.Namespace): + """ +Dump the activations resulting from running an input file through the graph. +You can use the current quantization settings and can also just quantify one +specific step of the graph.""" + self._check_graph() + dequantize = args.dequantize if args.dequantize is not None\ + else not (args.pickle or args.save) + if args.quantize or args.quantize_step or args.quantize_all_steps: + self._check_quantized() + if args.quantize: + if dequantize: + qmode = QuantizationMode.all_dequantize() + else: + qmode = QuantizationMode.all() + elif args.quantize_all_steps: + qmode = QuantizationMode.step_all() + dequantize = True + else: + qmode = QuantizationMode.step(args.quantize_step) + elif args.quantize_and_dequantize: + qmode = QuantizationMode.all_float_quantize_dequantize() + else: + qmode = QuantizationMode.none() + if args.step is not None: + step = args.step + num_steps = len(self.G.graph_state.steps) + if step < 0: + step = num_steps + step + if step < 0 or step > num_steps: + self.perror("step must be from {} to {}".format(-num_steps, num_steps)) + return + else: + step = None + + input_args = self._get_input_args(args) + + pickles = [] + + for file_per_input in glob_input_files(args.input_files, self.G.num_inputs): + LOG.info("input file %s", file_per_input) + data = [import_data(input_file, **input_args) for input_file in file_per_input] + executer = GraphExecuter(self.G, qrecs=self.G.quantization) + outputs = executer.execute(data, step_idx_limit=step, + qmode=qmode) + + if args.pickle or self._in_py or args.save: + pickles.append(format_dump_file(self.G, outputs, not qmode.is_none, + args.dequantize, args.quantize_step)) + else: + self.G.print_intermediates(outputs, limit=step, width=args.number_width, + precision=args.precision, channel=args.channel, + order=['c', 'h', 'w']) + + if args.pickle or args.save or self._in_py: + if not pickles: + self.perror("no input files found") + return + if len(args.input_files) == 1: + pickles = pickles[0] + if args.pickle: + with open(args.pickle, 'wb') as pickle_fp: + pickle.dump(pickles, pickle_fp) + if args.save: + self.tensor_store[args.save] = pickles + + if self._in_py: + self.last_result = pickles + + +class RoundingCommand(NNToolShellBase): + # ROUNDING COMMAND + parser_round = Cmd2ArgumentParser() + parser_round.add_argument('round', + choices=['on', 'off'], + nargs=(0, 1), + help='switch rounding on or off') + + @with_argparser(parser_round) + def do_rounding(self, args: argparse.Namespace): + """ +Switch rounding on and off in quantized calculations.""" + if args.round is not None: + set_do_rounding(args.round == 'on') + LOG.info("rounding is %s", 'on' if get_do_rounding() else 'off') diff --git a/tools/nntool/interpreter/commands/extract.py b/tools/nntool/interpreter/commands/extract.py new file mode 100644 index 000000000..ce781d1ee --- /dev/null +++ b/tools/nntool/interpreter/commands/extract.py @@ -0,0 +1,34 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from cmd2 import Cmd2ArgumentParser, with_argparser +from interpreter.nntool_shell_base import NNToolShellBase +from graph.manipulations.extract import extract_node + +class ExtractCommand(NNToolShellBase): + # EXTRACT COMMAND + parser_extract = Cmd2ArgumentParser() + parser_extract.add_argument('step', + type=int, + help='step number to extract') + + @with_argparser(parser_extract) + def do_extract(self, args): + """ +Extracts a single step out of a graph and forms a new graph with inputs and outputs to this step.""" + self._check_graph() + if args.step < 0 or args.step > len(self.G.graph_state.steps): + self.perror("step must be between 0 and {}".format(len(self.G.graph_state.steps))) + extract_node(self.G, self.G.graph_state.steps[args.step]['node']) diff --git a/tools/nntool/interpreter/commands/fquant.py b/tools/nntool/interpreter/commands/fquant.py new file mode 100644 index 000000000..03bf62594 --- /dev/null +++ b/tools/nntool/interpreter/commands/fquant.py @@ -0,0 +1,55 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import argparse +import numpy as np +from cmd2 import Cmd2ArgumentParser, with_argparser +from interpreter.nntool_shell_base import NNToolShellBase +from interpreter.shell_utils import output_table, table_options +from quantization.symmetric.symmetric_quantizer import SymmetricQuantizer +from reports.quantization_reporter import QuantizationReporter +from stats.activation_stats_collector import ActivationStatsCollector +from stats.fake_filter_stats_collector import FakeFilterStatsCollector +from utils.stats_funcs import STATS_BITS + +class FquantCommand(NNToolShellBase): + #FQUANT COMMAND + parser_fquant = Cmd2ArgumentParser() + parser_fquant.add_argument('-f', '--force_width', + choices=STATS_BITS, default=8, type=int, help='force all layers to this width') + table_options(parser_fquant, default_width=140) + + @with_argparser(parser_fquant) + def do_fquant(self, args: argparse.Namespace): + """ +Attempt to calculate a fake quantization for graph using random tensors and parameters. +This is intended to allow code generation for performance testing even if no real +weights and input data are avalaible.""" + self._check_graph() + self.G.constant_store.fake = True + stats_collector = ActivationStatsCollector() + input_tensors = [np.random.normal(0, 0.2, input.dims.shape) + for input in self.G.input_nodes()] + stats_collector.collect_stats(self.G, input_tensors) + astats = stats_collector.reduce_stats() + stats_collector = FakeFilterStatsCollector() + fstats = stats_collector.collect_stats(self.G) + quantizer = SymmetricQuantizer(astats, fstats, + force_width=args.force_width) + qrecs = quantizer.quantize(self.G) + self.G.quantization = qrecs + tab = QuantizationReporter().report(self.G, qrecs) + output_table(tab, args) + self.G.constant_store.fake = False diff --git a/tools/nntool/interpreter/commands/freeze.py b/tools/nntool/interpreter/commands/freeze.py new file mode 100644 index 000000000..89e8ce514 --- /dev/null +++ b/tools/nntool/interpreter/commands/freeze.py @@ -0,0 +1,59 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging +import argparse +from itertools import chain +from cmd2 import Cmd2ArgumentParser, with_argparser +from utils.node_id import NodeId +from interpreter.nntool_shell_base import NNToolShellBase +from graph.types.others import InputOutputParameters + + +LOG = logging.getLogger("nntool") + + +class FreezeCommand(NNToolShellBase): + def inputs_and_outputs(self): + if self.G is None: + return [] + return [node.name for node in chain(self.G.inputs_and_constants(), self.G.outputs())] + + # FREEZE COMMAND + parser_freeze = Cmd2ArgumentParser("toggle freezing of channel order of inputs or outputs") + parser_freeze.add_argument('node_names', + nargs='+', + choices_method=inputs_and_outputs, + help='input or output node names to toggle freeze') + + @with_argparser(parser_freeze) + def do_freeze(self, args: argparse.Namespace): + """ +Toggle freezing of channel order on inputs and outputs. When graph is adjusted frozen nodes + will not change channel order.""" + self._check_graph() + nodes = [self.G.node(node_name) for node_name in args.node_names] + if not all([isinstance(node, InputOutputParameters) for node in nodes]): + self.perror("all nodes should be inputs or outputs") + return + + for node in nodes: + if node.fixed_order: + LOG.info("node %s is unfrozen", node.name) + node.fixed_order = False + else: + LOG.info("node %s is frozen", node.name) + node.fixed_order = True + self.G.node_options[NodeId(node)] = node.at_options diff --git a/tools/nntool/interpreter/commands/fusions.py b/tools/nntool/interpreter/commands/fusions.py new file mode 100644 index 000000000..77a16e5db --- /dev/null +++ b/tools/nntool/interpreter/commands/fusions.py @@ -0,0 +1,68 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from cmd2 import Cmd2ArgumentParser, with_argparser + +from graph.matches.matches import get_fusion, get_fusions, get_pow2_match_group, get_scale8_match_group +from interpreter.nntool_shell_base import NNToolShellBase + + +class FusionsCommand(NNToolShellBase): + # FUSIONS COMMAND + def fusions_list(self): + return [elem[0] for elem in get_fusions()] + + parser_fusions = Cmd2ArgumentParser("apply fusions to graph") + parser_fustions_exclusive = parser_fusions.add_mutually_exclusive_group() + parser_fustions_exclusive.add_argument('-l', '--list', + action='store_true', + help='list available fusions') + parser_fustions_exclusive.add_argument('-a', '--apply', + type=str, + nargs='+', + choices_method=fusions_list, + help='apply a fusion') + parser_fustions_exclusive.add_argument('--pow2', + action='store_true', + help='apply standard fusions for AutoTiler POW2 kernels') + parser_fustions_exclusive.add_argument('--scale8', + action='store_true', + help='apply standard fusions for AutoTiler SQ8 kernels') + + @with_argparser(parser_fusions) + def do_fusions(self, args): + """ +Carry out the default set of fusions on the graph""" + self._check_graph() + if args.list: + self.ppaged("\n".join(["%s - %s" % (name, desc) for name, desc in get_fusions()])) + return + if args.apply: + fusions = [get_fusion(name) for name in args.apply] + if not fusions: + self.perror('fusion %s not found' % args.apply) + return + elif args.pow2: + fusions = [get_pow2_match_group()] + elif args.scale8: + fusions = [get_scale8_match_group()] + else: + self.perror("No fusion set selected. Nothing to do. Select --pow2 or --scale8.") + return + for fusion in fusions: + fusion.match(self.G) + self.G.add_dimensions() + if self.G.quantization and not self.G.quantization.verify_quantization(self.G): + self.G.quantization = None diff --git a/tools/nntool/interpreter/commands/gen.py b/tools/nntool/interpreter/commands/gen.py new file mode 100644 index 000000000..01c2eeef4 --- /dev/null +++ b/tools/nntool/interpreter/commands/gen.py @@ -0,0 +1,97 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import argparse +import logging +import os +from cmd2 import Cmd, Cmd2ArgumentParser, with_argparser +from interpreter.nntool_shell_base import NNToolShellBase +from utils.data_importer import import_data +from execution.graph_executer import GraphExecuter +from execution.quantization_mode import QuantizationMode +from generation.default_template import default_template, dynamic_template, header_template +from generation.naming_convension import DefaultNamingConvension +from generation.code_generator import CodeGenerator + +LOG = logging.getLogger("nntool") + +class GenCommand(NNToolShellBase): + # GEN COMMAND + parser_gen = Cmd2ArgumentParser() + parser_gen.add_argument('model_file', + completer_method=Cmd.path_complete, + nargs=argparse.OPTIONAL, + help='file to write to, otherwise output to terminal') + parser_gen.add_argument('-T', '--tensor_directory', + completer_method=Cmd.path_complete, + help='path to tensor directory. full path will be created' + + ' if it doesn\'t exist. If this parameter is given it will' + + 'update the settings saved with the graph state.') + parser_gen.add_argument('-M', '--model_directory', + completer_method=Cmd.path_complete, + help='path to model directory. full path will be created' + + ' if it doesn\'t exist. If this parameter is given it will' + + 'update the settings saved with the graph state.') + parser_gen.add_argument('-t', '--output_tensors', + action='store_true', + help='write constants (weights, biases)') + parser_gen.add_argument('-c', '--checksums', + completer_method=Cmd.path_complete, + help='generate checksum tests in code for the given file') + parser_gen.add_argument('--header_file', + completer_method=Cmd.path_complete, + help='generate header file with layers information') + + @with_argparser(parser_gen) + def do_gen(self, args): + """ +Generate AutoTiler model C code and optionally dump tensors. If no destination file is +given the generated code will be outputed to the screen. Check the 'set' command for +settings related to code generation.""" + self._check_graph() + self._check_quantized() + self._check_adjusted() + if args.checksums: + input_args = self._get_input_args(None) + LOG.info("input file %s", args.checksums) + data = import_data(args.checksums, **input_args) + executer = GraphExecuter(self.G, qrecs=self.G.quantization) + executer.execute([data], qmode=QuantizationMode.all()) + self.settings['checksum_file'] = args.checksums + self.settings['generate_checksums'] = True + + if args.tensor_directory: + self.settings['tensor_directory'] = args.tensor_directory + if args.model_directory: + self.settings['model_directory'] = args.model_directory + code_gen = CodeGenerator(self.G, DefaultNamingConvension(self.G), self.settings) + + if self.settings['template_file']: + code_template = dynamic_template(self.settings['template_file']) + else: + code_template = default_template + + if args.model_file: + with open(os.path.join(self.settings['model_directory'], + args.model_file), "w") as output_fp: + output_fp.write(code_template(self.G, code_generator=code_gen)) + else: + self.ppaged(code_template(self.G, code_generator=code_gen)) + if args.output_tensors: + code_gen.write_constants() + + if args.header_file: + with open(os.path.join(self.settings['model_directory'], args.header_file), "w") as output_fp: + output_fp.write(header_template(self.G, code_generator=code_gen)) diff --git a/tools/nntool/interpreter/commands/graph.py b/tools/nntool/interpreter/commands/graph.py new file mode 100644 index 000000000..a8310a828 --- /dev/null +++ b/tools/nntool/interpreter/commands/graph.py @@ -0,0 +1,77 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import argparse +from cmd2 import Cmd2ArgumentParser, with_argparser, CompletionItem +from interpreter.shell_utils import output_table, table_options +from interpreter.nntool_shell_base import NNToolShellBase +from reports.graph_reporter import GraphReporter + +class GraphCommand(NNToolShellBase): + # GRAPH COMMAND + + def other_open_graphs(self, only_open=False): + items = [] + for graph_idx, graph in enumerate(self._graphs): + if graph_idx == self._graph_idx: + continue + if graph['G'] is None: + if only_open: + continue + name = "No Graph" + else: + name = graph['G'].name + items.append(CompletionItem(graph_idx, name)) + return items + + parser_graph = Cmd2ArgumentParser("display graph") + parser_graph.add_argument('graph_number', + nargs=argparse.OPTIONAL, + type=int, + choices_method=other_open_graphs, + help='graph to select or nothing to show open graphs') + + @with_argparser(parser_graph) + def do_graph(self, args: argparse.Namespace): + """ +Select actuve graphs""" + if args.graph_number is not None: + if args.graph_number < 0 or args.graph_number >= len(self._graphs): + self.perror("graph number is invalid") + return + self._graph_idx = args.graph_number + self.pfeedback("selected graph {}".format(self._graph_idx)) + self._update_prompt() + self.py_locals['G'] = self.G + else: + for idx, rec in enumerate(self._graphs): + self.poutput("{:d} - {}".format(idx, rec['graph_file'])) + + # SHOW COMMAND + parser_show = Cmd2ArgumentParser("display graph") + table_options(parser_show, default_width=180) + parser_show.add_argument('step', type=int, nargs=(0, 1), help='Limit to step number') + + @with_argparser(parser_show) + def do_show(self, args: argparse.Namespace): + """ +Display the structure of the graph""" + self._check_graph() + fmt = ('tab' if args.output is None else args.output['fmt']) + split_dims = fmt == "xls" + do_totals = fmt != "csv" + tab = GraphReporter(split_dims=split_dims, do_totals=do_totals, + step=args.step).report(self.G, None) + output_table(tab, args) diff --git a/tools/nntool/interpreter/commands/imageformat.py b/tools/nntool/interpreter/commands/imageformat.py new file mode 100644 index 000000000..bf7df2bf8 --- /dev/null +++ b/tools/nntool/interpreter/commands/imageformat.py @@ -0,0 +1,141 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import argparse +from copy import deepcopy + +from cmd2 import Cmd2ArgumentParser, with_argparser +from quantization.multiplicative.mult_quantization import MultQuantizationRecordBase +from quantization.multiplicative.asymmetric.asymmetric_mult_qtype import AsymmetricMultQType +from quantization.symmetric.symmetric_quantization import SymmetricQuantizationBase +from quantization.qtype import QType +from graph.types import ImageFormatParameters +from interpreter.nntool_shell_base import NNToolShellBase +from utils.graph import Edge +from utils.node_id import NodeId + + +class ImageFormatCommand(NNToolShellBase): + def inputs_choices(self): + if self.G is None: + return [] + return [node.name for node in self.G.inputs()] + + def format_choices(self): + return [fmt.lower() for fmt in ImageFormatParameters.FORMAT_CHANGES] + ['none'] + + def norm_choices(self): + return [fmt.lower() for fmt in ImageFormatParameters.NORMALIZATIONS] + ['none'] + + # IMAGEFORMAT COMMAND + parser_imageformat = Cmd2ArgumentParser("inserts image format node into graphs") + parser_imageformat.add_argument('input_node', + choices_method=inputs_choices, + help='input node name to format') + parser_imageformat.add_argument('image_formatter', + choices_method=format_choices, + help='input node name to format') + parser_imageformat.add_argument('image_normalizer', + choices_method=norm_choices, + help='input node name to format') + + @with_argparser(parser_imageformat) + def do_imageformat(self, args: argparse.Namespace): + """ Add or modify image format options.""" + self._check_graph() + if args.input_node not in self.G: + self.perror("input node not found") + return + input_node = self.G[args.input_node] + out_edge = self.G.out_edges(input_node.name)[0] + if isinstance(out_edge.to_node, ImageFormatParameters): + self.G.changes.image_format(input_node.name, None, None) + remove_formatter(self.G, out_edge) + self.G.add_dimensions() + return + if args.image_formatter == "none" and args.image_normalizer == "none": + self.pfeedback("no formatting set") + self.G.add_dimensions() + return + self.G.changes.image_format(input_node.name, args.image_formatter, args.image_normalizer) + insert_formatter(self.G, out_edge, args.image_formatter, args.image_normalizer) + self.G.add_dimensions() + + +def insert_formatter(G, out_edge, formatter, normalizer): + input_node = out_edge.from_node + format_node = ImageFormatParameters(input_node.name + "_formatter", + norm_func=normalizer.upper(), + format_change=formatter.upper()) + # dims updated to reflect formatter + if format_node.output_channels is not None and format_node.input_channels is not None: + out_dim = input_node.get_output_size(None)[0] + if not out_dim.is_named or out_dim.c != format_node.output_channels: + raise ValueError( + "current graph input is not named or does not match formatter output channels") + if formatter.upper() in ("RGB16", "BW16") and normalizer.upper() != "OUT_INT16": + raise ValueError( + "rgb16 and bw16 formatters must have out_int16 as normalization function") + in_dim = out_dim.clone() + in_dim.c = format_node.input_channels + in_dim.impose_order(("h", "w", "c")) + format_node.out_dims_hint = input_node.out_dims_hint + input_node.out_dims_hint = [["h", "w", "c"]] + format_node.in_dims_hint = [["h", "w", "c"]] + input_node.dims = in_dim + input_node.fixed_order = True + # qrec updated to reflect formatter + input_qrec = G.quantization and G.quantization.get(NodeId(input_node)) + if input_qrec and format_node.input_dtype and format_node.output_dtype: + formatter_qrec = G.quantization.get(NodeId(format_node)) + if not formatter_qrec: + if input_qrec.out_qs[0].dtype != format_node.output_dtype: + raise ValueError( + "current graph input output quantization does not match formatter output") + formatter_qrec = deepcopy(input_qrec) + formatter_qrec.out_qs[0] = deepcopy(formatter_qrec.out_qs[0]) + if isinstance(formatter_qrec, MultQuantizationRecordBase): + formatter_in_q = AsymmetricMultQType(scale=1, dtype=format_node.input_dtype) + elif isinstance(formatter_qrec, SymmetricQuantizationBase): + formatter_in_q = QType(q=0, dtype=format_node.input_dtype) + else: + raise NotImplementedError("quantization has unknown type") + if len(formatter_qrec.in_qs) > 0: + formatter_qrec.in_qs[0] = formatter_in_q + input_qrec.in_qs[0] = formatter_in_q + else: + formatter_qrec.in_qs.append(formatter_in_q) + input_qrec.in_qs.append(formatter_in_q) + input_qrec.out_qs[0] = formatter_in_q + G.quantization[NodeId(format_node)] = formatter_qrec + + G.insert_node(format_node, out_edge.from_node.name, + out_edge.to_node.name, to_idx=out_edge.to_idx) + +def remove_formatter(G, out_edge): + input_node = out_edge.from_node + fmt_node = out_edge.to_node + fmt_edge = G.out_edges(fmt_node.name)[0] + fmt_qrec = G.quantization and G.quantization.get(NodeId(fmt_node)) + G.remove(fmt_node) + input_node.dims = fmt_node.out_dims[0] + input_node.out_dims_hint = fmt_node.out_dims_hint + + G.add_edge(Edge(input_node, fmt_edge.to_node, to_idx=fmt_edge.to_idx)) + if fmt_qrec: + input_qrec = G.quantization[NodeId(input_node)] + input_qrec.out_qs = fmt_qrec.out_qs + input_qrec.in_qs = fmt_qrec.out_qs + G.quantization.remove_node(fmt_node) diff --git a/tools/nntool/interpreter/commands/nodeoption.py b/tools/nntool/interpreter/commands/nodeoption.py new file mode 100644 index 000000000..7ebc5fd37 --- /dev/null +++ b/tools/nntool/interpreter/commands/nodeoption.py @@ -0,0 +1,89 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from cmd2 import Cmd2ArgumentParser, with_argparser +from interpreter.nntool_shell_base import NNToolShellBase +from utils.node_id import NodeId + +def nodeoption_choices_method(self, arg_tokens): + step_num = arg_tokens['step'][0] + if step_num == '*': + keys = [] + for step in self.G.graph_state.steps: + node = step['node'] + keys.extend(node.at_options.valid_options.keys()) + return keys + try: + step_num = int(step_num) + node = self.G.graph_state.steps[step_num]['node'] + return node.at_options.valid_options.keys() + except ValueError: + return [] + +def nodename_choices_method(self, arg_tokens): + step_start = arg_tokens['step'][0] + try: + _ = int(step_start) + return [] + except ValueError: + return [step['node'].name for step in self.G.graph_state.steps if step['node'].name.startswith(step_start)] + ["*"] + +class NodeoptionCommand(NNToolShellBase): + # nodeoption COMMAND + parser_nodeoption = Cmd2ArgumentParser() + parser_nodeoption.add_argument('step', nargs=(0, 1), choices_method=nodename_choices_method, help='Set this step number or name') + parser_nodeoption.add_argument('parameter', nargs=( + 0, 1), choices_method=nodeoption_choices_method, help='Set this parameter') + parser_nodeoption.add_argument('value', nargs=(0, 1), help='Set the parameter to this value') + + @with_argparser(parser_nodeoption) + def do_nodeoption(self, args): + """ Allows setting of autotiler generator control parameters and other code generation +options such as the location of inputs and outputs. For a complete set of the parameters that +can be set refer to the autotiler documentation.""" + self._check_graph() + if args.step is None or (args.step == '*' and args.parameter is None): + for nodeid, elem in self.G.node_options.items(): + print("{}: {}".format(nodeid, elem)) + return + + if args.step == '*': + nodes = [step['node'] for step in self.G.graph_state.steps] + else: + try: + try: + step = int(args.step) + nodes = [self.G.graph_state.steps[step]['node']] + except ValueError: + nodes = [self.G[args.step]] + except (IndexError): + self.perror("%s is not a valid step or node to set %s"%(args.step, args.parameter)) + return + + if args.parameter is None: + node_options = self.G.node_options.get(NodeId(nodes[0])) + if node_options: + print(node_options) + else: + print("nothing set") + return + if args.value is None: + val = None + else: + val = int(args.value) + for node in nodes: + node_options = node.at_options + setattr(node_options, args.parameter, val) + self.G.node_options[NodeId(node)] = node_options \ No newline at end of file diff --git a/tools/nntool/interpreter/commands/open.py b/tools/nntool/interpreter/commands/open.py new file mode 100644 index 000000000..7fe5d4857 --- /dev/null +++ b/tools/nntool/interpreter/commands/open.py @@ -0,0 +1,100 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import argparse +import os +import logging +from cmd2 import Cmd, Cmd2ArgumentParser, with_argparser +from utils.new_param_state import STATE_EXTENSION, load_state +from importer.importer import create_graph +from interpreter.nntool_shell_base import NNToolShellBase +from quantization.cross_layer_range_eq import weight_equalization + +LOG = logging.getLogger("nntool") + +NO_GRAPH = { + 'G': None, + 'graph_file': "", + 'tensor_file': "" +} + +class OpenCommand(NNToolShellBase): +# OPEN COMMAND + parser_open = Cmd2ArgumentParser("open a graph file") + parser_open.add_argument('nnfile', + completer_method=Cmd.path_complete, + help='graph or state file', + metavar="INPUT_GRAPH or STATE_FILE") + parser_open.add_argument('tensor_file', + nargs=argparse.OPTIONAL, + completer_method=Cmd.path_complete, + help='optional tensor file') + parser_open.add_argument('-q', '--load_quantization', + help='load TFLite quantization information', action='store_true') + parser_open.add_argument('-d', '--load_dequantized', + help='load dequantized constant values from TFLite quantized graph', action='store_true') + parser_open.add_argument('-n', '--new', + help='open as new graph - keep existing graph open', + action='store_true') + + def __open_graph(self, graph_file, tensor_file, load_quantization, load_dequantized): + + graph_file = os.path.expanduser(graph_file) + + _, ext = os.path.splitext(graph_file) + + if ext == STATE_EXTENSION: + LOG.info("opening state file %s", graph_file) + self.graph_file = graph_file + self.G, extra = load_state(graph_file, return_extra=True) + self.settings.update(extra) + else: + LOG.info("opening graph file %s", graph_file) + opts = { + 'load_tensors': True, + 'load_quantization': load_quantization, + 'load_dequantized': load_dequantized + } + + G = create_graph(graph_file, opts=opts) + G.add_dimensions() + if tensor_file: + G.load_tensors(tensor_file) + self.G = G + self.graph_file = graph_file + if tensor_file is not None: + self.tensor_file = tensor_file + self.settings['load_quantization'] = bool(load_quantization) + if self.settings['adjust_order']: + LOG.info("adjusting order") + self.execute_adjust_order() + if self.settings['weight_equalization']: + LOG.info("equalizing weights") + weight_equalization(self.G, self.settings['equalization_threshold']) + + @with_argparser(parser_open) + def do_open(self, args: argparse.Namespace): + """ +Open a graph or state file""" + if args.new: + # reset the current graph + self._graphs.append(NO_GRAPH.copy()) + self._graph_idx = len(self._graphs) - 1 + else: + # reset the current graph + self._graphs[self._graph_idx] = NO_GRAPH.copy() + self.__open_graph(args.nnfile, args.tensor_file, args.load_quantization, args.load_dequantized) + self._update_prompt() + self.py_locals['G'] = self.G diff --git a/tools/nntool/interpreter/commands/qerror.py b/tools/nntool/interpreter/commands/qerror.py new file mode 100644 index 000000000..389ebf8ad --- /dev/null +++ b/tools/nntool/interpreter/commands/qerror.py @@ -0,0 +1,69 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from cmd2 import Cmd2ArgumentParser, with_argparser +from interpreter.nntool_shell_base import NNToolShellBase +from interpreter.shell_utils import (glob_input_files, input_options, + output_table, table_options) +from utils.data_importer import import_data +from stats.step_error_stats_collector import StepErrorStatsCollector +from stats.error_stats_collector import ErrorStatsCollector +from reports.error_reporter import ErrorReporter + + +class QerrorCommand(NNToolShellBase): + # QERROR COMMAND + parser_qerror = Cmd2ArgumentParser() + parser_qerror.add_argument('-s', '--step', + action='store_true', + help='evaluate quantization per step. i.e.\ + individually quantize each layer') + parser_qerror.add_argument('--compare_quantized', + action='store_true', + help='quantize and dequantize the float output \ + to give it the same error as the quantized output of the layer') + parser_qerror.add_argument('-r', '--report_lowest', + type=int, help='QSNR threshold below which to report filename') + table_options(parser_qerror, default_width=140) + input_options(parser_qerror) + + @with_argparser(parser_qerror) + def do_qerror(self, args): + """ +Show quantization error introduced by processing one or more input files.""" + self._check_graph() + self._check_quantized() + fmt = ('tab' if args.output is None else args.output['fmt']) + input_args = self._get_input_args(args) + if args.step: + stats_collector = StepErrorStatsCollector(quant_compare=args.compare_quantized) + else: + stats_collector = ErrorStatsCollector(quant_compare=args.compare_quantized) + cnt = 0 + for file_per_input in glob_input_files(args.input_files, self.G.num_inputs): + cnt += 1 + + data = [import_data(input_file, **input_args) for input_file in file_per_input] + stat = stats_collector.collect_stats(self.G, data) + if args.report_lowest is not None: + lowest = min((elem['qsnr'] for elem in stat.values())) + if lowest < args.report_lowest: + self.pfeedback("{} had QSNR below threshold".format(file_per_input)) + if not cnt: + self.perror("no files to process") + return + tab = ErrorReporter(do_totals=(fmt != "csv"), one_input=cnt <= 1, with_chan=args.step)\ + .report(self.G, stats_collector.reduce_stats()) + output_table(tab, args) diff --git a/tools/nntool/interpreter/commands/qshow.py b/tools/nntool/interpreter/commands/qshow.py new file mode 100644 index 000000000..a3b4c85b5 --- /dev/null +++ b/tools/nntool/interpreter/commands/qshow.py @@ -0,0 +1,44 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from cmd2 import Cmd2ArgumentParser, with_argparser + +from interpreter.nntool_shell_base import NNToolShellBase +from interpreter.shell_utils import output_table, table_options +from reports.quantization_reporter import QuantizationReporter + +LOG = logging.getLogger('nntool.'+__name__) + +class QshowCommand(NNToolShellBase): + # QSHOW COMMAND + parser_qshow = Cmd2ArgumentParser() + table_options(parser_qshow) + parser_qshow.add_argument('step', type=int, nargs=(0, 1), help='Limit to step number') + parser_qshow.add_argument('-s', '--show_wrapped', + action='store_true', + help='show original quantization parameters on multiplicative quantization') + + @with_argparser(parser_qshow) + def do_qshow(self, args): + """ +Show current quantization settings.""" + self._check_graph() + self._check_quantized() + tab = QuantizationReporter(step=args.step, + emit_wrapped=args.show_wrapped).report(self.G, + self.G.quantization) + output_table(tab, args) diff --git a/tools/nntool/interpreter/commands/qtune.py b/tools/nntool/interpreter/commands/qtune.py new file mode 100644 index 000000000..5b7f381da --- /dev/null +++ b/tools/nntool/interpreter/commands/qtune.py @@ -0,0 +1,53 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from cmd2 import Cmd2ArgumentParser, with_argparser +from interpreter.nntool_shell_base import NNToolShellBase +from quantization.tuneq import tuneq + +class QtuneCommand(NNToolShellBase): + # QTUNE COMMAND + parser_tune = Cmd2ArgumentParser() + parser_tune.add_argument('step', + type=int, help='step to tune') + parser_tune.add_argument('parameter', + choices=['acc', 'calc', 'weights', 'biases', 'dp', 'out'], + help='which parameter to tune') + parser_tune.add_argument('X', + nargs='?', + default=0, + type=int, help='X of QX.Y') + parser_tune.add_argument('Y', + nargs='?', + default=0, + type=int, help='Y of QX.Y') + parser_tune.add_argument('index', + nargs='?', + default=0, + type=int, help='edge index') + parser_tune.add_argument('-f', + '--sub_step_fusion', + type=int, + help='index of the subnode for qtune inside of a fused one') + + @with_argparser(parser_tune) + def do_qtune(self, args): + """ +Tune quantization of graph.""" + self._check_graph() + self._check_quantized() + + tuneq(self.G, self.G.quantization, args.step, + args.parameter, args.X, args.Y, index=args.index) diff --git a/tools/nntool/interpreter/commands/range_equalization.py b/tools/nntool/interpreter/commands/range_equalization.py new file mode 100644 index 000000000..b65e01702 --- /dev/null +++ b/tools/nntool/interpreter/commands/range_equalization.py @@ -0,0 +1,81 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import argparse +from cmd2 import Cmd2ArgumentParser, with_argparser +from interpreter.nntool_shell_base import NNToolShellBase +from interpreter.shell_utils import glob_input_files, input_options +from utils.data_importer import import_data +from stats.step_error_stats_collector import StepErrorStatsCollector +from quantization.cross_layer_range_eq import adjust_biases, weight_equalization + +class WeightEqualizationCommand(NNToolShellBase): + # WEIGHT_EQUALIZATION COMMAND + parser_we = Cmd2ArgumentParser() + parser_we.add_argument('threshold', + type=float, default=0.1, + help='convergence threshold') + + def execute_weight_equalization(self, threshold): + if not (threshold > 0 and threshold < 10): + self.perror("threshold should be 10 > x > 0") + weight_equalization(self.G, threshold=threshold) + + @with_argparser(parser_we) + def do_weight_equalization(self, args: argparse.Namespace): + """ +Run weight equalization on graph. This reduces variance between weight +channels and may improve quantization accuracy.""" + self._check_graph() + self.execute_weight_equalization(args.threshold) + +class BalanceFiltersCommand(NNToolShellBase): + # BALANCE_FILTERS COMMAND + parser_bf = Cmd2ArgumentParser() + parser_bf.add_argument('-s', '--step', + type=int, help='step to balance. should be a convolution') + parser_bf.add_argument('-t', '--threshold', + default=0.20, + type=float, help='precision threshold of weights below which a layer should be balanced') + + @with_argparser(parser_bf) + def do_balance_filters(self, args: argparse.Namespace): + """ +Balance filter weights. THis will reduce variance in weights and will result in +a more balanced quantization at the expense of a multiplicative bias calculation.""" + self._check_graph() + self.G.balance_filters(step_idx=args.step, precision_threshold=args.threshold) + self.G.quantization = None + +class BcorrCommand(NNToolShellBase): + # BCORR COMMAND + parser_bcorr = Cmd2ArgumentParser() + input_options(parser_bcorr) + + @with_argparser(parser_bcorr) + def do_bcorr(self, args): + """ +Correct biases with average quantization error.""" + self._check_graph() + self._check_quantized() + stats_collector = StepErrorStatsCollector() + input_args = self._get_input_args(args) + cnt = 0 + for file_per_input in glob_input_files(args.input_files, self.G.num_inputs): + cnt += 1 + data = [import_data(filename, **input_args) for filename in file_per_input] + stats_collector.collect_stats(self.G, data) + + adjust_biases(self.G, stats_collector.reduce_stats()) diff --git a/tools/nntool/interpreter/commands/save_state.py b/tools/nntool/interpreter/commands/save_state.py new file mode 100644 index 000000000..cf8b7765b --- /dev/null +++ b/tools/nntool/interpreter/commands/save_state.py @@ -0,0 +1,47 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import argparse +import configparser +import os +from functools import partial +from cmd2 import Cmd, Cmd2ArgumentParser, with_argparser +from interpreter.nntool_shell_base import NNToolShellBase +from interpreter.shell_utils import filter_dirs +from generation.autotiler_options import DEFAULT_GEN_OPTS +from utils.new_param_state import dump_state + +class SaveStateCommand(NNToolShellBase): + # SAVE_STATE COMMAND + parser_save_state = Cmd2ArgumentParser() + parser_save_state.add_argument('output', + completer_method=Cmd.path_complete, + nargs=argparse.OPTIONAL, + help='file to write to') + + @with_argparser(parser_save_state) + def do_save_state(self, args): + """ +Save the state of the transforms and quantization of the graph. +This state file can be used to generate the model file as part of +a build script. If no argument is given then the state files +will be saved in the same directory as the graph. If a directory is +given then the state files will be saved in it with the graph +basename. If a filename is given, its basename will be used to +save the state files.""" + self._check_graph() + self._check_quantized() + gen_opts = {k: self.settings[k] for k in DEFAULT_GEN_OPTS} + dump_state(self.G, state_path=args.output, extra=gen_opts) diff --git a/tools/nntool/interpreter/commands/stats.py b/tools/nntool/interpreter/commands/stats.py new file mode 100644 index 000000000..03738d014 --- /dev/null +++ b/tools/nntool/interpreter/commands/stats.py @@ -0,0 +1,59 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import argparse +from cmd2 import Cmd2ArgumentParser, with_argparser +from interpreter.nntool_shell_base import NNToolShellBase +from interpreter.shell_utils import output_table, table_options +from reports.filter_reporter import (FilterDetailedStatsReporter, + FilterStatsReporter) +from stats.filter_stats_collector import (FilterDetailedStatsCollector, + FilterStatsCollector) + +class StatsCommand(NNToolShellBase): + # STATS COMMAND + parser_stats = Cmd2ArgumentParser("display statistics on globals") + parser_stats.add_argument('-d', '--detailed', + action="store_true", help='Dump detailed statistics') + parser_stats.add_argument('-q', '--qsnr', + type=float, default=30.0, help='QSNR threshold') + parser_stats.add_argument('-s', '--step', + type=int, + nargs=(1, 2), + help='display information by channel for step') + table_options(parser_stats, default_width=180) + + @with_argparser(parser_stats) + def do_stats(self, args: argparse.Namespace): + """ +Display statistics on weights and biases""" + self._check_graph() + fmt = ('tab' if args.output is None else args.output['fmt']) + if args.detailed: + stats_collector = FilterDetailedStatsCollector() + stats = stats_collector.collect_stats(self.G) + tab = FilterDetailedStatsReporter().report(self.G, stats) + else: + step_idx = args.step + if step_idx is not None: + if len(step_idx) == 1: + step_idx = step_idx[0] + else: + step_idx = tuple(step_idx) + stats_collector = FilterStatsCollector() + stats = stats_collector.collect_stats(self.G, step_idx=step_idx) + tab = FilterStatsReporter(do_totals=(fmt != "csv"), threshold=args.qsnr, step_idx=step_idx)\ + .report(self.G, stats) + output_table(tab, args) diff --git a/tools/nntool/interpreter/commands/temps.py b/tools/nntool/interpreter/commands/temps.py new file mode 100644 index 000000000..7c312d428 --- /dev/null +++ b/tools/nntool/interpreter/commands/temps.py @@ -0,0 +1,36 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from cmd2 import Cmd, Cmd2ArgumentParser, with_argparser +from interpreter.nntool_shell_base import NNToolShellBase +from interpreter.shell_utils import table_options, output_table +from stats.temps_stats_collector import TempsStatsCollector +from reports.temps_reporter import TempsReporter + +class TempsCommand(NNToolShellBase): + # TEMPS COMMAND + parser_temps = Cmd2ArgumentParser() + table_options(parser_temps, default_width=140) + + @with_argparser(parser_temps) + def do_temps(self, args): + """ +Show statistics on activations.""" + self._check_graph() + fmt = ('tab' if args.output is None else args.output['fmt']) + stats_collector = TempsStatsCollector() + stats = stats_collector.collect_stats(self.G) + tab = TempsReporter(do_totals=(fmt != "csv")).report(self.G, stats) + output_table(tab, args) diff --git a/tools/nntool/interpreter/commands/tensors.py b/tools/nntool/interpreter/commands/tensors.py new file mode 100644 index 000000000..025710c72 --- /dev/null +++ b/tools/nntool/interpreter/commands/tensors.py @@ -0,0 +1,197 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +import numpy as np +from cmd2 import Cmd, Cmd2ArgumentParser, with_argparser + +from interpreter.nntool_shell_base import NNToolShellBase +from interpreter.shell_utils import print_comparison +from utils.gap_tensor_file import read_gap_tensors, write_gap_tensor +from utils.stats_funcs import qsnr +from utils.at_tensor_loader import at_map_tensors, at_tensor_loader + +LOG = logging.getLogger('nntool.'+__name__) + + +class TensorsCommand(NNToolShellBase): + # TENSORS_COMMAND + parser_tensors = Cmd2ArgumentParser() + parser_tensors.add_argument('-c', '--channel', + nargs=(1, 2), + type=int, + help='channel to compare') + parser_tensors.add_argument('-s', '--step', + type=int, + help='step to compare') + parser_outexclu = parser_tensors.add_mutually_exclusive_group() + parser_outexclu.add_argument('-Q', '--compare_qsnr', + action='store_true', + help='compare two tensors QSNR') + parser_outexclu.add_argument('-E', '--compare_error', + action='store_true', + help='compare two tensors error (first - second)') + parser_tensors.add_argument('-n', '--name', + type=str, + choices_method=lambda x: x.tensor_store_names, + help='name to use for the tensor in the tensor store') + parser_tensors.add_argument('--write_numpy', + type=str, + completer_method=Cmd.path_complete, + help='write a tensor in npy format. you must select a step. ' + + 'the output of this step is written. specify a single tensor with ' + + 'the -t option.') + parser_tensors.add_argument('-m', '--make_filename', + type=str, + completer_method=Cmd.path_complete, + help='write a makefile including the dimensions of the tensor written ' + + 'and the dimensions of the input to the node that produced it.') + parser_texclu1 = parser_tensors.add_mutually_exclusive_group() + parser_texclu1.add_argument('-W', '--weights', + action='store_true', + help='compare weights') + parser_texclu1.add_argument('-B', '--biases', + action='store_true', + help='compare biases') + parser_texclu2 = parser_tensors.add_mutually_exclusive_group() + parser_texclu2.add_argument('-t', '--tensors', + nargs=(1, 2), + type=str, + choices_method=lambda x: x.tensor_store_names, + help='compare two tensors') + parser_texclu2.add_argument('-g', '--gap_load', + completer_method=Cmd.path_complete, + help='load tensors dumped by autotiler code. ' + + 'Supply the filename and' + + ' an optional tensor store name. If none is given' + + ' the filename will be used.') + parser_texclu2.add_argument('-X', '--clear', + action='store_true', + help='clears the tensor store') + + @with_argparser(parser_tensors) + def do_tensors(self, args): + """ +Load and manipulate tensors. If no option is supplied the saved tensors will be listed. +All the tensors in the store are available in dictionary 'tensors' in the python console +accessed by the command 'py'. Tensors can be displayed side by side or the average absolute +error or QSNR displayed. If a step is selected then the error by channel will be displayed.""" + if args.clear: + self.pfeedback('tensor store cleared') + self.tensor_store.clear() + return + if args.gap_load: + store_name = args.gap_load if not args.name else args.name + self.tensor_store[store_name] = at_map_tensors(self.G, at_tensor_loader(args.gap_load)) + return + if args.tensors: + if len(args.tensors) == 1: + tensor_name = args.tensors[0] + tensors = self.tensor_store.get(tensor_name) + if tensors is None: + self.perror("{} not in store".format(tensor_name)) + return + if args.step is None: + self.perror("you must select a step") + return + if args.step >= len(tensors): + self.perror("{} doesn't have that step".format(tensor_name)) + return + if tensors[args.step] is None: + self.perror("{} doesn't have this tensor for that step".format(tensor_name)) + return + tensor = tensors[args.step] + + if args.weights: + tensor = tensor[1] + elif args.biases: + tensor = tensor[2] + else: + tensor = tensor[0] + if args.write_numpy: + np.save(args.write_numpy, tensor) + else: + self.perror("not sure what to do with this single tensor") + return + + compare = args.tensors + tensors = [None]*2 + for i in range(2): + tensors[i] = self.tensor_store.get(compare[i]) + if tensors[i] is None: + self.perror("{} not in store".format(compare[i])) + return + if args.weights: + tensors[i] = [t[1] for t in tensors[i]] + elif args.biases: + tensors[i] = [t[2] for t in tensors[i]] + else: + tensors[i] = [t[0] for t in tensors[i]] + + if args.step is not None: + for i in range(2): + if args.step >= len(tensors[i]): + self.perror("{} doesn't have that step".format(compare[i])) + return + if tensors[i][args.step] is None: + self.perror( + "{} doesn't have this tensor for that step".format(compare[i])) + return + tensors[i] = [tensors[i][args.step]] + + if args.channel is not None: + for i in range(2): + for j, tensor in enumerate(tensors[i]): + if len(tensor.shape) <= len(args.channel): + self.perror("selected too many channels for this tensor") + for c in args.channel: + tensor = tensor[c] + tensors[i][j] = tensor + + if args.compare_qsnr or args.compare_error: + if args.compare_qsnr: + def func(x, y): + if x is not None and y is not None: + return qsnr(x.astype(np.float), y.astype(np.float)) + return float('nan') + else: + def func(x, y): + if x is not None and y is not None: + return np.abs(x - y) + return float('nan') + + if args.step is not None: + print("error for step %s" % args.step) + if args.channel is not None: + print("error for dimensions [%s]" % + (",".join([str(chan) for chan in args.channel]))) +#pylint: disable=unsubscriptable-object + out = [func(tensors[0][0][i], tensors[1][0][i]) + for i in range(len(tensors[0][0]))] + else: + out = [func(t1, t2) + for t1, t2 in zip(*tensors)] + for idx, val in enumerate(out): + if idx % 10 == 0: + print("\n{:03d} {:03d}: ".format(idx, idx+9), end='') + print('{}{}'.format(val, "" if (idx + 1) % 10 == 0 else ", "), end='') + print() + else: + self.ppaged("\n".join(print_comparison(tensors))) + return + + for idx, k in enumerate(self.tensor_store): + print("{:3d}) {}".format(idx, k)) diff --git a/tools/nntool/interpreter/commands/validation.py b/tools/nntool/interpreter/commands/validation.py new file mode 100644 index 000000000..37fa53f1d --- /dev/null +++ b/tools/nntool/interpreter/commands/validation.py @@ -0,0 +1,144 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import argparse +import logging + +import numpy as np +from cmd2 import Cmd, Cmd2ArgumentParser, with_argparser + +from execution.graph_executer import GraphExecuter +from execution.quantization_mode import QuantizationMode +from execution.execution_progress import ExecutionProgress +from interpreter.nntool_shell_base import NNToolShellBase +from interpreter.shell_utils import (glob_input_files, + input_options) +from utils.data_importer import import_data +from utils.validation_utils import ValidateFromJSON, ValidateFromName, ValidateFromClass, ValidateFromVWWInstances + +LOG = logging.getLogger('nntool.'+__name__) + + +class ValidationCommand(NNToolShellBase): + # VAL COMMAND + parser_val = Cmd2ArgumentParser() + parser_val.add_argument('-q', '--quantize', action='store_true', + help='quantize the graph (must have already set quantization)') + parser_val.add_argument('-s', '--silent', action='store_true', + help='do not print progress for each input') + parser_val.add_argument('--dataset_dir', + completer_method=Cmd.path_complete, + help='path to the directory of samples for test') + parser_val_group = parser_val.add_mutually_exclusive_group(required=False) + parser_val_group.add_argument('--label_json', + default=None, + completer_method=Cmd.path_complete, + help='path to the .json object containing labels annotation \ + { "filename0" : label0, "filename1": label1, ... }') + parser_val_group.add_argument('--class_number', + default=None, + type=int, + help='Number of a single class that all should match') + parser_val.add_argument('--progress_every', + default=100, + type=int, + help='print accuracy every n computed predictions') + parser_val_group.add_argument('--vww_instances_file', + default=None, + completer_method=Cmd.path_complete, + help='path to the .json object containing labels instances\ + with the visualwakewords format:\ + instances = {images, annotations, categories}\ + instances["images"] = { file_name:.., image_id:.. }\ + instances["annotations"] = { image_id:.., label:..}') + input_options(parser_val) + + @with_argparser(parser_val) + def do_validate(self, args: argparse.Namespace): + """ +Validate the model (quantized [-q] or not) in terms of prediction accuracy rate on a given dataset (images +folder). Ground truth labels can be embedded in files names ("filename_03.[png, ppm, pgm]", the number of +digits must be coherent with the number of networks outputs: e.g. in a 1000 classes problem the last digits +must be 3, "file_45.png" will raise an error) or can be written in a .json object (example: {'file0':label0, +'file1':label1, ...}) and given to the function with --label_json +""" + self._check_graph() + if args.quantize: + qmode = QuantizationMode.all_dequantize() + else: + qmode = QuantizationMode.none() + + LOG.info("quantization mode - %s", qmode) + input_args = self._get_input_args(args) + + good_predictions = [] + good_margin = 0 + bad_margin = 0 + + number_samples = sum(1 for _ in glob_input_files(args.input_files)) + + if args.vww_instances_file: + validation = ValidateFromVWWInstances(args.vww_instances_file) + elif args.label_json: + validation = ValidateFromJSON(args.label_json) + elif args.class_number is not None: + validation = ValidateFromClass(args.class_number) + else: + validation = ValidateFromName() + + try: + ExecutionProgress.start() + for i, file_per_input in enumerate(glob_input_files(args.input_files, self.G.num_inputs)): + if not args.silent: + LOG.info("input file %s", file_per_input) + data = [import_data(input_file, **input_args) for input_file in file_per_input] + + executer = GraphExecuter(self.G, qrecs=self.G.quantization) + outputs = executer.execute(data, qmode=qmode, silent=args.silent) + + good_prediction, class_predicted, real_class, margin = validation.validate( + file_per_input[0], np.asarray(outputs[-1])) + good_predictions.append(good_prediction) + if good_prediction: + good_margin += margin + else: + bad_margin += margin + + if not args.silent: + LOG.info('Prediction is %s predicted %s correct %s margin %s', + good_prediction, class_predicted, real_class, margin) + if not i % args.progress_every and i > 0: + LOG.info('ACCURACY: %.3f %%', 100*sum(good_predictions)/len(good_predictions)) + + ExecutionProgress.progress(i, number_samples) + ExecutionProgress.end() + + except (KeyboardInterrupt, SystemExit): + pass + + self.py_locals['labels'] = validation.labels + self.py_locals['predictions'] = validation.predictions + cnt = len(good_predictions) + if cnt: + ngood = sum(good_predictions) + nbad = cnt - ngood + if nbad: + LOG.info("%s out of %s predicted falsly with %s average margin", + nbad, cnt, bad_margin / nbad) + if ngood: + LOG.info("%s out of %s predicted correctly with %s average margin", + ngood, cnt, good_margin / ngood) + accuracy_rate = 100*sum(good_predictions)/len(good_predictions) + LOG.info('Total accuracy: %.3f %%', accuracy_rate) diff --git a/tools/nntool/interpreter/generator.py b/tools/nntool/interpreter/generator.py index c66a95fc2..1de1571d3 100644 --- a/tools/nntool/interpreter/generator.py +++ b/tools/nntool/interpreter/generator.py @@ -20,7 +20,7 @@ from generation.code_generator import CodeGenerator from generation.naming_convension import DefaultNamingConvension -from generation.default_template import default_template, dynamic_template +from generation.default_template import default_template, dynamic_template, header_template from utils.new_param_state import load_state @@ -72,6 +72,9 @@ def generate_code(args): sys.exit(1) with open(model_path, "w") as output_fp: output_fp.write(model) + if args.header_file: + with open(os.path.join(opts['model_directory'], args.header_file), "w") as output_fp: + output_fp.write(header_template(G, code_generator=code_gen)) if not args.dont_dump_tensors: LOG.info("Writing constants to %s", opts['model_directory']) code_gen.write_constants() diff --git a/tools/nntool/interpreter/nntool_shell.py b/tools/nntool/interpreter/nntool_shell.py index 1328ff0ca..9f7bd0470 100644 --- a/tools/nntool/interpreter/nntool_shell.py +++ b/tools/nntool/interpreter/nntool_shell.py @@ -13,75 +13,33 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -import argparse -import configparser import logging -import os -import pickle -from functools import partial -from itertools import chain -import json -import numpy as np -from cmd2 import (Cmd, Cmd2ArgumentParser, CompletionItem, EmptyStatement, - with_argparser) -from cmd2.utils import cast as cmd2_cast - -from execution.execute_graph import ExecutionProgress, execute, execute_validation -from execution.quantization_mode import QuantizationMode -from generation.code_generator import (DEFAULT_GEN_OPTS, - DEFAULT_GEN_OPTS_DESCRIPTIONS, - CodeGenerator) -from generation.default_template import default_template, dynamic_template, header_template -from generation.naming_convension import DefaultNamingConvension -from graph.matches.matches import get_fusion, get_fusions, get_std_match_group -from graph.types.others import InputOutputParameters -from graph.manipulations.extract import extract_node -from importer.importer import create_graph -from quantization.cross_layer_range_eq import (adjust_biases, - weight_equalization) -from quantization.simple_auto_quantify import SimpleQuantizer -from quantization.tuneq import tuneq -from quantization.adjust_relun import adjust_relun -from reports.activation_reporter import ActivationReporter -from reports.error_reporter import ErrorReporter -from reports.filter_reporter import (FilterDetailedStatsReporter, - FilterStatsReporter) -from reports.graph_reporter import GraphReporter -from reports.quantization_reporter import QuantizationReporter -from reports.temps_reporter import TempsReporter -from stats.activation_stats_collector import ActivationStatsCollector -from stats.error_stats_collector import ErrorStatsCollector -from stats.fake_filter_stats_collector import FakeFilterStatsCollector -from stats.filter_stats_collector import (FilterDetailedStatsCollector, - FilterStatsCollector) -from stats.step_error_stats_collector import StepErrorStatsCollector -from stats.temps_stats_collector import TempsStatsCollector -from utils.data_importer import MODES, import_data -from utils.gap_tensor_file import read_gap_tensors, write_gap_tensor -from utils.intermediate_cache import IntermediateCache -from utils.new_param_state import STATE_EXTENSION, dump_state, load_state -from utils.stats_funcs import STATS_BITS, qsnr -from utils.node_id import NodeId -from utils.validation_utils import ValidateFromName, ValidateFromJSON - -from .shell_utils import (NNToolShellLogHandler, filter_dirs, find_choice, - format_dump_file, glob_input_files, input_options, - output_table, print_comparison, table_options) +from interpreter.nntool_shell_base import NNToolShellBase +from .commands.dump import DumpCommand, RoundingCommand +from .commands.tensors import TensorsCommand +from .commands.validation import ValidationCommand +from .commands.gen import GenCommand +from .commands.nodeoption import NodeoptionCommand +from .commands.adjust import AdjustCommand +from .commands.freeze import FreezeCommand +from .commands.fusions import FusionsCommand +from .commands.graph import GraphCommand +from .commands.imageformat import ImageFormatCommand +from .commands.open import OpenCommand +from .commands.save_state import SaveStateCommand +from .commands.aquant import AquantCommand +from .commands.fquant import FquantCommand +from .commands.qerror import QerrorCommand +from .commands.qtune import QtuneCommand +from .commands.range_equalization import ( + BalanceFiltersCommand, BcorrCommand, WeightEqualizationCommand) +from .commands.qshow import QshowCommand +from .commands.astats import AstatsCommand +from .commands.temps import TempsCommand LOG = logging.getLogger("nntool") -CHECK_GRAPH_ERROR = """ -A graph must be opened to use this command. Use the open command to open a graph. -""" - -CHECK_QUANTIZED_ERROR = """ -The opened graph must be quantized to use this command. Run the aquant command. -""" - -CHECK_ADJUSTED_ERROR = """ -The opened graph must be adjusted to use this command. Run the adjust command. -""" VALID_LOG_LEVELS = [ "INFO", @@ -89,1627 +47,38 @@ "WARNING" ] -EXTRA_PROPERTIES = { - 'log_level': 'set logging level (one of {} or number)'.format(", ".join(VALID_LOG_LEVELS)), - 'enable_cache': 'enable value caching', - 'load_quantization': 'load TFLITE quantization information', - 'fusions': 'run standard graph fusions on graph load', - 'adjust_order': 'adjust activation and parameter dimension order\ - to match autotiler on graph load', - 'weight_equalization': 'equalize weights on graph load', - 'equalization_threshold': 'threshold for weight equalization convergence', - 'adjust_image': 'adjust image input size and channels', - 'image_width': 'input image width', - 'image_height': 'input image height', - 'image_mode': 'input image mode (one of {})'.format(", ".join(MODES.keys())), - 'input_divisor': "divide input tensor values by this value", - 'input_offset': "add this value to input tensor values", - 'input_norm_func': "lambda function in the form x: fn(x) where x is any input", - 'graph_name': 'name of the graph used for code generation', - 'template_file': 'template file used for code generation' -} +COMMANDS = [ + AquantCommand, + QshowCommand, + DumpCommand, + TensorsCommand, + ValidationCommand, + WeightEqualizationCommand, + OpenCommand, + SaveStateCommand, + GenCommand, + NodeoptionCommand, + AdjustCommand, + FreezeCommand, + FusionsCommand, + GraphCommand, + FquantCommand, + QerrorCommand, + BalanceFiltersCommand, + BcorrCommand, + AstatsCommand, + TempsCommand, + RoundingCommand, + QtuneCommand, + ImageFormatCommand +] -NO_GRAPH = { - 'G': None, - 'graph_file': "", - 'tensor_file': "" -} -# pylint: disable=too-many-public-methods +class CommandMixer(type): + def __new__(cls, name, base, ns): + return type.__new__(cls, name, tuple(COMMANDS + list(base)), ns) -class NNToolShell(Cmd): +class NNToolShell(NNToolShellBase, metaclass=CommandMixer): intro = 'Welcome to NNTOOL. Type help or ? to list commands.\n' prompt = '(NNT) ' - - def __init__(self, args, nntool_workdir, *rest, **kwargs): - super().__init__(*rest, **kwargs) - self._nntool_workdir = nntool_workdir - self.settable.update(EXTRA_PROPERTIES) - self.settable.update(DEFAULT_GEN_OPTS_DESCRIPTIONS) - - self.settings = { - 'enable_cache': True, - 'cache_dir': './.value_cache', - 'load_quantization': False, - 'fusions': False, - 'adjust_order': False, - 'weight_equalization': False, - 'equalization_threshold': 0.1, - 'adjust_image': False, - 'image_width': -1, - 'image_height': -1, - 'image_mode': "", - 'image_transpose': False, - 'input_norm_func': "", - 'input_divisor': 128, - 'input_offset': -1, - 'input_shift': 0, - 'log_level': 'INFO', - 'graph_file': "", - 'tensor_file': "", - 'template_file': "" - } - - self.settings.update(DEFAULT_GEN_OPTS) - - self.execute_load_settings() - - if args and args.log_level is not None: - self.settings['log_level'] = args.log_level.upper() - - self._graph_idx = 0 - self._tensor_store = {} - self.py_locals['tensors'] = self._tensor_store - - self.astats_collector = None - - # settings overide graph file - graph_file = self.settings['graph_file'] - tensor_file = self.settings['tensor_file'] - - # command line overides that - if args: - if args.graph_file: - graph_file = args.graph_file - - if args.tensor_file: - tensor_file = args.tensor_file - - if args.template_file: - self.settings['template_file'] = args.template_file - - if args.tf_quant: - self.settings['load_quantization'] = args.tf_quant - - if 'log_level' not in self.settings: - self.settings['log_level'] = "INFO" - - if graph_file: - self._graphs = [] - self._startup_commands.append(self.__build_open_graph( - graph_file, - tensor_file, - self.load_quantization - )) - else: - self._graphs = [ - NO_GRAPH.copy() - ] - - ExecutionProgress().listen(partial(NNToolShell.progress, self)) - LOG.propagate = False - handler = NNToolShellLogHandler(self) - formatter = logging.Formatter('%(module)s - %(message)s') - handler.setFormatter(formatter) - LOG.addHandler(handler) - LOG.setLevel(self.settings['log_level']) - - @property - def tensor_store_names(self): - return self._tensor_store.keys() - - def progress(self, step_idx, name, is_cached=False): - del is_cached - if not step_idx: - return - print("{}\r{} {}\r".format(" " * 70, step_idx, name), end="") - - def __getattr__(self, k): - if k in DEFAULT_GEN_OPTS: - return self.settings[k] - - def __setattr__(self, k, val): - if k in DEFAULT_GEN_OPTS: - self.settings[k] = val - super(NNToolShell, self).__setattr__(k, val) - - def run_script(self, script_path): - expanded_path = os.path.abspath(os.path.expanduser(script_path)) - - # Make sure the path exists and we can access it - if not os.path.exists(expanded_path): - self.perror("'{}' does not exist or cannot be accessed".format(expanded_path)) - return - - # Make sure expanded_path points to a file - if not os.path.isfile(expanded_path): - self.perror("'{}' is not a file".format(expanded_path)) - return - - # An empty file is not an error, so just return - if os.path.getsize(expanded_path) == 0: - return - - try: - # Read all lines of the script - with open(expanded_path, encoding='utf-8') as target: - script_commands = target.read().splitlines() - except OSError as ex: # pragma: no cover - self.pexcept("Problem accessing script from '{}': {}".format(expanded_path, ex)) - return - - orig_script_dir_count = len(self._script_dir) - - try: - self.runcmds_plus_hooks(self._startup_commands) - self._startup_commands.clear() - self._script_dir.append(os.path.dirname(expanded_path)) - return self.runcmds_plus_hooks(script_commands) - - finally: - with self.sigint_protection: - # Check if a script dir was added before an exception occurred - if orig_script_dir_count != len(self._script_dir): - self._script_dir.pop() - - # HELPERS / Properties - - @property - def G(self): - return self._graphs[self._graph_idx]['G'] - - @G.setter - def G(self, val): - self._graphs[self._graph_idx]['G'] = val - - @property - def graph_name(self): - if self._graph_idx is None: - return "" - return self._graphs[self._graph_idx]['G'].name - - @graph_name.setter - def graph_name(self, val): - if self._graph_idx is None: - return - self._graphs[self._graph_idx]['G'].name = val - - @property - def is_quantized(self): - return self._graphs[self._graph_idx]['G'].quantization is not None - - @property - def is_adjusted(self): - return self._graphs[self._graph_idx]['G'].graph_identity.is_adjusted - - @property - def is_equalized(self): - return self._graphs[self._graph_idx]['G'].graph_identity.is_equalized - - @property - def graph_file(self): - return self._graphs[self._graph_idx]['graph_file'] - - @graph_file.setter - def graph_file(self, val): - self._graphs[self._graph_idx]['graph_file'] = val - - @property - def tensor_file(self): - return self._graphs[self._graph_idx]['tensor_file'] - - @tensor_file.setter - def tensor_file(self, val): - self._graphs[self._graph_idx]['tensor_file'] = val - - def inputs_and_outputs(self): - if self.G is None: - return [] - return [node.name for node in chain(self.G.inputs_and_constants(), self.G.outputs())] - - def other_open_graphs(self, only_open=False): - items = [] - for graph_idx, graph in enumerate(self._graphs): - if graph_idx == self._graph_idx: - continue - if graph['G'] is None: - if only_open: - continue - name = "No Graph" - else: - name = graph['G'].name - items.append(CompletionItem(graph_idx, name)) - return items - - def _check_graph(self): - if self.G is None: - self.perror(CHECK_GRAPH_ERROR) - raise EmptyStatement() - - def _check_quantized(self): - if not self.is_quantized: - self.perror(CHECK_QUANTIZED_ERROR) - raise EmptyStatement() - - def _update_prompt(self): - self.prompt = "(NNT {} {}) ".format(os.path.basename(self.graph_file), - self._graph_idx) - - def _check_adjusted(self): - if not self.is_adjusted: - self.perror(CHECK_ADJUSTED_ERROR) - raise EmptyStatement() - - def _get_input_args(self, args): - res = {} - if self.settings['adjust_image']: - res['width'] = self.settings['image_width'] - res['height'] = self.settings['image_height'] - res['mode'] = self.settings['image_mode'] - else: - res['width'] = res['height'] = -1 - res['mode'] = None - if args: - if args.width is not None: - res['width'] = args.width - if args.height is not None: - res['height'] = args.height - if args.mode is not None: - res['mode'] = args.mode - -# res['shift'] = self.settings['input_shift'] if args.bit_shift -# is None else args.bit_shift - res['divisor'] = self.settings['input_divisor']\ - if args.divisor is None else args.divisor - res['offset'] = self.settings['input_offset']\ - if args.offset is None else args.offset - res['transpose'] = self.settings['image_transpose'] if args.transpose is None\ - else args.transpose - res['norm_func'] = self.settings['input_norm_func'] if args.norm_func is None\ - else args.norm_func - else: - # res['shift'] = self.settings['input_shift'] - res['divisor'] = self.settings['input_divisor'] - res['offset'] = self.settings['input_offset'] - res['transpose'] = self.settings['image_transpose'] - res['norm_func'] = self.settings['input_norm_func'] - - if args.nptype: - res['nptype'] = args.nptype - - return res - - # STATUS COMMAND - def do_status(self, _): - """ -Show current graph status -""" - if self.G is None: - self.poutput("No graph loaded") - return - self.poutput("graph file: {}".format(self.graph_file)) - if self.tensor_file: - self.poutput("tensor file: {}".format(self.tensor_file)) - self.poutput("is_quantized: {}".format(self.is_quantized)) - for k in ['is_adjusted', 'is_equalized', 'is_fused']: - self.poutput("{}: {}".format(k, getattr(self.G.graph_identity, k))) - - # SAVING AND LOADING SETTINGS - def execute_save_settings(self, dirname=None): - config = configparser.ConfigParser() - config['NNTOOL'] = {k: str(getattr(self, k)) - for k in self.settable if k != "prompt"} - config['GRAPH'] = {k: getattr(self, k) for k in ['graph_file', 'tensor_file']} - if dirname is None: - dirname = self._nntool_workdir - with open(os.path.join(dirname, 'nntool.ini'), 'w') as config_fp: - config.write(config_fp) - - def execute_load_settings(self, filepath=None): - nntool_home = os.path.join(self._nntool_workdir, 'nntool.ini') - if filepath is None: - if os.path.isfile('nntool.ini'): - filepath = 'nntool.ini' - elif os.path.isfile(nntool_home): - filepath = nntool_home - else: - return - config = configparser.ConfigParser() - config.read(filepath) - for key in config['NNTOOL']: - if key == "prompt": - continue - elif key in self.settings: - orig_val = self.settings[key] - self.settings[key] = cmd2_cast(orig_val, config['NNTOOL'][key]) - else: - orig_val = getattr(self, key) - setattr(self, key, cmd2_cast(orig_val, config['NNTOOL'][key])) - - self.settings["graph_file"] = config['GRAPH']['graph_file'] - self.settings["tensor_file"] = config['GRAPH']['tensor_file'] - - # CACHE PROPERTIES - def update_cache(self): - if self.G is None: - return - if self.settings['enable_cache']: - self.G.value_cache = IntermediateCache(self.settings['cache_dir']) - else: - self.G.value_cache = None - - @property - def enable_cache(self): - return self.settings['enable_cache'] - - @enable_cache.setter - def enable_cache(self, val): - self.settings['enable_cache'] = val - self.update_cache() - - # CACHE_DIR COMMAND - cache_dir_parser = Cmd2ArgumentParser(description="set value cache directory") - cache_dir_parser.add_argument('value', nargs=argparse.OPTIONAL, - help='the directory for the cache dir', - completer_method=partial(Cmd.path_complete, - path_filter=filter_dirs)) - - @property - def cache_dir(self): - return self.settings['cache_dir'] - - @cache_dir.setter - def cache_dir(self, val): - if self.settings['cache_dir'] != val: - self.settings['cache_dir'] = val - self.update_cache() - - @with_argparser(cache_dir_parser) - def do_cache_dir(self, args: argparse.Namespace): - """ -Sets the directory for the value cache -""" - self.cache_dir = args.value - - # SAVE_SETTINGS COMMAND - save_settings_parser = Cmd2ArgumentParser(description="set value cache directory") - save_settings_parser.add_argument('value', nargs=argparse.OPTIONAL, - help='the directory for the settings file', - completer_method=partial(Cmd.path_complete, - path_filter=filter_dirs)) - - @with_argparser(save_settings_parser) - def do_save_settings(self, args: argparse.Namespace): - """ -Save the current settings. If no directory is given then they are saved -in the ~/.nntool directory""" - self.execute_save_settings(args.value) - - # LOG_LEVEL PROPERTY - - @property - def log_level(self): - return self.settings['log_level'] - - @log_level.setter - def log_level(self, val): - try: - val = int(val) - self.settings['log_level'] = val - except ValueError: - val = val.upper() - val = find_choice(VALID_LOG_LEVELS, val) - - self.settings['log_level'] = val - LOG.setLevel(self.settings['log_level']) - LOG.info("set log level to %s", val) - - # load_quantization PROPERTY - - @property - def load_quantization(self): - return self.settings['load_quantization'] - - @load_quantization.setter - def load_quantization(self, val): - self.settings['load_quantization'] = bool(val) - - # FUSIONS PROPERTY - - @property - def fusions(self): - return self.settings['fusions'] - - @fusions.setter - def fusions(self, val): - self.settings['fusions'] = bool(val) - - # ADJUST_ORDER PROPERTY - - @property - def adjust_order(self): - return self.settings['adjust_order'] - - @adjust_order.setter - def adjust_order(self, val): - self.settings['adjust_order'] = bool(val) - - # WEIGHT_EQUALIZATION PROPERTY - - @property - def weight_equalization(self): - return self.settings['weight_equalization'] - - @weight_equalization.setter - def weight_equalization(self, val): - self.settings['weight_equalization'] = bool(val) - - # EQUALIZATION_THRESHOLD PROPERTY - - @property - def equalization_threshold(self): - return self.settings['equalization_threshold'] - - @equalization_threshold.setter - def equalization_threshold(self, val): - self.settings['equalization_threshold'] = float(val) - - # IMAGE ADJUSTMENT SETTINGS - - # ADJUST_IMAGE PROPERTY - - @property - def adjust_image(self): - return self.settings['adjust_image'] - - @adjust_image.setter - def adjust_image(self, val): - self.settings['adjust_image'] = bool(val) - - # IMAGE_WIDTH PROPERTY - - @property - def image_width(self): - return self.settings['image_width'] - - @image_width.setter - def image_width(self, val): - try: - val = int(val) - if val <= 0: - raise ValueError() - except ValueError: - raise ValueError("value should be positive integer") - self.settings['image_width'] = bool(val) - - # IMAGE_HEIGHT PROPERTY - - @property - def image_height(self): - return self.settings['image_height'] - - @image_height.setter - def image_height(self, val): - try: - val = int(val) - if val <= 0: - raise ValueError() - except ValueError: - raise ValueError("value should be positive integer") - self.settings['image_height'] = bool(val) - - # IMAGE_MODE PROPERTY - - @property - def image_mode(self): - return self.settings['image_mode'] - - @image_mode.setter - def image_mode(self, val): - val = find_choice(MODES.keys(), val) - self.settings['image_mode'] = bool(val) - - # INPUT_DIVISOR PROPERTY - - @property - def input_divisor(self): - return self.settings['input_divisor'] - - @input_divisor.setter - def input_divisor(self, val): - self.settings['input_divisor'] = int(val) - - # INPUT_NORM_FUNC PROPERTY - - @property - def input_norm_func(self): - return self.settings['input_norm_func'] - - @input_norm_func.setter - def input_norm_func(self, val): - self.settings['input_norm_func'] = str(val) - - # INPUT_OFFSET PROPERTY - - @property - def input_offset(self): - return self.settings['input_offset'] - - @input_offset.setter - def input_offset(self, val): - self.settings['input_offset'] = int(val) - - @property - def template_file(self): - return self.settings['template_file'] - - @template_file.setter - def template_file(self, val): - self.settings['template_file'] = val - - # OPEN COMMAND - parser_open = Cmd2ArgumentParser("open a graph file") - parser_open.add_argument('nnfile', - completer_method=Cmd.path_complete, - help='graph or state file', - metavar="INPUT_GRAPH or STATE_FILE") - parser_open.add_argument('tensor_file', - nargs=argparse.OPTIONAL, - completer_method=Cmd.path_complete, - help='optional tensor file') - parser_open.add_argument('-q', '--load_quantization', - help='load TFLite quantization information', action='store_true') - parser_open.add_argument('-n', '--new', - help='open as new graph - keep existing graph open', - action='store_true') - - @staticmethod - def __build_open_graph(graph_file, tensor_file, load_quantization): - command = ["open", graph_file, "-n"] - if tensor_file: - command.append("-t {}".format(tensor_file)) - if load_quantization: - command.append("-q") - return " ".join(command) - - def __open_graph(self, graph_file, tensor_file, load_quantization): - - value_cache = IntermediateCache(self.settings['cache_dir'])\ - if self.settings['enable_cache'] else None - - graph_file = os.path.expanduser(graph_file) - - _, ext = os.path.splitext(graph_file) - - if ext == STATE_EXTENSION: - LOG.info("opening state file %s", graph_file) - self.graph_file = graph_file - self.G, extra = load_state(graph_file, value_cache=value_cache, return_extra=True) - self.settings.update(extra) - else: - LOG.info("opening graph file %s", graph_file) - opts = { - 'load_tensors': True, - 'load_quantization': load_quantization, - 'value_cache': value_cache, - } - - G = create_graph(graph_file, opts=opts) - G.add_dimensions() - if tensor_file: - G.load_tensors(tensor_file) - self.G = G - self.graph_file = graph_file - if tensor_file is not None: - self.tensor_file = tensor_file - self.settings['load_quantization'] = bool(load_quantization) - if self.settings['adjust_order']: - LOG.info("adjusting order") - self.execute_adjust_order() - if self.settings['fusions']: - LOG.info("applying standard fusions") - self.apply_standard_fusions() - if self.settings['weight_equalization']: - LOG.info("equalizing weights") - weight_equalization(self.G, self.settings['equalization_threshold']) - - @with_argparser(parser_open) - def do_open(self, args: argparse.Namespace): - """ -Open a graph or state file""" - if args.new: - # reset the current graph - self._graphs.append(NO_GRAPH.copy()) - self._graph_idx = len(self._graphs) - 1 - else: - # reset the current graph - self._graphs[self._graph_idx] = NO_GRAPH.copy() - self.__open_graph(args.nnfile, args.tensor_file, args.load_quantization) - self._update_prompt() - self.py_locals['G'] = self.G - - # GRAPH COMMAND - parser_graph = Cmd2ArgumentParser("display graph") - parser_graph.add_argument('graph_number', - nargs=argparse.OPTIONAL, - type=int, - choices_method=other_open_graphs, - help='graph to select or nothing to show open graphs') - - @with_argparser(parser_graph) - def do_graph(self, args: argparse.Namespace): - """ -Select actuve graphs""" - if args.graph_number is not None: - if args.graph_number < 0 or args.graph_number >= len(self._graphs): - self.perror("graph number is invalid") - return - self._graph_idx = args.graph_number - self.pfeedback("selected graph {}".format(self._graph_idx)) - self._update_prompt() - self.py_locals['G'] = self.G - else: - for idx, rec in enumerate(self._graphs): - self.poutput("{:d} - {}".format(idx, rec['graph_file'])) - - # SHOW COMMAND - parser_show = Cmd2ArgumentParser("display graph") - table_options(parser_show, default_width=180) - parser_show.add_argument('step', type=int, nargs=(0, 1), help='Limit to step number') - - @with_argparser(parser_show) - def do_show(self, args: argparse.Namespace): - """ -Display the structure of the graph""" - self._check_graph() - fmt = ('tab' if args.output is None else args.output['fmt']) - split_dims = fmt == "xls" - do_totals = fmt != "csv" - tab = GraphReporter(split_dims=split_dims, do_totals=do_totals, - step=args.step).report(self.G, None) - output_table(tab, args) - - # STATS COMMAND - parser_stats = Cmd2ArgumentParser("display statistics on globals") - parser_stats.add_argument('-d', '--detailed', - action="store_true", help='Dump detailed statistics') - parser_stats.add_argument('-q', '--qsnr', - type=float, default=30.0, help='QSNR threshold') - parser_stats.add_argument('-s', '--step', - type=int, - nargs=(1, 2), - help='display information by channel for step') - table_options(parser_stats, default_width=180) - - @with_argparser(parser_stats) - def do_stats(self, args: argparse.Namespace): - """ -Display statistics on weights and biases""" - self._check_graph() - fmt = ('tab' if args.output is None else args.output['fmt']) - if args.detailed: - stats_collector = FilterDetailedStatsCollector() - stats = stats_collector.collect_stats(self.G) - tab = FilterDetailedStatsReporter().report(self.G, stats) - else: - step_idx = args.step - if step_idx is not None: - if len(step_idx) == 1: - step_idx = step_idx[0] - else: - step_idx = tuple(step_idx) - stats_collector = FilterStatsCollector() - stats = stats_collector.collect_stats(self.G, step_idx=step_idx) - tab = FilterStatsReporter(do_totals=(fmt != "csv"), threshold=args.qsnr, step_idx=step_idx)\ - .report(self.G, stats) - output_table(tab, args) - - # FREEZE COMMAND - parser_freeze = Cmd2ArgumentParser("toggle freezing of channel order of inputs or outputs") - parser_freeze.add_argument('node_names', - nargs='+', - choices_method=inputs_and_outputs, - help='input or output node names to toggle freeze') - - @with_argparser(parser_freeze) - def do_freeze(self, args: argparse.Namespace): - """ -Toggle freezing of channel order on inputs and outputs. When graph is adjusted frozen nodes - will not change channel order.""" - self._check_graph() - nodes = [self.G.node(node_name) for node_name in args.node_names] - if not all([isinstance(node, InputOutputParameters) for node in nodes]): - self.perror("all nodes should be inputs or outputs") - return - - for node in nodes: - if node.fixed_order: - LOG.info("node %s is unfrozen", node.name) - node.fixed_order = False - else: - LOG.info("node %s is frozen", node.name) - node.fixed_order = True - self.G.node_options[NodeId(node)] = node.at_options - - # FUSIONS COMMAND - def fusions_list(self): - return [elem[0] for elem in get_fusions()] - - parser_fusions = Cmd2ArgumentParser("apply fusions to graph") - parser_fustions_exclusive = parser_fusions.add_mutually_exclusive_group() - parser_fustions_exclusive.add_argument('-l', '--list', - action='store_true', - help='list available fusions') - parser_fustions_exclusive.add_argument('-a', '--apply', - type=str, - choices_method=fusions_list, - help='apply a fusion') - - def apply_standard_fusions(self): - get_std_match_group().match(self.G) - self.G.add_dimensions() - - @with_argparser(parser_fusions) - def do_fusions(self, args): - """ -Carry out the default set of fusions on the graph""" - self._check_graph() - if args.list: - self.ppaged("\n".join(["%s - %s" % (name, desc) for name, desc in get_fusions()])) - return - if args.apply: - fusion = get_fusion(args.apply) - if not fusion: - self.perror('fusion %s not found' % args.apply) - return - else: - fusion = get_std_match_group() - - fusion.match(self.G) - self.G.add_dimensions() - self.G.quantization = None - - # ADJUST COMMAND - # parser_adjust = Cmd2ArgumentParser("display statistics on globals") - - def execute_adjust_order(self): - self.G.adjust_order() - self.G.add_dimensions() - - # @with_argparser(parser_adjust) - def do_adjust(self, _): - """ -Adjust activation and parameter tensors to match AutoTiler order. -Must be run before generating code.""" - self._check_graph() - if self.is_adjusted: - self.perror("graph is already adjusted") - return - self.execute_adjust_order() - - # WEIGHT_EQUALIZATION COMMAND - parser_we = Cmd2ArgumentParser() - parser_we.add_argument('threshold', - type=float, default=0.1, - help='convergence threshold') - parser_we.add_argument('-n', '--relun', - action='store_true', help='process relun activations. not currently supported \ - by autotiler kernels') - - def execute_weight_equalization(self, threshold, do_relun=False): - if not (threshold > 0 and threshold < 10): - self.perror("threshold should be 10 > x > 0") - weight_equalization(self.G, threshold=threshold, do_relun=do_relun) - - @with_argparser(parser_we) - def do_weight_equalization(self, args: argparse.Namespace): - """ -Run weight equalization on graph. This reduces variance between weight -channels and may improve quantization accuracy.""" - self._check_graph() - self.execute_weight_equalization(args.threshold, args.relun) - - # BALANCE_FILTERS COMMAND - parser_bf = Cmd2ArgumentParser() - parser_bf.add_argument('-s', '--step', - type=int, help='step to balance. should be a convolution') - parser_bf.add_argument('-t', '--threshold', - default=0.20, - type=float, help='precision threshold of weights below which a layer should be balanced') - - @with_argparser(parser_bf) - def do_balance_filters(self, args: argparse.Namespace): - """ -Balance filter weights. THis will reduce variance in weights and will result in -a more balanced quantization at the expense of a multiplicative bias calculation.""" - self._check_graph() - self.G.balance_filters(step_idx=args.step, precision_threshold=args.threshold) - self.G.quantization = None - - # ASTATS COMMAND - parser_astats = Cmd2ArgumentParser() - parser_astats.add_argument('-q', '--qsnr', - type=float, default=30.0, help='QSNR threshold') - parser_astats.add_argument('-d', '--detail', - action="store_true", help='Show fusions detail') - parser_astats.add_argument('-s', - '--step', - type=int, - nargs=(1, 2), - help='display information by channel for step. You can indicate a fusion step with two values. The step_idx and the idx of the node in the fusion.') - table_options(parser_astats, default_width=180) - input_options(parser_astats) - - @with_argparser(parser_astats) - def do_astats(self, args: argparse.Namespace): - """ -Calculate activation statistics on one or more imput files.""" - self._check_graph() - input_args = self._get_input_args(args) - stats_collector = ActivationStatsCollector() - step_idx = args.step - if step_idx is not None: - if len(step_idx) == 1: - step_idx = step_idx[0] - else: - step_idx = tuple(step_idx) - if len(args.input_files) == 0: - self.perror("You must enter some files to process") - return - for input_file in glob_input_files(args.input_files): - LOG.info("input file %s", input_file) - data = import_data(input_file, **input_args) - data = stats_collector.collect_stats(self.G, [data], step_idx=step_idx) - - fmt = ('tab' if args.output is None else args.output['fmt']) - tab = ActivationReporter(do_totals=(fmt != "csv"), - threshold=args.qsnr, - yield_fusions=args.detail or isinstance(step_idx, tuple)).report(self.G, - stats_collector.reduce_stats()) - output_table(tab, args) - - # FQUANT COMMAND - parser_fquant = Cmd2ArgumentParser() - parser_fquant.add_argument('-f', '--force_width', - choices=STATS_BITS, default=8, type=int, help='force all layers to this width') - table_options(parser_fquant, default_width=140) - - @with_argparser(parser_fquant) - def do_fquant(self, args: argparse.Namespace): - """ -Attempt to calculate a fake quantization for graph using random tensors and parameters. -This is intended to allow code generation for performance testing even if no real -weights and input data are avalaible.""" - self._check_graph() - self.G.constant_store.fake = True - stats_collector = ActivationStatsCollector() - input_tensors = [np.random.normal(0, 0.2, input.dims.shape) - for input in self.G.input_nodes()] - stats_collector.collect_stats(self.G, input_tensors) - astats = stats_collector.reduce_stats() - stats_collector = FakeFilterStatsCollector() - fstats = stats_collector.collect_stats(self.G) - quantizer = SimpleQuantizer(astats, fstats, - force_width=args.force_width) - qrecs = quantizer.quantize(self.G) - self.G.quantization = qrecs - tab = QuantizationReporter().report(self.G, qrecs) - output_table(tab, args) - self.G.constant_store.fake = False - - # AQUANT COMMAND - parser_aquant = Cmd2ArgumentParser() - parser_aquant_group = parser_aquant.add_mutually_exclusive_group(required=True) - parser_aquant_group.add_argument('-q', '--qsnr', - type=float, default=50.0, help='QSNR threshold') - parser_aquant_group.add_argument('-f', '--force_width', - choices=STATS_BITS, type=int, help='force all layers to this width') - parser_aquant.add_argument('-a', '--adjust_relun', - action='store_true', help='Adjust relu N activations to match dynamic in test data.') - parser_aquant.add_argument('-i', '--init', - action='store_true', help='Initialize activations statistics') - parser_aquant.add_argument('-r', '--relun_threshold', - type=int, default=1, help='Threshold above floored max value to adjust relun\'s to.') - table_options(parser_aquant, default_width=140) - input_options(parser_aquant) - - @with_argparser(parser_aquant) - def do_aquant(self, args: argparse.Namespace): - """ -Attempt to calculate quantization for graph using one or more sample imput files.""" - self._check_graph() - input_args = self._get_input_args(args) - processed_input = False - if self.astats_collector is None or args.init: - self.astats_collector = ActivationStatsCollector() - - for input_file in glob_input_files(args.input_files): - LOG.info("input file %s", input_file) - processed_input = True - data = import_data(input_file, **input_args) - self.astats_collector.collect_stats(self.G, [data]) - if not processed_input: - self.perror("No imput files found") - return - astats = self.astats_collector.reduce_stats() - if args.adjust_relun: - adjust_relun(self.G, astats, threshold=args.relun_threshold) - stats_collector = FilterStatsCollector() - fstats = stats_collector.collect_stats(self.G) - quantizer = SimpleQuantizer(astats, fstats, - force_width=args.force_width, - min_qsnr=args.qsnr) - qrecs = quantizer.quantize(self.G) - self.G.quantization = qrecs - tab = QuantizationReporter().report(self.G, qrecs) - output_table(tab, args) - - # DUMP COMMAND - parser_dump = Cmd2ArgumentParser() - parser_dump.add_argument('-s', '--step', - type=int, help='step to dump output of', default=None) - parser_dump.add_argument('-w', '--number_width', - type=int, help='width of numbers', default=8) - parser_dump.add_argument('-p', '--precision', - type=int, help='number of decimal places', default=4) - parser_dump.add_argument('-c', '--channel', - type=int, help='channel to dump', default=None) - parser_dump.add_argument('-d', '--dequantize', - action='store_true', help='dequantize result') - parser_dump_group = parser_dump.add_mutually_exclusive_group(required=False) - parser_dump_group.add_argument('-q', '--quantize', action='store_true', - help='quantize the graph (must have already set quantization)') - parser_dump_group.add_argument('-Q', '--quantize_step', type=int, - help='quantize a step of the graph (must have already' + - ' set quantization)', - default=None) - parser_dump.add_argument('-P', '--pickle', - completer_method=Cmd.path_complete, - help='pickle all the outputed tensors to this file') - parser_dump.add_argument('-S', '--save', - help='save the tensor to the tensors list') - input_options(parser_dump) - - @with_argparser(parser_dump) - def do_dump(self, args: argparse.Namespace): - """ -Dump the activations resulting from running an input file through the graph. -You can use the current quantization settings and can also just quantify one -specific step of the graph.""" - self._check_graph() - if args.quantize or args.quantize_step: - self._check_quantized() - if args.quantize: - qmode = QuantizationMode.all() - else: - qmode = QuantizationMode.step(args.quantize_step) - else: - qmode = QuantizationMode.none() - if args.step is not None: - step = args.step - num_steps = len(self.G.graph_state.steps) - if step < 0: - step = num_steps + step - if step < 0 or step > num_steps: - self.perror("step must be from {} to {}".format(-num_steps, num_steps)) - return - else: - step = None - - input_args = self._get_input_args(args) - - pickles = [] - dequantize = args.dequantize if args.dequantize is not None\ - else not (args.pickle or args.save) - - for input_file in glob_input_files(args.input_files): - LOG.info("input file %s", input_file) - - data = import_data(input_file, **input_args) - outputs = execute(self.G, [data], limit=step, qrecs=self.G.quantization, - qmode=qmode, - dequantize=dequantize) - - if args.pickle or self._in_py or args.save: - pickles.append(format_dump_file(self.G, outputs, not qmode.is_none)) - else: - self.G.print_intermediates(outputs, limit=step, width=args.number_width, - precision=args.precision, channel=args.channel, - order=['c', 'h', 'w']) - - if args.pickle or args.save or self._in_py: - if not pickles: - self.perror("no input files found") - return - if len(args.input_files) == 1: - pickles = pickles[0] - if args.pickle: - with open(args.pickle, 'wb') as pickle_fp: - pickle.dump(pickles, pickle_fp) - if args.save: - self._tensor_store[args.save] = pickles - - if self._in_py: - self.last_result = pickles - - # VAL COMMAND - parser_val = Cmd2ArgumentParser() - parser_val.add_argument('-q', '--quantize', action='store_true', - help='quantize the graph (must have already set quantization)') - parser_val.add_argument('-s', '--silent', action='store_true', - help='do not print progress for each input') - parser_val.add_argument('-P', '--pickle', - completer_method=Cmd.path_complete, - help='pickle all the outputed tensors to this file') - parser_val.add_argument('-S', '--save', - help='save the tensor to the tensors list') - parser_val.add_argument('--dataset_dir', - completer_method=Cmd.path_complete, - help='path to the directory of samples for test') - parser_val.add_argument('--label_json', - default=None, - completer_method=Cmd.path_complete, - help='path to the .json object containing labels annotation \ - { "filename0" : label0, "filename1": label1, ... }') - #parser_val.add_argument('--num_classes', - # default=None, - # type=int, - # help='number of classes of the dataset') - #parser_val.add_argument('-E', '--emulation_mode', - # action='store_true', - # help='do the validation with the GAP emulator running on host') - #parser_val.add_argument('--AT_model_file', - # completer_method=Cmd.path_complete, - # help='path to the AT model to compile and run if emulation mode is on') - #parser_val.add_argument('--extra_flags', - # default='', - # help='extra flag for AT_model compiler') - #parser_val.add_argument('--AT_exe', - # default='Gentile', - # help='name of the autotiler executable file generated after model compile') - #parser_val.add_argument('--gen_files_dir', - # completer_method=Cmd.path_complete, - # default='./', - # help='path to the directory to store AT generated files') - input_options(parser_val) - - @with_argparser(parser_val) - def do_validate(self, args: argparse.Namespace): - """ -Validate the model (quantized [-q] or not) in terms of prediction accuracy rate on a given dataset (images -folder). Ground truth labels can be embedded in files names ("filename_03.[png, ppm, pgm]", the number of -digits must be coherent with the number of networks outputs: e.g. in a 1000 classes problem the last digits -must be 3, "file_45.png" will raise an error) or can be written in a .json object (example: {'file0':label0, -'file1':label1, ...}) and given to the function with --label_json -""" - self._check_graph() - if args.quantize: - self._check_quantized() - qmode = QuantizationMode.all() - else: - qmode = QuantizationMode.none() - - LOG.info("quantization mode - %s", qmode) - input_args = self._get_input_args(args) - - # TODO - compile and run ATmodel (ok) - # - generate mainfile for validation or use AT generated functions in .so - # - compile and run mainfile for the validation - #if args.emulation_mode: - # assert args.AT_model_file is not None - # compile_and_run_AT_model(model=args.AT_model_file, output_exe=args.AT_exe, - # output_dir=args.gen_files_dir, extra_flags=args.extra_flags) - # test_dataset = test_dataset_4_eval(folder_path=args.dataset_dir, num_classes=args.num_classes) - # write_test_main_template(self.G, test_dataset, (238,208,3), args.gen_files_dir+'/validation_main.c') - # compile_and_run_test_main(model_name=self.G.name, mainfile=args.gen_files_dir+'/validation_main.c', - # output_exe=self.G.name+'_emul', - # output_dir=args.gen_files_dir, extra_flags=args.extra_flags) - # return - - good_predictions = [] - - input_dir_paths = [] - for path in args.input_files: - input_dir_paths.append(os.path.split(path)[0]) - - number_samples = 0 - for input_dir_path in input_dir_paths: - number_samples += len([name for name in os.listdir(input_dir_path) if os.path.isfile(input_dir_path + '/' + name)]) - - if args.label_json: - validation = ValidateFromJSON(args.label_json) - else: - validation = ValidateFromName() - - ExecutionProgress.start() - for i,input_file in enumerate(glob_input_files(args.input_files)): - if not args.silent: - LOG.info("input file %s", input_file) - - data = import_data(input_file, **input_args) - outputs = execute_validation(self.G, [data], qrecs=self.G.quantization, - qmode=qmode, validation=True, silent=args.silent) - - good_prediction, label = validation.validate(input_file, np.asarray(outputs[-1])) - good_predictions.append(good_prediction) - - if not args.silent: - LOG.info('Prediction is %s', good_prediction) - if not i % 100 and i > 0: - LOG.info('ACCURACY: %.3f %%', 100*sum(good_predictions)/len(good_predictions)) - - ExecutionProgress.progress(i, number_samples) - ExecutionProgress.end() - - self.py_locals['labels'] = validation.labels - self.py_locals['predictions'] = validation.predictions - accuracy_rate = 100*sum(good_predictions)/len(good_predictions) - LOG.info('ACCURACY: %.3f %%', accuracy_rate) - - - # TENSORS_COMMAND - parser_tensors = Cmd2ArgumentParser() - parser_tensors.add_argument('-c', '--channel', - nargs=(1, 2), - type=int, - help='channel to compare') - parser_tensors.add_argument('-s', '--step', - type=int, - help='step to compare') - parser_outexclu = parser_tensors.add_mutually_exclusive_group() - parser_outexclu.add_argument('-Q', '--compare_qsnr', - action='store_true', - help='compare two tensors QSNR') - parser_outexclu.add_argument('-E', '--compare_error', - action='store_true', - help='compare two tensors error (first - second)') - parser_tensors.add_argument('-n', '--name', - type=str, - choices_method=lambda x: x.tensor_store_names, - help='name to use for the tensor in the tensor store') - parser_tensors.add_argument('-f', '--write_filename', - type=str, - completer_method=Cmd.path_complete, - help='write a tensor in gap helpers format. you must select a step. ' + - 'the output of this step is written. specify a single tensor with ' + - 'the -t option.') - parser_tensors.add_argument('-m', '--make_filename', - type=str, - completer_method=Cmd.path_complete, - help='write a makefile including the dimensions of the tensor written ' + - 'and the dimensions of the input to the node that produced it.') - parser_texclu1 = parser_tensors.add_mutually_exclusive_group() - parser_texclu1.add_argument('-W', '--weights', - action='store_true', - help='compare weights') - parser_texclu1.add_argument('-B', '--biases', - action='store_true', - help='compare biases') - parser_texclu2 = parser_tensors.add_mutually_exclusive_group() - parser_texclu2.add_argument('-t', '--tensors', - nargs=(1, 2), - type=str, - choices_method=lambda x: x.tensor_store_names, - help='compare two tensors') - parser_texclu2.add_argument('-g', '--gap_load', - completer_method=Cmd.path_complete, - help='load tensors dumped by autotiler code. ' + - 'Supply the filename and' + - ' an optional tensor store name. If none is given' + - ' the filename will be used.') - parser_texclu2.add_argument('-X', '--clear', - action='store_true', - help='clears the tensor store') - - @with_argparser(parser_tensors) - def do_tensors(self, args): - """ -Load and manipulate tensors. If no option is supplied the saved tensors will be listed. -All the tensors in the store are available in dictionary 'tensors' in the python console -accessed by the command 'py'. Tensors can be displayed side by side or the average absolute -error or QSNR displayed. If a step is selected then the error by channel will be displayed.""" - if args.clear: - self.pfeedback('tensor store cleared') - self._tensor_store.clear() - return - if args.gap_load: - store_name = args.gap_load if not args.name else args.name - self._tensor_store[store_name] = read_gap_tensors(args.gap_load) - return - if args.tensors: - if len(args.tensors) == 1: - tensor_name = args.tensors[0] - tensors = self._tensor_store.get(tensor_name) - if tensors is None: - self.perror("{} not in store".format(tensor_name)) - return - if args.step is None: - self.perror("you must select a step") - return - if args.step >= len(tensors): - self.perror("{} doesn't have that step".format(tensor_name)) - return - if tensors[args.step] is None: - self.perror("{} doesn't have this tensor for that step".format(tensor_name)) - return - tensor = tensors[args.step] - - if args.weights: - tensor = tensor[1] - elif args.biases: - tensors = tensor[2] - else: - tensors = tensor[0] - if args.write_filename: - if args.make_filename: - node = self.G.graph_state.steps[args.step]['node'] - in_edge = self.G.in_edges(node.name)[0] - in_step = in_edge.from_node.step_idx - all_tensors = self._tensor_store.get(tensor_name) - write_gap_tensor(args.write_filename, tensor, step=args.step, - output_tensor=all_tensors[in_step][0], make_file=args.make_filename) - else: - write_gap_tensor(args.write_filename, tensor, step=args.step) - else: - self.perror("not sure what to do with this single tensor") - return - - compare = args.tensors - tensors = [None]*2 - for i in range(2): - tensors[i] = self._tensor_store.get(compare[i]) - if tensors[i] is None: - self.perror("{} not in store".format(compare[i])) - return - if args.weights: - tensors[i] = [t[1] for t in tensors[i]] - elif args.biases: - tensors[i] = [t[2] for t in tensors[i]] - else: - tensors[i] = [t[0] for t in tensors[i]] - - if args.step is not None: - for i in range(2): - if args.step >= len(tensors[i]): - self.perror("{} doesn't have that step".format(compare[i])) - return - if tensors[i][args.step] is None: - self.perror( - "{} doesn't have this tensor for that step".format(compare[i])) - return - tensors[i] = [tensors[i][args.step]] - - if args.channel is not None: - for i in range(2): - for j, tensor in enumerate(tensors[i]): - if len(tensor.shape) <= len(args.channel): - self.perror("selected too many channels for this tensor") - for c in args.channel: - tensor = tensor[c] - tensors[i][j] = tensor - - if args.compare_qsnr or args.compare_error: - if args.compare_qsnr: - def func(x, y): - return qsnr(x.astype(np.float), y.astype(np.float)) - else: - def func(x, y): - return np.average(np.abs(x - y)) - if args.step is not None: - print("error for step %s" % args.step) - if args.channel is not None: - print("error for dimensions [%s]" % - (",".join([str(chan) for chan in args.channel]))) -#pylint: disable=unsubscriptable-object - out = [func(tensors[0][0][i], tensors[1][0][i]) - for i in range(len(tensors[0][0]))] - else: - out = [func(t1, t2) - for t1, t2 in zip(*tensors)] - for idx, val in enumerate(out): - if idx % 10 == 0: - print("\n{:03d} {:03d}: ".format(idx, idx+9), end='') - print('{:3.0f}{}'.format(val, "" if (idx + 1) % 10 == 0 else ", "), end='') - print() - else: - self.ppaged("\n".join(print_comparison(tensors))) - return - - for idx, k in enumerate(self._tensor_store): - print("{:3d}) {}".format(idx, k)) - - def nodeoption_choices_method(self, arg_tokens): - step_num = arg_tokens['step'][0] - if step_num == '*': - keys = [] - for step in self.G.graph_state.steps: - node = step['node'] - keys.extend(node.at_options.valid_options.keys()) - return keys - try: - step_num = int(step_num) - node = self.G.graph_state.steps[step_num]['node'] - return node.at_options.valid_options.keys() - except ValueError: - return [] - - # nodeoption COMMAND - parser_nodeoption = Cmd2ArgumentParser() - parser_nodeoption.add_argument('step', nargs=(0, 1), help='Set this step number') - parser_nodeoption.add_argument('parameter', nargs=( - 0, 1), choices_method=nodeoption_choices_method, help='Set this parameter') - parser_nodeoption.add_argument('value', nargs=(0, 1), help='Set the parameter to this value') - - @with_argparser(parser_nodeoption) - def do_nodeoption(self, args): - """ Allows setting of autotiler generator control parameters and other code generation -options such as the location of inputs and outputs. For a complete set of the parameters that -can be set refer to the autotiler documentation.""" - self._check_graph() - if args.step is None or (args.step == '*' and args.parameter is None): - for nodeid, elem in self.G.node_options.items(): - print("{}: {}".format(nodeid, elem)) - return - - if args.step == '*': - nodes = [step['node'] for step in self.G.graph_state.steps] - else: - try: - step = int(args.step) - nodes = [self.G.graph_state.steps[step]['node']] - except ValueError: - self.perror("that's not a valid step") - - if args.parameter is None: - node_options = self.G.node_options.get(NodeId(nodes[0])) - if node_options: - print(node_options) - else: - print("nothing set") - return - if args.value is None: - val = None - else: - val = int(args.value) - for node in nodes: - node_options = node.at_options - setattr(node_options, args.parameter, val) - self.G.node_options[NodeId(node)] = node_options - - # QSHOW COMMAND - parser_qshow = Cmd2ArgumentParser() - table_options(parser_qshow) - parser_qshow.add_argument('step', type=int, nargs=(0, 1), help='Limit to step number') - - @with_argparser(parser_qshow) - def do_qshow(self, args): - """ -Show current quantization settings.""" - self._check_graph() - self._check_quantized() - tab = QuantizationReporter(step=args.step).report(self.G, self.G.quantization) - output_table(tab, args) - - # EXTRACT COMMAND - parser_extract = Cmd2ArgumentParser() - parser_extract.add_argument('step', - type=int, - help='step number to extract') - - @with_argparser(parser_extract) - def do_extract(self, args): - """ -Extracts a single step out of a graph and forms a new graph with inputs and outputs to this step.""" - self._check_graph() - if args.step < 0 or args.step > len(self.G.graph_state.steps): - self.perror("step must be between 0 and {}".format(len(self.G.graph_state.steps))) - extract_node(self.G, self.G.graph_state.steps[args.step]['node']) - - # GEN COMMAND - parser_gen = Cmd2ArgumentParser() - parser_gen.add_argument('model_file', - completer_method=Cmd.path_complete, - nargs=argparse.OPTIONAL, - help='file to write to, otherwise output to terminal') - parser_gen.add_argument('-T', '--tensor_directory', - completer_method=Cmd.path_complete, - help='path to tensor directory. full path will be created' + - ' if it doesn\'t exist. If this parameter is given it will' + - 'update the settings saved with the graph state.') - parser_gen.add_argument('-M', '--model_directory', - completer_method=Cmd.path_complete, - help='path to model directory. full path will be created' + - ' if it doesn\'t exist. If this parameter is given it will' + - 'update the settings saved with the graph state.') - parser_gen.add_argument('-t', '--output_tensors', - action='store_true', - help='write constants (weights, biases)') - parser_gen.add_argument('-c', '--checksums', - completer_method=Cmd.path_complete, - help='generate checksum tests in code for the given file') - parser_gen.add_argument('--header_file', - action='store_true', - help='generate header file with quantization information for each layer') - - @with_argparser(parser_gen) - def do_gen(self, args): - """ -Generate AutoTiler model C code and optionally dump tensors. If no destination file is -given the generated code will be outputed to the screen. Check the 'set' command for -settings related to code generation.""" - self._check_graph() - self._check_quantized() - self._check_adjusted() - if args.checksums: - input_args = self._get_input_args(None) - LOG.info("input file %s", args.checksums) - data = import_data(args.checksums, **input_args) - execute(self.G, [data], qrecs=self.G.quantization, qmode=QuantizationMode.all()) - self.settings['checksum_file'] = args.checksums - self.settings['generate_checksums'] = True - - if args.tensor_directory: - self.settings['tensor_directory'] = args.tensor_directory - if args.model_directory: - self.settings['model_directory'] = args.model_directory - code_gen = CodeGenerator(self.G, DefaultNamingConvension(self.G), self.settings) - - if self.settings['template_file']: - code_template = dynamic_template(self.settings['template_file']) - else: - code_template = default_template - - if args.model_file: - with open(os.path.join(self.settings['model_directory'], - args.model_file), "w") as output_fp: - output_fp.write(code_template(self.G, code_generator=code_gen)) - else: - self.ppaged(code_template(self.G, code_generator=code_gen)) - if args.output_tensors: - code_gen.write_constants() - - if args.header_file: - if args.model_file: - with open(os.path.join(self.settings['model_directory'], os.path.splitext(args.model_file)[0]+'.h'), "w") as output_fp: - output_fp.write(header_template(self.G, code_generator=code_gen)) - else: - self.ppaged(header_template(self.G, code_generator=code_gen)) - - # SAVE_STATE COMMAND - parser_save_state = Cmd2ArgumentParser() - parser_save_state.add_argument('output', - completer_method=Cmd.path_complete, - nargs=argparse.OPTIONAL, - help='file to write to') - - @with_argparser(parser_save_state) - def do_save_state(self, args): - """ -Save the state of the transforms and quantization of the graph. -This state file can be used to generate the model file as part of -a build script. If no argument is given then the state files -will be saved in the same directory as the graph. If a directory is -given then the state files will be saved in it with the graph -basename. If a filename is given, its basename will be used to -save the state files.""" - self._check_graph() - self._check_quantized() - gen_opts = {k: self.settings[k] for k in DEFAULT_GEN_OPTS} - dump_state(self.G, state_path=args.output, extra=gen_opts) - - # BCORR COMMAND - parser_bcorr = Cmd2ArgumentParser() - input_options(parser_bcorr) - - @with_argparser(parser_bcorr) - def do_bcorr(self, args): - """ -Correct biases with average quantization error.""" - self._check_graph() - self._check_quantized() - stats_collector = StepErrorStatsCollector() - input_args = self._get_input_args(args) - cnt = 0 - for filename in glob_input_files(args.input_files): - cnt += 1 - data = import_data(filename, **input_args) - stats_collector.collect_stats(self.G, [data]) - - adjust_biases(self.G, stats_collector.reduce_stats()) - - # QERROR COMMAND - parser_qerror = Cmd2ArgumentParser() - parser_qerror.add_argument('-s', '--step', - action='store_true', - help='evaluate quantization per step. i.e.\ - individually quantize each layer') - parser_qerror.add_argument('-r', '--report_lowest', - type=int, help='QSNR threshold below which to report filename') - table_options(parser_qerror, default_width=140) - input_options(parser_qerror) - - @with_argparser(parser_qerror) - def do_qerror(self, args): - """ -Show quantization error introduced by processing one or more input files.""" - self._check_graph() - self._check_quantized() - fmt = ('tab' if args.output is None else args.output['fmt']) - input_args = self._get_input_args(args) - if args.step: - stats_collector = StepErrorStatsCollector() - else: - stats_collector = ErrorStatsCollector() - cnt = 0 - for filename in glob_input_files(args.input_files): - cnt += 1 - data = import_data(filename, **input_args) - stat = stats_collector.collect_stats(self.G, [data]) - if args.report_lowest is not None: - lowest = min((elem['qsnr'] for elem in stat.values())) - if lowest < args.report_lowest: - self.pfeedback("{} had QSNR below threshold".format(filename)) - - tab = ErrorReporter(do_totals=(fmt != "csv"), one_input=cnt <= 1, with_chan=args.step)\ - .report(self.G, stats_collector.reduce_stats()) - output_table(tab, args) - - # QTUNE COMMAND - parser_tune = Cmd2ArgumentParser() - parser_tune.add_argument('step', - type=int, help='step to tune') - parser_tune.add_argument('parameter', - choices=['acc', 'calc', 'weights', 'biases', 'dp', 'out'], - help='which parameter to tune') - parser_tune.add_argument('X', - nargs='?', - default=0, - type=int, help='X of QX.Y') - parser_tune.add_argument('Y', - nargs='?', - default=0, - type=int, help='Y of QX.Y') - parser_tune.add_argument('index', - nargs='?', - default=0, - type=int, help='edge index') - parser_tune.add_argument('-f', - '--sub_step_fusion', - type=int, - help='index of the subnode for qtune inside of a fused one') - - @with_argparser(parser_tune) - def do_qtune(self, args): - """ -Tune quantization of graph.""" - self._check_graph() - self._check_quantized() - - tuneq(self.G, self.G.quantization, args.step, - args.parameter, args.X, args.Y, index=args.index) - - # TEMPS COMMAND - parser_temps = Cmd2ArgumentParser() - table_options(parser_temps, default_width=140) - - @with_argparser(parser_temps) - def do_temps(self, args): - """ -Show statistics on activations.""" - self._check_graph() - fmt = ('tab' if args.output is None else args.output['fmt']) - stats_collector = TempsStatsCollector() - stats = stats_collector.collect_stats(self.G) - tab = TempsReporter(do_totals=(fmt != "csv")).report(self.G, stats) - output_table(tab, args) diff --git a/tools/nntool/interpreter/nntool_shell_base.py b/tools/nntool/interpreter/nntool_shell_base.py new file mode 100644 index 000000000..583cd035f --- /dev/null +++ b/tools/nntool/interpreter/nntool_shell_base.py @@ -0,0 +1,256 @@ +import os +import logging +from itertools import chain +from cmd2 import Cmd, CompletionItem +from execution.execution_progress import ExecutionProgress +from .shell_utils import NNToolShellLogHandler +from .settings import NNToolShellSettings + +CHECK_GRAPH_ERROR = """ +A graph must be opened to use this command. Use the open command to open a graph. +""" + +CHECK_QUANTIZED_ERROR = """ +The opened graph must be quantized to use this command. Run the aquant command. +""" + +CHECK_ADJUSTED_ERROR = """ +The opened graph must be adjusted to use this command. Run the adjust command. +""" + +LOG = logging.getLogger("nntool") + +NO_GRAPH = { + 'G': None, + 'graph_file': "", + 'tensor_file': "" +} + + +def progress(step_idx, name): + if not step_idx: + return + print("{}\r{} {}\r".format(" " * 70, step_idx, name), end="") + return + + +class GraphNotReadyException(Exception): + pass + + +class NNToolShellBase(NNToolShellSettings, Cmd): + def __init__(self, args, nntool_workdir, *rest, **kwargs): + self._nntool_workdir = nntool_workdir + self._graph_idx = 0 + self._graphs = [] + self._settings = [] + self._tensor_store = {} + super(NNToolShellBase, self).__init__(*rest, **kwargs) + self.py_locals['tensors'] = self._tensor_store + + if args and args.log_level is not None: + self.settings['log_level'] = args.log_level.upper() + + self._graph_idx = 0 + + # settings overide graph file + graph_file = self.settings['graph_file'] + tensor_file = self.settings['tensor_file'] + + # command line overides that + if args: + if args.graph_file: + graph_file = args.graph_file + + if args.tensor_file: + tensor_file = args.tensor_file + + if args.template_file: + self.settings['template_file'] = args.template_file + + if args.tf_quant: + self.settings['load_quantization'] = args.tf_quant + + if args.dequant_tf: + self.settings['load_dequantized'] = args.dequant_tf + + if 'log_level' not in self.settings: + self.settings['log_level'] = "INFO" + + if graph_file: + self._graphs = [] + self._startup_commands.append( + self.__build_open_graph(graph_file, + tensor_file, + self.load_quantization, + load_dequantized=self.settings.get('load_dequantized')) + ) + else: + self._graphs = [ + NO_GRAPH.copy() + ] + + ExecutionProgress().listen(progress) + LOG.propagate = False + handler = NNToolShellLogHandler(self) + formatter = logging.Formatter('%(module)s - %(message)s') + handler.setFormatter(formatter) + LOG.addHandler(handler) + LOG.setLevel(self.settings['log_level']) + + def run_script(self, script_path): + expanded_path = os.path.abspath(os.path.expanduser(script_path)) + + # Make sure the path exists and we can access it + if not os.path.exists(expanded_path): + self.perror("'{}' does not exist or cannot be accessed".format(expanded_path)) + return + + # Make sure expanded_path points to a file + if not os.path.isfile(expanded_path): + self.perror("'{}' is not a file".format(expanded_path)) + return + + # An empty file is not an error, so just return + if os.path.getsize(expanded_path) == 0: + return + + try: + # Read all lines of the script + with open(expanded_path, encoding='utf-8') as target: + script_commands = target.read().splitlines() + except OSError as ex: # pragma: no cover + self.pexcept("Problem accessing script from '{}': {}".format(expanded_path, ex)) + return + + orig_script_dir_count = len(self._script_dir) + + try: + self.runcmds_plus_hooks(self._startup_commands) + self._startup_commands.clear() + self._script_dir.append(os.path.dirname(expanded_path)) + return self.runcmds_plus_hooks(script_commands) + + finally: + with self.sigint_protection: + # Check if a script dir was added before an exception occurred + if orig_script_dir_count != len(self._script_dir): + self._script_dir.pop() + + def other_open_graphs(self, only_open=False): + items = [] + for graph_idx, graph in enumerate(self._graphs): + if graph_idx == self._graph_idx: + continue + if graph['G'] is None: + if only_open: + continue + name = "No Graph" + else: + name = graph['G'].name + items.append(CompletionItem(graph_idx, name)) + return items + + def inputs_and_outputs(self): + if self.G is None: + return [] + return [node.name for node in chain(self.G.inputs_and_constants(), self.G.outputs())] + + def _check_adjusted(self): + if not self.is_adjusted: + raise GraphNotReadyException(CHECK_ADJUSTED_ERROR) + + def _check_graph(self): + if self.G is None: + raise GraphNotReadyException(CHECK_GRAPH_ERROR) + + def _check_quantized(self): + if not self.is_quantized: + raise GraphNotReadyException(CHECK_QUANTIZED_ERROR) + + @staticmethod + def __build_open_graph(graph_file, tensor_file, load_quantization, load_dequantized=False): + command = ["open", graph_file, "-n"] + if tensor_file: + command.append("-t {}".format(tensor_file)) + if load_quantization: + command.append("-q") + if load_dequantized: + command.append("-d") + return " ".join(command) + + def execute_adjust_order(self): + self.G.adjust_order() + self.G.add_dimensions() + + def _update_prompt(self): + self.prompt = "(NNT {} {}) ".format(os.path.basename(self.graph_file), + self._graph_idx) + + @property + def settings(self): + return self._settings + + @settings.setter + def settings(self, val): + self._settings = val + + @property + def G(self): + return self._graphs[self._graph_idx]['G'] + + @G.setter + def G(self, val): + self._graphs[self._graph_idx]['G'] = val + + @property + def graph_name(self): + if self._graph_idx is None: + return "" + return self._graphs[self._graph_idx]['G'].name + + @graph_name.setter + def graph_name(self, val): + if self._graph_idx is None: + return + self._graphs[self._graph_idx]['G'].name = val + + @property + def is_quantized(self): + return self._graphs[self._graph_idx]['G'].quantization is not None + + @property + def is_adjusted(self): + return self._graphs[self._graph_idx]['G'].graph_identity.is_adjusted + + @property + def is_equalized(self): + return self._graphs[self._graph_idx]['G'].graph_identity.is_equalized + + @property + def graph_file(self): + return self._graphs[self._graph_idx]['graph_file'] + + @graph_file.setter + def graph_file(self, val): + self._graphs[self._graph_idx]['graph_file'] = val + + @property + def tensor_file(self): + return self._graphs[self._graph_idx]['tensor_file'] + + @tensor_file.setter + def tensor_file(self, val): + self._graphs[self._graph_idx]['tensor_file'] = val + + @property + def tensor_store(self): + return self._tensor_store + + @tensor_store.setter + def tensor_store(self, val): + self._tensor_store = val + + @property + def tensor_store_names(self): + return self._tensor_store.keys() diff --git a/tools/nntool/interpreter/settings.py b/tools/nntool/interpreter/settings.py new file mode 100644 index 000000000..aaa6a4c4d --- /dev/null +++ b/tools/nntool/interpreter/settings.py @@ -0,0 +1,278 @@ + +import logging +from cmd2 import Cmd, Settable +from generation.autotiler_options import DEFAULT_GEN_OPTS, DEFAULT_GEN_OPTS_DESCRIPTIONS +from utils.data_importer import MODES +from .shell_utils import find_choice + +LOG = logging.getLogger("nntool") + +VALID_LOG_LEVELS = [ + "INFO", + "DEBUG", + "WARNING" +] + +DEFAULT_OPT_DESCRIPTIONS = { + 'log_level': {'type': str, 'descr': 'set logging level (one of {} or number)'.format(", ".join(VALID_LOG_LEVELS))}, + 'load_quantization': {'type': bool, 'descr': 'load TFLITE quantization information'}, + 'load_dequantized': {'type': bool, 'descr': 'load the dequantized constant values from tflite quantized graph'}, + 'fusions': {'type': bool, 'descr': 'run standard graph fusions on graph load'}, + 'adjust_order': {'type': bool, 'descr': 'adjust activation and parameter dimension order\ + to match autotiler on graph load'}, + 'weight_equalization': {'type': bool, 'descr': 'equalize weights on graph load'}, + 'equalization_threshold': {'type': float, 'descr': 'threshold for weight equalization convergence'}, + 'adjust_image': {'type': bool, 'descr': 'adjust image input size and channels'}, + 'image_width': {'type': int, 'descr': 'input image width'}, + 'image_height': {'type': int, 'descr': 'input image height'}, + 'image_mode': {'type': str, 'descr': 'input image mode (one of {})'.format(", ".join(MODES.keys()))}, + 'input_divisor': {'type': float, 'descr': 'divide input tensor values by this value'}, + 'input_offset': {'type': float, 'descr': 'add this value to input tensor values'}, + 'input_norm_func': {'type': str, 'descr': 'lambda function in the form x: fn(x) where x is any input'}, + 'graph_name': {'type': str, 'descr': 'name of the graph used for code generation'}, + 'template_file': {'type': str, 'descr': 'template file used for code generation'}, +} + + +class NNToolShellSettings(Cmd): + ''' + This class have all the settings and properties that can be set up from the NNToolShell + To see the Code Generation settings, please refer to generation/autotiler_options.py + ''' + def __init__(self, *args, **kwargs): + super(NNToolShellSettings, self).__init__(*args, **kwargs) + for k, v in DEFAULT_OPT_DESCRIPTIONS.items(): + self.add_settable(Settable(k, v['type'], v['descr'])) + for k, v in DEFAULT_GEN_OPTS_DESCRIPTIONS.items(): + self.add_settable(Settable(k, v['type'], v['descr'])) + self.settings = { + 'load_quantization': False, + 'fusions': False, + 'adjust_order': False, + 'weight_equalization': False, + 'equalization_threshold': 0.1, + 'adjust_image': False, + 'image_width': -1, + 'image_height': -1, + 'image_mode': "", + 'image_transpose': False, + 'input_norm_func': "", + 'input_divisor': 1, + 'input_offset': 0, + 'input_shift': 0, + 'log_level': 'INFO', + 'graph_file': "", + 'tensor_file': "", + 'template_file': "" + } + self.settings.update(DEFAULT_GEN_OPTS) + + # LOG_LEVEL PROPERTY + + @property + def log_level(self): + return self.settings['log_level'] + + @log_level.setter + def log_level(self, val): + try: + val = int(val) + self.settings['log_level'] = val + except ValueError: + val = val.upper() + val = find_choice(VALID_LOG_LEVELS, val) + + self.settings['log_level'] = val + LOG.setLevel(self.settings['log_level']) + LOG.info("set log level to %s", val) + + def __getattr__(self, k): + if k in DEFAULT_GEN_OPTS: + return self.settings[k] + + def __setattr__(self, k, val): + if k in DEFAULT_GEN_OPTS: + self.settings[k] = val + super(NNToolShellSettings, self).__setattr__(k, val) + + # load_quantization PROPERTY + + @property + def load_quantization(self): + return self.settings['load_quantization'] + + @load_quantization.setter + def load_quantization(self, val): + self.settings['load_quantization'] = bool(val) + + # FUSIONS PROPERTY + + @property + def fusions(self): + return self.settings['fusions'] + + @fusions.setter + def fusions(self, val): + self.settings['fusions'] = bool(val) + + # ADJUST_ORDER PROPERTY + + @property + def adjust_order(self): + return self.settings['adjust_order'] + + @adjust_order.setter + def adjust_order(self, val): + self.settings['adjust_order'] = bool(val) + + # WEIGHT_EQUALIZATION PROPERTY + + @property + def weight_equalization(self): + return self.settings['weight_equalization'] + + @weight_equalization.setter + def weight_equalization(self, val): + self.settings['weight_equalization'] = bool(val) + + # EQUALIZATION_THRESHOLD PROPERTY + + @property + def equalization_threshold(self): + return self.settings['equalization_threshold'] + + @equalization_threshold.setter + def equalization_threshold(self, val): + self.settings['equalization_threshold'] = float(val) + + # IMAGE ADJUSTMENT SETTINGS + + # ADJUST_IMAGE PROPERTY + + @property + def adjust_image(self): + return self.settings['adjust_image'] + + @adjust_image.setter + def adjust_image(self, val): + self.settings['adjust_image'] = bool(val) + + # IMAGE_WIDTH PROPERTY + + @property + def image_width(self): + return self.settings['image_width'] + + @image_width.setter + def image_width(self, val): + try: + val = int(val) + if val <= 0: + raise ValueError() + except ValueError: + raise ValueError("value should be positive integer") + self.settings['image_width'] = bool(val) + + # IMAGE_HEIGHT PROPERTY + + @property + def image_height(self): + return self.settings['image_height'] + + @image_height.setter + def image_height(self, val): + try: + val = int(val) + if val <= 0: + raise ValueError() + except ValueError: + raise ValueError("value should be positive integer") + self.settings['image_height'] = bool(val) + + # IMAGE_MODE PROPERTY + + @property + def image_mode(self): + return self.settings['image_mode'] + + @image_mode.setter + def image_mode(self, val): + val = find_choice(MODES.keys(), val) + self.settings['image_mode'] = str(val) + + # INPUT_DIVISOR PROPERTY + + @property + def input_divisor(self): + return self.settings['input_divisor'] + + @input_divisor.setter + def input_divisor(self, val): + self.settings['input_divisor'] = int(val) + + # INPUT_NORM_FUNC PROPERTY + + @property + def input_norm_func(self): + return self.settings['input_norm_func'] + + @input_norm_func.setter + def input_norm_func(self, val): + self.settings['input_norm_func'] = str(val) + + # INPUT_OFFSET PROPERTY + + @property + def input_offset(self): + return self.settings['input_offset'] + + @input_offset.setter + def input_offset(self, val): + self.settings['input_offset'] = int(val) + + @property + def template_file(self): + return self.settings['template_file'] + + @template_file.setter + def template_file(self, val): + self.settings['template_file'] = val + + def _get_input_args(self, args): + res = {} + if self.settings['adjust_image']: + res['width'] = self.settings['image_width'] + res['height'] = self.settings['image_height'] + res['mode'] = self.settings['image_mode'] + else: + res['width'] = res['height'] = -1 + res['mode'] = None + if args: + if args.width is not None: + res['width'] = args.width + if args.height is not None: + res['height'] = args.height + if args.mode is not None: + res['mode'] = args.mode + +# res['shift'] = self.settings['input_shift'] if args.bit_shift +# is None else args.bit_shift + res['divisor'] = self.settings['input_divisor']\ + if args.divisor is None else args.divisor + res['offset'] = self.settings['input_offset']\ + if args.offset is None else args.offset + res['transpose'] = self.settings['image_transpose'] if args.transpose is None\ + else args.transpose + res['norm_func'] = self.settings['input_norm_func'] if args.norm_func is None\ + else args.norm_func + else: + # res['shift'] = self.settings['input_shift'] + res['divisor'] = self.settings['input_divisor'] + res['offset'] = self.settings['input_offset'] + res['transpose'] = self.settings['image_transpose'] + res['norm_func'] = self.settings['input_norm_func'] + + if args.nptype: + res['nptype'] = args.nptype + + return res diff --git a/tools/nntool/interpreter/shell_utils.py b/tools/nntool/interpreter/shell_utils.py index 018944f75..45875e372 100644 --- a/tools/nntool/interpreter/shell_utils.py +++ b/tools/nntool/interpreter/shell_utils.py @@ -96,10 +96,16 @@ def output_table(table, args): def filter_dirs(path: str) -> bool: return os.path.isdir(path) -def glob_input_files(input_files): +def glob_input_files(input_files, graph_inputs=1): + input_files_list = [] for file in input_files: for globbed_file in glob(file): - yield globbed_file + input_files_list.append(globbed_file) + if len(input_files_list) % graph_inputs: + return ValueError("input files number is not divisible for graph inputs {}".format(graph_inputs)) + shard = int(len(input_files_list) / graph_inputs) + return [[input_files_list[i+j] for i in range(0, len(input_files_list), shard)] \ + for j in range(shard)] def find_choice(choices, val): hits = [p for p in choices if p.startswith(val)] @@ -121,7 +127,7 @@ def emit(self, record: logging.LogRecord): else: self.__shell.pfeedback(ansi.style_success(output)) -def format_dump_file(G, outputs, quantized): +def format_dump_file(G, outputs, quantized, dequantize): # simplify the output since we only have one for now and add weights foutputs = [] for idx, out in enumerate(outputs): @@ -131,19 +137,54 @@ def format_dump_file(G, outputs, quantized): for filt in node.contained_filters(): if quantized: qrec = G.quantization[NodeId(node, filt)] - tensors.append(qrec.weights_q.quantize(filt.weights)) - tensors.append(qrec.biases_q.quantize(filt.biases)) + if G.has_quantized_parameters: + if dequantize: + qrec = G.quantization[NodeId(node, filt)] + tensors.append(qrec.weights_q.get_dequantized(filt.weights)) + tensors.append(qrec.biases_q.get_dequantized(filt.biases)) + else: + tensors.append(np.copy(filt.weights)) + tensors.append(qrec.biases_q.get_quantized(filt.biases)) + else: + if dequantize: + tensors.append(np.copy(filt.weights)) + tensors.append(np.copy(filt.biases)) + else: + tensors.append(qrec.weights_q.quantize(filt.weights)) + tensors.append(qrec.biases_q.quantize(filt.biases)) else: - tensors.append(np.copy(filt.weights)) - tensors.append(np.copy(filt.biases)) + if G.has_quantized_parameters: + qrec = G.quantization[NodeId(node, filt)] + tensors.append(qrec.weights_q.get_dequantized(filt.weights)) + tensors.append(qrec.biases_q.get_dequantized(filt.biases)) + else: + tensors.append(np.copy(filt.weights)) + tensors.append(np.copy(filt.biases)) elif isinstance(node, FilterParameters): if quantized: qrec = G.quantization[NodeId(node, None)] - tensors.append(qrec.weights_q.quantize(node.weights)) - tensors.append(qrec.biases_q.quantize(node.biases)) + if G.has_quantized_parameters: + if dequantize: + tensors.append(qrec.weights_q.get_dequantized(node.weights)) + tensors.append(qrec.biases_q.get_dequantized(node.biases)) + else: + tensors.append(np.copy(node.weights)) + tensors.append(qrec.biases_q.get_quantized(node.biases)) + else: + if dequantize: + tensors.append(np.copy(node.weights)) + tensors.append(np.copy(node.biases)) + else: + tensors.append(qrec.weights_q.quantize(node.weights)) + tensors.append(qrec.biases_q.quantize(node.biases)) else: - tensors.append(np.copy(node.weights)) - tensors.append(np.copy(node.biases)) + if G.has_quantized_parameters: + qrec = G.quantization[NodeId(node, None)] + tensors.append(qrec.weights_q.dequantize(node.weights)) + tensors.append(qrec.biases_q.dequantize(node.biases)) + else: + tensors.append(np.copy(node.weights)) + tensors.append(np.copy(node.biases)) else: tensors.append(None) tensors.append(None) @@ -158,7 +199,7 @@ def print_comparison(tensors): out = [[printt(t) for t in tensors[i]] for i in range(2)] max_len = max((len(l) for i in range(2) for o in out[i] for l in o)) make_len = lambda a: a + " "*(max_len - len(a)) - combine = lambda a, b: a if b is None else " "*max_len+1 + b if a is None\ + combine = lambda a, b: a if b is None else " "*(max_len+1) + b if a is None\ else make_len(a) + " " + b all_outs = [combine(l0, l1) for (o0, o1) in zip_longest(*out, fillvalue=[])\ for (l0, l1) in zip_longest(o0, o1)] diff --git a/tools/nntool/nntool b/tools/nntool/nntool index 697bf357f..7398b1163 100755 --- a/tools/nntool/nntool +++ b/tools/nntool/nntool @@ -27,7 +27,7 @@ def create_parser(): parser = argparse.ArgumentParser(prog='nntool') parser.add_argument('graph_file', - help='graph file - Darknet .cfg file, TFLite file or JSON state file', + help='graph file - TFLite file or JSON state file', metavar="INPUT_GRAPH or STATE_FILE", nargs=argparse.OPTIONAL, default="") @@ -45,6 +45,8 @@ def create_parser(): parser.add_argument('-s', '--script_file', help='run script and exit', metavar="SCRIPT_FILE") + parser.add_argument('-H', '--header_file', + help='write graph information to header file') parser.add_argument('-m', '--model_file', help='override model file in state file') parser.add_argument('-M', '--model_directory', @@ -62,6 +64,9 @@ def create_parser(): parser.add_argument('-q', '--tf_quant', action='store_true', help='load tflite quantization parameters') + parser.add_argument('--dequant_tf', + action='store_true', + help='dequantize the tflite quantizated parameters') return parser diff --git a/tools/nntool/quantization/cross_layer_range_eq.py b/tools/nntool/quantization/cross_layer_range_eq.py index d74c2ca55..67adf25b6 100644 --- a/tools/nntool/quantization/cross_layer_range_eq.py +++ b/tools/nntool/quantization/cross_layer_range_eq.py @@ -28,7 +28,25 @@ LOG = logging.getLogger('nntool.'+__name__) -def discover_groups(G, do_relun=False): + +def process_node(node, last_neuron, group, groups, neurons): + if not node.can_equalize: + group = add_group(group, groups, neurons) + return True, None, group + + if isinstance(node, FilterParameters): + last_neuron = add_neuron(node.name, node, last_neuron, neurons, group) + return True, last_neuron, group + + if isinstance(node, ActivationParameters) and\ + last_neuron is not None and node.activation == 'relu': + assert 'activation' not in last_neuron, "weird 2 activations after conv" + last_neuron['activation'] = node + return True, last_neuron, group + return False, last_neuron, group + + +def discover_groups(G): groups = [] group = [] neurons = [] @@ -40,41 +58,23 @@ def discover_groups(G, do_relun=False): last_neuron = None group = add_group(group, groups, neurons) continue - # can equalize indicates that the node can be included in the group - if not node.can_equalize: - last_neuron = None - group = add_group(group, groups, neurons) - continue - - if isinstance(node, FilterParameters): - last_neuron = add_neuron(node.name, node, last_neuron, neurons, group) - continue - if isinstance(node, ActivationParameters) and\ - last_neuron is not None and\ - (node.activation == 'relu6' or node.activation == 'relun'): - # To implement for RELU6 requires a RELUN with a per channel N - # which doesn't have a generator as yet so this is just for testing - # at present - if not do_relun: - last_neuron = None - group = add_group(group, groups, neurons) - continue - assert 'activation' not in last_neuron, "weird 2 activations after conv" - last_neuron['activation'] = node + should_continue, last_neuron, group = process_node(node, last_neuron, group, + groups, neurons) + if should_continue: continue if isinstance(node, ConvFusionParameters): - # TODO - Add reluN support for fusions - filters = node.contained_filters() - if len(filters) == 1: - last_neuron = add_neuron(node.name, filters[0], last_neuron, neurons, group) + for fnode in node.contained_nodes(): + _, last_neuron, group = process_node(fnode, last_neuron, group, + groups, neurons) if group: add_group(group, groups, neurons) return groups, neurons + def add_group(group, groups, neurons): if group: LOG.info("Adding group with %d neuron pairs", len(group)) @@ -83,6 +83,7 @@ def add_group(group, groups, neurons): group = [] return group + def add_neuron(node_name, node, last_neuron, neurons, group): new_neuron = {'name': node_name, 'node': node, 'weights': None, 'biases': None} @@ -93,14 +94,17 @@ def add_neuron(node_name, node, last_neuron, neurons, group): last_neuron = new_neuron return last_neuron + def calculate_s(range_1, range_2): assert len(range_1) == len(range_2) # note: the paper is wrong. It should be 1/range2 not 1/range1 return [(1/range_2[i]) * math.sqrt(range_1[i] * range_2[i]) for i in range(len(range_1))] + class QuantizationError(Exception): pass + def calculate_precisions(step): nn_0 = step[0] nn_1 = step[1] @@ -110,6 +114,7 @@ def calculate_precisions(step): prec_1 = ranges_1/max_1 return prec_0, prec_1 + def process_group(group, threshold): total_precision = 0 cycles = 0 @@ -138,40 +143,28 @@ def process_group(group, threshold): ranges_0, _ = Ranges.range_output(nn_0['node'], weights=nn_0['weights']) ranges_1, _ = Ranges.range_input(nn_1['node'], weights=nn_1['weights']) scale = calculate_s(ranges_0, ranges_1) - if 'activation' in nn_0: - if 'relun' not in nn_0: - if nn_0['activation'].activation == "relu6": - nn_0['relun'] = [6.0] * len(scale) - elif nn_0['activation'].activation == "relun": - if isinstance(nn_0['activation'].activation_params, list): - nn_0['relun'] = copy(nn_0['activation'].activation_params) - else: - nn_0['relun'] = [nn_0['activation'].activation_params] * len(scale) - nn_0['relun'] = [relun/s for relun, s in zip(nn_0['relun'], scale)] # now apply the scale to the output and input channels nn_0['weights'], nn_0['biases'] =\ Scales.scale_output(nn_0['node'], scale, nn_0['weights'], nn_0['biases']) nn_1['weights'] = Scales.scale_input(nn_1['node'], scale, nn_1['weights']) + def process_groups(groups, threshold=0.01): for group in groups: LOG.info("processing group") process_group(group, float(threshold)) + def update_parameters(neurons): for neuron in neurons: params = neuron['node'] params.weights = neuron['weights'] if neuron['biases'] is not None: params.biases = neuron['biases'] - if 'relun' in neuron: - act = neuron['activation'] - act.activation = 'relun' - act.activation_params = neuron['relun'] -def weight_equalization(G, threshold=0.01, do_relun=False): +def weight_equalization(G, threshold=0.01): LOG.info("discovering groups") - groups, neurons = discover_groups(G, do_relun=do_relun) + groups, neurons = discover_groups(G) if groups and neurons: LOG.info("found %d groups and %d neurons", len(groups), len(neurons)) process_groups(groups, threshold) @@ -180,6 +173,7 @@ def weight_equalization(G, threshold=0.01, do_relun=False): else: LOG.warning("no groups to equalize found") + def adjust_biases(G, stats): for nid, stat in stats.items(): node = nid.get_node(G) diff --git a/tools/nntool/quantization/float32/__init__.py b/tools/nntool/quantization/float32/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/quantization/float32/float32_quantization.py b/tools/nntool/quantization/float32/float32_quantization.py new file mode 100644 index 000000000..baf1cb693 --- /dev/null +++ b/tools/nntool/quantization/float32/float32_quantization.py @@ -0,0 +1,112 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +#pylint: disable=no-self-use +#pylint: disable=too-many-ancestors + +from typing import Sequence + +import numpy as np + +from graph.types import (Conv2DParameters, MultiplicativeBiasParameters, + Parameters) +from quantization.quantization_record_base import (QuantizationRecordBase, + FilterQuantizationRecordBase, InputOutputQuantizationRecordBase, + ScalableFilterQuantizationRecordBase) + + +class ShouldNotBeCalledError(Exception): + pass + + +class QuantizationMixin(QuantizationRecordBase): + def prepare_inputs(self, + params: Parameters, + input_tensors: Sequence[np.ndarray], ktype: str = None) -> Sequence[np.ndarray]: + del params + assert ktype == "float32", "incorrect kernel functions for this qrec type" + return input_tensors + + def get_outputs(self, + params: Parameters, + output_tensors: Sequence[np.ndarray], ktype: str = None) -> Sequence[np.ndarray]: + del params + assert ktype == "float32", "incorrect kernel functions for this qrec type" + return output_tensors + + @property + def auto_dequantize_outputs(self): + return False + + @auto_dequantize_outputs.setter + def auto_dequantize_outputs(self, val): + raise ShouldNotBeCalledError() + + @property + def auto_quantize_inputs(self): + return False + + @auto_quantize_inputs.setter + def auto_quantize_inputs(self, val): + raise ShouldNotBeCalledError() + + @property + def constants_are_quantized(self) -> bool: + return False + + +class Float32QuantizationRecord(QuantizationMixin, InputOutputQuantizationRecordBase): + pass + + +class FilterQuantizationMixin(QuantizationMixin): + + def gen_weights(self, params: Parameters, weights: np.ndarray) -> np.ndarray: + raise ShouldNotBeCalledError() + + def gen_biases(self, biases: np.ndarray) -> np.ndarray: + raise ShouldNotBeCalledError() + + def prepare_weights(self, params: Parameters, weights: np.ndarray, ktype: str = None) -> np.ndarray: + del params + assert ktype == "float32", "incorrect kernel functions for this qrec type" + return weights.astype(np.float32) + + def prepare_biases(self, params: Parameters, biases: np.ndarray, + weights: np.ndarray, ktype: str = None) -> np.ndarray: + del params, weights + assert ktype == "float32", "incorrect kernel functions for this qrec type" + return biases.astype(np.float32) + + +class Float32FilterQuantizationRecord(FilterQuantizationMixin, FilterQuantizationRecordBase): + pass + + +class Float32ScalableFilterQuantizationRecord(FilterQuantizationMixin, ScalableFilterQuantizationRecordBase): + + def gen_mul_biases(self, params: MultiplicativeBiasParameters) -> np.ndarray: + raise ShouldNotBeCalledError() + + def apply_multiplicative_bias(self, + params: Conv2DParameters, + input_tensor: np.ndarray, + axis: int, + ktype: str = None): + assert ktype == "float32", "incorrect kernel functions for this qrec type" + if params.has_mul_bias: + shape = [params.filter.out_c if idx == axis else 1 for idx in range(3)] + input_tensor *= params.mul_biases.reshape(shape) + return input_tensor diff --git a/tools/nntool/quantization/float32/float_kernet_set.py b/tools/nntool/quantization/float32/float_kernet_set.py new file mode 100644 index 000000000..a841d6ae0 --- /dev/null +++ b/tools/nntool/quantization/float32/float_kernet_set.py @@ -0,0 +1,128 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from quantization.float32.kernels.activations import relu, leaky, hsigmoid, hswish +from quantization.float32.kernels.fast_conv import faster_conv +from quantization.float32.kernels.linear import linear +from quantization.float32.kernels.matrix_operations import matscale, piecewise +from quantization.float32.kernels.pad import pad +from quantization.float32.kernels.pool import (av_global_pool, av_pool, + max_global_pool, max_pool) +from quantization.float32.kernels.softmax import softmax +from quantization.float32.kernels.tensor_functions import (concat, + constant_input, + graph_input, graph_output, + reshape, transpose) +from quantization.float32.kernels.image_format import image_format +from quantization.kernels.kernel_function import (KernelFunction, + KernelFunctionSetBase) + + +class Float32KernelSet(KernelFunctionSetBase): + @property + def graph_input(self) -> KernelFunction: + return graph_input + + @property + def graph_output(self) -> KernelFunction: + return graph_output + + @property + def constant_input(self) -> KernelFunction: + return constant_input + + @property + def leaky(self) -> KernelFunction: + return leaky + + @property + def hswish(self) -> KernelFunction: + return hswish + + @property + def hsigmoid(self) -> KernelFunction: + return hsigmoid + + @property + def relu(self) -> KernelFunction: + return relu + + @property + def matscale(self) -> KernelFunction: + return matscale + + @property + def matadd(self) -> KernelFunction: + return piecewise + + @property + def matsub(self) -> KernelFunction: + return piecewise + + @property + def matdiv(self) -> KernelFunction: + return piecewise + + @property + def matmul(self) -> KernelFunction: + return piecewise + + @property + def conv2d(self) -> KernelFunction: + return faster_conv + + @property + def linear(self) -> KernelFunction: + return linear + + @property + def softmax(self) -> KernelFunction: + return softmax + + @property + def reshape(self) -> KernelFunction: + return reshape + + @property + def transpose(self) -> KernelFunction: + return transpose + + @property + def concat(self) -> KernelFunction: + return concat + + @property + def av_pool(self) -> KernelFunction: + return av_pool + + @property + def av_global_pool(self) -> KernelFunction: + return av_global_pool + + @property + def max_pool(self) -> KernelFunction: + return max_pool + + @property + def max_global_pool(self) -> KernelFunction: + return max_global_pool + + @property + def pad(self) -> KernelFunction: + return pad + + @property + def image_format(self) -> KernelFunction: + return image_format diff --git a/tools/nntool/quantization/float32/kernels/__init__.py b/tools/nntool/quantization/float32/kernels/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/quantization/float32/kernels/activations.py b/tools/nntool/quantization/float32/kernels/activations.py new file mode 100644 index 000000000..e0200f3a4 --- /dev/null +++ b/tools/nntool/quantization/float32/kernels/activations.py @@ -0,0 +1,80 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np + +from quantization.quantization_record_base import QuantizationRecordBase +from quantization.float32.float32_quantization import Float32QuantizationRecord +from graph.types import ReluActivationParameters + + +def hswish(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + if qrec is None: + qrec = Float32QuantizationRecord() + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="float32")[0] + return qrec.get_outputs(params, [in_tensor * np.minimum(np.maximum(in_tensor + 3, 0), 6) / 6], ktype="float32") + + +def hsigmoid(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + if qrec is None: + qrec = Float32QuantizationRecord() + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="float32")[0] + return qrec.get_outputs(params, [np.minimum(np.maximum(in_tensor + params.offset, 0), 6) / 6], ktype="float32") + + +def sigmoid(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + if qrec is None: + qrec = Float32QuantizationRecord() + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="float32")[0] + return qrec.get_outputs(params, [1/(1 + np.exp(-in_tensor))], ktype="float32") + + +def relu(params: ReluActivationParameters, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + if qrec is None: + qrec = Float32QuantizationRecord() + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="float32")[0] + if params.upper_bound == None: + return qrec.get_outputs(params, + [np.maximum(in_tensor, + params.lower_bound)], + ktype="float32") + return qrec.get_outputs(params, + [np.minimum(np.maximum(in_tensor, + params.lower_bound), + params.upper_bound)], + ktype="float32") + + +def leaky(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + raise NotImplementedError() diff --git a/tools/nntool/quantization/float32/kernels/fast_conv.py b/tools/nntool/quantization/float32/kernels/fast_conv.py new file mode 100644 index 000000000..833d8b13d --- /dev/null +++ b/tools/nntool/quantization/float32/kernels/fast_conv.py @@ -0,0 +1,130 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +import numpy as np + +from quantization.quantization_record_base import ScalableFilterQuantizationRecordBase +from quantization.float32.float32_quantization import Float32ScalableFilterQuantizationRecord +# pylint: disable=invalid-name + +LOG = logging.getLogger("nntool." + __name__) + +def faster_conv(params, + in_tensors, + qrec: ScalableFilterQuantizationRecordBase, + details=None): + '''3D convolution by sub-matrix summing. + ''' + if qrec is None: + qrec = Float32ScalableFilterQuantizationRecord() + in_dims = params.in_dims[0] + out_dims = params.out_dims[0] + weights = qrec.prepare_weights(params, params.weights, ktype="float32") + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="float32")[0] + + if details is not None: + details['min_acc'] = float("Infinity") + details['max_acc'] = float("-Infinity") + details['pre_mul_bias_min'] = float("Infinity") + details['pre_mul_bias_max'] = float("-Infinity") + + in_tensor = in_tensor.transpose(in_dims.transpose_to_order(['h', 'w', 'c'])).astype(np.float32) + if params.padding.h + params.padding.w > 0: + in_tensor = np.pad(in_tensor, + ([params.padding.t, + params.padding.b], + [params.padding.l, + params.padding.r]) + + ([0, 0], ) * (np.ndim(in_tensor)-2), + mode='constant', + constant_values=0.0) + pad_w = params.padding.w + pad_h = params.padding.h + else: + pad_w = pad_h = 0 + + weights = weights.transpose(params.filter.transpose_to_order(['out_c', 'h', 'w', 'in_c'])).astype(np.float32) + + filt_w = params.filter.w + filt_h = params.filter.h + + in_w = in_dims.w + in_h = in_dims.h + out_c = params.filter.out_c + + in_c_per_group = in_dims.c // params.groups + out_c_per_group = out_c // params.groups + in_c_off = 0 + out_c_cnt = 0 + + + dillated_filter_w = filt_w if params.dilation.w == 1 else filt_w * params.dilation.w - 1 + dillated_filter_h = filt_h if params.dilation.h == 1 else filt_w * params.dilation.h - 1 + + out_w = ((in_w - dillated_filter_w + pad_w)) // params.stride.w + 1 + out_h = ((in_h - dillated_filter_h + pad_h)) // params.stride.h + 1 + + if params.has_bias: + biases = qrec.prepare_biases(params, params.biases, params.weights, ktype="float32") + result = np.ones((out_c, out_h, out_w), + dtype=np.float32) * biases.reshape(out_c, 1, 1) + else: + result = np.zeros((out_c, out_h, out_w), + dtype=np.float32) + + const_h = pad_h + in_h - dillated_filter_h + 1 + const_w = pad_w + in_w - dillated_filter_w + 1 + for out_c_i in range(out_dims.c): + for cur_h in range(filt_h): + for cur_w in range(filt_w): + + # selects all elements that the filter element needs to multiply + slabhw = np.multiply(in_tensor[cur_h * params.dilation.h: + const_h + cur_h * params.dilation.h: + params.stride.h, + cur_w * params.dilation.w: + const_w + cur_w * params.dilation.w: + params.stride.w, + in_c_off: + in_c_off + in_c_per_group: + 1], + weights[out_c_i, cur_h, cur_w], + dtype=np.float32) + + # add depthwise + slabhw = slabhw.sum(axis=-1) + # add to the previous filter elements + result[out_c_i] += slabhw + + if details is not None: + details['min_acc'] = min(np.min(result[out_c_i]), details['min_acc']) + details['max_acc'] = max(np.max(result[out_c_i]), details['max_acc']) + + out_c_cnt += 1 + if out_c_cnt >= out_c_per_group: + out_c_cnt = 0 + in_c_off += in_c_per_group + + if details is not None: + details['pre_mul_bias_min'] = min(np.min(result), details['pre_mul_bias_min']) + details['pre_mul_bias_max'] = max(np.max(result), details['pre_mul_bias_max']) + + result = qrec.apply_multiplicative_bias(params, result, axis=0, ktype="float32") + + result = result.transpose(out_dims.transpose_from_order(['c', 'h', 'w'])) + + return qrec.get_outputs(params, [result], ktype="float32") diff --git a/tools/nntool/quantization/float32/kernels/image_format.py b/tools/nntool/quantization/float32/kernels/image_format.py new file mode 100644 index 000000000..d4415360d --- /dev/null +++ b/tools/nntool/quantization/float32/kernels/image_format.py @@ -0,0 +1,25 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from utils.formatters import FORMAT_CHANGES, NORMALIZATIONS + +def image_format(params, in_tensors, qrec, details): + del details + in_dim = params.in_dims[0] + out_dim = params.out_dims[0] + res = in_tensors[0] + res = FORMAT_CHANGES[params.format_change](res, in_dim, out_dim) + res = NORMALIZATIONS[params.norm_func](res) + return [qrec.out_qs[0].dequantize(res)] diff --git a/tools/nntool/quantization/float32/kernels/linear.py b/tools/nntool/quantization/float32/kernels/linear.py new file mode 100644 index 000000000..ad3ad9b5a --- /dev/null +++ b/tools/nntool/quantization/float32/kernels/linear.py @@ -0,0 +1,75 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +import numpy as np + +from quantization.quantization_record_base import ScalableFilterQuantizationRecordBase +from quantization.float32.float32_quantization import Float32ScalableFilterQuantizationRecord + +LOG = logging.getLogger("nntool." + __name__) + + +def linear(params, + in_tensors, + qrec: ScalableFilterQuantizationRecordBase, + details=None): + + if qrec is None: + qrec = Float32ScalableFilterQuantizationRecord() + + in_dims = params.in_dims[0] + out_dims = params.out_dims[0] + weights = qrec.prepare_weights(params, params.weights, ktype="float32") + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="float32")[0] + + if details is not None: + details['min_acc'] = float("Infinity") + details['max_acc'] = float("-Infinity") + + if params.has_bias: + biases = qrec.prepare_biases(params, params.biases, params.weights, ktype="float32") + acc_tensor = np.ones((out_dims.c, out_dims.h, out_dims.w), + dtype=np.float32) * biases.reshape((out_dims.c, out_dims.h, out_dims.w)) + acc_tensor = acc_tensor.transpose(out_dims.transpose_from_order(('c', 'h', 'w'))) + else: + acc_tensor = np.zeros(out_dims.shape, + dtype=np.float32) + + in_tensor = in_tensor.reshape((in_dims.size())) + filt = params.filter.get_filter_dims() + for out_c in range(out_dims.c): + # Expand and normalize the accumulator + + w_slice = weights[filt.srange(out_c=out_c)].reshape((in_dims.size())) + + res = np.dot(in_tensor, w_slice) + + if details is not None: + details['min_acc'] = min(np.sum(res[res < 0]), details['min_acc']) + details['max_acc'] = min(np.sum(res[res > 0]), details['max_acc']) + + acc_slice = acc_tensor[out_dims.srange(c=out_c, h=0, w=0)] + acc_slice += res + + if details is not None: + details['min_acc'] = min(np.min(acc_slice), details['min_acc']) + details['max_acc'] = max(np.max(acc_slice), details['max_acc']) + + acc_tensor = qrec.apply_multiplicative_bias( + params, acc_tensor, out_dims.get_order_idx('c'), ktype="float32") + + return qrec.get_outputs(params, [acc_tensor], ktype="float32") diff --git a/tools/nntool/quantization/float32/kernels/matrix_operations.py b/tools/nntool/quantization/float32/kernels/matrix_operations.py new file mode 100644 index 000000000..b07b0d5a3 --- /dev/null +++ b/tools/nntool/quantization/float32/kernels/matrix_operations.py @@ -0,0 +1,54 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np + +from graph.types import (MatrixAddParameters, MatrixDivParameters, + MatrixMulParameters, MatrixSubParameters) +from quantization.quantization_record_base import QuantizationRecordBase +from quantization.float32.float32_quantization import Float32QuantizationRecord + +PIECEWISE_OPS = { + MatrixAddParameters: {'op': lambda x, y: x + y}, + MatrixMulParameters: {'op': lambda x, y: np.multiply(x, y, dtype=np.float)}, + MatrixSubParameters: {'op': lambda x, y: x - y}, + MatrixDivParameters: {'op': lambda x, y: x / y}, +} + + +def piecewise(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + if qrec is None: + qrec = Float32QuantizationRecord() + func = PIECEWISE_OPS[params.__class__] + op = func['op'] + in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="float32") + return qrec.get_outputs(params, [op(in_tensors[0], in_tensors[1])], ktype="float32") + + +def matscale(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + if qrec is None: + qrec = Float32QuantizationRecord() + in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="float32") + if len(params.in_dims) == 3: + return qrec.get_outputs(params, [in_tensors[0] * in_tensors[1] * in_tensors[2]], ktype="float32") + return qrec.get_outputs(params, [in_tensors[0] * in_tensors[1]], ktype="float32") diff --git a/tools/nntool/quantization/float32/kernels/pad.py b/tools/nntool/quantization/float32/kernels/pad.py new file mode 100644 index 000000000..1a339671d --- /dev/null +++ b/tools/nntool/quantization/float32/kernels/pad.py @@ -0,0 +1,29 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np + +from quantization.quantization_record_base import QuantizationRecordBase + + +def pad(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del qrec, details + if params.pad_type == "zero": + return [np.pad(in_tensors[0], params.padding.numpy_pad_shape(params.in_dims[0]), + 'constant', constant_values=0)] + raise NotImplementedError() diff --git a/tools/nntool/quantization/float32/kernels/pool.py b/tools/nntool/quantization/float32/kernels/pool.py new file mode 100644 index 000000000..429bc0751 --- /dev/null +++ b/tools/nntool/quantization/float32/kernels/pool.py @@ -0,0 +1,153 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +import numpy as np + +from quantization.quantization_record_base import QuantizationRecordBase +from quantization.float32.float32_quantization import Float32QuantizationRecord + +LOG = logging.getLogger("nntool." + __name__) + +# pylint: disable=too-many-arguments, too-many-locals + + +def av_pool(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + + if qrec is None: + qrec = Float32QuantizationRecord() + + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="float32")[0] + in_dims = params.in_dims[0] + out_dims = params.out_dims[0] + filter_sz = params.filter.h * params.filter.w + + pool_factor = 1.0/filter_sz + + out_tensor = np.zeros(out_dims.shape, dtype=np.float32) + + if params.padding.h + params.padding.w > 0: + in_tensor = np.pad(in_tensor, + params.padding.numpy_pad_shape(in_dims), + mode='constant', + constant_values=0.0) + pad_w = params.padding.w + pad_h = params.padding.h + else: + pad_w = pad_h = 0 + + for in_c in range(out_dims.c): + + out_h = 0 + for h_idx in range(0, in_dims.h - params.filter.h + pad_h + 1, + params.stride.h): + out_w = 0 + for w_idx in range(0, in_dims.w - params.filter.w + pad_w + 1, + params.stride.w): + # accumulate - potentially with different Q + in_slice_args = in_dims.srange(c=[in_c, in_c + 1, 1], + h=[h_idx, h_idx + params.filter.h, 1], + w=[w_idx, w_idx + params.filter.w, 1]) + + sum_filter = np.sum(in_tensor[in_slice_args], dtype=np.float32) + sum_filter = np.multiply(sum_filter, pool_factor, dtype=np.float32) + out_tensor[out_dims.srange(c=in_c, h=out_h, w=out_w)] = sum_filter + out_w += 1 + out_h += 1 + + return qrec.get_outputs(params, [out_tensor], ktype="float32") + + +def max_pool(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + + del details + if qrec is None: + qrec = Float32QuantizationRecord() + + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="float32")[0] + in_dims = params.in_dims[0] + out_dims = params.out_dims[0] + + out_tensor = np.zeros(out_dims.shape, dtype=np.float32) + + if params.padding.h + params.padding.w > 0: + in_tensor = np.pad(in_tensor, + params.padding.numpy_pad_shape(in_dims), + mode='constant', + constant_values=0.0) + pad_w = params.padding.w + pad_h = params.padding.h + else: + pad_w = pad_h = 0 + + for in_c in range(out_dims.c): + out_h = 0 + for h_idx in range(0, in_dims.h - params.filter.h + pad_h + 1, + params.stride.h): + out_w = 0 + for w_idx in range(0, in_dims.w - params.filter.w + pad_w + 1, + params.stride.w): + # accumulate - potentially with different Q + out_slice_args = out_dims.srange(c=in_c, h=out_h, w=out_w) + in_slice_args = in_dims.srange(c=[in_c, in_c + 1, 1], + h=[h_idx, h_idx + params.filter.h, 1], + w=[w_idx, w_idx + params.filter.w, 1]) + + out_tensor[out_slice_args] = np.max(in_tensor[in_slice_args].view(np.ndarray)) + out_w += 1 + out_h += 1 + + return qrec.get_outputs(params, [out_tensor], ktype="float32") + + +def av_global_pool(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + if qrec is None: + qrec = Float32QuantizationRecord() + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="float32")[0] + + sum_by_chan = np.sum(in_tensor, dtype=np.float32, axis=( + params.in_dims[0].get_order_idx('w'), params.in_dims[0].get_order_idx('h'))) + + return qrec.get_outputs(params, + [(sum_by_chan / (params.in_dims[0].h * params.in_dims[0].w) + ).reshape(params.out_dims[0].shape)], + ktype="float32") + + +def max_global_pool(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + if qrec is None: + qrec = Float32QuantizationRecord() + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="float32")[0] + + return qrec.get_outputs(params, [np.max(in_tensor, + axis=(params.in_dims[0].get_order_idx('w'), + params.in_dims[0].get_order_idx('h')), + keepdims=True)], ktype="float32") diff --git a/tools/nntool/quantization/float32/kernels/readme.md b/tools/nntool/quantization/float32/kernels/readme.md new file mode 100644 index 000000000..d7b516efb --- /dev/null +++ b/tools/nntool/quantization/float32/kernels/readme.md @@ -0,0 +1,7 @@ +# Float32 Kernels + +These kernels expect float32s as inputs and calculate float32s as outputs in float32 precision. + +They can be used with values quantized with other schemes by passing the appropriate quantization record. + +By default they expect parameters and constant inputs to be in float32 format. \ No newline at end of file diff --git a/tools/nntool/quantization/float32/kernels/softmax.py b/tools/nntool/quantization/float32/kernels/softmax.py new file mode 100644 index 000000000..28c24a371 --- /dev/null +++ b/tools/nntool/quantization/float32/kernels/softmax.py @@ -0,0 +1,34 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np +import scipy.special + +from quantization.quantization_record_base import QuantizationRecordBase +from quantization.float32.float32_quantization import Float32QuantizationRecord + +def softmax_func(v): + return scipy.special.softmax(v) + +def softmax(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + if qrec is None: + qrec = Float32QuantizationRecord() + np.seterr(over='raise') + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="float32")[0] + return qrec.get_outputs(params, [softmax_func(in_tensor)], ktype="float32") diff --git a/tools/nntool/quantization/float32/kernels/tensor_functions.py b/tools/nntool/quantization/float32/kernels/tensor_functions.py new file mode 100644 index 000000000..01c7265c4 --- /dev/null +++ b/tools/nntool/quantization/float32/kernels/tensor_functions.py @@ -0,0 +1,104 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np +from skimage.transform import resize + +from quantization.quantization_record_base import QuantizationRecordBase +from quantization.float32.float32_quantization import Float32QuantizationRecord + + +def graph_input(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + if qrec is None: + qrec = Float32QuantizationRecord() + in_tensor = in_tensors[params.index] + if in_tensor.size == params.dims.size(): + in_tensor = in_tensor.reshape(params.dims.shape) + else: + in_tensor = resize(in_tensor, params.dims.shape) + if params.transpose_out: + in_tensor = np.transpose(in_tensor, params.transpose_out) + return qrec.get_outputs(params, [in_tensor], ktype="float32") + + +def graph_output(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details, qrec + in_tensor = in_tensors[0] + if params.transpose_in: + in_tensor = np.transpose(in_tensor, params.transpose_in) + return [in_tensor] + + + +def constant_input(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del in_tensors, details + if qrec is None: + qrec = Float32QuantizationRecord() + return qrec.get_outputs(params, [params.value], ktype="float32") + + +def concat(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + if qrec is None: + qrec = Float32QuantizationRecord() + in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="float32") + if params.transpose_in: + in_tensors = [np.transpose(qrec.in_tensor, params.transpose_in) for in_tensor in in_tensors] + out_tensor = np.concatenate(in_tensors, params.axis) + if params.transpose_out: + out_tensor = np.transpose(out_tensor, params.transpose_out) + return qrec.get_outputs(params, [out_tensor], ktype="float32") + + +def reshape(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + if qrec is None: + qrec = Float32QuantizationRecord() + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="float32")[0] + if params.transpose_in: + in_tensor = np.transpose(in_tensor, params.transpose_in) + in_tensor = np.reshape(in_tensor, params.shape) + if params.transpose_out: + in_tensor = np.transpose(in_tensor, params.transpose_out) + return qrec.get_outputs(params, [in_tensor], ktype="float32") + + +def transpose(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + if qrec is None: + qrec = Float32QuantizationRecord() + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="float32")[0] + if params.transpose_in: + in_tensor = np.transpose(in_tensor, params.transpose_in) + return qrec.get_outputs(params, [in_tensor], ktype="float32") diff --git a/tools/nntool/quantization/kernels/__init__.py b/tools/nntool/quantization/kernels/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/quantization/kernels/kernel_function.py b/tools/nntool/quantization/kernels/kernel_function.py new file mode 100644 index 000000000..b1cce89d7 --- /dev/null +++ b/tools/nntool/quantization/kernels/kernel_function.py @@ -0,0 +1,129 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from abc import ABC, abstractproperty +from typing import Callable, Iterable, NewType, Mapping, Any, Optional + +import numpy as np + +from graph.types import Parameters +from quantization.quantization_record_base import QuantizationRecordBase + +KernelFunction = NewType('KernelFunction', + Callable[ + [Parameters, + Iterable[np.ndarray], + QuantizationRecordBase, + Optional[Mapping[Any, Any]]], + Iterable[np.ndarray] + ]) + +class KernelFunctionSetBase(ABC): + + @abstractproperty + def graph_input(self) -> KernelFunction: + pass + + @abstractproperty + def graph_output(self) -> KernelFunction: + pass + + @abstractproperty + def constant_input(self) -> KernelFunction: + pass + + @abstractproperty + def relu(self) -> KernelFunction: + pass + + @abstractproperty + def leaky(self) -> KernelFunction: + pass + + @abstractproperty + def hswish(self) -> KernelFunction: + pass + + @abstractproperty + def hsigmoid(self) -> KernelFunction: + pass + + @abstractproperty + def matadd(self) -> KernelFunction: + pass + + @abstractproperty + def matsub(self) -> KernelFunction: + pass + + @abstractproperty + def matdiv(self) -> KernelFunction: + pass + + @abstractproperty + def matmul(self) -> KernelFunction: + pass + + @abstractproperty + def matscale(self) -> KernelFunction: + pass + + @abstractproperty + def conv2d(self) -> KernelFunction: + pass + + @abstractproperty + def linear(self) -> KernelFunction: + pass + + @abstractproperty + def softmax(self) -> KernelFunction: + pass + + @abstractproperty + def reshape(self) -> KernelFunction: + pass + + @abstractproperty + def transpose(self) -> KernelFunction: + pass + + @abstractproperty + def concat(self) -> KernelFunction: + pass + + @abstractproperty + def av_pool(self) -> KernelFunction: + pass + + @abstractproperty + def av_global_pool(self) -> KernelFunction: + pass + + @abstractproperty + def max_pool(self) -> KernelFunction: + pass + + @abstractproperty + def max_global_pool(self) -> KernelFunction: + pass + + @abstractproperty + def pad(self) -> KernelFunction: + pass + + @abstractproperty + def image_format(self) -> KernelFunction: + pass diff --git a/tools/nntool/quantization/kernels/kernel_switch.py b/tools/nntool/quantization/kernels/kernel_switch.py new file mode 100644 index 000000000..cf5c9ca8e --- /dev/null +++ b/tools/nntool/quantization/kernels/kernel_switch.py @@ -0,0 +1,99 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from abc import ABC, abstractmethod +from typing import Sequence + +import numpy as np + +from graph.types import (ActivationParameters, ConcatParameters, + ConstantInputParameters, Conv2DParameters, + FcParameters, GlobalPoolParameters, InputParameters, + MatrixAddParameters, + MatrixDivParameters, MatrixMulParameters, + MatrixSubParameters, MatScaleFusionParameters, + OutputParameters, PadParameters, Parameters, + PoolingParameters, ReshapeParameters, + SoftMaxParameters, TransposeParameters, + ReluActivationParameters, LeakyActivationParameters, + HSwishActivationParameters, HSigmoidActivationParameters, + ImageFormatParameters) +from quantization.quantization_record_base import QuantizationRecordBase + + +class KernelSwitchBase(ABC): + @abstractmethod + def execute(self, params: Parameters, input_tensors: Sequence[np.ndarray], + qrec: QuantizationRecordBase, details: str = None) -> Sequence[np.ndarray]: + pass + + +class DefaultKernelSwitch(KernelSwitchBase): + FUNCTION_MAPPING = { + Conv2DParameters: "conv2d", + FcParameters: "linear", + PoolingParameters: "pool_switch", + InputParameters: "graph_input", + OutputParameters: "graph_output", + ReluActivationParameters: "relu", + LeakyActivationParameters: "leaky", + HSigmoidActivationParameters: "hsigmoid", + HSwishActivationParameters: "hswish", + PadParameters: "pad", + SoftMaxParameters: "softmax", + ReshapeParameters: "reshape", + MatrixAddParameters: "matadd", + MatrixDivParameters: "matdiv", + MatrixMulParameters: "matmul", + MatrixSubParameters: "matsub", + ConcatParameters: "concat", + TransposeParameters: "transpose", + ConstantInputParameters: "constant_input", + MatScaleFusionParameters: "matscale", + GlobalPoolParameters: "globalpool_switch", + ImageFormatParameters: "image_format" + } + + def __init__(self, kernel_functions): + super(DefaultKernelSwitch, self).__init__() + self._kernel_functions = kernel_functions + + def execute(self, params: Parameters, input_tensors: Sequence[np.ndarray], + qrec: QuantizationRecordBase, details: str = None) -> Sequence[np.ndarray]: + if params.__class__ in self.FUNCTION_MAPPING: + func = self.FUNCTION_MAPPING[params.__class__] + if hasattr(self, func): + return getattr(self, func)(params, input_tensors, qrec, details=details) + if hasattr(self._kernel_functions, func): + return getattr(self._kernel_functions, func)(params, input_tensors, + qrec, details=details) + raise NotImplementedError("Implementation for %s not found" % func) + raise NotImplementedError("Unknown parameter type %s" % params.__class__.name) + + def pool_switch(self, params: Parameters, input_tensors: Sequence[np.ndarray], + qrec: QuantizationRecordBase, details: str = None) -> Sequence[np.ndarray]: + if params.pool_type == "average": + return self._kernel_functions.av_pool(params, input_tensors, qrec, details=details) + if params.pool_type == "max": + return self._kernel_functions.max_pool(params, input_tensors, qrec, details=details) + raise NotImplementedError("unknown pool type %s" % params.pool_type) + + def globalpool_switch(self, params: Parameters, input_tensors: Sequence[np.ndarray], + qrec: QuantizationRecordBase, details: str = None) -> Sequence[np.ndarray]: + if params.pool_type == "average": + return self._kernel_functions.av_global_pool(params, input_tensors, qrec, details=details) + if params.pool_type == "max": + return self._kernel_functions.max_global_pool(params, input_tensors, qrec, details=details) + raise NotImplementedError("unknown pool type %s" % params.pool_type) diff --git a/tools/nntool/quantization/multiplicative/__init__.py b/tools/nntool/quantization/multiplicative/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/quantization/multiplicative/asymmetric/__init__.py b/tools/nntool/quantization/multiplicative/asymmetric/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/quantization/multiplicative/asymmetric/asymmetric_mult_qtype.py b/tools/nntool/quantization/multiplicative/asymmetric/asymmetric_mult_qtype.py new file mode 100644 index 000000000..d13f0f697 --- /dev/null +++ b/tools/nntool/quantization/multiplicative/asymmetric/asymmetric_mult_qtype.py @@ -0,0 +1,196 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import math + +import numpy as np + +from quantization.multiplicative.mult_qtype_base import MultQTypeBase + +VALID_DTYPES = [ + np.int8, + np.int16, + np.int32, + np.uint8, + np.uint16, + np.uint32 +] + +SIGNED_DTYPES = [ + np.int8, + np.int16, + np.int32, +] +# template +# QuantizationPoints GetQuantizationPoints() { +# QuantizationPoints qp; +# using Integer = DataType; +# qp.min_value = std::numeric_limits::min(); +# qp.max_value = std::numeric_limits::max(); +# // eg [-128,127]... +# qp.central_value = (qp.min_value / 2 + // -128 -> -64. +# (qp.max_value - 1) / 2 + // 127 -> 63. +# 1); +# return qp; +# } + + +class AsymmetricMultQType(MultQTypeBase): + def __init__(self, *args, zero_point=0, init=None, **kwargs): + super(AsymmetricMultQType, self).__init__(*args, init=init, **kwargs) + if init is None: + self.zero_point = self.init_array(zero_point) + + @classmethod + def from_tflite(cls, tf_qps, dtype): + res = cls(zero_point=tf_qps.ZeroPointAsNumpy() if tf_qps.ZeroPointLength() > 0 else None) + res.min_val = tf_qps.MinAsNumpy() if tf_qps.MinLength() > 0 else None + res.max_val = tf_qps.MaxAsNumpy() if tf_qps.MaxLength() > 0 else None + res.scale = tf_qps.ScaleAsNumpy() if tf_qps.ScaleLength() > 0 else None + res.zero_point = tf_qps.ZeroPointAsNumpy() if tf_qps.ZeroPointLength() > 0 else None + res.quantized_dimension = tf_qps.QuantizedDimension() + res.dtype = dtype + return res + + @classmethod + def from_array(cls, arr: np.ndarray, dtype=np.uint8, + quantized_dimension=None, narrow_range=False): + rmin = np.min(arr) + rmax = np.max(arr) + return cls.from_min_max(rmin, rmax, dtype=dtype, quantized_dimension=quantized_dimension, + narrow_range=narrow_range) + + @classmethod + def from_min_max(cls, rmin, rmax, dtype=np.uint8, + quantized_dimension=None, narrow_range=False): + iinfo = np.iinfo(dtype) + qmin = iinfo.min + (1 if narrow_range else 0) + qmax = iinfo.max + if rmin == rmax: + if rmin == 0: + return cls(min_val=0, max_val=0, scale=0, zero_point=0, + quantized_dimension=quantized_dimension, dtype=dtype) + elif rmin < 0: + rmax = -rmin + else: + rmin = -rmax + # we must represent 0 + if rmin > 0: + rmin = 0 + scale = (rmax - rmin) / (qmax - qmin) + zero_point_from_min = qmin - rmin / scale + zero_point_from_max = qmax - rmax / scale + zero_point_from_min_error = qmin + abs(rmin / scale) + zero_point_from_max_error = qmax + abs(rmax / scale) + if zero_point_from_min_error < zero_point_from_max_error: + zero_point = zero_point_from_min + else: + zero_point = zero_point_from_max + + nudged_zero_point = 0 + if zero_point < qmin: + nudged_zero_point = qmin + elif zero_point > qmax: + nudged_zero_point = qmax + else: + nudged_zero_point = math.floor(zero_point + 0.5) + + return cls(min_val=rmin, max_val=rmax, scale=scale, zero_point=nudged_zero_point, + quantized_dimension=quantized_dimension, dtype=dtype) + + @property + def central(self): + iinfo = np.iinfo(self.dtype) + if iinfo.min == 0: + return iinfo.min // 2 + (iinfo.max - 1) // 2 + 1 + return 0 + + @property + def min(self): + return self.min_val + + @property + def max(self): + return self.max_val + + def get_quantized_scale(self): + return [0], [0] + + def dtype_is_valid(self): + return self.dtype in VALID_DTYPES + + def _encapsulate(self): + return { + "min_val": self.min_val, + "max_val": self.max_val, + "scale": self.scale, + "zero_point": self.zero_point, + "dim": self.quantized_dimension, + "dtype": self.dtype.__name__ + } + + @classmethod + def _dencapsulate(cls, val): + return cls(init={ + "min_val": val['min_val'], + "max_val": val['max_val'], + "scale": val['scale'], + "zero_point": val['zero_point'], + "quantized_dimension": val['dim'] if 'dim' in val else None, + "dtype": getattr(np, val['dtype']) + }) + + @property + def pad_zero_point(self): + return self.zero_point[0] + + @property + def zero_point(self): + return self._info['zero_point'] + + @zero_point.setter + def zero_point(self, val): + self._info['zero_point'] = val + + def quantize(self, arr: np.array) -> np.array: + arr = np.floor(arr/self.scale + 0.5) + self.zero_point + return self.clip(arr) + + def dequantize(self, arr: np.array) -> np.array: + + shape = tuple(dim if idx == self.quantized_dimension + else 1 for idx, dim in enumerate(arr.shape)) + if len(self.zero_point) > 1: + zero_point = self.zero_point.reshape(shape) + else: + zero_point = self.zero_point + + if len(self.scale) > 1: + scale = self.scale.reshape(shape) + else: + scale = self.scale + + arr = (arr.astype(np.float32) - zero_point) * scale + return arr + + def get_dequantized(self, arr, container_is_quantized=True): + if container_is_quantized: + return self.dequantize(arr) + return arr + + def get_quantized(self, arr: np.array, container_is_quantized=True) -> np.array: + if not container_is_quantized: + return self.quantize(arr) + return arr diff --git a/tools/nntool/quantization/multiplicative/mult_mulbias_qtype.py b/tools/nntool/quantization/multiplicative/mult_mulbias_qtype.py new file mode 100644 index 000000000..b989dba09 --- /dev/null +++ b/tools/nntool/quantization/multiplicative/mult_mulbias_qtype.py @@ -0,0 +1,126 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import math + +import numpy as np + +from quantization.qtype_base import QTypeBase +from utils.json_serializable import JsonSerializable + +NUM_BITS = { + np.int8: 8, + np.int16: 16, + np.int32: 32, + np.uint8: 8, + np.uint16: 16, + np.uint32: 32, +} + +SIGNED = [ + np.int8, + np.int16, + np.int32 +] + +class MultMulBiasQType(QTypeBase, JsonSerializable): + def __init__(self, dtype=None, init=None): + if init: + self._info = init + else: + self._info = { + 'dtype': dtype if dtype is not None else np.int16, + 'shift': None + } + + @property + def q(self): + return self._info['shift'] + + @property + def bits(self): + return NUM_BITS[self.dtype] + + @property + def signed(self): + return self.dtype in SIGNED + + @property + def shift_is_set(self): + return self.q is not None + + @property + def dtype(self): + return self._info['dtype'] + + @property + def pad_zero_point(self): + return 0 + + def _encapsulate(self): + return { + 'dtype': self.dtype.__name__, + 'shift': self.q + } + + @classmethod + def _dencapsulate(cls, val): + return cls(init={ + 'dtype': getattr(np, val['dtype']), + 'shift': val['shift'] + }) + + @property + def pre_normalization(self): + return self._info.get('pre_normalization') or 0 + + @pre_normalization.setter + def pre_normalization(self, val): + self._info['pre_normalization'] = val + + def quantize_elem(self, val: np.float64): + return self.normalize(val) + + def quantize(self, arr: np.array) -> np.array: + return np.array([self.normalize(elem) for elem in arr]).astype(self.dtype) + + def dequantize_elem(self, val): + return val * 1.0/(1 << self.q) + + def dequantize(self, arr: np.array) -> np.array: + return arr.astype(np.float32) * 1/(1 << self.q) + + def get_shift(self, max_num): + (val, shift) = math.frexp(max_num) + if val > ((self.bits - 2)/(self.bits - 1)): + val /= 2 + shift += 1 + shift = shift - (self.bits - 1) + return val, shift + + def set_shift(self, max_num): + _, shift = self.get_shift(max_num) + shift = shift - self.pre_normalization + assert shift <= 0, "number cannot be represented with a right shift" + self._info['shift'] = -shift + + def normalize(self, fnum): + val, shift = self.get_shift(fnum) + cor = shift + self.q - self.pre_normalization + assert cor <= 0, "correction should never be positive" + return math.floor((val * (1 << (self.bits - 1 + cor))) + 0.5) + + def __str__(self): + return "{}b>>{}".format(self.bits, self.q) diff --git a/tools/nntool/quantization/multiplicative/mult_qtype_base.py b/tools/nntool/quantization/multiplicative/mult_qtype_base.py new file mode 100644 index 000000000..0295b7cba --- /dev/null +++ b/tools/nntool/quantization/multiplicative/mult_qtype_base.py @@ -0,0 +1,249 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import math +import numbers +from abc import ABC, abstractmethod, abstractproperty + +import numpy as np + +from quantization.qtype import QType +from quantization.qtype_base import QTypeBase +from utils.json_serializable import JsonSerializable + +#pylint: disable=abstract-method + +SIGNED = [ + np.int8, + np.int16, + np.int32 +] + +DTYPE_STR = { + np.int8: 'i8', + np.int16: 'i16', + np.int32: 'i32', + np.uint8: 'u8', + np.uint16: 'u16', + np.uint32: 'u32', +} + +DTYPE_CTYPE = { + np.int8: 'int8', + np.int16: 'int16', + np.int32: 'int32', + np.uint8: 'uint8', + np.uint16: 'uint16', + np.uint32: 'uint32', +} + + +class WrapperMixin(ABC): + @property + def wrapped(self): + return self._wrapped + + def quantize_wrapped(self, arr: np.array) -> np.array: + return self._wrapped.quantize(arr) + + def dequantize_wrapped(self, arr: np.array) -> np.array: + return self._wrapped.dequantize(arr) + + def requantize(self, arr: np.array) -> np.array: + return self.quantize(self._wrapped.dequantize(arr)) + + +class MultQTypeBase(QTypeBase, JsonSerializable): + def __init__(self, *args, min_val=None, max_val=None, scale=None, + quantized_dimension=0, dtype=None, init=None, **kwargs): + super(MultQTypeBase, self).__init__(*args, **kwargs) + if init: + self._info = init + else: + self._info = {} + self.min_val = self.init_array(min_val) + self.max_val = self.init_array(max_val) + self.quantized_dimension = quantized_dimension + self.dtype = dtype if dtype is not None else np.int8 + self.scale = self.init_array(scale) + # if not self.dtype_is_valid(): + # raise ValueError("dtype %s is not valid for %s" % (self.dtype, self.__class__.__name__)) + + + @property + def ctype(self): + return DTYPE_CTYPE[self.dtype] + + @abstractmethod + def dtype_is_valid(self): + pass + + @staticmethod + def init_array(val, key=None): + if key is not None: + val = val.get(key) + if isinstance(val, np.ndarray): + return val + elif isinstance(val, (numbers.Real, np.float)): + return np.array([val]) + elif isinstance(val, list): + return np.array(val) + elif val is None: + return None + else: + raise ValueError("invalid scale") + + @property + def signed(self): + return self.dtype in SIGNED + + @property + def q(self): # in mulbias scheme all Qs are zero except the mulbias + return 0 + + @property + def bits(self): + return self.dtype().itemsize * 8 + + @abstractproperty + def zero_point(self): + pass + + @abstractmethod + @zero_point.setter + def zero_point(self, val): + pass + + @abstractproperty + def min(self): + pass + + @property + def min_val(self): + return self._info['min_val'] + + @min_val.setter + def min_val(self, val): + self._info['min_val'] = val + + @abstractproperty + def max(self): + pass + + @property + def max_val(self): + return self._info['max_val'] + + @max_val.setter + def max_val(self, val): + self._info['max_val'] = val + + @property + def range(self): + if self.max is None and self.min is None: + return None + assert np.all(self.max >= self.min) + return self.max - self.min + + kNearZeroTolerance = 1.0e-6 + + @property + def scale(self): + return self._info['scale'] + + @scale.setter + def scale(self, val): + if isinstance(val, np.ndarray): + self._info['scale'] = np.where(val < self.kNearZeroTolerance, 1, val) + elif val is not None and val < self.kNearZeroTolerance: + self._info['scale'] = 1 + else: + self._info['scale'] = val + + @property + def quantized_dimension(self): + return self._info['quantized_dimension'] + + @quantized_dimension.setter + def quantized_dimension(self, val): + self._info['quantized_dimension'] = val + + @property + def dtype(self): + return self._info['dtype'] + + @dtype.setter + def dtype(self, val): + self._info['dtype'] = val + + @property + def pad_zero_point(self): + return 0 + + @abstractmethod + def get_quantized(self, arr, container_is_quantized=True): + pass + + @abstractmethod + def get_dequantized(self, arr, container_is_quantized=True): + pass + + def expand_from(self, arr: np.array, from_qtype: QTypeBase) -> np.array: + #if self.dtype != from_qtype.dtype: + return arr.astype(self.dtype) + + def reduce_from(self, arr: np.array, from_qtype: QTypeBase) -> np.array: + #if self.dtype != from_qtype.dtype: + return self.clip(arr) + + @staticmethod + def str_fmt(val, extend=False): + if val is None: + return "unkn" + if isinstance(val, int) or isinstance(val, float) or isinstance(val, np.floating): + return val + return "chan" if len(val) > 1 else ("{:0.2f}".format(val[0]) if not extend else "{:0.8f}".format(val[0])) + + def __eq__(self, other): + if isinstance(other, QType): + return self.bits == other.bits + if isinstance(other, MultQTypeBase): + return self.scale == other.scale and\ + self.dtype == other.dtype and\ + self.quantized_dimension == other.quantized_dimension and\ + self.zero_point == other.zero_point + return False + + @property + def dtype_str(self): + return DTYPE_STR[self.dtype] + + def str_by_chan(self, chan: int): + return "{}<({}-{})*{}<{}".format( + self.str_fmt(self.min[chan]), + self.dtype_str, + self.str_fmt(self.zero_point[chan]), + self.str_fmt(self.scale[chan]), + self.str_fmt(self.max[chan]), + ) + + def __str__(self): + return "{}<({}-{})*{}<{}".format( + self.str_fmt(self.min), + self.dtype_str, + self.str_fmt(self.zero_point), + self.str_fmt(self.scale, extend=True), + self.str_fmt(self.max), + ) diff --git a/tools/nntool/quantization/multiplicative/mult_quantization.py b/tools/nntool/quantization/multiplicative/mult_quantization.py new file mode 100644 index 000000000..1654a2109 --- /dev/null +++ b/tools/nntool/quantization/multiplicative/mult_quantization.py @@ -0,0 +1,388 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import math +from functools import reduce +from typing import Sequence + +import numpy as np + +from graph.types import (FilterParameters, MultiplicativeBiasParameters, + Parameters) +from quantization.multiplicative.mult_qtype_base import (MultQTypeBase, + WrapperMixin) +from quantization.multiplicative.symmetric.mult_mulbias_qtype_new import ( + MultMulBiasQType, MultMulBiasScaleQType) +from quantization.multiplicative.symmetric.symmetric_mult_biases_qtype import \ + SymmetricMultBiasesQType +from quantization.qtype import QType +from quantization.quantization_record_base import ( + ConstantQuantizationRecordBase, InputOutputQuantizationRecordBase, + QuantizationRecordBase, ScalableFilterQuantizationRecordBase) + + +class MultQuantizationRecordBase(QuantizationRecordBase): + def __init__(self, *args, quant_mode="symmetric", info=None, **kwargs): + super(MultQuantizationRecordBase, self).__init__(*args, info=info, **kwargs) + if info is None: + self._info['quant_mode'] = quant_mode + self._cache = {} + + def clear_cache(self): + self._cache = {} + + def check_cache(self, name): + return self._cache.get(name) + + def dequantize_as(self, tensor: np.ndarray, key_name: str, idx: int = None) -> np.ndarray: + qtype = getattr(self, key_name) + if idx is not None: + qtype = qtype[idx] + return qtype.dequantize(tensor) + + def quantize_as(self, tensor: np.ndarray, key_name: str, idx: int = None) -> np.ndarray: + qtype = getattr(self, key_name) + if idx is not None: + qtype = qtype[idx] + return qtype.quantize(tensor) + + def dequantize_wrapped(self, tensor: np.ndarray, key_name: str, idx: int = None) -> np.ndarray: + qtype = getattr(self, key_name) + if idx is not None: + qtype = qtype[idx] + if isinstance(qtype, WrapperMixin): + return qtype.wrapped.dequantize(tensor) + return qtype.dequantize(tensor) + + def quantize_wrapped(self, tensor: np.ndarray, key_name: str, idx: int = None) -> np.ndarray: + qtype = getattr(self, key_name) + if idx is not None: + qtype = qtype[idx] + if isinstance(qtype, WrapperMixin): + return qtype.wrapped.quantize(tensor) + return qtype.quantize(tensor) + + def requantize(self, tensor: np.ndarray, key_name: str, idx: int = None) -> np.ndarray: + qtype = getattr(self, key_name) + if idx is not None: + qtype = qtype[idx] + + if isinstance(qtype, WrapperMixin): + tensor_fp = self.dequantize_wrapped(tensor, key_name, idx=idx) + tensor_sym = qtype.quantize(tensor_fp) + return tensor_sym + if tensor.dtype == np.float32: + return qtype.quantize(tensor) + return tensor + + def confirm_dimension(self, out_c_idx: int, key_name: str): + qtype = getattr(self, key_name) + qtype.quantized_dimension = out_c_idx + + +class InputQuantizationMixin(MultQuantizationRecordBase): + def __init__(self, *args, auto_quantize_inputs=False, auto_dequantize_inputs=False, **kwargs): + super(InputQuantizationMixin, self).__init__(*args, **kwargs) + self._auto_quantize_inputs = auto_quantize_inputs + self._auto_dequantize_inputs = auto_dequantize_inputs + + @property + def auto_quantize_inputs(self): + return self._auto_quantize_inputs + + @auto_quantize_inputs.setter + def auto_quantize_inputs(self, val): + self._auto_quantize_inputs = val + + @property + def auto_dequantize_inputs(self): + return self._auto_dequantize_inputs + + @auto_quantize_inputs.setter + def auto_quantize_inputs(self, val): + self._auto_dequantize_inputs = val + + def prepare_inputs(self, params: Parameters, + input_tensors: Sequence[np.ndarray], ktype: str = None) -> Sequence[np.ndarray]: + del params + if ktype == 'float32' and self.auto_dequantize_inputs: + return [self.dequantize_wrapped(input_tensor, "in_qs", idx=idx) + for idx, input_tensor in enumerate(input_tensors)] + if ktype == 'symmetric' and self.auto_quantize_inputs: + return [self.quantize_as(input_tensor, "in_qs", idx=idx) for idx, input_tensor in enumerate(input_tensors)] + return input_tensors + + +class OutputQuantizationMixin(MultQuantizationRecordBase): + def __init__(self, *args, auto_dequantize_outputs=False, auto_quantize_outputs=False, **kwargs): + super(OutputQuantizationMixin, self).__init__(*args, **kwargs) + self._auto_quantize_outputs = auto_quantize_outputs + self._auto_dequantize_outputs = auto_dequantize_outputs + + @property + def auto_dequantize_outputs(self): + return self._auto_dequantize_outputs + + @auto_dequantize_outputs.setter + def auto_dequantize_outputs(self, val): + self._auto_dequantize_outputs = val + + @property + def auto_quantize_outputs(self): + return self._auto_dequantize_outputs + + @auto_quantize_outputs.setter + def auto_quantize_outputs(self, val): + self._auto_dequantize_outputs = val + + def get_outputs(self, params: Parameters, + output_tensors: Sequence[np.ndarray], ktype: str = None) -> Sequence[np.ndarray]: + del params + if ktype == 'symmetric': + if self._auto_dequantize_outputs: + return [self.dequantize_as(output_tensor, "out_qs", idx=idx) + for idx, output_tensor in enumerate(output_tensors)] + output_tensors = [self.out_qs[idx].clip(output_tensor) + for idx, output_tensor in enumerate(output_tensors)] + return output_tensors + + +class MultQuantizationRecord(InputQuantizationMixin, OutputQuantizationMixin, InputOutputQuantizationRecordBase): + def __init__(self, *args, scale_mul_biases_q=None, info=None, **kwargs): + super(MultQuantizationRecord, self).__init__(*args, info=info, **kwargs) + if info is None: + self.scale_mul_biases_q = scale_mul_biases_q + + @property + def scale_mul_biases_q(self): + mul_biases_q = self._info.get('scale_mul_biases_q') + if mul_biases_q is None: + mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8) + self.scale_mul_biases_q = mul_biases_q + return mul_biases_q + + @scale_mul_biases_q.setter + def scale_mul_biases_q(self, val): + self._info['scale_mul_biases_q'] = val + + def set_scale(self, in_idx=0, out_idx=0, extra_scale=1): + if isinstance(in_idx, int): + in_scale = self.in_qs[in_idx].scale + else: + in_scale = reduce(lambda x, y: x * y, [self.in_qs[idx].scale for idx in in_idx]) + if isinstance(out_idx, int): + out_scale = self.out_qs[out_idx].scale + else: + out_scale = reduce(lambda x, y: x * y, [self.out_qs[idx].scale for idx in out_idx]) + scale_mul_biases_q = self.scale_mul_biases_q + scale = in_scale * extra_scale / out_scale + scale_mul_biases_q.scale = scale + + +class MultAddQuantizationRecord(MultQuantizationRecord): + def __init__(self, *args, scale_in_mul_biases_q=None, info=None, **kwargs): + super(MultAddQuantizationRecord, self).__init__(*args, info=info, **kwargs) + if info is None: + self._info['scale_in_mul_biases_q'] = scale_in_mul_biases_q + + @property + def scale_in_mul_biases_q(self): + mul_biases_q = self._info.get('scale_in_mul_biases_q') + if mul_biases_q is None: + mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8) + self.scale_in_mul_biases_q = mul_biases_q + return mul_biases_q + + @scale_in_mul_biases_q.setter + def scale_in_mul_biases_q(self, val): + self._info['scale_in_mul_biases_q'] = val + + @property + def scaled_idx(self): + return 1 if self.in_qs[1].scale > self.in_qs[0].scale else 0 + + def set_add_scale(self): + self.set_scale(in_idx=0 if self.scaled_idx else 1) + scale_in_mul_biases_q = self.scale_in_mul_biases_q + scaled_idx = self.scaled_idx + not_scaled_idx = 0 if scaled_idx else 1 + scale = self.in_qs[scaled_idx].scale / self.in_qs[not_scaled_idx].scale + scale_in_mul_biases_q.scale = scale + + +class MultConstantQuantizationRecord(InputQuantizationMixin, InputOutputQuantizationRecordBase, + OutputQuantizationMixin, ConstantQuantizationRecordBase): + def gen_value(self, value): + return self.out_qs[0].get_quantized(value, container_is_quantized=self.constants_are_quantized) + + +class FilterQuantizationMixin(MultQuantizationRecord): + @property + def calc_q(self) -> MultQTypeBase: + return QType(bits=32, q=0, signed=True) + + @property + def acc_q(self) -> MultQTypeBase: + return QType(bits=32, q=0, signed=True) + + @property + def biases_q(self) -> SymmetricMultBiasesQType: + return self._info.get('biases_q') + + @property + def weights_q(self) -> MultQTypeBase: + return self._info.get('weights_q') + + @calc_q.setter + def calc_q(self, val: MultQTypeBase): + pass + + @acc_q.setter + def acc_q(self, val: MultQTypeBase): + pass + + @biases_q.setter + def biases_q(self, val: SymmetricMultBiasesQType): + self._info['biases_q'] = val + + @weights_q.setter + def weights_q(self, val: MultQTypeBase): + self._info['weights_q'] = val + + @staticmethod + def rescale(arr, from_scale, to_scale): + return np.floor((arr * from_scale/to_scale) + 0.5).astype(arr.dtype) + + +class MultScalableFilterQuantizationRecord(FilterQuantizationMixin, ScalableFilterQuantizationRecordBase): + def __init__(self, *args, + weights_q: MultQTypeBase = None, + biases_q: SymmetricMultBiasesQType = None, + mul_biases_q: Sequence[MultMulBiasQType] = None, + calc_q: QType = None, + acc_q: QType = None, + enable_prenorm=False, + info=None, + **kwargs): + super(MultScalableFilterQuantizationRecord, self).__init__(*args, info=info, **kwargs) + if info is None: + self._info['calc_q'] = calc_q + self._info['acc_q'] = acc_q + self._info['biases_q'] = biases_q + self._info['weights_q'] = weights_q + self._info['mul_biases_q'] = mul_biases_q + self._info['enable_prenorm'] = enable_prenorm + self.biases_q.link(self.weights_q, self.in_qs[0]) + + @property + def unwrap(self): + return self._unwrap + + @unwrap.setter + def unwrap(self, val): + self._unwrap = val + self.biases_q.link(self.weights_q, self.in_qs[0]) + + def compute_prenorm(self, params: FilterParameters): + if not self.enable_prenorm: + return 0 + max_bits = self.in_qs[0].bits - 1 + self.weights_q.bits - 1 + 1 + \ + math.ceil(math.log2(params.filter.in_c * params.filter.h * params.filter.w)) + spare_bits = 31 - max_bits + if self.mul_biases_q.dtype == np.int8: + bits = 7 + elif self.mul_biases_q.dtype == np.uint8: + bits = 8 + return max(0, bits - spare_bits) + + @property + def enable_prenorm(self) -> bool: + return self._info.get('enable_prenorm') + + @enable_prenorm.setter + def enable_prenorm(self, val: bool): + self._info['enable_prenorm'] = val + + @property + def mul_biases_q(self) -> MultMulBiasQType: + return self._info.get('mul_biases_q') + + @mul_biases_q.setter + def mul_biases_q(self, val: MultMulBiasQType): + self._info['mul_biases_q'] = val + + def get_quantized_bias_offset(self, params, weights): + # input zero correction is sum(W * Zin) by out_c if weights are channel scaled + axis = tuple([idx for idx in range(4) if idx != params.filter.get_order_idx('out_c')]) + return np.sum(np.multiply(self.in_qs[0].zero_point, + weights, + dtype=np.int32), + dtype=np.int32, + axis=axis) + + @property + def biases_zero_correction(self): + # output zero correction is So/(Si * Sw) * ZPo by out_c if weights are channel scaled + scale = self.out_qs[0].scale / (self.in_qs[0].scale * self.weights_q.scale) + return np.floor((self.out_qs[0].zero_point * scale) + 0.5).astype(np.int32) + + def prepare_biases(self, params: Parameters, biases: np.ndarray, + weights: np.ndarray, ktype: str = None) -> np.ndarray: + if ktype == 'float32': + return self.biases_q.get_dequantized(biases, + container_is_quantized=self.constants_are_quantized).astype(np.float32) + if ktype == 'symmetric': + return self.gen_biases(params, biases, weights) + raise ValueError() + + def prepare_weights(self, params: Parameters, + weights: np.ndarray, ktype: str = None) -> np.ndarray: + self.confirm_dimension(params.filter.get_order_idx('out_c'), 'weights_q') + if ktype == 'float32': + weights = self.weights_q.get_dequantized(weights, + container_is_quantized=self.constants_are_quantized) + return weights.astype(np.float32) + if ktype == 'symmetric': + return self.gen_weights(params, weights) + raise ValueError() + + def gen_weights(self, params: Parameters, weights: np.ndarray) -> np.ndarray: + return self.weights_q.get_quantized(weights, + container_is_quantized=self.constants_are_quantized) + + def gen_biases(self, params: Parameters, biases: np.ndarray, weights: np.ndarray) -> np.ndarray: + biases = self.biases_q.get_quantized( + biases, container_is_quantized=self.constants_are_quantized) + if self.in_qs[0].zero_point != 0: + biases -= self.get_quantized_bias_offset(params, + self.requantize(weights, 'weights_q')) + if self.out_qs[0].zero_point != 0: + biases += self.biases_zero_correction + return biases + + def gen_mul_biases(self, params: MultiplicativeBiasParameters) -> np.ndarray: + if isinstance(self.mul_biases_q, MultMulBiasQType): + self.mul_biases_q.pre_normalization = self.compute_prenorm(params) + return self.mul_biases_q.qbiases + + def apply_multiplicative_bias(self, params: FilterParameters, input_tensor: np.ndarray, + axis: int, ktype: str = None): + if ktype == 'float32': + return input_tensor + if ktype == 'symmetric': + if isinstance(self.mul_biases_q, MultMulBiasQType): + self.mul_biases_q.pre_normalization = self.compute_prenorm(params) + input_tensor = self.mul_biases_q.apply_scales(input_tensor, axis) + return input_tensor.astype(np.int32) diff --git a/tools/nntool/quantization/multiplicative/mult_quantizer.py b/tools/nntool/quantization/multiplicative/mult_quantizer.py new file mode 100644 index 000000000..3d13231f8 --- /dev/null +++ b/tools/nntool/quantization/multiplicative/mult_quantizer.py @@ -0,0 +1,212 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging +from collections import OrderedDict + +import numpy as np + +from graph.nngraph import NNGraph +from graph.types import (ConstantInputParameters, Conv2DParameters, + ConvFusionParameters, FcParameters, InputParameters, + MatrixAddParameters, ActivationFusion, + MatrixBroadcastedLinearOpParameters, + MatrixSubParameters, MatScaleFusionParameters, + PoolingParameters, SoftMaxParameters, GlobalPoolParameters, + OutputParameters) +from quantization.multiplicative.mult_quantization import ( + MultAddQuantizationRecord, MultConstantQuantizationRecord, + MultQuantizationRecord, MultScalableFilterQuantizationRecord) +from quantization.multiplicative.symmetric.symmetric_mult_biases_qtype import \ + SymmetricMultBiasesQType +from quantization.multiplicative.symmetric.symmetric_mult_qtype import \ + SymmetricMultQType +from quantization.multiplicative.symmetric.mult_mulbias_qtype_new import MultMulBiasScaleQType +from quantization.quantization_set import QuantizationSet +from quantization.quantizer import Quantizer +from utils.json_serializable import JsonSerializable +from utils.node_id import NodeId, convert_keys_to_str, convert_str_to_keys + +LOG = logging.getLogger('nntool.' + __name__) + +WIDTH_TO_DTYPE = { + 8: np.int8, + 16: np.int16 +} + + +class MultQuantizer(Quantizer, JsonSerializable): + def __init__(self, activation_stats, force_width=None, quantized_dimension=None, narrow_weights=False): + self._activation_stats = activation_stats + self._force_width = force_width + self._quantized_dimension = quantized_dimension + self._narrow_weights = narrow_weights + + # for tests + def __eq__(self, value): + return self._activation_stats == value._activation_stats and \ + self._force_width == value._force_width + + def _encapsulate(self): + return { + 'activation_stats': convert_keys_to_str(self._activation_stats), + 'force_width': self._force_width, + 'quantized_dimension': self._quantized_dimension + } + + @classmethod + def _dencapsulate(cls, val): + return MultQuantizer(convert_str_to_keys(val['activation_stats']), + val['force_width'], + val['quantized_dimension']) + + @staticmethod + def get_in_qs(G, edge_recs, node): + if isinstance(node, InputParameters): + in_qs = [] + else: + in_qs = [edge_recs[edge.params] + for edge in G.indexed_in_edges(node.name)] + return in_qs + + def get_quantized_dimension(self, node): + if self._quantized_dimension == 'tensor': + return None + elif self._quantized_dimension == 'channel': + return node.filter.get_order_idx('out_c') + return None + + def calculate_q(self, G, node, astats, in_qs, dtype, out_dtype=None): + del G + if out_dtype is None: + out_dtype = dtype + if isinstance(node, (PoolingParameters, OutputParameters)): + o_q = in_qs[0] + elif isinstance(node, SoftMaxParameters): + o_q = SymmetricMultQType(min_val=-1, max_val=1, dtype=np.int16, scale=2**(-15)) + else: + o_q = SymmetricMultQType.from_min_max(min_val=astats['min'], + max_val=astats['max'], + dtype=out_dtype) + + if isinstance(node, (MatrixAddParameters, MatrixSubParameters)): + qrec = MultAddQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) + + elif isinstance(node, (MatrixBroadcastedLinearOpParameters, MatScaleFusionParameters, GlobalPoolParameters)): + qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) + + elif isinstance(node, ConstantInputParameters): + qrec = MultConstantQuantizationRecord(out_qs=[o_q], + constants_are_quantized=False) + + elif isinstance(node, (FcParameters, Conv2DParameters)): + weights_q = SymmetricMultQType.from_array(arr=node.weights, + quantized_dimension=self.get_quantized_dimension(node), + dtype=dtype, narrow_range=self._narrow_weights) + if node.has_bias: + biases_q = SymmetricMultBiasesQType(dtype=np.int32, scale=weights_q.scale * in_qs[0].scale) + else: + biases_q = SymmetricMultBiasesQType(dtype=np.int32, scale=np.array([1], dtype=np.int32)) + mul_biases_q = MultMulBiasScaleQType.from_filter(in_qs[0], weights_q, o_q, node) + qrec = MultScalableFilterQuantizationRecord(in_qs=[in_qs[0]], + out_qs=[o_q], + weights_q=weights_q, + biases_q=biases_q, + mul_biases_q=mul_biases_q, + constants_are_quantized=False) + LOG.debug("filter %s qrec %s", node.name, qrec) + else: + qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) + return qrec + + def quantize_fusion(self, G, node, in_qs, dtype): + result = OrderedDict() + fin_qs = in_qs + nodes = node.contained_nodes() + if node.fusion_type in ['conv_active_pool', 'conv_active']: + conv_node = nodes[0] + act_node = nodes[1] + act_astats = self._activation_stats.get(NodeId(node, act_node)) + conv_qrec = self.calculate_q(G, + conv_node, + act_astats, + fin_qs, + dtype, + out_dtype=np.int8) + result[NodeId(node, conv_node)] = conv_qrec + fin_qs = conv_qrec.out_qs + nodes = nodes[1:] + for fnode in nodes: + qrec = self.calculate_q(G, + fnode, + self._activation_stats.get(NodeId(node, fnode)), + fin_qs, + dtype) + result[NodeId(node, fnode)] = qrec + fin_qs = qrec.out_qs + return MultQuantizationRecord(in_qs=in_qs, out_qs=fin_qs), result + + def quantize_forward(self, G: NNGraph, edge_recs, dtype=np.int8, result=None): + if result is None: + result = QuantizationSet() + for node in [step['node'] for step in G.graph_state.steps]: + LOG.debug("quantize forward %s", node.name) + in_qs = self.get_in_qs(G, edge_recs, node) + if isinstance(node, (ConvFusionParameters, ActivationFusion)): + qrec, qrecs = self.quantize_fusion(G, node, in_qs, dtype) + for node_id, fqrec in qrecs.items(): + result[node_id] = fqrec + else: + qrec = self.calculate_q(G, + node, + self._activation_stats.get( + NodeId(node, None)), + in_qs, + dtype) + result[NodeId(node, None)] = qrec + if not qrec: + break + + for edges in G.indexed_out_edges(node.name): + for edge in edges: + edge_recs[edge.params] = qrec.out_qs[edge.from_idx] + return result + + def dequantize(self, G: NNGraph): + qrecs = G.quantization + LOG.info("dequantizing graph parameters") + for _, node, _, fnode in G.nodes_iterator(): + qrec = qrecs[NodeId(node, fnode)] + if isinstance(node, ConstantInputParameters): + node.value = qrec.out_q[0].dequantize(node.value) + else: + anode = node if fnode is None else fnode + if isinstance(anode, (FcParameters, Conv2DParameters)): + if anode.has_bias: + anode.biases = qrec.biases_q.dequantize(anode.biases) + anode.weights = qrec.weights_q.dequantize(anode.weights) + + def quantize(self, G: NNGraph) -> OrderedDict: + '''quantize the graph''' + if G.has_quantized_parameters: + self.dequantize(G) + G.has_quantized_parameters = False + G.quantization = None + edge_recs = {} + dtype = WIDTH_TO_DTYPE[self._force_width] + qrecs = self.quantize_forward(G, edge_recs, dtype) + qrecs['__quantizer'] = self + G.graph_identity.quantization_type = 'SQ8' + return qrecs diff --git a/tools/nntool/quantization/multiplicative/mult_utils.py b/tools/nntool/quantization/multiplicative/mult_utils.py new file mode 100644 index 000000000..3327ab74a --- /dev/null +++ b/tools/nntool/quantization/multiplicative/mult_utils.py @@ -0,0 +1,38 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import math + +import numpy as np + + +def spare_bits(params, dtype, bits): + iinfo = np.iinfo(dtype) + max_bits = math.ceil(math.log2(iinfo.max * params.filter.in_c * params.filter.h * params.filter.w)) + return bits - max_bits + + +def compute_mul_bias(scales): + factors = np.array([math.frexp(scale) for scale in scales], dtype=[("scale", "f4"), ("norm", "i1")]) + qmults = np.floor(factors['scale'] * math.pow(2, 7) + 0.5) + qnorms = -(factors["norm"] - 7) + qnorms[qmults >= 128] += 1 + qmults[qmults >= 128] = 64 + return qmults, qnorms + +def compute_32bit_mul_bias(scales): + qmults = np.floor(scales * math.pow(2, 32) + 0.5) + iinfo = np.iinfo(np.uint32) + return np.minimum(qmults, iinfo.max).astype(np.uint32) diff --git a/tools/nntool/quantization/multiplicative/symmetric/__init__.py b/tools/nntool/quantization/multiplicative/symmetric/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/quantization/multiplicative/symmetric/mult_mulbias_qtype_new.py b/tools/nntool/quantization/multiplicative/symmetric/mult_mulbias_qtype_new.py new file mode 100644 index 000000000..bb96cde06 --- /dev/null +++ b/tools/nntool/quantization/multiplicative/symmetric/mult_mulbias_qtype_new.py @@ -0,0 +1,265 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import math +from abc import abstractmethod, abstractclassmethod + +import numpy as np + +from graph.types import FilterParameters +from quantization.multiplicative.symmetric.symmetric_mult_qtype import \ + SymmetricMultQType +from quantization.qtype import QType +from utils.at_norm import at_norm + +NUM_BITS = { + np.int8: 8, + np.int16: 16, + np.int32: 32, + np.uint8: 8, + np.uint16: 16, + np.uint32: 32, +} + +SIGNED = [ + np.int8, + np.int16, + np.int32 +] + + +class MultMulBiasQType(SymmetricMultQType): + @abstractmethod + def apply_scales(self, arr: np.ndarray, axis: int = None): + pass + + @property + def has_scale(self): + scale = self.scale + if isinstance(scale, np.ndarray): + return np.any(self.scale != 1) + return self.scale != 1 + + @abstractclassmethod + def from_filter(cls, in_q, weights_q, out_q, params, dtype=None): + pass + +class MultMulBiasScaleQType(MultMulBiasQType): + def __init__(self, *args, dtype=np.uint8, available_bits=8, init=None, **kwargs): + self._available_bits = available_bits + super(MultMulBiasScaleQType, self).__init__(*args, dtype=dtype, init=init, **kwargs) + if init: + self._info = init + + @classmethod + def from_filter(cls, in_q, weights_q, out_q, params, dtype=np.uint8): + available_bits = ( + 31 - (math.ceil(math.log2(params.filter.in_c * params.filter.h * params.filter.w)) + 7 + 7)) + qtype = cls(dtype=dtype, available_bits=available_bits) + qtype.scale = in_q.scale * weights_q.scale / out_q.scale + return qtype + + @property + def shift_ctype(self): + return "int8" + + @property + def shift_qtype(self): + return QType(q=0, bits=8, signed=True) + + @property + def qnorms(self): + return self._info['qnorms'] - self.pre_normalization + + @property + def qbiases(self): + return self._info['qbiases'] + + @property + def pre_normalization(self): + return self._info.get('pre_normalization') or 0 + + @pre_normalization.setter + def pre_normalization(self, val): + self._info['pre_normalization'] = val + + def _encapsulate(self): + return { + "qnorms": self.qnorms, + "qbiases": self.qbiases, + "scale": self.scale, + "pre_normalization": self.pre_normalization, + "dtype": self.dtype.__name__ + } + + @classmethod + def _dencapsulate(cls, val): + return cls(init={ + "qnorms": val['qnorms'], + "qbiases": val['qbiases'], + "scale": val['scale'], + "pre_normalization": val['pre_normalization'] if 'pre_normalization' in val else None, + "dtype": getattr(np, val['dtype']) + }) + + @property + def scale(self): + return self._info['scale'] + + @scale.setter + def scale(self, val): + if val is not None: + if not isinstance(val, np.ndarray): + val = np.array([val]) + assert np.all(val >= 0), "scale should be positive" + self._info['scale'] = val + self.compute_scales() + else: + self._info['scale'] = val + + def compute_scales(self): + if not self.has_scale: + return + if self.dtype == np.int8: + bits = min(7, self._available_bits) + elif self.dtype == np.uint8: + bits = min(8, self._available_bits) + max_val = math.pow(2, bits) + factors = np.array([math.frexp(scale) for scale in self.scale], + dtype=[("scale", "f4"), ("norm", "i1")]) + qbiases = np.floor(factors['scale'] * max_val + 0.5) + qnorms = -(factors["norm"] - bits) + overflow = qbiases >= max_val + qnorms[overflow] -= 1 + qbiases = np.where(overflow, qbiases // 2, qbiases) + self._info['qnorms'] = qnorms + self._info['qbiases'] = qbiases.astype(self.dtype) + + def apply_scales(self, arr: np.ndarray, axis: int = None): + if self.pre_normalization > 0: + arr = at_norm(arr, self.pre_normalization) + if not self.has_scale: + return arr + if axis is None: + mul_biases = self.qbiases + mul_biases_norm = self.qnorms + assert len(mul_biases) == 1 and len( + mul_biases_norm) == 1, "no axis set. should have single scale" + else: + shape = [len(self.qbiases) if idx == axis else 1 for idx in range(len(arr.shape))] + mul_biases = self.qbiases.reshape(shape) + mul_biases_norm = self.qnorms.reshape(shape) + return at_norm(np.multiply(arr, mul_biases, dtype=np.int32), mul_biases_norm) + + def str_by_chan(self, chan: int): + return "{}b>>{} {:0.3f}".format(self.bits, self.qnorms[chan], self.qbiases[chan]) + + def __str__(self): + qnorms = self.qnorms + if len(self.qnorms) == 1: + return "{}b>>{} {:0.3f}".format(self.bits, qnorms[0], self.qbiases[0]) + return "{}b>>chan".format(self.bits) + + +class MultFractionalMulBiasQType(MultMulBiasQType): + def __init__(self, *args, init=None, **kwargs): + kwargs['dtype'] = np.uint32 + # force to uint32 + super(MultFractionalMulBiasQType, self).__init__(*args, init=init, **kwargs) + if init: + self._info = init + + @classmethod + def from_filter(cls, in_q, weights_q, out_q, params, dtype=None): + return cls(scale=in_q.scale * weights_q.scale / out_q.scale) + + @property + def qnorms(self): + return self._info['qnorms'] + + @property + def qbiases(self): + return self._info['qbiases'] + + def _encapsulate(self): + return { + "qbiases": self.max_val.tolist() if self.qbiases else None, + "scale": self.scale.tolist() if self.scale else None, + "dtype": self.dtype.__name__ + } + + @classmethod + def _dencapsulate(cls, val): + return cls(None, None, init={ + "qbiases": cls.init_array(val, 'qbiases'), + "scale": cls.init_array(val, 'scale'), + "dtype": getattr(np, val['dtype']) + }) + + @property + def scale(self): + return self._info['scale'] + + @scale.setter + def scale(self, val): + if val is not None: + if not isinstance(val, np.ndarray): + val = np.array([val]) + assert np.all(val >= 0) and np.all(val <= 1), "scale should be positive and fractional" + self._info['scale'] = val + self.compute_scales() + else: + self._info['scale'] = val + + def compute_scales(self): + if not self.has_scale: + return + factors = np.array([math.frexp(scale) for scale in self.scale], + dtype=[("scale", "f4"), ("norm", "i1")]) + qbiases = np.floor(factors['scale'] * math.pow(2, 32) + 0.5) + qnorms = -(factors["norm"]) + overflow = qbiases >= math.pow(2, 32) + qnorms[overflow] -= 1 + qbiases[overflow] //= 2 + self._info['qnorms'] = qnorms + #qmults = np.floor(self.scale * math.pow(2, 32) + 0.5) + iinfo = np.iinfo(self.dtype) + self._info['qbiases'] = np.minimum(qbiases, iinfo.max).astype(self.dtype) + + def apply_scales(self, arr: np.ndarray, axis: int = None): + if not self.has_scale: + return arr.astype(np.int32) + if axis is None: + mul_biases = self.qbiases + mul_biases_norm = self.qnorms + assert len(mul_biases) == 1 and len( + mul_biases_norm) == 1, "no axis set. should have single scale" + else: + shape = [len(self.qbiases) if idx == axis else 1 for idx in range(len(arr.shape))] + mul_biases = self.qbiases.reshape(shape) + mul_biases_norm = self.qnorms.reshape(shape) + + #arr = np.multiply(arr, mul_biases, dtype=np.int64) >> 32 + arr = at_norm(np.multiply(arr, mul_biases, dtype=np.int64), 32 + mul_biases_norm) + return arr.astype(np.int32) + + def str_by_chan(self, chan: int): + return "{}b{:0.6f}".format(self.bits, self.qbiases[chan]) + + def __str__(self): + qbiases = self.qbiases + if len(self.qbiases) == 1: + return "{}b{:0.6f}".format(self.bits, qbiases[0]) + return "{}bchan".format(self.bits) diff --git a/tools/nntool/quantization/multiplicative/symmetric/symmetric_mult_biases_qtype.py b/tools/nntool/quantization/multiplicative/symmetric/symmetric_mult_biases_qtype.py new file mode 100644 index 000000000..8fa76f8de --- /dev/null +++ b/tools/nntool/quantization/multiplicative/symmetric/symmetric_mult_biases_qtype.py @@ -0,0 +1,58 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np +from quantization.multiplicative.symmetric.symmetric_mult_qtype import SymmetricMultQType + +# Has internal scale and external scale +# Internal scale is the set scale at creation. This is assumed to be the scale of any contained parameters +# The set in_q and weights_q is the scale that will be used when quantized bias is requested + +class SymmetricMultBiasesQType(SymmetricMultQType): + def __init__(self, *args, init=None, **kwargs): + self._set_scale = None + super(SymmetricMultBiasesQType, self).__init__(*args, init=init, **kwargs) + + def link(self, weights_q, in_q): + self._set_scale = weights_q.scale * in_q.scale + + @property + def scale(self): + return self._set_scale if self._set_scale is not None else super().scale + + @scale.setter + def scale(self, val): + self._info['scale'] = val + + def dequantize(self, arr): + return self.dequantize_at_scale(arr, super().scale) + + def get_dequantized(self, arr, container_is_quantized=True): + if container_is_quantized: + return self.dequantize_at_scale(arr, super().scale) + return arr + + def quantize(self, arr: np.array) -> np.array: + return self.quantize_at_scale(arr, super().scale) + + def get_quantized(self, arr: np.array, container_is_quantized=True) -> np.array: + if container_is_quantized: + if self._set_scale is not None and not np.array_equal(self._set_scale, super().scale): + return self.quantize_at_scale(self.dequantize_at_scale(arr, super().scale), self._set_scale) + return arr + else: + if self._set_scale is not None and not np.array_equal(self._set_scale, super().scale): + return self.quantize_at_scale(arr, self._set_scale) + return self.quantize_at_scale(arr, super().scale) diff --git a/tools/nntool/quantization/multiplicative/symmetric/symmetric_mult_qtype.py b/tools/nntool/quantization/multiplicative/symmetric/symmetric_mult_qtype.py new file mode 100644 index 000000000..bcbd33db7 --- /dev/null +++ b/tools/nntool/quantization/multiplicative/symmetric/symmetric_mult_qtype.py @@ -0,0 +1,232 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import math + +import numpy as np + +from quantization.multiplicative.mult_qtype_base import MultQTypeBase + +VALID_DTYPES = [ + np.int8, + np.int16, + np.int32 +] + + +class SymmetricMultQType(MultQTypeBase): + def __init__(self, *args, narrow_range=False, init=None, **kwargs): + super(SymmetricMultQType, self).__init__(*args, init=init, **kwargs) + if init is None: + self._info['narrow_range'] = narrow_range + + SYMMETRIC_UINT = { + np.uint8: (128, np.int8) + } + + def dtype_is_valid(self): + return self.dtype in VALID_DTYPES + + def verify(self, arr, dimension): + """Verify that any 0 filters have a 1 scale""" + dq_arr = self.dequantize(arr) + axes = tuple(axis for axis in range(len(dq_arr.shape)) if axis != dimension) + rmin = np.min(dq_arr, axis=axes) + rmax = np.min(dq_arr, axis=axes) + self.scale = np.where((rmin == 0) & (rmax == 0), 1, self.scale) + + @classmethod + def from_tflite(cls, tf_qps, dtype): + res = cls() + res.min_val = tf_qps.MinAsNumpy() if tf_qps.MinLength() > 0 else None + res.max_val = tf_qps.MaxAsNumpy() if tf_qps.MaxLength() > 0 else None + if res.min_val is not None and res.max_val is not None \ + and np.all(np.abs(res.min_val) == res.max_val): + res.narrow_range = True + res.scale = tf_qps.ScaleAsNumpy() if tf_qps.ScaleLength() > 0 else None + res.quantized_dimension = tf_qps.QuantizedDimension() + if dtype in cls.SYMMETRIC_UINT: + zero_point, signed_dtype = cls.SYMMETRIC_UINT[dtype] + assert np.all(tf_qps.ZeroPointAsNumpy() == zero_point) + res.dtype = signed_dtype + else: + res.dtype = dtype + return res + + @classmethod + def from_array(cls, arr: np.ndarray, dtype=np.int8, + quantized_dimension=None, narrow_range=False): + + if quantized_dimension is not None: + axes = tuple(axis for axis in range(len(arr.shape)) if axis != quantized_dimension) + else: + axes = None + rmin = np.min(arr, axis=axes) + rmax = np.max(arr, axis=axes) + return cls.from_min_max(rmin, rmax, dtype=dtype, + quantized_dimension=quantized_dimension, + narrow_range=narrow_range) + + @classmethod + def from_min_max(cls, min_val, max_val, dtype=np.int8, quantized_dimension=None, narrow_range=False): + val = cls(min_val=min_val, max_val=max_val, + quantized_dimension=quantized_dimension, dtype=dtype, + narrow_range=narrow_range) + iinfo = np.iinfo(dtype) + + if narrow_range: + ranges = iinfo.max - (iinfo.min + 1) + else: + ranges = iinfo.max - iinfo.min + val.scale = val.range / ranges + return val + + def scale_to_pow2(self): + # closest above pow2 + self.scale = 2**np.ceil(np.log2(self.scale)) + + def _encapsulate(self): + return { + "min_val": self.min, + "max_val": self.max, + "scale": self.scale, + "dim": self.quantized_dimension, + "narrow_range": self.narrow_range, + "dtype": self.dtype.__name__ + } + + @classmethod + def _dencapsulate(cls, val): + return cls(init={ + "min_val": val['min_val'], + "max_val": val['max_val'], + "scale": val['scale'], + "quantized_dimension": val['dim'] if 'dim' in val else None, + "narrow_range": val['narrow_range'], + "dtype": getattr(np, val['dtype']) + }) + + @property + def narrow_range(self): + return self._info.get('narrow_range') + + @narrow_range.setter + def narrow_range(self, val): + self._info['narrow_range'] = val + + @property + def min(self): + if self.max is None: + return None + if (not self.narrow_range) and (self.max_val != self.min_val).all(): + dtype_info = np.iinfo(self.dtype) + max_calc = - np.abs(self.max_val) * np.abs(dtype_info.min) / dtype_info.max + min_val_is_min = np.less_equal(self.max_val * dtype_info.max / np.abs(dtype_info.min), np.abs(self.min_val)) + #min_val_is_min = np.less(self.max_val, np.abs(self.min_val)) + max_calc = np.where(min_val_is_min, self.min_val, max_calc) + return max_calc + return self.max * -1 + + @property + def max(self): + if self.min_val is None or self.max_val is None: + return None + if (not self.narrow_range) and (self.max_val != self.min_val).all(): + dtype_info = np.iinfo(self.dtype) + max_calc = np.abs(self.min_val) * dtype_info.max / np.abs(dtype_info.min) + max_val_is_max = np.greater(self.max_val * dtype_info.max / np.abs(dtype_info.min), np.abs(self.min_val)) + #max_val_is_max = np.greater_equal(self.max_val, np.abs(self.min_val)) + max_calc = np.where(max_val_is_max, self.max_val, max_calc) + return max_calc + + max_calc = np.maximum(np.abs(self.min_val), np.abs(self.max_val)) + max_calc[max_calc == 0] = 1 + return max_calc + + @property + def zero_point(self): + return np.array([0]) + + @zero_point.setter + def zero_point(self, val): + raise ValueError() + + def get_quantized_scale(self): + max_val = math.pow(2, 8) + factors = np.array([math.frexp(scale) for scale in self.scale], + dtype=[("scale", "f4"), ("norm", "i1")]) + qscales = np.floor(factors['scale'] * max_val + 0.5) + qnorms = -factors["norm"] + overflow = qscales >= max_val + qnorms[overflow] -= 1 + qscales = np.where(overflow, qscales // 2, qscales) + return qscales.astype(np.uint8), qnorms + + def dequantize_at_scale(self, arr: np.array, scale: np.array) -> np.array: + arr = arr.astype(np.float32) + if len(scale) > 1 and arr.shape != scale.shape: + return arr * scale.reshape(tuple(size if idx == self.quantized_dimension else 1 + for idx, size in enumerate(arr.shape))) + return arr * scale + + def quantize_at_scale(self, arr: np.array, scale: np.array) -> np.array: + if len(scale) > 1 and arr.shape != scale.shape: + arr = np.floor(0.5 + arr * 1 / scale.reshape(tuple(size if idx == self.quantized_dimension else 1 + for idx, size in enumerate(arr.shape)))) + else: + arr = np.floor(arr/scale + 0.5) + return self.clip(arr, narrow_range=self.narrow_range) + + def quantize(self, arr: np.array) -> np.array: + return self.quantize_at_scale(arr, self.scale) + + def dequantize(self, arr: np.array) -> np.array: + return self.dequantize_at_scale(arr, self.scale) + + def get_dequantized(self, arr, container_is_quantized=True): + if container_is_quantized: + return self.dequantize_at_scale(arr, self.scale) + return arr + + def get_quantized(self, arr: np.array, container_is_quantized=True) -> np.array: + if not container_is_quantized: + return self.quantize_at_scale(arr, self.scale) + return arr + + def str_by_chan(self, chan: int): + if self.min is None and self.max is None: + return "{}*{}".format( + self.dtype_str, + self.str_fmt(self.scale[chan], extend=True), + ) + return "{}<{}*{}<{}".format( + self.str_fmt(self.min[chan]), + self.dtype_str, + self.str_fmt(self.scale[chan], extend=True), + self.str_fmt(self.max[chan]), + ) + + def __str__(self): + if self.min is None and self.max is None: + return "{}*{}".format( + self.dtype_str, + self.str_fmt(self.scale, extend=True), + ) + return "{}<{}*{}<{}".format( + self.str_fmt(self.min), + self.dtype_str, + self.str_fmt(self.scale, extend=True), + self.str_fmt(self.max), + ) diff --git a/tools/nntool/quantization/multiplicative/symmetric/symmetric_mult_qtype_wrapper.py b/tools/nntool/quantization/multiplicative/symmetric/symmetric_mult_qtype_wrapper.py new file mode 100644 index 000000000..6476bca77 --- /dev/null +++ b/tools/nntool/quantization/multiplicative/symmetric/symmetric_mult_qtype_wrapper.py @@ -0,0 +1,123 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import math + +import numpy as np + +from quantization.multiplicative.symmetric.symmetric_mult_qtype import SymmetricMultQType +from quantization.multiplicative.asymmetric.asymmetric_mult_qtype import AsymmetricMultQType +from quantization.multiplicative.mult_qtype_base import WrapperMixin + +VALID_DTYPES = [ + np.int8, + np.int16, + np.int32 +] + +UNSIGNED_TO_SIGNED = { + np.uint8: np.int8, + np.uint16: np.int16, + np.uint32: np.int32, + np.int8: np.int8, + np.int16: np.int16, + np.int32: np.int32, +} + +class SymmetricMultQTypeWrapper(WrapperMixin, SymmetricMultQType): + def __init__(self, wrapped, *args, scale=None, dtype=None, **kwargs): + # need to set wrapped before constructor + self._wrapped = wrapped + super(SymmetricMultQTypeWrapper, self).__init__(*args, **kwargs) + self._info['scale'] = scale + self._info['dtype'] = dtype + + def dtype_is_valid(self): + return self.dtype in VALID_DTYPES + + def _encapsulate(self): + #pylint: disable=protected-access + return { + "wrapped": AsymmetricMultQType._encapsulate(self), + "scale": self.scale, + "dtype": self.dtype.__name__ if self.dtype is not None else None + } + + @classmethod + def _dencapsulate(cls, val): + #pylint: disable=protected-access + dtype = getattr(np, val['dtype']) if val['dtype'] is not None else None + return cls(AsymmetricMultQType._dencapsulate(val['wrapped']), scale=val['scale'], dtype=dtype) + + @property + def min_val(self): + return self._wrapped.min_val + + @min_val.setter + def min_val(self, val): + pass + + @property + def max_val(self): + return self._wrapped.max_val + + @max_val.setter + def max_val(self, val): + pass + + @property + def scale(self): + if self._info['scale'] is None: + # return the bits from the wrapped type. The dtype is only set + # by the importer to keep the container size between layers + # fused with activations and should not change the scale of the + # output. + if self.range is None: + return None + return self.range / (math.pow(2, self._wrapped.bits) - 1) + return self._info['scale'] + + def get_dequantized(self, arr, container_is_quantized=True): + if container_is_quantized: + return self._wrapped.dequantize(arr) + return arr + + def get_quantized(self, arr: np.array, container_is_quantized=True) -> np.array: + if container_is_quantized: + return self.quantize(self._wrapped.dequantize(arr)) + return self.quantize(arr) + + @scale.setter + def scale(self, val): + # scale can be overriden (this is the case in biases for example) + self._info['scale'] = val + + @property + def quantized_dimension(self): + return self._wrapped.quantized_dimension + + @quantized_dimension.setter + def quantized_dimension(self, val): + self._wrapped.quantized_dimension = val + + @property + def dtype(self): + if self._info['dtype'] is not None: + return self._info['dtype'] + return UNSIGNED_TO_SIGNED[self.wrapped.dtype] + + @dtype.setter + def dtype(self, val): + self._info['dtype'] = val diff --git a/tools/nntool/quantization/qtype.py b/tools/nntool/quantization/qtype.py index 40f0bbbdb..55516ce98 100644 --- a/tools/nntool/quantization/qtype.py +++ b/tools/nntool/quantization/qtype.py @@ -16,6 +16,7 @@ import numpy as np from utils.json_serializable import JsonSerializable +from utils.at_norm import at_norm from .qtype_base import QTypeBase @@ -32,6 +33,16 @@ } } +DTYPES = { + np.uint8: (8, False), + np.uint16: (16, False), + np.uint32: (32, False), + np.int8: (8, True), + np.int16: (16, True), + np.int32: (32, True), +} + + def get_dtype(length, signed): if signed: return np.dtype("i"+str(int(length))) @@ -51,10 +62,22 @@ def normalize(obj, n_bits): return obj if n_bits < 0: return obj << -n_bits - return obj >> n_bits + return at_norm(obj, n_bits) + +def calc_int_bits(arr, signed=True): + abs_num = np.floor(np.abs(arr)) + # calculate number of bits to represent absolute number + if signed: + if abs_num == 0: + return 1 + return np.ceil(np.log(abs_num) / np.log(2)) + 1 + else: + if abs_num == 0: + return 0 + return np.ceil(np.log(abs_num) / np.log(2)) class QType(QTypeBase, JsonSerializable): - def __init__(self, *args, bits=None, q=None, signed=None): + def __init__(self, *args, bits=None, q=None, signed=None, dtype=None): if args: if isinstance(args[0], QType): proto = args[0] @@ -74,13 +97,19 @@ def __init__(self, *args, bits=None, q=None, signed=None): if signed is not None: self._quant[2] = signed + + if dtype is not None: + self._quant[0], self._quant[2] = DTYPES[dtype] def _encapsulate(self): return self._quant @classmethod def _dencapsulate(cls, val): - return QType(*val) + try: + return QType(*val) + except Exception as ex: + x = 0 def increase_precision(self): return QType(self.bits * 2, self.q, self.signed) @@ -136,15 +165,16 @@ def signed(self): def signed(self, val): self._quant[2] = val + @property + def pad_zero_point(self): + return 0 + def double_precision(self): return QType(self.bits * 2, self.q, self.signed) def quantize(self, arr): arr = np.floor((arr * 2.0 ** self.q) + 0.5) - max_value = 2**(self.bits - 1) - 1 - min_value = -max_value - 1 - arr = np.clip(arr, min_value, max_value) - return np.array(arr, copy=True, dtype=self.dtype) + return self.clip(arr) def dequantize(self, arr): return arr / (2.0**self.q) @@ -153,26 +183,19 @@ def expand_normalize(self, arr: np.ndarray, cur_qtype: 'QType'): assert cur_qtype.length <= self.length, "must expand into something bigger" return normalize(arr.astype(self.dtype), cur_qtype.q - self.q) - def clip(self, arr: np.array, change_type=True): - min_v, max_v = max_min(self.bits, self.signed) - ret = np.clip(arr, min_v, max_v) - if change_type: - ret = ret.astype(self.dtype) - return ret - def round_normalize(self, arr, cur_qtype: 'QType'): scale = cur_qtype.q - self.q # arr = arr + (1<<(scale - 1)) arr = normalize(arr, scale) return arr - def round_normalize_clip(self, arr, from_qtype, change_type=True): + def round_normalize_clip(self, arr, from_qtype): to_qtype = self scale = from_qtype.q - to_qtype.q # if scale > 0: # arr = arr + (1<<(scale - 1)) arr = normalize(arr, scale) - arr = self.clip(arr, change_type) + arr = self.clip(arr) return arr def expand_from(self, arr, from_qtype): @@ -185,8 +208,12 @@ def __add__(self, other): return QType(max(self.bits, other.bits), self.q + other.q, self.signed or other.signed) def __eq__(self, other): - return self.q == other.q and\ - self.bits == other.bits and self.signed == other.signed + if isinstance(other, QType): + return self.q == other.q and\ + self.bits == other.bits and self.signed == other.signed + return other.__eq__(self) def __str__(self): + if self.q > self.bits: + return "M{}>>{}".format(self.bits, self.q) return "Q{}.{}".format(self.bits - self.q, self.q) diff --git a/tools/nntool/quantization/qtype_base.py b/tools/nntool/quantization/qtype_base.py index 69d495c4b..3c474382a 100644 --- a/tools/nntool/quantization/qtype_base.py +++ b/tools/nntool/quantization/qtype_base.py @@ -13,25 +13,45 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from abc import ABC, abstractmethod +from abc import ABC, abstractmethod, abstractproperty -from numpy import array +import numpy as np class QTypeBase(ABC): @abstractmethod - def quantize(self, arr: array) -> array: + def quantize(self, arr: np.ndarray) -> np.ndarray: pass @abstractmethod - def dequantize(self, arr: array) -> array: + def dequantize(self, arr: np.ndarray) -> np.ndarray: pass - @abstractmethod - def expand_from(self, arr: array, from_qtype: 'QTypeBase') -> array: + @abstractproperty + def dtype(self): pass - @abstractmethod - def reduce_from(self, arr: array, from_qtype: 'QTypeBase') -> array: + @abstractproperty + def q(self) -> int: pass + + @abstractproperty + def bits(self) -> int: + pass + + @abstractproperty + def signed(self) -> bool: + pass + + @abstractproperty + def pad_zero_point(self) -> int: + pass + + def clip(self, arr: np.ndarray, dtype=None, narrow_range=False): + if dtype is None: + dtype = self.dtype + iinfo = np.iinfo(dtype) + qmax = iinfo.max + qmin = iinfo.min + (1 if narrow_range else 0) + return np.minimum(np.maximum(arr, qmin), qmax).astype(dtype) diff --git a/tools/nntool/quantization/quantization_record_base.py b/tools/nntool/quantization/quantization_record_base.py new file mode 100644 index 000000000..7e3cf6963 --- /dev/null +++ b/tools/nntool/quantization/quantization_record_base.py @@ -0,0 +1,202 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +#pylint: disable=abstract-method + +from abc import abstractmethod, abstractproperty +from typing import Sequence + +import numpy as np + +from graph.types import Parameters, MultiplicativeBiasParameters +from utils.json_serializable import JsonSerializable + +from quantization.qtype_base import QTypeBase +from quantization.multiplicative.mult_qtype_base import WrapperMixin + + +class QuantizationRecordBase(JsonSerializable): + def __init__(self, info=None): + self._unwrap = False + if info is None: + self._info = {} + else: + self._info = info + + @property + def unwrap(self): + return self._unwrap + + @unwrap.setter + def unwrap(self, val): + self._unwrap = val + + def unwrapped(self, val): + if self.unwrap: + if isinstance(val, list): + return [self.unwrapped(v) for v in val] + if isinstance(val, WrapperMixin): + return val.wrapped + return val + + def _encapsulate(self): + return self._info + + @classmethod + def _dencapsulate(cls, val): + return cls(info=val) + + def __eq__(self, value): + # pylint: disable=protected-access + return value._info == self._info + + @staticmethod + def ql_str(l): + return ",".join([str(qtype) for qtype in l]) + + def __hash__(self): + return object.__hash__(self) + +class InputQuantizationRecordBase(QuantizationRecordBase): + + def __init__(self, *args, in_qs: QTypeBase = None, info=None, **kwargs): + super(InputQuantizationRecordBase, self).__init__(*args, info=info, **kwargs) + if info is None: + self._info['in_qs'] = in_qs + + @abstractmethod + def prepare_inputs(self, + params: Parameters, + input_tensors: Sequence[np.ndarray], ktype: str = None) -> Sequence[np.ndarray]: + """Prepares the inputs before calculation""" + + @property + def in_qs(self) -> Sequence[QTypeBase]: + if 'in_qs' in self._info: + return self.unwrapped(self._info['in_qs']) + return None + + @in_qs.setter + def in_qs(self, value: Sequence[QTypeBase]): + self._info['in_qs'] = value + + @abstractproperty + def auto_quantize_inputs(self): + """Do whatever is necessary to quantize float inputs""" + + @abstractmethod + @auto_quantize_inputs.setter + def auto_quantize_inputs(self, val): + pass + + def __str__(self): + return "i:({})".format(self.ql_str(self.in_qs)) + + +class OutputQuantizationRecordBase(QuantizationRecordBase): + + def __init__(self, *args, out_qs: QTypeBase = None, info=None, **kwargs): + super(OutputQuantizationRecordBase, self).__init__(*args, info=info, **kwargs) + if info is None: + self._info['out_qs'] = out_qs + + @abstractmethod + def get_outputs(self, + params: Parameters, + output_tensors: Sequence[np.ndarray], + ktype: str = None) -> Sequence[np.ndarray]: + """Produces the output from the result""" + + @property + def out_qs(self) -> Sequence[QTypeBase]: + if 'out_qs' in self._info: + return self.unwrapped(self._info['out_qs']) + return None + + @out_qs.setter + def out_qs(self, value: Sequence[QTypeBase]): + self._info['out_qs'] = value + + @abstractproperty + def auto_dequantize_outputs(self): + """Do whatever is necessary to dequantize outputs to float""" + + @abstractmethod + @auto_dequantize_outputs.setter + def auto_dequantize_outputs(self, val): + pass + + def __str__(self): + return "o:({})".format(self.ql_str(self.out_qs)) + + +class HasConstantsBase(OutputQuantizationRecordBase): + def __init__(self, *args, constants_are_quantized: bool = True, info=None, **kwargs): + super(HasConstantsBase, self).__init__(*args, info=info, **kwargs) + if info is None: + self._info['constants_are_quantized'] = constants_are_quantized + + @property + def constants_are_quantized(self) -> bool: + return self._info['constants_are_quantized'] + + +class ConstantQuantizationRecordBase(HasConstantsBase): + + @abstractmethod + def gen_value(self, value) -> np.ndarray: + """does whatever is necessary to return real value that can be dumped""" + +class InputOutputQuantizationRecordBase(InputQuantizationRecordBase, OutputQuantizationRecordBase): + + def __init__(self, *args, info=None, **kwargs): + super(InputOutputQuantizationRecordBase, self).__init__(*args, info=info, **kwargs) + + def __str__(self): + return "i:({}) o:({})".format(self.ql_str(self.in_qs), self.ql_str(self.out_qs)) + + +class FilterQuantizationRecordBase(InputOutputQuantizationRecordBase, HasConstantsBase): + """The base FilterQuantization record does not have weights_q and biases_q since + they may not be needed. It does however have properties for code generation. + __str__ needs to be overidden if there is more info to print about these. So it + has no constructor by default.""" + + @abstractmethod + def gen_weights(self, params: Parameters, weights) -> np.ndarray: + """does whatever is necessary to return real weights that can be dumped""" + + @abstractmethod + def gen_biases(self, params: Parameters, biases: np.ndarray, weights: np.ndarray) -> np.ndarray: + """does whatever is necessary to return real biases that can be dumped""" + + @abstractmethod + def prepare_weights(self, params, weights: np.ndarray, ktype: str = None) -> np.ndarray: + """Prepares the weights before calculation""" + + @abstractmethod + def prepare_biases(self, params, biases: np.ndarray, weights: np.ndarray, ktype: str = None) -> np.ndarray: + """Prepares the biases before calculation""" + +class ScalableFilterQuantizationRecordBase(FilterQuantizationRecordBase): + + @abstractmethod + def gen_mul_biases(self, params: MultiplicativeBiasParameters) -> np.ndarray: + """Returns the actual multiplicative biases for the filter""" + + @abstractmethod + def apply_multiplicative_bias(self, params: Parameters, input_tensor: np.ndarray, axis: int, ktype: str = None): + """Applies the multiplicative bias during an internel kernel operation on axis of tensor. The mulbias + may be a function of the quantization or may be in the params.""" diff --git a/tools/nntool/quantization/quantization_set.py b/tools/nntool/quantization/quantization_set.py new file mode 100644 index 000000000..b250f46ad --- /dev/null +++ b/tools/nntool/quantization/quantization_set.py @@ -0,0 +1,130 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from collections.abc import MutableMapping +from copy import deepcopy +from typing import Sequence + +from graph.types import Parameters +from utils.json_serializable import JsonSerializable +from utils.node_id import NodeId, convert_keys_to_str, convert_str_to_keys +from quantization.quantization_record_base import QuantizationRecordBase + +class QuantizationSet(MutableMapping, JsonSerializable): + def __init__(self, *args, unwrap=False, init: dict = None, **kwargs): + super(QuantizationSet, self).__init__(*args, **kwargs) + if init is None: + self._init = { + 'unwrap': unwrap, + 'qset': {} + } + else: + self._init = init + + @property + def qset(self): + return self._init['qset'] + + def __delitem__(self, key): + del self.qset[key] + + def __getitem__(self, key): + item = self.qset[key] + if self.unwrap: + item.unwrap = self.unwrap + return item + + def __iter__(self): + return self.qset.__iter__() + + def __len__(self): + return len(self.qset) + + def __setitem__(self, key, item): + self.qset[key] = item + + def _encapsulate(self): + return convert_keys_to_str(self._init) + + def sorted_iterator(self, G): + node_ids = [NodeId(pnode, fnode) for _, pnode, _, fnode in G.nodes_iterator()] + return [(nid, self.qset[nid]) if nid in self.qset else (nid, None) for nid in node_ids] + + @classmethod + def _dencapsulate(cls, val): + return cls(init=convert_str_to_keys(val)) + + @property + def unwrap(self): + return self._init['unwrap'] + + @unwrap.setter + def unwrap(self, val): + self._init['unwrap'] = val + + def verify_quantization(self, G): + """Verify that all nodes have a quantization record""" + return all(NodeId(pnode, fnode) in self.qset for _, pnode, _, fnode in G.nodes_iterator()) + + def get_all(self, nodes: Sequence[Parameters]) -> Sequence[QuantizationRecordBase]: + """Get all the quantization records for a sequence of nodes""" + if self.all_have_quantization(nodes): + return [self.qset[NodeId(node)] for node in nodes] + return None + + def all_have_quantization(self, nodes: Sequence[Parameters]) -> bool: + """Check that a sequence of nodes all have quantization records""" + return all(NodeId(node) in self.qset for node in nodes) + + def propagate(self, G, from_node, to_node, from_idx=None, qtype=None): + """propagate the output quantization of from_node's output + all the way to to_node's output""" + if qtype is None: + assert from_idx is not None + qtype = self.qset[NodeId(from_node)].out_qs[from_idx] + for edge in G.out_edges(from_node.name): + if from_idx is None or edge.from_idx == from_idx: + transit_node_qrec = self.qset[NodeId(edge.to_node)] + transit_node_qrec.in_qs[edge.to_idx] = deepcopy(qtype) + transit_node_qrec.out_qs = [deepcopy(qtype)] * len(transit_node_qrec.out_qs) + if edge.to_node != to_node: + self.propagate(G, edge.to_node, to_node, qtype=qtype) + + def move_to_fusion(self, node: Parameters, new_pnode: Parameters): + nid = NodeId(node) + if nid in self.qset: + self.qset[NodeId(new_pnode, node)] = self.qset[nid] + del self.qset[nid] + + def move_to_node(self, node: Parameters, new_pnode: Parameters): + nid = NodeId(node) + if nid in self.qset: + self.qset[NodeId(new_pnode)] = self.qset[nid] + del self.qset[nid] + + def copy_to_fusion(self, node: Parameters, new_pnode: Parameters, new_fnode: Parameters): + nid = NodeId(node) + if nid in self.qset: + self.qset[NodeId(new_pnode, new_fnode)] = deepcopy(self.qset[nid]) + + def copy_to_node(self, node: Parameters, new_pnode: Parameters): + nid = NodeId(node) + if nid in self.qset: + self.qset[NodeId(new_pnode)] = deepcopy(self.qset[nid]) + + def remove_node(self, node: Parameters): + nid = NodeId(node) + if nid in self.qset: + del self.qset[nid] diff --git a/tools/nntool/quantization/quantizer.py b/tools/nntool/quantization/quantizer.py index d168e16ef..53d7ea480 100644 --- a/tools/nntool/quantization/quantizer.py +++ b/tools/nntool/quantization/quantizer.py @@ -15,12 +15,12 @@ from abc import ABC, abstractmethod -from typing import Mapping from graph.nngraph import NNGraph -from graph.types import Parameters -from .quantization_record import QuantizationRecord +from quantization.quantization_record_base import QuantizationRecordBase +from quantization.quantization_set import QuantizationSet + class Quantizer(ABC): @abstractmethod - def quantize(self, G: NNGraph) -> Mapping[Parameters, QuantizationRecord]: + def quantize(self, G: NNGraph) -> QuantizationSet: pass diff --git a/tools/nntool/quantization/symmetric/__init__.py b/tools/nntool/quantization/symmetric/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/quantization/symmetric/kernels/__init__.py b/tools/nntool/quantization/symmetric/kernels/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/quantization/symmetric/kernels/activations.py b/tools/nntool/quantization/symmetric/kernels/activations.py new file mode 100644 index 000000000..84699a327 --- /dev/null +++ b/tools/nntool/quantization/symmetric/kernels/activations.py @@ -0,0 +1,176 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np + +from quantization.multiplicative.mult_quantization import \ + MultQuantizationRecord +from quantization.qtype import QType +from quantization.quantization_record_base import QuantizationRecordBase +from utils.at_norm import at_norm + +FORCE_RELU = False + +def leaky(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + raise NotImplementedError() + + +def sigmoid(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + if isinstance(qrec, MultQuantizationRecord): + raise NotImplementedError() + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + dqinput = qrec.in_qs[0].dequantize(in_tensor) + return qrec.get_outputs(params, [qrec.out_qs[0].quantize(1/(1 + np.exp(-dqinput)))], ktype="symmetric") + + +def relu_mult(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + qrec.set_scale() + relu_lb = qrec.in_qs[0].quantize(params.lower_bound) + in_tensor = np.maximum(in_tensor, relu_lb) + if params.upper_bound is not None and not FORCE_RELU: + relu_ub = qrec.in_qs[0].quantize(params.upper_bound) + in_tensor = np.minimum(in_tensor, relu_ub) + in_tensor = qrec.scale_mul_biases_q.apply_scales(in_tensor) + if qrec.out_qs[0] != qrec.in_qs[0]: + return qrec.get_outputs(params, [qrec.out_qs[0].reduce_from(in_tensor, qrec.in_qs[0])], ktype="symmetric") + return qrec.get_outputs(params, [in_tensor], ktype="symmetric") + + +def relu(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + + if isinstance(qrec, MultQuantizationRecord): + return relu_mult(params, in_tensors, qrec, details=details) + + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + + relu_lb = qrec.in_qs[0].quantize(params.lower_bound) + in_tensor = np.maximum(in_tensor, relu_lb) + if params.upper_bound is not None: + relu_ub = qrec.in_qs[0].quantize(params.upper_bound) + in_tensor = np.minimum(in_tensor, relu_ub) + + if qrec.out_qs[0] != qrec.in_qs[0]: + return qrec.get_outputs(params, [qrec.out_qs[0].reduce_from(in_tensor, qrec.in_qs[0])], ktype="symmetric") + return qrec.get_outputs(params, [in_tensor], ktype="symmetric") + + +def hsigmoid_mult_gen_factors(params, qrec): + in_q = qrec.in_qs[0] + fac_1 = in_q.quantize(np.array([params.offset])) + qrec.set_scale(extra_scale=1/6) + upper_bound = in_q.quantize([6.]) + lower_bound = in_q.quantize([0.]) + return fac_1, upper_bound, lower_bound + + +def hsigmoid_mult(params, + in_tensors, + qrec: MultQuantizationRecord, + details=None): + del details + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + fac_1, upper_bound, lower_bound = hsigmoid_mult_gen_factors(params, qrec) + in_tensor = in_tensor.astype(np.int32) + in_tensor_relued = np.minimum(np.maximum(in_tensor + fac_1, lower_bound), upper_bound) + in_tensor = qrec.scale_mul_biases_q.apply_scales(in_tensor_relued) + return qrec.get_outputs(params, + [in_tensor], + ktype="symmetric") + + +def hsigmoid(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + if isinstance(qrec, MultQuantizationRecord): + return hsigmoid_mult(params, in_tensors, qrec, details=details) + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + + calc_q = QType(bits=32, q=qrec.in_qs[0].q + 15, signed=True) + + fac_1 = qrec.in_qs[0].quantize(np.array([params.offset])) + fac_2 = (1 << 15) // 6 + upper_bound = qrec.in_qs[0].quantize(np.array([6.])) + lower_bound = qrec.in_qs[0].quantize(np.array([0.])) + in_tensor = in_tensor.astype(np.int32) + in_tensor = np.multiply(np.minimum(np.maximum(in_tensor + fac_1, lower_bound), + upper_bound), fac_2, dtype=np.int32) + return qrec.get_outputs(params, [qrec.out_qs[0].reduce_from(in_tensor, calc_q)], ktype="symmetric") + + +def hswish_mult_gen_factors(qrec): + in_q = qrec.in_qs[0] + fac_1 = in_q.quantize(np.array([3.])) + # The scale of the result is actually in in_scale * in_scale since it is multiplied by itself + qrec.set_scale(extra_scale=qrec.in_qs[0].scale * 1/6) + upper_bound = in_q.quantize([6.]) + lower_bound = in_q.quantize([0.]) + return fac_1, upper_bound, lower_bound + + +def hswish_mult(params, + in_tensors, + qrec: MultQuantizationRecord, + details=None): + del details + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + fac_1, upper_bound, lower_bound = hswish_mult_gen_factors(qrec) + in_tensor = in_tensor.astype(np.int32) + in_tensor_relued = np.minimum(np.maximum(in_tensor + fac_1, lower_bound), upper_bound) + in_tensor = qrec.scale_mul_biases_q.apply_scales(in_tensor * in_tensor_relued) + return qrec.get_outputs(params, + [in_tensor], + ktype="symmetric") + + +def hswish(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + + if isinstance(qrec, MultQuantizationRecord): + return hswish_mult(params, in_tensors, qrec, details=details) + + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + + calc_q = QType(bits=32, q=qrec.in_qs[0].q + 15, signed=True) + fac_1 = qrec.in_qs[0].quantize(np.array([3.])) + fac_2 = (1 << 15) // 6 + upper_bound = qrec.in_qs[0].quantize([6.]) + lower_bound = qrec.in_qs[0].quantize([0.]) + in_tensor = in_tensor.astype(np.int32) + in_tensor = at_norm(np.multiply(np.minimum(np.maximum(in_tensor + fac_1, lower_bound), upper_bound), + in_tensor, + dtype=np.int32), qrec.in_qs[0].q) + return qrec.get_outputs(params, + [qrec.out_qs[0].reduce_from(np.multiply( + in_tensor, fac_2, dtype=np.int32), calc_q)], + ktype="symmetric") diff --git a/tools/nntool/quantization/symmetric/kernels/fast_conv.py b/tools/nntool/quantization/symmetric/kernels/fast_conv.py new file mode 100644 index 000000000..9088b026f --- /dev/null +++ b/tools/nntool/quantization/symmetric/kernels/fast_conv.py @@ -0,0 +1,139 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +import numpy as np + +from quantization.quantization_record_base import \ + ScalableFilterQuantizationRecordBase + +FORCE_INT64 = False + +# pylint: disable=invalid-name + +LOG = logging.getLogger("nntool." + __name__) + +def faster_conv(params, + in_tensors, + qrec: ScalableFilterQuantizationRecordBase, + details=None): + '''3D convolution by sub-matrix summing. + ''' + in_dims = params.in_dims[0] + out_dims = params.out_dims[0] + weights = qrec.prepare_weights(params, params.weights, ktype="symmetric") + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + + if details is not None: + details['min_acc'] = float("Infinity") + details['max_acc'] = float("-Infinity") + + in_tensor = in_tensor.transpose(in_dims.transpose_to_order(['h', 'w', 'c'])) + if params.padding.h + params.padding.w > 0: + if hasattr(qrec.in_qs[0], 'zero_point'): + const_pad = qrec.in_qs[0].zero_point[0] + else: + const_pad = 0 + in_tensor = np.pad(in_tensor, + ([params.padding.t, + params.padding.b], + [params.padding.l, + params.padding.r]) + + ([0, 0], ) * (np.ndim(in_tensor)-2), + mode='constant', + constant_values=const_pad) + pad_w = params.padding.w + pad_h = params.padding.h + else: + pad_w = pad_h = 0 + + weights = weights.transpose(params.filter.transpose_to_order(['out_c', 'h', 'w', 'in_c'])) + + filt_w = params.filter.w + filt_h = params.filter.h + + in_w = in_dims.w + in_h = in_dims.h + out_c = params.filter.out_c + + in_c_per_group = in_dims.c // params.groups + out_c_per_group = out_c // params.groups + in_c_off = 0 + out_c_cnt = 0 + + + dillated_filter_w = filt_w if params.dilation.w == 1 else filt_w * params.dilation.w - 1 + dillated_filter_h = filt_h if params.dilation.h == 1 else filt_w * params.dilation.h - 1 + + out_w = ((in_w - dillated_filter_w + pad_w)) // params.stride.w + 1 + out_h = ((in_h - dillated_filter_h + pad_h)) // params.stride.h + 1 + + if params.has_bias: + biases = qrec.prepare_biases(params, params.biases, params.weights, ktype="symmetric") + if qrec.acc_q != qrec.biases_q: + biases = qrec.acc_q.expand_from(biases, qrec.biases_q) + result = np.ones((out_c, out_h, out_w), + dtype=qrec.acc_q.dtype) * biases.reshape(out_c, 1, 1) + else: + result = np.zeros((out_c, out_h, out_w), + dtype=qrec.acc_q.dtype) + + const_h = pad_h + in_h - dillated_filter_h + 1 + const_w = pad_w + in_w - dillated_filter_w + 1 + if FORCE_INT64: + result = result.astype(np.int64) + for out_c_i in range(out_dims.c): + for cur_h in range(filt_h): + for cur_w in range(filt_w): + + # selects all elements that the filter element needs to multiply + slabhw = np.multiply(in_tensor[cur_h * params.dilation.h: + const_h + cur_h * params.dilation.h: + params.stride.h, + cur_w * params.dilation.w: + const_w + cur_w * params.dilation.w: + params.stride.w, + in_c_off: + in_c_off + in_c_per_group: + 1], + weights[out_c_i, cur_h, cur_w], + dtype=np.int64 if FORCE_INT64 else qrec.calc_q.dtype) + + if qrec.calc_q != qrec.acc_q: + slabhw = qrec.acc_q.reduce_from(slabhw, qrec.calc_q) + + # add depthwise + slabhw = slabhw.sum(axis=-1, dtype=np.int64 if FORCE_INT64 else qrec.calc_q.dtype) + # add to the previous filter elements + result[out_c_i] += slabhw + + if details is not None: + details['min_acc'] = min(np.min(result[out_c_i]), details['min_acc']) + details['max_acc'] = max(np.max(result[out_c_i]), details['max_acc']) + + out_c_cnt += 1 + if out_c_cnt >= out_c_per_group: + out_c_cnt = 0 + in_c_off += in_c_per_group + + result = qrec.apply_multiplicative_bias(params, result, 0, ktype="symmetric") + + result = result.transpose(out_dims.transpose_from_order(['c', 'h', 'w'])) + + if qrec.out_qs[0] != qrec.acc_q: + result = qrec.out_qs[0].reduce_from(result, qrec.acc_q) + + return qrec.get_outputs(params, [result], ktype="symmetric") diff --git a/tools/nntool/quantization/symmetric/kernels/image_format.py b/tools/nntool/quantization/symmetric/kernels/image_format.py new file mode 100644 index 000000000..bd89bca57 --- /dev/null +++ b/tools/nntool/quantization/symmetric/kernels/image_format.py @@ -0,0 +1,25 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from utils.formatters import FORMAT_CHANGES, NORMALIZATIONS + +def image_format(params, in_tensors, qrec, details): + del qrec, details + in_dim = params.in_dims[0] + out_dim = params.out_dims[0] + res = in_tensors[0] + res = FORMAT_CHANGES[params.format_change](res, in_dim, out_dim) + res = NORMALIZATIONS[params.norm_func](res) + return [res] diff --git a/tools/nntool/quantization/symmetric/kernels/linear.py b/tools/nntool/quantization/symmetric/kernels/linear.py new file mode 100644 index 000000000..8af128830 --- /dev/null +++ b/tools/nntool/quantization/symmetric/kernels/linear.py @@ -0,0 +1,87 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +import numpy as np + +from quantization.quantization_record_base import ScalableFilterQuantizationRecordBase + +LOG = logging.getLogger("nntool." + __name__) + + +def linear(params, + in_tensors, + qrec: ScalableFilterQuantizationRecordBase, + details=None): + + in_dims = params.in_dims[0] + out_dims = params.out_dims[0] + weights = qrec.prepare_weights(params, params.weights, ktype="symmetric") + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + + if details is not None: + details['min_acc'] = float("Infinity") + details['max_acc'] = float("-Infinity") + + if params.has_bias: + biases = qrec.prepare_biases(params, params.biases, params.weights, ktype="symmetric") + if qrec.acc_q != qrec.biases_q: + biases = qrec.acc_q.expand_from(biases, qrec.biases_q) + acc_tensor = np.ones((out_dims.c, out_dims.h, out_dims.w), + dtype=qrec.acc_q.dtype) * biases.reshape((out_dims.c, out_dims.h, out_dims.w)) + acc_tensor = acc_tensor.transpose(out_dims.transpose_from_order(('c', 'h', 'w'))) + else: + acc_tensor = np.zeros(out_dims.shape, + dtype=qrec.acc_q.dtype) + + # force the bit dimension of the input tensor to the bit width of the calc + # so that the dot product occurs in this precision + in_tensor = in_tensor.astype(qrec.calc_q.dtype) + + in_tensor = in_tensor.reshape((in_dims.size())) + filt = params.filter.get_filter_dims() + for out_c in range(out_dims.c): + # Expand and normalize the accumulator + if qrec.calc_q != qrec.acc_q: + acc_tensor = qrec.calc_q.expand_from(acc_tensor, qrec.acc_q) + + w_slice = weights[filt.srange(out_c=out_c)].reshape((in_dims.size())) + + res = np.dot(in_tensor, w_slice) + + if details is not None: + details['min_acc'] = min(np.sum(res[res < 0]), details['min_acc']) + details['max_acc'] = min(np.sum(res[res > 0]), details['max_acc']) + + acc_slice = acc_tensor[out_dims.srange(c=out_c, h=0, w=0)] + acc_slice += res + + if qrec.calc_q != qrec.acc_q: + acc_tensor = qrec.acc_q.reduce_from(acc_tensor, qrec.calc_q) + + if details is not None: + details['min_acc'] = min(np.min(acc_slice), details['min_acc']) + details['max_acc'] = max(np.max(acc_slice), details['max_acc']) + + # details['acc_before'] = acc_tensor.copy() + acc_tensor = qrec.apply_multiplicative_bias( + params, acc_tensor, out_dims.get_order_idx('c'), ktype="symmetric") + # details['acc_after'] = acc_tensor.copy() + + if qrec and qrec.out_qs[0] != qrec.acc_q: + acc_tensor = qrec.out_qs[0].reduce_from(acc_tensor, qrec.acc_q) + + return qrec.get_outputs(params, [acc_tensor], ktype="symmetric") diff --git a/tools/nntool/quantization/symmetric/kernels/matrix_operations.py b/tools/nntool/quantization/symmetric/kernels/matrix_operations.py new file mode 100644 index 000000000..0e5aefbf9 --- /dev/null +++ b/tools/nntool/quantization/symmetric/kernels/matrix_operations.py @@ -0,0 +1,131 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +import numpy as np + +from graph.types import (MatrixAddParameters, MatrixDivParameters, + MatrixMulParameters, MatrixSubParameters) +from quantization.multiplicative.mult_quantization import ( + MultAddQuantizationRecord, MultQuantizationRecord) +from quantization.qtype import QType +from quantization.quantization_record_base import QuantizationRecordBase +from utils.at_norm import at_norm + +PIECEWISE_OPS = { + MatrixAddParameters: {'op': lambda x, y, dtype: x + y, 'is_mult': False}, + MatrixMulParameters: {'op': lambda x, y, dtype: np.multiply(x, y, dtype=dtype), 'is_mult': True}, + MatrixSubParameters: {'op': lambda x, y, dtype: x - y, 'is_mult': False}, + MatrixDivParameters: {'op': lambda x, y, dtype: x / y, 'is_mult': True}, +} + +LOG = logging.getLogger("nntool." + __name__) + + +def piecewise_mult(params, + in_tensors, + qrec: MultQuantizationRecord, + details=None): + del details + in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="symmetric") + func = PIECEWISE_OPS[params.__class__] + op = func['op'] + if func['is_mult']: + qrec.set_scale(in_idx=(0, 1), out_idx=0) + i1 = in_tensors[0].astype(np.int32) + i2 = in_tensors[1].astype(np.int32) + res = qrec.scale_mul_biases_q.apply_scales(op(i1, i2, np.int32)) + else: + # larger scale should be scaled + qrec.set_add_scale() + if qrec.scaled_idx: + i1 = in_tensors[0].astype(np.int32) + i2 = qrec.scale_in_mul_biases_q.apply_scales(in_tensors[1]) + else: + i1 = qrec.scale_in_mul_biases_q.apply_scales(in_tensors[0]) + i2 = in_tensors[1].astype(np.int32) + + res = qrec.scale_mul_biases_q.apply_scales(op(i1, i2, None)) + return qrec.get_outputs(params, [qrec.out_qs[0].clip(res)], ktype="symmetric") + + +def piecewise(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + + if isinstance(qrec, (MultQuantizationRecord, MultAddQuantizationRecord)): + return piecewise_mult(params, in_tensors, qrec, details=details) + + in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="symmetric") + func = PIECEWISE_OPS[params.__class__] + op = func['op'] + if func['is_mult']: + i1 = in_tensors[0].astype(np.int32) + i2 = in_tensors[1].astype(np.int32) + res = op(i1, i2, np.int32) + else: + off_in = abs(qrec.in_qs[0].q - qrec.in_qs[1].q) + if qrec.in_qs[0].q > qrec.in_qs[1].q: + i1 = at_norm(in_tensors[0].astype(np.int32), off_in) + i2 = in_tensors[1].astype(np.int32) + else: + i1 = in_tensors[0].astype(np.int32) + i2 = at_norm(in_tensors[1].astype(np.int32), off_in) + res = op(i1, i2, None) + return qrec.get_outputs(params, [res], ktype="symmetric") + + +def matscale3(in_tensors, qrec): + assert qrec.in_qs[0].bits == qrec.in_qs[1].bits + assert qrec.in_qs[1].bits == qrec.in_qs[2].bits + if qrec.in_qs[0].bits == 8: + q_calc = QType(bits=32, q=qrec.in_qs[0].q + qrec.in_qs[1].q + qrec.in_qs[2].q, signed=True) + res = np.multiply(np.multiply(in_tensors[0], in_tensors[1], + dtype=np.int32), + in_tensors[2], + dtype=np.int32) + res = qrec.out_qs[0].reduce_from(res, q_calc) + elif qrec.in_qs[0].bits == 16: + q_calc = QType(bits=32, q=qrec.in_qs[0].q + qrec.in_qs[1].q, signed=True) + res = np.multiply(in_tensors[0], in_tensors[1], dtype=np.int32) + res = qrec.out_qs[0].reduce_from(res, q_calc) + q_calc = QType(bits=32, q=qrec.in_qs[2].q + qrec.out_qs[0].q, signed=True) + res = np.multiply(res, in_tensors[2], dtype=np.int32) + res = qrec.out_qs[0].reduce_from(res, q_calc) + return res + + +def matscale2(in_tensors, qrec=None): + assert qrec.in_qs[0].bits == qrec.in_qs[1].bits + q_calc = QType(bits=32, q=qrec.in_qs[0].q + qrec.in_qs[1].q, signed=True) + res = np.multiply(in_tensors[0], in_tensors[1], dtype=np.int32) + res = qrec.out_qs[0].reduce_from(res, q_calc) + return res + + +def matscale(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="symmetric") + LOG.debug("matscale input %s", ",".join([t.dtype.name for t in in_tensors])) + if len(params.in_dims) == 3: + output_tensor = matscale3(in_tensors, qrec) + else: + output_tensor = matscale2(in_tensors, qrec) + return qrec.get_outputs(params, [output_tensor], ktype="symmetric") diff --git a/tools/nntool/quantization/symmetric/kernels/pad.py b/tools/nntool/quantization/symmetric/kernels/pad.py new file mode 100644 index 000000000..1a339671d --- /dev/null +++ b/tools/nntool/quantization/symmetric/kernels/pad.py @@ -0,0 +1,29 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np + +from quantization.quantization_record_base import QuantizationRecordBase + + +def pad(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del qrec, details + if params.pad_type == "zero": + return [np.pad(in_tensors[0], params.padding.numpy_pad_shape(params.in_dims[0]), + 'constant', constant_values=0)] + raise NotImplementedError() diff --git a/tools/nntool/quantization/symmetric/kernels/pool.py b/tools/nntool/quantization/symmetric/kernels/pool.py new file mode 100644 index 000000000..3a151df4e --- /dev/null +++ b/tools/nntool/quantization/symmetric/kernels/pool.py @@ -0,0 +1,188 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +import numpy as np + +from quantization.multiplicative.mult_quantization import \ + MultQuantizationRecord +from quantization.quantization_record_base import QuantizationRecordBase +from utils.at_norm import at_norm + +LOG = logging.getLogger("nntool." + __name__) + +# pylint: disable=too-many-arguments, too-many-locals + + +def av_pool(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + # Prepare the quantization levels + + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + in_dims = params.in_dims[0] + out_dims = params.out_dims[0] + filter_sz = params.filter.h * params.filter.w + + pool_factor = (1 << 16)//filter_sz + + out_tensor = np.zeros(out_dims.shape, dtype=np.int32) + + if params.padding.h + params.padding.w > 0: + in_tensor = np.pad(in_tensor, + params.padding.numpy_pad_shape(in_dims), + mode='constant', + constant_values=qrec.in_qs[0].pad_zero_point) + pad_w = params.padding.w + pad_h = params.padding.h + else: + pad_w = pad_h = 0 + + for in_c in range(out_dims.c): + + out_h = 0 + for h_idx in range(0, in_dims.h - params.filter.h + pad_h + 1, + params.stride.h): + out_w = 0 + for w_idx in range(0, in_dims.w - params.filter.w + pad_w + 1, + params.stride.w): + # accumulate - potentially with different Q + in_slice_args = in_dims.srange(c=[in_c, in_c + 1, 1], + h=[h_idx, h_idx + params.filter.h, 1], + w=[w_idx, w_idx + params.filter.w, 1]) + + sum_filter = np.sum(in_tensor[in_slice_args], dtype=np.int32) + sum_filter = np.multiply(sum_filter, pool_factor, dtype=np.int32) + out_tensor[out_dims.srange(c=in_c, h=out_h, w=out_w)] = sum_filter + out_w += 1 + out_h += 1 + + return qrec.get_outputs(params, [qrec.out_qs[0].clip(at_norm(out_tensor, 16), qrec.out_qs[0].dtype)], ktype="symmetric") + + +def max_pool(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + + del details + # Prepare the quantization levels + + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + in_dims = params.in_dims[0] + out_dims = params.out_dims[0] + + out_tensor = np.zeros(out_dims.shape, dtype=qrec.out_qs[0].dtype) + + if params.padding.h + params.padding.w > 0: + in_tensor = np.pad(in_tensor, + params.padding.numpy_pad_shape(in_dims), + mode='constant', + constant_values=qrec.in_qs[0].pad_zero_point) + pad_w = params.padding.w + pad_h = params.padding.h + else: + pad_w = pad_h = 0 + + for in_c in range(out_dims.c): + out_h = 0 + for h_idx in range(0, in_dims.h - params.filter.h + pad_h + 1, + params.stride.h): + out_w = 0 + for w_idx in range(0, in_dims.w - params.filter.w + pad_w + 1, + params.stride.w): + # accumulate - potentially with different Q + out_slice_args = out_dims.srange(c=in_c, h=out_h, w=out_w) + in_slice_args = in_dims.srange(c=[in_c, in_c + 1, 1], + h=[h_idx, h_idx + params.filter.h, 1], + w=[w_idx, w_idx + params.filter.w, 1]) + + out_tensor[out_slice_args] = np.max(in_tensor[in_slice_args].view(np.ndarray)) + out_w += 1 + out_h += 1 + + return qrec.get_outputs(params, [out_tensor], ktype="symmetric") + + +def gap_clb(sum_): + '''Count Leading 0s or 1s''' + sum_bin = [np.binary_repr(sum_elem, width=32) for sum_elem in sum_] + return [len(s) - len(s.lstrip(s[0])) - 1 for s in sum_bin] + + +def av_global_pool_mult(params, + in_tensors, + qrec: MultQuantizationRecord, + details=None): + + # Prepare the quantization levels + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + in_dims = params.in_dims[0] + out_dims = params.out_dims[0] + qrec.set_scale(in_idx=0, out_idx=0) + + sum_by_chan = np.sum(in_tensor, dtype=np.int32, axis=( + in_dims.get_order_idx('w'), in_dims.get_order_idx('h'))) + + res = at_norm((sum_by_chan << 7) // (in_dims.h * in_dims.w), 7) + res = out_tensor = qrec.scale_mul_biases_q.apply_scales(res) + return qrec.get_outputs(params, + [out_tensor.reshape(out_dims.shape)], + ktype="symmetric") + + +def av_global_pool(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + + if isinstance(qrec, MultQuantizationRecord): + return av_global_pool_mult(params, in_tensors, qrec, details=details) + + # Prepare the quantization levels + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + in_dims = params.in_dims[0] + out_dims = params.out_dims[0] + + sum_by_chan = np.sum(in_tensor, dtype=np.int32, axis=( + in_dims.get_order_idx('w'), in_dims.get_order_idx('h'))) + + norm = (np.array([31], dtype=np.int32) - gap_clb(sum_by_chan)).astype(np.int32) + inv_wh = (1 << norm) // (in_dims.h * in_dims.w) + out_tensor = at_norm((inv_wh * sum_by_chan), norm) + return qrec.get_outputs(params, + [qrec.out_qs[0].clip(out_tensor).reshape(out_dims.shape)], + ktype="symmetric") + + +def max_global_pool(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + + del details + + # Prepare the quantization levels + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + if isinstance(qrec, MultQuantizationRecord): + qrec.set_scale(in_idx=0, out_idx=0) + in_dims = params.in_dims[0] + return qrec.get_outputs(params, [np.max(in_tensor, + axis=(in_dims.get_order_idx('w'), + in_dims.get_order_idx('h')), + keepdims=True)], ktype="symmetric") diff --git a/tools/nntool/quantization/symmetric/kernels/softmax.py b/tools/nntool/quantization/symmetric/kernels/softmax.py new file mode 100644 index 000000000..085540cc3 --- /dev/null +++ b/tools/nntool/quantization/symmetric/kernels/softmax.py @@ -0,0 +1,58 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np + +from quantization.quantization_record_base import QuantizationRecordBase +from quantization.multiplicative.mult_quantization import MultQuantizationRecordBase +from utils.exp_17_15 import exp_fp_17_15 + + +def softmax_func(v): + max_v = np.max(v) + v = v - max_v + return np.exp(v)/np.sum(np.exp(v)) + + +def softmax(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + if isinstance(qrec, MultQuantizationRecordBase): + return softmax_sq8(params, in_tensors, qrec, details=details) + + np.seterr(over='raise') + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + # TODO - Implement properly quantized version + in_tensor = qrec.in_qs[0].dequantize(in_tensor) + return qrec.get_outputs(params, [qrec.out_qs[0].quantize(softmax_func(in_tensor))], ktype="symmetric") + + +# void KerParSoftMax_SQ8(KerSoftMax_SQ8_T *Arg) +def softmax_sq8(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + in_tensor = in_tensors[0].flatten() + max_val = np.max(in_tensor) + norm = 15 + np.ceil(np.log2(qrec.in_qs[0].scale)).astype(np.int32) + exp = exp_fp_17_15((in_tensor.astype(np.int32) - max_val) << (norm)) + sum_exp = np.sum(exp) + inv_sum = (np.array([(1 << 15)-1], dtype=np.uint32) << 15)//sum_exp + res = np.abs((exp * inv_sum + (1 << 14)) >> 15) + iinfo = np.iinfo(np.int16) + res = np.clip(res, iinfo.min, iinfo.max).astype(np.int16).reshape(params.out_dims[0].shape) + return qrec.get_outputs(params, [res], ktype="symmetric") diff --git a/tools/nntool/quantization/symmetric/kernels/tensor_functions.py b/tools/nntool/quantization/symmetric/kernels/tensor_functions.py new file mode 100644 index 000000000..a77ea774a --- /dev/null +++ b/tools/nntool/quantization/symmetric/kernels/tensor_functions.py @@ -0,0 +1,98 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np +from skimage.transform import resize + +from quantization.quantization_record_base import QuantizationRecordBase + + +def graph_input(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + # all graph inputs are passed all of the inputs of the graph + # params.index indicates the index of the input that this node should output + in_tensor = in_tensors[params.index] + if in_tensor.size == params.dims.size(): + in_tensor = in_tensor.reshape(params.dims.shape) + else: + in_tensor = resize(in_tensor, params.dims.shape) + if params.transpose_out: + in_tensor = np.transpose(in_tensor, params.transpose_out) + # output_tensors = qrec.get_outputs(params, [in_tensor], ktype="symmetric") + return [qrec.out_qs[0].quantize(in_tensor)] + + +def graph_output(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details, qrec + in_tensor = in_tensors[0] + if params.transpose_in: + in_tensor = np.transpose(in_tensor, params.transpose_in) + return [in_tensor] + + +def constant_input(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del in_tensors, details + # output_tensors = qrec.get_outputs(params, [params.value], ktype="symmetric") + return [qrec.out_qs[0].quantize(params.value)] + + +def concat(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="symmetric") + assert all(qrec.in_qs[0] == qrec.in_qs[idx] + for idx in range(1, len(qrec.in_qs))), "input is incorrectly quantized" + if params.transpose_in: + in_tensors = [np.transpose(qrec.in_tensor, params.transpose_in) for in_tensor in in_tensors] + out_tensor = np.concatenate(in_tensors, params.axis) + if params.transpose_out: + out_tensor = np.transpose(out_tensor, params.transpose_out) + return qrec.get_outputs(params, [out_tensor], ktype="symmetric") + + +def reshape(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + if params.transpose_in: + in_tensor = np.transpose(in_tensor, params.transpose_in) + in_tensor = np.reshape(in_tensor, params.shape) + if params.transpose_out: + in_tensor = np.transpose(in_tensor, params.transpose_out) + return qrec.get_outputs(params, [in_tensor], ktype="symmetric") + + +def transpose(params, + in_tensors, + qrec: QuantizationRecordBase, + details=None): + del details + in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] + if params.transpose_in: + in_tensor = np.transpose(in_tensor, params.transpose_in) + return qrec.get_outputs(params, [in_tensor], ktype="symmetric") diff --git a/tools/nntool/quantization/symmetric/symmetric_kernet_set.py b/tools/nntool/quantization/symmetric/symmetric_kernet_set.py new file mode 100644 index 000000000..099827288 --- /dev/null +++ b/tools/nntool/quantization/symmetric/symmetric_kernet_set.py @@ -0,0 +1,130 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from quantization.kernels.kernel_function import (KernelFunction, + KernelFunctionSetBase) +from quantization.symmetric.kernels.activations import relu, hswish, hsigmoid, leaky +from quantization.symmetric.kernels.fast_conv import faster_conv +from quantization.symmetric.kernels.linear import linear +from quantization.symmetric.kernels.matrix_operations import (matscale, + piecewise) +from quantization.symmetric.kernels.pad import pad +from quantization.symmetric.kernels.pool import (av_global_pool, av_pool, + max_global_pool, max_pool) +from quantization.symmetric.kernels.softmax import softmax +from quantization.symmetric.kernels.tensor_functions import (concat, + constant_input, + graph_input, graph_output, + reshape, + transpose) +from quantization.symmetric.kernels.image_format import image_format + + +class SymmetricKernelSet(KernelFunctionSetBase): + @property + def graph_input(self) -> KernelFunction: + return graph_input + + @property + def graph_output(self) -> KernelFunction: + return graph_output + + @property + def constant_input(self) -> KernelFunction: + return constant_input + + @property + def relu(self) -> KernelFunction: + return relu + + @property + def leaky(self) -> KernelFunction: + return leaky + + @property + def hswish(self) -> KernelFunction: + return hswish + + @property + def hsigmoid(self) -> KernelFunction: + return hsigmoid + + @property + def matscale(self) -> KernelFunction: + return matscale + + @property + def matadd(self) -> KernelFunction: + return piecewise + + @property + def matsub(self) -> KernelFunction: + return piecewise + + @property + def matdiv(self) -> KernelFunction: + return piecewise + + @property + def matmul(self) -> KernelFunction: + return piecewise + + @property + def conv2d(self) -> KernelFunction: + return faster_conv + + @property + def linear(self) -> KernelFunction: + return linear + + @property + def softmax(self) -> KernelFunction: + return softmax + + @property + def reshape(self) -> KernelFunction: + return reshape + + @property + def transpose(self) -> KernelFunction: + return transpose + + @property + def concat(self) -> KernelFunction: + return concat + + @property + def av_pool(self) -> KernelFunction: + return av_pool + + @property + def av_global_pool(self) -> KernelFunction: + return av_global_pool + + @property + def max_pool(self) -> KernelFunction: + return max_pool + + @property + def max_global_pool(self) -> KernelFunction: + return max_global_pool + + @property + def pad(self) -> KernelFunction: + return pad + + @property + def image_format(self) -> KernelFunction: + return image_format diff --git a/tools/nntool/quantization/symmetric/symmetric_quantization.py b/tools/nntool/quantization/symmetric/symmetric_quantization.py new file mode 100644 index 000000000..2fe86f028 --- /dev/null +++ b/tools/nntool/quantization/symmetric/symmetric_quantization.py @@ -0,0 +1,208 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from typing import Sequence + +import numpy as np + +from graph.types import (Conv2DParameters, MatrixAddParameters, + MatrixDivParameters, MatrixMulParameters, + MatrixSubParameters, MultiplicativeBiasParameters, + Parameters) +from quantization.qtype import QType +from quantization.quantization_record_base import ( + FilterQuantizationRecordBase, InputOutputQuantizationRecordBase, + ScalableFilterQuantizationRecordBase) +from utils.at_norm import at_norm + + +class SymmetricQuantizationBase(InputOutputQuantizationRecordBase): + def __init__(self, *args, auto_quantize_inputs=False, auto_dequantize_outputs=False, **kwargs): + super(SymmetricQuantizationBase, self).__init__(*args, **kwargs) + self._auto_quantize_inputs = auto_quantize_inputs + self._auto_dequantize_outputs = auto_dequantize_outputs + + def dequantize_as(self, tensor: np.ndarray, key_name: str, idx: int = None) -> np.ndarray: + qtype = self._info[key_name] + if idx: + qtype = qtype[idx] + return qtype.dequantize(tensor) + + def quantize_as(self, tensor: np.ndarray, key_name: str, idx: int = None) -> np.ndarray: + qtype = self._info[key_name] + if idx: + qtype = qtype[idx] + return qtype.quantize(tensor) + + @property + def auto_quantize_inputs(self): + return self._auto_quantize_inputs + + @auto_quantize_inputs.setter + def auto_quantize_inputs(self, val): + self._auto_quantize_inputs = val + + @property + def auto_dequantize_outputs(self): + return self._auto_dequantize_outputs + + @auto_dequantize_outputs.setter + def auto_dequantize_outputs(self, val): + self._auto_dequantize_outputs = val + + def prepare_inputs(self, params: Parameters, + input_tensors: Sequence[np.ndarray], ktype: str = None) -> Sequence[np.ndarray]: + del params + if ktype == "symmetric" and self._auto_quantize_inputs: + return [self.in_qs[idx].quantize(input_tensor) for idx, input_tensor in enumerate(input_tensors)] + return input_tensors + + def get_outputs(self, params: Parameters, + output_tensors: Sequence[np.ndarray], ktype: str = None) -> Sequence[np.ndarray]: + if ktype == "symmetric": + if isinstance(params, (MatrixAddParameters, MatrixSubParameters)): + q_calc = QType(bits=32, q=min(self.in_qs[0].q, self.in_qs[1].q), signed=True) + output_tensors = [self.out_qs[0].reduce_from(output_tensors[0], q_calc)] + elif isinstance(params, (MatrixMulParameters, MatrixDivParameters)): + q_calc = QType(bits=32, q=self.in_qs[0].q+self.in_qs[1].q, signed=True) + output_tensors = [self.out_qs[0].reduce_from(output_tensors[0], q_calc)] + if self._auto_dequantize_outputs: + return [self.out_qs[idx].dequantize(output_tensor) for idx, output_tensor in enumerate(output_tensors)] + return output_tensors + + +class SymmetricQuantizationRecord(SymmetricQuantizationBase): + pass + + +class FilterSymmetricQuantizationBase(SymmetricQuantizationBase): + @property + def calc_q(self) -> QType: + return self._info.get('calc_q') + + @property + def acc_q(self) -> QType: + return self._info.get('acc_q') + + @property + def biases_q(self) -> QType: + return self._info.get('biases_q') + + @property + def weights_q(self) -> QType: + return self._info.get('weights_q') + + @calc_q.setter + def calc_q(self, val: QType): + self._info['calc_q'] = val + + @acc_q.setter + def acc_q(self, val: QType): + self._info['acc_q'] = val + + @biases_q.setter + def biases_q(self, val: QType): + self._info['biases_q'] = val + + @weights_q.setter + def weights_q(self, val: QType): + self._info['weights_q'] = val + + def gen_weights(self, params, weights: np.ndarray) -> np.ndarray: + return self.quantize_as(weights, 'weights_q') + + def gen_biases(self, params: Parameters, biases: np.ndarray, weights: np.ndarray) -> np.ndarray: + del params, weights + return self.quantize_as(biases, 'biases_q') + + def prepare_weights(self, params: Parameters, weights: np.ndarray, ktype: str = None) -> np.ndarray: + if ktype == "symmetric": + return self.gen_weights(params, weights) + if ktype == "float32": + return weights + raise NotImplementedError() + + def prepare_biases(self, params: Parameters, biases: np.ndarray, + weights: np.ndarray, ktype: str = None) -> np.ndarray: + if ktype == "symmetric": + return self.gen_biases(params, biases, weights) + if ktype == "float32": + return biases + raise NotImplementedError() + + +class SymmetricFilterQuantizationRecord(FilterSymmetricQuantizationBase, FilterQuantizationRecordBase): + def __init__(self, *args, + weights_q: QType = None, + biases_q: QType = None, + calc_q: QType = None, + acc_q: QType = None, + info=None, + **kwargs): + super(SymmetricFilterQuantizationRecord, self).__init__(*args, info=info, **kwargs) + if info is None: + self._info['calc_q'] = calc_q + self._info['acc_q'] = acc_q + self._info['biases_q'] = biases_q + self._info['weights_q'] = weights_q + self._info['weights_q'] = weights_q + + +class SymmetricScalableFilterQuantizationRecord(FilterSymmetricQuantizationBase, ScalableFilterQuantizationRecordBase): + def __init__(self, *args, + weights_q: QType = None, + biases_q: QType = None, + mul_biases_q: QType = None, + calc_q: QType = None, + acc_q: QType = None, + info=None, + **kwargs): + super(SymmetricScalableFilterQuantizationRecord, self).__init__(*args, info=info, **kwargs) + if info is None: + self._info['calc_q'] = calc_q + self._info['acc_q'] = acc_q + self._info['biases_q'] = biases_q + self._info['weights_q'] = weights_q + self._info['mul_biases_q'] = mul_biases_q + self._info['weights_q'] = weights_q + + @property + def mul_biases_q(self) -> QType: + return self._info.get('mul_biases_q') + + @mul_biases_q.setter + def mul_biases_q(self, val: QType): + self._info['mul_biases_q'] = val + + def gen_mul_biases(self, params: MultiplicativeBiasParameters) -> np.ndarray: + if params.has_mul_bias: + return self.quantize_as(params.mul_biases, 'mul_biases_q') + return None + + def apply_multiplicative_bias(self, params: Conv2DParameters, input_tensor: np.ndarray, + axis: int, ktype: str = None): + if ktype == "symmetric": + if params.has_mul_bias: + mul_biases = self.quantize_as(params.mul_biases, 'mul_biases_q') + shape = [params.filter.out_c if idx == axis else 1 for idx in range(3)] + input_tensor *= mul_biases.reshape(shape) + input_tensor = at_norm(input_tensor, self.mul_biases_q.q) + return input_tensor + if ktype == "float32": + if params.has_mul_bias: + shape = [params.filter.out_c if idx == axis else 1 for idx in range(3)] + input_tensor *= params.mul_biases.reshape(shape) + return input_tensor + raise NotImplementedError() diff --git a/tools/nntool/quantization/symmetric/symmetric_quantizer.py b/tools/nntool/quantization/symmetric/symmetric_quantizer.py new file mode 100644 index 000000000..6aa86c4bd --- /dev/null +++ b/tools/nntool/quantization/symmetric/symmetric_quantizer.py @@ -0,0 +1,607 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging +from collections import OrderedDict + +from graph.nngraph import NNGraph +from graph.types import (ActivationParameters, ConcatParameters, + ConstantInputParameters, Conv2DParameters, + ConvFusionParameters, FcParameters, InputParameters, + MatrixBroadcastedLinearOpParameters, + MatScaleFusionParameters, + MultiplicativeBiasParameters, Parameters, + SoftMaxParameters) +from quantization.qtype import QType +from quantization.quantizer import Quantizer +from quantization.quantization_set import QuantizationSet +from quantization.symmetric.symmetric_quantization import ( + SymmetricFilterQuantizationRecord, SymmetricQuantizationRecord, + SymmetricScalableFilterQuantizationRecord) +from utils.json_serializable import JsonSerializable +from utils.node_id import NodeId, convert_keys_to_str, convert_str_to_keys +from utils.stats_funcs import STATS_BITS, bits, calc_bits + +LOG = logging.getLogger('nntool.' + __name__) + + +class SymmetricQuantizer(Quantizer, JsonSerializable): + def __init__(self, activation_stats, filter_stats, min_qsnr=None, force_width=None): + self._activation_stats = activation_stats + self._filter_stats = filter_stats + self._min_qsnr = min_qsnr + self._force_width = force_width + + # for tests + def __eq__(self, value): + return self._activation_stats == value._activation_stats and \ + self._filter_stats == value._filter_stats and self._min_qsnr == value._min_qsnr and \ + self._force_width == value._force_width + + def _encapsulate(self): + return { + 'activation_stats': convert_keys_to_str(self._activation_stats), + 'filter_stats': convert_keys_to_str(self._filter_stats), + 'min_qsnr': self._min_qsnr, + 'force_width': self._force_width + } + + @classmethod + def _dencapsulate(cls, val): + return SymmetricQuantizer(convert_str_to_keys(val['activation_stats']), + convert_str_to_keys(val['filter_stats']), + val['min_qsnr'], + val['force_width']) + + # pylint: disable=too-many-locals + def calculate_filter_q(self, + node: Parameters, + astats, + fstats, + in_q: QType, + min_qsnr=None, + force_width=None, + force_out=None, + out_as_acc=False, + biases_bits_as_acc=False): + + w_q = self.get_quantization(fstats['weights'], min_qsnr, force_width) + + calc_width = 32 + calc_q = in_q.q + w_q.q + + acc_bits = bits(astats['max_acc'], astats['min_acc']) + act_bits = bits(astats['max'], astats['min']) + act_acc_bits = max(acc_bits, act_bits) + + calc_int_bits = calc_width - calc_q + if calc_int_bits < act_acc_bits: + # we don't have enough space for the integer portion so reduce the precision of + # the weights + missing_bits = act_acc_bits - calc_int_bits + # TODO - This needs improving + assert w_q.q >= missing_bits, "no space in weights to reduce precision" + w_q.q = w_q.q - missing_bits + calc_q = in_q.q + w_q.q + calc_int_bits = calc_width - calc_q + + c_q = acc_q = QType(bits=calc_width, q=calc_q, signed=True) + + if out_as_acc: + o_q = c_q + if 'biases' in fstats: + b_q = self.get_quantization(fstats['biases'], min_qsnr, force_width) + else: + b_q = o_q + else: + # The output size is requested to be force_out_width size + if force_out and force_out.bits: + # The output fixed point position is also forced + if force_out.q: + if (force_out.bits - force_out.q) < act_acc_bits: + # clipping so cannot completely satisfy + o_q = QType(bits=force_out.bits, + q=force_out.bits - act_acc_bits, + signed=True) + else: + if force_out.q > calc_q: + # We cannot shift left in the kernel + # TODO - This should try to increase the input q + # Unlikely to happen + raise NotImplementedError() + # We can satisfy the force + o_q = QType(bits=force_out.bits, + q=force_out.q, + signed=True) + else: + # Only the width is forced + o_q = self.get_quantization(astats, None, force_out.bits) + else: + # The output width is not forced so calculate the output q normally + o_q = self.get_quantization(astats, min_qsnr, force_width) + if force_out and force_out.q: + # The output fixed point position is forced + if force_out.q > calc_q: + # We cannot shift left in the kernel + # TODO - This should try to increase the input q + # Unlikely to happen + raise NotImplementedError() + o_q.q = force_out.q + + if 'biases' in fstats: + if biases_bits_as_acc: + b_q = self.get_quantization(fstats['biases'], None, calc_width) + else: + # if we are forcing width then match the output size which might + # have been promoted if the activation didn't fit + b_q = self.get_quantization(fstats['biases'], None, o_q.bits) + else: + b_q = o_q + # make sure that the biases are not stored more precisily than the accumulator. It's pointless and will + # cause a negative shift + if b_q.q > acc_q.q: + b_q.q = acc_q.q + + if isinstance(node, MultiplicativeBiasParameters): + if node.has_mul_bias: + mb_q = self.get_quantization(fstats['mul_biases'], min_qsnr, force_width) + else: + mb_q = None + qrec = SymmetricScalableFilterQuantizationRecord(in_qs=[in_q], out_qs=[o_q], calc_q=c_q, + acc_q=acc_q, biases_q=b_q, weights_q=w_q, + mul_biases_q=mb_q, + constants_are_quantized=False) + else: + qrec = SymmetricFilterQuantizationRecord(in_qs=[in_q], out_qs=[o_q], calc_q=c_q, + acc_q=acc_q, biases_q=b_q, weights_q=w_q, + constants_are_quantized=False) + + LOG.debug("filter %s qrec %s", node.name, qrec) + return qrec + + # pylint: disable=too-many-locals + def calculate_output_q(self, + node: Parameters, + astats, + in_qs, + min_qsnr=None, + force_width=None, + force_out=None): + del node + if force_out: + if force_out.bits: + if force_out.q: + o_q = QType(bits=force_out.bits, + q=force_out.q, + signed=True) + else: + o_q = self.get_quantization(astats, None, force_out.bits) + elif force_out.q: + o_q = self.get_quantization(astats, min_qsnr, force_width) + o_q.q = force_out.q + else: + o_q = self.get_quantization(astats, min_qsnr, force_width) + return SymmetricQuantizationRecord(in_qs=in_qs, + out_qs=[o_q]) + + @staticmethod + def get_quantization(stats, min_qsnr, force_width): + qstats = stats['qstats'] + if force_width is not None: + return QType(bits=force_width, + q=qstats[force_width]['q'], + signed=True) + for width in STATS_BITS: + if qstats[width]['qsnr'] > min_qsnr: + return QType(bits=width, + q=qstats[width]['q'], + signed=True) + raise ValueError("no solution for this QSNR could be found") + + def calculate_q(self, + node, + astats, + fstats, + in_qs, + min_qsnr, + force_width, + force_out=None): + + if isinstance(node, (InputParameters, MatrixBroadcastedLinearOpParameters, + ConstantInputParameters, MatScaleFusionParameters)): + qrec = self.calculate_output_q(node, + astats, + in_qs, + min_qsnr=min_qsnr, + force_width=force_width, + force_out=force_out) + elif isinstance(node, Conv2DParameters): + qrec = self.calculate_filter_q(node, + astats, + fstats, + in_q=in_qs[0], + min_qsnr=min_qsnr, + force_width=force_width, + force_out=force_out, + biases_bits_as_acc=False) + elif isinstance(node, FcParameters): + qrec = self.calculate_filter_q(node, + astats, + fstats, + in_q=in_qs[0], + min_qsnr=min_qsnr, + force_width=force_width, + force_out=force_out, + biases_bits_as_acc=False) + elif isinstance(node, SoftMaxParameters): + # softmax always outputs Q15 + qrec = SymmetricQuantizationRecord(in_qs=in_qs, out_qs=[QType(16, 15, True)]) + elif isinstance(node, ActivationParameters): + qrec = SymmetricQuantizationRecord(in_qs=in_qs, + out_qs=[self.compute_activation_out_qtype(node, in_qs[0])]) + else: + qrec = SymmetricQuantizationRecord(in_qs=in_qs, out_qs=in_qs) + return qrec + + @staticmethod + def compute_activation_out_maxq(node, num_bits): + relun = None + if node.activation == "relu6": + relun = 6 + elif node.activation == "relun": + relun = node.activation_params + if isinstance(relun, list): + relun = max(relun) + if relun is None: + return None + relu_bits = calc_bits(relun) + return num_bits - relu_bits + + def compute_activation_out_qtype(self, node, in_q): + max_q = self.compute_activation_out_maxq(node, in_q.bits) + if max_q is None: + return in_q + + return QType(bits=in_q.bits, + q=min(in_q.q, max_q), + signed=True) + + def default_quantize_fusion(self, + G: NNGraph, + node: ConvFusionParameters, + in_qs, + force_out=None) -> SymmetricQuantizationRecord: + del G + result = OrderedDict() + fin_qs = in_qs + for fnode in node.contained_nodes(): + qrec = self.calculate_q( + fnode, + self._activation_stats.get(NodeId(node, fnode)), + self._filter_stats.get(NodeId(node, fnode)), + fin_qs, + self._min_qsnr, + self._force_width, + force_out=force_out) + result[NodeId(node, fnode)] = qrec + fin_qs = qrec.out_qs + return SymmetricQuantizationRecord(in_qs=in_qs, out_qs=fin_qs), result + + def quantize_fusion(self, + G: NNGraph, + node: ConvFusionParameters, + in_qs, + force_out=None) -> SymmetricQuantizationRecord: + if node.fusion_type == 'conv_active': + result = OrderedDict() + nodes = node.contained_nodes() + conv_node = nodes[0] + conv_astats = self._activation_stats.get(NodeId(node, conv_node)) + conv_qrec = self.calculate_filter_q(conv_node, + conv_astats, + self._filter_stats.get(NodeId(node, conv_node)), + in_q=in_qs[0], + min_qsnr=self._min_qsnr, + force_width=self._force_width, + biases_bits_as_acc=False, + out_as_acc=True) + result[NodeId(node, conv_node)] = conv_qrec + act_node = nodes[1] + act_astats = self._activation_stats.get(NodeId(node, act_node)) + if force_out and force_out.bits: + act_max_q = self.compute_activation_out_maxq(act_node, force_out.bits) + if force_out.q is not None: + if (act_max_q is not None and force_out.q > act_max_q) or force_out.q > conv_qrec.out_qs[0].q: + # We cannot shift left in the kernel + # TODO - This should try to increase the input q and perhaps the width + # Unlikely to happen + raise NotImplementedError() + act_o_q = QType(bits=force_out.bits, + q=force_out.q, + signed=True) + else: + act_o_q = self.get_quantization(act_astats, + None, + force_out.bits) + if act_max_q is not None: + act_o_q.q = min(act_max_q, act_o_q.q) + else: + act_o_q = self.get_quantization(act_astats, + self._min_qsnr, + self._force_width) + act_max_q = self.compute_activation_out_maxq(act_node, act_o_q.bits) + # check that the output q is less than or equal to the filter output q + if act_max_q is not None: + act_o_q.q = min(act_o_q.q, conv_qrec.out_qs[0].q, act_max_q) + else: + act_o_q.q = min(act_o_q.q, conv_qrec.out_qs[0].q) + if force_out and force_out.q: + if force_out.q > act_max_q or force_out.q > conv_qrec.out_qs[0].q: + # We cannot shift left in the kernel + # TODO - This should try to increase the input q and perhaps the width + # Unlikely to happen + raise NotImplementedError() + act_o_q.q = force_out.q + act_qrec = SymmetricQuantizationRecord(in_qs=conv_qrec.out_qs, + out_qs=[act_o_q]) + result[NodeId(node, act_node)] = act_qrec + return SymmetricQuantizationRecord(in_qs=in_qs, out_qs=act_qrec.out_qs), result + else: + return self.default_quantize_fusion(G, node, in_qs, force_out=force_out) + + @staticmethod + def get_in_qs(G, edge_recs, node): + if isinstance(node, InputParameters): + in_qs = [] + else: + in_qs = [edge_recs[edge.params] + for edge in G.indexed_in_edges(node.name)] + return in_qs + + @staticmethod + def is_filter_node(node): + conv_fusion_types = set(['conv_active_pool', + 'conv_pool_active', + 'conv_active', + 'conv_pool']) + return (isinstance(node, ConvFusionParameters) and node.fusion_type in conv_fusion_types) or\ + isinstance(node, (Conv2DParameters, FcParameters)) + + @staticmethod + def satisfied(x, y): + return x is None or x == y + + def satisfied_force(self, force_out, o_q): + return not force_out or\ + (self.satisfied(force_out.q, o_q.q) and self.satisfied(force_out.bits, o_q.bits)) + + def quantize_backward(self, + G: NNGraph, + result, + edge_recs, + node, + force_out=None): + + LOG.debug("quantize backwards %s", node.name) + recalculated = False + while True: + in_qs = self.get_in_qs(G, edge_recs, node) + if self.is_filter_node(node): + if isinstance(node, ConvFusionParameters): + qrec, qrecs = self.quantize_fusion(G, + node, + in_qs, + force_out=force_out) + for node_id, fqrec in qrecs.items(): + result[node_id] = fqrec + else: + qrec = self.calculate_q(node, + self._activation_stats.get(NodeId(node, None)), + self._filter_stats.get(NodeId(node, None)), + in_qs, + self._min_qsnr, + self._force_width, + force_out=force_out) + + if force_out and force_out.q is not None and qrec.out_qs[0].q < force_out.q: + if recalculated: + raise NotImplementedError("no quantization solution found") + bits_to_gain = force_out.q - qrec.q + if bits_to_gain > in_qs[0].q: + raise NotImplementedError() + # Try to adjust the inputs to satisfy and then + # recalculate + pnode = G.in_edges(node.name)[0].from_node + self.quantize_backward(G, + result, + edge_recs, + pnode, + force_out=QType(bits=force_out.bits, + q=in_qs[0].q - bits_to_gain, + signed=True)) + elif isinstance(node, ConcatParameters): + assert not recalculated + max_width = max(in_q.bits for in_q in in_qs) + min_q = min(in_q.q for in_q in in_qs) + if force_out: + if not self.satisfied(force_out.bits, max_width): + max_width = force_out.bits + if not self.satisfied(force_out.q, min_q): + min_q = force_out.q + LOG.debug("normalizing concat to %s", QType(bits=max_width, q=min_q, signed=True)) + for pidx, pnode in enumerate([edge.from_node for edge in G.in_edges(node.name)]): + pqrec = in_qs[pidx] + if pqrec.q != min_q or pqrec.bits != max_width: + self.quantize_backward(G, + result, + edge_recs, + pnode, + force_out=QType(bits=max_width, + q=min_q, + signed=True)) + o_q = QType(bits=max_width, + q=min_q, + signed=True) + qrec = SymmetricQuantizationRecord( + in_qs=self.get_in_qs(G, edge_recs, node), out_qs=[o_q]) + elif isinstance(node, SoftMaxParameters): + raise NotImplementedError("softmax kernel cannot change width or q") + else: + if isinstance(node, ConvFusionParameters): + qrec, qrecs = self.quantize_fusion(G, + node, + in_qs, + force_out=force_out) + for node_id, fqrec in qrecs.items(): + result[node_id] = fqrec + else: + qrec = self.calculate_q(node, + self._activation_stats.get(NodeId(node, None)), + self._filter_stats.get(NodeId(node, None)), + in_qs, + self._min_qsnr, + self._force_width, + force_out=force_out) + o_q = qrec.out_qs[0] + if not(self.satisfied(force_out.q, o_q.q) and + self.satisfied(force_out.bits, o_q.bits)): + if recalculated: + raise NotImplementedError("no quantization solution found") + if len(G.in_edges(node.name)) > 1: + raise NotImplementedError("Nodes with multiple input edges \ + need custom handling") + pnode = G.in_edges(node.name)[0].from_node + self.quantize_backward(G, + result, + edge_recs, + pnode, + force_out=force_out) + + for edges in G.indexed_out_edges(node.name): + for edge in edges: + edge_recs[edge.params] = qrec.out_qs[edge.from_idx] + + result[NodeId(node, None)] = qrec + + o_q = qrec.out_qs[0] + if self.satisfied_force(force_out, o_q): + break + if recalculated: + raise NotImplementedError("no quantization solution found") + LOG.debug("recalculate %s", node.name) + recalculated = True + LOG.debug("back complete %s %s", node.name, qrec) + return qrec + + def quantize_forward(self, G: NNGraph, edge_recs, result=None): + if result is None: + result = QuantizationSet() + for node in [step['node'] for step in G.graph_state.steps]: + LOG.debug("quantize forward %s", node.name) + in_qs = self.get_in_qs(G, edge_recs, node) + if isinstance(node, ConvFusionParameters): + qrec, qrecs = self.quantize_fusion(G, node, in_qs) + for node_id, fqrec in qrecs.items(): + result[node_id] = fqrec + elif isinstance(node, ConcatParameters): + qrec = self.quantize_backward(G, + result, + edge_recs, + node) + else: + qrec = self.calculate_q( + node, + self._activation_stats.get(NodeId(node, None)), + self._filter_stats.get(NodeId(node, None)), + in_qs, + self._min_qsnr, + self._force_width) + result[NodeId(node, None)] = qrec + if not qrec: + break + + for edges in G.indexed_out_edges(node.name): + for edge in edges: + edge_recs[edge.params] = qrec.out_qs[edge.from_idx] + return result + + @staticmethod + def initialize_edge_recs(G: NNGraph, qrecs): + '''Initialize edge rec dictionary to current quantization settings''' + edge_recs = {} + for node in [step['node'] for step in G.graph_state.steps]: + nodeid = NodeId(node) + qrec = qrecs[nodeid] + for edges in G.indexed_out_edges(node.name): + for edge in edges: + edge_recs[edge.params] = qrec.out_qs[edge.from_idx] + return edge_recs + + def propagate_forward(self, G: NNGraph, edge_recs, start_node, new_out_qrec, result): + '''Propagate a new output qrec at node start_node in the graph''' + found_node = False + for node in [step['node'] for step in G.graph_state.steps]: + if found_node: + LOG.debug("propagate forwards %s", node.name) + in_qs = self.get_in_qs(G, edge_recs, node) + if isinstance(node, ConvFusionParameters): + qrec, qrecs = self.quantize_fusion(G, node, in_qs) + for node_id, fqrec in qrecs.items(): + result[node_id] = fqrec + elif isinstance(node, ConcatParameters): + qrec = self.quantize_backward(G, + result, + edge_recs, + node) + else: + qrec = self.calculate_q( + node, + self._activation_stats.get(NodeId(node, None)), + self._filter_stats.get(NodeId(node, None)), + in_qs, + self._min_qsnr, + self._force_width) + else: + if node == start_node: + found_node = True + qrec = self.quantize_backward(G, + result, + edge_recs, + node, + force_out=new_out_qrec) + else: + continue + + result[NodeId(node, None)] = qrec + if not qrec: + break + + for edges in G.indexed_out_edges(node.name): + for edge in edges: + edge_recs[edge.params] = qrec.out_qs[edge.from_idx] + + def quantize(self, G: NNGraph) -> OrderedDict: + '''quantize the graph''' + edge_recs = {} + qrecs = self.quantize_forward(G, edge_recs) + qrecs['__quantizer'] = self + G.graph_identity.quantization_type = 'POW2' + return qrecs + + @classmethod + def propagate(cls, G: NNGraph, current_qrecs, start_node, new_out_qrec) -> OrderedDict: + '''propagate new quantization record new_out_qrec at start node through the graph''' + edge_recs = cls.initialize_edge_recs(G, current_qrecs) + return current_qrecs['__quantizer'].propagate_forward(G, edge_recs, start_node, new_out_qrec, current_qrecs) diff --git a/tools/nntool/quantization/tuneq.py b/tools/nntool/quantization/tuneq.py index 146a06ba1..d1224bbdb 100644 --- a/tools/nntool/quantization/tuneq.py +++ b/tools/nntool/quantization/tuneq.py @@ -14,7 +14,7 @@ # along with this program. If not, see . from quantization.qtype import QType -from quantization.simple_auto_quantify import SimpleQuantizer +from quantization.symmetric.symmetric_quantizer import SymmetricQuantizer from utils.node_id import NodeId from utils.stats_funcs import STATS_BITS from graph.types import ConvFusionParameters @@ -30,6 +30,7 @@ def get_qtype(qparam1, qparam2): return QType(STATS_BITS[bits_idx], qparam2, True) def tuneq(G, qrecs, step_num, param, qparam1, qparam2, index=0): + del index step = G.graph_state.steps[step_num] node = step['node'] if param == 'dp': @@ -37,7 +38,7 @@ def tuneq(G, qrecs, step_num, param, qparam1, qparam2, index=0): if param == "out": qtype = get_qtype(qparam1, qparam2) - SimpleQuantizer.propagate(G, qrecs, node, qtype) + SymmetricQuantizer.propagate(G, qrecs, node, qtype) else: if isinstance(node, ConvFusionParameters): for subnode in node.subgraph.nodes(): @@ -45,9 +46,9 @@ def tuneq(G, qrecs, step_num, param, qparam1, qparam2, index=0): if hasattr(qrec, param + '_q'): setattr(qrec, param + '_q', get_qtype(qparam1, qparam2)) return - raise TuneError("parameter " + param + " not found") - else: - qrec = qrecs[NodeId(node, None)] - if not hasattr(qrec, param + '_q'): - raise TuneError("parameter " + param + " not found") - setattr(qrec, param + '_q', get_qtype(qparam1, qparam2)) + raise TuneError("parameter " + param + " not found") + + qrec = qrecs[NodeId(node, None)] + if not hasattr(qrec, param + '_q'): + raise TuneError("parameter " + param + " not found") + setattr(qrec, param + '_q', get_qtype(qparam1, qparam2)) diff --git a/tools/nntool/reports/error_reporter.py b/tools/nntool/reports/error_reporter.py index 9eb3e3f0c..136ba9f1f 100644 --- a/tools/nntool/reports/error_reporter.py +++ b/tools/nntool/reports/error_reporter.py @@ -43,10 +43,6 @@ def do_dheader(table, one_input, with_chan): TabularColumn("min QSNR", fmt=">.0f"), ]) - header.extend([ - TabularColumn("OverF dot", fmt=">d"), - TabularColumn("OverF acc", fmt=">d"), - ]) table.add_row(header) def do_drow(table, stat, cols): @@ -54,14 +50,13 @@ def do_drow(table, stat, cols): class ErrorReporter(Reporter): ONE_INPUT_COLS = ['name', 'op_name', 'step', 'av_err', 'max_err',\ - 'min_err', 'qsnr', 'overflow_dot', 'overflow_acc'] + 'min_err', 'qsnr'] ONE_INPUT_WCHAN_COLS = ['name', 'op_name', 'step', 'av_err', 'max_err',\ - 'min_err', 'max_chan_err', 'qsnr', 'overflow_dot', 'overflow_acc'] + 'min_err', 'max_chan_err', 'qsnr'] COLS = ['name', 'op_name', 'step', 'av_err', 'max_err',\ - 'min_err', 'qsnr', 'max_qsnr', 'min_qsnr', 'overflow_dot', 'overflow_acc'] + 'min_err', 'qsnr', 'max_qsnr', 'min_qsnr'] WCHAN_COLS = ['name', 'op_name', 'step', 'av_err', 'max_err',\ - 'min_err', 'max_chan_err', 'qsnr', 'max_qsnr', 'min_qsnr',\ - 'overflow_dot', 'overflow_acc'] + 'min_err', 'max_chan_err', 'qsnr', 'max_qsnr', 'min_qsnr'] def __init__(self, do_totals=True, threshold=30.0, one_input=False, with_chan=False): print('with chan', with_chan, one_input) diff --git a/tools/nntool/reports/filter_reporter.py b/tools/nntool/reports/filter_reporter.py index 6cea6a991..79dbb5bf7 100644 --- a/tools/nntool/reports/filter_reporter.py +++ b/tools/nntool/reports/filter_reporter.py @@ -134,7 +134,7 @@ def do_row_item(self, table, step_idx, node_name, var, total, op_name, vartype): return total def do_row(self, table, step_idx, node_name, stat, total, op_name="", by_channel=False): - for vartype in ['weights', 'biases']: + for vartype in ['weights', 'biases', 'mul_biases']: if vartype not in stat: continue var = stat[vartype] diff --git a/tools/nntool/reports/graph_reporter.py b/tools/nntool/reports/graph_reporter.py index a9e5c52a5..abfa85843 100644 --- a/tools/nntool/reports/graph_reporter.py +++ b/tools/nntool/reports/graph_reporter.py @@ -95,7 +95,7 @@ def report(self, G: NNGraph, stats) -> Tabular: steps = G.graph_state.steps liveness = G.graph_state.liveness first_node = steps[0]['node'] - active_order = "x".join(first_node.in_dims[0].order) + active_order = "x".join(first_node.out_dims[0].order) tab = Tabular() self.do_headers(active_order, tab) diff --git a/tools/nntool/reports/quantization_reporter.py b/tools/nntool/reports/quantization_reporter.py index abfadfdd8..ca261a511 100644 --- a/tools/nntool/reports/quantization_reporter.py +++ b/tools/nntool/reports/quantization_reporter.py @@ -13,33 +13,41 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from utils.tabular import Tabular, TabularColumn - -from graph.types import FilterParameters +from quantization.symmetric.symmetric_quantization import \ + SymmetricScalableFilterQuantizationRecord +from quantization.multiplicative.mult_quantization import MultScalableFilterQuantizationRecord +from quantization.multiplicative.mult_qtype_base import WrapperMixin +from graph.types import ConstantInputParameters from utils.node_id import NodeId -from quantization.qtype import QType -from quantization.quantization_record import FilterQuantizationRecord -from importer.tflite.tflite_qtype import TfliteQType +from utils.tabular import Tabular, TabularColumn from .reporter import Reporter DEFAULT_ACC_BITS = 32 -def emit_q(qtype): - if qtype is None: - return "" - if isinstance(qtype, TfliteQType): - return ', '.join(map(str, ["{} = {}".format(x, y) for x,y \ - in zip(qtype.report_columns(), qtype.to_report())])) - return "Q{}.{}".format(qtype.bits - qtype.q, qtype.q) - -def emit_qs(qtypes): - return ",".join([emit_q(qtype) for qtype in qtypes]) class QuantizationReporter(Reporter): - def __init__(self, step=None): + def __init__(self, step=None, emit_wrapped=True): super(QuantizationReporter).__init__() self._step = step + self._emit_wrapped = emit_wrapped + + def emit_qs(self, qtypes, limit=True): + if limit and len(qtypes) > 10: + qtypes = qtypes[0:10] + extra = " ..." + else: + extra = "" + if self._emit_wrapped: + return (",".join([str(qtype.wrapped) if isinstance(qtype, WrapperMixin) + else str(qtype) for qtype in qtypes])) + extra + return (",".join([str(qtype) for qtype in qtypes])) + extra + + def emit_q_chan(self, qtype, chan): + if self._emit_wrapped: + return (qtype.wrapped.str_by_chan(chan) if isinstance(qtype, WrapperMixin) + else qtype.str_by_chan(chan)) + return qtype.str_by_chan(chan) def report(self, G, stats): table = Tabular() @@ -55,7 +63,7 @@ def report(self, G, stats): TabularColumn("Acc"), ]) - for key, qrec in stats.items(): + for key, qrec in stats.sorted_iterator(G): if not isinstance(key, NodeId): continue node = G.node(key.node_name) @@ -64,11 +72,51 @@ def report(self, G, stats): fnode = node.get_contained_node(key.fnode_name) if key.fnode_name else None step_idx = node.step_idx node = fnode or node - row = [step_idx, node.name, emit_qs(qrec.in_qs), emit_qs(qrec.out_qs)] - if isinstance(qrec, FilterQuantizationRecord): - for i in ["weights", "biases", "mul_biases", "calc", "acc"]: - row.append(emit_q(getattr(qrec, i+'_q'))) + if qrec: + if self._step is None or not isinstance(qrec, MultScalableFilterQuantizationRecord) or len(qrec.weights_q.scale) == 1: + if isinstance(node, ConstantInputParameters): + row = [step_idx, node.name, + "", + self.emit_qs(qrec.out_qs, + limit=self._step is None), + "", "", "", "", ""] + else: + row = [step_idx, node.name, + self.emit_qs(qrec.in_qs, + limit=self._step is None), + self.emit_qs(qrec.out_qs, + limit=self._step is None)] + if isinstance(qrec, (SymmetricScalableFilterQuantizationRecord, MultScalableFilterQuantizationRecord)): + for i in ["weights", "biases", "mul_biases", "calc", "acc"]: + row.append(self.emit_qs([getattr(qrec, i+'_q')])) + else: + row += ["", "", "", "", ""] + else: + first = True + for chan in range(len(qrec.weights_q.scale)): + if first: + row = [step_idx, node.name, + self.emit_qs(qrec.in_qs, + limit=self._step is None), + self.emit_qs(qrec.out_qs, + limit=self._step is None), + self.emit_q_chan(qrec.weights_q, chan), + self.emit_q_chan(qrec.biases_q, chan), + self.emit_q_chan(qrec.mul_biases_q, chan), + str(qrec.calc_q), + str(qrec.acc_q), + ] + first = False + else: + row = [chan, "", "", "", + self.emit_q_chan(qrec.weights_q, chan), + self.emit_q_chan(qrec.biases_q, chan), + self.emit_q_chan(qrec.mul_biases_q, chan), + "", "" + ] + table.add_row(row) + continue else: - row += ["", "", "", "", ""] + row = [step_idx, node.name, "None", "None", "", "", "", "", ""] table.add_row(row) return table diff --git a/tools/nntool/requirements.txt b/tools/nntool/requirements.txt index 00347a09b..7d6f176d9 100644 --- a/tools/nntool/requirements.txt +++ b/tools/nntool/requirements.txt @@ -3,10 +3,10 @@ typing==3.7.4.1 pytest==5.0.1 XlsxWriter==1.1.5 texttable==1.6.2 -cmd2==0.9.24 +cmd2==1.0.2 numpy==1.16.2 Pillow==6.2.0 -Keras==2.2.4 -tensorflow==1.14.0 +Keras==2.3.1 +tensorflow==1.15.0 numpy==1.16.2 argcomplete==1.10.0 diff --git a/tools/nntool/stats/activation_stats_collector.py b/tools/nntool/stats/activation_stats_collector.py index 3110011c2..aba2d760e 100644 --- a/tools/nntool/stats/activation_stats_collector.py +++ b/tools/nntool/stats/activation_stats_collector.py @@ -16,7 +16,7 @@ from collections import OrderedDict from typing import Mapping, Sequence -from execution.execute_graph import execute_iterator +from execution.graph_executer import GraphExecuter from graph.types import FilterParameters, InputParameters, MultiplicativeBiasParameters from utils.node_id import NodeId from utils.stats_funcs import astats, calculate_qsnrs @@ -32,22 +32,32 @@ def gather_stats(activation, force_ideal=False, channel_dim=None, channel_detail class ActivationStatsCollector(ReductionStatsCollector): def __init__(self, graph_execution=None): super(ActivationStatsCollector, self).__init__() - self._graph_execution = execute_iterator if graph_execution is None else graph_execution + self._graph_execution = graph_execution def _collect(self, G, input_tensors, step_idx): + if self._graph_execution is None: + if G.has_quantized_parameters: + quantization = G.quantization + else: + quantization = None + graph_executor = GraphExecuter(G, qrecs=quantization) + graph_execution = graph_executor.execute_iterator + else: + graph_execution = self._graph_execution + stats = OrderedDict() limit = step_idx[0] if isinstance(step_idx, tuple) else step_idx - for _, _, node, output, _, fusion_node, details in\ - self._graph_execution(G, input_tensors, disable_cache=True, limit=limit): - if not self.matches_step(step_idx, node, fusion_node): + for _, node, fnode, output_tensors, details in\ + graph_execution(input_tensors, step_idx_limit=limit, yield_fusions=True, yield_details=True): + if not self.matches_step(step_idx, node, fnode): continue - key = NodeId(node, fusion_node) - node = (node if fusion_node is None else fusion_node) + key = NodeId(node, fnode) + node = (node if fnode is None else fnode) if node.out_dims[0].is_named and node.out_dims[0].has_key('c'): channel_dim = node.out_dims[0].get_order_idx('c') else: channel_dim = 0 - stat = gather_stats(output[0], + stat = gather_stats(output_tensors[0], force_ideal=not isinstance(node, InputParameters), channel_dim=channel_dim, channel_details=step_idx is not None) diff --git a/tools/nntool/stats/error_stats_collector.py b/tools/nntool/stats/error_stats_collector.py index 404baf97f..fe2236dd2 100644 --- a/tools/nntool/stats/error_stats_collector.py +++ b/tools/nntool/stats/error_stats_collector.py @@ -14,7 +14,6 @@ # along with this program. If not, see . import logging -import math from collections import OrderedDict from typing import Mapping @@ -23,49 +22,43 @@ from utils.stats_funcs import qsnr from utils.node_id import NodeId -from execution.execute_graph import execute, execute_iterator +from execution.graph_executer import GraphExecuter from execution.quantization_mode import QuantizationMode -from graph.types import FilterParameters - -from .stats_collector import ReductionStatsCollector +from stats.stats_collector import ReductionStatsCollector LOG = logging.getLogger('nntool.' + __name__) + class ErrorStatsCollector(ReductionStatsCollector): - def __init__(self, limit=None): + def __init__(self, limit=None, quant_compare=False): super().__init__() self._limit = limit + self._quant_compare = quant_compare def _prepare(self, G): pass - - def _collect_execution(self, G, tensors, qrecs=None, qmode=None): + def _collect_execution(self, executer, tensors, qrecs, qmode=None): + del qrecs outputs = [] fusion_outputs = [] - for step_idx, step, node, output, fusion_op_name, fusion_node, details in\ - execute_iterator(G, tensors, limit=self._limit, qrecs=qrecs, qmode=qmode): - if qrecs: - qrec = qrecs[NodeId(node, fusion_node)] - output = [qrec.out_qs[i].dequantize(out) for i, out in enumerate(output)] - else: - output = output.copy() + for step_idx, pnode, fnode, output, details in\ + executer.execute_iterator(tensors, step_idx_limit=self._limit, qmode=qmode): - del step, fusion_op_name - if fusion_node: + if fnode: fusion_outputs.append({ "name": "", "step_idx": "{}_{}".format(step_idx, len(fusion_outputs)), - "node": fusion_node, + "node": fnode, "output": output, "details": details }) else: stat = { - "name": node.name, + "name": pnode.name, "step_idx": str(step_idx), - "node": node, + "node": pnode, "output": output, "details": details, "fusion_outputs": [] @@ -77,17 +70,13 @@ def _collect_execution(self, G, tensors, qrecs=None, qmode=None): return outputs @staticmethod - def _collect_one(fstat, qstat): - fout = fstat['output'] - qout = qstat['output'] - error_ = np.abs(fout[0] - qout[0]) + def _collect_one(fstat, qstat, qrec, quant_compare=False): + fout = fstat['output'][0] + if quant_compare: + fout = qrec.out_qs[0].dequantize(qrec.out_qs[0].quantize(fout)) + qout = qstat['output'][0] + error_ = np.abs(fout - qout) node = fstat['node'] - details = qstat['details'] - if details: - overflow_dot = details['overflow_dot'] - overflow_acc = details['overflow_acc'] - else: - overflow_dot = overflow_acc = "" stat = { 'name': fstat['name'], @@ -96,28 +85,40 @@ def _collect_one(fstat, qstat): 'av_err': np.mean(error_), 'max_err': np.max(error_), 'min_err': np.min(error_), - 'qsnr': qsnr(fout[0], qout[0]), - 'overflow_dot' : overflow_dot, - 'overflow_acc' : overflow_acc, + 'qsnr': qsnr(fout, qout), } return stat def _collect(self, G, input_tensors, step_idx) -> Mapping[NodeId, Mapping]: LOG.debug("gather quantization statistics") - foutputs = self._collect_execution(G, input_tensors) - qoutputs = self._collect_execution(G, + if G.has_quantized_parameters: + quantization = G.quantization + else: + quantization = None + executer = GraphExecuter(G, qrecs=quantization) + foutputs = self._collect_execution(executer, input_tensors, quantization) + executer = GraphExecuter(G, qrecs=G.quantization) + qoutputs = self._collect_execution(executer, input_tensors, - qrecs=G.quantization, - qmode=QuantizationMode.all()) + G.quantization, + qmode=QuantizationMode.all_dequantize()) stats = OrderedDict() for idx, fstat in enumerate(foutputs): qstat = qoutputs[idx] if fstat['fusion_outputs']: for jdx, ffstat in enumerate(fstat['fusion_outputs']): - stats[NodeId(fstat['node'], ffstat['node'])] =\ - self._collect_one(ffstat, qstat['fusion_outputs'][jdx]) - stats[NodeId(fstat['node'], None)] = self._collect_one(fstat, qstat) + nid = NodeId(fstat['node'], ffstat['node']) + stats[nid] =\ + self._collect_one(ffstat, + qstat['fusion_outputs'][jdx], + G.quantization[nid], + quant_compare=self._quant_compare) + nid = NodeId(fstat['node'], None) + stats[nid] = self._collect_one(fstat, + qstat, + G.quantization[nid], + quant_compare=self._quant_compare) return stats @@ -134,8 +135,6 @@ def _reduce_prepare(self, all_stats): def _reduce(self, _, base: Mapping, stat: Mapping): for k in ['av_err', 'qsnr']: base[k].append(stat[k]) - for k in ['overflow_dot', 'overflow_acc']: - base[k] += stat[k] for k in [('max_err', 'max_err')]: base[k[0]] = max(base[k[0]], abs(stat[k[1]])) for k in [('min_err', 'min_err')]: diff --git a/tools/nntool/stats/filter_stats_collector.py b/tools/nntool/stats/filter_stats_collector.py index bd115a9cf..fb6c1ec7d 100644 --- a/tools/nntool/stats/filter_stats_collector.py +++ b/tools/nntool/stats/filter_stats_collector.py @@ -21,27 +21,45 @@ MultiplicativeBiasParameters) from utils.node_id import NodeId from utils.stats_funcs import astats, calculate_qsnrs +from quantization.multiplicative.mult_quantization import MultScalableFilterQuantizationRecord from .ranges import Ranges from .stats_collector import StatsCollector LOG = logging.getLogger("nntool." + __name__) -def filter_stats(pnode, fnode, anode, channel_details=None): + +def filter_stats(pnode, fnode, anode, channel_details=None, qrec=None): stats = {} - if isinstance(anode, MultiplicativeBiasParameters) and anode.has_mul_bias: - stats['mul_biases'] = mul_biases = astats(anode.mul_biases) - mul_biases['qstats'] = calculate_qsnrs(anode.mul_biases, - mul_biases['ibits'], - force_ideal=False) + if isinstance(anode, MultiplicativeBiasParameters): + if anode.has_mul_bias: + stats['mul_biases'] = mul_biases = astats(anode.mul_biases) + mul_biases['qstats'] = calculate_qsnrs(anode.mul_biases, + mul_biases['ibits'], + force_ideal=False) + elif isinstance(qrec, MultScalableFilterQuantizationRecord): + stats['mul_biases'] = mul_biases = astats(qrec.mul_biases_fps) + mul_biases['qstats'] = calculate_qsnrs(qrec.mul_biases_fps, + mul_biases['ibits'], + force_ideal=False) if anode.has_bias: - stats['biases'] = biases = astats(anode.biases) - biases['qstats'] = calculate_qsnrs(anode.biases, + if qrec: + qbiases = qrec.prepare_biases(anode, anode.biases, anode.weights, ktype="float32") + else: + qbiases = anode.biases + + stats['biases'] = biases = astats(qbiases) + biases['qstats'] = calculate_qsnrs(qbiases, biases['ibits'], force_ideal=False) + if qrec: + qweights = qrec.prepare_weights(anode, anode.weights, ktype="float32") + else: + qweights = anode.weights + stats['weights'] = weights = astats( - anode.weights, channel_dim=anode.filter.get_order_idx('out_c'), channel_details=channel_details) - weights['qstats'] = calculate_qsnrs(anode.weights, weights['ibits'], + qweights, channel_dim=anode.filter.get_order_idx('out_c'), channel_details=channel_details) + weights['qstats'] = calculate_qsnrs(qweights, weights['ibits'], force_ideal=False) # store the statistics into the graph for later use anode.stats = stats @@ -65,11 +83,17 @@ def _collect(self, G, step_idx): if not self.matches_step(step_idx, pnode, fnode): continue - key = NodeId(pnode, fnode) + nid = NodeId(pnode, fnode) + if G.quantization and G.has_quantized_parameters: + qrec = G.quantization[nid] + else: + qrec = None + anode = pnode if fnode is None else fnode LOG.debug("collecting stats for %s step %s", anode.name, pnode.step_idx) if anode.__class__ in STATS_FUNCTIONS: - stats[key] = STATS_FUNCTIONS[anode.__class__](pnode, fnode, anode, channel_details=step_idx is not None) + stats[nid] = STATS_FUNCTIONS[anode.__class__]( + pnode, fnode, anode, channel_details=step_idx is not None, qrec=qrec) return stats diff --git a/tools/nntool/stats/step_error_stats_collector.py b/tools/nntool/stats/step_error_stats_collector.py index 98876d85d..77b562756 100644 --- a/tools/nntool/stats/step_error_stats_collector.py +++ b/tools/nntool/stats/step_error_stats_collector.py @@ -23,29 +23,28 @@ from utils.stats_funcs import qsnr from utils.node_id import NodeId -from execution.execute_graph import execute_qnoq_iterator -from execution.quantization_mode import QuantizationMode - -from graph.types import FilterParameters +from execution.graph_executer import GraphExecuter from .stats_collector import ReductionStatsCollector LOG = logging.getLogger('nntool.' + __name__) + class StepErrorStatsCollector(ReductionStatsCollector): - def __init__(self, limit=None): + def __init__(self, limit=None, quant_compare=False): super().__init__() self._limit = limit + self._quant_compare = quant_compare def _prepare(self, G): pass - def _collect_execution(self, G, tensors, qrecs): outputs = [] fusion_outputs = [] + executer = GraphExecuter(G, qrecs) for step_idx, node, output, details, qoutput, qdetails, fusion_node in\ - execute_qnoq_iterator(G, tensors, qrecs): + executer.execute_qnoq_iterator(tensors): output = [np.copy(out) for out in output] qoutput = [np.copy(out) for out in qoutput] @@ -77,17 +76,14 @@ def _collect_execution(self, G, tensors, qrecs): return outputs @staticmethod - def _collect_one(out): - fout = out['output'] - qout = out['qoutput'] - error_ = np.abs(fout[0] - qout[0]) + def _collect_one(out, qrec, quant_compare=False): + fout = out['output'][0] + if quant_compare: + fout = qrec.out_qs[0].dequantize(qrec.out_qs[0].quantize(fout)) + qout = out['qoutput'][0] + + error_ = np.abs(fout - qout) node = out['node'] - qdetails = out['qdetails'] - if qdetails: - overflow_dot = qdetails['overflow_dot'] - overflow_acc = qdetails['overflow_acc'] - else: - overflow_dot = overflow_acc = "" stat = { 'name': out['name'], @@ -96,9 +92,7 @@ def _collect_one(out): 'av_err': np.mean(error_), 'max_err': np.max(error_), 'min_err': np.min(error_), - 'qsnr': qsnr(fout[0], qout[0]), - 'overflow_dot' : overflow_dot, - 'overflow_acc' : overflow_acc, + 'qsnr': qsnr(fout, qout), 'chan_err': [] } @@ -108,7 +102,7 @@ def _collect_one(out): dim = node.out_dims[0] for i in range(dim.c): srange = dim.srange(c=i) - channel_error.append(np.average(fout[0][srange] - qout[0][srange])) + channel_error.append(np.average(fout[srange] - qout[srange])) stat['chan_err'] = channel_error return stat @@ -122,9 +116,15 @@ def _collect(self, G, input_tensors, step_idx) -> Mapping[NodeId, Mapping]: for out in outputs: if out['fusion_outputs']: for fout in out['fusion_outputs']: - stats[NodeId(out['node'], fout['node'])] =\ - self._collect_one(fout) - stats[NodeId(out['node'], None)] = self._collect_one(out) + nid = NodeId(out['node'], fout['node']) + stats[nid] =\ + self._collect_one(fout, + G.quantization[nid], + quant_compare=self._quant_compare) + nid = NodeId(out['node'], None) + stats[nid] = self._collect_one(out, + G.quantization[nid], + quant_compare=self._quant_compare) return stats @@ -141,8 +141,6 @@ def _reduce_prepare(self, all_stats): def _reduce(self, _, base: Mapping, stat: Mapping): for k in ['av_err', 'qsnr', 'chan_err']: base[k].append(stat[k]) - for k in ['overflow_dot', 'overflow_acc']: - base[k] += stat[k] for k in [('max_err', 'max_err')]: base[k[0]] = max(base[k[0]], abs(stat[k[1]])) for k in [('min_err', 'min_err')]: diff --git a/tools/nntool/tests/conftest.py b/tools/nntool/tests/conftest.py index ed02aee5f..844657c11 100644 --- a/tools/nntool/tests/conftest.py +++ b/tools/nntool/tests/conftest.py @@ -6,18 +6,24 @@ import numpy as np import pytest -from graph.dim import Conv2DFilterDim, Dim, FcFilterDim, PadDim, StrideDim -from graph.matches.matches import get_std_match_group +from graph.dim import Conv2DFilterDim, Dim, PadDim, StrideDim +from graph.matches.matches import get_pow2_match_group from graph.nngraph import NNGraph -from graph.types import Conv2DParameters, FcParameters, NNEdge +from graph.types import (Conv2DParameters, NNEdge, MatrixAddParameters, + ReluActivationParameters) from importer.importer import create_graph -from quantization.simple_auto_quantify import SimpleQuantizer +from quantization.symmetric.symmetric_quantizer import SymmetricQuantizer from stats.activation_stats_collector import ActivationStatsCollector from stats.filter_stats_collector import FilterStatsCollector from utils.data_importer import import_data -from utils.intermediate_cache import IntermediateCache from utils.new_param_state import dump_state from utils.sparse_list import SparseList +from utils.node_id import NodeId +from quantization.quantization_set import QuantizationSet +from quantization.multiplicative.mult_quantization import (MultScalableFilterQuantizationRecord, + MultAddQuantizationRecord) +from quantization.multiplicative.symmetric.symmetric_mult_biases_qtype import SymmetricMultBiasesQType +from quantization.multiplicative.symmetric.symmetric_mult_qtype import SymmetricMultQType MNIST_GRAPH = 'tests/graph/mnist_model.tflite' IR_GRAPH = 'tests/graph/ir_model.tflite' @@ -28,10 +34,16 @@ CONCAT_TEST_GRAPH = 'tests/graph/concat_test.tflite' QVISUAL_GRAPH = 'tests/graph/model_quantized.tflite' MN3_GRAPH = 'tests/graph/v3-large_224_1.0_float.tflite' +MN2_VWW_SYM_Q_GRAPH = "tests/mobv2_valid/mobv2_vww_quant_sym.tflite" MN3Q_GRAPH = 'tests/graph/v3-large_224_1.0_uint8.tflite' MN3Q2_GRAPH = 'tests/graph/mn3_large_quant_tf2_no_train.tflite' +MN2_GRAPH = 'tests/graph/vergesense_mnv2.tflite' +MN1Q_GRAPH = 'tests/graph/mobv1_quant.tflite' +MN1F_GRAPH = 'tests/graph/mobv1_float.tflite' +LPRNET_Q_GRAPH = 'tests/graph/lprnet.tflite' +SQUEEZE_GRAPH = 'tests/graph/squeezenet.tflite' -MNIST_IMAGES = glob('tests/images/*.pgm') +MNIST_IMAGES = glob('tests/images/136.pgm') VWW_IMAGES = glob('tests/vwwimages/*.png') def save_state(temp_dir, width, fusions=False, adjust=False): @@ -41,7 +53,7 @@ def save_state(temp_dir, width, fusions=False, adjust=False): if adjust: G.adjust_order() if fusions: - get_std_match_group().match(G) + get_pow2_match_group().match(G) G.add_dimensions() stats_collector = ActivationStatsCollector() for input_file in MNIST_IMAGES: @@ -52,7 +64,7 @@ def save_state(temp_dir, width, fusions=False, adjust=False): astats = stats_collector.reduce_stats() stats_collector = FilterStatsCollector() fstats = stats_collector.collect_stats(G) - quantizer = SimpleQuantizer(astats, fstats, force_width=width) + quantizer = SymmetricQuantizer(astats, fstats, force_width=width) qrecs = quantizer.quantize(G) G.quantization = qrecs dump_state(G, include_parameters=True, state_path=file_name) @@ -95,6 +107,54 @@ def two_conv_graph(): G.add_dimensions() yield G +@pytest.fixture() +def actfusion_graph(): + G = NNGraph(name='actfusion_graph') + ti1 = G.add_input(Dim.unnamed([10, 10, 2])).name + ti2 = G.add_input(Dim.unnamed([10, 10, 2])).name + c1filt = Conv2DFilterDim(3, 3, 2, in_c=2) + c1filt.impose_order(['out_c', 'h', 'w', 'in_c']) + n1 = Conv2DParameters("node1", + filt=c1filt, + stride=StrideDim(1, 1), + padding=PadDim(0), + in_dims_hint=SparseList([['h', 'w', 'c']]), + out_dims_hint=SparseList([['h', 'w', 'c']])) + G.add_node(n1) + w1 = [[0.25, 0.25], [0.25, 0.25], [0.25, 0.25]] + w1 = [w1, w1, w1] + w2 = [[0.75, 0.75], [0.75, 0.75], [0.75, 0.75]] + w2 = [w2, w2, w2] + n1.weights = np.array([w1, w2]) + n1a = ReluActivationParameters("node1a") + G.add_node(n1a) + c2filt = Conv2DFilterDim(3, 3, 2, in_c=2) + c2filt.impose_order(['out_c', 'h', 'w', 'in_c']) + n2 = Conv2DParameters("node2", + filt=c2filt, + stride=StrideDim(1, 1), + padding=PadDim(0), + in_dims_hint=SparseList([['h', 'w', 'c']]), + out_dims_hint=SparseList([['h', 'w', 'c']])) + G.add_node(n2) + w3 = [[0.75, 0.25], [0.75, 0.25], [0.75, 0.25]] + w3 = [w3, w3, w3] + n2.weights = np.array([w3, w3]) + n3 = MatrixAddParameters("node3") + G.add_node(n3) + n4 = ReluActivationParameters("node4") + G.add_node(n4) + to = G.add_output() + G.add_edge(NNEdge(ti1, n1)) + G.add_edge(NNEdge(n1, n1a)) + G.add_edge(NNEdge(ti2, n2)) + G.add_edge(NNEdge(n1a, n3, to_idx=0)) + G.add_edge(NNEdge(n2, n3, to_idx=1)) + G.add_edge(NNEdge(n3, n4)) + G.add_edge(NNEdge(n4, to)) + G.add_dimensions() + yield G + @pytest.fixture(scope="session") def mnist_unfused_16bit_state(): temp_dir = mkdtemp() @@ -113,12 +173,6 @@ def mnist_fused_8bit_state(): yield save_state(temp_dir, 8, True, True) rmtree(temp_dir) -@pytest.fixture(scope="session") -def value_cache(): - temp_dir = mkdtemp() - yield IntermediateCache(temp_dir) - rmtree(temp_dir) - @pytest.fixture(scope="session") def vww_graph(): yield VISUAL_GRAPH @@ -127,6 +181,10 @@ def vww_graph(): def qvww_graph(): yield QVISUAL_GRAPH +@pytest.fixture(scope="session") +def mobv2_symq_graph(): + yield MN2_VWW_SYM_Q_GRAPH + @pytest.fixture(scope="session") def mnist_graph(): yield MNIST_GRAPH @@ -163,6 +221,26 @@ def mn3q_graph(): def mn3q2_graph(): yield MN3Q2_GRAPH +@pytest.fixture(scope="session") +def mn2_graph(): + yield MN2_GRAPH + +@pytest.fixture(scope="session") +def mn1q_graph(): + yield MN1Q_GRAPH + +@pytest.fixture(scope="session") +def mn1f_graph(): + yield MN1F_GRAPH + +@pytest.fixture(scope="session") +def lprnet_graph(): + yield LPRNET_Q_GRAPH + +@pytest.fixture(scope="session") +def squeezenet_graph(): + yield SQUEEZE_GRAPH + @pytest.fixture(scope="session") def ir_images(): yield [ diff --git a/tools/nntool/tests/graph/mobv1_quant.tflite b/tools/nntool/tests/graph/mobv1_quant.tflite index 5745b810a582b89e763d510ba0ec6cac9ebcaacc..945bd1e88a31d4a1c15e82591d14d1d68993722d 100644 GIT binary patch delta 1011 zcma*ke@xVM9LMqZmpkS74x@A!PbeD4?M*3)*qNDZSW5W5e%!z z7RUI~)dG`&oK91tKX7X0mhxpN_r(P|nx?|mECgLtvyDG&1sl|&*CqYg)@OU{vwgn% zJU^d(zN6<(M~s4B=Hv)jBWvX~@>)6aimtb_buN=A*~SsZXxLkBBK|x_2bP|#z4sDP z^PBiWEi z577rXMBks{sK-n6hnJ&+$8H4-T{?(@Xo!JWh=X;Yhj>^I31EN?kOoK{{*(Gh{#}Y=JDuhOLkTxv&lLARjDH0ENJV6>Lxh3q@`_+ZA5t9d9xHyZ68L zWdyqiJShR2?1deQO@1+_BV+Zv6}103u+2~QaDd3~TAkIj@WzeG*(hH**SgS@C`|qJ zBimki!y;2hzAJIPuw;D9>L1@_7f0$XqTMLYIEIDtz;s^vFU^eIRvG0#VwLsR`LRNw zV2jx(Ol}RZHOt3Y!pT%ifBM&!2_Yc#jf^p?U&D_6oF!~9aAH}iR`eE}6LadunOggu z5d7j5W+n>u40w{2K#XrUS246u!?(@!N>j#aY3^gUwY^2lXUoT=S+P>;j4zYs@6Pjo z_MfnyX&3Di7hf$De*dkwHr6HGD?2FtdRb!KM-*$shbN?gPuEJ()rrzOOK14jik;SA zZk#W-;*RM_5qW=dE=PAc2U1kl+06jwnKoaMAy|2T0#nj;FxVDLXGk%K&FR#`IZ+>Y|&zAoX zLR~?9&{U;YYg=YQ`}CiLP*(;UlSYH_Ln`W5_r^AbaKn&0;pNbShWEquq4|AD>T+G{ bh)3)Vp{}5=1g-0Qm(2g?uPR!-&(8e~uU&9G delta 953 zcmcK1T}V@57zgn8%&$2=Hl1r~md=mTG&eo-V|%wVH?0McHV|}#8stryNYX_lq~;lm za;796i_o$Qp)8~=7sWm*EC>q0n<6z4yeTh|qClehZ=<`q?7;7N-uL0W|HI)8-n=0S zLO;*&B0h?b=41F+KF%W^8|tekgmj5e2nmJ^pfpAi@*P6^ag_D&lGF&Xgrr3i@&xbS zjSCNhDOl?<_r*l^sKHf~EzCEN(TF~Zy#tznVs;7Lg?a19OQ+l=hIo)b0wh8bB!d)W zkOHY7hcr;Y4p2fmWI!hDge=$vyCEBLKn1yw2l-F{dtfgVLJ{nPVkm+APzq|$KpB(+ z3tFgvN~nTr(19K(7{CY|n7ndV&C0p_55FQ0ew;AyhM6^n`uUC~bEsEd_!$&Fi_O_i zR%`|f)OuNQ-9oSG&)I*SD-cF!ELqM9z77s}EBJ%O4q-9gEK?BDZwj3vq^~y83CRh) zo#~y5x7wJ-ft4}3l+jLW+n#E8;!qdyBXm{QMu%h#^p3Af^J99#Fgd6**qrZ6tG|BH zcO;tVVnZw4nxj<<<<(`P%M*0wb}W^&D`@JqY|V7;D~(W+WG!U!BcE|5SdTEx*VjI( z|JSEVn%_xIvZQW<+m^UEe%YiQ{1(ODJNA$(q)W!mp4Xgu{S(`D=DsOCuij)?w3lyo zJ~!HieVpd%GIvGySl{++UfZH%OvlF^oJZAdT(eKIEyIVj!d#LyobG<%mItP6>wctr zknUEUkOn$>SNwMAEkDxT^QV1d;AJ|)?br)*#EZsVkq6SX;%iePxbjM cx-mb}-AMNc+GOi&_?puGc1_-Bsnx*z1~bb -2 and np.max(diffs[7]) < 2 -def test_graph_calc_quantize_one_2(value_cache, mnist_unfused_16bit_state, mnist_images): - G = load_state(mnist_unfused_16bit_state, value_cache=value_cache) +def test_graph_calc_quantize_one_2(mnist_unfused_16bit_state, mnist_images): + G = load_state(mnist_unfused_16bit_state) input_tensor = import_data(mnist_images[0], height=28, width=28, offset=0, divisor=255) input_tensor = input_tensor.reshape((28, 28, 1)) - output1 = execute(G, [input_tensor]) + executer = GraphExecuter(G, qrecs=G.quantization) + output1 = executer.execute([input_tensor]) input_tensor = import_data(mnist_images[0], height=28, width=28, offset=0, divisor=255) input_tensor = input_tensor.reshape((28, 28, 1)) - output2 = execute(G, [input_tensor], qmode=QuantizationMode.step(4), qrecs=G.quantization) + output2 = executer.execute([input_tensor], qmode=QuantizationMode.step(4)) diffs = [] for i, out1 in enumerate(output1): diffs.append(out1[0] - output2[i][0]) assert np.min(diffs[7]) > -2 and np.max(diffs[7]) < 2 +def test_graph_calc_quantized8_qnoq(mnist_unfused_8bit_state, mnist_images): + G = load_state(mnist_unfused_8bit_state) + input_tensor = import_data(mnist_images[0], height=28, width=28, offset=0, divisor=255) + input_tensor = input_tensor.reshape((28, 28, 1)) + executer = GraphExecuter(G, qrecs=G.quantization) + diffs = [] + for step_idx, pnode, output, details, qoutput, qdetails, fnode in\ + executer.execute_qnoq_iterator([input_tensor]): + del step_idx, pnode, details, qdetails, fnode + diffs.append(output[0] - qoutput[0]) + assert np.max(np.abs(diffs[7])) < 9 + def test_graph_execute_complex(ir_graph, ir_images): G = create_graph(ir_graph, opts={"load_tensors":True}) G.add_dimensions() input_tensor = import_data(ir_images[0], offset=0, divisor=255) input_tensor = input_tensor.reshape((80, 80, 1)) - execute(G, [input_tensor]) + executer = GraphExecuter(G) + executer.execute([input_tensor]) def test_graph_kws(kws_graph, kws_sounds): G = create_graph(kws_graph, opts={"load_tensors":True}) @@ -131,9 +133,10 @@ def test_graph_kws(kws_graph, kws_sounds): normal_steps = 0 fusion_steps = 0 # pylint: disable=unused-variable - for step_idx, step, node, output, fusion_op_name, fusion_params, details in\ - execute_iterator(G, [input_tensor]): - if fusion_op_name is not None: + executer = GraphExecuter(G) + for step_idx, node, fnode, output_tensors, details in\ + executer.execute_iterator([input_tensor]): + if fnode is not None: fusion_steps += 1 else: normal_steps += 1 @@ -143,7 +146,7 @@ def test_graph_kws_auto_quant(kws_graph, kws_sounds): G = create_graph(kws_graph, opts={"load_tensors":True}) G.add_dimensions() G.adjust_order() - get_std_match_group().match(G) + get_pow2_match_group().match(G) G.add_dimensions() stats_collector = ActivationStatsCollector() for input_file in kws_sounds: @@ -152,15 +155,36 @@ def test_graph_kws_auto_quant(kws_graph, kws_sounds): astats = stats_collector.reduce_stats() stats_collector = FilterStatsCollector() fstats = stats_collector.collect_stats(G) - quantizer = SimpleQuantizer(astats, fstats, force_width=16) + quantizer = SymmetricQuantizer(astats, fstats, force_width=16) + qrecs = quantizer.quantize(G) + G.quantization = qrecs + +def test_graph_imu_auto_quant_and_execute_quant(): + G = create_graph("tests/graph/imu.tflite", opts={"load_tensors":True}) + G.add_dimensions() + G.adjust_order() + get_pow2_match_group().match(G) + G.add_dimensions() + stats_collector = ActivationStatsCollector() + for input_file in ['tests/images/imu0.pgm']: + input_tensor = import_data(input_file, offset=0, divisor=256, nptype='int16') + stats_collector.collect_stats(G, [input_tensor]) + astats = stats_collector.reduce_stats() + stats_collector = FilterStatsCollector() + fstats = stats_collector.collect_stats(G) + quantizer = SymmetricQuantizer(astats, fstats, force_width=16) qrecs = quantizer.quantize(G) G.quantization = qrecs + executer = GraphExecuter(G, qrecs=qrecs) + for input_file in ['tests/images/imu0.pgm']: + input_tensor = import_data(input_file, offset=0, divisor=256, nptype='int16') + output_ = executer.execute([input_tensor], qmode=QuantizationMode.all()) def test_fake_values_concat(concat_test_graph): G = create_graph(concat_test_graph, opts={"load_tensors":True}) G.add_dimensions() G.adjust_order() - matcher = get_std_match_group() + matcher = get_pow2_match_group() matcher.match(G) G.add_dimensions() G.constant_store.fake = True @@ -169,7 +193,7 @@ def test_fake_values_concat(concat_test_graph): astats = stats_collector.reduce_stats() stats_collector = FilterStatsCollector() fstats = stats_collector.collect_stats(G) - quantizer = SimpleQuantizer(astats, fstats, force_width=8) + quantizer = SymmetricQuantizer(astats, fstats, force_width=8) qrecs = quantizer.quantize(G) G.quantization = qrecs with tempfile.TemporaryDirectory() as tempdir: @@ -177,7 +201,7 @@ def test_fake_values_concat(concat_test_graph): 'default_input_location': 'ARG_LOC_L2', 'default_output_location': 'ARG_LOC_L2', 'default_global_location': 'ARG_LOC_L3_HFLASH', - 'default_local_location': '0', + 'default_local_location': 'AT_MEM_UNDEF', 'at_ver': 3, 'tensor_directory': tempdir } @@ -185,8 +209,8 @@ def test_fake_values_concat(concat_test_graph): print(default_template(G, code_generator=code_gen)) code_gen.write_constants() -# This test requires make test_files to be run in the sample project -# directory. With the 8 bit config +# # This test requires make test_files to be run in the sample project +# # directory. With the 8 bit config def test_equivalence(mnist_graph, mnist_images): G = create_graph(mnist_graph, opts={"load_tensors":True}) @@ -195,7 +219,8 @@ def test_equivalence(mnist_graph, mnist_images): G.add_dimensions() input_tensor = import_data(mnist_images[0], height=28, width=28, divisor=255, offset=0, transpose=False) - output_ = execute(G, [input_tensor]) + executer = GraphExecuter(G) + output_ = executer.execute([input_tensor]) with open("tests/h5_pickles/weights.pickle", 'rb') as fp: verif_weights = pickle.load(fp) assert np.array_equal(verif_weights[0]['weights'], G.graph_state.steps[1]['node'].weights) diff --git a/tools/nntool/tests/test_fusions.py b/tools/nntool/tests/test_fusions.py index 1c2736d8a..a20c2d063 100644 --- a/tools/nntool/tests/test_fusions.py +++ b/tools/nntool/tests/test_fusions.py @@ -1,6 +1,11 @@ +import numpy as np +from PIL import Image + from importer.tflite.new_tflite_graph_all import TfliteImporter from graph.matches.match_gap_conv import MatchAllGapConv -from graph.matches.matches import get_std_match_group +from graph.matches.matches import get_pow2_match_group, get_scale8_match_group +from execution.graph_executer import GraphExecuter +from execution.quantization_mode import QuantizationMode def test_fusions1(mnist_graph): tfi = TfliteImporter() @@ -31,6 +36,26 @@ def test_fusions4(ssd_graph): tfi = TfliteImporter() G = tfi.create_graph(ssd_graph, {}) G.add_dimensions() - matcher = get_std_match_group() + matcher = get_pow2_match_group() + matcher.match(G) + G.add_dimensions() + +def test_external_biases_sq8(qvww_graph): + # this model has at the end an external biases layer as constant add + tfi = TfliteImporter() + G = tfi.create_graph(qvww_graph, {"load_quantization": True, "load_tensors": True}) + G.add_dimensions() + matcher = get_scale8_match_group() matcher.match(G) G.add_dimensions() + image = 'tests/vwwimages/COCO_val2014_000000174838_1.png' + img_in = Image.open(image) + img_in = img_in.resize((238, 208)) + input_tensor = np.array(img_in, dtype=np.uint8) + input_tensor = (input_tensor.astype(np.float32) - 128) / 128 + executer = GraphExecuter(G, qrecs=G.quantization) + # check if nntool can execute + qoutput_tensors = executer.execute([input_tensor], qmode=QuantizationMode.all_dequantize()) + foutput_tensors = executer.execute([input_tensor], qmode=None) + diff = [q[0]-f[0] for q,f in zip(qoutput_tensors, foutput_tensors)] + assert max([np.max(d) for d in diff]) < 2.2 diff --git a/tools/nntool/tests/test_generator.py b/tools/nntool/tests/test_generator.py index 30c126682..47c60d88c 100644 --- a/tools/nntool/tests/test_generator.py +++ b/tools/nntool/tests/test_generator.py @@ -2,52 +2,56 @@ import logging import os import tempfile +import numpy as np from generation.code_generator import CodeGenerator -from generation.code_generators import gen_conv_pool_relu +# from generation.code_generators import gen_conv_pool_relu from generation.default_template import default_template from generation.naming_convension import DefaultNamingConvension from utils.new_param_state import load_state -from utils.node_id import NodeId +from importer.tflite.new_tflite_graph_all import TfliteImporter +from graph.matches.matches import get_fusion, get_scale8_match_group +from quantization.multiplicative.mult_quantizer import MultQuantizer +from stats.activation_stats_collector import ActivationStatsCollector -def test_conv_pool_relu_kernel_gen(mnist_unfused_8bit_state): - G = load_state(mnist_unfused_8bit_state) - conv_params = G.graph_state.steps[1]['node'] - relu_params = G.graph_state.steps[2]['node'] - pool_params = G.graph_state.steps[3]['node'] - conv_q = G.quantization[NodeId(conv_params)] - pool_q = G.quantization[NodeId(pool_params)] - relu_q = G.quantization[NodeId(relu_params)] - code_block = gen_conv_pool_relu("Test", conv_params, conv_q, None, None, None, None) - assert str(code_block) ==\ -'CNN_ConvolutionPoolReLU("Test", 0, 1, 1, 1, 1, 7, 7, 7, 6, 1, 1, 1, 1, 1, 32, 28, 28,\n KOP_CONV_DP, 5, 5, 1, 1, 1, 1, 0,\n KOP_NONE, 0, 0, 0, 0, 0, 0, 0, KOP_NONE);' - code_block = gen_conv_pool_relu("Test", conv_params, conv_q, pool_params, pool_q, relu_params, relu_q) - assert str(code_block) ==\ -'CNN_ConvolutionPoolReLU("Test", 0, 1, 1, 1, 1, 7, 7, 7, 6, 1, 1, 1, 1, 1, 32, 28, 28,\n KOP_CONV_DP, 5, 5, 1, 1, 1, 1, 0,\n KOP_MAXPOOL, 2, 2, 1, 1, 2, 2, 0, KOP_RELU);' - code_block = gen_conv_pool_relu("Test", conv_params, conv_q, None, None, relu_params, relu_q) - assert str(code_block) ==\ -'CNN_ConvolutionPoolReLU("Test", 0, 1, 1, 1, 1, 7, 7, 7, 6, 1, 1, 1, 1, 1, 32, 28, 28,\n KOP_CONV_DP, 5, 5, 1, 1, 1, 1, 0,\n KOP_NONE, 0, 0, 0, 0, 0, 0, 0, KOP_RELU);' - code_block = gen_conv_pool_relu("Test", conv_params, conv_q, pool_params, pool_q, None, None) - assert str(code_block) ==\ -'CNN_ConvolutionPoolReLU("Test", 0, 1, 1, 1, 1, 7, 7, 7, 6, 1, 1, 1, 1, 1, 32, 28, 28,\n KOP_CONV_DP, 5, 5, 1, 1, 1, 1, 0,\n KOP_MAXPOOL, 2, 2, 1, 1, 2, 2, 0, KOP_NONE);' - code_block = gen_conv_pool_relu("Test", None, None, pool_params, pool_q, relu_params, relu_q) - assert str(code_block) ==\ -'CNN_PoolReLU("Test", 0, 1, 1, 6, 6, 1, 1, 32, 32, 24, 24,\n KOP_MAXPOOL, 2, 2, 1, 1, 2, 2, 0, KOP_RELU);' - code_block = gen_conv_pool_relu("Test", None, None, None, None, relu_params, relu_q) - assert str(code_block) ==\ -'CNN_PoolReLU("Test", 0, 1, 1, 6, 6, 1, 1, 32, 32, 24, 24,\n KOP_NONE, 0, 0, 0, 0, 0, 0, 0, KOP_RELU);' +# def test_conv_pool_relu_kernel_gen(mnist_unfused_8bit_state): +# G = load_state(mnist_unfused_8bit_state) +# conv_params = G.graph_state.steps[1]['node'] +# relu_params = G.graph_state.steps[2]['node'] +# pool_params = G.graph_state.steps[3]['node'] +# conv_q = G.quantization[NodeId(conv_params)] +# pool_q = G.quantization[NodeId(pool_params)] +# relu_q = G.quantization[NodeId(relu_params)] +# code_block = gen_conv_pool_relu("Test", conv_params, conv_q, None, None, None, None) +# assert str(code_block) ==\ +# 'CNN_ConvolutionPoolReLU("Test", 0, 1, 1, 1, 1, 7, 7, 7, 6, 1, 1, 1, 1, 1, 32, 28, 28,\n KOP_CONV_DP, 5, 5, 1, 1, 1, 1, 0,\n KOP_NONE, 0, 0, 0, 0, 0, 0, 0, KOP_NONE);' +# code_block = gen_conv_pool_relu("Test", conv_params, conv_q, pool_params, pool_q, relu_params, relu_q) +# assert str(code_block) ==\ +# 'CNN_ConvolutionPoolReLU("Test", 0, 1, 1, 1, 1, 7, 7, 7, 6, 1, 1, 1, 1, 1, 32, 28, 28,\n KOP_CONV_DP, 5, 5, 1, 1, 1, 1, 0,\n KOP_MAXPOOL, 2, 2, 1, 1, 2, 2, 0, KOP_RELU);' +# code_block = gen_conv_pool_relu("Test", conv_params, conv_q, None, None, relu_params, relu_q) +# assert str(code_block) ==\ +# 'CNN_ConvolutionPoolReLU("Test", 0, 1, 1, 1, 1, 7, 7, 7, 6, 1, 1, 1, 1, 1, 32, 28, 28,\n KOP_CONV_DP, 5, 5, 1, 1, 1, 1, 0,\n KOP_NONE, 0, 0, 0, 0, 0, 0, 0, KOP_RELU);' +# code_block = gen_conv_pool_relu("Test", conv_params, conv_q, pool_params, pool_q, None, None) +# assert str(code_block) ==\ +# 'CNN_ConvolutionPoolReLU("Test", 0, 1, 1, 1, 1, 7, 7, 7, 6, 1, 1, 1, 1, 1, 32, 28, 28,\n KOP_CONV_DP, 5, 5, 1, 1, 1, 1, 0,\n KOP_MAXPOOL, 2, 2, 1, 1, 2, 2, 0, KOP_NONE);' +# code_block = gen_conv_pool_relu("Test", None, None, pool_params, pool_q, relu_params, relu_q) +# assert str(code_block) ==\ +# 'CNN_PoolReLU("Test", 0, 1, 1, 6, 6, 1, 1, 32, 32, 24, 24,\n KOP_MAXPOOL, 2, 2, 1, 1, 2, 2, 0, KOP_RELU);' +# code_block = gen_conv_pool_relu("Test", None, None, None, None, relu_params, relu_q) +# assert str(code_block) ==\ +# 'CNN_PoolReLU("Test", 0, 1, 1, 6, 6, 1, 1, 32, 32, 24, 24,\n KOP_NONE, 0, 0, 0, 0, 0, 0, 0, KOP_RELU);' def test_unfused_operational(caplog, mnist_unfused_8bit_state): - caplog.set_level(logging.INFO) + caplog.set_level(logging.DEBUG) G = load_state(mnist_unfused_8bit_state) opts = { 'default_input_location': 'ARG_LOC_L2', 'default_output_location': 'ARG_LOC_L2', 'default_global_location': 'ARG_LOC_L3_HFLASH', - 'default_local_location': '0', + 'default_local_location': 'AT_MEM_UNDEF', } code_gen = CodeGenerator(G, DefaultNamingConvension(G), opts) - default_template(G, code_generator=code_gen) + print(default_template(G, code_generator=code_gen)) def test_fused_operational(caplog, mnist_fused_8bit_state): caplog.set_level(logging.INFO) @@ -56,7 +60,7 @@ def test_fused_operational(caplog, mnist_fused_8bit_state): 'default_input_location': 'ARG_LOC_L2', 'default_output_location': 'ARG_LOC_L2', 'default_global_location': 'ARG_LOC_L3_HFLASH', - 'default_local_location': '0', + 'default_local_location': 'AT_MEM_UNDEF', } code_gen = CodeGenerator(G, DefaultNamingConvension(G), opts) default_template(G, code_generator=code_gen) @@ -68,16 +72,123 @@ def test_tensor_dump(mnist_fused_8bit_state): 'default_input_location': 'ARG_LOC_L2', 'default_output_location': 'ARG_LOC_L2', 'default_global_location': 'ARG_LOC_L3_HFLASH', - 'default_local_location': '0', + 'default_local_location': 'AT_MEM_UNDEF', 'tensor_directory': tempdir } code_gen = CodeGenerator(G, DefaultNamingConvension(G), opts) default_template(G, code_generator=code_gen) code_gen.write_constants() files_list = [f for f in os.listdir(tempdir) if os.path.isfile(os.path.join(tempdir, f))] - assert set(files_list) == set(['Step2Weights.tensor', - 'Step1Weights.tensor', - 'Step1Biases.tensor', - 'Step3Weights.tensor', - 'Step2Biases.tensor', - 'Step3Biases.tensor']) + assert set(files_list) == set(['S2_Weights.tensor', + 'S1_Weights.tensor', + 'S1_Biases.tensor', + 'S3_Weights.tensor', + 'S2_Biases.tensor', + 'S3_Biases.tensor']) + + +def test_gen_vergesense(caplog): + caplog.set_level(logging.DEBUG) + tfi = TfliteImporter() + G = tfi.create_graph("tests/graph/marco_17_04.tflite", {'load_tensors': True, 'load_quantization': True}) + G.add_dimensions() + G.adjust_order() + matcher = get_scale8_match_group() + matcher.match(G) + G.add_dimensions() + with tempfile.TemporaryDirectory() as tempdir: + opts = { + 'default_input_location': 'ARG_LOC_L2', + 'default_output_location': 'ARG_LOC_L2', + 'default_global_location': 'ARG_LOC_L3_HFLASH', + 'default_local_location': 'AT_MEM_UNDEF', + 'tensor_directory': tempdir + } + code_gen = CodeGenerator(G, DefaultNamingConvension(G), opts) + default_template(G, code_generator=code_gen) + code_gen.write_constants() + +def test_gen_mobv2_quant_from_keras(caplog): + caplog.set_level(logging.DEBUG) + tfi = TfliteImporter() + G = tfi.create_graph("tests/graph/model_quantized.tflite", {'load_tensors': True, 'load_quantization': True}) + G.add_dimensions() + G.adjust_order() + matcher = get_scale8_match_group() + matcher.match(G) + G.add_dimensions() + with tempfile.TemporaryDirectory() as tempdir: + opts = { + 'default_input_location': 'ARG_LOC_L2', + 'default_output_location': 'ARG_LOC_L2', + 'default_global_location': 'ARG_LOC_L3_HFLASH', + 'default_local_location': 'AT_MEM_UNDEF', + 'tensor_directory': tempdir + } + code_gen = CodeGenerator(G, DefaultNamingConvension(G), opts) + default_template(G, code_generator=code_gen) + code_gen.write_constants() + +def test_gen_ssdlite(caplog): + caplog.set_level(logging.DEBUG) + tfi = TfliteImporter() + G = tfi.create_graph("tests/graph/ssdlite_v2_quant_ocr_nopostprocess.tflite", {'load_tensors': True, 'load_quantization': True}) + G.add_dimensions() + G.adjust_order() + matcher = get_scale8_match_group() + matcher.match(G) + G.add_dimensions() + with tempfile.TemporaryDirectory() as tempdir: + opts = { + 'default_input_location': 'ARG_LOC_L2', + 'default_output_location': 'ARG_LOC_L2', + 'default_global_location': 'ARG_LOC_L3_HFLASH', + 'default_local_location': 'AT_MEM_UNDEF', + 'tensor_directory': tempdir + } + code_gen = CodeGenerator(G, DefaultNamingConvension(G), opts) + default_template(G, code_generator=code_gen) + code_gen.write_constants() + +def test_gen_mobv2_pool_with_actfusions(caplog): + caplog.set_level(logging.DEBUG) + tfi = TfliteImporter() + G = tfi.create_graph("tests/mobv2_valid/mobv2_vwwvehicle_quant_asym.tflite", {'load_tensors': True, 'load_quantization': True}) + G.add_dimensions() + G.adjust_order() + matcher = get_scale8_match_group() + matcher.match(G) + G.add_dimensions() + with tempfile.TemporaryDirectory() as tempdir: + opts = { + 'default_input_location': 'ARG_LOC_L2', + 'default_output_location': 'ARG_LOC_L2', + 'default_global_location': 'ARG_LOC_L3_HFLASH', + 'default_local_location': 'AT_MEM_UNDEF', + 'tensor_directory': tempdir + } + code_gen = CodeGenerator(G, DefaultNamingConvension(G), opts) + default_template(G, code_generator=code_gen) + code_gen.write_constants() + +def test_activatiofusion(actfusion_graph): + G = actfusion_graph + matcher = get_fusion('scale8_match_group') + matcher.match(G) + G.add_dimensions() + astat_col = ActivationStatsCollector() + astats = astat_col.collect_stats(G, [np.full([10, 10, 2], 1), np.full([10, 10, 2], 1)]) + astats = astat_col.reduce_stats() + quantizer = MultQuantizer(astats, force_width=8, quantized_dimension="channel") + G.quantization = quantizer.quantize(G) + with tempfile.TemporaryDirectory() as tempdir: + opts = { + 'default_input_location': 'ARG_LOC_L2', + 'default_output_location': 'ARG_LOC_L2', + 'default_global_location': 'ARG_LOC_L3_HFLASH', + 'default_local_location': 'AT_MEM_UNDEF', + 'tensor_directory': tempdir + } + code_gen = CodeGenerator(G, DefaultNamingConvension(G), opts) + ATModel_code = default_template(G, code_generator=code_gen) + #code_gen.write_constants() diff --git a/tools/nntool/tests/test_matcher.py b/tools/nntool/tests/test_matcher.py index 3a3fc587e..472b9a160 100644 --- a/tools/nntool/tests/test_matcher.py +++ b/tools/nntool/tests/test_matcher.py @@ -19,9 +19,8 @@ from utils.graph import Edge, Graph, Node from utils.graph_matcher import (GraphMatcher, MatchEdgeByIdx, MatchEdgeInputsGroupFactory, MatchNodeByName, - MatchNodeByNameSet, NodeMatch, MatchNodeByClass) -from graph.matches.matscale import MatScalePairMatchFactory, FuseMatScalePair, FuseMatScale, MatScaleNodeMatch -from graph.types import MatrixMulParameters + NodeMatch) +from graph.matches.matscale import MatScalePairMatchFactory, FuseMatScalePair, MatScaleNodeMatch def test_match1(): diff --git a/tools/nntool/tests/test_new_paramstate.py b/tools/nntool/tests/test_new_paramstate.py index 820e57b89..9ba1a3bc4 100644 --- a/tools/nntool/tests/test_new_paramstate.py +++ b/tools/nntool/tests/test_new_paramstate.py @@ -5,7 +5,7 @@ from importer.importer import create_graph from utils.data_importer import import_data from utils.new_param_state import load_state, dump_state -from quantization.simple_auto_quantify import SimpleQuantizer +from quantization.symmetric.symmetric_quantizer import SymmetricQuantizer from stats.activation_stats_collector import ActivationStatsCollector from stats.filter_stats_collector import FilterStatsCollector @@ -31,7 +31,7 @@ def test_graph_calc(mnist_graph, mnist_images): stats_collector = FilterStatsCollector() fstats = stats_collector.collect_stats(G) - quantizer = SimpleQuantizer(astats, fstats, force_width=8) + quantizer = SymmetricQuantizer(astats, fstats, force_width=8) qrecs = quantizer.quantize(G) G.quantization = qrecs diff --git a/tools/nntool/tests/test_nngraph.py b/tools/nntool/tests/test_nngraph.py index 8556e9d82..12aa65f0a 100644 --- a/tools/nntool/tests/test_nngraph.py +++ b/tools/nntool/tests/test_nngraph.py @@ -13,13 +13,19 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +import logging + +import numpy as np + +from execution.graph_executer import GraphExecuter +from execution.quantization_mode import QuantizationMode from graph.manipulations import add_dimensions, calculate_liveness -from graph.matches.matches import get_std_match_group, get_fusion +from graph.matches.matches import get_fusion, get_pow2_match_group, get_scale8_match_group from graph.types import Parameters, Transposable from importer.tflite.new_tflite_graph_all import TfliteImporter from reports.graph_reporter import GraphReporter -from utils.tabular import TextTableRenderer from utils.node_id import NodeId +from utils.tabular import TextTableRenderer def verify_steps(steps, cnt): @@ -71,17 +77,25 @@ def test_load7(qvww_graph): assert G -def test_load10(): +def test_load8(mn2_graph): tfi = TfliteImporter() - G = tfi.create_graph("tests/graph/xor.tflite", {'load_tensors': True}) - steps = add_dimensions(G) + G = tfi.create_graph(mn2_graph, {'load_tensors': True, 'load_quantization': True}) + for node in G.nodes(): + assert NodeId(node) in G.quantization, "node %s doesn't have a qrec" % (node.name) assert G -def test_load12(): + +def test_load9(mn1q_graph): tfi = TfliteImporter() - G = tfi.create_graph("tests/graph/imu.tflite", {'load_tensors': True}) + G = tfi.create_graph(mn1q_graph, {'load_tensors': True, 'load_quantization': True}) + assert G + + +def test_load10(): + tfi = TfliteImporter() + G = tfi.create_graph("tests/graph/xor.tflite", {'load_tensors': True}) steps = add_dimensions(G) - verify_steps(steps, 8) + verify_steps(steps, 6) assert G @@ -92,6 +106,15 @@ def test_load11(): verify_steps(steps, 11) assert G + +def test_load12(): + tfi = TfliteImporter() + G = tfi.create_graph("tests/graph/imu.tflite", {'load_tensors': True}) + steps = add_dimensions(G) + verify_steps(steps, 8) + assert G + + def test_add_dimension1(mnist_graph): tfi = TfliteImporter() G = tfi.create_graph(mnist_graph, {}) @@ -243,14 +266,37 @@ def test_adjust5(kws_graph): assert all([not (node.transpose_in or node.transpose_out) for node in G.nodes() if isinstance(node, Transposable)]), "shouldn't have transposes" -# TODO - fix when balance filter is rewritten -# def test_adjust6(vww_graph): -# tfi = TfliteImporter() -# G = tfi.create_graph(vww_graph, {'load_tensors': True}) -# G.add_dimensions() -# G.adjust_order() -# G.balance_filter(32) +def test_adjust6(): + tfi = TfliteImporter() + try: + G = tfi.create_graph("tests/graph/character_recogniction_cnn_ocr.tflite", + {'load_tensors': True}) + # This graph has an insance concat which multiplies the output of a linear + # layer. It will never be supported. + G.add_dimensions() + error = False + G.adjust_order() + except NotImplementedError: + error = True + assert error + + +def test_adjust_new(): + tfi = TfliteImporter() + G = tfi.create_graph("tests/graph/ocr_cnn_notile_fquant.tflite", + {'load_tensors': True, 'load_quantization': True}) + G.add_dimensions() + G.adjust_order() + +def test_adjust_new2(): + tfi = TfliteImporter() + G = tfi.create_graph("tests/graph/ssdlite_v2_quant_ocr_nopostprocess.tflite", + {'load_tensors': True, 'load_quantization': True}) + G.add_dimensions() + G['output_1'].fixed_order = True + G['output_2'].fixed_order = True + G.adjust_order() def test_adjust7(concat_test_graph): tfi = TfliteImporter() @@ -260,7 +306,7 @@ def test_adjust7(concat_test_graph): G.node('output_2').fixed_order = True G.add_dimensions() G.adjust_order() - matcher = get_std_match_group() + matcher = get_pow2_match_group() matcher.match(G) G.add_dimensions() report = GraphReporter().report(G, None) @@ -278,11 +324,151 @@ def test_adjust8(qvww_graph): matcher.match(G) G.add_dimensions() -# def test_adjust9(mn3q2_graph): -# tfi = TfliteImporter() -# G = tfi.create_graph(mn3q2_graph, {'load_tensors': True}) -# G.add_dimensions() -# G.adjust_order() -# matcher = get_fusion("fuse_external_bias") -# matcher.match(G) -# G.add_dimensions() \ No newline at end of file + +def test_adjust9(mn3q_graph, caplog): + caplog.set_level(logging.INFO) + tfi = TfliteImporter() + G = tfi.create_graph(mn3q_graph, {'load_tensors': True, 'load_quantization': True}) + G.add_dimensions() + G.adjust_order() + matcher = get_scale8_match_group() + matcher.match(G) + G.add_dimensions() + + +def test_adjust10(caplog): + caplog.set_level(logging.INFO) + tfi = TfliteImporter() + G = tfi.create_graph("tests/graph/ssdlite_v2_quant_ocr_nopostprocess.tflite", + {'load_tensors': True, 'load_quantization': True}) + G.add_dimensions() + G.adjust_order() + matcher = get_scale8_match_group() + matcher.match(G) + G.add_dimensions() + + +def test_adjust11(): + tfi = TfliteImporter() + G = tfi.create_graph("tests/graph/imu.tflite", {'load_tensors': True}) + G.add_dimensions() + G.adjust_order() + assert all([not (node.transpose_in or node.transpose_out) + for node in G.nodes() if isinstance(node, Transposable)]), "shouldn't have transposes" + + +def test_validate_mn1_float(mn1f_graph): + tfi = TfliteImporter() + G = tfi.create_graph(mn1f_graph, {'load_tensors': True}) + G.add_dimensions() + matcher = get_pow2_match_group() + matcher.match(G) + G.add_dimensions() + input_tensor = np.load('tests/mobv1_valid/COCO_val2014_000000362331_0.npy') + input_tensor = input_tensor.reshape((224, 224, 3)) + executer = GraphExecuter(G, qrecs=G.quantization) + routput_tensors = executer.execute([input_tensor]) + output_tensor = np.load('tests/mobv1_valid/output_COCO_val2014_000000362331_0_float.npy') + assert np.max(np.abs(routput_tensors[-1][0] - output_tensor[0])) < 0.0001 + + +def test_min(mn1q_graph): + tfi = TfliteImporter() + G = tfi.create_graph(mn1q_graph, {'load_tensors': True, 'load_quantization': True}) + + +def test_validate_mn1_quantized1(mn1q_graph, mn1f_graph): + tfi = TfliteImporter() + Gf = tfi.create_graph(mn1f_graph, {'load_tensors': True}) + Gf.add_dimensions() + Gf.adjust_order() + matcher = get_pow2_match_group() + matcher.match(Gf) + Gf.add_dimensions() + + tfi = TfliteImporter() + G = tfi.create_graph(mn1q_graph, {'load_tensors': True, 'load_quantization': True}) + G.add_dimensions() + G.adjust_order() + matcher = get_pow2_match_group() + matcher.match(G) + G.add_dimensions() + + fpnode = Gf.graph_state.steps[2]['node'] + fpcnode = fpnode.contained_filters()[0] + qpnode = G.graph_state.steps[2]['node'] + qpcnode = qpnode.contained_filters()[0] + nid = NodeId(qpnode, qpcnode) + qrec = G.quantization[nid] + dqbiases = qrec.biases_q.get_dequantized(qpcnode.biases) + assert np.max(np.abs(fpcnode.biases - dqbiases)) < 0.1 + input_tensor = np.load('tests/mobv1_valid/COCO_val2014_000000362331_0.npy') + input_tensor = input_tensor.reshape((224, 224, 3)).transpose((2, 0, 1)) + + executer = GraphExecuter(Gf) + foutput_tensors = executer.execute([input_tensor]) + foutput_tensor = np.load('tests/mobv1_valid/output_COCO_val2014_000000362331_0_float.npy') + assert np.max(np.abs(foutput_tensors[-1][0] - foutput_tensor[0])) < 0.0001 + + executer = GraphExecuter(G, qrecs=G.quantization) + qfroutput_tensors = executer.execute([input_tensor], qmode=QuantizationMode.none()) + assert np.max(np.abs(qfroutput_tensors[-1][0] - foutput_tensor[0])) < 0.2 + + executer = GraphExecuter(G, qrecs=G.quantization) + qroutput_tensors = executer.execute([input_tensor], qmode=QuantizationMode.all_dequantize()) + + output_tensor = np.load('tests/mobv1_valid/output_COCO_val2014_000000362331_0_quant.npy') + # assert np.max(np.abs(qroutput_tensors[-1][0] - output_tensor[0])) < 0.16 + assert np.max(np.abs(qroutput_tensors[-1][0] - output_tensor[0])) < 0.28 + + +def test_validate_mn1_quantized2(mn1q_graph): + tfi = TfliteImporter() + G = tfi.create_graph(mn1q_graph, {'load_tensors': True, 'load_quantization': True}) + G.add_dimensions() + G.adjust_order() + matcher = get_pow2_match_group() + matcher.match(G) + G.add_dimensions() + + +def test_validate_mn1_dequant_quantfloat(mn1q_graph): + # load dequantized graph same results as quant graph and float execution + tfi = TfliteImporter() + G = tfi.create_graph(mn1q_graph, {'load_tensors': True, 'load_quantization': True}) + G.add_dimensions() + G.adjust_order() + matcher = get_pow2_match_group() + matcher.match(G) + G.add_dimensions() + + Gdq = tfi.create_graph(mn1q_graph, {'load_tensors': True, 'load_dequantized': True}) + Gdq.add_dimensions() + Gdq.adjust_order() + matcher = get_pow2_match_group() + matcher.match(Gdq) + Gdq.add_dimensions() + + input_tensor = np.load('tests/mobv1_valid/COCO_val2014_000000362331_0.npy') + input_tensor = input_tensor.reshape((224, 224, 3)).transpose((2, 0, 1)) + + executer = GraphExecuter(G, qrecs=G.quantization) + qfoutput_tensors = executer.execute([input_tensor], qmode=QuantizationMode.none()) + + executer = GraphExecuter(Gdq) + dfoutput_tensors = executer.execute([input_tensor]) + + diff_list = [np.abs(df[0] - qf[0]) for df, qf in zip(dfoutput_tensors, qfoutput_tensors)] + max_diff = [np.max(elem) for elem in diff_list] + assert max(max_diff) < 0.003 + + +def test_mobv2_quant_asym_tf1_15_vwwvehicle(): + graph = 'tests/mobv2_valid/mobv2_vwwvehicle_quant_asym.tflite' + tfi = TfliteImporter() + G = tfi.create_graph(graph, {'load_tensors': True, 'load_quantization': True}) + G.add_dimensions() + G.adjust_order() + matcher = get_scale8_match_group() + matcher.match(G) + G.add_dimensions() diff --git a/tools/nntool/tests/test_quantize.py b/tools/nntool/tests/test_quantize.py index ddc932e81..039085e8d 100644 --- a/tools/nntool/tests/test_quantize.py +++ b/tools/nntool/tests/test_quantize.py @@ -1,19 +1,21 @@ -import numpy as np -from utils.data_importer import import_data -from utils.stats_funcs import bits +# Copyright (C) 2020 GreenWaves Technologies, SAS -IMAGE_FILE = "examples/0/136.pgm" +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. -# def test_quantization(): -# arr = np.array([0.5]) -# qarr = NumpyFloatToFixConverter(True, 16, 13)(arr) -# dqarr = NumpyFixToFloatConverter(13)(qarr) -# assert dqarr == arr +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . -# def test_image_scale(): -# input_tensor = import_data(IMAGE_FILE, height=28, width=28, divisor=128, offset=-1) -# qinput_tensor = quantize(input_tensor, QType(bits=8, q=7, signed=True)) -# print(qinput_tensor[0][5]) +from utils.stats_funcs import bits + +IMAGE_FILE = "examples/0/136.pgm" def test_bits(): assert bits(0.081599854, -0.07628916) == 1 diff --git a/tools/nntool/tests/test_reports.py b/tools/nntool/tests/test_reports.py index 59cce4913..debafdade 100644 --- a/tools/nntool/tests/test_reports.py +++ b/tools/nntool/tests/test_reports.py @@ -15,7 +15,7 @@ from importer.importer import create_graph -from quantization.simple_auto_quantify import SimpleQuantizer +from quantization.symmetric.symmetric_quantizer import SymmetricQuantizer from reports.activation_reporter import ActivationReporter from reports.error_reporter import ErrorReporter from reports.filter_reporter import (FilterDetailedStatsReporter, @@ -61,8 +61,8 @@ def test_filter_detailed_report(mnist_graph): renderer = TextTableRenderer(maxwidth=200) print(report.render(renderer)) -def test_error_report(value_cache, mnist_unfused_8bit_state, mnist_images): - G = load_state(mnist_unfused_8bit_state, value_cache=value_cache) +def test_error_report(mnist_unfused_8bit_state, mnist_images): + G = load_state(mnist_unfused_8bit_state) G.add_dimensions() input_tensor = import_data(mnist_images[0], height=28, width=28, offset=0, divisor=255) input_tensor = input_tensor.reshape((28, 28, 1)) @@ -82,8 +82,8 @@ def test_temps_report(mnist_graph): renderer = TextTableRenderer(maxwidth=200) print(report.render(renderer)) -def test_temps_report_quantized(value_cache, mnist_unfused_8bit_state): - G = load_state(mnist_unfused_8bit_state, value_cache=value_cache) +def test_temps_report_quantized(mnist_unfused_8bit_state): + G = load_state(mnist_unfused_8bit_state) G.add_dimensions() stats_collector = TempsStatsCollector(qrecs=G.quantization) stats = stats_collector.collect_stats(G) @@ -108,7 +108,7 @@ def test_simple_quantization(mnist_graph, mnist_images): astats = stats_collector.reduce_stats() stats_collector = FilterStatsCollector() fstats = stats_collector.collect_stats(G) - quantizer = SimpleQuantizer(astats, fstats, force_width=8) + quantizer = SymmetricQuantizer(astats, fstats, force_width=8) qrecs = quantizer.quantize(G) assert len(qrecs) == 11 # One more for saved quantizer report = QuantizationReporter().report(G, qrecs) diff --git a/tools/nntool/tests/test_sparse_list.py b/tools/nntool/tests/test_sparse_list.py index 6a2cf1622..4a96932c3 100644 --- a/tools/nntool/tests/test_sparse_list.py +++ b/tools/nntool/tests/test_sparse_list.py @@ -16,17 +16,17 @@ from utils.sparse_list import SparseList def test1(): - sl = SparseList() - sl[2] = True - assert sl[1] is None - assert len(sl) == 3 - assert sl[2] == True - sl[5] = False - assert len(sl) == 6 - assert sl[5] == False - del sl[2] - assert len(sl) == 5 - assert sl[2] is None - assert sl[4] == False - tl = [v for v in sl] - assert tl == [None, None, None, None, False] + sparse_list = SparseList() + sparse_list[2] = True + assert sparse_list[1] is None + assert len(sparse_list) == 3 + assert sparse_list[2] + sparse_list[5] = False + assert len(sparse_list) == 6 + assert not sparse_list[5] + del sparse_list[2] + assert len(sparse_list) == 5 + assert sparse_list[2] is None + assert not sparse_list[4] + iter_sparse_list = [v for v in sparse_list] + assert iter_sparse_list == [None, None, None, None, False] diff --git a/tools/nntool/utils/add_sys_path.py b/tools/nntool/utils/add_sys_path.py new file mode 100644 index 000000000..b12d1774f --- /dev/null +++ b/tools/nntool/utils/add_sys_path.py @@ -0,0 +1,16 @@ +import sys +import os + + +def add_sys_path(new_path): + + if not os.path.exists(new_path): + return False + + new_path = os.path.abspath(new_path) + for x in sys.path: + x = os.path.abspath(x) + if new_path in (x, x + os.sep): + return True + sys.path.append(new_path) + return True diff --git a/tools/nntool/utils/at_norm.py b/tools/nntool/utils/at_norm.py new file mode 100644 index 000000000..4860d83e6 --- /dev/null +++ b/tools/nntool/utils/at_norm.py @@ -0,0 +1,43 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np + +class Rounding: + DO_ROUNDING = True + +def set_do_rounding(do_rounding: bool): + Rounding.DO_ROUNDING = do_rounding + +def get_do_rounding(): + return Rounding.DO_ROUNDING + +def at_norm(val, norm): + if isinstance(norm, np.ndarray): + if np.any(norm < 0): + raise ValueError("negative normalization") + if Rounding.DO_ROUNDING: + return (val + np.left_shift(1, norm - 1, dtype=val.dtype)) >> norm + # broadcast = np.broadcast(val, norm) + # res = np.empty(broadcast.shape, dtype=val.dtype) + # res.flat = [(v + (1 << n - 1)) >> n if n > 0 else v for v, n in broadcast] + # return res + return val >> norm + else: + if norm < 0: + raise ValueError("negative normalization") + if Rounding.DO_ROUNDING and norm > 0: + return (val + (1 << (norm - 1))) >> norm + return val >> norm diff --git a/tools/nntool/utils/at_tensor_loader.py b/tools/nntool/utils/at_tensor_loader.py new file mode 100644 index 000000000..37a652d26 --- /dev/null +++ b/tools/nntool/utils/at_tensor_loader.py @@ -0,0 +1,190 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging +import re + +import numpy as np +from generation.generators.globals.global_names import BIASES, WEIGHTS +from graph.types import InputParameters, OutputParameters + +LOG = logging.getLogger("nntool." + __name__) + + +def find_next_dim(shape, next_idx): + for i in range(len(shape) - 1, -1, -1): + if shape[i] != next_idx[i]: + return i + return None + + +def at_tensor_loader(filename): + re_head = re.compile( + r'^Node: (?P[a-zA-Z_][a-zA-Z0-9_]*)' + r', Argument: (?P[a-zA-Z_][a-zA-Z0-9_]*)' + r', Dim: (?P\d+)' + r', \[(?P\d+)\]\[(?P\d+)\]\[(?P\d+)\]\[(?P\d+)\]\[(?P\d+)\]' + r' ItemSize: (?P\d+)$') + re_cont = re.compile( + r'^D(?P\d+): (?P\d+)$') + re_last2 = re.compile( + r'^D(?P\d+): (?P\d+)' + r' - D(?P\d+):(?P\d+)\.\.(?P\d+)$') + re_last1 = re.compile( + r'^D(?P\d+):(?P\d+)\.\.(?P\d+)$') + state = 'start' + tensors = {} + line_num = -1 + with open(filename, "r") as pfile: + for line in pfile: + line_num += 1 + if state == 'start': + match = re_head.search(line) + if not match: + continue + header = match.group('node_name', 'arg_name', 'dims', 'd0', + 'd1', 'd2', 'd3', 'd4', 'item_size') + shape = [int(i) for i in [header[3], header[4], header[5], + header[6], header[7]]] + dims = int(header[2]) + shape = shape[(5-dims):] + next_idx = [-1] * dims + node_name = header[0] + node_tensors = tensors.get(node_name) + if not node_tensors: + node_tensors = {} + tensors[node_name] = node_tensors + arg_name = header[1] + item_size = int(header[8]) + dims_read = 0 + value = [] + state = 'read_dims' + elif state == 'read_dims' and dims >= 2 and (dims - dims_read) == 2: + match = re_last2.search(line) + if not match: + state = 'start' + node_tensors[arg_name] = '[%s] bad_tensor - expecting last dim' % line_num + LOG.warning(node_tensors[arg_name]) + continue + header = [int(elem) for elem in match.group('dim_pen', 'dim_pen_start', 'dim_last', + 'dim_last_start', 'dim_last_end')] + if header[0] != dims_read or header[2] != dims_read + 1: + node_tensors[arg_name] = 'bad_tensor - wrong dim' % line_num + LOG.warning(node_tensors[arg_name]) + state = 'start' + continue + next_idx[header[0]] = header[1] + 1 + next_idx[header[2]] = header[4] + cur_block_len = header[4] - header[3] + cur_block_read = 0 + state = 'read_data' + elif state == 'read_dims' and dims == 1 and dims_read == 0: + match = re_last1.search(line) + if not match: + state = 'start' + node_tensors[arg_name] = '[%s] bad_tensor - expecting last dim' % line_num + LOG.warning(node_tensors[arg_name]) + continue + header = [int(elem) for elem in match.group( + 'dim_last', 'dim_last_start', 'dim_last_end')] + if header[0] != dims_read: + node_tensors[arg_name] = '[%s] bad_tensor - wrong dim' % line_num + LOG.warning(node_tensors[arg_name]) + state = 'start' + continue + next_idx[header[0]] = header[2] + cur_block_len = header[2] - header[1] + cur_block_read = 0 + state = 'read_data' + elif state == 'read_dims': + match = re_cont.search(line) + if not match: + node_tensors[arg_name] = '[%s] bad_tensor - expecting dim' % line_num + LOG.warning(node_tensors[arg_name]) + state = 'start' + continue + header = [int(elem) for elem in match.group('dim', 'dim_idx')] + if header[0] != dims_read: + node_tensors[arg_name] = '[%s] bad_tensor - wrong dim' % line_num + LOG.warning(node_tensors[arg_name]) + state = 'start' + continue + next_idx[header[0]] = header[1] + 1 + dims_read += 1 + elif state == 'read_data': + read = 0 + try: + for i in line.split(): + value.append(i) + read += 1 + except ValueError: + node_tensors[arg_name] = '[%s] bad_tensor - read error' % line_num + LOG.warning(node_tensors[arg_name]) + state = 'start' + continue + + cur_block_read += read + + if cur_block_read == cur_block_len: + state = 'read_dims' + dims_read = find_next_dim(shape, next_idx) + if dims_read is None: + state = 'start' + if arg_name in node_tensors.keys(): + node_tensors[arg_name] = np.concatenate((node_tensors[arg_name], + np.array(value, dtype=np.dtype('i'+str(item_size))).reshape(shape)), axis=0) + else: + node_tensors[arg_name] = np.array( + value, dtype=np.dtype('i'+str(item_size))).reshape(shape) + else: + state = 'read_dims' + elif cur_block_read > cur_block_len: + node_tensors[arg_name] = '[%s] bad_tensor - too long' % line_num + LOG.warning(node_tensors[arg_name]) + state = 'start' + continue + return tensors + +def at_map_tensors(G, tensors): + re_snum = re.compile( + r'^S(?P\d+)_') + steps = G.graph_state.steps + result = [[None, None, None] for _ in steps] + for cname, tset in tensors.items(): + match = re_snum.search(cname) + if not match: + raise ValueError("tensor name in unexpected format") + step_idx = int(match.group('step')) + node = steps[step_idx]['node'] + for tname, tensor in tset.items(): + tname = tname.lower() + if tname.startswith('input'): + for edge in G.in_edges(node.name): + if isinstance(edge.from_node, InputParameters): + result[edge.from_node.step_idx][0] = tensor.reshape(node.in_dims[0].shape) + break + elif tname.startswith('output'): + for edge in G.out_edges(node.name): + if isinstance(edge.to_node, OutputParameters): + result[edge.to_node.step_idx][0] = tensor.reshape(node.out_dims[0].shape) + break + result[step_idx][0] = tensor.reshape(node.out_dims[0].shape) + elif tname == "s%s_output"%step_idx: + result[step_idx][0] = tensor.reshape(node.out_dims[0].shape) + elif tname.endswith(WEIGHTS): + result[step_idx][1] = tensor + elif tname.endswith(BIASES): + result[step_idx][2] = tensor + return result diff --git a/tools/nntool/utils/data_importer.py b/tools/nntool/utils/data_importer.py index 1b91de880..f162f748a 100644 --- a/tools/nntool/utils/data_importer.py +++ b/tools/nntool/utils/data_importer.py @@ -18,6 +18,7 @@ import numpy as np from PIL import Image +from utils.at_norm import at_norm LOG = logging.getLogger('nntool.'+__name__) @@ -35,43 +36,18 @@ 'F': 1, # (32-bit floating point pixels) } -VALID_IMAGE_EXTENSIONS = ['.pgm', '.png', '.ppm'] +VALID_IMAGE_EXTENSIONS = ['.pgm', '.png', '.ppm', '.jpg', '.jpeg'] VALID_SOUND_EXTENSIONS = ['.raw', '.pcm'] +VALID_DATA_IMPORT_EXTENSIONS = ['.npy'] -def import_image_data(filename, **kwargs): - img_in = Image.open(filename) - if 'width' not in kwargs or kwargs['width'] == -1: - width = img_in.width - else: - width = kwargs['width'] - - if 'height' not in kwargs or kwargs['height'] == -1: - height = img_in.height - else: - height = kwargs['height'] - - if width != img_in.width or height != img_in.height: - img_in = img_in.resize((width, height)) - - if 'mode' in kwargs: - img_in.convert(mode=kwargs['mode']) - - if 'nptype' in kwargs: - nptype = getattr(np, kwargs['nptype']) - else: - nptype = np.uint8 - - channels = MODES[img_in.mode] - # TODO - this needs to be smarter for different image pixel types - img_in = np.array(img_in, dtype=nptype) - +def postprocess(img_in, h, w, c, **kwargs): if kwargs.get('transpose'): - if channels == 1: - img_in = img_in.transpose((1, 0)).reshape((channels, height, width)) + if c == 1: + img_in = img_in.transpose((1, 0)).reshape((c, h, w)) else: img_in = img_in.transpose((2, 0, 1)).copy() - elif channels == 1: - img_in = img_in.reshape((channels, width, height)) + elif c == 1: + img_in = img_in.reshape((c, w, h)) divisor = kwargs.get('divisor') or 1 offset = kwargs.get('offset') or 0 @@ -79,7 +55,7 @@ def import_image_data(filename, **kwargs): if shift: if shift < 0: - img_in = img_in >> int(-shift) + img_in = at_norm(img_in, int(-shift)) else: img_in = img_in << int(shift) @@ -98,14 +74,51 @@ def import_image_data(filename, **kwargs): return img_in +def import_image_data(filename, **kwargs): + img_in = Image.open(filename) + if 'width' not in kwargs or kwargs['width'] == -1: + width = img_in.width + else: + width = kwargs['width'] + + if 'height' not in kwargs or kwargs['height'] == -1: + height = img_in.height + else: + height = kwargs['height'] + + if width != img_in.width or height != img_in.height: + img_in = img_in.resize((width, height)) + + if 'mode' in kwargs: + img_in = img_in.convert(mode=kwargs['mode']) + + if 'nptype' in kwargs: + nptype = getattr(np, kwargs['nptype']) + else: + nptype = np.uint8 + + channels = MODES[img_in.mode] + # TODO - this needs to be smarter for different image pixel types + img_in = np.array(img_in, dtype=nptype) + return postprocess(img_in, height, width, channels, **kwargs) + +def import_tensor_data(filename, **kwargs): + img_in = np.load(filename) + if len(img_in.shape) == 4 and img_in.shape[0] == 1: + img_in = img_in.reshape(img_in.shape[1:]) + return postprocess(img_in, img_in.shape[0], img_in.shape[1], img_in.shape[2], **kwargs) + def import_sound_data(filename, **kwargs): raise NotImplementedError() def import_data(filename, **kwargs): _, ext = os.path.splitext(filename) + ext = ext.lower() if ext in VALID_IMAGE_EXTENSIONS: return import_image_data(filename, **kwargs) if ext in VALID_SOUND_EXTENSIONS: return import_sound_data(filename, **kwargs) + if ext in VALID_DATA_IMPORT_EXTENSIONS: + return import_tensor_data(filename, **kwargs) LOG.debug("no import tool for file %s with extension %s", filename, ext) raise NotImplementedError('unknown file extension for import data') diff --git a/tools/nntool/utils/exp_17_15.py b/tools/nntool/utils/exp_17_15.py new file mode 100644 index 000000000..a54d6a1a7 --- /dev/null +++ b/tools/nntool/utils/exp_17_15.py @@ -0,0 +1,95 @@ +# #define Abs(a) (((int)(a)<0)?(-(a)):(a)) +# #define Min(a, b) (((a)<(b))?(a):(b)) +# #define Max(a, b) (((a)>(b))?(a):(b)) + +import numpy as np + +# static unsigned short int IntegerExpLUT[] = +# { +INTEGER_EXP_LUT = np.array([0x0001, 0x0002, 0x0007, 0x0014, 0x0036, 0x0094, + 0x0193, 0x0448, 0x0BA4, 0x1FA7, 0x560A, 0xE9E2], dtype=np.uint16) +# }; + +# static unsigned short int FractionExpLUT[] = +# { +FRACTION_EXP_LUT = np.array([0x0000, 0x5BF1, 0x31CD, 0x0AF3, 0x4C90, 0x34E2, + 0x36E3, 0x510B, 0x7A9F, 0x0ABE, 0x3B9F, 0x1224], dtype=np.uint16) +# }; + +# /* 17.15 fixed point format */ +# static unsigned short int ExpCoeffLUT[] = +# { +EXP_COEFF_LUT = np.array([0x7FFF, 0x7FFF, 0x4000, 0x1555, 0x0555, 0x0111, 0x002E, 0x0007, 0x0001]) +# }; + + +def gap_bitextractu(x, size, off): + mask = (np.array([1], dtype=np.uint32) << size) - 1 + return (x >> off) & mask + + +def gap_mulsRN(x, y, n): + rounding = np.array([1], dtype=np.int32) << (n - 1) + return (np.multiply(x.astype(np.int16), + y.astype(np.int16), dtype=np.int32) + rounding) >> n + + +def gap_mulRN(x, y, n): + rounding = np.array([1], dtype=np.int32) << (n - 1) + return (np.multiply(x.astype(np.uint16), + y.astype(np.uint16), dtype=np.int32) + rounding) >> n + + +def gap_roundnorm(x, scale): + rounding = np.array([1], dtype=np.int32) << (scale - 1) + return (x.astype(np.int32) + rounding) >> scale + + +def exp_fp_17_15(X): + X = X.astype(np.uint32) + result = np.zeros(X.shape, dtype=np.int32) + zero_mask = X == 0 + result[zero_mask] = 0x8000 + non_zero_mask = np.logical_not(zero_mask) + Y = np.ndarray(X.shape, dtype=np.int32) + Y[non_zero_mask] = np.abs(X[non_zero_mask].astype(np.int32)) + + int_x = np.ndarray(X.shape, dtype=np.int32) + int_x[non_zero_mask] = Y[non_zero_mask] >> 15 + + overflow_mask = np.logical_and(int_x >= (len(INTEGER_EXP_LUT) - 1), non_zero_mask) + result[np.logical_and(X == Y, overflow_mask)] = 0x7FFFFFF + result[np.logical_and(X != Y, overflow_mask)] = 0 + + non_zero_mask[overflow_mask] = False + + fract_x = np.ndarray(X.shape, dtype=np.int32) + fract_x[non_zero_mask] = Y[non_zero_mask] & 0x7FFF + + bit_extract_mask = np.logical_and(non_zero_mask, gap_bitextractu(fract_x, 1, 14)) + fract_x[bit_extract_mask] -= 0x8000 + int_x[bit_extract_mask] += 1 + + scaled_int = np.ndarray(X.shape, dtype=np.int32) + scaled_int[non_zero_mask] = INTEGER_EXP_LUT[int_x[non_zero_mask]] + + scaled_fract = np.ndarray(X.shape, dtype=np.uint16) + scaled_fract[non_zero_mask] = FRACTION_EXP_LUT[int_x[non_zero_mask]] + + fract_x_s = fract_x.astype(np.int16) + z_s = fract_x.astype(np.int16) + for i in range(1, len(EXP_COEFF_LUT)): + result[non_zero_mask] += z_s[non_zero_mask].astype(np.int32) * EXP_COEFF_LUT[i] + z_s[non_zero_mask] = gap_mulsRN(z_s[non_zero_mask], fract_x_s[non_zero_mask], 15) + + result[non_zero_mask] = gap_roundnorm(result[non_zero_mask], 15) + EXP_COEFF_LUT[0] + + unsigned_res = result.astype(np.uint16) + + result[non_zero_mask] = (gap_mulRN(unsigned_res[non_zero_mask], + scaled_fract[non_zero_mask], 15)\ + + unsigned_res[non_zero_mask] * scaled_int[non_zero_mask]) + neg_mask = np.logical_and(np.logical_and(non_zero_mask, result != 0), X > 0x7FFFFFFF) + result[neg_mask] = ((0x7FFFFFFF)//result[neg_mask]) >> 1 + + return result.astype(np.uint32) diff --git a/tools/nntool/utils/formatters.py b/tools/nntool/utils/formatters.py new file mode 100644 index 000000000..b9a092e38 --- /dev/null +++ b/tools/nntool/utils/formatters.py @@ -0,0 +1,37 @@ +import numpy as np +from graph.dim import Dim + +def rgb565_rgb888(input_tensor: np.ndarray, in_dim: Dim, out_dim: Dim): + assert in_dim.is_named and in_dim.c == 1 and out_dim.is_named and out_dim.c == 3 + input_tensor = np.repeat(input_tensor.transpose(in_dim.transpose_to_order(("h", "w", "c"))), 3, axis=2) + input_tensor[:, :, 1] = (input_tensor[:, :, 0] & (63 << 5)) >> 3 + input_tensor[:, :, 2] = (input_tensor[:, :, 0] & 31) << 3 + input_tensor[:, :, 0] = (input_tensor[:, :, 0] & (31 << 11)) >> 8 + return input_tensor.astype(np.uint8).transpose(out_dim.transpose_from_order(("h", "w", "c"))) + +def from_hwc(input_tensor: np.ndarray, in_dim: Dim, out_dim: Dim): + del in_dim + return input_tensor.astype(np.uint8).transpose(out_dim.transpose_from_order(("h", "w", "c"))) + +def out_int16(input_tensor: np.ndarray): + return input_tensor.astype(np.int16) << 7 + +def shift_int8(input_tensor): + return (input_tensor >> 1).astype(np.int8) + +def offset_int8(input_tensor): + return (input_tensor.astype(np.int16) - 128).astype(np.int8) + +FORMAT_CHANGES = { + "RGB565_RGB888": rgb565_rgb888, + "RGB888": from_hwc, + "RGB16": from_hwc, + "BW8": from_hwc, + "BW16": from_hwc +} + +NORMALIZATIONS = { + "SHIFT_INT8": shift_int8, + "OFFSET_INT8": offset_int8, + "OUT_INT16": out_int16 +} diff --git a/tools/nntool/utils/fuzzy.py b/tools/nntool/utils/fuzzy.py new file mode 100644 index 000000000..e1753e967 --- /dev/null +++ b/tools/nntool/utils/fuzzy.py @@ -0,0 +1,61 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +class Fuzzy(): + def __init__(self, val): + self._val = val + + @classmethod + def maybe_yes(cls, maybe=False): + if maybe: + return cls.maybe + return cls.yes + + @classmethod + def yes(cls): + return cls("yes") + + @classmethod + def no(cls): + return cls("no") + + @classmethod + def maybe(cls): + return cls("maybe") + + @property + def is_yes(self): + return self._val == "yes" + + @property + def is_maybe(self): + return self._val == "maybe" + + @property + def is_no(self): + return self._val == "no" + + @classmethod + def all(cls, gen): + is_maybe = False + for elem in gen: + if elem.is_no: + return cls.no() + if elem.is_maybe: + is_maybe = True + return cls.maybe_yes(maybe=is_maybe) + + def __bool__(self): + return self.is_maybe or self.is_yes diff --git a/tools/nntool/utils/graph.py b/tools/nntool/utils/graph.py index b282ce648..2d9f899af 100644 --- a/tools/nntool/utils/graph.py +++ b/tools/nntool/utils/graph.py @@ -199,13 +199,13 @@ def __add_in_edge(self, edge: Edge, update=False): if edge_list is None: edge_list = [] edges[edge.from_node.name] = edge_list - try: - edge_idx = edge_list.index(edge) + edge_idx = next((i for i, x in enumerate(edge_list) if x == edge), -1) + if edge_idx >= 0: if update: edge_list[edge_idx] = edge else: raise EdgeExistsError() - except ValueError: + else: edge_list.append(edge) def __add_out_edge(self, edge: Edge, update=False): @@ -217,13 +217,13 @@ def __add_out_edge(self, edge: Edge, update=False): if edge_list is None: edge_list = [] edges[edge.to_node.name] = edge_list - try: - edge_idx = edge_list.index(edge) + edge_idx = next((i for i, x in enumerate(edge_list) if x == edge), -1) + if edge_idx >= 0: if update: edge_list[edge_idx] = edge else: raise EdgeExistsError() - except ValueError: + else: edge_list.append(edge) def verify_edges(self): @@ -345,6 +345,15 @@ def in_edges(self, node_name: str) -> Sequence[Edge]: return list(edge for edge_list in self._in_edges[node_name].values() for edge in edge_list) + def in_edges_idx(self, node_name: str, to_idx: int) -> Edge: + '''Input edge at index to a node''' + if node_name not in self._in_edges: + return None + + edges = list(edge for edge_list in self._in_edges[node_name].values() + for edge in edge_list if edge.to_idx == to_idx) + return edges[0] if len(edges) == 1 else None + @staticmethod def index_edges_by_from(edges): indexed_edges = [] @@ -758,5 +767,6 @@ def __getitem__(self, key): def __iter__(self): return self._nodes.__iter__() + class Graph(GraphView): pass diff --git a/tools/nntool/utils/json_serializable.py b/tools/nntool/utils/json_serializable.py index ef13434c2..7bfa74589 100644 --- a/tools/nntool/utils/json_serializable.py +++ b/tools/nntool/utils/json_serializable.py @@ -51,6 +51,12 @@ def default(self, o): return int(o) if isinstance(o, np.floating): return float(o) + if isinstance(o, np.ndarray): + return { + '__type': 'numpy.ndarray', + '__contents': o.tolist(), + '__dtype': o.dtype.name + } # Let the base class default method raise the try: @@ -69,6 +75,8 @@ def __init__(self, *args, object_hook=None, **kwargs): # pylint: disable=no-self-use, method-hidden def object_hook(self, obj): if '__type' in obj: + if obj['__type'] == 'numpy.ndarray': + return np.array(obj['__contents'], dtype=np.dtype(obj['__dtype'])) if obj['__type'] == 'JsonSerializable': return JsonSerializable.from_dict(obj) return obj diff --git a/tools/nntool/utils/new_param_state.py b/tools/nntool/utils/new_param_state.py index 6e1386638..510791ebd 100644 --- a/tools/nntool/utils/new_param_state.py +++ b/tools/nntool/utils/new_param_state.py @@ -120,8 +120,7 @@ def set_options(G, node_options, graph_node_options=None): graph_node_options[nodeid] = G.node(nodeid.node_name).at_options -def load_state(graph_file: str, value_cache=None, return_extra=False): - #state_dir = os.path.dirname(os.path.abspath(graph_file)) +def load_state(graph_file: str, return_extra=False): graph_base, _ = os.path.splitext(graph_file) state_filename = graph_base + STATE_EXTENSION state_file = Path(state_filename) @@ -150,15 +149,20 @@ def load_state(graph_file: str, value_cache=None, return_extra=False): parameters = None # Here load the orignal graph and replay the transforms that were done to it - opts = { - 'load_tensors': False, - } + if info_state['info'].get('has_quantized_parameters'): + opts = { + 'load_tensors': True, + 'load_quantization': True + } + else: + opts = { + 'load_tensors': False, + } # Retrieve the identity of the saved state identity = GraphIdentity(None) identity.identity = info_state['identity'] LOG.info("loading graph from %s", identity.filename) - #G = create_graph(os.path.join(state_dir, os.path.split(identity.filename)[-1]), opts=opts) G = create_graph(identity.filename, opts=opts) if 'name' in info_state: G.name = info_state['name'] @@ -184,7 +188,6 @@ def load_state(graph_file: str, value_cache=None, return_extra=False): G.info = info_state['info'] G.changes.replay(G) G.graph_identity = identity - G.value_cache = value_cache G.node_options = info_state['node_options'] set_options(G, info_state['node_options'], info_state['node_options']) diff --git a/tools/nntool/utils/node_id.py b/tools/nntool/utils/node_id.py index 5148c0bbb..66b56be93 100644 --- a/tools/nntool/utils/node_id.py +++ b/tools/nntool/utils/node_id.py @@ -15,6 +15,7 @@ from utils.json_serializable import JsonSerializable + class NodeId(JsonSerializable): def __init__(self, node, fnode=None): if isinstance(node, list): @@ -58,7 +59,6 @@ def __str__(self): return "_".join(self._id) return self._id[0] -from utils.node_id import NodeId def convert_node_id_to_str(nodeid): if isinstance(nodeid, NodeId): @@ -86,4 +86,4 @@ def convert_str_to_keys(info): if isinstance(info, dict): return {convert_str_to_node_id(k): convert_str_to_keys(v) for k, v in info.items()} - return info \ No newline at end of file + return info diff --git a/tools/nntool/utils/option_list.py b/tools/nntool/utils/option_list.py index d16ba7257..c6b5a1221 100644 --- a/tools/nntool/utils/option_list.py +++ b/tools/nntool/utils/option_list.py @@ -38,7 +38,8 @@ def __setattr__(self, name, value): upper_name = name.upper() if upper_name in self._valid_options: if value is None: - del self._options[upper_name] + if upper_name in self._options: + del self._options[upper_name] return elif not isinstance(value, self._valid_options[upper_name]): value = self._valid_options[upper_name](value) diff --git a/tools/nntool/utils/validation_utils.py b/tools/nntool/utils/validation_utils.py index 568708477..b008f145e 100644 --- a/tools/nntool/utils/validation_utils.py +++ b/tools/nntool/utils/validation_utils.py @@ -1,48 +1,82 @@ -import numpy as np -import logging import json import os from abc import ABC, abstractmethod +import numpy as np -SUPPORTED_PREDICTION = {'classification'} #add 'object-detection', 'segmentation' +SUPPORTED_PREDICTION = {'classification'} #add 'object-detection' class ValidateBase(ABC): - def __init__(self, type_of_prediction='classification'): - if type_of_prediction not in SUPPORTED_PREDICTION: - raise NotImplementedError("type_of_prediction must be in %r, %s not supported" %SUPPORTED_PREDICTION, type_of_prediction) - self.labels = [] - self.predictions = [] + def __init__(self, type_of_prediction='classification'): + if type_of_prediction not in SUPPORTED_PREDICTION: + raise NotImplementedError("type_of_prediction must be in %r, %s not supported" %SUPPORTED_PREDICTION, type_of_prediction) + self.labels = [] + self.predictions = [] + + @abstractmethod + def validate(self, input_name, predicted): + pass - @abstractmethod - def validate(self, input_name, predicted): - pass +class ValidateFromClass(ValidateBase): + def __init__(self, class_number, type_of_prediction='classification'): + super().__init__(type_of_prediction=type_of_prediction) + self._class_number = class_number + +#the label are all the same + def validate(self, input_name, predicted): + predicted = predicted.flatten() + class_predicted = int(np.argmax(predicted)) + margin = predicted[class_predicted] - np.average(np.delete(predicted, [class_predicted])) + self.predictions.append(class_predicted) + self.labels.append(self._class_number) + return class_predicted == self._class_number, class_predicted, self._class_number, margin class ValidateFromName(ValidateBase): - #the label are the last digits in the filename - def validate(self, input_name, predicted): - num_classes = predicted.size - filename, _ = os.path.splitext(input_name) - num_classes_digits = len(str(num_classes)) - label = int(filename[-(num_classes_digits):]) - class_predicted = int(np.argmax(predicted)) - self.predictions.append(class_predicted) - self.labels.append(label) - return class_predicted == label, label +#the label are the last digits in the filename + def validate(self, input_name, predicted): + num_classes = predicted.size + filename, _ = os.path.splitext(input_name) + num_classes_digits = len(str(num_classes-1)) + label = int(filename[-(num_classes_digits):]) + predicted = predicted.flatten() + class_predicted = int(np.argmax(predicted)) + margin = predicted[class_predicted] - np.average(np.delete(predicted, [class_predicted])) + self.predictions.append(class_predicted) + self.labels.append(label) + return class_predicted == label, class_predicted, label, margin class ValidateFromJSON(ValidateBase): - def __init__(self, json_file): - super().__init__() - with open(json_file) as file: - self.annotations = json.load(file) - - def validate(self, input_name, predicted): - num_classes = predicted.size - path, file = os.path.split(input_name) - label = self.annotations[file] - class_predicted = int(np.argmax(predicted)) - self.predictions.append(class_predicted) - self.labels.append(label) - return class_predicted == label, label + def __init__(self, json_file): + super().__init__() + with open(json_file) as file: + self.annotations = json.load(file) + def validate(self, input_name, predicted): + #num_classes = predicted.size + _, file = os.path.split(input_name) + label = self.annotations[file] + predicted = predicted.flatten() + class_predicted = int(np.argmax(predicted)) + self.predictions.append(class_predicted) + margin = predicted[class_predicted] - np.average(np.delete(predicted, [class_predicted])) + self.labels.append(label) + return class_predicted == label, class_predicted, label, margin +class ValidateFromVWWInstances(ValidateBase): + def __init__(self, instances_file): + super().__init__() + with open(instances_file) as file: + self.instances = json.load(file) + def validate(self, input_name, predicted): + _, file_name = os.path.split(input_name) + for image in self.instances['images']: + if image['file_name'] == file_name: + idx = image['id'] + label = self.instances['annotations'][str(idx)]['label'] + break + predicted = predicted.flatten() + class_predicted = int(np.argmax(predicted)) + self.predictions.append(class_predicted) + margin = predicted[class_predicted] - np.average(np.delete(predicted, [class_predicted])) + self.labels.append(label) + return class_predicted == label, class_predicted, label, margin diff --git a/tools/rules/pulp_rules.mk b/tools/rules/pulp_rules.mk index f96edd6ce..e7f761451 100644 --- a/tools/rules/pulp_rules.mk +++ b/tools/rules/pulp_rules.mk @@ -108,8 +108,12 @@ BOOTFLAGS = -Os -g -DUSE_AES -fno-jump-tables -Wextra -Wall -Wno-unused-parame CFLAGS = $(COMMON) -MMD -MP -c ifeq '$(platform)' 'board' +ifeq '$(TARGET_CHIP)' 'GAP9' +io ?= bridge +else io ?= host endif +endif ifeq '$(io)' 'host' PULP_CFLAGS += -D__RT_IODEV__=2