diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..10defa9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,39 @@
+*~
+__pycache__
+build
+dist
+**/*.egg*
+*.vscode
+.DS_Store
+*.html
+.ipynb_checkpoints/
+*#
+install/
+*.pkl
+*.data
+*#
+toolchain/**/*/
+package.json
+package-lock.json
+.mypy_cache
+node_modules
+
+compile_commands.json
+
+docs/_autosummary
+docs/_build
+
+
+DeeployTest/TestFiles/
+DeeployTest/Tests/**/*.txt
+DeeployTest/**/BUILD/*
+DeeployTest/TEST_*/*
+DeeployTest/deeployStates*/*
+DeeployTest/DeeployState*
+DeeployTest/testUtils/graphDebug.py
+DeeployTest/Tests/**/pactIntegerizationProto
+DeeployTest/Tests/**/quantlib
+DeeployTest/Tests/**/*.py
+DeeployTest/Tests/**/*.json
+DeeployTest/Tests/**/generateTest.py
+DeeployTest/out.txt
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..f60b9dd
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,717 @@
+variables:
+  GIT_SUBMODULE_STRATEGY: recursive
+  FF_USE_FASTZIP: "true"
+  # These can be specified per job or per pipeline
+  ARTIFACT_COMPRESSION_LEVEL: "fastest"
+  CACHE_COMPRESSION_LEVEL: "fastest"
+  TOOLCHAIN: "LLVM"
+  CMAKE_GENERATOR: "Ninja"
+
+stages: # List of stages for jobs, and their order of execution
+  - test
+
+.setup_test:
+  script:
+    - bash && source ~/.bashrc
+    - $CONDA activate dumpoci
+    - export PYTHONPATH=`pwd`:$PYTHONPATH
+    - cd DeeployTest
+    - git lfs pull
+
+build_deeploy: # This job runs in the build stage, which runs first.
+  stage: test
+  resource_group: install
+  artifacts:
+    untracked: true
+  script:
+    - bash && source ~/.bashrc
+    - $CONDA activate dumpoci
+    - pip install -e .
+    - rm -f DeeployTest/out.txt
+
+gen_docs:
+  stage: test
+  resource_group: install
+  artifacts:
+    untracked: true
+  script:
+    - bash && source ~/.bashrc
+    - $CONDA activate dumpoci
+    - make docs
+
+run_cmsis_test_models: # This job runs in the test stage.
+  stage: test # It only starts when the job in the build stage completes successfully.
+  tags:
+    - qemu-arm
+  parallel:
+    matrix:
+    - TEST: [simpleRegression, WaveFormer]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_cortexm.py -t ./Tests/$TEST --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_QEMU/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_QEMU/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+run_cmsis_test_kernels: # This job runs in the test stage.
+  stage: test # It only starts when the job in the build stage completes successfully.
+  tags:
+    - qemu-arm
+  parallel:
+    matrix:
+    - TEST: [Adder, MultIO, test1DPad, test2DPad, testMatMul, testMatMulAdd, testMaxPool, testRQConv, testReduceSum, testReduceMean, testSlice]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_cortexm.py -t ./Tests/$TEST --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_QEMU/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_QEMU/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+run_siracusa_test_models: # This job runs in the test stage.
+  stage: test # It only starts when the job in the build stage completes successfully.
+  tags:
+    - PULP
+  parallel:
+    matrix:
+    - TEST: [simpleRegression, miniMobileNet, miniMobileNetv2, Attention, MLPerf/KeywordSpotting, MLPerf/ImageClassification, MLPerf/AnomalyDetection]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_siracusa.py -t ./Tests/$TEST --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+run_siracusa_test_kernels: # This job runs in the test stage.
+  stage: test # It only starts when the job in the build stage completes successfully.
+  tags:
+    - PULP
+  parallel:
+    matrix:
+    - TEST: [Adder, MultIO, test1DPad, test2DPad, testMatMul, testMatMulAdd, testRequantizedDWConv, test2DRequantizedConv, iSoftmax, testConcat, testRMSNorm, trueIntegerDivSandwich, Hardswish, RQHardswish, testBacktracking]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_siracusa.py -t ./Tests/$TEST --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+run_siracusa_DMA_slice_L2: # This job runs in the test stage.
+  stage: test # It only starts when the job in the build stage completes successfully.
+  tags:
+    - PULP
+  script:
+    - !reference [.setup_test, script]
+    - python testSlice_PULP.py --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_SIRACUSA/Tests/testSlice/*.c
+      - ./DeeployTest/TEST_SIRACUSA/Tests/testSlice/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+run_siracusa_tiled_kernels_singlebuffer_L2: # This job runs in the test stage.
+  stage: test # It only starts when the job in the build stage completes successfully.
+  tags:
+    - PULP
+  parallel:
+    matrix:
+    - TEST: "testMatMul"
+      L1: [64000, 32000, 16000]
+    - TEST: "test2DRequantizedConv"
+      L1: [8000, 6000, 4000]
+    - TEST: "testRequantizedDWConv"
+      L1: [2561] # SCHEREMO: The implicit transpose after the conv is untiled; need at least 2560
+    - TEST: "iSoftmax"
+      L1: [800, 500, 300]
+    - TEST: "testConcat"
+      L1: [32000, 16000, 8000]
+    - TEST: "testRMSNorm"
+      L1: [2048, 1024, 512]
+    - TEST: "Hardswish"
+      L1: [750]
+    - TEST: "RQHardswish"
+      L1: [750]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_tiled_siracusa.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+run_siracusa_tiled_kernels_doublebuffer_L2: # This job runs in the test stage.
+  stage: test # It only starts when the job in the build stage completes successfully.
+  tags:
+    - PULP
+  parallel:
+    matrix:
+    - TEST: "testMatMul"
+      L1: [64000, 32000, 16000]
+    - TEST: "test2DRequantizedConv"
+      L1: [8000, 6000, 5000]
+    - TEST: "testRequantizedDWConv"
+      L1: [5121] # SCHEREMO: The implicit transpose after the conv is untiled; need at least 2560 * 2 for DB
+    - TEST: "iSoftmax"
+      L1: [1600, 1000, 600]
+    - TEST: "testConcat"
+      L1: [64000, 32000, 16000]
+    - TEST: "testRMSNorm"
+      L1: [4096, 2048, 1024]
+    - TEST: "Hardswish"
+      L1: [750]
+    - TEST: "RQHardswish"
+      L1: [750]
+    - TEST: "microLlama/microLlama1"
+      L1: [60000, 20000, 10000]
+    - TEST: "microLlama/microLlama8"
+      L1: [60000, 20000, 10000]
+    - TEST: "microLlama/microLlama8_parallel"
+      L1: [60000, 20000, 10000]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_tiled_siracusa.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8 --doublebuffer
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+run_siracusa_tiled_models_singlebuffer_L2: # This job runs in the test stage.
+  stage: test # It only starts when the job in the build stage completes successfully.
+  tags:
+    - PULP
+  parallel:
+    matrix:
+    - TEST: "simpleRegression"
+      L1: [45000, 30000, 15000]
+    - TEST: "miniMobileNet"
+      L1: [60000, 12000, 6000, 3000]
+    - TEST: "miniMobileNetv2"
+      L1: [60000, 16000, 12000, 8000]
+    - TEST: "Attention"
+      L1: [60000, 10000, 5000]
+    - TEST: "microLlama/microLlama1"
+      L1: [60000, 10000, 5000]
+    - TEST: "microLlama/microLlama8"
+      L1: [60000, 10000, 5000]
+    - TEST: "microLlama/microLlama8_parallel"
+      L1: [60000, 10000, 5000]
+    - TEST: "MLPerf/KeywordSpotting"
+      L1: [64000]
+    - TEST: "MLPerf/ImageClassification"
+      L1: [64000]
+    - TEST: "MLPerf/AnomalyDetection"
+      L1: [64000]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_tiled_siracusa.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+run_siracusa_tiled_models_singlebuffer_L3: # This job runs in the test stage.
+  stage: test # It only starts when the job in the build stage completes successfully.
+  tags:
+    - PULP
+  parallel:
+    matrix:
+    - TEST: "simpleRegression"
+      L1: [45000, 30000, 16000] # SCHEREMO: 15000 leads to non-2d transfers in L3!
+    - TEST: "miniMobileNet"
+      L1: [60000, 12000, 6000] # SCHEREMO: 3000 leads to non-2d transfers in L3!
+    - TEST: "miniMobileNetv2"
+      L1: [60000, 16000, 12000, 8000]
+    - TEST: "Attention"
+      L1: [60000, 10000, 5000, 2500]
+    - TEST: "Transformer"
+      L1: [60000, 30000, 15000]
+    - TEST: "microLlama/microLlama1"
+      L1: [60000, 10000, 5000]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_tiled_siracusa.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8 --defaultMemLevel=L3
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+
+run_siracusa_tiled_models_doublebuffer_L3: # This job runs in the test stage.
+  stage: test # It only starts when the job in the build stage completes successfully.
+  tags:
+    - PULP
+  parallel:
+    matrix:
+    - TEST: "simpleRegression"
+      L1: [60000, 45000, 30000]
+    - TEST: "miniMobileNet"
+      L1: [60000, 24000, 12000, 6000]
+    - TEST: "miniMobileNetv2"
+      L1: [60000, 32000, 24000, 16000]
+    - TEST: "Attention"
+      L1: [60000, 20000, 10000, 5000]
+    - TEST: "Transformer"
+      L1: [60000, 30000, 15000]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_tiled_siracusa.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8 --doublebuffer --defaultMemLevel=L3
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+
+run_siracusa_w_neureka_tiled_kernels_singlebuffer_L2:
+  stage: test
+  tags:
+    - PULP
+  parallel:
+    matrix:
+    - TEST: "testRequantizedLinear"
+      L1: [16000]
+    - TEST: "testPointwise"
+      L1: [32000]
+    - TEST: "testPointwiseConvBNReLU"
+      L1: [32000]
+    - TEST: "testPointwiseUnsignedWeights"
+      L1: [32000]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_tiled_siracusa_w_neureka.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --defaultMemLevel=L2
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+
+run_siracusa_w_neureka_tiled_kernels_doublebuffer_L2:
+  stage: test
+  tags:
+    - PULP
+  parallel:
+    matrix:
+    - TEST: "testRequantizedLinear"
+      L1: [16000]
+    - TEST: "testPointwise"
+      L1: [32000]
+    - TEST: "testPointwiseConvBNReLU"
+      L1: [32000]
+    - TEST: "testPointwiseUnsignedWeights"
+      L1: [32000]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_tiled_siracusa_w_neureka.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --defaultMemLevel=L2 --doublebuffer
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+run_siracusa_w_neureka_tiled_models_singlebuffer_L3:
+  stage: test
+  tags:
+    - PULP
+  parallel:
+    matrix:
+    - TEST: "miniMobileNet"
+      L1: [2000] # LMACAN: 1000 leads to non-2d transfers in L3!
+    - TEST: "Attention"
+      L1: [2500]
+    - TEST: "Transformer"
+      L1: [15000]
+    - TEST: "microLlama/microLlama1"
+      L1: [10000]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_tiled_siracusa_w_neureka.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8 --defaultMemLevel=L3
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+run_siracusa_w_neureka_tiled_models_doublebuffer_L3:
+  stage: test
+  tags:
+    - PULP
+  parallel:
+    matrix:
+    - TEST: "miniMobileNet"
+      L1: [2000] # LMACAN: 1000 leads to non-2d transfers in L3!
+    - TEST: "Attention"
+      L1: [5000]
+    - TEST: "Transformer"
+      L1: [30000]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_tiled_siracusa_w_neureka.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8 --defaultMemLevel=L3 --doublebuffer
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+run_siracusa_w_neureka_tiled_kernels_singlebuffer_L2_wmem:
+  stage: test
+  tags:
+    - PULP
+  parallel:
+    matrix:
+    - TEST: "testRequantizedLinear"
+      L1: [16000]
+    - TEST: "testPointwise"
+      L1: [32000]
+    - TEST: "testPointwiseConvBNReLU"
+      L1: [32000]
+    - TEST: "testPointwiseUnsignedWeights"
+      L1: [32000]
+  script:
+    - bash && source ~/.bashrc
+    - $CONDA activate dumpoci
+    - export PYTHONPATH=`pwd`:$PYTHONPATH
+    - cd DeeployTest
+    - python testRunner_tiled_siracusa_w_neureka.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --defaultMemLevel=L2 --neureka-wmem
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+run_siracusa_w_neureka_tiled_models_doublebuffer_L3_wmem:
+  stage: test
+  tags:
+    - PULP
+  parallel:
+    matrix:
+    - TEST: "miniMobileNet"
+      L1: [2000] # LMACAN: 1000 leads to non-2d transfers in L3!
+    - TEST: "Attention"
+      L1: [2500]
+    - TEST: "Transformer"
+      L1: [30000]
+    - TEST: "microLlama/microLlama1"
+      L1: [10000]
+  script:
+    - bash && source ~/.bashrc
+    - $CONDA activate dumpoci
+    - export PYTHONPATH=`pwd`:$PYTHONPATH
+    - cd DeeployTest
+    - python testRunner_tiled_siracusa_w_neureka.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8 --defaultMemLevel=L3 --doublebuffer --neureka-wmem
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+run_mempool_test_kernels: # This job runs in the test stage.
+  stage: test # It only starts when the job in the build stage completes successfully.
+  tags:
+    - banshee
+  retry: 2
+  parallel:
+    matrix:
+    - TEST: [Adder, MultIO, test1DConvolution, test2DConvolution, test1DDWConvolution, test2DDWConvolution, test1DPad, test2DPad, testGEMM, testMatMul, testMatMulAdd, testMaxPool, testRQConv, testRQGEMM, testRQMatMul, testReduceSum, testReduceMean, testSlice, testRequantizedDWConv, test2DRequantizedConv]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_mempool.py -t ./Tests/$TEST --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_MEMPOOL/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_MEMPOOL/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+run_mempool_test_models:   # This job runs in the test stage.
+  stage: test              # It only starts when the job in the build stage completes successfully.
+  tags:
+    - banshee
+  retry: 2
+  parallel:
+    matrix:
+    - TEST: [simpleRegression, simpleCNN, ICCT, ICCT_ITA, ICCT_8, ICCT_ITA_8, miniMobileNet, miniMobileNetv2]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_mempool.py -t ./Tests/$TEST --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR
+    # - python testRunner_mempool.py -t ./Tests/WaveFormer -DGCC_INSTALL_DIR=$MEMPOOL_GCC_INSTALL_DIR # Boken with ITA (heap is too small)
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_MEMPOOL/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_MEMPOOL/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+run_generic_test_kernels: # This job runs in the test stage.
+  stage: test # It only starts when the job in the build stage completes successfully.
+  parallel:
+    matrix:
+    - TEST: [Adder, MultIO, test1DConvolution, test2DConvolution, test1DDWConvolution, test2DDWConvolution, test1DPad, test2DPad, testGEMM, testMatMul, testMatMulAdd, testMaxPool, testRQConv, testRQMatMul, testReduceSum, testReduceMean, testSlice, testRequantizedDWConv, test2DRequantizedConv]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_generic.py -t ./Tests/$TEST --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_GENERIC/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_GENERIC/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+run_generic_test_models:   # This job runs in the test stage.
+  stage: test              # It only starts when the job in the build stage completes successfully.
+  parallel:
+    matrix:
+    - TEST: [simpleRegression, WaveFormer, simpleCNN, ICCT, ICCT_ITA, ICCT_8, ICCT_ITA_8, miniMobileNet, miniMobileNetv2]
+  script:
+    - !reference [.setup_test, script]
+    - python testRunner_generic.py -t ./Tests/$TEST --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
+    paths:
+      - ./DeeployTest/out.txt
+      - ./DeeployTest/TEST_GENERIC/Tests/$TEST/*.c
+      - ./DeeployTest/TEST_GENERIC/Tests/$TEST/*.h
+    expire_in: 4 weeks
+  cache:
+    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
+    paths:
+      - ./DeeployTest/TEST_*/build
+
+test_deeploy_state_serialization:
+  stage: test
+  parallel:
+    matrix:
+    - TEST: [simpleRegression]
+      PLATFORM: ['QEMU-ARM', 'Siracusa', 'MemPool', 'Generic']
+  script:
+    - !reference [.setup_test, script]
+    - python deeployStateEqualityTest.py -t ./Tests/$TEST -p $PLATFORM
+
+test_memory_level_extension:
+  stage: test
+  parallel:
+    matrix:
+    - TEST: [simpleRegression]
+      PLATFORM: ['QEMU-ARM', 'Siracusa', 'MemPool', 'Generic']
+  script:
+    - !reference [.setup_test, script]
+    - python testMemoryLevelExtension.py -t ./Tests/$TEST -p $PLATFORM
+
+test_tiler_extension:
+  stage: test
+  parallel:
+    matrix:
+    - TEST: [simpleRegression, simpleCNN, testMatMul, testMaxPool]
+      PLATFORM: ['Siracusa']
+  script:
+    - !reference [.setup_test, script]
+    - python testTilerExtension.py -t ./Tests/$TEST -p $PLATFORM
+
+test_tiler_extension_fails:
+  stage: test
+  parallel:
+    matrix:
+    - TEST: [simpleRegression, simpleCNN, testMatMul]
+      PLATFORM: ['Siracusa']
+  script:
+    - !reference [.setup_test, script]
+    - python testTilerExtension.py -t ./Tests/$TEST -p $PLATFORM --l1 2000 --shouldFail
+
+test_memory_allocation_extension:
+  stage: test
+  parallel:
+    matrix:
+    - TEST: [simpleRegression, simpleCNN, miniMobileNet, miniMobileNetv2, testMatMul, testMaxPool]
+      PLATFORM: ['Siracusa']
+  script:
+    - !reference [.setup_test, script]
+    - python testTilerExtension.py -t ./Tests/$TEST -p $PLATFORM
+
+test_deeploy_typing:
+  stage: test
+  script:
+    - !reference [.setup_test, script]
+    - python testTypes.py
+
+test_regex_matching:
+  stage: test
+  script:
+    - !reference [.setup_test, script]
+    - python testRegexMatching.py
+
+format_python:
+  stage: test
+  script:
+    - bash && source ~/.bashrc
+    - $CONDA activate dumpoci
+    - export PYTHONPATH=`pwd`:$PYTHONPATH
+    - yapf -rpd -e "third_party/" -e "install/" -e "toolchain/" .
+
+format_python_imports:
+  stage: test
+  script:
+    - bash && source ~/.bashrc
+    - $CONDA activate dumpoci
+    - export PYTHONPATH=`pwd`:$PYTHONPATH
+    - isort --sg "**/third_party/*"  --sg "install/*" --sg "toolchain/*" ./ -c -v
+    - autoflake -c -r --remove-all-unused-imports --ignore-init-module-imports --exclude "*/third_party/**" ./
+
+format_c:
+  stage: test
+  script:
+    - bash && source ~/.bashrc
+    - $CONDA activate dumpoci
+    - export PYTHONPATH=`pwd`:$PYTHONPATH
+    - python scripts/run_clang_format.py -e "*/third_party/*" -e "*/install/*" -e "*/toolchain/*" -ir --clang-format-executable=${LLVM_INSTALL_DIR}/bin/clang-format ./ scripts
+
+lint_python_licenses:
+  stage: test
+  variables:
+    LICENSE_STRING: "SPDX-License-Identifier: Apache-2.0"
+  script:
+    - bash && source ~/.bashrc
+    - $CONDA activate dumpoci
+    - export PYTHONPATH=`pwd`:$PYTHONPATH
+    - grep -Lr "$LICENSE_STRING" --exclude-dir="toolchain" --exclude-dir="install" --exclude-dir=".git" . --exclude-dir="third_party" --exclude-dir="TEST_*" --exclude "run_clang_format.py" | grep ".*\.py$" || [[ $? == 1 ]]
+
+lint_c_licenses:
+  stage: test
+  variables:
+    LICENSE_STRING: "SPDX-License-Identifier: Apache-2.0"
+  script:
+    - bash && source ~/.bashrc
+    - $CONDA activate dumpoci
+    - export PYTHONPATH=`pwd`:$PYTHONPATH
+    - grep -Lr "$LICENSE_STRING" --exclude-dir="toolchain" --exclude-dir="install" --exclude-dir=".git" . --exclude-dir="third_party" --exclude-dir="TEST_*" --exclude-dir="runtime" | grep ".*\.c$" || [[ $? == 1 ]]
+
+lint_c_header_licenses:
+  stage: test
+  variables:
+    LICENSE_STRING: "SPDX-License-Identifier: Apache-2.0"
+  script:
+    - bash && source ~/.bashrc
+    - $CONDA activate dumpoci
+    - export PYTHONPATH=`pwd`:$PYTHONPATH
+    - grep -Lr "$LICENSE_STRING" --exclude-dir="toolchain" --exclude-dir="install" --exclude-dir=".git" . --exclude-dir="third_party" --exclude-dir="TEST_*" --exclude-dir="runtime" | grep ".*\.h$" || [[ $? == 1 ]]
diff --git a/.gitlab/issue_templates/issue_template.md b/.gitlab/issue_templates/issue_template.md
new file mode 100644
index 0000000..92cae8b
--- /dev/null
+++ b/.gitlab/issue_templates/issue_template.md
@@ -0,0 +1,24 @@
+## Summary
+
+Give a *short* description of the problem, at most two paragraphs.
+
+## Steps to reproduce
+
+If possible create an example project that exhibits the problematic behaviour and reference it here. Please be as specific as possible.
+
+## Bug Behaviour
+
+Describe what is happening in your minimal example.
+
+## Expected Behaviour
+
+Describe what you expect to happen.
+
+## Relevant logs and/or screenshots
+
+If available, paste any relevant logs - use code blocks (```) to format console output, logs, and code, as
+it's very hard to read otherwise.
+
+## Possible fixes
+
+If you can, link to the line of code that might be responsible for the problem.
diff --git a/.gitlab/merge_request_templates/MRTemplate.md b/.gitlab/merge_request_templates/MRTemplate.md
new file mode 100644
index 0000000..f532117
--- /dev/null
+++ b/.gitlab/merge_request_templates/MRTemplate.md
@@ -0,0 +1,19 @@
+# Changelog
+
+Describe the intent of your merge request here.
+
+## Added
+
+## Changed
+
+## Fixed
+
+
+## PR Merge Checklist
+
+1. [ ] Is your PR rebased on the latest `devel` commit and pointing to `devel`?
+2. [ ] Was your PR reviewed and accepted?
+3. [ ] Does your latest pipeline pass?
+4. [ ] Are all dependencies merged onto their respective `main` branches?
+5. [ ] Did you reset all .gitmodules URLs to point to the `deeploy` group?
+6. [ ] Did you check in the latest commits for all dependencies available on their `main` branches?
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..1c1506d
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,9 @@
+[submodule "pulp-nn-mixed"]
+	path = TargetLibraries/PULPOpen/third_party/pulp-nn-mixed
+	url = https://github.com/pulp-platform/pulp-nn-mixed.git
+[submodule "pulp-nnx"]
+	path = TargetLibraries/PULPOpen/third_party/pulp-nnx
+	url = https://github.com/pulp-platform/pulp-nnx.git
+[submodule "CMSIS-NN"]
+	path = TargetLibraries/CMSIS/third_party/CMSIS-NN
+	url = https://github.com/ARM-software/CMSIS-NN.git
\ No newline at end of file
diff --git a/.isort.cfg b/.isort.cfg
new file mode 100644
index 0000000..c0b30d7
--- /dev/null
+++ b/.isort.cfg
@@ -0,0 +1,4 @@
+[settings]
+line_length=120
+multi_line_output=2
+include_trailing_comma=false
\ No newline at end of file
diff --git a/.style.yapf b/.style.yapf
new file mode 100644
index 0000000..3389b2a
--- /dev/null
+++ b/.style.yapf
@@ -0,0 +1,5 @@
+[style]
+based_on_style = google
+column_limit = 120
+split_before_logical_operator = true
+spaces_around_default_or_named_assign = true
\ No newline at end of file
diff --git a/.yapfignore b/.yapfignore
new file mode 100644
index 0000000..200637b
--- /dev/null
+++ b/.yapfignore
@@ -0,0 +1,3 @@
+*third_party/
+*install/
+*toolchain/
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..825c32f
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1 @@
+# Changelog
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..0636b51
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,184 @@
+cmake_minimum_required(VERSION 3.12)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+set(CMAKE_C_STANDARD 99)
+
+set(CMAKE_C_COMPILER_LAUNCHER "ccache")
+set(CMAKE_CXX_COMPILER_LAUNCHER "ccache")
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS TRUE)
+
+if(TOOLCHAIN STREQUAL GCC)
+  set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
+endif()
+
+set(platform MemPool CACHE STRING "Platform (MemPool, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, Generic)")
+set_property(CACHE platform PROPERTY STRINGS MemPool QEMU Siracusa Siracusa_w_neureka PULP-Open Generic)
+
+if(platform STREQUAL MemPool)
+  message(STATUS "Building for platform 'MemPool'")
+elseif(platform STREQUAL QEMU-ARM)
+  message(STATUS "Building for platform 'QEMU-ARM'")
+elseif(platform STREQUAL Siracusa)
+  message(STATUS "Building for platform 'Siracusa'")
+elseif(platform STREQUAL Siracusa_w_neureka)
+  message(STATUS "Building for platform 'Siracusa_w_neureka'")
+elseif(platform STREQUAL PULPOpen)
+  message(STATUS "Building for platform 'PULP-Open'")
+elseif(platform STREQUAL Generic)
+  message(STATUS "Building for platform 'Generic'")
+else()
+  message(FATAL_ERROR "Invalid platform '${platform}' specified!")
+endif()
+
+# Import useful functions / macros
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/Util.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/common.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/simulation.cmake)
+
+message(STATUS "============================= Project Configuration ============================")
+message(STATUS "[Deeploy]     platform               = " ${platform})
+message(STATUS "[Deeploy]     use_dma                = " ${use_dma})
+message(STATUS "================================================================================")
+message(STATUS "")
+
+if(platform STREQUAL MemPool)
+  set(mempool_flavour mempool_ita CACHE STRING "Platform (mempool, mempool_ita or minpool)")
+  set_property(CACHE mempool_flavour PROPERTY STRINGS mempool minpool mempool_ita)
+
+  if(TOOLCHAIN STREQUAL LLVM)
+    set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/mempool/toolchain_llvm.cmake)
+  else()
+    set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/mempool/toolchain_gcc.cmake)
+  endif()
+
+  include(${CMAKE_CURRENT_LIST_DIR}/cmake/mempool/${mempool_flavour}.cmake)
+
+  project(deeploy LANGUAGES C ASM)
+
+  add_subdirectory(TargetLibraries/MemPool)
+  add_subdirectory(TargetLibraries/Generic)
+  add_subdirectory(DeeployTest)
+  target_include_directories(deeploymempool PUBLIC TargetLibraries/Generic/inc)
+
+  target_link_libraries(deeploylib INTERFACE deeploymempool deeploybasic)
+
+  message(STATUS "============================= MemPool Configuration ============================")
+  message(STATUS "[cMake  ]   mempool_flavour        = " ${mempool_flavour})
+  message(STATUS "[cMake  ]   boot_addr              = " ${boot_addr})
+  message(STATUS "[cMake  ]   l2_base                = " ${l2_base})
+  message(STATUS "[cMake  ]   l2_size                = " ${l2_size})
+  message(STATUS "[cMake  ]   l2_banks               = " ${l2_banks})
+  message(STATUS "[cMake  ]   seq_mem_size           = " ${seq_mem_size})
+  message(STATUS "[cMake  ]   stack_size             = " ${stack_size})
+  message(STATUS "[cMake  ]   axi_data_width         = " ${axi_data_width})
+  message(STATUS "[cMake  ]   ro_line_width          = " ${ro_line_width})
+  message(STATUS "[cMake  ]   dmas_per_group         = " ${dmas_per_group})
+  message(STATUS "[cMake  ]   xqueue_size            = " ${xqueue_size})
+  message(STATUS "[cMake  ]   xpulpimg               = " ${xpulpimg})
+  message(STATUS "[cMake  ]   num_cores              = " ${num_cores})
+  message(STATUS "[cMake  ]   num_eff_cores          = " ${num_eff_cores})
+  message(STATUS "[cMake  ]   num_groups             = " ${num_groups})
+  message(STATUS "[cMake  ]   num_cores_per_tile     = " ${num_cores_per_tile})
+  message(STATUS "[cMake  ]   banking_factor         = " ${banking_factor})
+  message(STATUS "[cMake  ]   axi_hier_radix         = " ${axi_hier_radix})
+  message(STATUS "[cMake  ]   axi_masters_per_group  = " ${axi_masters_per_group})
+  if(mempool_flavour STREQUAL mempool_ita)
+    message(STATUS "=============================== ITA Configuration ==============================")
+    message(STATUS "[cMake  ]   ita_pe                 = " ${ita_pe})
+  endif()
+  message(STATUS "================================================================================")
+  message(STATUS "")
+
+endif()
+
+if(platform STREQUAL Generic)
+
+  if(TOOLCHAIN STREQUAL LLVM)
+    set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/generic/toolchain_llvm.cmake)
+  endif()
+
+  include(${CMAKE_CURRENT_LIST_DIR}/cmake/generic/generic.cmake)
+
+  project(deeploy LANGUAGES C ASM)
+
+  message(STATUS "============================= Generic Configuration ============================")
+  message(STATUS "[cMake  ]   CPU                    = " ${CMAKE_SYSTEM_PROCESSOR})
+  message(STATUS "================================================================================")
+  message(STATUS "")
+
+  add_subdirectory(TargetLibraries/Generic)
+  add_subdirectory(DeeployTest)
+
+  target_link_libraries(deeploylib INTERFACE deeploybasic)
+
+endif()
+
+if(platform STREQUAL QEMU-ARM)
+
+  if(TOOLCHAIN STREQUAL LLVM)
+    set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/cmsis/toolchain_llvm.cmake)
+  else()
+    set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/cmsis/toolchain_gcc.cmake)
+  endif()
+
+  include(${CMAKE_CURRENT_LIST_DIR}/cmake/cmsis/cmsis.cmake)
+  include(${CMAKE_CURRENT_LIST_DIR}/cmake/cmsis/qemu.cmake)
+
+  project(deeploy LANGUAGES C ASM)
+
+  message(STATUS "============================= QEMU Configuration ============================")
+  message(STATUS "[cMake  ]   CPU                    = " ${CPU})
+  message(STATUS "[cMake  ]   FABI                   = " ${FABI})
+  message(STATUS "[cMake  ]   FPU                    = " ${FPU})
+  message(STATUS "================================================================================")
+  message(STATUS "")
+
+  add_subdirectory(TargetLibraries/Generic)
+  add_subdirectory(TargetLibraries/CMSIS)
+  add_subdirectory(DeeployTest)
+
+  target_include_directories(deeploycmsis PUBLIC TargetLibraries/Generic/inc)
+  target_link_libraries(deeploylib INTERFACE deeploybasic deeploycmsis)
+
+endif()
+
+if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL PULPOpen)
+
+  if(TOOLCHAIN STREQUAL LLVM)
+    set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/toolchain_llvm.cmake)
+  else()
+    set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/toolchain_gcc.cmake)
+  endif()
+
+  include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/pulp.cmake)
+
+  if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka)
+    include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/siracusa/siracusa.cmake)
+  elseif(platform STREQUAL PULPOpen)
+    include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/pulp-open/pulp-open.cmake)
+  endif()
+
+  project(deeploy LANGUAGES C ASM)
+
+  message(STATUS "============================= ${platform} Configuration ============================")
+  message(STATUS "[cMake  ]   ISA                    = " ${ISA})
+  message(STATUS "[cMake  ]   Cluster Cores          = " ${PE})
+  message(STATUS "[cMake  ]   Fabric Controller      = " ${FC})
+  message(STATUS "[cMake  ]   Number of used cores   = " ${NUM_CORES})
+  message(STATUS "================================================================================")
+  message(STATUS "")
+
+  add_subdirectory(TargetLibraries/Generic)
+  add_subdirectory(TargetLibraries/PULPOpen)
+  target_include_directories(deeploypulp PUBLIC TargetLibraries/Generic/inc)
+
+  add_subdirectory(DeeployTest)
+  target_link_libraries(deeploylib INTERFACE deeploybasic deeploypulp)
+
+endif()
+
+print_simulation_config()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..2aa17b0
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,36 @@
+# Contribution Guide
+
+We encourage submitting your issues and work in merge requests against the devel branch. Please understand that we are trying to maintain a consistent minimal quality standard.
+Any and all merge requests you submit can only be accepted under the Apache 2.0 License.
+
+## Overview
+
+* The only way new features are accepted are by merge requests against the devel branch. Understand that we expect you to rebase your work against the devel branch if you submit merge requests.
+* We encourage early draft merge requests to keep development transparent and avoid diverging efforts. Please submit your draft merge requests clearly labelled with "DRAFT:".
+* We encourage refactoring. As code evolves, semantic concepts may change, and this is best addressed with refactoring. Please submit merge requests that implement refactoring with a label "REFACTOR:"
+* We strongly encourage discussion on merge requests. Please comment on open merge requests, and keep it productive. The goal is to include feature ideas that are compatible with the Deeploy framework. Feedback for collaborators should include clear actionable items to improve the contribution.
+* If a merge requests addresses a specific feature requests / bug, please reference it in the pull request.
+* Deeploy is a research project. We do not expect a production level workflow, but we ask to add at the very least a proof of concept for any feature implementation. Similarly, if your merge request fixes a bug, please add a regression test for the error condition that was addressed.
+
+
+## Style guide
+
+Deeploy mainly consists of code implemented in C, Makefile, and Python. To facilitate efficient collaboration among users and contributors, it is important to maintain a consistent coding style. To achieve this, it is strongly recommend to use autoformatting tools with the provided configuration files. Additionally, the Continuous Integration (CI) system checks the adherence to the style guide for each pushed commit. Currently configuration for C using `clang-format` and for Python using `yapf` and `isort` are provided.
+
+To recursively format all Python files run
+```bash
+$>	autoflake -i -r --remove-all-unused-imports --ignore-init-module-imports --exclude "*/third_party/**" ./
+$>	yapf -ipr -e "third_party/" -e "install/" -e "toolchain/" ./
+$>	isort --sg "**/third_party/*"  --sg "install/*" --sg "toolchain/*" ./
+```
+
+And for C files
+```bash
+$> python scripts/run_clang_format.py -e "*/third_party/*" -e "*/install/*" -e "*/toolchain/*" -ir --clang-format-executable=${LLVM_INSTALL_DIR}/bin/clang-format ./
+```
+
+Note that third party applications should not be formatted. You can alternatively also run
+```
+make format
+```
+to format all C and Python files.
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
new file mode 100644
index 0000000..d9ce7da
--- /dev/null
+++ b/CONTRIBUTORS.md
@@ -0,0 +1,6 @@
+All contributors have agreed to an open-source release of their work in the Deeploy project.
+
+* Moritz Scherer
+* Victor Jung
+* Philip Wiese
+* Luka Macan
diff --git a/Deeploy/AbstractDataTypes.py b/Deeploy/AbstractDataTypes.py
new file mode 100644
index 0000000..2fec670
--- /dev/null
+++ b/Deeploy/AbstractDataTypes.py
@@ -0,0 +1,489 @@
+# ----------------------------------------------------------------------
+#
+# File: AbstractDataTypes.py
+#
+# Last edited: 25.04.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import copy
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import Dict, Generic, Iterable, List, Optional, Type, TypeVar, Union
+
+import numpy as np
+
+_NetworkContext = TypeVar("_NetworkContext")
+
+_PointerType = TypeVar("Pointer", bound = "Pointer")
+_ImmediateType = TypeVar("Immediate", bound = "Immediate")
+_StructType = TypeVar("Struct", bound = "Struct")
+
+_DeeployType = TypeVar("_DeeployType", _PointerType, _ImmediateType, _StructType)
+_PythonType = TypeVar("_PythonType", str, int, float, Dict[str, "_PythonType"], Iterable["_PythonType"])
+
+
+class _ClassPropertyDescriptor(object):
+
+    def __init__(self, fget, fset = None):
+        self.fget = fget
+        self.fset = fset
+
+    def __get__(self, obj, other = None):
+        if other is None:
+            other = type(obj)
+        return self.fget.__get__(obj, other)()
+
+    def __set__(self, obj, value):
+        if not self.fset:
+            raise AttributeError("can't set attribute")
+        type_ = type(obj)
+        return self.fset.__get__(obj, type_)(value)
+
+    def setter(self, func):
+        if not isinstance(func, (classmethod, staticmethod)):
+            func = classmethod(func)
+        self.fset = func
+        return self
+
+
+def _classproperty(func):
+    if not isinstance(func, (classmethod, staticmethod)):
+        func = classmethod(func)
+
+    return _ClassPropertyDescriptor(func)
+
+
+class _SlotPickleMixin(object):
+
+    def __getstate__(self):
+        return dict((slot, getattr(self, slot)) for slot in self.__slots__ if hasattr(self, slot))
+
+    def __setstate__(self, state):
+        for slot, value in state.items():
+            setattr(self, slot, value)
+
+
+@dataclass
+class BaseType(Generic[_PythonType, _DeeployType], _SlotPickleMixin):
+    """Deeploy abstraction to represent data types that can be expressed in the C language
+    """
+
+    __slots__ = [
+        "value"  #: _PythonType: Variable that stores the underlying represented Python-typed value
+    ]
+    typeName: str  #: str: The C typename of this type
+    typeWidth: int  #: int: the number of BITS to be assigned to the type
+
+    @classmethod
+    @abstractmethod
+    def checkValue(cls, value: _PythonType, ctxt: Optional[_NetworkContext] = None) -> bool:
+        """Checks whether a given Python-type value (usually FP64) can be represented with a Deeploy type
+
+        Parameters
+        ----------
+        value : _PythonType
+            Python-typed value to check
+        ctxt : Optional[_NetworkContext]
+            Current NetworkContext
+
+        Returns
+        -------
+        bool
+            Returns true if value can represented by cls
+
+        """
+        return False
+
+    @classmethod
+    @abstractmethod
+    def checkPromotion(cls, value: Union[_PythonType, _DeeployType], ctxt: Optional[_NetworkContext] = None) -> bool:
+        """Checks whether a given Python-typed or Deeploy-typed value can be represented with the Deeploy type
+
+        Parameters
+        ----------
+        value : Union[_PythonType, _DeeployType]
+            Python-typed or Deeploy-typed value to be checked for
+            promotion to cls
+        ctxt : Optional[_NetworkContext]
+            Current NetworkContext
+
+        Returns
+        -------
+        bool
+            Returns true if the value can be promoted to cls
+
+        """
+        return False
+
+
+class VoidType(BaseType):
+    """Helper type to represent the C void type for pointers
+
+    """
+    __slots__ = []
+    typeName = "void"
+    typeWidth = 32
+
+
+class Immediate(BaseType[_PythonType, _ImmediateType]):
+    """Represents any immediate value, e.g. 6, 7.48,... Can not be used to represent values that are deferenced at runtime.
+    """
+
+    def __init__(self, value: Union[int, float, Immediate], ctxt: Optional[_NetworkContext] = None):
+        assert self.checkPromotion(value), f"Cannot assign {value} to a {self.typeName}"
+        self.value = value
+
+    @classmethod
+    def partialOrderUpcast(cls, otherCls: Type[Immediate]) -> bool:
+        """This method checks whether a data type (cls) can be used to represent any value that can be represented by another data type (otherCls). For more information on partial order sets and type conversion, check:https://en.wikipedia.org/wiki/Partially_ordered_set https://en.wikipedia.org/wiki/Type_conversion
+
+        Parameters
+        ----------
+        otherCls : Type[Immediate]
+            The class you want to upcast an immediate of this cls to
+
+        Returns
+        -------
+        bool
+            Returns true if this cls can be statically promoted to
+            otherCls
+
+        """
+        return False
+
+    @classmethod
+    def checkPromotion(cls, value: Union[_PythonType, _ImmediateType], ctxt: Optional[_NetworkContext] = None):
+        # SCHEREMO: np.ndarray is Iterable
+        if isinstance(value, Immediate):
+            return cls.checkPromotion(value.value, ctxt)
+
+        return cls.checkValue(value, ctxt)
+
+    def __eq__(self, other) -> bool:
+        if not (isinstance(self, type(other)) and hasattr(other, "value")):
+            return False
+        return self.value == other.value
+
+    def __repr__(self) -> str:
+        return f"{str(self.value)}"
+
+
+class IntegerImmediate(Immediate[Union[int, Iterable[int]], _ImmediateType]):
+
+    signed: bool  #: bool: Represents whether the underlying integer is signed or unsigned
+    typeMax: int  #: int: Represents the largest possible representable value, i.e. `2^{typeWidth}-1` for unsigned values and `2^{typeWidth-1}-1` for signed values.
+    typeMin: int  #: int: Represenst the smallest possible representable value, i.e. `0` for unsigned values and `-2^{typeWidth-1}` for signed values.
+
+    @_classproperty
+    def typeMax(cls) -> int:
+        if cls.signed:
+            return 2**(cls.typeWidth - 1) - 1
+        else:
+            return 2**(cls.typeWidth) - 1
+
+    @_classproperty
+    def typeMin(cls) -> int:
+        if cls.signed:
+            return -2**(cls.typeWidth - 1)
+        else:
+            return 0
+
+    @classmethod
+    def partialOrderUpcast(cls, otherCls: Type[Immediate]) -> bool:
+        if issubclass(otherCls, IntegerImmediate):
+            return cls.typeMax >= otherCls.typeMax and cls.typeMin <= otherCls.typeMin
+        else:
+            return False
+
+    @classmethod
+    def checkValue(cls, value: Union[int, Iterable[int]], ctxt: Optional[_NetworkContext] = None):
+
+        if isinstance(value, int):
+            _max, _min = (value, value)
+        elif isinstance(value, np.ndarray):
+            _max = value.max()
+            _min = value.min()
+        elif isinstance(value, Iterable):
+            _max = max(value)
+            _min = min(value)
+
+        if _max > cls.typeMax:
+            return False
+        if _min < cls.typeMin:
+            return False
+        return True
+
+
+class Pointer(BaseType[Optional[str], _PointerType]):
+    """Represents a C Pointer type to an underlying BaseType data type
+    """
+
+    __slots__: List[str] = ["referenceName", "_mangledReferenceName"]
+    referencedType: Type[
+        _DeeployType]  #: Type[_DeeployType]: type definition of the underlying type that this type points to
+
+    @_classproperty
+    def typeName(cls):
+        return cls.referencedType.typeName + "*"
+
+    @classmethod
+    def checkValue(cls, value: Optional[str], ctxt: Optional[_NetworkContext] = None) -> bool:
+        if ctxt is None:
+            return False
+
+        if value is None or value == "NULL":
+            print("WARNING: Setting pointer value to NULL - Referenced data is invalid!")
+            return True
+
+        reference = ctxt.lookup(value)
+
+        if hasattr(reference, "_type") and reference._type is not None:
+            # Void pointer & DeeployType check
+            _type = reference._type
+            if not issubclass(cls.referencedType, VoidType) and _type.referencedType != cls.referencedType:
+                return False
+            return True
+
+        if not hasattr(reference, value):
+            return True
+        return cls.referencedType.checkPromotion(reference.value, ctxt)
+
+    @classmethod
+    def checkPromotion(cls, _value: Union[Optional[str], Pointer], ctxt: Optional[_NetworkContext] = None) -> bool:
+        if isinstance(_value, Pointer):
+            value = _value.referenceName
+        else:
+            value = _value
+        return cls.checkValue(value, ctxt)
+
+    def __init__(self, _value: Union[Optional[str], Pointer], ctxt: Optional[_NetworkContext] = None):
+        """Initializes a pointer to a registered object in the NetworkContext
+
+        Parameters
+        ----------
+        _value : Union[Optional[str], Pointer]
+            Name of the memory buffer in the NetworkContext to be
+            represented or Pointer object
+        ctxt : Optional[_NetworkContext]
+            Current NetworkContext
+
+        Raises
+        ------
+        ValueError
+            Raises a ValueError if the memory buffer does not exist or
+            cannot be pointed to with this Pointer class
+
+        """
+
+        if _value is not None and not self.checkPromotion(_value, ctxt):
+            raise ValueError(f"value {_value} is not of type {self.referencedType}!")
+
+        if _value is None:
+            self.referenceName = "NULL"  #: str: Either NULL iff this pointer corresponds to a NULL pointer in C, or the name of the memory buffer this pointer points to.
+            self._mangledReferenceName = "NULL"
+        elif isinstance(_value, Pointer):
+            self.referenceName = _value.referenceName
+            self._mangledReferenceName = _value._mangledReferenceName
+        else:
+            self.referenceName = _value
+            self._mangledReferenceName = ctxt._mangle(_value)
+
+    def __eq__(self, other):
+        if not isinstance(other, Pointer):
+            return False
+
+        return self.referenceName == other.referenceName
+
+    def __repr__(self):
+        return f"{self._mangledReferenceName}"
+
+
+class Struct(BaseType[Union[str, Dict[str, _DeeployType]], _StructType]):
+    """Deeploy data type abstraction for C-like packed structs
+    """
+
+    structTypeDict: Dict[str, Type[BaseType]] = {
+    }  #: Dict[str, Type[BaseType]]: The definition of the struct mapping its field names to their associated Deeploy-types
+
+    @_classproperty
+    def typeWidth(cls) -> int:
+        return sum(q.typeWidth for q in cls.structTypeDict.values())
+
+    @classmethod
+    def _castDict(cls,
+                  inputValue: Union[str, Struct, Dict[str, BaseType]],
+                  ctxt: Optional[_NetworkContext] = None) -> Dict[str, BaseType]:
+
+        if isinstance(inputValue, str):
+            inputDict = ctxt.lookup(inputValue).structDict.value
+        elif isinstance(inputValue, Struct):
+            inputDict = inputValue.value
+        else:
+            inputDict = inputValue
+
+        castedDict: Dict[str, BaseType] = {}
+
+        for key, value in copy.deepcopy(inputDict).items():
+            castedDict[key] = cls.structTypeDict[key](inputDict[key], ctxt)
+
+        return castedDict
+
+    @classmethod
+    def checkValue(cls, value: Union[str, Dict[str, BaseType]], ctxt: Optional[_NetworkContext] = None):
+
+        if isinstance(value, str):
+            value = ctxt.lookup(value).structDict.value
+
+        if not hasattr(value, "keys"):
+            return False
+
+        if set(value.keys()) != set(cls.structTypeDict.keys()):
+            return False
+
+        for key, _value in value.items():
+            if not cls.structTypeDict[key].checkPromotion(_value, ctxt):
+                return False
+
+        return True
+
+    @classmethod
+    def checkPromotion(cls, _other: Union[str, Dict[str, BaseType], Struct], ctxt: Optional[_NetworkContext] = None):
+
+        if isinstance(_other, Struct):
+            other = _other.value
+        else:
+            other = _other
+
+        return cls.checkValue(other, ctxt)
+
+    def __init__(self, structDict: Union[str, Struct, Dict[str, BaseType]], ctxt: Optional[_NetworkContext] = None):
+        """Initialize a new struct object
+
+        Parameters
+        ----------
+        structDict : Union[str, Struct, Dict[str, BaseType]]
+            Either an initialized Deeploy-type struct, a string name
+            refering to an intialized struct registered in the
+            NetworkContext, or a full definition of the struct
+            to-be-initialized
+        ctxt : Optional[_NetworkContext]
+            Current NetworkContext
+
+        Raises
+        ------
+        Exception
+            Raises an Exception if structDict cannot be assigned to a
+            struct of layout structTypeDict
+
+        """
+
+        if not self.checkPromotion(structDict, ctxt):
+            raise Exception(f"Can't assign {structDict} to {type(self)}!")
+
+        self.value = self._castDict(
+            structDict, ctxt
+        )  #: structTypeDict: the value of the struct; corresponds to an element with type layout defined in cls.structTypeDict
+
+    def __eq__(self, other):
+
+        if not (hasattr(other, 'typeWidth') and hasattr(other, 'typeName') and hasattr(other, "value")):
+            return False
+
+        if any([not key in other.value.keys() for key in self.value.keys()]):
+            return False
+
+        return all([self.value[key] == other.value[key] for key in self.value.keys()])
+
+    def __repr__(self):
+        _repr = "{"
+        pairs = []
+        for key, value in self.value.items():
+            pairs.append(f".{key} = {str(value)}")
+        _repr += (", ").join(pairs)
+        _repr += "}"
+        return _repr
+
+    def _typeDefRepr(self):
+        _repr = "{"
+        pairs = []
+        for key, value in self.value.items():
+            pairs.append(f"{value.typeName} {key}")
+        _repr += ("; ").join(pairs)
+        _repr += ";}"
+        return _repr
+
+
+def StructClass(typeName: str, _structTypeDict: Dict[str, Type[BaseType]]) -> Type[Struct]:  # type: ignore
+    """Helper function to dynamically generate a Struct class from a structTypeDict definition. Used in Closure Generation to capture a closure's arguments.
+
+    Parameters
+    ----------
+    typeName : str
+        Name of the Struct class that is being created
+    _structTypeDict : Dict[str, Type[BaseType]]
+        Layout of the Struct class that is being created
+
+    Returns
+    -------
+    Type[Struct]:
+        Returns the class definition of a Struct class corresponding
+        to the function arguments
+
+    """
+
+    if typeName not in globals().keys():
+        retCls = type(typeName, (Struct,), {
+            "typeName": typeName,
+            "structTypeDict": _structTypeDict,
+        })
+        globals()[typeName] = retCls
+    else:
+        retCls = globals()[typeName]
+
+    return retCls
+
+
+def PointerClass(DeeployType: _DeeployType) -> Type[Pointer[BaseType]]:  # type: ignore
+    """Generates a Pointer class definition at runtime that wraps around the given referenceType
+
+    Parameters
+    ----------
+    DeeployType : _DeeployType
+        Type of the underlying referencedType
+
+    Returns
+    -------
+    Type[Pointer[BaseType]]:
+        Returns a unique Pointer class corresponding to a Pointer to
+        DeeployType
+
+    """
+
+    typeName = DeeployType.typeName + "Ptr"
+    if typeName not in globals().keys():
+        retCls = type(typeName, (Pointer,), {"typeWidth": 32, "referencedType": DeeployType})
+        globals()[typeName] = retCls
+    else:
+        retCls = globals()[typeName]
+
+    return retCls
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py b/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
new file mode 100644
index 0000000..b137266
--- /dev/null
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
@@ -0,0 +1,230 @@
+# ----------------------------------------------------------------------
+#
+# File: Closure.py
+#
+# Last edited: 12.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Optional, Tuple, Type, Union
+
+from Deeploy.AbstractDataTypes import Immediate, Pointer, PointerClass, Struct, StructClass, VoidType
+from Deeploy.CommonExtensions.CodeTransformationPasses.IntrospectiveCodeTransformation import \
+    IntrospectiveCodeTransformationMixIn
+from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, \
+    NodeTemplate, StructBuffer, TransientBuffer, _NoVerbosity
+
+# SCHEREMO: example template for a function closure call
+_closureCallTemplate = NodeTemplate("""
+// ${closureName} CLOSURE CALL
+${closureName}(&${closureStructArgName});
+""")
+
+_closureTemplate = NodeTemplate("""
+static void ${closureName}(void* ${closureName}_args){
+// CLOSURE ARG CAST
+${closureStructArgs.typeName}* args = (${closureStructArgs.typeName}*) ${closureStructArgName};
+% for argName, argType in closureStructArgs.value.items():
+${argType.typeName} ${argName} = args->${argName};
+% endfor
+
+// CLOSURE FUNCTION CALL
+${functionCall}
+
+// CLOSURE ARG WRITEBACK
+% if writeback:
+% for argName, argType in closureStructArgs.value.items():
+args->${argName} = ${argName};
+% endfor
+% endif
+}
+""")
+
+_closureWriteBackTemplate = NodeTemplate("""
+// CLOSURE ARG WRITEBACK
+% for argName, argType in closureStructArgs.value.items():
+${argName} = ${closureStructArgName}.${argName};
+% endfor
+""")
+
+_closureStructDefTemplate = NodeTemplate("""
+typedef struct ${closureStructArgs._typeDefRepr()} ${closureStructArgName}_t;
+""")
+
+
+class ClosureExecutionBlock(ExecutionBlock):
+
+    def __init__(self, nodeTemplate = None, closureBlock: Optional[ExecutionBlock] = None):
+        super().__init__(nodeTemplate)
+        self.closureBlock = closureBlock
+
+    @property
+    def baseBlock(self):
+        if isinstance(self.closureBlock, ClosureExecutionBlock):
+            return self.closureBlock.baseBlock
+        return self.closureBlock
+
+
+class ClosureGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
+
+    closureStructArgs: Struct
+
+    def __init__(self,
+                 closureCallTemplate: NodeTemplate = _closureCallTemplate,
+                 closureSuffix = "_closure",
+                 writeback: bool = True,
+                 generateStruct: bool = True):
+        super().__init__()
+        self.closureSuffix = closureSuffix
+        self.closureTemplate = _closureTemplate
+        self.closureCallTemplate = closureCallTemplate
+        self.closureStructDefTemplate = _closureStructDefTemplate
+        self.closureWriteBackTemplate = _closureWriteBackTemplate
+        self.writeback = writeback
+        self.generateStruct = generateStruct
+
+    # Don't override this
+    def _generateClosureStruct(self, ctxt: NetworkContext, executionBlock: ExecutionBlock):
+
+        # Add closure struct info to operatorRepresentation
+        closureStructArgsType: Dict[str, Type[Union[Pointer, Immediate, Struct]]] = {}
+        closureStruct: Dict[str, Union[Pointer, Immediate, Struct]] = {}
+        makoDynamicReferences = self.extractDynamicReferences(ctxt, executionBlock, True)
+
+        for arg in list(dict.fromkeys(makoDynamicReferences)):
+            ref = ctxt.lookup(arg)
+            if isinstance(ref, TransientBuffer):
+                closureStructArgsType[ctxt._mangle(arg)] = PointerClass(VoidType)
+            elif not isinstance(ref, StructBuffer):
+                closureStructArgsType[ctxt._mangle(arg)] = ref._type
+
+            if not isinstance(ref, StructBuffer):
+                closureStruct[ctxt._mangle(arg)] = arg
+
+        structClass = StructClass(self.closureName + "_args_t", closureStructArgsType)
+        self.closureStructArgType = structClass
+        self.closureStructArgs = structClass(closureStruct, ctxt)
+
+    # Don't override this
+    def _generateClosureCtxt(self, ctxt: NetworkContext, nodeName: str) -> NetworkContext:
+
+        ret = ctxt.hoistStruct(self.closureStructArgs, self.closureName + "_args", self.closureStructArgType)
+        ctxt.lookup(ret)._users.append(nodeName)
+
+        allArgs = {
+            "closureName": self.closureName,
+            "functionCall": self.functionCall,
+            "closureStructArgs": ctxt.lookup(self.closureName + "_args").structDict,
+            "closureStructArgName": self.closureName + "_args",
+            "writeback": self.writeback
+        }
+
+        # SCHEREMO: These are global definitions
+        closure = self.closureTemplate.generate(allArgs)
+        closureStructDef = self.closureStructDefTemplate.generate(allArgs)
+        closureStructName = self.closureName + '_args_t'
+
+        ctxt.hoistGlobalDefinition(closureStructName, closureStructDef)
+        ctxt.hoistGlobalDefinition(self.closureName, closure)
+
+        return ctxt
+
+    # Don't override this
+    def _generateClosureCall(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
+                             nodeName: str) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        allArgs = {
+            "closureName": self.closureName,
+            "functionCall": self.functionCall,
+            "closureStructArgs": ctxt.lookup(self.closureName + "_args").structDict,
+            "closureStructArgName": self.closureName + "_args",
+            "writeback": self.writeback
+        }
+
+        executionBlock = ClosureExecutionBlock(None, executionBlock)
+
+        # SCHEREMO: These replace the function call
+        executionBlock.addLeft(self.closureCallTemplate, allArgs)
+        if self.writeback:
+            executionBlock.addRight(self.closureWriteBackTemplate, allArgs)
+        if self.generateStruct:
+            ctxt, executionBlock = ArgumentStructGeneration().apply(ctxt, executionBlock, nodeName, _NoVerbosity)
+
+        return ctxt, executionBlock
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        self.closureName = name + self.closureSuffix
+        self.functionCall = executionBlock.generate(ctxt)
+        self._generateClosureStruct(ctxt, executionBlock)
+        ctxt = self._generateClosureCtxt(ctxt, name)
+        ctxt, executionBlock = self._generateClosureCall(ctxt, executionBlock, name)
+        return ctxt, executionBlock
+
+
+class MemoryAwareClosureGeneration(ClosureGeneration):
+
+    def __init__(self,
+                 closureCallTemplate: NodeTemplate = _closureCallTemplate,
+                 closureSuffix = "_closure",
+                 writeback: bool = True,
+                 generateStruct: bool = True,
+                 startRegion: str = "L2",
+                 endRegion: str = "L1"):
+        super().__init__(closureCallTemplate, closureSuffix, writeback, generateStruct)
+        self.startRegion = startRegion
+        self.endRegion = endRegion
+
+    # Don't override this
+    def _generateClosureStruct(self, ctxt: NetworkContext, executionBlock: ExecutionBlock):
+
+        # Add closure struct info to operatorRepresentation
+        closureStructArgsType = {}
+        closureStruct = {}
+        makoDynamicReferences = self.extractDynamicReferences(ctxt, executionBlock, True)
+
+        filteredMakoDynamicReferences = []
+
+        for ref in makoDynamicReferences:
+            buf = ctxt.lookup(ref)
+            if not hasattr(buf, "_memoryLevel") or buf._memoryLevel is None:
+                filteredMakoDynamicReferences.append(ref)
+                continue
+
+            if buf._memoryLevel == self.startRegion or buf._memoryLevel != self.endRegion:
+                filteredMakoDynamicReferences.append(ref)
+
+        for arg in list(dict.fromkeys(filteredMakoDynamicReferences)):
+            ref = ctxt.lookup(arg)
+            if isinstance(ref, TransientBuffer):
+                closureStructArgsType[ctxt._mangle(arg)] = PointerClass(VoidType)
+            elif not isinstance(ref, StructBuffer):
+                closureStructArgsType[ctxt._mangle(arg)] = ref._type
+
+            if not isinstance(ref, StructBuffer):
+                closureStruct[ctxt._mangle(arg)] = arg
+
+        structClass = StructClass(self.closureName + "_args_t", closureStructArgsType)
+        self.closureStructArgType = structClass
+        self.closureStructArgs = self.closureStructArgType(closureStruct, ctxt)
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/CycleMeasurement.py b/Deeploy/CommonExtensions/CodeTransformationPasses/CycleMeasurement.py
new file mode 100644
index 0000000..b73eb64
--- /dev/null
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/CycleMeasurement.py
@@ -0,0 +1,43 @@
+# ----------------------------------------------------------------------
+#
+# File: CycleMeasurement.py
+#
+# Last edited: 13.11.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+from Deeploy.DeeployTypes import CodeTransformationPass, ExecutionBlock, NetworkContext, NodeTemplate
+
+
+class ProfilingCodeGeneration(CodeTransformationPass):
+
+    def apply(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
+              name: str) -> Tuple[NetworkContext, ExecutionBlock]:
+        executionBlock.addLeft(NodeTemplate("""
+        uint32_t ${op}_cycles = getCycles();
+        """), {"op": name})
+        executionBlock.addRight(
+            NodeTemplate("""
+        uint32_t ${op}_endCycles = getCycles();
+        printf("${op} took %u cycles \\n", ${op}_endCycles - ${op}_cycles);
+        """), {"op": name})
+        return ctxt, executionBlock
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/IntrospectiveCodeTransformation.py b/Deeploy/CommonExtensions/CodeTransformationPasses/IntrospectiveCodeTransformation.py
new file mode 100644
index 0000000..398f1ff
--- /dev/null
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/IntrospectiveCodeTransformation.py
@@ -0,0 +1,179 @@
+# ----------------------------------------------------------------------
+#
+# File: IntrospectiveBinding.py
+#
+# Last edited: 10.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import types
+from typing import Dict, List
+
+import mako.codegen as codegen
+from mako.lexer import Lexer
+from mako.parsetree import Expression, TemplateNode
+
+from Deeploy.AbstractDataTypes import Pointer, Struct
+from Deeploy.DeeployTypes import ExecutionBlock, NetworkContext, NodeTemplate, OperatorRepresentation
+
+_NULL: str = "NULL"
+
+
+class IntrospectiveCodeTransformationMixIn():
+
+    parseTreeDict: Dict[int, TemplateNode] = {}
+
+    @staticmethod
+    def _generateParseTree(template: NodeTemplate) -> TemplateNode:
+        return Lexer(template.template._source).parse()
+
+    @staticmethod
+    def _reconstructCode(template: NodeTemplate, node: TemplateNode):
+
+        def fixupParseTree(parseTree: TemplateNode) -> TemplateNode:
+            nodes = []
+            prevLine = 0
+            prevPos = 0
+            for node in parseTree.nodes:
+
+                newNode = copy.copy(node)
+                offset = len(node.source)
+
+                # Expression contain the actual expression + the symbols "${}", i.e. 3 offset symbols
+                if isinstance(newNode, Expression):
+                    offset += 3
+
+                prevPos = prevPos + offset
+
+                if prevLine != node.lineno:
+                    prevPos = node.pos
+
+                newNode.pos = prevPos
+                prevLine = node.lineno
+
+                nodes.append(newNode)
+
+            parseTree.nodes = nodes
+
+            return parseTree
+
+        node = fixupParseTree(node)
+
+        temp = template.template
+        lexer = Lexer(temp._source)
+        source = codegen.compile(
+            node,
+            temp.uri,
+            None,
+            default_filters = temp.default_filters,
+            buffer_filters = temp.buffer_filters,
+            imports = temp.imports,
+            future_imports = temp.future_imports,
+            source_encoding = lexer.encoding,
+            generate_magic_comment = True,
+            strict_undefined = temp.strict_undefined,
+            enable_loop = temp.enable_loop,
+            reserved_names = temp.reserved_names,
+        )
+        module = types.ModuleType(temp.module_id)
+        code = compile(source, temp.module_id, "exec")
+        exec(code, module.__dict__, module.__dict__)
+
+        temp._code = code
+        temp.module = module
+        temp.callable_ = temp.module.render_body
+        template.template = temp
+
+    def extractDynamicReferences(self,
+                                 ctxt: NetworkContext,
+                                 executionBlock: ExecutionBlock = None,
+                                 unrollStructs = False):
+
+        makoDynamicReferences = []
+        for codeSnippet in executionBlock.codeSnippets:
+            template, operatorRepresentation = codeSnippet.template, codeSnippet.operatorRepresentation
+
+            newRefs = self._extractDynamicExpressions(ctxt, operatorRepresentation, template, unrollStructs)
+
+            makoDynamicReferences += newRefs
+
+        ret = IntrospectiveCodeTransformationMixIn._fixCtxtOrdering(ctxt, list(dict.fromkeys(makoDynamicReferences)))
+
+        return ret
+
+    @staticmethod
+    def _fixCtxtOrdering(ctxt: NetworkContext, nameList: List[str]) -> List[str]:
+
+        orderList = [*ctxt.globalObjects.keys(), *ctxt.localObjects.keys()]
+        _nameList = sorted(nameList.copy(), key = lambda key: orderList.index(key))
+
+        return _nameList
+
+    def _extractDynamicExpressions(self,
+                                   ctxt: NetworkContext,
+                                   operatorRepresentation: OperatorRepresentation,
+                                   template: NodeTemplate,
+                                   unrollStructs = False):
+
+        codeHash = hash(template.template._source)
+
+        if codeHash in self.parseTreeDict.keys():
+            makoParseTree = self.parseTreeDict[codeHash]
+        else:
+            # Parse the user-provided template
+            makoParseTree = IntrospectiveCodeTransformationMixIn._generateParseTree(template)
+            self.parseTreeDict[codeHash] = makoParseTree
+
+        # Filter parsing tree for expressions
+        makoExpressions = [node.text for node in makoParseTree.nodes if type(node) == Expression]
+
+        # Filter expressions for variables contained in operatorRepresentation
+        makoReferences = [
+            node for node in makoExpressions
+            if ((node in operatorRepresentation) and type(operatorRepresentation[node]) == str and (
+                operatorRepresentation[node] in ctxt.localObjects.keys()))
+        ]
+
+        def _unrollStructReferences(val) -> List[str]:
+            # Unroll struct references
+            structReferences = []
+            if isinstance(val, Struct):
+                for key, _type in val.value.items():
+                    if isinstance(_type, Struct):
+                        structReferences += _unrollStructReferences(val.value[key])
+                    elif isinstance(_type, Pointer) and val.value[key].referenceName != _NULL:
+                        structReferences.append(val.value[key].referenceName)
+            return structReferences
+
+        references = []
+        structReferences = []
+        for ref in makoReferences:
+            references.append(operatorRepresentation[ref])
+            if unrollStructs:
+                if (ctxt.is_local(operatorRepresentation[ref])
+                        or ctxt.is_global(operatorRepresentation[ref])) and hasattr(
+                            ctxt.lookup(operatorRepresentation[ref]), "structDict"):
+                    structReferences += _unrollStructReferences(ctxt.lookup(operatorRepresentation[ref]).structDict)
+
+        # Filter for dynamically allocated tensors
+
+        dynamicReferences = [ref for ref in references + structReferences if (ctxt.lookup(ref)._deploy)]
+        return dynamicReferences
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
new file mode 100644
index 0000000..b5f74a6
--- /dev/null
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
@@ -0,0 +1,197 @@
+# ----------------------------------------------------------------------
+#
+# File: MemoryAllocation.py
+#
+# Last edited: 12.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from functools import partial
+from typing import List, Optional, Tuple
+
+from Deeploy.CommonExtensions.CodeTransformationPasses.IntrospectiveCodeTransformation import \
+    IntrospectiveCodeTransformationMixIn
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, \
+    NodeTemplate, StructBuffer, TransientBuffer, _NoVerbosity
+
+
+class _ArgStructAllocateTemplate(NodeTemplate):
+
+    def __init__(self, templateStr: str, bufferName: str):
+        super().__init__(templateStr)
+        self.bufferName = bufferName
+
+
+_stackAllocateTemplate = partial(
+    _ArgStructAllocateTemplate,
+    templateStr = "${structDict.typeName} ${name} = (${structDict.typeName}) ${str(structDict)};")
+
+
+class ArgumentStructGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
+
+    def __init__(self):
+        super().__init__()
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        references = self.extractDynamicReferences(ctxt, executionBlock, True)
+        buffers = [ctxt.lookup(key) for key in references]
+
+        closureStructBufferNames = [
+            codeSnippet.template.bufferName
+            for codeSnippet in executionBlock.codeSnippets
+            if isinstance(codeSnippet.template, _ArgStructAllocateTemplate)
+        ]
+
+        buffers = [buf for buf in buffers if buf.name not in closureStructBufferNames]
+
+        for _buffer in buffers:
+            if isinstance(_buffer, StructBuffer) and name in _buffer._users:
+                executionBlock.addLeft(_stackAllocateTemplate(bufferName = _buffer.name),
+                                       _buffer._bufferRepresentation())
+
+        return ctxt, executionBlock
+
+
+class MemoryManagementGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
+
+    def __init__(self, memoryHierarchyRegex: Optional[str] = None):
+        super().__init__()
+        if memoryHierarchyRegex is not None:
+            self.regex = re.compile(memoryHierarchyRegex)
+        else:
+            self.regex = None
+
+    def _matchesRegex(self, ctxt: NetworkContext, key: str) -> bool:
+        _buffer = ctxt.lookup(key)
+
+        if self.regex is None:
+            return not hasattr(_buffer, "_memoryLevel")
+
+        if not hasattr(_buffer, "_memoryLevel"):
+            return False
+
+        ret = self.regex.findall(ctxt.lookup(key)._memoryLevel)
+        return ret != []
+
+    def _extractTransientBuffers(self, ctxt: NetworkContext, name: str) -> List[str]:
+        names = []
+
+        for key, _buffer in ctxt.localObjects.items():
+            if isinstance(_buffer, TransientBuffer) and name in _buffer._users:
+                names.append(key)
+
+        filteredNames = [key for key in names if self._matchesRegex(ctxt, key)]
+
+        return filteredNames
+
+    def _getOutputNames(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, name: str) -> List[str]:
+        outputs = []
+        references = self.extractDynamicReferences(ctxt, executionBlock, True)
+        localKeys = [key for key in references if ctxt.is_local(key)]
+
+        filteredKeys = [key for key in localKeys if self._matchesRegex(ctxt, key)]
+
+        for key in filteredKeys:
+            _buffer = ctxt.lookup(key)
+            if isinstance(_buffer, (StructBuffer, TransientBuffer)):
+                continue
+            if name not in _buffer._users:
+                outputs.append(_buffer.name)
+
+        return list(dict.fromkeys(outputs))
+
+    def _getFinalInputNames(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, name: str) -> List[str]:
+        inputs = []
+        references = self.extractDynamicReferences(ctxt, executionBlock, True)
+        localKeys = [key for key in references if ctxt.is_local(key)]
+
+        filteredKeys = [key for key in localKeys if self._matchesRegex(ctxt, key)]
+
+        for key in filteredKeys:
+            _buffer = ctxt.lookup(key)
+            if isinstance(_buffer, (StructBuffer, TransientBuffer)) or _buffer._users == []:
+                continue
+            if name == _buffer._users[-1]:
+                inputs.append(_buffer.name)
+
+        return list(dict.fromkeys(inputs))
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        outputNames = self._getOutputNames(ctxt, executionBlock, name)
+        inputNames = self._getFinalInputNames(ctxt, executionBlock, name)
+        transientBuffers = self._extractTransientBuffers(ctxt, name)
+
+        # We have to allocate the output buffers, unless they are global
+
+        for buffer in list(reversed(outputNames)) + transientBuffers:
+            nb = ctxt.lookup(buffer)
+            assert ctxt.localObjects[nb.name]._live == False, f"Tried to allocate already live buffer {nb.name}"
+            ctxt.localObjects[nb.name]._live = True
+            executionBlock.addLeft(nb.allocTemplate, nb._bufferRepresentation())
+
+        for buffer in inputNames + transientBuffers:
+            nb = ctxt.lookup(buffer)
+            assert ctxt.localObjects[nb.name]._live == True, f"Tried to deallocate already dead buffer {nb.name}"
+            ctxt.localObjects[nb.name]._live = False
+            executionBlock.addRight(nb.deallocTemplate, nb._bufferRepresentation())
+
+        return ctxt, executionBlock
+
+
+class MemoryPassthroughGeneration(MemoryManagementGeneration, IntrospectiveCodeTransformationMixIn):
+
+    def __init__(self, memoryHierarchyRegex: Optional[str] = None):
+        super().__init__(memoryHierarchyRegex)
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        outputNames = self._getOutputNames(ctxt, executionBlock, name)
+        inputNames = self._getFinalInputNames(ctxt, executionBlock, name)
+        transientBuffers = self._extractTransientBuffers(ctxt, name)
+
+        # We have to allocate the output buffers, unless they are global
+        for buffer in outputNames + transientBuffers:
+            nb = ctxt.lookup(buffer)
+
+            assert ctxt.localObjects[nb.name]._live == False, f"Tried to allocate already live buffer {nb.name}"
+            ctxt.localObjects[nb.name]._live = True
+
+        for buffer in inputNames + transientBuffers:
+            nb = ctxt.lookup(buffer)
+
+            assert ctxt.localObjects[nb.name]._live == True, f"Tried to deallocate already dead buffer {nb.name}"
+            ctxt.localObjects[nb.name]._live = False
+
+        return ctxt, executionBlock
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py b/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py
new file mode 100644
index 0000000..1f4f8fb
--- /dev/null
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py
@@ -0,0 +1,229 @@
+# ----------------------------------------------------------------------
+#
+# File: PrintInput.py
+#
+# Last edited: 13.11.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import Optional, Tuple
+
+from Deeploy.CommonExtensions.CodeTransformationPasses.IntrospectiveCodeTransformation import \
+    IntrospectiveCodeTransformationMixIn
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ConstantBuffer, ExecutionBlock, \
+    NetworkContext, NodeTemplate, StructBuffer, TransientBuffer, _NoVerbosity
+
+_DebugPrintTemplate = NodeTemplate("""
+<%
+import numpy as np
+accessStr = ""
+dimStr = ""
+for idx, dim in enumerate(bufferShape):
+    accessStr += "[" + f"print_iter_{idx}" + "]"
+    if idx > 0:
+        dimStr += "[" + f"{dim}" + "]"
+%>
+printf("${nodeName} ${bufferName}: ${bufferType.referencedType.typeName}, ${bufferShape}, %p\\n", ${bufferName});
+% for idx, dim in enumerate(bufferShape):
+printf("[");
+for (int print_iter_${idx}=0; print_iter_${idx} < ${dim}; print_iter_${idx}++){
+% endfor
+printf("%*i,", 4, ((${bufferType.referencedType.typeName} (*)${dimStr})${bufferName})${accessStr});
+% for dim in bufferShape:
+}
+printf("], \\n");
+%endfor
+""")
+
+
+class PrintInputGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
+
+    def _getRepDict(self, ctxt: NetworkContext, ref: str, name: str):
+        _buf = ctxt.lookup(ref)
+        refbuf = _buf
+
+        while hasattr(_buf, "_referenceName"):
+            _buf = ctxt.lookup(_buf._referenceName)
+
+        if isinstance(_buf, (TransientBuffer, ConstantBuffer, StructBuffer)):
+            return None
+
+        if name not in _buf._users:
+            return None
+
+        return {"bufferName": refbuf.name, "bufferType": _buf._type, "bufferShape": _buf.shape, "nodeName": name}
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        references = self.extractDynamicReferences(ctxt, executionBlock, True)
+
+        for ref in references:
+            refDict = self._getRepDict(ctxt, ref, name)
+            if refDict is not None:
+                executionBlock.addLeft(_DebugPrintTemplate, refDict)
+
+        return ctxt, executionBlock
+
+
+class MemoryAwareGeneration():
+
+    def __init__(self, memoryHierarchyRegex: Optional[str] = None):
+        super().__init__()
+        if memoryHierarchyRegex is not None:
+            self.regex = re.compile(memoryHierarchyRegex)
+        else:
+            self.regex = None
+
+    def _matchesRegex(self, ctxt: NetworkContext, key: str) -> bool:
+        _buffer = ctxt.lookup(key)
+
+        if self.regex is None:
+            return not hasattr(_buffer, "_memoryLevel")
+
+        if not hasattr(_buffer, "_memoryLevel"):
+            return False
+
+        ret = self.regex.findall(ctxt.lookup(key)._memoryLevel)
+        return ret != []
+
+
+class MemoryAwarePrintInputGeneration(MemoryAwareGeneration, PrintInputGeneration):
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        references = self.extractDynamicReferences(ctxt, executionBlock, True)
+
+        filteredReferences = [ref for ref in references if self._matchesRegex(ctxt, ref)]
+
+        for ref in filteredReferences:
+            refDict = self._getRepDict(ctxt, ref, name)
+            if refDict is not None:
+                executionBlock.addLeft(_DebugPrintTemplate, refDict)
+
+        return ctxt, executionBlock
+
+
+class PrintOutputGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
+
+    def _getRepDict(self, ctxt: NetworkContext, ref: str, name: str):
+        _buf = ctxt.lookup(ref)
+        refbuf = _buf
+
+        while hasattr(_buf, "_referenceName"):
+            _buf = ctxt.lookup(_buf._referenceName)
+
+        if isinstance(_buf, (TransientBuffer, ConstantBuffer, StructBuffer)):
+            return None
+
+        if name in _buf._users:
+            return None
+
+        if _buf._users == [] and not ctxt.is_global(_buf.name):
+            return None
+
+        return {"bufferName": refbuf.name, "bufferType": _buf._type, "bufferShape": _buf.shape, "nodeName": name}
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        references = self.extractDynamicReferences(ctxt, executionBlock, True)
+
+        for ref in references:
+            rep = self._getRepDict(ctxt, ref, name)
+            if rep is not None:
+                executionBlock.addRight(_DebugPrintTemplate, rep)
+
+        return ctxt, executionBlock
+
+
+class MemoryAwarePrintOutputGeneration(MemoryAwareGeneration, PrintOutputGeneration):
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        references = self.extractDynamicReferences(ctxt, executionBlock, True)
+
+        filteredReferences = [ref for ref in references if self._matchesRegex(ctxt, ref)]
+
+        for ref in filteredReferences:
+            refDict = self._getRepDict(ctxt, ref, name)
+            if refDict is not None:
+                executionBlock.addRight(_DebugPrintTemplate, refDict)
+
+        return ctxt, executionBlock
+
+
+class PrintConstantGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
+
+    def _getRepDict(self, ctxt: NetworkContext, ref: str, name: str):
+        _buf = ctxt.lookup(ref)
+        refbuf = _buf
+
+        while hasattr(_buf, "_referenceName"):
+            _buf = ctxt.lookup(_buf._referenceName)
+
+        if not isinstance(_buf, ConstantBuffer) or _buf._users == []:
+            return None
+
+        return {"bufferName": refbuf.name, "bufferType": _buf._type, "bufferShape": _buf.shape, "nodeName": name}
+
+    def apply(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
+              name: str) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        references = self.extractDynamicReferences(ctxt, executionBlock, True)
+
+        for ref in references:
+            rep = self._getRepDict(ctxt, ref, name)
+            if rep is not None:
+                executionBlock.addLeft(_DebugPrintTemplate, rep)
+
+        return ctxt, executionBlock
+
+
+class MemoryAwarePrintConstantGeneration(MemoryAwareGeneration, PrintConstantGeneration):
+
+    def apply(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
+              name: str) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        references = self.extractDynamicReferences(ctxt, executionBlock, True)
+
+        filteredReferences = [ref for ref in references if self._matchesRegex(ctxt, ref)]
+
+        for ref in filteredReferences:
+            refDict = self._getRepDict(ctxt, ref, name)
+            if refDict is not None:
+                executionBlock.addLeft(_DebugPrintTemplate, refDict)
+
+        return ctxt, executionBlock
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/__init__.py b/Deeploy/CommonExtensions/CodeTransformationPasses/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/CommonExtensions/DataTypes.py b/Deeploy/CommonExtensions/DataTypes.py
new file mode 100644
index 0000000..68772d3
--- /dev/null
+++ b/Deeploy/CommonExtensions/DataTypes.py
@@ -0,0 +1,85 @@
+# ----------------------------------------------------------------------
+#
+# File: BasicDataTypes.py
+#
+# Last edited: 31.08.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple, Type
+
+from Deeploy.AbstractDataTypes import IntegerImmediate
+
+
+class int8_t(IntegerImmediate):
+    typeName = "int8_t"
+    typeWidth = 8
+    signed = True
+
+
+class int16_t(IntegerImmediate):
+    typeName = "int16_t"
+    typeWidth = 16
+    signed = True
+
+
+class int32_t(IntegerImmediate):
+    typeName = "int32_t"
+    typeWidth = 32
+    signed = True
+
+
+class int64_t(IntegerImmediate):
+    typeName = "int64_t"
+    typeWidth = 64
+    signed = True
+
+
+class uint8_t(IntegerImmediate):
+    typeName = "uint8_t"
+    typeWidth = 8
+    signed = False
+
+
+class uint16_t(IntegerImmediate):
+    typeName = "uint16_t"
+    typeWidth = 16
+    signed = False
+
+
+class uint32_t(IntegerImmediate):
+    typeName = "uint32_t"
+    typeWidth = 32
+    signed = False
+
+
+class uint64_t(IntegerImmediate):
+    typeName = "uint64_t"
+    typeWidth = 64
+    signed = False
+
+
+SignedIntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (int8_t, int16_t, int32_t, int64_t)
+UnsignedIntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (uint8_t, uint16_t, uint32_t, uint64_t)
+IntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (sorted((
+    *SignedIntegerDataTypes,
+    *UnsignedIntegerDataTypes,
+),
+                                                               key = lambda _type: _type.typeWidth))
diff --git a/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py b/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py
new file mode 100644
index 0000000..6a99627
--- /dev/null
+++ b/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py
@@ -0,0 +1,98 @@
+# ----------------------------------------------------------------------
+#
+# File: NetworkDeployerWrapper.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Tuple, Union
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import CodeGenVerbosity, NetworkContext, NetworkDeployer, ONNXLayer, _NoVerbosity
+
+
+class NetworkDeployerWrapper(NetworkDeployer):
+
+    def __init__(self, deployer: NetworkDeployer):
+        self.__dict__["_innerObject"] = deployer
+
+    def __getattr__(self, name):
+        return getattr(self._innerObject, name)
+
+    def __setattr__(self, name, value):
+        if hasattr(self._innerObject, name):
+            setattr(self._innerObject, name, value)
+        else:
+            super().__setattr__(name, value)
+
+    """ Class attributes
+    Class attributes don't get caught by __getattr__ method so we have to explicitly override them
+    """
+
+    @property
+    def parsed(self):
+        return self._innerObject.parsed
+
+    @property
+    def bound(self):
+        return self._innerObject.bound
+
+    @property
+    def transformed(self):
+        return self._innerObject.transformed
+
+    @property
+    def prepared(self):
+        return self._innerObject.prepared
+
+    """ Extension augmented methods
+    Extensions augment methods and to preserve these augmentations, we have to call the innerObjects method instead of just using the inherited one.
+    """
+
+    # SignPropDeployer augment
+    def _createIOBindings(self, ctxt: NetworkContext, graph: gs.Graph):
+        return self._innerObject._createIOBindings(ctxt, graph)
+
+    # MemoryAwareDeployer, TilerAwareDeployer, and PULPDeployer augments
+    def bind(self) -> bool:
+        return self._innerObject.bind()
+
+    # MemoryAwareDeployer augment
+    def lower(self, graph: gs.Graph) -> gs.Graph:
+        return self._innerObject.lower(graph)
+
+    # MemoryAwareDeployer augment
+    def codeTransform(self, verbose: CodeGenVerbosity = _NoVerbosity):
+        return self._innerObject.codeTransform(verbose)
+
+    # MemoryAwareDeployer augment
+    def _parseNode(self, node: ONNXLayer, ctxt: NetworkContext,
+                   default_channels_first: bool) -> Tuple[NetworkContext, bool]:
+        return self._innerObject._parseNode(node, ctxt, default_channels_first)
+
+    # PULPDeployer augment
+    def generateBufferAllocationCode(self) -> str:
+        return self._innerObject.generateBufferAllocationCode()
+
+    # MultiEngineDeployer augment
+    def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]:
+        return self._innerObject._mapNode(node)
diff --git a/Deeploy/CommonExtensions/NetworkDeployers/SignPropDeployer.py b/Deeploy/CommonExtensions/NetworkDeployers/SignPropDeployer.py
new file mode 100644
index 0000000..c7174e1
--- /dev/null
+++ b/Deeploy/CommonExtensions/NetworkDeployers/SignPropDeployer.py
@@ -0,0 +1,64 @@
+# ----------------------------------------------------------------------
+#
+# File: SignPropDeployer.py
+#
+# Last edited: 11.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, Type
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, TopologyOptimizer
+
+
+class SignPropDeployer(NetworkDeployer):
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: DeploymentPlatform,
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable = lambda x: x,
+                 name: str = 'DeeployNetwork',
+                 default_channels_first: bool = True,
+                 deeployStateDir: str = "DeeployState",
+                 inputOffsets: Dict[str, int] = {}):
+        super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name,
+                         default_channels_first, deeployStateDir)
+
+        if inputOffsets == {}:
+            for key in inputTypes.keys():
+                inputOffsets[key] = 0
+
+        self.inputOffsets = inputOffsets
+
+    def _createIOBindings(self, ctxt, graph):
+        ctxt = super()._createIOBindings(ctxt, graph)
+        for node in graph.inputs:
+            data_name = node.name
+            nb = ctxt.lookup(data_name)
+            data_type = self.inputTypes[data_name]
+            nb._signed = (self.inputOffsets[data_name] == 0)
+            nb.nLevels = (2**data_type.referencedType.typeWidth)
+
+        return ctxt
diff --git a/Deeploy/CommonExtensions/NetworkDeployers/__init__.py b/Deeploy/CommonExtensions/NetworkDeployers/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/CommonExtensions/NetworkDeployers/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/AutoTranspose.py b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/AutoTranspose.py
new file mode 100644
index 0000000..5ff95db
--- /dev/null
+++ b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/AutoTranspose.py
@@ -0,0 +1,218 @@
+# ----------------------------------------------------------------------
+#
+# File: AutoTranspose.py
+#
+# Last edited: 20.11.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.BindingsOptimizationPasses.bindingUtils import bypassNode, \
+    editAttribute
+from Deeploy.CommonExtensions.OptimizationPasses.BindingsOptimizationPasses.PassClasses import bindingaware
+from Deeploy.CommonExtensions.OptimizationPasses.Matchers import BranchingMatcher, Match, NonBranchingMatcher
+from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, SequentialPass
+from Deeploy.DeeployTypes import NetworkContext
+
+
+def _merge_transposeRequantInputs_fun(ctxt: NetworkContext, layerBinding, match: Match, name: str):
+
+    nodes_map = match.nodes_map
+    in1Node = nodes_map['any1']
+    rqNode = nodes_map['rqsOut']
+
+    mergeCondition = lambda node: all([
+        node.op == "Transpose", not "bypass" in node.attrs.keys(),
+        len(node.outputs[0].outputs) == 1, rqNode.attrs["targetMemoryLevelMap"][rqNode.outputs[0].name] == "L1",
+        ctxt.lookup(node.outputs[0].name)._memoryLevel != "L1"
+    ])
+
+    # if rqNode.op == "RequantizedAdd":
+    #     return ctxt, layerBinding
+
+    if not hasattr(ctxt.lookup(in1Node.outputs[0].name), "_memoryLevel"):
+        return ctxt, layerBinding
+
+    orderedInputNames = [inp.name for inp in rqNode.inputs]
+
+    if mergeCondition(in1Node):
+
+        idx = orderedInputNames.index(in1Node.outputs[0].name)
+
+        editAttribute(layerBinding, rqNode, f"in{idx}_perm", in1Node.attrs["perm"])
+        editAttribute(layerBinding, in1Node, "bypass", 1)
+        ctxt, layerBinding = bypassNode(ctxt, layerBinding, in1Node)
+
+    if 'any2' not in nodes_map.keys():
+        return ctxt, layerBinding
+
+    in2Node = nodes_map['any2']
+
+    if not hasattr(ctxt.lookup(in2Node.outputs[0].name), "_memoryLevel"):
+        return ctxt, layerBinding
+
+    if mergeCondition(in2Node):
+        idx = orderedInputNames.index(in2Node.outputs[0].name)
+
+        editAttribute(layerBinding, rqNode, f"in{idx}_perm", in2Node.attrs["perm"])
+        editAttribute(layerBinding, in2Node, "bypass", 1)
+        ctxt, layerBinding = bypassNode(ctxt, layerBinding, in2Node)
+
+    return ctxt, layerBinding
+
+
+def _merge_transposeRequantOutputs_fun(ctxt: NetworkContext, layerBinding, match: Match, name: str):
+
+    nodes_map = match.nodes_map
+
+    outNode = nodes_map['anyOut']
+    rqNode = nodes_map['rqs']
+
+    if not hasattr(ctxt.lookup(rqNode.outputs[0].name), "_memoryLevel"):
+        return ctxt, layerBinding
+
+    mergeCondition = lambda node: all([
+        node.op == "Transpose", not "bypass" in node.attrs.keys(),
+        len(rqNode.outputs[0].outputs) == 1, rqNode.attrs["targetMemoryLevelMap"][rqNode.outputs[0].name] == "L1",
+        ctxt.lookup(rqNode.outputs[0].name)._memoryLevel != "L1"
+    ])
+
+    if mergeCondition(outNode):
+        editAttribute(layerBinding, rqNode, "out_perm", outNode.attrs["perm"])
+        editAttribute(layerBinding, outNode, "bypass", 1)
+
+    return ctxt, layerBinding
+
+
+@bindingaware
+class _CAPass(ReplaceSequentialPatternPass):
+    pass
+
+
+@bindingaware
+class AutoTransposeMergeOutputsPass(SequentialPass):
+
+    def _buildSingleVarGraph(self) -> gs.Graph:
+
+        _input1 = gs.Variable(name = 'input_1')
+
+        _rqs = gs.Variable(name = 'rqsVar')
+        _rqsOut = gs.Variable(name = 'rqsOutput')
+
+        output = gs.Node(inputs = [_input1], outputs = [_rqs], op = r'Requantized.*', name = 'rqs')
+        rqsOut = gs.Node(inputs = [_rqs], outputs = [_rqsOut], op = r'.*', name = 'anyOut')
+
+        graph = gs.Graph(nodes = [output, rqsOut], inputs = [_input1], outputs = [_rqsOut]).cleanup()
+
+        return graph
+
+    def _buildDualVarGraph(self) -> gs.Graph:
+
+        _input1 = gs.Variable(name = 'input_1')
+        _input2 = gs.Variable(name = 'input_2')
+
+        _rqs = gs.Variable(name = 'rqsVar')
+        _rqsOut = gs.Variable(name = 'rqsOutput')
+
+        output = gs.Node(inputs = [_input1, _input2], outputs = [_rqs], op = r'Requantized.*', name = 'rqs')
+        rqsOut = gs.Node(inputs = [_rqs], outputs = [_rqsOut], op = r'.*', name = 'anyOut')
+
+        graph = gs.Graph(nodes = [output, rqsOut], inputs = [_input1, _input2], outputs = [_rqsOut]).cleanup()
+
+        return graph
+
+    def __init__(self):
+
+        pass1 = _CAPass(self._buildSingleVarGraph(),
+                        replacement_fn = _merge_transposeRequantOutputs_fun,
+                        name = "_MERGE_TransposeRQ_PASS",
+                        matcher = NonBranchingMatcher(regex_op = True))
+
+        pass2 = _CAPass(self._buildDualVarGraph(),
+                        replacement_fn = _merge_transposeRequantOutputs_fun,
+                        name = "_MERGE_TransposeRQ_PASS",
+                        matcher = BranchingMatcher(regex_op = True))
+
+        super().__init__(pass1, pass2)
+
+
+@bindingaware
+class AutoTransposeMergeInputsPass(SequentialPass):
+
+    def _buildSingleVarGraph(self) -> gs.Graph:
+
+        _input1 = gs.Variable(name = 'input_1')
+        _rqIn1 = gs.Variable(name = 'rqIn1')
+
+        _rqs = gs.Variable(name = 'rqs')
+
+        anyIn1 = gs.Node(inputs = [_input1], outputs = [_rqIn1], op = r'.*', name = 'any1')
+        output = gs.Node(inputs = [_rqIn1], outputs = [_rqs], op = r'Requantized.*', name = 'rqsOut')
+
+        graph = gs.Graph(nodes = [anyIn1, output], inputs = [_input1], outputs = [_rqs])
+
+        return graph
+
+    def _buildDualVarGraph(self) -> gs.Graph:
+
+        _input1 = gs.Variable(name = 'input_1')
+        _input2 = gs.Variable(name = 'input_2')
+
+        _rqIn1 = gs.Variable(name = 'rqIn1')
+        _rqIn2 = gs.Variable(name = 'rqIn2')
+
+        _rqs = gs.Variable(name = 'rqs')
+        _rqsOut = gs.Variable(name = 'rqs')
+
+        anyIn1 = gs.Node(inputs = [_input1], outputs = [_rqIn1], op = r'.*', name = 'any1')
+        anyIn2 = gs.Node(inputs = [_input2], outputs = [_rqIn2], op = r'.*', name = 'any2')
+
+        output = gs.Node(inputs = [_rqIn1, _rqIn2], outputs = [_rqs], op = r'Requantized.*', name = 'rqsOut')
+
+        graph = gs.Graph(nodes = [anyIn1, anyIn2, output], inputs = [_input1, _input2], outputs = [_rqs])
+
+        return graph
+
+    def __init__(self):
+
+        pass1 = _CAPass(self._buildSingleVarGraph(),
+                        replacement_fn = _merge_transposeRequantInputs_fun,
+                        name = "_MERGE_TransposeRQ_PASS",
+                        matcher = NonBranchingMatcher(regex_op = True))
+
+        pass2 = _CAPass(self._buildDualVarGraph(),
+                        replacement_fn = _merge_transposeRequantInputs_fun,
+                        name = "_MERGE_TransposeRQ_PASS",
+                        matcher = BranchingMatcher(regex_op = True))
+
+        super().__init__(pass1, pass2)
+
+
+@bindingaware
+class AutoTransposeMergePass(SequentialPass):
+
+    def __init__(self):
+
+        pass1 = AutoTransposeMergeInputsPass()
+        # SCHEREMO: Not sure if PULP supports DMA'ing outputs transposed
+        #pass2 = AutoTransposeMergeOutputsPass()
+
+        super().__init__(pass1)
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/BindingsOptimization.py b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/BindingsOptimization.py
new file mode 100644
index 0000000..df97a96
--- /dev/null
+++ b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/BindingsOptimization.py
@@ -0,0 +1,48 @@
+# ----------------------------------------------------------------------
+#
+# File: BindingsOptimization.py
+#
+# Last edited: 21.11.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Tuple
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import NetworkContext, ONNXLayer
+
+
+class BindingOptimizationPass():
+
+    def apply(self, ctxt: NetworkContext, graph: gs.Graph,
+              layerBinding: Dict[str, ONNXLayer]) -> Tuple[NetworkContext, Dict[str, ONNXLayer]]:
+        return ctxt, layerBinding
+
+
+class BindingOptimizer():
+
+    def optimize(self, ctxt: NetworkContext, graph: gs.Graph,
+                 layerBinding: Dict[str, ONNXLayer]) -> Tuple[NetworkContext, Dict[str, ONNXLayer]]:
+        newLayerBinding = layerBinding.copy()
+        for _pass in self.passes:
+            ctxt, newLayerBinding = _pass.apply(ctxt, graph, newLayerBinding)
+            assert newLayerBinding.keys() == layerBinding.keys(), "BindingOptimizationPass removed bindings!"
+        return ctxt, layerBinding
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/PassClasses.py b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/PassClasses.py
new file mode 100644
index 0000000..475809b
--- /dev/null
+++ b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/PassClasses.py
@@ -0,0 +1,84 @@
+# ----------------------------------------------------------------------
+#
+# File: PassClasses.py
+#
+# Last edited: 21.11.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import Pass, ReplaceSequentialPatternPass, SequentialPass
+from Deeploy.DeeployTypes import NetworkContext
+
+
+class BindingAwarePassMixIn():
+    # DO NOT OVERWRITE this function in custom pass subclasses unless you have
+    # a very good reason!
+    def apply(self, ctxt, graph, layerBinding):
+        ctxt, layerBinding = self.retarget(ctxt, graph, layerBinding)
+        ctxt, layerBinding = self.run_pass(ctxt, graph, layerBinding)
+        return ctxt, layerBinding
+
+    def __call__(self, ctxt: NetworkContext, graph: gs.Graph, layerBinding):
+        return self.apply(ctxt, graph, layerBinding)
+
+    # overwrite this if your pass is specific to a graph instance (e.g., most
+    # "dynamic" SequentialPass derivatives will be, as the list of passes to
+    # execute probably depends on the graph. See e.g.
+    # ReplaceSequentialPatternPass for an example)
+    def retarget(self, ctxt: NetworkContext, graph: gs.Graph, layerBinding):
+        return ctxt, layerBinding
+
+
+class BindingAwareSequentialPassMixIn(BindingAwarePassMixIn):
+
+    def run_pass(self, ctxt: NetworkContext, graph: gs.Graph, layerBinding):
+        for p in self.named_subpasses().values():
+            ctxt, layerBinding = p.apply(ctxt, graph, layerBinding)
+        return ctxt, layerBinding
+
+
+class BindingAwareReplaceSequentialPatternPassMixIn(BindingAwareSequentialPassMixIn):
+
+    def retarget(self, ctxt: NetworkContext, graph: gs.Graph, layerBinding):
+        # to retarget to a new graph, clear all registered subpasses.
+        for k in self.named_subpasses().keys():
+            self.remove_subpass(k)
+        self.matches = self.matcher.match(graph, self.pattern)
+        for i, m in enumerate(self.matches):
+            ctxt, layerBinding = self.replacement_fn(ctxt, layerBinding, m, f"{self.name}_{i}", **self.kwargs)
+        return ctxt, layerBinding
+
+
+def bindingaware(cls):
+    mixinClass = None
+    # These need to be sorted from most specific parent class to least specific parent class!
+    # if issubclass(cls, ReplaceMatchWithModulePass):
+    #     mixinClass = BindingAwareReplaceMatchWithModulePassMixIn
+    if issubclass(cls, ReplaceSequentialPatternPass):
+        mixinClass = BindingAwareReplaceSequentialPatternPassMixIn
+    elif issubclass(cls, SequentialPass):
+        mixinClass = BindingAwareSequentialPassMixIn
+    elif issubclass(cls, Pass):
+        mixinClass = BindingAwarePassMixIn
+    else:
+        raise Exception(f"Tried to decorate class {cls} as bindingaware, but failed!")
+    return type(cls.__name__, (cls, mixinClass), {})
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/__init__.py b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/bindingUtils.py b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/bindingUtils.py
new file mode 100644
index 0000000..01958cc
--- /dev/null
+++ b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/bindingUtils.py
@@ -0,0 +1,73 @@
+# ----------------------------------------------------------------------
+#
+# File: bindingUtils.py
+#
+# Last edited: 21.11.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import MemoryManagementGeneration, \
+    MemoryPassthroughGeneration
+from Deeploy.DeeployTypes import CodeTransformation, NetworkContext, NodeTemplate, ONNXLayer
+from Deeploy.Targets.Generic.TileConstraints.UntiledTileConstraint import UntiledTileConstraint
+
+_bypassNodeTemplate = NodeTemplate("""
+// BYPASSED (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE ${data_out} = ${data_in};
+""")
+
+
+def bypassNode(ctxt: NetworkContext, layerBinding: Dict[str, ONNXLayer],
+               node: gs.Node) -> Tuple[NetworkContext, Dict[str, ONNXLayer]]:
+
+    assert len(node.inputs) == 1 and len(node.outputs) == 1, "Can only bypass nodes with single input and output!"
+
+    # bypassedOutput = ctxt.lookup(node.outputs[0].name)
+    # bypassedOutput._deploy = False
+
+    for binding in layerBinding[node.name].mapper.bindings:
+        binding.template = copy.deepcopy(_bypassNodeTemplate)
+        binding.template.tileConstraint = UntiledTileConstraint()
+
+        passes = []
+        for transformationPass in binding.codeTransformer.passes:
+            if isinstance(transformationPass, MemoryManagementGeneration):
+                passes.append(MemoryPassthroughGeneration(transformationPass.regex))
+
+        binding.codeTransformer = CodeTransformation(passes)
+
+    return ctxt, layerBinding
+
+
+def editAttribute(layerBinding: Dict[str, ONNXLayer], node: gs.Node, attrName: str, attrValue: Union[List[Any], Any]):
+    nodeName = node.name
+    operatorRepresentation = layerBinding[nodeName].mapper.parser.operatorRepresentation
+    operatorRepresentation[attrName] = attrValue
+
+    if isinstance(attrValue, list):
+        node.attrs[attrName] = np.array(attrValue)
+    else:
+        node.attrs[attrName] = np.array([attrValue])
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/Matchers.py b/Deeploy/CommonExtensions/OptimizationPasses/Matchers.py
new file mode 100644
index 0000000..54ec01f
--- /dev/null
+++ b/Deeploy/CommonExtensions/OptimizationPasses/Matchers.py
@@ -0,0 +1,257 @@
+# ----------------------------------------------------------------------
+#
+# File: Matchers.py
+#
+# Last edited: 28.04.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import Dict, Literal, NamedTuple, Optional
+
+import onnx_graphsurgeon as gs
+
+
+class Match(NamedTuple):
+    anchor: gs.Node
+    nodes_map: Dict[str, gs.Node]
+
+
+class SubgraphMatcher:
+
+    def __init__(self, regex_op: bool = False):
+        # operation matching policy
+        self.regex_op = regex_op
+
+    def is_op_match(self, patternNode: gs.Node, graphNode: gs.Node):
+        if self.regex_op:
+            return re.fullmatch(patternNode.op, graphNode.op) is not None
+        else:
+            return patternNode.op == graphNode.op
+
+    # Override this
+    def _valid_pattern(self, pattern: gs.Graph) -> None:
+        _ = pattern
+
+    # Override this
+    def _nodes_map_from_anchor(self, anchor: gs.Node, pattern: gs.Graph) -> Optional[Dict[str, gs.Node]]:
+        _, _ = anchor, pattern
+
+    def _match_from_anchor(self, anchor: gs.Node, pattern: gs.Graph) -> Optional[Match]:
+        nodes_map = self._nodes_map_from_anchor(anchor, pattern)
+
+        if nodes_map is not None and len(nodes_map.keys()) == len(pattern.nodes):
+            return Match(anchor, nodes_map)
+        else:
+            return None
+
+    def match(self, graph: gs.Graph, pattern: gs.Graph):
+        self._valid_pattern(pattern)
+
+        # Return a list of non-overlapping matches of pattern
+        # self.pattern in the graph.
+        matches = []
+        # Nodes are not hashable so we are using their names
+        matched_node_names = set()
+
+        def node_names(match: Match):
+            return [node.name for node in match.nodes_map.values()]
+
+        def is_overlap(match: Match):
+            return not matched_node_names.isdisjoint(node_names(match))
+
+        for node in graph.nodes:
+            match = self._match_from_anchor(node, pattern)
+            if match is not None and not is_overlap(match):
+                matches.append(match)
+                matched_node_names.update(node_names(match))
+        return matches
+
+
+class NonBranchingMatcher(SubgraphMatcher):
+    # simplified matcher which matches call_module ops more reasonably
+    def __init__(self, regex_op: bool = False):
+        # This checking is sufficient - iff the graph is acyclic and connected (checked by parser)
+        # and every node has one output, the graph is sequential
+        super().__init__(regex_op)
+
+    def _valid_pattern(self, pattern: gs.Graph):
+        assert len(pattern.outputs) == 1, "Found more than one output"
+        for node in pattern.nodes:
+            assert len(node.outputs) == 1, "Graph needs to be purely sequential!"
+
+    def _match_nodes_recursive(self, pn: gs.Node, gn: gs.Node, pattern_length: int,
+                               nodes_map: dict) -> Optional[Dict[str, gs.Node]]:
+        # as we do sequential traversal, the first step (checking if nodes
+        # already traversed) of the original _match_nodes function can be
+        # discarded
+
+        # the following submethod is a modified version of the one from the
+        # original SubgraphMatcher
+        def attributes_are_equal(pn: gs.Node, gn: gs.Node) -> bool:
+            return self.is_op_match(pn, gn)
+
+        # from here on, proceed as in the original implementation.
+        if not attributes_are_equal(pn, gn):
+            return None
+
+        # Graph has a branch
+        if len(gn.outputs) > 1:
+            return None
+
+        nodes_map[pn.name] = gn
+
+        # End of pattern
+        if pattern_length == 1:
+            return nodes_map
+
+        # if we are in the "active" pattern, the graph node has to be
+        # single-output and single-use
+        # if (pn.op not in ("output", "placeholder") and
+        # (len(gn.all_input_nodes) != 1) or (len(gn.users) > 1 and not
+        # first_active_node)):
+        if len(gn.outputs[0].outputs) > 1:
+            # if the gn has >1 users, the pattern is "leaking" and we don't
+            # want to match it
+            return None
+
+        # otherwise we are on a "matching track", so move one node down in
+        # pattern and graph. We know that gn has only 1 input!
+        if len(pn.outputs[0].outputs) < 1 or len(gn.outputs[0].outputs) < 1:
+            return None
+
+        return self._match_nodes_recursive(pn.o(), gn.o(), pattern_length - 1, nodes_map)
+
+    def _nodes_map_from_anchor(self, anchor: gs.Node, pattern: gs.Graph) -> Optional[Dict[str, gs.Node]]:
+        pattern_anchor = next(iter(pattern.nodes))
+        return self._match_nodes_recursive(pattern_anchor, anchor, len(pattern.nodes), {})
+
+
+class BranchingMatcher(SubgraphMatcher):
+    # simplified matcher which matches call_module ops more reasonably
+    def __init__(self, regex_op: bool = False):
+        super().__init__(regex_op)
+
+    def _valid_pattern(self, pattern: gs.Graph):
+        assert len(pattern.outputs) == 1, "Found more than one output"
+
+    def _match_nodes_recursive(self, pn: gs.Node, gn: gs.Node, nodes_map: dict,
+                               direction: Literal["Forward", "Reverse"]) -> Optional[Dict]:
+        assert direction in ["Forward", "Reverse"], f"'{direction}' is not a valid matching direction!"
+
+        # Check if nodes are identical
+        def attributes_are_equal(pn: gs.Node, gn: gs.Node):
+            ret = True
+            pn_inputs = [pn_input for pn_input in pn.inputs if len(pn_input.inputs) > 0]
+            gn_inputs = [gn_input for gn_input in gn.inputs if len(gn_input.inputs) > 0]
+            if len(pn.inputs) != 1 or len(pn.inputs[0].inputs) != 0:
+                ret &= (len(pn_inputs) == len(gn_inputs))
+
+            pn_outputs = [pn_output for pn_output in pn.outputs if len(pn_output.outputs) > 0]
+            gn_outputs = [gn_output for gn_output in gn.outputs if len(gn_output.outputs) > 0]
+            if len(pn.outputs) != 1 or len(pn.outputs[0].outputs) != 0:
+                ret &= (len(pn_outputs) == len(gn_outputs))
+
+            ret &= self.is_op_match(pn, gn)
+            return ret, (pn_inputs, gn_inputs, pn_outputs, gn_outputs)
+
+        ret, (pn_inputs, gn_inputs, pn_outputs, gn_outputs) = attributes_are_equal(pn, gn)
+        if not ret:
+            return None
+
+        # Add candidate match to list
+        nodes_map[pn.name] = gn
+
+        # Check if we are done
+        if direction == 'Reverse':
+            # If our pattern is fully matched until here
+            if len(pn.inputs) == 0:
+                return nodes_map
+            # If our pattern is not fully matched completely but we reached leaf node
+            if len(pn.inputs) > 0 and len(gn.inputs) == 0:
+                return None
+
+        if direction == 'Forward':
+            # If our pattern is fully matched until here
+            if len(pn.outputs) == 0:
+                return nodes_map
+            # If our pattern is not fully matched completely but we reached leaf node
+            if len(pn.outputs) > 0 and len(gn.outputs) == 0:
+                return None
+
+        # If pn and gn have multiple parent nodes, we have want to traverse upwards
+        if len(pn.inputs) > 0 and len(gn.inputs) > 0:
+            for pn_input in pn.inputs:
+                # Check if parent node of pn is constant or input node (in this case it has no additional inputs)
+                # and if node was already matched
+                if len(pn_input.inputs) > 0 and pn_input.inputs[0].name not in nodes_map.keys():
+                    tmp = None
+                    for gn_input in gn.inputs:
+                        # Check if parent node of gn is constant or input node (in this case it has no additional inputs)
+                        # and if node was already matched
+                        if len(gn_input.inputs) > 0 and gn_input.inputs[0] not in nodes_map.values():
+                            # Search for valid subgraphs
+                            tmp = self._match_nodes_recursive(pn_input.inputs[0],
+                                                              gn_input.inputs[0],
+                                                              nodes_map,
+                                                              direction = 'Reverse')
+                            if tmp is not None:
+                                nodes_map = tmp
+
+                    # If it was not possible to map parent node of pn to a parent node of gn
+                    if tmp == None:
+                        return None
+
+            # If it was possible to map all parent nodes of pn to a parent node of gn
+            if direction == 'Reverse':
+                return nodes_map
+
+        # If pn and gn have multiple child nodes, we have want to traverse downwards
+        if len(pn.outputs) > 0 and len(gn.outputs) > 0:
+            for pn_input in pn.outputs:
+                # Check if parent node of pn is is output node (in this case it has no additional outputs)
+                # and if node was already matched
+                if len(pn_input.outputs) > 0 and pn_input.outputs[0].name not in nodes_map.keys():
+                    tmp = None
+                    for gn_input in gn.outputs:
+                        # Check if parent node of gn is is output node (in this case it has no additional outputs)
+                        # and if node was already matched
+                        if len(gn_input.outputs) > 0 and gn_input.outputs[0] not in nodes_map.values():
+                            # Search for valid subgraphs
+                            tmp = self._match_nodes_recursive(pn_input.outputs[0],
+                                                              gn_input.outputs[0],
+                                                              nodes_map,
+                                                              direction = 'Forward')
+                            if tmp is not None:
+                                nodes_map = tmp
+
+                    # If it was not possible to map parent node of pn to a parent node of gn
+                    if tmp == None:
+                        return None
+
+            # If it was possible to map all child nodea of pn to a child node of gn
+            if direction == 'Forward':
+                return nodes_map
+
+        assert False, "This statement should never be reached!"
+
+    def _nodes_map_from_anchor(self, anchor: gs.Node, pattern: gs.Graph) -> Optional[Dict[str, gs.Node]]:
+        pattern_anchor = next(iter(pattern.nodes))
+        return self._match_nodes_recursive(pattern_anchor, anchor, {}, 'Forward')
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/PassClasses.py b/Deeploy/CommonExtensions/OptimizationPasses/PassClasses.py
new file mode 100644
index 0000000..e027e1d
--- /dev/null
+++ b/Deeploy/CommonExtensions/OptimizationPasses/PassClasses.py
@@ -0,0 +1,336 @@
+# ----------------------------------------------------------------------
+#
+# File: PassClasses.py
+#
+# Last edited: 28.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author:
+# Moritz Scherer, ETH Zurich
+# Georg Rutishauser, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import NetworkContext
+
+from .Matchers import Match, NonBranchingMatcher, SubgraphMatcher
+
+
+class _MemoReach():
+
+    def __init__(self, graph, inputTensors, outputTensors):
+        self.memo = {}
+        self.graph = graph
+        self.inputTensors = inputTensors
+        self.outputTensors = outputTensors
+
+    def reachingSet(self):
+        reachingSet = []
+        for inTensor in self.inputTensors:
+            for user in inTensor.outputs:
+                reachingSet += self._reachingSet(user)
+
+        nodeNames = {node.name for node in reachingSet}
+        retList = [node for node in self.graph.nodes if node.name in nodeNames]
+        return retList
+
+    def _reachingSet(self, node: gs.Node) -> List[gs.Node]:
+
+        if node.name in self.memo.keys():
+            return self.memo[node.name]
+
+        reachingSet = []
+
+        if any([output in self.outputTensors for output in node.outputs]):
+            self.memo[node.name] = [node]
+            return [node]
+
+        oSet = []
+        for outp in node.outputs:
+            for out in outp.outputs:
+                if outp not in self.inputTensors:
+                    oSet.append(out)
+
+        for potentialNode in oSet:
+            if potentialNode.name in self.memo.keys():
+                nodeRet = self.memo[potentialNode.name]
+            else:
+                nodeRet = self._reachingSet(potentialNode)
+
+            reachingSet += nodeRet
+
+        if reachingSet != []:
+            reachingSet.append(node)
+
+        self.memo[node.name] = reachingSet
+
+        return reachingSet
+
+
+@gs.Graph.register()
+def deleteNode(self, node: gs.Node):
+    # LMACAN: Assume only one input and only one output tensor
+
+    inputTensor = node.inputs[0]
+    outputTensor = node.outputs[0]
+
+    isGlobalOutputTensor = len(outputTensor.outputs) == 0
+
+    if isGlobalOutputTensor:
+        inputTensor.name = outputTensor.name  # Preserve the output tensor name
+        outputTensor.name = outputTensor.name + "_throwaway"  # Avoid same named tensors in graph; gets immediately removed with cleanup
+        self.outputs[self.outputs.index(outputTensor)] = inputTensor
+    else:
+        for outputNode in list(outputTensor.outputs):
+            # Swap the outputTensor with inputTensor in the downstream nodes
+            outputNode.inputs[outputNode.inputs.index(outputTensor)] = inputTensor
+        node.inputs.clear()
+        node.outputs.clear()
+
+    self.cleanup()
+
+
+def _reachableNodes(graph: gs.Graph, inputTensors: List[gs.Tensor], outputTensors: List[gs.Tensor]) -> List[gs.Node]:
+
+    _inputTensors = [tensor for tensor in inputTensors.copy() if tensor.name in graph.tensors().keys()]
+    _outputTensors = [tensor for tensor in outputTensors.copy() if tensor.name in graph.tensors().keys()]
+
+    retList = _MemoReach(graph, _inputTensors, _outputTensors).reachingSet()
+
+    return retList
+
+
+@gs.Graph.register()
+def replaceInsertNode(self, inputs, outputs, newNode):
+    reachableSet = _reachableNodes(self, inputs, outputs)
+
+    ret = self.layer(op = newNode.op, name = newNode.name, attrs = newNode.attrs, inputs = inputs, outputs = outputs)
+
+    for node in reachableSet:
+        node.outputs = []
+
+    self.toposort().cleanup()
+
+
+class Pass():
+
+    def __init__(self):
+        self.parent = None
+        self._subpasses = {}
+
+    def __setattr__(self, attribute, value):
+        if isinstance(value, Pass) and attribute != 'parent':
+            self.register_subpass(attribute, value)
+        super(Pass, self).__setattr__(attribute, value)
+
+    def register_subpass(self, name, value):
+        if name in self._subpasses.keys():
+            del self._subpasses[name]
+
+        value.parent = self
+        self._subpasses[name] = value
+
+    def remove_subpass(self, name):
+        try:
+            del self._subpasses[name]
+        except KeyError:
+            print(f"No subpass with name {name}, cannot remove!")
+        except AttributeError:
+            raise AttributeError("Cannot remove sub-pass before calling Pass.__init__!")
+
+    def __getattr__(self, attribute):
+        if self._subpasses is not None and attribute in self._subpasses.keys():
+            return self._subpasses[attribute]
+
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attribute}")
+
+    def named_subpasses(self):
+        return self._subpasses.copy()
+
+
+class ContextAwarePassMixIn():
+    # DO NOT OVERWRITE this function in custom pass subclasses unless you have
+    # a very good reason!
+    def apply(self, ctxt, graph):
+        ctxt, graph = self.retarget(ctxt, graph)
+        ctxt, graph = self.run_pass(ctxt, graph)
+        return ctxt, graph
+
+    def __call__(self, ctxt: NetworkContext, graph: gs.Graph):
+        return self.apply(ctxt, graph)
+
+    # overwrite this if your pass is specific to a graph instance (e.g., most
+    # "dynamic" SequentialPass derivatives will be, as the list of passes to
+    # execute probably depends on the graph. See e.g.
+    # ReplaceSequentialPatternPass for an example)
+    def retarget(self, ctxt: NetworkContext, graph: gs.Graph):
+        return ctxt, graph
+
+
+class ContextAgnosticPassMixIn():
+    # DO NOT OVERWRITE this function in custom pass subclasses unless you have
+    # a very good reason!
+    def apply(self, graph: gs.Graph) -> gs.Graph:
+        graph = self.retarget(graph)
+        graph = self.run_pass(graph)
+        return graph
+
+    def __call__(self, graph: gs.Graph):
+        return self.apply(graph)
+
+    # overwrite this if your pass is specific to a graph instance (e.g., most
+    # "dynamic" SequentialPass derivatives will be, as the list of passes to
+    # execute probably depends on the graph. See e.g.
+    # ReplaceSequentialPatternPass for an example)
+    def retarget(self, graph: gs.Graph) -> gs.Graph:
+        return graph
+
+
+class ContextAwareSequentialPassMixIn(ContextAwarePassMixIn):
+
+    def run_pass(self, ctxt: NetworkContext, graph: gs.Graph):
+        for p in self.named_subpasses().values():
+            ctxt, graph = p.apply(ctxt, graph)
+        return ctxt, graph
+
+
+class ContextAgnosticSequentialPassMixIn(ContextAgnosticPassMixIn):
+
+    def run_pass(self, graph: gs.Graph):
+        for p in self.named_subpasses().values():
+            graph = p.apply(graph)
+        return graph
+
+
+class SequentialPass(Pass):
+
+    def __init__(self, *passes, name_prefix = ''):
+        super(SequentialPass, self).__init__()
+        self.name_prefix = name_prefix
+        self.setup_passes(passes)
+
+    def setup_passes(self, passes):
+        for i, p in enumerate(passes):
+            self.register_subpass(self.name_prefix + '_' + str(i), p)
+
+
+class ContextAwareReplaceMatchWithModulePassMixIn(ContextAwarePassMixIn):
+
+    def run_pass(self, ctxt: NetworkContext, graph: gs.Graph):
+        if self.replacementNode is not None:
+            graph.replaceInsertNode(self.replacementNode)
+        return ctxt, graph
+
+
+class ContextAgnosticReplaceMatchWithModulePassMixIn(ContextAgnosticPassMixIn):
+
+    def run_pass(self, graph: gs.Graph) -> gs.Graph:
+        if self.replacementNode is not None:
+            graph.replaceInsertNode(self.replacementNode)
+        return graph
+
+
+class ReplaceMatchWithModulePass(Pass):
+    #Matches are specific to graph instances, so don't use this type of pass on its
+    #own if you want to reuse it!
+    def __init__(self, match: Match, module: gs.Node):
+        # this class needs a name field because the inserted submodules will be named
+        super(ReplaceMatchWithModulePass, self).__init__()
+        self.match = match
+        self.replacementNode = module
+
+
+class ContextAwareReplaceSequentialPatternPassMixIn(ContextAwareSequentialPassMixIn):
+
+    def retarget(self, ctxt: NetworkContext, graph: gs.Graph):
+        # to retarget to a new graph, clear all registered subpasses.
+        for k in self.named_subpasses().keys():
+            self.remove_subpass(k)
+        self.matches = self.matcher.match(graph, self.pattern)
+        for i, m in enumerate(self.matches):
+            ctxt, graph = self.replacement_fn(ctxt, graph, m, f"{self.name}_{i}", **self.kwargs)
+        graph.cleanup().toposort()
+        return ctxt, graph
+
+
+class ContextAgnosticReplaceSequentialPatternPassMixIn(ContextAgnosticSequentialPassMixIn):
+
+    def retarget(self, graph: gs.Graph):
+        # to retarget to a new graph, clear all registered subpasses.
+        for k in self.named_subpasses().keys():
+            self.remove_subpass(k)
+        self.matches = self.matcher.match(graph, self.pattern)
+        for i, m in enumerate(self.matches):
+            graph = self.replacement_fn(graph, m, f"{self.name}_{i}", **self.kwargs)
+        graph.cleanup().toposort()
+        return graph
+
+
+class ReplaceSequentialPatternPass(SequentialPass):
+    # finds all instances of pattern in the graph, calls the replacement_fn on
+    # the matches and replaces the matched nodes with the module returned by
+    # replacement_fn.
+    def __init__(self,
+                 pattern: gs.Graph,
+                 replacement_fn: callable,
+                 name: str,
+                 matcher: Optional[SubgraphMatcher] = None,
+                 **kwargs):
+        super().__init__(name_prefix = name)
+        self.pattern = pattern
+        self.matcher = matcher
+        if matcher is None:
+            self.matcher = NonBranchingMatcher()
+        self.replacement_fn = replacement_fn
+        self.name = name
+        self.kwargs = kwargs
+
+
+def contextagnostic(cls):
+    mixinClass = None
+    # These need to be sorted from most specific parent class to least specific parent class!
+    if issubclass(cls, ReplaceMatchWithModulePass):
+        mixinClass = ContextAgnosticReplaceMatchWithModulePassMixIn
+    elif issubclass(cls, ReplaceSequentialPatternPass):
+        mixinClass = ContextAgnosticReplaceSequentialPatternPassMixIn
+    elif issubclass(cls, SequentialPass):
+        mixinClass = ContextAgnosticSequentialPassMixIn
+    elif issubclass(cls, Pass):
+        mixinClass = ContextAgnosticPassMixIn
+    else:
+        raise Exception(f"Tried to decorate class {cls} as contextagnostic, but failed!")
+    return type(cls.__name__, (cls, mixinClass), {})
+
+
+def contextaware(cls):
+    mixinClass = None
+    # These need to be sorted from most specific parent class to least specific parent class!
+    if issubclass(cls, ReplaceMatchWithModulePass):
+        mixinClass = ContextAwareReplaceMatchWithModulePassMixIn
+    elif issubclass(cls, ReplaceSequentialPatternPass):
+        mixinClass = ContextAwareReplaceSequentialPatternPassMixIn
+    elif issubclass(cls, SequentialPass):
+        mixinClass = ContextAwareSequentialPassMixIn
+    elif issubclass(cls, Pass):
+        mixinClass = ContextAwarePassMixIn
+    else:
+        raise Exception(f"Tried to decorate class {cls} as contextaware, but failed!")
+    return type(cls.__name__, (cls, mixinClass), {})
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/DebugPasses.py b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/DebugPasses.py
new file mode 100644
index 0000000..7525bbb
--- /dev/null
+++ b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/DebugPasses.py
@@ -0,0 +1,150 @@
+# ----------------------------------------------------------------------
+#
+# File: DebugPasses.py
+#
+# Last edited: 28.04.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from functools import partial
+from typing import Literal
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.Matchers import Match, NonBranchingMatcher
+from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, contextagnostic
+
+
+@contextagnostic
+class EmulateCMSISRequantPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['output0'], op = 'RequantShift', name = 'rqs1')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_EMULATE_CMSIS_REQUANT_PASS"
+        super().__init__(graph, _convert_requant_to_cmsis_fun, name)
+
+
+def _convert_requant_to_cmsis_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    rqs = matched_nodes[0]
+
+    # Make sure pass is only applied once
+    if 'Emulate_CMSIS_RequantShift' in rqs.attrs:
+        return graph
+
+    # WIESEP: Because CMSIS performs add-multiply-divide and we normally do multiply-add-divide
+    #         we can emulate the same behavior by rounding the MUL value
+    rqs.inputs[-1].values = np.round(copy.deepcopy(rqs.inputs[-1].values) /
+                                     (rqs.inputs[-2].values + 1e-3)) * rqs.inputs[-2].values
+    rqs.attrs['emulate_CMSIS_requantShift'] = True
+
+    return graph
+
+
+def _print_fun(graph: gs.Graph, match: Match, name: str, position: Literal["before", "after"] = "before"):
+    assert position in ["before", "after"], f"'{position}' is not a valid position for the print node!"
+
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+
+    node = matched_nodes[0]
+    name += '_' + node.name
+
+    if position == 'before' and "PRINT" not in node.inputs[0].name:
+        newNodeInput = gs.Variable(name + '_input', dtype = np.float32, shape = node.inputs[0].shape)
+        newPrintNode = gs.Node(op = 'DebugPrint',
+                               name = 'DebugPrint_' + node.name + '_input',
+                               inputs = [node.inputs[0]],
+                               outputs = [newNodeInput])
+
+        node.inputs[0] = newNodeInput
+
+        graph.nodes.append(newPrintNode)
+        graph.cleanup().toposort()
+
+    if position == 'after' and "PRINT" not in node.outputs[0].name:
+        newNodeOutput = gs.Variable(name + '_output', dtype = np.float32, shape = node.outputs[0].shape)
+        newPrintNode = gs.Node(op = 'DebugPrint',
+                               name = 'DebugPrint_' + node.name + '_output',
+                               inputs = [newNodeOutput],
+                               outputs = [node.outputs[0]])
+
+        node.outputs[0] = newNodeOutput
+
+        graph.nodes.append(newPrintNode)
+        graph.cleanup().toposort()
+
+    return graph
+
+
+@contextagnostic
+class DebugPrintPass(ReplaceSequentialPatternPass):
+
+    def __init__(self, op_regex: str, position = 'before'):
+
+        if op_regex == "":
+            raise ValueError('Operator not set!')
+        if position not in ['before', 'after']:
+            ValueError(f'Invalid position "{position}"!')
+
+        pattern = gs.Graph()
+        _input = gs.Variable(name = 'input_0')
+        output = pattern.layer(inputs = [_input], outputs = ['output0'], op = op_regex)
+        pattern.outputs.append(output)
+        pattern.inputs = [_input]
+
+        name = "_DEBUG_PRINT_PASS"
+        super().__init__(pattern, partial(_print_fun, position = position), name, NonBranchingMatcher(regex_op = True))
+
+
+def _merge_print_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    d1 = matched_nodes[0]
+    d2 = matched_nodes[1]
+
+    _inputs = list(d1.inputs)
+    _outputs = list(d2.outputs)
+
+    newPrintNode = gs.Node(op = 'DebugPrint', name = name)
+    graph.replaceInsertNode(_inputs, _outputs, newPrintNode)
+
+    graph.cleanup().toposort()
+    return graph
+
+
+@contextagnostic
+class DebugPrintMergePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['d1_out'], op = 'DebugPrint', name = 'd1')
+        output = graph.layer(inputs = output, outputs = ['d2_out'], op = 'DebugPrint', name = 'd2')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_MERGE_DEBUG_PRINT_PASS"
+        super().__init__(graph, _merge_print_fun, name)
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py
new file mode 100644
index 0000000..80abf1e
--- /dev/null
+++ b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py
@@ -0,0 +1,637 @@
+# ----------------------------------------------------------------------
+#
+# File: LoweringOptimizationPasses.py
+#
+# Last edited: 07.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Iterable, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.Matchers import Match
+from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, SequentialPass, \
+    contextagnostic
+
+
+def _createReshape(tensorIn: gs.Tensor,
+                   name: str,
+                   newShape: Sequence[Union[int, str]],
+                   tensorOut: Optional[gs.Tensor] = None) -> Tuple[gs.Node, gs.Tensor]:
+    newShapeConst = gs.Constant(name = name + tensorIn.name + "_NewShape", values = np.array(newShape))
+
+    if tensorOut is None:
+        tensorOut = gs.Variable(name = name + tensorIn.name + "_Reshaped", dtype = np.float32, shape = newShape)
+    else:
+        assert newShape == tensorOut.shape
+
+    reshapeNode = gs.Node(name = name + tensorIn.name + "_Reshape",
+                          op = "Reshape",
+                          inputs = [tensorIn, newShapeConst],
+                          outputs = [tensorOut])
+
+    return reshapeNode, tensorOut
+
+
+def _appendExpandDims(tensor: gs.Tensor, name: str, axis: Union[int, Sequence[int]]) -> Tuple[gs.Node, gs.Tensor]:
+    if isinstance(axis, int):
+        axes = [axis]
+    elif isinstance(axis, tuple):
+        axes = list(axis)
+    elif isinstance(axis, list):
+        axes = axis
+    else:
+        assert False, f"axis should be of type int or tuple. Got {type(axis)}"
+
+    axes = [(len(tensor.shape) + len(axes) + axis if axis < 0 else axis) for axis in axes]
+    assert all(axis >= 0 for axis in axes)
+
+    assert isinstance(tensor.shape, Sequence) and len(tensor.shape) > 0 and isinstance(tensor.shape[0], int)
+    assert all(axis < len(tensor.shape) + len(axes) for axis in axes), f"axis out of bounds. axis: {axes}"
+
+    newShape = np.zeros(shape = (len(tensor.shape) + len(axes),), dtype = np.int_)
+    for axis in axes:
+        newShape[axis] = 1
+    newShape[newShape == 0] = tensor.shape
+
+    return _createReshape(tensor, name, newShape.tolist())
+
+
+def _prependSqueezeDims(tensor: gs.Tensor, name: str, axis: Union[int, Sequence[int]]) -> Tuple[gs.Node, gs.Tensor]:
+    if isinstance(axis, int):
+        axes = [axis]
+    elif isinstance(axis, tuple):
+        axes = list(axis)
+    elif isinstance(axis, list):
+        axes = axis
+    else:
+        assert False, f"axis should be of type int or tuple. Got {type(axis)}"
+
+    axes = [(len(tensor.shape) + axis if axis < 0 else axis) for axis in axes]
+    assert all(axis >= 0 for axis in axes)
+
+    assert isinstance(tensor.shape, Sequence) and len(tensor.shape) > 0 and isinstance(tensor.shape[0], int)
+    assert all(axis < len(tensor.shape) + len(axes) for axis in axes), f"axis out of bounds. axis: {axes}"
+
+    oldShape = np.zeros(shape = (len(tensor.shape) + len(axes),), dtype = np.int_)
+    for axis in axes:
+        oldShape[axis] = 1
+    oldShape[oldShape == 0] = tensor.shape
+
+    inputTensor = gs.Variable(name = name + tensor.name + "_Expanded", dtype = np.float32, shape = oldShape.tolist())
+
+    reshapeNode, _ = _createReshape(inputTensor, name, tensor.shape, tensor)
+
+    return reshapeNode, inputTensor
+
+
+# Permute (0,1,2,3,...,N-2,N-1) -> (0,1,2,3,...,N-1,N-2)
+def _permuteLastTwoDims(length: int) -> List[int]:
+    outList = list(range(length))
+    tmp = outList[-1]
+    outList[-1] = outList[-2]
+    outList[-2] = tmp
+    return outList
+
+
+# Permute (0,1,2,3,...,N-1) -> (0,2,3,...,N-1,1)
+def _permuteNCHWtoNHWC(length: int) -> List[int]:
+    outList = list(range(length))
+    outList.remove(1)
+    outList.append(1)
+    return outList
+
+
+# Permute (0,1,2,3,...,N-1) -> (0,N-1,1,2,3,...,N-2)
+def _permuteNHWCtoNCHW(length: int) -> List[int]:
+    outList = list(range(length))
+    outList.remove(length - 1)
+    outList.insert(1, length - 1)
+    return outList
+
+
+# Calculate permutation q = p^(-1) s.t. q(p(i)) = i
+def _invertPermutation(permutation: List[int]) -> List[int]:
+    tuples = []
+    for idx, i in enumerate(permutation):
+        tuples.append((i, idx))
+    sortedTuples = sorted(tuples, key = lambda x: x[0])
+    outPermutation = []
+    for i in sortedTuples:
+        outPermutation.append(i[1])
+    return outPermutation
+
+
+def _permuteList(inputList: List, permutation: List[int]):
+    assert len(inputList) == len(permutation), "Permuted list and permutation must have equal length!"
+    outList = []
+    for i in permutation:
+        outList.append(inputList[i])
+    return outList
+
+
+def _prependTransposeNode(anchor: gs.Variable,
+                          nodeName: str,
+                          permutation: Iterable[int],
+                          invert: bool = False) -> (gs.Node, gs.Variable):
+
+    if invert:
+        outShape = _permuteList(anchor.shape, _invertPermutation(permutation))
+    else:
+        outShape = _permuteList(anchor.shape, permutation)
+
+    anchorTransposeInput = gs.Variable(nodeName + "_Out", dtype = np.float32, shape = outShape)
+    anchorTransposeNode = gs.Node(name = nodeName,
+                                  op = "Transpose",
+                                  inputs = [anchorTransposeInput],
+                                  outputs = [anchor],
+                                  attrs = {'perm': permutation})
+
+    return anchorTransposeNode, anchorTransposeInput
+
+
+def _appendTransposeNode(anchor: gs.Variable,
+                         nodeName: str,
+                         permutation: Iterable[int],
+                         invert: bool = False) -> (gs.Node, gs.Variable):
+
+    if invert:
+        outShape = _permuteList(anchor.shape, _invertPermutation(permutation))
+    else:
+        outShape = _permuteList(anchor.shape, permutation)
+
+    anchorTransposeOutput = gs.Variable(nodeName + "_In", dtype = np.float32, shape = outShape)
+    anchorTransposeNode = gs.Node(name = nodeName,
+                                  op = "Transpose",
+                                  inputs = [anchor],
+                                  outputs = [anchorTransposeOutput],
+                                  attrs = {'perm': permutation})
+
+    return anchorTransposeNode, anchorTransposeOutput
+
+
+def _transposeMatMulInputs_fun(graph: gs.Graph, match: Match, name: str):
+
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    gemmNode = matched_nodes[0]
+
+    inputA = gemmNode.inputs[0]
+    inputB = gemmNode.inputs[1]
+
+    if 'transA' not in gemmNode.attrs:
+        gemmNode.attrs['transA'] = 0
+    if 'transB' not in gemmNode.attrs:
+        gemmNode.attrs['transB'] = 0
+    if 'alpha' not in gemmNode.attrs:
+        gemmNode.attrs['alpha'] = 1.0
+    if 'beta' not in gemmNode.attrs:
+        gemmNode.attrs['beta'] = 1.0
+
+    # Prepend transpose on A if it's transposed
+    if gemmNode.attrs['transA'] != 0:
+        anchorTransposeNode, anchorTransposeOutput = _appendTransposeNode(inputA, name + "_A",
+                                                                          _permuteLastTwoDims(len(inputA.shape)))
+        gemmNode.inputs[0] = anchorTransposeOutput
+        gemmNode.attrs['transA'] = 0
+        graph.nodes.append(anchorTransposeNode)
+
+    # Prepend transpose on B if it's not transposed
+    if gemmNode.attrs['transB'] != 1:
+        anchorTransposeNode, anchorTransposeOutput = _appendTransposeNode(inputB, name + "_B",
+                                                                          _permuteLastTwoDims(len(inputB.shape)))
+        gemmNode.inputs[1] = anchorTransposeOutput
+        gemmNode.attrs['transB'] = 1
+        graph.nodes.append(anchorTransposeNode)
+
+    return graph
+
+
+# SCHEREMO:
+# Implements generation of tranpose nodes such that each GEMM/MatMul node implements attributes transA = 0 transB = 1
+@contextagnostic
+class TransposeMatmulInputsPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['gemmOut'], op = 'RequantizedGemm', name = 'requantizedGemm')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_TRANSPOSE_MATMUL_INPUTS_PASS"
+        super().__init__(graph, _transposeMatMulInputs_fun, name)
+
+
+def _NCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_first: bool = True):
+
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    opNode = matched_nodes[0]
+    node_op = opNode.op
+
+    # Default for non-existent channels_first: True
+    channels_first = opNode.attrs["channels_first"] if "channels_first" in opNode.attrs else True
+
+    if (channels_first != default_channels_first):
+
+        inputNode = opNode.inputs[0]
+        outputNode = opNode.outputs[0]
+
+        inPermute = _permuteNCHWtoNHWC(len(inputNode.shape))
+        outPermute = _permuteNHWCtoNCHW(len(outputNode.shape))
+
+        inputTransposeNode, inputTransposeOutput = _appendTransposeNode(inputNode, name + "_TransposeIn", inPermute)
+        outputTransposeNode, outputTransposeInput = _prependTransposeNode(outputNode,
+                                                                          name + "_TransposeOut",
+                                                                          outPermute,
+                                                                          invert = True)
+
+        opNode.inputs[0] = inputTransposeOutput
+        opNode.outputs[0] = outputTransposeInput
+        graph.nodes.append(inputTransposeNode)
+        graph.nodes.append(outputTransposeNode)
+
+        if node_op in ["RequantizedConv", "Conv"]:
+
+            # Non DW-Type:
+            if opNode.attrs['group'] == 1:
+                weightNode = opNode.inputs[1]
+                weightTransposeNode, weightTransposeOutput = _appendTransposeNode(weightNode, name + "TransposeWeight",
+                                                                                  inPermute)
+
+            else:
+                DWPermute = [inPermute[-1]] + inPermute[1:-1] + [inPermute[0]]
+                weightNode = opNode.inputs[1]
+                weightTransposeNode, weightTransposeOutput = _appendTransposeNode(weightNode, name + "TransposeWeight",
+                                                                                  DWPermute)
+
+            opNode.inputs[1] = weightTransposeOutput
+            graph.nodes.append(weightTransposeNode)
+
+        opNode.attrs["channels_first"] = default_channels_first
+
+    return graph
+
+
+@contextagnostic
+class NCHWtoNHWCMaxPoolPass(ReplaceSequentialPatternPass):
+
+    def __init__(self, default_channels_first: bool = True):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['maxPool'], op = 'MaxPool', name = 'MaxPool')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_NCHW_TO_NHWC_MAXPOOL_PASS"
+        super().__init__(graph, partial(_NCHWtoNHWC_fun, default_channels_first = default_channels_first), name)
+
+
+@contextagnostic
+class NCHWtoNHWCConvPass(ReplaceSequentialPatternPass):
+
+    def __init__(self, default_channels_first: bool = True):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'Conv', name = 'conv')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_NCHW_TO_NHWC_CONV_PASS"
+        super().__init__(graph, partial(_NCHWtoNHWC_fun, default_channels_first = default_channels_first), name)
+
+
+@contextagnostic
+class NCHWtoNHWCRequantizedConvPass(ReplaceSequentialPatternPass):
+
+    def __init__(self, default_channels_first: bool = True):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'RequantizedConv', name = 'requantizedConv')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_NCHW_TO_NHWC_CONV_PASS"
+        super().__init__(graph, partial(_NCHWtoNHWC_fun, default_channels_first = default_channels_first), name)
+
+
+@contextagnostic
+class NCHWtoNHWCPadPass(ReplaceSequentialPatternPass):
+
+    def __init__(self, default_channels_first: bool = True):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['padOut'], op = 'Pad', name = 'pad')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_NCHW_TO_NHWC_PAD_PASS"
+        super().__init__(graph, partial(_NCHWtoNHWC_fun, default_channels_first = default_channels_first), name)
+
+
+@contextagnostic
+class NCHWtoNHWCPass(SequentialPass):
+
+    def __init__(self, default_channels_first: bool = True):
+        passes = [
+            NCHWtoNHWCPadPass(default_channels_first),
+            NCHWtoNHWCMaxPoolPass(default_channels_first),
+            NCHWtoNHWCConvPass(default_channels_first),
+            NCHWtoNHWCRequantizedConvPass(default_channels_first),
+        ]
+        super().__init__(*passes)
+
+
+def _PULPDWNCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_first: bool = True):
+
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    opNode = matched_nodes[0]
+    node_op = opNode.op
+
+    if opNode.attrs['group'] == 1:
+        return graph
+
+    if (("channels_first" in opNode.attrs and opNode.attrs["channels_first"] != default_channels_first)
+            or ("channels_first" not in opNode.attrs and default_channels_first == 0)):
+
+        inputNode = opNode.inputs[0]
+        outputNode = opNode.outputs[0]
+
+        inPermute = _permuteNCHWtoNHWC(len(inputNode.shape))
+        outPermute = _permuteNHWCtoNCHW(len(outputNode.shape))
+
+        outputTransposeNode, outputTransposeInput = _prependTransposeNode(outputNode,
+                                                                          name + "_TransposeOut",
+                                                                          outPermute,
+                                                                          invert = True)
+
+        opNode.outputs[0] = outputTransposeInput
+        graph.nodes.append(outputTransposeNode)
+
+        if node_op == "RequantizedConv":
+
+            weightNode = opNode.inputs[1]
+            weightTransposeNode, weightTransposeOutput = _appendTransposeNode(weightNode, name + "TransposeWeight",
+                                                                              inPermute)
+            opNode.inputs[1] = weightTransposeOutput
+            graph.nodes.append(weightTransposeNode)
+
+        opNode.attrs["channels_first"] = default_channels_first
+
+    return graph
+
+
+@contextagnostic
+class PULPDWConvPass(ReplaceSequentialPatternPass):
+
+    def __init__(self, default_channels_first: bool = True):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'RequantizedConv', name = 'requantizedConv')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_NCHW_TO_NHWC_CONV_PASS"
+        super().__init__(graph, partial(_PULPDWNCHWtoNHWC_fun, default_channels_first = default_channels_first), name)
+
+
+def _PULPDenseNCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_first: bool = True):
+
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    opNode = matched_nodes[0]
+
+    node_group = opNode.attrs['group'] if 'group' in opNode.attrs else 1
+    if node_group != 1:
+        return graph
+
+    return _NCHWtoNHWC_fun(graph, match, name, default_channels_first)
+
+
+@contextagnostic
+class PULPNCHWtoNHWCDenseRequantizedConvPass(ReplaceSequentialPatternPass):
+
+    def __init__(self, default_channels_first: bool = True):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'RequantizedConv', name = 'requantizedConv')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_NCHW_TO_NHWC_CONV_PASS"
+        super().__init__(graph, partial(_PULPDenseNCHWtoNHWC_fun, default_channels_first = default_channels_first),
+                         name)
+
+
+def _NeurekaDWNCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_first: bool = True):
+
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    opNode = matched_nodes[0]
+
+    node_group = opNode.attrs['group'] if 'group' in opNode.attrs else 1
+    if node_group == 1:
+        return graph
+
+    return _NCHWtoNHWC_fun(graph, match, name, default_channels_first)
+
+
+@contextagnostic
+class NeurekaNCHWtoNHWCDWRequantizedConvPass(ReplaceSequentialPatternPass):
+
+    def __init__(self, default_channels_first: bool = True):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'RequantizedConv', name = 'requantizedConv')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_NCHW_TO_NHWC_CONV_PASS"
+        super().__init__(graph, partial(_NeurekaDWNCHWtoNHWC_fun, default_channels_first = default_channels_first),
+                         name)
+
+
+@contextagnostic
+class PULPNCHWtoNHWCDenseConvPass(ReplaceSequentialPatternPass):
+
+    def __init__(self, default_channels_first: bool = True):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'Conv', name = 'conv')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_NCHW_TO_NHWC_CONV_PASS"
+        super().__init__(graph, partial(_PULPDenseNCHWtoNHWC_fun, default_channels_first = default_channels_first),
+                         name)
+
+
+@contextagnostic
+class PULPNCHWtoNHWCPass(SequentialPass):
+
+    def __init__(self, default_channels_first: bool = True):
+        passes = [
+            NCHWtoNHWCPadPass(default_channels_first),
+            NCHWtoNHWCMaxPoolPass(default_channels_first),
+            PULPDWConvPass(default_channels_first),
+            PULPNCHWtoNHWCDenseConvPass(default_channels_first),
+            PULPNCHWtoNHWCDenseRequantizedConvPass(default_channels_first),
+        ]
+        super().__init__(*passes)
+
+
+@contextagnostic
+class NeurekaNCHWtoNHWCPass(SequentialPass):
+
+    def __init__(self, default_channels_first: bool = True):
+        passes = [
+            NCHWtoNHWCPadPass(default_channels_first),
+            NCHWtoNHWCMaxPoolPass(default_channels_first),
+            NeurekaNCHWtoNHWCDWRequantizedConvPass(default_channels_first),
+            PULPNCHWtoNHWCDenseConvPass(default_channels_first),
+            PULPNCHWtoNHWCDenseRequantizedConvPass(default_channels_first),
+        ]
+        super().__init__(*passes)
+
+
+def _requantized_gemm_to_pw_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = list(match.nodes_map.values())
+    requantizedGemm = matched_nodes[0]
+
+    matrixA: gs.Variable = requantizedGemm.inputs[0]
+    matrixB: gs.Constant = requantizedGemm.inputs[1]
+    matrixY: gs.Variable = requantizedGemm.outputs[0]
+
+    # Check matrixB is a constant, otherwise don't transform
+    if not isinstance(matrixB, gs.Constant):
+        return graph
+
+    assert len(matrixA.shape) in [
+        2, 3
+    ], f"Unsupported number of dimensions for input matrix A of GEMM operation: {len(matrixA.shape)}; shape: {matrixA.shape}"
+    assert len(matrixY.shape) in [
+        2, 3
+    ], f"Unsupported number of dimensions for output matrix of GEMM operation: {len(matrixY.shape)}; shape: {matrixY.shape}"
+
+    # Pointwise with HWC layout (channels_first == False)
+
+    # If transA is set then the matrix is of shape [B x K x M] and it needs to be transposed, otherwise its shape is  [B x M x K]
+    if 'transA' in requantizedGemm.attrs and requantizedGemm.attrs['transA'] == 1:
+        matrixATransposeNode, matrixA = _appendTransposeNode(matrixA, name, _permuteLastTwoDims(len(matrixA.shape)))
+        graph.nodes.append(matrixATransposeNode)
+
+    # Align dimensions for convolution
+    expandAxis = []
+    # Align the batch dimension
+    if len(matrixA.shape) == 2:
+        expandAxis.append(0)
+    # Expand the height dimension
+    expandAxis.append(1)
+    # pwIn, shape [B x 1 x M x K]
+    matrixAExpandDimsNode, pwIn = _appendExpandDims(matrixA, name, axis = expandAxis)
+    graph.nodes.append(matrixAExpandDimsNode)
+
+    # If transB is set then the matrix is of shape [N x K] and it doesn't need to be transposed, otherwise its shape is [K x N] and it has to be transposed
+    if not 'transB' in requantizedGemm.attrs or requantizedGemm.attrs['transB'] == 0:
+        # matrixBTransposed, shape [N x K]
+        matrixBTransposeNode, matrixB = _appendTransposeNode(matrixB, name, _permuteLastTwoDims(len(matrixB.shape)))
+        graph.nodes.append(matrixBTransposeNode)
+    # pwWeight, shape [N x 1 x 1 x K]
+    matrixBExpandDimsNode, pwWeight = _appendExpandDims(matrixB, name, axis = (1, 2))
+    graph.nodes.append(matrixBExpandDimsNode)
+
+    if len(matrixY.shape) == 2:
+        # matrixY, shape [M x N]
+        squeezeDims = (0, 1)
+    else:
+        # matrixY, shape [B x M x N]
+        squeezeDims = (1,)
+    # pwOut, shape [B x 1 x M x N]
+    matrixYSqueezeDimsNode, pwOut = _prependSqueezeDims(matrixY, name, squeezeDims)
+    graph.nodes.append(matrixYSqueezeDimsNode)
+
+    pwAttrs = {
+        'channels_first': False,
+        'dilations': [1, 1],
+        'group': 1,
+        'kernel_shape': [1, 1],
+        'pads': [0, 0, 0, 0],
+        'strides': [1, 1],
+        'div': requantizedGemm.attrs['div'],
+        'n_levels_out': requantizedGemm.attrs['n_levels_out'],
+        'shift': requantizedGemm.attrs['shift'],
+        'signed': requantizedGemm.attrs['signed'],
+    }
+
+    add = requantizedGemm.inputs[2]
+    mul = requantizedGemm.inputs[3]
+
+    _inputs = [pwIn, pwWeight, mul, add]
+
+    pw = gs.Node(op = 'RequantizedConv',
+                 name = name + "_RequantizedPwConv",
+                 inputs = _inputs,
+                 outputs = [pwOut],
+                 attrs = pwAttrs)
+    graph.nodes.append(pw)
+
+    requantizedGemm.inputs.clear()
+    requantizedGemm.outputs.clear()
+    graph.nodes.remove(requantizedGemm)
+
+    return graph
+
+
+@contextagnostic
+class RequantizedGemmToPwPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['out'], op = 'RequantizedGemm', name = 'requantizedGemm')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        super().__init__(graph, _requantized_gemm_to_pw_fun, "_REQUANTIZED_GEMM_TO_PW_PASS")
+
+
+def _remove_global_output_reshape_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = list(match.nodes_map.values())
+    reshape = matched_nodes[0]
+
+    isGlobalOutput = len(reshape.outputs[0].outputs) == 0
+
+    if isGlobalOutput:
+        graph.deleteNode(reshape)
+
+    return graph
+
+
+@contextagnostic
+class RemoveGlobalOutputReshapePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['out'], op = 'Reshape', name = 'reshape')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        super().__init__(graph, _remove_global_output_reshape_fun, "_REMOVE_GLOBAL_OUTPUT_RESHAPE_PASS")
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/__init__.py b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/__init__.py
new file mode 100644
index 0000000..898895b
--- /dev/null
+++ b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 28.04.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/__init__.py b/Deeploy/CommonExtensions/OptimizationPasses/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/CommonExtensions/OptimizationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py b/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py
new file mode 100644
index 0000000..bdcb6ea
--- /dev/null
+++ b/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py
@@ -0,0 +1,74 @@
+# ----------------------------------------------------------------------
+#
+# File: SignPropChecker.py
+#
+# Last edited: 19.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeTypeChecker, OperatorRepresentation, VariableBuffer
+
+
+class SignPropTypeChecker(NodeTypeChecker):
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]:
+        return None
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]:
+        return None
+
+    def typeInferGlobalCtxt(self, ctxt: NetworkContext, node: gs.Node) -> NetworkContext:
+        ctxt = super().typeInferGlobalCtxt(ctxt, node)
+
+        for inputNode, _type in zip(node.inputs, self.input_types):
+            if isinstance(ctxt.lookup(inputNode.name), ConstantBuffer):
+                reference = ctxt.lookup(inputNode.name)
+                if not _type.referencedType.checkPromotion(reference.values):
+                    raise Exception(f"Can't cast {reference} to {_type}!")
+
+                reference.nLevels = reference.values.max() - reference.values.min()
+                reference._signed = _type.referencedType.typeMin < 0
+
+        return ctxt
+
+    def typeInferOutput(self, ctxt: NetworkContext, node: gs.Node,
+                        operatorRepresentation: OperatorRepresentation) -> NetworkContext:
+        ctxt = super().typeInferOutput(ctxt, node, operatorRepresentation)
+
+        inputs = [ctxt.lookup(inputNode.name) for inputNode in node.inputs]
+        outputs = [ctxt.lookup(outputNode.name) for outputNode in node.outputs]
+
+        signProp = all([hasattr(_input, "_signed") and hasattr(_input, "nLevels") for _input in inputs])
+
+        if signProp:
+            nLevels = self._inferNumLevels(inputs, operatorRepresentation)
+            signedness = self._inferSignedness(inputs, operatorRepresentation)
+
+            for obj, nLevels, sign in zip(outputs, nLevels, signedness):
+                obj.nLevels = nLevels
+                obj._signed = sign
+
+        return ctxt
diff --git a/Deeploy/CommonExtensions/TypeCheckers/__init__.py b/Deeploy/CommonExtensions/TypeCheckers/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/CommonExtensions/TypeCheckers/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/CommonExtensions/__init__.py b/Deeploy/CommonExtensions/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/CommonExtensions/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
new file mode 100644
index 0000000..bc2a7e6
--- /dev/null
+++ b/Deeploy/DeeployTypes.py
@@ -0,0 +1,3201 @@
+# ----------------------------------------------------------------------
+#
+# File: DeeployTypes.py
+#
+# Last edited: 17.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Moritz Scherer, ETH Zurich
+# - Victor Jung, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import copy
+import os
+import pickle
+import re
+from abc import abstractmethod
+from collections import OrderedDict, deque
+from dataclasses import dataclass
+from functools import reduce
+from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, Type, TypeVar, Union
+
+import mako
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+from mako.template import Template
+from onnx.external_data_helper import convert_model_to_external_data
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from .AbstractDataTypes import BaseType, IntegerImmediate, Pointer, PointerClass, Struct, VoidType
+
+Shape = TypeVar("Shape", bound = Any)
+SubGraph = List[gs.Node]
+Schedule = Union[List[SubGraph], SubGraph]
+
+OperatorRepresentation = Dict[str, Union[
+    str,
+    Any]]  # Represents the expressions used to describe an operator's parametrization. This is populated by the parser, typechecker, and nodebinding.
+
+
+@dataclass
+class CodeSnippet:
+    """A dataclass to hold a NodeTemplate and its associated OperatorRepresentation; used to generate code"""
+    template: NodeTemplate
+    operatorRepresentation: OperatorRepresentation
+
+
+@dataclass
+class CodeGenVerbosity:
+    """
+    Encapsulates verbosity options for downstream configuration
+    """
+
+    tilingProfiling: Optional[str]  #: str: Specifies the name of the memory level on which to profile tiling
+
+
+_NoVerbosity = CodeGenVerbosity(None)
+
+_middlewarePreLoweringFilename = 'middleware_pre_lowering'
+_middlewarePostLoweringFilename = 'middleware_post_lowering'
+_backendPostParsingFilename = 'backend_post_parsing'
+_backendPostBindingFilename = 'backend_post_binding'
+
+_ctxtExtension = '.pkl'
+_graphExtension = '.onnx'
+_dataExtension = '.data'
+
+
+# SCHEREMO: mako.Templates are not copiable, since they can use shared context.
+# In Deeploy we only use them by direct call (no shared context), so we can override deepcopy and workaround the issue
+class _Template(Template):
+    """
+    This class wraps the Mako.Template class in a way that enables deep-copying
+    """
+
+    def __deepcopy__(self, memo):
+        _copy = type(self)("", strict_undefined = self.strict_undefined)
+        _copy._source = self._source
+        _copy._code = self._code
+        _copy.module = self.module
+        _copy.callable_ = self.callable_
+        memo[id(self)] = _copy
+        return _copy
+
+
+class NodeTemplate():
+    """This class wraps a `Mako.Template` with additional functionality for hoisting transient buffers and adding expressions to the parsers' node representation"""
+
+    def __init__(self, templateStr: str):
+        """Initialize a NodeTemplate object
+
+        Parameters
+        ----------
+        templateStr : str
+            Mako template string. If tiling is supposed to be
+            supported, this template string may only contain direct
+            expressions that get added by either the operator's parser
+            or the `alignToContext` method.
+
+        """
+        self.template = _Template(templateStr, strict_undefined = True)
+        self.subTemplates = {}
+        self.subTemplateGenerators = {}
+
+    def internalSize(self) -> int:
+        """Return the byte size of internal memory buffers used by this template
+
+        Returns
+        -------
+        int
+            byte size of all transient internal buffers
+
+        """
+        return 0
+
+    # Override this. Used to hoist optional structs, constants and so on to the GLOBAL context for specialized kernels
+    def alignToContext(
+            self, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, OperatorRepresentation, List[str]]:
+        """Helper method to extract Mako template expressions used in the backend's code generation step. Also hoists transient buffers.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext. Modifying is allowed within this method.
+        operatorRepresentation : OperatorRepresentation
+            Current node representation. Modifying is allowed within this method.
+
+        Returns
+        -------
+        Tuple[NetworkContext, OperatorRepresentation, List[str]]
+            Tuple of the updated NetworkContext, operatorRepresentation and a list of
+            the names of hoisted transient buffers
+
+
+        """
+        return ctxt, operatorRepresentation, []
+
+    # Override this
+    def computeTransientBuffersSize(
+            self, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
+        """Computes the size of transient buffers hoisted by this template given expressions for each variable added by the operator's parser.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        operatorRepresentation : OperatorRepresentation
+            The parser's node representation
+
+        Returns
+        -------
+        List[Tuple[str, Union[int, IntVar]]]
+            Returns a list of tuples containing the hoisted buffer's
+            name and either a symbolic expression or an integer
+            representing its size.
+
+        """
+        return []
+
+    # Override this
+    def hoistTransientBuffers(
+            self, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, OperatorRepresentation, List[str]]:
+        """Registers the transient buffers required by this template. If tiling is applied, this method is called AFTER tiling.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        operatorRepresentation : OperatorRepresentation
+            The parser's node representation
+
+        Returns
+        -------
+        Tuple[NetworkContext, OperatorRepresentation, List[str]]
+            Tuple containing the updated `NetworkContext` object,
+            updated node representation and a list of names of all
+            hoisted `TransientBuffers`
+
+        """
+        return ctxt, operatorRepresentation, []
+
+    # Don't override this
+    def _alignToContext(
+            self, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, OperatorRepresentation, List[str]]:
+        ctxt, operatorRepresentation, nameList = self.alignToContext(ctxt, operatorRepresentation)
+        for key, (template, repGenerator) in self.subTemplates.items():
+            ctxt, subNodeRep, _nameList = template.alignToContext(*(repGenerator(ctxt, operatorRepresentation.copy())))
+            self.subTemplateGenerators[key] = (template, copy.copy(subNodeRep))
+            nameList += _nameList
+        return ctxt, operatorRepresentation, nameList
+
+    # Don't override this
+    def generate(self, operatorRepresentation = {}, **kwargs) -> str:
+        """Generated the operator's C implementation
+
+        Parameters
+        ----------
+        operatorRepresentation : The parser's node representation
+
+        Returns
+        -------
+        str
+            Returns the operator's C implementation
+
+        Raises
+        ------
+        KeyError
+            Raises an error whenever an expression in the
+            `NodeTemplate`'s templateString is not matched against the
+            available expressions in the `operatorRepresentation`
+
+        """
+        callStack = ""
+
+        try:
+            for key, (template, subNodeRep) in self.subTemplateGenerators.items():
+                operatorRepresentation[f'RENDER_{key}'] = template.generate(**subNodeRep, **kwargs)
+            callStack += self.template.render(**operatorRepresentation, **kwargs)
+        except:
+            print(operatorRepresentation)
+            print(mako.exceptions.text_error_template().render())
+            raise KeyError(f"Template {self} failed!")
+        return callStack
+
+
+class VariableBuffer():
+    """This class represents memory locations containing variable tensor data that is not transient, i.e. intermediate results or input- and output buffers.
+
+    """
+
+    initTemplate: NodeTemplate  #: NodeTemplate: Holds the buffer's initialization code
+    allocTemplate: NodeTemplate  #: NodeTemplate: Holds the buffer's allocation code
+    deallocTemplate: NodeTemplate  #: NodeTemplate: Holds the buffer's deallocation code
+
+    def __init__(self, name: str = '', shape = [1]):
+        self.name: str = name  #: str: Canonical name that this buffer is registered as in the NetworkContext
+        self.shape: Sequence[
+            int] = shape  #: Sequence[int]: Represents the dimensions of the underlying tensor as a sequence of dimension sizes
+
+        self._users: List[gs.Node] = [
+        ]  #: List[gs.Node]: DO NOT OVERRIDE - this variable stores all downstream users of this buffer
+        self._type: Type[
+            Pointer]  #: Type[Pointer]: DO NOT OVERRIDE - this variable stores the type assigned by the type checking pass
+        self._instance: Pointer  #: Pointer: DO NOT OVERRIDE - this variable stores an instantiated POinter assigned by the type checking pass
+        self._live: bool = False  #: bool: DO NOT OVERRIDE - this variable is true if a previous Memory allocation pass has allocated the buffer, and false if this buffer has been deallocated or has not been allocated yet.
+        self._deploy: bool = True  #: bool: MAY OVERRIDE - this variable is a global switch to deactivate the buffer for all purposes without deleting it outright.
+
+        self._signed = None
+        self.nLevels = None
+
+    def _bufferRepresentation(self) -> Dict:
+        return {"type": self._instance, "name": self.name, "size": int(np.prod(self.shape))}
+
+    def init(self) -> str:
+        """Return a string representation of the C code to declare this memory buffer
+
+        Returns
+        -------
+        str
+            C Code to declare this buffer
+
+        """
+        return self.initTemplate.generate(self._bufferRepresentation())
+
+    def alloc(self) -> str:
+        """Return a string representation of the C code required to allocated this memory buffer
+
+        Returns
+        -------
+        str
+            C Code to allocate this buffer
+
+
+        """
+
+        return self.allocTemplate.generate(self._bufferRepresentation())
+
+    def dealloc(self) -> str:
+        """Return a string representation of the C code to deallocate/free this memory buffer at runtime
+
+        Returns
+        -------
+        str
+            C Code to free this buffer
+
+        """
+        return self.deallocTemplate.generate(self._bufferRepresentation())
+
+    def __str__(self) -> str:
+        if hasattr(self, "_type"):
+            return f'VariableBuffer: name: {self.name}, type: {self._type}'
+
+        return f'VariableBuffer: name: {self.name}'
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+    def __eq__(self, other):
+        ret = all([self.name == other.name, self.shape == other.shape])
+        return ret
+
+    def __getstate__(self):
+        d = dict(self.__dict__)
+        if 'allocTemplate' in d.keys():
+            del d['allocTemplate']
+        if 'deallocTemplate' in d.keys():
+            del d['deallocTemplate']
+        if 'initTemplate' in d.keys():
+            del d['initTemplate']
+        return d
+
+    @classmethod
+    def fromNode(cls, node: gs.Node):
+        return (cls(name = node.name, shape = node.shape if not isinstance(node, gs.Constant) else node.values.shape))
+
+
+class TransientBuffer(VariableBuffer):
+    """Class to represent memory space required by kernels that is not covered by input and output tensors, e.g. im2col buffers in convolutions
+
+
+    """
+
+    def __init__(self, name: str = '', size = 0):
+        self.name = name
+        self.size = size  #: int: Total BYTE size of this TransientBuffer
+
+        # Do not override - Should be written in the parsing passes
+        self._users = []
+
+        # Do not override - Should be written in the parsing passes
+        self._type: Type[Pointer] = PointerClass(VoidType)
+
+        # Do not override - Should be written in the deployment passes
+        self._live = False
+
+        # Do not override - Set in Templates depending on platform
+        self._deploy = True
+
+    def __eq__(self, other):
+
+        ret = all([self.name == other.name, self.size == other.size])
+        return ret
+
+    def _bufferRepresentation(self) -> Dict:
+        return {"type": self._type, "name": self.name, "size": int(self.size)}
+
+    def __str__(self) -> str:
+        return f'TransientBuffer: name: {self.name}, size: {self.size}'
+
+    def __repr__(self) -> str:
+        return f'TransientBuffer: name: {self.name}, size: {self.size}'
+
+    @classmethod
+    def fromVariableBuffer(cls, buffer: VariableBuffer):
+        ret = cls(name = buffer.name, size = np.prod(buffer.shape) * buffer._type.typeWidth // 8)
+
+
+class ConstantBuffer(VariableBuffer):
+    """Class to represent compile-time constant tensors (weights, biases, other parameters) within Deeploy.
+
+    """
+
+    def __init__(self, name: str = '', shape = [1], values = [0]):
+        super().__init__(name, shape)
+        values = np.asarray(values)
+        intArray = values.astype(int)
+        assert (np.abs(values - intArray)).max() < 0.001, "Constant value {name} is NOT an integer!"
+        self.values = intArray  #: np.array: Stores the underlying weights in Ptyhon-type representation
+
+        # Do not override - ConstantBuffers are assumed to be always live!
+        self._live = True
+
+    def __eq__(self, other):
+        ret = all([super().__eq__(other), np.array_equal(self.values, other.values)])
+        return ret
+
+    def _valueString(self) -> str:
+        values = list(self.values.reshape(-1))
+        strValues = [str(value) for value in values]
+        valueString = ', '.join(strValues)
+        return valueString
+
+    def __str__(self) -> str:
+        return f'ConstantBuffer: name: {self.name}, type: {self._type}'
+
+    def __repr__(self) -> str:
+        return f'ConstantBuffer: name: {self.name}, type: {self._type}'
+
+    def _bufferRepresentation(self) -> Dict:
+        return {"type": self._type, "name": self.name, "size": int(np.prod(self.shape)), "values": self._valueString()}
+
+    @classmethod
+    def fromVariableBuffer(cls, buffer: VariableBuffer, values):
+        ret = cls(name = buffer.name, shape = buffer.shape, values = values)
+
+        return ret
+
+
+class StructBuffer(VariableBuffer):
+    """Class to represent Struct object needed by the generated C Code
+
+    """
+
+    def __init__(self, name: str, structDict: Dict):
+        super().__init__(name, None)
+        self.structDict = structDict
+
+    def __eq__(self, other):
+        ret = super().__eq__(other) and hasattr(other, "structDict") and self.structDict == other.structDict
+        return ret
+
+    def __str__(self) -> str:
+        return f'StructBuffer: name: {self.name}, type: {self._type}'
+
+    def __repr__(self) -> str:
+        return f'StructBuffer: name: {self.name}, type: {self._type}'
+
+    def _bufferRepresentation(self) -> Dict:
+        return {"type": self._type, "name": self.name, "size": int(self._type.typeWidth), "structDict": self.structDict}
+
+
+class GlobalDefinition():
+    """Helper class to hoist arbitrary C code into the global program scope; used to perform small amounts of global initialization, declare global synchronization objects, and similar.
+
+    """
+
+    def __init__(self, name: str, definition: str):
+        self.name = name
+        self.definition = definition
+
+    def alloc(self) -> str:
+        """Return this GlobalDefintion's C code
+        """
+        return self.definition
+
+    def __eq__(self, other):
+        ret = all([self.name == other.name, self.definition == other.definition])
+        return ret
+
+
+class _ReferenceBuffer(VariableBuffer):
+    """Helper class to hoist references to pre-established pointers; this is used most frequently in tiling to express an offset with respect to input or output tensors
+    """
+
+    allocTemplate = NodeTemplate("${type.typeName} ${name} = (${type.typeName}) ${objectName};")
+    deallocTemplate = NodeTemplate("")
+    initTemplate = NodeTemplate("")
+
+    def __init__(self, name: str = '', shape = [1], reference: Optional[VariableBuffer] = None):
+
+        assert reference is not None, "Can't have a reference to None!"
+
+        super().__init__(name, shape)
+        self._referencedBuffer = str(reference._instance)
+        self._referenceName = reference.name
+
+    def _bufferRepresentation(self) -> Dict:
+        rep = super()._bufferRepresentation()
+        rep['objectName'] = self._referencedBuffer
+        return rep
+
+
+class NetworkContext():
+    """The global context of the compiler. This object holds all the typing inferred in the type-checking passes within the respective buffers. It holds all hoisted transient buffers, struct buffers, and global definitions. The context is the source of truth for all code generation in the backend.
+    """
+
+    def __init__(self,
+                 variableBuffer: Type[VariableBuffer],
+                 constantBuffer: Type[ConstantBuffer],
+                 structBuffer: Type[StructBuffer],
+                 transientBuffer: Type[TransientBuffer],
+                 globalObjects = {},
+                 localObjects = {},
+                 name: str = 'DeeployNetwork'):
+        self.globalObjects = OrderedDict()
+        self.localObjects = OrderedDict()
+        self.VariableBuffer = variableBuffer
+        self.ConstantBuffer = constantBuffer
+        self.StructBuffer = structBuffer
+        self.TransientBuffer = transientBuffer
+        self.name = name
+
+    def dealiasBuffer(self, referenceName: str) -> str:
+        """Function to unravel reference instantiated in _ReferenceBuffer objects until the underlying VariableBuffer's name is returned
+
+        Parameters
+        ----------
+        referenceName : str
+            Name of the _ReferenceBuffer to unravel
+
+        Returns
+        -------
+        str
+            Name of the original VariableBuffer that was referenced
+
+        Raises
+        ------
+        Exception
+            Raises an Exception if references are circular, i.e. there
+            is no underlying VariableBuffer
+
+        """
+        _buffer = self.lookup(referenceName)
+        if not hasattr(_buffer, "_alias"):
+            return referenceName
+
+        seenAliases: Set[str] = set()
+
+        alias = _buffer._alias
+        while hasattr(self.lookup(alias), "_alias"):
+            seenAliases.add(alias)
+            alias = self.lookup(alias)._alias
+
+            if alias in seenAliases:
+                raise Exception("Circular aliasing detected!")
+
+        return alias
+
+    def exportNetworkContext(self, folderPath: str, fileName: str):
+        """Exports the NetworkContext as a pickled dictionary
+
+        Parameters
+        ----------
+        folderPath : str
+            Path to the location where this pickled context should be
+            saved
+        fileName : str
+            Name of the pickled context file
+
+        Raises
+        ------
+        OSError
+            Raises an OSError if the path is not valid
+
+        """
+        relativePath = os.path.join(folderPath, fileName + _ctxtExtension)
+        absolutePath = os.path.abspath(relativePath)
+
+        if not os.path.isabs(absolutePath):
+            raise OSError(f"Error exporting the context to: {absolutePath}")
+
+        with open(absolutePath, 'wb') as f:
+            pickle.dump(self, f)
+
+    @staticmethod
+    def importNetworkContext(folderPath, fileName):
+        """Imports a pickled NetworkContext that was saved using exportNetworkContext
+
+        Parameters
+        ----------
+        folderPath : str
+            Path to the location where the pickled context is stored
+        fileName : str
+            Name of the pickled context file
+
+        Raises
+        ------
+        OSError
+            Raises in OSError if the pickled context file does not
+            exist
+
+
+        """
+        relativePath = os.path.join(folderPath, fileName + _ctxtExtension)
+        absolutePath = os.path.abspath(relativePath)
+
+        if not os.path.isabs(absolutePath) or not os.path.exists(absolutePath):
+            raise OSError(f"File or path does not exist: {absolutePath}")
+
+        with open(absolutePath, 'rb') as f:
+            return pickle.load(f)
+
+    def __repr__(self):
+        globalObjects = []
+        localObjects = []
+        for item in self.globalObjects.values():
+            globalObjects.append(str(item))
+        for item in self.localObjects.values():
+            localObjects.append(str(item))
+        _repr = "globalObjects: {\n"
+        _repr += ",\n ".join(globalObjects)
+        _repr += "} \n\n"
+        _repr += "localObjects: {\n"
+        _repr += ",\n ".join(localObjects)
+        _repr += "}"
+        return _repr
+
+    def __eq__(self, other):
+        if not isinstance(other, NetworkContext):
+            raise TypeError(f'Cannot compare NetworkContext with {type(other)}!')
+
+        if not other.globalObjects.keys() == self.globalObjects.keys():
+            return False
+
+        if not other.localObjects.keys() == self.localObjects.keys():
+            return False
+
+        for buffer_name in self.globalObjects.keys():
+            if not self.globalObjects[buffer_name] == other.globalObjects[buffer_name]:
+                return False
+
+        for buffer_name in self.localObjects.keys():
+            if not self.localObjects[buffer_name] == other.localObjects[buffer_name]:
+                return False
+
+        return True
+
+    def _mangle(self, name: str, repr: bool = True) -> str:
+        repStr = name
+        repStr = re.sub('\.', '_', repStr)
+        repStr = re.sub(':', '_', repStr)
+        if repr:
+            repStr = re.sub('\.', '_', self.name) + '_' + repStr
+        return repStr
+
+    def add(self, obj: VariableBuffer, ctxt: str = 'local', _id: str = ""):
+        """Adds a VariableBuffer object to the NetworkContext
+
+        Parameters
+        ----------
+        obj : VariableBuffer
+            The VariableBuffer object to be registered
+        ctxt : str
+            Level of the NetworkContext to register the VariableBuffer in, either local or global
+        _id : str
+            Override for the registration name of the
+            VariableBuffer. Do not use unless you have a good reason!
+
+        Raises
+        ------
+        ValueError
+            Raises a ValueError if ctxt is not local or global
+        KeyError
+            Raises a KeyError if the VariableBuffer's name is already
+            registered within the NetworkContext
+
+
+        """
+        if _id != "":
+            obj.name = self._mangle(_id + "_" + obj.name, False)
+
+        if ctxt == 'local':
+            if obj.name not in self.localObjects.keys():
+                self.localObjects[obj.name] = obj
+            else:
+                raise KeyError(f'Buffername {obj.name} was already in the local context!')
+        elif ctxt == 'global':
+            if obj.name not in self.globalObjects.keys():
+                self.globalObjects[obj.name] = obj
+            else:
+                raise KeyError(f'Buffername {obj.name} was already in the global context!')
+        else:
+            raise ValueError("Expected either local or global context")
+
+    def lookup(self, name: str, _id: str = "") -> Union[VariableBuffer, GlobalDefinition]:
+        """Returns the VariableBuffer or GlobalDefinition registered under a given name
+
+        Parameters
+        ----------
+        name : str
+            Name of the VariableBuffer to look up
+        _id : str
+            Override for the registration name of the
+            VariableBuffer. Do not use unless you have a good reason!
+
+        Returns
+        -------
+        Union[VariableBuffer, GlobalDefinition]
+            Registered buffer object
+
+        Raises
+        ------
+        KeyError
+            Raises a KeyError if the name does not match with any
+            registered object
+
+        """
+
+        if _id != "":
+            name = self._mangle(_id + "_" + name, False)
+
+        if name in self.localObjects.keys():
+            return self.localObjects[name]
+        elif name in self.globalObjects.keys():
+            return self.globalObjects[name]
+        else:
+            raise KeyError(f'Expected key {name} to be in either local or global context!')
+
+    def is_global(self, name: str) -> bool:
+        """Checks whether a name is associated with a global buffer
+
+        Parameters
+        ----------
+        name : str
+            Name of the VariableBuffer to check for
+
+        Returns
+        -------
+        bool
+            Returns true if the name matches with any global buffer
+
+        """
+        if name in self.globalObjects.keys():
+            return True
+        else:
+            return False
+
+    def is_local(self, name: str) -> bool:
+        """Checks whether a name is associated with a local buffer
+
+        Parameters
+        ----------
+        name : str
+            Name of the VariableBuffer to check for
+
+        Returns
+        -------
+        bool
+            Returns ture if the name matches with any local buffer
+
+        """
+
+        if name in self.localObjects.keys():
+            return True
+        else:
+            return False
+
+    def hoistTransientBuffer(self, name: str, size: int) -> str:
+        """Registers a new TransientBuffer in the local context
+
+        Parameters
+        ----------
+        name : str
+            Name of the TransientBuffer to register
+        size : int
+            BYTE size of the TransientBuffer to register
+
+        Returns
+        -------
+        str
+            On success, return the name of the registered buffer
+
+        """
+        transientBuffer = self.TransientBuffer(name, size)
+        self.add(transientBuffer, 'local')
+
+        return name
+
+    def hoistGlobalDefinition(self, name: str, definition: str) -> None:
+        """Registers a new GlobalDefinition in the global context
+
+        Parameters
+        ----------
+        name : str
+            Name of the GlobalDefinition to register
+        definition : str
+            Program code of the GlobalDefinition
+
+        """
+
+        _definition = GlobalDefinition(name, definition)
+        self.add(_definition, 'global')
+
+    def hoistStruct(self, _struct: Union[Dict[str, BaseType], Struct], name: str, _type: Type[Struct]) -> str:
+        """Register a Struct with the local context
+
+        Parameters
+        ----------
+        _struct : Union[Dict[str, BaseType], Struct]
+            Struct object or Struct object's definition
+        name : str
+            Name to register the struct under
+        _type : Type[Struct]
+            Type definition of the Struct class to register
+
+        Returns
+        -------
+        str
+            On success, return the name of the registered buffer
+
+        """
+
+        if isinstance(_struct, _type):
+            struct = _struct
+        else:
+            struct = _type(_struct, self)
+
+        structBuffer = self.StructBuffer(name, struct)
+        structBuffer._type = _type
+        structBuffer._instance = struct
+        self.add(structBuffer, 'local')
+
+        return name
+
+    def hoistConstantAndReference(self, constBuf: ConstantBuffer, pointerType: Type[Pointer]) -> str:
+        """Helper function to hoist a new ConstantBuffer and a _ReferenceBuffer to it. Mostly used in tiling to create boilerplate for tiled variables.
+
+        Parameters
+        ----------
+        constBuf : ConstantBuffer
+            ConstantBuffer to hoist
+        pointerType : Type[Pointer]
+            Pointer class to assign to the constant buffer
+
+        Returns
+        -------
+        str
+            name of the registered _ReferenceBuffer
+
+        """
+
+        name = constBuf.name
+        constBuf._type = pointerType
+
+        self.add(constBuf, "global")
+
+        constBuf._instance = constBuf._type(name, self)
+
+        refName = name + "_ref"
+        reference = self.hoistReference(name, refName)
+
+        return refName
+
+    def hoistReference(self, _reference: str, name: str) -> str:
+        """Helper function to register a _ReferenceBuffer to preexisting VariableBuffer
+
+        Parameters
+        ----------
+        _reference : str
+            Name of the VariableBuffer that should be referenced
+        name : str
+            Name of the _ReferenceBuffer that should be registered
+
+        Returns
+        -------
+        str
+            Returns the name of the newly registered _ReferenceBuffer
+
+        """
+
+        assert _reference != name, f"Reference name {_reference} cannot be the same as {name}"
+        assert not self.is_local(name), f"{name} is already in context!"
+
+        _object = self.lookup(_reference)
+
+        referenceBuffer = _ReferenceBuffer(name, reference = _object)
+        referenceBuffer._type = _object._type
+
+        self.add(referenceBuffer, 'local')
+        referenceBuffer._instance = _object._type(name, ctxt = self)
+
+        return name
+
+    def hoistConstant(self, node: gs.Node, name: str = '', _type: Optional[Type[Pointer]] = None) -> str:
+        """Register a ConstantBuffer extracted directly from a graphsurgeon Node
+
+        Parameters
+        ----------
+        node : gs.Node
+            graphsurgeon.Node containing a single constant output
+        name : str
+            Name of the ConstantBuffer to be registered
+        _type : Optional[Type[Pointer]]
+            Optional type assignment of the registered ConstantBuffer
+
+        Returns
+        -------
+        str
+            Returns the name of the newly registed ConstantBuffer
+
+        """
+
+        assert len(node.outputs) <= 1, "Constant has more than one output"
+
+        if name == "":
+            name = node.name
+
+        # SCHEREMO: This is currently heuristic, but should be annotated in ONNX
+        localBuffer = self.VariableBuffer.fromNode(node = node)
+        globalBuffer = self.ConstantBuffer.fromVariableBuffer(localBuffer, values = node.values)
+        globalBuffer.name = name
+        globalBuffer._type = type
+
+        self.add(globalBuffer, 'global')
+
+        return globalBuffer.name
+
+    def addUser(self, name: str, node: gs.Node):
+        """Adds an operator's name to the _user list of a VariableBuffer in the context
+
+        Parameters
+        ----------
+        name : str
+            Name of the VariableBuffer that gets used by the node
+        node : gs.Node
+            graphsurgeon Node of the operator
+
+        """
+
+        _buffer = self.lookup(name)
+        if node.name not in _buffer._users:
+            _buffer._users.append(node.name)
+        if self.is_local(_buffer.name):
+            self.localObjects[_buffer.name] = _buffer
+        else:
+            self.globalObjects[_buffer.name] = _buffer
+
+    def annotateType(self, name: str, _type: Type[Pointer]):
+        """Annotates a Deeploy-type pointer on the _type field of a VariableBuffer
+
+        Parameters
+        ----------
+        name : str
+            Name of the VariableBuffer to annotate
+        _type : Type[Pointer]
+            Type of the Deeploy-type pointer to annotate the
+            VariableBuffer with
+
+        """
+        obj = self.lookup(name)
+        obj._type = _type
+        obj._instance = _type(name, ctxt = self)
+
+    def copy(self) -> NetworkContext:
+        """Return a shallow copy of this NetworkContext
+
+        """
+        return copy.copy(self)
+
+
+class NodeParser():
+    """Deeploy's core Parser class. Analyzes network nodes and evaluates whether they can be mapped by it.
+
+    """
+
+    def __init__(self):
+        self.operatorRepresentation: OperatorRepresentation = {
+        }  #: Dict[str, Any]: The internal representation of the operator this parser has analyzed that describes all relevant attributes of the node to be used by code generation
+
+    @abstractmethod
+    def parseNode(self, node: gs.Node) -> bool:
+        """Parser-specific method to-be-implemented. Given a graphsurgeon node, this method returns whether its attributes are mappable by this parser.
+
+        Parameters
+        ----------
+        node : gs.Node
+            Graphsurgeon node to be evaluated
+
+        Returns
+        -------
+        bool
+            False if any attribute in the node cannot be mapped
+            correctly.
+
+        """
+        return True
+
+    @abstractmethod
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        """Parses the node's input and output tensors, and adds them to its operatorRepresentation. May also be used to assert certain input- and output-level characteristics like correct dimensions.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        node : gs.Node
+            Node to be analyzed
+        channels_first : bool
+            Flag to indicate whether tensor dimensions are expected to
+            be in CxHxW layout (true) or HxWxC layout (false)
+
+        Returns
+        -------
+        Tuple[NetworkContext, bool]
+            Tuple of the updated NetworkContext and return boolean to
+            indicate whether the node, including it's IO tensors can
+            be mapped.
+
+        """
+
+        return ctxt, True
+
+    @classmethod
+    def parseInputs(cls, ctxt: NetworkContext, node: gs.Node) -> NetworkContext:
+        """DONT OVERRIDE - Takes care of hoisting IO tensors into the NetworkContext. Also verifies
+        that all inputs have been registered and the output has not been registered.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        node : gs.Node
+            Node whose IO tensors should be hoisted
+
+        Returns
+        -------
+        NetworkContext
+            Updated NetworkContext with hoisted IO tensors
+
+        """
+        data_in_buffers = []
+        for inputNode in node.inputs:
+            data_in = inputNode.name
+
+            # Hoist constant inputs
+            if type(inputNode) == gs.ir.tensor.Constant and not ctxt.is_global(data_in):
+                ctxt.hoistConstant(inputNode)
+            else:
+                localBuffer = ctxt.lookup(data_in)
+                data_in_buffers.append(localBuffer.name)
+
+            ctxt.addUser(data_in, node)
+
+        return ctxt
+
+    @classmethod
+    def parseOutputs(cls, ctxt: NetworkContext, node: gs.Node) -> NetworkContext:
+        """DONT OVERRIDE - registers the output tensor of the operator
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        node : gs.Node
+            Operator whose outputs should be parsed
+
+        Returns
+        -------
+        NetworkContext
+            Updated NetworkContext
+
+        """
+        outputNodes = node.outputs
+        outputNames = [node.name for node in outputNodes]
+
+        for node, name in zip(outputNodes, outputNames):
+            if not ctxt.is_global(name):
+                nb = ctxt.VariableBuffer(name = name, shape = node.shape)
+                ctxt.add(nb, 'local')
+            else:
+                nb = ctxt.lookup(name)
+
+        return ctxt
+
+    # Don't touch this
+    def parse(self,
+              ctxt: NetworkContext,
+              node: gs.Node,
+              default_channels_first: bool = True,
+              ioParse: bool = True) -> Tuple[NetworkContext, bool]:
+        """DONT OVERRIDE - Uses other NodeParser functions to implement a full parsing passing of the node
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        node : gs.Node
+            Node to be parsed
+        default_channels_first : bool
+            The default `channels_first` value if none is provided by the node's attributes
+        ioParse : bool
+            Flag to indicate whether to go through IO parsing or not
+
+        Returns
+        -------
+        Tuple[NetworkContext, bool]
+            Returns updated NetworkContext and flag to indicate
+            success
+
+        """
+        self.operatorRepresentation = {}
+
+        if "channels_first" in node.attrs:
+            self.operatorRepresentation['channels_first'] = node.attrs['channels_first']
+        else:
+            self.operatorRepresentation['channels_first'] = default_channels_first
+
+        ret = self.parseNode(node)
+
+        if not ret:
+            return ctxt, False
+
+        if ioParse:
+            ctxt = ctxt.copy()
+            ctxt = self.parseInputs(ctxt, node)
+            ctxt = self.parseOutputs(ctxt, node)
+
+        return ctxt, True
+
+
+class NodeTypeChecker():
+    """Implements type checking according to user-defined rules to assign Deeploy-types to the Python-typed input graph
+
+    """
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        """Generate a type checking rule
+
+        Parameters
+        ----------
+        input_types : Sequence[Type[Pointer]]
+            Ordered sequence of Deeploy-types that should be assigned
+            to the operator's Python-typed input tensor
+        output_types : Sequence[Type[Pointer]]
+            Ordered sequence of Deeploy-types that should be assigned
+            to the operator's Python-typed input tensor
+
+        """
+
+        self.input_types = input_types
+        self.output_types = output_types
+
+        self.typeDict: Dict[str, Type[Pointer]] = {
+        }  #: Dict[str, Type[Pointer]]: Stores the type assignment of the input and output tensors, mapping them to the names defined by the NodeParser
+
+    def checkOutputType(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> bool:
+        """TypeCheck method to-be-implemented. Returns whether the type checking rule is met or not
+
+        Parameters
+        ----------
+        inputs : List[VariableBuffer]
+            Ordered list of operator inputs to be used for inferring
+            the output type
+        operatorRepresentation : OperatorRepresentation
+            NodeParser's operatorRepresentation
+
+        Returns
+        -------
+        bool
+            True if output type can be assigned as defined in
+            output_types
+
+        """
+        return True
+
+    def typeInferOutput(self, ctxt: NetworkContext, node: gs.Node,
+                        operatorRepresentation: OperatorRepresentation) -> NetworkContext:
+        """DONT OVERRIDE - Annotates each VariableBuffer in the NetworkContext corresponding to an output of the operator with this rule's output types.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        node : gs.Node
+            Node whose outputs should be annotated
+        operatorRepresentation : OperatorRepresentation
+            NodeParser's operatorRepresentation
+
+        Returns
+        -------
+        NetworkContext
+            Updated NetworkContext
+
+        """
+        newCtxt = ctxt.copy()
+
+        inputs = [ctxt.lookup(inputNode.name) for inputNode in node.inputs]
+        outputNames = [node.name for node in node.outputs]
+
+        outputTypes = self.output_types
+
+        for name, output_type in zip(outputNames, outputTypes):
+            newCtxt.annotateType(name, output_type)
+
+        return newCtxt
+
+    def typeCheckNodeInputs(self, ctxt: NetworkContext, node: gs.Node) -> bool:
+        """DONT OVERRIDE - Type checks all input nodes to confirm they either already are assigned the correct type or their type can be statically upcast to the rule's input types
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        node : gs.Node
+            Node whose inputs should be analyzes
+
+        Returns
+        -------
+        bool
+            Whether the input's types match the rule's requirements
+
+        """
+        retCheck = True
+
+        for inputNode, _type in zip(node.inputs, self.input_types):
+            reference = ctxt.lookup(inputNode.name)
+
+            if not isinstance(reference, VariableBuffer):
+                return False
+
+            if hasattr(reference, "values"):
+                retCheck &= _type.referencedType.checkPromotion(reference.values)
+            else:
+                if ctxt.is_global(inputNode.name):
+                    retCheck &= _type.referencedType.partialOrderUpcast(reference._type.referencedType)
+
+                    if retCheck:
+                        reference._type = _type
+                        reference._instance = _type(inputNode.name, ctxt)
+                else:
+                    retCheck &= reference._type.referencedType == _type.referencedType
+        return retCheck
+
+    def typeInferGlobalCtxt(self, ctxt: NetworkContext, node: gs.Node) -> NetworkContext:
+        for inputNode, _type in zip(node.inputs, self.input_types):
+            if isinstance(ctxt.lookup(inputNode.name), ConstantBuffer):
+                reference = ctxt.lookup(inputNode.name)
+                if not _type.referencedType.checkPromotion(reference.values):
+                    raise Exception(f"Can't cast {reference} to {_type}!")
+
+                ctxt.annotateType(inputNode.name, _type)
+
+        return ctxt
+
+    def annotateDict(self, ctxt: NetworkContext, node: gs.Node, operatorRepresentation: OperatorRepresentation):
+        """Store the inferred typing information into the rule's type dict
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        node : gs.Node
+            Operator whose inputs and outputs should be considered
+        operatorRepresentation : OperatorRepresentation
+            The NodeParser's operatorRepresentation
+
+        """
+        env = [node.name for node in node.inputs + node.outputs]
+        for key, value in operatorRepresentation.items():
+            # check if the referenced buffer is in the environment
+            if isinstance(value, str) and value in env:
+                self.typeDict[key + '_type'] = ctxt.lookup(value)._type
+
+    def typeCheck(self, ctxt: NetworkContext, node: gs.Node,
+                  operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, bool]:
+        """DONT OVERRIDE - Uses other NodeTypeChecker methods to implement full type inference on a single node
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        node : gs.Node
+            Node that should be used for type inference
+        operatorRepresentation : OperatorRepresentation
+            The NodeParser's operatorRepresentation
+
+        Returns
+        -------
+        Tuple[NetworkContext, bool]
+            Updated NetworkContext and whether type inference was
+            successful with this rule.
+
+        """
+        newCtxt = ctxt.copy()
+
+        if not self.typeCheckNodeInputs(newCtxt, node):
+            return ctxt, False
+
+        if not self.checkOutputType(node.inputs, operatorRepresentation):
+            return ctxt, False
+
+        newCtxt = self.typeInferGlobalCtxt(newCtxt, node)
+        newCtxt = self.typeInferOutput(newCtxt, node, operatorRepresentation)
+        self.annotateDict(newCtxt, node, operatorRepresentation)
+        return (newCtxt, True)
+
+
+class ExecutionBlock():
+    """Deeploy abstraction to represent a operator whose kernel has been determined. Mostly used to apply various code transformations, and, finally, generate C Code
+
+    """
+
+    def __init__(self, operatorCodeSnippet: Optional[CodeSnippet] = None):
+        """Initialize a new ExecutionBlock object from a CodeSnippet
+
+        Parameters
+        ----------
+        codeSnippet : Optional[CodeSnippet]
+            NodeTemplate + operatorRepresentation combination that makes up this
+            ExecutionBlock
+        """
+        if operatorCodeSnippet is not None:
+            self.codeSnippets = deque([operatorCodeSnippet])
+        else:
+            self.codeSnippets = deque(
+                []
+            )  #: Sequence[CodeSnippet]: ordered list of code snippets that need to be generated to implemented the associated operator
+
+        self.patternMemoryConstraint: Optional = None  #: Optional[PatternMemoryConstraint]: Tiling information of the operator which is annotated in the midend
+
+    def addLeft(self, template: NodeTemplate, operatorRepresentation: OperatorRepresentation):
+        """Adds a code snippet that is generated BEFORE any of the other code snippets in this ExecutionBlock
+
+        Parameters
+        ----------
+        template : NodeTemplate
+            NodeTemplate that represents the code snippet to be added
+        operatorRepresentation : OperatorRepresentation
+            Dictionary that holds all expressions to generate code
+            from the template
+
+
+        """
+        self.codeSnippets.appendleft(CodeSnippet(template, operatorRepresentation))
+
+    def addRight(self, template: NodeTemplate, operatorRepresentation: OperatorRepresentation):
+        """Adds a code snippet that is generated AFTER any of the other code snippets in this ExecutionBlock
+
+        Parameters
+        ----------
+        template : NodeTemplate
+            NodeTemplate that represents the code snippet to be added
+        operatorRepresentation : OperatorRepresentation
+            Dictionary that holds all expressions to generate code
+            from the template
+
+        """
+
+        self.codeSnippets.append(CodeSnippet(template, operatorRepresentation))
+
+    def hoisting(self, ctxt: NetworkContext, **kwargs) -> Tuple[NetworkContext, List[str]]:
+        """Helper function to run the underlying NodeTemplate's hooks to add TransientBuffers into the NetworkContext and call their alignToContext methods
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+
+        Returns
+        -------
+        Tuple[NetworkContext, List[str]]
+            Updated NetworkContext and a list of newly registered
+            buffer names
+
+        """
+
+        transientBuffers = []
+        contextBuffers = []
+
+        for idx, codeSnippet in enumerate(self.codeSnippets.copy()):
+
+            template, operatorRepresentation = codeSnippet.template, codeSnippet.operatorRepresentation
+
+            newCtxt, operatorRepresentation, _transientBuffers = template.hoistTransientBuffers(
+                ctxt, {
+                    **operatorRepresentation,
+                    **kwargs
+                })
+            newCtxt, operatorRepresentation, _contextBuffers = template._alignToContext(
+                newCtxt, {
+                    **operatorRepresentation,
+                    **kwargs
+                })
+
+            self.codeSnippets[idx].operatorRepresentation.update(operatorRepresentation)
+            transientBuffers += _transientBuffers
+            contextBuffers += _contextBuffers
+
+        return newCtxt, transientBuffers + contextBuffers
+
+    @staticmethod
+    def _mangleNodeRep(ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> OperatorRepresentation:
+        parseDict = {}
+
+        for key, value in operatorRepresentation.items():
+            if type(value) == str and (ctxt.is_local(value) or
+                                       ctxt.is_global(value)) and not isinstance(ctxt.lookup(value), GlobalDefinition):
+                parseDict[key] = ctxt._mangle(value)
+            else:
+                parseDict[key] = value
+
+        return parseDict
+
+    def generate(self, ctxt: NetworkContext, **kwargs) -> str:
+        """Generates the code for all registered NodeTemplates and joins it to construct a single snippet
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+
+        Returns
+        -------
+        str
+            Code snippet that represent the entire ExecutionBlock
+
+        """
+
+        return ("\n").join([
+            codeSnippet.template.generate(
+                ExecutionBlock._mangleNodeRep(ctxt, {
+                    **codeSnippet.operatorRepresentation,
+                    **kwargs
+                })) for codeSnippet in self.codeSnippets
+        ])
+
+
+class NodeBinding():
+    """Deeploy's class to bind individual NodeTypeChecker objects to NodeTemplate and associate a CodeTransformation.
+
+    """
+
+    def __init__(self, typeChecker: NodeTypeChecker, template: NodeTemplate, codeTransformer: CodeTransformation):
+        self._typeChecker = typeChecker  #: NodeTypeChecker: The NodeTypeChecker that verifies the kernel template's signature can be matched to the node
+        self.template = template  #: NodeTemplate: The kernel template you want to bind
+        self._executionBlock: ExecutionBlock = ExecutionBlock(
+        )  #: ExecutionBlock: The executionBlock that will be built from the NodeTemplate
+        self._nodeName: str
+        self.buffers: List[VariableBuffer] = []
+        self.codeTransformer: CodeTransformation = codeTransformer
+
+    @property
+    def typeChecker(self):
+        """Read-only wrapper around the encapsulated type checker
+        """
+        return self._typeChecker
+
+    @property
+    def executionBlock(self):
+        """Read-only wrapper around the encapsulated execution block
+        """
+        return self._executionBlock
+
+    @property
+    def nodeName(self):
+        """Read-only wrapper around the encapsulated node's name
+        """
+        return self._nodeName
+
+    def earlyBinding(self, ctxt: NetworkContext, node: gs.Node,
+                     operatorRepresentation: OperatorRepresentation) -> NetworkContext:
+        """Initializes the executionBlock with the NodeTemplate
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        node : gs.Node
+            The operator this NodeBinding is associated with
+        operatorRepresentation : OperatorRepresentation
+            The NodeParser's operatorRepresentation
+
+        Returns
+        -------
+        NetworkContext
+            Updated NetworkContext
+
+        """
+        self.executionBlock.addLeft(self.template, operatorRepresentation)
+        self._nodeName = operatorRepresentation['nodeName']
+        return ctxt
+
+    def typeCheck(self, ctxt: NetworkContext, node: gs.Node,
+                  operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, bool]:
+        """Runs the binding-level typechecker on a node
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        node : gs.Node
+            The node to be typechecked
+        operatorRepresentation : OperatorRepresentation
+            The NodeParser's operatorRepresentation
+
+        Returns
+        -------
+        Tuple[NetworkContext, bool]
+            Updated and NetworkContext and true if the typing rule
+            matches the node
+
+        """
+        newCtxt, ret = self.typeChecker.typeCheck(ctxt.copy(), node, operatorRepresentation)
+        if ret:
+            return newCtxt, True
+
+        return ctxt, False
+
+    def bind(self, ctxt: NetworkContext, node: gs.Node,
+             operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, List[str], bool]:
+        """Initializes the executionBlock and hoist all necessary transient buffers of the underlying NodeTemplate
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        node : gs.Node
+            The node that should be bound
+        operatorRepresentation : OperatorRepresentation
+            The NodeParser's operatorRepresentation
+        Returns
+        -------
+        Tuple[NetworkContext, List[str], bool]
+            Updated NetworkContext, a list of names of transient
+            buffers that were hoisted and true if binding succeeded
+
+        """
+        newCtxt = self.earlyBinding(ctxt, node, operatorRepresentation)
+        newCtxt, buffers = self.executionBlock.hoisting(newCtxt, **self.typeChecker.typeDict)
+
+        for _buffer in buffers:
+            newCtxt.lookup(_buffer)._users.append(self._nodeName)
+
+        return newCtxt, [], True
+
+    def codeTransform(self, ctxt: NetworkContext, verbose: CodeGenVerbosity = _NoVerbosity):
+        """Applies the CodeTransformer's passes on the executionBlock
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        verbose : CodeGenVerbosity
+            Verbosity options to control code generation
+
+        """
+        ctxt, self._executionBlock = self.codeTransformer.transform(ctxt, self.executionBlock, self.nodeName, verbose)
+        return ctxt
+
+    def generate(self, ctxt: NetworkContext) -> List[str]:
+        """Generates C Code from the encapsulated executionBlock
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+
+        Returns
+        -------
+        List[str]
+            A list of C Code snippets to be pasted into the final
+            program
+
+        """
+        nodeCall = self.executionBlock.generate(ctxt, **self.typeChecker.typeDict)
+        return [nodeCall]
+
+
+class NodeMapper():
+    """Deeploy class to link a NodeParser and several NodeBindings
+    """
+
+    def __init__(self, parser: NodeParser, bindings: List[NodeBinding]):
+        self.parser = parser  #: NodeParser: The NodeParser object which is used to determine whether an operator may be bound to one of the associated bindings
+        self.bindings = bindings  #: List[NodeBinding]: All possible bindings that correspond to the linked parser
+
+        self.binder: NodeBinding  #: NodeBinding: The currently chosen NodeBinding
+        self.bound = False  #: bool: Indicates whether a binder has been chosen or not
+
+        self.discardedBindings = set()  #: Set[NodeBinding]: Set of all bindings which have been tried unsuccessfully.
+
+    # Don't override this. Parses the networks with the correct data type
+    def _parse(self,
+               ctxt: NetworkContext,
+               node: gs.Node,
+               default_channels_first: bool = True,
+               ioParse: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = self.parser.parse(ctxt.copy(), node, default_channels_first, ioParse)
+        if ret:
+            return newCtxt, True
+
+        return ctxt, False
+
+    def _parseCtxt(self,
+                   ctxt: NetworkContext,
+                   node: gs.Node,
+                   default_channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = self.parser.parseNodeCtxt(ctxt.copy(), node, default_channels_first)
+        return (newCtxt, ret)
+
+    def bindingsExhausted(self) -> bool:
+        """Returns whether all bindings have been tried
+
+        Returns
+        -------
+        bool
+            True is no more bindings are possible
+
+        """
+        return len(self.discardedBindings) == len(self.bindings)
+
+    def discardCurrentBinder(self):
+        """Discards the binder object
+
+        """
+        self.discardedBindings.add(self.binder)
+        self.binder = None
+        self.bound = False
+
+    def resetDiscardedBindings(self):
+        """Reset the discardedBindings set
+
+        """
+        self.discardedBindings = set()
+
+    def typeCheck(self, ctxt: NetworkContext, node: gs.Graph) -> Tuple[NetworkContext, bool]:
+        """Tries to elect a binder object whose typeChecker allows the node configuration
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        node : gs.Graph
+            The node that is being evaluated
+
+        Returns
+        -------
+        Tuple[NetworkContext, bool]
+            Updated NetworkContext and bool to indicate success or
+            failure
+
+        """
+        for binder in self.bindings:
+
+            if binder in self.discardedBindings:
+                continue
+
+            newCtxt, ret = binder.typeCheck(ctxt.copy(), node, self.parser.operatorRepresentation)
+
+            if not ret:
+                self.discardedBindings.add(binder)
+                continue
+
+            self.binder = binder
+            return newCtxt, True
+
+        return ctxt, False
+
+    # Don't override this. This should annotate the output node with the correct data type
+    # SCHEREMO: Currently simply binds the first viable binding
+    def bind(self, ctxt: NetworkContext, node: gs.Node) -> Tuple[NetworkContext, List[str], bool]:
+        """Invokes the binder's bind method to setup the executionBlock and buffer hoisting
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        node : gs.Node
+            Node that should be bound
+
+        Returns
+        -------
+        Tuple[NetworkContext, List[str], bool]
+            Updated NetworkContext, list of hoisted TransientBuffers'
+            names, boolean to indicate success or failure
+
+        """
+
+        newCtxt, transientBuffers, ret = self.binder.bind(ctxt.copy(), node, self.parser.operatorRepresentation)
+        if not ret:
+            return ctxt, [], False
+
+        self.bound = True
+
+        return newCtxt, transientBuffers, True
+
+    def generate(self, ctxt: NetworkContext) -> List[str]:
+        """Generates the C Code of the binder elected by this mapper
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+
+        Returns
+        -------
+        List[str]
+            Returns a list of code snippets that correspond to the
+            operator's invocation
+
+        Raises
+        ------
+        RuntimeError
+            Raises a RuntimeError if no binder has been elected or the
+            binder has not been bound yet.
+
+        """
+        if not self.bound:
+            raise RuntimeError("Bind layer before generating code!")
+        return self.binder.generate(ctxt)
+
+
+class ONNXLayer():
+    """Deeploy abstraction to represent one operator in an ONNX graph
+    """
+
+    def __init__(self, maps: List[NodeMapper]):
+        self.maps = maps  #: List[NodeMapper]: All potential mappings of an ONNX Layer
+
+        self.mapper: NodeMapper  #: NodeMapper: The currently elected NodeMapper to represent this layer
+        self.discardedMappers: Set[NodeMapper] = set(
+        )  #: Set[NodeMapper]: Set of all NodeMappers which cannot be used to represent this layer
+        self.node: gs.Node = None  #: gs.Node: The represented operator
+
+    def computeOps(self):
+        """Returns the number of operations (1 MAC = 2 Ops) of this operator
+        """
+        assert self.mapper is not None, "To compute Ops, network must first be parsed!"
+
+        return 0
+
+    # Override this for broadcasting support
+    # Returns a tuple of new, broadcasted inputShapes and outputShapes
+    def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation: OperatorRepresentation,
+                      channels_first: bool) -> Tuple[Shape, Shape]:
+        """Takes input and output shapes from the graph-representation and broadcasts them to a predefined layout
+
+        Parameters
+        ----------
+        inputShapes : Shape
+            Graph-level input shape
+        outputShapes : Shape
+            Graph-level output shapes
+        operatorRepresentation : OperatorRepresentation
+            The node's operatorRepresentation
+        channels_first : bool
+            Whether this operator's data layout is in CxHxW (true) or
+            HxWxC (false) layout
+
+        Returns
+        -------
+        Tuple[Shape, Shape]
+            Returns broadcasted shapes
+
+        """
+        return (inputShapes, outputShapes)
+
+    def broadcast(self, ctxt: NetworkContext, default_channels_first: bool = True) -> (NetworkContext):
+        """Broadcasts the operator's shapes and updates the NetworkContext
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        default_channels_first : bool
+            Whether the default layout if channels-first or not
+
+        Returns
+        -------
+        (NetworkContext)
+            Updated NetworkContext
+
+        Raises
+        ------
+        KeyError
+            Raises a KeyError if any tensor required is not found in
+            the NetworkContext
+        RuntimeError
+            Raises a RuntimeError if any tensor's shape could not be
+            broadcast to the target shape
+
+        """
+        inputShapes = [ctxt.lookup(node.name).shape for node in self.node.inputs]
+        outputShapes = [ctxt.lookup(node.name).shape for node in self.node.outputs]
+
+        if not "channels_first" in self.mapper.parser.operatorRepresentation:
+            channels_first = default_channels_first
+        else:
+            channels_first = self.mapper.parser.operatorRepresentation['channels_first']
+
+        newInputShapes, newOutputShapes = self.computeShapes(inputShapes, outputShapes,
+                                                             self.mapper.parser.operatorRepresentation, channels_first)
+
+        for node, newShape in zip(self.node.inputs + self.node.outputs, newInputShapes + newOutputShapes):
+            if ctxt.is_local(node.name):
+                ctxt.localObjects[node.name].shape = newShape
+                # Update shape of tensors in onnx graph
+                node.shape = newShape
+
+                # WIESEP: It is possible that the type was not yet set, so we assume some default type
+                # At this state, we assume that all local buffers are float32 type inference is not yet done.
+                if node.dtype is None:
+                    node.dtype = np.float32
+
+            elif ctxt.is_global(node.name):
+                ctxt.globalObjects[node.name].shape = newShape
+                if isinstance(ctxt.globalObjects[node.name], ConstantBuffer):
+
+                    # If the number of elements is equal, reshape
+                    if np.prod(ctxt.globalObjects[node.name].values.shape) == np.prod(newShape):
+                        ctxt.globalObjects[node.name].values.reshape(newShape)
+                    # The number of elements SHOULD be lower, and we broadcast
+                    else:
+                        try:
+                            ctxt.globalObjects[node.name].values = np.broadcast_to(ctxt.globalObjects[node.name].values,
+                                                                                   newShape)
+                        except:
+                            raise RuntimeError(
+                                f"Could not broadcast {node.name} from {ctxt.globalObjects[node.name].values.shape} to {newShape}!"
+                            )
+
+            else:
+                raise KeyError(f'Expected node {node.name} to be in context!')
+
+        return ctxt
+
+    # Don't override - binds the layer to a node
+    def __call__(self, node: gs.Node):
+        _copy = copy.deepcopy(self)
+        _copy.node = node
+        return _copy
+
+    def discardCurrentMapper(self):
+        """Discard the current Mapper
+
+        """
+        self.dicardedMappers.add(self.mapper)
+        self.mapper = None
+
+    def resetDiscardedMappers(self):
+        """Reset all discarded mappers
+
+        """
+        for mapper in self.maps:
+            mapper.resetDiscardedBindings()
+        self.discardedMappers = set()
+
+    def parse(self, ctxt: NetworkContext, default_channels_first: bool) -> Tuple[NetworkContext, bool]:
+        """Iterate through all possible mappers and elect the first one that work
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        default_channels_first : bool
+            Whether the default layout if channels-first or not
+
+        Returns
+        -------
+        Tuple[NetworkContext, bool]
+            Updated NetworkContext and boolean to indicate success or
+            failure
+
+
+        """
+        ioParse = True
+
+        # iterate through all possible mappings and return the first that works
+        for idx, mapper in enumerate(self.maps):
+
+            if mapper in self.discardedMappers:
+                continue
+
+            newCtxt = ctxt.copy()
+
+            newCtxt, ret = mapper._parse(newCtxt, self.node, default_channels_first, ioParse)
+
+            ioParse = not ret
+
+            if not ret:
+                self.discardedMappers.add(mapper)
+                continue
+
+            self.mapper = mapper
+
+            self.broadcast(newCtxt, default_channels_first)
+
+            newCtxt, ret = mapper._parseCtxt(newCtxt, self.node, default_channels_first)
+
+            if not ret:
+                self.discardedMappers.add(mapper)
+                continue
+
+            self.mapper.parser.operatorRepresentation['nodeOp'] = self.node.op
+            self.mapper.parser.operatorRepresentation['nodeName'] = self.node.name
+
+            return newCtxt, True
+
+        return ctxt, False
+
+    def _broadcastToNpType(self, ty: Type[BaseType]):
+
+        def _broadcastInteger(ty: Type[IntegerImmediate]):
+            if ty.signed:
+                return np.dtype(getattr(np, "int" + str(ty.typeWidth)))
+            else:
+                return np.dtype(getattr(np, "uint" + str(ty.typeWidth)))
+
+        if issubclass(ty, Pointer) and hasattr(ty, "referencedType"):
+            if issubclass(ty.referencedType, IntegerImmediate):
+                return _broadcastInteger(ty.referencedType)
+        elif issubclass(ty, IntegerImmediate):
+            return _broadcastInteger(ty)
+
+        return None
+
+    def typeCheck(self, ctxt: NetworkContext) -> Tuple[NetworkContext, bool]:
+        """Invokes the mapper's typeCheck method
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+
+        Returns
+        -------
+        Tuple[NetworkContext, bool]
+            Updated NetworkContext and boolean to indicate success or
+            failure
+
+        """
+
+        newCtxt = ctxt.copy()
+        newCtxt, ret = self.mapper.typeCheck(newCtxt, self.node)
+
+        if ret:
+            return newCtxt, True
+
+        return ctxt, ret
+
+    def bind(self, ctxt: NetworkContext) -> Tuple[NetworkContext, bool]:
+        """Attempt to bind the mapper; discard mapper if binding does not work
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+
+        Returns
+        -------
+        Tuple[NetworkContext, bool]
+            Updated NetworkContext and boolean to indicate success or
+            failure
+        """
+
+        newCtxt = ctxt.copy()
+        newCtxt, _, ret = self.mapper.bind(newCtxt, self.node)
+
+        if ret:
+            # Update onnx graph with name of the template class
+            self.node.attrs['mapping'] = str(self.mapper.binder.template.__class__).split("'")[1]
+
+            # Update shapes and types of tensors in onnx graph based on type inference after binding
+            for node in (self.node.inputs + self.node.outputs):
+                if ctxt.is_local(node.name):
+                    node.shape = ctxt.localObjects[node.name].shape
+                    npType = self._broadcastToNpType(ctxt.localObjects[node.name]._type)
+                    if npType is not None:
+                        node.dtype = npType
+                elif ctxt.is_global(node.name):
+                    npType = self._broadcastToNpType(ctxt.globalObjects[node.name]._type)
+                    if isinstance(ctxt.globalObjects[node.name], ConstantBuffer):
+                        if isinstance(node, gs.Constant):
+                            node.values = node.values.astype(npType)
+                    else:
+                        node.shape = ctxt.globalObjects[node.name].shape
+                        if npType is not None:
+                            node.dtype = npType
+
+            # WIESEP: Compute number of ops only after binding.
+            self.mapper.parser.operatorRepresentation['nodeOps'] = int(self.computeOps())
+            return newCtxt, True
+
+        self.discardedMappers.append(self.mapper)
+        return ctxt, False
+
+    def codeTransform(self, ctxt: NetworkContext, verbose: CodeGenVerbosity = _NoVerbosity) -> NetworkContext:
+        """Apply CodeTransformations to associated mapper's binder
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        verbose : CodeGenVerbosity
+            CodeGenVerbosity object to control verbosity of generated
+            code
+        Returns
+        -------
+        Tuple[NetworkContext, bool]
+            Updated NetworkContext
+
+        """
+        newCtxt = self.mapper.binder.codeTransform(ctxt, verbose)
+        return newCtxt
+
+    def generate(self, ctxt: NetworkContext) -> Tuple[NetworkContext, List[str]]:
+        """Invoke mapper's generate method
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+
+        Returns
+        -------
+        Tuple[NetworkContext, List[str]]
+            Updated NetworkContext and flag to indicate success
+
+        """
+
+        call = self.mapper.generate(ctxt)
+
+        generated_code = [call]
+        return (ctxt, generated_code)
+
+
+class TopologyOptimizationPass():
+    """Abstract pass object which modifies an ONNX graph
+
+    """
+
+    def __init__(self):
+        pass
+
+    def apply(self, graph: gs.Graph) -> Tuple[gs.Graph]:
+        """Applies a transformation to a graph
+
+        Parameters
+        ----------
+        graph : gs.Graph
+            The neural network being deployed
+
+        Returns
+        -------
+        Tuple[gs.Graph]
+            A modified version of the neural network graph
+
+        """
+        return graph
+
+
+class TopologyOptimizer():
+    """Wrapper object to apply multiple TopologyOptimizationPasses sequentially
+
+    """
+
+    def __init__(self, passes: List[TopologyOptimizationPass]):
+        self.passes = passes
+
+    def optimize(self, graph: gs.Graph) -> Tuple[gs.Graph]:
+        """Applies passes sequentially
+
+        Parameters
+        ----------
+        graph : gs.Graph
+            Current neural network graph
+
+        Returns
+        -------
+        Tuple[gs.Graph]
+            Modified neural network graph
+
+        """
+        for _pass in self.passes:
+            graph = _pass.apply(graph)
+            graph.cleanup().toposort()
+        return graph
+
+
+class NetworkOptimizationPass(TopologyOptimizationPass):
+    """Pass to update the NetworkContext and Neural Network Graph in one go
+    """
+
+    def apply(self, ctxt: NetworkContext, graph: gs.Graph) -> Tuple[NetworkContext, gs.Graph]:
+        """The method to update context and graph
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        graph : gs.Graph
+            Current Neural Network graph
+
+        Returns
+        -------
+        Tuple[NetworkContext, gs.Graph]:
+            Updated context and graph
+
+        """
+        return ctxt, graph
+
+
+class NetworkOptimizer(TopologyOptimizer):
+    """Wrapper class to run multiple NetworkOptimizationPasses sequentially
+    """
+
+    def optimize(self, ctxt: NetworkContext, graph: gs.Graph) -> Tuple[NetworkContext, gs.Graph]:  # type: ignore
+        """Apply passes sequentially
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        graph : gs.Graph
+            Current Neural Network graph
+
+        Returns
+        -------
+        Tuple[NetworkContext, gs.Graph]: # type: ignor
+            Update context and graph
+
+        """
+        for _pass in self.passes:
+            ctxt, graph = _pass.apply(ctxt, graph)  # type: ignore
+            graph.cleanup().toposort()
+        return ctxt, graph
+
+
+class CodeTransformationPass():
+    """Pass Object to update code generation; may either modify an executionBlock's existing code snippets or add new code snippets to an executionBlock
+    """
+
+    def __init__(self):
+        pass
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        """Apply the CodeTransformation to an ExecutionBlock
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        executionBlock : ExecutionBlock
+            ExecutionBlock whose code you'd like to transform
+        name : str
+            Graph node name of the operator being targetted
+        verbose : CodeGenVerbosity
+            Control the verbosity of code generation
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            Updated NetworkContext and ExecutionBlock
+
+        """
+        return ctxt, executionBlock
+
+
+class CodeTransformation():
+    """Wrapper object to run multiple CodeTransformations sequentially
+
+    """
+
+    def __init__(self, passes: List[CodeTransformationPass]):
+        self.passes = passes
+
+    def transform(self,
+                  ctxt: NetworkContext,
+                  executionBlock: ExecutionBlock,
+                  name: str,
+                  verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        """Apply passes sequentially to a single ExecutionBlock
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Current NetworkContext
+        executionBlock : ExecutionBlock
+            ExecutionBlock whose code you'd like to transform
+        name : str
+            Graph node name of the operator being targetted
+        verbose : CodeGenVerbosity
+            Control the verbosity of code generation
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            Updated NetworkContext and ExecutionBlock
+
+        """
+        for _pass in self.passes:
+            ctxt, executionBlock = _pass.apply(ctxt, executionBlock, name, verbose)
+        return ctxt, executionBlock
+
+
+class DeploymentEngine():
+    """Deeploy abstraction to represent a compute engine without a complete host system, like an accelerator
+
+    """
+
+    def __init__(self,
+                 name: str,
+                 Mapping: Dict[str, Union[ONNXLayer, Callable[[gs.Node], Any]]],
+                 initCode: str = "",
+                 includeList: List[str] = [""]) -> None:
+        """Instantiate a new engine
+
+        Parameters
+        ----------
+        name : str
+            Name of this compute engine; must be unique per deployemnt
+        Mapping : Dict[str, Union[ONNXLayer, Callable[[gs.Node], Any]]]
+            Mapping between operator names and ONNXLayer implementations
+        initCode : str
+            Static initialization code for this engine
+        includeList : List[str]
+            List of header files to be included with `#include` directives
+
+        """
+        self.name = name  #: str: Name of this compute engine; must be unique per deployemnt
+        self.Mapping = Mapping  #: Mapping between operator names and ONNXLayer implementations
+        self.initCode = initCode  # str: Static initialization code for this engine
+        self.includeList = includeList  #: List[str]: List of header files to be included with `#include` directives
+
+    def canExecute(self, node: gs.Node) -> bool:
+        """Return whether this accelerator can execute an operator
+
+        Parameters
+        ----------
+        node : gs.Node
+            Operator to be checked
+
+        Returns
+        -------
+        bool
+            True if operator can be run on this Engine, False
+            otherwise
+
+        """
+        return node.op in self.Mapping
+
+
+class DeploymentPlatform():
+    """Deeploy abstraction for a complete system, including at least a host core capable of memory allocation
+
+    """
+
+    def __init__(self, engines: List[DeploymentEngine], variableBuffer: Type[VariableBuffer],
+                 constantBuffer: Type[ConstantBuffer], structBuffer: Type[StructBuffer],
+                 transientBuffer: Type[TransientBuffer]) -> None:
+        """Initializes a new deployment platform
+
+        Parameters
+        ----------
+        engines : List[DeploymentEngine]
+            List of all available non-host engines
+        variableBuffer : Type[VariableBuffer]
+            VariableBuffer subclass with correctly set allocation and
+            deallocation templates
+        constantBuffer : Type[ConstantBuffer]
+            ConstantBuffer subclass with correctly set allocation and
+            deallocation templates
+        structBuffer : Type[StructBuffer]
+            StructBuffer subclass with correctly set allocation and
+            deallocation templates
+        transientBuffer : Type[TransientBuffer]
+            TransientBuffer subclass with correctly set allocation and
+            deallocation templates
+
+        """
+        assert len(engines) == len(set(engines)), "Duplicate engines are not allowed."
+        self.engines = engines  #: List[DeploymentEngine]: A list of all available non-host engines
+        self.VariableBuffer = variableBuffer
+        self.ConstantBuffer = constantBuffer
+        self.StructBuffer = structBuffer
+        self.TransientBuffer = transientBuffer
+
+
+class NetworkContainer():
+    """Deeploy abstraction for containing the information needed to describe a complete neural network to be deployed
+
+    """
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 platform: DeploymentPlatform,
+                 inputTypes: Dict[str, Type[Pointer]],
+                 scheduler: Callable[[gs.Graph], Schedule] = lambda graph: list(graph.nodes),
+                 name: str = 'DeeployNetwork',
+                 deeployStateDir: str = "DeeployState"):
+        """Initializes a new NetworkContainer and its NetworkContext
+
+        Parameters
+        ----------
+        graph : gs.Graph
+            Neural network graph to be deployed
+        platform : DeploymentPlatform
+            DeploymentPlatform being targetted
+        inputTypes : Dict[str, Type[Pointer]]
+            DataType for each global network input
+        scheduler : Callable[[gs.Graph], Schedule]
+            Callable that ingests the graph and returns a list of
+            operators to execute
+        name : str
+            Prefix to use in deployment to uniquify tensor names
+        deeployStateDir : str
+            Path to a directory to dump intermediate outputs
+
+
+        """
+        self.graph = graph
+        self.scheduler = scheduler
+        self.layerBinding: 'OrderedDict[str, ONNXLayer]' = OrderedDict()
+        self.parsed = False
+        self.Platform = platform
+        for engine in self.Platform.engines:
+            engine.Mapping['Constant'] = lambda x: \
+                self.ctxt.hoistConstant(x.attrs['value'], x.outputs[0].name, None)
+
+        self.inputTypes = inputTypes
+
+        self.ctxt = NetworkContext(variableBuffer = self.Platform.VariableBuffer,
+                                   constantBuffer = self.Platform.ConstantBuffer,
+                                   structBuffer = self.Platform.StructBuffer,
+                                   transientBuffer = self.Platform.TransientBuffer)
+
+        self.deeployStateDir = deeployStateDir
+
+        self.bound = False
+        self.transformed = False
+
+    # Don't override this
+    def _createIOBindings(self, ctxt: NetworkContext, graph: gs.Graph):
+
+        for node in graph.inputs:
+            data_name = node.name
+            data_size = node.shape
+            data_type = self.inputTypes[node.name]
+            nb = ctxt.VariableBuffer(data_name, data_size)
+
+            ctxt.add(nb, 'global')
+            ctxt.annotateType(data_name, data_type)
+
+        for node in graph.outputs:
+            data_name = node.name
+            data_size = node.shape
+            # WIESEP: The shape and type will be parsed from the graph
+            nb = ctxt.VariableBuffer(data_name, data_size)
+            ctxt.add(nb, 'global')
+
+        return ctxt
+
+    def inputs(self) -> List[VariableBuffer]:
+        """Return a list of all VariableBuffers that are also global inputs of the network
+
+        Returns
+        -------
+        List[VariableBuffer]
+            Global inputs
+
+        """
+        inputs = []
+
+        graphInputs = [tensor.name for tensor in self.graph.inputs]
+
+        for key, value in self.ctxt.globalObjects.items():
+            if not isinstance(value, self.ctxt.VariableBuffer) or value._users == []:
+                continue
+            if key not in graphInputs:
+                continue
+
+            inputs += [value]
+        return inputs
+
+    def outputs(self) -> List[VariableBuffer]:
+        """Return a list of all VariableBuffers that are also global outputs of the network
+
+        Returns
+        -------
+        List[VariableBuffer]
+            Global outputs
+
+        """
+        outputs = []
+
+        graphOutputs = [tensor.name for tensor in self.graph.outputs]
+
+        for key, value in self.ctxt.globalObjects.items():
+
+            if not isinstance(value, self.ctxt.VariableBuffer):
+                continue
+            if key not in graphOutputs:
+                continue
+
+            outputs += [value]
+        return outputs
+
+    def codeTransform(self, verbose: CodeGenVerbosity = _NoVerbosity):
+        """Apply code transformations on every layer's execution block
+
+        Parameters
+        ----------
+        verbose : CodeGenVerbosity
+            Control code generation verbosity
+
+        Raises
+        ------
+        RuntimeError
+            Raises a RuntimeError if the entire network is not bound
+
+        """
+        if not self.bound:
+            raise RuntimeError('You need to bind the network before transforming code!')
+
+        if self.transformed:
+            return
+
+        for name, layer in self.layerBinding.items():
+            self.ctxt = layer.codeTransform(self.ctxt, verbose)
+        self.transformed = True
+
+    def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]:
+        for engine in self.Platform.engines:
+            if node.op in engine.Mapping:
+                return engine.Mapping[node.op](node)
+        raise RuntimeError(f"No mapping found for node {node.name} with op type {node.op}")
+
+    def _bindLayers(self):
+        # Create schedule, binding, then parse resulting program for correctness
+        self.layerBinding: 'OrderedDict[str, ONNXLayer]' = OrderedDict()
+
+        schedule = self.scheduler(self.graph)
+        flatSchedule = []
+
+        for subGraph in schedule:
+            if isinstance(subGraph, gs.Node):
+                flatSchedule.append(subGraph)
+            else:
+                flatSchedule += subGraph
+
+        for node in flatSchedule:
+            layer = self._mapNode(node)
+            if isinstance(layer, ONNXLayer):
+                self.layerBinding[layer.node.name] = layer
+
+    def _parseNode(self, node: ONNXLayer, ctxt: NetworkContext,
+                   default_channels_first: bool) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, parsePass = node.parse(ctxt.copy(), default_channels_first)
+
+        if not parsePass:
+            return ctxt, False
+
+        newCtxt, LayerBindSuccess = node.typeCheck(newCtxt)
+
+        if not LayerBindSuccess:
+            return ctxt, False
+
+        return newCtxt, True
+
+    # Don't override this
+    def parse(self, default_channels_first: bool = True) -> bool:
+        """Parses the full network by iteratively exploring mapping and binding options with backtracking
+
+        Parameters
+        ----------
+        default_channels_first : bool
+            Whether the default data layout is CxHxW or HxWxC
+
+        Returns
+        -------
+        bool
+            Returns a boolean to indicate whether parsing was
+            successful
+
+        Raises
+        ------
+        RuntimeError
+            Raises a RuntimeError if backtracking was exhausted
+            without finding a mapping solution
+
+        """
+
+        self.ctxt = NetworkContext(variableBuffer = self.Platform.VariableBuffer,
+                                   constantBuffer = self.Platform.ConstantBuffer,
+                                   structBuffer = self.Platform.StructBuffer,
+                                   transientBuffer = self.Platform.TransientBuffer)
+
+        self.ctxt = self._createIOBindings(self.ctxt, self.graph)
+
+        self._bindLayers()
+
+        ctxt = self.ctxt.copy()
+
+        ctxtStack = deque()
+        scheduledLayerList = list(self.layerBinding.values())
+        idx: int = 0
+
+        deepestIdx = 0
+
+        while (idx < len(scheduledLayerList)):
+            currentLayer = scheduledLayerList[idx]
+
+            stCtxt = copy.deepcopy(ctxt)
+
+            newCtxt, parseSuccess = self._parseNode(currentLayer, ctxt, default_channels_first)
+
+            if parseSuccess:
+
+                # SCHEREMO: Continue depth-first exploration
+                ctxtStack.append(stCtxt)
+                ctxt = newCtxt
+                idx = idx + 1
+                if idx > deepestIdx:
+                    deepestIdx = max(idx, deepestIdx)
+                    deepestCtxt = stCtxt
+
+            else:
+                # SCHEREMO: Rollback one step
+
+                # SCHEREMO: If we can't find a mapping for the root, we must exit
+                if idx == 0:
+                    raise RuntimeError(
+                        f'Did not find adequate mapping for graph! Explored until {scheduledLayerList[deepestIdx]} Candidates: {[type(x.parser).__name__ for x in scheduledLayerList[deepestIdx].maps]}. Exhausted backtracking.'
+                    )
+
+                previousLayer = scheduledLayerList[idx - 1]
+                ctxt = ctxtStack.pop()
+
+                # Keep options of current layer open - the upstream mapping will change, so we don't know which options are feasible here
+                currentLayer.resetDiscardedMappers()
+
+                # Update the previous layer, by discarding the current mapper or binder
+                if previousLayer.mapper.bindingsExhausted():
+                    previousLayer.discardCurrentMapper()
+                else:
+                    previousLayer.mapper.discardCurrentBinder()
+
+                idx = idx - 1
+
+        self.ctxt = ctxt
+        self.parsed = True
+        return True
+
+    def bind(self) -> bool:
+        """Bind the entire network layer-by-layer
+
+        Returns
+        -------
+        bool
+            Return true if binding was successful
+
+        Raises
+        ------
+        RuntimeError
+            Raises a RuntimeError if the network has not been parsed
+            of there exists no valid binding
+
+        """
+        if not self.parsed:
+            raise RuntimeError('You need to parse the network before binding!')
+
+        # SCHEREMO: Implement backtracking here! Currently tries the cheapest branch only!
+        newCtxt = self.ctxt.copy()
+
+        NetworkBindSuccess = True
+        for name, layer in self.layerBinding.items():
+
+            newCtxt, LayerBindSuccess = layer.bind(newCtxt)
+            NetworkBindSuccess = NetworkBindSuccess and LayerBindSuccess
+
+            if not NetworkBindSuccess:
+                raise RuntimeError(f'Could not find a valid binding for the graph')
+
+        self.bound = True
+        self.ctxt = newCtxt
+
+        return True
+
+    # Don't override this
+    def generateInferenceCode(self) -> str:
+        """Generate the actual inference function for the entire network
+
+        Returns
+        -------
+        str
+            The full inference method
+
+        Raises
+        ------
+        ValueError
+            Raises a RuntimeError if network is not parsed and bound
+
+        """
+        if not self.parsed or not self.bound:
+            raise RuntimeError('You need to parse and bind the network before generating code!')
+
+        callStack = ''
+
+        for key, node in self.layerBinding.items():
+            self.ctxt, code = node.generate(self.ctxt)
+
+            sections = reduce(lambda a, b: a + b, code, [])
+            callStack += reduce(lambda a, b: a + b, sections, "")
+
+        return callStack
+
+    # Don't override this
+    def generateGlobalDefinitionCode(self) -> str:
+        """Generate all global definition code for inference
+
+        Returns
+        -------
+        str
+            Global Definition code
+
+        Raises
+        ------
+        RuntimeError
+            Raises a RuntimeError if network is not parsed and bound
+
+        """
+        if not self.parsed or not self.bound:
+            raise RuntimeError('You need to parse and bind the network before generating code!')
+
+        callStack = reduce(
+            lambda a, b: a + b,
+            [obj.definition for obj in self.ctxt.globalObjects.values() if isinstance(obj, GlobalDefinition)], "")
+
+        return callStack
+
+    # Don't override this
+    def generateInferenceInitializationCode(self) -> str:
+        """Generate initialization code, including static memory allocation and other setup tasks
+
+        Returns
+        -------
+        str
+            Initialization code
+
+        Raises
+        ------
+        RuntimeError
+            Raises a RuntimeError if network is not parsed and bound
+
+        """
+        if not self.parsed or not self.bound:
+            raise RuntimeError('You need to parse and bind the network before generating code!')
+
+        callStack = ''
+        for node in self.ctxt.localObjects.values():
+            # WIESEP: We don't want to initialize the struct buffers as this should be handled by the ArgumentStructGeneration
+            if isinstance(node, StructBuffer):
+                continue
+
+            name = node.name
+            node.name = self.ctxt._mangle(node.name)
+            callStack += node.init()
+            node.name = name
+
+        return callStack
+
+    # Don't override this
+    def generateIOBufferInitializationCode(self) -> str:
+        """Generate initialization code for global network inputs and outputs
+
+        Returns
+        -------
+        str
+            Initialization code
+
+        Raises
+        ------
+        RuntimeError
+            Raises a RuntimeError if network is not parsed and bound
+
+        """
+        if not self.parsed or not self.bound:
+            raise RuntimeError('You need to parse and bind the network before generating code!')
+
+        callStack = ''
+        inputNum = 0
+        outputNum = 0
+        inputs = self.inputs()
+        outputs = self.outputs()
+
+        for node in self.ctxt.globalObjects.values():
+            if isinstance(node, VariableBuffer) and not isinstance(node, (StructBuffer, ConstantBuffer)):
+                assert issubclass(node._type, Pointer), f"IO Buffer {node.name} is not a Pointer!"
+                if node._deploy:
+                    name = node.name
+                    node.name = self.ctxt._mangle(node.name)
+                    callStack += "extern " + node.init()
+                    # SCHEREMO: Borderline hacky, but on the okay side of things, I think
+                    callStack += "static const uint32_t " + node.name + "_len" + " = " + str(np.prod(node.shape)) + ";"
+                    node.name = name
+
+        callStack += "static const uint32_t " + self.ctxt._mangle("num_inputs") + f" = {len(inputs)};"
+        callStack += "static const uint32_t " + self.ctxt._mangle("num_outputs") + f" = {len(outputs)};"
+
+        callStack += "extern void* " + self.ctxt._mangle("inputs") + f"[{len(inputs)}];"
+        callStack += "extern void* " + self.ctxt._mangle("outputs") + f"[{len(outputs)}];"
+
+        callStack += "static const uint32_t " + self.ctxt._mangle("inputs_bytes") + f"[{len(inputs)}] = " + "{"
+
+        numBytes = []
+        for node in inputs:
+            numBytes.append(str(np.prod(node.shape) * node._type.referencedType.typeWidth // 8))
+        callStack += ", ".join(numBytes)
+
+        callStack += "};"
+
+        callStack += "static const uint32_t " + self.ctxt._mangle("outputs_bytes") + f"[{len(outputs)}] = " + "{"
+
+        numBytes = []
+        for node in outputs:
+            numBytes.append(str(np.prod(node.shape) * node._type.referencedType.typeWidth // 8))
+        callStack += ", ".join(numBytes)
+
+        callStack += "};"
+
+        return callStack
+
+    @property
+    def worstCaseBufferSize(self):
+        """Return the worst-case buffer size occupied by the network implementaiton
+        """
+        # WIESEP: There is no reasonable value for a worst case buffer size without tiling
+        raise NotImplementedError("Worst case buffer size is not known or not implemented!")
+
+    # Don't override this
+    def generateBufferInitializationCode(self) -> str:
+        """Generates code for all forward-declaration of buffers used during inference
+
+        Returns
+        -------
+        str
+            Returns forward-declaration code
+
+        Raises
+        ------
+        RuntimeError
+            Raises a RuntimeError if network is not parsed and bound
+
+        """
+        if not self.parsed or not self.bound:
+            raise RuntimeError('You need to parse and bind the network before generating code!')
+
+        ctxt = self.ctxt.copy()
+
+        inputs = self.inputs()
+        outputs = self.outputs()
+
+        callStack = ''
+        for node in ctxt.globalObjects.values():
+            if isinstance(node, VariableBuffer) and not isinstance(node, StructBuffer):
+                assert issubclass(node._type, Pointer), f"Global VariableBuffer {node.name} is not a Pointer!"
+                if node._deploy:
+                    name = node.name
+                    node.name = ctxt._mangle(node.name)
+                    callStack += node.init()
+                    node.name = name
+
+        for node in ctxt.globalObjects.values():
+            if isinstance(node, StructBuffer):
+                name = node.name
+                node.name = ctxt._mangle(node.name)
+                callStack += node.init()
+                node.name = name
+
+        callStack += "void* " + ctxt._mangle("inputs") + f"[{len(inputs)}];"
+        callStack += "void* " + ctxt._mangle("outputs") + f"[{len(outputs)}];"
+
+        return callStack
+
+    def generateBufferAllocationCode(self) -> str:
+        """Generates code to allocate space for the global input and output buffer of the network
+
+        Returns
+        -------
+        str
+            Allocation code for global IO buffers
+
+        Raises
+        ------
+        RuntimeError
+            Raises a RuntimeError if network is not parsed and bound
+
+
+        """
+        if not self.parsed or not self.bound:
+            raise RuntimeError('You need to parse and bind the network before generating code!')
+
+        ctxt = self.ctxt.copy()
+
+        inputs = self.inputs()
+        outputs = self.outputs()
+        callStack = ''
+
+        for node in ctxt.globalObjects.values():
+            if isinstance(node, VariableBuffer) and not isinstance(node, StructBuffer):
+                assert issubclass(node._type, Pointer), f"Global VariableBuffer {node.name} is not a Pointer!"
+                if node._deploy:
+                    name = node.name
+                    node.name = ctxt._mangle(node.name)
+                    callStack += node.alloc()
+                    node.name = name
+
+        for node in ctxt.globalObjects.values():
+            if isinstance(node, StructBuffer):
+
+                if node._deploy:
+                    name = node.name
+                    node.name = ctxt._mangle(node.name)
+                    callStack += node.alloc()
+                    node.name = name
+
+        for idx, i in enumerate(inputs):
+            callStack += ctxt._mangle("inputs") + f"[{idx}] = (void*) {ctxt._mangle(i.name)};"
+        for idx, i in enumerate(outputs):
+            callStack += ctxt._mangle("outputs") + f"[{idx}] = (void*) {ctxt._mangle(i.name)};"
+
+        return callStack
+
+    # Don't override this
+    def generateBufferDeAllocationCode(self) -> str:
+        """Generates code to deallocate all global buffers
+
+        Returns
+        -------
+        str
+            Code to deallocate buffers
+
+        Raises
+        ------
+        RuntimeError
+            Raises a RuntimeError if network is not parsed and bound
+
+
+
+        """
+        if not self.parsed or not self.bound:
+            raise RuntimeError('You need to parse and bind the network before generating code!')
+
+        callStack = ''
+        for node in self.ctxt.globalObjects.values():
+            if node._deploy:
+                node.name = self.ctxt._mangle(node.name)
+                callStack += node.dealloc()
+
+        return callStack
+
+    def generateIncludeString(self) -> str:
+        """Generate code to include platform-dependent includes
+
+        Returns
+        -------
+        str
+            Include code
+
+        """
+        includeStr = []
+        for engine in self.Platform.engines:
+            for include in engine.includeList:
+                includeStr += ["#include \"" + include + "\""]
+        return ("\n").join(includeStr)
+
+    def generateEngineInitializationCode(self) -> str:
+        """Generate initialization code for all compute engines
+
+        Returns
+        -------
+        str
+            Initialization code for all engines
+
+        """
+        return ("\n").join([engine.initCode for engine in self.Platform.engines])
+
+    # Don't override this - Returns parameter size in bytes
+    def getParameterSize(self) -> int:
+        """Return the BYTE size of all static network parameters (weights, biases, parameters,...)
+
+        Returns
+        -------
+        int
+            Size of all network parameters
+
+        Raises
+        ------
+        RuntimeError
+            Raises a RuntimeError if network is not parsed and bound
+
+
+        """
+        if not self.parsed or not self.bound:
+            raise RuntimeError('You need to parse and bind the network before getting RAM Size!')
+
+        size = 0
+        for _buffer in self.ctxt.globalObjects.values():
+            # We do not count structs for now, since they are not properly modeled
+            if isinstance(_buffer, ConstantBuffer) and _buffer._deploy:
+                size += int((np.prod(_buffer.shape) * _buffer._type.typeWidth // 8))
+
+        return size
+
+    # Don't override this - Returns worst case layer and buffering size in bytes
+    def getTotalSize(self) -> int:
+        """Returns total size of the network, consisting of all parameters and intermediate buffer size
+
+        Returns
+        -------
+        int
+            Total network size
+
+        Raises
+        ------
+        RuntimeError
+            Raises a RuntimeError if network is not parsed and bound
+
+
+        """
+        if not self.parsed or not self.bound:
+            raise RuntimeError('You need to parse and bind the network before getting RAM Size!')
+
+        return self.getParameterSize() + self.worstCaseBufferSize
+
+    def numberOfOps(self, verbose: bool) -> int:
+        """Returns the total number of operations per network inference
+
+        Parameters
+        ----------
+        verbose : bool
+            Control whether the number of operations are printed to
+            STDOUT for each operator
+
+        Returns
+        -------
+        int
+            Number of operations (1 MAC = 2 Ops) per network inference
+
+        Raises
+        ------
+        RuntimeError
+            Raises a RuntimeError if network is not parsed and bound
+
+
+        """
+        if not self.parsed or not self.bound:
+            raise RuntimeError('You need to parse and bind the network before getting number of operations!')
+        totalSum = 0
+        for i in self.layerBinding.values():
+            nodeOps = i.mapper.parser.operatorRepresentation['nodeOps']
+            totalSum += nodeOps
+            if verbose:
+                print("Layer " + str(i.node.name) + str("\nNumber of operations: \t\t") + str("%12s\n" % nodeOps))
+        return totalSum
+
+        # Don't override this
+    def _exportGraph(self, folderPath, fileName):
+        relativeDataPath = os.path.join(folderPath, fileName + _dataExtension)
+        absoluteDataPath = os.path.abspath(relativeDataPath)
+        relativeOnnxPath = os.path.join(folderPath, fileName + _graphExtension)
+        absoluteOnnxPath = os.path.abspath(relativeOnnxPath)
+
+        if not os.path.isabs(absoluteOnnxPath) or not os.path.isabs(absoluteDataPath):
+            raise OSError(f"Error exporting the context to: {absoluteOnnxPath}")
+
+        model = gs.export_onnx(self.graph)
+
+        # Annotate additional information in doc_string of tensors
+        for tensor in (list(model.graph.value_info) + list(model.graph.output) + list(model.graph.input) +
+                       list(model.graph.initializer)):
+            if tensor.name in self.ctxt.localObjects:
+                lObject = self.ctxt.localObjects[tensor.name]
+                tensor.doc_string += f"Biased: {lObject._signed}, "
+                tensor.doc_string += f"nLevels: {lObject.nLevels}, "
+                tensor.doc_string += f"Deeploy: {lObject._deploy}, "
+                if not isinstance(lObject, ConstantBuffer) and hasattr(lObject, "_type"):
+                    tensor.doc_string += f"Type: {lObject._type.typeName}, "
+                    if hasattr(lObject._type, "referencedType"):
+                        tensor.doc_string += f"Reference Type: {lObject._type.referencedType.typeName}"
+            elif tensor.name in self.ctxt.globalObjects:
+                gObject = self.ctxt.globalObjects[tensor.name]
+                tensor.doc_string += f"Biased: {gObject._signed}, "
+                tensor.doc_string += f"nLevels: {gObject.nLevels}, "
+                tensor.doc_string += f"Deeploy: {gObject._deploy}, "
+                if not isinstance(gObject, ConstantBuffer) and hasattr(gObject, "_type"):
+                    tensor.doc_string += f"Type: {gObject._type.typeName}, "
+                    if hasattr(gObject._type, "referencedType"):
+                        tensor.doc_string += f"Reference Type: {gObject._type.referencedType.typeName}"
+
+        convert_model_to_external_data(model, location = fileName + _dataExtension)
+        onnx.save(model, absoluteOnnxPath)
+
+    def exportDeeployState(self, folderPath: str, fileName: str):
+        """Export compressed network context and neural network graph
+
+        Parameters
+        ----------
+        folderPath : str
+            path to directory where to save context and graph
+        fileName : str
+            prefix to use when saving artifacts
+
+        """
+
+        os.makedirs(os.path.abspath(folderPath), exist_ok = True)
+        self._exportGraph(folderPath, fileName)
+        self.ctxt.exportNetworkContext(folderPath, fileName)
+
+    @staticmethod
+    def _importONNXGraph(folderPath: str, fileName: str) -> gs.Graph:
+        relativePath = os.path.join(folderPath, fileName + _graphExtension)
+        absolutePath = os.path.abspath(relativePath)
+
+        if not os.path.isabs(absolutePath) or not os.path.exists(absolutePath):
+            raise OSError(f"File or path does not exist: {absolutePath}")
+
+        onnx_graph = onnx.load_model(absolutePath)
+        return gs.import_onnx(onnx_graph)
+
+    def importDeeployState(self, folderPath: str, fileName: str):
+        """Override this container's graph and context with loaded compressed artifacts
+
+        Parameters
+        ----------
+        folderPath : str
+            Path to the artifact directory
+        fileName : str
+            prefix of the saved artifacts
+
+        """
+        self.graph = NetworkDeployer._importONNXGraph(folderPath, f"{fileName}")
+        self.ctxt = NetworkContext.importNetworkCtxt(folderPath, f"{fileName}")
+
+
+class NetworkDeployer(NetworkContainer):
+    """Deeploy abstraction to contain an entire network and all necessary information to deploy it
+    """
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: DeploymentPlatform,
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable[[gs.Graph], Schedule] = lambda graph: list(graph.nodes),
+                 name: str = 'DeeployNetwork',
+                 default_channels_first: bool = True,
+                 deeployStateDir: str = "DeeployState"):
+        """Initialize a new NetworkDeployer
+
+        Parameters
+        ----------
+        graph : gs.Graph
+            The raw neural network graph to be deployed, e.g. an output
+            from Quantlib
+        deploymentPlatform : DeploymentPlatform
+            The target deployment platform
+        inputTypes : Dict[str, Type[Pointer]]
+            A mapping of global network inputs to Deeploy datatypes
+        loweringOptimizer : TopologyOptimizer
+            A topology optimizer used to transform the network into a
+            representation that can be mapped to NodeMappers
+        scheduler : Callable[[gs.Graph], Schedule]
+            Method to topologically sort the graph into the order of
+            execution
+        name : str
+            Prefix to avoid name conflicts between Deeploy code and other
+            code
+        default_channels_first : bool
+            Whether data layout is CxHxW, i.e. channels are first, or
+            HxWxC, i.e. channels are last
+        deeployStateDir : str
+            Directory where intermediate states are saved
+
+
+        """
+        super().__init__(graph, deploymentPlatform, inputTypes, scheduler, name, deeployStateDir = deeployStateDir)
+
+        self.loweringOptimizer = loweringOptimizer
+        self.default_channels_first = default_channels_first
+
+        self.prepared = False
+
+    # Don't override this
+    def lower(self, graph: gs.Graph) -> gs.Graph:
+        """Apply the lowering optimize
+
+        Parameters
+        ----------
+        graph : gs.Graph
+            Unmodified input neural network graph
+
+        Returns
+        -------
+        gs.Graph
+            Neural network graph that is deployable with the
+            DeploymentPlatform's Mapping
+
+        """
+        return self.loweringOptimizer.optimize(graph)
+
+    # Don't override this
+    # Duplicate constants with multiple users
+    def _duplicateConstants(self, graph: gs.Graph):
+        idx = 0
+        for node in self.graph.nodes:
+            for i, inputNode in enumerate(node.inputs):
+                if type(inputNode) == gs.ir.tensor.Constant and len(inputNode.outputs) > 1:
+                    newConst = gs.Constant(name = f"{inputNode.name}_EXTRACT_CONST_{idx}", values = inputNode.values)
+                    node.inputs[i] = newConst
+                    # graph.nodes.append(newConst)
+                    idx += 1
+
+    # Don't override this
+    # Duplicate constants with multiple users
+    def _removeEmptyInputs(self, graph: gs.Graph):
+        _inps = self.graph.inputs.copy()
+        for inp in _inps:
+            if np.prod(inp.shape) == 0:
+                self.graph.inputs.remove(inp)
+
+    def frontEnd(self):
+        """API hook to prepare the graph to be deployed and build the initial NetworkContext
+
+        """
+        # Rename graph inputs and outputs:
+        for idx, inputNode in enumerate(self.graph.inputs):
+            inputNode.name = "input_" + str(idx)
+        for idx, outputNode in enumerate(self.graph.outputs):
+            outputNode.name = "output_" + str(idx)
+
+        self._removeEmptyInputs(self.graph)
+
+        self._duplicateConstants(self.graph)
+
+        self.exportDeeployState(self.deeployStateDir, _middlewarePreLoweringFilename)
+
+        self.graph = self.lower(self.graph)  # This lowers the graph to a deployable format
+
+        self.exportDeeployState(self.deeployStateDir, _middlewarePostLoweringFilename)
+
+        try:
+            self.parse(self.default_channels_first)  # This reparses the lowered graph
+        except Exception as e:
+            print("Error during parsing! Exporting deeploy state!")
+            self.exportDeeployState(self.deeployStateDir, _backendPostBindingFilename)
+            raise e
+
+    # Don't Override this
+    def midEnd(self):
+        """API hook to be used after finalizing kernel selection; hoist transient buffers, and perform low-level code optimizations (e.g. tiling and static memory allocation)
+        """
+        try:
+            self.bind()
+        except Exception as e:
+            print("Error during binding! Exporting deeploy state!")
+            self.exportDeeployState(self.deeployStateDir, _backendPostBindingFilename)
+            raise e
+
+    # Don't override this unless you know what you are doin
+    def backEnd(self, verbose: CodeGenVerbosity = _NoVerbosity):
+        """API hook to generate code once kernel implementations are picked and tiling, memory allocation, and other low-level optimizations have been done.
+
+        Parameters
+        ----------
+        verbose : CodeGenVerbosity
+            Control verbosity of generated code
+
+        """
+
+        self.exportDeeployState(self.deeployStateDir, _backendPostParsingFilename)
+
+        self.codeTransform(verbose)
+
+        self.exportDeeployState(self.deeployStateDir, _backendPostBindingFilename)
+
+    # Don't override this
+    def prepare(self, verbose: CodeGenVerbosity = _NoVerbosity):
+        """API hook to perform the entire deployment process to the point where generated code may be extracted
+
+        Parameters
+        ----------
+        verbose : CodeGenVerbosity
+            Control verbosity of generated code
+
+        """
+        self.frontEnd()
+        self.midEnd()
+        self.backEnd(verbose = verbose)
+        self.prepared = True
+
+    def generateFunction(self, verbose: CodeGenVerbosity = _NoVerbosity) -> str:
+        """Helper function to prepare deployment and return generated function code
+
+        """
+        if not self.prepared:
+            self.prepare(verbose = verbose)
+
+        return self.generateInferenceCode()
diff --git a/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py b/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py
new file mode 100644
index 0000000..d08978f
--- /dev/null
+++ b/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py
@@ -0,0 +1,83 @@
+# ----------------------------------------------------------------------
+#
+# File: EngineColoringDeployer.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Callable, Dict, Type, Union
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.NetworkDeployers.NetworkDeployerWrapper import NetworkDeployerWrapper
+from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, ONNXLayer, Schedule, TopologyOptimizer
+from Deeploy.EngineExtension.OptimizationPasses.TopologyOptimizationPasses.EngineColoringPasses import \
+    EngineColoringPass, EngineMapper
+
+
+class EngineColoringDeployer(NetworkDeployer):
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: DeploymentPlatform,
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable[[gs.Graph], Schedule] = lambda graph: list(graph.nodes),
+                 name: str = 'DeeployNetwork',
+                 default_channels_first: bool = True,
+                 deeployStateDir: str = "DeeployState",
+                 engineMapperCls: Type[EngineMapper] = EngineMapper):
+        super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name,
+                         default_channels_first, deeployStateDir)
+        self._initEngineColoringDeployer(engineMapperCls)
+
+    def _initEngineColoringDeployer(self, engineMapperCls: Type[EngineMapper]):
+        self.engineDict = {engine.name: engine for engine in self.Platform.engines}
+        engineMapper = engineMapperCls(self.engineDict)
+        engineColoringPass = EngineColoringPass(engineMapper)
+        loweringPasses = [engineColoringPass]
+        for _pass in self.loweringOptimizer.passes:
+            loweringPasses.append(_pass)
+            loweringPasses.append(engineColoringPass)
+        self.loweringOptimizer.passes = loweringPasses
+
+    def lower(self, graph: gs.Graph) -> gs.Graph:
+        graph = super().lower(graph)
+        uncoloredNodes = [node.name for node in graph.nodes if "engine" not in node.attrs]
+        assert len(uncoloredNodes) == 0, f"Missing engine color for nodes {uncoloredNodes}"
+        return graph
+
+    def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]:
+        assert "engine" in node.attrs, f"Node {node.name} doesn't have an engine color."
+        engineName = node.attrs["engine"]
+        assert isinstance(engineName, str) and engineName in self.engineDict, \
+            f"Node {node.name} has an invalid engine {engineName} assigned."
+        engine = self.engineDict[engineName]
+        assert node.op in engine.Mapping, f"No mapping found for {node.op} in engine {engine.name}"
+        return engine.Mapping[node.op](node)
+
+
+class EngineColoringDeployerWrapper(EngineColoringDeployer, NetworkDeployerWrapper):
+
+    def __init__(self, deployer: NetworkDeployer, engineMapperCls: Type[EngineMapper] = EngineMapper) -> None:
+        NetworkDeployerWrapper.__init__(self, deployer)
+        self._initEngineColoringDeployer(engineMapperCls)
diff --git a/Deeploy/EngineExtension/NetworkDeployers/__init__.py b/Deeploy/EngineExtension/NetworkDeployers/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/EngineExtension/NetworkDeployers/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py b/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py
new file mode 100644
index 0000000..4c3bc1f
--- /dev/null
+++ b/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py
@@ -0,0 +1,75 @@
+# ----------------------------------------------------------------------
+#
+# File: EngineColoringPasses.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Optional, Tuple
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.Matchers import Match, SubgraphMatcher
+from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, contextagnostic
+from Deeploy.DeeployTypes import DeploymentEngine, TopologyOptimizationPass
+
+
+class EngineMapper:
+
+    def __init__(self, engineDict: Dict[str, DeploymentEngine]) -> None:
+        self.engineDict = engineDict
+
+    # Override for a different allocation strategy
+    def mapNodeToEngine(self, node: gs.Node, graph: gs.Graph) -> Optional[DeploymentEngine]:
+        _ = graph
+        for engine in self.engineDict.values():
+            if engine.canExecute(node):
+                return engine
+        return None
+
+
+class EngineColoringPass(TopologyOptimizationPass):
+
+    def __init__(self, engineMapper: EngineMapper):
+        super().__init__()
+        self.engineMapper = engineMapper
+
+    def apply(self, graph: gs.Graph) -> Tuple[gs.Graph]:
+        for node in filter(lambda node: "engine" not in node.attrs, graph.nodes):
+            engine = self.engineMapper.mapNodeToEngine(node, graph)
+            if engine is not None:
+                node.attrs["engine"] = engine.name
+        return graph
+
+
+def _engine_discoloration_fun(graph: gs.Graph, match: Match, name: str):
+    _ = name
+    colored_matched_nodes = filter(lambda node: "engine" in node.attrs, match.nodes_map.values())
+    for node in colored_matched_nodes:
+        del node.attrs["engine"]
+    return graph
+
+
+@contextagnostic
+class EngineDiscolorationPass(ReplaceSequentialPatternPass):
+
+    def __init__(self, pattern: gs.Graph, name: str, matcher: Optional[SubgraphMatcher] = None, **kwargs):
+        super().__init__(pattern, _engine_discoloration_fun, name, matcher, **kwargs)
diff --git a/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/__init__.py b/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/EngineExtension/OptimizationPasses/__init__.py b/Deeploy/EngineExtension/OptimizationPasses/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/EngineExtension/OptimizationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/EngineExtension/__init__.py b/Deeploy/EngineExtension/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/EngineExtension/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/FutureExtension/Bindings/AutoFutureBinding.py b/Deeploy/FutureExtension/Bindings/AutoFutureBinding.py
new file mode 100644
index 0000000..6e9d295
--- /dev/null
+++ b/Deeploy/FutureExtension/Bindings/AutoFutureBinding.py
@@ -0,0 +1,82 @@
+# ----------------------------------------------------------------------
+#
+# File: PULPDMAFutureBinding.py
+#
+# Last edited: 08.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from Deeploy.DeeployTypes import CodeTransformation, NetworkContext, NodeTemplate, NodeTypeChecker
+from Deeploy.FutureExtension.Bindings.FutureBinding import FutureBinding
+from Deeploy.FutureExtension.Future import Future
+
+
+class AutoFutureBinding(FutureBinding):
+
+    def __init__(self,
+                 typeChecker: NodeTypeChecker,
+                 template: NodeTemplate,
+                 codeTransformer: CodeTransformation,
+                 stateReferenceType: Optional = None):
+        super().__init__(typeChecker, template, codeTransformer)
+
+        futureOutputs = [idx for idx, output in enumerate(self.typeChecker.output_types) if issubclass(output, Future)]
+
+        if len(futureOutputs) > 1:
+            raise Exception(f"{self} assigns more than one future output!")
+
+        if len(futureOutputs) == 1:
+            self.stateReferenceType = self.typeChecker.output_types[futureOutputs[0]].stateReferenceType
+
+        self.futureOutputs = futureOutputs
+
+    def assignStateReferenceElement(self, ctxt) -> NetworkContext:
+
+        if len(self.futureOutputs) > 1:
+            raise Exception(f"{self} assigns more than one future output!")
+
+        if len(self.futureOutputs) == 0:
+            return ctxt
+
+        for codeSnippet in self.executionBlock.codeSnippets:
+            operatorRepresentation = codeSnippet.operatorRepresentation
+
+            stateElementCandidates = []
+            for key, value in operatorRepresentation.items():
+                if type(value) == str and (ctxt.is_local(value) or ctxt.is_global(value)):
+                    reference = ctxt.lookup(value)
+                    if isinstance(reference._instance,
+                                  self.stateReferenceType) and reference not in stateElementCandidates:
+                        stateElementCandidates.append(reference)
+
+            if len(stateElementCandidates) == 1:
+                print(f"WARNING: Automagically assigning state Element of {self}")
+                for key, value in operatorRepresentation.items():
+                    if type(value) == str and (ctxt.is_local(value) or ctxt.is_global(value)):
+                        reference = ctxt.lookup(value)
+                        if issubclass(reference._type, Future) and not hasattr(reference._instance, "stateReference"):
+                            reference._instance.assignStateReference(stateElementCandidates[0], ctxt)
+
+            else:
+                raise Exception(f"Can't assign a unique state element to {self} automagically!")
+
+        return ctxt
diff --git a/Deeploy/FutureExtension/Bindings/FutureBinding.py b/Deeploy/FutureExtension/Bindings/FutureBinding.py
new file mode 100644
index 0000000..1a8b221
--- /dev/null
+++ b/Deeploy/FutureExtension/Bindings/FutureBinding.py
@@ -0,0 +1,49 @@
+# ----------------------------------------------------------------------
+#
+# File: DMABinding.py
+#
+# Last edited: 08.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformation, NetworkContext, NodeBinding, NodeTemplate, \
+    NodeTypeChecker, _NoVerbosity
+
+
+class FutureBinding(NodeBinding):
+
+    def __init__(self,
+                 typeChecker: NodeTypeChecker,
+                 template: NodeTemplate,
+                 codeTransformer: CodeTransformation,
+                 stateReference: Optional = None):
+        super().__init__(typeChecker, template, codeTransformer)
+        self.stateReference = stateReference
+
+    def assignStateReferenceElement(self, ctxt: NetworkContext) -> NetworkContext:
+        return ctxt
+
+    def codeTransform(self, ctxt: NetworkContext, verbose: CodeGenVerbosity = _NoVerbosity) -> NetworkContext:
+        ctxt = self.assignStateReferenceElement(ctxt)
+        ctxt = super().codeTransform(ctxt, verbose)
+
+        return ctxt
diff --git a/Deeploy/FutureExtension/Bindings/__init__.py b/Deeploy/FutureExtension/Bindings/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/FutureExtension/Bindings/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/FutureExtension/CodeTransformationPasses/FutureCodeTransformation.py b/Deeploy/FutureExtension/CodeTransformationPasses/FutureCodeTransformation.py
new file mode 100644
index 0000000..0bce6bc
--- /dev/null
+++ b/Deeploy/FutureExtension/CodeTransformationPasses/FutureCodeTransformation.py
@@ -0,0 +1,97 @@
+# ----------------------------------------------------------------------
+#
+# File: Future.py
+#
+# Last edited: 12.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Tuple
+
+from Deeploy.CommonExtensions.CodeTransformationPasses.IntrospectiveCodeTransformation import \
+    IntrospectiveCodeTransformationMixIn
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity
+from Deeploy.FutureExtension.Future import Future
+
+
+class FutureGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        ctxt, executionBlock = self._dispatchFutures(ctxt, executionBlock, name)
+        ctxt, executionBlock = self._resolveFutures(ctxt, executionBlock, name)
+        return ctxt, executionBlock
+
+    def _extractFutureArgs(self, ctxt: NetworkContext, executionBlock: ExecutionBlock) -> List[str]:
+        futures = []
+        dynamicReferences = self.extractDynamicReferences(ctxt, executionBlock, unrollStructs = True)
+        references = [ctxt.lookup(key) for key in dynamicReferences]
+        futureReferences = [ref for ref in references if issubclass(ref._type, Future)]
+
+        for reference in futureReferences:
+            if not hasattr(reference._instance, "stateReference"):
+                raise Exception(f"Buffer {reference} is a Future type but has no state element!")
+            if reference._deploy:
+                futures.append(reference)
+
+        return futures
+
+    def _resolveFutures(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
+                        name: str) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        futures = self._extractFutureArgs(ctxt, executionBlock)
+        for reference in futures:
+            stateReference = reference._instance.stateReference.name
+            # SCHEREMO: Late resolve if we are in the output of the network
+            if reference._users == []:
+                executionBlock.addRight(reference._type.resolveCheckTemplate, {
+                    **reference._bufferRepresentation(),
+                    **reference._instance._bufferRepresentation()
+                })
+                if name not in ctxt.lookup(stateReference)._users:
+                    ctxt.lookup(stateReference)._users.append(name)
+
+            # SCHEREMO: Early resolve if we are the first user - otherwise it has been resolved already (in a static scheduler)!
+            elif name == reference._users[0]:
+                executionBlock.addLeft(reference._type.resolveCheckTemplate, {
+                    **reference._bufferRepresentation(),
+                    **reference._instance._bufferRepresentation()
+                })
+                if name not in ctxt.lookup(stateReference)._users:
+                    ctxt.lookup(stateReference)._users.append(name)
+        return ctxt, executionBlock
+
+    def _dispatchFutures(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
+                         name: str) -> Tuple[NetworkContext, ExecutionBlock]:
+        futures = self._extractFutureArgs(ctxt, executionBlock)
+        for reference in futures:
+            stateReference = reference._instance.stateReference.name
+            # Dispatch iff we are not a user, i.e. we are the producer
+            if name not in reference._users:
+                executionBlock.addLeft(reference._type.dispatchCheckTemplate, {
+                    **reference._bufferRepresentation(),
+                    **reference._instance._bufferRepresentation()
+                })
+                if name not in ctxt.lookup(stateReference)._users:
+                    ctxt.lookup(stateReference)._users.append(name)
+        return ctxt, executionBlock
diff --git a/Deeploy/FutureExtension/CodeTransformationPasses/__init__.py b/Deeploy/FutureExtension/CodeTransformationPasses/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/FutureExtension/CodeTransformationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/FutureExtension/Future.py b/Deeploy/FutureExtension/Future.py
new file mode 100644
index 0000000..7e4969d
--- /dev/null
+++ b/Deeploy/FutureExtension/Future.py
@@ -0,0 +1,67 @@
+# ----------------------------------------------------------------------
+#
+# File: Future.py
+#
+# Last edited: 07.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Type
+
+from Deeploy.AbstractDataTypes import BaseType, Pointer
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, StructBuffer
+
+
+class Future(Pointer):
+
+    __slots__ = ['stateReference']
+    stateReferenceType: Type[Pointer]
+    resolveCheckTemplate: NodeTemplate
+    dispatchCheckTemplate: NodeTemplate
+
+    def assignStateReference(self, stateReference: StructBuffer, ctxt: Optional[NetworkContext] = None):
+        if self.stateReferenceType.checkPromotion(stateReference.structDict, ctxt):  # type: ignore
+            self.stateReference = stateReference
+        else:
+            raise Exception(f"Can't assign {stateReference} to {self}!")
+
+    def _bufferRepresentation(self):
+        return {"stateReference": self.stateReference.name}
+
+
+def FutureClass(underlyingType: BaseType, stateReferenceType: Type[Pointer], resolveCheckTemplate: NodeTemplate,
+                dispatchCheckTemplate: NodeTemplate) -> Type[Future]:
+
+    typeName = stateReferenceType.typeName + "Future"
+    if typeName not in globals().keys():
+        retCls = type(
+            typeName, (Future,), {
+                "typeName": underlyingType.typeName + "*",
+                "typeWidth": 32,
+                "referencedType": underlyingType,
+                "stateReferenceType": stateReferenceType,
+                "resolveCheckTemplate": resolveCheckTemplate,
+                "dispatchCheckTemplate": dispatchCheckTemplate
+            })
+        globals()[typeName] = retCls
+    else:
+        retCls = globals()[typeName]
+
+    return retCls
diff --git a/Deeploy/FutureExtension/__init__.py b/Deeploy/FutureExtension/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/FutureExtension/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/MemoryLevelExtension/MemoryLevels.py b/Deeploy/MemoryLevelExtension/MemoryLevels.py
new file mode 100644
index 0000000..06b9d47
--- /dev/null
+++ b/Deeploy/MemoryLevelExtension/MemoryLevels.py
@@ -0,0 +1,187 @@
+# ----------------------------------------------------------------------
+#
+# File: MemoryLevel.py
+#
+# Last edited: 04.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Victor Jung, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional, Sequence, Tuple
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import CodeTransformation, NetworkContext, NodeBinding, NodeTemplate, NodeTypeChecker, \
+    OperatorRepresentation
+
+
+class MemoryLevel():
+
+    def __init__(self, name: str, neighbourNames: List[str], size: int = 0):
+        self.name = name
+        self.neighbourNames = neighbourNames
+        self.size = size  # By convention the size is in Bytes
+
+        if self.size < 0:
+            raise ValueError(
+                f'Error while assigning a Memory Size to {self.name} Memory Level: Memory Size cannot be negative')
+
+        if self.name in self.neighbourNames:
+            raise ValueError(f'Node {self.name} cannot be a neighbour of itself')
+
+    def __eq__(self, other):
+
+        ret = [neighbour_name in other.neighbourNames for neighbour_name in self.neighbourNames]
+        ret += [neighbour_name in self.neighbourNames for neighbour_name in other.neighbourNames]
+        ret += [self.name == other.name, self.size == other.size]
+        return all(ret)
+
+
+class MemoryHierarchy():
+
+    def __init__(self, node_list: List[MemoryLevel]):
+        '''Effectively build the MemoryHierarchy from a list of MemoryLevels and check the validity of the hierarchy'''
+        self.memoryLevels: Dict[str, MemoryLevel] = {}
+        self._defaultMemoryLevel: Optional[MemoryLevel] = None
+
+        for node in node_list:
+            self._add(node)
+
+        self._check()
+
+    def __eq__(self, other):
+        if not isinstance(other, MemoryHierarchy):
+            return False
+
+        if not other.memoryLevels.keys() == self.memoryLevels.keys():
+            return False
+
+        for memory_level_name in self.memoryLevels.keys():
+            if not self.memoryLevels[memory_level_name] == other.memoryLevels[memory_level_name]:
+                return False
+
+        return True
+
+    def _add(self, new_node: MemoryLevel):
+        '''Add a new node to the hierarchy and propagate the neighbour relationships'''
+        if new_node.name in self.memoryLevels.keys():
+            raise ValueError(f'Node {new_node.name} already exists in MemoryHierarchy')
+
+        self.memoryLevels[new_node.name] = new_node
+
+    def _check(self):
+        '''Check if the memory hierarchy is a valid undirected graph (i.e. every node point at a valid neighbour)'''
+        for node_name, node in self.memoryLevels.items():
+            violatingNodes = [
+                neighbourName for neighbourName in node.neighbourNames if neighbourName not in self.memoryLevels.keys()
+            ]
+            assert len(violatingNodes) == 0, \
+                f'Invalid Memory Hierarchy graph, node {node.name} point to non-existing neighbour(s) {violatingNodes}'
+
+    def bfs(self, start: str, target: str) -> List[str]:
+
+        visited = [start]
+
+        queue = [[start]]
+        queueIdx = 0
+
+        if start == target:
+            return queue[0]
+
+        while queueIdx < len(queue):
+            currentPath = queue[queueIdx]
+            neighbours = self.memoryLevels[currentPath[-1]].neighbourNames
+
+            if target in neighbours:
+                currentPath.append(target)
+                return currentPath
+
+            for nextNode in neighbours:
+                if nextNode not in visited:
+                    newPath = currentPath[:]
+                    newPath.append(nextNode)
+                    queue.append(newPath)
+                    visited.append(nextNode)
+            queueIdx += 1
+
+        return []
+
+    def setDefaultMemoryLevel(self, name: str):
+        assert (name in self.memoryLevels), f"Node {name} not in MemoryHierarchy"
+        self._defaultMemoryLevel = self.memoryLevels[name]
+
+    def getDefaultMemoryLevel(self):
+        if self._defaultMemoryLevel is None:
+            raise ValueError('defaultMemoryLevel level not set!')
+        return self._defaultMemoryLevel
+
+
+class NodeMemoryLevelChecker():
+
+    def __init__(self, inputMemoryLevels: Sequence[Optional[str]], outputMemoryLevels: Sequence[Optional[str]]):
+        self.inputMemoryLevels = inputMemoryLevels
+        self.outputMemoryLevels = outputMemoryLevels
+
+    def _memEq(self, memoryLevel: str, annotatedMemoryLevel: str) -> bool:
+        if memoryLevel is None:
+            return True
+        else:
+            return memoryLevel == annotatedMemoryLevel
+
+    def _checkMemoryLevels(self, ctxt: NetworkContext, memoryLevels: Sequence[str],
+                           tensors: Sequence[gs.Tensor]) -> bool:
+        buffers = [ctxt.lookup(tensor.name) for tensor in tensors]
+        if not all(hasattr(buffer, "_memoryLevel") for buffer in buffers):
+            return False
+
+        annotatedMemoryLevels = [buffer._memoryLevel for buffer in buffers]
+        if all(
+                self._memEq(memoryLevel, annotatedMemoryLevel)
+                for memoryLevel, annotatedMemoryLevel in zip(memoryLevels, annotatedMemoryLevels)):
+            return True
+        else:
+            return False
+
+    def check(self, ctxt: NetworkContext, node: gs.Node, operatorRepresentation) -> Tuple[NetworkContext, bool]:
+        if self._checkMemoryLevels(ctxt, self.inputMemoryLevels, node.inputs) and self._checkMemoryLevels(
+                ctxt, self.outputMemoryLevels, node.outputs):
+            return ctxt, True
+        else:
+            return ctxt, False
+
+
+class MemoryAwareNodeBinding(NodeBinding):
+
+    def __init__(self, typeChecker: NodeTypeChecker, memoryLevelChecker: NodeMemoryLevelChecker, template: NodeTemplate,
+                 codeTransformer: CodeTransformation):
+        super().__init__(typeChecker, template, codeTransformer)
+        self.memoryLevelChecker = memoryLevelChecker
+
+    def typeCheck(self, ctxt: NetworkContext, node: gs.Node,
+                  operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, bool]:
+        newCtxt, ret = self.memoryLevelChecker.check(ctxt, node, operatorRepresentation)
+        if ret:
+            return super().typeCheck(newCtxt, node, operatorRepresentation)
+
+        return ctxt, False
+
+
+def memoryAwareNodeBindingExtension(binding: NodeBinding,
+                                    memoryLevelChecker: NodeMemoryLevelChecker) -> MemoryAwareNodeBinding:
+    return MemoryAwareNodeBinding(binding.typeChecker, memoryLevelChecker, binding.template, binding.codeTransformer)
diff --git a/Deeploy/MemoryLevelExtension/NetworkDeployers/MemoryLevelDeployer.py b/Deeploy/MemoryLevelExtension/NetworkDeployers/MemoryLevelDeployer.py
new file mode 100644
index 0000000..3d1e12f
--- /dev/null
+++ b/Deeploy/MemoryLevelExtension/NetworkDeployers/MemoryLevelDeployer.py
@@ -0,0 +1,255 @@
+# ----------------------------------------------------------------------
+#
+# File: MemoryLevelAnnotation.py
+#
+# Last edited: 04.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# Moritz Scherer, ETH Zurich
+# Victor Jung, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from types import MappingProxyType
+from typing import Any, Callable, Dict, List, Tuple, Type, Union
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.NetworkDeployers.NetworkDeployerWrapper import NetworkDeployerWrapper
+from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer
+from Deeploy.DeeployTypes import CodeGenVerbosity, ConstantBuffer, DeploymentEngine, DeploymentPlatform, \
+    NetworkContext, NetworkDeployer, NetworkOptimizationPass, NetworkOptimizer, ONNXLayer, Schedule, StructBuffer, \
+    TopologyOptimizer, TransientBuffer, VariableBuffer, _NoVerbosity
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel
+
+
+class MemoryPlatform(DeploymentPlatform):
+
+    def __init__(self, memoryHierarchy: MemoryHierarchy, defaultTargetMemoryLevel: MemoryLevel,
+                 engines: List[DeploymentEngine], variableBuffer: Type[VariableBuffer],
+                 constantBuffer: Type[ConstantBuffer], structBuffer: Type[StructBuffer],
+                 transientBuffer: Type[TransientBuffer]) -> None:
+        super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
+        self.memoryHierarchy = memoryHierarchy
+        self.defaultTargetMemoryLevel = defaultTargetMemoryLevel
+
+    def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str:
+        _, _, _ = node, tensorName, ctxt
+        return self.defaultTargetMemoryLevel.name
+
+
+class DeploymentPlatformWrapper(DeploymentPlatform):
+
+    def __init__(self, platform: DeploymentPlatform):
+        super().__setattr__("_innerObject", platform)
+
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self._innerObject, name)
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        if hasattr(self._innerObject, name):
+            setattr(self._innerObject, name, value)
+        else:
+            super().__setattr__(name, value)
+
+
+class MemoryPlatformWrapper(DeploymentPlatformWrapper):
+
+    def __init__(self, platform: DeploymentPlatform, memoryHierarchy: MemoryHierarchy,
+                 defaultTargetMemoryLevel: MemoryLevel):
+        super().__init__(platform)
+        self.memoryHierarchy = memoryHierarchy
+        self.defaultTargetMemoryLevel = defaultTargetMemoryLevel
+
+    def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str:
+        _, _, _ = node, tensorName, ctxt
+        return self.defaultTargetMemoryLevel.name
+
+
+class TargetMemoryLevelMapping:
+
+    def __init__(self, graph: gs.Graph, platform: Union[MemoryPlatform, MemoryPlatformWrapper],
+                 ctxt: NetworkContext) -> None:
+        mapping: Dict[Tuple[str, str], str] = {}
+        for node in graph.nodes:
+            for tensor in node.inputs + node.outputs:
+                mapping[node.name, tensor.name] = platform.getTargetMemoryLevel(node, tensor.name, ctxt)
+        self._mapping = MappingProxyType(mapping)
+
+    def lookup(self, nodeName: str, tensorName: str) -> str:
+        return self._mapping[nodeName, tensorName]
+
+
+class MemoryLevelAwareDeployer(NetworkDeployer):
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: Union[MemoryPlatform, MemoryPlatformWrapper],
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable[[gs.Graph], Schedule] = lambda graph: list(graph.nodes),
+                 name: str = 'DeeployNetwork',
+                 default_channels_first: bool = True,
+                 deeployStateDir: str = "DeeployState",
+                 memoryLevelAnnotationPasses: List[NetworkOptimizationPass] = []):
+        super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name,
+                         default_channels_first, deeployStateDir)
+        if len(memoryLevelAnnotationPasses) == 0:
+            memoryLevelAnnotationPasses.append(AnnotateDefaultMemoryLevel(self.Platform.memoryHierarchy))
+        self.memoryLevelAnnotationOptimizer = NetworkOptimizer(memoryLevelAnnotationPasses)
+
+    def getTargetMemoryLevelMapping(self) -> TargetMemoryLevelMapping:
+        assert isinstance(self.Platform, (MemoryPlatform, MemoryPlatformWrapper)), \
+            f"Platform should be a MemoryPlatform or MemoryPlatformWrapper! Got {type(self.Platform).__name__}"
+        return TargetMemoryLevelMapping(self.graph, self.Platform, self.ctxt)
+
+    def _parseNode(self, node: ONNXLayer, ctxt: NetworkContext,
+                   default_channels_first: bool) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, parsePass = node.parse(ctxt.copy(), default_channels_first)
+
+        if not parsePass:
+            return ctxt, False
+
+        newCtxt, self.graph = self.memoryLevelAnnotationOptimizer.optimize(newCtxt, self.graph)
+        newCtxt, LayerBindSuccess = node.typeCheck(newCtxt)
+
+        if not LayerBindSuccess:
+            return ctxt, False
+
+        return newCtxt, True
+
+    def bind(self):
+
+        ret = super().bind()
+        if not ret:
+            return False
+
+        # SCHEREMO: There might be hoisting; reassign memoryLevel preferences
+        self.ctxt, self.graph = self.memoryLevelAnnotationOptimizer.optimize(self.ctxt, self.graph)
+
+        return ret
+
+    def codeTransform(self, verbose: CodeGenVerbosity = _NoVerbosity):
+        self.ctxt, self.graph = self.memoryLevelAnnotationOptimizer.optimize(self.ctxt, self.graph)
+        super().codeTransform(verbose)
+
+
+class MemoryLevelAwareSignPropDeployer(SignPropDeployer):
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: Union[MemoryPlatform, MemoryPlatformWrapper],
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable = lambda x: x,
+                 name: str = 'DeeployNetwork',
+                 default_channels_first: bool = True,
+                 deeployStateDir: str = "DeeployState",
+                 inputOffsets: Dict[str, int] = {},
+                 memoryLevelAnnotationPasses: List[NetworkOptimizationPass] = []):
+        super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name,
+                         default_channels_first, deeployStateDir, inputOffsets)
+        if len(memoryLevelAnnotationPasses) == 0:
+            memoryLevelAnnotationPasses.append(AnnotateDefaultMemoryLevel(self.Platform.memoryHierarchy))
+        self.memoryLevelAnnotationOptimizer = NetworkOptimizer(memoryLevelAnnotationPasses)
+
+    def getTargetMemoryLevelMapping(self) -> TargetMemoryLevelMapping:
+        assert isinstance(self.Platform, (MemoryPlatform, MemoryPlatformWrapper)), \
+            f"Platform should be a MemoryPlatform or MemoryPlatformWrapper! Got {type(self.Platform).__name__}"
+        return TargetMemoryLevelMapping(self.graph, self.Platform, self.ctxt)
+
+    def _parseNode(self, node: ONNXLayer, ctxt: NetworkContext,
+                   default_channels_first: bool) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, parsePass = node.parse(ctxt.copy(), default_channels_first)
+
+        if not parsePass:
+            return ctxt, False
+
+        newCtxt, self.graph = self.memoryLevelAnnotationOptimizer.optimize(newCtxt, self.graph)
+        newCtxt, LayerBindSuccess = node.typeCheck(newCtxt)
+
+        if not LayerBindSuccess:
+            return ctxt, False
+
+        return newCtxt, True
+
+    def bind(self):
+
+        ret = super().bind()
+        if not ret:
+            return False
+
+        # SCHEREMO: There might be hoisting; reassign memoryLevel preferences
+        self.ctxt, self.graph = self.memoryLevelAnnotationOptimizer.optimize(self.ctxt, self.graph)
+
+        return ret
+
+    def codeTransform(self, verbose: CodeGenVerbosity = _NoVerbosity):
+        self.ctxt, self.graph = self.memoryLevelAnnotationOptimizer.optimize(self.ctxt, self.graph)
+        super().codeTransform(verbose)
+
+
+class MemoryDeployerWrapper(NetworkDeployerWrapper):
+
+    def __init__(self, deployer: NetworkDeployer, memoryLevelAnnotationPasses: List[NetworkOptimizationPass] = []):
+        super().__init__(deployer)
+        assert isinstance(deployer.Platform, (MemoryPlatform, MemoryPlatformWrapper)), \
+            f"Platform should be a MemoryPlatform or MemoryPlatformWrapper! Got {type(deployer.Platform).__name__}"
+        if len(memoryLevelAnnotationPasses) == 0:
+            memoryLevelAnnotationPasses.append(AnnotateDefaultMemoryLevel(self.Platform.memoryHierarchy))
+        self.memoryLevelAnnotationOptimizer = NetworkOptimizer(memoryLevelAnnotationPasses)
+
+    def getTargetMemoryLevelMapping(self) -> TargetMemoryLevelMapping:
+        assert isinstance(self.Platform, (MemoryPlatform, MemoryPlatformWrapper)), \
+            f"Platform should be a MemoryPlatform or MemoryPlatformWrapper! Got {type(self.Platform).__name__}"
+        return TargetMemoryLevelMapping(self.graph, self.Platform, self.ctxt)
+
+    def _parseNode(self, node: ONNXLayer, ctxt: NetworkContext,
+                   default_channels_first: bool) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, parsePass = node.parse(ctxt.copy(), default_channels_first)
+
+        if not parsePass:
+            return ctxt, False
+
+        newCtxt, self.graph = self.memoryLevelAnnotationOptimizer.optimize(newCtxt, self.graph)
+        newCtxt, LayerBindSuccess = node.typeCheck(newCtxt)
+
+        if not LayerBindSuccess:
+            return ctxt, False
+
+        return newCtxt, True
+
+    def bind(self):
+
+        ret = super().bind()
+        if not ret:
+            return False
+
+        # SCHEREMO: There might be hoisting; reassign memoryLevel preferences
+        self.ctxt, self.graph = self.memoryLevelAnnotationOptimizer.optimize(self.ctxt, self.graph)
+
+        return ret
+
+    def codeTransform(self, verbose: CodeGenVerbosity = _NoVerbosity):
+        self.ctxt, self.graph = self.memoryLevelAnnotationOptimizer.optimize(self.ctxt, self.graph)
+        super().codeTransform(verbose)
diff --git a/Deeploy/MemoryLevelExtension/NetworkDeployers/__init__.py b/Deeploy/MemoryLevelExtension/NetworkDeployers/__init__.py
new file mode 100644
index 0000000..65ec809
--- /dev/null
+++ b/Deeploy/MemoryLevelExtension/NetworkDeployers/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py b/Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py
new file mode 100644
index 0000000..3069262
--- /dev/null
+++ b/Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py
@@ -0,0 +1,104 @@
+# ----------------------------------------------------------------------
+#
+# File: MemoryLevelAnnotationPasses.py
+#
+# Last edited: 10.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Victor Jung, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Tuple
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import SequentialPass
+from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, VariableBuffer
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+
+
+class AnnotateDefaultMemoryLevel(SequentialPass):
+
+    def __init__(self, memoryHierarchy: MemoryHierarchy):
+        super().__init__()
+        self.memoryHierarchy = memoryHierarchy
+
+    def apply(self, ctxt: NetworkContext, graph: gs.Graph) -> Tuple[NetworkContext, gs.Graph]:
+        for _buffer in {**ctxt.localObjects, **ctxt.globalObjects}.values():
+            if not hasattr(_buffer, "_memoryLevel"):
+                _buffer._memoryLevel = self.memoryHierarchy.getDefaultMemoryLevel().name
+        return ctxt, graph
+
+
+class AnnotateIOMemoryLevel(SequentialPass):
+
+    def __init__(self, ioLevel: str):
+        super().__init__()
+        self.ioLevel = ioLevel
+
+    def apply(self, ctxt: NetworkContext, graph: gs.Graph) -> Tuple[NetworkContext, gs.Graph]:
+        buffers = []
+
+        def globalBuffers(tensors: List[gs.Tensor]) -> List[VariableBuffer]:
+            return [ctxt.globalObjects[tensor.name] for tensor in tensors if tensor.name in ctxt.globalObjects.keys()]
+
+        inputBuffers = globalBuffers(graph.inputs)
+        buffers += filter(lambda _buffer: isinstance(_buffer, ctxt.VariableBuffer) and len(_buffer._users) > 0,
+                          inputBuffers)
+
+        outputBuffers = globalBuffers(graph.outputs)
+        buffers += filter(lambda _buffer: isinstance(_buffer, ctxt.VariableBuffer), outputBuffers)
+
+        for _buffer in buffers:
+            _buffer._memoryLevel = self.ioLevel
+
+        return ctxt, graph
+
+
+class AnnotateNeurekaWeightMemoryLevel(SequentialPass):
+
+    def __init__(self, neurekaEngineName: str, weightMemoryLevel: MemoryLevel):
+        self._weightMemoryLevel = weightMemoryLevel
+        self.neurekaEngineName = neurekaEngineName
+        super().__init__()
+
+    def apply(self, ctxt: NetworkContext, graph: gs.Graph) -> Tuple[NetworkContext, gs.Graph]:
+
+        def _neurekaWeightBufferSize(buffer: ConstantBuffer) -> int:
+            return int(np.prod(buffer.shape))  # Weights are encoded as bytes so no need to check for typeWidth
+
+        weightMemoryOccupation = 0
+
+        # Current weight memory occupation
+        for buffer in {**ctxt.globalObjects, **ctxt.localObjects}.values():
+            if hasattr(buffer, "_memoryLevel") and buffer._memoryLevel == self._weightMemoryLevel.name:
+                weightMemoryOccupation += _neurekaWeightBufferSize(buffer)
+
+        neurekaNodes = [node for node in graph.nodes if node.attrs["engine"] == self.neurekaEngineName]
+        for node in neurekaNodes:
+            if node.op in ["Conv", "RequantizedConv"]:
+
+                if not (ctxt.is_local(node.inputs[1].name) or ctxt.is_global(node.inputs[1].name)):
+                    continue
+
+                buffer = ctxt.lookup(node.inputs[1].name)
+                if weightMemoryOccupation + _neurekaWeightBufferSize(buffer) < self._weightMemoryLevel.size:
+                    buffer._memoryLevel = self._weightMemoryLevel.name
+                    weightMemoryOccupation += _neurekaWeightBufferSize(buffer)
+        return ctxt, graph
diff --git a/Deeploy/MemoryLevelExtension/OptimizationPasses/__init__.py b/Deeploy/MemoryLevelExtension/OptimizationPasses/__init__.py
new file mode 100644
index 0000000..65ec809
--- /dev/null
+++ b/Deeploy/MemoryLevelExtension/OptimizationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/MemoryLevelExtension/__init__.py b/Deeploy/MemoryLevelExtension/__init__.py
new file mode 100644
index 0000000..65ec809
--- /dev/null
+++ b/Deeploy/MemoryLevelExtension/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/CortexM/Bindings.py b/Deeploy/Targets/CortexM/Bindings.py
new file mode 100644
index 0000000..3b7b7ff
--- /dev/null
+++ b/Deeploy/Targets/CortexM/Bindings.py
@@ -0,0 +1,118 @@
+# ----------------------------------------------------------------------
+#
+# File: CMSISBindings.py
+#
+# Last edited: 17.12.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
+    MemoryManagementGeneration
+from Deeploy.CommonExtensions.DataTypes import int8_t, int16_t, int32_t, int64_t
+from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
+from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
+from Deeploy.Targets.CortexM.Templates import CLCATemplate, ConvTemplate, DWConvTemplate, GEMMTemplate, \
+    LinearAttentionTemplate, MaxPool2DTemplate
+from Deeploy.Targets.CortexM.TypeCheckers import CMSISConvChecker, CMSISLinearChecker, CMSISMaxPoolChecker
+from Deeploy.Targets.Generic.TypeCheckers import CLCAChecker, LinearAttentionChecker
+
+BasicTransformer = CodeTransformation([ArgumentStructGeneration(), MemoryManagementGeneration(), FutureGeneration()])
+
+CMSISCLCABinding = NodeBinding(
+    CLCAChecker([PointerClass(int8_t), PointerClass(int8_t)] +
+                [PointerClass(int8_t), PointerClass(int32_t)] * 3 +
+                [PointerClass(int32_t), PointerClass(int32_t),
+                 PointerClass(int32_t)] * 7, [PointerClass(int8_t)]), CLCATemplate.referenceTemplate, BasicTransformer)
+
+CMSISConv1DBinding_16 = NodeBinding(
+    CMSISConvChecker([
+        PointerClass(int16_t),
+        PointerClass(int8_t),
+        PointerClass(int32_t),
+        PointerClass(int64_t),
+        PointerClass(int32_t)
+    ], [PointerClass(int16_t)]), ConvTemplate.cmsis1D_16_Template, BasicTransformer)
+CMSISConv1DBinding_8 = NodeBinding(
+    CMSISConvChecker([
+        PointerClass(int8_t),
+        PointerClass(int8_t),
+        PointerClass(int32_t),
+        PointerClass(int32_t),
+        PointerClass(int32_t)
+    ], [PointerClass(int8_t)]), ConvTemplate.cmsis1D_8_Template, BasicTransformer)
+CMSISConv1DBindings = [CMSISConv1DBinding_8, CMSISConv1DBinding_16]
+
+CMSISConv2DBinding = NodeBinding(
+    CMSISConvChecker([
+        PointerClass(int8_t),
+        PointerClass(int8_t),
+        PointerClass(int32_t),
+        PointerClass(int32_t),
+        PointerClass(int32_t)
+    ], [PointerClass(int8_t)]), ConvTemplate.cmsis2D_8_Template, BasicTransformer)
+
+CMSISDWConv1DBinding_16 = NodeBinding(
+    CMSISConvChecker([
+        PointerClass(int16_t),
+        PointerClass(int8_t),
+        PointerClass(int32_t),
+        PointerClass(int64_t),
+        PointerClass(int32_t)
+    ], [PointerClass(int16_t)]), DWConvTemplate.conv1D_16_Template, BasicTransformer)
+CMSISDWConv1DBinding_8 = NodeBinding(
+    CMSISConvChecker([
+        PointerClass(int8_t),
+        PointerClass(int8_t),
+        PointerClass(int32_t),
+        PointerClass(int32_t),
+        PointerClass(int32_t)
+    ], [PointerClass(int8_t)]), DWConvTemplate.conv1D_8_Template, BasicTransformer)
+CMSISDWConv1DBindings = [CMSISDWConv1DBinding_8, CMSISDWConv1DBinding_16]
+
+CMSISDWConv2DBinding = NodeBinding(
+    CMSISConvChecker([
+        PointerClass(int8_t),
+        PointerClass(int8_t),
+        PointerClass(int32_t),
+        PointerClass(int32_t),
+        PointerClass(int32_t)
+    ], [PointerClass(int8_t)]), DWConvTemplate.conv2D_8_Template, BasicTransformer)
+
+CMSISGEMM_16_Binding = NodeBinding(
+    CMSISLinearChecker([PointerClass(int16_t),
+                        PointerClass(int16_t),
+                        PointerClass(int64_t),
+                        PointerClass(int64_t)], [PointerClass(int16_t)]), GEMMTemplate.Linear_16_Template,
+    BasicTransformer)
+CMSISGEMM_8_Binding = NodeBinding(
+    CMSISLinearChecker(
+        [PointerClass(int8_t), PointerClass(int8_t),
+         PointerClass(int32_t),
+         PointerClass(int32_t)], [PointerClass(int8_t)]), GEMMTemplate.Linear_8_Template, BasicTransformer)
+CMSISGEMMBindings = [CMSISGEMM_8_Binding, CMSISGEMM_16_Binding]
+
+CMSISLinearAttentionBinding = NodeBinding(
+    LinearAttentionChecker([PointerClass(int16_t), PointerClass(int16_t),
+                            PointerClass(int16_t)] + [PointerClass(int8_t), PointerClass(int64_t)] * 4,
+                           [PointerClass(int16_t)]), LinearAttentionTemplate.referenceTemplate, BasicTransformer)
+
+CMSISMaxPool2DBinding = NodeBinding(CMSISMaxPoolChecker([PointerClass(int8_t)], [PointerClass(int8_t)]),
+                                    MaxPool2DTemplate.cmsisTemplate, BasicTransformer)
diff --git a/Deeploy/Targets/CortexM/DataTypes.py b/Deeploy/Targets/CortexM/DataTypes.py
new file mode 100644
index 0000000..9951cb3
--- /dev/null
+++ b/Deeploy/Targets/CortexM/DataTypes.py
@@ -0,0 +1,97 @@
+# ----------------------------------------------------------------------
+#
+# File: CMSISDataTypes.py
+#
+# Last edited: 01.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.AbstractDataTypes import PointerClass, Struct, VoidType
+from Deeploy.CommonExtensions.DataTypes import int32_t
+
+
+class cmsis_nn_context(Struct):
+    typeName = "cmsis_nn_context"
+    structTypeDict = {"buf": PointerClass(VoidType), "size": int32_t}
+
+
+class cmsis_nn_tile(Struct):
+    typeName = "cmsis_nn_tile"
+    structTypeDict = {"w": int32_t, "h": int32_t}
+
+
+class cmsis_nn_activation(Struct):
+    typeName = "cmsis_nn_activation"
+    structTypeDict = {"min": int32_t, "max": int32_t}
+
+
+class cmsis_nn_dims(Struct):
+    typeName = "cmsis_nn_dims"
+    structTypeDict = {"n": int32_t, "h": int32_t, "w": int32_t, "c": int32_t}
+
+
+class cmsis_nn_per_channel_quant_params(Struct):
+    typeName = "cmsis_nn_per_channel_quant_params"
+    structTypeDict = {"multiplier": PointerClass(int32_t), "shift": PointerClass(int32_t)}
+
+
+class cmsis_nn_per_tensor_quant_params(Struct):
+    typeName = "cmsis_nn_per_tensor_quant_params"
+    structTypeDict = {"multiplier": int32_t, "shift": int32_t}
+
+
+class cmsis_nn_conv_params(Struct):
+    typeName = "cmsis_nn_conv_params"
+    structTypeDict = {
+        "input_offset": int32_t,
+        "output_offset": int32_t,
+        "stride": cmsis_nn_tile,
+        "padding": cmsis_nn_tile,
+        "dilation": cmsis_nn_tile,
+        "activation": cmsis_nn_activation
+    }
+
+
+class cmsis_nn_fc_params(Struct):
+    typeName = "cmsis_nn_fc_params"
+    structTypeDict = {
+        "input_offset": int32_t,
+        "filter_offset": int32_t,
+        "output_offset": int32_t,
+        "activation": cmsis_nn_activation
+    }
+
+
+class cmsis_nn_pool_params(Struct):
+    typeName = "cmsis_nn_pool_params"
+    structTypeDict = {"stride": cmsis_nn_tile, "padding": cmsis_nn_tile, "activation": cmsis_nn_activation}
+
+
+class cmsis_nn_dw_conv_params(Struct):
+    typeName = "cmsis_nn_dw_conv_params"
+    structTypeDict = {
+        "input_offset": int32_t,
+        "output_offset": int32_t,
+        "ch_mult": int32_t,
+        "stride": cmsis_nn_tile,
+        "padding": cmsis_nn_tile,
+        "dilation": cmsis_nn_tile,
+        "activation": cmsis_nn_activation
+    }
diff --git a/Deeploy/Targets/CortexM/Deployer.py b/Deeploy/Targets/CortexM/Deployer.py
new file mode 100644
index 0000000..55f8987
--- /dev/null
+++ b/Deeploy/Targets/CortexM/Deployer.py
@@ -0,0 +1,69 @@
+# ----------------------------------------------------------------------
+#
+# File: NetworkDeployer.py
+#
+# Last edited: 26.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, Type
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.DebugPasses import DebugPrintMergePass
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    NCHWtoNHWCPass, TransposeMatmulInputsPass
+from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
+from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import TransposeConstOptPass, TransposeMergePass
+
+
+class CMSISDeployer(SignPropDeployer):
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: DeploymentPlatform,
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable = lambda x: x,
+                 name: str = 'DeeployNetwork',
+                 default_channels_first = False,
+                 deeployStateDir: str = "DeeployStateDir",
+                 inputOffsets: Dict[str, int] = {}):
+
+        super().__init__(graph,
+                         deploymentPlatform,
+                         inputTypes,
+                         loweringOptimizer,
+                         scheduler,
+                         name,
+                         default_channels_first = default_channels_first,
+                         deeployStateDir = deeployStateDir)
+
+        self.inputOffsets = inputOffsets
+
+        self.loweringOptimizer.passes += [
+            TransposeMatmulInputsPass(),
+            NCHWtoNHWCPass(self.default_channels_first),
+            TransposeMergePass(),
+            TransposeConstOptPass(),
+            DebugPrintMergePass()
+        ]
diff --git a/Deeploy/Targets/CortexM/Layers.py b/Deeploy/Targets/CortexM/Layers.py
new file mode 100644
index 0000000..ba60a2e
--- /dev/null
+++ b/Deeploy/Targets/CortexM/Layers.py
@@ -0,0 +1,58 @@
+# ----------------------------------------------------------------------
+#
+# File: CMSISLayers.py
+#
+# Last edited: 22.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Tuple
+
+from Deeploy.DeeployTypes import NodeMapper, Shape
+from Deeploy.Targets.Generic.Layers import RQGEMMLayer, RQSConvLayer
+
+
+class CMSISRQSConvLayer(RQSConvLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+        if channels_first:
+            inputShapes[2] = outputShapes[0][1]  # Channels out dimension of Kernel
+            inputShapes[3] = outputShapes[0][1]  # Channels out dimension of Kernel
+            inputShapes[4] = outputShapes[0][1]  # Channels out dimension of Kernel
+        else:
+            inputShapes[2] = outputShapes[0][-1]  # Channels out dimension of Kernel
+            inputShapes[3] = outputShapes[0][-1]  # Channels out dimension of Kernel
+            inputShapes[4] = outputShapes[0][-1]  # Channels out dimension of Kernel
+        return (inputShapes, outputShapes)
+
+
+class CMSISRQSGEMMLayer(RQGEMMLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+        inputShapes[2] = inputShapes[1][-2]  # Channels out dimension of Kernel
+        return (inputShapes, outputShapes)
diff --git a/Deeploy/Targets/CortexM/Parsers.py b/Deeploy/Targets/CortexM/Parsers.py
new file mode 100644
index 0000000..e81caf0
--- /dev/null
+++ b/Deeploy/Targets/CortexM/Parsers.py
@@ -0,0 +1,517 @@
+# ----------------------------------------------------------------------
+#
+# File: CMSISParsers.py
+#
+# Last edited: 17.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Tuple
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext
+from Deeploy.Targets.Generic.Parsers import CLCAParser, GEMMParser, LinearAttentionParser, MaxPool2DParser, \
+    MHSAParser, RQSConv1DParser, RQSConv2DParser, RQSParserInterface
+
+
+class CMSISMaxPool2DParser(MaxPool2DParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        wellFormed = super().parseNode(node)
+        if wellFormed:
+            ret = all([
+                self.operatorRepresentation['pads'][0] == 0,
+                self.operatorRepresentation['pads'][1] == 0,
+            ])
+
+            return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        return super().parseNodeCtxt(ctxt, node, channels_first)
+
+
+class CMSISDWConv2DParser(RQSConv2DParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = super().parseNode(node)
+        if wellFormed:
+            ret = all([
+                # Make sure padding is square
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2],
+                self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3],
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+                self.operatorRepresentation['pads'][0] == 0,
+                # Don't support dilations
+                #all([coeff == 1 for coeff in self.operatorRepresentation['dilations']]),
+                len(node.inputs) == 5,
+            ])
+
+            return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        ctxt = ctxt.copy()
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+
+            if not self.operatorRepresentation['group'] == newCtxt.lookup(
+                    self.operatorRepresentation['weight']).shape[0]:
+                return ctxt, False
+
+            inputs = ['data_in', 'weight', 'mul', 'add', 'shift']
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
+
+            data_in = newCtxt.lookup(self.operatorRepresentation['data_in'])
+            data_out = newCtxt.lookup(self.operatorRepresentation['data_out'])
+            weight = newCtxt.lookup(self.operatorRepresentation['weight'])
+
+            if not newCtxt.is_global(self.operatorRepresentation['weight']):
+                return ctxt, False
+
+            # SCHEREMO: Transpose weights to be num filters last
+            newCtxt.globalObjects[self.operatorRepresentation['weight']].values = np.transpose(
+                weight.values,
+                list(range(len(weight.shape)))[1:] + [0])
+
+            return newCtxt, True
+
+        return ctxt, False
+
+
+class CMSISConv2DParser(RQSConv2DParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = super().parseNode(node)
+        if wellFormed:
+            ret = all([
+                # Make sure padding is square
+                self.operatorRepresentation['group'] == 1,
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2],
+                self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3],
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+                #self.operatorRepresentation['pads'][0] == 0,
+                # Don't support dilations
+                #all([coeff == 1 for coeff in self.operatorRepresentation['dilations']]),
+                len(node.inputs) == 5,
+            ])
+
+            return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        ctxt = ctxt.copy()
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            inputs = ['data_in', 'weight', 'mul', 'add', 'shift']
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+
+            return newCtxt, True
+
+        return ctxt, False
+
+
+class CMSISDWConv1DParser(RQSConv1DParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = super().parseNode(node)
+        if wellFormed:
+            ret = all([
+                # Make sure padding is square
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+                #self.operatorRepresentation['pads'][0] == 0,
+                # Don't support dilations
+                #all([coeff == 1 for coeff in self.operatorRepresentation['dilations']]),
+                len(node.inputs) == 5,
+            ])
+
+            return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        ctxt = ctxt.copy()
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+
+            inputs = ['data_in', 'weight', 'mul', 'add', 'shift']
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
+
+            if not self.operatorRepresentation['group'] == newCtxt.lookup(
+                    self.operatorRepresentation['weight']).shape[-1]:
+                return ctxt, False
+
+            return newCtxt, True
+
+        return ctxt, False
+
+
+class CMSISConv1DParser(RQSConv1DParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = super().parseNode(node)
+        if wellFormed:
+            ret = all([
+                # Make sure padding is square
+                self.operatorRepresentation['group'] == 1,
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+                #self.operatorRepresentation['pads'][0] == 0,
+                # Don't support dilations
+                #all([coeff == 1 for coeff in self.operatorRepresentation['dilations']]),
+                len(node.inputs) == 5,
+            ])
+
+            return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        ctxt = ctxt.copy()
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            inputs = ['data_in', 'weight', 'mul', 'add', 'shift']
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+
+            return newCtxt, True
+
+        return ctxt, False
+
+
+class CMSISLinearParser(GEMMParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = super().parseNode(node)
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        ctxt = ctxt.copy()
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+        if ret:
+            # Try to transpose A offline if possible, else fail
+            if self.operatorRepresentation['transA'] == 1:
+                nameA = self.operatorRepresentation['A']
+                if newCtxt.is_global(nameA) and isinstance(newCtxt.lookup(nameA), ConstantBuffer):
+                    A = newCtxt.lookup(nameA)
+                    npA = np.asarray(A.values).reshape(A.shape)
+                    newA = np.transpose(npA, list(range(len(A.shape) - 2)) + [len(A.shape) - 1, len(A.shape) - 2])
+                    newCtxt.globalObjects[nameA].shape = newA.shape
+                    newCtxt.globalObjects[nameA].values = newA
+                    self.operatorRepresentation['transA'] = 0
+                else:
+                    return newCtxt, False
+
+            # Try to transpose B offline if possible, else fail
+            # SCHEREMO: Magic trick - CMSIS works a bit weirdly with matmuls...
+            if self.operatorRepresentation['transB'] == 0:
+                nameB = self.operatorRepresentation['B']
+                if newCtxt.is_global(nameB) and isinstance(newCtxt.lookup(nameB), ConstantBuffer):
+                    B = newCtxt.lookup(nameB)
+                    npB = np.asarray(B.values).reshape(B.shape)
+                    newB = np.transpose(npB, list(range(len(B.shape) - 2)) + [len(B.shape) - 1, len(B.shape) - 2])
+                    newCtxt.globalObjects[nameB].values = newB
+                    newCtxt.globalObjects[nameB].shape = newB.shape
+                    self.operatorRepresentation['transB'] = 1
+                else:
+                    return newCtxt, False
+
+            # Try to scale A offline if possible, else fail
+            if self.operatorRepresentation['alpha'] != 1.0:
+                nameA = self.operatorRepresentation['A']
+                if newCtxt.is_global(nameA) and isinstance(newCtxt.lookup(nameA), ConstantBuffer):
+                    A = newCtxt.lookup(nameA)
+                    npA = np.asarray(A.values).reshape(A.shape)
+                    newA = npA * self.operatorRepresentation['beta']
+                    newCtxt.globalObjects[nameA].values = newA
+                    self.operatorRepresentation['alpha'] = 1.0
+                else:
+                    return newCtxt, False
+
+            # Try to scale B offline if possible, else fail
+            if self.operatorRepresentation['beta'] != 1.0:
+                nameB = self.operatorRepresentation['B']
+                if newCtxt.is_global(nameB) and isinstance(newCtxt.lookup(nameB), ConstantBuffer):
+                    B = newCtxt.lookup(nameB)
+                    npB = np.asarray(B.values).reshape(B.shape)
+                    newB = npB * self.operatorRepresentation['beta']
+                    newCtxt.globalObjects[nameB].values = newB
+                    self.operatorRepresentation['beta'] = 1.0
+                else:
+                    return newCtxt, False
+
+            return newCtxt, True
+
+        return ctxt, False
+
+
+class CMSISGEMMParser(CMSISLinearParser, RQSParserInterface):
+
+    def __init__(self):
+        super().__init__(noBiasHoisting = True)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        ret_linear = CMSISLinearParser.parseNode(self, node)
+        ret_rqs = RQSParserInterface.parseNode(self, node)
+
+        ret = all([
+            ret_linear == True,
+            ret_rqs == True,
+            'shift' in node.attrs,
+            len(node.inputs) == 4,
+        ])
+
+        if ret:
+            self.operatorRepresentation['shift'] = int(node.attrs['shift'].values)
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        ctxt = ctxt.copy()
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            inputs = ['A', 'B', 'C', 'mul', 'add']
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
+
+            return newCtxt, True
+
+        else:
+            return ctxt, False
+
+
+class CMSISMHSAParser(MHSAParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = super().parseNode(node)
+
+        if wellFormed:
+            ret = all([
+                'isoftmaxA' in node.attrs,
+                'isoftmaxB' in node.attrs,
+                'isoftmaxC' in node.attrs,
+                'isoftmaxlog2' in node.attrs,
+            ])
+
+            if ret:
+                self.operatorRepresentation['signed'] = 1
+                self.operatorRepresentation['preattn_requant_shift'] = int(node.attrs['preattn_requant_shift'].values)
+                self.operatorRepresentation['preattn_requant_div'] = int(
+                    math.log2(int(node.attrs['preattn_requant_div'].values)))
+                self.operatorRepresentation['postattn_requant_shift'] = int(node.attrs['postattn_requant_shift'].values)
+                self.operatorRepresentation['postattn_requant_div'] = int(
+                    math.log2(int(node.attrs['postattn_requant_div'].values)))
+                self.operatorRepresentation['wo_requant_shift'] = int(node.attrs['wo_requant_shift'].values)
+                self.operatorRepresentation['wo_requant_div'] = int(math.log2(int(node.attrs['wo_requant_div'].values)))
+                self.operatorRepresentation['wq_requant_shift'] = int(node.attrs['wq_requant_shift'].values)
+                self.operatorRepresentation['wq_requant_div'] = int(math.log2(int(node.attrs['wq_requant_div'].values)))
+                self.operatorRepresentation['wk_requant_shift'] = int(node.attrs['wk_requant_shift'].values)
+                self.operatorRepresentation['wk_requant_div'] = int(math.log2(int(node.attrs['wk_requant_div'].values)))
+                self.operatorRepresentation['wv_requant_shift'] = int(node.attrs['wv_requant_shift'].values)
+                self.operatorRepresentation['wv_requant_div'] = int(math.log2(int(node.attrs['wv_requant_div'].values)))
+                self.operatorRepresentation['isoftmaxA'] = int(node.attrs['isoftmaxA'].values)
+                self.operatorRepresentation['isoftmaxB'] = int(node.attrs['isoftmaxB'].values)
+                self.operatorRepresentation['isoftmaxC'] = int(node.attrs['isoftmaxC'].values)
+                self.operatorRepresentation['isoftmaxlog2'] = int(node.attrs['isoftmaxlog2'].values)
+
+            return ret
+
+        return False
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        ctxt = ctxt.copy()
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            return newCtxt, ret
+        else:
+            return ctxt, False
+
+
+class CMSISLinearAttentionParser(LinearAttentionParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = super().parseNode(node)
+        self.operatorRepresentation['signed'] = 1
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        ctxt = ctxt.copy()
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            return newCtxt, ret
+        else:
+            return ctxt, False
+
+
+class CMSISCLCAParser(CLCAParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = super().parseNode(node)
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        ctxt = ctxt.copy()
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+        if ret:
+            # Div to shift:
+            newCtxt.globalObjects[self.operatorRepresentation['wq_requant_div']].values = np.log2(
+                newCtxt.globalObjects[self.operatorRepresentation['wq_requant_div']].values).astype('int')
+            newCtxt.globalObjects[self.operatorRepresentation['wk_requant_div']].values = np.log2(
+                newCtxt.globalObjects[self.operatorRepresentation['wk_requant_div']].values).astype('int')
+            newCtxt.globalObjects[self.operatorRepresentation['wv_requant_div']].values = np.log2(
+                newCtxt.globalObjects[self.operatorRepresentation['wv_requant_div']].values).astype('int')
+            newCtxt.globalObjects[self.operatorRepresentation['wo_requant_div']].values = np.log2(
+                newCtxt.globalObjects[self.operatorRepresentation['wo_requant_div']].values).astype('int')
+            newCtxt.globalObjects[self.operatorRepresentation['kdiv_requant_div']].values = np.log2(
+                newCtxt.globalObjects[self.operatorRepresentation['kdiv_requant_div']].values).astype('int')
+            newCtxt.globalObjects[self.operatorRepresentation['preattn_requant_div']].values = np.log2(
+                newCtxt.globalObjects[self.operatorRepresentation['preattn_requant_div']].values).astype('int')
+            newCtxt.globalObjects[self.operatorRepresentation['postattn_requant_div']].values = np.log2(
+                newCtxt.globalObjects[self.operatorRepresentation['postattn_requant_div']].values).astype('int')
+
+            # Fold additions:
+            newCtxt.globalObjects[self.operatorRepresentation['wo_bias']].values = newCtxt.globalObjects[
+                self.operatorRepresentation['wo_bias']].values + (
+                    newCtxt.globalObjects[self.operatorRepresentation['wo_requant_add']].values /
+                    newCtxt.globalObjects[self.operatorRepresentation['wo_requant_mul']].values).astype('int')
+            newCtxt.globalObjects[self.operatorRepresentation['wo_requant_add']]._deploy = False
+            newCtxt.globalObjects[self.operatorRepresentation['wq_bias']].values = newCtxt.globalObjects[
+                self.operatorRepresentation['wq_bias']].values + (
+                    newCtxt.globalObjects[self.operatorRepresentation['wq_requant_add']].values /
+                    newCtxt.globalObjects[self.operatorRepresentation['wq_requant_mul']].values).astype('int')
+            newCtxt.globalObjects[self.operatorRepresentation['wq_requant_add']]._deploy = False
+            newCtxt.globalObjects[self.operatorRepresentation['wk_bias']].values = newCtxt.globalObjects[
+                self.operatorRepresentation['wk_bias']].values + (
+                    newCtxt.globalObjects[self.operatorRepresentation['wv_requant_add']].values /
+                    newCtxt.globalObjects[self.operatorRepresentation['wv_requant_mul']].values).astype('int')
+            newCtxt.globalObjects[self.operatorRepresentation['wv_requant_add']]._deploy = False
+
+            # Rescale requant adds:
+            newCtxt.globalObjects[self.operatorRepresentation['postattn_requant_add']].values = (
+                newCtxt.globalObjects[self.operatorRepresentation['postattn_requant_add']].values /
+                newCtxt.globalObjects[self.operatorRepresentation['postattn_requant_mul']].values).astype('int')
+            newCtxt.globalObjects[self.operatorRepresentation['preattn_requant_add']].values = (
+                newCtxt.globalObjects[self.operatorRepresentation['preattn_requant_add']].values /
+                newCtxt.globalObjects[self.operatorRepresentation['preattn_requant_mul']].values).astype('int')
+            newCtxt.globalObjects[self.operatorRepresentation['kdiv_requant_add']].values = (
+                newCtxt.globalObjects[self.operatorRepresentation['kdiv_requant_add']].values /
+                newCtxt.globalObjects[self.operatorRepresentation['kdiv_requant_mul']].values).astype('int')
+            newCtxt.globalObjects[self.operatorRepresentation['wk_requant_add']].values = (
+                newCtxt.globalObjects[self.operatorRepresentation['wk_requant_add']].values /
+                newCtxt.globalObjects[self.operatorRepresentation['wk_requant_mul']].values).astype('int')
+            newCtxt.globalObjects[self.operatorRepresentation['wo_requant_add']].values = (
+                newCtxt.globalObjects[self.operatorRepresentation['wo_requant_add']].values /
+                newCtxt.globalObjects[self.operatorRepresentation['wo_requant_mul']].values).astype('int')
+            newCtxt.globalObjects[self.operatorRepresentation['wq_requant_add']].values = (
+                newCtxt.globalObjects[self.operatorRepresentation['wq_requant_add']].values /
+                newCtxt.globalObjects[self.operatorRepresentation['wq_requant_mul']].values).astype('int')
+            newCtxt.globalObjects[self.operatorRepresentation['wv_requant_add']].values = (
+                newCtxt.globalObjects[self.operatorRepresentation['wv_requant_add']].values /
+                newCtxt.globalObjects[self.operatorRepresentation['wv_requant_mul']].values).astype('int')
+
+            # Delta into mul
+            newCtxt.globalObjects[self.operatorRepresentation['kdiv_requant_mul']].values = newCtxt.globalObjects[
+                self.operatorRepresentation['kdiv_requant_mul']].values * self.operatorRepresentation['Delta']
+
+            return newCtxt, ret
+        else:
+            return ctxt, False
diff --git a/Deeploy/Targets/CortexM/Platform.py b/Deeploy/Targets/CortexM/Platform.py
new file mode 100644
index 0000000..c9d6534
--- /dev/null
+++ b/Deeploy/Targets/CortexM/Platform.py
@@ -0,0 +1,179 @@
+# ----------------------------------------------------------------------
+#
+# File: CMSISPlatform.py
+#
+# Last edited: 17.12.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Moritz Scherer, ETH Zurich
+# - Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \
+    StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
+from Deeploy.Targets.CortexM.Bindings import CMSISCLCABinding, CMSISConv1DBindings, CMSISConv2DBinding, \
+    CMSISDWConv1DBindings, CMSISDWConv2DBinding, CMSISGEMMBindings, CMSISLinearAttentionBinding, \
+    CMSISMaxPool2DBinding
+from Deeploy.Targets.CortexM.Layers import CMSISRQSConvLayer, CMSISRQSGEMMLayer
+from Deeploy.Targets.CortexM.Parsers import CMSISCLCAParser, CMSISConv1DParser, CMSISConv2DParser, \
+    CMSISDWConv1DParser, CMSISDWConv2DParser, CMSISGEMMParser, CMSISLinearAttentionParser, CMSISMaxPool2DParser
+from Deeploy.Targets.CortexM.TopologyOptimizationPasses.Passes import ConvRequantMergePass, GEMMRequantMergePass, \
+    LinearAttentionAlignmentPass, MatMulRequantMergePass, MHSAAlignmentPass
+from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicDebugPrintBindings, BasicGatherBindings, \
+    BasicGELUBinding, BasicIntegerDivBinding, BasicLayerNormBinding, BasicMatMulBinding, BasicMulBindings, \
+    BasicPad1DBindings, BasicPad2DBindings, BasicReduceMeanBindings, BasicReduceSumBindings, BasicReshapeBindings, \
+    BasicRQIntegerDivBinding, BasicRQSBindings, BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBinding, \
+    BasicTransposeBindings, DummyBinding
+from Deeploy.Targets.Generic.Layers import AddLayer, CLCALayer, DebugPrintLayer, GatherLayer, IntegerDivLayer, \
+    LinearAttentionLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, ReduceMeanLayer, ReduceSumLayer, \
+    RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, SliceLayer, TransposeLayer, iGELULayer, \
+    iLayerNormLayer, iSoftmaxLayer
+from Deeploy.Targets.Generic.Parsers import AddParser, DebugParser, DummyParser, FlattenParser, GatherParser, \
+    IntegerDivParser, MatMulParser, MulParser, Pad1DParser, Pad2DParser, ReduceMeanParser, ReduceSumParser, \
+    RequantShiftParser, ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SliceParser, TransposeParser, \
+    UnsqueezeParser, iGELUParser, iLayerNormParser, iSoftmaxParser
+from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate
+from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import IntegerDivRequantMergePass, \
+    MergeConstAddAndRequantPass, iGELURequantMergePass
+
+AddMapper = NodeMapper(AddParser(), BasicAddBindings)
+CLCA_int8_Mapper = NodeMapper(CMSISCLCAParser(), [CMSISCLCABinding])
+Conv1D_Mapper = NodeMapper(CMSISConv1DParser(), CMSISConv1DBindings)
+Conv2D_int8_Mapper = NodeMapper(CMSISConv2DParser(), [CMSISConv2DBinding])
+DebugPrint_Mapper = NodeMapper(DebugParser(), BasicDebugPrintBindings)
+DWConv1D_Mapper = NodeMapper(CMSISDWConv1DParser(), CMSISDWConv1DBindings)
+DWConv2D_int8_Mapper = NodeMapper(CMSISDWConv2DParser(), [CMSISDWConv2DBinding])
+FlattenMapper = NodeMapper(FlattenParser(), BasicReshapeBindings)
+GatherMapper = NodeMapper(GatherParser(), BasicGatherBindings)
+GELU_int8_Mapper = NodeMapper(iGELUParser(), [BasicGELUBinding])
+GEMMMapper = NodeMapper(CMSISGEMMParser(), CMSISGEMMBindings)
+iLayerNorm_int8_Mapper = NodeMapper(iLayerNormParser(), [BasicLayerNormBinding])
+IntegerDivMapper = NodeMapper(IntegerDivParser(), [BasicIntegerDivBinding])
+LinearAttention_int16_Mapper = NodeMapper(CMSISLinearAttentionParser(), [CMSISLinearAttentionBinding])
+MatMulMapper = NodeMapper(MatMulParser(), [BasicMatMulBinding])
+MaxPool2DMapper = NodeMapper(CMSISMaxPool2DParser(), [CMSISMaxPool2DBinding])
+MulMapper = NodeMapper(MulParser(), BasicMulBindings)
+Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings)
+Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings)
+ReduceMeanMapper = NodeMapper(ReduceMeanParser(), BasicReduceMeanBindings)
+ReduceSumMapper = NodeMapper(ReduceSumParser(), BasicReduceSumBindings)
+RequantShiftMapper = NodeMapper(RequantShiftParser(), BasicRQSBindings)
+ReshapeMapper = NodeMapper(ReshapeParser(), BasicReshapeBindings)
+RQGELU_int8_Mapper = NodeMapper(RQSiGELUParser(), [BasicRQSGELUBinding])
+RQIntegerDivMapper = NodeMapper(RQIntegerDivParser(), [BasicRQIntegerDivBinding])
+Softmax_int8_Mapper = NodeMapper(iSoftmaxParser(), [BasicSoftmaxBinding])
+TransposeMapper = NodeMapper(TransposeParser(), BasicTransposeBindings)
+UnsqueezeMapper = NodeMapper(UnsqueezeParser(), BasicReshapeBindings)
+
+SliceMapper = NodeMapper(SliceParser(), BasicSliceBindings)
+
+# Dummy nodes are intended for development purposes only!
+# They should always generate compiler errors to not accidentally end up in production code
+DummyMapper = NodeMapper(DummyParser(), [DummyBinding])
+
+CMSISMapping = {
+    'Add': AddLayer([AddMapper]),
+    'CLCA': CLCALayer([CLCA_int8_Mapper]),
+    'DebugPrint': DebugPrintLayer([DebugPrint_Mapper]),
+    'Flatten': ReshapeLayer([FlattenMapper]),
+    'Gather': GatherLayer([GatherMapper]),
+    'iGELU': iGELULayer([GELU_int8_Mapper]),
+    'iLayerNorm': iLayerNormLayer([iLayerNorm_int8_Mapper]),
+    'IntegerDiv': IntegerDivLayer([IntegerDivMapper]),
+    'IntegerMean': ReduceMeanLayer([ReduceMeanMapper]),
+    'iSoftmax': iSoftmaxLayer([Softmax_int8_Mapper]),
+    'LinearAttention': LinearAttentionLayer([LinearAttention_int16_Mapper]),
+    'MatMul': MatMulLayer([MatMulMapper]),
+    'MaxPool': MaxPoolLayer([MaxPool2DMapper]),
+    'Mul': MulLayer([MulMapper]),
+    'Pad': PadLayer([Pad1DMapper, Pad2DMapper]),
+    'ReduceMean': ReduceMeanLayer([ReduceMeanMapper]),
+    'ReduceSum': ReduceSumLayer([ReduceSumMapper]),
+    'RequantizedConv': CMSISRQSConvLayer([Conv2D_int8_Mapper, DWConv2D_int8_Mapper, Conv1D_Mapper, DWConv1D_Mapper]),
+    'RequantizedGemm': CMSISRQSGEMMLayer([GEMMMapper]),
+    'RequantizediGELU': RQSiGELULayer([RQGELU_int8_Mapper]),
+    'RequantShift': RequantShiftLayer([RequantShiftMapper]),
+    'Reshape': ReshapeLayer([ReshapeMapper]),
+    'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]),
+    'Transpose': TransposeLayer([TransposeMapper]),
+    'Unsqueeze': ReshapeLayer([UnsqueezeMapper]),
+    'Slice': SliceLayer([SliceMapper])
+}
+
+
+class CMSISVariableBuffer(VariableBuffer):
+
+    initTemplate = AllocateTemplate.referenceInitTemplate
+    allocTemplate = AllocateTemplate.referenceAllocateTemplate
+    deallocTemplate = FreeTemplate.referenceLocalTemplate
+
+
+class CMSISTransientBuffer(TransientBuffer):
+
+    initTemplate = AllocateTemplate.referenceInitTemplate
+    allocTemplate = AllocateTemplate.referenceAllocateTemplate
+    deallocTemplate = FreeTemplate.referenceLocalTemplate
+
+
+class CMSISConstantBuffer(ConstantBuffer):
+
+    initTemplate = AllocateTemplate.referenceGlobalInitTemplate
+    allocTemplate = AllocateTemplate.referenceGlobalAllocateTemplate
+    deallocTemplate = FreeTemplate.referenceGlobalTemplate
+
+
+class CMSISStructBuffer(StructBuffer):
+
+    initTemplate = AllocateTemplate.referenceStructInitTemplate
+    allocTemplate = AllocateTemplate.referenceStructAllocateTemplate
+    deallocTemplate = NodeTemplate("")
+
+
+# ExtractPaddingFromConvPass(),ExtractPaddingFromPoolPass(),
+CMSISOptimizer = TopologyOptimizer([
+    IntegerDivRequantMergePass(),
+    iGELURequantMergePass(),
+    LinearAttentionAlignmentPass(),
+    MHSAAlignmentPass(),
+    MergeConstAddAndRequantPass(),
+    ConvRequantMergePass(),
+    GEMMRequantMergePass(),
+    MatMulRequantMergePass(),
+    # DebugPass("Conv", position='before'),
+    # DebugPass("Pad", position='after'),
+])
+
+includeList = ["arm_nnfunctions.h", "DeeployMath.h"]
+
+
+class CMSISEngine(DeploymentEngine):
+
+    def __init__(self, name: str, Mapping = CMSISMapping, initCode: str = "", includeList = includeList) -> None:
+        super().__init__(name, Mapping, initCode, includeList)
+
+
+class CMSISPlatform(DeploymentPlatform):
+
+    def __init__(self,
+                 engines = [CMSISEngine("cmsis")],
+                 variableBuffer = CMSISVariableBuffer,
+                 constantBuffer = CMSISConstantBuffer,
+                 structBuffer = CMSISStructBuffer,
+                 transientBuffer = CMSISTransientBuffer):
+        super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
diff --git a/Deeploy/Targets/CortexM/Templates/AddTemplate.py b/Deeploy/Targets/CortexM/Templates/AddTemplate.py
new file mode 100644
index 0000000..b477193
--- /dev/null
+++ b/Deeploy/Targets/CortexM/Templates/AddTemplate.py
@@ -0,0 +1,36 @@
+# ----------------------------------------------------------------------
+#
+# File: AddTemplate.py
+#
+# Last edited: 18.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+AddInt8Template = NodeTemplate("\
+arm_add_q7(${data_in_1}, ${data_in_2}, ${data_out}, ${size});\
+")
+AddInt16Template = NodeTemplate("\
+arm_add_q15(${data_in_1}, ${data_in_2}, ${data_out}, ${size});\
+")
+AddInt32Template = NodeTemplate("\
+arm_add_q31(${data_in_1}, ${data_in_2}, ${data_out}, ${size});\
+")
diff --git a/Deeploy/Targets/CortexM/Templates/CLCATemplate.py b/Deeploy/Targets/CortexM/Templates/CLCATemplate.py
new file mode 100644
index 0000000..3eeb355
--- /dev/null
+++ b/Deeploy/Targets/CortexM/Templates/CLCATemplate.py
@@ -0,0 +1,551 @@
+# ----------------------------------------------------------------------
+#
+# File: CLCATemplate.py
+#
+# Last edited: 26.08.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes as DataTypes
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
+from Deeploy.Targets.Generic.Templates import ReduceMeanTemplate, RequantShiftTemplate, TransposeTemplate
+
+from . import ConvTemplate, GEMMTemplate
+
+
+class _CLCATemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+        reduceMeanTemplate = ReduceMeanTemplate.referenceTemplate
+        convTemplate = ConvTemplate.cmsis2D_8_Template
+        RQSMMTemplate = GEMMTemplate.Linear_8_Template
+        rqsTemplate = RequantShiftTemplate.referenceTemplate
+        transposeTemplate = TransposeTemplate.referenceTemplate
+
+        self.subTemplates["reduceMean"] = (reduceMeanTemplate, self.reduceMeanGenerator)
+
+        self.subTemplates["convQ"] = (convTemplate, self.convQGenerator)
+        self.subTemplates["convV"] = (convTemplate, self.convVGenerator)
+        self.subTemplates["convO"] = (convTemplate, self.convOGenerator)
+
+        self.subTemplates["RQK"] = (rqsTemplate, self.rqsKGenerator)
+        self.subTemplates["RQDelta"] = (rqsTemplate, self.rqsDeltaGenerator)
+
+        self.subTemplates["PreTransposeV"] = (transposeTemplate, self.transQGenerator)
+        self.subTemplates["PostTransposeV"] = (transposeTemplate, self.transQGenerator)
+
+        self.subTemplates["TransposeQ"] = (transposeTemplate, self.transQGenerator)
+        self.subTemplates["TransposeO"] = (transposeTemplate, self.transOGenerator)
+
+        self.subTemplates["MMA"] = (RQSMMTemplate, self.MMAGenerator)
+
+    def MMAGenerator(self, ctxt: NetworkContext,
+                     operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+
+        ctxt, operatorRepresentation = copy.deepcopy(ctxt), copy.deepcopy(operatorRepresentation)
+
+        K = ctxt.lookup("K", _id = operatorRepresentation['id'])
+        V = ctxt.lookup("V", _id = operatorRepresentation['id'])
+        A = ctxt.lookup("A", _id = operatorRepresentation['id'])
+
+        operatorRepresentation['A'] = K.name
+        operatorRepresentation['B'] = V.name
+        operatorRepresentation['data_out'] = A.name
+        operatorRepresentation['C'] = operatorRepresentation['preattn_requant_add']
+
+        operatorRepresentation['size'] = np.prod(K.shape) // operatorRepresentation['heads']
+        operatorRepresentation['alpha'] = 1.0
+        operatorRepresentation['beta'] = 1.0
+        operatorRepresentation['transA'] = 0
+        operatorRepresentation['transB'] = 1
+
+        operatorRepresentation['mul'] = operatorRepresentation['preattn_requant_mul']
+        operatorRepresentation['shift'] = operatorRepresentation['preattn_requant_div']
+        operatorRepresentation['channels'] = 1
+
+        return ctxt, operatorRepresentation
+
+    def transQGenerator(self, ctxt: NetworkContext,
+                        operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+        Q = ctxt.lookup("Q", _id = operatorRepresentation['id'])
+        QT = ctxt.lookup("QT", _id = operatorRepresentation['id'])
+
+        operatorRepresentation['data_in'] = Q.name
+        operatorRepresentation['data_in_type'] = Q._type
+        operatorRepresentation['data_in_shape'] = [
+            1, operatorRepresentation['heads'], operatorRepresentation['dim_head'],
+            operatorRepresentation['q_shape'][-1]
+        ]
+        operatorRepresentation['data_out'] = QT.name
+        operatorRepresentation['data_out_type'] = QT._type
+        operatorRepresentation['data_out_shape'] = [
+            1, operatorRepresentation['heads'], operatorRepresentation['q_shape'][-1],
+            operatorRepresentation['dim_head']
+        ]
+        operatorRepresentation['perm'] = [0, 1, 3, 2]
+
+        return ctxt, operatorRepresentation
+
+    def transOGenerator(self, ctxt: NetworkContext,
+                        operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+        O = ctxt.lookup("O", _id = operatorRepresentation['id'])
+        OT = ctxt.lookup("OT", _id = operatorRepresentation['id'])
+
+        operatorRepresentation['data_in'] = O.name
+        operatorRepresentation['data_in_type'] = O._type
+        operatorRepresentation['data_in_shape'] = [
+            1, operatorRepresentation['heads'], operatorRepresentation['q_shape'][-1],
+            operatorRepresentation['dim_head']
+        ]
+        operatorRepresentation['data_out'] = OT.name
+        operatorRepresentation['data_out_type'] = OT._type
+        operatorRepresentation['data_out_shape'] = [
+            1, operatorRepresentation['heads'], operatorRepresentation['dim_head'],
+            operatorRepresentation['q_shape'][-1]
+        ]
+        operatorRepresentation['perm'] = [0, 1, 3, 2]
+
+        return ctxt, operatorRepresentation
+
+    def rqsDeltaGenerator(self, ctxt: NetworkContext,
+                          operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+
+        K = ctxt.lookup("K", _id = operatorRepresentation['id'])
+        RK = ctxt.lookup("RK", _id = operatorRepresentation['id'])
+
+        operatorRepresentation['data_in'] = K.name
+        operatorRepresentation['data_in_type'] = K._type
+        operatorRepresentation['data_out'] = RK.name
+        operatorRepresentation['size'] = operatorRepresentation['input_size_KV']
+        operatorRepresentation['mul'] = operatorRepresentation['kdiv_requant_mul']
+        operatorRepresentation['add'] = operatorRepresentation['kdiv_requant_add']
+        operatorRepresentation['log2D'] = operatorRepresentation['kdiv_requant_div']
+        operatorRepresentation['channels'] = 1
+
+        return ctxt, operatorRepresentation
+
+    def rqsKGenerator(self, ctxt: NetworkContext,
+                      operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+
+        V = ctxt.lookup("V", _id = operatorRepresentation['id'])
+        K = ctxt.lookup("K", _id = operatorRepresentation['id'])
+
+        operatorRepresentation['data_in'] = V.name
+        operatorRepresentation['data_in_type'] = V._type
+        operatorRepresentation['data_out'] = K.name
+        operatorRepresentation['size'] = operatorRepresentation['input_size_KV']
+        operatorRepresentation['mul'] = operatorRepresentation['wk_requant_mul']
+        operatorRepresentation['add'] = operatorRepresentation['wk_requant_add']
+        operatorRepresentation['log2D'] = operatorRepresentation['wk_requant_div']
+        operatorRepresentation['channels'] = 1
+
+        return ctxt, operatorRepresentation
+
+    def convOGenerator(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+        convCtxt, convRep = self.convGenerator(ctxt, operatorRepresentation)
+
+        OT = ctxt.lookup("OT", _id = operatorRepresentation['id'])
+
+        convRep['data_in'] = OT.name
+        convRep['weight'] = convRep['wo_weight']
+        convRep['add'] = convRep['wo_bias']
+        convRep['dim_im_in_x'] = convRep['q_shape'][2]
+        convRep['dim_im_out_x'] = convRep['q_shape'][2]
+        convRep['mul'] = convRep['wo_requant_mul']
+        convRep['shift'] = convRep['wo_requant_div']
+        convRep['ch_im_in'] = convRep['dim_head'] * convRep['heads']
+        convRep['ch_im_out'] = convRep['out_dim']
+
+        return convCtxt, convRep
+
+    def convVGenerator(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+        convCtxt, convRep = self.convGenerator(ctxt, operatorRepresentation)
+
+        V = ctxt.lookup("V", _id = operatorRepresentation['id'])
+
+        convRep['data_in'] = convRep['k']
+        convRep['weight'] = convRep['wk_weight']
+        convRep['add'] = convRep['wk_bias']
+        convRep['data_out'] = V.name
+        convRep['dim_im_in_x'] = convRep['kv_shape'][2]
+        convRep['dim_im_out_x'] = convRep['kv_shape'][2]
+        convRep['mul'] = convRep['wv_requant_mul']
+        convRep['shift'] = convRep['wv_requant_div']
+        convRep['ch_im_in'] = convRep['kv_shape'][1]
+        convRep['ch_im_out'] = convRep['dim_head'] * convRep['heads']
+
+        return convCtxt, convRep
+
+    def convQGenerator(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+        convCtxt, convRep = self.convGenerator(ctxt, operatorRepresentation)
+
+        Q = ctxt.lookup("Q", _id = operatorRepresentation['id'])
+
+        convRep['data_in'] = convRep['q']
+        convRep['weight'] = convRep['wq_weight']
+        convRep['add'] = convRep['wq_bias']
+        convRep['data_out'] = Q.name
+        convRep['dim_im_in_x'] = convRep['q_shape'][2]
+        convRep['dim_im_out_x'] = convRep['q_shape'][2]
+        convRep['mul'] = convRep['wq_requant_mul']
+        convRep['shift'] = convRep['wq_requant_div']
+        convRep['ch_im_in'] = convRep['q_shape'][1]
+        convRep['ch_im_out'] = convRep['dim_head'] * convRep['heads']
+
+        return convCtxt, convRep
+
+    def convGenerator(self, ctxt: NetworkContext,
+                      operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+
+        convRep = copy.deepcopy(operatorRepresentation)
+        convCtxt = copy.deepcopy(ctxt)
+
+        # Same for all convs
+        convRep['dilation_x'] = 1
+        convRep['dilation_y'] = 1
+        convRep['padding_x'] = 0
+        convRep['padding_y'] = 0
+        convRep['stride_x'] = 1
+        convRep['stride_y'] = 1
+        convRep['dim_kernel_x'] = 1
+        convRep['dim_kernel_y'] = 1
+        convRep['dim_im_in_y'] = 1
+        convRep['dim_im_out_y'] = 1
+
+        return convCtxt, convRep
+
+    def reduceMeanGenerator(self, ctxt: NetworkContext,
+                            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+        K = ctxt.lookup("K", _id = operatorRepresentation['id'])
+        E = ctxt.lookup("E", _id = operatorRepresentation['id'])
+
+        operatorRepresentation['data_in'] = K.name
+        operatorRepresentation['data_in_type'] = K._type
+        operatorRepresentation['data_out'] = E.name
+        operatorRepresentation['data_out_type'] = E._type
+
+        operatorRepresentation['data_in_shape'] = [
+            1, operatorRepresentation['heads'], operatorRepresentation['dim_head'],
+            operatorRepresentation['kv_shape'][-1]
+        ]
+        operatorRepresentation['data_out_shape'] = [
+            1, operatorRepresentation['heads'], operatorRepresentation['dim_head']
+        ]
+        operatorRepresentation['size'] = operatorRepresentation['heads'] * operatorRepresentation[
+            'dim_head'] * operatorRepresentation['kv_shape'][2]
+        operatorRepresentation['axisLength'] = operatorRepresentation['kv_shape'][-1]
+        operatorRepresentation['axes'] = [3]
+
+        return ctxt, operatorRepresentation
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        Q = VariableBuffer("Q", [
+            operatorRepresentation['heads'], operatorRepresentation['dim_head'], operatorRepresentation['q_shape'][-1]
+        ], 256)
+        Q._type = DataTypes.int8_t
+        Q._signed = False
+        Q._deploy = False
+
+        QT = VariableBuffer("QT", [
+            operatorRepresentation['heads'], operatorRepresentation['q_shape'][-1], operatorRepresentation['dim_head']
+        ], 256)
+        QT._type = DataTypes.int8_t
+        QT._signed = False
+        QT._deploy = False
+
+        K = VariableBuffer("K", [
+            operatorRepresentation['heads'], operatorRepresentation['dim_head'], operatorRepresentation['kv_shape'][-1]
+        ], 256)
+        K._type = DataTypes.int8_t
+        K._signed = False
+        K._deploy = False
+
+        RK = VariableBuffer("RK", [
+            operatorRepresentation['heads'], operatorRepresentation['dim_head'], operatorRepresentation['kv_shape'][-1]
+        ], 256)
+        RK._type = DataTypes.int8_t
+        RK._signed = True
+        RK._deploy = False
+
+        V = VariableBuffer("V", [
+            operatorRepresentation['heads'], operatorRepresentation['dim_head'], operatorRepresentation['kv_shape'][-1]
+        ], 256)
+        V._type = DataTypes.int8_t
+        V._signed = True
+        V._deploy = False
+
+        VT = VariableBuffer("VT", [
+            operatorRepresentation['heads'], operatorRepresentation['kv_shape'][-1], operatorRepresentation['dim_head']
+        ], 256)
+        VT._type = DataTypes.int8_t
+        VT._signed = True
+        VT._deploy = False
+
+        E = VariableBuffer("E", [operatorRepresentation['heads'], operatorRepresentation['dim_head'], 1], 256)
+        E._type = DataTypes.int8_t
+        E._signed = False
+        E._deploy = False
+
+        A = VariableBuffer(
+            "A",
+            [operatorRepresentation['heads'], operatorRepresentation['dim_head'], operatorRepresentation['dim_head']],
+            256)
+        A._type = DataTypes.int8_t
+        A._signed = False
+        A._deploy = False
+
+        AA = VariableBuffer("AA", [
+            operatorRepresentation['heads'], operatorRepresentation['dim_head'], operatorRepresentation['q_shape'][-1]
+        ], 2**32)
+        AA._type = DataTypes.int32_t
+        AA._signed = True
+        AA._deploy = False
+
+        B = VariableBuffer("B", [operatorRepresentation['heads'], operatorRepresentation['q_shape'][-1], 1], 2**32)
+        B._type = DataTypes.int32_t
+        B._signed = True
+        B._deploy = False
+
+        O = VariableBuffer("O", [
+            1, operatorRepresentation['heads'], operatorRepresentation['q_shape'][-1],
+            operatorRepresentation['dim_head']
+        ], 256)
+        O._type = DataTypes.int8_t
+        O._signed = True
+        O._deploy = False
+
+        OT = VariableBuffer("OT", [
+            1,
+            operatorRepresentation['heads'] * operatorRepresentation['dim_head'],
+            operatorRepresentation['q_shape'][-1],
+        ], 256)
+        OT._type = DataTypes.int8_t
+        OT._signed = True
+        OT._deploy = False
+
+        operatorRepresentation['id'] = operatorRepresentation['data_out']
+
+        ctxt.add(Q, 'internal', _id = operatorRepresentation['id'])
+        operatorRepresentation['Q'] = Q.name
+        ctxt.add(QT, 'internal', _id = operatorRepresentation['id'])
+        operatorRepresentation['QT'] = QT.name
+        ctxt.add(K, 'internal', _id = operatorRepresentation['id'])
+        operatorRepresentation['K'] = K.name
+        ctxt.add(RK, 'internal', _id = operatorRepresentation['id'])
+        operatorRepresentation['RK'] = RK.name
+        ctxt.add(V, 'internal', _id = operatorRepresentation['id'])
+        operatorRepresentation['V'] = V.name
+        ctxt.add(VT, 'internal', _id = operatorRepresentation['id'])
+        operatorRepresentation['VT'] = VT.name
+        ctxt.add(E, 'internal', _id = operatorRepresentation['id'])
+        operatorRepresentation['E'] = E.name
+        ctxt.add(A, 'internal', _id = operatorRepresentation['id'])
+        operatorRepresentation['A'] = A.name
+        ctxt.add(AA, 'internal', _id = operatorRepresentation['id'])
+        operatorRepresentation['AA'] = AA.name
+        ctxt.add(B, 'internal', _id = operatorRepresentation['id'])
+        operatorRepresentation['B'] = B.name
+        ctxt.add(O, 'internal', _id = operatorRepresentation['id'])
+        operatorRepresentation['O'] = O.name
+        ctxt.add(OT, 'internal', _id = operatorRepresentation['id'])
+        operatorRepresentation['OT'] = OT.name
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _CLCATemplate("""
+<%
+sizeV = heads*dim_head*kv_shape[2]
+%>
+// alloc V
+int8_t* ${V} = deeploy_malloc(${sizeV});
+am_util_stdio_printf("Alloc V at 0x\%x\\n", ${V});
+// V <- k * WKV
+${RENDER_convV}
+//am_util_stdio_printf("Comp V\\n");
+// alloc K
+<%
+sizeK = heads*dim_head*kv_shape[2]
+%>
+int8_t* ${K} = deeploy_malloc(${sizeK});
+//am_util_stdio_printf("Alloc K at 0x\%x\\n", ${K});
+// K <- RQ(V)
+${RENDER_RQK}
+// alloc A
+<%
+sizeA = heads*dim_head*dim_head
+%>
+// RK <- RQ(K)
+int8_t* ${RK} = deeploy_malloc(${sizeK});
+//am_util_stdio_printf("Alloc RK at 0x\%x\\n", ${RK});
+${RENDER_RQDelta}
+int8_t* ${A} = (int8_t*)deeploy_malloc(sizeof(int8_t) * ${sizeA});
+//am_util_stdio_printf("Alloc A at 0x\%x\\n", ${A});
+// A <- RQS(KT x V)
+// Headwise MMA
+int8_t* OG_${A} = ${A};
+int8_t* OG_${RK} = ${RK};
+int8_t* OG_${V} = ${V};
+for (int head=0; head<${heads}; head++){
+${RENDER_MMA}
+${A} += ${dim_head}*${dim_head};
+${RK} += ${kv_shape[-1]}*${dim_head};
+${V} += ${kv_shape[-1]}*${dim_head};
+}
+${A} = OG_${A};
+${RK} = OG_${RK};
+${V} = OG_${V};
+//am_util_stdio_printf("Comp A\\n");
+free(${RK});
+//am_util_stdio_printf("Free RK at 0x\%x\\n", ${RK});
+// alloc E
+<%
+sizeE = heads*dim_head
+%>
+int8_t* ${E} = deeploy_malloc(${sizeE});
+//am_util_stdio_printf("Alloc E at 0x\%x\\n", ${E});
+// E <- mean(K)
+${RENDER_reduceMean}
+//am_util_stdio_printf("Comp E\\n");
+// free K
+free(${K});
+//am_util_stdio_printf("Free K at 0x\%x\\n", ${K});
+// free V
+free(${V});
+//am_util_stdio_printf("Free V at 0x\%x\\n", ${V});
+// alloc Q
+<%
+sizeQ = heads*dim_head*q_shape[2]
+%>
+int8_t* ${Q} = deeploy_malloc(${sizeQ});
+//am_util_stdio_printf("Alloc Q at 0x\%x\\n", ${Q});
+// Q <- q * WQ
+${RENDER_convQ}
+// alloc QT
+int8_t* ${QT} = deeploy_malloc(${sizeQ});
+//am_util_stdio_printf("Alloc QT at 0x\%x\\n", ${QT});
+// transpose Q -> QT
+${RENDER_TransposeQ}
+// free Q
+free(${Q});
+//am_util_stdio_printf("Free Q at 0x\%x\\n", ${Q});
+// alloc AA
+<%
+sizeAA = heads*dim_head*dim_head
+%>
+int32_t* ${AA} = (int32_t*)deeploy_malloc((sizeof(int32_t)) * ${sizeAA});
+//am_util_stdio_printf("Alloc AA at 0x\%x\\n", ${AA});
+// AA <- Q x A
+int32_t* OG_${AA} = ${AA};
+int8_t* OG_${QT} = ${QT};
+for (int head=0; head<${heads}; head++){
+MatMul_s8_s8_s32(${QT}, ${A}, ${AA}, ${q_shape[-1]}, ${dim_head}, ${dim_head});
+${QT} += ${q_shape[-1]} * ${dim_head};
+${A} += ${dim_head} * ${dim_head};
+${AA} += ${q_shape[-1]} * ${dim_head};
+}
+${AA} = OG_${AA};
+${A} = OG_${A};
+${QT} = OG_${QT};
+//am_util_stdio_printf("Comp AA\\n");
+// free A
+free(${A});
+//am_util_stdio_printf("Free A at 0x\%x\\n", ${A});
+  //am_util_delay_ms(5);
+// alloc B
+<%
+sizeB = heads*dim_head
+%>
+int32_t* ${B} = (int32_t*)deeploy_malloc((sizeof(int32_t)) * ${sizeB});
+//am_util_stdio_printf("Alloc B at 0x\%x\\n", ${B});
+  //am_util_delay_ms(5);
+// B <- Q x E
+int8_t* OG_${E} = ${E};
+int32_t* OG_${B} = ${B};
+for (int head=0; head<${heads}; head++){
+MatMul_s8_s8_s32(${QT}, ${E}, ${B}, ${q_shape[-1]}, ${dim_head}, 1);
+${QT} += ${q_shape[-1]} * ${dim_head};
+${E} += ${dim_head};
+${B} += ${q_shape[-1]};
+}
+${E} = OG_${E};
+${B} = OG_${B};
+${QT} = OG_${QT};
+//am_util_stdio_printf("QT: 0x%x \\n", ${QT});
+//am_util_stdio_printf("Comp B \\n");
+  //am_util_delay_ms(5);
+// free o
+free(${QT});
+//am_util_stdio_printf("Free QT at 0x\%x\\n", ${QT});
+  //am_util_delay_ms(5);
+// free E
+free(${E});
+//am_util_stdio_printf("Free E at 0x\%x\\n", ${E});
+  //am_util_delay_ms(5);
+// alloc _o
+<%
+sizeO = sizeQ
+%>
+int8_t* ${O} = deeploy_malloc(${sizeO});
+//am_util_stdio_printf("Alloc O at 0x\%x\\n", ${O});
+RQDivKernel_s32_s8(${AA}, ${B}, ${sizeAA}, ${sizeB}, ${O}, ${Delta}, ${eps}, ${eta}, *${postattn_requant_mul}, *${postattn_requant_add}, *${postattn_requant_div});
+//am_util_stdio_printf("Comp O\\n");
+  //am_util_delay_ms(5);
+// free AA
+free(${AA});
+//am_util_stdio_printf("Free AA at 0x\%x\\n", ${AA});
+  //am_util_delay_ms(5);
+// free B
+free(${B});
+//am_util_stdio_printf("Free B at 0x\%x\\n", ${B});
+  //am_util_delay_ms(5);
+
+// alloc OT
+int8_t* ${OT} = deeploy_malloc(${sizeQ});
+//am_util_stdio_printf("Alloc OT at 0x\%x\\n", ${OT});
+  //am_util_delay_ms(5);
+// transpose O -> OT
+${RENDER_TransposeO}
+// free O
+free(${O});
+//am_util_stdio_printf("Free O at 0x\%x\\n", ${O});
+  //am_util_delay_ms(5);
+// data_out <- o * WO
+${RENDER_convO}
+//am_util_stdio_printf("Comp Output \\n");
+  //am_util_delay_ms(5);
+// free o
+free(${OT});
+am_util_stdio_printf("Free OT at 0x\%x\\n", ${OT});
+am_util_delay_ms(15);
+
+""")
diff --git a/Deeploy/Targets/CortexM/Templates/CMSISUtils.py b/Deeploy/Targets/CortexM/Templates/CMSISUtils.py
new file mode 100644
index 0000000..1c474c8
--- /dev/null
+++ b/Deeploy/Targets/CortexM/Templates/CMSISUtils.py
@@ -0,0 +1,238 @@
+# ----------------------------------------------------------------------
+#
+# File: CMSISUtils.py
+#
+# Last edited: 10.01.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from Deeploy.Targets.CortexM.DataTypes import cmsis_nn_activation, cmsis_nn_context, cmsis_nn_conv_params, \
+    cmsis_nn_dims, cmsis_nn_fc_params, cmsis_nn_per_channel_quant_params, cmsis_nn_per_tensor_quant_params
+
+
+def bindConvParams(ctxt, name, repName, batch, operatorRepresentation):
+
+    nameList = []
+
+    # Hoist the structs to the global ctxt
+
+    # First the context
+    # https://review.trustedfirmware.org/plugins/gitiles/mirror/ARM-software/CMSIS_5/+/refs/heads/bias_for_conv/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c
+    bufferSize = 2 * operatorRepresentation['ch_im_in'] * operatorRepresentation[
+        'dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] * 2
+
+    ctxtDict = {
+        'buf': operatorRepresentation['ctxtBuffer'],  #f'{name}_ctxt_buffer',
+        'size': bufferSize
+    }
+
+    nameList += [ctxt.hoistStruct(ctxtDict, f'{name}_ctxt', cmsis_nn_context)]
+    operatorRepresentation[f'{repName}_ctxt'] = f'{name}_ctxt'
+
+    # Next the conv params
+    # stride
+    strideDict = {
+        'h': operatorRepresentation['stride_x'],
+        'w': operatorRepresentation['stride_y'],
+    }
+    # padding
+    paddingDict = {'h': operatorRepresentation['padding_x'], 'w': operatorRepresentation['padding_y']}
+    # dilation
+    dilationDict = {'h': operatorRepresentation['dilation_x'], 'w': operatorRepresentation['dilation_y']}
+    activationDict = {
+        'min': -(operatorRepresentation['n_levels'] // 2),
+        'max': (operatorRepresentation['n_levels'] // 2) - 1
+    }
+
+    if 'data_in' in operatorRepresentation:
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_in_signed = data_in._signed
+    else:
+        data_in_signed = operatorRepresentation['data_in_signed']
+
+    if 'data_out' in operatorRepresentation:
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+        data_out_signed = data_out._signed
+    else:
+        data_out_signed = operatorRepresentation['data_out_signed']
+
+    assert data_in_signed is not None
+    assert data_out_signed is not None
+
+    convParamsDict = {
+        'input_offset': (data_in_signed == 0) * operatorRepresentation['n_levels'] // 2,
+        'output_offset': -(data_out_signed == 0) * operatorRepresentation['n_levels'] // 2,
+        'stride': strideDict,
+        'padding': paddingDict,
+        'dilation': dilationDict,
+        'activation': activationDict,
+    }
+    nameList += [ctxt.hoistStruct(convParamsDict, f'{name}_conv_params', cmsis_nn_conv_params)]
+    operatorRepresentation[f'{repName}_conv_params'] = ctxt.lookup(f'{name}_conv_params').name
+
+    convQuantDict = {
+        'multiplier': ctxt._mangle(operatorRepresentation['mul']),
+        'shift': ctxt._mangle(operatorRepresentation['shift']),
+    }
+    nameList += [ctxt.hoistStruct(convQuantDict, f'{name}_quant_params', cmsis_nn_per_channel_quant_params)]
+    operatorRepresentation[f'{repName}_quant_params'] = ctxt.lookup(f'{name}_quant_params').name
+
+    inputDimsDict = {
+        'n': batch,
+        'h': operatorRepresentation['dim_im_in_x'],
+        'w': operatorRepresentation['dim_im_in_y'],
+        'c': operatorRepresentation['ch_im_in']
+    }
+    nameList += [ctxt.hoistStruct(inputDimsDict, f'{name}_input_dims', cmsis_nn_dims)]
+    operatorRepresentation[f'{repName}_input_dims'] = ctxt.lookup(f'{name}_input_dims').name
+
+    filterDimsDict = {
+        'n': operatorRepresentation['ch_im_out'],
+        'h': operatorRepresentation['dim_kernel_x'],
+        'w': operatorRepresentation['dim_kernel_y'],
+        'c': operatorRepresentation['ch_im_in']
+    }
+    nameList += [ctxt.hoistStruct(filterDimsDict, f'{name}_filter_dims', cmsis_nn_dims)]
+    operatorRepresentation[f'{repName}_filter_dims'] = ctxt.lookup(f'{name}_filter_dims').name
+
+    outputDimsDict = {
+        'n': batch,
+        'h': operatorRepresentation['dim_im_out_x'],
+        'w': operatorRepresentation['dim_im_out_y'],
+        'c': operatorRepresentation['ch_im_out']
+    }
+    nameList += [ctxt.hoistStruct(outputDimsDict, f'{name}_output_dims', cmsis_nn_dims)]
+    operatorRepresentation[f'{repName}_output_dims'] = ctxt.lookup(f'{name}_output_dims').name
+
+    biasDimsDict = {
+        'n': 1,
+        'h': 1,
+        'w': 1,
+        'c': operatorRepresentation['ch_im_out'],
+    }
+    nameList += [ctxt.hoistStruct(biasDimsDict, f'{name}_bias_dims', cmsis_nn_dims)]
+    operatorRepresentation[f'{repName}_bias_dims'] = ctxt.lookup(f'{name}_bias_dims').name
+
+    return ctxt, operatorRepresentation
+
+
+def bindFCParams(ctxt,
+                 name,
+                 mul,
+                 shift,
+                 data_in,
+                 weight,
+                 operatorRepresentation,
+                 operatorRepresentationPrefix = '',
+                 bias = True):
+
+    nameList = []
+
+    operatorRepresentation['in_N'] = operatorRepresentation['M']
+    operatorRepresentation['in_C'] = operatorRepresentation['N']
+    operatorRepresentation['weight_N'] = operatorRepresentation['N']
+    operatorRepresentation['weight_C'] = operatorRepresentation['O']
+
+    ctxtDict = {'buf': None, 'size': 0}
+
+    nameList += [ctxt.hoistStruct(ctxtDict, f'{name}_ctxt', cmsis_nn_context)]
+    operatorRepresentation[f'{operatorRepresentationPrefix}ctxt'] = f'{name}_ctxt'
+
+    # activation
+    activationDict = {
+        'min': -(operatorRepresentation['n_levels'] // 2),
+        'max': (operatorRepresentation['n_levels'] // 2) - 1
+    }
+    nameList += [ctxt.hoistStruct(activationDict, f'{name}_activation', cmsis_nn_activation)]
+
+    data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+    # SCHEREMO: Workaround for MHSA:
+    if not hasattr(data_in, '_signed') or not hasattr(data_out, '_signed'):
+
+        fcParamsDict = {
+            'input_offset': 0,
+            'output_offset': 0,
+            'filter_offset': 0,
+            'activation': activationDict,
+        }
+
+    else:
+
+        fcParamsDict = {
+            'input_offset': (data_in._signed == 0) * operatorRepresentation['n_levels'] // 2,
+            'output_offset': -(data_out._signed == 0) * operatorRepresentation['n_levels'] // 2,
+            'filter_offset': 0,
+            'activation': activationDict,
+        }
+
+    nameList += [ctxt.hoistStruct(fcParamsDict, f'{name}_fc_params', cmsis_nn_fc_params)]
+    operatorRepresentation[f'{operatorRepresentationPrefix}fc_params'] = ctxt.lookup(f'{name}_fc_params').name
+
+    if isinstance(mul, str):
+        __mul = ctxt.lookup(mul).values
+        assert np.ndim(__mul) == 0, "Mul is not scalar!"
+        _mul = __mul.item()
+        ctxt.lookup(mul)._deploy = False
+    else:
+        _mul = mul
+
+    if isinstance(shift, str):
+        __shift = ctxt.lookup(shift).values
+        assert np.ndim(__shift) == 0, "Shift is not scalar!"
+        _shift = __shift.item()
+        ctxt.lookup(shift)._deploy = False
+    else:
+        _shift = shift
+
+    gemmQuantDict = {'multiplier': _mul, 'shift': _shift}
+
+    nameList += [ctxt.hoistStruct(gemmQuantDict, f'{name}_quant_params', cmsis_nn_per_tensor_quant_params)]
+    operatorRepresentation[f'{operatorRepresentationPrefix}quant_params'] = ctxt.lookup(f'{name}_quant_params').name
+
+    inputDimsDict = {
+        'n': operatorRepresentation['in_N'],
+        'h': 1,
+        'w': 1,
+        'c': operatorRepresentation['in_C'],
+    }
+    nameList += [ctxt.hoistStruct(inputDimsDict, f'{name}_input_dims', cmsis_nn_dims)]
+    operatorRepresentation[f'{operatorRepresentationPrefix}input_dims'] = ctxt.lookup(f'{name}_input_dims').name
+
+    filterDimsDict = {'n': operatorRepresentation['weight_N'], 'h': 1, 'w': 1, 'c': operatorRepresentation['weight_C']}
+    nameList += [ctxt.hoistStruct(filterDimsDict, f'{name}_filter_dims', cmsis_nn_dims)]
+    operatorRepresentation[f'{operatorRepresentationPrefix}filter_dims'] = ctxt.lookup(f'{name}_filter_dims').name
+
+    outputDimsDict = {'n': operatorRepresentation['in_N'], 'h': 1, 'w': 1, 'c': operatorRepresentation['weight_C']}
+    nameList += [ctxt.hoistStruct(outputDimsDict, f'{name}_output_dims', cmsis_nn_dims)]
+    operatorRepresentation[f'{operatorRepresentationPrefix}output_dims'] = ctxt.lookup(f'{name}_output_dims').name
+
+    biasDimsDict = {
+        'n': 1,
+        'h': 1,
+        'w': 1,
+        'c': operatorRepresentation['weight_C'] * bias,
+    }
+    nameList += [ctxt.hoistStruct(biasDimsDict, f'{name}_bias_dims', cmsis_nn_dims)]
+    operatorRepresentation[f'{operatorRepresentationPrefix}bias_dims'] = ctxt.lookup(f'{name}_bias_dims').name
+
+    return ctxt, operatorRepresentation, nameList
diff --git a/Deeploy/Targets/CortexM/Templates/ConvTemplate.py b/Deeploy/Targets/CortexM/Templates/ConvTemplate.py
new file mode 100644
index 0000000..5743fc9
--- /dev/null
+++ b/Deeploy/Targets/CortexM/Templates/ConvTemplate.py
@@ -0,0 +1,271 @@
+# ----------------------------------------------------------------------
+#
+# File: ConvTemplate.py
+#
+# Last edited: 17.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+from Deeploy.Targets.CortexM.DataTypes import cmsis_nn_context, cmsis_nn_conv_params, cmsis_nn_dims, \
+    cmsis_nn_per_channel_quant_params
+
+
+class _Conv2D_8_Template(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        nameList = []
+
+        data_out_name = operatorRepresentation['data_out']
+        ctxtDict = {'buf': operatorRepresentation['ctxtBuffer'], 'size': operatorRepresentation['ctxtBufferSize']}
+        nameList += [ctxt.hoistStruct(ctxtDict, f'{data_out_name}_ctxt', cmsis_nn_context)]
+        operatorRepresentation['ctxt'] = f'{data_out_name}_ctxt'
+
+        strideDict = {
+            'h': operatorRepresentation['stride_x'],
+            'w': operatorRepresentation['stride_y'],
+        }
+        paddingDict = {'h': operatorRepresentation['padding_x'], 'w': operatorRepresentation['padding_y']}
+        dilationDict = {'h': operatorRepresentation['dilation_x'], 'w': operatorRepresentation['dilation_y']}
+        activationDict = {
+            'min': -(operatorRepresentation['n_levels'] // 2),
+            'max': (operatorRepresentation['n_levels'] // 2) - 1
+        }
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        assert data_in._signed is not None
+        assert data_out._signed is not None
+
+        convParamsDict = cmsis_nn_conv_params({
+            'input_offset': (data_in._signed == 0) * operatorRepresentation['n_levels'] // 2,
+            'output_offset': -(data_out._signed == 0) * operatorRepresentation['n_levels'] // 2,
+            'stride': strideDict,
+            'padding': paddingDict,
+            'dilation': dilationDict,
+            'activation': activationDict,
+        })
+        nameList += [ctxt.hoistStruct(convParamsDict, f'{data_out_name}_conv_params', cmsis_nn_conv_params)]
+        operatorRepresentation['conv_params'] = ctxt.lookup(f'{data_out_name}_conv_params').name
+
+        convQuantDict = cmsis_nn_per_channel_quant_params(
+            {
+                'multiplier': operatorRepresentation['mul'],
+                'shift': operatorRepresentation['shift'],
+            }, ctxt)
+
+        nameList += [
+            ctxt.hoistStruct(convQuantDict, f'{data_out_name}_quant_params', cmsis_nn_per_channel_quant_params)
+        ]
+        operatorRepresentation['quant_params'] = ctxt.lookup(f'{data_out_name}_quant_params').name
+
+        inputDimsDict = cmsis_nn_dims({
+            'n': data_in.shape[0],
+            'h': operatorRepresentation['dim_im_in_x'],
+            'w': operatorRepresentation['dim_im_in_y'],
+            'c': operatorRepresentation['ch_im_in']
+        })
+        nameList += [ctxt.hoistStruct(inputDimsDict, f'{data_out_name}_input_dims', cmsis_nn_dims)]
+        operatorRepresentation['input_dims'] = ctxt.lookup(f'{data_out_name}_input_dims').name
+
+        filterDimsDict = cmsis_nn_dims({
+            'n': operatorRepresentation['ch_im_out'],
+            'h': operatorRepresentation['dim_kernel_x'],
+            'w': operatorRepresentation['dim_kernel_y'],
+            'c': operatorRepresentation['ch_im_in']
+        })
+        nameList += [ctxt.hoistStruct(filterDimsDict, f'{data_out_name}_filter_dims', cmsis_nn_dims)]
+        operatorRepresentation['filter_dims'] = ctxt.lookup(f'{data_out_name}_filter_dims').name
+
+        outputDimsDict = cmsis_nn_dims({
+            'n': data_in.shape[0],
+            'h': operatorRepresentation['dim_im_out_x'],
+            'w': operatorRepresentation['dim_im_out_y'],
+            'c': operatorRepresentation['ch_im_out']
+        })
+        nameList += [ctxt.hoistStruct(outputDimsDict, f'{data_out_name}_output_dims', cmsis_nn_dims)]
+        operatorRepresentation['output_dims'] = ctxt.lookup(f'{data_out_name}_output_dims').name
+
+        biasDimsDict = cmsis_nn_dims({
+            'n': 1,
+            'h': 1,
+            'w': 1,
+            'c': operatorRepresentation['ch_im_out'],
+        })
+        nameList += [ctxt.hoistStruct(biasDimsDict, f'{data_out_name}_bias_dims', cmsis_nn_dims)]
+        operatorRepresentation['bias_dims'] = ctxt.lookup(f'{data_out_name}_bias_dims').name
+
+        return ctxt, operatorRepresentation, nameList
+
+    def computeTransientBuffersSize(
+            self, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
+        size = 2 * operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation[
+            'dim_kernel_y'] * 2
+        name = operatorRepresentation['nodeName'] + f"_buffer"
+        return [(name, size)]
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # SCHEREMO: Hoist transient buffer
+        # https://review.trustedfirmware.org/plugins/gitiles/mirror/ARM-software/CMSIS_5/+/refs/heads/bias_for_conv/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c
+
+        name, size = self.computeTransientBuffersSize(ctxt, operatorRepresentation)[0]
+        ctxt.hoistTransientBuffer(name, size)
+        operatorRepresentation['ctxtBuffer'] = name
+        operatorRepresentation['ctxtBufferSize'] = size
+        return ctxt, operatorRepresentation, [name]
+
+
+cmsis2D_8_Template = _Conv2D_8_Template("\
+arm_convolve_wrapper_s8(&${ctxt}, &${conv_params}, &${quant_params}, &${input_dims}, ${data_in}, &${filter_dims}, ${weight}, &${bias_dims}, ${add}, &${output_dims}, ${data_out}); \n\
+")
+
+
+class _Conv1D_16_Template(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_out_name = operatorRepresentation['data_out']
+        nameList = []
+
+        # Hoist the structs to the global ctxt
+        ctxtDict = {'buf': operatorRepresentation['ctxtBuffer'], 'size': operatorRepresentation['ctxtBufferSize']}
+        nameList += [ctxt.hoistStruct(ctxtDict, f'{data_out_name}_ctxt', cmsis_nn_context)]
+        operatorRepresentation['ctxt'] = f'{data_out_name}_ctxt'
+
+        # Next the conv params
+        # stride
+        strideDict = {
+            'h': 1,
+            'w': operatorRepresentation['stride_y'],
+        }
+        paddingDict = {'h': 0, 'w': operatorRepresentation['padding_y']}
+        dilationDict = {'h': 1, 'w': operatorRepresentation['dilation_y']}
+        activationDict = {
+            'min': -(operatorRepresentation['n_levels'] // 2),
+            'max': (operatorRepresentation['n_levels'] // 2) - 1
+        }
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        assert data_in._signed is not None
+        assert data_out._signed is not None
+
+        convParamsDict = {
+            'input_offset': (data_in._signed == 0) * operatorRepresentation['n_levels'] // 2,
+            'output_offset': -(data_out._signed == 0) * operatorRepresentation['n_levels'] // 2,
+            'stride': strideDict,
+            'padding': paddingDict,
+            'dilation': dilationDict,
+            'activation': activationDict,
+        }
+        nameList += [ctxt.hoistStruct(convParamsDict, f'{data_out_name}_conv_params', cmsis_nn_conv_params)]
+        operatorRepresentation['conv_params'] = ctxt.lookup(f'{data_out_name}_conv_params').name
+
+        convQuantDict = {
+            'multiplier': operatorRepresentation['mul'],
+            'shift': operatorRepresentation['shift'],
+        }
+        nameList += [
+            ctxt.hoistStruct(convQuantDict, f'{data_out_name}_quant_params', cmsis_nn_per_channel_quant_params)
+        ]
+        operatorRepresentation['quant_params'] = ctxt.lookup(f'{data_out_name}_quant_params').name
+
+        inputDimsDict = {
+            'n': data_in.shape[0],
+            'h': 1,
+            'w': operatorRepresentation['dim_im_in_y'],
+            'c': operatorRepresentation['ch_im_in']
+        }
+        nameList += [ctxt.hoistStruct(inputDimsDict, f'{data_out_name}_input_dims', cmsis_nn_dims)]
+        operatorRepresentation['input_dims'] = ctxt.lookup(f'{data_out_name}_input_dims').name
+
+        filterDimsDict = {
+            'n': operatorRepresentation['ch_im_out'],
+            'h': 1,
+            'w': operatorRepresentation['dim_kernel_y'],
+            'c': operatorRepresentation['ch_im_in']
+        }
+        nameList += [ctxt.hoistStruct(filterDimsDict, f'{data_out_name}_filter_dims', cmsis_nn_dims)]
+        operatorRepresentation['filter_dims'] = ctxt.lookup(f'{data_out_name}_filter_dims').name
+
+        outputDimsDict = {
+            'n': data_in.shape[0],
+            'h': 1,
+            'w': operatorRepresentation['dim_im_out_y'],
+            'c': operatorRepresentation['ch_im_out']
+        }
+        nameList += [ctxt.hoistStruct(outputDimsDict, f'{data_out_name}_output_dims', cmsis_nn_dims)]
+        operatorRepresentation['output_dims'] = ctxt.lookup(f'{data_out_name}_output_dims').name
+
+        biasDimsDict = {
+            'n': 1,
+            'h': 1,
+            'w': 1,
+            'c': operatorRepresentation['ch_im_out'],
+        }
+        nameList += [ctxt.hoistStruct(biasDimsDict, f'{data_out_name}_bias_dims', cmsis_nn_dims)]
+        operatorRepresentation['bias_dims'] = ctxt.lookup(f'{data_out_name}_bias_dims').name
+
+        return ctxt, operatorRepresentation, nameList
+
+    def computeTransientBuffersSize(
+            self, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
+        size = 2 * operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_y'] * 2
+        name = operatorRepresentation['nodeName'] + f"_buffer"
+        return [(name, size)]
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # SCHEREMO: Hoist transient buffer
+        # https://review.trustedfirmware.org/plugins/gitiles/mirror/ARM-software/CMSIS_5/+/refs/heads/bias_for_conv/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c
+
+        nameList = []
+        name, size = self.computeTransientBuffersSize(ctxt, operatorRepresentation)[0]
+        nameList += [ctxt.hoistTransientBuffer(name, size)]
+        operatorRepresentation['ctxtBuffer'] = name
+        operatorRepresentation['ctxtBufferSize'] = size
+        return ctxt, operatorRepresentation, nameList
+
+
+cmsis1D_16_Template = _Conv1D_16_Template("""
+arm_convolve_wrapper_s16(&${ctxt}, &${conv_params}, &${quant_params}, &${input_dims}, ${data_in}, &${filter_dims}, ${weight}, &${bias_dims}, ${add}, &${output_dims}, ${data_out});
+""")
+
+cmsis1D_8_Template = _Conv1D_16_Template("""
+arm_convolve_wrapper_s8(&${ctxt}, &${conv_params}, &${quant_params}, &${input_dims}, ${data_in}, &${filter_dims}, ${weight}, &${bias_dims}, ${add}, &${output_dims}, ${data_out});
+""")
diff --git a/Deeploy/Targets/CortexM/Templates/DWConvTemplate.py b/Deeploy/Targets/CortexM/Templates/DWConvTemplate.py
new file mode 100644
index 0000000..056ef4d
--- /dev/null
+++ b/Deeploy/Targets/CortexM/Templates/DWConvTemplate.py
@@ -0,0 +1,301 @@
+# ----------------------------------------------------------------------
+#
+# File: DWConvTemplate.py
+#
+# Last edited: 04.01.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+from Deeploy.Targets.CortexM.DataTypes import cmsis_nn_activation, cmsis_nn_context, cmsis_nn_dims, \
+    cmsis_nn_dw_conv_params, cmsis_nn_per_channel_quant_params, cmsis_nn_tile
+
+
+class _Conv2DDW_8_Template(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        nodeList = []
+
+        # Hoist the structs to the global ctxt
+
+        data_out_name = operatorRepresentation['data_out']
+
+        ctxtDict = {'buf': operatorRepresentation['ctxtBuffer'], 'size': operatorRepresentation['ctxtBufferSize']}
+
+        nodeList += [ctxt.hoistStruct(ctxtDict, f'{data_out_name}_ctxt', cmsis_nn_context)]
+        operatorRepresentation['ctxt'] = f'{data_out_name}_ctxt'
+
+        # Next the conv params
+        # stride
+        strideDict = {'h': operatorRepresentation['stride_x'], 'w': operatorRepresentation['stride_y']}
+        nodeList += [ctxt.hoistStruct(strideDict, f'{data_out_name}_stride', cmsis_nn_tile)]
+        # padding
+        paddingDict = {'h': operatorRepresentation['padding_x'], 'w': operatorRepresentation['padding_y']}
+        nodeList += [ctxt.hoistStruct(paddingDict, f'{data_out_name}_padding', cmsis_nn_tile)]
+        # dilation
+        dilationDict = {'h': operatorRepresentation['dilation_x'], 'w': operatorRepresentation['dilation_y']}
+        nodeList += [ctxt.hoistStruct(dilationDict, f'{data_out_name}_dilation', cmsis_nn_tile)]
+        # activation
+        activationDict = {
+            'min': -(operatorRepresentation['n_levels'] // 2),
+            'max': (operatorRepresentation['n_levels'] // 2) - 1
+        }
+
+        # if operatorRepresentation[f'signed']:
+        #     activationDict = {
+        #         'min': -(operatorRepresentation[f'n_levels']//2),
+        #         'max': (operatorRepresentation[f'n_levels']//2) - 1
+        #     }
+        # else:
+        #     activationDict = {
+        #         'min': 0,
+        #         'max': (operatorRepresentation[f'n_levels'])-1
+        #     }
+
+        nodeList += [ctxt.hoistStruct(activationDict, f'{data_out_name}_activation', cmsis_nn_activation)]
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        convParamsDict = {
+            'input_offset': (data_in._signed == 0) * operatorRepresentation['n_levels'] // 2,
+            'output_offset': -(data_out._signed == 0) * operatorRepresentation['n_levels'] // 2,
+            'ch_mult': 1,
+            'stride': strideDict,
+            'padding': paddingDict,
+            'dilation': dilationDict,
+            'activation': activationDict,
+        }
+        nodeList += [ctxt.hoistStruct(convParamsDict, f'{data_out_name}_dw_conv_params', cmsis_nn_dw_conv_params)]
+        operatorRepresentation['dw_conv_params'] = ctxt.lookup(f'{data_out_name}_dw_conv_params').name
+
+        convQuantDict = {
+            'multiplier': operatorRepresentation['mul'],
+            'shift': operatorRepresentation['shift'],
+        }
+        nodeList += [
+            ctxt.hoistStruct(convQuantDict, f'{data_out_name}_quant_params', cmsis_nn_per_channel_quant_params)
+        ]
+        operatorRepresentation['quant_params'] = ctxt.lookup(f'{data_out_name}_quant_params').name
+
+        inputDimsDict = {
+            'n': 1,
+            'h': operatorRepresentation['dim_im_in_x'],
+            'w': operatorRepresentation['dim_im_in_y'],
+            'c': operatorRepresentation['ch_im_in']
+        }
+        nodeList += [ctxt.hoistStruct(inputDimsDict, f'{data_out_name}_input_dims', cmsis_nn_dims)]
+        operatorRepresentation['input_dims'] = ctxt.lookup(f'{data_out_name}_input_dims').name
+
+        filterDimsDict = {
+            'n': 1,
+            'h': operatorRepresentation['dim_kernel_x'],
+            'w': operatorRepresentation['dim_kernel_y'],
+            'c': operatorRepresentation['ch_im_out']
+        }
+        nodeList += [ctxt.hoistStruct(filterDimsDict, f'{data_out_name}_filter_dims', cmsis_nn_dims)]
+        operatorRepresentation['filter_dims'] = ctxt.lookup(f'{data_out_name}_filter_dims').name
+
+        outputDimsDict = {
+            'n': 1,
+            'h': operatorRepresentation['dim_im_out_x'],
+            'w': operatorRepresentation['dim_im_out_y'],
+            'c': operatorRepresentation['ch_im_out']
+        }
+        nodeList += [ctxt.hoistStruct(outputDimsDict, f'{data_out_name}_output_dims', cmsis_nn_dims)]
+        operatorRepresentation['output_dims'] = ctxt.lookup(f'{data_out_name}_output_dims').name
+
+        biasDimsDict = {
+            'n': 1,
+            'h': 1,
+            'w': 1,
+            'c': operatorRepresentation['ch_im_out'],
+        }
+        nodeList += [ctxt.hoistStruct(biasDimsDict, f'{data_out_name}_bias_dims', cmsis_nn_dims)]
+        operatorRepresentation['bias_dims'] = ctxt.lookup(f'{data_out_name}_bias_dims').name
+
+        return ctxt, operatorRepresentation, nodeList
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # SCHEREMO: Hoist transient buffer
+        # https://review.trustedfirmware.org/plugins/gitiles/mirror/ARM-software/CMSIS_5/+/refs/heads/bias_for_conv/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c
+
+        bufferSize = 2 * operatorRepresentation['ch_im_in'] * operatorRepresentation[
+            'dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] * 2 + 4
+
+        name = operatorRepresentation['nodeName'] + "_buffer"
+        ctxt.hoistTransientBuffer(name, bufferSize)
+        operatorRepresentation['ctxtBuffer'] = name
+        operatorRepresentation['ctxtBufferSize'] = bufferSize
+        return ctxt, operatorRepresentation, [name]
+
+
+conv2D_8_Template = _Conv2DDW_8_Template("""
+<%
+batchSizeIn = ch_im_in * dim_im_in_x * dim_im_in_y
+batchSizeOut = ch_im_out * dim_im_out_x * dim_im_out_y
+%>
+for(int b=0; b<${batch}; b++){
+arm_depthwise_conv_wrapper_s8(&${ctxt}, &${dw_conv_params}, &${quant_params}, &${input_dims}, (${data_in} + b*${batchSizeIn}), &${filter_dims}, ${weight}, &${bias_dims}, ${add}, &${output_dims}, (${data_out} + b*${batchSizeOut}));
+}
+""")
+# int8_t* bias = int8_t* deeploy_malloc(sizeof(int8_t) * ${ch_im_in}); \n\
+#                free(bias); \
+
+
+class _Conv1DDW_16_Template(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        nameList = []
+        # Hoist the structs to the global ctxt
+
+        # First the context
+        # https://review.trustedfirmware.org/plugins/gitiles/mirror/ARM-software/CMSIS_5/+/refs/heads/bias_for_conv/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c
+
+        data_out_name = operatorRepresentation['data_out']
+
+        ctxtDict = {
+            'buf': operatorRepresentation['ctxtBuffer'],  #f'{node.name}_ctxt_buffer',
+            'size': operatorRepresentation['ctxtBufferSize']
+        }
+
+        nameList += [ctxt.hoistStruct(ctxtDict, f'{data_out_name}_ctxt', cmsis_nn_context)]
+        operatorRepresentation['ctxt'] = f'{data_out_name}_ctxt'
+
+        # Next the conv params
+        # stride
+        strideDict = {'h': 1, 'w': operatorRepresentation['stride_y']}
+        # padding
+        paddingDict = {'h': 0, 'w': operatorRepresentation['padding_y']}
+        # dilation
+        dilationDict = {'h': 1, 'w': operatorRepresentation['dilation_y']}
+        # activation
+        activationDict = {
+            'min': -(operatorRepresentation['n_levels'] // 2),
+            'max': (operatorRepresentation['n_levels'] // 2) - 1
+        }
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        convParamsDict = {
+            'input_offset': (data_in._signed == 0) * operatorRepresentation['n_levels'] // 2,
+            'output_offset': -(data_out._signed == 0) * operatorRepresentation['n_levels'] // 2,
+            'ch_mult': 1,
+            'stride': strideDict,
+            'padding': paddingDict,
+            'dilation': dilationDict,
+            'activation': activationDict,
+        }
+        nameList += [ctxt.hoistStruct(convParamsDict, f'{data_out_name}_dw_conv_params', cmsis_nn_dw_conv_params)]
+        operatorRepresentation['dw_conv_params'] = ctxt.lookup(f'{data_out_name}_dw_conv_params').name
+
+        convQuantDict = {
+            'multiplier': operatorRepresentation['mul'],
+            'shift': operatorRepresentation['shift'],
+        }
+        nameList += [
+            ctxt.hoistStruct(convQuantDict, f'{data_out_name}_quant_params', cmsis_nn_per_channel_quant_params)
+        ]
+        operatorRepresentation['quant_params'] = ctxt.lookup(f'{data_out_name}_quant_params').name
+
+        inputDimsDict = {
+            'n': data_in.shape[0],
+            'h': 1,
+            'w': operatorRepresentation['dim_im_in_y'],
+            'c': operatorRepresentation['ch_im_in']
+        }
+        nameList += [ctxt.hoistStruct(inputDimsDict, f'{data_out_name}_input_dims', cmsis_nn_dims)]
+        operatorRepresentation['input_dims'] = ctxt.lookup(f'{data_out_name}_input_dims').name
+
+        filterDimsDict = {
+            'n': 1,
+            'h': 1,
+            'w': operatorRepresentation['dim_kernel_y'],
+            'c': operatorRepresentation['ch_im_out']
+        }
+        nameList += [ctxt.hoistStruct(filterDimsDict, f'{data_out_name}_filter_dims', cmsis_nn_dims)]
+        operatorRepresentation['filter_dims'] = ctxt.lookup(f'{data_out_name}_filter_dims').name
+
+        outputDimsDict = {
+            'n': data_in.shape[0],
+            'h': 1,
+            'w': operatorRepresentation['dim_im_out_y'],
+            'c': operatorRepresentation['ch_im_out']
+        }
+        nameList += [ctxt.hoistStruct(outputDimsDict, f'{data_out_name}_output_dims', cmsis_nn_dims)]
+        operatorRepresentation['output_dims'] = ctxt.lookup(f'{data_out_name}_output_dims').name
+
+        biasDimsDict = {
+            'n': 1,
+            'h': 1,
+            'w': 1,
+            'c': operatorRepresentation['ch_im_out'],
+        }
+        nameList += [ctxt.hoistStruct(biasDimsDict, f'{data_out_name}_bias_dims', cmsis_nn_dims)]
+        operatorRepresentation['bias_dims'] = ctxt.lookup(f'{data_out_name}_bias_dims').name
+
+        return ctxt, operatorRepresentation, nameList
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # SCHEREMO: Hoist transient buffer
+        # https://review.trustedfirmware.org/plugins/gitiles/mirror/ARM-software/CMSIS_5/+/refs/heads/bias_for_conv/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c
+
+        size = 2 * operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_y'] * 2
+        name = operatorRepresentation['nodeName'] + "_buffer"
+        ctxt.hoistTransientBuffer(name, size)
+        operatorRepresentation['ctxtBuffer'] = name
+        operatorRepresentation['ctxtBufferSize'] = size
+        return ctxt, operatorRepresentation, [name]
+
+
+conv1D_16_Template = _Conv1DDW_16_Template("""
+<%
+batchSizeIn = ch_im_in * dim_im_in_y
+batchSizeOut = ch_im_out * dim_im_out_y
+%>
+for(int b=0; b<${batch}; b++){
+arm_depthwise_conv_s16(&${ctxt}, &${dw_conv_params}, &${quant_params}, &${input_dims}, (${data_in} + b*${batchSizeIn}), &${filter_dims}, ${weight}, &${bias_dims}, ${add}, &${output_dims}, (${data_out} + b*${batchSizeOut}));
+}
+""")
+
+conv1D_8_Template = _Conv1DDW_16_Template("""
+<%
+batchSizeIn = ch_im_in * dim_im_in_x * dim_im_in_y
+batchSizeOut = ch_im_out * dim_im_out_x * dim_im_out_y
+%>
+for(int b=0; b<${batch}; b++){
+arm_depthwise_conv_wrapper_s8(&${ctxt}, &${dw_conv_params}, &${quant_params}, &${input_dims}, (${data_in} + b*${batchSizeIn}), &${filter_dims}, ${weight}, &${bias_dims}, ${add}, &${output_dims}, (${data_out} + b*${batchSizeOut}));
+}
+""")
diff --git a/Deeploy/Targets/CortexM/Templates/GEMMTemplate.py b/Deeploy/Targets/CortexM/Templates/GEMMTemplate.py
new file mode 100644
index 0000000..237f5ee
--- /dev/null
+++ b/Deeploy/Targets/CortexM/Templates/GEMMTemplate.py
@@ -0,0 +1,89 @@
+# ----------------------------------------------------------------------
+#
+# File: GEMMTemplate.py
+#
+# Last edited: 20.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+from .CMSISUtils import bindFCParams
+
+
+class _GEMM_8_Template(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        # Hoist the structs to the global ctxt
+        data_in = ctxt.lookup(operatorRepresentation['A'])
+        weight = ctxt.lookup(operatorRepresentation['B'])
+
+        ctxt, operatorRepresentation, nameList = bindFCParams(ctxt, operatorRepresentation['data_out'],
+                                                              operatorRepresentation['mul'],
+                                                              operatorRepresentation['shift'], data_in, weight,
+                                                              operatorRepresentation)
+
+        return ctxt, operatorRepresentation, nameList
+
+
+Linear_8_Template = _GEMM_8_Template("""
+// GEMM
+int8_t* ref_${data_out}_${A} = ${A};
+int8_t* ref_${data_out}_${B} = ${B};
+int8_t* ref_${data_out}_${data_out} = ${data_out};
+for(int i=0;i<${batch};i++){
+    arm_fully_connected_s8(&${ctxt}, &${fc_params}, &${quant_params}, &${input_dims}, ref_${data_out}_${A}, &${filter_dims}, ref_${data_out}_${B}, &${bias_dims}, ${C}, &${output_dims}, ref_${data_out}_${data_out});
+    ref_${data_out}_${A} += ${M} * ${N};
+    ref_${data_out}_${B} += ${N} * ${O};
+    ref_${data_out}_${data_out} += ${M} * ${O};
+}
+""")
+
+
+class _GEMM_16_Template(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Hoist the structs to the global ctxt
+        data_in = ctxt.lookup(operatorRepresentation['A'])
+        weight = ctxt.lookup(operatorRepresentation['B'])
+
+        ctxt, operatorRepresentation, nameList = bindFCParams(ctxt, operatorRepresentation['data_out'],
+                                                              operatorRepresentation['mul'],
+                                                              operatorRepresentation['shift'], data_in, weight,
+                                                              operatorRepresentation)
+
+        return ctxt, operatorRepresentation, nameList
+
+
+Linear_16_Template = _GEMM_16_Template("""
+// FC
+arm_fully_connected_s16(&${ctxt}, &${fc_params}, &${quant_params}, &${input_dims}, ${A}, &${filter_dims}, ${B}, &${bias_dims}, ${C}, &${output_dims}, ${data_out});
+""")
diff --git a/Deeploy/Targets/CortexM/Templates/LinearAttentionTemplate.py b/Deeploy/Targets/CortexM/Templates/LinearAttentionTemplate.py
new file mode 100644
index 0000000..97fd3b6
--- /dev/null
+++ b/Deeploy/Targets/CortexM/Templates/LinearAttentionTemplate.py
@@ -0,0 +1,43 @@
+# ----------------------------------------------------------------------
+#
+# File: LinearAttentionTemplate.py
+#
+# Last edited: 05.06.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _LinearAttentionTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+        return ctxt, operatorRepresentation
+
+
+referenceTemplate = _LinearAttentionTemplate("""
+// PLACEHOLDER LINEAR ATTENTION
+""")
diff --git a/Deeploy/Targets/CortexM/Templates/MHSATemplate.py b/Deeploy/Targets/CortexM/Templates/MHSATemplate.py
new file mode 100644
index 0000000..0ae237c
--- /dev/null
+++ b/Deeploy/Targets/CortexM/Templates/MHSATemplate.py
@@ -0,0 +1,250 @@
+# ----------------------------------------------------------------------
+#
+# File: MHSATemplate.py
+#
+# Last edited: 01.01.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+from .CMSISUtils import bindFCParams
+
+
+class _MHSATemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        s = ctxt.lookup(operatorRepresentation['q']).shape[1]
+
+        data_in = ctxt.lookup(operatorRepresentation['q'])
+        bias = ctxt.lookup(operatorRepresentation['wq_bias'])
+        weight = ctxt.lookup(operatorRepresentation['wq_weight'])
+
+        # Q FC layer:
+        ctxt, operatorRepresentation = bindFCParams(ctxt,
+                                                    operatorRepresentation['data_out'] + "_wq",
+                                                    operatorRepresentation['wq_requant_mul'],
+                                                    operatorRepresentation['wq_requant_shift'],
+                                                    data_in,
+                                                    weight,
+                                                    operatorRepresentation,
+                                                    "wq_",
+                                                    bias = (np.prod(bias.shape) > 1))
+
+        data_in = ctxt.lookup(operatorRepresentation['k'])
+        bias = ctxt.lookup(operatorRepresentation['wk_bias'])
+        weight = ctxt.lookup(operatorRepresentation['wk_weight'])
+
+        ctxt, operatorRepresentation = bindFCParams(ctxt,
+                                                    operatorRepresentation['data_out'] + "_wk",
+                                                    operatorRepresentation['wk_requant_mul'],
+                                                    operatorRepresentation['wk_requant_shift'],
+                                                    data_in,
+                                                    weight,
+                                                    operatorRepresentation,
+                                                    "wk_",
+                                                    bias = (np.prod(bias.shape) > 1))
+
+        data_in = ctxt.lookup(operatorRepresentation['v'])
+        bias = ctxt.lookup(operatorRepresentation['wv_bias'])
+        weight = ctxt.lookup(operatorRepresentation['wv_weight'])
+
+        ctxt, operatorRepresentation = bindFCParams(ctxt,
+                                                    operatorRepresentation['data_out'] + "_wv",
+                                                    operatorRepresentation['wv_requant_mul'],
+                                                    operatorRepresentation['wv_requant_shift'],
+                                                    data_in,
+                                                    weight,
+                                                    operatorRepresentation,
+                                                    "wv_",
+                                                    bias = (np.prod(bias.shape) > 1))
+
+        new_shape = (1, data_in.shape[1], operatorRepresentation['heads'] * operatorRepresentation['dim_head'])
+        data_in = ctxt.VariableBuffer(name = 'data_in', shape = new_shape, nLevels = operatorRepresentation['n_levels'])
+        data_in._signed = True
+        bias = ctxt.lookup(operatorRepresentation['wo_bias'])
+        weight = ctxt.lookup(operatorRepresentation['wo_weight'])
+
+        ctxt, operatorRepresentation = bindFCParams(ctxt,
+                                                    operatorRepresentation['data_out'] + "_wo",
+                                                    operatorRepresentation['wo_requant_mul'],
+                                                    operatorRepresentation['wo_requant_shift'],
+                                                    data_in,
+                                                    weight,
+                                                    operatorRepresentation,
+                                                    "wo_",
+                                                    bias = (np.prod(bias.shape) > 1))
+        #*operatorRepresentation['heads']
+        new_shape = (s, operatorRepresentation['dim_head'])
+        data_in = ctxt.VariableBuffer(name = 'data_in', shape = new_shape, nLevels = operatorRepresentation['n_levels'])
+        data_in._signed = True
+        # K
+        weight = np.ones((s, operatorRepresentation['dim_head']))
+
+        ctxt, operatorRepresentation = bindFCParams(ctxt,
+                                                    operatorRepresentation['data_out'] + "_preattn",
+                                                    operatorRepresentation['preattn_requant_mul'],
+                                                    operatorRepresentation['preattn_requant_shift'],
+                                                    data_in,
+                                                    weight,
+                                                    operatorRepresentation,
+                                                    "preattn_",
+                                                    bias = False)
+        #*operatorRepresentation['heads']
+        new_shape = (s, s)
+        data_in = ctxt.VariableBuffer(name = 'data_in', shape = new_shape, nLevels = operatorRepresentation['n_levels'])
+        data_in._signed = False
+        # K
+        weight = np.ones((operatorRepresentation['dim_head'], s))
+
+        ctxt, operatorRepresentation = bindFCParams(ctxt,
+                                                    operatorRepresentation['data_out'] + "_postattn",
+                                                    operatorRepresentation['postattn_requant_mul'],
+                                                    operatorRepresentation['postattn_requant_shift'],
+                                                    data_in,
+                                                    weight,
+                                                    operatorRepresentation,
+                                                    "postattn_",
+                                                    bias = False)
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _MHSATemplate("""
+do{
+<%
+    sequenceLength = q_shape[1]
+%>
+// W_Q * q -> Q
+int32_t* _null = deeploy_malloc(${max(dim, sequenceLength)}*${heads});
+memset(_null, 0, ${max(dim, sequenceLength)}*${heads});
+
+int8_t* wq_buffer = deeploy_malloc(${wq_output_dims}.n * ${wq_output_dims}.c);
+arm_fully_connected_s8(&${wq_ctxt}, &${wq_fc_params}, &${wq_quant_params}, &${wq_input_dims}, ${q}, &${wq_filter_dims}, ${wq_weight}, &${wq_bias_dims}, ${wq_bias}, &${wq_output_dims}, wq_buffer);
+<%
+dim1 = sequenceLength
+dim2 = heads
+dim3 = dim_head
+%>
+// Q: NSHD-> NHSD
+int8_t* wq_buffer_transposed = deeploy_malloc(${wq_output_dims}.n * ${wq_output_dims}.c);
+for(int k=0;k<${dim2};k++){
+for (int i=0;i<${dim1};i++){
+for(int j=0;j<${dim3};j++){
+wq_buffer_transposed[k*${dim3}*${dim1} + i*${dim3} + j] = wq_buffer[i*${dim2}*${dim3} + k*${dim3} + j];
+}
+}
+}
+free(wq_buffer);
+
+// W_K * k -> K
+int8_t* wk_buffer = deeploy_malloc(${wk_output_dims}.n * ${wk_output_dims}.c);
+arm_fully_connected_s8(&${wk_ctxt}, &${wk_fc_params}, &${wk_quant_params}, &${wk_input_dims}, ${k}, &${wk_filter_dims}, ${wk_weight}
+, &${wk_bias_dims}, ${wk_bias}, &${wk_output_dims}, wk_buffer);
+
+
+// K: NSHD-> NHSD
+int8_t* wk_buffer_transposed = deeploy_malloc(${wk_output_dims}.n * ${wk_output_dims}.c);
+for(int k=0;k<${dim2};k++){
+for (int i=0;i<${dim1};i++){
+for(int j=0;j<${dim3};j++){
+wk_buffer_transposed[k*${dim3}*${dim1} + i*${dim3} + j] = wk_buffer[i*${dim2}*${dim3} + k*${dim3} + j];
+}
+}
+}
+free(wk_buffer);
+
+
+// ATTN Matrix -> Q*KT
+
+// QKT -> NHSS
+int8_t* preattn_buffer = deeploy_malloc(${heads} * ${sequenceLength} * ${sequenceLength});
+for(int i=0; i<${heads}; i++){
+arm_fully_connected_s8(&${preattn_ctxt}, &${preattn_fc_params}, &${preattn_quant_params}, &${preattn_input_dims}, &wq_buffer_transposed[i * ${preattn_input_dims}.n * ${preattn_input_dims}.c], &${preattn_filter_dims}, &wk_buffer_transposed[i * ${preattn_filter_dims}.n * ${preattn_filter_dims}.c], &${preattn_bias_dims}, _null, &${preattn_output_dims}, &preattn_buffer[i*${preattn_output_dims}.c*${preattn_output_dims}.n]);
+}
+free(wq_buffer_transposed);
+free(wk_buffer_transposed);
+int8_t* postattn_buffer = deeploy_malloc(${heads} * ${sequenceLength} * ${sequenceLength});
+SoftmaxKernel_s8(preattn_buffer, postattn_buffer, ${heads} * ${sequenceLength} * ${sequenceLength}, ${sequenceLength}, ${isoftmaxA}, ${isoftmaxB}, ${isoftmaxC}, ${isoftmaxlog2}, ${n_levels});
+free(preattn_buffer);
+
+int8_t* wv_buffer = deeploy_malloc(${wv_output_dims}.n * ${wv_output_dims}.c);
+arm_fully_connected_s8(&${wv_ctxt}, &${wv_fc_params}, &${wv_quant_params}, &${wv_input_dims}, ${v}, &${wv_filter_dims}, ${wv_weight}, &${wv_bias_dims}, ${wv_bias}, &${wv_output_dims}, wv_buffer);
+
+<%
+dim1 = sequenceLength
+dim2 = heads
+dim3 = dim_head
+%>
+// NSHD-> NHDS
+//
+int8_t* wv_buffer_transposed = deeploy_malloc(${wv_output_dims}.n * ${wv_output_dims}.c);
+for(int k=0;k<${dim2};k++){
+for(int j=0;j<${dim3};j++){
+for (int i=0;i<${dim1};i++){
+wv_buffer_transposed[k*${dim3}*${dim1} + j*${dim1} + i] = wv_buffer[i*${dim2}*${dim3} + k*${dim3} + j];
+}
+}
+}
+free(wv_buffer);
+
+int8_t* out_buffer = deeploy_malloc(${heads} * ${sequenceLength} * ${dim_head});
+
+for(int i=0; i<${heads}; i++){
+arm_fully_connected_s8(&${postattn_ctxt}, &${postattn_fc_params}, &${postattn_quant_params},
+&${postattn_input_dims}, &postattn_buffer[i*${postattn_input_dims}.n*${postattn_input_dims}.c],
+&${postattn_filter_dims}, &wv_buffer_transposed[i*${postattn_filter_dims}.n * ${postattn_filter_dims}.c],
+&${postattn_bias_dims}, _null,
+&${postattn_output_dims}, &out_buffer[i*${postattn_output_dims}.n*${postattn_output_dims}.c]);
+}
+free(postattn_buffer);
+free(wv_buffer_transposed);
+<%
+dim1 = heads
+dim2 = sequenceLength
+dim3 = dim_head
+%>
+
+// NHSD-> NSHD
+int8_t* out_buffer_transposed = deeploy_malloc(${heads} * ${sequenceLength} * ${dim_head});
+for(int k=0;k<${dim2};k++){
+for (int i=0;i<${dim1};i++){
+for(int j=0;j<${dim3};j++){
+out_buffer_transposed[k*${dim3}*${dim1} + i*${dim3} + j] = out_buffer[i*${dim2}*${dim3} + k*${dim3} + j];
+}
+}
+}
+free(out_buffer);
+free(_null);
+
+arm_fully_connected_s8(&${wo_ctxt}, &${wo_fc_params}, &${wo_quant_params}, &${wo_input_dims}, out_buffer_transposed, &${wo_filter_dims}, ${wo_weight}, &${wo_bias_dims}, ${wo_bias}, &${wo_output_dims}, ${data_out});
+
+free(out_buffer_transposed);
+
+}while(0);
+""")
diff --git a/Deeploy/Targets/CortexM/Templates/MaxPool2DTemplate.py b/Deeploy/Targets/CortexM/Templates/MaxPool2DTemplate.py
new file mode 100644
index 0000000..3714374
--- /dev/null
+++ b/Deeploy/Targets/CortexM/Templates/MaxPool2DTemplate.py
@@ -0,0 +1,100 @@
+# ----------------------------------------------------------------------
+#
+# File: MaxPool2DTemplate.py
+#
+# Last edited: 27.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+from Deeploy.Targets.CortexM.DataTypes import cmsis_nn_context, cmsis_nn_dims, cmsis_nn_pool_params
+
+
+class _MaxPool2DTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        nameList = []
+
+        data_out_name = operatorRepresentation['data_out']
+
+        ctxtDict = {'buf': None, 'size': 0}
+
+        nameList += [ctxt.hoistStruct(ctxtDict, f'{data_out_name}_ctxt', cmsis_nn_context)]
+        operatorRepresentation['ctxt'] = f'{data_out_name}_ctxt'
+
+        strideDict = {'h': operatorRepresentation['stride_x'], 'w': operatorRepresentation['stride_y']}
+        paddingDict = {'h': operatorRepresentation['padding_x'], 'w': operatorRepresentation['padding_y']}
+        activationDict = {'min': -2**7, 'max': 2**7 - 1}
+
+        convParamsDict = {
+            'stride': strideDict,
+            'padding': paddingDict,
+            'activation': activationDict,
+        }
+        nameList += [ctxt.hoistStruct(convParamsDict, f'{data_out_name}_pool_params', cmsis_nn_pool_params)]
+        operatorRepresentation['pool_params'] = ctxt.lookup(f'{data_out_name}_pool_params').name
+
+        inputDimsDict = {
+            'n': 1,
+            'h': operatorRepresentation['dim_im_in_x'],
+            'w': operatorRepresentation['dim_im_in_y'],
+            'c': operatorRepresentation['ch_im_in']
+        }
+        nameList += [ctxt.hoistStruct(inputDimsDict, f'{data_out_name}_input_dims', cmsis_nn_dims)]
+        operatorRepresentation['input_dims'] = ctxt.lookup(f'{data_out_name}_input_dims').name
+
+        filterDimsDict = {
+            'n': 1,
+            'h': operatorRepresentation['dim_kernel_x'],
+            'w': operatorRepresentation['dim_kernel_y'],
+            'c': 1
+        }
+        nameList += [ctxt.hoistStruct(filterDimsDict, f'{data_out_name}_filter_dims', cmsis_nn_dims)]
+        operatorRepresentation['filter_dims'] = ctxt.lookup(f'{data_out_name}_filter_dims').name
+
+        outputDimsDict = {
+            'n': 1,
+            'h': operatorRepresentation['dim_im_out_x'],
+            'w': operatorRepresentation['dim_im_out_y'],
+            'c': operatorRepresentation['ch_im_out']
+        }
+        nameList += [ctxt.hoistStruct(outputDimsDict, f'{data_out_name}_output_dims', cmsis_nn_dims)]
+        operatorRepresentation['output_dims'] = ctxt.lookup(f'{data_out_name}_output_dims').name
+
+        return ctxt, operatorRepresentation, nameList
+
+
+cmsisTemplate = _MaxPool2DTemplate("""
+<%
+batchSizeIn = dim_im_in_x * dim_im_in_y * ch_im_in
+batchSizeOut = dim_im_out_x * dim_im_out_y * ch_im_out
+%>
+// MaxPool2D
+for(int b=0;b<${batch};b++){
+arm_max_pool_s8(&${ctxt}, &${pool_params}, &${input_dims}, (${data_in}+b*${batchSizeIn}), &${filter_dims}, &${output_dims}, (${data_out} + b*${batchSizeOut}));
+}
+""")
diff --git a/Deeploy/Targets/CortexM/Templates/__init__.py b/Deeploy/Targets/CortexM/Templates/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/CortexM/Templates/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/CortexM/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/CortexM/TopologyOptimizationPasses/Passes.py
new file mode 100644
index 0000000..6e1d899
--- /dev/null
+++ b/Deeploy/Targets/CortexM/TopologyOptimizationPasses/Passes.py
@@ -0,0 +1,230 @@
+# ----------------------------------------------------------------------
+#
+# File: CMSISPasses.py
+#
+# Last edited: 17.12.2022
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Moritz Scherer, ETH Zurich
+# - Georg Rutishauser, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.Matchers import Match
+from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, contextagnostic
+
+
+def _merge_conv_rq_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    conv = matched_nodes[0]
+    rqs = matched_nodes[1]
+
+    totalShift = 31 - np.log2(rqs.attrs['div'].values)
+
+    rqs.inputs[-1].values = np.round(rqs.inputs[-1].values / (rqs.inputs[-2].values + 1e-3))  # normalize add
+
+    # Reweight multiplicators:
+    # Get maximum:
+    maxMult = rqs.inputs[1].values.max()
+    # Get maximum shift possible:
+    MultShift = min(totalShift, np.floor(32 - np.log2(maxMult)))
+    # get remaining shift:
+    remainingShift = totalShift - MultShift
+
+    # shift mult:
+    rqs.inputs[1].values = rqs.inputs[1].values * 2**MultShift
+    shiftNode = gs.Constant(f'{conv.name}_shift', np.array(remainingShift))
+    # rqs.inputs[-1].values = np.round(rqs.inputs[-1].values / rqs.inputs[-2].values) # normalize add
+    # #import IPython; IPython.embed()
+    # shiftNode = gs.Constant(f'{conv.name}_shift', np.array((31-np.log2(rqs.attrs['div'].values),)))
+
+    shiftNode = gs.Constant(f'{conv.name}_shift', np.array(remainingShift))
+    _inputs = list(conv.inputs) + list(rqs.inputs[1:]) + [shiftNode]
+
+    _outputs = rqs.outputs
+
+    #import IPython; IPython.embed()
+
+    rqsConv = gs.Node(op = 'RequantizedConv', name = name, attrs = {**conv.attrs, **rqs.attrs})
+    graph.replaceInsertNode(_inputs, _outputs, rqsConv)
+
+    return graph
+
+
+@contextagnostic
+class ConvRequantMergePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['conv_out'], op = 'Conv', name = 'conv1')
+        output = graph.layer(inputs = output, outputs = ['rqs'], op = 'RequantShift', name = 'rqs1')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_MERGE_CONVRQ_PASS"
+        super().__init__(graph, _merge_conv_rq_fun, name)
+
+
+def _merge_gemm_rq_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    gemm = matched_nodes[0]
+    rqs = matched_nodes[1]
+
+    totalShift = 31 - np.log2(rqs.attrs['div'].values)
+
+    rqs.inputs[-1].values = np.round(rqs.inputs[-1].values / (rqs.inputs[-2].values + 1e-3))  # normalize add
+
+    # Reweight multiplicators:
+    # Get maximum shift possible:
+    MultShift = min(totalShift, np.floor(np.log2(2**31 - rqs.inputs[1].values.max())))
+    # get remaining shift:
+    remainingShift = totalShift - MultShift
+
+    # shift mult:
+    rqs.inputs[1].values = rqs.inputs[1].values * 2**MultShift
+    # rqs.inputs[-1].values = np.round(rqs.inputs[-1].values / rqs.inputs[-2].values) # normalize add
+    # #import IPython; IPython.embed()
+    # shiftNode = gs.Constant(f'{gemm.name}_shift', np.array((31-np.log2(rqs.attrs['div'].values),)))
+
+    # GEMM has add
+    if len(list(gemm.inputs)) == 3:
+
+        gemm.inputs[2].values = gemm.inputs[2].values + np.round(rqs.inputs[2].values / (rqs.inputs[1].values + 1e-3))
+
+        # Keep input, weight from GEMM
+        # Take mul from RQS
+        _inputs = list(gemm.inputs) + list(rqs.inputs[1:2])
+    else:
+        _inputs = list(gemm.inputs) + list(rqs.inputs[2:]) + list(rqs.inputs[1:2])
+    _outputs = rqs.outputs
+    attrs = {**gemm.attrs, **rqs.attrs}
+    attrs['shift'] = gs.Constant(name = 'shift', values = np.array(remainingShift))
+    #attrs['mul']=gs.Constant(name='mul',values = np.array(rqs.inputs[1].values))
+    rqsGemm = gs.Node(op = 'RequantizedGemm', name = name, attrs = attrs)
+    graph.replaceInsertNode(_inputs, _outputs, rqsGemm)
+
+    return graph
+
+
+@contextagnostic
+class GEMMRequantMergePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['gemm_out'], op = 'Gemm', name = 'gemm')
+        output = graph.layer(inputs = output, outputs = ['rqs'], op = 'RequantShift', name = 'rqs1')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_MERGE_GEMM_RQ_PASS"
+        super().__init__(graph, _merge_gemm_rq_fun, name)
+
+
+@contextagnostic
+class MatMulRequantMergePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['gemm_out'], op = 'MatMul', name = 'gemm')
+        output = graph.layer(inputs = output, outputs = ['rqs'], op = 'RequantShift', name = 'rqs1')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_MERGE_MATMUL_RQ_PASS"
+        super().__init__(graph, _merge_gemm_rq_fun, name)
+
+
+def _align_mhsa_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    mhsa = matched_nodes[0]
+
+    for idx, name in enumerate(["wq", "wk", "wv", "wo", "postattn", "preattn"]):
+        totalShift = 31 - np.log2(mhsa.attrs[f'{name}_requant_div'].values)
+        # Reweight multiplicators:
+        # Get maximum:
+        maxMult = mhsa.attrs[f'{name}_requant_mul'].values.max()
+        # Get maximum shift possible:
+        MultShift = min(totalShift, np.floor(np.log2(2**31 / maxMult)))
+        # get remaining shift:
+        remainingShift = totalShift - MultShift
+
+        # shift mult:
+        mhsa.attrs[f'{name}_requant_mul'].values = mhsa.attrs[f'{name}_requant_mul'].values * 2**MultShift
+        mhsa.attrs[f'{name}_requant_shift'] = gs.Constant(name = f'{name}_requant_shift',
+                                                          values = np.array(remainingShift))
+
+    return graph
+
+
+@contextagnostic
+class MHSAAlignmentPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['gemm_out'], op = 'MultiHeadSelfAttention', name = 'mhsa')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_ALIGN_MHSA_PASS"
+        super().__init__(graph, _align_mhsa_fun, name)
+
+
+def _align_linear_attention_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    linearattn = matched_nodes[0]
+
+    for idx, name in enumerate(["wq", "wk", "wv", "wo", "postattn", "preattn", 'normalizer']):
+
+        totalShift = 31 - np.log2(linearattn.attrs[f'{name}_requant_div'].values)
+        # Reweight multiplicators:
+        # Get maximum:
+        maxMult = linearattn.attrs[f'{name}_requant_mul'].values.max()
+        # Get maximum shift possible:
+        assert maxMult < 2**31, "{linearattn.name} requant mul is too large!"
+        MultShift = min(totalShift, np.floor(np.log2(2**31 / maxMult)))
+
+        # get remaining shift:
+        remainingShift = totalShift - MultShift
+
+        # shift mult:
+        linearattn.attrs[f'{name}_requant_mul'].values = linearattn.attrs[f'{name}_requant_mul'].values * 2**MultShift
+        linearattn.attrs[f'{name}_requant_shift'] = gs.Constant(name = f'{name}_requant_shift',
+                                                                values = np.array(remainingShift))
+
+    return graph
+
+
+@contextagnostic
+class LinearAttentionAlignmentPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['gemm_out'], op = 'LinearAttention', name = 'LA')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_ALIGN_LinearAttention_PASS"
+        super().__init__(graph, _align_linear_attention_fun, name)
diff --git a/Deeploy/Targets/CortexM/TopologyOptimizationPasses/__init__.py b/Deeploy/Targets/CortexM/TopologyOptimizationPasses/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/CortexM/TopologyOptimizationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/CortexM/TypeCheckers.py b/Deeploy/Targets/CortexM/TypeCheckers.py
new file mode 100644
index 0000000..c5f58a9
--- /dev/null
+++ b/Deeploy/Targets/CortexM/TypeCheckers.py
@@ -0,0 +1,92 @@
+# ----------------------------------------------------------------------
+#
+# File: CMSISCheckers.py
+#
+# Last edited: 18.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Sequence, Type
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.TypeCheckers.SignPropTypeChecker import SignPropTypeChecker
+from Deeploy.DeeployTypes import OperatorRepresentation, VariableBuffer
+
+
+class CMSISSaturatingAddChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [min(inputs[0].nLevels + inputs[1].nLevels, 2**(self.input_types[0].referencedType.typeWidth))]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed or inputs[1]._signed:
+            return [True]
+        else:
+            return [False]
+
+
+class CMSISLinearChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [operatorRepresentation['n_levels']]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [bool(operatorRepresentation["signed"])]
+
+
+class CMSISConvChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [operatorRepresentation['n_levels']]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [bool(operatorRepresentation["signed"])]
+
+
+class CMSISMaxPoolChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [inputs[0].nLevels]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
diff --git a/Deeploy/Targets/CortexM/__init__.py b/Deeploy/Targets/CortexM/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/CortexM/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py
new file mode 100644
index 0000000..04d87fd
--- /dev/null
+++ b/Deeploy/Targets/Generic/Bindings.py
@@ -0,0 +1,191 @@
+# ----------------------------------------------------------------------
+#
+# File: BasicBindings.py
+#
+# Last edited: 17.12.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Moritz Scherer, ETH Zurich
+# - Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
+    MemoryManagementGeneration, MemoryPassthroughGeneration
+from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes, SignedIntegerDataTypes, int8_t, int32_t, uint8_t
+from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
+from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
+from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, ConvTemplate, DebugPrintTemplate, \
+    DummyTemplate, DWConvTemplate, GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, \
+    ITAPartialMaxTemplate, MatMulTemplate, MaxPoolTemplate, MulTemplate, PadTemplate, ReduceMeanTemplate, \
+    ReduceSumTemplate, RequantShiftTemplate, ReshapeTemplate, RQIntegerDivTemplate, RQSiGELUTemplate, SliceTemplate, \
+    TransposeTemplate, iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate
+from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DebugPrintChecker, \
+    DummyChecker, GatherChecker, GELUChecker, GEMMChecker, IntegerDivChecker, MatMulChecker, MaxPoolChecker, \
+    MulChecker, PadChecker, ReduceMeanChecker, ReduceSumChecker, RequantShiftChecker, ReshapeChecker, \
+    RQIntegerDivChecker, SliceChecker, SoftmaxChecker, TransposeChecker, iLayerNormChecker
+
+BasicTransformer = CodeTransformation([ArgumentStructGeneration(), MemoryManagementGeneration(), FutureGeneration()])
+
+ReshapeSkipTransformer = CodeTransformation(
+    [ArgumentStructGeneration(), MemoryPassthroughGeneration(),
+     FutureGeneration()])
+
+BasicSliceBindings = [
+    NodeBinding(
+        SliceChecker([
+            PointerClass(type),
+            PointerClass(uint8_t),
+            PointerClass(uint8_t),
+            PointerClass(uint8_t),
+            PointerClass(uint8_t)
+        ], [PointerClass(type)]), SliceTemplate.referenceTemplate, BasicTransformer) for type in IntegerDataTypes
+]
+
+BasicAddBindings = [
+    NodeBinding(AddChecker([PointerClass(type1), PointerClass(type2)], [PointerClass(int32_t)]),
+                AddTemplate.referenceTemplate, BasicTransformer)
+    for type1 in IntegerDataTypes
+    for type2 in IntegerDataTypes
+]
+
+BasicConv1DBinding = NodeBinding(ConvChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+                                 ConvTemplate.reference1DTemplate, BasicTransformer)
+
+BasicDWConv1DBinding = NodeBinding(ConvChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+                                   DWConvTemplate.reference1DTemplate, BasicTransformer)
+
+BasicConv2DBinding = NodeBinding(ConvChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+                                 ConvTemplate.reference2DTemplate, BasicTransformer)
+
+BasicDWConv2DBinding = NodeBinding(ConvChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+                                   DWConvTemplate.reference2DTemplate, BasicTransformer)
+
+BasicDebugPrintBindings = [
+    NodeBinding(DebugPrintChecker([PointerClass(type)], [PointerClass(type)]), DebugPrintTemplate.referenceTemplate,
+                ReshapeSkipTransformer) for type in SignedIntegerDataTypes
+]
+
+BasicGatherBindings = [
+    NodeBinding(GatherChecker([PointerClass(type), PointerClass(int32_t)], [PointerClass(type)]),
+                GatherTemplate.referenceTemplate, BasicTransformer) for type in SignedIntegerDataTypes
+]
+
+BasicGELUBinding = NodeBinding(GELUChecker([PointerClass(int8_t)], [PointerClass(int32_t)]),
+                               iGELUTemplate.referenceTemplate, BasicTransformer)
+
+BasicGEMMBinding = NodeBinding(
+    GEMMChecker(
+        [PointerClass(int8_t), PointerClass(int8_t), PointerClass(int32_t)], [PointerClass(int32_t)]),
+    GemmTemplate.referenceTemplate, BasicTransformer)
+
+BasicIntegerDivBinding = NodeBinding(
+    IntegerDivChecker([PointerClass(int32_t), PointerClass(int32_t)], [PointerClass(int32_t)]),
+    IntegerDivTemplate.referenceTemplate, BasicTransformer)
+
+BasicITASoftmaxBinding = NodeBinding(SoftmaxChecker([PointerClass(int8_t)], [PointerClass(int8_t)]),
+                                     ITAMaxTemplate.referenceTemplate, BasicTransformer)
+
+BasicITAPartialSoftmaxBinding = NodeBinding(SoftmaxChecker([PointerClass(int8_t)], [PointerClass(int8_t)]),
+                                            ITAPartialMaxTemplate.referenceTemplate, BasicTransformer)
+
+BasicLayerNormBinding = NodeBinding(
+    iLayerNormChecker([PointerClass(int8_t), PointerClass(int32_t),
+                       PointerClass(int32_t)], [PointerClass(int8_t)]), iLayernormTemplate.referenceTemplate,
+    BasicTransformer)
+
+BasicMatMulBinding = NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+                                 MatMulTemplate.referenceTemplate, BasicTransformer)
+
+BasicMaxPool2DBinding = NodeBinding(MaxPoolChecker([PointerClass(int8_t)], [PointerClass(int8_t)]),
+                                    MaxPoolTemplate.referenceTemplate, BasicTransformer)
+
+BasicMulBindings = [
+    NodeBinding(MulChecker([PointerClass(typeA), PointerClass(typeB)], [PointerClass(int32_t)]),
+                MulTemplate.referenceTemplate, BasicTransformer)
+    for typeA, typeB in itertools.product(SignedIntegerDataTypes, SignedIntegerDataTypes)
+]
+
+BasicPad1DBindings = [
+    NodeBinding(PadChecker([PointerClass(type)], [PointerClass(type)]), PadTemplate.reference1DTemplate,
+                BasicTransformer) for type in SignedIntegerDataTypes
+]
+BasicPad2DBindings = [
+    NodeBinding(PadChecker([PointerClass(type)], [PointerClass(type)]), PadTemplate.reference2DTemplate,
+                BasicTransformer) for type in SignedIntegerDataTypes
+]
+
+BasicReduceMeanBindings = [
+    NodeBinding(ReduceMeanChecker([PointerClass(type)], [PointerClass(type)]), ReduceMeanTemplate.referenceTemplate,
+                BasicTransformer) for type in SignedIntegerDataTypes
+]
+
+BasicReduceSumBindings = [
+    NodeBinding(ReduceSumChecker([PointerClass(type)], [PointerClass(int32_t)]), ReduceSumTemplate.referenceTemplate,
+                BasicTransformer) for type in SignedIntegerDataTypes
+]
+
+BasicReshapeBindings = [
+    NodeBinding(ReshapeChecker([PointerClass(type), PointerClass(int32_t)], [PointerClass(type)]),
+                ReshapeTemplate.referenceTemplate, ReshapeSkipTransformer) for type in IntegerDataTypes
+]
+
+BasicRQSBindings = [
+    NodeBinding(
+        RequantShiftChecker([PointerClass(type), PointerClass(int32_t),
+                             PointerClass(int32_t)], [PointerClass(int8_t)]), RequantShiftTemplate.referenceTemplate,
+        BasicTransformer) for type in SignedIntegerDataTypes
+]
+
+BasicRQSGELUBinding = NodeBinding(
+    GELUChecker([PointerClass(int8_t),
+                 PointerClass(int32_t),
+                 PointerClass(int32_t),
+                 PointerClass(int32_t)], [PointerClass(int8_t)]), RQSiGELUTemplate.referenceTemplate, BasicTransformer)
+
+BasicRQIntegerDivBinding = NodeBinding(
+    RQIntegerDivChecker([
+        PointerClass(int32_t),
+        PointerClass(int32_t),
+        PointerClass(int32_t),
+        PointerClass(int32_t),
+        PointerClass(int32_t)
+    ], [PointerClass(int8_t)]), RQIntegerDivTemplate.referenceTemplate, BasicTransformer)
+
+BasicSoftmaxBinding = NodeBinding(SoftmaxChecker([PointerClass(int8_t)], [PointerClass(int8_t)]),
+                                  iSoftmaxTemplate.referenceTemplate, BasicTransformer)
+
+BasicTransposeBindings = [
+    NodeBinding(TransposeChecker([PointerClass(type)], [PointerClass(type)]), TransposeTemplate.referenceTemplate,
+                BasicTransformer) for type in IntegerDataTypes
+]
+
+BasiciRMSNormBinding = NodeBinding(
+    iLayerNormChecker([PointerClass(int8_t), PointerClass(int32_t)], [PointerClass(int8_t)]),
+    iRMSNormTemplate.referenceTemplate, BasicTransformer)
+
+DummyBinding = NodeBinding(DummyChecker([PointerClass(int8_t)], [PointerClass(int8_t)]),
+                           DummyTemplate.referenceTemplate, BasicTransformer)
+
+BasicConcatBindings = [
+    NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]),
+                ConcatTemplate.referenceTemplate, BasicTransformer) for type in IntegerDataTypes
+]
diff --git a/Deeploy/Targets/Generic/Deployer.py b/Deeploy/Targets/Generic/Deployer.py
new file mode 100644
index 0000000..8dc216d
--- /dev/null
+++ b/Deeploy/Targets/Generic/Deployer.py
@@ -0,0 +1,69 @@
+# ----------------------------------------------------------------------
+#
+# File: GenericDeployer.py
+#
+# Last edited: 04.01.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, Type
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.DebugPasses import DebugPrintMergePass
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    NCHWtoNHWCPass, TransposeMatmulInputsPass
+from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
+from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import TransposeConstOptPass, TransposeMergePass
+
+
+class GenericDeployer(SignPropDeployer):
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: DeploymentPlatform,
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable = lambda x: x,
+                 name: str = 'DeeployNetwork',
+                 default_channels_first = False,
+                 deeployStateDir: str = "DeeployStateDir",
+                 inputOffsets: Dict[str, int] = {}):
+
+        super().__init__(graph,
+                         deploymentPlatform,
+                         inputTypes,
+                         loweringOptimizer,
+                         scheduler,
+                         name,
+                         default_channels_first = default_channels_first,
+                         deeployStateDir = deeployStateDir)
+
+        self.inputOffsets = inputOffsets
+
+        self.loweringOptimizer.passes += [
+            TransposeMatmulInputsPass(),
+            NCHWtoNHWCPass(self.default_channels_first),
+            TransposeMergePass(),
+            TransposeConstOptPass(),
+            DebugPrintMergePass()
+        ]
diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py
new file mode 100644
index 0000000..91e8107
--- /dev/null
+++ b/Deeploy/Targets/Generic/Layers.py
@@ -0,0 +1,515 @@
+# ----------------------------------------------------------------------
+#
+# File: BasicLayers.py
+#
+# Last edited: 17.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Authors:
+# - Moritz Scherer, ETH Zurich
+# - Victor Jung, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Tuple
+
+import numpy as np
+
+from Deeploy.DeeployTypes import NodeMapper, ONNXLayer, Shape
+
+
+class ConcatLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class iRMSNormLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class SliceLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class ReshapeLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class GatherLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class iGELULayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        compAbs = self.mapper.parser.operatorRepresentation['size']
+        compAdd = self.mapper.parser.operatorRepresentation['size']
+        compSqr = self.mapper.parser.operatorRepresentation['size']
+        compMul = self.mapper.parser.operatorRepresentation['size']
+        compAdd = self.mapper.parser.operatorRepresentation['size']
+        compMul2 = self.mapper.parser.operatorRepresentation['size']
+        compAdd2 = self.mapper.parser.operatorRepresentation['size']
+        compDiv = self.mapper.parser.operatorRepresentation['size']
+        return compAbs + compAdd + compSqr + compMul + compAdd + compMul2 + compAdd2 + compDiv
+
+
+class iHardswishLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class RQSiGELULayer(iGELULayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class RQSiHardswishLayer(iHardswishLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class iSoftmaxLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class ITAMaxLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class RequantShiftLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: List[Shape], outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+
+        channel_dim = inputShapes[0][1]
+        inputShapes[2] = [inputShapes[0][0], channel_dim] + list(inputShapes[2][1:])
+        inputShapes[1] = [inputShapes[0][0], channel_dim] + list(inputShapes[1][1:])
+
+        return (inputShapes, outputShapes)
+
+    def computeOps(self):
+        return self.mapper.parser.operatorRepresentation['size'] * 3  # One add, one mul, one div
+
+
+class AddLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+        outputShapes = inputShapes.copy()
+        if len(inputShapes[0]) > len(inputShapes[1]):
+            inputShapes[1] = inputShapes[0]
+        else:
+            inputShapes[0] = inputShapes[1]
+
+        return (inputShapes, outputShapes)
+
+    def computeOps(self):
+        return self.mapper.parser.operatorRepresentation['size']
+
+
+class MatMulLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        return 2 * self.mapper.parser.operatorRepresentation['M'] * self.mapper.parser.operatorRepresentation[
+            'N'] * self.mapper.parser.operatorRepresentation['O'] * self.mapper.parser.operatorRepresentation['batch']
+
+
+class RQMatMulLayer(MatMulLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: List[Shape], outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+
+        channel_dim = inputShapes[0][1]
+        inputShapes[3] = [inputShapes[0][0]] + list(inputShapes[3][1:])
+        inputShapes[2] = [inputShapes[0][0]] + list(inputShapes[2][1:])
+
+        return (inputShapes, outputShapes)
+
+    def computeOps(self):
+        matmul = super().computeOps()
+        rqs = self.mapper.parser.operatorRepresentation['size'] * 3
+        return matmul + rqs
+
+
+class IntegerDivLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class RQIntegerDivLayer(IntegerDivLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class GEMMLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+        if operatorRepresentation['transA']:
+            M = inputShapes[0][-1]
+        else:
+            M = inputShapes[0][-2]
+
+        if operatorRepresentation['transB']:
+            N = inputShapes[1][-2]
+        else:
+            N = inputShapes[1][-1]
+
+        if len(inputShapes) == 3:
+            inputShapes[2] = [M, N]
+
+        return (inputShapes, outputShapes)
+
+    def computeOps(self):
+        matmul = 2 * self.mapper.parser.operatorRepresentation['M'] * self.mapper.parser.operatorRepresentation[
+            'N'] * self.mapper.parser.operatorRepresentation['O'] * self.mapper.parser.operatorRepresentation['batch']
+        gemm = matmul + 3 * self.mapper.parser.operatorRepresentation['M'] * self.mapper.parser.operatorRepresentation[
+            'O'] * self.mapper.parser.operatorRepresentation['batch']
+
+        return gemm
+
+
+class RQGEMMLayer(GEMMLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: List[Shape], outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+        if operatorRepresentation['transA']:
+            M = inputShapes[0][-1]
+        else:
+            M = inputShapes[0][-2]
+
+        if operatorRepresentation['transB']:
+            N = inputShapes[1][-2]
+        else:
+            N = inputShapes[1][-1]
+
+        if len(inputShapes) == 5:
+            inputShapes[2] = [M, N]
+            inputShapes[4] = [inputShapes[0][0]] + list(inputShapes[4][1:])
+            inputShapes[3] = [inputShapes[0][0]] + list(inputShapes[3][1:])
+        else:
+            inputShapes[3] = [inputShapes[0][0]] + list(inputShapes[3][1:])
+            inputShapes[2] = [
+                inputShapes[0][0],
+            ] + list(inputShapes[2][1:])
+
+        return (inputShapes, outputShapes)
+
+    def computeOps(self):
+        gemm = super().computeOps()
+        rqs = self.mapper.parser.operatorRepresentation['size'] * 3
+        return gemm + rqs
+
+
+class MulLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+        if len(inputShapes[0]) > len(inputShapes[1]):
+            inputShapes[1] = inputShapes[0]
+        else:
+            inputShapes[0] = inputShapes[1]
+        return (inputShapes, outputShapes)
+
+
+class ConvLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+        if len(inputShapes) == 3:
+            inputShapes[2] = inputShapes[1][0]
+        return (inputShapes, outputShapes)
+
+    def computeOps(self):
+        if "group" in self.mapper.parser.operatorRepresentation:
+            groups = self.mapper.parser.operatorRepresentation['group']
+        else:
+            groups = 1
+        opsPerPx = int(
+            np.prod(self.mapper.parser.operatorRepresentation['kernel_shape']) *
+            self.mapper.parser.operatorRepresentation['ch_im_in'] *
+            self.mapper.parser.operatorRepresentation['ch_im_out'] / groups) * 2
+        if 'dim_im_out_y' in self.mapper.parser.operatorRepresentation:
+            numPx = self.mapper.parser.operatorRepresentation[
+                'dim_im_out_x'] * self.mapper.parser.operatorRepresentation['dim_im_out_y']
+        else:
+            numPx = self.mapper.parser.operatorRepresentation['dim_im_out_x']
+        return numPx * opsPerPx
+
+
+class RQSConvLayer(ConvLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        conv = super().computeOps()
+
+        if 'dim_im_out_y' in self.mapper.parser.operatorRepresentation:
+            rqs = self.mapper.parser.operatorRepresentation['dim_im_out_x'] * self.mapper.parser.operatorRepresentation[
+                'dim_im_out_y'] * 3
+        else:
+            rqs = self.mapper.parser.operatorRepresentation['dim_im_out_x'] * 3
+
+        return conv + rqs
+
+
+class PadLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class MaxPoolLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class ReduceMeanLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class ReduceSumLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+        outputShapes = inputShapes.copy()
+        axis = operatorRepresentation['axes'][0]
+
+        if operatorRepresentation['keepdims']:
+            outputShapes[0][axis] = 1
+        else:
+            outputShapes[0] = outputShapes[0][:axis] + outputShapes[0][axis + 1:]
+        return (inputShapes, outputShapes)
+
+
+class iLayerNormLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        compAverage = self.mapper.parser.operatorRepresentation['size']
+        compNormalize = self.mapper.parser.operatorRepresentation['size']
+        compSqr = self.mapper.parser.operatorRepresentation['size']
+        compSum = self.mapper.parser.operatorRepresentation['size']
+        compSqrt = self.mapper.parser.operatorRepresentation['size']
+        compDiv = self.mapper.parser.operatorRepresentation['size']
+        return compAverage + compNormalize + compSqr + compSum + compSqrt + compDiv
+
+
+class TransposeLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class LinearAttentionLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+        inputShapes[4] = inputShapes[3][0]
+        inputShapes[6] = inputShapes[5][0]
+        inputShapes[8] = inputShapes[7][0]
+        inputShapes[10] = inputShapes[9][0]
+
+        return (inputShapes, outputShapes)
+
+    def computeOps(self):
+        # seqLen = self.mapper.parser.operatorRepresentation['in_C']
+        # dim = self.mapper.parser.operatorRepresentation['dim']
+        # dim_head = self.mapper.parser.operatorRepresentation['dim_head']
+        # heads = self.mapper.parser.operatorRepresentation['heads']
+        # QOps = seqLen * dim * dim_head * heads * 2
+        # # WQ * Q (H )
+        # KOps = seqLen * dim * dim_head * heads * 2
+        # # WK * K
+        # VOps = seqLen * dim * dim_head * heads * 2
+        # # WV * V
+        # KVOps = seqLen * dim_head * dim_head * heads * 2
+        # # Q * KT
+        # QKVOps = seqLen * dim_head * dim_head * heads * 2
+        # # N H S S * N H S D -> N H S D
+        # OutOps = seqLen * dim_head * heads * dim * 2
+        # # WO * O
+        # totOps = QOps + KOps + VOps + KVOps + QKVOps + OutOps
+        # return totOps
+
+        return 0
+
+
+class CLCALayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+        inputShapes[3] = inputShapes[2][0]
+        inputShapes[5] = inputShapes[4][0]
+        inputShapes[7] = inputShapes[6][0]
+        # WQ Requant
+        inputShapes[8] = [operatorRepresentation['dim_head'] * operatorRepresentation['heads'], 1]
+        inputShapes[9] = [operatorRepresentation['dim_head'] * operatorRepresentation['heads'], 1]
+        inputShapes[10] = [operatorRepresentation['dim_head'] * operatorRepresentation['heads'], 1]
+        # WK Requant
+        inputShapes[11] = [1, 1]
+        inputShapes[12] = [1, 1]
+        inputShapes[13] = [1, 1]
+        # WV Requant
+        inputShapes[14] = [operatorRepresentation['dim_head'] * operatorRepresentation['heads'], 1]
+        inputShapes[15] = [operatorRepresentation['dim_head'] * operatorRepresentation['heads'], 1]
+        inputShapes[16] = [operatorRepresentation['dim_head'] * operatorRepresentation['heads'], 1]
+        # Kdiv Requanat
+        inputShapes[17] = [1, 1]
+        inputShapes[18] = [1, 1]
+        inputShapes[19] = [1, 1]
+        # Preattn Requant
+        inputShapes[20] = [1, 1]
+        inputShapes[21] = [1, 1]
+        inputShapes[22] = [1, 1]
+        # Postattn Requant
+        inputShapes[23] = [1, 1]
+        inputShapes[24] = [1, 1]
+        inputShapes[25] = [1, 1]
+        # WO Requant
+        inputShapes[26] = [operatorRepresentation['out_dim'], 1]
+        inputShapes[27] = [operatorRepresentation['out_dim'], 1]
+        inputShapes[28] = [operatorRepresentation['out_dim'], 1]
+        return (inputShapes, outputShapes)
+
+    def computeOps(self):
+
+        qLen = self.mapper.parser.operatorRepresentation['q_shape'][-1]
+        kLen = self.mapper.parser.operatorRepresentation['kv_shape'][-1]
+        inDim = self.mapper.parser.operatorRepresentation['q_shape'][-2]
+        heads = self.mapper.parser.operatorRepresentation['heads']
+        dim_head = self.mapper.parser.operatorRepresentation['dim_head']
+        out_dim = self.mapper.parser.operatorRepresentation['out_dim']
+
+        # q -> Q
+        QOps = qLen * 1 * inDim * heads * dim_head * 2
+        # v -> V
+        VOps = kLen * 1 * inDim * heads * dim_head * 2
+        # V -> K
+        KOps = kLen * heads * dim_head * 2
+        # KOps = 0
+
+        EOps = heads * kLen * heads * dim_head
+
+        MMKTV = heads * dim_head * kLen * dim_head * 2
+        MMQA = heads * qLen * dim_head * dim_head * 2
+        MMQE = heads * qLen * dim_head * 1 * 2
+
+        # Divs, Adds(eps), muls(delta, eps)
+        DivOps = heads * qLen * dim_head + heads * qLen + 2 * heads * qLen * dim_head
+
+        OOps = (heads * dim_head) * qLen * out_dim * 1 * 2
+
+        return QOps + VOps + KOps + EOps + MMKTV + MMQA + MMQE + DivOps + OOps
+
+
+class MHSALayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+        outputShapes = [[inputShapes[0][0], operatorRepresentation['heads']] + inputShapes[0][1:]]
+
+        return (inputShapes, outputShapes)
+
+    def computeOps(self):
+        seqLen = self.mapper.parser.operatorRepresentation['S']
+        dim = self.mapper.parser.operatorRepresentation['dim']
+        dim_head = self.mapper.parser.operatorRepresentation['dim_head']
+        heads = self.mapper.parser.operatorRepresentation['heads']
+        QOps = seqLen * dim * dim_head * heads * 2
+        # WQ * Q (H )
+        KOps = seqLen * dim * dim_head * heads * 2
+        # WK * K
+        VOps = seqLen * dim * dim_head * heads * 2
+        # WV * V
+        QKOps = seqLen * seqLen * dim_head * heads * 2
+        # Q * KT
+        AVOps = seqLen * seqLen * dim_head * heads * 2
+        # N H S S * N H S D -> N H S D
+        OutOps = seqLen * dim_head * heads * dim * 2
+        # WO * O
+        totOps = QOps + KOps + VOps + QKOps + AVOps + OutOps
+        return totOps
+
+
+class DebugPrintLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py
new file mode 100644
index 0000000..7c9e7e7
--- /dev/null
+++ b/Deeploy/Targets/Generic/Parsers.py
@@ -0,0 +1,2082 @@
+# ----------------------------------------------------------------------
+#
+# File: BasicParsers.py
+#
+# Last edited: 15.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Authors:
+# - Moritz Scherer, ETH Zurich
+# - Victor Jung, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Tuple
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import NetworkContext, NodeParser
+
+
+class ConcatParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all(['axis' in node.attrs, len(node.inputs) >= 2, len(node.outputs) == 1])
+
+        if ret:
+            self.operatorRepresentation['axis'] = node.attrs['axis']
+            return True
+
+        return False
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_out = ctxt.lookup(node.outputs[0].name)
+        self.operatorRepresentation['data_out'] = data_out.name
+
+        for idx, _inp in enumerate(node.inputs):
+            data_in = ctxt.lookup(_inp.name)
+            self.operatorRepresentation[f'data_in_{idx+1}'] = _inp.name
+
+        return ctxt, True
+
+
+class iRMSNormParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        ret = all(['D' in node.attrs, 'n_levels' in node.attrs, len(node.inputs) == 2, len(node.outputs) == 1])
+
+        if ret:
+
+            self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'])
+            self.operatorRepresentation['log2D'] = int(math.log2(node.attrs['D']))
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        inputs = ['data_in', 'weight']
+        outputs = ['data_out']
+
+        for idx, inputNode in enumerate(node.inputs):
+            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+        for idx, outputNode in enumerate(node.outputs):
+            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+
+        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
+        self.operatorRepresentation['lastDimLength'] = ctxt.lookup(node.inputs[0].name).shape[-1]
+
+        return ctxt, True
+
+
+class RQSParserInterface():
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        ret = all([
+            'div' in node.attrs,
+            any(['n_levels' in node.attrs, 'n_levels_out' in node.attrs]),
+            'signed' in node.attrs,
+        ])
+
+        if ret:
+            if 'n_levels' in node.attrs:
+                self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'].values)
+            else:
+                self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels_out'].values)
+            self.operatorRepresentation['signed'] = int(node.attrs['signed'].values)
+            self.operatorRepresentation['log2D'] = int(math.log2(node.attrs['div'].values))
+
+        return ret
+
+
+class SliceParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        # Scheremo ONNX >= 10
+        retNew = all([len(node.inputs) >= 3, len(node.inputs) <= 5, len(node.outputs) == 1])
+
+        # Scheremo ONNX < 10
+        retOld = all([len(node.inputs) == 1, 'ends' in node.attrs, 'starts' in node.attrs, len(node.outputs) == 1])
+
+        if not (retNew or retOld):
+            return False
+
+        return True
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+
+        self.operatorRepresentation['data_in_shape'] = data_in.shape
+        self.operatorRepresentation['data_out_shape'] = data_out.shape
+        self.operatorRepresentation['dims'] = len(data_in.shape)
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+
+        if len(node.inputs) <= 1:
+            values = node.attrs['starts']
+            startsTensor = gs.Constant(f'{node.name}_Starts_Tensor', values = values)
+            ctxt.hoistConstant(startsTensor)
+            node.inputs.append(startsTensor)
+        if len(node.inputs) <= 2:
+            values = node.attrs['ends']
+            endsTensor = gs.Constant(f'{node.name}_Ends_Tensor', values = values)
+            ctxt.hoistConstant(endsTensor)
+            node.inputs.append(endsTensor)
+        if len(node.inputs) <= 3:
+            values = np.array(list(range(self.operatorRepresentation['dims'])))
+            axesTensor = gs.Constant(f'{node.name}_Axes_Tensor', values = values)
+            ctxt.hoistConstant(axesTensor)
+            node.inputs.append(axesTensor)
+        if len(node.inputs) <= 4:
+            values = np.ones((self.operatorRepresentation['dims']))
+            stepsTensor = gs.Constant(f'{node.name}_Steps_Tensor', values = values)
+            ctxt.hoistConstant(stepsTensor)
+            node.inputs.append(stepsTensor)
+
+        self.operatorRepresentation['starts'] = node.inputs[1].name
+        self.operatorRepresentation['ends'] = node.inputs[2].name
+
+        self.operatorRepresentation['axes'] = node.inputs[3].name
+        self.operatorRepresentation['steps'] = node.inputs[4].name
+
+        return ctxt, True
+
+
+class TransposeParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all(['perm' in node.attrs, len(node.inputs) == 1, len(node.outputs) == 1])
+
+        if ret:
+            self.operatorRepresentation['perm'] = node.attrs['perm']
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        self.operatorRepresentation['data_in_shape'] = data_in.shape
+        self.operatorRepresentation['data_out_shape'] = data_out.shape
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['data_in_size'] = np.prod(data_in.shape)
+        self.operatorRepresentation['data_out_size'] = np.prod(data_out.shape)
+
+        return ctxt, True
+
+
+class MaxPoolParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all([
+            'ceil_mode' in node.attrs, 'kernel_shape' in node.attrs, 'pads' in node.attrs, 'strides' in node.attrs,
+            len(node.inputs) == 1,
+            len(node.outputs) >= 1
+        ])
+
+        if ret:
+            self.operatorRepresentation['ceil_mode'] = node.attrs['ceil_mode']
+            self.operatorRepresentation['pads'] = node.attrs['pads']
+            self.operatorRepresentation['kernel_shape'] = node.attrs['kernel_shape']
+            self.operatorRepresentation['strides'] = node.attrs['strides']
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['data_in_size'] = np.prod(data_in.shape)
+        self.operatorRepresentation['data_out_size'] = np.prod(data_out.shape)
+
+        return ctxt, True
+
+
+class MaxPool2DParser(MaxPoolParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = super().parseNode(node)
+        wellFormed = False
+        if ret:
+            pads = self.operatorRepresentation['pads']
+            kernel_shape = self.operatorRepresentation['kernel_shape']
+            strides = self.operatorRepresentation['strides']
+            if len(pads) == 4 and len(kernel_shape) == 2 and len(strides) == 2:
+                wellFormed = True
+
+            self.operatorRepresentation['padding_x'] = int(self.operatorRepresentation['pads'][0])
+            self.operatorRepresentation['padding_y'] = int(self.operatorRepresentation['pads'][1])
+            self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][0])
+            self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][1])
+            self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][2])
+            self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][3])
+            self.operatorRepresentation['stride_x'] = int(self.operatorRepresentation['strides'][0])
+            self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][1])
+            self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0])
+            self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1])
+
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+        wellFormed = False
+        if ret:
+            data_in = newCtxt.lookup(self.operatorRepresentation['data_in'])
+            data_out = newCtxt.lookup(self.operatorRepresentation['data_out'])
+
+            self.operatorRepresentation['batch'] = data_in.shape[0]
+            if channels_first:
+                self.operatorRepresentation['ch_im_in'] = data_in.shape[1]
+                self.operatorRepresentation['dim_im_in_x'] = data_in.shape[2]
+                self.operatorRepresentation['dim_im_in_y'] = data_in.shape[3]
+                self.operatorRepresentation['ch_im_out'] = data_out.shape[1]
+                self.operatorRepresentation['dim_im_out_x'] = data_out.shape[2]
+                self.operatorRepresentation['dim_im_out_y'] = data_out.shape[3]
+            else:
+                self.operatorRepresentation['ch_im_in'] = data_in.shape[3]
+                self.operatorRepresentation['dim_im_in_x'] = data_in.shape[1]
+                self.operatorRepresentation['dim_im_in_y'] = data_in.shape[2]
+                self.operatorRepresentation['ch_im_out'] = data_out.shape[3]
+                self.operatorRepresentation['dim_im_out_x'] = data_out.shape[1]
+                self.operatorRepresentation['dim_im_out_y'] = data_out.shape[2]
+
+            if len(data_in.shape) == 4 and len(data_out.shape) == 4:
+                wellFormed = True
+
+        return newCtxt, wellFormed
+
+
+class PadParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all([
+            'mode' in node.attrs, 'pads' in node.attrs, 'value' in node.attrs,
+            len(node.inputs) == 1,
+            len(node.outputs) == 1
+        ])
+
+        if ret:
+            self.operatorRepresentation['mode'] = node.attrs['mode']
+            self.operatorRepresentation['pads'] = node.attrs['pads']
+            self.operatorRepresentation['value'] = node.attrs['value']
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['data_in_size'] = np.prod(data_in.shape)
+        self.operatorRepresentation['data_out_size'] = np.prod(data_out.shape)
+
+        return ctxt, True
+
+
+class Pad2DParser(PadParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = super().parseNode(node)
+        wellFormed = False
+        if ret:
+            pads = self.operatorRepresentation['pads']
+            if len(pads) == 8 and pads[0] == 0 and pads[4] == 0 \
+            and pads[1] == 0 and pads[5] == 0:
+                wellFormed = True
+                self.operatorRepresentation['pad_x'] = int(pads[3])
+                self.operatorRepresentation['pad_y'] = int(pads[2])
+
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+        wellFormed = False
+        if ret:
+            data_in = newCtxt.lookup(node.inputs[0].name)
+            data_out = newCtxt.lookup(node.outputs[0].name)
+            if len(data_in.shape) == 4:
+                wellFormed = True
+                self.operatorRepresentation['batch'] = data_in.shape[0]
+                if channels_first:
+                    self.operatorRepresentation['dim_im_in_x'] = data_in.shape[2]
+                    self.operatorRepresentation['dim_im_in_y'] = data_in.shape[3]
+                    self.operatorRepresentation['dim_im_in_ch'] = data_in.shape[1]
+                    self.operatorRepresentation['dim_im_out_x'] = data_out.shape[2]
+                    self.operatorRepresentation['dim_im_out_y'] = data_out.shape[3]
+                    self.operatorRepresentation['dim_im_out_ch'] = data_out.shape[1]
+                else:
+                    self.operatorRepresentation['dim_im_in_x'] = data_in.shape[1]
+                    self.operatorRepresentation['dim_im_in_y'] = data_in.shape[2]
+                    self.operatorRepresentation['dim_im_in_ch'] = data_in.shape[3]
+                    self.operatorRepresentation['dim_im_out_x'] = data_out.shape[1]
+                    self.operatorRepresentation['dim_im_out_y'] = data_out.shape[2]
+                    self.operatorRepresentation['dim_im_out_ch'] = data_out.shape[3]
+        return newCtxt, wellFormed
+
+
+class Pad1DParser(PadParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = super().parseNode(node)
+        wellFormed = False
+        if ret:
+            pads = self.operatorRepresentation['pads']
+            if len(pads) == 6 and pads[0] == 0 and pads[3] == 0 \
+            and pads[1] == 0 and pads[4] == 0:
+                wellFormed = True
+                self.operatorRepresentation['pad_y'] = int(pads[2])
+                self.operatorRepresentation['pad_x'] = 0
+
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+        wellFormed = False
+        if ret:
+            data_in = newCtxt.lookup(node.inputs[0].name)
+            data_out = newCtxt.lookup(node.outputs[0].name)
+            if len(data_in.shape) == 3:
+                wellFormed = True
+                self.operatorRepresentation['batch'] = data_in.shape[0]
+                self.operatorRepresentation['dim_im_in_x'] = 1
+                self.operatorRepresentation['dim_im_out_x'] = 1
+                if channels_first:
+                    self.operatorRepresentation['dim_im_in_y'] = data_in.shape[2]
+                    self.operatorRepresentation['dim_im_in_ch'] = data_in.shape[1]
+                    self.operatorRepresentation['dim_im_out_y'] = data_out.shape[2]
+                    self.operatorRepresentation['dim_im_out_ch'] = data_out.shape[1]
+                else:
+                    self.operatorRepresentation['dim_im_in_y'] = data_in.shape[1]
+                    self.operatorRepresentation['dim_im_in_ch'] = data_in.shape[2]
+                    self.operatorRepresentation['dim_im_out_y'] = data_out.shape[1]
+                    self.operatorRepresentation['dim_im_out_ch'] = data_out.shape[2]
+        return newCtxt, wellFormed
+
+
+class AddParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all([len(node.inputs) == 2, len(node.outputs) == 1])
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_in_1 = ctxt.lookup(node.inputs[0].name)
+        data_in_2 = ctxt.lookup(node.inputs[1].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        self.operatorRepresentation['data_in_1'] = data_in_1.name
+        self.operatorRepresentation['data_in_2'] = data_in_2.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['size'] = np.prod(data_in_1.shape)
+
+        return ctxt, True
+
+
+class ReduceParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all(['axes' in node.attrs, 'keepdims' in node.attrs, len(node.inputs) == 1, len(node.outputs) == 1])
+
+        if ret:
+            if isinstance(node.attrs['axes'], int):
+                self.operatorRepresentation['axes'] = [node.attrs['axes']]
+            else:
+                self.operatorRepresentation['axes'] = node.attrs['axes']
+            self.operatorRepresentation['keepdims'] = int(node.attrs['keepdims'])
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['data_in_shape'] = data_in.shape
+        self.operatorRepresentation['data_out_shape'] = data_out.shape
+        self.operatorRepresentation['size'] = np.prod(data_in.shape)
+        self.operatorRepresentation['axisLength'] = data_in.shape[self.operatorRepresentation['axes'][0]]
+
+        return ctxt, True
+
+
+class ReduceMeanParser(ReduceParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        wellFormed = super().parseNode(node)
+
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+        return newCtxt, ret
+
+
+class ReduceSumParser(ReduceParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        wellFormed = super().parseNode(node)
+
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+        return newCtxt, ret
+
+
+class SoftmaxParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all([len(node.inputs) == 1, len(node.outputs) == 1])
+
+        if ret:
+            self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'].values)
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['size'] = np.prod(data_in.shape)
+        self.operatorRepresentation['lastDimLength'] = data_in.shape[-1]
+
+        return ctxt, True
+
+
+class iSoftmaxParser(SoftmaxParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        wellFormed = super().parseNode(node)
+
+        if wellFormed:
+            wellFormed = all([
+                'coeffA' in node.attrs,
+                'coeffB' in node.attrs,
+                'coeffC' in node.attrs,
+                'log2' in node.attrs,
+            ])
+
+        if wellFormed:
+            self.operatorRepresentation['coeffA'] = int(node.attrs['coeffA'].values)
+            self.operatorRepresentation['coeffB'] = int(node.attrs['coeffB'].values)
+            self.operatorRepresentation['coeffC'] = int(node.attrs['coeffC'].values)
+            self.operatorRepresentation['log2'] = int(node.attrs['log2'].values)
+
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        return newCtxt, ret
+
+
+class ITAMaxParser(SoftmaxParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        wellFormed = super().parseNode(node)
+
+        ret = all(['n_levels' in node.attrs])
+
+        if ret and wellFormed:
+            self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'].values)
+            return True
+
+        return False
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        ctxt = ctxt.copy()
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        return newCtxt, ret
+
+
+class ITAPartialMaxParser(SoftmaxParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        wellFormed = super().parseNode(node)
+
+        ret = all(['group_width' in node.attrs, 'n_levels' in node.attrs])
+
+        if ret and wellFormed:
+            self.operatorRepresentation['group_width'] = int(node.attrs['group_width'])
+            self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'].values)
+            return True
+
+        return False
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        return newCtxt, ret
+
+
+class iGELUParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all(['b' in node.attrs, 'one' in node.attrs, len(node.inputs) >= 1, len(node.outputs) == 1])
+
+        if ret:
+            self.operatorRepresentation['b'] = node.attrs['b']
+            self.operatorRepresentation['one'] = node.attrs['one']
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['size'] = np.prod(data_in.shape)
+
+        return ctxt, True
+
+
+class RQSiGELUParser(iGELUParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        wellFormed = all([
+            len(node.inputs) == 4,
+        ])
+        ret = super().parseNode(node)
+
+        return (ret and wellFormed)
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+
+            inputs = ['data_in', 'mul', 'add', 'shift']
+            outputs = ['data_out']
+
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
+            for idx, outputNode in enumerate(node.outputs):
+                self.operatorRepresentation[outputs[idx]] = newCtxt.lookup(outputNode.name).name
+
+            return newCtxt, True
+        return ctxt, False
+
+
+class iHardswishParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all(['one_over_six' in node.attrs, 'six' in node.attrs, 'three' in node.attrs])
+
+        if ret:
+            self.operatorRepresentation['one_over_six'] = node.attrs['one_over_six']
+            self.operatorRepresentation['six'] = node.attrs['six']
+            self.operatorRepresentation['three'] = node.attrs['three']
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['size'] = np.prod(data_in.shape)
+
+        return ctxt, True
+
+
+class RQSiHardswishParser(iHardswishParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node):
+
+        wellFormed = all([len(node.inputs) == 1, 'mul' in node.attrs, 'add' in node.attrs, 'shift' in node.attrs])
+        ret = super().parseNode(node)
+
+        if ret and wellFormed:
+            self.operatorRepresentation['mul'] = node.attrs['mul']
+            self.operatorRepresentation['add'] = node.attrs['add']
+            self.operatorRepresentation['shift'] = node.attrs['shift']
+
+            return True
+
+        return False
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+
+            inputs = ['data_in']
+            outputs = ['data_out']
+
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
+            for idx, outputNode in enumerate(node.outputs):
+                self.operatorRepresentation[outputs[idx]] = newCtxt.lookup(outputNode.name).name
+
+            return newCtxt, True
+        return ctxt, False
+
+
+class GatherParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        ret = all(['axis' in node.attrs, len(node.inputs) == 2, len(node.outputs) == 1])
+
+        if ret:
+            self.operatorRepresentation['axis'] = node.attrs['axis']
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        inputs = ['data_in', 'indices']
+        outputs = ['data_out']
+
+        for idx, inputNode in enumerate(node.inputs):
+            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+        for idx, outputNode in enumerate(node.outputs):
+            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+
+        axis = self.operatorRepresentation['axis']
+        self.operatorRepresentation['numIndices'] = int(
+            np.prod(ctxt.lookup(self.operatorRepresentation['indices']).values.shape))
+        self.operatorRepresentation['offset'] = np.prod(ctxt.lookup(node.inputs[0].name).shape[axis + 1:])
+        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
+
+        return ctxt, True
+
+
+class FlattenParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        ret = all(['axis' in node.attrs, len(node.inputs) == 1, len(node.outputs) == 1])
+
+        if ret:
+            self.operatorRepresentation['axis'] = node.attrs['axis']
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        inputs = ['data_in']
+        outputs = ['data_out']
+
+        for idx, inputNode in enumerate(node.inputs):
+            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+        for idx, outputNode in enumerate(node.outputs):
+            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+
+        return ctxt, True
+
+
+class UnsqueezeParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        ret = all(['axes' in node.attrs, len(node.inputs) == 1, len(node.outputs) == 1])
+
+        if ret:
+            self.operatorRepresentation['axes'] = node.attrs['axes']
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        inputs = ['data_in']
+        outputs = ['data_out']
+
+        for idx, inputNode in enumerate(node.inputs):
+            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+        for idx, outputNode in enumerate(node.outputs):
+            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+
+        return ctxt, True
+
+
+class ReshapeParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        ret = all([len(node.inputs) == 2, len(node.outputs) == 1])
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        inputs = ['data_in', 'indices']
+        outputs = ['data_out']
+
+        for idx, inputNode in enumerate(node.inputs):
+            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+        for idx, outputNode in enumerate(node.outputs):
+            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+
+        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
+
+        return ctxt, True
+
+
+class RequantShiftParser(NodeParser, RQSParserInterface):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        ret_rqs = RQSParserInterface.parseNode(self, node)
+
+        ret = all([
+            ret_rqs == True,
+            len(node.inputs) == 3,
+            len(node.outputs) == 1,
+        ])
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        inputs = ['data_in', 'mul', 'add']
+        outputs = ['data_out']
+
+        for idx, inputNode in enumerate(node.inputs):
+            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+        for idx, outputNode in enumerate(node.outputs):
+            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+
+        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
+        self.operatorRepresentation['channels'] = ctxt.lookup(node.inputs[0].name).shape[1]
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        self.operatorRepresentation['channel_width'] = int(self.operatorRepresentation['size'] /
+                                                           self.operatorRepresentation['channels'])
+
+        if len(data_in.shape) == 4:
+            self.operatorRepresentation['batch'] = data_in.shape[0]
+            self.operatorRepresentation['channel_width'] = int(self.operatorRepresentation['channel_width'] /
+                                                               self.operatorRepresentation['batch'])
+
+        return ctxt, True
+
+
+class UniformRequantShiftParser(RequantShiftParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        ret1 = super().parseNode(node)
+
+        ret2 = all([
+            np.prod(node.inputs[1].values.shape) == 1,
+            np.prod(node.inputs[2].values.shape) == 1,
+        ])
+
+        return (ret1 and ret2)
+
+
+class MulParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = all([
+            len(node.inputs) == 2,
+            len(node.outputs) == 1,
+        ])
+
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        inputs = ['A', 'B']
+        outputs = ['C']
+
+        for idx, inputNode in enumerate(node.inputs):
+            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+        for idx, outputNode in enumerate(node.outputs):
+            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+
+        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
+        self.operatorRepresentation['sizeB'] = np.prod(ctxt.lookup(node.inputs[1].name).shape)
+
+        return ctxt, True
+
+
+class ConvParser(NodeParser):
+
+    def __init__(self, noBiasHoisting):
+        super().__init__()
+        self.noBiasHoisting = noBiasHoisting
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = all([
+            'dilations' in node.attrs,
+            'group' in node.attrs,
+            'kernel_shape' in node.attrs,
+            'pads' in node.attrs,
+            'strides' in node.attrs,
+            len(node.outputs) == 1,
+        ])
+
+        if wellFormed:
+            self.operatorRepresentation['group'] = node.attrs['group']
+            self.operatorRepresentation['kernel_shape'] = node.attrs['kernel_shape']
+            self.operatorRepresentation['pads'] = node.attrs['pads']
+            self.operatorRepresentation['strides'] = node.attrs['strides']
+            self.operatorRepresentation['dilations'] = node.attrs['dilations']
+
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        inputs = ['data_in', 'weight']
+        outputs = ['data_out']
+
+        for idx, inputNode in enumerate(node.inputs):
+            if idx < len(inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+        for idx, outputNode in enumerate(node.outputs):
+            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+
+        if len(node.inputs) == 3:
+            self.operatorRepresentation['bias'] = ctxt.lookup(node.inputs[2].name).name
+        else:
+            if not self.noBiasHoisting:
+                values = np.zeros((1))
+                zeroTensor = gs.Constant(f'{node.name}_Bias_Tensor', values = values)
+                ctxt.hoistConstant(zeroTensor)
+                node.inputs.append(zeroTensor)
+                self.operatorRepresentation['bias'] = f'{node.name}_Bias_Tensor'
+
+        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
+
+        return ctxt, True
+
+
+class Conv2DParser(ConvParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = super().parseNode(node)
+        ret = False
+
+        if wellFormed:
+            ret = all([
+                # Make sure kernel is 2D
+                len(node.attrs['kernel_shape']) == 2,
+                # Make sure strides are 2D
+                len(node.attrs['strides']) == 2,
+                len(node.attrs['pads']) == 4,
+                len(node.attrs['dilations']) == 2,
+            ])
+
+        if ret:
+            self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0])
+            self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1])
+            self.operatorRepresentation['dilation_x'] = int(self.operatorRepresentation['dilations'][0])
+            self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][1])
+            self.operatorRepresentation['padding_x'] = int(self.operatorRepresentation['pads'][0])
+            self.operatorRepresentation['padding_y'] = int(self.operatorRepresentation['pads'][1])
+            self.operatorRepresentation['stride_x'] = int(self.operatorRepresentation['strides'][0])
+            self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][1])
+            self.operatorRepresentation['bias_shift'] = int(0)
+            self.operatorRepresentation['out_shift'] = int(0)
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            data_in = newCtxt.lookup(self.operatorRepresentation['data_in'])
+            data_out = newCtxt.lookup(self.operatorRepresentation['data_out'])
+            weight = newCtxt.lookup(self.operatorRepresentation['weight'])
+
+            self.operatorRepresentation['batch'] = data_in.shape[0]
+            if channels_first:
+                self.operatorRepresentation['ch_im_in'] = data_in.shape[1]
+                self.operatorRepresentation['dim_im_in_x'] = data_in.shape[2]
+                self.operatorRepresentation['dim_im_in_y'] = data_in.shape[3]
+                self.operatorRepresentation['ch_im_out'] = data_out.shape[1]
+                self.operatorRepresentation['dim_im_out_x'] = data_out.shape[2]
+                self.operatorRepresentation['dim_im_out_y'] = data_out.shape[3]
+            else:
+                self.operatorRepresentation['ch_im_in'] = data_in.shape[3]
+                self.operatorRepresentation['dim_im_in_x'] = data_in.shape[1]
+                self.operatorRepresentation['dim_im_in_y'] = data_in.shape[2]
+                self.operatorRepresentation['ch_im_out'] = data_out.shape[3]
+                self.operatorRepresentation['dim_im_out_x'] = data_out.shape[1]
+                self.operatorRepresentation['dim_im_out_y'] = data_out.shape[2]
+
+            if len(data_in.shape) == 4 and len(weight.shape) == 4:
+                return newCtxt, True
+
+        return ctxt, False
+
+
+class RQSConv2DParser(Conv2DParser, RQSParserInterface):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        ret_rqs = RQSParserInterface.parseNode(self, node)
+        ret_conv = Conv2DParser.parseNode(self, node)
+
+        ret = all([
+            ret_rqs == True,
+            ret_conv == True,
+        ])
+
+        return ret
+
+
+class Conv1DParser(ConvParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = super().parseNode(node)
+        ret = False
+
+        if wellFormed:
+            ret = all([
+                # Make sure kernel is 2D
+                len(node.attrs['kernel_shape']) == 1,
+                # Make sure strides are 2D
+                len(node.attrs['strides']) == 1,
+                len(node.attrs['pads']) == 2,
+                len(node.attrs['dilations']) == 1,
+            ])
+
+        if ret:
+            self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][0])
+            self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][0])
+            self.operatorRepresentation['padding_y'] = int(self.operatorRepresentation['pads'][0])
+            self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][0])
+            self.operatorRepresentation['bias_shift'] = int(0)
+            self.operatorRepresentation['out_shift'] = int(0)
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            data_in = newCtxt.lookup(self.operatorRepresentation['data_in'])
+            data_out = newCtxt.lookup(self.operatorRepresentation['data_out'])
+            weight = newCtxt.lookup(self.operatorRepresentation['weight'])
+
+            self.operatorRepresentation['batch'] = data_in.shape[0]
+            self.operatorRepresentation['dim_im_in_x'] = 1
+            self.operatorRepresentation['dim_im_out_x'] = 1
+
+            if channels_first:
+                self.operatorRepresentation['ch_im_in'] = data_in.shape[1]
+                self.operatorRepresentation['dim_im_in_y'] = data_in.shape[2]
+                self.operatorRepresentation['ch_im_out'] = data_out.shape[1]
+                self.operatorRepresentation['dim_im_out_y'] = data_out.shape[2]
+            else:
+                self.operatorRepresentation['ch_im_in'] = data_in.shape[2]
+                self.operatorRepresentation['dim_im_in_y'] = data_in.shape[1]
+                self.operatorRepresentation['ch_im_out'] = data_out.shape[2]
+                self.operatorRepresentation['dim_im_out_y'] = data_out.shape[1]
+
+            if len(data_in.shape) == 3 and len(weight.shape) == 3:
+                return newCtxt, True
+
+        return ctxt, False
+
+
+class RQSConv1DParser(Conv1DParser, RQSParserInterface):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        ret_rqs = RQSParserInterface.parseNode(self, node)
+        ret_conv = Conv1DParser.parseNode(self, node)
+
+        ret = all([
+            ret_rqs == True,
+            ret_conv == True,
+        ])
+
+        return ret
+
+
+class MHSAParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        ret = all([
+            'preattn_requant_mul' in node.attrs, 'preattn_requant_div' in node.attrs, 'postattn_requant_mul'
+            in node.attrs, 'postattn_requant_div' in node.attrs, 'wo_requant_mul' in node.attrs, 'wo_requant_div'
+            in node.attrs, 'wq_requant_mul' in node.attrs, 'wq_requant_div' in node.attrs, 'wk_requant_mul'
+            in node.attrs, 'wk_requant_div' in node.attrs, 'wv_requant_mul' in node.attrs, 'wv_requant_div'
+            in node.attrs, 'n_levels' in node.attrs, 'dim' in node.attrs, 'dim_head' in node.attrs, 'heads'
+            in node.attrs, 'signed' in node.attrs,
+            len(node.inputs) == 11,
+            len(node.outputs) == 1
+        ])
+
+        if ret:
+            self.operatorRepresentation['preattn_requant_mul'] = node.attrs['preattn_requant_mul']
+            self.operatorRepresentation['preattn_requant_div'] = node.attrs['preattn_requant_div']
+            self.operatorRepresentation['postattn_requant_mul'] = node.attrs['postattn_requant_mul']
+            self.operatorRepresentation['postattn_requant_div'] = node.attrs['postattn_requant_div']
+            self.operatorRepresentation['wo_requant_mul'] = node.attrs['wo_requant_mul']
+            self.operatorRepresentation['wo_requant_div'] = node.attrs['wo_requant_div']
+            self.operatorRepresentation['wq_requant_mul'] = node.attrs['wq_requant_mul']
+            self.operatorRepresentation['wq_requant_div'] = node.attrs['wq_requant_div']
+            self.operatorRepresentation['wk_requant_mul'] = node.attrs['wk_requant_mul']
+            self.operatorRepresentation['wk_requant_div'] = node.attrs['wk_requant_div']
+            self.operatorRepresentation['wv_requant_mul'] = node.attrs['wv_requant_mul']
+            self.operatorRepresentation['wv_requant_div'] = node.attrs['wv_requant_div']
+            self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'])
+            self.operatorRepresentation['dim'] = int(node.attrs['dim'])  # Sequence Length
+            self.operatorRepresentation['dim_head'] = int(node.attrs['dim_head'])  # Projection Size
+            self.operatorRepresentation['heads'] = int(node.attrs['heads'])
+            self.operatorRepresentation['signed'] = int(node.attrs['signed'])
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        inputs = [
+            'q', 'k', 'v', 'wq_weight', 'wq_bias', 'wk_weight', 'wk_bias', 'wv_weight', 'wv_bias', 'wo_weight',
+            'wo_bias'
+        ]
+        outputs = ['data_out']
+
+        for idx, inputNode in enumerate(node.inputs):
+            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+            self.operatorRepresentation[inputs[idx] + '_shape'] = ctxt.lookup(inputNode.name).shape
+        for idx, outputNode in enumerate(node.outputs):
+            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+            self.operatorRepresentation[outputs[idx] + '_shape'] = ctxt.lookup(outputNode.name).shape
+
+        self.operatorRepresentation['size'] = np.sum([np.prod(ctxt.lookup(x.name).shape) for x in node.inputs])
+        # self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
+
+        return ctxt, True
+
+
+class LinearAttentionParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        ret = all([
+            'preattn_requant_mul' in node.attrs, 'preattn_requant_div' in node.attrs, 'normalizer_requant_mul'
+            in node.attrs, 'normalizer_requant_div' in node.attrs, 'postattn_requant_mul' in node.attrs,
+            'postattn_requant_div' in node.attrs, 'wo_requant_mul' in node.attrs, 'wo_requant_div' in node.attrs,
+            'wq_requant_mul' in node.attrs, 'wq_requant_div' in node.attrs, 'wk_requant_mul' in node.attrs,
+            'wk_requant_div' in node.attrs, 'wv_requant_mul' in node.attrs, 'wv_requant_div' in node.attrs, 'Delta'
+            in node.attrs, 'eps' in node.attrs, 'act_type' in node.attrs, 'n_levels' in node.attrs, 'dim' in node.attrs,
+            'dim_head' in node.attrs, 'heads' in node.attrs,
+            len(node.inputs) == 11,
+            len(node.outputs) == 1
+        ])
+
+        if ret:
+            self.operatorRepresentation['preattn_requant_mul'] = int(node.attrs['preattn_requant_mul'].values)
+            self.operatorRepresentation['preattn_requant_shift'] = int(node.attrs['preattn_requant_shift'].values)
+            self.operatorRepresentation['preattn_requant_div'] = int(
+                math.log2(int(node.attrs['preattn_requant_div'].values)))
+            self.operatorRepresentation['normalizer_requant_mul'] = int(node.attrs['normalizer_requant_mul'].values)
+            self.operatorRepresentation['normalizer_requant_shift'] = int(node.attrs['normalizer_requant_shift'].values)
+            self.operatorRepresentation['normalizer_requant_div'] = int(
+                math.log2(int(node.attrs['normalizer_requant_div'].values)))
+            self.operatorRepresentation['postattn_requant_mul'] = int(node.attrs['postattn_requant_mul'].values)
+            self.operatorRepresentation['postattn_requant_shift'] = int(node.attrs['postattn_requant_shift'].values)
+            self.operatorRepresentation['postattn_requant_div'] = int(
+                math.log2(int(node.attrs['postattn_requant_div'].values)))
+            self.operatorRepresentation['wo_requant_mul'] = int(node.attrs['wo_requant_mul'].values)
+            self.operatorRepresentation['wo_requant_shift'] = int(node.attrs['wo_requant_shift'].values)
+            self.operatorRepresentation['wo_requant_div'] = int(math.log2(int(node.attrs['wo_requant_div'].values)))
+            self.operatorRepresentation['wq_requant_mul'] = int(node.attrs['wq_requant_mul'].values)
+            self.operatorRepresentation['wq_requant_shift'] = int(node.attrs['wq_requant_shift'].values)
+            self.operatorRepresentation['wq_requant_div'] = int(math.log2(int(node.attrs['wq_requant_div'].values)))
+            self.operatorRepresentation['wk_requant_mul'] = int(node.attrs['wk_requant_mul'].values)
+            self.operatorRepresentation['wk_requant_shift'] = int(node.attrs['wk_requant_shift'].values)
+            self.operatorRepresentation['wk_requant_div'] = int(math.log2(int(node.attrs['wk_requant_div'].values)))
+            self.operatorRepresentation['wv_requant_mul'] = int(node.attrs['wv_requant_mul'].values)
+            self.operatorRepresentation['wv_requant_shift'] = int(node.attrs['wv_requant_shift'].values)
+            self.operatorRepresentation['wv_requant_div'] = int(math.log2(int(node.attrs['wv_requant_div'].values)))
+            self.operatorRepresentation['Delta'] = int(node.attrs['Delta'])
+            self.operatorRepresentation['eps'] = int(node.attrs['eps'])
+            self.operatorRepresentation['act_type'] = int(node.attrs['act_type'])
+            self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'].values)
+            self.operatorRepresentation['dim'] = int(node.attrs['dim'].values)
+            self.operatorRepresentation['dim_head'] = int(node.attrs['dim_head'].values)
+            self.operatorRepresentation['heads'] = int(node.attrs['heads'].values)
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        inputs = [
+            'q', 'k', 'v', 'wq_weight', 'wq_bias', 'wk_weight', 'wk_bias', 'wv_weight', 'wv_bias', 'wo_weight',
+            'wo_bias'
+        ]
+        outputs = ['data_out']
+
+        for idx, inputNode in enumerate(node.inputs):
+            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+        for idx, outputNode in enumerate(node.outputs):
+            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+
+        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
+        self.operatorRepresentation['q_shape'] = ctxt.lookup(node.inputs[0].name).shape
+
+        return ctxt, True
+
+
+class CLCAParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        ret = all([
+            'Delta' in node.attrs, 'eps' in node.attrs, 'eta' in node.attrs, 'act_type' in node.attrs, 'n_levels'
+            in node.attrs, 'dim' in node.attrs, 'dim_head' in node.attrs, 'out_dim' in node.attrs, 'heads'
+            in node.attrs,
+            len(node.inputs) == 29,
+            len(node.outputs) == 1
+        ])
+
+        if ret:
+            self.operatorRepresentation['Delta'] = int(node.attrs['Delta'])
+            self.operatorRepresentation['eps'] = int(node.attrs['eps'])
+            self.operatorRepresentation['eta'] = int(node.attrs['eta'])
+            self.operatorRepresentation['act_type'] = int(node.attrs['act_type'])
+            self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'].values)
+            self.operatorRepresentation['dim'] = int(node.attrs['dim'].values)
+            self.operatorRepresentation['dim_head'] = int(node.attrs['dim_head'].values)
+            self.operatorRepresentation['out_dim'] = int(node.attrs['out_dim'].values)
+            self.operatorRepresentation['heads'] = int(node.attrs['heads'].values)
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        inputs = [
+            'q', 'k', 'wq_weight', 'wq_bias', 'wk_weight', 'wk_bias', 'wo_weight', 'wo_bias', 'wq_requant_mul',
+            'wq_requant_add', 'wq_requant_div', 'wk_requant_mul', 'wk_requant_add', 'wk_requant_div', 'wv_requant_mul',
+            'wv_requant_add', 'wv_requant_div', 'kdiv_requant_mul', 'kdiv_requant_add', 'kdiv_requant_div',
+            'preattn_requant_mul', 'preattn_requant_add', 'preattn_requant_div', 'postattn_requant_mul',
+            'postattn_requant_add', 'postattn_requant_div', 'wo_requant_mul', 'wo_requant_add', 'wo_requant_div'
+        ]
+        outputs = ['data_out']
+
+        for idx, inputNode in enumerate(node.inputs):
+            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+        for idx, outputNode in enumerate(node.outputs):
+            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+
+        self.operatorRepresentation['input_size_Q'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
+        self.operatorRepresentation['input_size_KV'] = np.prod(ctxt.lookup(node.inputs[1].name).shape)
+        self.operatorRepresentation['q_shape'] = ctxt.lookup(node.inputs[0].name).shape
+        self.operatorRepresentation['kv_shape'] = ctxt.lookup(node.inputs[1].name).shape
+
+        return ctxt, True
+
+
+class iLayerNormParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        ret = all(['D' in node.attrs, 'n_levels' in node.attrs, len(node.inputs) == 3, len(node.outputs) == 1])
+
+        if ret:
+            self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'].values)
+            self.operatorRepresentation['log2D'] = int(math.log2(node.attrs['D'].values))
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        inputs = ['data_in', 'weight', 'bias']
+        outputs = ['data_out']
+
+        for idx, inputNode in enumerate(node.inputs):
+            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+        for idx, outputNode in enumerate(node.outputs):
+            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+
+        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
+        self.operatorRepresentation['lastDimLength'] = ctxt.lookup(node.inputs[0].name).shape[-1]
+
+        return ctxt, True
+
+
+class MatMulParser(NodeParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__()
+        self.noBiasHoisting = noBiasHoisting
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        ret = all([len(node.inputs) >= 2, len(node.outputs) == 1])
+
+        # Assign GEMM-like attributes to be able to reuse same kernel binding
+        if ret:
+            self.operatorRepresentation['alpha'] = 1
+            self.operatorRepresentation['beta'] = 1
+            self.operatorRepresentation['transB'] = 0
+            self.operatorRepresentation['transA'] = 0
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        ret = True
+
+        inputs = ['A', 'B']
+        outputs = ['data_out']
+
+        for idx, inputNode in enumerate(node.inputs):
+            if idx < len(inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+        for idx, outputNode in enumerate(node.outputs):
+            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+
+        # Create fake C node for GEMM-compatibility and hoist it
+        if not self.noBiasHoisting:
+            values = np.zeros((1))
+            zeroTensor = gs.Constant(f'{node.name}_C_Tensor', values = values)
+            ctxt.hoistConstant(zeroTensor)
+            node.inputs.append(zeroTensor)
+            self.operatorRepresentation['C'] = f'{node.name}_C_Tensor'
+
+        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
+        self.operatorRepresentation['A_shape'] = ctxt.lookup(node.inputs[0].name).shape
+        self.operatorRepresentation['B_shape'] = ctxt.lookup(node.inputs[1].name).shape
+        self.operatorRepresentation['M'] = ctxt.lookup(
+            node.inputs[0].name).shape[(-2 + self.operatorRepresentation['transA'])]
+        self.operatorRepresentation['N'] = ctxt.lookup(
+            node.inputs[0].name).shape[(-1 - self.operatorRepresentation['transA'])]
+        self.operatorRepresentation['O'] = ctxt.lookup(
+            node.inputs[1].name).shape[(-1 - self.operatorRepresentation['transB'])]
+
+        # SCHEREMO: Assert that reduction dimension is the same on both matrices
+        ret = ret and (self.operatorRepresentation['N'] == ctxt.lookup(
+            node.inputs[1].name).shape[-2 + self.operatorRepresentation['transB']])
+
+        self.operatorRepresentation['batch'] = np.prod(ctxt.lookup(node.inputs[0].name).shape[:-2])
+
+        # SCHEREMO: Assert that batch is the same on both matrices
+        W_batched = (self.operatorRepresentation['batch'] == np.prod(ctxt.lookup(node.inputs[1].name).shape[:-2]))
+        self.operatorRepresentation['W_batched'] = W_batched
+
+        return ctxt, ret
+
+
+class RQMatMulParser(MatMulParser, RQSParserInterface):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+        self.noBiasHoisting = noBiasHoisting
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        ret_rqs = RQSParserInterface.parseNode(self, node)
+        ret_matmul = MatMulParser.parseNode(self, node)
+
+        ret = all([
+            ret_rqs == True,
+            ret_matmul == True,
+            len(node.inputs) == 4,
+            len(node.outputs) == 1,
+        ])
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            inputs = ['A', 'B', 'add', 'mul']
+            outputs = ['data_out']
+
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
+            for idx, outputNode in enumerate(node.outputs):
+                self.operatorRepresentation[outputs[idx]] = newCtxt.lookup(outputNode.name).name
+
+        return newCtxt, ret
+
+
+# This parser combines Matmul nodes and GEMM nodes to the more general GEMM nodes
+class GEMMParser(MatMulParser):
+
+    def __init__(self, noBiasHoisting = True):
+        self.noBiasHoisting = noBiasHoisting
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        ret = all([
+            len(node.inputs) >= 2,
+            len(node.outputs) == 1,
+        ])
+
+        # This is a GEMM node:
+        if ret:
+
+            if 'alpha' in node.attrs:
+                self.operatorRepresentation['alpha'] = node.attrs['alpha']
+            else:
+                self.operatorRepresentation['alpha'] = 1
+
+            if 'beta' in node.attrs:
+                self.operatorRepresentation['beta'] = node.attrs['beta']
+            else:
+                self.operatorRepresentation['beta'] = 1
+
+            if 'transA' in node.attrs:
+                self.operatorRepresentation['transA'] = node.attrs['transA']
+            else:
+                self.operatorRepresentation['transA'] = 0
+
+            if 'transB' in node.attrs:
+                self.operatorRepresentation['transB'] = node.attrs['transB']
+            else:
+                self.operatorRepresentation['transB'] = 0
+
+            return True
+        # This might be a matmul node -> Cast up
+        else:
+            return False
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        # We are a true GEMM
+        if ret:
+            inputs = ['A', 'B']
+            outputs = ['data_out']
+
+            for idx, inputNode in enumerate(node.inputs):
+                if idx < len(inputs):
+                    self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
+            for idx, outputNode in enumerate(node.outputs):
+                self.operatorRepresentation[outputs[idx]] = newCtxt.lookup(outputNode.name).name
+
+            if len(node.inputs) == 3:
+                self.operatorRepresentation['C'] = newCtxt.lookup(node.inputs[2].name).name
+            elif not self.noBiasHoisting:
+                values = np.zeros((1))
+                zeroTensor = gs.Constant(f'{node.name}_C_Tensor', values = values)
+                newCtxt.hoistConstant(zeroTensor)
+                self.operatorRepresentation['C'] = f'{node.name}_C_Tensor'
+
+            self.operatorRepresentation['size'] = np.prod(newCtxt.lookup(node.inputs[0].name).shape)
+
+        return newCtxt, ret
+
+
+class RQGEMMParser(GEMMParser, RQSParserInterface):
+
+    def __init__(self, noBiasHoisting = True):
+        self.noBiasHoisting = noBiasHoisting
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        ret_rqs = RQSParserInterface.parseNode(self, node)
+        ret_matmul = GEMMParser.parseNode(self, node)
+
+        ret = all([
+            ret_rqs == True,
+            ret_matmul == True,
+            len(node.inputs) == 5,
+            len(node.outputs) == 1,
+        ])
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        # We are a true GEMM
+        if ret:
+            inputs = ['A', 'B', 'C', 'add', 'mul']
+            outputs = ['data_out']
+
+            for idx, inputNode in enumerate(node.inputs):
+                if idx < len(inputs):
+                    self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
+            for idx, outputNode in enumerate(node.outputs):
+                self.operatorRepresentation[outputs[idx]] = newCtxt.lookup(outputNode.name).name
+
+            if len(node.inputs) == 5:
+                self.operatorRepresentation['C'] = newCtxt.lookup(node.inputs[2].name).name
+            elif not self.noBiasHoisting:
+                values = np.zeros((1))
+                zeroTensor = gs.Constant(f'{node.name}_C_Tensor', values = values)
+                newCtxt.hoistConstant(zeroTensor)
+                self.operatorRepresentation['C'] = f'{node.name}_C_Tensor'
+
+        return newCtxt, ret
+
+
+class DummyParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return True
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        inputs = []
+        outputs = []
+        for i in node.inputs:
+            inputs.append(ctxt.lookup(i.name))
+        for i in node.outputs:
+            outputs.append(ctxt.lookup(i.name))
+
+        self.operatorRepresentation['data_in'] = inputs[0].name
+        self.operatorRepresentation['data_out'] = outputs[0].name
+        self.operatorRepresentation['size'] = np.prod(inputs[0].shape)
+
+        return ctxt, True
+
+
+class IntegerDivParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all([
+            len(node.inputs) >= 2,
+            len(node.outputs) == 1,
+            'Delta' in node.attrs,
+            'eps' in node.attrs,
+            'eta' in node.attrs,
+        ])
+
+        if ret:
+            self.operatorRepresentation['Delta'] = node.attrs['Delta']
+            self.operatorRepresentation['eps'] = node.attrs['eps']
+            self.operatorRepresentation['eta'] = node.attrs['eta']
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        inputs = ["A", "B"]
+        outputs = ["C"]
+        for idx, inputNode in enumerate(node.inputs):
+            if idx < len(inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+        for idx, outputNode in enumerate(node.outputs):
+            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+
+        self.operatorRepresentation['sizeA'] = np.prod(ctxt.lookup(self.operatorRepresentation['A']).shape)
+        self.operatorRepresentation['sizeB'] = np.prod(ctxt.lookup(self.operatorRepresentation['B']).shape)
+
+        for idx, (a, b) in enumerate(
+                zip(
+                    ctxt.lookup(self.operatorRepresentation['A']).shape,
+                    ctxt.lookup(self.operatorRepresentation['B']).shape)):
+            if a != b:
+                self.operatorRepresentation['nomStep'] = np.prod(
+                    ctxt.lookup(self.operatorRepresentation['A']).shape[idx:])
+                self.operatorRepresentation['denomStep'] = np.prod(
+                    ctxt.lookup(self.operatorRepresentation['B']).shape[idx:])
+                break
+
+        return ctxt, True
+
+
+class RQIntegerDivParser(IntegerDivParser, RQSParserInterface):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        ret = RQSParserInterface.parseNode(self, node)
+
+        if ret:
+            ret = IntegerDivParser.parseNode(self, node)
+
+        wellFormed = all([
+            len(node.inputs) == 5,
+        ])
+
+        if ret:
+            return wellFormed
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        inputs = ["A", "B", "requant_mul", "requant_add", "requant_div"]
+        outputs = ["C"]
+        for idx, inputNode in enumerate(node.inputs):
+            self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
+        for idx, outputNode in enumerate(node.outputs):
+            self.operatorRepresentation[outputs[idx]] = newCtxt.lookup(outputNode.name).name
+
+        return newCtxt, ret
+
+
+class DebugParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all([len(node.inputs) == 1, len(node.outputs) == 1],)
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['size'] = np.prod(data_in.shape)
+
+        wellFormed = False
+        if len(data_in.shape) == 4:
+            wellFormed = True
+            self.operatorRepresentation['batch'] = data_in.shape[0]
+            if channels_first:
+                self.operatorRepresentation['dim_im_in_ch'] = data_in.shape[1]
+                self.operatorRepresentation['dim_im_in_x'] = data_in.shape[2]
+                self.operatorRepresentation['dim_im_in_y'] = data_in.shape[3]
+                self.operatorRepresentation['dim_im_out_ch'] = data_out.shape[1]
+                self.operatorRepresentation['dim_im_out_x'] = data_out.shape[2]
+                self.operatorRepresentation['dim_im_out_y'] = data_out.shape[3]
+            else:
+                self.operatorRepresentation['dim_im_in_x'] = data_in.shape[1]
+                self.operatorRepresentation['dim_im_in_y'] = data_in.shape[2]
+                self.operatorRepresentation['dim_im_in_ch'] = data_in.shape[3]
+                self.operatorRepresentation['dim_im_out_x'] = data_out.shape[1]
+                self.operatorRepresentation['dim_im_out_y'] = data_out.shape[2]
+                self.operatorRepresentation['dim_im_out_ch'] = data_out.shape[3]
+
+        if len(data_in.shape) == 3:
+            wellFormed = True
+            self.operatorRepresentation['batch'] = data_in.shape[0]
+            self.operatorRepresentation['dim_im_in_ch'] = 1
+            self.operatorRepresentation['dim_im_in_x'] = data_in.shape[1]
+            self.operatorRepresentation['dim_im_in_y'] = data_in.shape[2]
+            self.operatorRepresentation['dim_im_out_ch'] = 1
+            self.operatorRepresentation['dim_im_out_x'] = data_out.shape[1]
+            self.operatorRepresentation['dim_im_out_y'] = data_out.shape[2]
+
+        if len(data_in.shape) == 2:
+            wellFormed = True
+            self.operatorRepresentation['batch'] = data_in.shape[0]
+            self.operatorRepresentation['dim_im_in_x'] = 1
+            self.operatorRepresentation['dim_im_out_x'] = 1
+            self.operatorRepresentation['dim_im_in_ch'] = 1
+            self.operatorRepresentation['dim_im_out_ch'] = 1
+            self.operatorRepresentation['dim_im_in_y'] = data_in.shape[1]
+            self.operatorRepresentation['dim_im_out_y'] = data_out.shape[1]
+
+        return ctxt, wellFormed
+
+
+class GenericMaxPool2DParser(MaxPool2DParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        wellFormed = super().parseNode(node)
+        if wellFormed:
+            ret = all([
+                all([pad == 0 for pad in self.operatorRepresentation['pads']]), self.operatorRepresentation['ceil_mode']
+                == 0
+            ],)
+
+            return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        return newCtxt, ret
+
+
+class GenericConv1DParser(Conv1DParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        wellFormed = super().parseNode(node)
+
+        if wellFormed:
+            ret = all([
+                # Make sure padding is square
+                self.operatorRepresentation['group'] == 1,
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+                self.operatorRepresentation['pads'][0] == 0,
+                all([coeff == 1 for coeff in self.operatorRepresentation['dilations']]),
+            ])
+
+            return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            inputs = ['data_in', 'weight']
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+
+            return newCtxt, True
+
+        return ctxt, False
+
+
+class GenericDWConv1DParser(Conv1DParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        wellFormed = super().parseNode(node)
+
+        if wellFormed:
+            ret = all([
+                # Make sure padding is square
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+                self.operatorRepresentation['pads'][0] == 0,
+                all([coeff == 1 for coeff in self.operatorRepresentation['dilations']]),
+            ])
+
+            return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            inputs = ['data_in', 'weight']
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+
+            if self.operatorRepresentation['group'] == self.operatorRepresentation['ch_im_in']:
+                return newCtxt, True
+
+        return ctxt, False
+
+
+class GenericConv2DParser(Conv2DParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        wellFormed = super().parseNode(node)
+
+        if wellFormed:
+            ret = all([
+                # Make sure padding is square
+                self.operatorRepresentation['group'] == 1,
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2],
+                self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3],
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+                self.operatorRepresentation['pads'][0] == 0,
+                all([coeff == 1 for coeff in self.operatorRepresentation['dilations']]),
+            ])
+
+            return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            inputs = ['data_in', 'weight']
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+            return newCtxt, True
+
+        return ctxt, False
+
+
+class GenericDWConv2DParser(Conv2DParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        wellFormed = super().parseNode(node)
+
+        if wellFormed:
+            ret = all([
+                # Make sure padding is square
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2],
+                self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3],
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+                self.operatorRepresentation['pads'][0] == 0,
+                all([coeff == 1 for coeff in self.operatorRepresentation['dilations']]),
+            ])
+
+            return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            inputs = ['data_in', 'weight']
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+            if self.operatorRepresentation['group'] == self.operatorRepresentation['ch_im_in']:
+                return newCtxt, True
+
+        return ctxt, False
+
+
+class GenericGEMMParser(GEMMParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = super().parseNode(node)
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+        if ret:
+            # Try to scale A offline if possible, else fail
+            if not self.operatorRepresentation['alpha'].is_integer():
+                nameA = self.operatorRepresentation['A']
+                if newCtxt.is_global(nameA) and isinstance(newCtxt.lookup(nameA), ConstantBuffer):
+                    A = newCtxt.lookup(nameA)
+                    npA = np.asarray(A.values).reshape(A.shape)
+                    newA = npA * self.operatorRepresentation['beta']
+                    newCtxt.globalObjects[nameA].values = newA
+                    self.operatorRepresentation['alpha'] = 1.0
+                else:
+                    return newCtxt, False
+            # Try to scale B offline if possible, else fail
+            if not self.operatorRepresentation['beta'].is_integer():
+                nameB = self.operatorRepresentation['B']
+                if newCtxt.is_global(nameB) and isinstance(newCtxt.lookup(nameB), ConstantBuffer):
+                    B = newCtxt.lookup(nameB)
+                    npB = np.asarray(B.values).reshape(B.shape)
+                    newB = npB * self.operatorRepresentation['beta']
+                    newCtxt.globalObjects[nameB].values = newB
+                    self.operatorRepresentation['beta'] = 1.0
+                else:
+                    return newCtxt, False
+
+            self.operatorRepresentation['alpha'] = int(self.operatorRepresentation['alpha'])
+            self.operatorRepresentation['beta'] = int(self.operatorRepresentation['beta'])
+            return newCtxt, True
+
+        return ctxt, False
diff --git a/Deeploy/Targets/Generic/Platform.py b/Deeploy/Targets/Generic/Platform.py
new file mode 100644
index 0000000..9142b26
--- /dev/null
+++ b/Deeploy/Targets/Generic/Platform.py
@@ -0,0 +1,175 @@
+# ----------------------------------------------------------------------
+#
+# File: GenericPlatform.py
+#
+# Last edited: 17.12.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Moritz Scherer, ETH Zurich
+# - Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \
+    StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
+from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicConv1DBinding, BasicConv2DBinding, \
+    BasicDebugPrintBindings, BasicDWConv1DBinding, BasicDWConv2DBinding, BasicGatherBindings, BasicGELUBinding, \
+    BasicGEMMBinding, BasicIntegerDivBinding, BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, \
+    BasicLayerNormBinding, BasicMatMulBinding, BasicMaxPool2DBinding, BasicMulBindings, BasicPad1DBindings, \
+    BasicPad2DBindings, BasicReduceMeanBindings, BasicReduceSumBindings, BasicReshapeBindings, \
+    BasicRQIntegerDivBinding, BasicRQSBindings, BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBinding, \
+    BasicTransposeBindings, DummyBinding
+from Deeploy.Targets.Generic.Layers import AddLayer, ConvLayer, DebugPrintLayer, GatherLayer, GEMMLayer, \
+    IntegerDivLayer, ITAMaxLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, ReduceMeanLayer, ReduceSumLayer, \
+    RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, SliceLayer, TransposeLayer, iGELULayer, \
+    iLayerNormLayer, iSoftmaxLayer
+from Deeploy.Targets.Generic.Parsers import AddParser, DebugParser, DummyParser, FlattenParser, GatherParser, \
+    GenericConv1DParser, GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, \
+    GenericMaxPool2DParser, IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, MatMulParser, MulParser, Pad1DParser, \
+    Pad2DParser, ReduceMeanParser, ReduceSumParser, RequantShiftParser, ReshapeParser, RQIntegerDivParser, \
+    RQSiGELUParser, SliceParser, TransposeParser, UnsqueezeParser, iGELUParser, iLayerNormParser, iSoftmaxParser
+from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate
+from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ExtractPaddingFromConvPass, \
+    ExtractPaddingFromPoolPass, MatMulAddMergePass, MergeConstAddAndRequantPass, iGELURequantMergePass
+
+AddMapper = NodeMapper(AddParser(), BasicAddBindings)
+Conv1DMapper = NodeMapper(GenericConv1DParser(), [BasicConv1DBinding])
+Conv2DMapper = NodeMapper(GenericConv2DParser(), [BasicConv2DBinding])
+DebugMapper = NodeMapper(DebugParser(), BasicDebugPrintBindings)
+DWConv1DMapper = NodeMapper(GenericDWConv1DParser(), [BasicDWConv1DBinding])
+DWConv2DMapper = NodeMapper(GenericDWConv2DParser(), [BasicDWConv2DBinding])
+FlattenMapper = NodeMapper(FlattenParser(), BasicReshapeBindings)
+GatherMapper = NodeMapper(GatherParser(), BasicGatherBindings)
+GELUMapper = NodeMapper(iGELUParser(), [BasicGELUBinding])
+GEMMMapper = NodeMapper(GenericGEMMParser(), [BasicGEMMBinding])
+iLayerNormMapper = NodeMapper(iLayerNormParser(), [BasicLayerNormBinding])
+IntegerDivMapper = NodeMapper(IntegerDivParser(), [BasicIntegerDivBinding])
+ITAMaxMapper = NodeMapper(ITAMaxParser(), [BasicITASoftmaxBinding])
+ITAPartialMaxMapper = NodeMapper(ITAPartialMaxParser(), [BasicITAPartialSoftmaxBinding])
+MatMulMapper = NodeMapper(MatMulParser(), [BasicMatMulBinding])
+MaxPoolMapper = NodeMapper(GenericMaxPool2DParser(), [BasicMaxPool2DBinding])
+MulMapper = NodeMapper(MulParser(), BasicMulBindings)
+Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings)
+Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings)
+ReduceMeanMapper = NodeMapper(ReduceMeanParser(), BasicReduceMeanBindings)
+ReduceSumMapper = NodeMapper(ReduceSumParser(), BasicReduceSumBindings)
+RequantShiftMapper = NodeMapper(RequantShiftParser(), BasicRQSBindings)
+ReshapeMapper = NodeMapper(ReshapeParser(), BasicReshapeBindings)
+RQGELUMapper = NodeMapper(RQSiGELUParser(), [BasicRQSGELUBinding])
+RQIntegerDivMapper = NodeMapper(RQIntegerDivParser(), [BasicRQIntegerDivBinding])
+SoftmaxMapper = NodeMapper(iSoftmaxParser(), [BasicSoftmaxBinding])
+TransposeMapper = NodeMapper(TransposeParser(), BasicTransposeBindings)
+UnsqueezeMapper = NodeMapper(UnsqueezeParser(), BasicReshapeBindings)
+
+SliceMapper = NodeMapper(SliceParser(), BasicSliceBindings)
+
+# Dummy nodes are intended for development purposes only!
+# They should always generate compiler errors to not accidentally end up in production code
+DummyMapper = NodeMapper(DummyParser(), [DummyBinding])
+
+GenericMapping = {
+    'Add': AddLayer([AddMapper]),
+    'Conv': ConvLayer([Conv2DMapper, DWConv2DMapper, Conv1DMapper, DWConv1DMapper]),
+    'DebugPrint': DebugPrintLayer([DebugMapper]),
+    'Div': IntegerDivLayer([IntegerDivMapper]),
+    'Flatten': ReshapeLayer([FlattenMapper]),
+    'Gather': GatherLayer([GatherMapper]),
+    'Gemm': GEMMLayer([GEMMMapper]),
+    'iGELU': iGELULayer([GELUMapper]),
+    'iLayerNorm': iLayerNormLayer([iLayerNormMapper]),
+    'IntegerDiv': IntegerDivLayer([IntegerDivMapper]),
+    'IntegerMean': ReduceMeanLayer([ReduceMeanMapper]),
+    'iSoftmax': iSoftmaxLayer([SoftmaxMapper]),
+    'ITAMax': ITAMaxLayer([ITAMaxMapper]),
+    'ITAPartialMax': ITAMaxLayer([ITAPartialMaxMapper]),
+    'MatMul': GEMMLayer([MatMulMapper]),
+    'MatMulInteger': MatMulLayer([MatMulMapper]),
+    'MaxPool': MaxPoolLayer([MaxPoolMapper]),
+    'Mul': MulLayer([MulMapper]),
+    'Pad': PadLayer([Pad1DMapper, Pad2DMapper]),
+    'ReduceMean': ReduceMeanLayer([ReduceMeanMapper]),
+    'ReduceSum': ReduceSumLayer([ReduceSumMapper]),
+    'RequantizediGELU': RQSiGELULayer([RQGELUMapper]),
+    'RequantShift': RequantShiftLayer([RequantShiftMapper]),
+    'Reshape': ReshapeLayer([ReshapeMapper]),
+    'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]),
+    'Transpose': TransposeLayer([TransposeMapper]),
+    'Unsqueeze': ReshapeLayer([UnsqueezeMapper]),
+    'Slice': SliceLayer([SliceMapper])
+    # # For example, you can use the DummpyMapper, in case you want to test
+    # # deployment or optimizations with GlobalAveragePool nodes but did not yet
+    # # implement the corresponding kernel
+    # 'GlobalAveragePool': ConvLayer([DummyMapper]),
+}
+
+
+class GenericVariableBuffer(VariableBuffer):
+
+    initTemplate = AllocateTemplate.referenceInitTemplate
+    allocTemplate = AllocateTemplate.referenceAllocateTemplate
+    deallocTemplate = FreeTemplate.referenceLocalTemplate
+
+
+class GenericTransientBuffer(TransientBuffer):
+
+    initTemplate = AllocateTemplate.referenceInitTemplate
+    allocTemplate = AllocateTemplate.referenceAllocateTemplate
+    deallocTemplate = FreeTemplate.referenceLocalTemplate
+
+
+class GenericConstantBuffer(ConstantBuffer):
+
+    initTemplate = AllocateTemplate.referenceGlobalInitTemplate
+    allocTemplate = AllocateTemplate.referenceGlobalAllocateTemplate
+    deallocTemplate = FreeTemplate.referenceGlobalTemplate
+
+
+class GenericStructBuffer(StructBuffer):
+
+    initTemplate = AllocateTemplate.referenceStructInitTemplate
+    allocTemplate = AllocateTemplate.referenceStructAllocateTemplate
+    deallocTemplate = NodeTemplate("")
+
+
+GenericOptimizer = TopologyOptimizer([
+    iGELURequantMergePass(),
+    MatMulAddMergePass(),
+    MergeConstAddAndRequantPass(),
+    ExtractPaddingFromConvPass(),
+    ExtractPaddingFromPoolPass(),
+    # DebugPrintPass(r'.*[Mm]at[Mm]ul.*', position = 'after'),
+])
+
+includeList = ["DeeployBasicMath.h"]
+
+
+class GenericEngine(DeploymentEngine):
+
+    def __init__(self, name: str, Mapping = GenericMapping, initCode: str = "", includeList = includeList) -> None:
+        super().__init__(name, Mapping, initCode, includeList)
+
+
+class GenericPlatform(DeploymentPlatform):
+
+    def __init__(self,
+                 engines = [GenericEngine("Generic")],
+                 variableBuffer = GenericVariableBuffer,
+                 constantBuffer = GenericConstantBuffer,
+                 structBuffer = GenericStructBuffer,
+                 transientBuffer = GenericTransientBuffer):
+        super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
diff --git a/Deeploy/Targets/Generic/Templates/AddTemplate.py b/Deeploy/Targets/Generic/Templates/AddTemplate.py
new file mode 100644
index 0000000..4eec289
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/AddTemplate.py
@@ -0,0 +1,62 @@
+# ----------------------------------------------------------------------
+#
+# File: AddTemplate.py
+#
+# Last edited: 15.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _AddTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in_1 = ctxt.lookup(operatorRepresentation['data_in_1'])
+        data_in_2 = ctxt.lookup(operatorRepresentation['data_in_2'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        input_1_offset = 0
+        if hasattr(data_in_1, "_signed") and hasattr(data_in_1, "nLevels"):
+            input_1_offset = (data_in_1._signed == 0) * int(data_in_1.nLevels / 2)
+        input_2_offset = 0
+        if hasattr(data_in_2, "_signed") and hasattr(data_in_2, "nLevels"):
+            input_2_offset = (data_in_2._signed == 0) * int(data_in_2.nLevels / 2)
+        output_offset = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            output_offset = -(data_out._signed == 0) * int(data_out.nLevels // 2)
+
+        operatorRepresentation['offset'] = input_1_offset + input_2_offset + output_offset
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _AddTemplate("""
+// Add (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    for (uint32_t i=0;i<${size};i++){
+        ${data_out}[i] = ${data_in_1}[i] + ${data_in_2}[i] + ${offset};
+    }
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/AllocateTemplate.py b/Deeploy/Targets/Generic/Templates/AllocateTemplate.py
new file mode 100644
index 0000000..634ad6c
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/AllocateTemplate.py
@@ -0,0 +1,43 @@
+# ----------------------------------------------------------------------
+#
+# File: AllocateTemplate.py
+#
+# Last edited: 15.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceInitTemplate = NodeTemplate("${type.typeName} ${name};\n")
+referenceAllocateTemplate = NodeTemplate(
+    "${name} = (${type.typeName}) deeploy_malloc(${type.referencedType.typeWidth//8} * ${size});\n")
+
+referenceGlobalInitTemplate = NodeTemplate("static ${type.referencedType.typeName} ${name}[${size}] = {${values}};\n")
+#referenceGlobalInitTemplate = NodeTemplate("static const ${type} ${name}[${size}];\n")
+referenceGlobalAllocateTemplate = NodeTemplate("")
+
+referenceStructInitTemplate = NodeTemplate("""
+static ${type.typeName} ${name};
+""")
+#static const ${type}* ${name} = &${name}_UL;
+
+referenceStructAllocateTemplate = NodeTemplate("""
+    ${name} = (${structDict.typeName}) ${str(structDict)};
+""")
diff --git a/Deeploy/Targets/Generic/Templates/ClosureTemplate.py b/Deeploy/Targets/Generic/Templates/ClosureTemplate.py
new file mode 100644
index 0000000..4398f63
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/ClosureTemplate.py
@@ -0,0 +1,43 @@
+# ----------------------------------------------------------------------
+#
+# File: ClosureTemplate.py
+#
+# Last edited: 15.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate, OperatorRepresentation
+
+
+class ClosureTemplate(NodeTemplate, OperatorRepresentation):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+
+referenceTemplate = ClosureTemplate("""
+void ${nodeName}_closure(void* {nodeName}_args){
+${nodeName}_args_t* args = (${nodeName}_args_t*) {nodeName}_args;
+% for argName, argType in closureStructArgs.items():
+${argType.typeName} ${argName} = args->${argName};
+% endfor
+${functionCall}
+}
+""")
diff --git a/Deeploy/Targets/Generic/Templates/ConcatTemplate.py b/Deeploy/Targets/Generic/Templates/ConcatTemplate.py
new file mode 100644
index 0000000..e233e93
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/ConcatTemplate.py
@@ -0,0 +1,79 @@
+# ----------------------------------------------------------------------
+#
+# File: ConcatTemplate.py
+#
+# Last edited: 19.02.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Tuple
+
+import numpy as np
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _ConcatTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+
+        dataIn1 = ctxt.lookup(operatorRepresentation['data_in_1'])
+        dataIn2 = ctxt.lookup(operatorRepresentation['data_in_2'])
+
+        assert "data_in_3" not in operatorRepresentation.keys(), "Concat with more than two inputs not implemented!"
+
+        dataIn1Shape = dataIn1.shape
+        dataIn2Shape = dataIn2.shape
+
+        axis = operatorRepresentation['axis']
+        in1TransferLength = np.prod(dataIn1Shape[axis:]) * (dataIn1._type.referencedType.typeWidth // 8)
+        in2TransferLength = np.prod(dataIn2Shape[axis:]) * (dataIn2._type.referencedType.typeWidth // 8)
+
+        iterations1 = np.prod(dataIn1Shape[:axis])
+        iterations2 = np.prod(dataIn2Shape[:axis])
+
+        assert iterations1 == iterations2, f"iterations1 {iterations1} is not iterations2 {iterations2}; concat can't be applied!"
+
+        operatorRepresentation['iterations'] = iterations1
+        operatorRepresentation['in1TransferLength'] = in1TransferLength
+        operatorRepresentation['in2TransferLength'] = in2TransferLength
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _ConcatTemplate("""
+
+char* ${data_in_1}_tf = (char*) ${data_in_1};
+char* ${data_in_2}_tf = (char*) ${data_in_2};
+char* ${data_out}_tf = (char*) ${data_out};
+
+for (int i=0; i<${iterations}; i++){
+memcpy(${data_out}_tf, ${data_in_1}_tf, ${in1TransferLength});
+${data_out}_tf += ${in1TransferLength};
+${data_in_1}_tf += ${in1TransferLength};
+memcpy(${data_out}_tf, ${data_in_2}_tf, ${in2TransferLength});
+${data_out}_tf += ${in2TransferLength};
+${data_in_2}_tf += ${in2TransferLength};
+}
+""")
diff --git a/Deeploy/Targets/Generic/Templates/ConvTemplate.py b/Deeploy/Targets/Generic/Templates/ConvTemplate.py
new file mode 100644
index 0000000..c65f7ee
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/ConvTemplate.py
@@ -0,0 +1,98 @@
+# ----------------------------------------------------------------------
+#
+# File: ConvTemplate.py
+#
+# Last edited: 04.01.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _Conv2D_Template(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        operatorRepresentation['input_offset'] = 0
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"):
+            operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels // 2)
+        operatorRepresentation['output_offset'] = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_out.nLevels // 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+reference1DTemplate = _Conv2D_Template("""
+<%
+batchOffsetIn = ch_im_in * dim_im_in_y
+batchOffsetOut = ch_im_out * dim_im_out_y
+%>
+
+// 1D Conv (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+    ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+    for (uint32_t n=0; n<${batch}; ++n) {
+        Conv2d_s${data_in_type.referencedType.typeWidth}_s${weight_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_NCHW(
+            ref_${data_out}_${data_in}, ${ch_im_in}, 1, ${dim_im_in_y},
+            ${weight}, ${ch_im_out}, 1, ${dim_kernel_y},
+            1, ${stride_y},
+            ref_${data_out}_${data_out}, ${input_offset}, ${output_offset}
+        );
+        ref_${data_out}_${data_in} += ${batchOffsetIn};
+        ref_${data_out}_${data_out} += ${batchOffsetOut};
+    }
+END_SINGLE_CORE
+""")
+
+reference2DTemplate = _Conv2D_Template("""
+<%
+batchOffsetIn = ch_im_in * dim_im_in_x * dim_im_in_y
+batchOffsetOut = ch_im_out * dim_im_out_x * dim_im_out_y
+%>
+
+// 2D Conv (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+    ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+    for (uint32_t n=0; n<${batch}; ++n) {
+        Conv2d_s${data_in_type.referencedType.typeWidth}_s${weight_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_NCHW(
+            ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y},
+            ${weight}, ${ch_im_out}, ${dim_kernel_x}, ${dim_kernel_y},
+            ${stride_x}, ${stride_y},
+            ref_${data_out}_${data_out}, ${input_offset}, ${output_offset}
+        );
+        ref_${data_out}_${data_in} += ${batchOffsetIn};
+        ref_${data_out}_${data_out} += ${batchOffsetOut};
+    }
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/DWConvTemplate.py b/Deeploy/Targets/Generic/Templates/DWConvTemplate.py
new file mode 100644
index 0000000..e4c8513
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/DWConvTemplate.py
@@ -0,0 +1,98 @@
+# ----------------------------------------------------------------------
+#
+# File: DWConvTemplate.py
+#
+# Last edited: 05.01.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _DWConv2D_Template(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        operatorRepresentation['input_offset'] = 0
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"):
+            operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels // 2)
+        operatorRepresentation['output_offset'] = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_out.nLevels // 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+reference1DTemplate = _DWConv2D_Template("""
+<%
+batchOffsetIn = ch_im_in * dim_im_in_y
+batchOffsetOut = ch_im_out * dim_im_out_y
+%>
+
+// 1D Depth-Wise Conv (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+    ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+    for (uint32_t n=0; n<${batch}; ++n) {
+        DWConv2d_s${data_in_type.referencedType.typeWidth}_s${weight_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_NCHW(
+            ref_${data_out}_${data_in}, ${ch_im_in}, 1, ${dim_im_in_y},
+            ${weight}, 1, ${dim_kernel_y},
+            1, ${stride_y},
+            ref_${data_out}_${data_out}, ${input_offset}, ${output_offset}
+        );
+        ref_${data_out}_${data_in} += ${batchOffsetIn};
+        ref_${data_out}_${data_out} += ${batchOffsetOut};
+    }
+END_SINGLE_CORE
+""")
+
+reference2DTemplate = _DWConv2D_Template("""
+<%
+batchOffsetIn = ch_im_in * dim_im_in_x * dim_im_in_y
+batchOffsetOut = ch_im_out * dim_im_out_x * dim_im_out_y
+%>
+
+// 2D Depth-Wise Conv (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+    ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+    for (uint32_t n=0; n<${batch}; ++n) {
+        DWConv2d_s${data_in_type.referencedType.typeWidth}_s${weight_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_NCHW(
+            ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y},
+            ${weight}, ${dim_kernel_x}, ${dim_kernel_y},
+            ${stride_x}, ${stride_y},
+            ref_${data_out}_${data_out}, ${input_offset}, ${output_offset}
+        );
+        ref_${data_out}_${data_in} += ${batchOffsetIn};
+        ref_${data_out}_${data_out} += ${batchOffsetOut};
+    }
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/DebugPrintTemplate.py b/Deeploy/Targets/Generic/Templates/DebugPrintTemplate.py
new file mode 100644
index 0000000..f8a51df
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/DebugPrintTemplate.py
@@ -0,0 +1,81 @@
+# ----------------------------------------------------------------------
+#
+# File: DebugPrintTemplate.py
+#
+# Last edited: 14.12.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _DebugPrintTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        data_out._type = data_in._type
+
+        operatorRepresentation['data_in_signed'] = data_in._signed
+        operatorRepresentation['offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+
+        operatorRepresentation['output_name'] = ctxt._mangle("outputs") + "[0]" if ctxt.outputs(
+        )[0].name == data_out.name else ctxt._mangle(data_out.name)
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _DebugPrintTemplate("""
+<%
+tensor_type = "Input" if "input" in nodeName else "Output"
+tensor_name = nodeName.replace("_input", "").replace("_output", "")
+%>
+
+// Debug Print (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    ${data_out} = ${data_in};
+% if output_name != data_out:
+    ${output_name} = ${data_out};
+%endif
+    deeploy_log("[DEBUG] ${tensor_type} ${tensor_name} (Buffer ${data_in}, Signed: ${data_in_signed}):\\r\\n");
+
+    %if channels_first:
+    %if data_in_signed:
+        PrintMatrix_s${data_in_type.referencedType.typeWidth}_NCHW(${data_in}, ${batch}, ${dim_im_in_ch}, ${dim_im_in_x}, ${dim_im_in_y}, ${offset});
+    %else:
+        PrintMatrix_u${data_in_type.referencedType.typeWidth}_NCHW((uint${data_in_type.referencedType.typeWidth}_t *) ${data_in}, ${batch}, ${dim_im_in_ch}, ${dim_im_in_x}, ${dim_im_in_y}, ${offset});
+    %endif
+    %else:
+    %if data_in_signed:
+        PrintMatrix_s${data_in_type.referencedType.typeWidth}_NHWC(${data_in}, ${batch}, ${dim_im_in_ch}, ${dim_im_in_x}, ${dim_im_in_y}, ${offset});
+    %else:
+        PrintMatrix_u${data_in_type.referencedType.typeWidth}_NHWC((uint${data_in_type.referencedType.typeWidth}_t *) ${data_in}, ${batch}, ${dim_im_in_ch}, ${dim_im_in_x}, ${dim_im_in_y}, ${offset});
+    %endif
+    %endif
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/DummyTemplate.py b/Deeploy/Targets/Generic/Templates/DummyTemplate.py
new file mode 100644
index 0000000..6ae2d03
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/DummyTemplate.py
@@ -0,0 +1,31 @@
+# ----------------------------------------------------------------------
+#
+# File: DummyTemplate.py
+#
+# Last edited: 15.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// Dummy (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE void DummyOP(${data_in}, ${data_out}, ${size});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/FreeTemplate.py b/Deeploy/Targets/Generic/Templates/FreeTemplate.py
new file mode 100644
index 0000000..84e13d0
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FreeTemplate.py
@@ -0,0 +1,34 @@
+# ----------------------------------------------------------------------
+#
+# File: FreeTemplate.py
+#
+# Last edited: 15.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceLocalTemplate = NodeTemplate("""
+SINGLE_CORE deeploy_free(${name});
+""")
+
+referenceGlobalTemplate = NodeTemplate("""
+SINGLE_CORE deeploy_free(${name});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/GatherTemplate.py b/Deeploy/Targets/Generic/Templates/GatherTemplate.py
new file mode 100644
index 0000000..eb3c052
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/GatherTemplate.py
@@ -0,0 +1,35 @@
+# ----------------------------------------------------------------------
+#
+# File: GatherTemplate.py
+#
+# Last edited: 16.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// Gather (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+for (uint32_t i=0; i<${numIndices}; ++i) {
+    memcpy(${data_out}, ${data_in} + ${indices}[i] * ${offset}, ${offset});
+}
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/GemmTemplate.py b/Deeploy/Targets/Generic/Templates/GemmTemplate.py
new file mode 100644
index 0000000..5bc780d
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/GemmTemplate.py
@@ -0,0 +1,94 @@
+# ----------------------------------------------------------------------
+#
+# File: GemmTemplate.py.py
+#
+# Last edited: 05.01.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _GemmTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        A = ctxt.lookup(operatorRepresentation['A'])
+        B = ctxt.lookup(operatorRepresentation['B'])
+        C = ctxt.lookup(operatorRepresentation['C'])
+        Y = ctxt.lookup(operatorRepresentation['data_out'])
+
+        operatorRepresentation['A_offset'] = 0
+        operatorRepresentation['B_offset'] = 0
+        operatorRepresentation['C_offset'] = 0
+        operatorRepresentation['Y_offset'] = 0
+
+        if hasattr(A, "_signed") and hasattr(A, "nLevels"):
+            operatorRepresentation['A_offset'] = (A._signed == 0) * int(A.nLevels / 2)
+        if hasattr(B, "_signed") and hasattr(B, "nLevels"):
+            operatorRepresentation['B_offset'] = (B._signed == 0) * int(B.nLevels / 2)
+        if hasattr(C, "_signed") and hasattr(C, "nLevels"):
+            operatorRepresentation['C_offset'] = -(C._signed == 0) * int(C.nLevels / 2)
+        if hasattr(Y, "_signed") and hasattr(Y, "nLevels"):
+            operatorRepresentation['Y_offset'] = -(Y._signed == 0) * int(Y.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _GemmTemplate("""
+// GEMM (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    ${A_type.typeName} ref_${data_out}_${A} = ${A};
+    ${B_type.typeName} ref_${data_out}_${B} = ${B};
+    ${C_type.typeName} ref_${data_out}_${C} = ${C};
+    ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+    for(uint32_t i=0;i<${batch};i++){
+        Gemm_s${A_type.referencedType.typeWidth}_s${B_type.referencedType.typeWidth}_s${C_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}(
+            ref_${data_out}_${A},
+            ref_${data_out}_${B},
+            ref_${data_out}_${C},
+            ref_${data_out}_${data_out},
+            ${M},
+            ${N},
+            ${O},
+            ${alpha},
+            ${beta},
+            ${transA},
+            ${transB},
+            ${A_offset},
+            ${B_offset},
+            ${C_offset},
+            ${Y_offset}
+        );
+
+        ref_${data_out}_${A} += ${M} * ${N};
+        ref_${data_out}_${B} += ${N} * ${O};
+        ref_${data_out}_${C} += ${M} * ${O};
+        ref_${data_out}_${data_out} += ${M} * ${O};
+    }
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/ITAMaxTemplate.py b/Deeploy/Targets/Generic/Templates/ITAMaxTemplate.py
new file mode 100644
index 0000000..67d9b0f
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/ITAMaxTemplate.py
@@ -0,0 +1,55 @@
+# ----------------------------------------------------------------------
+#
+# File: ITAMaxTemplate.py
+#
+# Last edited: 27.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _ITAMaxTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+
+        return ctxt, operatorRepresentation, []
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        size = operatorRepresentation['lastDimLength']
+        name = operatorRepresentation['nodeName'] + f"_buffer"
+        ctxt.hoistTransientBuffer(name, size)
+        operatorRepresentation['ctxtBuffer'] = ctxt._mangle(name)
+        operatorRepresentation['ctxtBufferSize'] = size
+
+        return ctxt, operatorRepresentation, [name]
+
+
+referenceTemplate = _ITAMaxTemplate("""
+// ITAMax (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE ITAMax_s${data_in_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${ctxtBuffer}, ${size}, ${lastDimLength}, ${n_levels});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/ITAPartialMaxTemplate.py b/Deeploy/Targets/Generic/Templates/ITAPartialMaxTemplate.py
new file mode 100644
index 0000000..f813dca
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/ITAPartialMaxTemplate.py
@@ -0,0 +1,45 @@
+# ----------------------------------------------------------------------
+#
+# File: ITAPartialMaxTemplate.py
+#
+# Last edited: 08.01.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _ITAPartialMaxTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+        ctxt = ctxt.copy()
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _ITAPartialMaxTemplate("""
+// ITAPartialMax (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE ITAPartialMax_s${data_in_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size}, ${lastDimLength}, ${group_width}, ${n_levels});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/IntegerDivTemplate.py b/Deeploy/Targets/Generic/Templates/IntegerDivTemplate.py
new file mode 100644
index 0000000..6946b60
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/IntegerDivTemplate.py
@@ -0,0 +1,31 @@
+# ----------------------------------------------------------------------
+#
+# File: IntegerDivTemplate.py
+#
+# Last edited: 02.09.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// Integer Division (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE Div_s${A_type.referencedType.typeWidth}_s${B_type.referencedType.typeWidth}(${A}, ${B}, ${sizeA}, ${sizeB}, ${nomStep}, ${denomStep}, ${C}, ${Delta}, ${eps}, ${eta});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/MatMulTemplate.py b/Deeploy/Targets/Generic/Templates/MatMulTemplate.py
new file mode 100644
index 0000000..038be0c
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/MatMulTemplate.py
@@ -0,0 +1,78 @@
+# ----------------------------------------------------------------------
+#
+# File: MatMulTemplate.py
+#
+# Last edited: 02.09.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _MatMulTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        A = ctxt.lookup(operatorRepresentation['A'])
+        B = ctxt.lookup(operatorRepresentation['B'])
+        C = ctxt.lookup(operatorRepresentation['data_out'])
+        operatorRepresentation['A_offset'] = 0
+        operatorRepresentation['B_offset'] = 0
+        operatorRepresentation['C_offset'] = 0
+        if hasattr(A, "_signed") and hasattr(A, "nLevels"):
+            operatorRepresentation['A_offset'] = (A._signed == 0) * int(A.nLevels / 2)
+        if hasattr(B, "_signed") and hasattr(B, "nLevels"):
+            operatorRepresentation['B_offset'] = (B._signed == 0) * int(B.nLevels / 2)
+        if hasattr(C, "_signed") and hasattr(C, "nLevels"):
+            operatorRepresentation['C_offset'] = -(C._signed == 0) * int(C.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _MatMulTemplate("""
+// MatMul (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    ${A_type.typeName} ref_${data_out}_${A} = ${A};
+    ${B_type.typeName} ref_${data_out}_${B} = ${B};
+    ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+    for(uint32_t i=0;i<${batch};i++){
+        MatMul_s${A_type.referencedType.typeWidth}_s${B_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}(
+            ref_${data_out}_${A},
+            ref_${data_out}_${B},
+            ref_${data_out}_${data_out},
+            ${M},
+            ${N},
+            ${O},
+            ${A_offset}, ${B_offset}, ${C_offset}
+        );
+
+        ref_${data_out}_${A} += ${M} * ${N};
+        ref_${data_out}_${B} += ${N} * ${O};
+        ref_${data_out}_${data_out} += ${M} * ${O};
+    }
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/MaxPoolTemplate.py b/Deeploy/Targets/Generic/Templates/MaxPoolTemplate.py
new file mode 100644
index 0000000..9da0029
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/MaxPoolTemplate.py
@@ -0,0 +1,74 @@
+# ----------------------------------------------------------------------
+#
+# File: MaxPoolTemplate.py
+#
+# Last edited: 04.01.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _MaxPool2DTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        operatorRepresentation['input_offset'] = 0
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"):
+            operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels // 2)
+        operatorRepresentation['output_offset'] = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_out.nLevels // 2)
+
+        # import IPython; IPython.embed()
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _MaxPool2DTemplate("""
+<%
+batchOffsetIn = ch_im_in * dim_im_in_x * dim_im_in_y
+batchOffsetOut = ch_im_out * dim_im_out_x * dim_im_out_y
+%>
+
+// 2D MaxPool (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+    ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+    for (uint32_t n=0; n<${batch}; ++n) {
+        MaxPool2d_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_NCHW(
+            ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y},
+            ${dim_kernel_x}, ${dim_kernel_y}, ${stride_x}, ${stride_y},
+            ref_${data_out}_${data_out}, ${input_offset}, ${output_offset}
+        );
+        ref_${data_out}_${data_in} += ${batchOffsetIn};
+        ref_${data_out}_${data_out} += ${batchOffsetOut};
+    }
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/MulTemplate.py b/Deeploy/Targets/Generic/Templates/MulTemplate.py
new file mode 100644
index 0000000..f96b549
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/MulTemplate.py
@@ -0,0 +1,62 @@
+# ----------------------------------------------------------------------
+#
+# File: MulTemplate.py
+#
+# Last edited: 02.09.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _MulTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        A = ctxt.lookup(operatorRepresentation['A'])
+        B = ctxt.lookup(operatorRepresentation['B'])
+        C = ctxt.lookup(operatorRepresentation['C'])
+        operatorRepresentation['A_offset'] = 0
+        operatorRepresentation['B_offset'] = 0
+        operatorRepresentation['C_offset'] = 0
+        if hasattr(A, "_signed") and hasattr(A, "nLevels"):
+            operatorRepresentation['A_offset'] = (A._signed == 0) * int(A.nLevels / 2)
+        if hasattr(B, "_signed") and hasattr(B, "nLevels"):
+            operatorRepresentation['B_offset'] = (B._signed == 0) * int(B.nLevels / 2)
+        if hasattr(C, "_signed") and hasattr(C, "nLevels"):
+            operatorRepresentation['C_offset'] = -(C._signed == 0) * int(C.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _MulTemplate("""
+// Mul (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    for (uint32_t i=0;i<${size};i++){
+        ${C}[i] = ((${A}[i] + ${A_offset}) * (${B}[i] + ${B_offset}) + ${C_offset});
+    }
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/PadTemplate.py b/Deeploy/Targets/Generic/Templates/PadTemplate.py
new file mode 100644
index 0000000..dfcced9
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/PadTemplate.py
@@ -0,0 +1,148 @@
+# ----------------------------------------------------------------------
+#
+# File: PadTemplate.py
+#
+# Last edited: 27.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _Pad2DTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        # Align padding value to input signedness
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels") and not data_in._signed:
+            operatorRepresentation['value'] = operatorRepresentation['value'] - int(data_in.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+reference2DTemplate = _Pad2DTemplate("""
+<%
+    y_offset_out = dim_im_out_ch*(pad_y*dim_im_out_y)
+    x_offset_out = dim_im_out_ch*(pad_x)
+    width = dim_im_in_ch*dim_im_in_y
+
+    addoffsetOut = dim_im_out_ch * dim_im_out_y
+    addoffsetIn = dim_im_in_ch * dim_im_in_y
+
+    startPosX = y_offset_out + x_offset_out
+
+batchOffsetOut = dim_im_out_ch * dim_im_out_x * dim_im_out_y
+%>
+
+// 2D Pad (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    memset(${data_out}, ${value}, ${data_out_size}*sizeof(${data_out_type.referencedType.typeName}));
+    uint32_t xoffset_${data_out}_${data_in};
+    uint32_t offset_in_${data_out}_${data_in} = 0;
+
+    % if channels_first:
+    // NCHW Layout
+    for(uint32_t n=0; n<${batch}; n++){
+        xoffset_${data_out}_${data_in} = n*${batchOffsetOut} + ${pad_y}*${dim_im_out_y}+${pad_x};
+        for (uint32_t c=0; c<${dim_im_in_ch}; ++c) {
+            for(uint32_t h=0; h<${dim_im_in_x}; h++){
+                memcpy(${data_out} + xoffset_${data_out}_${data_in}, ${data_in}+offset_in_${data_out}_${data_in}, ${dim_im_in_y}*sizeof(${data_out_type.referencedType.typeName}));
+                xoffset_${data_out}_${data_in} += ${dim_im_out_y};
+                offset_in_${data_out}_${data_in} += ${dim_im_in_y};
+            }
+            xoffset_${data_out}_${data_in} += 2*${pad_y}*${dim_im_out_y};
+        }
+    }
+    % else:
+    // NHWC Layout
+    for(uint32_t n=0; n<${batch}; n++){
+        xoffset_${data_out}_${data_in} = n*${batchOffsetOut} + ${startPosX};
+        for(uint32_t h=0; h<${dim_im_in_x}; h++){
+            memcpy(${data_out}+xoffset_${data_out}_${data_in}, ${data_in}+offset_in_${data_out}_${data_in}, ${width}*sizeof(${data_out_type.referencedType.typeName}));
+            xoffset_${data_out}_${data_in} += ${addoffsetOut};
+            offset_in_${data_out}_${data_in} += ${addoffsetIn};
+        }
+    }
+    %endif
+END_SINGLE_CORE
+""")
+
+
+class _Pad1DTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        # Align padding value to input signedness
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels") and not data_in._signed:
+            operatorRepresentation['value'] = operatorRepresentation['value'] - int(data_in.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+reference1DTemplate = _Pad1DTemplate("""
+<%
+    x_offset_out = dim_im_out_ch*(pad_y)
+    width = dim_im_in_ch*dim_im_in_y
+
+    startPosX = x_offset_out
+
+batchOffsetOut = dim_im_out_ch * dim_im_out_y
+%>
+
+// 1D Pad (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    memset(${data_out}, ${value}, ${data_out_size}*sizeof(${data_out_type.referencedType.typeName}));
+    uint32_t xoffset_${data_out}_${data_in};
+    uint32_t offset_in_${data_out}_${data_in} = 0;
+
+    % if channels_first:
+    // NCHW Layout
+    for(uint32_t n=0; n<${batch}; n++){
+        xoffset_${data_out}_${data_in} = n*${batchOffsetOut} +${pad_y};
+        for (uint32_t c=0; c<${dim_im_in_ch}; ++c) {
+            memcpy(${data_out} + xoffset_${data_out}_${data_in}, ${data_in}+offset_in_${data_out}_${data_in}, ${dim_im_in_y}*sizeof(${data_out_type.referencedType.typeName}));
+            xoffset_${data_out}_${data_in} += ${dim_im_out_y};
+            offset_in_${data_out}_${data_in} += ${dim_im_in_y};
+        }
+    }
+    % else:
+    // NHWC Layout
+    for(uint32_t n=0; n<${batch}; n++){
+        xoffset_${data_out}_${data_in} = n*${batchOffsetOut} + ${startPosX};
+        memcpy(${data_out}+xoffset_${data_out}_${data_in}, ${data_in}+offset_in_${data_out}_${data_in}, ${width}*sizeof(${data_out_type.referencedType.typeName}));
+        offset_in_${data_out}_${data_in} += ${width};
+    }
+    %endif
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/RQIntegerDivTemplate.py b/Deeploy/Targets/Generic/Templates/RQIntegerDivTemplate.py
new file mode 100644
index 0000000..1f2e01c
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/RQIntegerDivTemplate.py
@@ -0,0 +1,31 @@
+# ----------------------------------------------------------------------
+#
+# File: RQIntegerDivTemplate.py
+#
+# Last edited: 02.09.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// RQIntegerDiv (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE RQDiv_s${A_type.referencedType.typeWidth}_s${C_type.referencedType.typeWidth}(${A}, ${B}, ${sizeA}, ${sizeB}, ${nomStep}, ${denomStep}, ${C}, ${Delta}, ${eps}, ${eta}, *${requant_mul}, *${requant_add}, *${requant_div});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/RQSiGELUTemplate.py b/Deeploy/Targets/Generic/Templates/RQSiGELUTemplate.py
new file mode 100644
index 0000000..7783ade
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/RQSiGELUTemplate.py
@@ -0,0 +1,54 @@
+# ----------------------------------------------------------------------
+#
+# File: RQSiGELUTemplate.py
+#
+# Last edited: 13.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _RQSiGELUTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+        operatorRepresentation['input_offset'] = 0
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"):
+            operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+        operatorRepresentation['output_offset'] = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_out.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _RQSiGELUTemplate("""
+// RequantizediGELU (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE RQGELU_s${data_in_type.referencedType.typeWidth}_s${data_in_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size}, ${b}, ${one}, ${input_offset}, ${output_offset}, ${mul}, ${add}, ${shift});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/RQSiHardswishTemplate.py b/Deeploy/Targets/Generic/Templates/RQSiHardswishTemplate.py
new file mode 100644
index 0000000..4067ef0
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/RQSiHardswishTemplate.py
@@ -0,0 +1,50 @@
+# ----------------------------------------------------------------------
+#
+# File: RQSiHardswishTemplate.py
+#
+# Last edited: 23.02.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _RQSiHardswishTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+        operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+        operatorRepresentation['output_offset'] = (data_out._signed == 0) * int(data_out.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _RQSiHardswishTemplate("""
+// RequantizediHardswish (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE RQiHardswish_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size}, ${one_over_six}, ${three}, ${six}, ${input_offset}, ${output_offset}, ${mul}, ${add}, ${shift});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/ReduceMeanTemplate.py b/Deeploy/Targets/Generic/Templates/ReduceMeanTemplate.py
new file mode 100644
index 0000000..d43d6f7
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/ReduceMeanTemplate.py
@@ -0,0 +1,116 @@
+# ----------------------------------------------------------------------
+#
+# File: ReduceMeanTemplate.py
+#
+# Last edited: 05.06.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _ReduceMeanTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+        operatorRepresentation['input_offset'] = 0
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"):
+            operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+        operatorRepresentation['output_offset'] = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_in.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _ReduceMeanTemplate("""
+// ReduceMean (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+int32_t ${data_out}_accumulator = 0;
+<%
+
+reduceLength = 1
+for i, axis in enumerate(axes):
+    if axis < 0:
+        axes[i] += len(data_in_shape)
+    reduceLength = reduceLength * data_in_shape[axis]
+%>
+<%
+    shapeStr = ''
+    accessStr = ''
+%>
+% for idx, i in enumerate(data_in_shape[1:]):
+<%
+    shapeStr += '['+str(i)+']'
+%>
+% endfor
+% for j in range(len(data_in_shape)):
+<%
+    accessStr += '[i_'+str(j)+']'
+%>
+% endfor
+${data_out_type.typeName} dummy_${data_out} = ${data_out};
+
+<%
+restDims = set(list(range(len(data_in_shape)))).difference(set(axes))
+%>
+% for i in list(restDims):
+for(uint32_t i_${i} = 0; i_${i}<${data_in_shape[i]}; i_${i}++){
+% endfor
+${data_out}_accumulator = ${input_offset}*${reduceLength};
+% for i in list(axes):
+for(uint32_t i_${i} = 0; i_${i}<${data_in_shape[i]}; i_${i}++){
+% endfor
+${data_out}_accumulator += ((${data_in_type.referencedType.typeName} (*)${shapeStr})${data_in})${accessStr};
+
+% for i in range(len(axes)):
+}
+% endfor
+% if keepdims:
+*dummy_${data_out}++ = (${data_out_type.referencedType.typeName}) ((${data_out}_accumulator + ${data_out}_sgn*(${reduceLength}>>1)) / ${reduceLength} + ${output_offset});
+% else:
+<%
+
+import numpy as np
+
+if (np.log2(reduceLength) - int(np.log2(reduceLength))) == 0:
+    shift = int(np.log2(reduceLength))
+%>
+% if shift is not None:
+*dummy_${data_out}++ = (${data_out_type.referencedType.typeName}) (((${data_out}_accumulator + (1<<(${shift}-1))) >> ${shift}) + ${output_offset});
+% else:
+int8_t ${data_out}_sgn = 0;
+${data_out}_sgn = -(${data_out}_accumulator<0) + (${data_out}_accumulator >= 0);
+*dummy_${data_out}++ = (${data_out_type.referencedType.typeName}) ((${data_out}_accumulator + ${data_out}_sgn*(${reduceLength}>>1)) / ${reduceLength} + ${output_offset});
+% endif
+% endif
+% for i in range(len(restDims)):
+}
+% endfor
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/ReduceSumTemplate.py b/Deeploy/Targets/Generic/Templates/ReduceSumTemplate.py
new file mode 100644
index 0000000..952efb4
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/ReduceSumTemplate.py
@@ -0,0 +1,94 @@
+# ----------------------------------------------------------------------
+#
+# File: ReduceSumTemplate.py
+#
+# Last edited: 27.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _ReduceSumTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+        operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+        operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_in.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _ReduceSumTemplate("""
+// ReduceSum (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+int32_t ${data_out}_accumulator = 0;
+<%
+reduceLength = 1
+for i, axis in enumerate(axes):
+    if axis < 0:
+        axes[i] += len(data_in_shape)
+    reduceLength = reduceLength * data_in_shape[i]
+%>
+<%
+    shapeStr = ''
+    accessStr = ''
+%>
+% for idx, i in enumerate(data_in_shape[1:]):
+<%
+    shapeStr += '['+str(i)+']'
+%>
+% endfor
+% for j in range(len(data_in_shape)):
+<%
+    accessStr += '[i_'+str(j)+']'
+%>
+% endfor
+${data_out_type.typeName} dummy_${data_out} = ${data_out};
+
+<%
+restDims = set(list(range(len(data_in_shape)))).difference(set(axes))
+%>
+% for i in list(restDims):
+for(uint32_t i_${i} = 0; i_${i}<${data_in_shape[i]}; i_${i}++){
+% endfor
+${data_out}_accumulator = ${input_offset}*${reduceLength};
+% for i in list(axes):
+for(uint32_t i_${i} = 0; i_${i}<${data_in_shape[i]}; i_${i}++){
+% endfor
+${data_out}_accumulator += ((${data_in_type.referencedType.typeName} (*)${shapeStr})${data_in})${accessStr};
+
+% for i in range(len(axes)):
+}
+% endfor
+*dummy_${data_out}++ = (${data_out_type.referencedType.typeName}) (${data_out}_accumulator + ${output_offset});
+% for i in range(len(restDims)):
+}
+% endfor
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/RequantShiftTemplate.py b/Deeploy/Targets/Generic/Templates/RequantShiftTemplate.py
new file mode 100644
index 0000000..9041296
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/RequantShiftTemplate.py
@@ -0,0 +1,71 @@
+# ----------------------------------------------------------------------
+#
+# File: RequantShiftTemplate.py
+#
+# Last edited: 14.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _RequantShiftTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        operatorRepresentation['input_offset'] = 0
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"):
+            operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+        operatorRepresentation['output_offset'] = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * operatorRepresentation['n_levels'] // 2
+
+        operatorRepresentation['output_min'] = -(operatorRepresentation['n_levels'] // 2)
+        operatorRepresentation['output_max'] = (operatorRepresentation['n_levels'] // 2) - 1
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _RequantShiftTemplate("""
+<%
+if isinstance(log2D, int):
+    log2Dstring = log2D
+else:
+    log2Dstring = "*"+log2D
+%>
+
+// RequantShift (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    % if channels_first:
+    RequantShift_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_NCHW(${data_in}, ${size}, ${mul}, ${add}, ${data_out}, ${log2Dstring}, ${channel_width}, ${input_offset}, ${output_offset}, ${output_min}, ${output_max}, 1);
+    % else:
+    RequantShift_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_NHWC(${data_in}, ${size}, ${mul}, ${add}, ${data_out}, ${log2Dstring}, ${channels}, ${input_offset}, ${output_offset}, ${output_min}, ${output_max}, 1);
+    %endif
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/ReshapeTemplate.py b/Deeploy/Targets/Generic/Templates/ReshapeTemplate.py
new file mode 100644
index 0000000..f577e4e
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/ReshapeTemplate.py
@@ -0,0 +1,54 @@
+# ----------------------------------------------------------------------
+#
+# File: ReshapeTemplate.py
+#
+# Last edited: 16.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _ReshapeTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        # SCHEREMO: Selectively mark 'indices' dead, since we don't need them
+        if 'indices' in operatorRepresentation.keys():
+            ctxt.globalObjects[operatorRepresentation['indices']]._deploy = False
+            ctxt.globalObjects[operatorRepresentation['indices']]._live = False
+
+        inBuffer = ctxt.lookup(operatorRepresentation['data_in'])
+        outBuffer = ctxt.lookup(operatorRepresentation['data_out'])
+        outBuffer._alias = inBuffer.name
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _ReshapeTemplate("""
+// Reshape (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE ${data_out} = ${data_in};
+""")
diff --git a/Deeploy/Targets/Generic/Templates/SkipTemplate.py b/Deeploy/Targets/Generic/Templates/SkipTemplate.py
new file mode 100644
index 0000000..dee5b80
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/SkipTemplate.py
@@ -0,0 +1,30 @@
+# ----------------------------------------------------------------------
+#
+# File: SkipTemplate.py
+#
+# Last edited: 16.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+SINGLE_CORE ${data_out} = ${data_in};
+""")
diff --git a/Deeploy/Targets/Generic/Templates/SliceTemplate.py b/Deeploy/Targets/Generic/Templates/SliceTemplate.py
new file mode 100644
index 0000000..0f04ded
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/SliceTemplate.py
@@ -0,0 +1,106 @@
+# ----------------------------------------------------------------------
+#
+# File: SliceTemplate.py
+#
+# Last edited: 01.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _SliceTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        # Immediate-ify start
+        startsBuffer = ctxt.lookup(operatorRepresentation['starts'])
+        axesBuffer = ctxt.lookup(operatorRepresentation['axes'])
+        endsBuffer = ctxt.lookup(operatorRepresentation['ends'])
+        stepsBuffer = ctxt.lookup(operatorRepresentation['steps'])
+
+        startsBuffer._deploy = False
+        axesBuffer._deploy = False
+        endsBuffer._deploy = False
+        stepsBuffer._deploy = False
+
+        operatorRepresentation['starts'] = startsBuffer.values
+        operatorRepresentation['ends'] = endsBuffer.values
+        operatorRepresentation['axes'] = axesBuffer.values
+        operatorRepresentation['steps'] = stepsBuffer.values
+
+        operatorRepresentation['data_in_size'] = np.prod(operatorRepresentation['data_in_shape'])
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _SliceTemplate("""
+// Slice (Name: ${nodeName}, Op: ${nodeOp})
+<%
+dimSteps = []
+dimSteps.append(data_in_size//data_in_shape[0])
+for dim in data_in_shape[1:]:
+     dimSteps.append(dimSteps[-1]//dim)
+%>
+<%
+transferSize = dimSteps[axes[-1]]
+%>
+<%
+if axes[0] > 0:
+    preAxes = list(range(axes[0]))
+else:
+    preAxes = []
+%>
+
+${data_out_type.referencedType.typeName}* ref_${data_out} = ${data_out};
+% for axis in (list(preAxes) + list(axes)):
+uint32_t ${data_out}_offset_${axis} = 0;
+% endfor
+
+% for axis, axisLen in zip(preAxes, list(data_in_shape)):
+for(uint32_t i_${axis} = 0; i_${axis} < ${axisLen}; i_${axis}++){
+% if axis == 0:
+${data_out}_offset_0 =  ${dimSteps[axis]} * i_${axis};
+% else:
+${data_out}_offset_${axis} =  ${data_out}_offset_${axis-1} + ${dimSteps[axis]} * i_${axis};
+% endif
+% endfor
+% for axis, start, end, step in zip(axes, starts, ends, steps):
+for(uint32_t i_${axis} = ${start}; i_${axis} < ${end}; i_${axis} += ${step}){
+% if axis == 0:
+${data_out}_offset_0 =  ${dimSteps[axis]} * i_${axis};
+% else:
+${data_out}_offset_${axis} =  ${data_out}_offset_${axis-1} + ${dimSteps[axis]} * i_${axis};
+% endif
+% endfor
+memcpy(ref_${data_out}, ${data_in} + ${data_out}_offset_${axis}, ${transferSize* data_out_type.referencedType.typeWidth//8});
+ref_${data_out} += ${transferSize};
+% for axis in range(axes[-1]+1):
+}
+% endfor
+""")
diff --git a/Deeploy/Targets/Generic/Templates/TransposeTemplate.py b/Deeploy/Targets/Generic/Templates/TransposeTemplate.py
new file mode 100644
index 0000000..0dfceac
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/TransposeTemplate.py
@@ -0,0 +1,56 @@
+# ----------------------------------------------------------------------
+#
+# File: TransposeTemplate.py
+#
+# Last edited: 28.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// Transpose ${data_in_shape} -> ${data_out_shape} (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+${data_out_type.typeName} dummy_${data_out} = ${data_out};
+<%
+    dimStr = ''
+    accessStr = ''
+    shapeStr = ''
+    for dim in data_in_shape:
+        dimStr += '['+str(dim)+']'
+%>
+% for idx, i in enumerate(perm[:-1]):
+<%
+    shapeStr += '['+str(data_in_shape[idx+1])+']'
+%>
+% endfor
+% for idx, i in enumerate(perm):
+<%
+    shape = data_out_shape[idx]
+    accessStr += '[i_'+str(idx)+']'
+%>
+for(uint32_t i_${i} = 0; i_${i}<${shape}; i_${i}++){
+% endfor
+*dummy_${data_out}++ = ((${data_in_type.referencedType.typeName} (*)${shapeStr})${data_in})${accessStr};
+% for idx, i in enumerate(perm):
+}
+% endfor
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/__init__.py b/Deeploy/Targets/Generic/Templates/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/Generic/Templates/iGELUTemplate.py b/Deeploy/Targets/Generic/Templates/iGELUTemplate.py
new file mode 100644
index 0000000..0b3e1b8
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/iGELUTemplate.py
@@ -0,0 +1,54 @@
+# ----------------------------------------------------------------------
+#
+# File: iGELUTemplate.py
+#
+# Last edited: 13.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _iGELUTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+        operatorRepresentation['input_offset'] = 0
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"):
+            operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+        operatorRepresentation['output_offset'] = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_out.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _iGELUTemplate("""
+// iGELU (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE GELU_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size}, ${b}, ${one}, ${input_offset});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/iHardswishTemplate.py b/Deeploy/Targets/Generic/Templates/iHardswishTemplate.py
new file mode 100644
index 0000000..0dd7f65
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/iHardswishTemplate.py
@@ -0,0 +1,48 @@
+# ----------------------------------------------------------------------
+#
+# File: iHardswishTemplate.py
+#
+# Last edited: 22.02.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _iHardswishTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _iHardswishTemplate("""
+// iHardswish (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE iHardswish_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size}, ${one_over_six}, ${three}, ${six}, ${input_offset});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/iLayernormTemplate.py b/Deeploy/Targets/Generic/Templates/iLayernormTemplate.py
new file mode 100644
index 0000000..75a1a9b
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/iLayernormTemplate.py
@@ -0,0 +1,48 @@
+# ----------------------------------------------------------------------
+#
+# File: ILayernormTemplate.py
+#
+# Last edited: 31.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _iLayerNormTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _iLayerNormTemplate("""
+// iLayernorm (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE Layernorm_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${weight}, ${bias}, ${input_offset}, ${size}, ${lastDimLength}, ${log2D});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/iRMSNormTemplate.py b/Deeploy/Targets/Generic/Templates/iRMSNormTemplate.py
new file mode 100644
index 0000000..2f8859e
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/iRMSNormTemplate.py
@@ -0,0 +1,48 @@
+# ----------------------------------------------------------------------
+#
+# File: iRMSNormTemplate.py
+#
+# Last edited: 20.02.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _iRMSNormTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _iRMSNormTemplate("""
+// iRMSnorm (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE iRMSnorm_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${weight}, ${input_offset}, ${size}, ${lastDimLength}, ${log2D});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/iSoftmaxTemplate.py b/Deeploy/Targets/Generic/Templates/iSoftmaxTemplate.py
new file mode 100644
index 0000000..be5c7f1
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/iSoftmaxTemplate.py
@@ -0,0 +1,54 @@
+# ----------------------------------------------------------------------
+#
+# File: iSoftmaxTemplate.py
+#
+# Last edited: 30.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _iSoftmaxTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+        operatorRepresentation['input_offset'] = 0
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"):
+            operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+        operatorRepresentation['output_offset'] = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_in.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _iSoftmaxTemplate("""
+// iSoftmax (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE Softmax_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size}, ${lastDimLength}, ${coeffA}, ${coeffB}, ${coeffC}, ${log2}, ${n_levels});
+""")
diff --git a/Deeploy/Targets/Generic/TileConstraints/AddTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/AddTileConstraint.py
new file mode 100644
index 0000000..0e932a8
--- /dev/null
+++ b/Deeploy/Targets/Generic/TileConstraints/AddTileConstraint.py
@@ -0,0 +1,30 @@
+# ----------------------------------------------------------------------
+#
+# File: AddTileConstraint.py
+#
+# Last edited: 05.10.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .BOPTileConstraint import BOPTileConstraint
+
+
+class AddTileConstraint(BOPTileConstraint):
+    pass
diff --git a/Deeploy/Targets/Generic/TileConstraints/BOPTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/BOPTileConstraint.py
new file mode 100644
index 0000000..d5b77f9
--- /dev/null
+++ b/Deeploy/Targets/Generic/TileConstraints/BOPTileConstraint.py
@@ -0,0 +1,100 @@
+# ----------------------------------------------------------------------
+#
+# File: BOPTileConstraint.py
+#
+# Last edited: 05.10.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
+
+
+class BOPTileConstraint(TileConstraint):
+    """Tile constraint class for binary operators, i.e. operators that use two input tensors of equal dimensions
+    """
+
+    dataIn1Name = 'data_in_1'  #: str: Name of the first input tensor as defined by the operator's parser
+    dataIn2Name = 'data_in_2'  #: str: Name of the second input tensor as defined by the operator's parser
+    dataOutName = 'data_out'  #: str: Name of the output tensor as defined by the operator's parser
+
+    @classmethod
+    def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        inputBuffer1Name = parseDict[cls.dataIn1Name]
+        inputBuffer2Name = parseDict[cls.dataIn2Name]
+        outputBufferName = parseDict[cls.dataOutName]
+
+        for bufferName in [inputBuffer1Name, inputBuffer2Name, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        input1Shape = ctxt.lookup(inputBuffer1Name).shape
+
+        for dim in range(len(input1Shape)):
+            inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim)
+            inputDim2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim)
+            outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim)
+
+            tilerModel.addConstraint(inputDim1Var == inputDim2Var)
+            tilerModel.addConstraint(inputDim1Var == outputDimVar)
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = [cls.dataIn1Name, cls.dataIn2Name, cls.dataOutName]
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        replacements = {"size": []}
+
+        replacementTypes = {"size": PointerClass(uint16_t)}
+
+        for cube in outputCubes:
+            newSize = np.prod(cube.dims)
+            replacements["size"].append(newSize)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for cube in outputCubes:
+            inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: cube})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({cls.dataOutName: out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/Generic/TileConstraints/ConcatTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/ConcatTileConstraint.py
new file mode 100644
index 0000000..6e546ab
--- /dev/null
+++ b/Deeploy/Targets/Generic/TileConstraints/ConcatTileConstraint.py
@@ -0,0 +1,140 @@
+# ----------------------------------------------------------------------
+#
+# File: ConcatTileConstraint.py
+#
+# Last edited: 19.02.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
+
+
+class ConcatTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        inputBuffer1Name = parseDict['data_in_1']
+        inputBuffer2Name = parseDict['data_in_2']
+        outputBufferName = parseDict['data_out']
+
+        for bufferName in [inputBuffer1Name, inputBuffer2Name, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        input1Shape = ctxt.lookup(inputBuffer1Name).shape
+        outputShape = ctxt.lookup(outputBufferName).shape
+
+        axis = parseDict['axis']
+        posAxis = axis if axis >= 0 else len(input1Shape) + axis
+
+        for dim in range(len(input1Shape)):
+            inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim)
+            inputDim2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim)
+            outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim)
+
+            if dim == posAxis:
+                tilerModel.addConstraint(inputDim1Var + inputDim2Var == outputDimVar)
+            else:
+                tilerModel.addConstraint(inputDim1Var == outputDimVar)
+                tilerModel.addConstraint(inputDim2Var == inputDim1Var)
+
+        for dim in range(posAxis, len(input1Shape), 1):
+
+            inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim)
+            inputDim2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim)
+            outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim)
+
+            tilerModel.addConstraint(inputDim1Var == input1Shape[dim])
+            tilerModel.addConstraint(outputDimVar == outputShape[dim])
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in_1', 'data_in_2', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        replacements = {"iterations": [], "in1TransferLength": [], "in2TransferLength": []}
+        replacementTypes = {
+            "iterations": PointerClass(uint16_t),
+            "in1TransferLength": PointerClass(uint16_t),
+            "in2TransferLength": PointerClass(uint16_t)
+        }
+
+        in1Shape = ctxt.lookup(operatorRepresentation['data_in_1']).shape
+        in2Shape = ctxt.lookup(operatorRepresentation['data_in_2']).shape
+
+        dataIn1 = ctxt.lookup(operatorRepresentation['data_in_1'])
+        dataIn2 = ctxt.lookup(operatorRepresentation['data_in_2'])
+
+        axis = operatorRepresentation['axis']
+        posAxis = axis if axis >= 0 else len(in1Shape) + axis
+
+        for cube in outputCubes:
+            newIterations = np.prod(cube.dims[:axis])
+            replacements["iterations"].append(newIterations)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for cube in outputCubes:
+
+            in1Cube = copy.deepcopy(cube)
+            in2Cube = copy.deepcopy(cube)
+
+            if posAxis < (len(in1Shape) - 1):
+                in1Cube.dims = (*in1Cube.dims[:posAxis], in1Shape[posAxis], *in1Cube.dims[posAxis + 1:])
+                in2Cube.dims = (*in2Cube.dims[:posAxis], in2Shape[posAxis], *in2Cube.dims[posAxis + 1:])
+
+            else:
+                in1Cube.dims = (*in1Cube.dims[:posAxis], in1Shape[posAxis])
+                in2Cube.dims = (*in2Cube.dims[:posAxis], in2Shape[posAxis])
+
+            replacements["in1TransferLength"].append(
+                np.prod(in1Cube.dims[posAxis:]) * (dataIn1._type.referencedType.typeWidth // 8))
+            replacements["in2TransferLength"].append(
+                np.prod(in2Cube.dims[posAxis:]) * (dataIn2._type.referencedType.typeWidth // 8))
+
+            inputLoadSchedule.append({"data_in_1": in1Cube, "data_in_2": in2Cube})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/Generic/TileConstraints/MulTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/MulTileConstraint.py
new file mode 100644
index 0000000..f2a794f
--- /dev/null
+++ b/Deeploy/Targets/Generic/TileConstraints/MulTileConstraint.py
@@ -0,0 +1,32 @@
+# ----------------------------------------------------------------------
+#
+# File: MulTileConstraint.py
+#
+# Last edited: 22.02.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .BOPTileConstraint import BOPTileConstraint
+
+
+class MulTileConstraint(BOPTileConstraint):
+    dataIn1Name = "A"
+    dataIn2Name = "B"
+    dataOutName = "C"
diff --git a/Deeploy/Targets/Generic/TileConstraints/NOPTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/NOPTileConstraint.py
new file mode 100644
index 0000000..73293fb
--- /dev/null
+++ b/Deeploy/Targets/Generic/TileConstraints/NOPTileConstraint.py
@@ -0,0 +1,65 @@
+# ----------------------------------------------------------------------
+#
+# File: FlattenTileConstraint.py
+#
+# Last edited: 02.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List
+
+from Deeploy.DeeployTypes import NetworkContext
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+
+
+class NOPTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        inputBufferName = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        pointer: List[str] = []
+
+        for key, value in parseDict.items():
+            if not isinstance(value, str):
+                continue
+
+            if ctxt.is_global(value) or ctxt.is_local(value):
+                pointer.append(value)
+
+        #Add I/O dimensions to the model as variables
+        for bufferName in [inputBufferName, outputBufferName]:
+
+            _buffer = ctxt.lookup(bufferName)
+
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+            for idx, shapeDim in enumerate(_buffer.shape):
+                tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = bufferName, dimIdx = idx) <= shapeDim)
+
+        # Remove unused tensors from deployment
+        for bufferName in pointer:
+            if bufferName not in [inputBufferName, outputBufferName]:
+                ctxt.lookup(bufferName)._deploy = False
+
+        return tilerModel
diff --git a/Deeploy/Targets/Generic/TileConstraints/RQSiGELUTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/RQSiGELUTileConstraint.py
new file mode 100644
index 0000000..5bea47e
--- /dev/null
+++ b/Deeploy/Targets/Generic/TileConstraints/RQSiGELUTileConstraint.py
@@ -0,0 +1,30 @@
+# ----------------------------------------------------------------------
+#
+# File: RQSiGELUTileConstraint.py
+#
+# Last edited: 22.02.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .UnaryTileConstraint import UnaryTileConstraint
+
+
+class RQSiGELUTileConstraint(UnaryTileConstraint):
+    pass
diff --git a/Deeploy/Targets/Generic/TileConstraints/RQSiHardswishTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/RQSiHardswishTileConstraint.py
new file mode 100644
index 0000000..98e3fd7
--- /dev/null
+++ b/Deeploy/Targets/Generic/TileConstraints/RQSiHardswishTileConstraint.py
@@ -0,0 +1,30 @@
+# ----------------------------------------------------------------------
+#
+# File: RQiHardswishTileConstraint.py
+#
+# Last edited: 23.02.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .UnaryTileConstraint import UnaryTileConstraint
+
+
+class RQSiHardswishTileConstraint(UnaryTileConstraint):
+    pass
diff --git a/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py
new file mode 100644
index 0000000..f9d53f8
--- /dev/null
+++ b/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py
@@ -0,0 +1,107 @@
+# ----------------------------------------------------------------------
+#
+# File: TransposeTileConstraint.py
+#
+# Last edited: 01.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint16_t
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    _invertPermutation, _permuteList
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class TransposeTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        inputBufferName = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [inputBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        # Map output dims to inputs dims
+        for idx, perm_idx in enumerate(parseDict["perm"]):
+            tilerModel.addConstraint(
+                tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = idx) == tilerModel.getTensorDimVar(
+                    tensorName = inputBufferName, dimIdx = perm_idx))
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        inputInCubes = []
+
+        replacementTypes = {}
+        replacements: Dict[str, List[int]] = {}
+
+        numDims = len(ctxt.lookup(operatorRepresentation['data_in']).shape)
+
+        for dim in range(numDims):
+            replacementTypes[f"dimLen_{dim}"] = PointerClass(uint16_t)
+            replacements[f"dimLen_{dim}"] = []
+
+        perm = operatorRepresentation['perm']
+        invPerm = _invertPermutation(perm)
+
+        for cube in outputCubes:
+
+            inCubeDims = _permuteList(cube.dims, invPerm)
+
+            InCube = HyperRectangle(_permuteList(cube.offset, invPerm), inCubeDims)
+            inputInCubes.append(InCube)
+
+            for dim in range(numDims):
+                replacements[f"dimLen_{dim}"].append(inCubeDims[dim])
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for a in inputInCubes:
+            inputLoadSchedule.append({"data_in": a})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/Generic/TileConstraints/UnaryTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/UnaryTileConstraint.py
new file mode 100644
index 0000000..590ff87
--- /dev/null
+++ b/Deeploy/Targets/Generic/TileConstraints/UnaryTileConstraint.py
@@ -0,0 +1,90 @@
+# ----------------------------------------------------------------------
+#
+# File: UnaryTileConstraint.py
+#
+# Last edited: 05.10.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
+
+
+class UnaryTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        inputBuffer1Name = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        for bufferName in [inputBuffer1Name, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        input1Shape = ctxt.lookup(inputBuffer1Name).shape
+
+        for dim in range(len(input1Shape)):
+            inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim)
+            outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim)
+
+            tilerModel.addConstraint(inputDim1Var == outputDimVar)
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        replacements = {"size": []}
+        replacementTypes = {"size": PointerClass(uint16_t)}
+
+        for cube in outputCubes:
+            newSize = np.prod(cube.dims)
+            replacements["size"].append(newSize)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for cube in outputCubes:
+            inputLoadSchedule.append({"data_in": cube})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py
new file mode 100644
index 0000000..d9fef0e
--- /dev/null
+++ b/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py
@@ -0,0 +1,95 @@
+# ----------------------------------------------------------------------
+#
+# File: UntiledTileConstraint.py
+#
+# Last edited: 03.11.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, TransientBuffer
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
+
+
+class UntiledTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        pointer: List[str] = []
+
+        for key, value in parseDict.items():
+            if not isinstance(value, str):
+                continue
+
+            if ctxt.is_global(value) or ctxt.is_local(value):
+                pointer.append(value)
+
+        for tensorName in pointer:
+
+            _buffer = ctxt.lookup(tensorName)
+            if isinstance(_buffer, TransientBuffer):
+                continue
+
+            tilerModel.addTensorDimToModel(ctxt, tensorName)
+
+            for idx, shapeDim in enumerate(_buffer.shape):
+                tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx) == shapeDim)
+
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+        symbolicParseDict = parseDict.copy()
+
+        return symbolicParseDict
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        schedule = TilingSchedule({}, {}, [], [])
+        repScheme = VariableReplacementScheme({}, {})
+
+        for key, value in tilingSolution.tensorMemoryConstraints.items():
+
+            assert len(value.memoryConstraints.keys()) == 1, f"{cls} should be untiled, but {value} is tiled!"
+
+            memKey = list(value.memoryConstraints.keys())[0]
+            memValue = value.memoryConstraints[memKey]
+
+            _buffer = ctxt.lookup(key)
+            if isinstance(_buffer, TransientBuffer):
+                continue
+
+            assert memValue.shape == tuple(_buffer.shape)
+
+        return repScheme, schedule
diff --git a/Deeploy/Targets/Generic/TileConstraints/__init__.py b/Deeploy/Targets/Generic/TileConstraints/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/Generic/TileConstraints/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/Generic/TileConstraints/iHardswishTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/iHardswishTileConstraint.py
new file mode 100644
index 0000000..96abf97
--- /dev/null
+++ b/Deeploy/Targets/Generic/TileConstraints/iHardswishTileConstraint.py
@@ -0,0 +1,91 @@
+# ----------------------------------------------------------------------
+#
+# File: iHardswishTileConstraint.py
+#
+# Last edited: 22.02.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
+
+
+class iHardswishTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        inputBufferName = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        for bufferName in [inputBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        inputShape = ctxt.lookup(inputBufferName).shape
+
+        for dim in range(len(inputShape)):
+            inputDimVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = dim)
+            outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim)
+
+            tilerModel.addConstraint(inputDimVar == outputDimVar)
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        replacements = {"size": []}
+
+        replacementTypes = {"size": PointerClass(uint16_t)}
+
+        for cube in outputCubes:
+            newSize = np.prod(cube.dims)
+            replacements["size"].append(newSize)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for cube in outputCubes:
+            inputLoadSchedule.append({"data_in": cube})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/Generic/TileConstraints/iRMSNormTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/iRMSNormTileConstraint.py
new file mode 100644
index 0000000..4cff06d
--- /dev/null
+++ b/Deeploy/Targets/Generic/TileConstraints/iRMSNormTileConstraint.py
@@ -0,0 +1,101 @@
+# ----------------------------------------------------------------------
+#
+# File: iRMSNormTileConstraint.py
+#
+# Last edited: 21.02.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
+
+
+class iRMSNormTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        inputName = parseDict['data_in']
+        weightName = parseDict['weight']
+        outputName = parseDict['data_out']
+
+        for bufferName in [inputName, weightName, outputName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        inputShape = ctxt.lookup(inputName).shape
+        lastDimIdx = len(inputShape) - 1
+        lastDimLen = inputShape[-1]
+
+        tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = inputName, dimIdx = lastDimIdx) == lastDimLen)
+        tilerModel.addConstraint(
+            tilerModel.getTensorDimVar(tensorName = inputName, dimIdx = lastDimIdx) == tilerModel.getTensorDimVar(
+                tensorName = weightName, dimIdx = 0))
+
+        for idx, dim in enumerate(inputShape):
+            tilerModel.addConstraint(
+                tilerModel.getTensorDimVar(tensorName = inputName, dimIdx = idx) == tilerModel.getTensorDimVar(
+                    tensorName = outputName, dimIdx = idx))
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'weight', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        replacements = {"size": []}
+        replacementTypes = {"size": PointerClass(uint16_t)}
+
+        for cube in outputCubes:
+            newSize = np.prod(cube.dims)
+            replacements["size"].append(newSize)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for cube in outputCubes:
+
+            weightCube = copy.deepcopy(cube)
+            weightCube.dims = (cube.dims[-1],)
+            inputLoadSchedule.append({"data_in": cube, "weight": weightCube})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/Generic/Tiler.py b/Deeploy/Targets/Generic/Tiler.py
new file mode 100644
index 0000000..35df320
--- /dev/null
+++ b/Deeploy/Targets/Generic/Tiler.py
@@ -0,0 +1,44 @@
+# ----------------------------------------------------------------------
+#
+# File: BasicTiler.py
+#
+# Last edited: 01.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicConcatBindings, BasicReshapeBindings, \
+    BasicTransposeBindings
+from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.ConcatTileConstraint import ConcatTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.NOPTileConstraint import NOPTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint
+from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
+
+BasicTransposeTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = BasicTransposeBindings,
+                                                            tileConstraint = TransposeTileConstraint())
+
+BasicFlattenTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = BasicReshapeBindings,
+                                                          tileConstraint = NOPTileConstraint())
+
+BasicAddTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = BasicAddBindings,
+                                                      tileConstraint = AddTileConstraint())
+
+BasicConcatTilingReadyBinding = TilingReadyNodeBindings(nodeBindings = BasicConcatBindings,
+                                                        tileConstraint = ConcatTileConstraint())
diff --git a/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py
new file mode 100644
index 0000000..96eb222
--- /dev/null
+++ b/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py
@@ -0,0 +1,867 @@
+# ----------------------------------------------------------------------
+#
+# File: BasicPasses.py
+#
+# Last edited: 28.04.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from functools import partial
+from typing import List
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.Matchers import Match, NonBranchingMatcher
+from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, contextagnostic
+
+
+def _merge_trueintegerdiv_rq_fun(graph: gs.Graph, match: Match, name: str):
+
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+
+    integerDiv = matched_nodes[0]
+    rqs2 = matched_nodes[1]
+
+    rqs2Add = rqs2.inputs[2]
+    rqs2Mul = rqs2.inputs[1]
+    rqs2Div = rqs2.attrs["div"]
+
+    if not isinstance(rqs2Add, gs.Constant) or np.prod(rqs2Add.shape) > 1:
+        return graph
+
+    if not rqs2Add.values.item() == 0:
+        return graph
+
+    if not isinstance(rqs2Mul, gs.Constant) or np.prod(rqs2Mul.shape) > 1:
+        return graph
+
+    Delta = integerDiv.attrs['Delta']
+    eta = integerDiv.attrs['eta']
+    eps = integerDiv.attrs['eps']
+    y = integerDiv.attrs['y']
+
+    stretch = 2**8
+
+    coeff = np.floor(((Delta * eta) / (y * eta + eps))) * stretch
+
+    rqs2Mul.values = np.round(rqs2Mul.values * coeff)
+    rqs2.attrs['div'].values = rqs2.attrs['div'].values * stretch
+
+    _inputs = [*integerDiv.inputs[:1], *rqs2.inputs[1:]]
+    _outputs = rqs2.outputs
+
+    newRQS = gs.Node(op = "RequantShift", name = rqs2.name + "_repl", attrs = {**rqs2.attrs})
+
+    graph.replaceInsertNode(_inputs, _outputs, newRQS)
+
+    return graph
+
+
+@contextagnostic
+class MergeTrueIntegerDivRequantShiftPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        passes = []
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input],
+                             outputs = ['integerdiv_out'],
+                             op = 'TrueIntegerDiv',
+                             name = 'integerdiv')
+        output = graph.layer(inputs = output, outputs = ['rqs_2'], op = 'RequantShift', name = 'rqs2')
+
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_MERGE_TRUEINTEGERDIV_PASS"
+        super().__init__(graph, _merge_trueintegerdiv_rq_fun, name)
+
+
+def _merge_integerdiv_rq_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    integerdiv = matched_nodes[0]
+    rqs = matched_nodes[1]
+    totalShift = np.round(np.log2(rqs.attrs['div'].values))
+
+    rqs.inputs[-1].values = np.round(rqs.inputs[-1].values / rqs.inputs[-2].values + 1e-3)  # normalize add
+
+    shiftNode = gs.Constant(f'{integerdiv.name}_shift', np.array(totalShift))
+    _inputs = list(integerdiv.inputs) + list(rqs.inputs[1:]) + [shiftNode]
+    _outputs = rqs.outputs
+
+    rqsIntegerDiv = gs.Node(op = 'RQIntegerDiv', name = name, attrs = {**integerdiv.attrs, **rqs.attrs})
+    graph.replaceInsertNode(_inputs, _outputs, rqsIntegerDiv)
+
+    return graph
+
+
+@contextagnostic
+class IntegerDivRequantMergePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        passes = []
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['integerdiv_out'], op = 'IntegerDiv', name = 'integerdiv')
+        output = graph.layer(inputs = output, outputs = ['rqs'], op = 'RequantShift', name = 'rqs1')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_MERGE_INTEGERDIV_PASS"
+        super().__init__(graph, _merge_integerdiv_rq_fun, name)
+
+
+def _merge_ihardswish_rq_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    ihardswish = matched_nodes[0]
+    rqs = matched_nodes[1]
+    totalShift = np.round(np.log2(rqs.attrs['div'].values))
+
+    if not len(rqs.inputs) == 3:
+        return
+
+    if not (rqs.inputs[1].shape == [] or np.prod(rqs.inputs[1].shape) == 1):
+        return
+
+    if not (rqs.inputs[2].shape == [] or np.prod(rqs.inputs[2].shape) == 1):
+        return
+
+    if not isinstance(rqs.inputs[1], gs.Constant):
+        return
+
+    if not isinstance(rqs.inputs[2], gs.Constant):
+        return
+
+    rqs.inputs[-1].values = np.round(rqs.inputs[-1].values / rqs.inputs[-2].values + 1e-3)  # normalize add
+
+    requantArgs = {"mul": rqs.inputs[1].values.item(), "add": rqs.inputs[2].values.item(), "shift": totalShift}
+
+    _inputs = list(ihardswish.inputs)
+    _outputs = rqs.outputs
+
+    rqsiHardswish = gs.Node(op = 'RequantizediHardswish',
+                            name = name,
+                            attrs = {
+                                **ihardswish.attrs,
+                                **rqs.attrs,
+                                **requantArgs
+                            })
+    graph.replaceInsertNode(_inputs, _outputs, rqsiHardswish)
+
+    return graph
+
+
+@contextagnostic
+class iHardswishRequantMergePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['ihardswish_out'], op = 'iHardswish', name = 'ihardswish')
+        output = graph.layer(inputs = output, outputs = ['rqs'], op = 'RequantShift', name = 'rqs1')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = f"_MERGE_iHARDSWISHRQ_PASS"
+        super().__init__(graph, _merge_ihardswish_rq_fun, name)
+
+
+def _merge_igelu_rq_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    igelu = matched_nodes[0]
+    rqs = matched_nodes[1]
+    totalShift = np.round(np.log2(rqs.attrs['div'].values))
+
+    rqs.inputs[-1].values = np.round(rqs.inputs[-1].values / rqs.inputs[-2].values + 1e-3)  # normalize add
+
+    shiftNode = gs.Constant(f'{igelu.name}_shift', np.array(totalShift))
+    _inputs = list(igelu.inputs) + list(rqs.inputs[1:]) + [shiftNode]
+    _outputs = rqs.outputs
+
+    #import IPython; IPython.embed()
+
+    rqsiGELU = gs.Node(op = 'RequantizediGELU', name = name, attrs = {**igelu.attrs, **rqs.attrs})
+    graph.replaceInsertNode(_inputs, _outputs, rqsiGELU)
+
+    return graph
+
+
+@contextagnostic
+class iGELURequantMergePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        passes = []
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['igelu_out'], op = 'iGELU', name = 'igelu')
+        output = graph.layer(inputs = output, outputs = ['rqs'], op = 'RequantShift', name = 'rqs1')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = f"_MERGE_iGELURQ_PASS"
+        super().__init__(graph, _merge_igelu_rq_fun, name)
+
+
+def _merge_rqs_add_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    add = matched_nodes[0]
+    rqs = matched_nodes[1]
+
+    if (isinstance(add.inputs[0], gs.Constant) or isinstance(add.inputs[1], gs.Constant)) and isinstance(
+            rqs.inputs[2], gs.Constant):
+        if isinstance(add.inputs[0], gs.Constant):
+            idx = 1  # Non-constant idx
+            constantTensor = add.inputs[0]
+        else:
+            idx = 0  # non-constant idx
+            constantTensor = add.inputs[1]
+        if constantTensor.values.shape != tuple(add.outputs[0].shape):
+            rqs.inputs[2].values = (rqs.inputs[1].values * constantTensor.values) + rqs.inputs[2].values
+            add.inputs[(idx + 1) % 2].values = add.inputs[(idx + 1) % 2].values * 0
+            rqs.inputs[0] = add.inputs[idx]
+        return graph
+    else:
+        return graph
+
+
+@contextagnostic
+class MergeConstAddAndRequantPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['add_out'], op = 'Add', name = 'add1')
+        output = graph.layer(inputs = output, outputs = ['rqs_out'], op = 'RequantShift', name = 'rqs1')
+        graph.outputs.append(output)
+        graph.inputs = [_input]
+
+        name = "_MERGE_RQS_ADD_PASS"
+        super().__init__(graph, _merge_rqs_add_fun, name)
+
+
+def _skip_rqs_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    node = matched_nodes[0]
+    rqs = matched_nodes[1]
+
+    # Check if it is a unity requant
+    mul = rqs.inputs[1].values
+    add = rqs.inputs[2].values
+    if (rqs.attrs['div'].values == mul).all() and (add == 0).all():
+        # Remove the requant node
+        graph.replaceInsertNode(node.inputs, rqs.outputs, node)
+
+    return graph
+
+
+@contextagnostic
+class SkipUnityRequantPass(ReplaceSequentialPatternPass):
+
+    def __init__(self, previous_op_regex: str, num_inputs: int = 1):
+        if previous_op_regex == "":
+            raise ValueError('Operator not set!')
+
+        graph = gs.Graph()
+        inputs = [gs.Variable(name = f'input_{i}') for i in range(num_inputs)]
+        output = graph.layer(inputs = inputs, outputs = ['op_out'], op = previous_op_regex)
+        output = graph.layer(inputs = output, outputs = ['rqs_out'], op = 'RequantShift', name = 'rqs1')
+        graph.outputs.append(output)
+        graph.inputs = inputs
+
+        name = "_SKIP_RQS_PASS"
+        super().__init__(graph, _skip_rqs_fun, name, NonBranchingMatcher(regex_op = True))
+
+
+def _skip_emptyconcat_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    concat = matched_nodes[0]
+
+    remove = False
+    empty_inputs = []
+    for inp in concat.inputs:
+        # Check if one of the shapes is zero
+        if np.prod(inp.shape) == 0:
+            empty_inputs.append(inp)
+            remove = True
+            break
+
+    if remove:
+        # Check if one of the inputs is empty
+        for inp in concat.inputs:
+            # Check if one of the shapes is non-zero
+            if np.prod(inp.shape) != 0:
+                for outputNode in list(concat.outputs[0].outputs):
+                    # Swap the outputTensor with inputTensor in the downstream nodes
+                    outputNode.inputs[outputNode.inputs.index(concat.outputs[0])] = inp
+                concat.inputs.clear()
+                concat.outputs.clear()
+
+                # Check if inputs are global inputs and remove them
+                graph.inputs = [inp for inp in graph.inputs if inp not in empty_inputs]
+
+                graph.cleanup().toposort()
+                return graph
+
+    return graph
+
+
+@contextagnostic
+class SkipEmptyConcatPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        _input2 = gs.Variable(name = 'input_2')
+        output = graph.layer(inputs = [_input, _input2], outputs = ['concat_out'], op = 'Concat')
+        graph.outputs.append(output)
+        graph.inputs = [_input, _input2]
+
+        name = "_SKIP_EMPTY_CONCAT_PASS"
+        super().__init__(graph, _skip_emptyconcat_fun, name)
+
+
+def _split_add_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    add = matched_nodes[0]
+
+    inputs = add.inputs
+    if len(inputs) > 2:
+        result = [inputs[0]]
+        for i in range(0, len(inputs) - 1):
+            result = graph.layer(op = "Add",
+                                 name = name + f'_Add{i}',
+                                 inputs = [result[0], inputs[i + 1]],
+                                 outputs = [name + f'_Add{i}_out'])
+
+        add.outputs[0].outputs[0].inputs[0] = result[0]
+
+        add.inputs.clear()
+        add.outputs.clear()
+        graph.cleanup().toposort()
+
+    return graph
+
+
+@contextagnostic
+class SplitAddPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['add_out'], op = 'Add', name = 'add1')
+        graph.outputs.append(output)
+        graph.inputs = [_input]
+
+        name = "_SPLIT_ADD_PASS"
+        super().__init__(graph, _split_add_fun, name)
+
+
+def _extract_padding_fun(graph: gs.Graph, match: Match, name: str, value = 0):
+
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    conv = matched_nodes[0]
+    if 'pads' in conv.attrs and np.sum(conv.attrs['pads']) > 1:
+        pads = copy.deepcopy(conv.attrs['pads'])
+        shape = copy.deepcopy(conv.inputs[0].shape)
+        newPads = np.zeros(2 * len(shape))
+        assert len(shape) - 2 == len(pads) / 2, "Conv padding dims do not match!"
+        newShape = shape
+
+        beginPads = pads[0:len(pads) // 2]
+        endPads = pads[len(pads) // 2:]
+        for idx, i in enumerate(beginPads):
+            newShape[2 + idx] = newShape[2 + idx] + i
+            newPads[2 + idx] = i
+
+        for idx, i in enumerate(endPads):
+            newShape[2 + idx] = newShape[2 + idx] + i
+            newPads[len(newPads) // 2 + 2 + idx] = i
+
+        newConvInput = gs.Variable(name + '_padded_input', dtype = np.float32, shape = newShape)
+        #valConst = gs.Constant('value', np.array(0))
+        conv.attrs['pads'] = [0 for pad in conv.attrs['pads']]
+        newPad = gs.Node(op = 'Pad',
+                         name = name + '_pad',
+                         attrs = {
+                             'pads': newPads,
+                             'mode': 'constant',
+                             'value': value
+                         },
+                         inputs = [conv.inputs[0]],
+                         outputs = [newConvInput])
+
+        conv.inputs[0] = newConvInput
+        graph.nodes.append(newPad)
+        graph.cleanup().toposort()
+        #import IPython; IPython.embed()
+
+    return graph
+
+
+@contextagnostic
+class ExtractPaddingFromPoolPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['pool_out'], op = 'MaxPool', name = 'maxpool1')
+        graph.outputs.append(output)
+        graph.inputs = [_input]
+
+        name = "_EXTRACT_POOL_PASS"
+        # SCHEREMO: This is a workaround!!!
+        super().__init__(graph, partial(_extract_padding_fun, value = -128), name)
+
+
+@contextagnostic
+class ExtractPaddingFromConvPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        passes = []
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['conv_out'], op = 'Conv', name = 'conv1')
+        graph.outputs.append(output)
+        graph.inputs = [_input]
+
+        name = "_EXTRACT_CONV_PASS"
+        super().__init__(graph, _extract_padding_fun, name)
+
+
+def _merge_matmul_add_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    gemm = matched_nodes[0]
+    add = matched_nodes[1]
+    _bias = add.inputs[0] if isinstance(add.inputs[0], gs.Constant) else add.inputs[1]
+    _inputs = gemm.inputs + [_bias]
+    _outputs = add.outputs
+
+    rqsGemm = gs.Node(op = 'Gemm', name = name, attrs = {'alpha': 1.0, 'beta': 1.0})
+    graph.replaceInsertNode(_inputs, _outputs, rqsGemm)
+
+    return graph
+
+
+@contextagnostic
+class MatMulAddMergePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['gemm_out'], op = 'MatMul', name = 'gemm')
+        output = graph.layer(inputs = output, outputs = ['add_out'], op = 'Add', name = 'add')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_MERGE_MATMUL_ADD_PASS"
+        super().__init__(graph, _merge_matmul_add_fun, name)
+
+
+def _propagate_requant_fun(graph: gs.Graph, match: Match, name: str):
+
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    add = matched_nodes[0]
+    rqs = matched_nodes[1]
+
+    inputNode1 = add.inputs[0]
+    inputNode2 = add.inputs[1]
+
+    newAdd1 = gs.Constant(name = name + '_rqs1_add', values = rqs.inputs[2].values)
+    newAdd2 = gs.Constant(name = name + '_rqs2_add', values = rqs.inputs[2].values)
+    newMul1 = gs.Constant(name = name + '_rqs1_mul', values = rqs.inputs[1].values)
+    newMul2 = gs.Constant(name = name + '_rqs2_mul', values = rqs.inputs[1].values)
+
+    newAddInput1 = gs.Variable(name + '_add_in_1', dtype = np.float32, shape = inputNode1.shape)
+    newAddInput2 = gs.Variable(name + '_add_in_2', dtype = np.float32, shape = inputNode2.shape)
+
+    newRQS1 = gs.Node(op = 'RequantShift',
+                      name = name + '_rqs1',
+                      attrs = rqs.attrs,
+                      inputs = [inputNode1, newMul1, newAdd1],
+                      outputs = [newAddInput1])
+    newRQS2 = gs.Node(op = 'RequantShift',
+                      name = name + '_rqs2',
+                      attrs = rqs.attrs,
+                      inputs = [inputNode2, newMul2, newAdd2],
+                      outputs = [newAddInput2])
+
+    graph.nodes.append(newRQS1)
+    graph.nodes.append(newRQS2)
+
+    add.inputs = [newAddInput1, newAddInput2]
+    graph.deleteNode(rqs)
+
+    return graph
+
+
+@contextagnostic
+class PropagateRequantThroughAddPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        _input2 = gs.Variable(name = 'input_2')
+        output = graph.layer(inputs = [_input, _input2], outputs = ['add_out'], op = 'Add', name = 'add1')
+        output = graph.layer(inputs = output, outputs = ['r1_out'], op = 'RequantShift', name = 'r1')
+        graph.outputs.append(output)
+        graph.inputs = [_input, _input2]
+
+        name = "_OPT_ADD_RQS_PASS"
+        super().__init__(graph, _propagate_requant_fun, name)
+
+
+def _merge_requant_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    attrs = {}
+    rqs1 = matched_nodes[0]
+    rqs2 = matched_nodes[1]
+
+    div1 = rqs1.attrs['div'].values
+    div2 = rqs2.attrs['div'].values
+    newDiv = max(div1, div2)
+    minDiv = min(div1, div2)
+    nLevels = max(rqs1.attrs['n_levels_out'].values, rqs2.attrs['n_levels_out'].values)
+    signed = max(rqs1.attrs['signed'].values, rqs2.attrs['signed'].values)
+
+    attrs['div'] = gs.Constant(name = 'div', values = newDiv)
+    attrs['n_levels'] = gs.Constant(name = 'n_levels', values = nLevels)
+    attrs['signed'] = gs.Constant(name = 'signed', values = signed)
+
+    if isinstance(rqs1.inputs[1], gs.Constant) and isinstance(rqs1.inputs[2], gs.Constant) and \
+       isinstance(rqs2.inputs[1], gs.Constant) and isinstance(rqs2.inputs[2], gs.Constant):
+        mul1 = rqs1.inputs[1].values
+        mul2 = rqs2.inputs[1].values
+        add1 = rqs1.inputs[2].values
+        add2 = rqs2.inputs[2].values
+
+        newMul = (mul1 * mul2)
+        newAdd = (add1 * mul2) + (div1 * add2)
+
+        newMul = gs.Constant(name = rqs1.name + name + '_mul', values = np.array(np.round(newMul / minDiv)))
+        newAdd = gs.Constant(name = rqs1.name + name + '_add', values = np.array(np.round(newAdd / minDiv)))
+
+        _inputs = [rqs1.inputs[0], newMul, newAdd]
+        _outputs = rqs2.outputs
+        newTrans = gs.Node(op = 'RequantShift', name = name, attrs = attrs)
+        graph.replaceInsertNode(_inputs, _outputs, newTrans)
+        return graph
+    else:
+        return graph
+
+
+@contextagnostic
+class MergeRequantPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['r1_out'], op = 'RequantShift', name = 'r1')
+        output = graph.layer(inputs = output, outputs = ['r2_out'], op = 'RequantShift', name = 'r2')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_OPT_RQS_PASS"
+        super().__init__(graph, _merge_requant_fun, name)
+
+
+def _merge_transposes_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    t1 = matched_nodes[0]
+    t2 = matched_nodes[1]
+
+    #Transpose forth and back - delete both nodes
+
+    if (t1.inputs[0].shape == t2.outputs[0].shape):
+        # Find Nodes-to-be-replaced
+        graph.deleteNode(t2)
+        graph.deleteNode(t1)
+        graph.cleanup().toposort()
+        return graph
+    # Net the transpose
+    else:
+        p1 = t1.attrs['perm']
+        p2 = t2.attrs['perm']
+        newPerm = [p1[idx] for idx in p2]
+
+    _inputs = list(t1.inputs)
+    _outputs = list(t2.outputs)
+
+    # Check if one of the intermedate nodes is a output node
+    for node in t1.outputs:
+        if node in graph.outputs:
+            return graph
+
+    newTrans = gs.Node(op = 'Transpose', name = name, attrs = {"perm": newPerm})
+    graph.replaceInsertNode(_inputs, _outputs, newTrans)
+
+    graph.cleanup().toposort()
+    return graph
+
+
+@contextagnostic
+class TransposeMergePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['t1_out'], op = 'Transpose', name = 't1')
+        output = graph.layer(inputs = output, outputs = ['t2_out'], op = 'Transpose', name = 't2')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_MERGE_TRANSPOSES_PASS"
+        super().__init__(graph, _merge_transposes_fun, name)
+
+
+def _split_transposes_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    t1 = matched_nodes[0]
+
+    if len(t1.outputs[0].outputs) <= 1:
+        return graph
+
+    perm = t1.attrs['perm']
+    inputVar = t1.inputs[0]
+    inputNode = t1.inputs[0].inputs[0]
+
+    originalNode = t1.outputs[0]
+
+    postSplitOutput = gs.Variable(name = f"{t1.outputs[0].name}_split", dtype = np.float32, shape = t1.inputs[0].shape)
+    inputNode.outputs = [postSplitOutput]
+
+    for node in originalNode.outputs.copy():
+        nodeName = node.name + f"_transpose_in"
+        varName = node.name + f"_transpose_in_var"
+        newOutput = gs.Variable(name = varName, dtype = np.float32, shape = t1.outputs[0].shape)
+
+        transposeNode = gs.Node(name = nodeName,
+                                op = "Transpose",
+                                inputs = [postSplitOutput],
+                                outputs = [newOutput],
+                                attrs = {'perm': perm})
+
+        graph.nodes.append(transposeNode)
+
+        newNodeInputs = []
+        for _input in node.inputs:
+            if _input != originalNode:
+                newNodeInputs.append(_input)
+            else:
+                newNodeInputs.append(newOutput)
+
+        node.inputs = newNodeInputs
+
+    t1.outputs = []
+    t1.inputs = []
+
+    graph.cleanup().toposort()
+    return graph
+
+
+@contextagnostic
+class TransposeSplitPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['t1_out'], op = 'Transpose', name = 't1')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_SPLIT_TRANSPOSES_PASS"
+        super().__init__(graph, _split_transposes_fun, name)
+
+
+def _const_perm_opt_transposes_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    t1 = matched_nodes[0]
+
+    perm = t1.attrs['perm']
+    if all([idx == val for idx, val in enumerate(perm)]):
+        graph.deleteNode(t1)
+
+    return graph
+
+
+@contextagnostic
+class TransposeNoPermOptPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['t1_out'], op = 'Transpose', name = 't1')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_CONST_PERM_OPT_TRANSPOSES_PASS"
+        super().__init__(graph, _const_perm_opt_transposes_fun, name)
+
+
+def _const_opt_transposes_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    t1 = matched_nodes[0]
+
+    if isinstance(t1.inputs[0], gs.Constant):
+        t1.inputs[0].values = np.transpose(t1.inputs[0].values, t1.attrs['perm'])
+        graph.deleteNode(t1)
+
+    return graph
+
+
+@contextagnostic
+class TransposeConstOptPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['t1_out'], op = 'Transpose', name = 't1')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_CONST_OPT_TRANSPOSES_PASS"
+        super().__init__(graph, _const_opt_transposes_fun, name)
+
+
+def _const_opt_reshape_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    reshape = matched_nodes[0]
+
+    if isinstance(reshape.inputs[0], gs.Constant):
+        reshape.inputs[0].values = reshape.inputs[0].values.reshape(reshape.inputs[1].values)
+        graph.deleteNode(reshape)
+
+    return graph
+
+
+@contextagnostic
+class ReshapeConstOptPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['out'], op = 'Reshape', name = 'reshape')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_CONST_OPT_RESHAPE_PASS"
+        super().__init__(graph, _const_opt_reshape_fun, name)
+
+
+def _merge_reshape_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    reshape1 = matched_nodes[0]
+    reshape2 = matched_nodes[1]
+
+    graph.deleteNode(reshape1)
+
+    graph.cleanup()
+
+    return graph
+
+
+@contextagnostic
+class ReshapeMergePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['out1'], op = 'Reshape', name = 'reshape1')
+        output = graph.layer(inputs = output, outputs = ['out2'], op = 'Reshape', name = 'reshape2')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_MERGE_RESHAPE_PASS"
+        super().__init__(graph, _merge_reshape_fun, name)
+
+
+def _split_rqs_fun(graph: gs.Graph, match: Match, name: str, splitSet: List[str]):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    t1 = matched_nodes[0]
+
+    if len(t1.outputs[0].outputs) <= 1:
+        return graph
+
+    outputOpNames = [node.op for node in t1.outputs[0].outputs]
+
+    if not any([name in splitSet for name in outputOpNames]):
+        return graph
+
+    inputVars = t1.inputs
+    inputNode = t1.inputs[0]
+
+    originalNode = t1.outputs[0]
+
+    userNodes = t1.outputs[0].outputs
+
+    postSplitInputs = []
+    for idx, var in enumerate(inputVars):
+        if isinstance(var, gs.Variable):
+            postSplitInput = var
+        else:
+            postSplitInput = gs.Constant(name = f"{t1.name}_split_{idx}", values = var.values.copy().reshape(-1,))
+        postSplitInputs.append(postSplitInput)
+
+    for idx, node in enumerate(originalNode.outputs.copy()):
+
+        nodeName = node.name + f"_rqs"
+        varName = node.name + f"_rqs_var"
+        newOutput = gs.Variable(name = varName, dtype = np.float32, shape = t1.outputs[0].shape)
+
+        RQSNode = gs.Node(name = nodeName,
+                          op = "RequantShift",
+                          inputs = postSplitInputs,
+                          outputs = [newOutput],
+                          attrs = t1.attrs)
+
+        graph.nodes.append(RQSNode)
+
+        newNodeInputs = []
+        for _input in node.inputs:
+            if _input != originalNode:
+                newNodeInputs.append(_input)
+            else:
+                newNodeInputs.append(newOutput)
+
+        node.inputs = newNodeInputs
+
+    t1.outputs = []
+    t1.inputs = []
+
+    graph.cleanup().toposort()
+
+    return graph
+
+
+@contextagnostic
+class RQSSplitPass(ReplaceSequentialPatternPass):
+
+    splitSet = ["Add", "Concat"]
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['t1_out'], op = 'RequantShift', name = 't1')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_SPLIT_RequantShift_PASS"
+        super().__init__(graph, partial(_split_rqs_fun, splitSet = self.splitSet), name)
diff --git a/Deeploy/Targets/Generic/TopologyOptimizationPasses/__init__.py b/Deeploy/Targets/Generic/TopologyOptimizationPasses/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/Generic/TopologyOptimizationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py
new file mode 100644
index 0000000..d77d620
--- /dev/null
+++ b/Deeploy/Targets/Generic/TypeCheckers.py
@@ -0,0 +1,508 @@
+# ----------------------------------------------------------------------
+#
+# File: BasicCheckers.py
+#
+# Last edited: 16.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Authors:
+# - Moritz Scherer, ETH Zurich
+# - Victor Jung, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Sequence, Type
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.TypeCheckers.SignPropTypeChecker import SignPropTypeChecker
+from Deeploy.DeeployTypes import ConstantBuffer, OperatorRepresentation, VariableBuffer
+
+
+class ConcatChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]:
+
+        maxNLevel = max(i.nLevels for i in inputs)
+
+        return [maxNLevel]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> Optional[List[bool]]:
+        assert (all([_inp._signed == True for _inp in inputs]) or all(
+            [[_inp._signed == False for _inp in inputs]])), "Some inputs in concat operation have different signs!"
+
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+
+class SliceChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]:
+        return [inputs[0].nLevels]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> Optional[List[bool]]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+
+class TransposeChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]:
+        return [inputs[0].nLevels]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> Optional[List[bool]]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+
+class PadChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [inputs[0].nLevels]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+
+class AddChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [inputs[0].nLevels + inputs[1].nLevels]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed or isinstance(inputs[1], ConstantBuffer):
+            return [True]
+        else:
+            return [False]
+
+
+class GatherChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [inputs[0].nLevels]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+
+class ReshapeChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [inputs[0].nLevels]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+
+class MHSAChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [operatorRepresentation['n_levels']]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [True]
+
+
+class CLCAChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [True]
+
+
+class LinearAttentionChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [True]
+
+
+class GEMMChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [
+            2**((self.input_types[0].referencedType.typeWidth) * 2) *
+            inputs[0].shape[-1 - operatorRepresentation['transA']]
+        ]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [True]
+
+
+class iLayerNormChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [True]
+
+
+class MulChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [2**(self.input_types[1].typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed or isinstance(inputs[1], ConstantBuffer):
+            return [True]
+        else:
+            return [False]
+
+
+class IntegerDivChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [2**(self.output_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed or isinstance(inputs[1], ConstantBuffer):
+            return [True]
+        else:
+            return [False]
+
+
+class RQIntegerDivChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [2**(self.output_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed or isinstance(inputs[1], ConstantBuffer):
+            return [True]
+        else:
+            return [False]
+
+
+class MatMulChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [np.max(inputs[0].shape) * np.max(inputs[1].shape) * 2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        # WIESEP: Hack because previous kernel implementation assumed signed to always be true.
+        return [True]
+        # if inputs[0]._signed or isinstance(inputs[1], ConstantBuffer):
+        #   return [True]
+        # else:
+        # return [False]
+
+
+class RQMatMulChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [operatorRepresentation['n_levels']]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [bool(operatorRepresentation["signed"])]
+
+
+class RQGEMMChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [operatorRepresentation['n_levels']]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [bool(operatorRepresentation["signed"])]
+
+
+class ReduceMeanChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+
+class ReduceSumChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [operatorRepresentation['axisLength'] * 2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+
+class SoftmaxChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [False]
+
+
+class GELUChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+
+class HardswishChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [2**(4 * self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+
+class RQHardswishChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+
+class MaxPoolChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [inputs[0].nLevels]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+
+class ConvChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        weight = inputs[1]
+        return [
+            np.prod(operatorRepresentation['kernel_shape']) * weight.nLevels * weight.shape[1] *
+            2**(self.input_types[0].referencedType.typeWidth)
+        ]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+
+class RequantShiftChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [operatorRepresentation['n_levels']]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [operatorRepresentation["signed"]]
+
+
+class DummyChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [2**(self.input_types[0].referencedType.typeWidth)]
+
+
+class DebugPrintChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [inputs[0].nLevels]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
diff --git a/Deeploy/Targets/Generic/__init__.py b/Deeploy/Targets/Generic/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/Generic/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/MemPool/Bindings.py b/Deeploy/Targets/MemPool/Bindings.py
new file mode 100644
index 0000000..38157c6
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Bindings.py
@@ -0,0 +1,95 @@
+# ----------------------------------------------------------------------
+#
+# File: MemPoolBindings.py
+#
+# Last edited: 13.11.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
+    MemoryManagementGeneration
+from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes, int8_t, int32_t
+from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
+from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
+from Deeploy.Targets.Generic.TypeCheckers import ConvChecker, GEMMChecker, MatMulChecker, MaxPoolChecker, MHSAChecker, \
+    RequantShiftChecker, RQGEMMChecker, RQMatMulChecker, SoftmaxChecker
+from Deeploy.Targets.MemPool.Templates import ConvTemplate, DWConvTemplate, GemmTemplate, ITAMaxTemplate, ITATemplate, \
+    MatMulTemplate, MaxPoolTemplate, RequantShiftTemplate, RQGemmTemplate, RQMatMulTemplate
+
+BasicTransformer = CodeTransformation([MemoryManagementGeneration(), ArgumentStructGeneration(), FutureGeneration()])
+
+MemPoolConv1D_8_8_32_Binding = NodeBinding(
+    ConvChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+    ConvTemplate.MemPoolParallel1DTemplate, BasicTransformer)
+MemPoolConv2D_8_8_32_Binding = NodeBinding(
+    ConvChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+    ConvTemplate.MemPoolParallel2DTemplate, BasicTransformer)
+MemPoolDWConv1D_8_8_32_Binding = NodeBinding(
+    ConvChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+    DWConvTemplate.MemPoolParallel1DTemplate, BasicTransformer)
+MemPoolDWConv2D_8_8_32_Binding = NodeBinding(
+    ConvChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+    DWConvTemplate.MemPoolParallel2DTemplate, BasicTransformer)
+MemPoolGEMMBinding_8_8_32_32 = NodeBinding(
+    GEMMChecker(
+        [PointerClass(int8_t), PointerClass(int8_t), PointerClass(int32_t)], [PointerClass(int32_t)]),
+    GemmTemplate.MemPoolParallelTemplate, BasicTransformer)
+MemPoolITASoftmaxBinding_8_8 = NodeBinding(SoftmaxChecker([PointerClass(int8_t)], [PointerClass(int8_t)]),
+                                           ITAMaxTemplate.MemPoolParallelTemplate, BasicTransformer)
+MemPoolMatMul_8_8_32_Binding = NodeBinding(
+    MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+    MatMulTemplate.MemPoolParallelTemplate, BasicTransformer)
+MemPoolMaxPool2D_8_8_Binding = NodeBinding(MaxPoolChecker([PointerClass(int8_t)], [PointerClass(int8_t)]),
+                                           MaxPoolTemplate.MemPoolParallelTemplate, BasicTransformer)
+MemPoolMHSA_1H_INT8_Binding = NodeBinding(
+    MHSAChecker(
+        [PointerClass(int8_t), PointerClass(int8_t), PointerClass(int8_t)] +
+        [PointerClass(int8_t), PointerClass(int32_t)] * 4, [PointerClass(int8_t)]),
+    ITATemplate.MemPoolParallelTemplate_1H, BasicTransformer)
+MemPoolMHSA_2H_INT8_Binding = NodeBinding(
+    MHSAChecker(
+        [PointerClass(int8_t), PointerClass(int8_t), PointerClass(int8_t)] +
+        [PointerClass(int8_t), PointerClass(int32_t)] * 4, [PointerClass(int8_t)]),
+    ITATemplate.MemPoolParallelTemplate_2H, BasicTransformer)
+MemPoolMHSA_4H_INT8_Binding = NodeBinding(
+    MHSAChecker(
+        [PointerClass(int8_t), PointerClass(int8_t), PointerClass(int8_t)] +
+        [PointerClass(int8_t), PointerClass(int32_t)] * 4, [PointerClass(int8_t)]),
+    ITATemplate.MemPoolParallelTemplate_4H, BasicTransformer)
+MemPoolRQGEMMBinding_8_8_32_32_32_8 = NodeBinding(
+    RQGEMMChecker([
+        PointerClass(int8_t),
+        PointerClass(int8_t),
+        PointerClass(int32_t),
+        PointerClass(int32_t),
+        PointerClass(int32_t)
+    ], [PointerClass(int8_t)]), RQGemmTemplate.MemPoolParallelTemplate, BasicTransformer)
+MemPoolRQMatMul_8_8_32_32_Binding = NodeBinding(
+    RQMatMulChecker(
+        [PointerClass(int8_t), PointerClass(int8_t),
+         PointerClass(int32_t),
+         PointerClass(int32_t)], [PointerClass(int8_t)]), RQMatMulTemplate.MemPoolParallelTemplate, BasicTransformer)
+MemPoolRQSBindings_x_32_32_8 = [
+    NodeBinding(
+        RequantShiftChecker([PointerClass(_type), PointerClass(int32_t),
+                             PointerClass(int32_t)], [PointerClass(int8_t)]),
+        RequantShiftTemplate.MemPoolParallelTemplate, BasicTransformer) for _type in IntegerDataTypes
+]
diff --git a/Deeploy/Targets/MemPool/DataTypes.py b/Deeploy/Targets/MemPool/DataTypes.py
new file mode 100644
index 0000000..11ed173
--- /dev/null
+++ b/Deeploy/Targets/MemPool/DataTypes.py
@@ -0,0 +1,43 @@
+# ----------------------------------------------------------------------
+#
+# File: MemPoolDataTypes.py
+#
+# Last edited: 08.01.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+from Deeploy.AbstractDataTypes import PointerClass, Struct
+from Deeploy.CommonExtensions.DataTypes import int32_t, uint8_t
+
+
+class ita_quant_t(Struct):
+    typeName = "ita_quant_t"
+    structTypeDict = {
+        'eps_mult': PointerClass(uint8_t),
+        'right_shift': PointerClass(uint8_t),
+        'add': PointerClass(int32_t)
+    }
+
+
+@dataclass
+class MemPoolStructDataTypes():
+    ita_quant_t = ita_quant_t
diff --git a/Deeploy/Targets/MemPool/Deployer.py b/Deeploy/Targets/MemPool/Deployer.py
new file mode 100644
index 0000000..788fe8d
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Deployer.py
@@ -0,0 +1,62 @@
+# ----------------------------------------------------------------------
+#
+# File: MemPoolDeployer.py
+#
+# Last edited: 13.11.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, Type
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.DebugPasses import DebugPrintMergePass
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    NCHWtoNHWCPass, TransposeMatmulInputsPass
+from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
+from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import TransposeConstOptPass, TransposeMergePass
+
+
+class MemPoolDeployer(SignPropDeployer):
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: DeploymentPlatform,
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable = lambda x: x,
+                 name: str = 'DeeployNetwork',
+                 default_channels_first: bool = True,
+                 deeployStateDir: str = "DeeployState",
+                 inputOffsets: Dict[str, int] = {}):
+        super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name,
+                         default_channels_first, deeployStateDir)
+
+        self.inputOffsets = inputOffsets
+
+        self.loweringOptimizer.passes += [
+            TransposeMatmulInputsPass(),
+            NCHWtoNHWCPass(self.default_channels_first),
+            TransposeMergePass(),
+            TransposeConstOptPass(),
+            DebugPrintMergePass()
+        ]
diff --git a/Deeploy/Targets/MemPool/Layers.py b/Deeploy/Targets/MemPool/Layers.py
new file mode 100644
index 0000000..6f0ef4b
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Layers.py
@@ -0,0 +1,24 @@
+# ----------------------------------------------------------------------
+#
+# File: MemPoolLayers.py
+#
+# Last edited: 13.11.2022
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/Deeploy/Targets/MemPool/Parsers.py b/Deeploy/Targets/MemPool/Parsers.py
new file mode 100644
index 0000000..6166d1c
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Parsers.py
@@ -0,0 +1,150 @@
+# ----------------------------------------------------------------------
+#
+# File: MemPoolParsers.py
+#
+# Last edited: 13.11.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import NetworkContext
+from Deeploy.Targets.Generic.Parsers import MHSAParser
+
+
+class MemPoolMHSAParser(MHSAParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        wellFormed = super().parseNode(node)
+        if wellFormed:
+            wellFormed = all([
+                self.operatorRepresentation['n_levels'] == 256,
+                'preattn_requant_add' in node.attrs,
+                'postattn_requant_add' in node.attrs,
+                'wo_requant_add' in node.attrs,
+                'wq_requant_add' in node.attrs,
+                'wk_requant_add' in node.attrs,
+                'wv_requant_add' in node.attrs,
+            ])
+
+        if wellFormed:
+            self.operatorRepresentation['preattn_requant_add'] = node.attrs['preattn_requant_add']
+            self.operatorRepresentation['postattn_requant_add'] = node.attrs['postattn_requant_add']
+            self.operatorRepresentation['wo_requant_add'] = node.attrs['wo_requant_add']
+            self.operatorRepresentation['wq_requant_add'] = node.attrs['wq_requant_add']
+            self.operatorRepresentation['wk_requant_add'] = node.attrs['wk_requant_add']
+            self.operatorRepresentation['wv_requant_add'] = node.attrs['wv_requant_add']
+
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        K = ctxt.lookup(self.operatorRepresentation['k'])
+        V = ctxt.lookup(self.operatorRepresentation['v'])
+
+        self.operatorRepresentation['E'] = int(K.shape[-1])  # Embedding size
+
+        wellFormed = all([
+            K.name == V.name  # K and V has to be the same for ITA
+        ])
+
+        return newCtxt, wellFormed
+
+
+class MemPoolM1HSAParser(MemPoolMHSAParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        wellFormed = super().parseNode(node)
+        if wellFormed:
+            wellFormed = all([
+                self.operatorRepresentation['heads'] == 1,
+            ])
+
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        return newCtxt, ret
+
+
+class MemPoolM2HSAParser(MemPoolMHSAParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        wellFormed = super().parseNode(node)
+        if wellFormed:
+            wellFormed = all([
+                self.operatorRepresentation['heads'] == 2,
+            ])
+
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        ctxt = ctxt.copy()
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        return newCtxt, ret
+
+
+class MemPoolITAM4HSAParser(MemPoolMHSAParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        wellFormed = super().parseNode(node)
+        if wellFormed:
+            wellFormed = all([
+                self.operatorRepresentation['heads'] % 4 == 0,
+            ])
+
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        ctxt = ctxt.copy()
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        return newCtxt, ret
diff --git a/Deeploy/Targets/MemPool/Platform.py b/Deeploy/Targets/MemPool/Platform.py
new file mode 100644
index 0000000..e9125d9
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Platform.py
@@ -0,0 +1,222 @@
+# ----------------------------------------------------------------------
+#
+# File: MemPoolPlatform.py
+#
+# Last edited: 17.12.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+import numpy as np
+
+from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \
+    StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
+from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicConv1DBinding, BasicConv2DBinding, \
+    BasicDebugPrintBindings, BasicDWConv1DBinding, BasicDWConv2DBinding, BasicGatherBindings, BasicGELUBinding, \
+    BasicIntegerDivBinding, BasicLayerNormBinding, BasicMulBindings, BasicPad1DBindings, BasicPad2DBindings, \
+    BasicReduceMeanBindings, BasicReduceSumBindings, BasicReshapeBindings, BasicRQIntegerDivBinding, \
+    BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBinding, BasicTransposeBindings, DummyBinding
+from Deeploy.Targets.Generic.Layers import AddLayer, ConvLayer, DebugPrintLayer, GatherLayer, GEMMLayer, \
+    IntegerDivLayer, ITAMaxLayer, MatMulLayer, MaxPoolLayer, MHSALayer, MulLayer, PadLayer, ReduceMeanLayer, \
+    ReduceSumLayer, RequantShiftLayer, ReshapeLayer, RQGEMMLayer, RQIntegerDivLayer, RQMatMulLayer, RQSiGELULayer, \
+    SliceLayer, TransposeLayer, iGELULayer, iLayerNormLayer, iSoftmaxLayer
+from Deeploy.Targets.Generic.Parsers import AddParser, DebugParser, DummyParser, FlattenParser, GatherParser, \
+    GenericConv1DParser, GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, \
+    GenericMaxPool2DParser, IntegerDivParser, ITAMaxParser, MatMulParser, MulParser, Pad1DParser, Pad2DParser, \
+    ReduceMeanParser, ReduceSumParser, RequantShiftParser, ReshapeParser, RQGEMMParser, RQIntegerDivParser, \
+    RQMatMulParser, RQSiGELUParser, SliceParser, TransposeParser, UnsqueezeParser, iGELUParser, iLayerNormParser, \
+    iSoftmaxParser
+from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ExtractPaddingFromConvPass, \
+    ExtractPaddingFromPoolPass, MatMulAddMergePass, MergeConstAddAndRequantPass, SplitAddPass, iGELURequantMergePass
+from Deeploy.Targets.MemPool.Bindings import MemPoolConv1D_8_8_32_Binding, MemPoolConv2D_8_8_32_Binding, \
+    MemPoolDWConv1D_8_8_32_Binding, MemPoolDWConv2D_8_8_32_Binding, MemPoolGEMMBinding_8_8_32_32, \
+    MemPoolITASoftmaxBinding_8_8, MemPoolMatMul_8_8_32_Binding, MemPoolMaxPool2D_8_8_Binding, \
+    MemPoolMHSA_1H_INT8_Binding, MemPoolMHSA_2H_INT8_Binding, MemPoolMHSA_4H_INT8_Binding, \
+    MemPoolRQGEMMBinding_8_8_32_32_32_8, MemPoolRQMatMul_8_8_32_32_Binding, MemPoolRQSBindings_x_32_32_8
+from Deeploy.Targets.MemPool.Parsers import MemPoolITAM4HSAParser, MemPoolM1HSAParser, MemPoolM2HSAParser
+from Deeploy.Targets.MemPool.Templates import AllocateTemplate, FreeTemplate
+from Deeploy.Targets.MemPool.TopologyOptimizationPasses.Passes import MemPoolFuseMHSAPass, \
+    MemPoolGEMMRequantMergePass, MemPoolMatMulRequantMergePass, MemPoolSplitMHSAPass
+
+# Fallback bindings from the generic platform
+# (they support a wider range of attribute values)
+GenericConv1D_Mapper = NodeMapper(GenericConv1DParser(), [BasicConv1DBinding])
+GenericDWConv1D_Mapper = NodeMapper(GenericDWConv1DParser(), [BasicDWConv1DBinding])
+GenericConv2D_Mapper = NodeMapper(GenericConv2DParser(), [BasicConv2DBinding])
+GenericDWConv2D_Mapper = NodeMapper(GenericDWConv2DParser(), [BasicDWConv2DBinding])
+
+GenericConv_Mappers = [GenericConv2D_Mapper, GenericDWConv2D_Mapper, GenericConv1D_Mapper, GenericDWConv1D_Mapper]
+
+# Basic bindings
+Add_Mapper = NodeMapper(AddParser(), BasicAddBindings)
+DebugPrint_Mapper = NodeMapper(DebugParser(), BasicDebugPrintBindings)
+Flatten_Mapper = NodeMapper(FlattenParser(), BasicReshapeBindings)
+Gather_Mapper = NodeMapper(GatherParser(), BasicGatherBindings)
+GELU_Mapper = NodeMapper(iGELUParser(), [BasicGELUBinding])
+iLayerNorm_Mapper = NodeMapper(iLayerNormParser(), [BasicLayerNormBinding])
+IntegerDiv_Mapper = NodeMapper(IntegerDivParser(), [BasicIntegerDivBinding])
+ITAMaxMapper = NodeMapper(ITAMaxParser(), [MemPoolITASoftmaxBinding_8_8])
+Mul_Mapper = NodeMapper(MulParser(), BasicMulBindings)
+Pad1D_Mapper = NodeMapper(Pad1DParser(), BasicPad1DBindings)
+Pad2D_Mapper = NodeMapper(Pad2DParser(), BasicPad2DBindings)
+ReduceMean_Mapper = NodeMapper(ReduceMeanParser(), BasicReduceMeanBindings)
+ReduceSum_Mapper = NodeMapper(ReduceSumParser(), BasicReduceSumBindings)
+RequantShift_Mapper = NodeMapper(RequantShiftParser(), MemPoolRQSBindings_x_32_32_8)
+Reshape_Mapper = NodeMapper(ReshapeParser(), BasicReshapeBindings)
+RQGELU_Mapper = NodeMapper(RQSiGELUParser(), [BasicRQSGELUBinding])
+RQIntegerDiv_Mapper = NodeMapper(RQIntegerDivParser(), [BasicRQIntegerDivBinding])
+Softmax_Mapper = NodeMapper(iSoftmaxParser(), [BasicSoftmaxBinding])
+Transpose_Mapper = NodeMapper(TransposeParser(), BasicTransposeBindings)
+Unsqueeze_Mapper = NodeMapper(UnsqueezeParser(), BasicReshapeBindings)
+
+# MemPool specific bindings
+Conv1D_Mapper = NodeMapper(GenericConv1DParser(), [MemPoolConv1D_8_8_32_Binding])
+Conv2D_Mapper = NodeMapper(GenericConv2DParser(), [MemPoolConv2D_8_8_32_Binding])
+DWConv1D_Mapper = NodeMapper(GenericDWConv1DParser(), [MemPoolDWConv1D_8_8_32_Binding])
+DWConv2D_Mapper = NodeMapper(GenericDWConv2DParser(), [MemPoolDWConv2D_8_8_32_Binding])
+GEMM_Mapper = NodeMapper(GenericGEMMParser(), [MemPoolGEMMBinding_8_8_32_32])
+MatMul_Mapper = NodeMapper(MatMulParser(), [MemPoolMatMul_8_8_32_Binding])
+MaxPool_Mapper = NodeMapper(GenericMaxPool2DParser(), [MemPoolMaxPool2D_8_8_Binding])
+M1HSA_Mapper = NodeMapper(MemPoolM1HSAParser(), [MemPoolMHSA_1H_INT8_Binding])
+M2HSA_Mapper = NodeMapper(MemPoolM2HSAParser(), [MemPoolMHSA_2H_INT8_Binding])
+M4HSA_Mapper = NodeMapper(MemPoolITAM4HSAParser(), [MemPoolMHSA_4H_INT8_Binding])
+RQMatMul_Mapper = NodeMapper(RQMatMulParser(), [MemPoolRQMatMul_8_8_32_32_Binding])
+RQGemm_Mapper = NodeMapper(RQGEMMParser(), [MemPoolRQGEMMBinding_8_8_32_32_32_8])
+
+MHSA_Mappers = [M4HSA_Mapper, M2HSA_Mapper, M1HSA_Mapper]
+
+Conv_Mappers = [Conv2D_Mapper, DWConv2D_Mapper, Conv1D_Mapper, DWConv1D_Mapper]
+
+SliceMapper = NodeMapper(SliceParser(), BasicSliceBindings)
+
+# Dummy nodes are intended for development purposes only!
+# They should always generate compiler errors to not accidentally end up in production code
+DummyMapper = NodeMapper(DummyParser(), [DummyBinding])
+
+MemPoolMapping = {
+    'Add': AddLayer([Add_Mapper]),
+    'Conv': ConvLayer(Conv_Mappers + GenericConv_Mappers),  # Mapper with higher priority should be placed first!
+    'DebugPrint': DebugPrintLayer([DebugPrint_Mapper]),
+    'Div': IntegerDivLayer([IntegerDiv_Mapper]),
+    'Flatten': ReshapeLayer([Flatten_Mapper]),
+    'Gather': GatherLayer([Gather_Mapper]),
+    'Gemm': GEMMLayer([GEMM_Mapper]),
+    'iGELU': iGELULayer([GELU_Mapper]),
+    'iLayerNorm': iLayerNormLayer([iLayerNorm_Mapper]),
+    'IntegerDiv': IntegerDivLayer([IntegerDiv_Mapper]),
+    'IntegerMean': ReduceMeanLayer([ReduceMean_Mapper]),
+    'iSoftmax': iSoftmaxLayer([Softmax_Mapper]),
+    'ITAMax': ITAMaxLayer([ITAMaxMapper]),
+    'MatMul': MatMulLayer([MatMul_Mapper]),
+    'MatMulInteger': MatMulLayer([MatMul_Mapper]),
+    'MaxPool': MaxPoolLayer([MaxPool_Mapper]),
+    'MHSA': MHSALayer(MHSA_Mappers),
+    'Mul': MulLayer([Mul_Mapper]),
+    'Pad': PadLayer([Pad1D_Mapper, Pad2D_Mapper]),
+    'ReduceMean': ReduceMeanLayer([ReduceMean_Mapper]),
+    'ReduceSum': ReduceSumLayer([ReduceSum_Mapper]),
+    'RequantizediGELU': RQSiGELULayer([RQGELU_Mapper]),
+    'RequantShift': RequantShiftLayer([RequantShift_Mapper]),
+    'Reshape': ReshapeLayer([Reshape_Mapper]),
+    'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDiv_Mapper]),
+    'RQGemm': RQGEMMLayer([RQGemm_Mapper]),
+    'RQMatMul': RQMatMulLayer([RQMatMul_Mapper]),
+    'Transpose': TransposeLayer([Transpose_Mapper]),
+    'Unsqueeze': ReshapeLayer([Unsqueeze_Mapper]),
+    'Slice': SliceLayer([SliceMapper])
+}
+
+
+class MemPoolVariableBuffer(VariableBuffer):
+
+    initTemplate = AllocateTemplate.MemPoolInitTemplate
+    allocTemplate = AllocateTemplate.MemPoolAllocateTemplate
+    deallocTemplate = FreeTemplate.MemPoolLocalTemplate
+
+
+class MemPoolTransientBuffer(TransientBuffer):
+
+    initTemplate = AllocateTemplate.MemPoolInitTemplate
+    allocTemplate = AllocateTemplate.MemPoolAllocateTemplate
+    deallocTemplate = FreeTemplate.MemPoolLocalTemplate
+
+
+class MemPoolConstantBuffer(ConstantBuffer):
+
+    initTemplate = AllocateTemplate.MemPoolGlobalInitTemplate
+    allocTemplate = AllocateTemplate.MemPoolGlobalAllocateTemplate
+    deallocTemplate = FreeTemplate.MemPoolGlobalTemplate
+
+    def _bufferRepresentation(self) -> Dict:
+        retDict = super()._bufferRepresentation()
+        # WIESEP: Workaround for banshee simulations.
+        # Due to problems wrongly copied bytes, we want array sized a multiple of 4
+        bytes = np.prod(self.shape) * (self._type.typeWidth // 8)
+        if bytes % 4 != 0:
+            bytes = 4 * int((bytes / 4 + 1))
+        size = (bytes * 8) // self._type.typeWidth
+        retDict['size'] = int(size)
+        return retDict
+
+
+class MemPoolStructBuffer(StructBuffer):
+
+    initTemplate = AllocateTemplate.MemPoolStructInitTemplate
+    allocTemplate = AllocateTemplate.MemPoolStructAllocateTemplate
+    deallocTemplate = NodeTemplate("")
+
+
+MemPoolOptimizer = TopologyOptimizer([
+    MemPoolFuseMHSAPass(H = 8, bias = False, preSoftMaxRQ = True, integerDiv = False),
+    MemPoolFuseMHSAPass(H = 1, bias = False, preSoftMaxRQ = True, integerDiv = False),
+    MemPoolFuseMHSAPass(H = -1, bias = False, preSoftMaxRQ = True, integerDiv = False),
+    MemPoolFuseMHSAPass(H = -1, bias = True, preSoftMaxRQ = True, integerDiv = False),
+    MemPoolSplitMHSAPass(),
+    iGELURequantMergePass(),
+    MatMulAddMergePass(),
+    SplitAddPass(),
+    MergeConstAddAndRequantPass(),
+    MemPoolMatMulRequantMergePass(),
+    MemPoolGEMMRequantMergePass(),
+    ExtractPaddingFromConvPass(),
+    ExtractPaddingFromPoolPass(),
+    # DebugPrintPass(r'.*[Mm]at[Mm]ul.*', position = 'after'),
+])
+
+includeList = ["DeeployMath.h", "runtime.h", "synchronization.h"]
+
+
+class MemPoolEngine(DeploymentEngine):
+
+    def __init__(self, name: str, Mapping = MemPoolMapping, initCode: str = "", includeList = includeList) -> None:
+        super().__init__(name, Mapping, initCode, includeList)
+
+
+class MemPoolPlatform(DeploymentPlatform):
+
+    def __init__(self,
+                 engines = [MemPoolEngine("MemPool")],
+                 variableBuffer = MemPoolVariableBuffer,
+                 constantBuffer = MemPoolConstantBuffer,
+                 structBuffer = MemPoolStructBuffer,
+                 transientBuffer = MemPoolTransientBuffer):
+        super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
diff --git a/Deeploy/Targets/MemPool/Templates/AllocateTemplate.py b/Deeploy/Targets/MemPool/Templates/AllocateTemplate.py
new file mode 100644
index 0000000..0e78db4
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Templates/AllocateTemplate.py
@@ -0,0 +1,58 @@
+# ----------------------------------------------------------------------
+#
+# File: AllocateTemplate.py
+#
+# Last edited: 15.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+MemPoolInitTemplate = NodeTemplate("${type.typeName} ${name} __attribute__((section(\".l1\")));\n")
+MemPoolAllocateTemplate = NodeTemplate("""
+if (core_id ==0) {
+    ## #if DEEPLOY_TRACE_MALLOC
+    ## deeploy_log("[Deeploy] Alloc ${name} (${type.referencedType.typeName} * ${size})\\r\\n");
+    ## alloc_dump(get_alloc_l1());
+    ## #endif
+
+    ${name} = (${type.typeName}) deeploy_malloc(sizeof(${type.referencedType.typeName}) * ${size});
+
+    ## #if DEEPLOY_TRACE_MALLOC
+    ## deeploy_log("  -> @ %p\\r\\n", ${name});
+    ## alloc_dump(get_alloc_l1());
+    ## #endif
+}
+""")
+
+MemPoolGlobalInitTemplate = NodeTemplate(
+    "static ${type.referencedType.typeName} ${name}[${size}] __attribute__((section(\".l2\"))) = {${values}};\n")
+MemPoolGlobalAllocateTemplate = NodeTemplate("")
+
+MemPoolStructInitTemplate = NodeTemplate("""
+static ${type.typeName} ${name} __attribute__((section(\".l1\")));
+""")
+#static const ${type}* ${name} = &${name}_UL;
+
+MemPoolStructAllocateTemplate = NodeTemplate("""
+if (core_id == 0) {
+    ${name} = (${structDict.typeName}) ${str(structDict)};
+}
+""")
diff --git a/Deeploy/Targets/MemPool/Templates/ConvTemplate.py b/Deeploy/Targets/MemPool/Templates/ConvTemplate.py
new file mode 100644
index 0000000..0eeddf0
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Templates/ConvTemplate.py
@@ -0,0 +1,103 @@
+# ----------------------------------------------------------------------
+#
+# File: ConvTemplate.py
+#
+# Last edited: 02.12.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _Conv2D_Template(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        operatorRepresentation['input_offset'] = 0
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"):
+            operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels // 2)
+        operatorRepresentation['output_offset'] = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_out.nLevels // 2)
+
+        # import ipdb; ipdb.set_trace()
+        return ctxt, operatorRepresentation, []
+
+
+MemPoolParallel1DTemplate = _Conv2D_Template("""
+<%
+batchOffsetIn = ch_im_in  * dim_im_in_y
+batchOffsetOut = ch_im_out * dim_im_out_y
+%>
+
+// 1D Conv Parallel (Name: ${nodeName}, Op: ${nodeOp})
+mempool_barrier(numThreads);
+${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for (uint32_t n=0; n<${batch}; ++n) {
+    Conv2d_parallel_s${data_in_type.referencedType.typeWidth}_NCHW(
+        ref_${data_out}_${data_in}, ${ch_im_in}, 1, ${dim_im_in_y},
+        ${weight}, ${ch_im_out}, 1, ${dim_kernel_y},
+        1, ${stride_y},
+        ref_${data_out}_${data_out}, ${input_offset}, ${output_offset},
+        core_id,
+        numThreads
+    );
+    ref_${data_out}_${data_in} += ${batchOffsetIn};
+    ref_${data_out}_${data_out} += ${batchOffsetOut};
+}
+mempool_barrier(numThreads);
+""")
+
+MemPoolParallel2DTemplate = _Conv2D_Template("""
+<%
+batchOffsetIn = ch_im_in * dim_im_in_x * dim_im_in_y
+batchOffsetOut = ch_im_out * dim_im_out_x * dim_im_out_y
+%>
+
+// 2D Conv Parallel (Name: ${nodeName}, Op: ${nodeOp})
+mempool_barrier(numThreads);
+${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for (uint32_t n=0; n<${batch}; ++n) {
+    Conv2d_parallel_s${data_in_type.referencedType.typeWidth}_NCHW(
+        ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y},
+        ${weight}, ${ch_im_out}, ${dim_kernel_x}, ${dim_kernel_y},
+        ${stride_x}, ${stride_y},
+        ref_${data_out}_${data_out}, ${input_offset}, ${output_offset},
+        core_id,
+        numThreads
+    );
+    ref_${data_out}_${data_in} += ${batchOffsetIn};
+    ref_${data_out}_${data_out} += ${batchOffsetOut};
+}
+mempool_barrier(numThreads);
+""")
diff --git a/Deeploy/Targets/MemPool/Templates/DWConvTemplate.py b/Deeploy/Targets/MemPool/Templates/DWConvTemplate.py
new file mode 100644
index 0000000..1a5c4f8
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Templates/DWConvTemplate.py
@@ -0,0 +1,102 @@
+# ----------------------------------------------------------------------
+#
+# File: DWConvTemplate.py
+#
+# Last edited: 09.01.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _DWConv2D_Template(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        operatorRepresentation['input_offset'] = 0
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"):
+            operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels // 2)
+        operatorRepresentation['output_offset'] = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_out.nLevels // 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+MemPoolParallel1DTemplate = _DWConv2D_Template("""
+<%
+batchOffsetIn = ch_im_in  * dim_im_in_y
+batchOffsetOut = ch_im_out * dim_im_out_y
+%>
+
+// 1D Depth-Wise Conv Parallel (Name: ${nodeName}, Op: ${nodeOp})
+mempool_barrier(numThreads);
+${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for (uint32_t n=0; n<${batch}; ++n) {
+    DWConv2d_parallel_s${data_in_type.referencedType.typeWidth}_NCHW(
+        ref_${data_out}_${data_in}, ${ch_im_in}, 1, ${dim_im_in_y},
+        ${weight}, 1, ${dim_kernel_y},
+        1, ${stride_y},
+        ref_${data_out}_${data_out}, ${input_offset}, ${output_offset},
+        core_id,
+        numThreads
+    );
+    ref_${data_out}_${data_in} += ${batchOffsetIn};
+    ref_${data_out}_${data_out} += ${batchOffsetOut};
+}
+mempool_barrier(numThreads);
+""")
+
+MemPoolParallel2DTemplate = _DWConv2D_Template("""
+<%
+batchOffsetIn = ch_im_in * dim_im_in_x * dim_im_in_y
+batchOffsetOut = ch_im_out * dim_im_out_x * dim_im_out_y
+%>
+
+// 2D Depth-Wise Conv Parallel (Name: ${nodeName}, Op: ${nodeOp})
+mempool_barrier(numThreads);
+${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for (uint32_t n=0; n<${batch}; ++n) {
+    DWConv2d_parallel_s${data_in_type.referencedType.typeWidth}_NCHW(
+        ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y},
+        ${weight}, ${dim_kernel_x}, ${dim_kernel_y},
+        ${stride_x}, ${stride_y},
+        ref_${data_out}_${data_out}, ${input_offset}, ${output_offset},
+        core_id,
+        numThreads
+    );
+    ref_${data_out}_${data_in} += ${batchOffsetIn};
+    ref_${data_out}_${data_out} += ${batchOffsetOut};
+}
+mempool_barrier(numThreads);
+""")
diff --git a/Deeploy/Targets/MemPool/Templates/FreeTemplate.py b/Deeploy/Targets/MemPool/Templates/FreeTemplate.py
new file mode 100644
index 0000000..899afa9
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Templates/FreeTemplate.py
@@ -0,0 +1,56 @@
+# ----------------------------------------------------------------------
+#
+# File: FreeTemplate.py
+#
+# Last edited: 15.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+MemPoolLocalTemplate = NodeTemplate("""
+if (core_id ==0) {
+    ## #if DEEPLOY_TRACE_MALLOC
+    ## deeploy_log("[Deeploy] Free ${name} @ %p\\r\\n", ${name});
+    ## alloc_dump(get_alloc_l1());
+    ## #endif
+
+    simple_free(${name});
+
+    ## #if DEEPLOY_TRACE_MALLOC
+    ## alloc_dump(get_alloc_l1());
+    ## #endif
+}
+""")
+
+MemPoolGlobalTemplate = NodeTemplate("""
+if (core_id ==0) {
+    ## #if DEEPLOY_TRACE_MALLOC
+    ## deeploy_log("[Deeploy] Free ${name} @ %p\\r\\n", ${name});
+    ## alloc_dump(get_alloc_l1());
+    ## #endif
+
+    simple_free(${name});
+
+    ## #if DEEPLOY_TRACE_MALLOC
+    ## alloc_dump(get_alloc_l1());
+    ## #endif
+}
+""")
diff --git a/Deeploy/Targets/MemPool/Templates/GemmTemplate.py b/Deeploy/Targets/MemPool/Templates/GemmTemplate.py
new file mode 100644
index 0000000..d4852ba
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Templates/GemmTemplate.py
@@ -0,0 +1,166 @@
+# ----------------------------------------------------------------------
+#
+# File: GemmTemplate.py
+#
+# Last edited: 16.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _GemmTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+
+        A = ctxt.lookup(operatorRepresentation['A'])
+        B = ctxt.lookup(operatorRepresentation['B'])
+        C = ctxt.lookup(operatorRepresentation['C'])
+        Y = ctxt.lookup(operatorRepresentation['data_out'])
+        operatorRepresentation['A_offset'] = (A._signed == 0) * int(A.nLevels / 2)
+        operatorRepresentation['B_offset'] = (B._signed == 0) * int(B.nLevels / 2)
+        operatorRepresentation['C_offset'] = -(C._signed == 0) * int(C.nLevels / 2)
+        operatorRepresentation['Y_offset'] = -(Y._signed == 0) * int(Y.nLevels / 2)
+
+        # import ipdb; ipdb.set_trace()
+        return ctxt, operatorRepresentation, []
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Allocate buffer in L1 if original data lives in L2 to speed up the calculation,
+        # by first transferring it to L2 with the DMA.
+
+        A = ctxt.lookup(operatorRepresentation['A'])
+        B = ctxt.lookup(operatorRepresentation['B'])
+        C = ctxt.lookup(operatorRepresentation['C'])
+
+        names = []
+        size = operatorRepresentation['M'] * operatorRepresentation['N'] * (A._type.referencedType.typeWidth // 8)
+        name = operatorRepresentation['nodeName'] + f"_buffer_A"
+        operatorRepresentation['ctxtBuffer_A_size'] = size
+        if isinstance(A, ConstantBuffer):
+            names += [name]
+            ctxt.hoistTransientBuffer(name, size)
+            operatorRepresentation['ctxtBuffer_A'] = ctxt._mangle(name)
+        else:
+            operatorRepresentation['ctxtBuffer_A'] = operatorRepresentation['A']
+
+        size = operatorRepresentation['N'] * operatorRepresentation['O'] * (B._type.referencedType.typeWidth // 8)
+        name = operatorRepresentation['nodeName'] + f"_buffer_B"
+        operatorRepresentation['ctxtBuffer_B_size'] = size
+        if isinstance(B, ConstantBuffer):
+            names += [name]
+            ctxt.hoistTransientBuffer(name, size)
+            operatorRepresentation['ctxtBuffer_B'] = ctxt._mangle(name)
+        else:
+            operatorRepresentation['ctxtBuffer_B'] = operatorRepresentation['B']
+
+        size = operatorRepresentation['M'] * operatorRepresentation['O'] * (C._type.referencedType.typeWidth // 8)
+        name = operatorRepresentation['nodeName'] + f"_buffer_C"
+        operatorRepresentation['ctxtBuffer_C_size'] = size
+        if isinstance(C, ConstantBuffer):
+            names += [name]
+            ctxt.hoistTransientBuffer(name, size)
+            operatorRepresentation['ctxtBuffer_C'] = ctxt._mangle(name)
+        else:
+            operatorRepresentation['ctxtBuffer_C'] = operatorRepresentation['C']
+
+        return ctxt, operatorRepresentation, names
+
+
+MemPoolParallelTemplate = _GemmTemplate("""
+// GEMM Parallel (Name: ${nodeName}, Op: ${nodeOp})
+mempool_barrier(numThreads);
+
+%if ctxtBuffer_A != A:
+// Fast copy data from L2 to L1
+BEGIN_SINGLE_CORE
+    #if USE_DMA
+        dma_memcpy_blocking(${ctxtBuffer_A}, ${A}, ${ctxtBuffer_A_size});
+    #else
+        memcpy(${ctxtBuffer_A}, ${A}, ${ctxtBuffer_A_size});
+    #endif
+END_SINGLE_CORE
+%endif
+
+%if ctxtBuffer_B != B:
+// Fast copy data from L2 to L1
+BEGIN_SINGLE_CORE
+    #if USE_DMA
+        dma_memcpy_blocking(${ctxtBuffer_B}, ${B}, ${ctxtBuffer_B_size});
+    #else
+        memcpy(${ctxtBuffer_B}, ${B}, ${ctxtBuffer_B_size});
+    #endif
+END_SINGLE_CORE
+%endif
+
+%if ctxtBuffer_C != C:
+// Fast copy data from L2 to L1
+BEGIN_SINGLE_CORE
+    #if USE_DMA
+        dma_memcpy_blocking(${ctxtBuffer_C}, ${C}, ${ctxtBuffer_C_size});
+    #else
+        memcpy(${ctxtBuffer_C}, ${C}, ${ctxtBuffer_C_size});
+    #endif
+END_SINGLE_CORE
+%endif
+
+%if ctxtBuffer_A != A or ctxtBuffer_B != B or ctxtBuffer_C != C:
+    mempool_barrier(numThreads);
+%endif
+
+${A_type.typeName} ref_${data_out}_${A} = ${ctxtBuffer_A};
+${B_type.typeName} ref_${data_out}_${B} = ${ctxtBuffer_B};
+${C_type.typeName} ref_${data_out}_${C} = ${ctxtBuffer_C};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for(uint32_t i=0;i<${batch};i++){
+    Gemm_parallel_s${A_type.referencedType.typeWidth}(
+        ref_${data_out}_${A},
+        ref_${data_out}_${B},
+        ref_${data_out}_${C},
+        ref_${data_out}_${data_out},
+        ${M},
+        ${N},
+        ${O},
+        ${alpha},
+        ${beta},
+        ${transA},
+        ${transB},
+        ${A_offset},
+        ${B_offset},
+        ${C_offset},
+        ${Y_offset},
+        core_id,
+        numThreads
+    );
+
+    ref_${data_out}_${A} += ${M} * ${N};
+    ref_${data_out}_${B} += ${N} * ${O};
+    ref_${data_out}_${data_out} += ${M} * ${O};
+}
+mempool_barrier(numThreads);
+""")
diff --git a/Deeploy/Targets/MemPool/Templates/ITAMaxTemplate.py b/Deeploy/Targets/MemPool/Templates/ITAMaxTemplate.py
new file mode 100644
index 0000000..81d3cb1
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Templates/ITAMaxTemplate.py
@@ -0,0 +1,68 @@
+# ----------------------------------------------------------------------
+#
+# File: ITAMaxTemplate.py
+#
+# Last edited: 27.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _ITAMaxTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+
+        return ctxt, operatorRepresentation, []
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # WIESEP: Hack: Allocate a buffer for each core
+        size = operatorRepresentation['lastDimLength'] * 192
+        name = operatorRepresentation['nodeName'] + f"_buffer"
+        ctxt.hoistTransientBuffer(name, size)
+        operatorRepresentation['ctxtBuffer'] = ctxt._mangle(name)
+        operatorRepresentation['ctxtBufferSize'] = size
+
+        return ctxt, operatorRepresentation, [name]
+
+
+MemPoolParallelTemplate = _ITAMaxTemplate("""
+// ITAMax Parallel (Name: ${nodeName}, Op: ${nodeOp})
+mempool_barrier(numThreads);
+
+ITAMax_parallel_s${data_in_type.referencedType.typeWidth}(
+    ${data_in},
+    ${data_out},
+    ${ctxtBuffer},
+    ${size},
+    ${lastDimLength},
+    ${n_levels},
+    core_id,
+    numThreads
+);
+mempool_barrier(numThreads);
+""")
diff --git a/Deeploy/Targets/MemPool/Templates/ITATemplate.py b/Deeploy/Targets/MemPool/Templates/ITATemplate.py
new file mode 100644
index 0000000..3a42435
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Templates/ITATemplate.py
@@ -0,0 +1,384 @@
+# ----------------------------------------------------------------------
+#
+# File: ITATemplate.py
+#
+# Last edited: 16.11.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Tuple
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import int8_t, int32_t, uint8_t
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+from Deeploy.Targets.MemPool.DataTypes import MemPoolStructDataTypes
+
+# ITA Configuration
+_ITA_PE = 16
+
+
+def _transformITAInputs(ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation):
+    wq_bias = ctxt.lookup(operatorRepresentation['wq_bias'])
+    wk_bias = ctxt.lookup(operatorRepresentation['wk_bias'])
+    wv_bias = ctxt.lookup(operatorRepresentation['wv_bias'])
+    wo_bias = ctxt.lookup(operatorRepresentation['wo_bias'])
+    wq_weight = ctxt.lookup(operatorRepresentation['wq_weight'])
+    wk_weight = ctxt.lookup(operatorRepresentation['wk_weight'])
+    wv_weight = ctxt.lookup(operatorRepresentation['wv_weight'])
+    wo_weight = ctxt.lookup(operatorRepresentation['wo_weight'])
+
+    # Disable buffers
+    wq_bias._deploy = False
+    wk_bias._deploy = False
+    wv_bias._deploy = False
+    wo_bias._deploy = False
+    wq_weight._deploy = False
+    wk_weight._deploy = False
+    wv_weight._deploy = False
+    wo_weight._deploy = False
+
+    operatorRepresentation['S'] = operatorRepresentation['dim']
+    operatorRepresentation['P'] = operatorRepresentation['dim_head']
+
+    N = operatorRepresentation['heads']
+    S = operatorRepresentation['S']
+    E = operatorRepresentation['E']
+    P = operatorRepresentation['P']
+
+    # Extract values and transform them to layout required by ITA
+    wq_bias_ita = wq_bias.values.reshape(N, 1, P)
+
+    wk_bias_ita = wk_bias.values.reshape(N, 1, P)
+
+    wv_bias_ita = wv_bias.values.reshape(N, 1, P)
+
+    wo_bias_ita = wo_bias.values.reshape(N, 1, E)
+
+    wq_weight_ita = wq_weight.values.reshape(N, E, P)
+    wq_weight_ita = np.concatenate(
+        [np.concatenate(np.split(np.transpose(wq_weight_ita[i]), E // _ITA_PE, axis = 1)) for i in range(N)])
+    wq_weight_ita = np.reshape(wq_weight_ita, (N, P, E))
+
+    wk_weight_ita = wk_weight.values.reshape(N, E, P)
+    wk_weight_ita = np.concatenate([np.transpose(wk_weight_ita[i]) for i in range(N)])
+    wk_weight_ita = np.reshape(wk_weight_ita, (N, P, E))
+
+    wv_weight_ita = wv_weight.values.reshape(N, E, P)
+    wv_weight_ita = np.concatenate([np.transpose(wv_weight_ita[i]) for i in range(N)])
+    wv_weight_ita = np.reshape(wv_weight_ita, (N, P, E))
+
+    wo_weight_ita = wo_weight.values.reshape(N, P, E)
+    wo_weight_ita = np.concatenate([np.transpose(wo_weight_ita[i]) for i in range(N)])
+    wo_weight_ita = np.reshape(wo_weight_ita, (N, E, P))
+
+    q_ita = np.zeros((N, S, E))
+    k_ita = np.zeros((N, S, E))
+
+    return ctxt, operatorRepresentation, wq_bias_ita, wk_bias_ita, wv_bias_ita, wo_bias_ita, wq_weight_ita, wk_weight_ita, wv_weight_ita, wo_weight_ita, q_ita, k_ita
+
+
+class _1HSATemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+        nameList = []
+
+        ctxt, operatorRepresentation, wq_bias_ita, wk_bias_ita, wv_bias_ita, wo_bias_ita, wq_weight_ita, wk_weight_ita, wv_weight_ita, wo_weight_ita, q_ita, k_ita = _transformITAInputs(
+            ctxt, operatorRepresentation)
+
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+        q = ctxt.lookup(operatorRepresentation['q'])
+        k = ctxt.lookup(operatorRepresentation['k'])
+
+        nodeName = operatorRepresentation['nodeName']
+
+        # Fuse all inputs together and store in L2
+        wo_weight_ita = np.reshape(wo_weight_ita, (-1,))
+        wv_weight_ita = np.reshape(wv_weight_ita, (-1,))
+        wk_weight_ita = np.reshape(wk_weight_ita, (-1,))
+        q_ita = np.reshape(q_ita, (-1,))
+        k_ita = np.reshape(k_ita, (-1,))
+        wq_weight_ita = np.reshape(wq_weight_ita, (-1,))
+        wo_bias_ita = np.reshape(wo_bias_ita.astype(np.int32), (-1,)).view(np.int8)
+        wv_bias_ita = np.reshape(wv_bias_ita.astype(np.int32), (-1,)).view(np.int8)
+        wk_bias_ita = np.reshape(wk_bias_ita.astype(np.int32), (-1,)).view(np.int8)
+        wq_bias_ita = np.reshape(wq_bias_ita.astype(np.int32), (-1,)).view(np.int8)
+
+        data = np.concatenate([
+            wo_weight_ita,
+            wv_weight_ita,
+            wk_weight_ita,
+            q_ita,
+            k_ita,
+            wq_weight_ita,
+            wo_bias_ita,
+            wv_bias_ita,
+            wk_bias_ita,
+            wq_bias_ita,
+        ])
+
+        data_in = ctxt.ConstantBuffer(name = f'{nodeName}_input', shape = data.shape, values = data)
+        ctxt.add(data_in, 'global')
+        data_in._type = PointerClass(int8_t)
+        operatorRepresentation['data_in'] = data_in.name
+        nameList += [data_in.name]
+
+        requant_mult_data = np.array([
+            int(operatorRepresentation['wq_requant_mul']),
+            int(operatorRepresentation['wk_requant_mul']),
+            int(operatorRepresentation['preattn_requant_mul']),
+            int(operatorRepresentation['wv_requant_mul']),
+            int(operatorRepresentation['postattn_requant_mul']),
+            int(operatorRepresentation['wo_requant_mul']),
+            0,
+            0,
+        ])
+        requant_mult = ctxt.ConstantBuffer(name = f'{nodeName}_requant_mult',
+                                           shape = requant_mult_data.shape,
+                                           values = requant_mult_data)
+        ctxt.add(requant_mult, 'global')
+        requant_mult._type = PointerClass(uint8_t)
+        operatorRepresentation['requant_mult'] = requant_mult.name
+        nameList += [requant_mult.name]
+
+        requant_shift_data = np.array([
+            int(np.log2(operatorRepresentation['wq_requant_div'])),
+            int(np.log2(operatorRepresentation['wk_requant_div'])),
+            int(np.log2(operatorRepresentation['preattn_requant_div'])),
+            int(np.log2(operatorRepresentation['wv_requant_div'])),
+            int(np.log2(operatorRepresentation['postattn_requant_div'])),
+            int(np.log2(operatorRepresentation['wo_requant_div'])),
+            0,
+            0,
+        ])
+        requant_shift = ctxt.ConstantBuffer(name = f'{nodeName}_requant_shift',
+                                            shape = requant_shift_data.shape,
+                                            values = requant_shift_data)
+        ctxt.add(requant_shift, 'global')
+        requant_shift._type = PointerClass(uint8_t)
+        operatorRepresentation['requant_shift'] = requant_shift.name
+        nameList += [requant_shift.name]
+
+        requant_add_data = np.array([
+            int(operatorRepresentation['wq_requant_add']),
+            int(operatorRepresentation['wk_requant_add']),
+            int(operatorRepresentation['preattn_requant_add']),
+            int(operatorRepresentation['wv_requant_add']),
+            int(operatorRepresentation['postattn_requant_add']),
+            int(operatorRepresentation['wo_requant_add']),
+            0,
+            0,
+        ])
+        requant_add = ctxt.ConstantBuffer(name = f'{nodeName}_requant_add',
+                                          shape = requant_add_data.shape,
+                                          values = requant_add_data)
+        ctxt.add(requant_add, 'global')
+        requant_add._type = PointerClass(int32_t)
+        operatorRepresentation['requant_add'] = requant_add.name
+        nameList += [requant_add.name]
+
+        quant_dict = {
+            'eps_mult': operatorRepresentation['requant_mult'],
+            'right_shift': operatorRepresentation['requant_shift'],
+            'add': operatorRepresentation['requant_add']
+        }
+        nameList += [ctxt.hoistStruct(quant_dict, f'{nodeName}_quant_param', MemPoolStructDataTypes.ita_quant_t)]
+        operatorRepresentation['quant_param'] = ctxt.lookup(f'{nodeName}_quant_param').name
+
+        operatorRepresentation['q_offset'] = (q._signed == 0) * int(q.nLevels // 2)
+        operatorRepresentation['k_offset'] = (k._signed == 0) * int(k.nLevels // 2)
+        operatorRepresentation['output_offset'] = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_out.nLevels // 2)
+
+        # import IPython; IPython.embed()
+        return ctxt, operatorRepresentation, nameList
+
+
+class _MHSATemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+        nameList = []
+
+        ctxt, operatorRepresentation, wq_bias_ita, wk_bias_ita, wv_bias_ita, wo_bias_ita, wq_weight_ita, wk_weight_ita, wv_weight_ita, wo_weight_ita, q_ita, k_ita = _transformITAInputs(
+            ctxt, operatorRepresentation)
+
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+        q = ctxt.lookup(operatorRepresentation['q'])
+        k = ctxt.lookup(operatorRepresentation['k'])
+
+        nodeName = operatorRepresentation['nodeName']
+        N = operatorRepresentation['heads']
+
+        data_in = N * [None]
+        requant_mult = N * [None]
+        requant_shift = N * [None]
+        requant_add = N * [None]
+
+        for h in range(N):
+            # Create dummy array for key and values
+
+            # Fuse all inputs together and store in L2
+            data = np.concatenate([
+                np.reshape(wo_weight_ita[h], -1),
+                np.reshape(wv_weight_ita[h], -1),
+                np.reshape(wk_weight_ita[h], -1),
+                np.reshape(q_ita[h], -1),
+                np.reshape(k_ita[h], -1),
+                np.reshape(wq_weight_ita[h], -1),
+                np.reshape(wo_bias_ita[h].astype(np.int32), (-1,)).view(np.int8),
+                np.reshape(wv_bias_ita[h].astype(np.int32), (-1,)).view(np.int8),
+                np.reshape(wk_bias_ita[h].astype(np.int32), (-1,)).view(np.int8),
+                np.reshape(wq_bias_ita[h].astype(np.int32), (-1,)).view(np.int8),
+            ])
+
+            data_in[h] = ctxt.ConstantBuffer(name = f'{nodeName}_input_head{h}', shape = data.shape, values = data)
+            ctxt.add(data_in[h], 'global')
+            data_in[h]._type = PointerClass(int8_t)
+            operatorRepresentation[f'data_in_head{h}'] = data_in[h].name
+            nameList += [data_in[h].name]
+
+            requant_mult_data = np.array([
+                operatorRepresentation['wq_requant_mul'][h], operatorRepresentation['wk_requant_mul'][h],
+                operatorRepresentation['preattn_requant_mul'][h], operatorRepresentation['wv_requant_mul'][h],
+                operatorRepresentation['postattn_requant_mul'][h], operatorRepresentation['wo_requant_mul'][h], 0, 0
+            ])
+            requant_mult[h] = ctxt.ConstantBuffer(name = f'{nodeName}_requant_mult_head{h}',
+                                                  shape = requant_mult_data.shape,
+                                                  values = requant_mult_data)
+            ctxt.add(requant_mult[h], 'global')
+            requant_mult[h]._type = PointerClass(uint8_t)
+            operatorRepresentation[f'requant_mult_head{h}'] = requant_mult[h].name
+            nameList += [requant_mult[h].name]
+
+            requant_shift_data = np.array([
+                int(np.log2(operatorRepresentation['wq_requant_div'][h])),
+                int(np.log2(operatorRepresentation['wk_requant_div'][h])),
+                int(np.log2(operatorRepresentation['preattn_requant_div'][h])),
+                int(np.log2(operatorRepresentation['wv_requant_div'][h])),
+                int(np.log2(operatorRepresentation['postattn_requant_div'][h])),
+                int(np.log2(operatorRepresentation['wo_requant_div'][h])), 0, 0
+            ])
+            requant_shift[h] = ctxt.ConstantBuffer(name = f'{nodeName}_requant_shift_head{h}',
+                                                   shape = requant_shift_data.shape,
+                                                   values = requant_shift_data)
+            ctxt.add(requant_shift[h], 'global')
+            requant_shift[h]._type = PointerClass(uint8_t)
+            operatorRepresentation[f'requant_shift_head{h}'] = requant_shift[h].name
+            nameList += [requant_shift[h].name]
+
+            requant_add_data = np.array([
+                operatorRepresentation['wq_requant_add'][h], operatorRepresentation['wk_requant_add'][h],
+                operatorRepresentation['preattn_requant_add'][h], operatorRepresentation['wv_requant_add'][h],
+                operatorRepresentation['postattn_requant_add'][h], operatorRepresentation['wo_requant_add'][h], 0, 0
+            ])
+            requant_add[h] = ctxt.ConstantBuffer(name = f'{nodeName}_requant_add_head{h}',
+                                                 shape = requant_add_data.shape,
+                                                 values = requant_add_data)
+            ctxt.add(requant_add[h], 'global')
+            requant_add[h]._type = PointerClass(int32_t)
+            operatorRepresentation[f'requant_add_head{h}'] = requant_add[h].name
+            nameList += [requant_add[h].name]
+
+            quant_dict = {
+                'eps_mult': operatorRepresentation[f'requant_mult_head{h}'],
+                'right_shift': operatorRepresentation[f'requant_shift_head{h}'],
+                'add': operatorRepresentation[f'requant_add_head{h}']
+            }
+
+            nameList += [
+                ctxt.hoistStruct(quant_dict, f'{nodeName}_quant_params_head{h}', MemPoolStructDataTypes.ita_quant_t)
+            ]
+            operatorRepresentation[f'quant_params_head{h}'] = f'{nodeName}_quant_params_head{h}'
+
+            operatorRepresentation['q_offset'] = (q._signed == 0) * int(q.nLevels // 2)
+            operatorRepresentation['k_offset'] = (k._signed == 0) * int(k.nLevels // 2)
+            operatorRepresentation['output_offset'] = 0
+            if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+                operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_out.nLevels // 2)
+
+        operatorRepresentation['data_in_array'] = ctxt._mangle(operatorRepresentation['nodeName'] + f"_data_in_array")
+        operatorRepresentation['quant_params_array'] = ctxt._mangle(operatorRepresentation['nodeName'] +
+                                                                    f"_quant_params_array")
+
+        return ctxt, operatorRepresentation, nameList
+
+
+MemPoolParallelTemplate_1H = _1HSATemplate("""
+// ITA M1HSA (Name: ${nodeName}, Op: ${nodeOp})
+mempool_barrier(numThreads);
+M1HSA_s8_ITA(
+    ${q}, ${k}, ${data_in},
+    ${S}, ${E}, ${P},
+    &${quant_param},
+    ${data_out},
+    ${q_offset}, ${k_offset}, ${output_offset},
+    core_id,
+    numThreads
+);
+mempool_barrier(numThreads);
+""")
+
+MemPoolParallelTemplate_2H = _MHSATemplate("""
+// ITA M2HSA (Name: ${nodeName}, Op: ${nodeOp})
+mempool_barrier(numThreads);
+
+int8_t *${data_in_array}[] = { ${data_in_head0}, ${data_in_head1} };
+ita_quant_t const *${quant_params_array}[] = { &${quant_params_head0}, &${quant_params_head1}};
+
+M2HSA_s8_ITA(
+    ${q}, ${k}, ${data_in_array},
+    ${S}, ${E}, ${P},
+    ${quant_params_array},
+    ${data_out},
+    ${q_offset}, ${k_offset}, ${output_offset},
+    core_id,
+    numThreads
+);
+mempool_barrier(numThreads);
+""")
+
+MemPoolParallelTemplate_4H = _MHSATemplate("""
+// ITA M4HSA (Name: ${nodeName}, Op: ${nodeOp})
+mempool_barrier(numThreads);
+
+int8_t *${data_in_array}[] = { ${data_in_head0}, ${data_in_head1}, ${data_in_head2}, ${data_in_head3} };
+ita_quant_t const *${quant_params_array}[] = { &${quant_params_head0}, &${quant_params_head1}, &${quant_params_head2}, &${quant_params_head3}};
+
+M4HSA_s8_ITA(
+    ${q}, ${k}, ${data_in_array},
+    ${S}, ${E}, ${P},
+    ${quant_params_array},
+    ${data_out},
+    ${q_offset}, ${k_offset}, ${output_offset},
+    core_id,
+    numThreads
+);
+mempool_barrier(numThreads);
+""")
diff --git a/Deeploy/Targets/MemPool/Templates/MHSATemplate.py b/Deeploy/Targets/MemPool/Templates/MHSATemplate.py
new file mode 100644
index 0000000..726d184
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Templates/MHSATemplate.py
@@ -0,0 +1,241 @@
+# ----------------------------------------------------------------------
+#
+# File: MHSATemplate.py
+#
+# Last edited: 30.10.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Tuple
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import int8_t
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+# ITA Configuration
+_SPLIT = 4
+_ITA_S = 64
+_ITA_E = 64
+_ITA_P = 64
+
+
+class _MHSATemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+        nameList = []
+
+        nodeName = operatorRepresentation['nodeName']
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        wq_bias = ctxt.lookup(operatorRepresentation['wq_bias'])
+        wk_bias = ctxt.lookup(operatorRepresentation['wk_bias'])
+        wv_bias = ctxt.lookup(operatorRepresentation['wv_bias'])
+        wo_bias = ctxt.lookup(operatorRepresentation['wo_bias'])
+        wq_weight = ctxt.lookup(operatorRepresentation['wq_weight'])
+        wk_weight = ctxt.lookup(operatorRepresentation['wk_weight'])
+        wv_weight = ctxt.lookup(operatorRepresentation['wv_weight'])
+        wo_weight = ctxt.lookup(operatorRepresentation['wo_weight'])
+        q = ctxt.lookup(operatorRepresentation['q'])
+        k = ctxt.lookup(operatorRepresentation['k'])
+
+        # Disable buffers
+        wq_bias._deploy = False
+        wk_bias._deploy = False
+        wv_bias._deploy = False
+        wo_bias._deploy = False
+        wq_weight._deploy = False
+        wk_weight._deploy = False
+        wv_weight._deploy = False
+        wo_weight._deploy = False
+
+        operatorRepresentation['S'] = operatorRepresentation['dim']
+        operatorRepresentation['P'] = operatorRepresentation['dim_head']
+
+        N = operatorRepresentation['heads']
+        S = operatorRepresentation['S']
+        E = operatorRepresentation['E']
+        P = operatorRepresentation['P']
+
+        PAD_S = _ITA_S - S
+        PAD_E = _ITA_E - E
+        PAD_P = _ITA_P - P
+
+        # Extract values and transform them to layout required by ITA
+        wq_bias_ita = wq_bias.values.reshape(N, S, P)
+        wq_bias_ita = np.pad(wq_bias_ita, ((0, 0), (0, PAD_S), (0, PAD_P)))
+        wq_bias_ita = np.reshape(np.split(wq_bias_ita, _SPLIT, axis = 2), (N, _ITA_S, _ITA_P))
+
+        wk_bias_ita = wk_bias.values.reshape(N, S, P)
+        wk_bias_ita = np.pad(wk_bias_ita, ((0, 0), (0, PAD_S), (0, PAD_P)))
+        wk_bias_ita = np.reshape(np.split(wk_bias_ita, _SPLIT, axis = 2), (N, _ITA_S, _ITA_P))
+
+        wv_bias_ita = wv_bias.values.reshape(N, S, P)
+        wv_bias_ita = np.pad(wv_bias_ita, ((0, 0), (0, PAD_S), (0, PAD_P)))
+        wv_bias_ita = np.reshape(np.split(np.reshape(np.transpose(wv_bias_ita), (N, _ITA_P, _ITA_S)), _SPLIT, axis = 2),
+                                 (N, _ITA_P, _ITA_S))
+
+        wo_bias_ita = wo_bias.values.reshape(N, S, E)
+        wo_bias_ita = np.pad(wo_bias_ita, ((0, 0), (0, PAD_S), (0, PAD_E)))
+        wo_bias_ita = np.reshape(np.split(wo_bias_ita, _SPLIT, axis = 2), (N, _ITA_S, _ITA_E))
+
+        wq_weight_ita = wq_weight.values.reshape(N, E, P)
+        wq_weight_ita = np.pad(wq_weight_ita, ((0, 0), (0, PAD_E), (0, PAD_P)))
+        wq_weight_ita = np.concatenate(
+            [np.concatenate(np.split(np.transpose(wq_weight_ita[i]), _SPLIT, axis = 1)) for i in range(N)])
+        wq_weight_ita = np.reshape(wq_weight_ita, (N, _ITA_P, _ITA_E))
+
+        wk_weight_ita = wk_weight.values.reshape(N, E, P)
+        wk_weight_ita = np.pad(wk_weight_ita, ((0, 0), (0, PAD_E), (0, PAD_P)))
+        wk_weight_ita = np.concatenate([np.transpose(wk_weight_ita[i]) for i in range(N)])
+        wk_weight_ita = np.reshape(wk_weight_ita, (N, _ITA_P, _ITA_E))
+
+        wv_weight_ita = wv_weight.values.reshape(N, E, P)
+        wv_weight_ita = np.pad(wv_weight_ita, ((0, 0), (0, PAD_E), (0, PAD_P)))
+        wv_weight_ita = np.concatenate([np.transpose(wv_weight_ita[i]) for i in range(N)])
+        wv_weight_ita = np.reshape(wv_weight_ita, (N, _ITA_P, _ITA_E))
+
+        wo_weight_ita = wo_weight.values.reshape(N, P, E)
+        wo_weight_ita = np.pad(wo_weight_ita, ((0, 0), (0, PAD_P), (0, PAD_E)))
+        wo_weight_ita = np.concatenate([np.transpose(wo_weight_ita[i]) for i in range(N)])
+        wo_weight_ita = np.reshape(wo_weight_ita, (N, _ITA_E, _ITA_P))
+
+        # Create dummy array for key and values
+        q_ita = np.zeros((1, _ITA_S, _ITA_E))
+        k_ita = np.zeros((1, _ITA_S, _ITA_E))
+
+        # Fuse all inputs together and store in L2
+        data = np.stack([
+            wo_weight_ita,
+            wv_weight_ita,
+            wk_weight_ita,
+            q_ita,
+            k_ita,
+            wq_weight_ita,
+            wo_bias_ita,
+            wv_bias_ita,
+            wk_bias_ita,
+            wq_bias_ita,
+        ])
+
+        data_in = ctxt.ConstantBuffer(name = f'{nodeName}_input', shape = data.shape, values = data)
+        ctxt.add(data_in, 'global')
+        data_in._type = PointerClass(int8_t)
+        operatorRepresentation['data_in'] = data_in.name
+        nameList += [data_in.name]
+
+        requant_mult_data = np.array([
+            operatorRepresentation['wq_requant_mul'],
+            operatorRepresentation['wk_requant_mul'],
+            operatorRepresentation['preattn_requant_mul'],
+            operatorRepresentation['wv_requant_mul'],
+            operatorRepresentation['postattn_requant_mul'],
+            operatorRepresentation['wo_requant_mul'],
+            0,
+            0,
+        ])
+        requant_mult = ctxt.ConstantBuffer(name = f'{nodeName}_requant_mult',
+                                           shape = requant_mult_data.shape,
+                                           values = requant_mult_data)
+        ctxt.add(requant_mult, 'global')
+        requant_mult._type = PointerClass(int8_t)
+        operatorRepresentation['requant_mult'] = requant_mult.name
+        nameList += [requant_mult.name]
+
+        requant_shift_data = np.array([
+            int(np.log2(operatorRepresentation['wq_requant_div'])),
+            int(np.log2(operatorRepresentation['wk_requant_div'])),
+            int(np.log2(operatorRepresentation['preattn_requant_div'])),
+            int(np.log2(operatorRepresentation['wv_requant_div'])),
+            int(np.log2(operatorRepresentation['postattn_requant_div'])),
+            int(np.log2(operatorRepresentation['wo_requant_div'])),
+            0,
+            0,
+        ])
+        requant_shift = ctxt.ConstantBuffer(name = f'{nodeName}_requant_shift',
+                                            shape = requant_shift_data.shape,
+                                            values = requant_shift_data)
+        ctxt.add(requant_shift, 'global')
+        requant_shift._type = PointerClass(int8_t)
+        operatorRepresentation['requant_shift'] = requant_shift.name
+        nameList += [requant_shift.name]
+
+        requant_add_data = np.array([
+            operatorRepresentation['wq_requant_add'],
+            operatorRepresentation['wk_requant_add'],
+            operatorRepresentation['preattn_requant_add'],
+            operatorRepresentation['wv_requant_add'],
+            operatorRepresentation['postattn_requant_add'],
+            operatorRepresentation['wo_requant_add'],
+            0,
+            0,
+        ])
+        requant_add = ctxt.ConstantBuffer(name = f'{nodeName}_requant_add',
+                                          shape = requant_add_data.shape,
+                                          values = requant_add_data)
+        ctxt.add(requant_add, 'global')
+        requant_add._type = PointerClass(int8_t)
+        operatorRepresentation['requant_add'] = requant_add.name
+        nameList += [requant_add.name]
+
+        operatorRepresentation['q_offset'] = (q._signed == 0) * int(q.nLevels // 2)
+        operatorRepresentation['k_offset'] = (k._signed == 0) * int(k.nLevels // 2)
+        operatorRepresentation['output_offset'] = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_out.nLevels // 2)
+
+        # import IPython; IPython.embed()
+        return ctxt, operatorRepresentation, nameList
+
+
+MemPoolParallelTemplate = _MHSATemplate("""
+// ITA MHSA (Name: ${nodeName}, Op: ${nodeOp})
+mempool_barrier(numThreads);
+
+<%
+    ctxt = locals()['pageargs']
+    data_in_strings = ", ".join([ctxt[f"data_in_head{h}"] for h in range(heads)])
+    requant_mult_strings = ", ".join([ctxt[f"requant_mult_head{h}"] for h in range(heads)])
+    requant_shift_strings = ", ".join([ctxt[f"requant_shift_head{h}"] for h in range(heads)])
+    requant_add_strings = ", ".join([ctxt[f"requant_add_head{h}"] for h in range(heads)])
+%>
+int8_t *data_in_array[] = { ${data_in_strings} };
+uint8_t const *requant_mult_array[] = { ${requant_mult_strings} };
+uint8_t const *requant_shift_array[] = { ${requant_shift_strings} };
+int8_t const *requant_add_array[] = { ${requant_add_strings} };
+
+MHSA_s8_ITA(
+    ${q}, ${k}, data_in_array,
+    ${S}, ${E},
+    requant_mult_array,
+    requant_shift_array,
+    requant_add_array,
+    ${data_out},
+    ${q_offset}, ${k_offset}, ${output_offset},
+    core_id,
+    numThreads
+);
+mempool_barrier(numThreads);
+""")
diff --git a/Deeploy/Targets/MemPool/Templates/MatMulTemplate.py b/Deeploy/Targets/MemPool/Templates/MatMulTemplate.py
new file mode 100644
index 0000000..400d556
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Templates/MatMulTemplate.py
@@ -0,0 +1,75 @@
+# ----------------------------------------------------------------------
+#
+# File: MatMulTemplate.py
+#
+# Last edited: 13.11.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _MatMulTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        A = ctxt.lookup(operatorRepresentation['A'])
+        B = ctxt.lookup(operatorRepresentation['B'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+        operatorRepresentation['A_offset'] = (A._signed == 0) * int(A.nLevels / 2)
+        operatorRepresentation['B_offset'] = (B._signed == 0) * int(B.nLevels / 2)
+        operatorRepresentation['offset_output'] = -(data_out._signed == 0) * int(data_out.nLevels / 2)
+
+        # import ipdb; ipdb.set_trace()
+        return ctxt, operatorRepresentation, []
+
+
+MemPoolParallelTemplate = _MatMulTemplate("""
+// MatMul Parallel (Name: ${nodeName}, Op: ${nodeOp})
+mempool_barrier(numThreads);
+${A_type.typeName} ref_${data_out}_${A} = ${A};
+${B_type.typeName} ref_${data_out}_${B} = ${B};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for(uint32_t i=0;i<${batch};i++){
+    MatMul_parallel_s${A_type.referencedType.typeWidth}(
+        ref_${data_out}_${A},
+        ref_${data_out}_${B},
+        ref_${data_out}_${data_out},
+        ${M},
+        ${N},
+        ${O},
+        ${A_offset}, ${B_offset}, ${offset_output},
+        core_id,
+        numThreads
+    );
+
+    ref_${data_out}_${A} += ${M} * ${N};
+    ref_${data_out}_${B} += ${N} * ${O};
+    ref_${data_out}_${data_out} += ${M} * ${O};
+}
+mempool_barrier(numThreads);
+""")
diff --git a/Deeploy/Targets/MemPool/Templates/MaxPoolTemplate.py b/Deeploy/Targets/MemPool/Templates/MaxPoolTemplate.py
new file mode 100644
index 0000000..1665a6a
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Templates/MaxPoolTemplate.py
@@ -0,0 +1,76 @@
+# ----------------------------------------------------------------------
+#
+# File: MaxPoolTemplate.py
+#
+# Last edited: 13.12.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _MaxPool2DTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        operatorRepresentation['input_offset'] = 0
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"):
+            operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels // 2)
+        operatorRepresentation['output_offset'] = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_out.nLevels // 2)
+
+        # import IPython; IPython.embed()
+        return ctxt, operatorRepresentation, []
+
+
+MemPoolParallelTemplate = _MaxPool2DTemplate("""
+<%
+batchOffsetIn = ch_im_in * dim_im_in_x * dim_im_in_y
+batchOffsetOut = ch_im_out * dim_im_out_x * dim_im_out_y
+%>
+
+// 2D MaxPool Parallel (Name: ${nodeName}, Op: ${nodeOp})
+mempool_barrier(numThreads);
+${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for (uint32_t n=0; n<${batch}; ++n) {
+    MaxPool2d_parallel_s${data_in_type.referencedType.typeWidth}_NCHW(
+        ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y},
+        ${dim_kernel_x}, ${dim_kernel_y}, ${stride_x}, ${stride_y},
+        ref_${data_out}_${data_out}, ${input_offset}, ${output_offset},
+        core_id,
+        numThreads
+    );
+    ref_${data_out}_${data_in} += ${batchOffsetIn};
+    ref_${data_out}_${data_out} += ${batchOffsetOut};
+}
+mempool_barrier(numThreads);
+""")
diff --git a/Deeploy/Targets/MemPool/Templates/RQGemmTemplate.py b/Deeploy/Targets/MemPool/Templates/RQGemmTemplate.py
new file mode 100644
index 0000000..b336af2
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Templates/RQGemmTemplate.py
@@ -0,0 +1,217 @@
+# ----------------------------------------------------------------------
+#
+# File: RQGemmTemplate.py
+#
+# Last edited: 17.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _RQGemmTemplate(NodeTemplate, OperatorRepresentation):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+
+        A = ctxt.lookup(operatorRepresentation['A'])
+        B = ctxt.lookup(operatorRepresentation['B'])
+        C = ctxt.lookup(operatorRepresentation['C'])
+        Y = ctxt.lookup(operatorRepresentation['data_out'])
+        operatorRepresentation['A_offset'] = (A._signed == 0) * int(A.nLevels / 2)
+        operatorRepresentation['B_offset'] = (B._signed == 0) * int(B.nLevels / 2)
+        operatorRepresentation['C_offset'] = -(C._signed == 0) * int(C.nLevels / 2)
+        operatorRepresentation['Y_offset'] = -(Y._signed == 0) * int(Y.nLevels / 2)
+
+        operatorRepresentation['output_min'] = -(operatorRepresentation['n_levels'] // 2)
+        operatorRepresentation['output_max'] = (operatorRepresentation['n_levels'] // 2) - 1
+
+        MUL = ctxt.lookup(operatorRepresentation['mul'])
+        # WIESEP: Per element and per column quantization is not supported for RQGemm
+
+        if len(MUL.shape) == 1:
+            operatorRepresentation['perRowQuant'] = 0
+        else:
+            operatorRepresentation['perRowQuant'] = int(MUL.shape[-2] != 1)
+
+        return ctxt, operatorRepresentation, []
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Allocate buffer in L1 if original data lives in L2 to speed up the calculation,
+        # by first transferring it to L2 with the DMA.
+
+        A = ctxt.lookup(operatorRepresentation['A'])
+        B = ctxt.lookup(operatorRepresentation['B'])
+        C = ctxt.lookup(operatorRepresentation['C'])
+
+        names = []
+        size = operatorRepresentation['M'] * operatorRepresentation['N'] * (A._type.referencedType.typeWidth // 8)
+        name = operatorRepresentation['nodeName'] + f"_buffer_A"
+        operatorRepresentation['ctxtBuffer_A_size'] = size
+        if isinstance(A, ConstantBuffer):
+            names += [name]
+            ctxt.hoistTransientBuffer(name, size)
+            operatorRepresentation['ctxtBuffer_A'] = ctxt._mangle(name)
+        else:
+            operatorRepresentation['ctxtBuffer_A'] = operatorRepresentation['A']
+
+        size = operatorRepresentation['N'] * operatorRepresentation['O'] * (B._type.referencedType.typeWidth // 8)
+        name = operatorRepresentation['nodeName'] + f"_buffer_B"
+        operatorRepresentation['ctxtBuffer_B_size'] = size
+        if isinstance(B, ConstantBuffer):
+            names += [name]
+            ctxt.hoistTransientBuffer(name, size)
+            operatorRepresentation['ctxtBuffer_B'] = ctxt._mangle(name)
+        else:
+            operatorRepresentation['ctxtBuffer_B'] = operatorRepresentation['B']
+
+        size = operatorRepresentation['M'] * operatorRepresentation['O'] * (C._type.referencedType.typeWidth // 8)
+        name = operatorRepresentation['nodeName'] + f"_buffer_C"
+        operatorRepresentation['ctxtBuffer_C_size'] = size
+        if isinstance(C, ConstantBuffer):
+            names += [name]
+            ctxt.hoistTransientBuffer(name, size)
+            operatorRepresentation['ctxtBuffer_C'] = ctxt._mangle(name)
+        else:
+            operatorRepresentation['ctxtBuffer_C'] = operatorRepresentation['C']
+
+        return ctxt, operatorRepresentation, names
+
+
+MemPoolParallelTemplate = _RQGemmTemplate("""
+<%
+if isinstance(log2D, int):
+    log2Dstring = log2D
+else:
+    log2Dstring = "*"+log2D
+%>
+
+// RQGEMM Parallel (Name: ${nodeName}, Op: ${nodeOp})
+mempool_barrier(numThreads);
+
+%if ctxtBuffer_A != A:
+// Fast copy data from L2 to L1
+BEGIN_SINGLE_CORE
+    #if USE_DMA
+        dma_memcpy_blocking(${ctxtBuffer_A}, ${A}, ${ctxtBuffer_A_size});
+    #else
+        memcpy(${ctxtBuffer_A}, ${A}, ${ctxtBuffer_A_size});
+    #endif
+END_SINGLE_CORE
+%endif
+
+%if ctxtBuffer_B != B:
+// Fast copy data from L2 to L1
+BEGIN_SINGLE_CORE
+    #if USE_DMA
+        dma_memcpy_blocking(${ctxtBuffer_B}, ${B}, ${ctxtBuffer_B_size});
+    #else
+        memcpy(${ctxtBuffer_B}, ${B}, ${ctxtBuffer_B_size});
+    #endif
+END_SINGLE_CORE
+%endif
+
+%if ctxtBuffer_C != C:
+// Fast copy data from L2 to L1
+BEGIN_SINGLE_CORE
+    #if USE_DMA
+        dma_memcpy_blocking(${ctxtBuffer_C}, ${C}, ${ctxtBuffer_C_size});
+    #else
+        memcpy(${ctxtBuffer_C}, ${C}, ${ctxtBuffer_C_size});
+    #endif
+END_SINGLE_CORE
+%endif
+
+%if ctxtBuffer_A != A or ctxtBuffer_B != B or ctxtBuffer_C != C:
+    mempool_barrier(numThreads);
+%endif
+
+${A_type.typeName} ref_${data_out}_${A} = ${ctxtBuffer_A};
+${B_type.typeName} ref_${data_out}_${B} = ${ctxtBuffer_B};
+${C_type.typeName} ref_${data_out}_${C} = ${ctxtBuffer_C};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for(uint32_t i=0;i<${batch};i++){
+%if M%4==0 and N%4==0 and O%4==0:
+    RQGemm_offset_unrolled_2x2_parallel_s${A_type.referencedType.typeWidth}(
+        ref_${data_out}_${A},
+        ref_${data_out}_${B},
+        ref_${data_out}_${C},
+        ref_${data_out}_${data_out},
+        ${M},
+        ${N},
+        ${O},
+        ${alpha},
+        ${beta},
+        ${transA},
+        ${transB},
+        ${mul},
+        ${add},
+        ${log2Dstring},
+        1,
+        ${perRowQuant},
+        ${A_offset},
+        ${B_offset},
+        ${C_offset},
+        ${Y_offset},
+        core_id,
+        numThreads
+    );
+%else:
+    RQGemm_parallel_s${A_type.referencedType.typeWidth}(
+        ref_${data_out}_${A},
+        ref_${data_out}_${B},
+        ref_${data_out}_${C},
+        ref_${data_out}_${data_out},
+        ${M},
+        ${N},
+        ${O},
+        ${alpha},
+        ${beta},
+        ${transA},
+        ${transB},
+        ${mul},
+        ${add},
+        ${log2Dstring},
+        1,
+        ${perRowQuant},
+        ${A_offset},
+        ${B_offset},
+        ${C_offset},
+        ${Y_offset},
+        ${output_min},
+        ${output_max},
+        core_id,
+        numThreads
+    );
+%endif
+
+    ref_${data_out}_${A} += ${M} * ${N};
+    ref_${data_out}_${B} += ${N} * ${O};
+    ref_${data_out}_${data_out} += ${M} * ${O};
+}
+mempool_barrier(numThreads);
+""")
diff --git a/Deeploy/Targets/MemPool/Templates/RQMatMulTemplate.py b/Deeploy/Targets/MemPool/Templates/RQMatMulTemplate.py
new file mode 100644
index 0000000..d8165c6
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Templates/RQMatMulTemplate.py
@@ -0,0 +1,206 @@
+# ----------------------------------------------------------------------
+#
+# File: RQMatMulTemplate.py
+#
+# Last edited: 02.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _RQMatMulTemplate(NodeTemplate, OperatorRepresentation):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+
+        A = ctxt.lookup(operatorRepresentation['A'])
+        B = ctxt.lookup(operatorRepresentation['B'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+        operatorRepresentation['A_offset'] = (A._signed == 0) * int(A.nLevels / 2)
+        operatorRepresentation['B_offset'] = (B._signed == 0) * int(B.nLevels / 2)
+        operatorRepresentation['offset_output'] = -(data_out._signed == 0) * int(data_out.nLevels / 2)
+
+        operatorRepresentation['output_min'] = -(operatorRepresentation['n_levels'] // 2)
+        operatorRepresentation['output_max'] = (operatorRepresentation['n_levels'] // 2) - 1
+
+        MUL = ctxt.lookup(operatorRepresentation['mul'])
+        # WIESEP: Per element quantization is not supported for RQMatMul
+        if len(MUL.shape) == 4:
+            operatorRepresentation['perChannelQuant'] = int(MUL.shape[1] != 1)
+            operatorRepresentation['perRowQuant'] = int(MUL.shape[2] != 1)
+        elif len(MUL.shape) == 3:
+            operatorRepresentation['perChannelQuant'] = int(MUL.shape[0] != 1)
+            operatorRepresentation['perRowQuant'] = int(MUL.shape[1] != 1)
+        elif len(MUL.shape) == 2:
+            operatorRepresentation['perChannelQuant'] = 0
+            operatorRepresentation['perRowQuant'] = int(MUL.shape[0] != 1)
+        elif len(MUL.shape) == 1:
+            operatorRepresentation['perChannelQuant'] = 0
+            operatorRepresentation['perRowQuant'] = 0
+
+        # import ipdb; ipdb.set_trace()
+        return ctxt, operatorRepresentation, []
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Allocate buffer in L1 if original data lives in L2 to speed up the calculation,
+        # by first transferring it to L2 with the DMA.
+
+        A = ctxt.lookup(operatorRepresentation['A'])
+        B = ctxt.lookup(operatorRepresentation['B'])
+
+        names = []
+        size = operatorRepresentation['M'] * operatorRepresentation['N']
+        name = operatorRepresentation['nodeName'] + f"_buffer_A"
+        operatorRepresentation['ctxtBuffer_A_size'] = size
+        if isinstance(A, ConstantBuffer):
+            names += [name]
+            ctxt.hoistTransientBuffer(name, size)
+            operatorRepresentation['ctxtBuffer_A'] = ctxt._mangle(name)
+        else:
+            operatorRepresentation['ctxtBuffer_A'] = operatorRepresentation['A']
+
+        size = operatorRepresentation['N'] * operatorRepresentation['O']
+        name = operatorRepresentation['nodeName'] + f"_buffer_B"
+        operatorRepresentation['ctxtBuffer_B_size'] = size
+        if isinstance(B, ConstantBuffer):
+            names += [name]
+            ctxt.hoistTransientBuffer(name, size)
+            operatorRepresentation['ctxtBuffer_B'] = ctxt._mangle(name)
+        else:
+            operatorRepresentation['ctxtBuffer_B'] = operatorRepresentation['B']
+
+        return ctxt, operatorRepresentation, names
+
+
+MemPoolParallelTemplate = _RQMatMulTemplate("""
+<%
+if isinstance(log2D, int):
+    log2Dstring = log2D
+else:
+    log2Dstring = "*"+log2D
+%>
+
+// RQMatMul Parallel (Name: ${nodeName}, Op: ${nodeOp})
+mempool_barrier(numThreads);
+
+%if ctxtBuffer_A != A:
+// Fast copy data from L2 to L1
+BEGIN_SINGLE_CORE
+    #if USE_DMA
+        dma_memcpy_blocking(${ctxtBuffer_A}, ${A}, ${ctxtBuffer_A_size});
+    #else
+        memcpy(${ctxtBuffer_A}, ${A}, ${ctxtBuffer_A_size});
+    #endif
+END_SINGLE_CORE
+%endif
+
+%if ctxtBuffer_B != B:
+// Fast copy data from L2 to L1
+BEGIN_SINGLE_CORE
+    #if USE_DMA
+        dma_memcpy_blocking(${ctxtBuffer_B}, ${B}, ${ctxtBuffer_B_size});
+    #else
+        memcpy(${ctxtBuffer_B}, ${B}, ${ctxtBuffer_B_size});
+    #endif
+END_SINGLE_CORE
+%endif
+
+%if ctxtBuffer_A != A or ctxtBuffer_B != B:
+    mempool_barrier(numThreads);
+%endif
+
+${A_type.typeName} ref_${data_out}_${A} = ${ctxtBuffer_A};
+${B_type.typeName} ref_${data_out}_${B} = ${ctxtBuffer_B};
+${mul_type.typeName} ref_${mul} = ${mul};
+${add_type.typeName} ref_${add} = ${add};
+${data_out_type.typeName}  ref_${data_out}_${data_out} = ${data_out};
+
+for(uint32_t i=0;i<${batch};i++){
+%if A_offset==0 and B_offset==0 and offset_output==0 and M%4==0 and N%4==0 and O%4==0:
+    RQMatMul_unrolled_2x2_parallel_s${A_type.referencedType.typeWidth}(
+        ref_${data_out}_${A},
+        ref_${data_out}_${B},
+        ref_${data_out}_${data_out},
+        ${M},
+        ${N},
+        ${O},
+        ref_${mul},
+        ref_${add},
+        ${log2Dstring},
+        1,
+        ${perRowQuant},
+        core_id,
+        numThreads
+    );
+%elif M%4==0 and N%4==0 and O%4==0:
+    RQMatMul_offset_unrolled_2x2_parallel_s${A_type.referencedType.typeWidth}(
+        ref_${data_out}_${A},
+        ref_${data_out}_${B},
+        ref_${data_out}_${data_out},
+        ${M},
+        ${N},
+        ${O},
+        ref_${mul},
+        ref_${add},
+        ${log2Dstring},
+        1,
+        ${perRowQuant},
+        ${A_offset}, ${B_offset}, ${offset_output},
+        core_id,
+        numThreads
+    );
+%else:
+    RQMatMul_parallel_s${A_type.referencedType.typeWidth}(
+        ref_${data_out}_${A},
+        ref_${data_out}_${B},
+        ref_${data_out}_${data_out},
+        ${M},
+        ${N},
+        ${O},
+        ref_${mul},
+        ref_${add},
+        ${log2Dstring},
+        1,
+        ${perRowQuant},
+        ${A_offset}, ${B_offset}, ${offset_output},
+        ${output_min},
+        ${output_max},
+        core_id,
+        numThreads
+    );
+%endif
+
+    ref_${data_out}_${A} += ${M} * ${N};
+    ref_${data_out}_${B} += ${N} * ${O};
+    ref_${data_out}_${data_out} += ${M} * ${O};
+%if perChannelQuant:
+    ++ref_${mul};
+    ++ref_${add};
+%endif
+}
+mempool_barrier(numThreads);
+""")
diff --git a/Deeploy/Targets/MemPool/Templates/RequantShiftTemplate.py b/Deeploy/Targets/MemPool/Templates/RequantShiftTemplate.py
new file mode 100644
index 0000000..bc1ca14
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Templates/RequantShiftTemplate.py
@@ -0,0 +1,112 @@
+# ----------------------------------------------------------------------
+#
+# File: RequantShiftTemplate.py
+#
+# Last edited: 24.04.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _RequantShiftTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+        # operatorRepresentation['input_offset'] = (data_in._signed == 0) * operatorRepresentation['n_levels']//2
+        # operatorRepresentation['output_offset'] = -(data_out._signed == 0) * operatorRepresentation['n_levels']//2
+        operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+        operatorRepresentation['output_offset'] = -(data_out._signed == 0) * operatorRepresentation['n_levels'] // 2
+
+        operatorRepresentation['output_min'] = -(operatorRepresentation['n_levels'] // 2)
+        operatorRepresentation['output_max'] = (operatorRepresentation['n_levels'] // 2) - 1
+
+        return ctxt, operatorRepresentation, []
+
+
+MemPoolParallelTemplate = _RequantShiftTemplate("""
+<%
+if isinstance(log2D, int):
+    log2Dstring = log2D
+else:
+    log2Dstring = "*"+log2D
+%>
+
+// RequantShift Parallel (Name: ${nodeName}, Op: ${nodeOp})
+mempool_barrier(numThreads);
+% if channels_first:
+    %if output_min==-128 and output_max==127 and data_in_type.referencedType.typeWidth==32 and data_out_type.referencedType.typeWidth==8 and size%4==0:
+        RequantShift_unrolled_1x4_parallel_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_NCHW(
+            ${data_in},
+            ${size},
+            ${mul},
+            ${add},
+            ${data_out},
+            ${log2Dstring},
+            ${channel_width},
+            ${input_offset},
+            ${output_offset},
+            1,
+            core_id,
+            numThreads);
+    %else:
+        RequantShift_parallel_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_NCHW(
+            ${data_in},
+            ${size},
+            ${mul},
+            ${add},
+            ${data_out},
+            ${log2Dstring},
+            ${channel_width},
+            ${input_offset},
+            ${output_offset},
+            ${output_min},
+            ${output_max},
+            1,
+            core_id,
+            numThreads);
+    %endif
+% else:
+    RequantShift_parallel_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_NHWC(
+        ${data_in},
+        ${size},
+        ${mul}, ${add},
+        ${data_out},
+        ${log2Dstring},
+        ${channels},
+        ${input_offset},
+        ${output_offset},
+        ${output_min},
+        ${output_max},
+        1,
+        core_id,
+        numThreads
+    );
+%endif
+mempool_barrier(numThreads);
+""")
diff --git a/Deeploy/Targets/MemPool/Templates/__init__.py b/Deeploy/Targets/MemPool/Templates/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/MemPool/Templates/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/MemPool/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/MemPool/TopologyOptimizationPasses/Passes.py
new file mode 100644
index 0000000..56b2683
--- /dev/null
+++ b/Deeploy/Targets/MemPool/TopologyOptimizationPasses/Passes.py
@@ -0,0 +1,532 @@
+# ----------------------------------------------------------------------
+#
+# File: MemPoolPasses.py
+#
+# Last edited: 13.11.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Dict, Union
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.Matchers import BranchingMatcher, Match, NonBranchingMatcher
+from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, contextagnostic
+
+
+def merge_matmul_rq_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    matmul = matched_nodes[0]
+    rqs = matched_nodes[1]
+
+    # WIESEP: Per element quantization is not supported for RQMatMul
+    if len(rqs.inputs[2].shape) > 0 and rqs.inputs[2].shape[-1] != 1:
+        return graph
+
+    _inputs = list(matmul.inputs) + list(rqs.inputs[2:]) + list(rqs.inputs[1:2])
+    _outputs = rqs.outputs
+
+    attrs = {**matmul.attrs, **rqs.attrs}
+    rqsMatMul = gs.Node(op = 'RQMatMul', name = name, attrs = attrs)
+    graph.replaceInsertNode(_inputs, _outputs, rqsMatMul)
+
+    return graph
+
+
+@contextagnostic
+class MemPoolMatMulRequantMergePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        passes = []
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['matmul_out'], op = 'MatMul', name = 'matmul')
+        output = graph.layer(inputs = output, outputs = ['rqs'], op = 'RequantShift', name = 'rqs')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = f"_MERGE_MATMUL_RQ_PASS"
+        super().__init__(graph, merge_matmul_rq_fun, name)
+
+
+def merge_gemm_rq_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    gemm = matched_nodes[0]
+    rqs = matched_nodes[1]
+
+    # WIESEP: Per element quantization is not supported for RQGemm
+    if len(rqs.inputs[2].shape) > 0 and rqs.inputs[2].shape[-1] != 1:
+        return graph
+
+    # WIESEP: Per column quantization is not supported for RQGemm
+    if len(rqs.inputs[2].shape) > 2 and rqs.inputs[2].shape[-3] != 1:
+        return graph
+
+    _inputs = list(gemm.inputs) + list(rqs.inputs[2:]) + list(rqs.inputs[1:2])
+    _outputs = rqs.outputs
+
+    attrs = {**gemm.attrs, **rqs.attrs}
+    rqsGemm = gs.Node(op = 'RQGemm', name = name, attrs = attrs)
+    graph.replaceInsertNode(_inputs, _outputs, rqsGemm)
+
+    return graph
+
+
+@contextagnostic
+class MemPoolGEMMRequantMergePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        passes = []
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['matmul_out'], op = 'Gemm', name = 'gemm')
+        output = graph.layer(inputs = output, outputs = ['rqs'], op = 'RequantShift', name = 'rqs')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = f"_MERGE_GEMM_RQ_PASS"
+        super().__init__(graph, merge_gemm_rq_fun, name)
+
+
+def _fuse_mhsa_fun(graph: gs.Graph, match: Match, name: str, batchedMatMul = False):
+    # matched_nodes = [m for k, m in match.nodes_map.items()]
+
+    def get_named_node(nodes_map: Dict, name: str) -> Union[gs.Node, None]:
+        if name in nodes_map:
+            return nodes_map[name]
+        return None
+
+    Projection_q = get_named_node(match.nodes_map, 'Projection_q')
+    Bias_Pq = get_named_node(match.nodes_map, 'Bias_Pq')
+    RequantShift_Pq = get_named_node(match.nodes_map, 'RequantShift_Pq')
+    Reshape_Pq = get_named_node(match.nodes_map, 'Reshape_Pq')
+    Transpose_Pq = get_named_node(match.nodes_map, 'Transpose_Pq')
+    Projection_k = get_named_node(match.nodes_map, 'Projection_k')
+    Bias_Pk = get_named_node(match.nodes_map, 'Bias_Pk')
+    RequantShift_Pk = get_named_node(match.nodes_map, 'RequantShift_Pk')
+    # Reshape_Pk = get_named_node(match.nodes_map, 'Reshape_Pk')
+    # Transpose_Pk = get_named_node(match.nodes_map, 'Transpose_Pk')
+    Projection_v = get_named_node(match.nodes_map, 'Projection_v')
+    Bias_Pv = get_named_node(match.nodes_map, 'Bias_Pv')
+    RequantShift_Pv = get_named_node(match.nodes_map, 'RequantShift_Pv')
+    # Reshape_Pv = get_named_node(match.nodes_map, 'Reshape_Pv')
+    Transpose_Pv = get_named_node(match.nodes_map, 'Transpose_Pv')
+    # MatMul_a = get_named_node(match.nodes_map, 'MatMul_a')
+    RequantShift_a = get_named_node(match.nodes_map, 'RequantShift_a')
+    IntegerDiv_a = get_named_node(match.nodes_map, 'IntegerDiv_a')
+    # Softmax_a = get_named_node(match.nodes_map, 'Softmax_a')
+    # MatMul_o = get_named_node(match.nodes_map, 'MatMul_o')
+    RequantShift_o = get_named_node(match.nodes_map, 'RequantShift_o')
+
+    # Check if we accidentally swapped Q and K
+    if Transpose_Pq.attrs['perm'] != Transpose_Pv.attrs['perm']:
+        Projection_q = get_named_node(match.nodes_map, 'Projection_k')
+        Bias_Pq = get_named_node(match.nodes_map, 'Bias_Pk')
+        RequantShift_Pq = get_named_node(match.nodes_map, 'RequantShift_Pk')
+        Reshape_Pq = get_named_node(match.nodes_map, 'Reshape_Pk')
+        Transpose_Pq = get_named_node(match.nodes_map, 'Transpose_Pk')
+        Projection_k = get_named_node(match.nodes_map, 'Projection_q')
+        Bias_Pk = get_named_node(match.nodes_map, 'Bias_Pq')
+        RequantShift_Pk = get_named_node(match.nodes_map, 'RequantShift_Pq')
+        # Reshape_Pk = get_named_node(match.nodes_map, 'Reshape_Pq')
+        # Transpose_Pk = get_named_node(match.nodes_map, 'Transpose_Pq')
+
+        assert Transpose_Pq.attrs['perm'] == Transpose_Pv.attrs[
+            'perm'], "[MemPoolFuseMHSAPass] MHSA key and value permutation is not the same!"
+
+    attrs = {}
+    H = Reshape_Pq.inputs[1].values[2]
+    attrs['heads'] = H
+    attrs['dim_head'] = Reshape_Pq.inputs[1].values[-1]  # Projection Size
+    attrs['dim'] = Projection_q.inputs[0].shape[-2]  # Sequence Length
+    attrs['S'] = Projection_q.inputs[0].shape[-2]  # Sequence Length
+    attrs['E'] = Projection_q.inputs[1].shape[0]  # Embedding Size
+    attrs['P'] = Reshape_Pq.inputs[1].values[-1]  # Projection Size
+
+    attrs['wq_requant_mul'] = np.broadcast_to(RequantShift_Pq.inputs[1].values.reshape(-1), [H])
+    attrs['wk_requant_mul'] = np.broadcast_to(RequantShift_Pk.inputs[1].values.reshape(-1), [H])
+    attrs['wv_requant_mul'] = np.broadcast_to(RequantShift_Pv.inputs[1].values.reshape(-1), [H])
+
+    # WIESEP: We also have to handle the integer div node!
+    if IntegerDiv_a is not None:
+        attrs['preattn_requant_mul'] = np.broadcast_to(
+            np.round(RequantShift_a.inputs[1].values.reshape(-1) / IntegerDiv_a.inputs[1].values.reshape(-1)), [H])
+    else:
+        attrs['preattn_requant_mul'] = np.broadcast_to(RequantShift_a.inputs[1].values.reshape(-1), [H])
+
+    attrs['postattn_requant_mul'] = np.broadcast_to(RequantShift_o.inputs[1].values.reshape(-1), [H])
+
+    attrs['wq_requant_div'] = np.broadcast_to(RequantShift_Pq.attrs['div'].values.reshape(-1), [H])
+    attrs['wk_requant_div'] = np.broadcast_to(RequantShift_Pk.attrs['div'].values.reshape(-1), [H])
+    attrs['wv_requant_div'] = np.broadcast_to(RequantShift_Pv.attrs['div'].values.reshape(-1), [H])
+    attrs['preattn_requant_div'] = np.broadcast_to(RequantShift_a.attrs['div'].values.reshape(-1), [H])
+    attrs['postattn_requant_div'] = np.broadcast_to(RequantShift_o.attrs['div'].values.reshape(-1), [H])
+
+    _inputs = []
+    _inputs.append(Projection_q.inputs[0])
+    _inputs.append(Projection_k.inputs[0])
+    _inputs.append(Projection_v.inputs[0])
+
+    def separate_heads(x: np.ndarray, heads: int, dim_head: int):
+        return np.transpose(np.reshape(x, (-1, heads, dim_head)), (1, 0, 2))
+
+    def get_constant_input(n: gs.Node):
+        for input in n.inputs:
+            if isinstance(input, gs.Constant):
+                return input.values
+        assert False, f"Did not find constant input for {n} node"
+
+    def get_constant_input_or_zeros(n: gs.Node, shape):
+        if n is None:
+            return np.zeros(shape)
+        else:
+            return get_constant_input(n)
+
+    # Transform from MUL-DIV-ADD to MUL-ADD-DIV
+    attrs['wq_requant_add'] = np.broadcast_to(RequantShift_Pq.inputs[2].values.reshape(-1) // attrs['wq_requant_div'],
+                                              [H])
+    attrs['wk_requant_add'] = np.broadcast_to(RequantShift_Pk.inputs[2].values.reshape(-1) // attrs['wk_requant_div'],
+                                              [H])
+    attrs['wv_requant_add'] = np.broadcast_to(RequantShift_Pv.inputs[2].values.reshape(-1) // attrs['wv_requant_div'],
+                                              [H])
+    attrs['preattn_requant_add'] = np.broadcast_to(
+        RequantShift_a.inputs[2].values.reshape(-1) // attrs['preattn_requant_div'], [H])
+    attrs['postattn_requant_add'] = np.broadcast_to(
+        RequantShift_o.inputs[2].values.reshape(-1) // attrs['postattn_requant_div'], [H])
+
+    _inputs += [
+        gs.Constant(name = name + '_wq_weight',
+                    values = separate_heads(get_constant_input(Projection_q), attrs['heads'], attrs['P']))
+    ]
+    _inputs += [
+        gs.Constant(name = name + '_wq_bias',
+                    values = separate_heads(get_constant_input_or_zeros(Bias_Pq, (1, attrs['heads'], 1, attrs['P'])),
+                                            attrs['heads'], attrs['P']))
+    ]
+
+    _inputs += [
+        gs.Constant(name = name + '_wk_weight',
+                    values = separate_heads(get_constant_input(Projection_k), attrs['heads'], attrs['P']))
+    ]
+    _inputs += [
+        gs.Constant(name = name + '_wk_bias',
+                    values = separate_heads(get_constant_input_or_zeros(Bias_Pk, (1, attrs['heads'], 1, attrs['P'])),
+                                            attrs['heads'], attrs['P']))
+    ]
+
+    _inputs += [
+        gs.Constant(name = name + '_wv_weight',
+                    values = separate_heads(get_constant_input(Projection_v), attrs['heads'], attrs['P']))
+    ]
+    _inputs += [
+        gs.Constant(name = name + '_wv_bias',
+                    values = separate_heads(get_constant_input_or_zeros(Bias_Pv, (1, attrs['heads'], 1, attrs['P'])),
+                                            attrs['heads'], attrs['P']))
+    ]
+
+    if batchedMatMul:
+        Projection_Po = get_named_node(match.nodes_map, 'Projection_Po')
+        Bias_Po = get_named_node(match.nodes_map, 'Bias_Po')
+        RequantShift_Po = get_named_node(match.nodes_map, 'RequantShift_Po')
+
+        attrs['n_levels'] = RequantShift_Po.attrs['n_levels_out'].values.reshape(1)
+        attrs['signed'] = RequantShift_Po.attrs['signed'].values.reshape(1)
+        attrs['wo_requant_mul'] = np.broadcast_to(RequantShift_Po.inputs[1].values.reshape(-1), [H])
+        attrs['wo_requant_div'] = np.broadcast_to(RequantShift_Po.attrs['div'].values.reshape(-1), [H])
+        attrs['wo_requant_add'] = np.broadcast_to(RequantShift_Po.inputs[2].values.reshape(-1),
+                                                  [H]) // attrs['wo_requant_div']
+
+        _inputs += [
+            gs.Constant(name = name + '_wo_weight',
+                        values = np.reshape(get_constant_input(Projection_Po), (attrs['heads'], attrs['P'], -1)))
+        ]
+        _inputs += [
+            gs.Constant(name = name + '_wo_bias',
+                        values = np.reshape(get_constant_input_or_zeros(Bias_Po, (1, attrs['heads'], 1, attrs['E'])),
+                                            (attrs['heads'], attrs['E'])))
+        ]
+
+        _outputs = RequantShift_Po.outputs
+        mhsa = gs.Node(op = 'MHSA', name = name, attrs = attrs)
+        graph.replaceInsertNode(_inputs, _outputs, mhsa)
+    else:
+        # Extract ouptut projection for each head
+        attrs['wo_requant_mul'] = np.empty((H))
+        attrs['wo_requant_div'] = np.empty((H))
+        attrs['wo_requant_add'] = np.empty((H))
+        wo_weight = np.empty((H, attrs['P'], attrs['E']))
+        wo_bias = np.empty((H, 1, attrs['E']))
+        outputs = []
+        for h in range(H):
+            Gather_Po = get_named_node(match.nodes_map, f'Gather_o_{h}')
+            index_h = int(get_constant_input(Gather_Po))
+            MatMul_Po = get_named_node(match.nodes_map, f'MatMul_Po_{h}')
+            Bias_Po = get_named_node(match.nodes_map, f'Bias_Po_{h}')
+            RequantShift_Po = get_named_node(match.nodes_map, f'RequantShift_Po_{h}')
+            outputs.append(RequantShift_Po.outputs[0])
+
+            attrs['wo_requant_mul'][index_h] = RequantShift_Po.inputs[1].values.reshape(-1)
+            attrs['wo_requant_div'][index_h] = RequantShift_Po.attrs['div'].values.reshape(-1)
+            attrs['wo_requant_add'][index_h] = RequantShift_Po.inputs[2].values.reshape(
+                -1) // attrs['wo_requant_div'][index_h]
+            wo_weight[index_h] = get_constant_input(MatMul_Po)
+            wo_bias[index_h] = get_constant_input_or_zeros(Bias_Po, (1, attrs['E']))
+
+        _inputs += [gs.Constant(name = name + '_wo_weight', values = wo_weight)]
+        _inputs += [gs.Constant(name = name + '_wo_bias', values = wo_bias)]
+
+        # Extract final output
+        attrs['n_levels'] = RequantShift_Po.attrs['n_levels_out'].values.reshape(1)
+        attrs['signed'] = RequantShift_Po.attrs['signed'].values.reshape(1)
+
+        if H > 1:
+            _output = get_named_node(match.nodes_map, f'Add_Po_{H-2}').outputs[0]
+        else:
+            _output = get_named_node(match.nodes_map, f'RequantShift_Po_0').outputs[0]
+
+        mhsa_out = graph.layer(inputs = _inputs, outputs = [name + f'_out'], op = 'MHSA', name = name, attrs = attrs)
+        graph.layer(inputs = mhsa_out,
+                    outputs = [_output],
+                    op = 'ReduceSum',
+                    name = name + "_sum",
+                    attrs = {
+                        'axes': [1],
+                        "keepdims": "0"
+                    })
+
+        mhsa_out[0].shape = [_output.shape[0]] + [int(H)] + _output.shape[1:]
+        mhsa_out[0].dtype = np.float32
+        # Disconnect input nodes of all output tensors
+        _output.inputs = _output.inputs[-1:]
+
+        graph.cleanup().toposort()
+
+    return graph
+
+
+@contextagnostic
+class MemPoolFuseMHSAPass(ReplaceSequentialPatternPass):
+
+    def __init__(self, H, integerDiv = False, preSoftMaxRQ = True, bias = False):
+
+        graph = gs.Graph()
+        _input_q = gs.Variable(name = 'input_q')
+        _input_k = gs.Variable(name = 'input_k')
+        _input_v = gs.Variable(name = 'input_v')
+
+        # Query Projection
+        output_q = graph.layer(inputs = [_input_q], outputs = ['pQ'], op = 'MatMul', name = 'Projection_q')
+        if bias:
+            output_q = graph.layer(inputs = output_q, outputs = ['pQ_b'], op = 'Add', name = 'Bias_Pq')
+        output_q = graph.layer(inputs = output_q, outputs = ['pQ_rq'], op = 'RequantShift', name = 'RequantShift_Pq')
+        output_q = graph.layer(inputs = output_q, outputs = ['pQ_r'], op = 'Reshape', name = 'Reshape_Pq')
+        output_q = graph.layer(inputs = output_q, outputs = ['pQ_t'], op = 'Transpose', name = 'Transpose_Pq')
+
+        # Key Projection
+        output_k = graph.layer(inputs = [_input_k], outputs = ['pK'], op = 'MatMul', name = 'Projection_k')
+        if bias:
+            output_k = graph.layer(inputs = output_k, outputs = ['pK_b'], op = 'Add', name = 'Bias_Pk')
+        output_k = graph.layer(inputs = output_k, outputs = ['pK_rq'], op = 'RequantShift', name = 'RequantShift_Pk')
+        output_k = graph.layer(inputs = output_k, outputs = ['pK_r'], op = 'Reshape', name = 'Reshape_Pk')
+        output_k = graph.layer(inputs = output_k, outputs = ['pK_t'], op = 'Transpose', name = 'Transpose_Pk')
+
+        # Value Projection
+        output_v = graph.layer(inputs = [_input_v], outputs = ['pV'], op = 'MatMul', name = 'Projection_v')
+        if bias:
+            output_v = graph.layer(inputs = output_v, outputs = ['pV_b'], op = 'Add', name = 'Bias_Pv')
+        output_v = graph.layer(inputs = output_v, outputs = ['pV_rq'], op = 'RequantShift', name = 'RequantShift_Pv')
+        output_v = graph.layer(inputs = output_v, outputs = ['pV_r'], op = 'Reshape', name = 'Reshape_Pv')
+        output_v = graph.layer(inputs = output_v, outputs = ['pV_t'], op = 'Transpose', name = 'Transpose_Pv')
+
+        # Attention Matrix
+        output_a = graph.layer(inputs = output_q + output_k, outputs = ['a'], op = 'MatMul', name = 'MatMul_a')
+        if preSoftMaxRQ:
+            output_a = graph.layer(inputs = output_a, outputs = ['a_rq'], op = 'RequantShift', name = 'RequantShift_a')
+
+        if integerDiv:
+            output_a = graph.layer(inputs = output_a, outputs = ['a_d'], op = 'IntegerDiv', name = 'IntegerDiv_a')
+
+        output_a = graph.layer(inputs = output_a, outputs = ['a_s'], op = 'ITAPartialMax', name = 'Softmax_a')
+
+        # Attention
+        output = graph.layer(inputs = output_v + output_a, outputs = ['o'], op = 'MatMul', name = 'MatMul_o')
+
+        if H == -1:
+            # WIESEP: This only works if the output projection is a batched matrix multiplication
+            output = graph.layer(inputs = output, outputs = ['o_rq'], op = 'RequantShift', name = 'RequantShift_o')
+            # output = graph.layer(inputs = output, outputs = ['o_t'], op = 'Transpose', name = 'Transpose_o')
+            # output = graph.layer(inputs = output, outputs = ['o_r'], op = 'Reshape', name = 'Reshape_Po')
+            output = graph.layer(inputs = output, outputs = ['pO'], op = 'MatMul', name = 'Projection_Po')
+            if bias:
+                output = graph.layer(inputs = output, outputs = ['pO_b'], op = 'Add', name = 'Bias_Po')
+            output = graph.layer(inputs = output, outputs = ['pO_rq'], op = 'RequantShift', name = 'RequantShift_Po')
+        else:
+            attention = graph.layer(inputs = output, outputs = ['o_rq'], op = 'RequantShift', name = 'RequantShift_o')
+
+            projection_out = []
+            for i in range(H):
+                output = graph.layer(inputs = attention, outputs = [f'o_{i}'], op = 'Gather', name = f'Gather_o_{i}')
+                output = graph.layer(inputs = output, outputs = [f'pO_{i}'], op = 'MatMul', name = f'MatMul_Po_{i}')
+                if bias:
+                    output = graph.layer(inputs = output, outputs = ['pO_{i}_b'], op = 'Add', name = f'Bias_Po_{i}')
+                output = graph.layer(inputs = output,
+                                     outputs = [f'pO_{i}_rq'],
+                                     op = 'RequantShift',
+                                     name = f'RequantShift_Po_{i}')
+                projection_out.extend(output)
+
+            for i in range(H - 1):
+                if i == 0:
+                    inp = [projection_out[0], projection_out[i + 1]]
+                else:
+                    inp = [output[0], projection_out[i + 1]]
+
+                output = graph.layer(inputs = inp, outputs = [f'pO_sum_{i}'], op = 'Add', name = f'Add_Po_{i}')
+
+        graph.outputs.append(output[0])
+        graph.inputs.append(_input_q)
+        graph.inputs.append(_input_k)
+        graph.inputs.append(_input_v)
+
+        name = "_FUSE_MHSA_PASS"
+
+        # WIESEP: Debug Export pattern graph to ONNX
+        # model = gs.export_onnx(graph, False)
+        # onnx.save(model, f'pattern_{name}.onnx')
+
+        super().__init__(graph, partial(_fuse_mhsa_fun, batchedMatMul = (H == -1)), name, matcher = BranchingMatcher())
+
+
+def _split_mhsa_fun(graph: gs.Graph, match: Match, name: str):
+
+    def get_named_node(nodes_map: Dict, name: str) -> gs.Node:
+        if name in nodes_map:
+            return nodes_map[name]
+        raise KeyError(f"Did not find node with name {name}")
+
+    MHSA = get_named_node(match.nodes_map, 'MHSA')
+
+    input_q = MHSA.inputs[0]
+    input_kv = MHSA.inputs[1]
+    ReduceSum = get_named_node(match.nodes_map, 'ReduceSum')
+    RequantShift = get_named_node(match.nodes_map, 'RequantShift')
+
+    _outputs = RequantShift.outputs
+
+    # Calculate the number of heads
+    H = int(MHSA.attrs['heads'])
+    S = int(MHSA.attrs['dim'])
+    E = input_kv.shape[-1]
+    P = int(MHSA.attrs['dim_head'])
+
+    if H == 1:
+        return graph
+
+    # Create a list to hold the output nodes
+    mhsa_outputs = []
+
+    def extractHead(H: int, i: int):
+        _attrs = {}
+        _attrs['dim'] = MHSA.attrs['dim']
+        _attrs['dim_head'] = MHSA.attrs['dim_head']
+        _attrs['heads'] = H
+        _attrs['n_levels'] = MHSA.attrs['n_levels']
+        _attrs['signed'] = MHSA.attrs['signed']
+
+        _attr_names = [
+            "wq_requant_add", "wk_requant_add", "wv_requant_add", "wo_requant_add", "preattn_requant_add",
+            "postattn_requant_add", "wq_requant_mul", "wk_requant_mul", "wv_requant_mul", "wo_requant_mul",
+            "preattn_requant_mul", "postattn_requant_mul", "wq_requant_div", "wk_requant_div", "wv_requant_div",
+            "wo_requant_div", "preattn_requant_div", "postattn_requant_div"
+        ]
+        for att in _attr_names:
+            _attrs[att] = MHSA.attrs[att][i:i + H]
+
+        _inputs = [input_q, input_kv, input_kv]
+        _inputs_names = ["wq_weight", "wq_bias", "wk_weight", "wk_bias", "wv_weight", "wv_bias", "wo_weight", "wo_bias"]
+        for idx, inp in enumerate(_inputs_names):
+            _inputs += [
+                gs.Constant(name = name + f'_MHSA_H{i}_{i+H-1}_{inp}', values = MHSA.inputs[idx + 3].values[i:i + H])
+            ]
+
+        # Create a new MHSA node for the current set of 4 heads
+        mhsa_out = graph.layer(inputs = _inputs,
+                               outputs = [name + f'_MHSA_H{i}_{i+H-1}_out'],
+                               op = 'MHSA',
+                               name = name + f'_MHSA_H{i}_{i+H-1}',
+                               attrs = _attrs)
+
+        # Append the new MHSA node to the output nodes list
+        output_sum = graph.layer(inputs = mhsa_out,
+                                 outputs = [name + f'_MHSA_H{i}_{i+H-1}_out_sum'],
+                                 op = 'ReduceSum',
+                                 name = name + f'_ReduceSum_H{i}_{i+H-1}',
+                                 attrs = ReduceSum.attrs)
+        mhsa_outputs.extend(output_sum)
+
+    # Split the MHSA node into multiple MHSA nodes calculating 4, 2 or 1 heads each
+    for i in range(0, H, 4):
+        if i + 4 <= H:
+            extractHead(4, i)
+        elif i + 3 <= H:
+            extractHead(2, i)
+            extractHead(1, i + 2)
+        elif i + 2 <= H:
+            extractHead(2, i)
+        else:
+            extractHead(1, i)
+
+    # Add layer to sum all heads
+    out_sum = graph.layer(inputs = mhsa_outputs, outputs = [name + '_Out_sum'], op = 'Add', name = name + '_Add')
+
+    # Add final requantiztion step
+    RequantShift.inputs = out_sum + RequantShift.inputs[1:]
+    # graph.layer(inputs=out_sum, outputs = _outputs, op = 'RequantShift', name = name + '_RequantShift')
+
+    graph.deleteNode(MHSA)
+    # graph.deleteNode(ReduceSum)
+    # graph.deleteNode(RequantShift)
+
+    graph.cleanup().toposort()
+
+    return graph
+
+
+@contextagnostic
+class MemPoolSplitMHSAPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input_q = gs.Variable(name = 'input_q')
+        _input_kv = gs.Variable(name = 'input_kv')
+
+        output = graph.layer(inputs = [_input_q, _input_kv], outputs = ['Out'], op = 'MHSA', name = 'MHSA')
+
+        output = graph.layer(inputs = output, outputs = ['Out_sum'], op = 'ReduceSum', name = 'ReduceSum')
+        output = graph.layer(inputs = output, outputs = ['Out_sum_rqs'], op = 'RequantShift', name = 'RequantShift')
+
+        graph.outputs.append(output)
+        graph.inputs.append(_input_q)
+        graph.inputs.append(_input_kv)
+
+        name = "_SPLIT_MHSA_PASS"
+        super().__init__(graph, _split_mhsa_fun, name, matcher = NonBranchingMatcher())
diff --git a/Deeploy/Targets/MemPool/TopologyOptimizationPasses/__init__.py b/Deeploy/Targets/MemPool/TopologyOptimizationPasses/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/MemPool/TopologyOptimizationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/MemPool/__init__.py b/Deeploy/Targets/MemPool/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/MemPool/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/Neureka/Bindings.py b/Deeploy/Targets/Neureka/Bindings.py
new file mode 100644
index 0000000..2a62cd5
--- /dev/null
+++ b/Deeploy/Targets/Neureka/Bindings.py
@@ -0,0 +1,125 @@
+# ----------------------------------------------------------------------
+#
+# File: NeurekaBindings.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# Luka Macan, University of Bologna
+# Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import int8_t, int32_t, uint8_t
+from Deeploy.DeeployTypes import NodeBinding
+from Deeploy.MemoryLevelExtension.MemoryLevels import NodeMemoryLevelChecker, memoryAwareNodeBindingExtension
+from Deeploy.Targets.Generic.TypeCheckers import ConvChecker
+from Deeploy.Targets.Neureka.Templates.ConvTemplate import NeurekaDenseConv2D_Template, NeurekaDWConv2D_Template, \
+    NeurekaPWConv2D_Template, NeurekaRqntDenseConv2D_Template, NeurekaRqntDWConv2D_Template, \
+    NeurekaRqntPWConv2D_Template
+from Deeploy.Targets.PULPOpen.Bindings import ClusterTransformer
+from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker
+
+NeurekaRQSPWConv2DBindings = [
+    NodeBinding(
+        PULPConvChecker(
+            [PointerClass(data_in_type),
+             PointerClass(weight_type),
+             PointerClass(int32_t),
+             PointerClass(int32_t)], [PointerClass(data_out_type)]), NeurekaRqntPWConv2D_Template, ClusterTransformer)
+    for data_in_type in [uint8_t, int8_t]
+    for data_out_type in [uint8_t, int8_t]
+    for weight_type in [uint8_t, int8_t]
+]
+NeurekaPWConv2DBindings = [
+    NodeBinding(
+        ConvChecker(
+            [PointerClass(data_in_type), PointerClass(weight_type),
+             PointerClass(int32_t)], [PointerClass(int32_t)]), NeurekaPWConv2D_Template, ClusterTransformer)
+    for data_in_type in [uint8_t, int8_t]
+    for weight_type in [uint8_t, int8_t]
+]
+
+NeurekaWmemRQSPWConv2DBindings = [
+    memoryAwareNodeBindingExtension(binding, NodeMemoryLevelChecker([None, "WeightMemory_SRAM", None, None], [None]))
+    for binding in NeurekaRQSPWConv2DBindings
+]
+NeurekaWmemPWConv2DBindings = [
+    memoryAwareNodeBindingExtension(binding, NodeMemoryLevelChecker([None, "WeightMemory_SRAM"], [None]))
+    for binding in NeurekaPWConv2DBindings
+]
+
+NeurekaRQSDWConv2DBindings = [
+    NodeBinding(
+        PULPConvChecker(
+            [PointerClass(data_in_type),
+             PointerClass(weight_type),
+             PointerClass(int32_t),
+             PointerClass(int32_t)], [PointerClass(data_out_type)]), NeurekaRqntDWConv2D_Template, ClusterTransformer)
+    for data_in_type in [uint8_t, int8_t]
+    for data_out_type in [uint8_t, int8_t]
+    for weight_type in [uint8_t, int8_t]
+]
+NeurekaDWConv2DBindings = [
+    NodeBinding(
+        ConvChecker(
+            [PointerClass(data_in_type), PointerClass(weight_type),
+             PointerClass(int32_t)], [PointerClass(int32_t)]), NeurekaDWConv2D_Template, ClusterTransformer)
+    for data_in_type in [uint8_t, int8_t]
+    for weight_type in [uint8_t, int8_t]
+]
+
+NeurekaWmemRQSDWConv2DBindings = [
+    memoryAwareNodeBindingExtension(binding, NodeMemoryLevelChecker([None, "WeightMemory_SRAM", None, None], [None]))
+    for binding in NeurekaRQSDWConv2DBindings
+]
+NeurekaWmemDWConv2DBindings = [
+    memoryAwareNodeBindingExtension(binding, NodeMemoryLevelChecker([None, "WeightMemory_SRAM"], [None]))
+    for binding in NeurekaDWConv2DBindings
+]
+
+NeurekaRQSDenseConv2DBindings = [
+    NodeBinding(
+        PULPConvChecker(
+            [PointerClass(data_in_type),
+             PointerClass(weight_type),
+             PointerClass(int32_t),
+             PointerClass(int32_t)], [PointerClass(data_out_type)]), NeurekaRqntDenseConv2D_Template,
+        ClusterTransformer)
+    for data_in_type in [uint8_t, int8_t]
+    for data_out_type in [uint8_t, int8_t]
+    for weight_type in [uint8_t, int8_t]
+]
+NeurekaDenseConv2DBindings = [
+    NodeBinding(
+        ConvChecker(
+            [PointerClass(data_in_type), PointerClass(weight_type),
+             PointerClass(int32_t)], [PointerClass(int32_t)]), NeurekaDenseConv2D_Template, ClusterTransformer)
+    for data_in_type in [uint8_t, int8_t]
+    for weight_type in [uint8_t, int8_t]
+]
+
+NeurekaWmemRQSDenseConv2DBindings = [
+    memoryAwareNodeBindingExtension(binding, NodeMemoryLevelChecker([None, "WeightMemory_SRAM", None, None], [None]))
+    for binding in NeurekaRQSDenseConv2DBindings
+]
+NeurekaWmemDenseConv2DBindings = [
+    memoryAwareNodeBindingExtension(binding, NodeMemoryLevelChecker([None, "WeightMemory_SRAM"], [None]))
+    for binding in NeurekaDenseConv2DBindings
+]
diff --git a/Deeploy/Targets/Neureka/Deployer.py b/Deeploy/Targets/Neureka/Deployer.py
new file mode 100644
index 0000000..c14d1ab
--- /dev/null
+++ b/Deeploy/Targets/Neureka/Deployer.py
@@ -0,0 +1,62 @@
+# ----------------------------------------------------------------------
+#
+# File: Deployer.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, Type
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    NeurekaNCHWtoNHWCPass, PULPNCHWtoNHWCPass
+from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
+from Deeploy.Targets.Neureka.TopologyOptimizationPasses.Passes import ConvEngineDiscolorationPass, \
+    NeurekaOptimizationPass
+from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer
+
+
+class NeurekaDeployer(PULPDeployer):
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: DeploymentPlatform,
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable = lambda graph: list(graph.nodes),
+                 name: str = 'DeeployNetwork',
+                 default_channels_first = False,
+                 deeployStateDir: str = "DeeployStateDir",
+                 inputOffsets = {}):
+        super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name,
+                         default_channels_first, deeployStateDir, inputOffsets)
+
+        if self.Platform.engines[0].enable3x3:
+            for idx in range(len(self.loweringOptimizer.passes)):
+                if isinstance(self.loweringOptimizer.passes[idx], PULPNCHWtoNHWCPass):
+                    self.loweringOptimizer.passes[idx] = NeurekaNCHWtoNHWCPass(self.default_channels_first)
+
+        self.loweringOptimizer.passes += [
+            ConvEngineDiscolorationPass(),
+            NeurekaOptimizationPass(self.default_channels_first, "Neureka")
+        ]
diff --git a/Deeploy/Targets/Neureka/Engine.py b/Deeploy/Targets/Neureka/Engine.py
new file mode 100644
index 0000000..7775f41
--- /dev/null
+++ b/Deeploy/Targets/Neureka/Engine.py
@@ -0,0 +1,114 @@
+# ----------------------------------------------------------------------
+#
+# File: Engine.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import DeploymentEngine, NodeMapper
+from Deeploy.Targets.Generic.Layers import ConvLayer
+from Deeploy.Targets.Neureka.Parsers import NeurekaDenseConv2DParser, NeurekaDWConv2DParser, NeurekaPWConv2DParser, \
+    NeurekaRQSDenseConv2DParser, NeurekaRQSDWConv2DParser, NeurekaRQSPWConv2DParser
+from Deeploy.Targets.Neureka.Tiler import NeurekaDenseConv2DTilingReadyBindings, NeurekaDWConv2DTilingReadyBindings, \
+    NeurekaPWConv2DTilingReadyBindings, NeurekaRQSDenseConv2DTilingReadyBindings, \
+    NeurekaRQSDWConv2DTilingReadyBindings, NeurekaRQSPWConv2DTilingReadyBindings, \
+    NeurekaWmemDenseConv2DTilingReadyBindings, NeurekaWmemDWConv2DTilingReadyBindings, \
+    NeurekaWmemPWConv2DTilingReadyBindings, NeurekaWmemRQSDenseConv2DTilingReadyBindings, \
+    NeurekaWmemRQSDWConv2DTilingReadyBindings, NeurekaWmemRQSPWConv2DTilingReadyBindings
+from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer
+
+NeurekaRqntPWConv2DMapper = NodeMapper(
+    NeurekaRQSPWConv2DParser(), NeurekaWmemRQSPWConv2DTilingReadyBindings + NeurekaRQSPWConv2DTilingReadyBindings)
+NeurekaPWConv2DMapper = NodeMapper(NeurekaPWConv2DParser(),
+                                   NeurekaWmemPWConv2DTilingReadyBindings + NeurekaPWConv2DTilingReadyBindings)
+
+NeurekaRqntDWConv2DMapper = NodeMapper(
+    NeurekaRQSDWConv2DParser(), NeurekaWmemRQSDWConv2DTilingReadyBindings + NeurekaRQSDWConv2DTilingReadyBindings)
+NeurekaDWConv2DMapper = NodeMapper(NeurekaDWConv2DParser(),
+                                   NeurekaWmemDWConv2DTilingReadyBindings + NeurekaDWConv2DTilingReadyBindings)
+
+NeurekaRqntDenseConv2DMapper = NodeMapper(
+    NeurekaRQSDenseConv2DParser(),
+    NeurekaWmemRQSDenseConv2DTilingReadyBindings + NeurekaRQSDenseConv2DTilingReadyBindings)
+NeurekaDenseConv2DMapper = NodeMapper(NeurekaDenseConv2DParser(),
+                                      NeurekaWmemDenseConv2DTilingReadyBindings + NeurekaDenseConv2DTilingReadyBindings)
+
+NeurekaMapping = {
+    'RequantizedConv':
+        PULPRQSConvLayer([NeurekaRqntPWConv2DMapper, NeurekaRqntDWConv2DMapper, NeurekaRqntDenseConv2DMapper]),
+    'Conv':
+        ConvLayer([NeurekaPWConv2DMapper, NeurekaDWConv2DMapper, NeurekaDenseConv2DMapper]),
+}
+
+_includeList = ["pulp_nnx_neureka.h", "pulp_nnx_util.h", "neureka_siracusa_bsp.h", "neureka.h", "neureka_task.h"]
+
+_neurekaInitCode = r"""
+neureka_siracusa_conf_t conf = {.max_stall = 8};
+neureka_nnx_init(neureka_siracusa_get_dev(), &conf);
+"""
+
+
+class NeurekaEngine(DeploymentEngine):
+
+    def __init__(self,
+                 name: str,
+                 Mapping = NeurekaMapping,
+                 initCode: str = _neurekaInitCode,
+                 includeList: List[str] = _includeList,
+                 enable3x3: bool = False,
+                 enableStrides: bool = False) -> None:
+        super().__init__(name, Mapping, initCode, includeList)
+
+        self.enable3x3 = enable3x3
+        self.enableStrides = enableStrides
+
+    def isDenseConv(self, node) -> bool:
+        return node.op in ["Conv", "RequantizedConv"] and \
+            isinstance(node.inputs[1], gs.Constant) and \
+            node.attrs['kernel_shape'] == [3, 3] and \
+            node.attrs['dilations'] == [1, 1] and \
+            node.attrs['group'] == 1 and \
+            (node.attrs['strides'] == [1, 1] or self.enableStrides)
+
+    def isPWConv(self, node) -> bool:
+        return node.op in ["Conv", "RequantizedConv"] and \
+            isinstance(node.inputs[1], gs.Constant) and \
+            node.attrs['kernel_shape'] == [1, 1] and \
+            node.attrs['dilations'] == [1, 1] and \
+            (node.attrs['strides'] == [1, 1] or self.enableStrides)
+
+    def isDWConv(self, node) -> bool:
+        return node.op in ["Conv", "RequantizedConv"] and \
+            isinstance(node.inputs[1], gs.Constant) and \
+            node.attrs['kernel_shape'] == [3, 3] and \
+            node.attrs['dilations'] == [1, 1] and \
+            node.attrs['group'] != 1 and \
+            (node.attrs['strides'] == [1, 1] or self.enableStrides)
+
+    def canExecute(self, node: gs.Node) -> bool:
+        if self.enable3x3:
+            return self.isPWConv(node) or self.isDWConv(node) or self.isDenseConv(node)
+        else:
+            return self.isPWConv(node)
diff --git a/Deeploy/Targets/Neureka/Parsers.py b/Deeploy/Targets/Neureka/Parsers.py
new file mode 100644
index 0000000..dd1394f
--- /dev/null
+++ b/Deeploy/Targets/Neureka/Parsers.py
@@ -0,0 +1,272 @@
+# ----------------------------------------------------------------------
+#
+# File: Parsers.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import NetworkContext
+from Deeploy.Targets.Generic.Parsers import ConvParser, RQSParserInterface
+
+
+class NeurekaConv2DBaseParser(ConvParser):
+
+    def __init__(self, noBiasHoisting: bool = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> bool:
+        if not super().parseNode(node):
+            return False
+
+        if not all([
+                len(node.attrs['pads']) == 4,
+                # No dilation support
+                self.operatorRepresentation['dilations'] == [1, 1],
+                # Channels have to be last
+                'channels_first' in self.operatorRepresentation and not self.operatorRepresentation['channels_first'],
+                # Expect "weight_offset" attribute in the node
+                "weight_offset" in node.attrs,
+        ]):
+            return False
+
+        self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0])
+        self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1])
+        self.operatorRepresentation['dilation_x'] = int(self.operatorRepresentation['dilations'][0])
+        self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][1])
+        self.operatorRepresentation['padding_x'] = int(self.operatorRepresentation['pads'][0])
+        self.operatorRepresentation['padding_y'] = int(self.operatorRepresentation['pads'][1])
+        self.operatorRepresentation['stride_x'] = int(self.operatorRepresentation['strides'][0])
+        self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][1])
+        self.operatorRepresentation['bias_shift'] = int(0)
+        self.operatorRepresentation['out_shift'] = int(0)
+        self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0])
+        self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1])
+        self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2])
+        self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3])
+        self.operatorRepresentation['weight_offset'] = int(node.attrs["weight_offset"])
+
+        return True
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if not ret:
+            return ctxt, False
+
+        data_in = newCtxt.lookup(self.operatorRepresentation['data_in'])
+        data_out = newCtxt.lookup(self.operatorRepresentation['data_out'])
+        weight = newCtxt.lookup(self.operatorRepresentation['weight'])
+
+        self.operatorRepresentation['batch'] = data_in.shape[0]
+        if channels_first:
+            self.operatorRepresentation['ch_im_in'] = data_in.shape[1]
+            self.operatorRepresentation['dim_im_in_x'] = data_in.shape[2]
+            self.operatorRepresentation['dim_im_in_y'] = data_in.shape[3]
+            self.operatorRepresentation['ch_im_out'] = data_out.shape[1]
+            self.operatorRepresentation['dim_im_out_x'] = data_out.shape[2]
+            self.operatorRepresentation['dim_im_out_y'] = data_out.shape[3]
+        else:
+            self.operatorRepresentation['ch_im_in'] = data_in.shape[3]
+            self.operatorRepresentation['dim_im_in_x'] = data_in.shape[1]
+            self.operatorRepresentation['dim_im_in_y'] = data_in.shape[2]
+            self.operatorRepresentation['ch_im_out'] = data_out.shape[3]
+            self.operatorRepresentation['dim_im_out_x'] = data_out.shape[1]
+            self.operatorRepresentation['dim_im_out_y'] = data_out.shape[2]
+
+        # No requantization
+        self.operatorRepresentation['mul'] = 'NULL'
+        self.operatorRepresentation['add'] = 'NULL'
+        self.operatorRepresentation['shift'] = 'NULL'
+
+        return newCtxt, True
+
+
+class NeurekaDWConv2DParser(NeurekaConv2DBaseParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        if not super().parseNode(node):
+            return False
+
+        if not self.operatorRepresentation['kernel_shape'] == [3, 3]:
+            return False
+        if self.operatorRepresentation['group'] == 1:
+            return False
+
+        return True
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        data_in = ctxt.lookup(self.operatorRepresentation['data_in'])
+        weight = ctxt.lookup(self.operatorRepresentation['weight'])
+
+        if len(data_in.shape) != 4 or len(weight.shape) != 4:
+            return ctxt, False
+
+        return newCtxt, True
+
+
+class NeurekaRQSDWConv2DParser(NeurekaDWConv2DParser, RQSParserInterface):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        ret = all([
+            RQSParserInterface.parseNode(self, node),
+            NeurekaDWConv2DParser.parseNode(self, node),
+        ])
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if not ret:
+            return ctxt, False
+
+        inputs = ['data_in', 'weight', 'mul', 'add']
+        for idx, inputNode in enumerate(node.inputs):
+            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+
+        return newCtxt, True
+
+
+class NeurekaPWConv2DParser(NeurekaConv2DBaseParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        if not super().parseNode(node):
+            return False
+
+        if not self.operatorRepresentation['kernel_shape'] == [1, 1]:
+            return False
+
+        # if not self.operatorRepresentation['strides'] == [1, 1]:
+        #     return False
+
+        return True
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        data_in = newCtxt.lookup(self.operatorRepresentation['data_in'])
+        weight = newCtxt.lookup(self.operatorRepresentation['weight'])
+
+        if len(data_in.shape) != 4 or len(weight.shape) != 3:
+            return ctxt, False
+
+        return newCtxt, True
+
+
+class NeurekaRQSPWConv2DParser(NeurekaPWConv2DParser, RQSParserInterface):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        ret = all([
+            RQSParserInterface.parseNode(self, node),
+            NeurekaPWConv2DParser.parseNode(self, node),
+        ])
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if not ret:
+            return ctxt, False
+
+        inputs = ['data_in', 'weight', 'mul', 'add']
+        for idx, inputNode in enumerate(node.inputs):
+            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+
+        return newCtxt, True
+
+
+class NeurekaDenseConv2DParser(NeurekaConv2DBaseParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        if not super().parseNode(node):
+            return False
+
+        if not self.operatorRepresentation['kernel_shape'] == [3, 3]:
+            return False
+
+        if not self.operatorRepresentation['group'] == 1:
+            return False
+
+        return True
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        data_in = newCtxt.lookup(self.operatorRepresentation['data_in'])
+        weight = newCtxt.lookup(self.operatorRepresentation['weight'])
+
+        if len(data_in.shape) != 4 or len(weight.shape) != 4:
+            return ctxt, False
+
+        return newCtxt, True
+
+
+class NeurekaRQSDenseConv2DParser(NeurekaDenseConv2DParser, RQSParserInterface):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        ret = all([
+            RQSParserInterface.parseNode(self, node),
+            NeurekaDenseConv2DParser.parseNode(self, node),
+        ])
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if not ret:
+            return ctxt, False
+
+        inputs = ['data_in', 'weight', 'mul', 'add']
+        for idx, inputNode in enumerate(node.inputs):
+            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+
+        return newCtxt, True
diff --git a/Deeploy/Targets/Neureka/Platform.py b/Deeploy/Targets/Neureka/Platform.py
new file mode 100644
index 0000000..b618cab
--- /dev/null
+++ b/Deeploy/Targets/Neureka/Platform.py
@@ -0,0 +1,104 @@
+# ----------------------------------------------------------------------
+#
+# File: Platform.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    RequantizedGemmToPwPass
+from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeTemplate, TopologyOptimizer
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.Targets.Neureka.Engine import NeurekaEngine
+from Deeploy.Targets.Neureka.Templates.AllocateTemplate import neurekaGenericGlobalInitTemplate
+from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPClusterEngine, \
+    PULPOptimizer, PULPPlatform, PULPStructBuffer, PULPTransientBuffer, PULPVariableBuffer
+
+NeurekaOptimizer = TopologyOptimizer([
+    *PULPOptimizer.passes,
+    RequantizedGemmToPwPass(),
+])
+
+
+class NeurekaConstantBuffer(ConstantBuffer):
+
+    initTemplate = neurekaGenericGlobalInitTemplate
+    allocTemplate = NodeTemplate("")
+    deallocTemplate = NodeTemplate("")
+
+    def _bufferRepresentation(self):
+        operatorRepresentation = super()._bufferRepresentation()
+        operatorRepresentation["_memoryLevel"] = getattr(self, "_memoryLevel", None)
+        return operatorRepresentation
+
+
+class NeurekaPlatform(PULPPlatform):
+
+    def __init__(self,
+                 engines = [NeurekaEngine("Neureka"), PULPClusterEngine("PULPCluster")],
+                 variableBuffer = PULPVariableBuffer,
+                 constantBuffer = NeurekaConstantBuffer,
+                 structBuffer = PULPStructBuffer,
+                 transientBuffer = PULPTransientBuffer) -> None:
+        super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
+
+
+class MemoryNeurekaPlatform(MemoryPULPPlatform):
+
+    def __init__(self,
+                 memoryHierarchy: MemoryHierarchy,
+                 defaultTargetMemoryLevel: MemoryLevel,
+                 weightMemoryLevel: Optional[MemoryLevel] = None,
+                 engines = [NeurekaEngine("Neureka"), PULPClusterEngine("PULPCluster")],
+                 variableBuffer = PULPVariableBuffer,
+                 constantBuffer = NeurekaConstantBuffer,
+                 structBuffer = PULPStructBuffer,
+                 transientBuffer = PULPTransientBuffer) -> None:
+        super().__init__(memoryHierarchy, defaultTargetMemoryLevel, engines, variableBuffer, constantBuffer,
+                         structBuffer, transientBuffer)
+        self.weightMemoryLevel = weightMemoryLevel
+
+    def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str:
+        if self.weightMemoryLevel is not None and ctxt.lookup(tensorName)._memoryLevel == self.weightMemoryLevel.name:
+            return self.weightMemoryLevel.name
+        return super().getTargetMemoryLevel(node, tensorName, ctxt)
+
+
+class MemoryNeurekaPlatformWrapper(MemoryPULPPlatformWrapper):
+
+    def __init__(self,
+                 platform: NeurekaPlatform,
+                 memoryHierarchy: MemoryHierarchy,
+                 defaultTargetMemoryLevel: MemoryLevel,
+                 weightMemoryLevel: Optional[MemoryLevel] = None):
+        assert isinstance(platform, NeurekaPlatform), \
+        f"Given platform is not an instance of NeurekaPlatform. Platform type: {type(platform).__name__}"
+        super().__init__(platform, memoryHierarchy, defaultTargetMemoryLevel)
+        self.weightMemoryLevel = weightMemoryLevel
+
+    def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str:
+        if self.weightMemoryLevel is not None and ctxt.lookup(tensorName)._memoryLevel == self.weightMemoryLevel.name:
+            return self.weightMemoryLevel.name
+        return super().getTargetMemoryLevel(node, tensorName, ctxt)
diff --git a/Deeploy/Targets/Neureka/Templates/AllocateTemplate.py b/Deeploy/Targets/Neureka/Templates/AllocateTemplate.py
new file mode 100644
index 0000000..5a39360
--- /dev/null
+++ b/Deeploy/Targets/Neureka/Templates/AllocateTemplate.py
@@ -0,0 +1,40 @@
+# ----------------------------------------------------------------------
+#
+# File: AllocateTemplate.py
+#
+# Last edited: 09.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#         Luka Macan, University of Bologna
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+neurekaGenericGlobalInitTemplate = NodeTemplate("""
+% if _memoryLevel == "L1":
+static PI_L1 ${type.referencedType.typeName} ${name}[${size}] = {${values}};\n
+% elif _memoryLevel == "L2" or _memoryLevel is None:
+static PI_L2 ${type.referencedType.typeName} ${name}[${size}] = {${values}};\n
+% elif _memoryLevel == "L3":
+// ${name} is allocated in L3 \n
+static PI_L2 ${type.referencedType.typeName}* ${name};
+% elif _memoryLevel == "WeightMemory_SRAM":
+static __attribute__((section(".weightmem_sram"))) ${type.referencedType.typeName} ${name}[${size}] = {${values}};\n
+% endif
+""")
diff --git a/Deeploy/Targets/Neureka/Templates/ConvTemplate.py b/Deeploy/Targets/Neureka/Templates/ConvTemplate.py
new file mode 100644
index 0000000..2d658cc
--- /dev/null
+++ b/Deeploy/Targets/Neureka/Templates/ConvTemplate.py
@@ -0,0 +1,377 @@
+# ----------------------------------------------------------------------
+#
+# File: ConvTemplate.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+def _getNumTiles(fullDim: int, tileDim: int) -> int:
+    return int(np.ceil(fullDim / tileDim))
+
+
+def _getBorderTileSize(fullDim: int, tileDim: int) -> int:
+    return fullDim % tileDim if fullDim % tileDim > 0 else tileDim
+
+
+def ioStridesFromDimensions(width: int, channel: int, bits: int) -> Tuple[int, int]:
+    """stridesFromDimensions
+    Returns strides in bytes.
+    """
+    width_stride = channel * bits // 8
+    height_stride = width * width_stride
+    return height_stride, width_stride
+
+
+def getNormQuantConf0(use_relu: bool, layerwise_output_shift: int, scale_bits: int, use_bias: bool,
+                      use_shift: bool) -> int:
+    conf0 = 0
+    conf0 |= 1 << 4  # Use Normalization and quantization
+    if scale_bits == 32:
+        conf0 |= 2 << 12
+    conf0 |= layerwise_output_shift << 16
+    if not use_relu:
+        conf0 |= 1 << 23
+    if use_shift:
+        conf0 |= 1 << 24
+    if use_bias:
+        conf0 |= 1 << 25
+    return conf0
+
+
+def getInputAddrOffset(width_in: int, width_in_stride: int, padding_top: int, padding_left: int) -> int:
+    return (padding_top * width_in + padding_left) * width_in_stride
+
+
+class NeurekaConvTemplate(NodeTemplate):
+
+    def __init__(self, templateStr: str):
+        super().__init__(templateStr)
+
+    @classmethod
+    @abstractmethod
+    def getCounters(
+            cls, channel_in: int, height_out: int, width_out: int, channel_out: int, padding_bottom: int,
+            padding_right: int,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[int, int, int, int, int, int, int, int, int, int]:
+        pass
+
+    @classmethod
+    @abstractmethod
+    def getWeightStrides(cls, channel_in: int) -> Tuple[int, int, int]:
+        pass
+
+    @classmethod
+    @abstractmethod
+    def getConf0(cls, output_bits: int, weight_bits: int, input_signed: bool, use_wmem: bool) -> int:
+        pass
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        data_in: ConstantBuffer = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out: ConstantBuffer = ctxt.lookup(operatorRepresentation['data_out'])
+        weight: ConstantBuffer = ctxt.lookup(operatorRepresentation['weight'])
+
+        operatorRepresentation['input_signed'] = data_in._type.referencedType.typeMin < 0
+        operatorRepresentation['use_relu'] = data_out._type.referencedType.typeMin >= 0
+
+        operatorRepresentation['input_bits'] = data_in._type.referencedType.typeWidth
+        operatorRepresentation['output_bits'] = data_out._type.referencedType.typeWidth
+        operatorRepresentation['weight_bits'] = weight._type.referencedType.typeWidth
+
+        operatorRepresentation["input_typeWidth_bytes"] = int(np.ceil(data_in._type.referencedType.typeWidth / 8))
+        operatorRepresentation["output_typeWidth_bytes"] = int(np.ceil(data_out._type.referencedType.typeWidth / 8))
+
+        operatorRepresentation["weight_addr_offset"] = 0
+
+        operatorRepresentation["use_wmem"] = hasattr(weight,
+                                                     "_memoryLevel") and weight._memoryLevel == "WeightMemory_SRAM"
+
+        dim_im_in_x_stride, dim_im_in_y_stride = ioStridesFromDimensions(operatorRepresentation["dim_im_in_y"],
+                                                                         operatorRepresentation["ch_im_in"],
+                                                                         operatorRepresentation["input_bits"])
+        operatorRepresentation["dim_im_in_y_stride"] = dim_im_in_y_stride
+        operatorRepresentation["dim_im_in_x_stride"] = dim_im_in_x_stride
+
+        dim_im_out_x_stride, dim_im_out_y_stride = ioStridesFromDimensions(operatorRepresentation["dim_im_out_y"],
+                                                                           operatorRepresentation["ch_im_out"],
+                                                                           operatorRepresentation["output_bits"])
+        operatorRepresentation["dim_im_out_y_stride"] = dim_im_out_y_stride
+        operatorRepresentation["dim_im_out_x_stride"] = dim_im_out_x_stride
+
+        operatorRepresentation["input_addr_offset"] = getInputAddrOffset(operatorRepresentation["dim_im_in_y"],
+                                                                         operatorRepresentation["dim_im_in_y_stride"],
+                                                                         operatorRepresentation["padding_y_top"],
+                                                                         operatorRepresentation["padding_x_left"])
+
+        nKo, nKi, nHo, nWo, bKo, bKi, bHo, bWo, bHi, bWi = self.getCounters(
+            operatorRepresentation["ch_im_in"], operatorRepresentation["dim_im_out_x"],
+            operatorRepresentation["dim_im_out_y"], operatorRepresentation["ch_im_out"],
+            operatorRepresentation["padding_y_bottom"], operatorRepresentation["padding_x_right"],
+            operatorRepresentation)
+
+        operatorRepresentation["nKo"] = nKo
+        operatorRepresentation["nKi"] = nKi
+        operatorRepresentation["nHo"] = nHo
+        operatorRepresentation["nWo"] = nWo
+        operatorRepresentation["bKo"] = bKo
+        operatorRepresentation["bKi"] = bKi
+        operatorRepresentation["bHo"] = bHo
+        operatorRepresentation["bWo"] = bWo
+        operatorRepresentation["bHi"] = bHi
+        operatorRepresentation["bWi"] = bWi
+
+        weightStrideD0, weightStrideD1, weightStrideD2 = self.getWeightStrides(operatorRepresentation["ch_im_in"])
+
+        operatorRepresentation["weightStrideD0"] = weightStrideD0
+        operatorRepresentation["weightStrideD1"] = weightStrideD1
+        operatorRepresentation["weightStrideD2"] = weightStrideD2
+
+        operatorRepresentation["conf0"] = self.getConf0(operatorRepresentation["output_bits"],
+                                                        operatorRepresentation["weight_bits"],
+                                                        operatorRepresentation["input_signed"],
+                                                        operatorRepresentation["use_wmem"])
+
+        operatorRepresentation["wmem_addr_offset"] = 0x10400000 if operatorRepresentation["use_wmem"] else 0
+
+        # If requantized
+        if operatorRepresentation["mul"] != "NULL":
+            mulBuff = ctxt.lookup(operatorRepresentation["mul"])
+            mulBits = mulBuff._type.referencedType.typeWidth
+            operatorRepresentation["conf0"] |= getNormQuantConf0(operatorRepresentation["use_relu"],
+                                                                 operatorRepresentation["log2D"], mulBits, "add"
+                                                                 in operatorRepresentation, False)
+        return ctxt, operatorRepresentation, []
+
+
+class Neureka2DPWConvTemplate(NeurekaConvTemplate):
+
+    def __init__(self, templateStr: str):
+        super().__init__(templateStr)
+
+    @classmethod
+    def getCounters(
+            cls, channel_in: int, height_out: int, width_out: int, channel_out: int, padding_bottom: int,
+            padding_right: int,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[int, int, int, int, int, int, int, int, int, int]:
+        n_channel_out_subtiles = _getNumTiles(channel_out, 32)
+        n_channel_in_subtiles = _getNumTiles(channel_in, 32)
+        n_height_out_subtiles = _getNumTiles(height_out, 6)
+        n_width_out_subtiles = _getNumTiles(width_out, 6)
+
+        channel_out_border = _getBorderTileSize(channel_out, 32)
+        channel_in_border = _getBorderTileSize(channel_in, 32)
+        height_out_border = _getBorderTileSize(height_out, 6)
+        width_out_border = _getBorderTileSize(width_out, 6)
+        height_in_border = height_out_border - padding_bottom
+        width_in_border = width_out_border - padding_right
+
+        return (n_channel_out_subtiles, n_channel_in_subtiles, n_height_out_subtiles, n_width_out_subtiles,
+                channel_out_border, channel_in_border, height_out_border, width_out_border, height_in_border,
+                width_in_border)
+
+    @classmethod
+    def getWeightStrides(cls, channel_in: int) -> Tuple[int, int, int]:
+        n_channel_in = _getNumTiles(channel_in, 32)
+        _NEUREKA_WEIGHT_BANDWIDTH_BYTES = 32
+        return _NEUREKA_WEIGHT_BANDWIDTH_BYTES, _NEUREKA_WEIGHT_BANDWIDTH_BYTES * n_channel_in, 0
+
+    @classmethod
+    def getConf0(cls, output_bits: int, weight_bits: int, input_signed: bool, use_wmem: bool) -> int:
+        conf0 = 0
+        conf0 |= weight_bits - 1
+        conf0 |= 2 << 5  # PW MODE
+        if use_wmem:
+            conf0 |= 1 << 9
+        conf0 |= 1 << 15  # Layerwise weight offset mode
+        if output_bits == 32:
+            conf0 |= 2 << 21
+        if input_signed:
+            conf0 |= 1 << 26
+        return conf0
+
+
+class Neureka2DDWConvTemplate(NeurekaConvTemplate):
+
+    def __init__(self, templateStr: str):
+        super().__init__(templateStr)
+
+    @classmethod
+    def getCounters(
+            cls, channel_in: int, height_out: int, width_out: int, channel_out: int, padding_bottom: int,
+            padding_right: int,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[int, int, int, int, int, int, int, int, int, int]:
+        _ = operatorRepresentation  # operatorRepresentation not accessed for now because it's just for pointwise kernels
+
+        n_channel_out_subtiles = _getNumTiles(channel_out, 28)
+        n_channel_in_subtiles = n_channel_out_subtiles
+        n_height_out_subtiles = _getNumTiles(height_out, 6)
+        n_width_out_subtiles = _getNumTiles(width_out, 6)
+
+        channel_out_border = _getBorderTileSize(channel_out, 28)
+        channel_in_border = channel_out_border
+        height_out_border = _getBorderTileSize(height_out, 6)
+        width_out_border = _getBorderTileSize(width_out, 6)
+        height_in_border = height_out_border + 2 - padding_bottom
+        width_in_border = width_out_border + 2 - padding_right
+
+        return (n_channel_out_subtiles, n_channel_in_subtiles, n_height_out_subtiles, n_width_out_subtiles,
+                channel_out_border, channel_in_border, height_out_border, width_out_border, height_in_border,
+                width_in_border)
+
+    @classmethod
+    def getWeightStrides(cls, channel_in: int) -> Tuple[int, int, int]:
+        n_channel_in = _getNumTiles(channel_in, 28)
+        _NEUREKA_WEIGHT_BANDWIDTH_BYTES = 32
+        return _NEUREKA_WEIGHT_BANDWIDTH_BYTES, 0, 0
+
+    @classmethod
+    def getConf0(cls, output_bits: int, weight_bits: int, input_signed: bool, use_wmem: bool) -> int:
+        conf0 = 0
+        conf0 |= weight_bits - 1
+        conf0 |= 1 << 5  # DW MODE
+        if use_wmem:
+            conf0 |= 1 << 9
+        conf0 |= 1 << 15  # Layerwise weight offset mode
+        if output_bits == 32:
+            conf0 |= 2 << 21
+        if input_signed:
+            conf0 |= 1 << 26
+        return conf0
+
+
+class Neureka2DDenseConvTemplate(NeurekaConvTemplate):
+
+    def __init__(self, templateStr: str):
+        super().__init__(templateStr)
+
+    @classmethod
+    def getCounters(
+            cls, channel_in: int, height_out: int, width_out: int, channel_out: int, padding_bottom: int,
+            padding_right: int,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[int, int, int, int, int, int, int, int, int, int]:
+        _ = operatorRepresentation  # operatorRepresentation not accessed for now because it's just for pointwise kernels
+
+        n_channel_out_subtiles = _getNumTiles(channel_out, 28)
+        n_channel_in_subtiles = _getNumTiles(channel_in, 28)
+        n_height_out_subtiles = _getNumTiles(height_out, 6)
+        n_width_out_subtiles = _getNumTiles(width_out, 6)
+
+        channel_out_border = _getBorderTileSize(channel_out, 28)
+        channel_in_border = _getBorderTileSize(channel_in, 28)
+        height_out_border = _getBorderTileSize(height_out, 6)
+        width_out_border = _getBorderTileSize(width_out, 6)
+        height_in_border = height_out_border + 2 - padding_bottom
+        width_in_border = width_out_border + 2 - padding_right
+
+        return (n_channel_out_subtiles, n_channel_in_subtiles, n_height_out_subtiles, n_width_out_subtiles,
+                channel_out_border, channel_in_border, height_out_border, width_out_border, height_in_border,
+                width_in_border)
+
+    @classmethod
+    def getWeightStrides(cls, channel_in: int) -> Tuple[int, int, int]:
+        n_channel_in = _getNumTiles(channel_in, 28)
+        _NEUREKA_WEIGHT_BANDWIDTH_BYTES = 32
+        return _NEUREKA_WEIGHT_BANDWIDTH_BYTES, _NEUREKA_WEIGHT_BANDWIDTH_BYTES * 8 * n_channel_in, 0
+
+    @classmethod
+    def getConf0(cls, output_bits: int, weight_bits: int, input_signed: bool, use_wmem: bool) -> int:
+        conf0 = 0
+        conf0 |= weight_bits - 1
+        if use_wmem:
+            conf0 |= 1 << 9
+        conf0 |= 1 << 15  # Layerwise weight offset mode
+        if output_bits == 32:
+            conf0 |= 2 << 21
+        if input_signed:
+            conf0 |= 1 << 26
+        return conf0
+
+
+NeurekaTaskInitTemplateStr = """
+// N-EUREKA Task Init
+neureka_task_t task = {
+    .data = (neureka_task_data_t) {
+        .weights_addr = (uint32_t)${weight} - ${wmem_addr_offset} + ${weight_addr_offset},
+        .infeat_addr = (uint32_t)${data_in} - ${input_addr_offset},
+        .outfeat_addr = (uint32_t)${data_out},
+        .scale_addr = (uint32_t)${mul},
+        .scale_shift_addr = (uint32_t)${shift},
+        .scale_bias_addr = (uint32_t)${add},
+        .cfg = (neureka_cfg_t) {
+            .input_stride = (neureka_stride_t) {
+                .d0 = ${dim_im_in_y_stride},
+                .d1 = ${dim_im_in_x_stride},
+                .d2 = 0
+            },
+            .output_stride = (neureka_stride_t) {
+                .d0 = NEUREKA_OUTPUT_BANDWIDTH_BYTES,
+                .d1 = ${dim_im_out_y_stride},
+                .d2 = ${dim_im_out_x_stride}
+            },
+            task.data.cfg.weights_stride = (neureka_stride_t) {
+                .d0 = ${weightStrideD0},
+                .d1 = ${weightStrideD1},
+                .d2 = ${weightStrideD2}
+            },
+            .subtile = (neureka_subtile_t) {
+                .number = {
+                    .KoKi = nnx_concat_half(${nKo}, ${nKi}),
+                    .HoWo = nnx_concat_half(${nHo}, ${nWo})
+                },
+                .remainder = {
+                    .KoKi = nnx_concat_half(${bKo}, ${bKi}),
+                    .HoWo = nnx_concat_half(${bHo}, ${bWo}),
+                    .HiWi = nnx_concat_half(${bHi}, ${bWi})
+                }
+            },
+            .padding = (${padding_y_top} << 28) + (${padding_x_right} << 24) + (${padding_y_bottom} << 20) + (${padding_x_left} << 16),
+            .weight_offset_factor = ${weight_offset},
+            .filter_mask = 0,
+            .conf0 = ${conf0},
+        }
+    }
+};
+"""
+
+NeurekaTaskExecutionTemplateStr = """
+// N-EUREKA Task Execution
+neureka_nnx_dispatch_wait(neureka_siracusa_get_dev());
+neureka_nnx_dispatch(neureka_siracusa_get_dev(), &task);
+neureka_nnx_resolve_wait(neureka_siracusa_get_dev(), &task);
+"""
+
+NeurekaRqntPWConv2D_Template = Neureka2DPWConvTemplate(NeurekaTaskInitTemplateStr + NeurekaTaskExecutionTemplateStr)
+NeurekaPWConv2D_Template = Neureka2DPWConvTemplate(NeurekaTaskInitTemplateStr + NeurekaTaskExecutionTemplateStr)
+
+NeurekaRqntDWConv2D_Template = Neureka2DDWConvTemplate(NeurekaTaskInitTemplateStr + NeurekaTaskExecutionTemplateStr)
+NeurekaDWConv2D_Template = Neureka2DDWConvTemplate(NeurekaTaskInitTemplateStr + NeurekaTaskExecutionTemplateStr)
+
+NeurekaRqntDenseConv2D_Template = Neureka2DDenseConvTemplate(NeurekaTaskInitTemplateStr +
+                                                             NeurekaTaskExecutionTemplateStr)
+NeurekaDenseConv2D_Template = Neureka2DDenseConvTemplate(NeurekaTaskInitTemplateStr + NeurekaTaskExecutionTemplateStr)
diff --git a/Deeploy/Targets/Neureka/Templates/__init__.py b/Deeploy/Targets/Neureka/Templates/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/Neureka/Templates/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/Neureka/TileConstraints/NeurekaDenseConstraint.py b/Deeploy/Targets/Neureka/TileConstraints/NeurekaDenseConstraint.py
new file mode 100644
index 0000000..8457c17
--- /dev/null
+++ b/Deeploy/Targets/Neureka/TileConstraints/NeurekaDenseConstraint.py
@@ -0,0 +1,559 @@
+# ----------------------------------------------------------------------
+#
+# File: NeurekaDenseConstraint.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t, uint32_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.Targets.Neureka.Templates.ConvTemplate import Neureka2DDenseConvTemplate, getInputAddrOffset, \
+    ioStridesFromDimensions
+from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import PerformanceHint, TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme, calculateRectangleOffset
+
+
+class NeurekaDenseConv2DTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        # Get to-be-tiled tensor's buffers
+        inputBufferName = parseDict['data_in']
+        weightBufferName = parseDict['weight']
+        outputBufferName = parseDict['data_out']
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+        dilation = parseDict["dilations"]
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
+
+        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
+        weightInChannelMajorVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1)
+        weightBitsVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2)
+        weightBandwidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3)
+
+        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+        # Map output dims to inputs dims
+        tilerModel.addConstraint(outputBatchVar == inputBatchVar)  # Batch
+        tilerModel.addConstraint(outputChannelVar == weightOutChannelVar)  # Output Channel
+
+        inputBuffer = ctxt.lookup(inputBufferName)
+
+        effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[1]))
+        effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[2]))
+
+        tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (3 - 1) - 1) // strides[0] + 1))
+        tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (3 - 1) - 1) // strides[1] + 1))
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        weightBuffer = ctxt.lookup(name = parseDict['weight'])
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
+
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 3)
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+
+        tilerModel.addConstraint((inputHeightVar % strides[0]) == 0)
+        tilerModel.addConstraint((inputWidthVar % strides[1]) == 0)
+
+        tilerModel.addConstraint(inputChannelVar == inputChannelVar.Max())
+
+        tilerModel.addConstraint(inputHeightVar == inputHeightVar.Max(), strategy = PerformanceHint(1))
+        tilerModel.addConstraint(inputWidthVar == inputWidthVar.Max(), strategy = PerformanceHint(1))
+
+        tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_x'])
+        tilerModel.addConstraint(inputWidthVar >= parseDict['dim_kernel_y'])
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'weight', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        varWeight = operatorRepresentation['weight']
+        varOut = operatorRepresentation['data_out']
+
+        inputInCubes = []
+        inputWeightCubes = []
+        replacements: Dict[str, List[int]] = {
+            "padding_y_top": [],
+            "padding_y_bottom": [],
+            "padding_x_left": [],
+            "padding_x_right": [],
+            "dim_im_in_x_stride": [],
+            "dim_im_in_y_stride": [],
+            "dim_im_out_x_stride": [],
+            "dim_im_out_y_stride": [],
+            "input_addr_offset": [],
+            "nKo": [],
+            "nKi": [],
+            "nHo": [],
+            "nWo": [],
+            "bKo": [],
+            "bKi": [],
+            "bHo": [],
+            "bWo": [],
+            "bHi": [],
+            "bWi": [],
+        }
+
+        replacementTypes = {
+            "padding_y_top": PointerClass(uint8_t),
+            "padding_y_bottom": PointerClass(uint8_t),
+            "padding_x_left": PointerClass(uint8_t),
+            "padding_x_right": PointerClass(uint8_t),
+            "dim_im_in_x_stride": PointerClass(uint32_t),
+            "dim_im_in_y_stride": PointerClass(uint32_t),
+            "dim_im_out_x_stride": PointerClass(uint32_t),
+            "dim_im_out_y_stride": PointerClass(uint32_t),
+            "input_addr_offset": PointerClass(uint32_t),
+            "nKo": PointerClass(uint16_t),
+            "nKi": PointerClass(uint16_t),
+            "nHo": PointerClass(uint16_t),
+            "nWo": PointerClass(uint16_t),
+            "bKo": PointerClass(uint16_t),
+            "bKi": PointerClass(uint16_t),
+            "bHo": PointerClass(uint16_t),
+            "bWo": PointerClass(uint16_t),
+            "bHi": PointerClass(uint16_t),
+            "bWi": PointerClass(uint16_t),
+        }
+
+        weightH = operatorRepresentation['dim_kernel_y']
+        weightW = operatorRepresentation['dim_kernel_x']
+        weightC = operatorRepresentation['ch_im_in']
+
+        pads = operatorRepresentation['pads']
+        strides = operatorRepresentation['strides']
+
+        for cube in outputCubes:
+            (BatchOffset, HOffset, WOffset, COffset) = cube.offset
+            (BatchSize, HSize, WSize, CSize) = cube.dims
+
+            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
+                                                                          cube,
+                                                                          ctxt.lookup(varOut).shape)
+            padding_left, padding_right, padding_top, padding_bottom = padding_tuple
+
+            replacements['padding_y_top'].append(padding_top)
+            replacements['padding_y_bottom'].append(padding_bottom)
+            replacements['padding_x_left'].append(padding_left)
+            replacements['padding_x_right'].append(padding_right)
+
+            inBSize, inHSize, inWSize, inCSize = InCube.dims
+
+            dim_im_in_x_stride, dim_im_in_y_stride = ioStridesFromDimensions(inWSize, inCSize,
+                                                                             operatorRepresentation["input_bits"])
+            replacements['dim_im_in_x_stride'].append(dim_im_in_x_stride)
+            replacements['dim_im_in_y_stride'].append(dim_im_in_y_stride)
+            dim_im_out_x_stride, dim_im_out_y_stride = ioStridesFromDimensions(WSize, CSize,
+                                                                               operatorRepresentation["output_bits"])
+            replacements['dim_im_out_x_stride'].append(dim_im_out_x_stride)
+            replacements['dim_im_out_y_stride'].append(dim_im_out_y_stride)
+
+            replacements['input_addr_offset'].append(
+                getInputAddrOffset(inWSize, dim_im_in_y_stride, padding_top, padding_left))
+
+            nKo, nKi, nHo, nWo, bKo, bKi, bHo, bWo, bHi, bWi = Neureka2DDenseConvTemplate.getCounters(
+                inCSize, HSize, WSize, CSize, padding_bottom, padding_right, operatorRepresentation)
+
+            replacements["nKo"].append(nKo)
+            replacements["nKi"].append(nKi)
+            replacements["nHo"].append(nHo)
+            replacements["nWo"].append(nWo)
+            replacements["bKo"].append(bKo)
+            replacements["bKi"].append(bKi)
+            replacements["bHo"].append(bHo)
+            replacements["bWo"].append(bWo)
+            replacements["bHi"].append(bHi)
+            replacements["bWi"].append(bWi)
+
+            inputInCubes.append(InCube)
+
+            weightShape = ctxt.lookup(varWeight).shape
+            WeightCube = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
+
+            inputWeightCubes.append(WeightCube)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for a, b in zip(inputInCubes, inputWeightCubes):
+            inputLoadSchedule.append({"data_in": a, "weight": b})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
+
+
+class NeurekaRQSDenseConv2DTileConstraint(NeurekaDenseConv2DTileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        tilerModel = NeurekaDenseConv2DTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
+
+        outputBufferName = parseDict['data_out']
+        mulBufferName = parseDict['mul']
+        addBufferName = parseDict['add']
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [mulBufferName, addBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+        addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0)
+        mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0)
+
+        tilerModel.addConstraint(outputChannelVar == addChannelVar)
+        tilerModel.addConstraint(outputChannelVar == mulChannelVar)
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        variableReplacementSchedule, tilingSchedule = super().serializeTilingSolution(
+            tilingSolution, absoluteOutputCubes, targetMemLevel, ctxt, operatorRepresentation)
+
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['mul', 'add']
+        inputRequantBaseOffsets, _ = cls.extractBaseAddr(tilingSolution, targetMemLevel, operatorRepresentation,
+                                                         addrNames)
+        newInputBaseOffsets = {**tilingSchedule.inputBaseOffsets, **inputRequantBaseOffsets}
+
+        inputRequantCubes = []
+        for cube in outputCubes:
+            (_, _, _, COffset) = cube.offset
+            (_, _, _, CSize) = cube.dims
+            inputRequantCubes.append(HyperRectangle((COffset,), (CSize,)))
+        newInputLoadSchedule = [{
+            **schedule, "add": requant,
+            "mul": requant
+        } for schedule, requant in zip(tilingSchedule.inputLoadSchedule, inputRequantCubes)]
+
+        newTilingSchedule = TilingSchedule(newInputBaseOffsets, tilingSchedule.outputBaseOffsets, newInputLoadSchedule,
+                                           tilingSchedule.outputLoadSchedule)
+
+        return variableReplacementSchedule, newTilingSchedule
+
+
+class NeurekaWmemDenseConv2DTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        inputBufferName = parseDict['data_in']
+        weightBufferName = parseDict['weight']
+        outputBufferName = parseDict['data_out']
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+        dilation = parseDict["dilations"]
+
+        for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
+
+        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+
+        # Map output dims to inputs dims
+        tilerModel.addConstraint(outputBatchVar == inputBatchVar)
+
+        tilerModel.addConstraint(inputHeightVar >= 3)
+        tilerModel.addConstraint(inputWidthVar >= 3)
+
+        inputBuffer = ctxt.lookup(inputBufferName)
+
+        effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[1]))
+        effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[2]))
+
+        tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (3 - 1) - 1) // strides[0] + 1))
+        tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (3 - 1) - 1) // strides[1] + 1))
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        weightBuffer = ctxt.lookup(name = parseDict['weight'])
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+
+        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 0)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
+
+        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 0)
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 3)
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+
+        tilerModel.addConstraint((inputHeightVar % strides[0]) == 0)
+        tilerModel.addConstraint((inputWidthVar % strides[1]) == 0)
+
+        tilerModel.addConstraint(inputChannelVar == inputChannelVar.Max())
+
+        tilerModel.addConstraint(inputHeightVar == inputHeightVar.Max(), strategy = PerformanceHint(1))
+        tilerModel.addConstraint(inputWidthVar == inputWidthVar.Max(), strategy = PerformanceHint(1))
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        varWeight = operatorRepresentation['weight']
+        varOut = operatorRepresentation['data_out']
+
+        inputInCubes = []
+        replacements: Dict[str, List[int]] = {
+            "padding_y_top": [],
+            "padding_y_bottom": [],
+            "padding_x_left": [],
+            "padding_x_right": [],
+            "weight_addr_offset": [],
+            "dim_im_in_x_stride": [],
+            "dim_im_in_y_stride": [],
+            "dim_im_out_x_stride": [],
+            "dim_im_out_y_stride": [],
+            "input_addr_offset": [],
+            "nKo": [],
+            "nKi": [],
+            "nHo": [],
+            "nWo": [],
+            "bKo": [],
+            "bKi": [],
+            "bHo": [],
+            "bWo": [],
+            "bHi": [],
+            "bWi": [],
+        }
+
+        replacementTypes = {
+            "padding_y_top": PointerClass(uint8_t),
+            "padding_y_bottom": PointerClass(uint8_t),
+            "padding_x_left": PointerClass(uint8_t),
+            "padding_x_right": PointerClass(uint8_t),
+            "weight_addr_offset": PointerClass(uint32_t),
+            "dim_im_in_x_stride": PointerClass(uint32_t),
+            "dim_im_in_y_stride": PointerClass(uint32_t),
+            "dim_im_out_x_stride": PointerClass(uint32_t),
+            "dim_im_out_y_stride": PointerClass(uint32_t),
+            "input_addr_offset": PointerClass(uint32_t),
+            "nKo": PointerClass(uint16_t),
+            "nKi": PointerClass(uint16_t),
+            "nHo": PointerClass(uint16_t),
+            "nWo": PointerClass(uint16_t),
+            "bKo": PointerClass(uint16_t),
+            "bKi": PointerClass(uint16_t),
+            "bHo": PointerClass(uint16_t),
+            "bWo": PointerClass(uint16_t),
+            "bHi": PointerClass(uint16_t),
+            "bWi": PointerClass(uint16_t),
+        }
+
+        weightH = operatorRepresentation['dim_kernel_y']
+        weightW = operatorRepresentation['dim_kernel_x']
+        weightC = operatorRepresentation['ch_im_in']
+
+        pads = operatorRepresentation['pads']
+        strides = operatorRepresentation['strides']
+
+        for absoluteCube in absoluteOutputCubes:
+            cube = absoluteCube.rectangle
+            (BatchOffset, HOffset, WOffset, COffset) = cube.offset
+            (BatchSize, HSize, WSize, CSize) = cube.dims
+
+            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
+                                                                          cube,
+                                                                          ctxt.lookup(varOut).shape)
+            padding_left, padding_right, padding_top, padding_bottom = padding_tuple
+
+            replacements['padding_y_top'].append(padding_top)
+            replacements['padding_y_bottom'].append(padding_bottom)
+            replacements['padding_x_left'].append(padding_left)
+            replacements['padding_x_right'].append(padding_right)
+
+            inBSize, inHSize, inWSize, inCSize = InCube.dims
+
+            dim_im_in_x_stride, dim_im_in_y_stride = ioStridesFromDimensions(inWSize, inCSize,
+                                                                             operatorRepresentation["input_bits"])
+            replacements['dim_im_in_x_stride'].append(dim_im_in_x_stride)
+            replacements['dim_im_in_y_stride'].append(dim_im_in_y_stride)
+            dim_im_out_x_stride, dim_im_out_y_stride = ioStridesFromDimensions(WSize, CSize,
+                                                                               operatorRepresentation["output_bits"])
+            replacements['dim_im_out_x_stride'].append(dim_im_out_x_stride)
+            replacements['dim_im_out_y_stride'].append(dim_im_out_y_stride)
+
+            replacements['input_addr_offset'].append(
+                getInputAddrOffset(inWSize, dim_im_in_y_stride, padding_top, padding_left))
+
+            nKo, nKi, nHo, nWo, bKo, bKi, bHo, bWo, bHi, bWi = Neureka2DDenseConvTemplate.getCounters(
+                inCSize, HSize, WSize, CSize, padding_bottom, padding_right, operatorRepresentation)
+
+            replacements["nKo"].append(nKo)
+            replacements["nKi"].append(nKi)
+            replacements["nHo"].append(nHo)
+            replacements["nWo"].append(nWo)
+            replacements["bKo"].append(bKo)
+            replacements["bKi"].append(bKi)
+            replacements["bHo"].append(bHo)
+            replacements["bWo"].append(bWo)
+            replacements["bHi"].append(bHi)
+            replacements["bWi"].append(bWi)
+
+            inputInCubes.append(InCube)
+
+            _, _, _, absoluteCOffset = absoluteCube.absoluteOffset
+            weightShape = ctxt.lookup(varWeight).shape
+            WeightCube = HyperRectangle((absoluteCOffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
+            replacements['weight_addr_offset'].append(calculateRectangleOffset(WeightCube, ctxt.lookup(varWeight)))
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for a in inputInCubes:
+            inputLoadSchedule.append({"data_in": a})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
+
+
+class NeurekaWmemRQSDenseConv2DTileConstraint(NeurekaWmemDenseConv2DTileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        tilerModel = NeurekaWmemDenseConv2DTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
+
+        outputBufferName = parseDict['data_out']
+        mulBufferName = parseDict['mul']
+        addBufferName = parseDict['add']
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [mulBufferName, addBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+        addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0)
+        mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0)
+
+        tilerModel.addConstraint(outputChannelVar == addChannelVar)
+        tilerModel.addConstraint(outputChannelVar == mulChannelVar)
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        variableReplacementSchedule, tilingSchedule = super().serializeTilingSolution(
+            tilingSolution, absoluteOutputCubes, targetMemLevel, ctxt, operatorRepresentation)
+
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['mul', 'add']
+        inputRequantBaseOffsets, _ = cls.extractBaseAddr(tilingSolution, targetMemLevel, operatorRepresentation,
+                                                         addrNames)
+        newInputBaseOffsets = {**tilingSchedule.inputBaseOffsets, **inputRequantBaseOffsets}
+
+        inputRequantCubes = []
+        for cube in outputCubes:
+            (_, _, _, COffset) = cube.offset
+            (_, _, _, CSize) = cube.dims
+            inputRequantCubes.append(HyperRectangle((COffset,), (CSize,)))
+        newInputLoadSchedule = [{
+            **schedule, "add": requant,
+            "mul": requant
+        } for schedule, requant in zip(tilingSchedule.inputLoadSchedule, inputRequantCubes)]
+
+        newTilingSchedule = TilingSchedule(newInputBaseOffsets, tilingSchedule.outputBaseOffsets, newInputLoadSchedule,
+                                           tilingSchedule.outputLoadSchedule)
+
+        return variableReplacementSchedule, newTilingSchedule
diff --git a/Deeploy/Targets/Neureka/TileConstraints/NeurekaDepthwiseConstraint.py b/Deeploy/Targets/Neureka/TileConstraints/NeurekaDepthwiseConstraint.py
new file mode 100644
index 0000000..6364afc
--- /dev/null
+++ b/Deeploy/Targets/Neureka/TileConstraints/NeurekaDepthwiseConstraint.py
@@ -0,0 +1,557 @@
+# ----------------------------------------------------------------------
+#
+# File: NeurekaDepthwiseConstraint.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t, uint32_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.Targets.Neureka.Templates.ConvTemplate import Neureka2DDWConvTemplate, getInputAddrOffset, \
+    ioStridesFromDimensions
+from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import PerformanceHint, TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme, calculateRectangleOffset
+
+
+class NeurekaDWConv2DTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        # Get to-be-tiled tensor's buffers
+        inputBufferName = parseDict['data_in']
+        weightBufferName = parseDict['weight']
+        outputBufferName = parseDict['data_out']
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+        dilation = parseDict["dilations"]
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
+
+        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
+        weightInChannelMajorVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1)
+        weightBitsVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2)
+        weightBandwidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3)
+
+        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+        # Map output dims to inputs dims
+        tilerModel.addConstraint(outputBatchVar == inputBatchVar)  # Batch
+        tilerModel.addConstraint(outputChannelVar == weightOutChannelVar)  # Output Channel
+        tilerModel.addConstraint(outputChannelVar == inputChannelVar)  # Output Channel
+
+        tilerModel.addConstraint(inputHeightVar >= 3)
+        tilerModel.addConstraint(inputWidthVar >= 3)
+
+        inputBuffer = ctxt.lookup(inputBufferName)
+
+        effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[1]))
+        effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[2]))
+
+        tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (3 - 1) - 1) // strides[0] + 1))
+        tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (3 - 1) - 1) // strides[1] + 1))
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        weightBuffer = ctxt.lookup(name = parseDict['weight'])
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
+
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 3)
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+
+        tilerModel.addConstraint((inputHeightVar % strides[0]) == 0)
+        tilerModel.addConstraint((inputWidthVar % strides[1]) == 0)
+
+        tilerModel.addConstraint(inputHeightVar == inputHeightVar.Max(), strategy = PerformanceHint(1))
+        tilerModel.addConstraint(inputWidthVar == inputWidthVar.Max(), strategy = PerformanceHint(1))
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'weight', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        varWeight = operatorRepresentation['weight']
+        varOut = operatorRepresentation['data_out']
+
+        inputInCubes = []
+        inputWeightCubes = []
+        replacements: Dict[str, List[int]] = {
+            "padding_y_top": [],
+            "padding_y_bottom": [],
+            "padding_x_left": [],
+            "padding_x_right": [],
+            "dim_im_in_x_stride": [],
+            "dim_im_in_y_stride": [],
+            "dim_im_out_x_stride": [],
+            "dim_im_out_y_stride": [],
+            "input_addr_offset": [],
+            "nKo": [],
+            "nKi": [],
+            "nHo": [],
+            "nWo": [],
+            "bKo": [],
+            "bKi": [],
+            "bHo": [],
+            "bWo": [],
+            "bHi": [],
+            "bWi": [],
+        }
+
+        replacementTypes = {
+            "padding_y_top": PointerClass(uint8_t),
+            "padding_y_bottom": PointerClass(uint8_t),
+            "padding_x_left": PointerClass(uint8_t),
+            "padding_x_right": PointerClass(uint8_t),
+            "dim_im_in_x_stride": PointerClass(uint32_t),
+            "dim_im_in_y_stride": PointerClass(uint32_t),
+            "dim_im_out_x_stride": PointerClass(uint32_t),
+            "dim_im_out_y_stride": PointerClass(uint32_t),
+            "input_addr_offset": PointerClass(uint32_t),
+            "nKo": PointerClass(uint16_t),
+            "nKi": PointerClass(uint16_t),
+            "nHo": PointerClass(uint16_t),
+            "nWo": PointerClass(uint16_t),
+            "bKo": PointerClass(uint16_t),
+            "bKi": PointerClass(uint16_t),
+            "bHo": PointerClass(uint16_t),
+            "bWo": PointerClass(uint16_t),
+            "bHi": PointerClass(uint16_t),
+            "bWi": PointerClass(uint16_t),
+        }
+
+        weightH = operatorRepresentation['dim_kernel_y']
+        weightW = operatorRepresentation['dim_kernel_x']
+        weightC = operatorRepresentation['ch_im_in']
+
+        pads = operatorRepresentation['pads']
+        strides = operatorRepresentation['strides']
+
+        for cube in outputCubes:
+            (BatchOffset, HOffset, WOffset, COffset) = cube.offset
+            (BatchSize, HSize, WSize, CSize) = cube.dims
+
+            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
+                                                                          cube,
+                                                                          ctxt.lookup(varOut).shape)
+            padding_left, padding_right, padding_top, padding_bottom = padding_tuple
+
+            replacements['padding_y_top'].append(padding_top)
+            replacements['padding_y_bottom'].append(padding_bottom)
+            replacements['padding_x_left'].append(padding_left)
+            replacements['padding_x_right'].append(padding_right)
+
+            inBSize, inHSize, inWSize, inCSize = InCube.dims
+
+            dim_im_in_x_stride, dim_im_in_y_stride = ioStridesFromDimensions(inWSize, inCSize,
+                                                                             operatorRepresentation["input_bits"])
+            replacements['dim_im_in_x_stride'].append(dim_im_in_x_stride)
+            replacements['dim_im_in_y_stride'].append(dim_im_in_y_stride)
+            dim_im_out_x_stride, dim_im_out_y_stride = ioStridesFromDimensions(WSize, CSize,
+                                                                               operatorRepresentation["output_bits"])
+            replacements['dim_im_out_x_stride'].append(dim_im_out_x_stride)
+            replacements['dim_im_out_y_stride'].append(dim_im_out_y_stride)
+
+            replacements['input_addr_offset'].append(
+                getInputAddrOffset(inWSize, dim_im_in_y_stride, padding_top, padding_left))
+
+            nKo, nKi, nHo, nWo, bKo, bKi, bHo, bWo, bHi, bWi = Neureka2DDWConvTemplate.getCounters(
+                inCSize, HSize, WSize, CSize, padding_bottom, padding_right, operatorRepresentation)
+
+            replacements["nKo"].append(nKo)
+            replacements["nKi"].append(nKi)
+            replacements["nHo"].append(nHo)
+            replacements["nWo"].append(nWo)
+            replacements["bKo"].append(bKo)
+            replacements["bKi"].append(bKi)
+            replacements["bHo"].append(bHo)
+            replacements["bWo"].append(bWo)
+            replacements["bHi"].append(bHi)
+            replacements["bWi"].append(bWi)
+
+            inputInCubes.append(InCube)
+
+            weightShape = ctxt.lookup(varWeight).shape
+            WeightCube = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
+
+            inputWeightCubes.append(WeightCube)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for a, b in zip(inputInCubes, inputWeightCubes):
+            inputLoadSchedule.append({"data_in": a, "weight": b})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
+
+
+class NeurekaRQSDWConv2DTileConstraint(NeurekaDWConv2DTileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        tilerModel = NeurekaDWConv2DTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
+
+        outputBufferName = parseDict['data_out']
+        mulBufferName = parseDict['mul']
+        addBufferName = parseDict['add']
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [mulBufferName, addBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+        addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0)
+        mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0)
+
+        tilerModel.addConstraint(outputChannelVar == addChannelVar)
+        tilerModel.addConstraint(outputChannelVar == mulChannelVar)
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        variableReplacementSchedule, tilingSchedule = super().serializeTilingSolution(
+            tilingSolution, absoluteOutputCubes, targetMemLevel, ctxt, operatorRepresentation)
+
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['mul', 'add']
+        inputRequantBaseOffsets, _ = cls.extractBaseAddr(tilingSolution, targetMemLevel, operatorRepresentation,
+                                                         addrNames)
+        newInputBaseOffsets = {**tilingSchedule.inputBaseOffsets, **inputRequantBaseOffsets}
+
+        inputRequantCubes = []
+        for cube in outputCubes:
+            (_, _, _, COffset) = cube.offset
+            (_, _, _, CSize) = cube.dims
+            inputRequantCubes.append(HyperRectangle((COffset,), (CSize,)))
+        newInputLoadSchedule = [{
+            **schedule, "add": requant,
+            "mul": requant
+        } for schedule, requant in zip(tilingSchedule.inputLoadSchedule, inputRequantCubes)]
+
+        newTilingSchedule = TilingSchedule(newInputBaseOffsets, tilingSchedule.outputBaseOffsets, newInputLoadSchedule,
+                                           tilingSchedule.outputLoadSchedule)
+
+        return variableReplacementSchedule, newTilingSchedule
+
+
+class NeurekaWmemDWConv2DTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        inputBufferName = parseDict['data_in']
+        weightBufferName = parseDict['weight']
+        outputBufferName = parseDict['data_out']
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+        dilation = parseDict["dilations"]
+
+        for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
+
+        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+
+        # Map output dims to inputs dims
+        tilerModel.addConstraint(outputBatchVar == inputBatchVar)
+        tilerModel.addConstraint(outputChannelVar == inputChannelVar)
+
+        tilerModel.addConstraint(inputHeightVar >= 3)
+        tilerModel.addConstraint(inputWidthVar >= 3)
+
+        inputBuffer = ctxt.lookup(inputBufferName)
+
+        effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[1]))
+        effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[2]))
+
+        tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (3 - 1) - 1) // strides[0] + 1))
+        tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (3 - 1) - 1) // strides[1] + 1))
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        weightBuffer = ctxt.lookup(name = parseDict['weight'])
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+
+        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 0)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
+
+        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 0)
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 3)
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+
+        tilerModel.addConstraint((inputHeightVar % strides[0]) == 0)
+        tilerModel.addConstraint((inputWidthVar % strides[1]) == 0)
+
+        tilerModel.addConstraint(inputHeightVar == inputHeightVar.Max(), strategy = PerformanceHint(1))
+        tilerModel.addConstraint(inputWidthVar == inputWidthVar.Max(), strategy = PerformanceHint(1))
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        varWeight = operatorRepresentation['weight']
+        varOut = operatorRepresentation['data_out']
+
+        inputInCubes = []
+        replacements: Dict[str, List[int]] = {
+            "padding_y_top": [],
+            "padding_y_bottom": [],
+            "padding_x_left": [],
+            "padding_x_right": [],
+            "weight_addr_offset": [],
+            "dim_im_in_x_stride": [],
+            "dim_im_in_y_stride": [],
+            "dim_im_out_x_stride": [],
+            "dim_im_out_y_stride": [],
+            "input_addr_offset": [],
+            "nKo": [],
+            "nKi": [],
+            "nHo": [],
+            "nWo": [],
+            "bKo": [],
+            "bKi": [],
+            "bHo": [],
+            "bWo": [],
+            "bHi": [],
+            "bWi": [],
+        }
+
+        replacementTypes = {
+            "padding_y_top": PointerClass(uint8_t),
+            "padding_y_bottom": PointerClass(uint8_t),
+            "padding_x_left": PointerClass(uint8_t),
+            "padding_x_right": PointerClass(uint8_t),
+            "weight_addr_offset": PointerClass(uint32_t),
+            "dim_im_in_x_stride": PointerClass(uint32_t),
+            "dim_im_in_y_stride": PointerClass(uint32_t),
+            "dim_im_out_x_stride": PointerClass(uint32_t),
+            "dim_im_out_y_stride": PointerClass(uint32_t),
+            "input_addr_offset": PointerClass(uint32_t),
+            "nKo": PointerClass(uint16_t),
+            "nKi": PointerClass(uint16_t),
+            "nHo": PointerClass(uint16_t),
+            "nWo": PointerClass(uint16_t),
+            "bKo": PointerClass(uint16_t),
+            "bKi": PointerClass(uint16_t),
+            "bHo": PointerClass(uint16_t),
+            "bWo": PointerClass(uint16_t),
+            "bHi": PointerClass(uint16_t),
+            "bWi": PointerClass(uint16_t),
+        }
+
+        weightH = operatorRepresentation['dim_kernel_y']
+        weightW = operatorRepresentation['dim_kernel_x']
+        weightC = operatorRepresentation['ch_im_in']
+
+        pads = operatorRepresentation['pads']
+        strides = operatorRepresentation['strides']
+
+        for absoluteCube in absoluteOutputCubes:
+            cube = absoluteCube.rectangle
+            (BatchOffset, HOffset, WOffset, COffset) = cube.offset
+            (BatchSize, HSize, WSize, CSize) = cube.dims
+
+            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
+                                                                          cube,
+                                                                          ctxt.lookup(varOut).shape)
+            padding_left, padding_right, padding_top, padding_bottom = padding_tuple
+
+            replacements['padding_y_top'].append(padding_top)
+            replacements['padding_y_bottom'].append(padding_bottom)
+            replacements['padding_x_left'].append(padding_left)
+            replacements['padding_x_right'].append(padding_right)
+
+            inBSize, inHSize, inWSize, inCSize = InCube.dims
+
+            dim_im_in_x_stride, dim_im_in_y_stride = ioStridesFromDimensions(inWSize, inCSize,
+                                                                             operatorRepresentation["input_bits"])
+            replacements['dim_im_in_x_stride'].append(dim_im_in_x_stride)
+            replacements['dim_im_in_y_stride'].append(dim_im_in_y_stride)
+            dim_im_out_x_stride, dim_im_out_y_stride = ioStridesFromDimensions(WSize, CSize,
+                                                                               operatorRepresentation["output_bits"])
+            replacements['dim_im_out_x_stride'].append(dim_im_out_x_stride)
+            replacements['dim_im_out_y_stride'].append(dim_im_out_y_stride)
+
+            replacements['input_addr_offset'].append(
+                getInputAddrOffset(inWSize, dim_im_in_y_stride, padding_top, padding_left))
+
+            nKo, nKi, nHo, nWo, bKo, bKi, bHo, bWo, bHi, bWi = Neureka2DDWConvTemplate.getCounters(
+                inCSize, HSize, WSize, CSize, padding_bottom, padding_right, operatorRepresentation)
+
+            replacements["nKo"].append(nKo)
+            replacements["nKi"].append(nKi)
+            replacements["nHo"].append(nHo)
+            replacements["nWo"].append(nWo)
+            replacements["bKo"].append(bKo)
+            replacements["bKi"].append(bKi)
+            replacements["bHo"].append(bHo)
+            replacements["bWo"].append(bWo)
+            replacements["bHi"].append(bHi)
+            replacements["bWi"].append(bWi)
+
+            inputInCubes.append(InCube)
+
+            _, _, _, absoluteCOffset = absoluteCube.absoluteOffset
+            weightShape = ctxt.lookup(varWeight).shape
+            WeightCube = HyperRectangle((absoluteCOffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
+            replacements['weight_addr_offset'].append(calculateRectangleOffset(WeightCube, ctxt.lookup(varWeight)))
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for a in inputInCubes:
+            inputLoadSchedule.append({"data_in": a})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
+
+
+class NeurekaWmemRQSDWConv2DTileConstraint(NeurekaWmemDWConv2DTileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        tilerModel = NeurekaWmemDWConv2DTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
+
+        outputBufferName = parseDict['data_out']
+        mulBufferName = parseDict['mul']
+        addBufferName = parseDict['add']
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [mulBufferName, addBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+        addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0)
+        mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0)
+
+        tilerModel.addConstraint(outputChannelVar == addChannelVar)
+        tilerModel.addConstraint(outputChannelVar == mulChannelVar)
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        variableReplacementSchedule, tilingSchedule = super().serializeTilingSolution(
+            tilingSolution, absoluteOutputCubes, targetMemLevel, ctxt, operatorRepresentation)
+
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['mul', 'add']
+        inputRequantBaseOffsets, _ = cls.extractBaseAddr(tilingSolution, targetMemLevel, operatorRepresentation,
+                                                         addrNames)
+        newInputBaseOffsets = {**tilingSchedule.inputBaseOffsets, **inputRequantBaseOffsets}
+
+        inputRequantCubes = []
+        for cube in outputCubes:
+            (_, _, _, COffset) = cube.offset
+            (_, _, _, CSize) = cube.dims
+            inputRequantCubes.append(HyperRectangle((COffset,), (CSize,)))
+        newInputLoadSchedule = [{
+            **schedule, "add": requant,
+            "mul": requant
+        } for schedule, requant in zip(tilingSchedule.inputLoadSchedule, inputRequantCubes)]
+
+        newTilingSchedule = TilingSchedule(newInputBaseOffsets, tilingSchedule.outputBaseOffsets, newInputLoadSchedule,
+                                           tilingSchedule.outputLoadSchedule)
+
+        return variableReplacementSchedule, newTilingSchedule
diff --git a/Deeploy/Targets/Neureka/TileConstraints/NeurekaPointwiseConstraint.py b/Deeploy/Targets/Neureka/TileConstraints/NeurekaPointwiseConstraint.py
new file mode 100644
index 0000000..303cc6a
--- /dev/null
+++ b/Deeploy/Targets/Neureka/TileConstraints/NeurekaPointwiseConstraint.py
@@ -0,0 +1,606 @@
+# ----------------------------------------------------------------------
+#
+# File: NeurekaPointwiseConstraint.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t, uint32_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.Targets.Neureka.Templates.ConvTemplate import Neureka2DPWConvTemplate, getInputAddrOffset, \
+    ioStridesFromDimensions
+from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import PerformanceHint, TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme, calculateRectangleOffset
+
+
+class NeurekaPWConv2DTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        # Get to-be-tiled tensor's buffers
+        inputBufferName = parseDict['data_in']
+        weightBufferName = parseDict['weight']
+        outputBufferName = parseDict['data_out']
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
+
+        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
+        weightInChannelMajorVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1)
+        weightBandwidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2)
+
+        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+
+        # Map output dims to inputs dims
+        tilerModel.addConstraint(outputBatchVar == inputBatchVar)  # Batch
+        tilerModel.addConstraint(outputChannelVar == weightOutChannelVar)  # Output Channel
+        tilerModel.addConstraint(outputHeightVar == inputHeightVar)
+        tilerModel.addConstraint(outputWidthVar == inputWidthVar)
+
+        tilerModel.addConstraint(inputHeightVar >= 1)
+        tilerModel.addConstraint(inputWidthVar >= 1)
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        weightBuffer = ctxt.lookup(name = parseDict['weight'])
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
+
+        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 0)
+        weightInChannelMajorVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 1)
+        weightBandwidthVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 2)
+
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 3)
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+
+        # LMACAN: Force full input channel to avoid partial results
+        tilerModel.addConstraint(inputChannelVar == inputChannelVar.Max())
+        tilerModel.addConstraint(weightInChannelMajorVar == weightInChannelMajorVar.Max())
+        tilerModel.addConstraint(weightBandwidthVar == weightBandwidthVar.Max())
+
+        tilerModel.addConstraint((inputHeightVar % strides[0]) == 0)
+        tilerModel.addConstraint((inputWidthVar % strides[1]) == 0)
+
+        # N-EUREKA tile constraints to align with N-EUREKA's hardware subtiling
+        if parseDict["dim_im_out_x"] > 6:
+            tilerModel.addTileSizeDivisibleConstraint(parseDict,
+                                                      "dim_im_out_x",
+                                                      outputHeightVar,
+                                                      6,
+                                                      strategy = PerformanceHint(priority = 3))
+        else:
+            tilerModel.addConstraint(outputHeightVar == outputHeightVar.Max(), strategy = PerformanceHint(priority = 3))
+
+        if parseDict["dim_im_out_y"] > 6:
+            tilerModel.addTileSizeDivisibleConstraint(parseDict,
+                                                      "dim_im_out_y",
+                                                      outputWidthVar,
+                                                      6,
+                                                      strategy = PerformanceHint(priority = 2))
+        else:
+            tilerModel.addConstraint(outputWidthVar == outputWidthVar.Max(), strategy = PerformanceHint(priority = 2))
+
+        if parseDict["ch_im_out"] > 32:
+            tilerModel.addTileSizeDivisibleConstraint(parseDict,
+                                                      "ch_im_out",
+                                                      outputChannelVar,
+                                                      32,
+                                                      strategy = PerformanceHint(priority = 1))
+        else:
+            tilerModel.addConstraint(outputChannelVar == outputChannelVar.Max(),
+                                     strategy = PerformanceHint(priority = 1))
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'weight', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        varWeight = operatorRepresentation['weight']
+        varOut = operatorRepresentation['data_out']
+
+        inputInCubes = []
+        inputWeightCubes = []
+        replacements: Dict[str, List[int]] = {
+            "padding_y_top": [],
+            "padding_y_bottom": [],
+            "padding_x_left": [],
+            "padding_x_right": [],
+            "dim_im_in_x_stride": [],
+            "dim_im_in_y_stride": [],
+            "dim_im_out_x_stride": [],
+            "dim_im_out_y_stride": [],
+            "input_addr_offset": [],
+            "nKo": [],
+            "nKi": [],
+            "nHo": [],
+            "nWo": [],
+            "bKo": [],
+            "bKi": [],
+            "bHo": [],
+            "bWo": [],
+            "bHi": [],
+            "bWi": [],
+        }
+
+        replacementTypes = {
+            "padding_y_top": PointerClass(uint8_t),
+            "padding_y_bottom": PointerClass(uint8_t),
+            "padding_x_left": PointerClass(uint8_t),
+            "padding_x_right": PointerClass(uint8_t),
+            "dim_im_in_x_stride": PointerClass(uint32_t),
+            "dim_im_in_y_stride": PointerClass(uint32_t),
+            "dim_im_out_x_stride": PointerClass(uint32_t),
+            "dim_im_out_y_stride": PointerClass(uint32_t),
+            "input_addr_offset": PointerClass(uint32_t),
+            "nKo": PointerClass(uint16_t),
+            "nKi": PointerClass(uint16_t),
+            "nHo": PointerClass(uint16_t),
+            "nWo": PointerClass(uint16_t),
+            "bKo": PointerClass(uint16_t),
+            "bKi": PointerClass(uint16_t),
+            "bHo": PointerClass(uint16_t),
+            "bWo": PointerClass(uint16_t),
+            "bHi": PointerClass(uint16_t),
+            "bWi": PointerClass(uint16_t),
+        }
+
+        weightH = operatorRepresentation['dim_kernel_y']
+        weightW = operatorRepresentation['dim_kernel_x']
+        weightC = operatorRepresentation['ch_im_in']
+
+        pads = operatorRepresentation['pads']
+        strides = operatorRepresentation['strides']
+
+        for cube in outputCubes:
+            (BatchOffset, HOffset, WOffset, COffset) = cube.offset
+            (BatchSize, HSize, WSize, CSize) = cube.dims
+
+            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
+                                                                          cube,
+                                                                          ctxt.lookup(varOut).shape)
+            padding_left, padding_right, padding_top, padding_bottom = padding_tuple
+
+            replacements['padding_y_top'].append(padding_top)
+            replacements['padding_y_bottom'].append(padding_bottom)
+            replacements['padding_x_left'].append(padding_left)
+            replacements['padding_x_right'].append(padding_right)
+
+            inBSize, inHSize, inWSize, inCSize = InCube.dims
+
+            dim_im_in_x_stride, dim_im_in_y_stride = ioStridesFromDimensions(inWSize, inCSize,
+                                                                             operatorRepresentation["input_bits"])
+            replacements['dim_im_in_x_stride'].append(dim_im_in_x_stride)
+            replacements['dim_im_in_y_stride'].append(dim_im_in_y_stride)
+            dim_im_out_x_stride, dim_im_out_y_stride = ioStridesFromDimensions(WSize, CSize,
+                                                                               operatorRepresentation["output_bits"])
+            replacements['dim_im_out_x_stride'].append(dim_im_out_x_stride)
+            replacements['dim_im_out_y_stride'].append(dim_im_out_y_stride)
+
+            replacements['input_addr_offset'].append(
+                getInputAddrOffset(inWSize, dim_im_in_y_stride, padding_top, padding_left))
+
+            nKo, nKi, nHo, nWo, bKo, bKi, bHo, bWo, bHi, bWi = Neureka2DPWConvTemplate.getCounters(
+                inCSize, HSize, WSize, CSize, padding_bottom, padding_right, operatorRepresentation)
+
+            replacements["nKo"].append(nKo)
+            replacements["nKi"].append(nKi)
+            replacements["nHo"].append(nHo)
+            replacements["nWo"].append(nWo)
+            replacements["bKo"].append(bKo)
+            replacements["bKi"].append(bKi)
+            replacements["bHo"].append(bHo)
+            replacements["bWo"].append(bWo)
+            replacements["bHi"].append(bHi)
+            replacements["bWi"].append(bWi)
+
+            inputInCubes.append(InCube)
+
+            weightShape = ctxt.lookup(varWeight).shape
+            WeightCube = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
+
+            inputWeightCubes.append(WeightCube)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for a, b in zip(inputInCubes, inputWeightCubes):
+            inputLoadSchedule.append({"data_in": a, "weight": b})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
+
+
+class NeurekaRQSPWConv2DTileConstraint(NeurekaPWConv2DTileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        tilerModel = NeurekaPWConv2DTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
+
+        outputBufferName = parseDict['data_out']
+        mulBufferName = parseDict['mul']
+        addBufferName = parseDict['add']
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [mulBufferName, addBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+        addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0)
+        mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0)
+
+        tilerModel.addConstraint(outputChannelVar == addChannelVar)
+        tilerModel.addConstraint(outputChannelVar == mulChannelVar)
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        variableReplacementSchedule, tilingSchedule = super().serializeTilingSolution(
+            tilingSolution, absoluteOutputCubes, targetMemLevel, ctxt, operatorRepresentation)
+
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['mul', 'add']
+        inputRequantBaseOffsets, _ = cls.extractBaseAddr(tilingSolution, targetMemLevel, operatorRepresentation,
+                                                         addrNames)
+        newInputBaseOffsets = {**tilingSchedule.inputBaseOffsets, **inputRequantBaseOffsets}
+
+        inputRequantCubes = []
+        for cube in outputCubes:
+            (_, _, _, COffset) = cube.offset
+            (_, _, _, CSize) = cube.dims
+            inputRequantCubes.append(HyperRectangle((COffset,), (CSize,)))
+        newInputLoadSchedule = [{
+            **schedule, "add": requant,
+            "mul": requant
+        } for schedule, requant in zip(tilingSchedule.inputLoadSchedule, inputRequantCubes)]
+
+        newTilingSchedule = TilingSchedule(newInputBaseOffsets, tilingSchedule.outputBaseOffsets, newInputLoadSchedule,
+                                           tilingSchedule.outputLoadSchedule)
+
+        return variableReplacementSchedule, newTilingSchedule
+
+
+class NeurekaWmemPWConv2DTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        inputBufferName = parseDict['data_in']
+        weightBufferName = parseDict['weight']
+        outputBufferName = parseDict['data_out']
+
+        for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
+
+        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
+
+        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
+
+        # Map output dims to inputs dims
+        tilerModel.addConstraint(outputBatchVar == inputBatchVar)
+        tilerModel.addConstraint(outputHeightVar == inputHeightVar)
+        tilerModel.addConstraint(outputWidthVar == inputWidthVar)
+
+        # Don't tile weights in weight memory
+        tilerModel.addConstraint(weightOutChannelVar == weightOutChannelVar.Max())
+
+        tilerModel.addConstraint(inputHeightVar >= 1)
+        tilerModel.addConstraint(inputWidthVar >= 1)
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        weightBuffer = ctxt.lookup(name = parseDict['weight'])
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
+
+        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 0)
+        weightInChannelMajorVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 1)
+        weightBandwidthVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 2)
+
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 3)
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+
+        # LMACAN: Force full input channel to avoid partial results
+        tilerModel.addConstraint(inputChannelVar == inputChannelVar.Max())
+        tilerModel.addConstraint(weightInChannelMajorVar == weightInChannelMajorVar.Max())
+        tilerModel.addConstraint(weightBandwidthVar == weightBandwidthVar.Max())
+
+        tilerModel.addConstraint((inputHeightVar % strides[0]) == 0)
+        tilerModel.addConstraint((inputWidthVar % strides[1]) == 0)
+
+        # N-EUREKA tile constraints to align with N-EUREKA's hardware subtiling
+        if parseDict["dim_im_out_x"] > 6:
+            tilerModel.addTileSizeDivisibleConstraint(parseDict,
+                                                      "dim_im_out_x",
+                                                      outputHeightVar,
+                                                      6,
+                                                      strategy = PerformanceHint(priority = 3))
+        else:
+            tilerModel.addConstraint(outputHeightVar == outputHeightVar.Max(), strategy = PerformanceHint(priority = 3))
+
+        if parseDict["dim_im_out_y"] > 6:
+            tilerModel.addTileSizeDivisibleConstraint(parseDict,
+                                                      "dim_im_out_y",
+                                                      outputWidthVar,
+                                                      6,
+                                                      strategy = PerformanceHint(priority = 2))
+        else:
+            tilerModel.addConstraint(outputWidthVar == outputWidthVar.Max(), strategy = PerformanceHint(priority = 2))
+
+        if parseDict["ch_im_out"] > 32:
+            tilerModel.addTileSizeDivisibleConstraint(parseDict,
+                                                      "ch_im_out",
+                                                      outputChannelVar,
+                                                      32,
+                                                      strategy = PerformanceHint(priority = 1))
+        else:
+            tilerModel.addConstraint(outputChannelVar == outputChannelVar.Max(),
+                                     strategy = PerformanceHint(priority = 1))
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        varWeight = operatorRepresentation['weight']
+        varOut = operatorRepresentation['data_out']
+
+        inputInCubes = []
+        replacements: Dict[str, List[int]] = {
+            "padding_y_top": [],
+            "padding_y_bottom": [],
+            "padding_x_left": [],
+            "padding_x_right": [],
+            "weight_addr_offset": [],
+            "dim_im_in_x_stride": [],
+            "dim_im_in_y_stride": [],
+            "dim_im_out_x_stride": [],
+            "dim_im_out_y_stride": [],
+            "input_addr_offset": [],
+            "nKo": [],
+            "nKi": [],
+            "nHo": [],
+            "nWo": [],
+            "bKo": [],
+            "bKi": [],
+            "bHo": [],
+            "bWo": [],
+            "bHi": [],
+            "bWi": [],
+        }
+
+        replacementTypes = {
+            "padding_y_top": PointerClass(uint8_t),
+            "padding_y_bottom": PointerClass(uint8_t),
+            "padding_x_left": PointerClass(uint8_t),
+            "padding_x_right": PointerClass(uint8_t),
+            "weight_addr_offset": PointerClass(uint32_t),
+            "dim_im_in_x_stride": PointerClass(uint32_t),
+            "dim_im_in_y_stride": PointerClass(uint32_t),
+            "dim_im_out_x_stride": PointerClass(uint32_t),
+            "dim_im_out_y_stride": PointerClass(uint32_t),
+            "input_addr_offset": PointerClass(uint32_t),
+            "nKo": PointerClass(uint16_t),
+            "nKi": PointerClass(uint16_t),
+            "nHo": PointerClass(uint16_t),
+            "nWo": PointerClass(uint16_t),
+            "bKo": PointerClass(uint16_t),
+            "bKi": PointerClass(uint16_t),
+            "bHo": PointerClass(uint16_t),
+            "bWo": PointerClass(uint16_t),
+            "bHi": PointerClass(uint16_t),
+            "bWi": PointerClass(uint16_t),
+        }
+
+        weightH = operatorRepresentation['dim_kernel_y']
+        weightW = operatorRepresentation['dim_kernel_x']
+        weightC = operatorRepresentation['ch_im_in']
+
+        pads = operatorRepresentation['pads']
+        strides = operatorRepresentation['strides']
+
+        for absoluteCube in absoluteOutputCubes:
+            cube = absoluteCube.rectangle
+            (BatchOffset, HOffset, WOffset, COffset) = cube.offset
+            (BatchSize, HSize, WSize, CSize) = cube.dims
+
+            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
+                                                                          cube,
+                                                                          ctxt.lookup(varOut).shape)
+            padding_left, padding_right, padding_top, padding_bottom = padding_tuple
+
+            replacements['padding_y_top'].append(padding_top)
+            replacements['padding_y_bottom'].append(padding_bottom)
+            replacements['padding_x_left'].append(padding_left)
+            replacements['padding_x_right'].append(padding_right)
+
+            inBSize, inHSize, inWSize, inCSize = InCube.dims
+
+            dim_im_in_x_stride, dim_im_in_y_stride = ioStridesFromDimensions(inWSize, inCSize,
+                                                                             operatorRepresentation["input_bits"])
+            replacements['dim_im_in_x_stride'].append(dim_im_in_x_stride)
+            replacements['dim_im_in_y_stride'].append(dim_im_in_y_stride)
+            dim_im_out_x_stride, dim_im_out_y_stride = ioStridesFromDimensions(WSize, CSize,
+                                                                               operatorRepresentation["output_bits"])
+            replacements['dim_im_out_x_stride'].append(dim_im_out_x_stride)
+            replacements['dim_im_out_y_stride'].append(dim_im_out_y_stride)
+
+            replacements['input_addr_offset'].append(
+                getInputAddrOffset(inWSize, dim_im_in_y_stride, padding_top, padding_left))
+
+            nKo, nKi, nHo, nWo, bKo, bKi, bHo, bWo, bHi, bWi = Neureka2DPWConvTemplate.getCounters(
+                inCSize, HSize, WSize, CSize, padding_bottom, padding_right, operatorRepresentation)
+
+            replacements["nKo"].append(nKo)
+            replacements["nKi"].append(nKi)
+            replacements["nHo"].append(nHo)
+            replacements["nWo"].append(nWo)
+            replacements["bKo"].append(bKo)
+            replacements["bKi"].append(bKi)
+            replacements["bHo"].append(bHo)
+            replacements["bWo"].append(bWo)
+            replacements["bHi"].append(bHi)
+            replacements["bWi"].append(bWi)
+
+            inputInCubes.append(InCube)
+
+            _, _, _, absoluteCOffset = absoluteCube.absoluteOffset
+            weightShape = ctxt.lookup(varWeight).shape
+            WeightCube = HyperRectangle((absoluteCOffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
+            replacements['weight_addr_offset'].append(calculateRectangleOffset(WeightCube, ctxt.lookup(varWeight)))
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for a in inputInCubes:
+            inputLoadSchedule.append({"data_in": a})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
+
+
+class NeurekaWmemRQSPWConv2DTileConstraint(NeurekaWmemPWConv2DTileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        tilerModel = NeurekaWmemPWConv2DTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
+
+        outputBufferName = parseDict['data_out']
+        mulBufferName = parseDict['mul']
+        addBufferName = parseDict['add']
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [mulBufferName, addBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+        addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0)
+        mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0)
+
+        tilerModel.addConstraint(outputChannelVar == addChannelVar)
+        tilerModel.addConstraint(outputChannelVar == mulChannelVar)
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        variableReplacementSchedule, tilingSchedule = super().serializeTilingSolution(
+            tilingSolution, absoluteOutputCubes, targetMemLevel, ctxt, operatorRepresentation)
+
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['mul', 'add']
+        inputRequantBaseOffsets, _ = cls.extractBaseAddr(tilingSolution, targetMemLevel, operatorRepresentation,
+                                                         addrNames)
+        newInputBaseOffsets = {**tilingSchedule.inputBaseOffsets, **inputRequantBaseOffsets}
+
+        inputRequantCubes = []
+        for cube in outputCubes:
+            (_, _, _, COffset) = cube.offset
+            (_, _, _, CSize) = cube.dims
+            inputRequantCubes.append(HyperRectangle((COffset,), (CSize,)))
+        newInputLoadSchedule = [{
+            **schedule, "add": requant,
+            "mul": requant
+        } for schedule, requant in zip(tilingSchedule.inputLoadSchedule, inputRequantCubes)]
+
+        newTilingSchedule = TilingSchedule(newInputBaseOffsets, tilingSchedule.outputBaseOffsets, newInputLoadSchedule,
+                                           tilingSchedule.outputLoadSchedule)
+
+        return variableReplacementSchedule, newTilingSchedule
diff --git a/Deeploy/Targets/Neureka/TileConstraints/__init__.py b/Deeploy/Targets/Neureka/TileConstraints/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/Neureka/TileConstraints/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/Neureka/Tiler.py b/Deeploy/Targets/Neureka/Tiler.py
new file mode 100644
index 0000000..e4f07c1
--- /dev/null
+++ b/Deeploy/Targets/Neureka/Tiler.py
@@ -0,0 +1,68 @@
+# ----------------------------------------------------------------------
+#
+# File: Tiler.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from Deeploy.Targets.Neureka.Bindings import NeurekaDenseConv2DBindings, NeurekaDWConv2DBindings, \
+    NeurekaPWConv2DBindings, NeurekaRQSDenseConv2DBindings, NeurekaRQSDWConv2DBindings, NeurekaRQSPWConv2DBindings, \
+    NeurekaWmemDenseConv2DBindings, NeurekaWmemDWConv2DBindings, NeurekaWmemPWConv2DBindings, \
+    NeurekaWmemRQSDenseConv2DBindings, NeurekaWmemRQSDWConv2DBindings, NeurekaWmemRQSPWConv2DBindings
+from Deeploy.Targets.Neureka.TileConstraints.NeurekaDenseConstraint import NeurekaDenseConv2DTileConstraint, \
+    NeurekaRQSDenseConv2DTileConstraint, NeurekaWmemDenseConv2DTileConstraint, \
+    NeurekaWmemRQSDenseConv2DTileConstraint
+from Deeploy.Targets.Neureka.TileConstraints.NeurekaDepthwiseConstraint import NeurekaDWConv2DTileConstraint, \
+    NeurekaRQSDWConv2DTileConstraint, NeurekaWmemDWConv2DTileConstraint, NeurekaWmemRQSDWConv2DTileConstraint
+from Deeploy.Targets.Neureka.TileConstraints.NeurekaPointwiseConstraint import NeurekaPWConv2DTileConstraint, \
+    NeurekaRQSPWConv2DTileConstraint, NeurekaWmemPWConv2DTileConstraint, NeurekaWmemRQSPWConv2DTileConstraint
+from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
+
+NeurekaRQSPWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = NeurekaRQSPWConv2DBindings,
+                                                                tileConstraint = NeurekaRQSPWConv2DTileConstraint())
+NeurekaPWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = NeurekaPWConv2DBindings,
+                                                             tileConstraint = NeurekaPWConv2DTileConstraint())
+
+NeurekaWmemRQSPWConv2DTilingReadyBindings = TilingReadyNodeBindings(
+    nodeBindings = NeurekaWmemRQSPWConv2DBindings, tileConstraint = NeurekaWmemRQSPWConv2DTileConstraint())
+NeurekaWmemPWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = NeurekaWmemPWConv2DBindings,
+                                                                 tileConstraint = NeurekaWmemPWConv2DTileConstraint())
+
+NeurekaRQSDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = NeurekaRQSDWConv2DBindings,
+                                                                tileConstraint = NeurekaRQSDWConv2DTileConstraint())
+NeurekaDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = NeurekaDWConv2DBindings,
+                                                             tileConstraint = NeurekaDWConv2DTileConstraint())
+
+NeurekaWmemRQSDWConv2DTilingReadyBindings = TilingReadyNodeBindings(
+    nodeBindings = NeurekaWmemRQSDWConv2DBindings, tileConstraint = NeurekaWmemRQSDWConv2DTileConstraint())
+NeurekaWmemDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = NeurekaWmemDWConv2DBindings,
+                                                                 tileConstraint = NeurekaWmemDWConv2DTileConstraint())
+
+NeurekaRQSDenseConv2DTilingReadyBindings = TilingReadyNodeBindings(
+    nodeBindings = NeurekaRQSDenseConv2DBindings, tileConstraint = NeurekaRQSDenseConv2DTileConstraint())
+NeurekaDenseConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = NeurekaDenseConv2DBindings,
+                                                                tileConstraint = NeurekaDenseConv2DTileConstraint())
+
+NeurekaWmemRQSDenseConv2DTilingReadyBindings = TilingReadyNodeBindings(
+    nodeBindings = NeurekaWmemRQSDenseConv2DBindings, tileConstraint = NeurekaWmemRQSDenseConv2DTileConstraint())
+NeurekaWmemDenseConv2DTilingReadyBindings = TilingReadyNodeBindings(
+    nodeBindings = NeurekaWmemDenseConv2DBindings, tileConstraint = NeurekaWmemDenseConv2DTileConstraint())
diff --git a/Deeploy/Targets/Neureka/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Neureka/TopologyOptimizationPasses/Passes.py
new file mode 100644
index 0000000..f3fdeaf
--- /dev/null
+++ b/Deeploy/Targets/Neureka/TopologyOptimizationPasses/Passes.py
@@ -0,0 +1,333 @@
+# ----------------------------------------------------------------------
+#
+# File: Passes.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import math
+from functools import partial
+from typing import Generator, List, Tuple
+
+import numpy as np
+import numpy.typing as npt
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.Matchers import Match, NonBranchingMatcher
+from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, SequentialPass, \
+    contextagnostic
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    RemoveGlobalOutputReshapePass, _createReshape
+from Deeploy.EngineExtension.OptimizationPasses.TopologyOptimizationPasses.EngineColoringPasses import \
+    EngineDiscolorationPass
+from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ReshapeConstOptPass, ReshapeMergePass
+
+
+def _weightEncode(weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False) -> npt.NDArray[np.uint8]:
+    """Unroll weight into expected memory format
+
+    Expected weight shape is (cout, cin, H, W).
+    The produced memory layout depends on the weight kernel shape:
+      - 3x3: (cout, cinMajor, Bits, H x W x cinMinor_3x3 packed into Weight Bandwidth bits),
+      - 1x1: (cout, cinMajor, Bits x H x W x cinMinor_1x1 packed into Weight Bandwidth bits),
+    where cinMajor is the ceil(cin / cin subtile <mode>) and cinMinor has to be padded with 0 to cin subtile <mode>.
+    """
+    _NEUREKA_WEIGHT_BANDWIDTH = 256
+    _NEUREKA_CIN_SUBTILE_1x1 = 32
+    _NEUREKA_CIN_SUBTILE_3x3 = 28
+
+    if depthwise:
+        weight = weight.transpose(1, 0, 2, 3)  # Swap cout and cin
+
+    cout, cin, height, width = weight.shape
+    cinSubtile = (_NEUREKA_CIN_SUBTILE_3x3 if height == 3 else _NEUREKA_CIN_SUBTILE_1x1)
+
+    # Pad cin to be divisible with CIN_SUBTILE
+    if cin % cinSubtile != 0:
+        cinPad = cinSubtile - cin % cinSubtile
+        weight = np.pad(
+            weight,
+            ((0, 0), (0, cinPad), (0, 0), (0, 0)),
+            "constant",
+            constant_values = 0,
+        )
+
+    # Reshape into (cout, cinMajor, cinMinor, Flattened spatial, 1)
+    # The 1 at the end is required by the unpacking
+    cinMajor = int(np.ceil(cin / cinSubtile))
+    weight = weight.reshape(cout, cinMajor, cinSubtile, height * width, 1)
+
+    # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0]
+    # (cout, cinMajor, cinSubtile, Flattened spatial, Bits)
+    weight = np.unpackbits(weight, axis = -1, count = bits, bitorder = "little")
+
+    # Shuffle bits so that the final shape is:
+    # (cout, cinMajor, Bits, Flattened spatial, cinSubtile)
+    weight = weight.transpose(0, 1, 4, 3, 2)
+
+    # Pack dimensions to fit into weight bandwidth
+    if height == 3 and width == 3:
+        # (cout * cinMajor * Bits, H * W * cinSubtile)
+        weight = weight.reshape(-1, height * width * cinSubtile)
+        # Pad only the last dimension to weight bandwidth size
+        # (-1, Weight Bandwidth)
+        weight = np.pad(
+            weight,
+            ((0, 0), (0, _NEUREKA_WEIGHT_BANDWIDTH - weight.shape[-1])),
+            "constant",
+            constant_values = 0,
+        )
+    elif height == 1 and width == 1:
+        # Tile cinSubtile into tiles of size 4
+        # (cout, cinMajor, Bits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
+        weight = weight.reshape(cout, cinMajor, bits, height * width, cinSubtile // 4,
+                                4)  # cout, cinMajor, bits, 1, 8, 4
+        # Pad bits to 8
+        if bits < 8:
+            # (cout, cinMajor, PaddedBits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
+            weight = np.pad(
+                weight,
+                ((0, 0), (0, 0), (0, 8 - bits), (0, 0), (0, 0), (0, 0)),
+                mode = "constant",
+                constant_values = 0,
+            )
+        # (cout, cinMajor, Flattened spatial, cinSubtileMajor, PaddedBits, cinSubtileTile)
+        weight = weight.transpose(0, 1, 3, 4, 2, 5)
+        # (-1, Weight Bandwidth)
+        weight = weight.reshape(cout * cinMajor, _NEUREKA_WEIGHT_BANDWIDTH)  # cout*cinMajor, 256b
+
+    # Prepare for packing
+    # (-1, Weight Bandwidth Bytes, 8)
+    weightBandwidthBytes = int(np.ceil(_NEUREKA_WEIGHT_BANDWIDTH / 8))
+    weight = np.stack(np.split(weight, weightBandwidthBytes, axis = -1), axis = -2)
+
+    # Pack bits
+    # (-1, Weight Bandwidth Bytes)
+    weight = np.packbits(weight, axis = -1, bitorder = "little")
+
+    if height == 1 and width == 1:
+        # (cout, cinMajor, Weight Bandwidth Bytes)
+        return weight.reshape(cout, cinMajor, weightBandwidthBytes)
+    elif depthwise:
+        return weight.reshape(cout, cinMajor, bits, weightBandwidthBytes)
+    else:
+        return weight.reshape(cout, cinMajor, bits, weightBandwidthBytes)
+
+
+def _neureka_adjust_weight_memory_layout_fun(graph: gs.Graph, match: Match, name: str, default_channels_first: bool,
+                                             neurekaEngineName: str):
+    matched_nodes = list(match.nodes_map.values())
+    node = matched_nodes[0]
+
+    if not ("engine" in node.attrs and node.attrs["engine"] == neurekaEngineName):
+        return graph
+
+    weightTensor = node.inputs[1]
+
+    if not isinstance(weightTensor, gs.Constant):
+        return graph
+
+    # Adjust N-EUREKA's weights
+    values = weightTensor.values
+
+    # Extract weight offset and translate weights by the offset
+    weight_offset = values.min()
+    values = values - weight_offset
+    node.attrs["weight_offset"] = weight_offset
+
+    if "channels_first" in node.attrs:
+        channels_first = node.attrs["channels_first"]
+    else:
+        channels_first = default_channels_first
+
+    # Weight encode expects channels first
+    if not channels_first:
+        values = values.transpose(0, 3, 1, 2)
+
+    bits = 8  # Support only 8 bit weights for now
+    if node.attrs['group'] == 1:
+        weightTensor.values = _weightEncode(values.astype(np.uint8), bits, depthwise = False)
+    else:
+        weightTensor.values = _weightEncode(values.astype(np.uint8), bits, depthwise = True)
+    weightTensor.name = f"{name}_{weightTensor.name}"
+
+    return graph
+
+
+@contextagnostic
+class NeurekaAdjustWeightMemoryLayoutPass(ReplaceSequentialPatternPass):
+
+    def __init__(self, default_channels_first: bool, neurekaEngineName: str):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['out'], op = 'RequantizedConv|Conv', name = 'node')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        super().__init__(
+            graph,
+            partial(_neureka_adjust_weight_memory_layout_fun,
+                    default_channels_first = default_channels_first,
+                    neurekaEngineName = neurekaEngineName), "_NEUREKA_ADJUST_WEIGHT_MEMORY_LAYOUT_PASS",
+            NonBranchingMatcher(regex_op = True))
+
+
+def _findAllMultiplicands(x: int) -> List[int]:
+    multiplicands = []
+    tmpX = x
+    for i in range(2, math.ceil(math.sqrt(x))):  # Ceil cause range doesn't include the last number
+        while tmpX % i == 0:
+            multiplicands.append(i)
+            tmpX = tmpX / i
+
+    if x // math.prod(multiplicands) > 1:
+        multiplicands.append(x // math.prod(multiplicands))
+
+    return multiplicands
+
+
+def _findAllReshapeOptions(dim: int) -> Generator[Tuple[int, int], None, None]:
+    multiplicands = _findAllMultiplicands(dim)
+    for combLen in range(1, 1 + (len(multiplicands) // 2)):
+        for comb in itertools.combinations(multiplicands, combLen):
+            a = math.prod(comb)
+            b = dim // a
+            yield a, b
+
+
+def _nSubtiles(dims: Tuple[int, int]):
+    return math.ceil(dims[0] / 6) * math.ceil(dims[1] / 6)
+
+
+def _findLowestNumberOfSubtilesReshapeOptions(dim: int) -> List[Tuple[int, int]]:
+    lowestNumberOfSubtiles = dim
+    bestOptions: List[Tuple[int, int]] = [(dim, 1)]
+    for option in _findAllReshapeOptions(dim):
+        nSubtiles = _nSubtiles(option)
+        if nSubtiles < lowestNumberOfSubtiles:
+            lowestNumberOfSubtiles = nSubtiles
+            bestOptions = [option]
+        elif nSubtiles == lowestNumberOfSubtiles:
+            bestOptions.append(option)
+    return bestOptions
+
+
+def _bestReshapeOption(dim: int) -> Tuple[int, int]:
+    smallestDim = dim
+    biggestDim = 1
+    for option in _findLowestNumberOfSubtilesReshapeOptions(dim):
+        if option[0] < smallestDim:
+            smallestDim = option[0]
+            biggestDim = option[1]
+        elif option[1] < smallestDim:
+            smallestDim = option[1]
+            biggestDim = option[0]
+    return biggestDim, smallestDim
+
+
+def _neureka_reshape_pointwise_convolution_fun(graph: gs.Graph, match: Match, name: str, default_channels_first: bool,
+                                               neurekaEngineName: str):
+    matched_nodes = list(match.nodes_map.values())
+    node = matched_nodes[0]
+
+    if not ("engine" in node.attrs and node.attrs["engine"] == neurekaEngineName):
+        return graph
+
+    if not (node.attrs["kernel_shape"] == [1, 1]):
+        return graph
+
+    if "channels_first" in node.attrs:
+        channels_first = node.attrs["channels_first"]
+    else:
+        channels_first = default_channels_first
+
+    def extractSpatialDims(shape: List[int]) -> List[int]:
+        if channels_first:
+            return shape[-2:]
+        else:
+            return shape[-3:-1]
+
+    def replaceSpatialDims(shape: List[int], newSpatialDims: Tuple[int, int]) -> List[int]:
+        if channels_first:
+            return shape[:-2] + list(newSpatialDims)
+        else:
+            return shape[:-3] + list(newSpatialDims) + shape[-1:]
+
+    _input = node.inputs[0]
+    spatialDims = extractSpatialDims(_input.shape)
+    newSpatialDims = _bestReshapeOption(math.prod(spatialDims))
+    newInputShape = replaceSpatialDims(_input.shape, newSpatialDims)
+
+    inputReshapeNode, reshapedInput = _createReshape(_input, name, newInputShape)
+    graph.nodes.append(inputReshapeNode)
+    node.inputs[0] = reshapedInput
+
+    output = node.outputs[0]
+    newOutputShape = replaceSpatialDims(output.shape, newSpatialDims)
+    reshapedOutput = gs.Variable(output.name + "_Reshaped", dtype = output.dtype, shape = newOutputShape)
+    outputReshapeNode, _ = _createReshape(reshapedOutput, name, output.shape, output)
+    graph.nodes.append(outputReshapeNode)
+    node.outputs[0] = reshapedOutput
+
+    return graph
+
+
+@contextagnostic
+class NeurekaReshapePointwiseConvolutionPass(ReplaceSequentialPatternPass):
+    """Reshape pointwise convolution's spatial dimensions so that they work better for N-EUREKA's hardware tiling"""
+
+    def __init__(self, default_channels_first: bool, neurekaEngineName: str):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['out'], op = 'RequantizedConv|Conv', name = 'node')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        super().__init__(
+            graph,
+            partial(_neureka_reshape_pointwise_convolution_fun,
+                    default_channels_first = default_channels_first,
+                    neurekaEngineName = neurekaEngineName), "_NEUREKA_RESHAPE_POINTWISE_CONVOLUTION_PASS",
+            NonBranchingMatcher(regex_op = True))
+
+
+class ConvEngineDiscolorationPass(EngineDiscolorationPass):
+
+    def __init__(self):
+        pattern = gs.Graph()
+        _input = gs.Variable(name = 'input')
+        output = pattern.layer(inputs = [_input], outputs = ['output'], op = 'RequantizedConv|Conv', name = 'conv')
+        pattern.outputs.append(output)
+        pattern.inputs.append(_input)
+        super().__init__(pattern, "_CONV_ENGINE_DISCOLORATION_PASS", matcher = NonBranchingMatcher(regex_op = True))
+
+
+@contextagnostic
+class NeurekaOptimizationPass(SequentialPass):
+
+    def __init__(self, default_channels_first: bool, neurekaEngineName: str):
+        super().__init__(NeurekaAdjustWeightMemoryLayoutPass(default_channels_first, neurekaEngineName),
+                         NeurekaReshapePointwiseConvolutionPass(default_channels_first, neurekaEngineName),
+                         ReshapeMergePass(),
+                         ReshapeConstOptPass(),
+                         RemoveGlobalOutputReshapePass(),
+                         name_prefix = '')
diff --git a/Deeploy/Targets/Neureka/TopologyOptimizationPasses/__init__.py b/Deeploy/Targets/Neureka/TopologyOptimizationPasses/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/Neureka/TopologyOptimizationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/Neureka/__init__.py b/Deeploy/Targets/Neureka/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/Neureka/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py
new file mode 100644
index 0000000..5d23620
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Bindings.py
@@ -0,0 +1,309 @@
+# ----------------------------------------------------------------------
+#
+# File: PULPBindings.py
+#
+# Last edited: 10.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Authors:
+# - Moritz Scherer, ETH Zurich
+# - Victor Jung, ETH Zurichs
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from functools import partial
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration
+from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
+    MemoryManagementGeneration
+from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes, SignedIntegerDataTypes, int8_t, int32_t, uint8_t
+from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeTemplate
+from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
+from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
+from Deeploy.Targets.Generic.Templates import ConcatTemplate, RQSiGELUTemplate, iHardswishTemplate
+from Deeploy.Targets.Generic.TypeCheckers import ConcatChecker, GELUChecker, HardswishChecker, MatMulChecker, \
+    MulChecker, ReduceMeanChecker, RQHardswishChecker, SliceChecker, SoftmaxChecker, TransposeChecker, \
+    iLayerNormChecker
+from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass
+from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling
+from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling
+from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture
+from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, \
+    MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, RQAddTemplate, RQSiHardswishTemplate, SliceTemplate, \
+    TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate
+from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \
+    PULPRequantShiftChecker, PULPRQAddChecker
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement
+
+_clusterEntryClosureCallTemplate = NodeTemplate("""
+// ${closureName} CLOSURE CALL
+static struct pi_cluster_task cluster_task;
+
+pi_cluster_task(&cluster_task, ${closureName}, &${closureStructArgName});
+cluster_task.stack_size = 5000;
+cluster_task.slave_stack_size = 3800;
+pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
+//pi_cluster_close(&cluster_dev);
+""")
+
+_clusterForkClosureCallTemplate = NodeTemplate("""
+pi_cl_team_fork(NUM_CORES, (void*)${closureName}, &${closureStructArgName});
+""")
+
+FunctionCallClosure = partial(ClosureGeneration, closureSuffix = "_closure")
+ClusterClosure = partial(ClosureGeneration,
+                         closureSuffix = "_cluster_entry",
+                         closureCallTemplate = _clusterEntryClosureCallTemplate)
+ForkClosure = partial(ClosureGeneration,
+                      closureSuffix = "_cluster_fork",
+                      closureCallTemplate = _clusterForkClosureCallTemplate)
+
+TilingCallClosure = partial(ClosureGeneration, closureSuffix = "_tiling_closure")
+FunctionCallClosure = partial(ClosureGeneration, closureSuffix = "_closure")
+ForkClosure = partial(ClosureGeneration,
+                      closureSuffix = "_cluster_fork",
+                      closureCallTemplate = _clusterForkClosureCallTemplate)
+
+MemoryAwareClusterClosure = partial(MemoryAwareClosureGeneration,
+                                    closureSuffix = "_cluster_entry",
+                                    closureCallTemplate = _clusterEntryClosureCallTemplate,
+                                    startRegion = "L2",
+                                    endRegion = "L1")
+MemoryAwareFunctionCallClosure = partial(MemoryAwareClosureGeneration,
+                                         closureSuffix = "_closure",
+                                         startRegion = "L2",
+                                         endRegion = "L1")
+
+L3MemoryAwareFunctionCallClosure = partial(ClosureGeneration, closureSuffix = "_closure_L3")
+
+MemoryAwareForkTransformer = CodeTransformation([
+    ArgumentStructGeneration(),
+    ForkClosure(generateStruct = False),
+    FutureGeneration(),
+    ArgumentStructGeneration(),
+    MemoryManagementGeneration("L1"),
+    FunctionCallClosure(writeback = True),
+    MemoryManagementGeneration("L2"),
+    MemoryManagementGeneration()
+])
+
+ForkTransformer = CodeTransformation([
+    TilingVariableReplacement("L1"),
+    TilingCallClosure(writeback = False),
+    PULPSynchCoresPass(),
+    ForkClosure(writeback = False, generateStruct = True),
+    PULPClusterTiling("L1"),
+    ArgumentStructGeneration(),
+    MemoryManagementGeneration("L1"),
+    MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
+    TilingVariableReplacement("L2"),
+    PULPL3Tiling("L2"),
+    ArgumentStructGeneration(),
+    L3MemoryAwareFunctionCallClosure(writeback = False),
+    MemoryManagementGeneration("L3.*"),
+    MemoryManagementGeneration("L2"),
+    MemoryManagementGeneration(),
+])
+
+ClusterTransformer = CodeTransformation([
+    TilingVariableReplacement("L1"),
+    TilingCallClosure(writeback = False, generateStruct = True),
+    PULPClusterTiling("L1"),
+    ArgumentStructGeneration(),
+    MemoryManagementGeneration("L1"),
+    MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
+    TilingVariableReplacement("L2"),
+    PULPL3Tiling("L2"),
+    ArgumentStructGeneration(),
+    L3MemoryAwareFunctionCallClosure(writeback = False),
+    MemoryManagementGeneration("L2"),
+    MemoryManagementGeneration("L3.*"),
+    MemoryManagementGeneration(),
+])
+
+SimpleTransformer = CodeTransformation([
+    MemoryManagementGeneration("L2"),
+    MemoryManagementGeneration("L3.*"),
+    MemoryManagementGeneration(),
+])
+
+PULPDMASliceBindings = [
+    AutoFutureBinding(
+        SliceChecker([
+            PointerClass(type),
+            PointerClass(uint8_t),
+            PointerClass(uint8_t),
+            PointerClass(uint8_t),
+            PointerClass(uint8_t)
+        ], [PULPDMAFuture(underlyingType = type)]), SliceTemplate.referenceTemplate, MemoryAwareForkTransformer)
+    for type in IntegerDataTypes
+]
+
+PULPRQAddBindings = [
+    NodeBinding(PULPRQAddChecker([PointerClass(_type), PointerClass(_type2)], [PointerClass(_type3)]),
+                RQAddTemplate.RQAddTemplate, ForkTransformer)
+    for _type in [int8_t, uint8_t]
+    for _type2 in [int8_t, uint8_t]
+    for _type3 in [int8_t, uint8_t]
+]
+
+PULPRQSConv2DBindings = [
+    NodeBinding(
+        PULPConvChecker([
+            PointerClass(type1),
+            PointerClass(int8_t),
+            PointerClass(int32_t),
+            PointerClass(int32_t),
+            PointerClass(int32_t)
+        ], [PointerClass(type2)]), ConvTemplate.PULPConv2D_8_Template, ForkTransformer)
+    for type1, type2 in zip([int8_t, int8_t, uint8_t, uint8_t], [int8_t, uint8_t, int8_t, uint8_t])
+]
+
+PULPRQSDWConv2DBindings = [
+    NodeBinding(
+        PULPConvChecker([
+            PointerClass(type1),
+            PointerClass(int8_t),
+            PointerClass(int32_t),
+            PointerClass(int32_t),
+            PointerClass(int32_t)
+        ], [PointerClass(type2)]), ConvTemplate.PULPDWConv2D_8_Template, ForkTransformer)
+    for type1, type2 in zip([int8_t, int8_t, uint8_t, uint8_t], [int8_t, uint8_t, int8_t, uint8_t])
+]
+
+PULPRQSGEMM_8_Binding = [
+    NodeBinding(
+        PULPLinearChecker([PointerClass(type1),
+                           PointerClass(int8_t),
+                           PointerClass(int32_t),
+                           PointerClass(int32_t)], [PointerClass(type2)]), GEMMTemplate.PULPGEMM_8_Template,
+        ForkTransformer) for type1, type2 in zip([int8_t, uint8_t, int8_t, uint8_t], [int8_t, uint8_t, uint8_t, int8_t])
+]
+
+PULPRQSMatrixVecBindings = [
+    NodeBinding(
+        PULPLinearChecker([PointerClass(type1),
+                           PointerClass(int8_t),
+                           PointerClass(int32_t),
+                           PointerClass(int32_t)], [PointerClass(type2)]), MatrixVectorTemplate.referenceTemplate,
+        ForkTransformer) for type1, type2 in zip([int8_t], [int8_t])
+]
+
+PULPRQSTallGEMMBindings = [
+    NodeBinding(
+        PULPLinearChecker([PointerClass(type1),
+                           PointerClass(int8_t),
+                           PointerClass(int32_t),
+                           PointerClass(int32_t)], [PointerClass(type2)]), TallGEMMTemplate.referenceTemplate,
+        ForkTransformer) for type1, type2 in zip([int8_t], [int8_t])
+]
+
+PULPRQSGEMMBindings = PULPRQSGEMM_8_Binding
+
+PULPMaxPool2DBindings = [
+    NodeBinding(PULPMaxPoolChecker([PointerClass(type)], [PointerClass(type)]),
+                MaxPool2DTemplate.PULPMaxPool2D_8_Template, ForkTransformer) for type in [int8_t, uint8_t]
+]
+
+PULPConv1DBinding = NodeBinding(
+    PULPConvChecker(
+        [PointerClass(int8_t), PointerClass(int8_t),
+         PointerClass(int32_t),
+         PointerClass(int32_t)], [PointerClass(int8_t)]), ConvTemplate.PULPConv1D_8_Template, ForkTransformer)
+
+PULPDWConv1DBinding = NodeBinding(
+    PULPConvChecker(
+        [PointerClass(int8_t), PointerClass(int8_t),
+         PointerClass(int32_t),
+         PointerClass(int32_t)], [PointerClass(int8_t)]), ConvTemplate.PULPDWConv1D_8_Template, ForkTransformer)
+
+PULPMatMulBinding = NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+                                GEMMTemplate.PULPMM_8_Template, ClusterTransformer)
+
+PULPReduceMeanBindings = [
+    NodeBinding(ReduceMeanChecker([PointerClass(type)], [PointerClass(type)]), ReduceMeanTemplate.referenceTemplate,
+                ClusterTransformer) for type in IntegerDataTypes
+]
+
+PULPUniformRQSBindings = [
+    NodeBinding(
+        PULPRequantShiftChecker([PointerClass(type), PointerClass(int32_t),
+                                 PointerClass(int32_t)], [PointerClass(int8_t)]),
+        UniformRequantShiftTemplate.referenceTemplate, ForkTransformer) for type in IntegerDataTypes
+]
+
+PULPRQSBindings = [
+    NodeBinding(
+        PULPRequantShiftChecker([PointerClass(type), PointerClass(int32_t),
+                                 PointerClass(int32_t)], [PointerClass(int8_t)]),
+        RequantShiftTemplate.referenceTemplate, ForkTransformer) for type in IntegerDataTypes
+] + [
+    NodeBinding(
+        PULPRequantShiftChecker([PointerClass(type), PointerClass(int32_t),
+                                 PointerClass(int32_t)], [PointerClass(uint8_t)]),
+        RequantShiftTemplate.referenceTemplate, ForkTransformer) for type in IntegerDataTypes
+]
+
+PULPSoftmaxBindings = [
+    NodeBinding(SoftmaxChecker([PointerClass(_type)], [PointerClass(uint8_t)]), iSoftmaxTemplate.referenceTemplate,
+                ForkTransformer) for _type in [int8_t, uint8_t]
+]
+
+PULPTransposeBindings = [
+    NodeBinding(TransposeChecker([PointerClass(type)], [PointerClass(type)]), TransposeTemplate.referenceTemplate,
+                ForkTransformer) for type in IntegerDataTypes
+]
+
+PULPConcatBindings = [
+    NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]),
+                ConcatTemplate.referenceTemplate, ClusterTransformer) for type in IntegerDataTypes
+]
+
+PULPiRMSNormBindings = [
+    NodeBinding(iLayerNormChecker([PointerClass(int8_t), PointerClass(int32_t)], [PointerClass(int8_t)]),
+                iRMSNormTemplate.referenceTemplate, ForkTransformer)
+]
+
+PULPiHardswishBindings = [
+    NodeBinding(HardswishChecker([PointerClass(int8_t)], [PointerClass(int32_t)]), iHardswishTemplate.referenceTemplate,
+                ClusterTransformer)
+]
+PULPRQSiHardswishBindings = [
+    NodeBinding(
+        RQHardswishChecker([PointerClass(int8_t),
+                            PointerClass(int32_t),
+                            PointerClass(int32_t),
+                            PointerClass(int32_t)], [PointerClass(int8_t)]), RQSiHardswishTemplate.referenceTemplate,
+        ForkTransformer)
+]
+
+PULPiRQSGELUBindings = [
+    NodeBinding(
+        GELUChecker([PointerClass(int8_t),
+                     PointerClass(int32_t),
+                     PointerClass(int32_t),
+                     PointerClass(int32_t)], [PointerClass(int8_t)]), RQSiGELUTemplate.referenceTemplate,
+        ClusterTransformer)
+]
+
+PULPMulBindings = [
+    NodeBinding(MulChecker([PointerClass(typeA), PointerClass(typeB)], [PointerClass(int32_t)]),
+                MulTemplate.referenceTemplate, ForkTransformer)
+    for typeA, typeB in itertools.product(SignedIntegerDataTypes, SignedIntegerDataTypes)
+]
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/AutoTransposeUtils.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/AutoTransposeUtils.py
new file mode 100644
index 0000000..47d19cb
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/AutoTransposeUtils.py
@@ -0,0 +1,186 @@
+# ----------------------------------------------------------------------
+#
+# File: AutoTransposeUtils.py
+#
+# Last edited: 11.12.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import Dict, List, Literal, Tuple
+
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    _invertPermutation, _permuteList
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.Targets.PULPOpen.DataTypes import PULPStructDataTypes
+from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, minimizeRectangleDims
+
+
+def _transposedDMAStrides(ctxt: NetworkContext, rectangle: HyperRectangle, direction: Literal["ToL1", "FromL1"],
+                          perm: List[int], L1Name: str, L2Name: str) -> Tuple[HyperRectangle, List[int], List[int]]:
+    _invPerm = _invertPermutation(perm)
+    rectangle = HyperRectangle(_permuteList(rectangle.offset, _invPerm), _permuteList(rectangle.dims, _invPerm))
+
+    contiguousDims = [permIdx == rangeIdx for permIdx, rangeIdx in zip(perm, range(len(perm)))]
+    workList = []
+
+    for idx, dim in enumerate(contiguousDims):
+        if dim:
+            workList.append(rectangle.dims[idx])
+        else:
+            workList.append(1)
+
+    maxTransferRect = copy.copy(rectangle)
+    maxTransferRect.dims = tuple(workList)
+
+    referenceBuffer = copy.copy(ctxt.lookup(L2Name))
+    referenceBuffer.shape = _permuteList(referenceBuffer.shape, _invPerm)
+    minRect, referenceRect = minimizeRectangleDims(maxTransferRect, referenceBuffer)
+
+    droppedIdx = [
+        idx for idx in range(len(perm))
+        if (referenceBuffer.shape[idx] == 1 or referenceBuffer.shape[idx] == maxTransferRect.dims[idx])
+    ]
+
+    _newPerm = []
+    for p in perm:
+        if p not in droppedIdx:
+            _newPerm.append(p)
+
+    newPerm = []
+    for p in _newPerm:
+        sub = sum([p > idx for idx in droppedIdx])
+        newPerm.append(p - sub)
+
+    strides = [1]
+    for dim in reversed(referenceRect.dims[1:]):
+        strides.insert(0, strides[0] * dim)
+
+    permStrides = [strides[idx] for idx in newPerm]
+    fixedPermStrides = []
+    maxStride = 0
+    remainderStrides = []
+    for stride in reversed(permStrides):
+        if stride < maxStride:
+            remainderStrides.append(stride)
+            continue
+        maxStride = max(stride, maxStride)
+        fixedPermStrides.insert(0, stride)
+
+    return minRect, fixedPermStrides, remainderStrides
+
+
+def allNumTransfers(ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation,
+                    loadSchedule: List[Dict[str, HyperRectangle]], direction: Literal["ToL1",
+                                                                                      "FromL1"]) -> List[List[int]]:
+
+    allNumTransfer: List[List[int]] = []
+
+    for stepIdx, loadStep in enumerate(loadSchedule):
+        for idx, (key, rectangle) in enumerate(loadStep.items()):
+            permName = f"in{idx}_perm"
+            externalPtr = ctxt.lookup(ctxt.lookup(operatorRepresentation[key])._referenceName)
+            internalPtr = ctxt.lookup(operatorRepresentation[key])
+
+            tensorName = key
+            nodeName = operatorRepresentation['nodeName']
+
+            if permName in operatorRepresentation and direction == "ToL1":
+                perm = operatorRepresentation[permName]
+                _, _, numTransfers = generateTransposedDMAStruct(ctxt, rectangle, direction, perm, internalPtr.name,
+                                                                 externalPtr.name)
+
+                allNumTransfer.append(numTransfers)
+
+    return allNumTransfer
+
+
+def generateTransposedDMAStruct(ctxt: NetworkContext, rectangle: HyperRectangle, direction: Literal["ToL1", "FromL1"],
+                                perm: List[int], L1Name: str,
+                                L2Name: str) -> Tuple[PULPStructDataTypes.DMA_copy, List[int], List[int]]:
+
+    #rect, referenceRect = minimizeRectangleDims(maxTransferRect, referenceBuffer)
+    referenceBuffer = ctxt.lookup(L2Name)
+
+    _invPerm = _invertPermutation(perm)
+
+    contiguousDims = [permIdx == rangeIdx for permIdx, rangeIdx in zip(perm, range(len(perm)))]
+    workList = []
+
+    for idx, dim in enumerate(contiguousDims):
+        if dim:
+            workList.append(rectangle.dims[idx])
+        else:
+            workList.append(1)
+
+    maxTransferRect = copy.copy(rectangle)
+    maxTransferRect.dims = tuple(workList)
+
+    droppedIdx = [
+        idx for idx in range(len(perm))
+        if (referenceBuffer.shape[idx] == 1 or referenceBuffer.shape[idx] == maxTransferRect.dims[idx])
+    ]
+
+    permOffset = [rectangle.offset[idx] for idx, dims in enumerate(rectangle.dims) if (idx not in droppedIdx)]
+    permDims = [dims for idx, dims in enumerate(rectangle.dims) if (idx not in droppedIdx)]
+
+    rect = HyperRectangle(offset = permOffset, dims = permDims)
+    minRect, fixedPermStrides, remainderStrides = _transposedDMAStrides(ctxt, rectangle, direction, perm, L1Name,
+                                                                        L2Name)
+
+    assert len(fixedPermStrides) <= 2, "PULP: Only 2D transfers are supported!"
+
+    if direction == "ToL1":
+        _dir = 1
+    else:
+        _dir = 0
+
+    length_1d_copy = minRect.dims[-1] * (referenceBuffer._type.referencedType.typeWidth // 8)
+
+    if len(fixedPermStrides) >= 1:
+        number_of_1d_copies = rect.dims[-1]
+        stride_1d = fixedPermStrides[-1] * (referenceBuffer._type.referencedType.typeWidth // 8)
+
+    else:
+        number_of_1d_copies = 1
+        stride_1d = 0
+
+    if len(fixedPermStrides) >= 2:
+        number_of_2d_copies = rect.dims[-2]
+        stride_2d = fixedPermStrides[-2] * (referenceBuffer._type.referencedType.typeWidth // 8)
+    else:
+        number_of_2d_copies = 1
+        stride_2d = 0
+
+    struct = PULPStructDataTypes.DMA_copy(
+        {
+            "ext": referenceBuffer.name,
+            "loc": L1Name,
+            "hwc_to_chw": 0,
+            "stride_2d": stride_2d,
+            "number_of_2d_copies": number_of_2d_copies,
+            "stride_1d": stride_1d,
+            "number_of_1d_copies": number_of_1d_copies,
+            "length_1d_copy": length_1d_copy,
+            "dir": _dir,
+            "tid": 0
+        }, ctxt)
+
+    return struct, remainderStrides, rect.dims[:-len(fixedPermStrides)]
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterSynch.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterSynch.py
new file mode 100644
index 0000000..275c8ba
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterSynch.py
@@ -0,0 +1,44 @@
+# ----------------------------------------------------------------------
+#
+# File: PULPClusterSynch.py
+#
+# Last edited: 30.10.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, \
+    NodeTemplate, _NoVerbosity
+
+_synchTemplate = NodeTemplate("""
+        pi_cl_team_barrier();
+        """)
+
+
+class PULPSynchCoresPass(CodeTransformationPass):
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        executionBlock.addRight(_synchTemplate, {})
+        return ctxt, executionBlock
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py
new file mode 100644
index 0000000..c372320
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py
@@ -0,0 +1,55 @@
+# ----------------------------------------------------------------------
+#
+# File: PULPClusterTiling.py
+#
+# Last edited: 19.04.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity
+
+from .PULPClusterTilingDB import ProfilingPULPClusterTilingGenerationDB, PULPClusterTilingGenerationDB
+from .PULPClusterTilingSB import ProfilingPULPClusterTilingGenerationSB, PULPClusterTilingGenerationSB
+
+
+class PULPClusterTiling(CodeTransformationPass):
+
+    def __init__(self, targetMemLevel: str):
+        self.SB = PULPClusterTilingGenerationSB(targetMemLevel)
+        self.profilingSB = ProfilingPULPClusterTilingGenerationSB(targetMemLevel)
+        self.DB = PULPClusterTilingGenerationDB(targetMemLevel)
+        self.profilingDB = ProfilingPULPClusterTilingGenerationDB(targetMemLevel)
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        if verbose.tilingProfiling == "L2":
+            ctxt, executionBlock = self.profilingSB.apply(ctxt, executionBlock, name)
+            ctxt, executionBlock = self.profilingDB.apply(ctxt, executionBlock, name)
+        else:
+            ctxt, executionBlock = self.SB.apply(ctxt, executionBlock, name)
+            ctxt, executionBlock = self.DB.apply(ctxt, executionBlock, name)
+
+        return ctxt, executionBlock
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTilingDB.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTilingDB.py
new file mode 100644
index 0000000..8b86fff
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTilingDB.py
@@ -0,0 +1,358 @@
+# ----------------------------------------------------------------------
+#
+# File: PULPClusterTilingDB.py
+#
+# Last edited: 25.10.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import CodeSnippet, ExecutionBlock, NetworkContext, NodeTemplate, OperatorRepresentation
+from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTilingSB import PULPClusterTilingSB, _DMAUpdate
+from Deeploy.Targets.PULPOpen.DataTypes import PULPStructDataTypes
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import DoubleBufferingTilingMixIn, \
+    ProfilingDoubleBufferingTilingMixIn, TilingMetaInfo
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme
+
+_moveTileInTemplate = NodeTemplate("""
+
+// IMPORT TILE ${innerTilePtr} from ${outerTilePtr}
+if (${tileNum} < ${numTiles}[*${tileIdxPtr}+1]){
+dory_dma_memcpy_mindims_async(&${stateReference});
+}
+
+""")
+
+_moveTileOutTemplate = NodeTemplate("""
+
+// EXPORT TILE ${innerTilePtr} to ${outerTilePtr}
+if((${tileNum}) % 2 == 0){
+dory_dma_memcpy_mindims_async(&${stateReference});
+} else {
+dory_dma_memcpy_mindims_async(&${_stateReference});
+}
+""")
+
+_blockTileOutTemplate = NodeTemplate("""
+
+// BLOCKING EXPORT TILE ${innerTilePtr}
+if((${tileNum}) > 1){
+if((${tileNum}) % 2 == 0){
+dory_dma_barrier(&${stateReference});
+} else {
+dory_dma_barrier(&${_stateReference});
+}
+}
+
+""")
+
+_finalBlockTileOutTemplate = NodeTemplate("""
+
+// BLOCKING EXPORT TILE ${innerTilePtr}
+dory_dma_barrier(&${stateReference});
+dory_dma_barrier(&${_stateReference});
+""")
+
+_updateDMATransferStructTemplate = NodeTemplate("""
+
+// UPDATE DMA STRUCT ${stateReference}, ${_stateReference}
+${stateReference}.ext = (((char*)${extPtr}) + ${extOffsetPtr}[${tileNum}]);
+${stateReference}.mchan_cmd = ${mchanCmdPtr}[${tileNum}];
+${stateReference}.length_1d_copy = ${length1dPtr}[${tileNum}];
+${stateReference}.number_of_1d_copies = ${number1dPtr}[${tileNum}];
+${stateReference}.number_of_2d_copies = ${number2dPtr}[${tileNum}];
+${stateReference}.loc = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}]);
+${locPtr} = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}-1]);
+""")
+
+_outUpdateDMATransferStructTemplate = NodeTemplate("""
+
+if ((${tileNum}) % 2 == 0){
+// UPDATE DMA STRUCT ${stateReference}
+${stateReference}.ext = ((char*)${extPtr} + ${extOffsetPtr}[${tileNum}]);
+${stateReference}.mchan_cmd = ${mchanCmdPtr}[${tileNum}];
+${stateReference}.length_1d_copy = ${length1dPtr}[${tileNum}];
+${stateReference}.number_of_1d_copies = ${number1dPtr}[${tileNum}];
+${stateReference}.number_of_2d_copies = ${number2dPtr}[${tileNum}];
+${stateReference}.loc = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}]);
+} else {
+${_stateReference}.ext = ((char*)${extPtr} + ${extOffsetPtr}[${tileNum}]);
+${_stateReference}.mchan_cmd = ${mchanCmdPtr}[${tileNum}];
+${_stateReference}.length_1d_copy = ${length1dPtr}[${tileNum}];
+${_stateReference}.number_of_1d_copies = ${number1dPtr}[${tileNum}];
+${_stateReference}.number_of_2d_copies = ${number2dPtr}[${tileNum}];
+${_stateReference}.loc = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}]);
+}
+${locPtr} = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}]);
+
+""")
+
+
+class PULPClusterTilingDB(PULPClusterTilingSB):
+
+    _blockTileOutTemplate = _blockTileOutTemplate
+    _updateDMATransferStructTemplate = _updateDMATransferStructTemplate
+    _moveTileOutTemplate = _moveTileOutTemplate
+    _moveTileInTemplate = _moveTileInTemplate
+
+    def _hoistDMAUpdates(self, ctxt: NetworkContext, tensorName: str, updateList: List[_DMAUpdate],
+                         operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+        nodeName = operatorRepresentation['nodeName']
+
+        operatorRepresentation = operatorRepresentation.copy()
+
+        dmaName = self._DMAStructName(tensorName, nodeName)
+        # operatorRepresentation['stateReference'] = dmaName
+        # operatorRepresentation['tileNum'] = "TILING_I"
+        operatorRepresentation['locPtr'] = ctxt.lookup(operatorRepresentation[tensorName]).name
+        operatorRepresentation['baseLocPtr'] = ctxt.hoistReference(operatorRepresentation['locPtr'],
+                                                                   operatorRepresentation['locPtr'] + "_ref")
+        operatorRepresentation['_stateReference'] = self._DMAStructName(tensorName, nodeName) + "_1"
+        ctxt.lookup(operatorRepresentation['baseLocPtr'])._memoryLevel = self.targetMemLevel
+
+        namePrefix = self.prefix + f"{nodeName}_{tensorName}"
+
+        ctxt, operatorRepresentation = super()._hoistDMAUpdates(ctxt, tensorName, updateList, operatorRepresentation)
+
+        locOffsetList = []
+        locBaseOffset = updateList[0].locOffset
+        for update in updateList:
+            locOffsetList.append(int(update.locOffset) - locBaseOffset)
+
+        name = namePrefix + "_locOffset"
+        cb = ctxt.ConstantBuffer(name, [len(updateList)], locOffsetList)
+        ctxt, operatorRepresentation = self._hoistConstantAndReference(ctxt, cb, operatorRepresentation, nodeName,
+                                                                       'locOffsetPtr')
+
+        return ctxt, operatorRepresentation
+
+    def _generateEgressPointerUpdates(
+            self, nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, List[CodeSnippet]]:
+
+        updates = []
+        newCtxt = ctxt.copy()
+
+        updateDict = self._generatePointerUpdates(ctxt, operatorRepresentation, tilingSchedule.outputLoadSchedule,
+                                                  nodeMemoryConstraint, tilingSchedule)
+
+        for key, updateList in updateDict.items():
+
+            newCtxt, newNodeRep = self._hoistDMAUpdates(newCtxt, key, updateList, operatorRepresentation)
+            updates.append(CodeSnippet(_outUpdateDMATransferStructTemplate, newNodeRep))
+
+        return newCtxt, updates
+
+    def _generateEgressDMACode(
+            self, tilingSchedule: TilingSchedule, nodeMemoryConstraint: NodeMemoryConstraint, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
+
+        egressDMATransferCalls = []
+        egressDMAWaitStatements = []
+
+        exportLoadStep = tilingSchedule.outputLoadSchedule[0]
+        for key, rectangle in exportLoadStep.items():
+            externalPtr = ctxt.lookup(ctxt.lookup(operatorRepresentation[key])._referenceName)
+            internalPtr = ctxt.lookup(operatorRepresentation[key])
+
+            tensorName = key
+            nodeName = operatorRepresentation['nodeName']
+            dmaName = self._DMAStructName(tensorName, nodeName)
+
+            finalMemoryLevel = TilingCodeGeneration.isFinalMemoryLevel(nodeMemoryConstraint, internalPtr)
+            struct = self._rectToDMAStruct(ctxt, rectangle, "FromL1", internalPtr.name, externalPtr.name,
+                                           finalMemoryLevel)
+            _ = ctxt.hoistStruct(struct, dmaName, PULPStructDataTypes.DMA_copy)
+            ctxt.lookup(dmaName)._users += [operatorRepresentation['nodeName']]
+
+            tensorName = key + "_1"
+            nodeName = operatorRepresentation['nodeName']
+            _dmaName = self._DMAStructName(tensorName, nodeName)
+
+            struct = self._rectToDMAStruct(ctxt, rectangle, "FromL1", internalPtr.name, externalPtr.name,
+                                           finalMemoryLevel)
+            _ = ctxt.hoistStruct(struct, _dmaName, PULPStructDataTypes.DMA_copy)
+            ctxt.lookup(_dmaName)._users += [operatorRepresentation['nodeName']]
+
+            egressDMATransferCalls.append(
+                CodeSnippet(
+                    self._moveTileOutTemplate, {
+                        'innerTilePtr': str(internalPtr._instance),
+                        "outerTilePtr": str(externalPtr._instance),
+                        "stateReference": dmaName,
+                        "_stateReference": _dmaName
+                    }))
+
+            egressDMAWaitStatements.append(
+                CodeSnippet(
+                    self._blockTileOutTemplate, {
+                        'innerTilePtr': str(internalPtr._instance),
+                        "outerTilePtr": str(externalPtr._instance),
+                        "stateReference": dmaName,
+                        "_stateReference": _dmaName
+                    }))
+
+        return egressDMATransferCalls, egressDMAWaitStatements
+
+    def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
+                    nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule,
+                    variableReplacement: VariableReplacementScheme,
+                    operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
+
+        tileIdxPtr = self._hoistTileIdxPtr(ctxt, operatorRepresentation)
+
+        ingressDMATransferCalls, ingressDMAWaitStatements = self._generateIngressDMACode(
+            tilingSchedule, nodeMemoryConstraint, ctxt, operatorRepresentation)
+
+        egressDMATransferCalls, egressDMAWaitStatements = self._generateEgressDMACode(
+            tilingSchedule, nodeMemoryConstraint, ctxt, operatorRepresentation)
+
+        ctxt, ingressDMAUpdates = self._generateIngressPointerUpdates(nodeMemoryConstraint, tilingSchedule, ctxt,
+                                                                      operatorRepresentation)
+        ctxt, egressDMAUpdates = self._generateEgressPointerUpdates(nodeMemoryConstraint, tilingSchedule, ctxt,
+                                                                    operatorRepresentation)
+
+        variableUpdates = self._generateVariableUpdates(tilingSchedule, variableReplacement, ctxt,
+                                                        operatorRepresentation)
+
+        for transaction in ingressDMATransferCalls:
+            _operatorRepresentation = transaction.operatorRepresentation
+            _operatorRepresentation["tileNum"] = "TILING_I+1"
+            _operatorRepresentation["numTiles"] = operatorRepresentation['numTiles']
+            _operatorRepresentation["tileIdxPtr"] = tileIdxPtr
+
+        for transaction in ingressDMAUpdates:
+            _operatorRepresentation = transaction.operatorRepresentation
+            _operatorRepresentation["tileNum"] = "TILING_I+1"
+
+        for transaction in egressDMATransferCalls:
+            _operatorRepresentation = transaction.operatorRepresentation
+            _operatorRepresentation["tileNum"] = "TILING_I"
+
+        for transaction in egressDMAWaitStatements:
+            _operatorRepresentation = transaction.operatorRepresentation
+            _operatorRepresentation['tileNum'] = "TILING_I"
+
+        for transaction in egressDMAUpdates:
+            _operatorRepresentation = transaction.operatorRepresentation
+            _operatorRepresentation["tileNum"] = "TILING_I"
+
+        for transaction in variableUpdates:
+            _operatorRepresentation = transaction.operatorRepresentation
+            _operatorRepresentation["tileNum"] = "TILING_I"
+
+        openLoopStatement = [
+            CodeSnippet(self._openTileLoopTemplate, {
+                "numTiles": operatorRepresentation["numTiles"],
+                "tileIdxPtr": tileIdxPtr
+            })
+        ]
+
+        closeLoopStatement = [
+            CodeSnippet(self._closeTileLoopTemplate, {
+                "numTiles": operatorRepresentation["numTiles"],
+                "tileIdxPtr": tileIdxPtr
+            })
+        ]
+
+        setupStatements = []
+        teardownStatements = []
+
+        teardownStatements += [
+            CodeSnippet(self._releaseDMATemplate,
+                        {"stateReference": ingressDMAUpdates[0].operatorRepresentation["stateReference"]})
+        ]
+
+        setupStatements += [CodeSnippet(self._initDMATemplate, {"channelName": "dma_channel"})]
+        setupStatements += [
+            CodeSnippet(self._setDMAChannelTemplate, {
+                **transaction.operatorRepresentation, "channelName": "dma_channel"
+            }) for transaction in ingressDMAUpdates
+        ]
+
+        for transaction in egressDMAUpdates:
+            _operatorRepresentation = transaction.operatorRepresentation.copy()
+            _operatorRepresentation["channelName"] = "dma_channel"
+            setupStatements.append(CodeSnippet(self._setDMAChannelTemplate, _operatorRepresentation.copy()))
+            _operatorRepresentation["channelName"] = "dma_channel"
+            _operatorRepresentation["stateReference"] = _operatorRepresentation["_stateReference"]
+            setupStatements.append(CodeSnippet(self._setDMAChannelTemplate, _operatorRepresentation.copy()))
+
+        for transaction in ingressDMATransferCalls:
+            _operatorRepresentation = transaction.operatorRepresentation.copy()
+            _operatorRepresentation["tileNum"] = 0
+            _operatorRepresentation["numTiles"] = operatorRepresentation['numTiles']
+            _operatorRepresentation["tileIdxPtr"] = tileIdxPtr
+            setupStatements.append(CodeSnippet(transaction.template, _operatorRepresentation))
+
+        for transaction in egressDMAWaitStatements:
+            _operatorRepresentation = transaction.operatorRepresentation.copy()
+            _operatorRepresentation['tileNum'] = ctxt.lookup(operatorRepresentation["numTiles"]).values[-1]
+            teardownStatements.append(CodeSnippet(_finalBlockTileOutTemplate, _operatorRepresentation))
+
+        metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + "_L2",
+                                  nodeOps = operatorRepresentation['nodeOps'],
+                                  numTiles = len(tilingSchedule.outputLoadSchedule),
+                                  tileIdxVar = "TILING_I")
+
+        newExecutionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMATransferCalls,
+                                                       ingressDMAWaitStatements[-1:], ingressDMAUpdates,
+                                                       egressDMATransferCalls, egressDMAWaitStatements[-1:],
+                                                       egressDMAUpdates, variableUpdates, openLoopStatement,
+                                                       closeLoopStatement, setupStatements, teardownStatements)
+
+        return ctxt, newExecutionBlock, True
+
+    def generateTilingLoop(
+            self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nodeMemoryConstraint: NodeMemoryConstraint,
+            tilingSchedules: List[TilingSchedule], variableReplacement: VariableReplacementScheme,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
+
+        flatTilingSchedule = copy.copy(tilingSchedules[0])
+        for tilingSchedule in tilingSchedules[1:]:
+            flatTilingSchedule += tilingSchedule
+
+        offsetLists = list({**flatTilingSchedule.inputBaseOffsets, **flatTilingSchedule.outputBaseOffsets}.values())
+
+        if len(offsetLists) == 0:
+            return ctxt, executionBlock, False
+
+        for offsetList in offsetLists:
+            if not len(offsetList) == 2:
+                return ctxt, executionBlock, False
+
+        allNumTiles = [len(schedule.outputLoadSchedule) for schedule in tilingSchedules]
+        operatorRepresentation["numTiles"] = self._hoistNumTiles(ctxt, operatorRepresentation['nodeName'],
+                                                                 tilingSchedules)
+
+        return self._tilingLoop(ctxt, executionBlock, nodeMemoryConstraint, flatTilingSchedule, variableReplacement,
+                                operatorRepresentation)
+
+
+class PULPClusterTilingGenerationDB(PULPClusterTilingDB, DoubleBufferingTilingMixIn):
+    pass
+
+
+class ProfilingPULPClusterTilingGenerationDB(PULPClusterTilingDB, ProfilingDoubleBufferingTilingMixIn):
+    pass
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTilingSB.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTilingSB.py
new file mode 100644
index 0000000..bd43d8b
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTilingSB.py
@@ -0,0 +1,670 @@
+# ----------------------------------------------------------------------
+#
+# File: PULPClusterTiling.py
+#
+# Last edited: 17.10.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from collections import namedtuple
+from typing import Dict, List, Literal, Optional, Tuple, Type
+
+import numpy as np
+
+import Deeploy.CommonExtensions.DataTypes as BasicDataTypes
+from Deeploy.AbstractDataTypes import Immediate, PointerClass
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    _invertPermutation, _permuteList
+from Deeploy.DeeployTypes import CodeSnippet, ConstantBuffer, ExecutionBlock, NetworkContext, NodeTemplate, \
+    OperatorRepresentation
+from Deeploy.Targets.PULPOpen.CodeTransformationPasses import AutoTransposeUtils
+from Deeploy.Targets.PULPOpen.DataTypes import PULPStructDataTypes
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingSingleBufferingTilingMixIn, \
+    SingleBufferingTilingMixIn, TilingMetaInfo
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme, \
+    calculateRectangleOffset, minimizeRectangleDims
+
+_openTileLoopTemplate = NodeTemplate("""
+
+// TILING LOOP
+for (int TILING_I=${numTiles}[*${tileIdxPtr}]; TILING_I<${numTiles}[(*${tileIdxPtr})+1]; TILING_I++){
+""")
+
+_closeTileLoopTemplate = NodeTemplate("""
+
+// CLOSE TILING LOOP
+}
+*${tileIdxPtr} += 1;
+
+""")
+
+_moveTileInTemplate = NodeTemplate("""
+
+// IMPORT TILE ${innerTilePtr} from ${outerTilePtr}
+dory_dma_memcpy_mindims_async(&${stateReference});
+
+""")
+
+_iteratedMoveTileInTemplate = NodeTemplate("""
+
+// IMPORT TILE ${innerTilePtr} from ${outerTilePtr}
+// ITERATED
+
+<%
+_extStrides = [stride * stateStruct.value['length_1d_copy'].value for stride in remainderStrides]
+_locStride = f"{stateReference}.length_1d_copy  * {stateReference}.number_of_1d_copies  *  {stateReference}.number_of_2d_copies"
+
+stateStruct.value['ext'] = str(stateReference) + ".ext"
+stateStruct.value['loc'] = str(stateReference) + ".loc"
+stateStruct.value['tid'] = str(stateReference) + ".tid"
+stateStruct.value['stride_2d'] = str(stateReference) + ".stride_2d"
+stateStruct.value['stride_1d'] = str(stateReference) + ".stride_1d"
+stateStruct.value['number_of_2d_copies'] = str(stateReference) + ".number_of_2d_copies"
+stateStruct.value['number_of_1d_copies'] = str(stateReference) + ".number_of_1d_copies"
+stateStruct.value['length_1d_copy'] = str(stateReference) + ".length_1d_copy"
+%>
+
+int8_t * bu_${stateReference}_loc = ${stateReference}.loc;
+int8_t * bu_${stateReference}_ext = ${stateReference}.ext;
+
+% for idx, dimLen in enumerate(dimLens):
+uint16_t ${nodeName}_${tensorName}_dimLen_${idx} = ${dimLen}[${tileNum}];
+for(int i_${idx} = 0; i_${idx} < ${nodeName}_${tensorName}_dimLen_${idx}; i_${idx}++){
+%endfor
+${stateStruct.typeName} trans_${stateReference} = (${stateStruct.typeName}) ${str(stateStruct)};
+dory_dma_memcpy_mindims_async(&trans_${stateReference});
+${stateStruct.value['loc']} = (((int8_t*) ${stateStruct.value['loc']}) + ${_locStride});
+% for idx, _ in enumerate(dimLens):
+${stateStruct.value['ext']} = (((int8_t*) ${stateStruct.value['ext']}) + (${_extStrides[idx]}));
+}
+${stateStruct.value['ext']} = (((int8_t*) ${stateStruct.value['ext']}) - ${nodeName}_${tensorName}_dimLen_${len(dimLens) -1 - idx} * ${_extStrides[idx]});
+%endfor
+
+${stateStruct.value['loc']} = bu_${stateReference}_loc;
+${stateStruct.value['ext']} = bu_${stateReference}_ext;
+
+""")
+
+_blockTileInTemplate = NodeTemplate("""
+
+// BLOCKING IMPORT TILE ${innerTilePtr}
+dory_dma_barrier(&${stateReference});
+
+""")
+
+_moveTileOutTemplate = NodeTemplate("""
+
+// EXPORT TILE ${innerTilePtr} to ${outerTilePtr}
+dory_dma_memcpy_mindims_async(&${stateReference});
+
+""")
+
+_blockTileOutTemplate = NodeTemplate("""
+
+// BLOCKING EXPORT TILE ${innerTilePtr}
+dory_dma_barrier(&${stateReference});
+
+""")
+
+_updateDMATransferStructTemplate = NodeTemplate("""
+
+// UPDATE DMA STRUCT ${stateReference}
+${stateReference}.ext = ((char*)${extPtr}) + ${extOffsetPtr}[${tileNum}];
+${stateReference}.length_1d_copy = ${length1dPtr}[${tileNum}];
+${stateReference}.number_of_1d_copies = ${number1dPtr}[${tileNum}];
+${stateReference}.number_of_2d_copies = ${number2dPtr}[${tileNum}];
+
+${stateReference}.stride_1d = ${stride1dPtr}[${tileNum}];
+${stateReference}.stride_2d = ${stride2dPtr}[${tileNum}];
+
+${stateReference}.mchan_cmd = ${mchanCmdPtr}[${tileNum}];
+""")
+
+_updateReferenceTemplate = NodeTemplate("""
+
+// UPDATE VARIABLE ${reference}
+*${reference} = ${baseReference}[${tileNum}];
+""")
+
+_initDMATemplate = NodeTemplate("""
+int32_t ${channelName} = dory_dma_allocate();
+""")
+
+_setDMAChannelTemplate = NodeTemplate("""
+${stateReference}.tid = ${channelName};
+""")
+
+_releaseDMATemplate = NodeTemplate("""
+dory_dma_free(&${stateReference});
+""")
+
+# ADD NUM TRANSFERS VARIABLE
+
+_DMAUpdate = namedtuple(
+    "_DMAUpdate",
+    "extOffset locOffset length_1d_copy number_of_1d_copies number_of_2d_copies stride_1d stride_2d mchan_cmd")
+
+
+class PULPClusterTilingSB(TilingCodeGeneration):
+
+    _prefix = "TILING_REPLACED_"
+
+    _openTileLoopTemplate = _openTileLoopTemplate
+    _closeTileLoopTemplate = _closeTileLoopTemplate
+
+    _moveTileInTemplate = _moveTileInTemplate
+    _iteratedMoveTileInTemplate = _iteratedMoveTileInTemplate
+    _blockTileInTemplate = _blockTileInTemplate
+
+    _moveTileOutTemplate = _moveTileOutTemplate
+    _blockTileOutTemplate = _blockTileOutTemplate
+
+    _updateDMATransferStructTemplate = _updateDMATransferStructTemplate
+    _updateReferenceTemplate = _updateReferenceTemplate
+
+    _initDMATemplate = _initDMATemplate
+    _setDMAChannelTemplate = _setDMAChannelTemplate
+    _releaseDMATemplate = _releaseDMATemplate
+
+    @property
+    def prefix(self):
+        return self._prefix + self.targetMemLevel + "_"
+
+    def _DMAStructName(self, tensorName: str, nodeName: str) -> str:
+        return f"{self.prefix}_DMA_{nodeName}_{tensorName}"
+
+    @classmethod
+    def _generatePointerUpdates(cls, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation,
+                                loadSchedule: List[Dict[str,
+                                                        HyperRectangle]], nodeMemoryConstraint: NodeMemoryConstraint,
+                                tilingSchedule: TilingSchedule) -> Dict[str, _DMAUpdate]:
+        updateDict = {}
+        deltaOffsets = {}
+
+        for idx, loadStep in enumerate(loadSchedule):
+            for stepIdx, (key, rect) in enumerate(loadStep.items()):
+
+                if key in tilingSchedule.outputBaseOffsets.keys():
+                    baseOffsets = tilingSchedule.outputBaseOffsets[key]
+                    direction = "FromL1"
+                else:
+                    baseOffsets = tilingSchedule.inputBaseOffsets[key]
+                    direction = "ToL1"
+
+                if key not in updateDict.keys():
+                    updateDict[key] = []
+                if key not in deltaOffsets.keys():
+                    deltaOffsets[key] = 0
+
+                referenceBuffer = ctxt.lookup(ctxt.lookup(operatorRepresentation[key])._referenceName)
+                l1Buffer = ctxt.lookup(operatorRepresentation[key])
+
+                finalMemoryLevel = TilingCodeGeneration.isFinalMemoryLevel(nodeMemoryConstraint, l1Buffer)
+
+                if (f"in{stepIdx}_perm" in operatorRepresentation
+                        and key in tilingSchedule.inputBaseOffsets.keys()) and (finalMemoryLevel == False):
+                    perm = operatorRepresentation[f"in{stepIdx}_perm"]
+                    struct, _, _ = AutoTransposeUtils.generateTransposedDMAStruct(ctxt, rect, direction, perm,
+                                                                                  l1Buffer.name,
+                                                                                  l1Buffer._referenceName)
+
+                    _invPerm = _invertPermutation(perm)
+                    _rect = copy.copy(rect)
+                    _referenceBuffer = copy.copy(referenceBuffer)
+                    _rect.offset = _permuteList(rect.offset, _invPerm)
+                    _rect.dims = _permuteList(rect.dims, _invPerm)
+                    _referenceBuffer.shape = _permuteList(referenceBuffer.shape, _invPerm)
+
+                    accOffset = calculateRectangleOffset(_rect, _referenceBuffer)
+
+                else:
+                    struct = cls._rectToDMAStruct(ctxt, rect, direction, l1Buffer.name, l1Buffer._referenceName,
+                                                  finalMemoryLevel)
+                    accOffset = calculateRectangleOffset(rect, referenceBuffer)
+
+                length_1d_copy = struct.value['length_1d_copy'].value
+                number_of_1d_copies = struct.value['number_of_1d_copies'].value
+                number_of_2d_copies = struct.value['number_of_2d_copies'].value
+                stride_1d = struct.value['stride_1d'].value
+                stride_2d = struct.value['stride_2d'].value
+                mchan_cmd = struct.value['mchan_cmd'].value
+
+                lIdx = idx % len(baseOffsets)
+
+                sol = _DMAUpdate(accOffset, baseOffsets[lIdx], length_1d_copy, number_of_1d_copies, number_of_2d_copies,
+                                 stride_1d, stride_2d, mchan_cmd)
+
+                deltaOffsets[key] = accOffset
+                updateDict[key].append(sol)
+
+        return updateDict
+
+    @classmethod
+    def _rectToDMAStruct(cls, ctxt: NetworkContext, rectangle: HyperRectangle, direction: Literal["ToL1", "FromL1"],
+                         L1Name: str, L2Name: str, finalMemoryLevel: bool) -> PULPStructDataTypes.DMA_copy:
+
+        referenceBuffer = ctxt.lookup(L2Name)
+
+        rect, referenceRect = minimizeRectangleDims(rectangle, referenceBuffer)
+        assert len(rect.dims) <= 3, "PULP: Only 2D transfers are supported!"
+
+        if direction == "ToL1":
+            _dir = 1
+        else:
+            _dir = 0
+
+        length_1d_copy = rect.dims[-1] * (referenceBuffer._type.referencedType.typeWidth // 8)
+
+        number_of_1d_copies = 1
+        stride_1d = 0
+
+        if len(rect.dims) > 1:
+            number_of_1d_copies = rect.dims[-2]
+            stride_1d = referenceRect.dims[-1] * (referenceBuffer._type.referencedType.typeWidth // 8)
+
+            if not finalMemoryLevel:
+                stride_1d = length_1d_copy
+
+        number_of_2d_copies = 1
+        stride_2d = 0
+
+        if len(rect.dims) > 2:
+            number_of_2d_copies = rect.dims[-3]
+            stride_2d = referenceRect.dims[-2] * stride_1d
+
+        length_2d_copy = number_of_1d_copies * length_1d_copy
+        mchan_flags = _dir + 0x2 + 0x8
+        if number_of_1d_copies > 1 or number_of_2d_copies > 1:
+            mchan_flags += 0x4
+        mchan_cmd = length_2d_copy + (mchan_flags << 17)
+
+        struct = PULPStructDataTypes.DMA_copy(
+            {
+                "ext": referenceBuffer.name,
+                "loc": L1Name,
+                "hwc_to_chw": 0,
+                "stride_2d": stride_2d,
+                "number_of_2d_copies": number_of_2d_copies,
+                "stride_1d": stride_1d,
+                "number_of_1d_copies": number_of_1d_copies,
+                "length_1d_copy": length_1d_copy,
+                "mchan_cmd": mchan_cmd,
+                "dir": _dir,
+                "tid": 0
+            }, ctxt)
+
+        return struct
+
+    def _hoistConstantAndReference(self,
+                                   ctxt: NetworkContext,
+                                   constBuf: ConstantBuffer,
+                                   operatorRepresentation: OperatorRepresentation,
+                                   nodeName: str,
+                                   operatorRepresentationName: str,
+                                   immediateType: Optional[Type[Immediate]] = None) -> Tuple[NetworkContext, Dict]:
+
+        if immediateType is None:
+            _type = PointerClass(BasicDataTypes.int32_t)
+        else:
+            _type = PointerClass(immediateType)
+
+        name = constBuf.name
+
+        ctxt.add(constBuf, "global")
+        constBuf._type = _type
+        constBuf._instance = constBuf._type(name, ctxt)
+        constBuf._users = [nodeName]
+        constBuf._memoryLevel = self.targetMemLevel
+
+        refName = name + "_ref"
+        reference = ctxt.hoistReference(name, refName)
+        ctxt.lookup(reference)._memoryLevel = self.targetMemLevel
+
+        operatorRepresentation[operatorRepresentationName] = refName
+
+        return ctxt, operatorRepresentation
+
+    def _hoistDMAUpdates(self, ctxt: NetworkContext, tensorName: str, updateList: List[_DMAUpdate],
+                         operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+
+        operatorRepresentation = operatorRepresentation.copy()
+
+        nodeName = operatorRepresentation['nodeName']
+
+        offsetList = []
+        mchanCmdList = []
+        len1dList = []
+        num1dList = []
+        num2dList = []
+        stride1dList = []
+        stride2dList = []
+        for update in updateList:
+            offsetList.append(int(update.extOffset))
+            mchanCmdList.append(int(update.mchan_cmd))
+            len1dList.append(int(update.length_1d_copy))
+            num1dList.append(int(update.number_of_1d_copies))
+            num2dList.append(int(update.number_of_2d_copies))
+            stride1dList.append(int(update.stride_1d))
+            stride2dList.append(int(update.stride_2d))
+
+        dmaName = self._DMAStructName(tensorName, nodeName)
+        operatorRepresentation['stateReference'] = dmaName
+        operatorRepresentation['tileNum'] = "TILING_I"
+        operatorRepresentation['extPtr'] = ctxt.lookup(operatorRepresentation[tensorName])._referenceName
+
+        namePrefix = self.prefix + f"{nodeName}_{tensorName}"
+
+        name = namePrefix + "_offset"
+        cb = ctxt.ConstantBuffer(name, [len(updateList)], offsetList)
+        ctxt, operatorRepresentation = self._hoistConstantAndReference(ctxt, cb, operatorRepresentation, nodeName,
+                                                                       'extOffsetPtr')
+
+        name = namePrefix + "_mchan_cmd"
+        cb = ctxt.ConstantBuffer(name, [len(updateList)], mchanCmdList)
+        ctxt, operatorRepresentation = self._hoistConstantAndReference(
+            ctxt, cb, operatorRepresentation, nodeName, 'mchanCmdPtr',
+            PULPStructDataTypes.DMA_copy.structTypeDict['mchan_cmd'])
+
+        name = namePrefix + "_length_1d_copy"
+        cb = ctxt.ConstantBuffer(name, [len(updateList)], len1dList)
+        ctxt, operatorRepresentation = self._hoistConstantAndReference(
+            ctxt, cb, operatorRepresentation, nodeName, 'length1dPtr',
+            PULPStructDataTypes.DMA_copy.structTypeDict['length_1d_copy'])
+
+        name = namePrefix + "_number_of_1d_copies"
+        cb = ctxt.ConstantBuffer(name, [len(updateList)], num1dList)
+        ctxt, operatorRepresentation = self._hoistConstantAndReference(
+            ctxt, cb, operatorRepresentation, nodeName, 'number1dPtr',
+            PULPStructDataTypes.DMA_copy.structTypeDict['number_of_1d_copies'])
+
+        name = namePrefix + "_number_of_2d_copies"
+        cb = ctxt.ConstantBuffer(name, [len(updateList)], num2dList)
+        ctxt, operatorRepresentation = self._hoistConstantAndReference(
+            ctxt, cb, operatorRepresentation, nodeName, 'number2dPtr',
+            PULPStructDataTypes.DMA_copy.structTypeDict['number_of_2d_copies'])
+
+        name = namePrefix + "_stride_1d"
+        cb = ctxt.ConstantBuffer(name, [len(updateList)], stride1dList)
+        ctxt, operatorRepresentation = self._hoistConstantAndReference(
+            ctxt, cb, operatorRepresentation, nodeName, 'stride1dPtr',
+            PULPStructDataTypes.DMA_copy.structTypeDict['stride_1d'])
+
+        name = namePrefix + "_stride_2d"
+        cb = ctxt.ConstantBuffer(name, [len(updateList)], stride2dList)
+        ctxt, operatorRepresentation = self._hoistConstantAndReference(
+            ctxt, cb, operatorRepresentation, nodeName, 'stride2dPtr',
+            PULPStructDataTypes.DMA_copy.structTypeDict['stride_2d'])
+
+        return ctxt, operatorRepresentation
+
+    def _generateEgressPointerUpdates(
+            self, nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, List[CodeSnippet]]:
+
+        updates = []
+        newCtxt = ctxt.copy()
+
+        updateDict = self._generatePointerUpdates(ctxt, operatorRepresentation, tilingSchedule.outputLoadSchedule,
+                                                  nodeMemoryConstraint, tilingSchedule)
+
+        for key, updateList in updateDict.items():
+
+            newCtxt, newNodeRep = self._hoistDMAUpdates(newCtxt, key, updateList, operatorRepresentation)
+            updates.append(CodeSnippet(self._updateDMATransferStructTemplate, newNodeRep))
+
+        return newCtxt, updates
+
+    def _generateIngressPointerUpdates(
+            self, nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, List[CodeSnippet]]:
+
+        updates = []
+        newCtxt = ctxt.copy()
+
+        updateDict = self._generatePointerUpdates(ctxt, operatorRepresentation, tilingSchedule.inputLoadSchedule,
+                                                  nodeMemoryConstraint, tilingSchedule)
+
+        for key, updateList in updateDict.items():
+
+            newCtxt, newNodeRep = self._hoistDMAUpdates(newCtxt, key, updateList, operatorRepresentation)
+            updates.append(CodeSnippet(self._updateDMATransferStructTemplate, newNodeRep))
+
+        return newCtxt, updates
+
+    def _generateVariableUpdates(self, tilingSchedule: TilingSchedule, variableReplacement: VariableReplacementScheme,
+                                 ctxt: NetworkContext,
+                                 operatorRepresentation: OperatorRepresentation) -> List[CodeSnippet]:
+
+        updates = []
+
+        for key in variableReplacement.perTileReplacements.keys():
+
+            buf = ctxt.lookup(operatorRepresentation[key])
+            reference = str(buf._instance)
+
+            updates.append(
+                CodeSnippet(self._updateReferenceTemplate, {
+                    "reference": reference,
+                    "tileNum": "TILING_I",
+                    "baseReference": buf._referenceName
+                }))
+
+        return updates
+
+    def _generateDMACode(self, nodeMemoryConstraint: NodeMemoryConstraint, ctxt: NetworkContext,
+                         operatorRepresentation: OperatorRepresentation, loadSchedule: List[Dict[str, HyperRectangle]],
+                         direction: Literal["ToL1", "FromL1"]) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
+
+        DMATransferCalls = []
+        DMAWaitStatements = []
+
+        allNumTransfers = AutoTransposeUtils.allNumTransfers(ctxt, operatorRepresentation, loadSchedule, direction)
+
+        transferNodeRep = {}
+
+        if allNumTransfers != []:
+
+            dimLens = []
+
+            for dim in range(len(allNumTransfers[0])):
+                dimVec = [transfer[dim] for transfer in allNumTransfers]
+                namePrefix = operatorRepresentation["nodeName"] + "_"
+                vecName = f"dimLen_{dim}"
+
+                cb = ctxt.ConstantBuffer(namePrefix + vecName, [len(dimVec)], dimVec)
+                ctxt, transferNodeRep = self._hoistConstantAndReference(ctxt, cb, transferNodeRep,
+                                                                        operatorRepresentation['nodeName'], vecName)
+
+                dimLens.append(str(cb._instance))
+
+            transferNodeRep['nodeName'] = operatorRepresentation['nodeName']
+            transferNodeRep['dimLens'] = dimLens
+            transferNodeRep['tileNum'] = "TILING_I"
+
+        loadStep = loadSchedule[0]
+
+        for idx, (key, rectangle) in enumerate(loadStep.items()):
+
+            permName = f"in{idx}_perm"
+
+            externalPtr = ctxt.lookup(ctxt.lookup(operatorRepresentation[key])._referenceName)
+            internalPtr = ctxt.lookup(operatorRepresentation[key])
+
+            tensorName = key
+            nodeName = operatorRepresentation['nodeName']
+            dmaName = self._DMAStructName(tensorName, nodeName)
+
+            transferNodeRep = {
+                **transferNodeRep,
+                **{
+                    'innerTilePtr': str(internalPtr._instance),
+                    "outerTilePtr": str(externalPtr._instance),
+                    "stateReference": dmaName
+                }
+            }
+
+            if permName in operatorRepresentation and direction == "ToL1":
+                perm = operatorRepresentation[permName]
+                struct, remainderStrides, numTransfers = AutoTransposeUtils.generateTransposedDMAStruct(
+                    ctxt, rectangle, direction, perm, internalPtr.name, externalPtr.name)
+                locStride = np.prod(
+                    rectangle.dims) // np.prod(numTransfers) * (externalPtr._type.referencedType.typeWidth // 8)
+
+                transferNodeRep['tensorName'] = operatorRepresentation[key]
+
+                transferNodeRep = {**transferNodeRep, **{"remainderStrides": remainderStrides, "locStride": locStride}}
+
+            else:
+                finalMemoryLevel = TilingCodeGeneration.isFinalMemoryLevel(nodeMemoryConstraint, internalPtr)
+
+                struct = self._rectToDMAStruct(ctxt, rectangle, direction, internalPtr.name, externalPtr.name,
+                                               finalMemoryLevel)
+
+            transferNodeRep["stateStruct"] = struct
+            _ = ctxt.hoistStruct(struct, dmaName, PULPStructDataTypes.DMA_copy)
+            ctxt.lookup(dmaName)._users += [operatorRepresentation['nodeName']]
+
+            if permName in operatorRepresentation and direction == "ToL1":
+
+                DMATransferCalls.append(CodeSnippet(self._iteratedMoveTileInTemplate, transferNodeRep))
+            else:
+                DMATransferCalls.append(CodeSnippet(self._moveTileInTemplate, transferNodeRep))
+
+            DMAWaitStatements.append(CodeSnippet(self._blockTileInTemplate, transferNodeRep))
+
+        return DMATransferCalls, DMAWaitStatements
+
+    def _generateIngressDMACode(
+            self, tilingSchedule: TilingSchedule, nodeMemoryConstraint: NodeMemoryConstraint, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
+
+        importLoadStep = tilingSchedule.inputLoadSchedule
+        ingressDMATransferCalls, ingressDMAWaitStatements = self._generateDMACode(nodeMemoryConstraint, ctxt,
+                                                                                  operatorRepresentation,
+                                                                                  importLoadStep, "ToL1")
+        return ingressDMATransferCalls, ingressDMAWaitStatements
+
+    def _generateEgressDMACode(
+            self, tilingSchedule: TilingSchedule, nodeMemoryConstraint: NodeMemoryConstraint, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
+
+        exportLoadStep = tilingSchedule.outputLoadSchedule
+        egressDMATransferCalls, egressDMAWaitStatements = self._generateDMACode(nodeMemoryConstraint, ctxt,
+                                                                                operatorRepresentation, exportLoadStep,
+                                                                                "FromL1")
+
+        return egressDMATransferCalls, egressDMAWaitStatements
+
+    def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
+                    nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule,
+                    variableReplacement: VariableReplacementScheme,
+                    operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
+
+        tileIdxPtr = self._hoistTileIdxPtr(ctxt, operatorRepresentation)
+
+        ingressDMATransferCalls, ingressDMAWaitStatements = self._generateIngressDMACode(
+            tilingSchedule, nodeMemoryConstraint, ctxt, operatorRepresentation)
+
+        egressDMATransferCalls, egressDMAWaitStatements = self._generateEgressDMACode(
+            tilingSchedule, nodeMemoryConstraint, ctxt, operatorRepresentation)
+
+        ctxt, ingressDMAUpdates = self._generateIngressPointerUpdates(nodeMemoryConstraint, tilingSchedule, ctxt,
+                                                                      operatorRepresentation)
+        ctxt, egressDMAUpdates = self._generateEgressPointerUpdates(nodeMemoryConstraint, tilingSchedule, ctxt,
+                                                                    operatorRepresentation)
+
+        openLoopStatement = [
+            CodeSnippet(self._openTileLoopTemplate, {
+                "numTiles": operatorRepresentation["numTiles"],
+                "tileIdxPtr": tileIdxPtr
+            })
+        ]
+
+        closeLoopStatement = [
+            CodeSnippet(self._closeTileLoopTemplate, {
+                "numTiles": operatorRepresentation["numTiles"],
+                "tileIdxPtr": tileIdxPtr
+            })
+        ]
+
+        setupStatements = [CodeSnippet(self._initDMATemplate, {"channelName": "dma_channel"})]
+        setupStatements += [
+            CodeSnippet(self._setDMAChannelTemplate, {
+                **transaction.operatorRepresentation, "channelName": "dma_channel"
+            }) for transaction in ingressDMAUpdates + egressDMAUpdates
+        ]
+
+        teardownStatements = [
+            CodeSnippet(self._releaseDMATemplate,
+                        {"stateReference": ingressDMAUpdates[0].operatorRepresentation["stateReference"]})
+        ]
+
+        variableUpdates = self._generateVariableUpdates(tilingSchedule, variableReplacement, ctxt,
+                                                        operatorRepresentation)
+
+        metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + "_L2",
+                                  nodeOps = operatorRepresentation['nodeOps'],
+                                  numTiles = len(tilingSchedule.outputLoadSchedule),
+                                  tileIdxVar = "TILING_I")
+
+        newExecutionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMATransferCalls,
+                                                       ingressDMAWaitStatements, ingressDMAUpdates,
+                                                       egressDMATransferCalls, egressDMAWaitStatements,
+                                                       egressDMAUpdates, variableUpdates, openLoopStatement,
+                                                       closeLoopStatement, setupStatements, teardownStatements)
+
+        return ctxt, newExecutionBlock, True
+
+    def generateTilingLoop(
+            self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nodeMemoryConstraint: NodeMemoryConstraint,
+            tilingSchedules: List[TilingSchedule], variableReplacement: VariableReplacementScheme,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
+
+        flatTilingSchedule = copy.copy(tilingSchedules[0])
+        for tilingSchedule in tilingSchedules[1:]:
+            flatTilingSchedule += tilingSchedule
+
+        # SCHEREMO: hoist numTiles
+
+        offsetLists = list({**flatTilingSchedule.inputBaseOffsets, **flatTilingSchedule.outputBaseOffsets}.values())
+
+        if len(offsetLists) == 0:
+            return ctxt, executionBlock, False
+
+        for offsetList in offsetLists:
+            if not len(offsetList) == 1:
+                return ctxt, executionBlock, False
+
+        operatorRepresentation["numTiles"] = self._hoistNumTiles(ctxt, operatorRepresentation['nodeName'],
+                                                                 tilingSchedules)
+
+        return self._tilingLoop(ctxt, executionBlock, nodeMemoryConstraint, flatTilingSchedule, variableReplacement,
+                                operatorRepresentation)
+
+
+class PULPClusterTilingGenerationSB(PULPClusterTilingSB, SingleBufferingTilingMixIn):
+    pass
+
+
+class ProfilingPULPClusterTilingGenerationSB(PULPClusterTilingSB, ProfilingSingleBufferingTilingMixIn):
+    pass
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py
new file mode 100644
index 0000000..ac32eeb
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py
@@ -0,0 +1,55 @@
+# ----------------------------------------------------------------------
+#
+# File: PULPL3Tiling.py
+#
+# Last edited: 19.04.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity
+
+from .PULPL3TilingDB import ProfilingPULPL3TilingGenerationDB, PULPL3TilingGenerationDB
+from .PULPL3TilingSB import ProfilingPULPL3TilingGenerationSB, PULPL3TilingGenerationSB
+
+
+class PULPL3Tiling(CodeTransformationPass):
+
+    def __init__(self, targetMemLevel: str):
+        self.SB = PULPL3TilingGenerationSB(targetMemLevel)
+        self.profilingSB = ProfilingPULPL3TilingGenerationSB(targetMemLevel)
+        self.DB = PULPL3TilingGenerationDB(targetMemLevel)
+        self.profilingDB = ProfilingPULPL3TilingGenerationDB(targetMemLevel)
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        if verbose.tilingProfiling == "L3":
+            ctxt, executionBlock = self.profilingSB.apply(ctxt, executionBlock, name)
+            ctxt, executionBlock = self.profilingDB.apply(ctxt, executionBlock, name)
+        else:
+            ctxt, executionBlock = self.SB.apply(ctxt, executionBlock, name)
+            ctxt, executionBlock = self.DB.apply(ctxt, executionBlock, name)
+
+        return ctxt, executionBlock
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3TilingDB.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3TilingDB.py
new file mode 100644
index 0000000..307f959
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3TilingDB.py
@@ -0,0 +1,328 @@
+# ----------------------------------------------------------------------
+#
+# File: PULPClusterTiling.py
+#
+# Last edited: 17.10.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import CodeSnippet, ExecutionBlock, NetworkContext, NodeTemplate, OperatorRepresentation
+from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3TilingSB import PULPL3TilingSB, _DMAUpdate
+from Deeploy.Targets.PULPOpen.DataTypes import PULPStructDataTypes
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import DoubleBufferingTilingMixIn, \
+    ProfilingDoubleBufferingTilingMixIn, TilingMetaInfo
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme
+
+_moveTileInTemplate = NodeTemplate("""
+
+// IMPORT TILE ${innerTilePtr} from ${outerTilePtr}
+if (${tileNum} < ${numTiles}[*${tileIdxPtr}+1]){
+pi_cl_ram_copy_2d(get_ram_ptr(), ${stateReference}.pi_ram_addr, ${stateReference}.addr, ${stateReference}.size, ${stateReference}.stride, ${stateReference}.length, ${stateReference}.ext2loc, &${stateReference});
+}
+
+""")
+
+_moveTileOutTemplate = NodeTemplate("""
+
+// EXPORT TILE ${innerTilePtr} to ${outerTilePtr}
+if((${tileNum}) % 2 == 0){
+pi_cl_ram_copy_2d(get_ram_ptr(), ${stateReference}.pi_ram_addr, ${stateReference}.addr, ${stateReference}.size, ${stateReference}.stride, ${stateReference}.length, ${stateReference}.ext2loc, &${stateReference});
+} else {
+pi_cl_ram_copy_2d(get_ram_ptr(), ${_stateReference}.pi_ram_addr, ${_stateReference}.addr, ${_stateReference}.size, ${_stateReference}.stride, ${_stateReference}.length, ${_stateReference}.ext2loc, &${_stateReference});
+}
+
+""")
+
+_blockTileOutTemplate = NodeTemplate("""
+
+// BLOCKING EXPORT TILE ${innerTilePtr}
+if((${tileNum}) > 1){
+if((${tileNum}) % 2 == 0){
+pi_cl_ram_copy_wait(&${stateReference});
+} else {
+pi_cl_ram_copy_wait(&${_stateReference});
+}
+}
+
+""")
+
+_finalBlockTileOutTemplate = NodeTemplate("""
+
+// BLOCKING EXPORT TILE ${innerTilePtr}
+pi_cl_ram_copy_wait(&${stateReference});
+% if numTiles > 1:
+pi_cl_ram_copy_wait(&${_stateReference});
+% endif
+""")
+
+_updateDMATransferStructTemplate = NodeTemplate("""
+
+// UPDATE DMA STRUCT ${stateReference}
+${stateReference}.pi_ram_addr = ((char*)${extPtr}) + ${extOffsetPtr}[${tileNum}];
+${stateReference}.size = ${length1dPtr}[${tileNum}];
+${stateReference}.length = ${number1dPtr}[${tileNum}];
+${stateReference}.addr = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}]);
+${locPtr} = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}-1]);
+
+""")
+
+_outUpdateDMATransferStructTemplate = NodeTemplate("""
+
+if ((${tileNum}) % 2 == 0){
+// UPDATE DMA STRUCT ${stateReference}
+${stateReference}.pi_ram_addr = ((char*)${extPtr}) + ${extOffsetPtr}[${tileNum}];
+${stateReference}.size = ${length1dPtr}[${tileNum}];
+${stateReference}.length = ${number1dPtr}[${tileNum}];
+${stateReference}.addr = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}]);
+} else {
+${_stateReference}.pi_ram_addr = ((char*)${extPtr}) + ${extOffsetPtr}[${tileNum}];
+${_stateReference}.size = ${length1dPtr}[${tileNum}];
+${_stateReference}.length = ${number1dPtr}[${tileNum}];
+${_stateReference}.addr = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}]);
+}
+${locPtr} = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}]);
+
+""")
+
+
+class PULPL3TilingDB(PULPL3TilingSB):
+
+    _prefix = "TILING_REPLACED_"
+    _blockTileOutTemplate = _blockTileOutTemplate
+    _updateDMATransferStructTemplate = _updateDMATransferStructTemplate
+    _moveTileOutTemplate = _moveTileOutTemplate
+    _moveTileInTemplate = _moveTileInTemplate
+
+    def _hoistDMAUpdates(self, ctxt: NetworkContext, tensorName: str, updateList: List[_DMAUpdate],
+                         operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+
+        nodeName = operatorRepresentation['nodeName']
+
+        operatorRepresentation = operatorRepresentation.copy()
+
+        dmaName = self._DMAStructName(tensorName, nodeName)
+        # operatorRepresentation['stateReference'] = dmaName
+        # operatorRepresentation['tileNum'] = "TILING_I"
+        operatorRepresentation['locPtr'] = ctxt.lookup(operatorRepresentation[tensorName]).name
+        operatorRepresentation['baseLocPtr'] = ctxt.hoistReference(operatorRepresentation['locPtr'],
+                                                                   operatorRepresentation['locPtr'] + "_ref")
+        operatorRepresentation['_stateReference'] = self._DMAStructName(tensorName, nodeName) + "_1"
+        ctxt.lookup(operatorRepresentation['baseLocPtr'])._memoryLevel = self.targetMemLevel
+
+        namePrefix = self.prefix + f"{nodeName}_{tensorName}"
+
+        ctxt, operatorRepresentation = super()._hoistDMAUpdates(ctxt, tensorName, updateList, operatorRepresentation)
+
+        locOffsetList = []
+        locBaseOffset = updateList[0].locOffset
+        for update in updateList:
+            locOffsetList.append(int(update.locOffset) - locBaseOffset)
+
+        name = namePrefix + "_locOffset"
+        cb = ctxt.ConstantBuffer(name, [len(updateList)], locOffsetList)
+        ctxt, operatorRepresentation = self._hoistConstantAndReference(ctxt, cb, operatorRepresentation, nodeName,
+                                                                       'locOffsetPtr')
+
+        return ctxt, operatorRepresentation
+
+    def _generateEgressPointerUpdates(
+            self, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, List[CodeSnippet]]:
+
+        updates = []
+        newCtxt = ctxt.copy()
+
+        updateDict = self._generatePointerUpdates(ctxt, operatorRepresentation, tilingSchedule.outputLoadSchedule,
+                                                  tilingSchedule)
+
+        for key, updateList in updateDict.items():
+
+            newCtxt, newNodeRep = self._hoistDMAUpdates(newCtxt, key, updateList, operatorRepresentation)
+            updates.append(CodeSnippet(_outUpdateDMATransferStructTemplate, newNodeRep))
+
+        return newCtxt, updates
+
+    def _generateEgressDMACode(
+            self, tilingSchedule: TilingSchedule, nodeMemoryConstraint: NodeMemoryConstraint, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
+
+        egressDMATransferCalls = []
+        egressDMAWaitStatements = []
+        exportLoadStep = tilingSchedule.outputLoadSchedule[0]
+
+        for key, rectangle in exportLoadStep.items():
+            externalPtr = ctxt.lookup(ctxt.lookup(operatorRepresentation[key])._referenceName)
+            internalPtr = ctxt.lookup(operatorRepresentation[key])
+
+            tensorName = key
+            nodeName = operatorRepresentation['nodeName']
+            dmaName = self._DMAStructName(tensorName, nodeName)
+
+            struct = self._rectToDMAStruct(ctxt, rectangle, "FromL2", internalPtr.name, externalPtr.name)
+            _ = ctxt.hoistStruct(struct, dmaName, PULPStructDataTypes.pi_cl_ram_req_t)
+            ctxt.lookup(dmaName)._users += [operatorRepresentation['nodeName']]
+
+            tensorName = key + "_1"
+            nodeName = operatorRepresentation['nodeName']
+            _dmaName = self._DMAStructName(tensorName, nodeName)
+
+            struct = self._rectToDMAStruct(ctxt, rectangle, "FromL2", internalPtr.name, externalPtr.name)
+            _ = ctxt.hoistStruct(struct, _dmaName, PULPStructDataTypes.pi_cl_ram_req_t)
+            ctxt.lookup(_dmaName)._users += [operatorRepresentation['nodeName']]
+
+            egressDMATransferCalls.append(
+                CodeSnippet(
+                    self._moveTileOutTemplate, {
+                        'innerTilePtr': str(internalPtr._instance),
+                        "outerTilePtr": str(externalPtr._instance),
+                        "stateReference": dmaName,
+                        "_stateReference": _dmaName
+                    }))
+
+            egressDMAWaitStatements.append(
+                CodeSnippet(
+                    self._blockTileOutTemplate, {
+                        'innerTilePtr': str(internalPtr._instance),
+                        "outerTilePtr": str(externalPtr._instance),
+                        "stateReference": dmaName,
+                        "_stateReference": _dmaName
+                    }))
+
+        return egressDMATransferCalls, egressDMAWaitStatements
+
+    def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
+                    nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule,
+                    variableReplacement: VariableReplacementScheme,
+                    operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
+
+        tileIdxPtr = self._hoistTileIdxPtr(ctxt, operatorRepresentation)
+
+        ingressDMATransferCalls, ingressDMAWaitStatements = self._generateIngressDMACode(
+            tilingSchedule, ctxt, operatorRepresentation)
+
+        egressDMATransferCalls, egressDMAWaitStatements = self._generateEgressDMACode(
+            tilingSchedule, nodeMemoryConstraint, ctxt, operatorRepresentation)
+
+        ctxt, ingressDMAUpdates = self._generateIngressPointerUpdates(tilingSchedule, ctxt, operatorRepresentation)
+        ctxt, egressDMAUpdates = self._generateEgressPointerUpdates(tilingSchedule, ctxt, operatorRepresentation)
+
+        variableUpdates = []
+
+        for transaction in ingressDMATransferCalls:
+            _operatorRepresentation = transaction.operatorRepresentation
+            _operatorRepresentation["tileNum"] = "TILING_I+1"
+            _operatorRepresentation["numTiles"] = operatorRepresentation['numTiles']
+            _operatorRepresentation["tileIdxPtr"] = tileIdxPtr
+
+        for transaction in ingressDMAUpdates:
+            _operatorRepresentation = transaction.operatorRepresentation
+            _operatorRepresentation["tileNum"] = "TILING_I+1"
+
+        for transaction in egressDMATransferCalls:
+            _operatorRepresentation = transaction.operatorRepresentation
+            _operatorRepresentation["tileNum"] = "TILING_I"
+
+        for transaction in egressDMAWaitStatements:
+            _operatorRepresentation = transaction.operatorRepresentation
+            _operatorRepresentation['tileNum'] = "TILING_I"
+
+        for transaction in egressDMAUpdates:
+            _operatorRepresentation = transaction.operatorRepresentation
+            _operatorRepresentation["tileNum"] = "TILING_I"
+
+        openLoopStatement = [
+            CodeSnippet(self._openTileLoopTemplate, {
+                "numTiles": operatorRepresentation["numTiles"],
+                "tileIdxPtr": tileIdxPtr
+            })
+        ]
+
+        closeLoopStatement = [
+            CodeSnippet(self._closeTileLoopTemplate, {
+                "numTiles": operatorRepresentation["numTiles"],
+                "tileIdxPtr": tileIdxPtr
+            })
+        ]
+
+        setupStatements = []
+        teardownStatements = []
+
+        for transaction in ingressDMATransferCalls:
+            _operatorRepresentation = transaction.operatorRepresentation.copy()
+            _operatorRepresentation["tileNum"] = 0
+            _operatorRepresentation["numTiles"] = operatorRepresentation['numTiles']
+            _operatorRepresentation["tileIdxPtr"] = tileIdxPtr
+            setupStatements.append(CodeSnippet(transaction.template, _operatorRepresentation))
+
+        for transaction in egressDMAWaitStatements:
+            _operatorRepresentation = transaction.operatorRepresentation.copy()
+            _operatorRepresentation['tileNum'] = ctxt.lookup(operatorRepresentation["numTiles"]).values[-1]
+            _operatorRepresentation['numTiles'] = len(tilingSchedule.outputLoadSchedule)
+            teardownStatements.append(CodeSnippet(_finalBlockTileOutTemplate, _operatorRepresentation))
+
+        metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + "_L3",
+                                  nodeOps = operatorRepresentation['nodeOps'],
+                                  numTiles = len(tilingSchedule.outputLoadSchedule),
+                                  tileIdxVar = "TILING_I")
+
+        newExecutionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMATransferCalls,
+                                                       ingressDMAWaitStatements, ingressDMAUpdates,
+                                                       egressDMATransferCalls, egressDMAWaitStatements,
+                                                       egressDMAUpdates, variableUpdates, openLoopStatement,
+                                                       closeLoopStatement, setupStatements, teardownStatements)
+
+        return ctxt, newExecutionBlock, True
+
+    def generateTilingLoop(
+            self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nodeMemoryConstraint: NodeMemoryConstraint,
+            tilingSchedules: List[TilingSchedule], variableReplacement: VariableReplacementScheme,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
+
+        flatTilingSchedule = copy.copy(tilingSchedules[0])
+        for tilingSchedule in tilingSchedules[1:]:
+            flatTilingSchedule += tilingSchedule
+
+        offsetLists = list({**flatTilingSchedule.inputBaseOffsets, **flatTilingSchedule.outputBaseOffsets}.values())
+
+        if len(offsetLists) == 0:
+            return ctxt, executionBlock, False
+
+        for offsetList in offsetLists:
+            if not len(offsetList) == 2:
+                return ctxt, executionBlock, False
+
+        allNumTiles = [len(schedule.outputLoadSchedule) for schedule in tilingSchedules]
+        operatorRepresentation["numTiles"] = self._hoistNumTiles(ctxt, operatorRepresentation['nodeName'],
+                                                                 tilingSchedules)
+
+        return self._tilingLoop(ctxt, executionBlock, nodeMemoryConstraint, flatTilingSchedule, variableReplacement,
+                                operatorRepresentation)
+
+
+class PULPL3TilingGenerationDB(PULPL3TilingDB, DoubleBufferingTilingMixIn):
+    pass
+
+
+class ProfilingPULPL3TilingGenerationDB(PULPL3TilingDB, ProfilingDoubleBufferingTilingMixIn):
+    pass
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3TilingSB.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3TilingSB.py
new file mode 100644
index 0000000..10aed52
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3TilingSB.py
@@ -0,0 +1,462 @@
+# ----------------------------------------------------------------------
+#
+# File: PULPL3TilingSB.py
+#
+# Last edited: 19.04.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from collections import namedtuple
+from typing import Dict, List, Literal, Optional, Tuple, Type
+
+import Deeploy.CommonExtensions.DataTypes as BasicDataTypes
+from Deeploy.AbstractDataTypes import Immediate, PointerClass
+from Deeploy.DeeployTypes import CodeSnippet, ConstantBuffer, ExecutionBlock, NetworkContext, NodeTemplate, \
+    OperatorRepresentation
+from Deeploy.Targets.PULPOpen.CodeTransformationPasses import AutoTransposeUtils
+from Deeploy.Targets.PULPOpen.DataTypes import PULPStructDataTypes
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingSingleBufferingTilingMixIn, \
+    SingleBufferingTilingMixIn, TilingMetaInfo
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme, \
+    calculateRectangleOffset, minimizeRectangleDims
+
+_openTileLoopTemplate = NodeTemplate("""
+
+// TILING LOOP
+// for (int TILING_I=0; TILING_I<${numTiles}; TILING_I++){
+for (int TILING_I=${numTiles}[*${tileIdxPtr}]; TILING_I<${numTiles}[(*${tileIdxPtr})+1]; TILING_I++){
+""")
+
+_closeTileLoopTemplate = NodeTemplate("""
+
+// CLOSE TILING LOOP
+}
+*${tileIdxPtr} += 1;
+
+""")
+
+_moveTileInTemplate = NodeTemplate("""
+
+// IMPORT TILE ${innerTilePtr} from ${outerTilePtr}
+pi_cl_ram_copy_2d(get_ram_ptr(), ${stateReference}.pi_ram_addr, ${stateReference}.addr, ${stateReference}.size, ${stateReference}.stride, ${stateReference}.length, ${stateReference}.ext2loc, &${stateReference});
+
+""")
+
+_blockTileInTemplate = NodeTemplate("""
+
+// BLOCKING IMPORT TILE ${innerTilePtr}
+pi_cl_ram_copy_wait(&${stateReference});
+""")
+
+_moveTileOutTemplate = NodeTemplate("""
+
+// EXPORT TILE ${innerTilePtr} to ${outerTilePtr}
+pi_cl_ram_copy_2d(get_ram_ptr(), ${stateReference}.pi_ram_addr, ${stateReference}.addr, ${stateReference}.size, ${stateReference}.stride, ${stateReference}.length, ${stateReference}.ext2loc, &${stateReference});
+
+""")
+
+_blockTileOutTemplate = NodeTemplate("""
+
+// BLOCKING EXPORT TILE ${innerTilePtr}
+pi_cl_ram_copy_wait(&${stateReference});
+
+""")
+
+_updateDMATransferStructTemplate = NodeTemplate("""
+
+// UPDATE DMA STRUCT ${stateReference}
+${stateReference}.pi_ram_addr = ((char*)${extPtr}) + ${extOffsetPtr}[${tileNum}];
+${stateReference}.size = ${length1dPtr}[${tileNum}];
+${stateReference}.length = ${number1dPtr}[${tileNum}];
+
+""")
+
+# ${stateReference}.number_of_2d_copies = ${number2dPtr}[${tileNum}];
+
+_updateReferenceTemplate = NodeTemplate("""
+
+// UPDATE VARIABLE ${reference}
+*${reference} = ${baseReference}[${tileNum}];
+""")
+
+# ADD NUM TRANSFERS VARIABLE
+
+_DMAUpdate = namedtuple("_DMAUpdate", "extOffset locOffset length_1d_copy number_of_1d_copies number_of_2d_copies")
+
+
+class PULPL3TilingSB(TilingCodeGeneration):
+
+    _prefix = "TILING_REPLACED_"
+
+    _openTileLoopTemplate = _openTileLoopTemplate
+    _closeTileLoopTemplate = _closeTileLoopTemplate
+
+    _moveTileInTemplate = _moveTileInTemplate
+    _blockTileInTemplate = _blockTileInTemplate
+
+    _moveTileOutTemplate = _moveTileOutTemplate
+    _blockTileOutTemplate = _blockTileOutTemplate
+
+    _updateDMATransferStructTemplate = _updateDMATransferStructTemplate
+    _updateReferenceTemplate = _updateReferenceTemplate
+
+    @property
+    def prefix(self):
+        return self._prefix + self.targetMemLevel + "_"
+
+    def _DMAStructName(self, tensorName: str, nodeName: str) -> str:
+        return f"{self.prefix}_DMA_{nodeName}_{tensorName}"
+
+    @classmethod
+    def _generatePointerUpdates(cls, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation,
+                                loadSchedule: List[Dict[str, HyperRectangle]],
+                                tilingSchedule: TilingSchedule) -> Dict[str, _DMAUpdate]:
+        updateDict = {}
+        deltaOffsets = {}
+
+        for idx, loadStep in enumerate(loadSchedule):
+            for stepIdx, (key, rect) in enumerate(loadStep.items()):
+
+                if key in tilingSchedule.outputBaseOffsets.keys():
+                    baseOffsets = tilingSchedule.outputBaseOffsets[key]
+                    direction = "FromL2"
+                else:
+                    baseOffsets = tilingSchedule.inputBaseOffsets[key]
+                    direction = "ToL2"
+
+                if key not in updateDict.keys():
+                    updateDict[key] = []
+                if key not in deltaOffsets.keys():
+                    deltaOffsets[key] = 0
+
+                referenceBuffer = ctxt.lookup(ctxt.lookup(operatorRepresentation[key])._referenceName)
+                l1Buffer = ctxt.lookup(operatorRepresentation[key])
+
+                struct = cls._rectToDMAStruct(ctxt, rect, direction, l1Buffer.name, l1Buffer._referenceName)
+                accOffset = calculateRectangleOffset(rect, referenceBuffer)
+
+                length_1d_copy = struct.value['size'].value
+                number_of_1d_copies = struct.value['length'].value
+
+                lIdx = idx % len(baseOffsets)
+
+                sol = _DMAUpdate(accOffset, baseOffsets[lIdx], length_1d_copy, number_of_1d_copies, 0)
+
+                deltaOffsets[key] = accOffset
+                updateDict[key].append(sol)
+
+        return updateDict
+
+    @classmethod
+    def _rectToDMAStruct(cls, ctxt: NetworkContext, rectangle: HyperRectangle, direction: Literal["ToL2", "FromL2"],
+                         L1Name: str, L2Name: str) -> PULPStructDataTypes.pi_cl_ram_req_t:
+
+        referenceBuffer = ctxt.lookup(L2Name)
+
+        rect, referenceRect = minimizeRectangleDims(rectangle, referenceBuffer)
+        assert len(rect.dims) <= 2, "PULP: Only 2D transfers are supported!"
+
+        if direction == "ToL2":
+            _dir = 1
+        else:
+            _dir = 0
+
+        length_1d_copy = rect.dims[-1] * (referenceBuffer._type.referencedType.typeWidth // 8)
+
+        if len(rect.dims) > 1:
+            number_of_1d_copies = rect.dims[-2]
+            stride_1d = referenceRect.dims[-1] * (referenceBuffer._type.referencedType.typeWidth // 8)
+        else:
+            number_of_1d_copies = 1
+            stride_1d = 0
+
+        struct = PULPStructDataTypes.pi_cl_ram_req_t(
+            {
+                "pi_ram_addr": referenceBuffer.name,
+                "addr": L1Name,
+                "stride": stride_1d,
+                "length": length_1d_copy,
+                "size": number_of_1d_copies * length_1d_copy,
+                "ext2loc": _dir,
+                "is_2d": 1
+            }, ctxt)
+
+        return struct
+
+    def _hoistConstantAndReference(self,
+                                   ctxt: NetworkContext,
+                                   constBuf: ConstantBuffer,
+                                   operatorRepresentation: OperatorRepresentation,
+                                   nodeName: str,
+                                   operatorRepresentationName: str,
+                                   immediateType: Optional[Type[Immediate]] = None) -> Tuple[NetworkContext, Dict]:
+        if immediateType is None:
+            _type = PointerClass(BasicDataTypes.int32_t)
+        else:
+            _type = PointerClass(immediateType)
+
+        constBuf._users = [nodeName]
+        constBuf._memoryLevel = self.targetMemLevel
+
+        refName = ctxt.hoistConstantAndReference(constBuf, _type)
+
+        operatorRepresentation[operatorRepresentationName] = refName
+
+        return ctxt, operatorRepresentation
+
+    def _hoistDMAUpdates(self, ctxt: NetworkContext, tensorName: str, updateList: List[_DMAUpdate],
+                         operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
+
+        operatorRepresentation = operatorRepresentation.copy()
+
+        nodeName = operatorRepresentation['nodeName']
+
+        offsetList = []
+        len1dList = []
+        num1dList = []
+        num2dList = []
+        for update in updateList:
+            offsetList.append(int(update.extOffset))
+            len1dList.append(int(update.length_1d_copy))
+            num1dList.append(int(update.number_of_1d_copies))
+            num2dList.append(int(update.number_of_2d_copies))
+
+        dmaName = self._DMAStructName(tensorName, nodeName)
+        operatorRepresentation['stateReference'] = dmaName
+        operatorRepresentation['tileNum'] = "TILING_I"
+        operatorRepresentation['extPtr'] = ctxt.lookup(operatorRepresentation[tensorName])._referenceName
+
+        namePrefix = self.prefix + f"{nodeName}_{tensorName}"
+
+        name = namePrefix + "_offset"
+        cb = ctxt.ConstantBuffer(name, [len(updateList)], offsetList)
+        ctxt, operatorRepresentation = self._hoistConstantAndReference(ctxt, cb, operatorRepresentation, nodeName,
+                                                                       'extOffsetPtr')
+
+        name = namePrefix + "_length_1d_copy"
+        cb = ctxt.ConstantBuffer(name, [len(updateList)], len1dList)
+        ctxt, operatorRepresentation = self._hoistConstantAndReference(
+            ctxt, cb, operatorRepresentation, nodeName, 'length1dPtr',
+            PULPStructDataTypes.pi_cl_ram_req_t.structTypeDict['size'])
+
+        name = namePrefix + "_number_of_1d_copies"
+        cb = ctxt.ConstantBuffer(name, [len(updateList)], num1dList)
+        ctxt, operatorRepresentation = self._hoistConstantAndReference(
+            ctxt, cb, operatorRepresentation, nodeName, 'number1dPtr',
+            PULPStructDataTypes.pi_cl_ram_req_t.structTypeDict['length'])
+
+        return ctxt, operatorRepresentation
+
+    def _generateEgressPointerUpdates(
+            self, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, List[CodeSnippet]]:
+
+        updates = []
+        newCtxt = ctxt.copy()
+
+        updateDict = self._generatePointerUpdates(ctxt, operatorRepresentation, tilingSchedule.outputLoadSchedule,
+                                                  tilingSchedule)
+
+        for key, updateList in updateDict.items():
+
+            newCtxt, newNodeRep = self._hoistDMAUpdates(newCtxt, key, updateList, operatorRepresentation)
+            updates.append(CodeSnippet(self._updateDMATransferStructTemplate, newNodeRep))
+
+        return newCtxt, updates
+
+    def _generateIngressPointerUpdates(
+            self, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, List[CodeSnippet]]:
+
+        updates = []
+        newCtxt = ctxt.copy()
+
+        updateDict = self._generatePointerUpdates(ctxt, operatorRepresentation, tilingSchedule.inputLoadSchedule,
+                                                  tilingSchedule)
+
+        for key, updateList in updateDict.items():
+
+            newCtxt, newNodeRep = self._hoistDMAUpdates(newCtxt, key, updateList, operatorRepresentation)
+            updates.append(CodeSnippet(self._updateDMATransferStructTemplate, newNodeRep))
+
+        return newCtxt, updates
+
+    def _generateVariableUpdates(self, tilingSchedule: TilingSchedule, variableReplacement: VariableReplacementScheme,
+                                 ctxt: NetworkContext,
+                                 operatorRepresentation: OperatorRepresentation) -> List[CodeSnippet]:
+
+        updates = []
+
+        for key in variableReplacement.perTileReplacements.keys():
+
+            buf = ctxt.lookup(operatorRepresentation[key])
+            reference = str(buf._instance)
+
+            updates.append(
+                CodeSnippet(self._updateReferenceTemplate, {
+                    "reference": reference,
+                    "tileNum": "TILING_I",
+                    "baseReference": buf._referenceName
+                }))
+
+        return updates
+
+    def _generateDMACode(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation,
+                         loadSchedule: List[Dict[str, HyperRectangle]],
+                         direction: Literal["ToL2", "FromL2"]) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
+
+        DMATransferCalls = []
+        DMAWaitStatements = []
+
+        allNumTransfers = AutoTransposeUtils.allNumTransfers(ctxt, operatorRepresentation, loadSchedule, direction)
+
+        transferNodeRep = {}
+
+        loadStep = loadSchedule[0]
+
+        for idx, (key, rectangle) in enumerate(loadStep.items()):
+
+            externalPtr = ctxt.lookup(ctxt.lookup(operatorRepresentation[key])._referenceName)
+            internalPtr = ctxt.lookup(operatorRepresentation[key])
+
+            tensorName = key
+            nodeName = operatorRepresentation['nodeName']
+            dmaName = self._DMAStructName(tensorName, nodeName)
+
+            transferNodeRep = {
+                **transferNodeRep,
+                **{
+                    'innerTilePtr': str(internalPtr._instance),
+                    "outerTilePtr": str(externalPtr._instance),
+                    "stateReference": dmaName
+                }
+            }
+
+            struct = self._rectToDMAStruct(ctxt, rectangle, direction, internalPtr.name, externalPtr.name)
+            transferNodeRep["stateStruct"] = struct
+            _ = ctxt.hoistStruct(struct, dmaName, PULPStructDataTypes.pi_cl_ram_req_t)
+            ctxt.lookup(dmaName)._users += [operatorRepresentation['nodeName']]
+
+            DMATransferCalls.append(CodeSnippet(self._moveTileInTemplate, transferNodeRep))
+
+            DMAWaitStatements.append(CodeSnippet(self._blockTileInTemplate, transferNodeRep))
+
+        return DMATransferCalls, DMAWaitStatements
+
+    def _generateIngressDMACode(
+            self, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
+
+        importLoadStep = tilingSchedule.inputLoadSchedule
+        ingressDMATransferCalls, ingressDMAWaitStatements = self._generateDMACode(ctxt, operatorRepresentation,
+                                                                                  importLoadStep, "ToL2")
+        return ingressDMATransferCalls, ingressDMAWaitStatements
+
+    def _generateEgressDMACode(
+            self, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
+
+        exportLoadStep = tilingSchedule.outputLoadSchedule
+        egressDMATransferCalls, egressDMAWaitStatements = self._generateDMACode(ctxt, operatorRepresentation,
+                                                                                exportLoadStep, "FromL2")
+
+        return egressDMATransferCalls, egressDMAWaitStatements
+
+    def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
+                    nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule,
+                    variableReplacement: VariableReplacementScheme,
+                    operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
+
+        tileIdxPtr = self._hoistTileIdxPtr(ctxt, operatorRepresentation)
+
+        ingressDMATransferCalls, ingressDMAWaitStatements = self._generateIngressDMACode(
+            tilingSchedule, ctxt, operatorRepresentation)
+
+        egressDMATransferCalls, egressDMAWaitStatements = self._generateEgressDMACode(
+            tilingSchedule, ctxt, operatorRepresentation)
+
+        ctxt, ingressDMAUpdates = self._generateIngressPointerUpdates(tilingSchedule, ctxt, operatorRepresentation)
+        ctxt, egressDMAUpdates = self._generateEgressPointerUpdates(tilingSchedule, ctxt, operatorRepresentation)
+
+        setupStatements: List[CodeSnippet] = []
+        teardownStatements: List[CodeSnippet] = []
+        variableUpdates: List[CodeSnippet] = []
+
+        openLoopStatement = [
+            CodeSnippet(self._openTileLoopTemplate, {
+                "numTiles": operatorRepresentation["numTiles"],
+                "tileIdxPtr": tileIdxPtr
+            })
+        ]
+
+        closeLoopStatement = [
+            CodeSnippet(self._closeTileLoopTemplate, {
+                "numTiles": operatorRepresentation["numTiles"],
+                "tileIdxPtr": tileIdxPtr
+            })
+        ]
+
+        metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'],
+                                  nodeOps = operatorRepresentation['nodeOps'],
+                                  numTiles = len(tilingSchedule.outputLoadSchedule),
+                                  tileIdxVar = "TILING_I")
+
+        newExecutionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMATransferCalls,
+                                                       ingressDMAWaitStatements, ingressDMAUpdates,
+                                                       egressDMATransferCalls, egressDMAWaitStatements,
+                                                       egressDMAUpdates, variableUpdates, openLoopStatement,
+                                                       closeLoopStatement, setupStatements, teardownStatements)
+
+        return ctxt, newExecutionBlock, True
+
+    def generateTilingLoop(
+            self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nodeMemoryConstraint: NodeMemoryConstraint,
+            tilingSchedules: List[TilingSchedule], variableReplacement: VariableReplacementScheme,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
+
+        flatTilingSchedule = copy.copy(tilingSchedules[0])
+        for tilingSchedule in tilingSchedules[1:]:
+            flatTilingSchedule += tilingSchedule
+
+        offsetLists = list({**flatTilingSchedule.inputBaseOffsets, **flatTilingSchedule.outputBaseOffsets}.values())
+
+        if len(offsetLists) == 0:
+            return ctxt, executionBlock, False
+
+        for offsetList in offsetLists:
+            if not len(offsetList) == 1:
+                return ctxt, executionBlock, False
+
+        operatorRepresentation["numTiles"] = self._hoistNumTiles(ctxt, operatorRepresentation['nodeName'],
+                                                                 tilingSchedules)
+
+        return self._tilingLoop(ctxt, executionBlock, nodeMemoryConstraint, flatTilingSchedule, variableReplacement,
+                                operatorRepresentation)
+
+
+class PULPL3TilingGenerationSB(PULPL3TilingSB, SingleBufferingTilingMixIn):
+    pass
+
+
+class ProfilingPULPL3TilingGenerationSB(PULPL3TilingSB, ProfilingSingleBufferingTilingMixIn):
+    pass
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/__init__.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/PULPOpen/DataTypes.py b/Deeploy/Targets/PULPOpen/DataTypes.py
new file mode 100644
index 0000000..ec526c3
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/DataTypes.py
@@ -0,0 +1,84 @@
+# ----------------------------------------------------------------------
+#
+# File: PULPDataTypes.py
+#
+# Last edited: 01.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from functools import partial
+
+from Deeploy.AbstractDataTypes import PointerClass, Struct, VoidType
+from Deeploy.CommonExtensions.DataTypes import int32_t, uint8_t, uint16_t, uint32_t
+from Deeploy.DeeployTypes import NodeTemplate
+from Deeploy.FutureExtension.Future import FutureClass
+
+_DMAResolveTemplate = NodeTemplate("""
+// PULP CLUSTER DMA Resolve
+dory_dma_barrier(&${stateReference});
+""")
+
+_DMADispatchTemplate = NodeTemplate("""
+// PULP CLUSTER DMA Dispatch
+// No dispatch necessary
+""")
+
+
+class DMA_copy(Struct):
+    typeName = "DMA_copy"
+    structTypeDict = {
+        "ext": PointerClass(VoidType),
+        "loc": PointerClass(VoidType),
+        "hwc_to_chw": uint16_t,
+        "stride_2d": uint16_t,
+        "number_of_2d_copies": uint16_t,
+        "stride_1d": uint16_t,
+        "number_of_1d_copies": uint16_t,
+        "length_1d_copy": uint16_t,
+        "mchan_cmd": uint32_t,
+        "dir": int32_t,
+        "tid": int32_t
+    }
+
+
+class pi_cl_ram_req_t(Struct):
+    typeName = "pi_cl_ram_req_t"
+    structTypeDict = {
+        "addr": PointerClass(VoidType),
+        "pi_ram_addr": PointerClass(VoidType),
+        "size": uint32_t,
+        "stride": uint32_t,
+        "length": uint32_t,
+        "is_2d": uint8_t,
+        "ext2loc": uint8_t,
+    }
+
+
+@dataclass
+class PULPStructDataTypes():
+    DMA_copy = DMA_copy
+    pi_cl_ram_req_t = pi_cl_ram_req_t
+
+
+PULPDMAFuture = partial(FutureClass,
+                        stateReferenceType = PULPStructDataTypes.DMA_copy,
+                        resolveCheckTemplate = _DMAResolveTemplate,
+                        dispatchCheckTemplate = _DMADispatchTemplate)
diff --git a/Deeploy/Targets/PULPOpen/Deployer.py b/Deeploy/Targets/PULPOpen/Deployer.py
new file mode 100644
index 0000000..df34afd
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Deployer.py
@@ -0,0 +1,123 @@
+# ----------------------------------------------------------------------
+#
+# File: PULPDeployer.py
+#
+# Last edited: 08.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, Type
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer
+from Deeploy.CommonExtensions.OptimizationPasses.BindingsOptimizationPasses.AutoTranspose import AutoTransposeMergePass
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    PULPNCHWtoNHWCPass, RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass
+from Deeploy.DeeployTypes import ConstantBuffer, DeploymentPlatform, NodeTemplate, TopologyOptimizer, VariableBuffer
+from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ReshapeConstOptPass, TransposeConstOptPass, \
+    TransposeMergePass, TransposeNoPermOptPass, TransposeSplitPass
+from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import RQAddTransposeSquashPass
+
+_L3AllocTemplate = NodeTemplate("""
+${locPtr} = cl_ram_malloc(${size});
+""")
+
+_L3InitTemplate = NodeTemplate("""
+load_file_to_ram(${locPtr}, "${extName}.hex");
+""")
+
+
+class PULPDeployer(SignPropDeployer):
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: DeploymentPlatform,
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable = lambda x: x,
+                 name: str = 'DeeployNetwork',
+                 default_channels_first = False,
+                 deeployStateDir: str = "DeeployStateDir",
+                 inputOffsets = {}):
+        super().__init__(graph,
+                         deploymentPlatform,
+                         inputTypes,
+                         loweringOptimizer,
+                         scheduler,
+                         name,
+                         default_channels_first = default_channels_first,
+                         deeployStateDir = deeployStateDir,
+                         inputOffsets = inputOffsets)
+
+        self.loweringOptimizer.passes += [
+            TransposeMatmulInputsPass(),
+            PULPNCHWtoNHWCPass(self.default_channels_first),
+            TransposeSplitPass(),
+            RQAddTransposeSquashPass(),
+            TransposeSplitPass(),
+            TransposeMergePass(),
+            TransposeConstOptPass(),
+            ReshapeConstOptPass(),
+            TransposeNoPermOptPass(),
+            RemoveGlobalOutputReshapePass(),
+        ]
+
+    def bind(self):
+        # SCHEREMO: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU.
+        # SCHEREMO: The BindingOptimizationPass system is fairly fragile;
+        # it was designed this way because implementing further topology optimizations after
+        # parsing is very involved. If there are further use-cases, we should consider making this effort,
+        # but if there is only very few cases, this solution is okay.
+        autoTransposePass = AutoTransposeMergePass()
+        #self.ctxt, self.layerBinding = autoTransposePass.apply(self.ctxt, self.graph, self.layerBinding)
+        # SCHEREMO: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU.
+        ret = super().bind()
+        if ret:
+            self.ctxt.hoistGlobalDefinition("cluster_dev", "extern struct pi_device cluster_dev;")
+        return ret
+
+    def generateBufferAllocationCode(self) -> str:
+        retStr = super().generateBufferAllocationCode()
+
+        L3FileStr = ""
+        globalConstBuffers = [
+            buf for key, buf in self.ctxt.globalObjects.items() if isinstance(buf, VariableBuffer) and buf._deploy
+        ]
+        nonArenaBuffers = [buf for buf in globalConstBuffers if buf._users != []]
+        l3ConstBuffer = [buf for buf in nonArenaBuffers if hasattr(buf, "_memoryLevel") and buf._memoryLevel == "L3"]
+
+        for idx, buf in enumerate(l3ConstBuffer):
+
+            locPtr = str(buf._instance)
+            extName = str(idx)
+            buf.extName = extName
+            size = np.prod(buf.shape) * (buf._type.referencedType.typeWidth // 8)
+
+            if isinstance(buf, ConstantBuffer):
+                L3FileStr += _L3AllocTemplate.generate({"locPtr": locPtr, "extName": extName, "size": size})
+
+            L3FileStr += _L3InitTemplate.generate({"locPtr": locPtr, "extName": extName, "size": size})
+
+        retStr = retStr + L3FileStr
+
+        return retStr
diff --git a/Deeploy/Targets/PULPOpen/Layers.py b/Deeploy/Targets/PULPOpen/Layers.py
new file mode 100644
index 0000000..d291078
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Layers.py
@@ -0,0 +1,64 @@
+# ----------------------------------------------------------------------
+#
+# File: CMSISLayers.py
+#
+# Last edited: 22.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Tuple
+
+from Deeploy.DeeployTypes import NodeMapper, Shape
+from Deeploy.Targets.Generic.Layers import RQGEMMLayer, RQSConvLayer
+
+
+class PULPRQSConvLayer(RQSConvLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+        if channels_first:
+            inputShapes[2] = [outputShapes[0][1]]  # Channels out dimension of Kernel
+            inputShapes[3] = [outputShapes[0][1]]  # Channels out dimension of Kernel
+        else:
+            inputShapes[2] = [outputShapes[0][-1]]  # Channels out dimension of Kernel
+            inputShapes[3] = [outputShapes[0][-1]]  # Channels out dimension of Kernel
+        return (inputShapes, outputShapes)
+
+
+class PULPRQSGEMMLayer(RQGEMMLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+
+        if operatorRepresentation['transB']:
+            channelDim = -2
+        else:
+            channelDim = -1
+
+        inputShapes[2] = [inputShapes[1][channelDim]]  # Channels out dimension of Kernel
+        inputShapes[3] = [inputShapes[1][channelDim]]  # Channels out dimension of Kernel
+
+        return (inputShapes, outputShapes)
diff --git a/Deeploy/Targets/PULPOpen/Parsers.py b/Deeploy/Targets/PULPOpen/Parsers.py
new file mode 100644
index 0000000..6878237
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Parsers.py
@@ -0,0 +1,423 @@
+# ----------------------------------------------------------------------
+#
+# File: PULPParsers.py
+#
+# Last edited: 10.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Tuple
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import NetworkContext
+from Deeploy.Targets.Generic.Parsers import AddParser, GEMMParser, RQSConv1DParser, RQSConv2DParser, RQSParserInterface
+
+
+class PULPRQAddParser(AddParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        if not super().parseNode(node):
+            return False
+
+        ret = all([
+            'rqs1_mul' in node.attrs,
+            'rqs1_add' in node.attrs,
+            'rqs1_div' in node.attrs,
+            'rqs1_signed' in node.attrs,
+            any(['rqs1_n_levels' in node.attrs, 'rqs1_n_levels_out' in node.attrs]),
+            'rqs2_mul' in node.attrs,
+            'rqs2_add' in node.attrs,
+            'rqs2_div' in node.attrs,
+            'rqs2_signed' in node.attrs,
+            any(['rqs2_n_levels' in node.attrs, 'rqs2_n_levels_out' in node.attrs]),
+            'rqsOut_mul' in node.attrs,
+            'rqsOut_add' in node.attrs,
+            'rqsOut_div' in node.attrs,
+            'rqsOut_signed' in node.attrs,
+            any(['rqsOut_n_levels' in node.attrs, 'rqsOut_n_levels_out' in node.attrs]),
+        ])
+
+        if ret:
+            if 'rqs1_n_levels' in node.attrs:
+                self.operatorRepresentation['rqs1_n_levels'] = int(node.attrs['rqs1_n_levels'].values)
+            else:
+                self.operatorRepresentation['rqs1_n_levels'] = int(node.attrs['rqs1_n_levels_out'].values)
+            self.operatorRepresentation['rqs1_mul'] = int(node.attrs['rqs1_mul'])
+            self.operatorRepresentation['rqs1_add'] = int(node.attrs['rqs1_add'])
+            self.operatorRepresentation['rqs1_signed'] = int(node.attrs['rqs1_signed'].values)
+            self.operatorRepresentation['rqs1_log2D'] = int(math.log2(node.attrs['rqs1_div'].values))
+
+            if 'rqs2_n_levels' in node.attrs:
+                self.operatorRepresentation['rqs2_n_levels'] = int(node.attrs['rqs2_n_levels'].values)
+            else:
+                self.operatorRepresentation['rqs2_n_levels'] = int(node.attrs['rqs2_n_levels_out'].values)
+            self.operatorRepresentation['rqs2_mul'] = int(node.attrs['rqs2_mul'])
+            self.operatorRepresentation['rqs2_add'] = int(node.attrs['rqs2_add'])
+            self.operatorRepresentation['rqs2_signed'] = int(node.attrs['rqs2_signed'].values)
+            self.operatorRepresentation['rqs2_log2D'] = int(math.log2(node.attrs['rqs2_div'].values))
+
+            if 'rqsOut_n_levels' in node.attrs:
+                self.operatorRepresentation['rqsOut_n_levels'] = int(node.attrs['rqsOut_n_levels'].values)
+            else:
+                self.operatorRepresentation['rqsOut_n_levels'] = int(node.attrs['rqsOut_n_levels_out'].values)
+            self.operatorRepresentation['rqsOut_mul'] = int(node.attrs['rqsOut_mul'])
+            self.operatorRepresentation['rqsOut_add'] = int(node.attrs['rqsOut_add'])
+            self.operatorRepresentation['rqsOut_signed'] = int(node.attrs['rqsOut_signed'].values)
+            self.operatorRepresentation['rqsOut_log2D'] = int(math.log2(node.attrs['rqsOut_div'].values))
+
+        return ret
+
+
+class PULPConv2DParser(RQSConv2DParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = super().parseNode(node)
+        if wellFormed:
+            ret = all([
+                # Make sure padding is square
+                self.operatorRepresentation['group'] == 1,
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2],
+                self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3],
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+                #self.operatorRepresentation['pads'][0] == 0,
+                # Don't support dilations
+                #all([coeff == 1 for coeff in self.operatorRepresentation['dilations']]),
+                len(node.inputs) == 4,
+                'shift' in node.attrs,
+            ])
+
+            self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0])
+            self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1])
+            self.operatorRepresentation['dilation_x'] = int(self.operatorRepresentation['dilations'][0])
+            self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][1])
+            self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0])
+            self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1])
+            self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2])
+            self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3])
+            self.operatorRepresentation['stride_x'] = int(self.operatorRepresentation['strides'][0])
+            self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][1])
+
+            return ret
+        return False
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            inputs = ['data_in', 'weight', 'mul', 'add']
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+
+            return newCtxt, True
+
+        return ctxt, False
+
+
+class PULPDWConv1DParser(RQSConv1DParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = super().parseNode(node)
+        if wellFormed:
+            ret = all([
+                # Make sure padding is square
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+                #self.operatorRepresentation['pads'][0] == 0,
+                # Don't support dilations
+                #all([coeff == 1 for coeff in self.operatorRepresentation['dilations']]),
+                len(node.inputs) == 4,
+            ])
+
+            if ret:
+
+                self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][0])
+                self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][0])
+                self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0])
+                self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][1])
+                self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][0])
+
+                if 'n_levels' in node.attrs:
+                    self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'].values)
+                else:
+                    self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels_out'].values)
+
+                self.operatorRepresentation['signed'] = int(node.attrs['signed'].values)
+                self.operatorRepresentation['log2D'] = int(math.log2(node.attrs['div'].values))
+            return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+
+            inputs = ['data_in', 'weight', 'mul', 'add']
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
+
+            if not self.operatorRepresentation['group'] == newCtxt.lookup(
+                    self.operatorRepresentation['weight']).shape[0]:
+                return ctxt, False
+
+            # if not newCtxt.is_global(self.operatorRepresentation['weight']):
+            #     return ctxt, False
+
+            # SCHEREMO: Transpose weights to be num filters last
+            # newCtxt.globalObjects[self.operatorRepresentation['weight']].values = np.transpose(weight.values, list(range(len(weight.shape)))[1:] + [0])
+
+            return newCtxt, True
+
+        return ctxt, False
+
+
+class PULPDWConv2DParser(RQSConv2DParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = super().parseNode(node)
+        if wellFormed:
+            ret = all([
+                # Make sure padding is square
+                node.op == 'RequantizedConv',
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2],
+                self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3],
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+                #self.operatorRepresentation['pads'][0] == 0,
+                # Don't support dilations
+                #all([coeff == 1 for coeff in self.operatorRepresentation['dilations']]),
+                len(node.inputs) == 4,
+                'shift' in node.attrs,
+                any(['n_levels' in node.attrs, 'n_levels_out' in node.attrs]),
+                'signed' in node.attrs
+            ])
+
+            if ret:
+                self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0])
+                self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1])
+                self.operatorRepresentation['dilation_x'] = int(self.operatorRepresentation['dilations'][0])
+                self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][1])
+                self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0])
+                self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1])
+                self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2])
+                self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3])
+                self.operatorRepresentation['stride_x'] = int(self.operatorRepresentation['strides'][0])
+                self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][1])
+
+                if 'n_levels' in node.attrs:
+                    self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'].values)
+                else:
+                    self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels_out'].values)
+                self.operatorRepresentation['signed'] = int(node.attrs['signed'].values)
+                self.operatorRepresentation['log2D'] = int(math.log2(node.attrs['div'].values))
+
+            return ret
+        return False
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node)
+
+        if ret:
+
+            inputs = ['data_in', 'weight', 'mul', 'add']
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
+
+            if not self.operatorRepresentation['group'] == newCtxt.lookup(
+                    self.operatorRepresentation['weight']).shape[0]:
+                return ctxt, False
+
+            data_in = newCtxt.lookup(self.operatorRepresentation['data_in'])
+            data_out = newCtxt.lookup(self.operatorRepresentation['data_out'])
+            _ = newCtxt.lookup(self.operatorRepresentation['weight'])
+
+            # if not newCtxt.is_global(self.operatorRepresentation['weight']):
+            #     return ctxt, False
+
+            # SCHEREMO: Transpose weights to be num filters last
+            # newCtxt.globalObjects[self.operatorRepresentation['weight']].values = np.transpose(weight.values, list(range(len(weight.shape)))[1:] + [0])
+
+            if channels_first:
+                self.operatorRepresentation['ch_im_in'] = data_in.shape[1]
+                self.operatorRepresentation['dim_im_in_x'] = data_in.shape[2]
+                self.operatorRepresentation['dim_im_in_y'] = data_in.shape[3]
+                self.operatorRepresentation['ch_im_out'] = data_out.shape[1]
+                self.operatorRepresentation['dim_im_out_x'] = data_out.shape[2]
+                self.operatorRepresentation['dim_im_out_y'] = data_out.shape[3]
+            else:
+                self.operatorRepresentation['ch_im_in'] = data_in.shape[1]
+                self.operatorRepresentation['dim_im_in_x'] = data_in.shape[2]
+                self.operatorRepresentation['dim_im_in_y'] = data_in.shape[3]
+                self.operatorRepresentation['ch_im_out'] = data_out.shape[3]
+                self.operatorRepresentation['dim_im_out_x'] = data_out.shape[1]
+                self.operatorRepresentation['dim_im_out_y'] = data_out.shape[2]
+
+            return newCtxt, True
+
+        return ctxt, False
+
+
+class PULPConv1DParser(RQSConv1DParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        wellFormed = super().parseNode(node)
+        if wellFormed:
+            ret = all([
+                # Make sure padding is square
+                self.operatorRepresentation['group'] == 1,
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+                #self.operatorRepresentation['pads'][0] == 0,
+                # Don't support dilations
+                #all([coeff == 1 for coeff in self.operatorRepresentation['dilations']]),
+                len(node.inputs) == 4,
+            ])
+
+            self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][0])
+            self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][0])
+            self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0])
+            self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][1])
+            self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][0])
+
+            return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            inputs = ['data_in', 'weight', 'mul', 'add']
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+
+            return newCtxt, True
+
+        return ctxt, False
+
+
+class PULPGEMMParser(GEMMParser, RQSParserInterface):
+
+    def __init__(self):
+        super().__init__(noBiasHoisting = True)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        ret_rqs = RQSParserInterface.parseNode(self, node)
+        ret_matmul = GEMMParser.parseNode(self, node)
+
+        ret = all([
+            ret_rqs == True,
+            ret_matmul == True,
+            'shift' in node.attrs,
+            len(node.inputs) == 4,
+        ])
+
+        if ret:
+            self.operatorRepresentation['shift'] = int(node.attrs['shift'].values)
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            inputs = ['A', 'B', 'C', 'mul']
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
+
+            return newCtxt, True
+
+        else:
+            return ctxt, False
+
+
+class PULPMatrixVecParser(PULPGEMMParser):
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if not ret:
+            return ctxt, False
+
+        if not (self.operatorRepresentation['M'] == 1 and self.operatorRepresentation['batch'] >= 8):
+            return ctxt, False
+
+        return newCtxt, True
+
+
+class PULPTallGEMMParser(PULPGEMMParser):
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if not ret:
+            return ctxt, False
+
+        ret = all([
+            self.operatorRepresentation['batch'] < 8,
+            self.operatorRepresentation['M'] >= 8,
+            self.operatorRepresentation['M'] % 8 < self.operatorRepresentation['O'] % 8,
+        ])
+
+        if not ret:
+            return ctxt, False
+
+        return newCtxt, True
diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py
new file mode 100644
index 0000000..c32f29d
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Platform.py
@@ -0,0 +1,274 @@
+# ----------------------------------------------------------------------
+#
+# File: PULPPlatform.py
+#
+# Last edited: 07.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Authors:
+# - Moritz Scherer, ETH Zurich
+# - Victor Jung, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NetworkContext, NodeMapper, \
+    NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper
+from Deeploy.Targets.CortexM.Parsers import CMSISMaxPool2DParser
+from Deeploy.Targets.Generic.Bindings import BasicGatherBindings, BasicPad1DBindings, BasicPad2DBindings, \
+    BasicReshapeBindings, BasicRQIntegerDivBinding
+from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, GatherLayer, MatMulLayer, MaxPoolLayer, MulLayer, \
+    PadLayer, ReduceMeanLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, RQSiHardswishLayer, \
+    SliceLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer, iSoftmaxLayer
+from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, FlattenParser, GatherParser, MatMulParser, \
+    MulParser, Pad1DParser, Pad2DParser, ReduceMeanParser, RequantShiftParser, ReshapeParser, RQIntegerDivParser, \
+    RQSiGELUParser, RQSiHardswishParser, SliceParser, TransposeParser, UniformRequantShiftParser, UnsqueezeParser, \
+    iHardswishParser, iRMSNormParser, iSoftmaxParser
+from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate
+from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import IntegerDivRequantMergePass, \
+    MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \
+    SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass
+from Deeploy.Targets.PULPOpen.Bindings import PULPConv1DBinding, PULPDMASliceBindings, PULPDWConv1DBinding, \
+    PULPReduceMeanBindings
+from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer, PULPRQSGEMMLayer
+from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPDWConv1DParser, \
+    PULPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, PULPRQAddParser, PULPTallGEMMParser
+from Deeploy.Targets.PULPOpen.Templates import AllocateTemplate, FreeTemplate
+from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPConcatTilingReadyBindings, \
+    PULPFlattenTilingReadyBindings, PULPiHardswishTilingReadyBindings, PULPiRMSNormTilingReadyBindings, \
+    PULPiRQSGELUTilingReadyBindings, PULPiSoftmaxTilingReadyBindings, PULPMatMulTilingReadyBindings, \
+    PULPMaxPool2DTilingReadyBindings, PULPMulTilingReadyBindings, PULPRQAddTilingReadyBindings, \
+    PULPRQSConv2DTilingReadyBindings, PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, \
+    PULPRQSiHardswishTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, \
+    PULPRQSTilingReadyBindings, PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings
+from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \
+    PULPConvRequantMergePass, PULPGEMMRequantMergePass, PULPMatMulRequantMergePass
+
+RQAddMapper = NodeMapper(PULPRQAddParser(), PULPRQAddTilingReadyBindings)
+AddMapper = NodeMapper(AddParser(), PULPAddTilingReadyBindings)
+FlattenMapper = NodeMapper(FlattenParser(), PULPFlattenTilingReadyBindings)
+GatherMapper = NodeMapper(GatherParser(), BasicGatherBindings)
+MulMapper = NodeMapper(MulParser(), PULPMulTilingReadyBindings)
+Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings)
+Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings)
+ReshapeMapper = NodeMapper(ReshapeParser(), PULPFlattenTilingReadyBindings)
+TransposeMapper = NodeMapper(TransposeParser(), PULPTransposeTilingReadyBindings)
+UnsqueezeMapper = NodeMapper(UnsqueezeParser(), BasicReshapeBindings)
+
+RequantShiftMapper = NodeMapper(RequantShiftParser(), PULPRQSTilingReadyBindings)
+UniformRequantShiftMapper = NodeMapper(UniformRequantShiftParser(), PULPUniformRQSTilingReadyBindings)
+
+ReduceMeanMapper = NodeMapper(ReduceMeanParser(), PULPReduceMeanBindings)
+MatMulMapper = NodeMapper(MatMulParser(), PULPMatMulTilingReadyBindings)
+RQIntegerDivMapper = NodeMapper(RQIntegerDivParser(), [BasicRQIntegerDivBinding])
+RQGELU_int8_Mapper = NodeMapper(RQSiGELUParser(), PULPiRQSGELUTilingReadyBindings)
+
+Conv1DMapper = NodeMapper(PULPConv1DParser(), [PULPConv1DBinding])
+DWConv1DMapper = NodeMapper(PULPDWConv1DParser(), [PULPDWConv1DBinding])
+
+Conv2DMapper = NodeMapper(PULPConv2DParser(), PULPRQSConv2DTilingReadyBindings)
+DWConv2DMapper = NodeMapper(PULPDWConv2DParser(), PULPRQSDWConv2DTilingReadyBindings)
+GEMMMapper = NodeMapper(PULPGEMMParser(), PULPRQSGEMMTilingReadyBindings)
+MatrixVecMapper = NodeMapper(PULPMatrixVecParser(), PULPRQSMatrixVecTilingReadyBindings)
+TallGEMMMapper = NodeMapper(PULPTallGEMMParser(), PULPRQSTallGEMMTilingReadyBindings)
+MaxPool2DMapper = NodeMapper(CMSISMaxPool2DParser(), PULPMaxPool2DTilingReadyBindings)
+Softmax_int8_Mapper = NodeMapper(iSoftmaxParser(), PULPiSoftmaxTilingReadyBindings)
+
+ConcatMapper = NodeMapper(ConcatParser(), PULPConcatTilingReadyBindings)
+
+SliceMapper = NodeMapper(SliceParser(), PULPDMASliceBindings)
+
+iRMSNormMapper = NodeMapper(iRMSNormParser(), PULPiRMSNormTilingReadyBindings)
+
+iHardswishMapper = NodeMapper(iHardswishParser(), PULPiHardswishTilingReadyBindings)
+RQSiHardswishMapper = NodeMapper(RQSiHardswishParser(), PULPRQSiHardswishTilingReadyBindings)
+
+PULPMapping = {
+    'RequantizedConv': PULPRQSConvLayer([Conv2DMapper, DWConv2DMapper, Conv1DMapper, DWConv1DMapper]),
+    'RequantizedGemm': PULPRQSGEMMLayer([MatrixVecMapper, TallGEMMMapper, GEMMMapper]),
+    'MaxPool': MaxPoolLayer([MaxPool2DMapper]),
+    'RequantizediGELU': RQSiGELULayer([RQGELU_int8_Mapper]),
+    'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]),
+    'MatMul': MatMulLayer([MatMulMapper]),
+    'IntegerMean': ReduceMeanLayer([ReduceMeanMapper]),
+    'iSoftmax': iSoftmaxLayer([Softmax_int8_Mapper]),
+    'ReduceMean': ReduceMeanLayer([ReduceMeanMapper]),
+    'RequantShift': RequantShiftLayer([UniformRequantShiftMapper, RequantShiftMapper]),
+    'Add': AddLayer([AddMapper]),
+    'Flatten': ReshapeLayer([FlattenMapper]),
+    'Gather': GatherLayer([GatherMapper]),
+    'Mul': MulLayer([MulMapper]),
+    'Pad': PadLayer([Pad1DMapper, Pad2DMapper]),
+    'Reshape': ReshapeLayer([ReshapeMapper]),
+    'Transpose': TransposeLayer([TransposeMapper]),
+    'Unsqueeze': ReshapeLayer([UnsqueezeMapper]),
+    'Slice': SliceLayer([SliceMapper]),
+    'RequantizedAdd': AddLayer([RQAddMapper]),
+    'Concat': ConcatLayer([ConcatMapper]),
+    'iRMSNorm': iRMSNormLayer([iRMSNormMapper]),
+    'iHardswish': iHardswishLayer([iHardswishMapper]),
+    'RequantizediHardswish': RQSiHardswishLayer([RQSiHardswishMapper])
+}
+
+
+class PULPVariableBuffer(VariableBuffer):
+
+    initTemplate = AllocateTemplate.pulpL2InitTemplate
+    # allocTemplate = AllocateTemplate.pulpL2AllocateTemplate
+    # deallocTemplate = FreeTemplate.pulpL2LocalTemplate
+
+    allocTemplate = AllocateTemplate.pulpGenericAllocate
+    deallocTemplate = FreeTemplate.pulpGenericFree
+
+    def _bufferRepresentation(self):
+
+        if hasattr(self, "_memoryLevel"):
+            memoryLevel = self._memoryLevel
+        else:
+            memoryLevel = None
+
+        return {
+            "type": self._instance,
+            "name": self.name,
+            "size": int(np.prod(self.shape)),
+            "_memoryLevel": memoryLevel
+        }
+
+
+class PULPTransientBuffer(TransientBuffer):
+
+    initTemplate = AllocateTemplate.pulpL2InitTemplate
+    allocTemplate = AllocateTemplate.pulpGenericAllocate
+    deallocTemplate = FreeTemplate.pulpGenericFree
+
+    # allocTemplate = AllocateTemplate.pulpL2AllocateTemplate
+    # deallocTemplate = FreeTemplate.pulpL2GlobalTemplate
+
+    def _bufferRepresentation(self):
+
+        if hasattr(self, "_memoryLevel"):
+            memoryLevel = self._memoryLevel
+        else:
+            memoryLevel = None
+
+        return {"type": self._type, "name": self.name, "size": self.size, "_memoryLevel": memoryLevel}
+
+
+class PULPConstantBuffer(ConstantBuffer):
+
+    initTemplate = AllocateTemplate.pulpGenericGlobalInitTemplate
+    allocTemplate = AllocateTemplate.pulpL2GlobalAllocateTemplate
+    deallocTemplate = FreeTemplate.pulpL2GlobalTemplate
+
+    def _bufferRepresentation(self):
+        operatorRepresentation = super()._bufferRepresentation()
+
+        if hasattr(self, "_memoryLevel"):
+            memoryLevel = self._memoryLevel
+        else:
+            memoryLevel = None
+
+        operatorRepresentation["_memoryLevel"] = memoryLevel
+
+        return operatorRepresentation
+
+
+class PULPStructBuffer(StructBuffer):
+
+    initTemplate = BasicAllocateTemplate.referenceStructInitTemplate
+    allocTemplate = BasicAllocateTemplate.referenceStructAllocateTemplate
+    deallocTemplate = NodeTemplate("")
+
+
+PULPOptimizer = TopologyOptimizer([
+    SkipEmptyConcatPass(),
+    SkipUnityRequantPass(previous_op_regex = "Concat", num_inputs = 2),
+    SkipUnityRequantPass(previous_op_regex = "Reshape|Transpose", num_inputs = 1),
+    SkipUnityRequantPass(previous_op_regex = "Reshape|Transpose", num_inputs = 1),
+    RQSSplitPass(),
+    MergeTrueIntegerDivRequantShiftPass(),
+    IntegerDivRequantMergePass(),
+    iGELURequantMergePass(),
+    iHardswishRequantMergePass(),
+    PULPConvRequantMergePass(),
+    MergeConstAddAndRequantPass(),
+    PULPGEMMRequantMergePass(),
+    PULPMatMulRequantMergePass(),
+    PULPAddRequantMergePass()
+])
+
+# SCHEREMO: stdint is included before pulp_nn_kernels.h because it is supposed to be included in there, but isn't...
+_includeList = [
+    "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployBasicMath.h", "dory_dma.h", "dory_mem.h", "bsp/ram.h"
+]
+
+
+class PULPClusterEngine(DeploymentEngine):
+
+    def __init__(self, name: str, Mapping = PULPMapping, initCode = "", includeList = _includeList) -> None:
+        super().__init__(name, Mapping, initCode, includeList)
+
+
+class PULPPlatform(DeploymentPlatform):
+
+    def __init__(self,
+                 engines = [PULPClusterEngine("PULPCluster")],
+                 variableBuffer = PULPVariableBuffer,
+                 constantBuffer = PULPConstantBuffer,
+                 structBuffer = PULPStructBuffer,
+                 transientBuffer = PULPTransientBuffer) -> None:
+        super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
+
+
+class MemoryPULPPlatform(MemoryPlatform):
+
+    untiledOps = ["add"]
+
+    def __init__(self,
+                 memoryHierarchy: MemoryHierarchy,
+                 defaultTargetMemoryLevel: MemoryLevel,
+                 engines = [PULPClusterEngine("PULPCluster")],
+                 variableBuffer = PULPVariableBuffer,
+                 constantBuffer = PULPConstantBuffer,
+                 structBuffer = PULPStructBuffer,
+                 transientBuffer = PULPTransientBuffer) -> None:
+        super().__init__(memoryHierarchy, defaultTargetMemoryLevel, engines, variableBuffer, constantBuffer,
+                         structBuffer, transientBuffer)
+
+    def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str:
+        if node.op in self.untiledOps:
+            return ctxt.lookup(tensorName)._memoryLevel
+        return super().getTargetMemoryLevel(node, tensorName, ctxt)
+
+
+class MemoryPULPPlatformWrapper(MemoryPlatformWrapper):
+
+    untiledOps = ["add"]
+
+    def __init__(self, platform: PULPPlatform, memoryHierarchy: MemoryHierarchy, defaultTargetMemoryLevel: MemoryLevel):
+        assert isinstance(platform, PULPPlatform), \
+        f"Given platform is not an instance of PULPPlatform. Platform type: {type(platform).__name__}"
+        super().__init__(platform, memoryHierarchy, defaultTargetMemoryLevel)
+
+    def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str:
+        if node.op in self.untiledOps:
+            return ctxt.lookup(tensorName)._memoryLevel
+        return super().getTargetMemoryLevel(node, tensorName, ctxt)
diff --git a/Deeploy/Targets/PULPOpen/Templates/AllocateTemplate.py b/Deeploy/Targets/PULPOpen/Templates/AllocateTemplate.py
new file mode 100644
index 0000000..d661726
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/AllocateTemplate.py
@@ -0,0 +1,90 @@
+# ----------------------------------------------------------------------
+#
+# File: AllocateTemplate.py
+#
+# Last edited: 09.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+pulpL2InitTemplate = NodeTemplate("${type.typeName} ${name};\n")
+
+pulpL1InitTemplate = NodeTemplate("${type.typeName} ${name};\n")
+#pulpL2AllocateTemplate = NodeTemplate("${name} = (${type.typeName}) pi_l2_malloc(${type.referencedType.typeWidth//8} * ${size});\n")
+pulpL2AllocateTemplate = NodeTemplate(
+    "${name} = (${type.typeName}) pi_l2_malloc(sizeof(${type.referencedType.typeName}) * ${size});\n")
+
+pulpL1AllocateTemplate = NodeTemplate(
+    "${name} = (${type.typeName}) pmsis_l1_malloc(sizeof(${type.referencedType.typeName}) * ${size});\n")
+
+pulpL2GlobalInitTemplate = NodeTemplate(
+    "static PI_L2 ${type.referencedType.typeName} ${name}[${size}] = {${values}};\n")
+
+pulpL1GlobalInitTemplate = NodeTemplate(
+    "static PI_L1 ${type.referencedType.typeName} ${name}[${size}] = {${values}};\n")
+
+#pulpL2GlobalInitTemplate = NodeTemplate("static const ${type} ${name}[${size}];\n")
+pulpL2GlobalAllocateTemplate = NodeTemplate("")
+
+pulpL1GlobalAllocateTemplate = NodeTemplate("")
+
+pulpL2StructInitTemplate = NodeTemplate("""static PI_L2 ${type.typeName} ${name};
+""")
+#static const ${type}* ${name} = &${name}_UL;
+
+pulpL2StructAllocateTemplate = NodeTemplate(""" % for key, value in structDict.items():
+    ${name}.${key} = ${value};
+% endfor """)
+
+pulpGenericStructInitTemplate = NodeTemplate("""
+% if _memoryLevel == "L1":
+static PI_L1 ${type.typeName} ${name};\n
+% elif _memoryLevel == "L2" or _memoryLevel is None:
+static PI_L2 ${type.typeName} ${name};\n
+% elif _memoryLevel == "L3":
+// ${name} is allocated in L3 \n
+% endif
+""")
+
+pulpGenericGlobalInitTemplate = NodeTemplate("""
+% if _memoryLevel == "L1":
+static PI_L1 ${type.referencedType.typeName} ${name}[${size}] = {${values}};\n
+% elif _memoryLevel == "L2" or _memoryLevel is None:
+static PI_L2 ${type.referencedType.typeName} ${name}[${size}] = {${values}};\n
+% elif _memoryLevel == "L3":
+// ${name} is allocated in L3 \n
+static PI_L2 ${type.referencedType.typeName}* ${name};
+% endif
+""")
+
+pulpGenericAllocate = NodeTemplate("""
+% if _memoryLevel == "L1":
+${name} = (${type.typeName}) pmsis_l1_malloc(sizeof(${type.referencedType.typeName}) * ${size});\n
+% elif _memoryLevel == "L2" or _memoryLevel is None:
+${name} = (${type.typeName}) pi_l2_malloc(sizeof(${type.referencedType.typeName}) * ${size});\n
+% elif _memoryLevel == "L3":
+${name} = (${type.typeName}) cl_ram_malloc(sizeof(${type.referencedType.typeName}) * ${size});\n
+% else:
+//COMPILER BLOCK - MEMORYLEVEL ${_memoryLevel} NOT FOUND \n
+${name} = (${type.typeName}) pi_l2_malloc(sizeof(${type.referencedType.typeName}) * ${size});\n
+// ${name} with size ${size} allocated in L2!
+% endif
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/ConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ConvTemplate.py
new file mode 100644
index 0000000..7ceec56
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/ConvTemplate.py
@@ -0,0 +1,229 @@
+# ----------------------------------------------------------------------
+#
+# File: ConvTemplate.py
+#
+# Last edited: 10.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class PULP2DConvTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Extract signedness information of input, weights and output
+        signedW = ctxt.lookup(operatorRepresentation['weight'])._type.referencedType.typeMin < 0
+        signedI = ctxt.lookup(operatorRepresentation['data_in'])._type.referencedType.typeMin < 0
+        signedO = ctxt.lookup(operatorRepresentation['data_out'])._type.referencedType.typeMin < 0
+        operatorRepresentation['weight_signed'] = signedW
+        operatorRepresentation['input_signed'] = signedI
+        operatorRepresentation['output_signed'] = signedO
+
+        return ctxt, operatorRepresentation, []
+
+    @staticmethod
+    def computeTransientBuffersSize(
+            ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
+        im2col_dim = 2 * 8 * (operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] *
+                              operatorRepresentation['dim_kernel_y'])
+        im2col_name = operatorRepresentation['nodeName'] + "_buffer"
+        return [(im2col_name, im2col_dim)]
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        im2col_name, im2col_dim = PULP2DConvTemplate.computeTransientBuffersSize(ctxt, operatorRepresentation)[0]
+        ctxt.hoistTransientBuffer(im2col_name, im2col_dim)
+
+        operatorRepresentation['ctxtBuffer'] = im2col_name
+        operatorRepresentation['ctxtBufferSize'] = im2col_dim
+        return ctxt, operatorRepresentation, [im2col_name]
+
+
+class PULP2DDWConvTemplate(PULP2DConvTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Extract signedness information of input, weights and output
+        signedW = ctxt.lookup(operatorRepresentation['weight'])._type.referencedType.typeMin < 0
+        signedI = ctxt.lookup(operatorRepresentation['data_in'])._type.referencedType.typeMin < 0
+        signedO = ctxt.lookup(operatorRepresentation['data_out'])._type.referencedType.typeMin < 0
+        operatorRepresentation['weight_signed'] = signedW
+        operatorRepresentation['input_signed'] = signedI
+        operatorRepresentation['output_signed'] = signedO
+
+        return ctxt, operatorRepresentation, []
+
+
+class PULP1DConvTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Extract signedness information of input, weights and output
+        signedW = ctxt.lookup(operatorRepresentation['weight'])._type.referencedType.typeMin < 0
+        signedI = ctxt.lookup(operatorRepresentation['data_in'])._type.referencedType.typeMin < 0
+        signedO = ctxt.lookup(operatorRepresentation['data_out'])._type.referencedType.typeMin < 0
+        operatorRepresentation['weight_signed'] = signedW
+        operatorRepresentation['input_signed'] = signedI
+        operatorRepresentation['output_signed'] = signedO
+
+        operatorRepresentation['pad_x_left'] = operatorRepresentation['pads'][0]
+        operatorRepresentation['pad_x_right'] = operatorRepresentation['pads'][1]
+        operatorRepresentation['stride_x'] = operatorRepresentation['strides'][0]
+
+        return ctxt, operatorRepresentation, []
+
+    @staticmethod
+    def computeTransientBuffersSize(
+            ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
+        im2col_dim = 8 * (1 * (1 + operatorRepresentation['pads'][0]) + operatorRepresentation['dim_kernel_y'])
+        im2col_name = operatorRepresentation['nodeName'] + "_buffer"
+        return [(im2col_name, im2col_dim)]
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        im2col_name, im2col_dim = PULP1DConvTemplate.computeTransientBuffersSize(ctxt, operatorRepresentation)[0]
+        ctxt.hoistTransientBuffer(im2col_name, im2col_dim)
+        operatorRepresentation['ctxtBuffer'] = im2col_name
+        operatorRepresentation['ctxtBufferSize'] = im2col_dim
+        return ctxt, operatorRepresentation, [im2col_name]
+
+
+class PULP1DDWConvTemplate(PULP1DConvTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+
+PULPConv2D_8_Template = PULP2DConvTemplate("""
+// PULP NN CONV
+<%
+signatureString = ''
+if input_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+if output_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+if weight_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+%>
+
+<%
+operatorString = ''
+if dim_kernel_x == 1 and dim_kernel_y == 1:
+    operatorString = 'pointwise'
+else:
+    operatorString = 'conv'
+operatorString = 'conv'
+%>
+
+pulp_nn_${operatorString}${signatureString}(${data_in}, ${ctxtBuffer}, NULL, ${data_out}, ${weight}, ${mul}, ${add}, 1, ${log2D}, ${dim_im_in_y}, ${dim_im_in_x}, ${ch_im_in}, ${dim_im_out_y}, ${dim_im_out_x}, ${ch_im_out}, ${dim_kernel_y}, ${dim_kernel_x}, ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}, ${stride_y}, ${stride_x}, 1, 1);
+""")
+
+PULPDWConv2D_8_Template = PULP2DDWConvTemplate("""
+// PULP NN CONV
+<%
+signatureString = ''
+if input_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+if output_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+if weight_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+%>
+pulp_nn_depthwise${signatureString}(${data_in}, ${ctxtBuffer}, NULL, ${data_out}, ${weight}, NULL, ${mul}, ${add}, 1, ${log2D}, ${dim_im_in_y}, ${dim_im_in_x}, ${ch_im_in}, ${dim_im_out_y}, ${dim_im_out_x}, ${ch_im_out}, ${dim_kernel_y}, ${dim_kernel_x}, ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}, ${stride_y}, ${stride_x}, 1, 1);
+""")
+
+PULPConv1D_8_Template = PULP1DConvTemplate("""
+// PULP NN CONV
+<%
+signatureString = ''
+if input_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+if output_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+if weight_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+%>
+
+<%
+operatorString = ''
+if dim_kernel_x == 1 and dim_kernel_y == 1:
+    operatorString = 'pointwise'
+else:
+    operatorString = 'conv'
+operatorString = 'conv'
+%>
+
+pulp_nn_${operatorString}${signatureString}(${data_in}, ${ctxtBuffer}, NULL, ${data_out}, ${weight}, ${mul}, ${add}, 1, ${log2D}, 1, ${dim_im_in_y}, ${ch_im_in}, 1, ${dim_im_out_y}, ${ch_im_out}, 1, ${dim_kernel_y}, ${padding_y_top}, ${padding_y_bottom}, 0, 0, 1, ${stride_y}, 1, 1);
+""")
+
+PULPDWConv1D_8_Template = PULP1DDWConvTemplate("""
+// PULP NN CONV
+<%
+signatureString = ''
+if input_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+if output_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+if weight_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+%>
+pulp_nn_depthwise${signatureString}(${data_in}, ${ctxtBuffer}, NULL, ${data_out}, ${weight}, NULL, ${mul}, ${add}, 1, ${log2D}, 1, ${dim_im_in_y}, ${ch_im_in}, 1, ${dim_im_out_y}, ${ch_im_out}, 1, ${dim_kernel_y}, ${padding_y_top}, ${padding_y_bottom}, 0, 0, 1, ${stride_y}, 1, 1);
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/FreeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FreeTemplate.py
new file mode 100644
index 0000000..de10f7f
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/FreeTemplate.py
@@ -0,0 +1,43 @@
+# ----------------------------------------------------------------------
+#
+# File: FreeTemplate.py
+#
+# Last edited: 09.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+pulpL2LocalTemplate = NodeTemplate("pi_l2_free(${name}, sizeof(${type.referencedType.typeName}) * ${size});")
+pulpL2GlobalTemplate = NodeTemplate("pi_l2_free(${name}, sizeof(${type.referencedType.typeName}) * ${size});")
+pulpL1FreeTemplate = NodeTemplate("pmsis_l1_malloc_free(${name}, sizeof(${type.referencedType.typeName}) * ${size});\n")
+pulpL1GlobalFreeTemplate = NodeTemplate("")
+
+pulpGenericFree = NodeTemplate("""
+% if _memoryLevel == "L1":
+pmsis_l1_malloc_free(${name}, sizeof(${type.referencedType.typeName}) * ${size});
+% elif _memoryLevel == "L2" or _memoryLevel is None:
+pi_l2_free(${name}, sizeof(${type.referencedType.typeName}) * ${size});
+% elif _memoryLevel == "L3":
+cl_ram_free(${name}, sizeof(${type.referencedType.typeName}) * ${size});
+% else:
+//COMPILER BLOCK - MEMORYLEVEL ${_memoryLevel} NOT FOUND \n
+% endif
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py b/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py
new file mode 100644
index 0000000..0486dd1
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py
@@ -0,0 +1,139 @@
+# ----------------------------------------------------------------------
+#
+# File: GEMMTemplate.py
+#
+# Last edited: 10.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class PULPGEMMTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Extract signedness information of input, weights and output
+        signedW = ctxt.lookup(operatorRepresentation['B'])._type.referencedType.typeMin < 0
+        signedI = ctxt.lookup(operatorRepresentation['A'])._type.referencedType.typeMin < 0
+        signedO = ctxt.lookup(operatorRepresentation['data_out'])._type.referencedType.typeMin < 0
+        operatorRepresentation['weight_signed'] = signedW
+        operatorRepresentation['input_signed'] = signedI
+        operatorRepresentation['output_signed'] = signedO
+
+        return ctxt, operatorRepresentation, []
+
+
+PULPGEMM_8_Template = PULPGEMMTemplate("""
+<%
+signatureString = ''
+if input_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+if output_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+if weight_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+%>
+// PULP NN GEMM
+int8_t* ref_${data_out}_${A} = ${A};
+int8_t* ref_${data_out}_${B} = ${B};
+int8_t* ref_${data_out}_${data_out} = ${data_out};
+for(int i=0;i<${batch};i++){
+for(int j=0;j<${M};j++){
+// LMACAN: In some edge cases sporadic errors happen if this loop is not added.
+// We believe this is due to missing bubbles in the pipeline that break operator forwarding.
+// Breaking test:
+//   `python testRunner_tiled_siracusa.py -t=Tests/Transformer --defaultMemLevel=L3 --doublebuffer --l1=30000`
+#pragma unroll 1
+for(int k=0;k<3;k++){
+  asm volatile("nop" ::);
+}
+pulp_nn_linear${signatureString}(ref_${data_out}_${A}, NULL, ref_${data_out}_${data_out}, ref_${data_out}_${B}, ${mul}, ${C}, 1, ${log2D}, ${N}, ${O}, 1, 1);
+ref_${data_out}_${A} += ${N};
+ref_${data_out}_${data_out} += ${O};
+}
+% if W_batched:
+ref_${data_out}_${B} += ${N} * ${O};
+% endif
+}
+""")
+
+
+class _MatMulTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        A = ctxt.lookup(operatorRepresentation['A'])
+        B = ctxt.lookup(operatorRepresentation['B'])
+        C = ctxt.lookup(operatorRepresentation['data_out'])
+
+        operatorRepresentation['A_offset'] = 0
+        operatorRepresentation['B_offset'] = 0
+        operatorRepresentation['C_offset'] = 0
+
+        if hasattr(A, "nLevels"):
+            operatorRepresentation['A_offset'] = (A._type.referencedType.typeMin == 0) * int(A.nLevels / 2)
+        if hasattr(B, "nLevels"):
+            operatorRepresentation['B_offset'] = (B._type.referencedType.typeMin == 0) * int(B.nLevels / 2)
+        if hasattr(C, "nLevels"):
+            operatorRepresentation['C_offset'] = -(C._type.referencedType.typeMin == 0) * int(C.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+PULPMM_8_Template = _MatMulTemplate("""
+// MatMul (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    ${A_type.typeName} ref_${data_out}_${A} = ${A};
+    ${B_type.typeName} ref_${data_out}_${B} = ${B};
+    ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+    for(uint32_t i=0;i<${batch};i++){
+        MatMul_s${A_type.referencedType.typeWidth}_s${B_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}(
+            ref_${data_out}_${A},
+            ref_${data_out}_${B},
+            ref_${data_out}_${data_out},
+            ${M},
+            ${N},
+            ${O},
+            0, 0, ${C_offset}
+        );
+
+        ref_${data_out}_${A} += ${M} * ${N};
+        ref_${data_out}_${B} += ${N} * ${O};
+        ref_${data_out}_${data_out} += ${M} * ${O};
+    }
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/MatrixVectorTemplate.py b/Deeploy/Targets/PULPOpen/Templates/MatrixVectorTemplate.py
new file mode 100644
index 0000000..3596193
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/MatrixVectorTemplate.py
@@ -0,0 +1,77 @@
+# ----------------------------------------------------------------------
+#
+# File: MatrixVectorTemplate.py
+#
+# Last edited: 15.03.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _PULPMatrixVectorTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        signedW = ctxt.lookup(operatorRepresentation['B'])._type.referencedType.typeMin < 0
+        signedI = ctxt.lookup(operatorRepresentation['A'])._type.referencedType.typeMin < 0
+        signedO = ctxt.lookup(operatorRepresentation['data_out'])._type.referencedType.typeMin < 0
+        operatorRepresentation['weight_signed'] = signedW
+        operatorRepresentation['input_signed'] = signedI
+        operatorRepresentation['output_signed'] = signedO
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _PULPMatrixVectorTemplate("""
+// MatrixVec (Name: ${nodeName}, Op: ${nodeOp})
+
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+int16_t ${nodeName}_chunk = (${int(batch)} >> ${nodeName}_log2Core) + ((${int(batch)} & (NUM_CORES-1))!=0);
+int16_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${int(batch)});
+int16_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${int(batch)} + 1);
+
+int8_t* ref_${nodeName}_${A} = ${A} + (${nodeName}_chunk_start * ${M}*${N});
+% if W_batched:
+int8_t* ref_${nodeName}_${B} = ${B} + (${nodeName}_chunk_start * ${O}*${N});
+% else:
+int8_t* ref_${nodeName}_${B} = ${B};
+% endif
+
+int8_t* ref_${nodeName}_${data_out} = ${data_out} + (${nodeName}_chunk_start * ${O}*${M});
+
+for (uint32_t i=${nodeName}_chunk_start;i<${nodeName}_chunk_stop;i++){
+     gemv_s8_s8_plp(ref_${nodeName}_${A}, NULL, ref_${nodeName}_${data_out}, ref_${nodeName}_${B}, ${mul}, ${C}, 1, ${log2D}, ${N}, ${O}, 1, 1);
+     ref_${nodeName}_${A} += (${M}*${N});
+     ref_${nodeName}_${data_out} += (${O}*${M});
+
+% if W_batched:
+     ref_${nodeName}_${B} += ${N} * ${O};
+% endif
+
+}
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/MaxPool2DTemplate.py b/Deeploy/Targets/PULPOpen/Templates/MaxPool2DTemplate.py
new file mode 100644
index 0000000..6b74c3d
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/MaxPool2DTemplate.py
@@ -0,0 +1,53 @@
+# ----------------------------------------------------------------------
+#
+# File: MaxPool2DTemplate.py
+#
+# Last edited: 10.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class PULPMaxPoolTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        signedI = ctxt.lookup(operatorRepresentation['data_in'])._type.referencedType.typeMin < 0
+        operatorRepresentation['input_signed'] = signedI
+        return ctxt, operatorRepresentation, []
+
+
+PULPMaxPool2D_8_Template = PULPMaxPoolTemplate("""
+// PULP NN MaxPool 2D
+<%
+signatureString = ''
+if input_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+%>
+pulp_nn_maxpool${signatureString}(${data_in}, ${data_out}, ${dim_im_in_y}, ${dim_im_in_x}, ${ch_im_in}, ${dim_im_out_y}, ${dim_im_out_x}, ${dim_kernel_y}, ${dim_kernel_x}, ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}, ${stride_y}, ${stride_x});
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/MulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/MulTemplate.py
new file mode 100644
index 0000000..1b6be3d
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/MulTemplate.py
@@ -0,0 +1,47 @@
+# ----------------------------------------------------------------------
+#
+# File: MulTemplate.py
+#
+# Last edited: 15.03.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate, OperatorRepresentation
+
+
+class _MulTemplate(NodeTemplate, OperatorRepresentation):
+    pass
+
+
+referenceTemplate = _MulTemplate("""
+// Mul (Name: ${nodeName}, Op: ${nodeOp})
+
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+int16_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
+int16_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${size});
+int16_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${size} + 1);
+
+#pragma unroll 2
+for (uint32_t i=${nodeName}_chunk_start;i<${nodeName}_chunk_stop;i++){
+    ${C}[i] = ${A}[i] * ${B}[i];
+}
+
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/RQAddTemplate.py b/Deeploy/Targets/PULPOpen/Templates/RQAddTemplate.py
new file mode 100644
index 0000000..49ede2b
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/RQAddTemplate.py
@@ -0,0 +1,69 @@
+# ----------------------------------------------------------------------
+#
+# File: RQAddTemplate.py
+#
+# Last edited: 11.11.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class PULPRQAddTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Extract signedness information of input, weights and output
+        signedI2 = ctxt.lookup(operatorRepresentation['data_in_2'])._type.referencedType.typeMin < 0
+        signedI = ctxt.lookup(operatorRepresentation['data_in_1'])._type.referencedType.typeMin < 0
+        signedO = ctxt.lookup(operatorRepresentation['data_out'])._type.referencedType.typeMin < 0
+        operatorRepresentation['input_2_signed'] = signedI2
+        operatorRepresentation['input_signed'] = signedI
+        operatorRepresentation['output_signed'] = signedO
+
+        return ctxt, operatorRepresentation, []
+
+
+RQAddTemplate = PULPRQAddTemplate("""
+
+<%
+signatureString = ''
+if input_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+if input_2_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+if output_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+%>
+
+// PULP NN RQADD
+pulp_nn_add${signatureString}(${data_in_1}, ${data_in_2}, ${data_out}, ${rqs1_mul}, ${rqs1_add}, ${rqs1_log2D}, ${rqs2_mul}, ${rqs2_add}, ${rqs2_log2D}, ${rqsOut_mul}, ${rqsOut_add}, ${rqsOut_log2D}, 1, ${size}, 1, 1);
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/RQSiHardswishTemplate.py b/Deeploy/Targets/PULPOpen/Templates/RQSiHardswishTemplate.py
new file mode 100644
index 0000000..dd020ba
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/RQSiHardswishTemplate.py
@@ -0,0 +1,50 @@
+# ----------------------------------------------------------------------
+#
+# File: RQSiHardswishTemplate.py
+#
+# Last edited: 15.03.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _RQSiHardswishTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+        operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+        operatorRepresentation['output_offset'] = (data_out._signed == 0) * int(data_out.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _RQSiHardswishTemplate("""
+// RequantizediHardswish (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE RQiHardswish_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_plp(${data_in}, ${data_out}, ${size}, ${one_over_six}, ${three},${six}, ${int(mul)}, ${int(add)}, ${int(shift)});
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/ReduceMeanTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ReduceMeanTemplate.py
new file mode 100644
index 0000000..1494884
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/ReduceMeanTemplate.py
@@ -0,0 +1,110 @@
+# ----------------------------------------------------------------------
+#
+# File: ReduceMeanTemplate.py
+#
+# Last edited: 05.06.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _ReduceMeanTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['input_offset'] = 0
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"):
+            operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+        operatorRepresentation['output_offset'] = 0  #-(data_out._signed==0) * int(data_in.nLevels/2)
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _ReduceMeanTemplate("""
+// ReduceMean (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+int32_t ${data_out}_accumulator = 0;
+<%
+reduceLength = 1
+for i in axes:
+    reduceLength = reduceLength * data_in_shape[i]
+%>
+<%
+    shapeStr = ''
+    accessStr = ''
+%>
+% for idx, i in enumerate(data_in_shape[1:]):
+<%
+    shapeStr += '['+str(i)+']'
+%>
+% endfor
+% for j in range(len(data_in_shape)):
+<%
+    accessStr += '[i_'+str(j)+']'
+%>
+% endfor
+${data_out_type.typeName} dummy_${data_out} = ${data_out};
+
+<%
+restDims = set(list(range(len(data_in_shape)))).difference(set(axes))
+%>
+% for i in list(restDims):
+for(uint32_t i_${i} = 0; i_${i}<${data_in_shape[i]}; i_${i}++){
+% endfor
+${data_out}_accumulator = ${input_offset}*${reduceLength};
+% for i in list(axes):
+for(uint32_t i_${i} = 0; i_${i}<${data_in_shape[i]}; i_${i}++){
+% endfor
+${data_out}_accumulator += ((${data_in_type.typeName} ${shapeStr})${data_in})${accessStr};
+
+% for i in range(len(axes)):
+}
+% endfor
+% if keepdims:
+*dummy_${data_out}++ = (${data_out_type.referencedType.typeName}) ((${data_out}_accumulator + ${data_out}_sgn*(${reduceLength}>>1)) / ${reduceLength} + ${output_offset});
+% else:
+<%
+
+import numpy as np
+
+if (np.log2(reduceLength) - int(np.log2(reduceLength))) == 0:
+    shift = int(np.log2(reduceLength))
+%>
+% if shift is not None:
+*dummy_${data_out}++ = (${data_out_type.referencedType.typeName}) (((${data_out}_accumulator + (1<<(${shift}-1))) >> ${shift}) + ${output_offset});
+% else:
+int8_t ${data_out}_sgn = 0;
+${data_out}_sgn = -(${data_out}_accumulator<0) + (${data_out}_accumulator >= 0);
+*dummy_${data_out}++ = (${data_out_type.referencedType.typeName}) ((${data_out}_accumulator + ${data_out}_sgn*(${reduceLength}>>1)) / ${reduceLength} + ${output_offset});
+% endif
+% endif
+% for i in range(len(restDims)):
+}
+% endfor
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/RequantShiftTemplate.py b/Deeploy/Targets/PULPOpen/Templates/RequantShiftTemplate.py
new file mode 100644
index 0000000..4a7a384
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/RequantShiftTemplate.py
@@ -0,0 +1,79 @@
+# ----------------------------------------------------------------------
+#
+# File: RequantShiftTemplate.py
+#
+# Last edited: 14.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _RequantShiftTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        operatorRepresentation["signedI"] = data_in._type.referencedType.typeMin < 0
+        operatorRepresentation["signedO"] = data_out._type.referencedType.typeMin < 0
+
+        operatorRepresentation['input_offset'] = 0
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"):
+            operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+        operatorRepresentation['output_offset'] = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * operatorRepresentation['n_levels'] // 2
+
+        if operatorRepresentation["signed"]:
+            operatorRepresentation['output_min'] = -(operatorRepresentation['n_levels'] // 2)
+            operatorRepresentation['output_max'] = (operatorRepresentation['n_levels'] // 2) - 1
+        else:
+            operatorRepresentation['output_min'] = 0
+            operatorRepresentation['output_max'] = operatorRepresentation['n_levels'] - 1
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _RequantShiftTemplate("""
+<%
+if isinstance(log2D, int):
+    log2Dstring = log2D
+else:
+    log2Dstring = "*"+log2D
+
+inSignage = "s" if signedI else "u"
+outSignage = "s" if signedO else "u"
+%>
+
+// RequantShift (Name: ${nodeName}, Op: ${nodeOp})
+    % if channels_first:
+    RequantShift_${inSignage}${data_in_type.referencedType.typeWidth}_${outSignage}${data_out_type.referencedType.typeWidth}_NCHW(${data_in}, ${size}, ${mul}, ${add}, ${data_out}, ${log2Dstring}, ${channel_width}, 0, 0 , ${output_min}, ${output_max}, 1);
+    % else:
+    RequantShift_${inSignage}${data_in_type.referencedType.typeWidth}_${outSignage}${data_out_type.referencedType.typeWidth}_NHWC(${data_in}, ${size}, ${mul}, ${add}, ${data_out}, ${log2Dstring}, ${channels}, 0, 0, ${output_min}, ${output_max}, 1);
+    %endif
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/SliceTemplate.py b/Deeploy/Targets/PULPOpen/Templates/SliceTemplate.py
new file mode 100644
index 0000000..07b6afc
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/SliceTemplate.py
@@ -0,0 +1,176 @@
+# ----------------------------------------------------------------------
+#
+# File: SliceTemplate.py
+#
+# Last edited: 01.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+from Deeploy.Targets.PULPOpen.DataTypes import PULPStructDataTypes
+
+
+class _SliceTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        assert ctxt.lookup(operatorRepresentation['data_in'])._memoryLevel in ["L2",
+                                                                               "L1"], "input data needs to be on-chip!"
+        assert ctxt.lookup(operatorRepresentation['data_out'])._memoryLevel in ["L2", "L1"
+                                                                               ], "output data needs to be on-chip!"
+        assert ctxt.lookup(operatorRepresentation['data_in'])._memoryLevel != ctxt.lookup(
+            operatorRepresentation['data_out'])._memoryLevel, "Can't move on same memory level with Cluster DMA!"
+
+        bufferList = []
+
+        def _downSample(starts, ends, axes, steps, data_in_shape, idx) -> bool:
+            return steps[idx] != 1 or starts[idx] > 0 or ends[idx] < data_in_shape[axes[idx]]
+
+        # Immediate-ify start
+        startsBuffer = ctxt.lookup(operatorRepresentation['starts'])
+        axesBuffer = ctxt.lookup(operatorRepresentation['axes'])
+        endsBuffer = ctxt.lookup(operatorRepresentation['ends'])
+        stepsBuffer = ctxt.lookup(operatorRepresentation['steps'])
+
+        startsBuffer._deploy = False
+        axesBuffer._deploy = False
+        endsBuffer._deploy = False
+        stepsBuffer._deploy = False
+
+        operatorRepresentation['starts'] = startsBuffer.values
+        operatorRepresentation['ends'] = endsBuffer.values
+        operatorRepresentation['axes'] = axesBuffer.values
+        operatorRepresentation['steps'] = stepsBuffer.values
+
+        operatorRepresentation['data_in_size'] = np.prod(operatorRepresentation['data_in_shape'])
+
+        data_in_shape = operatorRepresentation['data_in_shape']
+        data_in_size = operatorRepresentation['data_in_size']
+        axes = operatorRepresentation['axes']
+        starts = operatorRepresentation['starts']
+        ends = operatorRepresentation['ends']
+        steps = operatorRepresentation['steps']
+
+        dimSteps = []
+        dimSteps.append(data_in_size // data_in_shape[0])
+        for dim in data_in_shape[1:]:
+            dimSteps.append(dimSteps[-1] // dim)
+
+        number_of_1d_copies = 1
+        number_of_2d_copies = 1
+        stride_1d = 0
+        stride_2d = 0
+
+        numCopies = []
+        strides = []
+        downSample = []
+
+        switchIdx = 0
+
+        for i in range(len(axes)):
+            numCopies.append(ends[i] - starts[i])
+            strides.append(dimSteps[axes[i]])
+            downSample.append(_downSample(starts, ends, axes, steps, data_in_shape, i))
+
+        for idx, switch in enumerate(downSample):
+            if switch == True:
+                switchIdx = idx
+                break
+            switchIdx = axes[idx] + 1
+
+        operatorRepresentation["offset"] = starts[switchIdx] * dimSteps[axes[switchIdx]]
+
+        operatorRepresentation['numberIterations'] = np.prod(data_in_shape[:axes[switchIdx]])
+
+        inputOffset = dimSteps[axes[switchIdx]] * data_in_shape[axes[switchIdx]]
+        outputOffset = int(inputOffset * ((ends[switchIdx] - starts[switchIdx]) / data_in_shape[axes[switchIdx]]))
+        consecutiveCopies = outputOffset
+        transferSize1D = consecutiveCopies * operatorRepresentation['data_in_type'].referencedType.typeWidth // 8
+
+        if ctxt.lookup(operatorRepresentation['data_in'])._memoryLevel == "L2":
+            # Target address:
+            ext = operatorRepresentation['data_in']
+            # Source address:
+            loc = operatorRepresentation['data_out']
+            _dir = 1
+            operatorRepresentation["extOffset"] = inputOffset
+            operatorRepresentation["locOffset"] = outputOffset
+        else:
+            # Target address:
+            ext = operatorRepresentation['data_out']
+            # Source address:
+            loc = operatorRepresentation['data_in']
+            _dir = 0
+            operatorRepresentation["locOffset"] = inputOffset
+            operatorRepresentation["extOffset"] = outputOffset
+
+        operatorRepresentation["dir"] = _dir
+
+        length_2d_copy = number_of_1d_copies * transferSize1D
+        mchan_flags = _dir + 0x2 + 0x8
+        if number_of_1d_copies > 1 or number_of_2d_copies > 1:
+            mchan_flags += 0x4
+        mchan_cmd = length_2d_copy + (mchan_flags << 17)
+
+        bufferList += [
+            ctxt.hoistStruct(
+                {
+                    "ext": ext,
+                    "loc": loc,
+                    "hwc_to_chw": 0,
+                    "stride_2d": stride_2d,
+                    "number_of_2d_copies": number_of_2d_copies,
+                    "stride_1d": stride_1d,
+                    "number_of_1d_copies": number_of_1d_copies,
+                    "length_1d_copy": transferSize1D,
+                    "mchan_cmd": mchan_cmd,
+                    "dir": _dir,
+                    "tid": 0
+                }, operatorRepresentation['nodeName'] + "_stateReference", PULPStructDataTypes.DMA_copy)
+        ]
+
+        operatorRepresentation['stateReference'] = operatorRepresentation['nodeName'] + "_stateReference"
+
+        return ctxt, operatorRepresentation, bufferList
+
+
+referenceTemplate = _SliceTemplate("""
+// Slice (Name: ${nodeName}, Op: ${nodeOp})
+// data_in : ${data_in_shape}
+// data_out : ${data_out_shape}
+% if dir == 1:
+${stateReference}.ext += ${offset};
+% else:
+${stateReference}.loc += ${offset};
+% endif
+for(int j=0;j<${numberIterations};j++){
+dory_dma_memcpy_async(&${stateReference});
+${stateReference}.ext += ${extOffset};
+${stateReference}.loc += ${locOffset};
+}
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/TallGEMMTemplate.py b/Deeploy/Targets/PULPOpen/Templates/TallGEMMTemplate.py
new file mode 100644
index 0000000..d0f4e22
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/TallGEMMTemplate.py
@@ -0,0 +1,77 @@
+# ----------------------------------------------------------------------
+#
+# File: TallGEMMTemplate.py
+#
+# Last edited: 21.03.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _PULPTallGEMMTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        signedW = ctxt.lookup(operatorRepresentation['B'])._type.referencedType.typeMin < 0
+        signedI = ctxt.lookup(operatorRepresentation['A'])._type.referencedType.typeMin < 0
+        signedO = ctxt.lookup(operatorRepresentation['data_out'])._type.referencedType.typeMin < 0
+        operatorRepresentation['weight_signed'] = signedW
+        operatorRepresentation['input_signed'] = signedI
+        operatorRepresentation['output_signed'] = signedO
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _PULPTallGEMMTemplate("""
+// TallGEMM (Name: ${nodeName}, Op: ${nodeOp})
+
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+int16_t ${nodeName}_chunk = (${int(M)} >> ${nodeName}_log2Core) + ((${int(M)} & (NUM_CORES-1))!=0);
+int16_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${int(M)});
+int16_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${int(M)} + 1);
+                                          
+int8_t* ref_${nodeName}_${A};
+int8_t* ref_${nodeName}_${B};
+int8_t* ref_${nodeName}_${data_out};
+                                          
+for(int b=0; b<${batch}; b++){
+
+    for (uint32_t i=${nodeName}_chunk_start; i<${nodeName}_chunk_stop; i++){
+        
+        int8_t* ref_${nodeName}_${A} = ${A} + (b * ${M} * ${N}) + (i * ${N});
+        % if W_batched:
+        int8_t* ref_${nodeName}_${B} = ${B} + (b * ${N} * ${O});
+        % else:
+        int8_t* ref_${nodeName}_${B} = ${B};
+        % endif
+        int8_t* ref_${nodeName}_${data_out} = ${data_out} + (b * ${M} * ${O}) + (i * ${O});
+                                          
+        gemv_s8_s8_plp(ref_${nodeName}_${A}, NULL, ref_${nodeName}_${data_out}, ref_${nodeName}_${B}, ${mul}, ${C}, 1, ${log2D}, ${N}, ${O}, 1, 1);
+    }
+}
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py
new file mode 100644
index 0000000..06562e1
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py
@@ -0,0 +1,126 @@
+# ----------------------------------------------------------------------
+#
+# File: TransposeTemplate.py
+#
+# Last edited: 28.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, _Template
+
+_tileHeader = NodeTemplate("""
+const uint32_t coreId = pi_core_id();
+
+% for i in range(numDims):
+uint16_t dimLen_${i} = <%text>${</%text>${dimLenPtr[i]}<%text>}</%text>;\n
+% endfor
+""")
+
+_tileForLoop = NodeTemplate("""
+const uint32_t baseChunk = dimLen_${i} / NUM_CORES;
+const uint32_t leftover = dimLen_${i} - baseChunk * NUM_CORES;
+const uint32_t offset = baseChunk * coreId + (coreId < leftover ? coreId : leftover);
+const uint32_t chunk = coreId < leftover ? baseChunk + 1 : baseChunk;
+for(uint32_t i_${i} = offset; i_${i} < offset + chunk; i_${i}++ ) {
+""")
+
+_forLoop = NodeTemplate("""
+for(uint32_t i_${i} = 0; i_${i} < dimLen_${i} ; i_${i}++){
+""")
+
+
+class PULPTransposeTemplate(NodeTemplate):
+
+    def __init__(self, templateStr: str):
+        self._indirectTemplate = _Template(templateStr)
+        self.subTemplates = {}
+        self.subTemplateGenerators = {}
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        shapeStr = ""
+        dimStr = ""
+        accessStr = ""
+        outAccessStr = ""
+        outShapeStr = ""
+        perm = operatorRepresentation['perm']
+        data_in_shape = ctxt.lookup(operatorRepresentation['data_in']).shape
+        data_out_shape = ctxt.lookup(operatorRepresentation['data_out']).shape
+
+        for idx, i in enumerate(perm[:-1]):
+            shapeStr += '[' + f"dimLen_{idx+1}" + ']'
+            outShapeStr += '[' + f"dimLen_{perm[idx+1]}" + ']'
+
+        for dim in data_in_shape:
+            dimStr += '[' + str(dim) + ']'
+
+        for idx, i in enumerate(perm):
+            accessStr += '[i_' + str(idx) + ']'
+            outAccessStr += '[i_' + str(i) + ']'
+
+        fRep = operatorRepresentation.copy()
+
+        fRep['shapeStr'] = shapeStr
+        fRep['outShapeStr'] = outShapeStr
+        fRep['outAccessStr'] = outAccessStr
+        fRep['dimStr'] = dimStr
+        fRep['accessStr'] = accessStr
+        fRep['data_out_shape'] = data_out_shape
+
+        parallelDims = [idx for idx, dim in enumerate(data_out_shape) if dim >= 8]
+        if len(parallelDims) > 0:
+            parallelDim = parallelDims[0]
+        else:
+            parallelDim = data_out_shape.index(max(data_out_shape))
+
+        forLoops = []
+        dimLenPtrs = []
+        for idx, i in enumerate(perm):
+            operatorRepresentation[f"dimLen_{idx}"] = data_in_shape[idx]
+            dimLenPtrs.append(f"dimLen_{idx}")
+            if idx != parallelDim:
+                forLoops.append(_forLoop.generate({"i": i, "dimLenPtr": f"dimLen_{i}"}))
+            else:
+                forLoops.append(_tileForLoop.generate({"i": i, "dimLenPtr": f"dimLen_{i}"}))
+
+        fRep['forLoops'] = forLoops
+        fRep['tileHeader'] = _tileHeader.generate({"numDims": len(perm), "dimLenPtr": dimLenPtrs})
+        fRep['parallelDim'] = parallelDim
+
+        self.template = _Template(self._indirectTemplate.render(**fRep))
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = PULPTransposeTemplate("""
+// Transpose ${data_in_shape} -> ${data_out_shape} (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+${tileHeader}
+% for idx, i in enumerate(perm):
+${forLoops[idx]}
+% endfor
+((${data_in_type.referencedType.typeName} (*)${outShapeStr})<%text>${data_out}</%text>)${outAccessStr} = ((${data_in_type.referencedType.typeName} (*)${shapeStr})<%text>${data_in}</%text>)${accessStr};
+% for idx, i in enumerate(perm):
+}
+% endfor
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/UniformRequantShiftTemplate.py b/Deeploy/Targets/PULPOpen/Templates/UniformRequantShiftTemplate.py
new file mode 100644
index 0000000..92def86
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/UniformRequantShiftTemplate.py
@@ -0,0 +1,84 @@
+# ----------------------------------------------------------------------
+#
+# File: UniformRequantShiftTemplate.py
+#
+# Last edited: 12.03.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _UniformRequantShiftTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        operatorRepresentation["signedI"] = data_in._type.referencedType.typeMin < 0
+        operatorRepresentation["signedO"] = data_out._type.referencedType.typeMin < 0
+
+        operatorRepresentation['input_offset'] = 0
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"):
+            operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+        operatorRepresentation['output_offset'] = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * operatorRepresentation['n_levels'] // 2
+
+        if operatorRepresentation["signed"]:
+            operatorRepresentation['output_min'] = -(operatorRepresentation['n_levels'] // 2)
+            operatorRepresentation['output_max'] = (operatorRepresentation['n_levels'] // 2) - 1
+        else:
+            operatorRepresentation['output_min'] = 0
+            operatorRepresentation['output_max'] = operatorRepresentation['n_levels'] - 1
+
+        operatorRepresentation['mul_immediate'] = ctxt.lookup(operatorRepresentation['mul']).values.flatten()[0]
+        operatorRepresentation['add_immediate'] = ctxt.lookup(operatorRepresentation['add']).values.flatten()[0]
+
+        # JUNGVI: Don't tile the mul and add tensors in case of uniform requantization
+        mul_buffer = ctxt.lookup(operatorRepresentation['mul'])
+        add_buffer = ctxt.lookup(operatorRepresentation['add'])
+        mul_buffer._deploy = False
+        add_buffer._deploy = False
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _UniformRequantShiftTemplate("""
+<%
+if isinstance(log2D, int):
+    log2Dstring = log2D
+else:
+    log2Dstring = "*"+log2D
+
+inSignage = "s" if signedI else "u"
+outSignage = "s" if signedO else "u"
+%>
+
+// UniformRequantShift (Name: ${nodeName}, Op: ${nodeOp})
+UniformRequantShift_${inSignage}${data_in_type.referencedType.typeWidth}_${outSignage}${data_out_type.referencedType.typeWidth}(${data_in}, ${size}, ${mul_immediate}, ${add_immediate}, ${data_out}, ${log2Dstring}, ${channel_width}, 0, 0 , ${output_min}, ${output_max}, 1);
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/__init__.py b/Deeploy/Targets/PULPOpen/Templates/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/PULPOpen/Templates/iRMSNormTemplate.py b/Deeploy/Targets/PULPOpen/Templates/iRMSNormTemplate.py
new file mode 100644
index 0000000..815143b
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/iRMSNormTemplate.py
@@ -0,0 +1,45 @@
+# ----------------------------------------------------------------------
+#
+# File: iRMSNormTemplate.py
+#
+# Last edited: 20.02.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _iRMSNormTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _iRMSNormTemplate("""
+// iRMSnorm (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE iRMSnorm_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_plp(${data_in}, ${data_out}, ${weight}, ${size}, ${lastDimLength}, ${log2D});
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/iSoftmaxTemplate.py b/Deeploy/Targets/PULPOpen/Templates/iSoftmaxTemplate.py
new file mode 100644
index 0000000..804db3b
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/iSoftmaxTemplate.py
@@ -0,0 +1,79 @@
+# ----------------------------------------------------------------------
+#
+# File: iSoftmaxTemplate.py
+#
+# Last edited: 13.11.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class PULPiSoftmaxTemplate(NodeTemplate):
+
+    @staticmethod
+    def computeTransientBuffersSize(
+            ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
+
+        lastDimBuffer_dim = 8 * 4 * operatorRepresentation['lastDimLength']
+        lastDimBuffer_name = operatorRepresentation['nodeName'] + "_lastDimBuffer"
+        return [(lastDimBuffer_name, lastDimBuffer_dim)]
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        lastDimBuffer_name, lastDimBuffer_dim = PULPiSoftmaxTemplate.computeTransientBuffersSize(
+            ctxt, operatorRepresentation)[0]
+        ctxt.hoistTransientBuffer(lastDimBuffer_name, lastDimBuffer_dim)
+
+        operatorRepresentation['lastDimBuffer'] = lastDimBuffer_name
+        operatorRepresentation['lastDimBufferSize'] = lastDimBuffer_dim
+        return ctxt, operatorRepresentation, [lastDimBuffer_name]
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        signedI = ctxt.lookup(operatorRepresentation['data_in'])._type.referencedType.typeMin < 0
+        signedO = ctxt.lookup(operatorRepresentation['data_out'])._type.referencedType.typeMin < 0
+
+        operatorRepresentation['input_signed'] = signedI
+        operatorRepresentation['output_signed'] = signedO
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = PULPiSoftmaxTemplate("""
+<%
+signatureString = ''
+if input_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+if output_signed:
+    signatureString += '_i8'
+else:
+    signatureString += '_u8'
+%>
+PULPSoftmax${signatureString}(${data_in}, ${data_out}, ${lastDimBuffer}, ${size}, ${lastDimLength}, ${coeffB}, ${coeffC}, ${log2});
+""")
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py
new file mode 100644
index 0000000..a4d403d
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py
@@ -0,0 +1,289 @@
+# ----------------------------------------------------------------------
+#
+# File: ConvTileConstraint.py
+#
+# Last edited: 09.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class Conv2DTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        '''
+        This function add geometrical constraints for a PULP Im2Col Convolution Tilling.
+        '''
+
+        # Get to-be-tiled tensor's buffers
+        inputBufferName = parseDict['data_in']
+        weightBufferName = parseDict['weight']
+        mulBufferName = parseDict['mul']
+        addBufferName = parseDict['add']
+        outputBufferName = parseDict['data_out']
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+        dilation = parseDict["dilations"]
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [inputBufferName, weightBufferName, mulBufferName, addBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
+
+        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
+        weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1)
+        weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2)
+        weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3)
+
+        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+
+        addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0)
+        mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0)
+
+        # Map output dims to inputs dims
+        tilerModel.addConstraint(outputBatchVar == inputBatchVar)  # Batch
+        tilerModel.addConstraint(outputChannelVar == weightOutChannelVar)  # Output Channel
+
+        tilerModel.addConstraint(outputChannelVar == addChannelVar)
+        tilerModel.addConstraint(outputChannelVar == mulChannelVar)
+
+        inputBuffer = ctxt.lookup(inputBufferName)
+
+        effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[1]))
+        effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[2]))
+
+        tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (weightHeightVar - 1) - 1) // strides[0] + 1))
+        tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (weightWidthVar - 1) - 1) // strides[1] + 1))
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        weightBuffer = ctxt.lookup(name = parseDict['weight'])
+
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
+
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 0)
+        weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 1)
+        weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 2)
+        weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 3)
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+
+        # VIC: Force at least one row of A and one col of B in the GEMM (since it's a im2col Conv) to avoid partial results
+        tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in'])
+
+        if (parseDict["ch_im_out"] >= 8):
+            tilerModel.addMinTileSizeConstraint(parseDict, 'ch_im_out', outputChannelVar, 8)
+
+        tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_x'])
+        tilerModel.addConstraint(inputWidthVar >= parseDict['dim_kernel_y'])
+        tilerModel.addConstraint(weightInChannelVar == parseDict['ch_im_in'])
+
+        # VIC: Constraint the minimum tile size such that we can apply at least one kernel on it
+        tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_x'])
+        tilerModel.addConstraint(inputWidthVar >= parseDict['dim_kernel_y'])
+
+        tilerModel.addConstraint(weightHeightVar == parseDict['dim_kernel_x'])
+        tilerModel.addConstraint(weightWidthVar == parseDict['dim_kernel_y'])
+
+        tilerModel.addConstraint((inputHeightVar % strides[0]) == 0)
+        tilerModel.addConstraint((inputWidthVar % strides[1]) == 0)
+
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        weightBuffer = ctxt.lookup(name = parseDict['weight'])
+
+        symbolicParseDict = parseDict.copy()
+        symbolicParseDict['dim_im_in_x'] = tilerModel.getTensorDimVar(inputBuffer.name, 1)
+        symbolicParseDict['dim_kernel_x'] = tilerModel.getTensorDimVar(weightBuffer.name, 1)
+        symbolicParseDict['dim_kernel_y'] = tilerModel.getTensorDimVar(weightBuffer.name, 2)
+
+        return symbolicParseDict
+
+    @staticmethod
+    def computeMargins(kernelShape: Tuple[int, ...]) -> Tuple[int, ...]:
+        if kernelShape[1] % 2 == 0:
+            leftMargin = 0
+            rightMargin = 0
+        else:
+            leftMargin = ((kernelShape[1]) // 2)
+            rightMargin = ((kernelShape[1]) // 2)
+
+        if kernelShape[0] % 2 == 0:
+            topMargin = 0
+            bottomMargin = 0
+        else:
+            topMargin = ((kernelShape[0]) // 2)
+            bottomMargin = ((kernelShape[0]) // 2)
+
+        return leftMargin, rightMargin, topMargin, bottomMargin
+
+    @staticmethod
+    def computeInputCube(kernelShape: Tuple[int, ...], pads: Tuple[int, ...], strides: Tuple[int, ...],
+                         weightChannels: int, outputCube: HyperRectangle,
+                         outputDims: Tuple[int, ...]) -> Tuple[HyperRectangle, Tuple[int, ...]]:
+
+        (BatchOffset, HOffset, WOffset, COffset) = outputCube.offset
+        (BatchSize, HSize, WSize, CSize) = outputCube.dims
+
+        leftMargin, rightMargin, topMargin, bottomMargin = Conv2DTileConstraint.computeMargins(kernelShape)
+
+        padding_top = (HOffset == 0) * pads[0]
+        padding_bottom = (HOffset + HSize == outputDims[1]) * pads[2]
+
+        padding_left = (WOffset == 0) * pads[1]
+        padding_right = (WOffset + WSize == outputDims[2]) * pads[3]
+
+        inputHOffset = HOffset * strides[0] - topMargin * (HOffset != 0)
+        inputWOffset = WOffset * strides[1] - leftMargin * (WOffset != 0)
+
+        inputHSize = HSize * strides[0] + (topMargin + bottomMargin) - (padding_top + padding_bottom)
+        inputWSize = WSize * strides[1] + (leftMargin + rightMargin) - (padding_left + padding_right)
+
+        InCube = HyperRectangle((BatchOffset, inputHOffset, inputWOffset, 0),
+                                (BatchSize, inputHSize, inputWSize, weightChannels))
+
+        return InCube, (padding_left, padding_right, padding_top, padding_bottom)
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'weight', 'mul', 'add', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        varWeight = operatorRepresentation['weight']
+        varOut = operatorRepresentation['data_out']
+
+        inputInCubes = []
+        inputAddCubes = []
+        inputMulCubes = []
+        inputWeightCubes = []
+        replacements: Dict[str, List[int]] = {
+            "dim_im_in_x": [],
+            "dim_im_in_y": [],
+            "dim_im_out_x": [],
+            "dim_im_out_y": [],
+            "ch_im_out": [],
+            "padding_y_top": [],
+            "padding_y_bottom": [],
+            "padding_x_left": [],
+            "padding_x_right": []
+        }
+
+        replacementTypes = {
+            "dim_im_in_x": PointerClass(uint16_t),
+            "dim_im_in_y": PointerClass(uint16_t),
+            "dim_im_out_x": PointerClass(uint16_t),
+            "dim_im_out_y": PointerClass(uint16_t),
+            "ch_im_out": PointerClass(uint16_t),
+            "padding_y_top": PointerClass(uint8_t),
+            "padding_y_bottom": PointerClass(uint8_t),
+            "padding_x_left": PointerClass(uint8_t),
+            "padding_x_right": PointerClass(uint8_t)
+        }
+
+        weightH = ctxt.lookup(varWeight).shape[1]
+        weightW = ctxt.lookup(varWeight).shape[2]
+        weightC = ctxt.lookup(varWeight).shape[3]
+
+        pads = operatorRepresentation['pads']
+        strides = operatorRepresentation['strides']
+
+        for cube in outputCubes:
+            (BatchOffset, HOffset, WOffset, COffset) = cube.offset
+            (BatchSize, HSize, WSize, CSize) = cube.dims
+
+            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
+                                                                          cube,
+                                                                          ctxt.lookup(varOut).shape)
+            padding_left, padding_right, padding_top, padding_bottom = padding_tuple
+
+            replacements['dim_im_in_x'].append(InCube.dims[1])
+            replacements['dim_im_in_y'].append(InCube.dims[2])
+            replacements['dim_im_out_x'].append(HSize)
+            replacements['dim_im_out_y'].append(WSize)
+            replacements['ch_im_out'].append(CSize)
+
+            replacements['padding_y_top'].append(padding_top)
+            replacements['padding_y_bottom'].append(padding_bottom)
+            replacements['padding_x_left'].append(padding_left)
+            replacements['padding_x_right'].append(padding_right)
+
+            inputInCubes.append(InCube)
+
+            RequantCube = HyperRectangle((COffset,), (CSize,))
+            WeightCube = HyperRectangle((COffset, 0, 0, 0), (CSize, weightH, weightW, weightC))
+
+            inputWeightCubes.append(WeightCube)
+            inputAddCubes.append(RequantCube)
+            inputMulCubes.append(RequantCube)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for a, b, add, mul in zip(inputInCubes, inputWeightCubes, inputAddCubes, inputMulCubes):
+            inputLoadSchedule.append({"data_in": a, "weight": b, "add": add, "mul": mul})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py
new file mode 100644
index 0000000..d4e1989
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py
@@ -0,0 +1,257 @@
+# ----------------------------------------------------------------------
+#
+# File: ConvTileConstraint.py
+#
+# Last edited: 09.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import PerformanceHint, TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class DWConv2DTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        '''
+        This function add geometrical constraints for a PULP Im2Col Convolution Tilling.
+        '''
+
+        # Get to-be-tiled tensor's buffers
+        inputBufferName = parseDict['data_in']
+        weightBufferName = parseDict['weight']
+        mulBufferName = parseDict['mul']
+        addBufferName = parseDict['add']
+        outputBufferName = parseDict['data_out']
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+        dilation = parseDict["dilations"]
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [inputBufferName, weightBufferName, mulBufferName, addBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        # SCHEREMO: NCHW layout
+
+        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
+
+        # SCHEREMO: CHW layout
+
+        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
+        weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1)
+        weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2)
+
+        # SCHEREMO: NHWC layout
+
+        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+
+        addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0)
+        mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0)
+
+        # map output dims to inputs dims
+        tilerModel.addConstraint(outputBatchVar == inputBatchVar)  # Batch
+        tilerModel.addConstraint(outputChannelVar == weightOutChannelVar)  # Output Channel
+        tilerModel.addConstraint(outputChannelVar == inputChannelVar)  # Input Channel
+
+        tilerModel.addConstraint(outputChannelVar == addChannelVar)
+        tilerModel.addConstraint(outputChannelVar == mulChannelVar)
+
+        inputBuffer = ctxt.lookup(inputBufferName)
+
+        effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[2]))
+        effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[3]))
+
+        tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (weightHeightVar - 1) - 1) // strides[0] + 1))
+        tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (weightWidthVar - 1) - 1) // strides[1] + 1))
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+        weightBuffer = ctxt.lookup(name = parseDict['weight'])
+
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 2)
+
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
+
+        weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 1)
+        weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 2)
+        weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 0)
+
+        # SCHEREMO: Work around tiling issue with non-wordaligned accesses
+        if "L3" in ctxt.lookup(parseDict['data_in'])._memoryLevel:
+            tilerModel.addTileSizeDivisibleConstraint(parseDict, 'ch_im_in', inputChannelVar, 4)
+
+        strides = parseDict["strides"]
+        pads = parseDict["pads"]
+
+        tilerModel.addConstraint(weightHeightVar == parseDict['dim_kernel_x'])
+        tilerModel.addConstraint(weightWidthVar == parseDict['dim_kernel_y'])
+
+        # VIC: Constraint the minimum tile size such that we can apply at least one kernel on it
+        # SCHEREMO: Account for padding - needed for MobileNetv1
+        tilerModel.addConstraint(outputHeightVar >= 1 + max([pads[0], pads[2]]))
+        tilerModel.addConstraint(outputWidthVar >= 1 + max([pads[1], pads[3]]))
+
+        tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_x'] + pads[0], strategy = PerformanceHint(1))
+        tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_y'] + pads[1], strategy = PerformanceHint(1))
+
+        tilerModel.addConstraint((inputHeightVar % strides[0]) == 0)
+        tilerModel.addConstraint((inputWidthVar % strides[1]) == 0)
+
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        weightBuffer = ctxt.lookup(name = parseDict['weight'])
+
+        symbolicParseDict = parseDict.copy()
+        symbolicParseDict['ch_im_in'] = tilerModel.getTensorDimVar(inputBuffer.name, 1)
+        symbolicParseDict['dim_kernel_x'] = tilerModel.getTensorDimVar(weightBuffer.name, 1)
+        symbolicParseDict['dim_kernel_y'] = tilerModel.getTensorDimVar(weightBuffer.name, 2)
+
+        return symbolicParseDict
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'weight', 'mul', 'add', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        varWeight = operatorRepresentation['weight']
+        varOut = operatorRepresentation['data_out']
+
+        inputInCubes = []
+        inputAddCubes = []
+        inputMulCubes = []
+        inputWeightCubes = []
+        replacements: Dict[str, List[int]] = {
+            "dim_im_in_x": [],
+            "dim_im_in_y": [],
+            "dim_im_out_x": [],
+            "dim_im_out_y": [],
+            "ch_im_out": [],
+            "ch_im_in": [],
+            "padding_y_top": [],
+            "padding_y_bottom": [],
+            "padding_x_left": [],
+            "padding_x_right": []
+        }
+
+        replacementTypes = {
+            "dim_im_in_x": PointerClass(uint16_t),
+            "dim_im_in_y": PointerClass(uint16_t),
+            "dim_im_out_x": PointerClass(uint16_t),
+            "dim_im_out_y": PointerClass(uint16_t),
+            "ch_im_out": PointerClass(uint16_t),
+            "ch_im_in": PointerClass(uint16_t),
+            "padding_y_top": PointerClass(uint8_t),
+            "padding_y_bottom": PointerClass(uint8_t),
+            "padding_x_left": PointerClass(uint8_t),
+            "padding_x_right": PointerClass(uint8_t)
+        }
+
+        weightH = ctxt.lookup(varWeight).shape[1]
+        weightW = ctxt.lookup(varWeight).shape[2]
+
+        pads = operatorRepresentation['pads']
+        strides = operatorRepresentation['strides']
+
+        for cube in outputCubes:
+
+            (BatchOffset, HOffset, WOffset, COffset) = cube.offset
+            (BatchSize, HSize, WSize, CSize) = cube.dims
+
+            NHWCInCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, CSize,
+                                                                              cube,
+                                                                              ctxt.lookup(varOut).shape)
+            padding_left, padding_right, padding_top, padding_bottom = padding_tuple
+
+            NCHWInCube = HyperRectangle((NHWCInCube.offset[0], COffset, NHWCInCube.offset[1], NHWCInCube.offset[2]),
+                                        (NHWCInCube.dims[0], CSize, NHWCInCube.dims[1], NHWCInCube.dims[2]))
+
+            RequantCube = HyperRectangle((COffset,), (CSize,))
+            WeightCube = HyperRectangle((COffset, 0, 0, 0), (CSize, weightH, weightW, 1))
+
+            replacements['dim_im_in_x'].append(NCHWInCube.dims[2])
+            replacements['dim_im_in_y'].append(NCHWInCube.dims[3])
+            replacements['dim_im_out_x'].append(HSize)
+            replacements['dim_im_out_y'].append(WSize)
+            replacements['ch_im_out'].append(CSize)
+            replacements['ch_im_in'].append(CSize)
+
+            replacements['padding_y_top'].append(padding_top)
+            replacements['padding_y_bottom'].append(padding_bottom)
+            replacements['padding_x_left'].append(padding_left)
+            replacements['padding_x_right'].append(padding_right)
+
+            inputInCubes.append(NCHWInCube)
+            inputAddCubes.append(RequantCube)
+            inputMulCubes.append(RequantCube)
+            inputWeightCubes.append(WeightCube)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for a, b, add, mul in zip(inputInCubes, inputWeightCubes, inputAddCubes, inputMulCubes):
+            inputLoadSchedule.append({"data_in": a, "weight": b, "add": add, "mul": mul})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py
new file mode 100644
index 0000000..437b647
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py
@@ -0,0 +1,218 @@
+# ----------------------------------------------------------------------
+#
+# File: GEMMTileConstraint.py
+#
+# Last edited: 02.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class GEMMTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        bufferA = ctxt.lookup(name = parseDict['A'])
+        bufferB = ctxt.lookup(name = parseDict['B'])
+        bufferC = ctxt.lookup(name = parseDict['C'])  # Add from RequantShift
+        mulBuffer = ctxt.lookup(name = parseDict['mul'])
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [bufferA.name, bufferB.name, bufferC.name, mulBuffer.name, outputBuffer.name]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        dimOffsetA = len(bufferA.shape) - 2
+        dimOffsetB = len(bufferB.shape) - 2
+        dimOffsetOut = len(outputBuffer.shape) - 2
+
+        AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA'])
+        ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                   dimIdx = dimOffsetA + 1 - parseDict['transA'])
+        BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = dimOffsetB + parseDict['transB'])
+        BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                   dimIdx = dimOffsetB + 1 - parseDict['transB'])
+        outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = dimOffsetOut)
+        outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = dimOffsetOut + 1)
+
+        # Map output dims to inputs dims
+        tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar)
+        tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar)
+
+        # Add GEMM Geometrical constraints
+        tilerModel.addConstraint(ASecondDimVar == BFirstDimVar)
+
+        mulDimVar = tilerModel.getTensorDimVar(tensorName = mulBuffer.name, dimIdx = 0)
+        addDimVar = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = 0)
+
+        tilerModel.addConstraint(outputSecondDimVar == mulDimVar)
+        tilerModel.addConstraint(outputSecondDimVar == addDimVar)
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        bufferA = ctxt.lookup(name = parseDict['A'])
+        bufferB = ctxt.lookup(name = parseDict['B'])
+
+        dimOffsetA = len(bufferA.shape) - 2
+        dimOffsetB = len(bufferB.shape) - 2
+
+        AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA'])
+
+        ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                   dimIdx = dimOffsetA + 1 - parseDict['transA'])
+        BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = dimOffsetB + parseDict['transB'])
+        BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                   dimIdx = dimOffsetB + 1 - parseDict['transB'])
+
+        # VIC: We don't want to deal with intermediate results between kernel calls
+        tilerModel.addConstraint(ASecondDimVar == parseDict['N'])
+        tilerModel.addConstraint(BFirstDimVar == parseDict['N'])
+
+        if (parseDict["O"] >= 16):
+            #modulus = tilerModel.addMinTileSizeConstraint(parseDict, 'O', BSecondDimVar, 8, prefix = "8_")
+            modulus = tilerModel.addTileSizeDivisibleConstraint(parseDict, 'O', BSecondDimVar, 16, prefix = "16_")
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['A', 'B', 'mul', 'C', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+        varA = operatorRepresentation['A']
+
+        NSize = ctxt.lookup(varA).shape[-1]
+        NOffset = 0
+
+        inputACubes = []
+        inputBCubes = []
+        inputMulCubes = []
+        inputAddCubes = []
+
+        replacements = {"M": [], "O": [], "batch": []}
+
+        # Every output is constructed by a pair of inputs. Reconstruct this pair.
+        for cube in outputCubes:
+
+            BSize = 1
+            BOffset = 0
+            BatchSize = 1
+            BatchOffset = 0
+
+            if len(cube.offset) == 2:
+                (MOffset, OOffset) = cube.offset
+                (MSize, OSize) = cube.dims
+            elif len(cube.offset) == 3:
+                (BatchOffset, MOffset, OOffset) = cube.offset
+                (BatchSize, MSize, OSize) = cube.dims
+            else:
+                (BatchOffset, BOffset, MOffset, OOffset) = cube.offset
+                (BatchSize, BSize, MSize, OSize) = cube.dims
+
+            replacements["M"].append(MSize)
+            replacements["O"].append(OSize)
+            replacements["batch"].append(BSize)
+
+            ACube = HyperRectangle((BatchOffset, BOffset, MOffset, NOffset), (BatchSize, BSize, MSize, NSize))
+            BCube = HyperRectangle((BatchOffset, BOffset, OOffset, NOffset), (BatchSize, BSize, OSize, NSize))
+
+            RequantCube = HyperRectangle((OOffset,), (OSize,))
+
+            inputACubes.append(ACube)
+            inputBCubes.append(BCube)
+            inputMulCubes.append(RequantCube)
+            inputAddCubes.append(RequantCube)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        replacements["N"] = [NSize] * len(outputCubes)
+
+        replacementTypes = {
+            "M": PointerClass(uint16_t),
+            "N": PointerClass(uint16_t),
+            "O": PointerClass(uint16_t),
+            "batch": PointerClass(uint8_t)
+        }
+
+        for a, b, c, mul in zip(inputACubes, inputBCubes, inputAddCubes, inputMulCubes):
+            inputLoadSchedule.append({"A": a, "B": b, "C": c, "mul": mul})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+
+        return VariableReplacementScheme(replacements, replacementTypes), schedule
+
+
+class MatrixVecTileConstraint(GEMMTileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        tm = GEMMTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
+
+        return tm
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        tm = GEMMTileConstraint.addPolicyConstraint(tilerModel, parseDict, ctxt)
+
+        return tm
+
+
+class TallGEMMTileConstraint(GEMMTileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        tm = GEMMTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
+
+        return tm
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        tm = GEMMTileConstraint.addPolicyConstraint(tilerModel, parseDict, ctxt)
+
+        return tm
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py
new file mode 100644
index 0000000..85773c4
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py
@@ -0,0 +1,163 @@
+# ----------------------------------------------------------------------
+#
+# File: MatMulTileConstraint.py
+#
+# Last edited: 04.07.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import int8_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class MatMulTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        bufferA = ctxt.lookup(name = parseDict['A'])
+        bufferB = ctxt.lookup(name = parseDict['B'])
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+
+        # Add I/O dimensions to the model as variables
+        for _buffer in [bufferA, bufferB, outputBuffer]:
+            tilerModel.addTensorDimToModel(ctxt, _buffer.name)
+
+        tensorsShapeLen = len(bufferA.shape)
+
+        AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transA'])
+        ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transA'])
+        BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transB'])
+        BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transB'])
+        outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 2))
+        outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 1))
+
+        # Map output dims to inputs dims
+        for idx in range(tensorsShapeLen - 2):
+            tilerModel.addConstraint(
+                tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar(
+                    tensorName = bufferA.name, dimIdx = idx))
+            tilerModel.addConstraint(
+                tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar(
+                    tensorName = bufferB.name, dimIdx = idx))
+
+        tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar)
+        tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar)
+
+        # Add GEMM Geometrical constraints
+        tilerModel.addConstraint(ASecondDimVar == BFirstDimVar)
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        bufferA = ctxt.lookup(name = parseDict['A'])
+        bufferB = ctxt.lookup(name = parseDict['B'])
+
+        tensorsShapeLen = len(bufferA.shape)
+
+        ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transA'])
+        BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transB'])
+        BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transB'])
+
+        # VIC: We don't want to deal with intermediate results between kernel calls
+        tilerModel.addConstraint(ASecondDimVar == parseDict['N'])
+        tilerModel.addConstraint(BFirstDimVar == parseDict['N'])
+
+        # VIC: For now we tile only one of the inputs
+        tilerModel.addConstraint(BSecondDimVar == parseDict['O'])
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['A', 'B', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        varA = operatorRepresentation['A']
+
+        NSize = ctxt.lookup(varA).shape[-1]
+        NOffset = 0
+
+        inputACubes = []
+        inputBCubes = []
+
+        replacements = {"M": [], "O": [], "batch": []}
+
+        # Every output is constructed by a pair of inputs. Reconstruct this pair.
+        for cube in outputCubes:
+            (BatchOffset, BOffset, MOffset, OOffset) = cube.offset
+            (BatchSize, BSize, MSize, OSize) = cube.dims
+
+            replacements["M"].append(MSize)
+            replacements["O"].append(OSize)
+            replacements["batch"].append(BSize)
+
+            ACube = HyperRectangle((BatchOffset, BOffset, MOffset, NOffset), (BatchSize, BSize, MSize, NSize))
+            BCube = HyperRectangle((BatchOffset, BOffset, NOffset, OOffset), (BatchSize, BSize, NSize, OSize))
+
+            inputACubes.append(ACube)
+            inputBCubes.append(BCube)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        replacements["N"] = [NSize] * len(outputCubes)
+
+        replacementTypes = {
+            "M": PointerClass(int8_t),
+            "N": PointerClass(int8_t),
+            "O": PointerClass(int8_t),
+            "batch": PointerClass(int8_t)
+        }
+
+        for a, b in zip(inputACubes, inputBCubes):
+            inputLoadSchedule.append({"A": a, "B": b})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+
+        return VariableReplacementScheme(replacements, replacementTypes), schedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/MaxPoolTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/MaxPoolTileConstraint.py
new file mode 100644
index 0000000..d2ecf5a
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/MaxPoolTileConstraint.py
@@ -0,0 +1,175 @@
+# ----------------------------------------------------------------------
+#
+# File: MaxPoolTileConstraint.py
+#
+# Last edited: 09.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
+
+
+class MaxPoolTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+        kernelShape = parseDict['kernel_shape']
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [inputBuffer.name, outputBuffer.name]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 0)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
+
+        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 0)
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 3)
+
+        # Map output dims to inputs dims
+        tilerModel.addConstraint(outputBatchVar == inputBatchVar)  # Batch
+        tilerModel.addConstraint(outputChannelVar == inputChannelVar)  # Channel
+
+        effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[1]))
+        effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[2]))
+
+        tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (kernelShape[0] - 1) - 1) // strides[0] + 1))
+        tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (kernelShape[1] - 1) - 1) // strides[1] + 1))
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
+
+        strides = parseDict["strides"]
+
+        # VIC: Constraint the minimum tile size such that we can apply at least one kernel on it
+        tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in'])
+        tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_x'])
+        tilerModel.addConstraint(inputWidthVar >= parseDict['dim_kernel_y'])
+
+        tilerModel.addConstraint((inputHeightVar % strides[0]) == 0)
+        tilerModel.addConstraint((inputWidthVar % strides[1]) == 0)
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+        varOut = operatorRepresentation['data_out']
+
+        inputInCubes = []
+        replacements: Dict[str, List[int]] = {
+            "dim_im_in_x": [],
+            "dim_im_in_y": [],
+            "dim_im_out_x": [],
+            "dim_im_out_y": [],
+            "ch_im_in": [],
+            "padding_y_top": [],
+            "padding_y_bottom": [],
+            "padding_x_left": [],
+            "padding_x_right": []
+        }
+
+        replacementTypes = {
+            "dim_im_in_x": PointerClass(uint16_t),
+            "dim_im_in_y": PointerClass(uint16_t),
+            "dim_im_out_x": PointerClass(uint16_t),
+            "dim_im_out_y": PointerClass(uint16_t),
+            "ch_im_in": PointerClass(uint16_t),
+            "padding_y_top": PointerClass(uint8_t),
+            "padding_y_bottom": PointerClass(uint8_t),
+            "padding_x_left": PointerClass(uint8_t),
+            "padding_x_right": PointerClass(uint8_t)
+        }
+
+        kernelShape = operatorRepresentation['kernel_shape']
+        pads = operatorRepresentation['pads']
+        strides = operatorRepresentation['strides']
+
+        for cube in outputCubes:
+            (BatchOffset, HOffset, WOffset, COffset) = cube.offset
+            (BatchSize, HSize, WSize, CSize) = cube.dims
+
+            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((kernelShape[0], kernelShape[1]), pads,
+                                                                          strides, CSize, cube,
+                                                                          ctxt.lookup(varOut).shape)
+            padding_left, padding_right, padding_top, padding_bottom = padding_tuple
+
+            replacements['dim_im_in_x'].append(InCube.dims[1])
+            replacements['dim_im_in_y'].append(InCube.dims[2])
+            replacements['dim_im_out_x'].append(HSize)
+            replacements['dim_im_out_y'].append(WSize)
+            replacements['ch_im_in'].append(CSize)
+
+            replacements['padding_y_top'].append(padding_top)
+            replacements['padding_y_bottom'].append(padding_bottom)
+            replacements['padding_x_left'].append(padding_left)
+            replacements['padding_x_right'].append(padding_right)
+
+            inputInCubes.append(InCube)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for a in inputInCubes:
+            inputLoadSchedule.append({"data_in": a})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/RequantShiftTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/RequantShiftTileConstraint.py
new file mode 100644
index 0000000..da3acf0
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/RequantShiftTileConstraint.py
@@ -0,0 +1,132 @@
+# ----------------------------------------------------------------------
+#
+# File: RequantShiftTileConstraint.py
+#
+# Last edited: 05.10.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class RequantShiftTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        inputBufferName = parseDict['data_in']
+        mulBufferName = parseDict['mul']
+        addBufferName = parseDict['add']
+        outputBufferName = parseDict['data_out']
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [inputBufferName, mulBufferName, addBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        inputShape = ctxt.lookup(inputBufferName).shape
+
+        mulBufferShapeLen = len(ctxt.lookup(mulBufferName).shape)
+        addBufferShapeLen = len(ctxt.lookup(addBufferName).shape)
+
+        mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = mulBufferShapeLen - 1)
+        addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = addBufferShapeLen - 1)
+
+        tilerModel.addConstraint(mulChannelVar == addChannelVar)
+
+        channels_first = parseDict['channels_first']
+        if not channels_first:
+            inChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = len(inputShape) - 1)
+        else:
+            inChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
+
+        tilerModel.addConstraint(mulChannelVar == inChannelVar)
+
+        for dim in range(len(inputShape)):
+            inputDimVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = dim)
+            outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim)
+            tilerModel.addConstraint(inputDimVar == outputDimVar)  # Batch
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'mul', 'add', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        inputCubes = outputCubes
+
+        rqCubes = []
+
+        replacements = {"size": [], "channel_width": [], "channels": []}
+        replacementTypes = {
+            "size": PointerClass(uint16_t),
+            "channel_width": PointerClass(uint16_t),
+            "channels": PointerClass(uint16_t)
+        }
+
+        for cube in inputCubes:
+
+            if operatorRepresentation['channels_first']:
+                rqCube = HyperRectangle((cube.offset[1],), (cube.dims[1],))
+                channelDim = cube.dims[1]
+            else:
+                rqCube = HyperRectangle((cube.offset[-1],), (cube.dims[-1],))
+                channelDim = cube.dims[-1]
+
+            rqCubes.append(rqCube)
+
+            size = np.prod(cube.dims[1:])
+            channelWidth = size // channelDim
+            channels = channelDim
+
+            replacements['size'].append(size)
+            replacements['channel_width'].append(channelWidth)
+            replacements['channels'].append(channels)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for a, rq in zip(inputCubes, rqCubes):
+            inputLoadSchedule.append({"data_in": a, "add": rq, "mul": rq})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/__init__.py b/Deeploy/Targets/PULPOpen/TileConstraints/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/iSoftmaxTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/iSoftmaxTileConstraint.py
new file mode 100644
index 0000000..9fb2a6f
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/iSoftmaxTileConstraint.py
@@ -0,0 +1,124 @@
+# ----------------------------------------------------------------------
+#
+# File: iSoftmaxTileConstraint.py
+#
+# Last edited: 13.11.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple, Union
+
+import numpy as np
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint32_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
+
+
+class iSoftmaxTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        inputBufferName = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        shapeLen = len(ctxt.lookup(inputBufferName).shape)
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [inputBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        for idx in range(shapeLen):
+            outputDim = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = idx)
+            inputDim = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = idx)
+            tilerModel.addConstraint(outputDim == inputDim)
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        inputBufferName = parseDict['data_in']
+        inputBuffer = ctxt.lookup(inputBufferName)
+
+        lastDimLength = inputBuffer.shape[-1]
+        lastDimIdx = len(inputBuffer.shape) - 1
+        lastDimVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = lastDimIdx)
+
+        # tilerModel.addTensorNumOfEltToModel(ctxt, inputBufferName)
+        # numVars = tilerModel.getTensorNumberOfEltVar(inputBufferName)
+
+        # tilerModel.addMinTileSizeConstraint(parseDict, 'size', numVars, 8*lastDimLength)
+
+        tilerModel.addConstraint(lastDimVar == lastDimLength)
+
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+        inputBufferName = parseDict['data_in']
+        inputBuffer = ctxt.lookup(inputBufferName)
+
+        lastDimIdx = len(inputBuffer.shape) - 1
+
+        symbolicParseDict = parseDict.copy()
+        symbolicParseDict['lastDimLength'] = tilerModel.getTensorDimVar(inputBuffer.name, lastDimIdx)
+
+        return symbolicParseDict
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        replacements = {"lastDimLength": [], "size": []}
+
+        replacementTypes = {"lastDimLength": PointerClass(uint32_t), "size": PointerClass(uint32_t)}
+
+        for cube in outputCubes:
+            lastDimLength = cube.dims[-1]
+            size = np.prod(cube.dims)
+
+            replacements['lastDimLength'].append(lastDimLength)
+            replacements['size'].append(size)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for out in outputCubes:
+            inputLoadSchedule.append({"data_in": out})
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py
new file mode 100644
index 0000000..26fd251
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Tiler.py
@@ -0,0 +1,124 @@
+# ----------------------------------------------------------------------
+#
+# File: PULPTiler.py
+#
+# Last edited: 09.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import MemoryPassthroughGeneration
+from Deeploy.DeeployTypes import CodeTransformation
+from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicReshapeBindings
+from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.ConcatTileConstraint import ConcatTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.iHardswishTileConstraint import iHardswishTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.iRMSNormTileConstraint import iRMSNormTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.MulTileConstraint import MulTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.NOPTileConstraint import NOPTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.RQSiGELUTileConstraint import RQSiGELUTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.RQSiHardswishTileConstraint import RQSiHardswishTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.UntiledTileConstraint import UntiledTileConstraint
+from Deeploy.Targets.PULPOpen.Bindings import PULPConcatBindings, PULPiHardswishBindings, PULPiRMSNormBindings, \
+    PULPiRQSGELUBindings, PULPMatMulBinding, PULPMaxPool2DBindings, PULPMulBindings, PULPRQAddBindings, \
+    PULPRQSBindings, PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, PULPRQSiHardswishBindings, \
+    PULPRQSMatrixVecBindings, PULPRQSTallGEMMBindings, PULPSoftmaxBindings, PULPTransposeBindings, \
+    PULPUniformRQSBindings, SimpleTransformer
+from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.GEMMTileConstraint import GEMMTileConstraint, MatrixVecTileConstraint, \
+    TallGEMMTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.iSoftmaxTileConstraint import iSoftmaxTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.MatMulTileConstraint import MatMulTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.MaxPoolTileConstraint import MaxPoolTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.RequantShiftTileConstraint import RequantShiftTileConstraint
+from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
+
+PULPRQSConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSConv2DBindings,
+                                                           tileConstraint = Conv2DTileConstraint())
+
+PULPRQSDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSDWConv2DBindings,
+                                                             tileConstraint = DWConv2DTileConstraint())
+
+PULPRQSGEMMTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSGEMMBindings,
+                                                         tileConstraint = GEMMTileConstraint())
+
+PULPRQSMatrixVecTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSMatrixVecBindings,
+                                                              tileConstraint = MatrixVecTileConstraint())
+
+PULPRQSTallGEMMTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSTallGEMMBindings,
+                                                             tileConstraint = TallGEMMTileConstraint())
+
+PULPMatMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [PULPMatMulBinding],
+                                                        tileConstraint = MatMulTileConstraint())
+
+PULPRQAddTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQAddBindings,
+                                                       tileConstraint = AddTileConstraint())
+
+PULPiHardswishTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPiHardswishBindings,
+                                                            tileConstraint = iHardswishTileConstraint())
+
+PULPRQSiHardswishTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSiHardswishBindings,
+                                                               tileConstraint = RQSiHardswishTileConstraint())
+
+_BasicFlattenBindings = copy.deepcopy(BasicReshapeBindings)
+for binding in _BasicFlattenBindings:
+    binding.codeTransformer = CodeTransformation([MemoryPassthroughGeneration("L.*"), MemoryPassthroughGeneration()])
+
+PULPFlattenTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = _BasicFlattenBindings,
+                                                         tileConstraint = NOPTileConstraint())
+
+PULPMaxPool2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPMaxPool2DBindings,
+                                                           tileConstraint = MaxPoolTileConstraint())
+
+PULPRQSTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSBindings,
+                                                     tileConstraint = RequantShiftTileConstraint())
+
+PULPUniformRQSTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPUniformRQSBindings,
+                                                            tileConstraint = UnaryTileConstraint())
+
+PULPTransposeTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPTransposeBindings,
+                                                           tileConstraint = TransposeTileConstraint())
+
+_PULPAddBindings = copy.deepcopy(BasicAddBindings)
+for binding in _PULPAddBindings:
+    binding.codeTransformer = SimpleTransformer
+
+PULPAddTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = _PULPAddBindings,
+                                                     tileConstraint = UntiledTileConstraint())
+
+PULPiSoftmaxTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSoftmaxBindings,
+                                                          tileConstraint = iSoftmaxTileConstraint())
+
+PULPConcatTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPConcatBindings,
+                                                        tileConstraint = ConcatTileConstraint())
+
+PULPiRMSNormTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPiRMSNormBindings,
+                                                          tileConstraint = iRMSNormTileConstraint())
+
+PULPiRQSGELUTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPiRQSGELUBindings,
+                                                          tileConstraint = RQSiGELUTileConstraint())
+
+PULPMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPMulBindings,
+                                                     tileConstraint = MulTileConstraint())
diff --git a/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/Passes.py
new file mode 100644
index 0000000..293e320
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/Passes.py
@@ -0,0 +1,281 @@
+# ----------------------------------------------------------------------
+#
+# File: PULPPasses.py
+#
+# Last edited: 10.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from collections import OrderedDict
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.Matchers import BranchingMatcher, Match
+from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, contextagnostic
+
+
+def _squash_transpose_add_fun(graph: gs.Graph, match: Match, name: str):
+
+    nodes_map = match.nodes_map
+
+    # SCHEREMO: Check that perms are equal
+    if not nodes_map['transpose1'].attrs['perm'] == nodes_map['transpose2'].attrs['perm']:
+        return graph
+
+    # SCHEREMO: Make sure we are requantizing layerwise
+    if not (isinstance(nodes_map['add'].attrs['rqs1_add'], int)
+            and isinstance(nodes_map['add'].attrs['rqs1_mul'], int)):
+        return graph
+
+    addNode = nodes_map['add']
+
+    transposeAttrs = copy.deepcopy(nodes_map['transpose1'].attrs)
+    newInputs = [nodes_map['transpose1'].inputs[0], nodes_map['transpose2'].inputs[0]]
+    newOutputs = [addNode.outputs[0]]
+
+    graph.deleteNode(nodes_map['transpose1'])
+    graph.deleteNode(nodes_map['transpose2'])
+    newAddOut = gs.Variable(name = addNode.outputs[0].name + "_tp")
+    newAddOut.shape = newInputs[0].shape
+    newAddOut.dtype = newOutputs[0].dtype
+
+    addNode.outputs = [newAddOut]
+    graph.layer(inputs = [newAddOut],
+                outputs = [newOutputs[0]],
+                op = "Transpose",
+                name = addNode.name + "_transpose",
+                attrs = transposeAttrs)
+
+    #import IPython; IPython.embed()
+
+    return graph
+
+
+@contextagnostic
+class RQAddTransposeSquashPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        _input1 = gs.Variable(name = 'input_1')
+        _input2 = gs.Variable(name = 'input_2')
+        _addIn1 = gs.Variable(name = 'addIn1')
+        _addIn2 = gs.Variable(name = 'addIn2')
+        _addOut = gs.Variable(name = 'addOut')
+        _rqs = gs.Variable(name = 'rqs')
+
+        anyIn1 = gs.Node(inputs = [_input1], outputs = [_addIn1], op = r'Transpose', name = 'transpose1')
+        anyIn2 = gs.Node(inputs = [_input2], outputs = [_addIn2], op = r'Transpose', name = 'transpose2')
+
+        addOut = gs.Node(inputs = [_addIn1, _addIn2], outputs = [_addOut], op = 'RequantizedAdd', name = 'add')
+
+        graph = gs.Graph(nodes = [anyIn1, anyIn2, addOut], inputs = [_input1, _input2], outputs = [_rqs])
+
+        super().__init__(graph,
+                         replacement_fn = _squash_transpose_add_fun,
+                         name = "_SQUASH_TRANSPOSE_RQADD_PASS",
+                         matcher = BranchingMatcher(regex_op = True))
+
+
+def _merge_add_rq_fun(graph: gs.Graph, match: Match, name: str):
+
+    nodes_map = match.nodes_map
+    addNode = nodes_map['add']
+
+    rqDict = OrderedDict([("rqs1", None), ("rqs2", None), ("rqsOut", None)])
+
+    for key, node in nodes_map.items():
+
+        if node.outputs[0].name == addNode.inputs[0].name:
+            rqDict['rqs1'] = node
+        elif node.outputs[0].name == addNode.inputs[1].name:
+            rqDict['rqs2'] = node
+        elif node.inputs[0].name == addNode.outputs[0].name:
+            rqDict['rqsOut'] = node
+
+    newAttrs = copy.copy(addNode.attrs)
+    newInputs = []
+
+    if rqDict['rqsOut'] is not None:
+        newOutputs = rqDict['rqsOut'].outputs
+    else:
+        newOutputs = addNode.outputs
+
+    defaultAttrs = {
+        "mul": 1,
+        "add": 0,
+        "div": gs.Constant('div', np.array(1)),
+        'shift': gs.Constant('div', np.array(0))
+    }
+    guessAttrs = {"n_levels_out": 256, "signed": np.array([True])}
+    for idx, (rqKey, node) in enumerate(rqDict.items()):
+        if node.op == "RequantShift":
+            for key, attr in node.attrs.items():
+                newAttrs[f"{rqKey}_{key}"] = attr
+
+            if np.prod(node.inputs[1].values.shape) != 1:
+                return graph
+
+            if np.prod(node.inputs[2].values.shape) != 1:
+                return graph
+
+            if rqKey != 'rqsOut':
+                newInputs.append(node.inputs[0])
+
+            newAttrs[f"{rqKey}_mul"] = int(node.inputs[1].values.item())
+            newAttrs[f"{rqKey}_add"] = int(node.inputs[2].values.item() + newAttrs[f"{rqKey}_div"].values.item() // 2)
+            newAttrs[f"{rqKey}_shift"] = int(np.log2(newAttrs[f"{rqKey}_div"].values.item()))
+
+        else:
+            for key, attr in defaultAttrs.items():
+                newAttrs[f"{rqKey}_{key}"] = attr
+
+            for key, attr in guessAttrs.items():
+                if not key in node.attrs:
+                    newAttrs[f"{rqKey}_{key}"] = attr
+                else:
+                    newAttrs[f"{rqKey}_{key}"] = node.attrs[key]
+            if rqKey != 'rqsOut':
+                newInputs.append(addNode.inputs[idx])
+
+    rqAdd = gs.Node(op = "RequantizedAdd", name = name, attrs = newAttrs)
+    graph.replaceInsertNode(newInputs, newOutputs, rqAdd)
+
+    return graph
+
+
+@contextagnostic
+class PULPAddRequantMergePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        _input1 = gs.Variable(name = 'input_1')
+        _input2 = gs.Variable(name = 'input_2')
+        _addIn1 = gs.Variable(name = 'addIn1')
+        _addIn2 = gs.Variable(name = 'addIn2')
+        _addOut = gs.Variable(name = 'addOut')
+        _rqs = gs.Variable(name = 'rqs')
+
+        anyIn1 = gs.Node(inputs = [_input1], outputs = [_addIn1], op = r'.*', name = 'any1')
+        anyIn2 = gs.Node(inputs = [_input2], outputs = [_addIn2], op = r'.*', name = 'any2')
+
+        addOut = gs.Node(inputs = [_addIn1, _addIn2], outputs = [_addOut], op = 'Add', name = 'add')
+        output = gs.Node(inputs = [_addOut], outputs = [_rqs], op = r'RequantShift', name = 'rqsOut')
+
+        graph = gs.Graph(nodes = [anyIn1, anyIn2, addOut, output], inputs = [_input1, _input2], outputs = [_rqs])
+
+        super().__init__(graph,
+                         replacement_fn = _merge_add_rq_fun,
+                         name = "_MERGE_ADDRQ_PASS",
+                         matcher = BranchingMatcher(regex_op = True))
+
+
+def _merge_conv_rq_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    conv = matched_nodes[0]
+    rqs = matched_nodes[1]
+
+    totalShift = int(np.log2(rqs.attrs['div'].values))
+
+    # Artifically add half the shift division value to implement rounding
+    rqs.inputs[-1].values = copy.deepcopy(rqs.inputs[-1].values) + 2**(totalShift - 1)
+
+    _inputs = list(conv.inputs) + list(rqs.inputs[1:])
+
+    _outputs = rqs.outputs
+
+    rqsConv = gs.Node(op = 'RequantizedConv', name = name, attrs = {**conv.attrs, **rqs.attrs, "shift": totalShift})
+    graph.replaceInsertNode(_inputs, _outputs, rqsConv)
+
+    return graph
+
+
+@contextagnostic
+class PULPConvRequantMergePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['conv_out'], op = 'Conv', name = 'conv1')
+        output = graph.layer(inputs = output, outputs = ['rqs'], op = 'RequantShift', name = 'rqs1')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_MERGE_CONVRQ_PASS"
+        super().__init__(graph, _merge_conv_rq_fun, name)
+
+
+def _merge_gemm_rq_fun(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    gemm = matched_nodes[0]
+    rqs = matched_nodes[1]
+
+    totalShift = int(np.log2(rqs.attrs['div'].values))
+
+    rqs.inputs[-1].values = copy.deepcopy(rqs.inputs[-1].values) + 2**(totalShift - 1)
+
+    # GEMM has add
+    if len(list(gemm.inputs)) == 3:
+
+        gemm.inputs[2].values = np.round(gemm.inputs[2].values * (rqs.inputs[1].values)) + rqs.inputs[2].values
+
+        #gemm.inputs[2].values = gemm.inputs[2].values + np.round(rqs.inputs[2].values / (rqs.inputs[1].values + 1e-3))
+        # Keep input, weight from GEMM
+        # Take mul from RQS
+        _inputs = list(gemm.inputs) + list(rqs.inputs[1:2])
+    else:
+        _inputs = list(gemm.inputs) + list(rqs.inputs[2:]) + list(rqs.inputs[1:2])
+    _outputs = rqs.outputs
+    attrs = {**gemm.attrs, **rqs.attrs}
+    attrs['shift'] = gs.Constant(name = 'shift', values = np.array(totalShift))
+    #attrs['mul']=gs.Constant(name='mul',values = np.array(rqs.inputs[1].values))
+    rqsGemm = gs.Node(op = 'RequantizedGemm', name = name, attrs = attrs)
+    graph.replaceInsertNode(_inputs, _outputs, rqsGemm)
+
+    return graph
+
+
+@contextagnostic
+class PULPGEMMRequantMergePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['gemm_out'], op = 'Gemm', name = 'gemm')
+        output = graph.layer(inputs = output, outputs = ['rqs'], op = 'RequantShift', name = 'rqs1')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_MERGE_GEMMRQ_PASS"
+        super().__init__(graph, _merge_gemm_rq_fun, name)
+
+
+@contextagnostic
+class PULPMatMulRequantMergePass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['gemm_out'], op = 'MatMul', name = 'gemm')
+        output = graph.layer(inputs = output, outputs = ['rqs'], op = 'RequantShift', name = 'rqs1')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        name = "_MERGE_GEMM_MATMUL_RQ_PASS"
+        super().__init__(graph, _merge_gemm_rq_fun, name)
diff --git a/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/__init__.py b/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/PULPOpen/TypeCheckers.py b/Deeploy/Targets/PULPOpen/TypeCheckers.py
new file mode 100644
index 0000000..2685f4d
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TypeCheckers.py
@@ -0,0 +1,159 @@
+# ----------------------------------------------------------------------
+#
+# File: PULPCheckers.py
+#
+# Last edited: 03.10.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Sequence, Type
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.TypeCheckers.SignPropTypeChecker import SignPropTypeChecker
+from Deeploy.DeeployTypes import OperatorRepresentation, VariableBuffer
+
+
+class PULPDMASliceChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [inputs[0].nLevels]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+
+class PULPRQAddChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [operatorRepresentation['rqsOut_n_levels']]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [bool(operatorRepresentation["rqsOut_signed"])]
+
+    # Override this. This should compute the signednes of each output node of the Layer
+    def checkOutputType(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> bool:
+        outputTypeSigned = self.output_types[0].referencedType.typeMin < 0
+        if operatorRepresentation['rqsOut_signed'] and outputTypeSigned:
+            return True
+        if (not operatorRepresentation['rqsOut_signed']) and (not outputTypeSigned):
+            return True
+        return False
+
+
+class PULPRequantShiftChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [operatorRepresentation['n_levels']]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [operatorRepresentation["signed"]]
+
+    def checkOutputType(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> bool:
+        outputTypeSigned = self.output_types[0].referencedType.typeMin < 0
+        if operatorRepresentation['signed'] and outputTypeSigned:
+            return True
+        if (not operatorRepresentation['signed']) and (not outputTypeSigned):
+            return True
+        return False
+
+
+class PULPConvChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [operatorRepresentation['n_levels']]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [bool(operatorRepresentation["signed"])]
+
+    # Override this. This should compute the signednes of each output node of the Layer
+    def checkOutputType(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> bool:
+        outputTypeSigned = self.output_types[0].referencedType.typeMin < 0
+        if operatorRepresentation['signed'] and outputTypeSigned:
+            return True
+        if (not operatorRepresentation['signed']) and (not outputTypeSigned):
+            return True
+        return False
+
+
+class PULPLinearChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [operatorRepresentation['n_levels']]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [bool(operatorRepresentation["signed"])]
+
+    # Override this. This should compute the signednes of each output node of the Layer
+    def checkOutputType(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> bool:
+        outputTypeSigned = self.output_types[0].referencedType.typeMin < 0
+        if operatorRepresentation['signed'] and outputTypeSigned:
+            return True
+        if (not operatorRepresentation['signed']) and (not outputTypeSigned):
+            return True
+        return False
+
+
+class PULPMaxPoolChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [inputs[0].nLevels]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+    # Override this. This should compute the signednes of each output node of the Layer
+    def checkOutputType(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> bool:
+        return True
diff --git a/Deeploy/Targets/PULPOpen/__init__.py b/Deeploy/Targets/PULPOpen/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/__init__.py b/Deeploy/Targets/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/Targets/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py
new file mode 100644
index 0000000..e19d1ca
--- /dev/null
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py
@@ -0,0 +1,190 @@
+# ----------------------------------------------------------------------
+#
+# File: TilingCodeGeneration.py
+#
+# Last edited: 24.10.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+from typing import List, Tuple
+
+import Deeploy.CommonExtensions.DataTypes as BasicDataTypes
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureExecutionBlock
+from Deeploy.CommonExtensions.CodeTransformationPasses.IntrospectiveCodeTransformation import \
+    IntrospectiveCodeTransformationMixIn
+from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, \
+    NodeTemplate, OperatorRepresentation, VariableBuffer, _NoVerbosity
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PrototypeTilingMixIn
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, minimizeVariableReplacement
+
+
+class TilingCodeGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn, PrototypeTilingMixIn):
+
+    def __init__(self, targetMemLevel: str):
+        self.targetMemLevel = targetMemLevel
+        self.argStructGeneration = ArgumentStructGeneration()
+
+    @abstractmethod
+    def generateTilingLoop(
+            self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nodeMemoryConstraint: NodeMemoryConstraint,
+            tilingSchedule: TilingSchedule, variableReplacement: VariableReplacementScheme,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
+
+        return ctxt, executionBlock, False
+
+    # SCHEREMO: internalPtr refers to the HIGHER memory level of a transfer,
+    # e.g. in both an L2 -> L1 and L1 -> L2 transfer, the internalPtr is in L1.
+    @staticmethod
+    def isFinalMemoryLevel(nodeMemoryConstraint: NodeMemoryConstraint, internalPtr: VariableBuffer) -> bool:
+        externalName = internalPtr._referenceName
+        tensorMemoryConstraint = nodeMemoryConstraint.tensorMemoryConstraints[externalName]
+        if len(tensorMemoryConstraint.memoryConstraints.keys()) <= 2:
+            return True
+
+        finalMemoryLevels = list(tensorMemoryConstraint.memoryConstraints.keys())[:2]
+        memoryLevel = internalPtr._memoryLevel
+
+        return memoryLevel in finalMemoryLevels
+
+    def _hoistTileIdxPtr(self,
+                         ctxt: NetworkContext,
+                         operatorRepresentation: OperatorRepresentation,
+                         sourceMemoryLevel: str = "L2") -> str:
+
+        newPtrName = self.prefix + operatorRepresentation['nodeName'] + "_tileIdxPtr"
+
+        tilePtrBuffer = ctxt.VariableBuffer(newPtrName, shape = [1])
+        ctxt.add(tilePtrBuffer, "local")
+
+        _type = ctxt.lookup(self.prefix + operatorRepresentation['nodeName'] + "_numTiles")._type
+
+        tilePtrBuffer._type = _type
+        tilePtrBuffer._instance = tilePtrBuffer._type(newPtrName, ctxt)
+        tilePtrBuffer._memoryLevel = sourceMemoryLevel
+
+        tilePtrBuffer.allocTemplate = NodeTemplate("")
+        tilePtrBuffer.deallocTemplate = NodeTemplate("")
+        tilePtrBuffer.initTemplate = NodeTemplate("""
+        ${type.referencedType.typeName} bu_${name} = 0;
+        ${type.referencedType.typeName}* ${name} = &bu_${name};""")
+
+        return newPtrName
+
+    def _hoistNumTiles(self,
+                       ctxt: NetworkContext,
+                       nodeName: str,
+                       tilingSchedules: List[TilingSchedule],
+                       sourceMemoryLevel: str = "L2") -> str:
+
+        newPtrName = self.prefix + nodeName + "_numTiles"
+
+        numTiles = [len(tilingSchedule.outputLoadSchedule) for tilingSchedule in tilingSchedules]
+        cumNumTiles = [0]
+        for idx in list(range(len(numTiles))):
+            cumNumTiles.append(cumNumTiles[-1] + numTiles[idx])
+
+        cb = ctxt.ConstantBuffer(newPtrName, [len(cumNumTiles)], values = cumNumTiles)
+        ctxt.add(cb, "global")
+
+        minType = None
+        if BasicDataTypes.uint8_t.checkValue(cumNumTiles):
+            minType = BasicDataTypes.uint8_t
+        elif BasicDataTypes.uint16_t.checkValue(cumNumTiles):
+            minType = BasicDataTypes.uint16_t
+        else:
+            minType = BasicDataTypes.uint32_t
+
+        cb._type = PointerClass(minType)
+        cb._instance = cb._type(newPtrName, ctxt)
+        cb._memoryLevel = sourceMemoryLevel
+
+        return newPtrName
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        def unravelReference(ctxt: NetworkContext, name: str) -> str:
+
+            if name not in ctxt.localObjects.keys() and name not in ctxt.globalObjects.keys():
+                return name
+
+            refBuffer = ctxt.lookup(name)
+            if not hasattr(refBuffer, "_referenceName"):
+                return name
+
+            return unravelReference(ctxt, refBuffer._referenceName)
+
+        if isinstance(executionBlock, ClosureExecutionBlock):
+            baseExecutionBlock = executionBlock.baseBlock
+        else:
+            baseExecutionBlock = executionBlock
+
+        patternMemoryConstraint = baseExecutionBlock.patternMemoryConstraint
+
+        if patternMemoryConstraint is None:
+            return ctxt, executionBlock
+
+        assert len(patternMemoryConstraint.nodeConstraints) == 1, "Only layerwise supported for now!"
+        #assert len(baseExecutionBlock.codeSnippets) == 1, "Only layerwise supported for now!"
+
+        nodeMemoryConstraint = patternMemoryConstraint.nodeConstraints[0]
+
+        possibleTemplateNodes = [
+            node for node in baseExecutionBlock.codeSnippets if hasattr(node.template, 'tileConstraint')
+        ]
+
+        assert len(possibleTemplateNodes) == 1, "More than one template node with TCF found"
+
+        templateNode = possibleTemplateNodes[0]
+
+        operatorRepresentation = templateNode.operatorRepresentation
+        unravelRep = operatorRepresentation.copy()
+        for key in unravelRep.keys():
+
+            val = unravelRep[key]
+            if not isinstance(val, str):
+                continue
+
+            unravelRep[key] = unravelReference(ctxt, val)
+
+        template = templateNode.template
+
+        variableReplacement, tilingSchedules = template.tileConstraint.wrapTilingSolution(
+            nodeMemoryConstraint, self.targetMemLevel, ctxt, unravelRep)
+
+        minimalVariableReplacement, newNodeRep = minimizeVariableReplacement(variableReplacement,
+                                                                             templateNode.operatorRepresentation)
+        for key, value in newNodeRep.items():
+            templateNode.operatorRepresentation[key] = value
+
+        ctxt, executionBlock, applicable = self.generateTilingLoop(ctxt, executionBlock, nodeMemoryConstraint,
+                                                                   tilingSchedules, minimalVariableReplacement,
+                                                                   operatorRepresentation)
+        if applicable:
+            ctxt, executionBlock = self.argStructGeneration.apply(ctxt, executionBlock, name)
+
+        return ctxt, executionBlock
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py
new file mode 100644
index 0000000..eb063e2
--- /dev/null
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py
@@ -0,0 +1,463 @@
+# ----------------------------------------------------------------------
+#
+# File: TilingPrototypes.py
+#
+# Last edited: 17.04.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import List
+
+from Deeploy.DeeployTypes import CodeSnippet, ExecutionBlock, NodeTemplate
+
+
+@dataclass
+class TilingMetaInfo:
+    nodeName: str
+    nodeOps: int
+    numTiles: int
+    tileIdxVar: str
+
+
+_CodeSegmentType = List[CodeSnippet]
+
+_measureCycles = NodeTemplate("""
+${nodeName}_${measurementName}_measurements[${tileIdx}] = getCycles();
+""")
+
+_measurementArrayDeclaration = NodeTemplate("""
+uint32_t ${nodeName}_${measurementName}_measurements[${numTiles}];
+""")
+
+_printPrefixAndSufixDeclaration = NodeTemplate("""
+char ${nodeName}_prefix[] = "[${nodeName}][${buffering}][${nodeOps} ops][Tile ";
+char ${nodeName}_suffix[] = " cycles \\n";
+""")
+
+_measureConditionSetup = NodeTemplate("""
+if(${cond}){
+""")
+
+_measureConditionEnd = NodeTemplate("""
+}
+""")
+
+_printLoopSetup = NodeTemplate("""
+StopTimer();
+for (int printLoopIdx = 0; printLoopIdx < ${numTiles}; printLoopIdx++){
+""")
+
+_printCycleDifference = NodeTemplate(r"""
+printf("%s%u] %s%u%s", ${nodeName}_prefix,${tileIdx},"${flavorStr}", \
+${nodeName}_${endMeasurementName}_measurements[${tileIdx}] - ${nodeName}_${startMeasurementName}_measurements[${tileIdx}],${nodeName}_suffix);
+""")
+
+_printLoopTeardown = NodeTemplate("""
+}
+StartTimer();
+""")
+
+
+class PrototypeTilingMixIn(ABC):
+
+    @classmethod
+    def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                                     setupStatements: _CodeSegmentType,
+                                     teardownStatements: _CodeSegmentType) -> ExecutionBlock:
+
+        for transaction in reversed(setupStatements):
+            executionBlock.addLeft(transaction.template, transaction.operatorRepresentation)
+
+        for transaction in teardownStatements:
+            executionBlock.addRight(transaction.template, transaction.operatorRepresentation)
+
+        return executionBlock
+
+    @classmethod
+    def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                         openLoopStatements: _CodeSegmentType, closeLoopStatements: _CodeSegmentType) -> ExecutionBlock:
+
+        for transaction in reversed(openLoopStatements):
+            executionBlock.addLeft(transaction.template, transaction.operatorRepresentation)
+
+        for transaction in closeLoopStatements:
+            executionBlock.addRight(transaction.template, transaction.operatorRepresentation)
+
+        return executionBlock
+
+    @classmethod
+    def generateAllTilingCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                              ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType,
+                              ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType,
+                              egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType,
+                              variableUpdates: _CodeSegmentType, openLoopStatements: _CodeSegmentType,
+                              closeLoopStatements: _CodeSegmentType, setupStatements: _CodeSegmentType,
+                              teardownStatements: _CodeSegmentType) -> ExecutionBlock:
+
+        if not hasattr(cls, "generateInnerCode"):
+            raise Exception("You need to mix in a code gen strategy!")
+
+        newExecutionBlock = cls.generateInnerCode(executionBlock, metaInfo, ingressDMATransferCalls,
+                                                  ingressDMAWaitStatements, ingressDMAUpdates, egressDMATransferCalls,
+                                                  egressDMAWaitStatements, egressDMAUpdates, variableUpdates)
+
+        newExecutionBlock = cls.generateLoopCode(newExecutionBlock, metaInfo, openLoopStatements, closeLoopStatements)
+
+        newExecutionBlock = cls.generateSetupAndTeardownCode(newExecutionBlock, metaInfo, setupStatements,
+                                                             teardownStatements)
+
+        return newExecutionBlock
+
+
+class TilingCodeGenMixin(ABC):
+
+    @abstractmethod
+    def generateInnerCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                          ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType,
+                          ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType,
+                          egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType,
+                          variableUpdates: _CodeSegmentType) -> ExecutionBlock:
+
+        return executionBlock
+
+
+class SingleBufferingTilingMixIn(PrototypeTilingMixIn, TilingCodeGenMixin):
+
+    @classmethod
+    def generateInnerCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                          ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType,
+                          ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType,
+                          egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType,
+                          variableUpdates: _CodeSegmentType) -> ExecutionBlock:
+
+        # Structure:
+        # Update DMA Structs
+        # Transfer in tiles (async)
+        # Update tile variables
+        # Wait for tiles
+
+        # Kernel execution
+
+        # Update DMA Structs
+        # Transfer out tiles (async)
+        # Wait for out transfers
+
+        for transaction in reversed(ingressDMAUpdates + ingressDMATransferCalls + variableUpdates +
+                                    ingressDMAWaitStatements):
+            executionBlock.addLeft(transaction.template, transaction.operatorRepresentation)
+
+        for transaction in (egressDMAUpdates + egressDMATransferCalls + egressDMAWaitStatements):
+            executionBlock.addRight(transaction.template, transaction.operatorRepresentation)
+
+        return executionBlock
+
+
+class ProfilingSingleBufferingTilingMixIn(SingleBufferingTilingMixIn):
+
+    @classmethod
+    def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                                     setupStatements: _CodeSegmentType,
+                                     teardownStatements: _CodeSegmentType) -> ExecutionBlock:
+
+        nodeName = metaInfo.nodeName
+        nodeOps = metaInfo.nodeOps
+        numTiles = metaInfo.numTiles
+
+        executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements,
+                                                              teardownStatements)
+
+        for measurementName in [
+                "kernel_start", "kernel_end", "ingress_dma_wait_start", "ingress_dma_wait_end", "egress_dma_wait_start",
+                "egress_dma_wait_end"
+        ]:
+            executionBlock.addLeft(_measurementArrayDeclaration, {
+                "nodeName": nodeName,
+                "measurementName": measurementName,
+                "numTiles": numTiles
+            })
+
+        executionBlock.addLeft(_printPrefixAndSufixDeclaration, {
+            "nodeName": nodeName,
+            "nodeOps": nodeOps,
+            "buffering": "SB"
+        })
+
+        executionBlock.addRight(_printLoopSetup, {"numTiles": numTiles})
+
+        executionBlock.addRight(
+            _printCycleDifference, {
+                "nodeName": nodeName,
+                "flavorStr": "Input DMA took ",
+                "startMeasurementName": "ingress_dma_wait_start",
+                "endMeasurementName": "ingress_dma_wait_end",
+                "tileIdx": "printLoopIdx"
+            })
+        executionBlock.addRight(
+            _printCycleDifference, {
+                "nodeName": nodeName,
+                "flavorStr": "Kernel took ",
+                "startMeasurementName": "kernel_start",
+                "endMeasurementName": "kernel_end",
+                "tileIdx": "printLoopIdx"
+            })
+        executionBlock.addRight(
+            _printCycleDifference, {
+                "nodeName": nodeName,
+                "flavorStr": "Output DMA took ",
+                "startMeasurementName": "egress_dma_wait_start",
+                "endMeasurementName": "egress_dma_wait_end",
+                "tileIdx": "printLoopIdx"
+            })
+
+        executionBlock.addRight(_printLoopTeardown, {})
+
+        return executionBlock
+
+    @classmethod
+    def generateInnerCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                          ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType,
+                          ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType,
+                          egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType,
+                          variableUpdates: _CodeSegmentType) -> ExecutionBlock:
+
+        nodeName = metaInfo.nodeName
+        numTiles = metaInfo.numTiles
+        tileIdxVar = metaInfo.tileIdxVar
+
+        executionBlock.addLeft(_measureCycles, {
+            "nodeName": nodeName,
+            "measurementName": "kernel_start",
+            "tileIdx": tileIdxVar
+        })
+        executionBlock.addRight(_measureCycles, {
+            "nodeName": nodeName,
+            "measurementName": "kernel_end",
+            "tileIdx": tileIdxVar
+        })
+
+        _ingressDMAWaitStatements = []
+        _ingressDMAWaitStatements.append(
+            CodeSnippet(_measureCycles, {
+                "nodeName": nodeName,
+                "measurementName": "ingress_dma_wait_start",
+                "tileIdx": tileIdxVar
+            }))
+        _ingressDMAWaitStatements += ingressDMAWaitStatements
+        _ingressDMAWaitStatements.append(
+            CodeSnippet(_measureCycles, {
+                "nodeName": nodeName,
+                "measurementName": "ingress_dma_wait_end",
+                "tileIdx": tileIdxVar
+            }))
+
+        _egressDMAWaitStatements = []
+        _egressDMAWaitStatements.append(
+            CodeSnippet(_measureCycles, {
+                "nodeName": nodeName,
+                "measurementName": "egress_dma_wait_start",
+                "tileIdx": tileIdxVar
+            }))
+        _egressDMAWaitStatements += egressDMAWaitStatements
+        _egressDMAWaitStatements.append(
+            CodeSnippet(_measureCycles, {
+                "nodeName": nodeName,
+                "measurementName": "egress_dma_wait_end",
+                "tileIdx": tileIdxVar
+            }))
+
+        executionBlock = super().generateInnerCode(executionBlock, metaInfo, ingressDMATransferCalls,
+                                                   _ingressDMAWaitStatements, ingressDMAUpdates, egressDMATransferCalls,
+                                                   _egressDMAWaitStatements, egressDMAUpdates, variableUpdates)
+
+        return executionBlock
+
+
+class DoubleBufferingTilingMixIn(PrototypeTilingMixIn, TilingCodeGenMixin):
+
+    @classmethod
+    def generateInnerCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                          ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType,
+                          ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType,
+                          egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType,
+                          variableUpdates: _CodeSegmentType) -> ExecutionBlock:
+
+        # Structure:
+
+        # Update input DMA Structs
+        # Update tile variables
+        # Wait for current input tiles
+        # Transfer in next input tiles (async)
+        # Update output DMA Structs
+        # Wait for current output tiles
+
+        # Kernel execution
+
+        # Transfer out tiles (async)
+
+        for transaction in reversed(ingressDMAWaitStatements + ingressDMAUpdates + ingressDMATransferCalls +
+                                    variableUpdates + egressDMAWaitStatements + egressDMAUpdates):
+            executionBlock.addLeft(transaction.template, transaction.operatorRepresentation)
+
+        for transaction in egressDMATransferCalls:
+            executionBlock.addRight(transaction.template, transaction.operatorRepresentation)
+
+        return executionBlock
+
+
+class ProfilingDoubleBufferingTilingMixIn(DoubleBufferingTilingMixIn):
+
+    @classmethod
+    def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                                     setupStatements: _CodeSegmentType,
+                                     teardownStatements: _CodeSegmentType) -> ExecutionBlock:
+
+        nodeName = metaInfo.nodeName
+        nodeOps = metaInfo.nodeOps
+        numTiles = metaInfo.numTiles
+
+        executionBlock.addLeft(_measureCycles, {
+            "nodeName": nodeName,
+            "measurementName": "ingress_dma_wait_start",
+            "tileIdx": 0
+        })
+
+        for measurementName in [
+                "kernel_start", "kernel_end", "ingress_dma_wait_start", "ingress_dma_wait_end", "egress_dma_wait_start",
+                "egress_dma_wait_end"
+        ]:
+            executionBlock.addLeft(_measurementArrayDeclaration, {
+                "nodeName": nodeName,
+                "measurementName": measurementName,
+                "numTiles": numTiles
+            })
+
+        executionBlock.addLeft(_printPrefixAndSufixDeclaration, {
+            "nodeName": nodeName,
+            "nodeOps": nodeOps,
+            "buffering": "DB"
+        })
+
+        executionBlock.addRight(_measureCycles, {
+            "nodeName": nodeName,
+            "measurementName": "egress_dma_wait_start",
+            "tileIdx": numTiles - 1
+        })
+        executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements,
+                                                              teardownStatements)
+        executionBlock.addRight(_measureCycles, {
+            "nodeName": nodeName,
+            "measurementName": "egress_dma_wait_end",
+            "tileIdx": numTiles - 1
+        })
+
+        executionBlock.addRight(_printLoopSetup, {"numTiles": numTiles})
+
+        executionBlock.addRight(
+            _printCycleDifference, {
+                "nodeName": nodeName,
+                "flavorStr": "Input DMA took ",
+                "startMeasurementName": "ingress_dma_wait_start",
+                "endMeasurementName": "ingress_dma_wait_end",
+                "tileIdx": "printLoopIdx"
+            })
+        executionBlock.addRight(
+            _printCycleDifference, {
+                "nodeName": nodeName,
+                "flavorStr": "Kernel took ",
+                "startMeasurementName": "kernel_start",
+                "endMeasurementName": "kernel_end",
+                "tileIdx": "printLoopIdx"
+            })
+        executionBlock.addRight(
+            _printCycleDifference, {
+                "nodeName": nodeName,
+                "flavorStr": "Output DMA took ",
+                "startMeasurementName": "egress_dma_wait_start",
+                "endMeasurementName": "egress_dma_wait_end",
+                "tileIdx": "printLoopIdx"
+            })
+
+        executionBlock.addRight(_printLoopTeardown, {})
+
+        return executionBlock
+
+    @classmethod
+    def generateInnerCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                          ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType,
+                          ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType,
+                          egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType,
+                          variableUpdates: _CodeSegmentType) -> ExecutionBlock:
+
+        nodeName = metaInfo.nodeName
+        numTiles = metaInfo.numTiles
+        tileIdxVar = metaInfo.tileIdxVar
+
+        executionBlock.addLeft(_measureCycles, {
+            "nodeName": nodeName,
+            "measurementName": "kernel_start",
+            "tileIdx": tileIdxVar
+        })
+        executionBlock.addRight(_measureCycles, {
+            "nodeName": nodeName,
+            "measurementName": "kernel_end",
+            "tileIdx": tileIdxVar
+        })
+
+        _ingressDMAWaitStatements = []
+        _ingressDMAWaitStatements.append(CodeSnippet(_measureConditionSetup, {"cond": f"{tileIdxVar} > 0"}))
+        _ingressDMAWaitStatements.append(
+            CodeSnippet(_measureCycles, {
+                "nodeName": nodeName,
+                "measurementName": "ingress_dma_wait_start",
+                "tileIdx": tileIdxVar
+            }))
+        _ingressDMAWaitStatements.append(CodeSnippet(_measureConditionEnd, {}))
+        _ingressDMAWaitStatements += ingressDMAWaitStatements
+        _ingressDMAWaitStatements.append(
+            CodeSnippet(_measureCycles, {
+                "nodeName": nodeName,
+                "measurementName": "ingress_dma_wait_end",
+                "tileIdx": tileIdxVar
+            }))
+
+        _egressDMAWaitStatements = []
+        _egressDMAWaitStatements.append(CodeSnippet(_measureConditionSetup, {"cond": f"{tileIdxVar} > 0"}))
+        _egressDMAWaitStatements.append(
+            CodeSnippet(_measureCycles, {
+                "nodeName": nodeName,
+                "measurementName": "egress_dma_wait_start",
+                "tileIdx": f"{tileIdxVar} - 1"
+            }))
+        _egressDMAWaitStatements += egressDMAWaitStatements
+        _egressDMAWaitStatements.append(
+            CodeSnippet(_measureCycles, {
+                "nodeName": nodeName,
+                "measurementName": "egress_dma_wait_end",
+                "tileIdx": f"{tileIdxVar} - 1"
+            }))
+        _egressDMAWaitStatements.append(CodeSnippet(_measureConditionEnd, {}))
+
+        executionBlock = super().generateInnerCode(executionBlock, metaInfo, ingressDMATransferCalls,
+                                                   _ingressDMAWaitStatements, ingressDMAUpdates, egressDMATransferCalls,
+                                                   _egressDMAWaitStatements, egressDMAUpdates, variableUpdates)
+
+        return executionBlock
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py
new file mode 100644
index 0000000..fd910cc
--- /dev/null
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py
@@ -0,0 +1,281 @@
+# ----------------------------------------------------------------------
+#
+# File: TilingVariableReplacement.py
+#
+# Last edited: 28.09.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import Dict, List, Tuple, Type
+
+from mako.parsetree import Expression, Node, Text
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureExecutionBlock
+from Deeploy.CommonExtensions.CodeTransformationPasses.IntrospectiveCodeTransformation import \
+    IntrospectiveCodeTransformationMixIn
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeSnippet, CodeTransformationPass, ExecutionBlock, \
+    NetworkContext, NodeTemplate, OperatorRepresentation, TransientBuffer, _NoVerbosity
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, minimizeVariableReplacement
+
+
+class TilingVariableReplacement(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
+
+    _prefix = "TILING_REPLACED_"
+
+    def __init__(self, targetMemLevel: str):
+        self.targetMemLevel = targetMemLevel
+        self._name: str
+
+    @property
+    def prefix(self):
+        return self._prefix + f"{self._name}_" + self.targetMemLevel + "_"
+
+    def _dereferencePointer(self, nodes: List[Node], name: str) -> List[Node]:
+        instanceIdxs = [idx for idx, node in enumerate(nodes) if isinstance(node, Expression) and node.text == name]
+
+        for offset, idx in enumerate(instanceIdxs):
+            text = Text("*", source = "*", lineno = 0, pos = 0, filename = None)
+            nodes.insert(offset + idx, text)
+
+        return nodes
+
+    def _replaceImmediate(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation,
+                          variableReplacement: Tuple[str,
+                                                     List], dataType: Type[Pointer]) -> Tuple[NetworkContext, Dict]:
+
+        varName = variableReplacement[0]
+        varVal = variableReplacement[1]
+
+        newConstName = self.prefix + varName
+        newRefName = self.prefix + "ref_" + varName
+
+        cb = ctxt.ConstantBuffer(newConstName, shape = (len(varVal),), values = varVal)
+        ctxt.add(cb, "global")
+
+        cb._type = dataType
+        cb._instance = dataType(newConstName, ctxt)
+        cb._memoryLevel = self.targetMemLevel
+
+        reference = ctxt.hoistReference(newConstName, newRefName)
+        ctxt.lookup(reference)._memoryLevel = self.targetMemLevel
+
+        operatorRepresentation[varName] = reference
+
+        return ctxt, operatorRepresentation
+
+    def _hoistTileReference(self, ctxt: NetworkContext, reference: str, name: str, offset: int) -> NetworkContext:
+
+        refName = ctxt.hoistReference(reference, name)
+        refBuf = ctxt.lookup(refName)
+
+        staticBuf = ctxt.lookup(f"MEMORYARENA_{self.targetMemLevel}")
+
+        refBuf.allocTemplate = NodeTemplate(" \
+        ${type.typeName} ${name} = (${type.typeName}) " + f"((char*){str(staticBuf._instance)} + {offset});")
+        refBuf._memoryLevel = self.targetMemLevel
+
+        return ctxt
+
+    def _replaceReferences(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation,
+                           tilingSchedule: TilingSchedule, name: str) -> Tuple[NetworkContext, Dict]:
+
+        def unravelOldRef(refName):
+            oldBuf = ctxt.lookup(refName)
+            if hasattr(oldBuf, "_referenceName"):
+                return unravelOldRef(oldBuf._referenceName)
+            return oldBuf.name
+
+        newRefName = self.prefix + "ref_" + name
+        oldRefName = operatorRepresentation[name]
+
+        if name in tilingSchedule.inputBaseOffsets:
+            offset = tilingSchedule.inputBaseOffsets[name]
+        elif name in tilingSchedule.outputBaseOffsets:
+            offset = tilingSchedule.outputBaseOffsets[name]
+        else:
+            raise RuntimeError(f"Name {name} not found in TilingSchedule {tilingSchedule}")
+
+        unravelRef = unravelOldRef(oldRefName)
+
+        ctxt = self._hoistTileReference(ctxt, unravelRef, newRefName, offset[0])
+        operatorRepresentation[name] = newRefName
+
+        return ctxt, operatorRepresentation
+
+    def _replaceTransients(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation,
+                           nodeMemoryConstraint: NodeMemoryConstraint, name: str) -> Tuple[NetworkContext, Dict]:
+
+        memoryConstraints = nodeMemoryConstraint.tensorMemoryConstraints[operatorRepresentation[name]].memoryConstraints
+        assert len(memoryConstraints
+                  ) == 1, f"Tiled transient buffer {operatorRepresentation[name]} has more than one memory level!"
+        key = list(memoryConstraints.keys())[0]
+        constraint = memoryConstraints[key]
+        assert constraint.addrSpace is not None, f"Address space of {constraint} cannot be None!"
+        offset = constraint.addrSpace[0]
+
+        refBuf = ctxt.lookup(operatorRepresentation[name])
+
+        if refBuf._memoryLevel != self.targetMemLevel:
+            return ctxt, operatorRepresentation
+
+        staticBuf = ctxt.lookup(f"MEMORYARENA_{self.targetMemLevel}")
+
+        refBuf.allocTemplate = NodeTemplate(" \
+        ${type.typeName} ${name} = (${type.typeName}) " + f"((char*){str(staticBuf._instance)} + {offset});")
+        refBuf.deallocTemplate = NodeTemplate("")
+        refBuf._memoryLevel = self.targetMemLevel
+
+        return ctxt, operatorRepresentation
+
+    def _replaceTiledExpressions(self, ctxt: NetworkContext, templateNode: CodeSnippet,
+                                 variableReplacement: VariableReplacementScheme, tilingSchedule: TilingSchedule,
+                                 nodeMemoryConstraint: NodeMemoryConstraint) -> NetworkContext:
+
+        operatorRepresentation = templateNode.operatorRepresentation
+        template = templateNode.template
+
+        immediateList = [(key, value)
+                         for key, value in variableReplacement.perTileReplacements.items()
+                         if type(operatorRepresentation[key]) != str]
+
+        inoutSchedule = {**tilingSchedule.inputBaseOffsets, **tilingSchedule.outputBaseOffsets}
+        variableList = [key for key, value in inoutSchedule.items() if type(operatorRepresentation[key]) == str]
+
+        transientBufferList = []
+        for key, value in operatorRepresentation.items():
+            if not isinstance(value, str):
+                continue
+            if (ctxt.is_local(value) and isinstance(ctxt.lookup(value), TransientBuffer)):
+                transientBufferList.append(key)
+
+        parseTree = IntrospectiveCodeTransformationMixIn._generateParseTree(template)
+        newParseTree = copy.copy(parseTree)
+        nodes = parseTree.nodes
+
+        newNodes = copy.copy(nodes)
+
+        for rep in immediateList:
+            ctxt, operatorRepresentation = self._replaceImmediate(ctxt, operatorRepresentation, rep,
+                                                                  variableReplacement.replacementTypes[rep[0]])
+            newNodes = self._dereferencePointer(newNodes, rep[0])
+
+        for rep in variableList:
+            ctxt, operatorRepresentation = self._replaceReferences(ctxt, operatorRepresentation, tilingSchedule, rep)
+
+        for rep in transientBufferList:
+            ctxt, operatorRepresentation = self._replaceTransients(ctxt, operatorRepresentation, nodeMemoryConstraint,
+                                                                   rep)
+
+        newParseTree.nodes = newNodes
+        IntrospectiveCodeTransformationMixIn._reconstructCode(template, newParseTree)
+
+        return ctxt
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        def unravelReference(ctxt: NetworkContext, name: str) -> str:
+
+            if name not in ctxt.localObjects.keys() and name not in ctxt.globalObjects.keys():
+                return name
+
+            refBuffer = ctxt.lookup(name)
+            if not hasattr(refBuffer, "_referenceName"):
+                return name
+
+            return unravelReference(ctxt, refBuffer._referenceName)
+
+        self._name = name
+
+        if isinstance(executionBlock, ClosureExecutionBlock):
+            baseExecutionBlock = executionBlock.baseBlock
+        else:
+            baseExecutionBlock = executionBlock
+
+        patternMemoryConstraint = baseExecutionBlock.patternMemoryConstraint
+
+        if patternMemoryConstraint is None:
+            return ctxt, executionBlock
+
+        assert len(patternMemoryConstraint.nodeConstraints) == 1, "Only layerwise supported for now!"
+        #assert len(executionBlock.codeSnippets) == 1, "Only layerwise supported for now!"
+
+        nodeMemoryConstraint = patternMemoryConstraint.nodeConstraints[0]
+
+        possibleTemplateNodes = [
+            node for node in baseExecutionBlock.codeSnippets if hasattr(node.template, 'tileConstraint')
+        ]
+
+        assert len(possibleTemplateNodes) == 1, "More than one template node with TCF found"
+
+        templateNode = possibleTemplateNodes[0]
+        operatorRepresentation = templateNode.operatorRepresentation
+
+        unravelRep = operatorRepresentation.copy()
+        for key in unravelRep.keys():
+
+            val = unravelRep[key]
+            if not isinstance(val, str):
+                continue
+
+            unravelRep[key] = unravelReference(ctxt, val)
+
+        template = templateNode.template
+
+        variableReplacement, tilingSchedules = template.tileConstraint.wrapTilingSolution(
+            nodeMemoryConstraint, self.targetMemLevel, ctxt, unravelRep)
+
+        minimalVariableReplacement, newNodeRep = minimizeVariableReplacement(variableReplacement,
+                                                                             templateNode.operatorRepresentation)
+        for key, value in newNodeRep.items():
+            templateNode.operatorRepresentation[key] = value
+
+        flatTilingSchedule = copy.copy(tilingSchedules[0])
+        for tilingSchedule in tilingSchedules[1:]:
+            flatTilingSchedule += tilingSchedule
+
+        ctxt = self._replaceTiledExpressions(ctxt, templateNode, minimalVariableReplacement, flatTilingSchedule,
+                                             nodeMemoryConstraint)
+
+        for codeSnippet in executionBlock.codeSnippets:
+
+            template, nRep = codeSnippet.template, codeSnippet.operatorRepresentation
+
+            if not "closureStructArgs" in nRep:
+                continue
+
+            keyList = {}
+
+            for key in list(flatTilingSchedule.inputBaseOffsets.keys()) + list(
+                    flatTilingSchedule.outputBaseOffsets.keys()):
+                keyList[unravelRep[key]] = operatorRepresentation[key]
+
+            for key in copy.copy(nRep['closureStructArgs'].value).keys():
+                if nRep['closureStructArgs'].value[key].referenceName in keyList.keys():
+                    nRep['closureStructArgs'].value[key] = type(nRep['closureStructArgs'].value[key])(
+                        keyList[nRep['closureStructArgs'].value[key].referenceName], ctxt)
+
+        return ctxt, executionBlock
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/__init__.py b/Deeploy/TilingExtension/CodeTransformationPasses/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/TilingExtension/GenericFlow.py b/Deeploy/TilingExtension/GenericFlow.py
new file mode 100644
index 0000000..626bef0
--- /dev/null
+++ b/Deeploy/TilingExtension/GenericFlow.py
@@ -0,0 +1,105 @@
+# ----------------------------------------------------------------------
+#
+# File: Flow.py
+#
+# Last edited: 28.07.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import Generic, Iterable, List, Optional, Set, TypeVar
+
+flowType = TypeVar("flowType")
+iteratorType = TypeVar("iteratorType")
+
+
+@dataclass
+class GenericFlowState(Generic[flowType]):
+    liveSet: Set[flowType]
+    killSet: Set[flowType]
+    genSet: Set[flowType]
+
+    def __repr__(self) -> str:
+        retStr = ""
+        retStr += "\nliveSet:\n"
+        retStr += str(self.liveSet)
+        retStr += "\nkillSet:\n"
+        retStr += str(self.killSet)
+        retStr += "\ngenSet:\n"
+        retStr += str(self.genSet)
+        return retStr
+
+
+# SCHEREMO: Checkout data flow analysis (https://en.wikipedia.org/wiki/Data-flow_analysis)
+class GenericFlow(Generic[flowType, iteratorType]):
+
+    def flowStep(self, liveSet: Set[flowType], killSet: Set[flowType], genSet: Set[flowType]) -> Set[flowType]:
+
+        # SCHEREMO: Assert general flow invariants
+        assert (genSet & killSet) == set(
+        ), f"ERROR: Spawning and killing {flowType} instance in same step: \ngenSet = {genSet}\n killSet = {killSet}"
+        assert (genSet & liveSet) == set(
+        ), f"ERROR: Spawning an already live {flowType} instance : \ngenSet = {genSet}\n liveSet = {liveSet}"
+        assert (killSet - liveSet) == set(
+        ), f"ERROR: Killing a non-live {flowType} instance,  \nkillSet = {killSet}, \nliveSet = {liveSet}"
+
+        liveSet = (liveSet | genSet) - killSet
+
+        return liveSet
+
+    def flow(self,
+             iterator: Iterable[iteratorType],
+             initialLiveSet: Optional[Set[flowType]] = None) -> List[GenericFlowState[flowType]]:
+
+        flowStates: List[GenericFlowState[flowType]] = []
+
+        liveSet: Set[flowType] = set()
+        if initialLiveSet is not None:
+            liveSet = initialLiveSet
+
+        killSet: Set[flowType]
+        genSet: Set[flowType]
+
+        for step in iterator:
+
+            self.preComputeStep(step)
+
+            genSet = self.computeGenSet(step)
+            killSet = self.computeKillSet(step)
+
+            flowStates.append(GenericFlowState[flowType](liveSet, killSet, genSet))
+
+            liveSet = self.flowStep(liveSet, killSet, genSet)
+
+        flowStates.append(GenericFlowState[flowType](liveSet, set(), set()))
+
+        return flowStates
+
+    def preComputeStep(self, step: iteratorType) -> None:
+        pass
+
+    @abstractmethod
+    def computeGenSet(self, step: iteratorType) -> Set[flowType]:
+        pass
+
+    @abstractmethod
+    def computeKillSet(self, step: iteratorType) -> Set[flowType]:
+        pass
diff --git a/Deeploy/TilingExtension/MemoryConstraintFlows.py b/Deeploy/TilingExtension/MemoryConstraintFlows.py
new file mode 100644
index 0000000..4696498
--- /dev/null
+++ b/Deeploy/TilingExtension/MemoryConstraintFlows.py
@@ -0,0 +1,263 @@
+# ----------------------------------------------------------------------
+#
+# File: MemoryConstraintFlows.py
+#
+# Last edited: 01.08.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from collections import namedtuple
+from typing import Iterable, List, Optional, Set, Tuple
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import TargetMemoryLevelMapping
+from Deeploy.TilingExtension.GenericFlow import GenericFlow, GenericFlowState
+from Deeploy.TilingExtension.MemoryConstraints import MemoryConstraint, NodeMemoryConstraint, TensorMemoryConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+
+TensorMemLevelTuple = namedtuple("TensorMemLevelTuple", "tensorName targetMemoryLevel")
+
+
+class PatternMemoryConstraintFlow(GenericFlow[TensorMemLevelTuple, gs.Node]):
+
+    def __init__(self, ctxt: NetworkContext, pattern: List[gs.Node],
+                 targetMemoryLevelMapping: TargetMemoryLevelMapping):
+        super().__init__()
+        self.ctxt = ctxt
+        self.pattern = pattern
+        self.patternNodeNames: Set[str] = {node.name for node in pattern}
+        self.targetMemoryLevelMapping = targetMemoryLevelMapping
+
+    def flow(self,
+             iterator: Iterable[gs.Node],
+             initialLiveSet: Optional[Set[TensorMemLevelTuple]] = None) -> List[GenericFlowState[TensorMemLevelTuple]]:
+
+        if initialLiveSet is None:
+            _initialLiveSet: Set[TensorMemLevelTuple] = self._getIOConstraints()[0]
+        else:
+            _initialLiveSet = initialLiveSet | self._getIOConstraints()[0]
+
+        flowStates = super().flow(iterator, _initialLiveSet)
+        return flowStates
+
+    def _getInputTensorTuples(self, inputTensorNames: List[str]) -> Set[TensorMemLevelTuple]:
+
+        inputTensorTuples: Set[TensorMemLevelTuple] = set()
+
+        for tensorName in inputTensorNames:
+            tensorUserSet = set(copy.deepcopy(self.ctxt.lookup(tensorName)._users))
+            patternTensorUserSet = [node for node in self.pattern if node.name in tensorUserSet]
+
+            for requiredUser in patternTensorUserSet:
+                targetMemoryLevel = self.targetMemoryLevelMapping.lookup(requiredUser.name, tensorName)
+                inputTensorTuples |= {TensorMemLevelTuple(tensorName, targetMemoryLevel)}
+
+        return inputTensorTuples
+
+    def _getOutputTensorTuples(self, outputTensorNames: List[str]) -> Set[TensorMemLevelTuple]:
+
+        outputTensorTuples: Set[TensorMemLevelTuple] = set()
+
+        for node in self.pattern:
+            outputNames = [tensor.name for tensor in node.outputs]
+
+            for tensorName in outputNames:
+                if tensorName in copy.deepcopy(outputTensorNames):
+                    targetMemoryLevel = self.targetMemoryLevelMapping.lookup(node.name, tensorName)
+                    outputTensorTuples |= {TensorMemLevelTuple(tensorName, targetMemoryLevel)}
+
+        return outputTensorTuples
+
+    def _getIONames(self) -> Tuple[List[str], List[str]]:
+
+        def _containsAll(listA: List, listB: List) -> bool:
+            if not len(listB) < len(listA):
+                return False
+
+            return all([entry in listA for entry in listB])
+
+        producedTensors = []
+        inputTensors = []
+        outputTensors = []
+
+        patternNodeNames = [node.name for node in self.pattern]
+
+        for node in self.pattern:
+            inTensorNames = [node.name for node in node.inputs]
+            outTensorNames = [node.name for node in node.outputs]
+
+            for tensor in inTensorNames:
+                if tensor not in producedTensors:
+                    inputTensors.append(tensor)
+
+            for tensor in outTensorNames:
+                producedTensors.append(tensor)
+                if not _containsAll(patternNodeNames, self.ctxt.lookup(tensor)._users):
+                    outputTensors.append(tensor)
+
+        return inputTensors, outputTensors
+
+    def _getIOConstraints(self) -> Tuple[Set[TensorMemLevelTuple], Set[TensorMemLevelTuple]]:
+
+        inputTensorNames, outputTensorNames = self._getIONames()
+
+        patternInputConstraints = self._getInputTensorTuples(inputTensorNames)
+        patternOutputConstraints = self._getOutputTensorTuples(outputTensorNames)
+
+        return patternInputConstraints, patternOutputConstraints
+
+    def computeGenSet(self, step: gs.Node) -> Set[TensorMemLevelTuple]:
+
+        returnSet: Set[TensorMemLevelTuple] = set()
+        for tensor in step.outputs:
+            targetMemoryLevel = self.targetMemoryLevelMapping.lookup(step.name, tensor.name)
+            returnSet.add(TensorMemLevelTuple(tensor.name, targetMemoryLevel))
+
+        return returnSet
+
+    def computeKillSet(self, step: gs.Node) -> Set[TensorMemLevelTuple]:
+
+        returnSet: Set[TensorMemLevelTuple] = set()
+        killTensorNames: List[str] = []
+
+        _, outputTensorNames = self._getIONames()
+
+        intermediateTensorNames = [tensor.name for tensor in step.inputs if tensor.name not in outputTensorNames]
+        for tensorName in intermediateTensorNames:
+            patternUsers = [node for node in self.ctxt.lookup(tensorName)._users if node in self.patternNodeNames]
+            assert patternUsers != [], f"Tensor {tensorName} has no users in this pattern and is not an output!"
+            if step.name == patternUsers[-1]:
+                killTensorNames.append(tensorName)
+
+        for tensorName in killTensorNames:
+            targetMemoryLevel = self.targetMemoryLevelMapping.lookup(step.name, tensorName)
+            returnSet.add(TensorMemLevelTuple(tensorName, targetMemoryLevel))
+
+        return returnSet
+
+
+class GraphMemoryConstraintFlow(GenericFlow[TensorMemLevelTuple, List[gs.Node]]):
+
+    def __init__(self, ctxt: NetworkContext, targetMemoryLevelMapping: TargetMemoryLevelMapping):
+        self.ctxt = ctxt
+        self._patternFlowStates: List[List[GenericFlowState[TensorMemLevelTuple]]] = []
+        self.targetMemoryLevelMapping = targetMemoryLevelMapping
+
+    @property
+    def patternFlowState(self):
+        if not len(self._patternFlowStates) > 0:
+            return None
+
+        return self._patternFlowStates[-1]
+
+    def preComputeStep(self, step: List[gs.Node]) -> None:
+
+        constraintFlow = PatternMemoryConstraintFlow(self.ctxt, step, self.targetMemoryLevelMapping)
+        if self.patternFlowState is not None:
+            flowStates = constraintFlow.flow(step, self.patternFlowState[-1].liveSet)
+        else:
+            flowStates = constraintFlow.flow(step, None)
+
+        self._patternFlowStates.append(flowStates)
+
+    def computeGenSet(self, step: List[gs.Node]) -> Set[TensorMemLevelTuple]:
+
+        genSet = set()
+        outputConstraints = self.patternFlowState[-1].liveSet - self.patternFlowState[0].liveSet
+        for constraint in outputConstraints:
+            genSet.add(TensorMemLevelTuple(constraint.tensorName, self.ctxt.lookup(constraint.tensorName)._memoryLevel))
+
+        return genSet
+
+    def computeKillSet(self, step: List[gs.Node]) -> Set[TensorMemLevelTuple]:
+
+        killSet = set()
+
+        # SCHEREMO: pretty straightforward, just use the current patternFlow liveSet
+        liveSet = [
+            tensorTuple for tensorTuple in self.patternFlowState[0].liveSet
+            if not isinstance(self.ctxt.lookup(tensorTuple.tensorName), ConstantBuffer)
+        ]
+
+        inputConstraints = liveSet
+
+        patternNodeNames = [node.name for node in step]
+
+        for constraint in inputConstraints:
+            refBuffer = self.ctxt.lookup(constraint.tensorName)
+
+            userList = refBuffer._users
+
+            if len(userList) == 0:
+                continue
+
+            if userList[-1] in patternNodeNames and not isinstance(refBuffer, ConstantBuffer):
+
+                killConstraint = TensorMemLevelTuple(constraint.tensorName,
+                                                     self.ctxt.lookup(constraint.tensorName)._memoryLevel)
+                killSet.add(killConstraint)
+
+        return killSet
+
+
+def convertFlowState2NodeMemoryConstraint(tilerModel: TilerModel,
+                                          ctxt: NetworkContext,
+                                          flowState: GenericFlowState[TensorMemLevelTuple],
+                                          useMax: bool = False) -> NodeMemoryConstraint:
+
+    nodeMemoryConstraint = NodeMemoryConstraint()
+
+    memoryOccupyingSet = flowState.liveSet | flowState.genSet
+    _inputs = [item.tensorName for item in flowState.liveSet]
+    _outputs = [item.tensorName for item in flowState.genSet]
+
+    for tensorName, memoryLevel in memoryOccupyingSet:
+
+        if tilerModel.existsCopyIdx(tensorName):
+            tilerModel.addTensorNumOfEltToModel(ctxt, tensorName)
+            memorySize = tilerModel.getTensorNumberOfEltVar(tensorName)
+
+            if useMax:
+                _memorySize = memorySize.Max()
+            else:
+                _memorySize = memorySize
+
+        else:
+            # SCHEREMO: This means the tensor is passed through, we don't tile it
+            _memorySize = int(np.prod(ctxt.lookup(tensorName).shape))
+
+        elementMemorySize = _memorySize
+        memLevelConstraint = MemoryConstraint(memoryLevel, elementMemorySize)
+        tensorConstraint = TensorMemoryConstraint(tensorName, {memoryLevel: memLevelConstraint}, ctxt)
+
+        if tensorName in _inputs:
+            ioDir = "input"
+        elif tensorName in _outputs:
+            ioDir = "output"
+        else:
+            ioDir = "intermediate"
+
+        nodeMemoryConstraint.addTensorConstraint(tensorConstraint, ioDir)
+
+    return nodeMemoryConstraint
diff --git a/Deeploy/TilingExtension/MemoryConstraints.py b/Deeploy/TilingExtension/MemoryConstraints.py
new file mode 100644
index 0000000..0c12368
--- /dev/null
+++ b/Deeploy/TilingExtension/MemoryConstraints.py
@@ -0,0 +1,221 @@
+# ----------------------------------------------------------------------
+#
+# File: MemoryConstraints.py
+#
+# Last edited: 27.07.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import copy
+from collections import OrderedDict
+from typing import Dict, List, Literal, Optional, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NetworkContext
+
+
+class MemoryConstraint():
+    __slots__ = ["memoryLevel", "size", "multiBufferCoefficient", "shape", "addrSpace"]
+
+    def __init__(self, memoryLevel: str, size: Union[IntVar, int]):
+        self.memoryLevel: str = memoryLevel
+        self.size: Union[int, IntVar] = size
+        self.multiBufferCoefficient: Union[int, IntVar] = 1
+
+        self.shape: Optional[Tuple[int]] = None
+        self.addrSpace: Optional[Tuple[int, int]] = None
+
+    def __repr__(self) -> str:
+        retStr = f"MemoryLevel: {self.memoryLevel}, Size: {self.size}, MultiBuffer: {self.multiBufferCoefficient}"
+
+        if self.shape is not None:
+            retStr += f" Shape: {self.shape}"
+
+        if self.addrSpace is not None:
+            retStr += f" Address Space: {self.addrSpace}"
+
+        return retStr
+
+    def __deepcopy__(self, memo = {}):
+        new = self.__class__(self.memoryLevel, self.size)
+        new.multiBufferCoefficient = self.multiBufferCoefficient
+        new.shape = self.shape
+        new.addrSpace = self.addrSpace
+        memo[id(self)] = new
+        return new
+
+
+class TensorMemoryConstraint():
+    __slots__ = ["tensorName", "memoryConstraints"]
+
+    def __init__(self, tensorName: str, constraints: Dict[str, MemoryConstraint], ctxt: NetworkContext):
+        # SCHEREMO: Asserts the tensor is registered in the context
+        _ = ctxt.lookup(tensorName)
+        self.tensorName: str = tensorName
+        self.memoryConstraints: OrderedDict[str, MemoryConstraint] = copy.deepcopy(
+            constraints)  # Lists are mutable, so copy for persistence
+
+    def _amendMemoryConstraints(self, memoryConstraints: Dict[str, MemoryConstraint]):
+
+        _cleanConstraints = []
+        for key, new in memoryConstraints.items():
+
+            if not key in self.memoryConstraints.keys():
+                _cleanConstraints.append(new)
+                continue
+
+            old = self.memoryConstraints[key]
+
+            if old.memoryLevel == new.memoryLevel:
+                assert old.size == new.size, "Tried to override existing constraints!"
+                continue
+
+            _cleanConstraints.append(new)
+
+        for constraint in _cleanConstraints:
+            self.addMemoryConstraint(constraint)
+
+    def addMemoryConstraint(self, memoryConstraint: MemoryConstraint):
+        name = memoryConstraint.memoryLevel
+        self.memoryConstraints[name] = memoryConstraint
+
+    def __repr__(self) -> str:
+        retStr = f"{self.tensorName}: "
+        retStr += "{\n"
+        for i in self.memoryConstraints.values():
+            line = str(i)
+            retLines = line.split("\n")
+            retLine = ""
+            for line in retLines:
+                retLine += ("\t" + line + "\n")
+            retStr += retLine
+        retStr += "}"
+        return retStr
+
+
+class NodeMemoryConstraint():
+    __slots__ = ["inputTensorMemoryConstraints", "intermediateTensorMemoryConstraints", "outputTensorMemoryConstraints"]
+
+    def __init__(self):
+        self.inputTensorMemoryConstraints: Dict[str, TensorMemoryConstraint] = {}
+        self.intermediateTensorMemoryConstraints: Dict[str, TensorMemoryConstraint] = {}
+        self.outputTensorMemoryConstraints: Dict[str, TensorMemoryConstraint] = {}
+
+    @property
+    def tensorMemoryConstraints(self):
+        return {
+            **self.inputTensorMemoryConstraints,
+            **self.intermediateTensorMemoryConstraints,
+            **self.outputTensorMemoryConstraints
+        }
+
+    def _amendTensorConstraint(self, tensorMemoryConstraint: TensorMemoryConstraint):
+        name = tensorMemoryConstraint.tensorName
+        if name in self.tensorMemoryConstraints.keys():
+            self.tensorMemoryConstraints[name]._amendMemoryConstraints(tensorMemoryConstraint.memoryConstraints)
+
+    def getIO(self, tensorName: str) -> Optional[Literal["input", "intermediate", "output"]]:
+        if tensorName in self.inputTensorMemoryConstraints.keys():
+            return "input"
+        elif tensorName in self.outputTensorMemoryConstraints.keys():
+            return "output"
+        elif tensorName in self.intermediateTensorMemoryConstraints.keys():
+            return "intermediate"
+        else:
+            return None
+
+    def addTensorConstraint(self, tensorMemoryConstraint: TensorMemoryConstraint, io: Literal["input", "output",
+                                                                                              "intermediate"]):
+        name = tensorMemoryConstraint.tensorName
+        if name in self.tensorMemoryConstraints.keys():
+            self._amendTensorConstraint(tensorMemoryConstraint)
+            return
+
+        if io == "input":
+            _dict = self.inputTensorMemoryConstraints
+        elif io == "output":
+            _dict = self.outputTensorMemoryConstraints
+        else:
+            _dict = self.intermediateTensorMemoryConstraints
+
+        _dict[name] = tensorMemoryConstraint
+
+    def __add__(self, other):
+        assert isinstance(other, NodeMemoryConstraint), f"Can't add {other} to {self}, expected NodeMemoryConstraint!"
+
+        new = NodeMemoryConstraint()
+        new.inputTensorMemoryConstraints = copy.deepcopy(self.inputTensorMemoryConstraints)
+        new.intermediateTensorMemoryConstraints = copy.deepcopy(self.intermediateTensorMemoryConstraints)
+        new.outputTensorMemoryConstraints = copy.deepcopy(self.outputTensorMemoryConstraints)
+
+        for key, constraint in other.tensorMemoryConstraints.items():
+            ioDir = other.getIO(key)
+            new.addTensorConstraint(constraint, ioDir)
+
+        return new
+
+    def __repr__(self) -> str:
+        retStr = ""
+        retStr += "{\n"
+        for i in self.tensorMemoryConstraints.values():
+            line = str(i)
+            retLines = line.split("\n")
+            retLine = ""
+            for line in retLines:
+                retLine += ("\t" + line + "\n")
+            retStr += retLine
+        retStr += "}"
+        return retStr
+
+
+class PatternMemoryConstraints():
+    __slots__ = ["nodeConstraints"]
+
+    def __init__(self):
+        self.nodeConstraints: List[NodeMemoryConstraint] = []
+
+    def addConstraint(self, nodeConstraint: NodeMemoryConstraint):
+        self.nodeConstraints.append(nodeConstraint)
+
+    def __add__(self, other):
+
+        assert isinstance(other,
+                          PatternMemoryConstraints), f"Can't add {other} to {self}, expected PatternMemoryConstraints!"
+
+        newConst = PatternMemoryConstraints()
+        for old, new in zip(self.nodeConstraints, other.nodeConstraints):
+            newConst.addConstraint(old + new)
+        return newConst
+
+    def __repr__(self) -> str:
+        retStr = ""
+        retStr += "{\n"
+        for i in self.nodeConstraints:
+            line = str(i)
+            retLines = line.split("\n")
+            retLine = ""
+            for line in retLines:
+                retLine += ("\t" + line + "\n")
+            retStr += retLine
+        retStr += "}"
+        return retStr
diff --git a/Deeploy/TilingExtension/MemoryScheduler.py b/Deeploy/TilingExtension/MemoryScheduler.py
new file mode 100644
index 0000000..d8fcf69
--- /dev/null
+++ b/Deeploy/TilingExtension/MemoryScheduler.py
@@ -0,0 +1,640 @@
+# ----------------------------------------------------------------------
+#
+# File: MemoryScheduler.py
+#
+# Last edited: 06.10.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import random
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    _permuteList
+from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, TransientBuffer
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy
+from Deeploy.TilingExtension.MemoryConstraints import PatternMemoryConstraints, TensorMemoryConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+
+
+@dataclass
+class MemoryBlock:
+    name: str
+    level: str
+    _lifetime: Tuple[int, int]
+    _addrSpace: Optional[Tuple[int, int]] = None
+
+    @property
+    def addrSpace(self) -> Optional[Tuple[int, int]]:
+        return self._addrSpace
+
+    @addrSpace.setter
+    def addrSpace(self, addrSpace: Optional[Tuple[int, int]]):
+        if addrSpace is None:
+            self._addrSpace = None
+            return
+
+        assert addrSpace[0] < addrSpace[1], "Address space range needs to be ordered from lesser to greater!"
+        self._addrSpace = addrSpace
+
+    @property
+    def lifetime(self) -> Tuple[int, int]:
+        return self._lifetime
+
+    @lifetime.setter
+    def lifetime(self, lifetime: Tuple[int, int]):
+        assert lifetime[0] <= lifetime[1], "Lifetime range needs to be ordered from lesser to greater!"
+        self._lifetime = lifetime
+
+    def __init__(self, name: str, level: str, lifetime: Tuple[int, int], addrSpace: Optional[Tuple[int, int]]):
+        self.name = name
+        self.level = level
+        self.lifetime = lifetime
+
+        if addrSpace is not None:
+            self.addrSpace = addrSpace
+
+    def collides(self, other: MemoryBlock) -> bool:
+        assert (isinstance(other, MemoryBlock)), f"{other} is not a MemoryBlock!"
+
+        if self.addrSpace is None or other.addrSpace is None:
+            return False
+
+        xCollision: bool = False
+        yCollision: bool = False
+
+        if self.lifetime[0] <= other.lifetime[1] and self.lifetime[1] >= other.lifetime[0]:
+            xCollision = True
+
+        if self.addrSpace[0] < other.addrSpace[1] and self.addrSpace[1] > other.addrSpace[0]:
+            yCollision = True
+
+        return (xCollision and yCollision)
+
+
+class MemoryScheduler():
+    _ROWSUMNAME = "rowSum"
+    _COLSUMNAME = "colSum"
+    _PERMUTATIONIDXNAME = "permutationIdx"
+    _INTERMEDIATEADJPRODUCTNAME = "intermediateAdjProduct"
+    _FINALADJPRODUCTNAME = "AdjProduct"
+    _COSTVARIABLENAME = "H"
+    _COSTPRODUCTNAME = "costProduct"
+
+    byteAlignment = 4
+
+    @staticmethod
+    def overlap(lifetimeA: Tuple[int, int], lifetimeB: Tuple[int, int]) -> bool:
+        overlap: bool = False
+        overlap |= (lifetimeA[0] >= lifetimeB[0] and lifetimeA[0] <= lifetimeB[1])
+        overlap |= (lifetimeB[0] >= lifetimeA[0] and lifetimeB[0] <= lifetimeA[1])
+
+        return overlap
+
+    def __init__(self, stringSuffix: str, tileScheduler: bool, seed: int = 19960801):
+        self._stringSuffix = stringSuffix
+        self.stringSuffix = ""
+        self.tileScheduler = tileScheduler
+
+        self.seed = seed
+        self.memoryMap: Dict[str, List[List[MemoryBlock]]] = {}
+
+        self._permutationState: Dict[str, Union[List[List[Union[IntVar]]], np.ndarray]] = {}
+
+    def _addPermutationMatrix(self, tilerModel: TilerModel, numVars: int,
+                              patternIdx: int) -> List[List[Union[IntVar, int]]]:
+
+        permMat: List[List[Union[IntVar, int]]] = []
+
+        for i in range(numVars):
+            rowSumName = f"{self._ROWSUMNAME}_{i}" + self.stringSuffix
+            jSum = tilerModel.addVariable(rowSumName, 0, 1, patternIdx)
+            permMat.append([])
+            for j in range(numVars):
+                name = f"{self._PERMUTATIONIDXNAME}_{i}_{j}" + self.stringSuffix
+                jVar = tilerModel.addVariable(name, 0, 1, patternIdx)
+                permMat[i].append(jVar)
+            tilerModel.addConstraint(tilerModel._model.SumEquality(permMat[i], jSum))
+            tilerModel.addConstraint(jSum == 1)
+
+        for i in range(numVars):
+            colSumName = f"{self._COLSUMNAME}_{i}" + self.stringSuffix
+            jSum = tilerModel.addVariable(colSumName, 0, 1, patternIdx)
+            constraintVec = []
+            for j in range(numVars):
+                name = f"{self._PERMUTATIONIDXNAME}_{j}_{i}" + self.stringSuffix
+                jVar = tilerModel.getVariable(name, patternIdx)
+                constraintVec.append(jVar)
+            tilerModel.addConstraint(tilerModel._model.SumEquality(constraintVec, jSum))
+            tilerModel.addConstraint(jSum == 1)
+
+        return permMat
+
+    def _permuteMatrices(self, tilerModel: TilerModel, permutationMatrix: List[List[Union[IntVar, int]]],
+                         adjacencyMatrix: List[List[int]], costVector: List[Union[int, IntVar]], patternIdx: int):
+
+        def boolMatMulSingle(A, B, row, col, transposeB = False):
+
+            constr = 0
+            numVars = len(A)
+
+            for j in range(numVars):
+                if not transposeB:
+                    constr += A[row][j] * B[j][col]
+                else:
+                    constr += A[row][j] * B[col][j]
+
+            return constr
+
+        def boolMatVecMulSingle(A, B, row):
+
+            constr = 0
+            numVars = len(B)
+
+            for j in range(numVars):
+                constr += A[row][j] * B[j]
+
+            return constr
+
+        permAdj_intermediate: List[List[Union[IntVar, int]]] = []
+        permAdj: List[List[Union[IntVar, int]]] = []
+        permCost: List[Union[IntVar, int]] = []
+
+        numVars = len(costVector)
+
+        for i in range(numVars):
+            permAdj_intermediate.append([])
+            for j in range(numVars):
+                name = f"{self._INTERMEDIATEADJPRODUCTNAME}_{i}_{j}" + self.stringSuffix
+                jVar = tilerModel.addVariable(name, 0, 1, patternIdx)
+                constr = boolMatMulSingle(permutationMatrix, adjacencyMatrix, i, j, False)
+                tilerModel.addConstraint(jVar == constr)
+                permAdj_intermediate[i].append(jVar)
+
+        for i in range(numVars):
+            permAdj.append([])
+            for j in range(numVars):
+                name = f"{self._FINALADJPRODUCTNAME}_{i}_{j}" + self.stringSuffix
+                jVar = tilerModel.addVariable(name, 0, 1, patternIdx)
+                constr = boolMatMulSingle(permAdj_intermediate, permutationMatrix, i, j, True)
+                tilerModel.addConstraint(jVar == constr)
+                permAdj[i].append(jVar)
+
+        costMax = 0
+        for cost in costVector:
+            if isinstance(cost, int):
+                newCost = cost
+            else:
+                newCost = cost.Max()
+            costMax = max(costMax, newCost)
+
+        for j in range(numVars):
+            name = f"{self._COSTPRODUCTNAME}_{j}" + self.stringSuffix
+            jVar = tilerModel.addVariable(name, 0, costMax, patternIdx)
+            constr = boolMatVecMulSingle(permutationMatrix, costVector, j)
+            tilerModel.addConstraint(jVar == constr)
+            permCost.append(jVar)
+
+        return permAdj, permCost
+
+    def _generateCost(self, tilerModel: TilerModel, adjMatrix: List[List[Union[int, IntVar]]],
+                      costVector: List[Union[int, IntVar]], patternIdx: int):
+
+        def maxVal(val) -> int:
+            if isinstance(val, int):
+                return val
+            else:
+                return val.Max()
+
+        hVector = []
+        numVars = len(costVector)
+
+        name = f"{self._COSTVARIABLENAME}_0" + self.stringSuffix
+
+        hVar = tilerModel.addVariable(name, 0, maxVal(costVector[0]), patternIdx)
+        constr = hVar == costVector[0]
+        tilerModel.addConstraint(constr)
+        hVector.append(hVar)
+
+        for i in range(1, numVars):
+            name = f"{self._COSTVARIABLENAME}_{i}" + self.stringSuffix
+            # SCHEREMO: Check for overlap here!
+            hVar = tilerModel.addVariable(name, 0, maxVal(hVector[i - 1]) + maxVal(costVector[i]), patternIdx)
+            prod = []
+            for j in range(i):
+                name = f"{self._COSTVARIABLENAME}_{i}_maxEntry_{j}" + self.stringSuffix
+                pVar = tilerModel.addVariable(name, 0, maxVal(hVector[j]) + maxVal(costVector[i]), patternIdx)
+                constr = (pVar == (adjMatrix[i][j] * hVector[j] + costVector[i]))
+                tilerModel.addConstraint(constr)
+                prod.append(pVar)
+            tilerModel.addConstraint(tilerModel._model.MaxEquality(prod, hVar))
+            hVector.append(hVar)
+
+        name = "cost" + self.stringSuffix
+        costMax = max([maxVal(entry) for entry in hVector])
+        cost = tilerModel.addVariable(name, 0, costMax, patternIdx)
+        tilerModel.addConstraint(tilerModel._model.MaxEquality(hVector, cost))
+
+        return cost
+
+    def _buildInterferenceGraph(self, lifetimeMap):
+
+        interferenceGraph: Dict[str, List[str]] = {}
+        for name, lifetime in lifetimeMap.items():
+            neighbors: List[str] = []
+            for neighborName, neighborLifetime in lifetimeMap.items():
+                if neighborName == name:
+                    continue
+
+                if self.overlap(lifetime, neighborLifetime):
+                    neighbors.append(neighborName)
+
+            interferenceGraph[name] = neighbors
+
+        return interferenceGraph
+
+    def _calculateLifetimes(self, ctxt: NetworkContext, patternMemoryConstraint: PatternMemoryConstraints,
+                            memoryLevel: str):
+
+        def filterTensorMemoryConstraint(ctxt: NetworkContext, tensorMemoryConstraint: TensorMemoryConstraint) -> bool:
+
+            if ctxt.lookup(tensorMemoryConstraint.tensorName)._deploy == False:
+                return False
+
+            for level in tensorMemoryConstraint.memoryConstraints.values():
+
+                homeLevel = ctxt.lookup(tensorName)._memoryLevel
+
+                if not level.memoryLevel == memoryLevel:
+                    continue
+
+                # SCHEREMO: Transient buffers are only considered by last-level schedulers
+                if isinstance(ctxt.lookup(tensorMemoryConstraint.tensorName), TransientBuffer) and self.tileScheduler:
+                    return True
+
+                elif isinstance(ctxt.lookup(tensorMemoryConstraint.tensorName), TransientBuffer):
+                    return False
+
+                # SCHEREMO: The original level is only considered by "home-level" schedulers
+                if level.memoryLevel == homeLevel and not self.tileScheduler:
+
+                    # SCHEREMO: ConstantBuffers are assigned and allocated at compile time, Global Var Buffers are assigned at init time
+                    if isinstance(ctxt.lookup(tensorMemoryConstraint.tensorName), ConstantBuffer) or ctxt.is_global(
+                            tensorMemoryConstraint.tensorName):
+                        return False
+                    return True
+
+                if level.memoryLevel != homeLevel and self.tileScheduler:
+                    return True
+
+            return False
+
+        tensorMap = OrderedDict()
+        tensorLifetimeMap: Dict[str, Tuple[int, int]] = dict()
+
+        for stepIdx, nodeConstraint in enumerate(patternMemoryConstraint.nodeConstraints):
+            for tensorName, tensorMemoryConstraint in nodeConstraint.tensorMemoryConstraints.items():
+
+                if not filterTensorMemoryConstraint(ctxt, tensorMemoryConstraint):
+                    continue
+
+                if tensorName in tensorLifetimeMap.keys():
+                    prevLifetime = tensorLifetimeMap[tensorName]
+                    tensorLifetimeMap[tensorName] = tuple((prevLifetime[0], stepIdx))
+                else:
+                    tensorLifetimeMap[tensorName] = tuple((stepIdx, stepIdx))
+                    tensorMap[tensorName] = tensorMemoryConstraint
+
+        return tensorLifetimeMap, tensorMap
+
+    def _buildAdjacencyMatrix(self, graph, tensorMap):
+        numVars = len(graph)
+
+        adjacencyMatrix = np.zeros((numVars, numVars), dtype = int)
+
+        for node, neighbors in graph.items():
+            nodeIdx = list(tensorMap.keys()).index(node)
+            for neighbor in neighbors:
+                adjacencyIdx = list(tensorMap.keys()).index(neighbor)
+                adjacencyMatrix[nodeIdx, adjacencyIdx] = 1
+
+        return adjacencyMatrix
+
+    def _buildCostVector(self, ctxt, graph, tensorMap, memoryLevel):
+        costVector: List[Union[int, IntVar]] = []
+        numVars = len(graph)
+
+        if numVars == 0:
+            costVector.append(0)
+            return costVector
+
+        for node, neighbors in graph.items():
+
+            constraints = tensorMap[node].memoryConstraints
+            cost = 0
+
+            for c in constraints.values():
+                if c.memoryLevel == memoryLevel:
+
+                    if not isinstance(ctxt.lookup(node), TransientBuffer):
+                        typeWidth = max(1, ctxt.lookup(node)._type.referencedType.typeWidth // 8)
+                    else:
+                        typeWidth = 1
+
+                    # SCHEREMO: Make sure each tile is word-aligned for better access performance
+                    # and to comply with implicit PULP L3 tiling bugs
+                    wordCost = (((c.size * typeWidth) + type(self).byteAlignment - 1) //
+                                type(self).byteAlignment) * type(self).byteAlignment
+                    cost = wordCost * c.multiBufferCoefficient
+
+                    # SCHEREMO: In-place operator outputs are "costless" whenever their input is in the same pattern
+                    if hasattr(ctxt.lookup(node), "_alias") and ctxt.lookup(node)._alias in neighbors:
+                        cost = 0
+
+            costVector.append(cost)
+
+        return costVector
+
+    def heuristicPermutation(self, adjacencyMatrix, costVector) -> List[int]:
+        permutationList = list(range(len(costVector)))
+        random.seed(self.seed)
+        random.shuffle(permutationList)
+
+        return permutationList
+
+    def _stablePermutation(self, adjacencyMatrix, costVector, permutationList):
+
+        if len(costVector) == 1:
+            return adjacencyMatrix, costVector, np.ones_like(adjacencyMatrix)
+
+        permutationMatrix = np.zeros_like(adjacencyMatrix)
+        newCostVector = []
+
+        for i in permutationList:
+            newCostVector.append(costVector[i])
+
+        for idx, i in enumerate(permutationList):
+            permutationMatrix[idx, i] = 1
+
+        newAdjacencyMatrix = permutationMatrix @ adjacencyMatrix @ np.transpose(permutationMatrix)
+
+        return newAdjacencyMatrix, newCostVector, permutationMatrix
+
+    # SCHEREMO: Set the end of the lifetime of in-place operator inputs to the lifetime of their outputs
+    def _dealiasLifetimeMap(self, ctxt: NetworkContext,
+                            tensorLifetimeMap: Dict[str, Tuple[int, int]]) -> Dict[str, Tuple[int, int]]:
+
+        tensorLifetimeMap = tensorLifetimeMap.copy()
+
+        if not self.tileScheduler:
+            for key, lifetime in tensorLifetimeMap.items():
+                alias = ctxt.dealiasBuffer(key)
+
+                if alias == key:
+                    continue
+
+                if ctxt.is_global(alias):
+                    tensorLifetime = (0, lifetime[1])
+                    tensorLifetimeMap[key] = tensorLifetime
+                    continue
+
+                aliasLifetime = tensorLifetimeMap[alias]
+                tensorLifetime = (aliasLifetime[0], max(aliasLifetime[1], lifetime[1]))
+                tensorLifetimeMap[alias] = tensorLifetime
+
+        return tensorLifetimeMap
+
+    def _scheduleMemoryConstraints(self,
+                                   tilerModel: TilerModel,
+                                   ctxt: NetworkContext,
+                                   allMemoryConstraints: List[PatternMemoryConstraints],
+                                   memoryHierarchy: MemoryHierarchy,
+                                   memoryLevel: str = "L1",
+                                   optimizeSchedule: bool = False):
+
+        if memoryLevel not in self.memoryMap:
+            self.memoryMap[memoryLevel] = []
+
+        for patternIdx, patternMemoryConstraint in enumerate(allMemoryConstraints):
+
+            # SCHEREMO: Calculate lifetimes
+            tensorLifetimeMap, tensorMap = self._calculateLifetimes(ctxt, patternMemoryConstraint, memoryLevel)
+
+            tensorLifetimeMap = self._dealiasLifetimeMap(ctxt, tensorLifetimeMap)
+
+            # SCHEREMO: Build interference graph
+            graph = self._buildInterferenceGraph(tensorLifetimeMap)
+
+            numVars = len(graph)
+
+            # SCHEREMO: Build adjacency matrices for memoryLevel
+            adjacencyMatrix = self._buildAdjacencyMatrix(graph, tensorMap)
+            costVector = self._buildCostVector(ctxt, graph, tensorMap, memoryLevel)
+            nameVector: List[str] = []
+
+            blockList = []
+
+            for node, neighbors in graph.items():
+                nameVector.append(node)
+                relativeLifeTime = tensorLifetimeMap[node]
+                absoluteLifetime = (relativeLifeTime[0] + patternIdx, relativeLifeTime[1] + patternIdx)
+
+                memBlock = MemoryBlock(node, memoryLevel, absoluteLifetime, None)
+                blockList.append(memBlock)
+
+            self.memoryMap[memoryLevel].append(blockList)
+
+            # SCHEREMO: Build permutation matrix
+            if optimizeSchedule:
+                if numVars > 1:
+
+                    permutationMatrix = self._addPermutationMatrix(tilerModel, numVars, patternIdx)
+                    permAdj, permCost = self._permuteMatrices(tilerModel, permutationMatrix, adjacencyMatrix,
+                                                              costVector, patternIdx)
+
+                else:
+                    permutationMatrix = np.ones((1,))
+                    permAdj, permCost = adjacencyMatrix, costVector
+
+            else:
+                permutationList = self.heuristicPermutation(adjacencyMatrix, costVector)
+                permAdj, permCost, permutationMatrix = self._stablePermutation(adjacencyMatrix, costVector,
+                                                                               permutationList)
+
+            self._permutationState[memoryLevel + f"_{patternIdx}"] = permutationMatrix
+
+            cost = self._generateCost(tilerModel, permAdj, permCost, patternIdx)
+            constr = cost < memoryHierarchy.memoryLevels[memoryLevel].size
+            tilerModel.addConstraint(constr)
+
+        return
+
+    def scheduleMemoryConstraints(self,
+                                  tilerModel: TilerModel,
+                                  ctxt: NetworkContext,
+                                  allMemoryConstraints: List[PatternMemoryConstraints],
+                                  memoryHierarchy: MemoryHierarchy,
+                                  memoryLevel: str = "L1",
+                                  optimizeSchedule: bool = False):
+
+        self.stringSuffix = self._stringSuffix + f"_{memoryLevel}"
+        return self._scheduleMemoryConstraints(tilerModel, ctxt, allMemoryConstraints, memoryHierarchy, memoryLevel,
+                                               optimizeSchedule)
+
+    def getSymbolicCostName(self, patternIdx: int, memoryLevel: str) -> str:
+        stringSuffix = self._stringSuffix + f"_{memoryLevel}"
+
+        name = f"cost{stringSuffix}"
+        return name
+
+    def getCost(self, tilerModel, patternIdx: int, memoryLevel: str) -> int:
+
+        stringSuffix = self._stringSuffix + f"_{memoryLevel}"
+
+        collector = tilerModel._solveModel("max")
+        numVars = len(self.memoryMap[memoryLevel][patternIdx])
+
+        name = f"cost{stringSuffix}_copyIdx_{patternIdx}"
+        symVar = tilerModel._variables[name]
+        var = tilerModel._resolveVariable(symVar)
+        cost = var
+
+        return cost
+
+    def getHVector(self, tilerModel, patternIdx: int, memoryLevel: str) -> np.ndarray:
+
+        stringSuffix = self._stringSuffix + f"_{memoryLevel}"
+
+        collector = tilerModel._solveModel("max")
+        numVars = len(self.memoryMap[memoryLevel][patternIdx])
+
+        hVec = np.zeros((numVars))
+
+        for i in range(numVars):
+            name = f"{self._COSTVARIABLENAME}_{i}{stringSuffix}_copyIdx_{patternIdx}"
+            symVar = tilerModel._variables[name]
+            var = tilerModel._resolveVariable(symVar)
+            hVec[i] = var
+
+        return hVec
+
+    def getBlockVector(self, patternIdx: int, memoryLevel: str) -> List[MemoryBlock]:
+
+        return self.memoryMap[memoryLevel][patternIdx]
+
+    def getPMatrix(self, tilerModel, patternIdx: int, memoryLevel: str) -> np.ndarray:
+
+        stringSuffix = self._stringSuffix + f"_{memoryLevel}"
+
+        collector = tilerModel._solveModel("max")
+        numVars = len(self.memoryMap[memoryLevel][patternIdx])
+
+        permMat = np.zeros((numVars, numVars))
+
+        for i in range(numVars):
+            for j in range(numVars):
+                name = f"{self._PERMUTATIONIDXNAME}_{i}_{j}{stringSuffix}_copyIdx_{patternIdx}"
+                symVar = tilerModel._variables[name]
+                var = tilerModel._resolveVariable(symVar)
+                permMat[i, j] = var
+
+        return permMat
+
+    def annotateSolution(self, ctxt: NetworkContext, tilerModel: TilerModel):
+
+        def permMatrix2permList(permMatrix: np.ndarray) -> List[int]:
+
+            _permMatrix = []
+
+            if len(permMatrix) == 0:
+                return []
+
+            if len(permMatrix) == 1:
+                return [0]
+
+            for i in range(permMatrix.shape[0]):
+                rowVec = list(permMatrix[i])
+                _permMatrix.append(rowVec)
+
+            return [row.index(1) for row in _permMatrix]
+
+        for memoryLevel, patternList in self.memoryMap.items():
+            for patternIdx, pattern in enumerate(patternList):
+
+                permutationMatrix = self._permutationState[memoryLevel + f"_{patternIdx}"]
+
+                if not isinstance(permutationMatrix, np.ndarray):
+                    _permutationMatrix = self.getPMatrix(tilerModel, patternIdx, memoryLevel)
+                else:
+                    _permutationMatrix = permutationMatrix
+
+                permList = permMatrix2permList(_permutationMatrix)
+
+                if pattern != [] and len(pattern) > 1:
+                    permPattern = _permuteList(pattern, permList)
+                else:
+                    permPattern = pattern
+
+                aliasedBlocks = []
+
+                for blockIdx, memoryBlock in enumerate(permPattern):
+
+                    blockNames = [block.name for block in permPattern]
+                    _buffer = ctxt.lookup(memoryBlock.name)
+
+                    alias = ctxt.dealiasBuffer(memoryBlock.name)
+
+                    # SCHEREMO: If we're handling an active alias to a global buffer in their home memory level, we don't need to resolve addresses
+                    if all([alias != memoryBlock.name, ctxt.is_global(alias), _buffer._memoryLevel == memoryLevel]):
+                        continue
+
+                    # SCHEREMO: Don't fully unroll aliases here - this is pattern-sensitive!
+                    if hasattr(_buffer, "_alias") and _buffer._alias in blockNames:
+                        _alias = ctxt.lookup(memoryBlock.name)._alias
+                        aliasedBlocks.append((memoryBlock, _alias))
+                        continue
+
+                    upperIdx = blockIdx
+
+                    upperEndVar = tilerModel.getVariable(
+                        f"{self._COSTVARIABLENAME}_{upperIdx}{self._stringSuffix}_{memoryLevel}", patternIdx)
+                    upperEnd = tilerModel._resolveVariable(upperEndVar)
+
+                    maxAddr = 0
+                    for idx, oldBlock in enumerate(permPattern):
+                        if self.overlap(oldBlock.lifetime, memoryBlock.lifetime):
+                            if oldBlock.addrSpace is not None:
+                                maxAddr = max(maxAddr, oldBlock.addrSpace[1])
+
+                    lowerEnd = maxAddr
+                    memoryBlock.addrSpace = (lowerEnd, upperEnd)
+
+                for block, alias in aliasedBlocks:
+                    for refBlock in sorted(permPattern, key = lambda x: x.lifetime[0]):
+                        if refBlock.name == alias:
+                            block.addrSpace = refBlock.addrSpace
+                            break
diff --git a/Deeploy/TilingExtension/TileConstraint.py b/Deeploy/TilingExtension/TileConstraint.py
new file mode 100644
index 0000000..eed22b0
--- /dev/null
+++ b/Deeploy/TilingExtension/TileConstraint.py
@@ -0,0 +1,255 @@
+# ----------------------------------------------------------------------
+#
+# File: TileConstraint.py
+#
+# Last edited: 26.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from abc import abstractmethod
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+from ortools.constraint_solver.pywrapcp import IntVar
+
+#from Deeploy import TilerModel
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import MemoryConstraint, NodeMemoryConstraint, TensorMemoryConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, MemoryTransfer, \
+    TilingSchedule, VariableReplacementScheme, computeHyperRectangleList
+
+
+class TileConstraint():
+
+    # Override this
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        '''
+        Override this function to add your geometric constraints.
+        Each dimension of the output tensors should be determinable through a linear equation that utilizes the dimensions of the input tensors and the attributes of the nodes.
+        '''
+        return tilerModel
+
+    # Override this
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        '''
+        Override this function to add your custom constraints to your node.
+        '''
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+        return {}
+
+    @staticmethod
+    def getBaseAddr(tilingSolution, targetMemLevel, name) -> List[Optional[int]]:
+
+        block = tilingSolution.tensorMemoryConstraints[name].memoryConstraints[targetMemLevel]
+
+        if block.addrSpace is None:
+            return [None]
+
+        baseAddr = block.addrSpace[0]
+        endAddr = block.addrSpace[1]
+        sol = []
+        for it in range(block.multiBufferCoefficient):
+            addr = ((endAddr - baseAddr) // block.multiBufferCoefficient) * it + baseAddr
+            sol.append(addr)
+        return sol
+
+    @staticmethod
+    def extractBaseAddr(tilingSolution: NodeMemoryConstraint, targetMemLevel: str,
+                        operatorRepresentation: OperatorRepresentation,
+                        addrNames: List[str]) -> Tuple[Dict[str, int], Dict[str, int]]:
+
+        varList = list(map(lambda x: operatorRepresentation[x], addrNames))
+        addrList = list(map(lambda x: TileConstraint.getBaseAddr(tilingSolution, targetMemLevel, x), varList))
+
+        inputBaseOffsets = {}
+        outputBaseOffsets = {}
+
+        for addr, addrName, varName in zip(addrList, addrNames, varList):
+            if varName in tilingSolution.outputTensorMemoryConstraints.keys():
+                outputBaseOffsets[addrName] = addr
+            elif varName in tilingSolution.inputTensorMemoryConstraints.keys():
+                inputBaseOffsets[addrName] = addr
+            else:
+                raise Exception(f"{addrName} not in input or output!")
+
+        return inputBaseOffsets, outputBaseOffsets
+
+    @staticmethod
+    def sanitizeTilingSchedule(tilingSchedule: TilingSchedule) -> TilingSchedule:
+
+        _tilingSchedule = tilingSchedule
+
+        for baseOffsetName, baseOffsetValue in tilingSchedule.inputBaseOffsets.copy().items():
+            if baseOffsetValue == [None]:
+                for step in tilingSchedule.inputLoadSchedule:
+                    del step[baseOffsetName]
+                del tilingSchedule.inputBaseOffsets[baseOffsetName]
+
+        for baseOffsetName, baseOffsetValue in tilingSchedule.outputBaseOffsets.copy().items():
+            if baseOffsetValue == [None]:
+                for step in tilingSchedule.outputLoadSchedule:
+                    del step[baseOffsetName]
+                del tilingSchedule.outputBaseOffsets[baseOffsetName]
+
+        return _tilingSchedule
+
+    @classmethod
+    def wrapTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]:
+
+        def getMemoryTransfer(tensorConstraint: TensorMemoryConstraint, sourceCube: HyperRectangle,
+                              sourceMemoryLevel: str, targetMemoryLevel: str) -> MemoryTransfer:
+
+            size = np.prod(sourceCube.dims)
+            sourceConstraint = MemoryConstraint(sourceMemoryLevel, size)
+            sourceConstraint.shape = sourceCube.dims
+
+            destConstraint = copy.copy(tensorConstraint.memoryConstraints[targetMemoryLevel])
+
+            if any(dim1 > dim2 for dim1, dim2 in zip(destConstraint.shape, sourceConstraint.shape)):
+                destConstraint.shape = sourceConstraint.shape
+
+            return MemoryTransfer(sourceConstraint, destConstraint)
+
+        def _offsetAdd(offsetA: Tuple[int, ...], offsetB: Tuple[int, ...]) -> Tuple[int, ...]:
+            return tuple(dimA + dimB for dimA, dimB in zip(offsetA, offsetB))
+
+        def getCubeTransfers(tensorConstraint: TensorMemoryConstraint, sourceCubes: List[AbsoluteHyperRectangle],
+                             sourceMemoryLevel: str,
+                             targetMemoryLevel: str) -> Tuple[List[AbsoluteHyperRectangle], List[int]]:
+
+            solution = []
+            solutionLengths = []
+
+            for sourceCube in sourceCubes:
+                memTransfer = getMemoryTransfer(tensorConstraint, sourceCube.rectangle, sourceMemoryLevel,
+                                                targetMemoryLevel)
+                solutionCubes = computeHyperRectangleList(memTransfer)
+                solutionAbsoluteCubes = [
+                    AbsoluteHyperRectangle(rectangle = cube,
+                                           absoluteOffset = _offsetAdd(sourceCube.absoluteOffset, cube.offset))
+                    for cube in solutionCubes
+                ]
+                solution += solutionAbsoluteCubes
+                solutionLengths.append(len(solutionAbsoluteCubes))
+
+            return solution, solutionLengths
+
+        assert len(tilingSolution.outputTensorMemoryConstraints.keys()) == 1, "Expected node to have only one output!"
+        varOut = list(tilingSolution.outputTensorMemoryConstraints.keys())[0]
+
+        outTensorConstraint = tilingSolution.tensorMemoryConstraints[varOut]
+        outTensorMemoryLevelPath = list(outTensorConstraint.memoryConstraints.keys())
+        targetIdxs = [idx for idx, key in enumerate(outTensorMemoryLevelPath) if key == targetMemLevel]
+
+        assert len(targetIdxs) == 1, f"Received more than one spec for memoryLevel {targetMemLevel}"
+        targetIdx = targetIdxs[0]
+
+        if targetIdx == 0:
+            # SCHEREMO: Watch out - this happens if inputs are in L(N+1) but outputs only in L(N)
+            targetIdx = 1
+
+        fullShape = ctxt.lookup(varOut).shape
+        initialOffset = tuple([0] * len(fullShape))
+        outputCubes = [
+            AbsoluteHyperRectangle(rectangle = HyperRectangle(offset = initialOffset, dims = tuple(fullShape)),
+                                   absoluteOffset = initialOffset)
+        ]
+
+        for targetIdx in list(range(targetIdx + 1))[1:]:
+            sourceMemoryLevel = outTensorMemoryLevelPath[targetIdx - 1]
+            targetMemoryLevel = outTensorMemoryLevelPath[targetIdx]
+            outputCubes, solutionLengths = getCubeTransfers(outTensorConstraint, outputCubes, sourceMemoryLevel,
+                                                            targetMemoryLevel)
+
+        arrayOfCubes = []
+        _idx = 0
+        for idxLen in solutionLengths:
+            arrayOfCubes += [outputCubes[_idx:_idx + idxLen]]
+            _idx += idxLen
+
+        varReplacements = []
+        tilingSchedules = []
+
+        for _outputCubes in arrayOfCubes:
+
+            varReplacement, tilingSchedule = cls.serializeTilingSolution(tilingSolution, _outputCubes, targetMemLevel,
+                                                                         ctxt, operatorRepresentation)
+            sanitizedTilingSchedule = cls.sanitizeTilingSchedule(tilingSchedule)
+
+            varReplacements.append(varReplacement)
+            tilingSchedules.append(sanitizedTilingSchedule)
+
+        flatReplacement = varReplacements[0]
+        for replacement in varReplacements[1:]:
+            flatReplacement += replacement
+
+        return flatReplacement, tilingSchedules
+
+    @classmethod
+    @abstractmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        """Compute the required input tiles as a sequence of HyperRectangles
+
+        Parameters
+        ----------
+        tilingSolution : NodeMemoryConstraint
+            The final tiling solution computed in the midend
+        absoluteOutputCubes : List[AbsoluteHyperRectangle]
+            A list of HyperRectangles that represent tiles of the
+            operator's outputs with absolute offsets
+        targetMemLevel : str
+            The name of the MemoryLevel registered within the
+            Platform's MemoryHierarchy where tiles should be
+            transferred into (e.g.: L2, L1,... )
+        ctxt : NetworkContext
+            The current NetworkContext
+        operatorRepresentation : Dict
+            The operator's node representation dictionary
+
+        Returns
+        -------
+        Tuple[VariableReplacementScheme, TilingSchedule]
+            Return a VariableReplacementScheme to express which
+            expressions within the target template might have to be
+            replaced due to tiling. Also return a TilingSchedule to
+            define one input HyperRectangle tuple for each output tile
+
+        Raises
+        ------
+        Exception
+            Raises an exception unless overridden in the calling class
+
+        """
+
+        raise Exception(f"serializeTilingSolution not implemented for class {cls.__name__}!")
diff --git a/Deeploy/TilingExtension/TilerExtension.py b/Deeploy/TilingExtension/TilerExtension.py
new file mode 100644
index 0000000..3b70cc3
--- /dev/null
+++ b/Deeploy/TilingExtension/TilerExtension.py
@@ -0,0 +1,825 @@
+# ----------------------------------------------------------------------
+#
+# File: TilerExtension.py
+#
+# Last edited: 09.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Create Monad that take a Deployer and make it TilerAware
+# Define Tiler Obj centralize all tilling related functionalities for a given deployer.
+# Like Template-T-Obj mapping, propagate cst, graph edition, etc
+
+import copy
+from typing import Callable, Dict, List, Optional, Tuple, Type, Union
+
+import numpy as np
+import onnx_graphsurgeon as gs
+from ortools.constraint_solver.pywrapcp import IntVar, SolutionCollector
+
+import Deeploy.CommonExtensions.DataTypes as BasicDataTypes
+from Deeploy.AbstractDataTypes import Pointer, PointerClass
+from Deeploy.CommonExtensions.NetworkDeployers.NetworkDeployerWrapper import NetworkDeployerWrapper
+from Deeploy.DeeployTypes import ConstantBuffer, GlobalDefinition, NetworkContext, NetworkOptimizationPass, \
+    NodeBinding, NodeTemplate, ONNXLayer, Schedule, SubGraph, TopologyOptimizer, TransientBuffer
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper, \
+    MemoryLevelAwareDeployer, MemoryPlatform, MemoryPlatformWrapper, TargetMemoryLevelMapping
+from Deeploy.TilingExtension.GenericFlow import GenericFlowState
+from Deeploy.TilingExtension.MemoryConstraintFlows import GraphMemoryConstraintFlow, TensorMemLevelTuple, \
+    convertFlowState2NodeMemoryConstraint
+from Deeploy.TilingExtension.MemoryConstraints import MemoryConstraint, NodeMemoryConstraint, \
+    PatternMemoryConstraints, TensorMemoryConstraint
+from Deeploy.TilingExtension.MemoryScheduler import MemoryBlock, MemoryScheduler
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+
+TilingSolution = List[PatternMemoryConstraints]
+
+_deallocTemplate = NodeTemplate("")
+
+
+class Tiler():
+
+    arenaName = "MEMORYARENA"
+    memorySchedulerClass: Type[MemoryScheduler] = MemoryScheduler
+
+    # Initialize with the list of TemplateTCFbinding
+    def __init__(self, memoryHierarchy: MemoryHierarchy):
+
+        self.memoryHierarchy = memoryHierarchy
+        self.tilerModel: Optional[TilerModel] = None
+        self.innerMemoryScheduler = self.memorySchedulerClass("_inner", tileScheduler = True)
+        self.outerMemoryScheduler = self.memorySchedulerClass("_outer", tileScheduler = False)
+        self.symbolicMemoryConstraints: Optional[List[PatternMemoryConstraints]] = None
+
+        self._worstCaseBufferSize: Dict[str, int] = {}
+
+    @property
+    def worstCaseBufferSize(self):
+        return self._worstCaseBufferSize
+
+    def _convertCtxtToStaticSchedule(self, ctxt: NetworkContext,
+                                     memoryMap: Dict[str, List[List[MemoryBlock]]]) -> NetworkContext:
+
+        maxAddr: Dict[str, int] = {}
+
+        for memoryLevel, patternList in memoryMap.items():
+            currentMax = 0
+            for nodeList in patternList:
+                blockNames = [block.name for block in nodeList]
+                for node in nodeList:
+
+                    _buffer = ctxt.lookup(node.name)
+                    # SCHEREMO: If alias buffers have zero cost, they don't contribute to the currentMax and their addrSpace is None
+                    if hasattr(_buffer, "_alias") and (ctxt.is_global(_buffer._alias) or _buffer._alias in blockNames):
+                        continue
+
+                    currentMax = max(currentMax, node._addrSpace[1])
+
+            maxAddr[memoryLevel] = currentMax
+            self._worstCaseBufferSize[memoryLevel] = currentMax
+
+        for level, addrSpace in maxAddr.items():
+            if addrSpace == 0:
+                continue
+
+            arenaName = f"{self.arenaName}_{level}"
+
+            scratchBuffer = ctxt.VariableBuffer(arenaName, [addrSpace])
+            scratchBuffer._type = PointerClass(BasicDataTypes.int8_t)
+            ctxt.add(scratchBuffer, "global")
+            scratchBuffer._instance = scratchBuffer._type(arenaName, ctxt)
+            scratchBuffer._memoryLevel = level
+
+        # SCHEREMO: Adapt homelevel tensors to their respective arena
+        for memoryLevel, patternList in memoryMap.items():
+            if not ctxt.is_global(f"{self.arenaName}_{memoryLevel}"):
+                continue
+            staticBuf = ctxt.lookup(f"{self.arenaName}_{memoryLevel}")
+            for nodeList in patternList:
+                blockNames = [block.name for block in nodeList]
+                for node in nodeList:
+                    tensorName = node.name
+                    _buffer = ctxt.lookup(tensorName)
+
+                    if _buffer._memoryLevel != memoryLevel:
+                        continue
+
+                    if hasattr(_buffer, "_alias") and ctxt.is_global(_buffer._alias):
+                        continue
+
+                    if hasattr(_buffer, "_alias") and _buffer._alias in blockNames:
+
+                        alias = ctxt.dealiasBuffer(tensorName)
+                        aliasNodes = [node for node in nodeList if node.name == alias]
+
+                        assert len(aliasNodes) == 1, f"alias {alias} references more than one node!"
+
+                        aliasNode = aliasNodes[0]
+
+                        _buffer.allocTemplate = NodeTemplate(
+                            " \
+                        ${name} = (${type.typeName}) " +
+                            f"((char*){str(staticBuf._instance)} + {aliasNode.addrSpace[0]});")
+                        _buffer.deallocTemplate = _deallocTemplate
+
+                        continue
+
+                    offset = node.addrSpace[0]
+
+                    _buffer.allocTemplate = NodeTemplate(" \
+                    ${name} = (${type.typeName}) " + f"((char*){str(staticBuf._instance)} + {offset});")
+                    _buffer.deallocTemplate = _deallocTemplate
+
+        return ctxt
+
+    def computeTilingSchedule(self, ctxt: NetworkContext) -> TilingSolution:
+
+        assert self.tilerModel is not None and self.symbolicMemoryConstraints is not None, "Set up the model before trying to compute a schedule!"
+
+        collector = self.tilerModel.trySolveModel()
+        tilingSchedule = self._getTilingSolution(self.tilerModel, ctxt, collector, self.symbolicMemoryConstraints)
+
+        self.innerMemoryScheduler.annotateSolution(ctxt, self.tilerModel)
+        self.outerMemoryScheduler.annotateSolution(ctxt, self.tilerModel)
+
+        memoryMap = {}
+
+        for key in self.innerMemoryScheduler.memoryMap.keys():
+            memoryMap[key] = [*self.innerMemoryScheduler.memoryMap[key], *self.outerMemoryScheduler.memoryMap[key]]
+
+        for idx, pattern in enumerate(tilingSchedule):
+            for nodeIdx, nodeConstraint in enumerate(pattern.nodeConstraints):
+                for tensorConstraint in nodeConstraint.tensorMemoryConstraints.values():
+                    for memoryConstraint in tensorConstraint.memoryConstraints.values():
+                        patternList = memoryMap[memoryConstraint.memoryLevel]
+                        blockPattern = patternList[idx]
+
+                        # SCHEREMO: Don't try to annotate home base of tensor
+                        if ctxt.lookup(tensorConstraint.tensorName
+                                      )._memoryLevel == memoryConstraint.memoryLevel and not isinstance(
+                                          ctxt.lookup(tensorConstraint.tensorName), TransientBuffer):
+                            continue
+
+                        _block = [memBlock for memBlock in blockPattern if memBlock.name == tensorConstraint.tensorName]
+
+                        assert len(
+                            _block
+                        ) == 1, f"Missing or superfluous memory block {tensorConstraint.tensorName} allocation found in {_block}!"
+
+                        block = _block[0]
+                        memoryConstraint.addrSpace = block.addrSpace
+
+        self._convertCtxtToStaticSchedule(ctxt, memoryMap)
+
+        return tilingSchedule
+
+    def setupModel(self, ctxt: NetworkContext, schedule: Schedule, layerBinding: 'OrderedDict[str, ONNXLayer]',
+                   targetMemoryLevelMapping: TargetMemoryLevelMapping) -> NetworkContext:
+
+        wrapSchedule: List[SubGraph] = []
+        for entry in schedule:
+            if isinstance(entry, gs.Node):
+                wrapSchedule.append([entry])
+            else:
+                wrapSchedule.append(entry)
+
+        tilerModel = TilerModel()
+        tilerModel = self._setupGeometricConstraints(tilerModel, ctxt, wrapSchedule, layerBinding)
+        tilerModel = self._setupTensorDimensionProducts(tilerModel, ctxt, wrapSchedule)
+        tilerModel = self._setupHeuristics(tilerModel, ctxt, wrapSchedule)
+        tilerModel, allSymbolicMemoryConstraints = self._setupMemoryConstraints(tilerModel, ctxt, wrapSchedule,
+                                                                                layerBinding, targetMemoryLevelMapping)
+
+        self.tilerModel = tilerModel
+        self.symbolicMemoryConstraints = allSymbolicMemoryConstraints
+
+        return ctxt
+
+    # SCHEREMO: Return a integer factor or IntVar variable for the multi Buffer coefficient given the tiling path, hop and tensorName.
+    def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
+                            hop: str, tensorName: str) -> Union[int, IntVar]:
+
+        varBuffer = ctxt.lookup(tensorName)
+
+        generalCoeff = 2
+
+        if isinstance(varBuffer, TransientBuffer):
+            coefficient = 1
+        elif isinstance(varBuffer, ConstantBuffer):
+            coefficient = generalCoeff
+        else:
+            coefficient = generalCoeff
+
+        # if tensorName == pattern[-1].outputs[0].name:
+        #     maxVal = (np.prod(varBuffer.shape) // (coefficient)).item()
+        #     numElt = tilerModel.getTensorNumberOfEltVar(tensorName)
+        #     constr = numElt <= maxVal
+
+        #     if (constr != True):
+        #         tilerModel.addConstraint(constr)
+
+        return coefficient
+
+    # SCHEREMO: Given a PatternMemoryConstraints object, propagate the IOBuffer freeing strategy.
+    # Input: Single-buffered Liveness analysis of the input/output IO buffers that should be tiled
+    # Output: Buffering-strategy aware liveness analysis of the input/output IO buffers
+
+    # This version implements "static n-ple buffering"
+
+    def propagateIOBufferStrategy(self, tileConstraintPattern: PatternMemoryConstraints, pattern: SubGraph,
+                                  ctxt: NetworkContext) -> PatternMemoryConstraints:
+
+        borderTensorStep = NodeMemoryConstraint()
+        for patternStep in tileConstraintPattern.nodeConstraints:
+            borderTensorStep += patternStep
+
+        for idx in range(len(tileConstraintPattern.nodeConstraints)):
+            tileConstraintPattern.nodeConstraints[idx] += borderTensorStep
+
+        return tileConstraintPattern
+
+    def _resolveTensorMemoryConstraint(self, tilerModel: TilerModel, ctxt: NetworkContext, collector: SolutionCollector,
+                                       tensorConstraint: TensorMemoryConstraint) -> TensorMemoryConstraint:
+        assert self.tilerModel is not None, "Can't resolve tensor memory constraints, tilerModel is None!"
+
+        tensorName = tensorConstraint.tensorName
+        solvedTensorConstraint = TensorMemoryConstraint(tensorName, {}, ctxt)
+
+        for memoryLevel, memoryConstraint in tensorConstraint.memoryConstraints.items():
+            size = self.tilerModel._resolveVariable(memoryConstraint.size)
+
+            newMemoryConstraint: MemoryConstraint = MemoryConstraint(memoryLevel, size)
+            multiBufferCoefficient = self.tilerModel._resolveVariable(memoryConstraint.multiBufferCoefficient)
+            newMemoryConstraint.multiBufferCoefficient = multiBufferCoefficient
+
+            if not isinstance(ctxt.lookup(tensorName), TransientBuffer):
+
+                tensorShapeLen = len(ctxt.lookup(tensorName).shape)
+                newShape: List[int] = []
+
+                if isinstance(memoryConstraint.size, int):
+                    newShape = ctxt.lookup(tensorName).shape
+                else:
+                    _, copyIdx = tilerModel.getNameCopyIdx(memoryConstraint.size.Name())
+                    for i in range(tensorShapeLen):
+                        newShape.append(
+                            self.tilerModel._resolveVariable(tilerModel.getTensorDimVar(tensorName, i, copyIdx)))
+
+                newMemoryConstraint.shape = tuple(newShape)
+
+            solvedTensorConstraint.addMemoryConstraint(newMemoryConstraint)
+
+        return solvedTensorConstraint
+
+    def _getTilingSolution(self, tilerModel: TilerModel, ctxt: NetworkContext, collector: SolutionCollector,
+                           allConstraints: List[PatternMemoryConstraints]) -> List[PatternMemoryConstraints]:
+
+        retList = []
+
+        def _checkResolve(ctxt, tensorName, tensorConstraint):
+
+            if ctxt.is_global(tensorName) and len(tensorConstraint.memoryConstraints.values()) <= 1:
+                return False
+            if len(tensorConstraint.memoryConstraints.values()) <= 1 and not isinstance(
+                    ctxt.lookup(tensorName), TransientBuffer):
+                return False
+            return True
+
+        for patternConstraints in allConstraints:
+            newMemoryConstraint = PatternMemoryConstraints()
+            for stepConstraints in patternConstraints.nodeConstraints:
+                newStepMemoryConstraint = NodeMemoryConstraint()
+                for tensorName, tensorConstraint in stepConstraints.tensorMemoryConstraints.items():
+                    if _checkResolve(ctxt, tensorName, tensorConstraint):
+                        solvedTensorConstraint = self._resolveTensorMemoryConstraint(
+                            tilerModel, ctxt, collector, tensorConstraint)
+                        ioDir = stepConstraints.getIO(tensorName)
+                        newStepMemoryConstraint.addTensorConstraint(solvedTensorConstraint, ioDir)
+
+                newMemoryConstraint.addConstraint(newStepMemoryConstraint)
+            retList.append(newMemoryConstraint)
+
+        return retList
+
+    def _setupTensorDimensionProducts(self, tilerModel: TilerModel, ctxt: NetworkContext,
+                                      schedule: List[SubGraph]) -> TilerModel:
+
+        for idx, pattern in enumerate(schedule):
+            subGraph = gs.Graph(nodes = pattern)
+            subgraphTensors: 'OrderedDict[str, gs.Tensor]' = subGraph.tensors(check_duplicates = True)
+
+            for _, tensor in subgraphTensors.items():
+                if not ctxt.lookup(tensor.name)._deploy:
+                    continue
+
+                tilerModel.addTensorNumOfEltToModel(ctxt, tensor.name, idx)
+
+        return tilerModel
+
+    def _setupGeometricConstraints(self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph],
+                                   layerBinding: 'OrderedDict[str, ONNXLayer]') -> TilerModel:
+
+        # SCHEREMO: Each pattern is a decoupled sub-problem w.r.t the geometric constraints.
+        # We need to regenerate dimension variables for each tensor
+        # This is done by setting the copyIdx in the tilerModel
+
+        for idx, pattern in enumerate(schedule):
+            tilerModel.copyIdx = idx
+
+            for node in pattern:
+
+                if node.name not in layerBinding.keys():
+                    continue
+
+                parseDict = layerBinding[node.name].mapper.parser.operatorRepresentation
+                template = layerBinding[node.name].mapper.binder.template
+
+                tilerModel = template.tileConstraint.addGeometricalConstraint(tilerModel,
+                                                                              parseDict = parseDict,
+                                                                              ctxt = ctxt)
+
+                tilerModel = template.tileConstraint.addPolicyConstraint(tilerModel, parseDict = parseDict, ctxt = ctxt)
+
+        return tilerModel
+
+    def _setupHeuristics(self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph]) -> TilerModel:
+
+        for idx, pattern in enumerate(schedule):
+
+            patternTensorList = []
+            seenTensorNameList = []
+            for node in pattern:
+                for gsTensor in node.inputs + node.outputs:
+                    ctxtTensor = ctxt.lookup(gsTensor.name)
+                    if ctxtTensor.name not in seenTensorNameList:
+                        seenTensorNameList.append(ctxtTensor.name)
+                        patternTensorList.append(ctxtTensor)
+
+            patternMemSizeExpr: IntVar = 0
+            for tensor in patternTensorList:
+                if not ctxt.lookup(tensor.name)._deploy:
+                    continue
+
+                patternMemSizeExpr += tilerModel.getTensorNumberOfEltVar(
+                    tensorName = tensor.name, copyIdx = idx) * (tensor._type.referencedType.typeWidth // 8)
+
+            if isinstance(patternMemSizeExpr, int):
+                _max = patternMemSizeExpr
+            else:
+                _max = patternMemSizeExpr.Max()
+
+            patternVariable = tilerModel.addVariable(name = "DEEPLOY_PATTERN_MEM",
+                                                     lowerBound = 1,
+                                                     upperBound = _max,
+                                                     copyIdx = idx)
+            tilerModel.addConstraint(patternVariable == patternMemSizeExpr)
+
+            tilerModel.addObjective(patternVariable, 'maximize')
+
+        return tilerModel
+
+    def _setupMemoryConstraints(
+            self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph],
+            layerBinding: 'OrderedDict[str, ONNXLayer]',
+            targetMemoryLevelMapping: TargetMemoryLevelMapping) -> Tuple[TilerModel, List[PatternMemoryConstraints]]:
+
+        allMemoryConstraints = self._generateAllMemoryConstraints(tilerModel, ctxt, schedule, layerBinding,
+                                                                  targetMemoryLevelMapping)
+
+        outerMemoryConstraints = PatternMemoryConstraints()
+        for constraint in allMemoryConstraints:
+            for nodeConstraint in constraint.nodeConstraints:
+                outerMemoryConstraints.addConstraint(nodeConstraint)
+
+        for level in self.memoryHierarchy.memoryLevels.keys():
+            self.outerMemoryScheduler.scheduleMemoryConstraints(tilerModel, ctxt, [outerMemoryConstraints],
+                                                                self.memoryHierarchy, level)
+
+        # Update inner memoryHierarchy with outer constraints
+        innerMemoryHierarchy = MemoryHierarchy([])
+        for level, memLevel in self.memoryHierarchy.memoryLevels.items():
+            newMemLevel = copy.copy(memLevel)
+            outerConstraint = tilerModel.getVariable(self.outerMemoryScheduler.getSymbolicCostName(0, level), 0)
+
+            newMemLevel.size = newMemLevel.size - outerConstraint
+            innerMemoryHierarchy._add(newMemLevel)
+
+        for level in innerMemoryHierarchy.memoryLevels.keys():
+            self.innerMemoryScheduler.scheduleMemoryConstraints(tilerModel, ctxt, allMemoryConstraints,
+                                                                innerMemoryHierarchy, level)
+
+        return tilerModel, allMemoryConstraints
+
+    def _generateAllMemoryConstraints(
+            self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph],
+            layerBinding: 'OrderedDict[str, ONNXLayer]',
+            targetMemoryLevelMapping: TargetMemoryLevelMapping) -> List[PatternMemoryConstraints]:
+
+        dynamicTensorConstraints, constantTensorConstraints = self._generateMemoryConstraints(
+            tilerModel, ctxt, schedule, layerBinding, targetMemoryLevelMapping)
+
+        allConstraints: List[PatternMemoryConstraints] = []
+        # Initialize structures
+
+        for pattern in dynamicTensorConstraints:
+            allPattern = PatternMemoryConstraints()
+            for step in pattern.nodeConstraints:
+                allStep = step + constantTensorConstraints
+                allPattern.addConstraint(allStep)
+            allConstraints.append(allPattern)
+
+        return allConstraints
+
+    def _generateMemoryConstraints(
+        self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph],
+        layerBinding: 'OrderedDict[str, ONNXLayer]', targetMemoryLevelMapping: TargetMemoryLevelMapping
+    ) -> Tuple[List[PatternMemoryConstraints], NodeMemoryConstraint]:
+
+        # SCHEREMO: Construct non-double-buffered constraints of local variable buffers
+
+        outerVariableConstraints, innerVariableConstraints = self._generateVariableBufferConstraints(
+            tilerModel, ctxt, schedule, layerBinding, targetMemoryLevelMapping)
+
+        # SCHEREMO: Construct global buffer constraints
+
+        globalVariableConstraint = self._generateBufferConstraints(ctxt)
+
+        # SCHEREMO: Construct first-level constraint set (all global buffers + tensors stored in higher level)
+
+        firstLevelConstraints: List[PatternMemoryConstraints] = copy.copy(outerVariableConstraints)
+        for patternConstraint in firstLevelConstraints:
+            for idx in range(len(patternConstraint.nodeConstraints)):
+                patternConstraint.nodeConstraints[idx] += globalVariableConstraint
+
+        # SCHEREMO: Construct constraint set for tiled tensors (including double buffering, excluding static global constraints)
+        tiledTensorConstraints: List[PatternMemoryConstraints] = self._generateTilePathConstraints(
+            tilerModel, ctxt, firstLevelConstraints, innerVariableConstraints, schedule)
+
+        # SCHEREMO: Construct constraint set for tiled tensors + local-only tensors (dynamic tensor set)
+        dynamicTensorConstraints: List[PatternMemoryConstraints] = []
+        for tilingConstraints, innerConstraints in zip(tiledTensorConstraints, innerVariableConstraints):
+
+            dynamicTensorPattern = PatternMemoryConstraints()
+            for tilingPatternStep, innerPatternStep in zip(tilingConstraints.nodeConstraints,
+                                                           innerConstraints.nodeConstraints):
+                dynamicTensorPatternStep = copy.copy(tilingPatternStep)
+
+                # Pick all constraints that are purely internal
+                for innerTensorName, innerTensor in innerPatternStep.tensorMemoryConstraints.items():
+                    if not any([
+                            innerTensorName == dynamicTensorName
+                            for dynamicTensorName, tensor in dynamicTensorPatternStep.tensorMemoryConstraints.items()
+                    ]):
+                        ioDir = tilingPatternStep.getIO(innerTensorName)
+                        dynamicTensorPatternStep.addTensorConstraint(innerTensor, ioDir)
+                dynamicTensorPattern.addConstraint(dynamicTensorPatternStep)
+            dynamicTensorConstraints.append(dynamicTensorPattern)
+
+        # SCHEREMO: Construct unkilled tensor set
+        inplaceTensorConstraints: List[PatternMemoryConstraints] = []
+        for tilingConstraints, outerConstraints in zip(dynamicTensorConstraints, firstLevelConstraints):
+            dynamicTensorPattern = PatternMemoryConstraints()
+            for tilingPatternStep, outerPatternStep in zip(tilingConstraints.nodeConstraints,
+                                                           outerConstraints.nodeConstraints):
+                dynamicTensorPatternStep = copy.copy(tilingPatternStep)
+
+                # Pick all constraints that are purely internal
+                for outerTensorName, outerTensor in outerPatternStep.tensorMemoryConstraints.items():
+                    if not any(
+                        [(outerTensorName == dynamicTensorName) or (ctxt.is_global(outerTensorName))
+                         for dynamicTensorName, tensor in dynamicTensorPatternStep.tensorMemoryConstraints.items()]):
+                        dynamicTensorPatternStep.addTensorConstraint(outerTensor, "intermediate")
+                dynamicTensorPattern.addConstraint(dynamicTensorPatternStep)
+            inplaceTensorConstraints.append(dynamicTensorPattern)
+
+        return inplaceTensorConstraints, globalVariableConstraint
+
+    def _generateTilePath(self, tilerModel: TilerModel, ctxt: NetworkContext,
+                          tensorMemoryConstraint: TensorMemoryConstraint, pattern: SubGraph) -> TensorMemoryConstraint:
+
+        assert len(tensorMemoryConstraint.memoryConstraints.keys()
+                  ) == 2, "Can't generate a tile path for more than one hierarchy level!"
+
+        tensorName = tensorMemoryConstraint.tensorName
+
+        valList = list(tensorMemoryConstraint.memoryConstraints.values())
+        constraintA = valList[0]
+        constraintB = valList[1]
+
+        # SCHEREMO : Base is whichever constraint is constant
+        base = constraintA if isinstance(constraintA.size, int) else constraintB
+        end = constraintA if base == constraintB else constraintB
+
+        path = self.memoryHierarchy.bfs(base.memoryLevel, end.memoryLevel)
+        requiredHops = path[1:]
+
+        returnTensorConstraint = TensorMemoryConstraint(tensorName, {}, ctxt)
+        returnTensorConstraint.addMemoryConstraint(base)
+
+        for hop in requiredHops:
+            factor = self.multiBufferStrategy(tilerModel, ctxt, pattern, path, hop, tensorName)
+            assert factor >= 1 and isinstance(factor, int), "Invalid factor!"
+
+            memConstraint = MemoryConstraint(hop, end.size)
+            memConstraint.multiBufferCoefficient = factor
+            returnTensorConstraint.addMemoryConstraint(memConstraint)
+
+        return returnTensorConstraint
+
+    def _generateIntermediateTilingSteps(self, tilerModel: TilerModel, ctxt: NetworkContext,
+                                         sourceStep: NodeMemoryConstraint, destinationStep: NodeMemoryConstraint,
+                                         pattern: SubGraph) -> NodeMemoryConstraint:
+        tileConstraintStep = NodeMemoryConstraint()
+
+        mergedStep = sourceStep + destinationStep
+        tileTensorConstraints = [
+            tensor for tensor in mergedStep.tensorMemoryConstraints.values()
+            if len(tensor.memoryConstraints.values()) > 1
+        ]
+
+        for tileTensor in tileTensorConstraints:
+            tiledTensor = self._generateTilePath(tilerModel, ctxt, tileTensor, pattern)
+            ioDir = mergedStep.getIO(tileTensor.tensorName)
+            tileConstraintStep.addTensorConstraint(tiledTensor, ioDir)
+
+        return tileConstraintStep
+
+    def _generateTilePathConstraints(self, tilerModel: TilerModel, ctxt: NetworkContext,
+                                     sourceConstraints: List[PatternMemoryConstraints],
+                                     destinationConstraints: List[PatternMemoryConstraints],
+                                     schedule: List[SubGraph]) -> List[PatternMemoryConstraints]:
+
+        tileConstraints = []
+
+        for idx, (sourceConstraint, destinationConstraint) in enumerate(zip(sourceConstraints, destinationConstraints)):
+
+            tilerModel.copyIdx = idx
+
+            assert (len(sourceConstraint.nodeConstraints) == 1
+                   ), "source pattern must be constant and single step, since it's live throughout the pattern!"
+            sourcePatternStep = sourceConstraint.nodeConstraints[0]
+
+            tileConstraint = PatternMemoryConstraints()
+
+            for destinationConstraintStep in destinationConstraint.nodeConstraints:
+                tileConstraintStep = self._generateIntermediateTilingSteps(tilerModel, ctxt, sourcePatternStep,
+                                                                           destinationConstraintStep, schedule[idx])
+                tileConstraint.addConstraint(tileConstraintStep)
+
+            propagatedTileConstraint = self.propagateIOBufferStrategy(tileConstraint, schedule[idx], ctxt)
+            assert len(propagatedTileConstraint.nodeConstraints) == len(tileConstraint.nodeConstraints)
+
+            tileConstraints.append(propagatedTileConstraint)
+
+        return tileConstraints
+
+    def _generateBufferConstraints(self, ctxt: NetworkContext) -> NodeMemoryConstraint:
+
+        constantGlobalConstraint: NodeMemoryConstraint = NodeMemoryConstraint()
+        constantGlobalBuffers = [
+            node for node in ctxt.globalObjects.values()
+            if not isinstance(node, GlobalDefinition) and node._deploy == True
+        ]
+
+        for constantBuffer in constantGlobalBuffers:
+
+            tensorName = constantBuffer.name
+
+            memorySize = int(np.prod(ctxt.lookup(tensorName).shape))
+
+            elementMemorySize = memorySize
+            memoryConstraint = MemoryConstraint(constantBuffer._memoryLevel, elementMemorySize)
+            tensorConstraint = TensorMemoryConstraint(constantBuffer.name,
+                                                      {memoryConstraint.memoryLevel: memoryConstraint}, ctxt)
+            constantGlobalConstraint.addTensorConstraint(tensorConstraint, "input")
+
+        return constantGlobalConstraint
+
+    def _generateVariableBufferConstraints(
+        self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph],
+        layerBinding: 'OrderedDict[str, ONNXLayer]', targetMemoryLevelMapping: TargetMemoryLevelMapping
+    ) -> Tuple[List[PatternMemoryConstraints], List[PatternMemoryConstraints]]:
+
+        def deltaFlow(
+                patternFlow: List[GenericFlowState[TensorMemLevelTuple]]) -> GenericFlowState[TensorMemLevelTuple]:
+
+            initialFlow = patternFlow[0]
+            endFlow = patternFlow[1]
+
+            # SCHEREMO: The genset and killset of the innerflow are correct; however, since we now pass the initialliveset of the pattern to the constraint flow. we need to remove bypassed tensors
+            mergedLiveSet = initialFlow.liveSet - endFlow.liveSet
+            mergedGenSet = initialFlow.genSet
+            mergedKillSet = initialFlow.killSet
+
+            mergedFlow = GenericFlowState[TensorMemLevelTuple](mergedLiveSet, mergedKillSet, mergedGenSet)
+
+            return mergedFlow
+
+        initialLiveBuffers = {
+            value.name
+            for value in ctxt.globalObjects.values()
+            if (isinstance(value, ctxt.VariableBuffer) and value._users != [])
+        }
+
+        producedBuffers = {layer.node.outputs[0].name for layer in layerBinding.values()}
+        inputBufferNames = initialLiveBuffers - producedBuffers
+        inputBuffers = [ctxt.lookup(name) for name in inputBufferNames]
+
+        initialLiveTensors = {TensorMemLevelTuple(buf.name, buf._memoryLevel) for buf in inputBuffers}
+
+        constraintFlow = GraphMemoryConstraintFlow(ctxt, targetMemoryLevelMapping)
+        graphFlowStates = constraintFlow.flow(schedule, initialLiveTensors)
+
+        innerMemConstraints: List[PatternMemoryConstraints] = []
+        outerMemConstraints: List[PatternMemoryConstraints] = []
+
+        for idx, pattern in enumerate(schedule):
+
+            tilerModel.copyIdx = idx
+
+            innerPatternMemoryConstraints = PatternMemoryConstraints()
+            outerPatternMemoryConstraints = PatternMemoryConstraints()
+
+            outerFlowState = graphFlowStates[idx]
+            patternFlow = constraintFlow._patternFlowStates[idx]
+
+            dynamicOuterBufferConstraints = convertFlowState2NodeMemoryConstraint(tilerModel,
+                                                                                  ctxt,
+                                                                                  outerFlowState,
+                                                                                  useMax = True)
+
+            outerPatternMemoryConstraints.addConstraint(dynamicOuterBufferConstraints)
+            outerMemConstraints.append(outerPatternMemoryConstraints)
+
+            mergedFlow = [deltaFlow(patternFlow)]
+
+            for step, innerFlowState in zip(pattern, mergedFlow):
+                transientBufferConstraints = self._generatePatternStepTransientBufferConstraints(
+                    tilerModel, ctxt, layerBinding, step, targetMemoryLevelMapping)
+
+                dynamicInnerBufferConstraints = convertFlowState2NodeMemoryConstraint(tilerModel,
+                                                                                      ctxt,
+                                                                                      innerFlowState,
+                                                                                      useMax = False)
+
+                innerPatternMemoryConstraints.addConstraint(transientBufferConstraints + dynamicInnerBufferConstraints)
+
+            innerMemConstraints.append(innerPatternMemoryConstraints)
+
+        return outerMemConstraints, innerMemConstraints
+
+    def _generatePatternStepTransientBufferConstraints(
+            self, tilerModel: TilerModel, ctxt: NetworkContext, layerBinding: 'OrderedDict[str, ONNXLayer]',
+            step: gs.Node, targetMemoryLevelMapping: TargetMemoryLevelMapping) -> NodeMemoryConstraint:
+
+        patternStepTransientBufferSizes = NodeMemoryConstraint()
+
+        template = layerBinding[step.name].mapper.binder.template
+
+        symbolicNodeRep = template.tileConstraint.constructSymbolicNodeRep(
+            tilerModel, parseDict = layerBinding[step.name].mapper.parser.operatorRepresentation, ctxt = ctxt)
+
+        transientBufferList: List[Tuple[str,
+                                        Union[int,
+                                              IntVar]]] = template.computeTransientBuffersSize(ctxt, symbolicNodeRep)
+
+        for tensorName, memorySize in transientBufferList:
+
+            # SCHEREMO: Assume transientbuffers end up in the same level as their user's main input
+            memoryLevelName = targetMemoryLevelMapping.lookup(step.name, step.inputs[0].name)
+            ctxt.lookup(tensorName)._memoryLevel = memoryLevelName
+
+            transientSize = tilerModel.addTransientBufferSizeToModel(tensorName, memorySize)
+
+            #memoryLevelName = self.memoryHierarchy.getDefaultMemoryLevel().name
+
+            transientMemoryConstraint = MemoryConstraint(memoryLevelName, transientSize)
+            transientBufferConstraint = TensorMemoryConstraint(tensorName, {memoryLevelName: transientMemoryConstraint},
+                                                               ctxt)
+            patternStepTransientBufferSizes.addTensorConstraint(transientBufferConstraint, "intermediate")
+
+        return patternStepTransientBufferSizes
+
+
+class TilerAwareDeployer(MemoryLevelAwareDeployer):
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: Union[MemoryPlatform, MemoryPlatformWrapper],
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable[[gs.Graph], Schedule] = lambda graph: list(graph.nodes),
+                 name: str = 'DeeployNetwork',
+                 default_channels_first: bool = True,
+                 deeployStateDir: str = "DeeployState",
+                 memoryLevelAnnotationPasses: List[NetworkOptimizationPass] = [],
+                 tilerCls: Type[Tiler] = Tiler):
+        super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name,
+                         default_channels_first, deeployStateDir, memoryLevelAnnotationPasses)
+        self.tiler = tilerCls(deploymentPlatform.memoryHierarchy)
+
+    @property
+    def worstCaseBufferSize(self):
+        maxAddr: Dict[str, int] = self.tiler.worstCaseBufferSize
+
+        # WIESEP: Memory map form tiler does not include inputs and outputs
+        for node in (self.inputs() + self.outputs()):
+            maxAddr[node._memoryLevel] += np.prod(node.shape) * node._type.referencedType.typeWidth // 8
+
+        return maxAddr
+
+    def tile(self, tilingSolution: Optional[TilingSolution] = None):
+        if tilingSolution is None:
+            schedule = self.scheduler(self.graph)
+
+            self.tiler.setupModel(ctxt = self.ctxt,
+                                  schedule = schedule,
+                                  layerBinding = self.layerBinding,
+                                  targetMemoryLevelMapping = self.getTargetMemoryLevelMapping())
+            tilingSolution = self.tiler.computeTilingSchedule(self.ctxt)
+
+        # SCHEREMO: Annotate execution block with solution
+        for layer, pattern in zip(self.layerBinding.values(), tilingSolution):
+            layer.mapper.binder.executionBlock.patternMemoryConstraint = pattern
+
+        # SCHEREMO: Code generation STUB
+
+    def bind(self):
+        if not super().bind():
+            return False
+
+        self.tile()
+        return True
+
+
+class TilerDeployerWrapper(NetworkDeployerWrapper):
+
+    def __init__(self, deployer: Union[MemoryLevelAwareDeployer, MemoryDeployerWrapper], tilerCls: Type[Tiler] = Tiler):
+        super().__init__(deployer)
+        assert isinstance(self.Platform, (MemoryPlatform, MemoryPlatformWrapper)), \
+            f"Platform should be a MemoryPlatform or MemoryPlatformWrapper! Got {type(self.Platform).__name__}"
+        self.tiler = tilerCls(self.Platform.memoryHierarchy)
+
+    @property
+    def worstCaseBufferSize(self):
+        maxAddr: Dict[str, int] = self.tiler.worstCaseBufferSize
+
+        # WIESEP: Memory map form tiler does not include inputs and outputs
+        for node in (self.inputs() + self.outputs()):
+            maxAddr[node._memoryLevel] += np.prod(node.shape) * node._type.referencedType.typeWidth // 8
+
+        return maxAddr
+
+    def tile(self, tilingSolution: Optional[TilingSolution] = None):
+        if tilingSolution is None:
+            schedule = self.scheduler(self.graph)
+
+            self.tiler.setupModel(ctxt = self.ctxt,
+                                  schedule = schedule,
+                                  layerBinding = self.layerBinding,
+                                  targetMemoryLevelMapping = self.getTargetMemoryLevelMapping())
+            tilingSolution = self.tiler.computeTilingSchedule(self.ctxt)
+
+        # SCHEREMO: Annotate execution block with solution
+        for layer, pattern in zip(self.layerBinding.values(), tilingSolution):
+            layer.mapper.binder.executionBlock.patternMemoryConstraint = pattern
+
+        # SCHEREMO: Code generation STUB
+
+    def bind(self):
+        if not super().bind():
+            return False
+
+        self.tile()
+        return True
+
+
+def TilingReadyNodeBindings(nodeBindings: List[NodeBinding], tileConstraint: TileConstraint) -> List[NodeBinding]:
+    '''
+    Apply the TillingReadyNodeTemplate to the template of each NodeBinding.
+    '''
+    nodeBindingsCopy = copy.deepcopy(nodeBindings)  #.copy()
+    for binding in nodeBindingsCopy:
+        binding.template.tileConstraint = tileConstraint
+
+    return nodeBindingsCopy
diff --git a/Deeploy/TilingExtension/TilerModel.py b/Deeploy/TilingExtension/TilerModel.py
new file mode 100644
index 0000000..40e596d
--- /dev/null
+++ b/Deeploy/TilingExtension/TilerModel.py
@@ -0,0 +1,396 @@
+# ----------------------------------------------------------------------
+#
+# File: TilerModel.py
+#
+# Last edited: 25.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from pprint import pformat
+from typing import Dict, List, Literal, Optional, Tuple, Union
+
+import numpy as np
+from ortools.constraint_solver.pywrapcp import IntExpr, IntVar, SolutionCollector, Solver
+
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryLevel
+
+_COPYIDXSUFFIX = "_copyIdx_"
+_SOLVERTIMEOUT = 60000
+
+
+@dataclass
+class AddConstraintStrategy:
+    """Base class for strategies of adding constraints"""
+
+
+@dataclass
+class PerformanceHint(AddConstraintStrategy):
+    """Constraints marked with PerformanceHint will be added only if the TilerModel is still valid after adding them.
+       Constraints with higher priority will be tried first.
+    """
+    priority: int = 0
+
+
+class TilerModel():
+
+    def __init__(self, copyIdxSuffix: Optional[str] = None):
+
+        self._model: Solver = Solver('CPSimple')
+        self._objectives: List[Tuple[IntVar, bool]] = []
+        self._constraints: List[IntExpr] = []
+        self._memoryConstraints: List[Tuple[MemoryLevel, IntExpr]] = []
+        self._performanceConstraints: List[Tuple[int, IntExpr]] = []
+        self._performanceMemoryConstraints: List[Tuple[int, Tuple[MemoryLevel, IntExpr]]] = []
+        self._variables: Dict[str, IntVar] = {}
+
+        self.copyIdx: int = 0
+        self._copyIdxSuffix: str = copyIdxSuffix if copyIdxSuffix is not None else _COPYIDXSUFFIX
+        self._collector: Optional[SolutionCollector] = None
+
+    def _resolveVariable(self, var) -> int:
+        if isinstance(var, int):
+            return var
+
+        if self._collector is None:
+            return 0
+
+        return self._collector.Value(self._collector.SolutionCount() - 1, var)
+
+    def _addVariable(self, name: str, lowerBound: int, upperBound: int) -> IntVar:
+
+        assert name not in self._variables.keys(), \
+            f"Error while adding {name} variable in {self}, variable already exists."
+
+        self._variables[name] = self._model.IntVar(int(lowerBound), int(upperBound), name)
+        return self._variables[name]
+
+    def _getSuffix(self, copyIdx: Optional[int]) -> str:
+        if copyIdx is not None:
+            varName = f"{self._copyIdxSuffix}{copyIdx}"
+        else:
+            varName = f"{self._copyIdxSuffix}{self.copyIdx}"
+        return varName
+
+    def getNameCopyIdx(self, variableName: str) -> Tuple[str, int]:
+        splitList = variableName.split(self._copyIdxSuffix)
+        varName = splitList[0]
+        copyIdx = splitList[1]
+
+        return (varName, int(copyIdx))
+
+    def existsCopyIdx(self, name: str, copyIdx: Optional[int] = None) -> bool:
+
+        if copyIdx is None:
+            _copyIdx = self.copyIdx
+        else:
+            _copyIdx = copyIdx
+
+        varName1 = name + "_num_elements" + self._getSuffix(_copyIdx)
+        varName2 = name + "_dim_0" + self._getSuffix(_copyIdx)
+        ret = (varName1 in self._variables) or (varName2 in self._variables)
+        return ret
+
+    def addObjective(self, objective: IntVar, objectiveType: Union[Literal['maximize'], Literal['minimize']]):
+        if objectiveType == 'maximize':
+            self._objectives.append((objective, False))
+        else:
+            self._objectives.append((objective, True))
+
+    def addConstraint(self,
+                      constraintExpression: IntExpr,
+                      memoryLevel: Optional[MemoryLevel] = None,
+                      strategy: Optional[AddConstraintStrategy] = None):
+        if isinstance(strategy, PerformanceHint):
+            if memoryLevel is None:
+                self._performanceConstraints.append((strategy.priority, constraintExpression))
+            else:
+                self._performanceMemoryConstraints.append((strategy.priority, (memoryLevel, constraintExpression)))
+        else:
+            if memoryLevel is None:
+                self._constraints.append(constraintExpression)
+            else:
+                self._memoryConstraints.append((memoryLevel, constraintExpression))
+
+    def addVariable(self, name: str, lowerBound: int, upperBound: int, copyIdx: Optional[int] = None) -> IntVar:
+
+        varName = name + self._getSuffix(copyIdx)
+        return self._addVariable(varName, lowerBound, upperBound)
+
+    def getVariable(self, name: str, copyIdx: Optional[int] = None) -> IntVar:
+        varName = name + self._getSuffix(copyIdx)
+        return self._variables[varName]
+
+    def getTensorDimVar(self, tensorName: str, dimIdx: int, copyIdx: Optional[int] = None):
+
+        varName = f"{tensorName}_dim_{dimIdx}" + self._getSuffix(copyIdx)
+
+        return self._variables[varName]
+
+    def getTensorNumberOfEltVar(self, tensorName: str, copyIdx: Optional[int] = None):
+
+        varName = f"{tensorName}_num_elements" + self._getSuffix(copyIdx)
+
+        return self._variables[varName]
+
+    def addTensorDimToModel(self, ctxt: NetworkContext, tensorName: str, copyIdx: Optional[int] = None):
+        '''
+        Add every dimensions of an unseen tensors in the given list as Integer Variable of the Model and the context.
+        Namespace of added variables is: f"{tensor.name}_dim_{idx}".
+        '''
+        tensor = ctxt.lookup(tensorName)
+
+        for idx, dim in enumerate(tensor.shape):
+
+            varName = f"{tensor.name}_dim_{idx}" + self._getSuffix(copyIdx)
+
+            if varName in self._variables:
+                continue
+
+            self._addVariable(name = varName, lowerBound = 1, upperBound = dim)
+
+    def addTensorNumOfEltToModel(self, ctxt: NetworkContext, tensorName: str, copyIdx: Optional[int] = None):
+        '''
+        For each tensor in the given list, add a variable equal to the product of dimension variables of this tensor.
+        Namespace of those new variables are f"{tensor.name}_num_elements".
+        '''
+
+        varNameNumElt = f"{tensorName}_num_elements" + self._getSuffix(copyIdx)
+        if varNameNumElt in self._variables:
+            return
+
+        tensor = ctxt.lookup(tensorName)
+
+        tensorDimProductExpr = 1
+
+        for idx, _ in enumerate(tensor.shape):
+
+            varNameIdx = f"{tensor.name}_dim_{idx}" + self._getSuffix(copyIdx)
+            tensorDimProductExpr *= self._variables[varNameIdx]
+
+        tensorDimProductVar = self._addVariable(name = varNameNumElt,
+                                                lowerBound = 1,
+                                                upperBound = np.prod(tensor.shape))
+
+        self._model.Add(tensorDimProductVar == tensorDimProductExpr)
+
+    def addTransientBufferSizeToModel(self, tensorName: str, memorySizeExpr: Union[IntExpr, IntVar, int]) -> IntVar:
+
+        transientName = tensorName
+
+        if isinstance(memorySizeExpr, int):
+            lowerBound = memorySizeExpr
+            upperBound = memorySizeExpr
+        else:
+            lowerBound = memorySizeExpr.Min()
+            upperBound = memorySizeExpr.Max()
+
+        transientSize = self._addVariable(name = transientName, lowerBound = lowerBound, upperBound = upperBound)
+        self._model.Add(transientSize == memorySizeExpr)
+
+        return transientSize
+
+    def addMinTileSizeConstraint(self,
+                                 operatorRepresentation: OperatorRepresentation,
+                                 variableName: str,
+                                 intvar: IntVar,
+                                 modulo: int,
+                                 prefix: str = "",
+                                 strategy: Optional[AddConstraintStrategy] = None) -> IntVar:
+
+        tileSizeVar = self.addVariable(prefix + operatorRepresentation["nodeName"] + f"_{variableName}" + "_tileSize",
+                                       1, operatorRepresentation[variableName])
+        mulVar = self.addVariable(prefix + operatorRepresentation["nodeName"] + f"_{variableName}" + "_mul", 1,
+                                  operatorRepresentation[variableName])
+        addVar = self.addVariable(prefix + operatorRepresentation["nodeName"] + f"_{variableName}" + "_add", 0,
+                                  operatorRepresentation[variableName])
+        self.addConstraint(addVar <= tileSizeVar, strategy = strategy)
+        self.addConstraint(addVar >= (modulo * (tileSizeVar < intvar.Max())), strategy = strategy)
+        self.addConstraint(operatorRepresentation[variableName] == mulVar * tileSizeVar + addVar, strategy = strategy)
+        self.addConstraint(intvar == tileSizeVar, strategy = strategy)
+
+        return addVar
+
+    def addTileSizeDivisibleConstraint(self,
+                                       operatorRepresentation: OperatorRepresentation,
+                                       variableName: str,
+                                       intvar: IntVar,
+                                       modulo: int,
+                                       prefix: str = "",
+                                       strategy: Optional[AddConstraintStrategy] = None) -> IntVar:
+
+        tileSizeVar = self.addVariable(prefix + operatorRepresentation["nodeName"] + f"_{variableName}" + "_tileSize",
+                                       1, operatorRepresentation[variableName])
+        mulVar = self.addVariable(prefix + operatorRepresentation["nodeName"] + f"_{variableName}" + "_mul", 1,
+                                  operatorRepresentation[variableName])
+
+        mulMulVar = self.addVariable(prefix + operatorRepresentation["nodeName"] + f"_{variableName}" + "_mulmul", 1,
+                                     operatorRepresentation[variableName])
+
+        addVar = self.addVariable(prefix + operatorRepresentation["nodeName"] + f"_{variableName}" + "_add", 0,
+                                  operatorRepresentation[variableName])
+
+        self.addConstraint(addVar <= tileSizeVar,
+                           strategy = strategy)  # Reminder tile has to be smaller than the regular tile
+        self.addConstraint(tileSizeVar == mulMulVar * modulo, strategy = strategy)
+        self.addConstraint(operatorRepresentation[variableName] == mulVar * tileSizeVar + addVar, strategy = strategy)
+        self.addConstraint(intvar == tileSizeVar, strategy = strategy)
+
+        return addVar
+
+    def debugConstraints(self) -> bool:
+
+        offendingGeometricalConstraints: List[IntExpr] = []
+        offendingMemoryConstraints: List[Tuple[MemoryLevel, IntVar, IntExpr]] = []
+
+        for constraint in self._constraints:
+            if self._model.CheckConstraint(constraint):
+                self._model.Add(constraint)
+                continue
+
+            offendingGeometricalConstraints.append(constraint)
+
+        if offendingGeometricalConstraints != []:
+
+            errorMsg = [""]
+            errorMsg += ["ERROR: Some geometrical constraints are infeasible. A minimal set is this one:"]
+            errorMsg += [pformat(offendingGeometricalConstraints, indent = 2)]
+            raise RuntimeError(("\n").join(errorMsg))
+
+        self.copyIdx = 0
+
+        for idx, (memoryLevel, constraint) in enumerate(self._memoryConstraints):
+            constrExpr = constraint <= memoryLevel.size
+
+            if self._model.CheckConstraint(constrExpr):
+                self._model.Add(constrExpr)
+                continue
+
+            self.copyIdx += 1
+
+            offendingConstraint = self.addVariable(name = f"constraint", lowerBound = 0, upperBound = 2**63 - 1)
+            self._model.Add(offendingConstraint == constraint)
+
+            offendingMemoryConstraints.append((memoryLevel, offendingConstraint, constraint))
+
+        collector = self._solveModel('min')
+
+        minimumRequirement: Dict[str, int] = {}
+        for memLevel, memRequirement, _ in offendingMemoryConstraints:
+            value = collector.Value(collector.SolutionCount() - 1, memRequirement)
+
+            if memLevel.name in minimumRequirement.keys():
+                minimumRequirement[memLevel.name] = max(minimumRequirement[memLevel.name], value)
+            else:
+                minimumRequirement[memLevel.name] = value
+
+        errorMsg = [""]
+
+        for key, val in minimumRequirement.items():
+            levelError = ""
+            levelError += f"ERROR: minimal memory requirement violated, please increase {key} to at least {val} or change constraints"
+            errorMsg.append(levelError)
+
+        errorMsg.append(f"Offending constraints were")
+        for memLevel, _, constr in offendingMemoryConstraints:
+            errorMsg.append(f"{memLevel.size} >= {str(constr)}")
+
+        if len(errorMsg) > 1:
+            raise RuntimeError(("\n").join(errorMsg))
+
+        return True
+
+    def _trySetupConstraints(self,) -> bool:
+        for constraint in self._constraints:
+            self._model.Add(constraint)
+
+        for memLevel, constraint in self._memoryConstraints:
+            constrExpr = constraint <= memLevel.size
+            self._model.Add(constrExpr)
+
+        for _, performanceConstraint in sorted(self._performanceConstraints, reverse = True):
+            if self._model.CheckConstraint(performanceConstraint):
+                self._model.Add(performanceConstraint)
+
+        for _, (memLevel, performanceConstraint) in sorted(self._performanceMemoryConstraints, reverse = True):
+            constrExpr = performanceConstraint <= memLevel.size
+            if self._model.CheckConstraint(constrExpr):
+                self._model.Add(constrExpr)
+
+        return self._model.CheckConstraint(self._model.TrueConstraint())
+
+    def _setupObjective(self, patternIdx: Optional[int] = None):
+
+        _patternIdx: int
+
+        if patternIdx is None:
+            _patternIdx = 0
+        else:
+            _patternIdx = patternIdx
+
+        assert _patternIdx <= len(
+            self._objectives), f"patternIdx {_patternIdx} is larger than list of _objectives, {len(self._objectives)}"
+
+        _objective = self._objectives[_patternIdx]
+
+        if _objective[1] == False:
+            objective = self._model.Maximize(_objective[0], step = 1)
+        else:
+            objective = self._model.Minimize(_objective[0], step = 1)
+
+        return objective
+
+    def trySolveModel(self):
+
+        solvable: bool = self._trySetupConstraints()
+        if not solvable:
+            self.debugConstraints()
+
+        return self._solveModel()
+
+    def _solveModel(self, solType: Union[Literal['min'], Literal['max']] = 'max') -> SolutionCollector:
+        variablesList = [var for varName, var in self._variables.items()]
+
+        if solType == 'max':
+            decision_builder = self._model.Phase(variablesList, self._model.CHOOSE_FIRST_UNBOUND,
+                                                 self._model.ASSIGN_MAX_VALUE)
+        else:
+            decision_builder = self._model.Phase(variablesList, self._model.CHOOSE_FIRST_UNBOUND,
+                                                 self._model.ASSIGN_MIN_VALUE)
+
+        collector = self._model.LastSolutionCollector()
+
+        for var in variablesList:
+            collector.Add(var)
+
+        objective = self._setupObjective()
+
+        timelimit = self._model.TimeLimit(_SOLVERTIMEOUT)
+
+        log = self._model.SearchLog(1000000)
+
+        _ = self._model.Solve(decision_builder, [objective, collector, log, timelimit])
+
+        assert collector.SolutionCount() > 0, "Error in Tiler: No solution found"
+
+        self._collector = collector
+        return self._collector
diff --git a/Deeploy/TilingExtension/TilingCodegen.py b/Deeploy/TilingExtension/TilingCodegen.py
new file mode 100644
index 0000000..6a2ff26
--- /dev/null
+++ b/Deeploy/TilingExtension/TilingCodegen.py
@@ -0,0 +1,370 @@
+# ----------------------------------------------------------------------
+#
+# File: TilingCodegen.py
+#
+# Last edited: 11.10.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Dict, Iterable, List, Optional, Tuple, Type
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.TilingExtension.MemoryConstraints import MemoryConstraint, NodeMemoryConstraint
+
+
+@dataclass
+class MemoryTransfer():
+    source: MemoryConstraint
+    destination: MemoryConstraint
+
+
+@dataclass
+class HyperRectangle():
+    # position of the hyperrectangle in feature map space
+    offset: Tuple[int, ...]
+    # size of the hyperrectangle
+    dims: Tuple[int, ...]
+
+    def __init__(self, offset: Tuple[int, ...], dims: Tuple[int, ...]):
+        assert len(offset) == len(
+            dims), f"HyperRectangle offset and dims for mismatching dimensions {offset} and {dims}"
+
+        self.offset = offset
+        self.dims = dims
+
+
+@dataclass
+class AbsoluteHyperRectangle:
+    rectangle: HyperRectangle
+    absoluteOffset: Tuple[int, ...]
+
+    def __init__(self, rectangle: HyperRectangle, absoluteOffset: Tuple[int, ...]):
+        assert len(absoluteOffset) == len(
+            rectangle.offset
+        ), f"AsoluteHyperRectangle's absoluteOffset and rectangle's offset for mismatching dimensions {absoluteOffset} and {rectangle.offset}"
+
+        self.rectangle = rectangle
+        self.absoluteOffset = absoluteOffset
+
+
+@dataclass
+class TilingSchedule():
+    # the places to store input tiles
+    # Should have length numTiles
+    inputBaseOffsets: Dict[str, List[int]]
+
+    # the places to store output tiles
+    # Should have length numTiles
+    outputBaseOffsets: Dict[str, List[int]]
+
+    # the hypercubes to load in each step
+    # Should have length numTiles
+    inputLoadSchedule: List[Dict[str, HyperRectangle]]
+
+    # the hypercubes to store in each step
+    # Should have length numTiles
+    outputLoadSchedule: List[Dict[str, HyperRectangle]]
+
+    def __init__(self, inputBaseOffsets: Dict[str, List[int]], outputBaseOffsets: Dict[str, List[int]],
+                 inputLoadSchedule: List[Dict[str, HyperRectangle]], outputLoadSchedule: List[Dict[str,
+                                                                                                   HyperRectangle]]):
+
+        # assert len(inputLoadSchedule) == len(outputLoadSchedule), "Didn't get equal amount of input and output tiles!"
+
+        for scheduleStep in inputLoadSchedule:
+            for key in inputBaseOffsets:
+                assert key in scheduleStep.keys(), f"Key {key} is not in scheduleStep {scheduleStep}"
+
+        for scheduleStep in outputLoadSchedule:
+            for key in outputBaseOffsets:
+                assert key in scheduleStep.keys(), f"Key {key} is not in scheduleStep {scheduleStep}"
+
+        self.inputBaseOffsets = inputBaseOffsets
+        self.outputBaseOffsets = outputBaseOffsets
+        self.inputLoadSchedule = inputLoadSchedule
+        self.outputLoadSchedule = outputLoadSchedule
+
+    def __repr__(self) -> str:
+        outStr = ""
+        outStr += f"inputBaseOffsets: \n{str(self.inputBaseOffsets)} \n"
+        outStr += f"outputBaseOffsets: \n{str(self.outputBaseOffsets)} \n"
+
+        inSched = ("\n").join([str(step) for step in self.inputLoadSchedule])
+        outSched = ("\n").join([str(step) for step in self.outputLoadSchedule])
+
+        outStr += f"inputLoadSchedule: \n{inSched} \n"
+        outStr += f"outputLoadSchedule: \n{outSched} \n"
+
+        return outStr
+
+    def __add__(self, other: TilingSchedule) -> TilingSchedule:
+
+        assert isinstance(other, TilingSchedule), f"Other {other} is not a TilingSchedule"
+
+        for key in self.inputBaseOffsets.keys():
+            assert key in other.inputBaseOffsets.keys(), f"Other {other} has no key {key}"
+        for key in self.outputBaseOffsets.keys():
+            assert key in other.outputBaseOffsets.keys(), f"Other {other} has no key {key}"
+
+        for key in other.inputBaseOffsets.keys():
+            assert key in self.inputBaseOffsets.keys(), f"Other {other} has no key {key}"
+        for key in other.outputBaseOffsets.keys():
+            assert key in self.outputBaseOffsets.keys(), f"Other {other} has no key {key}"
+
+        new = TilingSchedule(self.inputBaseOffsets.copy(), self.outputBaseOffsets.copy(), self.inputLoadSchedule.copy(),
+                             self.outputLoadSchedule.copy())
+
+        new.inputLoadSchedule += other.inputLoadSchedule
+        new.outputLoadSchedule += other.outputLoadSchedule
+
+        return new
+
+
+@dataclass
+class VariableReplacementScheme():
+    perTileReplacements: Dict[str, List]
+    replacementTypes: Dict[str, Type[Pointer]]
+
+    def __init__(self, perTileReplacements: Dict[str, List], replacementTypes: Dict[str, Type[Pointer]]):
+        assert len(perTileReplacements.keys()) == len(
+            replacementTypes.keys()), "Exactly all replacements must have one type"
+
+        for key in perTileReplacements.keys():
+            assert key in replacementTypes.keys(), "Keys must match!"
+
+        self.perTileReplacements = perTileReplacements
+        self.replacementTypes = replacementTypes
+
+    def __add__(self, other: VariableReplacementScheme) -> VariableReplacementScheme:
+
+        assert isinstance(other, VariableReplacementScheme), f"Other {other} is not a VariableReplacementScheme"
+
+        for key in self.perTileReplacements.keys():
+            assert key in other.perTileReplacements.keys(), f"key {key} not in other {other}!"
+        for key in self.replacementTypes.keys():
+            assert key in other.replacementTypes.keys(), f"key {key} not in other {other}!"
+
+        for key in other.perTileReplacements.keys():
+            assert key in self.perTileReplacements.keys(), f"key {key} not in other {other}!"
+        for key in other.replacementTypes.keys():
+            assert key in self.replacementTypes.keys(), f"key {key} not in other {other}!"
+
+        new = VariableReplacementScheme(self.perTileReplacements.copy(), self.replacementTypes.copy())
+        for key in self.perTileReplacements.keys():
+            new.perTileReplacements[key] += other.perTileReplacements[key]
+
+        return new
+
+
+def minimizeVariableReplacement(
+        scheme: VariableReplacementScheme,
+        operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, Dict]:
+    newPerTileRep = {}
+    newRepTypes = {}
+
+    for key, value in scheme.perTileReplacements.items():
+        if len(set(value)) > 1:
+            newPerTileRep[key] = scheme.perTileReplacements[key]
+            newRepTypes[key] = scheme.replacementTypes[key]
+        else:
+            operatorRepresentation[key] = value[0]
+
+    return VariableReplacementScheme(newPerTileRep, newRepTypes), operatorRepresentation
+
+
+def minimizeRectangleDims(hyperRectangle: HyperRectangle,
+                          referenceBuffer: VariableBuffer) -> Tuple[HyperRectangle, HyperRectangle]:
+
+    rectDims = hyperRectangle.dims
+    rectOffset = hyperRectangle.offset
+    shape = referenceBuffer.shape
+    newDims: List[int] = []
+    newOffset: List[int] = []
+
+    newBaseline = []
+
+    reversedRectOffset = list(reversed(rectOffset))
+
+    # SCHEREMO: Collapse dimensions right to left
+    acc = 0
+    for idx, (tileDim, bufDim) in enumerate(zip(reversed(rectDims), reversed(shape))):
+
+        if tileDim == bufDim:
+            assert reversedRectOffset[idx] == 0, "Can't not tile a dimension and have an offset, tf"
+
+        # SCHEREMO: Collapse if equal
+        if tileDim == bufDim and acc != 0:
+            acc *= tileDim
+        elif tileDim == bufDim and acc == 0:
+            acc = tileDim
+        elif tileDim != bufDim and acc != 0:
+            newDims.insert(0, acc * tileDim)
+            newBaseline.insert(0, acc * bufDim)
+            newOffset.insert(0, acc * reversedRectOffset[idx])
+            acc = 0
+        else:
+            newDims.insert(0, tileDim)
+            newBaseline.insert(0, bufDim)
+            newOffset.insert(0, reversedRectOffset[idx])
+
+    if acc > 1:
+        newDims.insert(0, acc)
+        newBaseline.insert(0, acc)
+        newOffset.insert(0, acc * reversedRectOffset[idx])
+
+    # JUNGVI: If the function collapsed all dimensions of the tensor, set it to dim 1 and offset 0
+    if len(newDims) == 0:
+        newDims = [1]
+        newBaseline = [1]
+        newOffset = [0]
+
+    newRect = HyperRectangle(tuple(newOffset), tuple(newDims))
+    newBaseline = HyperRectangle(tuple([0] * len(newOffset)), tuple(newBaseline))
+
+    return newRect, newBaseline
+
+
+def calculateRectangleOffset(hyperRectangle: HyperRectangle, referenceBuffer: VariableBuffer) -> int:
+
+    minimalRect, baselineRect = minimizeRectangleDims(hyperRectangle, referenceBuffer)
+
+    offsetMult = [1]
+    for dim in reversed(baselineRect.dims[1:]):
+        offsetMult.insert(0, dim * np.prod(offsetMult))
+
+    accOffset = 0
+    for offsetIdx, mult in zip(minimalRect.offset, offsetMult):
+        accOffset += offsetIdx * mult
+
+    return int(accOffset * (referenceBuffer._type.referencedType.typeWidth // 8))
+
+
+def extractTilingTransfer(tilingSolution: NodeMemoryConstraint, targetMemLevel: str,
+                          tensorName: str) -> Optional[MemoryTransfer]:
+
+    for name, constraint in tilingSolution.tensorMemoryConstraints.items():
+        if not name == tensorName:
+            continue
+
+        sourceIdx = 0
+
+        for idx, memConstraint in enumerate(constraint.memoryConstraints.values()):
+            if memConstraint.memoryLevel != targetMemLevel:
+                continue
+
+            sourceIdx = idx
+            targetIdx = idx - 1
+
+            if sourceIdx == 0:
+                return None
+
+            return MemoryTransfer(
+                list(constraint.memoryConstraints.values())[targetIdx],
+                list(constraint.memoryConstraints.values())[sourceIdx])
+
+    raise RuntimeError(f"{tensorName} not found in tilingSolution!")
+
+
+def computeHyperRectangleList(memTrans: MemoryTransfer) -> List[HyperRectangle]:
+
+    def nextElement(idxVec: List[int], targetVector: List[int]) -> Optional[List[int]]:
+        nextIdx = []
+
+        countUp = True
+        for vecIdx, maxIdx in zip(reversed(idxVec), reversed(targetVector)):
+            if countUp:
+                if vecIdx == maxIdx:
+                    nextIdx.append(1)
+                else:
+                    nextIdx.append(vecIdx + 1)
+                    countUp = False
+            else:
+                nextIdx.append(vecIdx)
+
+        nextIdx.reverse()
+
+        if countUp:
+            return None
+
+        return nextIdx
+
+    def calculateCost(idxVec: Iterable[int], smallShape: Tuple[int]) -> List[int]:
+        outVec = []
+        for idx, step in zip(idxVec, smallShape):
+            outVec.append((idx - 1) * step)
+
+        return outVec
+
+    def calculateDim(idxVec: List[int], numTiles: List[int], smallShape: Tuple[int],
+                     largeShape: Tuple[int]) -> List[int]:
+
+        dimVec = []
+
+        for idx, (vecIdx, maxIdx) in enumerate(zip(idxVec, numTiles)):
+            if vecIdx != maxIdx:
+                dimVec.append(smallShape[idx])
+                continue
+            if largeShape[idx] % smallShape[idx] == 0:
+                dimVec.append(smallShape[idx])
+                continue
+            dimVec.append(largeShape[idx] % smallShape[idx])
+
+        return dimVec
+
+    src = memTrans.source
+    dst = memTrans.destination
+
+    largeShape = src.shape
+    smallShape = dst.shape
+
+    assert largeShape is not None, "Transfer shapes cannot be undefined!"
+    assert smallShape is not None, "Transfer shapes cannot be undefined!"
+
+    assert len(smallShape) == len(
+        largeShape), f"Source and target of memory transfer {memTrans} don't have the same number of dimensions!"
+    for idx, (dim1, dim2) in enumerate(zip(smallShape, largeShape)):
+        assert dim1 <= dim2, f"Large shape is smaller in dimension {idx}"
+
+    totNumTiles = 1
+    numTiles: List[int] = []
+
+    for (dim1, dim2) in zip(smallShape, largeShape):
+        totNumTiles *= np.ceil(dim2 / dim1)
+        numTiles.append(int(np.ceil(dim2 / dim1)))
+
+    cubeList: List[HyperRectangle] = []
+    idxVec = [1] * len(smallShape)
+
+    for i in range(int(totNumTiles)):
+        offsetVec = calculateCost(idxVec, smallShape)
+        dimVec = calculateDim(idxVec, numTiles, smallShape, largeShape)
+        cubeList.append(HyperRectangle(tuple(offsetVec), tuple(dimVec)))
+
+        nextVec = nextElement(idxVec, numTiles)
+        if nextVec is None:
+            break
+        idxVec = nextVec
+
+    return cubeList
diff --git a/Deeploy/TilingExtension/__init__.py b/Deeploy/TilingExtension/__init__.py
new file mode 100644
index 0000000..b50445f
--- /dev/null
+++ b/Deeploy/TilingExtension/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/__init__.py b/Deeploy/__init__.py
new file mode 100644
index 0000000..65ec809
--- /dev/null
+++ b/Deeploy/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/DeeployTest/CMakeLists.txt b/DeeployTest/CMakeLists.txt
new file mode 100644
index 0000000..68fe107
--- /dev/null
+++ b/DeeployTest/CMakeLists.txt
@@ -0,0 +1,46 @@
+include_directories(${GENERATED_SOURCE})
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+add_library(network OBJECT ${GENERATED_SOURCE}/Network.c)
+target_link_libraries(network PUBLIC deeploylib)
+
+if(platform STREQUAL MemPool)
+  add_subdirectory(Platforms/MemPool)
+
+elseif(platform STREQUAL Generic)
+  add_subdirectory(Platforms/Generic)
+
+elseif(DEEPLOY_ARCH STREQUAL CMSIS)
+  if(platform STREQUAL QEMU-ARM)
+    add_subdirectory(Platforms/QEMU_ARM)
+  endif()
+
+elseif(DEEPLOY_ARCH STREQUAL PULP)
+
+
+  file(GLOB_RECURSE HEXLIST
+    "${GENERATED_SOURCE}/hex/**"
+  )
+  list(TRANSFORM HEXLIST PREPEND "--config-opt=flash/content/partitions/readfs/files=")
+  set(GVSOCHEXINCLUDE ${HEXLIST})
+
+  if (NOT HEXLIST)
+    target_compile_options(network PUBLIC
+      -DNOFLASH
+    )
+  endif()
+  # SCHEREMO: Waive warnings
+  # Pointer sign warnings are caused by the data width abstraction used in Deeploy. Signedness is not explicitly modelled, as this is handled by kernels
+  target_compile_options(network PRIVATE
+    -Wno-pointer-sign
+  )
+
+  if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka)
+    set(USE_NEUREKA ON)
+    add_subdirectory(Platforms/Siracusa)
+  elseif(platform STREQUAL PULPOpen)
+    set(USE_NEUREKA OFF)
+    add_subdirectory(Platforms/PULPOpen)
+  endif()
+endif()
diff --git a/DeeployTest/Platforms/Generic/CMakeLists.txt b/DeeployTest/Platforms/Generic/CMakeLists.txt
new file mode 100644
index 0000000..4f951ca
--- /dev/null
+++ b/DeeployTest/Platforms/Generic/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(ProjectId ${TESTNAME})
+
+file(GLOB_RECURSE SOURCES
+    main.c
+)
+
+link_directories(${ProjectId}/../../${GENERATED_SOURCE})
+
+add_deeploy_executable(${ProjectId} EXCLUDE_FROM_ALL ${SOURCES} )
+target_link_libraries(${ProjectId} PRIVATE network deeploylib)
+
+link_compile_dump(${TESTNAME})
diff --git a/DeeployTest/Platforms/Generic/main.c b/DeeployTest/Platforms/Generic/main.c
new file mode 100644
index 0000000..ca8c665
--- /dev/null
+++ b/DeeployTest/Platforms/Generic/main.c
@@ -0,0 +1,74 @@
+/* =====================================================================
+ * Title:        main.c
+ * Description:
+ *
+ * Date:        15.03.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except pSrcA compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to pSrcA writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "Network.h"
+#include "testinputs.h"
+#include "testoutputs.h"
+
+int main() {
+
+  printf("Initializing network...\r\n");
+
+  InitNetwork(0, 1);
+
+  for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
+    memcpy(DeeployNetwork_inputs[buf], testInputVector[buf],
+           DeeployNetwork_inputs_bytes[buf]);
+  }
+
+  printf("Running network...\r\n");
+  RunNetwork(0, 1);
+
+  int32_t tot_err = 0;
+  uint32_t tot = 0;
+  int32_t diff;
+  int32_t expected, actual;
+  for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
+    tot += DeeployNetwork_outputs_bytes[buf];
+    for (uint32_t i = 0; i < DeeployNetwork_outputs_bytes[buf]; i++) {
+      expected = ((char *)testOutputVector[buf])[i];
+      actual = ((char *)DeeployNetwork_outputs[buf])[i];
+      diff = expected - actual;
+
+      if (diff) {
+        tot_err += 1;
+        printf("Expected: %4d  ", expected);
+        printf("Actual: %4d  ", actual);
+        printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf);
+      }
+    }
+  }
+
+  printf("Errors: %d out of %d \r\n", tot_err, tot);
+
+  return tot_err;
+}
diff --git a/DeeployTest/Platforms/MemPool/CMakeLists.txt b/DeeployTest/Platforms/MemPool/CMakeLists.txt
new file mode 100644
index 0000000..0b4b94b
--- /dev/null
+++ b/DeeployTest/Platforms/MemPool/CMakeLists.txt
@@ -0,0 +1,15 @@
+set(ProjectId ${TESTNAME})
+
+file(GLOB_RECURSE SOURCES
+    main.c
+)
+
+link_directories(${ProjectId}/../../${GENERATED_SOURCE})
+
+add_deeploy_executable(${ProjectId} EXCLUDE_FROM_ALL ${SOURCES} )
+add_dependencies(${ProjectId} linkerscript)
+
+target_link_libraries(${ProjectId} PRIVATE network deeploylib)
+add_banshee_simulation(${ProjectId})
+
+link_compile_dump(${TESTNAME})
diff --git a/DeeployTest/Platforms/MemPool/main.c b/DeeployTest/Platforms/MemPool/main.c
new file mode 100644
index 0000000..5603bf2
--- /dev/null
+++ b/DeeployTest/Platforms/MemPool/main.c
@@ -0,0 +1,230 @@
+/* =====================================================================
+ * Title:        main.c
+ * Description:
+ *
+ * Date:        15.03.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except pSrcA compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to pSrcA writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "Network.h"
+#include "dma.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+#include "testinputs.h"
+#include "testoutputs.h"
+
+#ifndef BANSHEE_SIMULATION
+dump(timer_cycle, 0);
+dump(timer_instr, 1);
+dump(expected, 2);
+dump(actual, 3);
+dump(diff, 4);
+dump(info, 5);
+dump(input0, 6);
+dump(input1, 7);
+#endif
+
+#ifndef NUM_THREADS
+#define NUM_THREADS 1
+#endif
+
+static int8_t inference_done __attribute__((section(".l1"))) = 0;
+
+int main() {
+  uint32_t const core_id = mempool_get_core_id();
+
+  mempool_timer_t instr_init, instr_end;
+  mempool_timer_t timer_init, timer_end;
+#ifdef BANSHEE_SIMULATION
+  uint32_t const num_cores = NUM_THREADS;
+#else
+  uint32_t const num_cores = mempool_get_core_count();
+#endif
+
+  mempool_init(core_id);
+
+  // Initialize synchronization variables
+  mempool_barrier_init(core_id, num_cores);
+
+#ifdef BANSHEE_SIMULATION
+  if (core_id == num_cores - 1) {
+    printf("Network running on %ld of %ld cores\r\n", num_cores,
+           mempool_get_core_count());
+  }
+#endif
+
+  // Wait until initialization is done
+  mempool_barrier(num_cores);
+
+#ifdef BANSHEE_SIMULATION
+  if (core_id == 0) {
+    printf("Init network...\r\n");
+  }
+#endif
+
+  if (core_id == 0) {
+    InitNetwork(core_id, num_cores);
+  }
+
+  // Wait until initialization is done
+  mempool_barrier(num_cores);
+
+  if (core_id == 0) {
+#if BANSHEE_SIMULATION
+    for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
+      memcpy(DeeployNetwork_inputs[buf], testInputVector[buf],
+             DeeployNetwork_inputs_bytes[buf]);
+    }
+#else
+    for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
+      dma_memcpy_nonblocking(DeeployNetwork_inputs[buf], testInputVector[buf],
+                             DeeployNetwork_inputs_bytes[buf]);
+    }
+    do {
+      mempool_wait(16);
+    } while (!dma_done());
+#endif
+  }
+
+  // Wait until initialization is done
+  mempool_barrier(num_cores);
+
+#ifdef BANSHEE_SIMULATION
+  if (core_id == 0) {
+    for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
+      printf("testInputVector%d @ %p\r\n", buf, testInputVector[buf]);
+      printf("DeeployNetwork_input_%d @ %p and %lu elements\r\n", buf,
+             DeeployNetwork_inputs[buf], DeeployNetwork_inputs_bytes[buf]);
+    }
+    for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
+      printf("testInputVector%d @ %p\r\n", buf, testOutputVector[buf]);
+      printf("DeeployNetwork_output_%d @ %p and %lu elements\r\n", buf,
+             DeeployNetwork_outputs[buf], DeeployNetwork_outputs_bytes[buf]);
+    }
+    printf("Running network...\r\n");
+  }
+#endif
+
+  instr_init = read_csr(minstret);
+  timer_init = read_csr(mcycle);
+  if (core_id < NUM_THREADS)
+    RunNetwork(core_id, NUM_THREADS);
+  timer_end = read_csr(mcycle);
+  instr_end = read_csr(minstret);
+
+  if (core_id == 0) {
+    inference_done = 1;
+    mempool_wait(64);
+    wake_up_all();
+  } else {
+    while (inference_done == 0) {
+      mempool_wfi();
+    }
+  }
+
+  // Wait until all cores are done
+  mempool_barrier(num_cores);
+
+  int32_t tot_err = 0;
+  int32_t diff;
+  int32_t expected, actual;
+
+#ifdef BANSHEE_SIMULATION
+  uint32_t tot = 0;
+  // Sequential part executed by all cores
+  if (core_id != 0) {
+    mempool_wfi();
+  }
+  printf("RunNetwork(%3ld, %3ld) Runtime: %6ld cycles, %6ld instr\r\n", core_id,
+         num_cores, timer_end - timer_init, instr_end - instr_init - 2);
+  wake_up(core_id + 1);
+
+  // Wait until all cores are done
+  mempool_barrier(num_cores);
+
+  if (core_id == 0) {
+    printf("Done. Checking outputs...\r\n");
+
+    for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
+      tot += DeeployNetwork_outputs_bytes[buf];
+      for (uint32_t i = 0; i < DeeployNetwork_outputs_bytes[buf]; i++) {
+        expected = ((char *)testOutputVector[buf])[i];
+        actual = ((char *)DeeployNetwork_outputs[buf])[i];
+        diff = expected - actual;
+
+        if (diff) {
+          tot_err += 1;
+          printf("Expected: %4d  ", expected);
+          printf("Actual: %4d  ", actual);
+          printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf);
+        }
+      }
+    }
+    printf("Errors: %ld out of %lu \r\n", tot_err, tot);
+  }
+#else
+  if (core_id != 0) {
+    mempool_wfi();
+  }
+  dump_timer_cycle(timer_end - timer_init);
+  dump_timer_instr(instr_end - instr_init - 2);
+  // printf("RunNetwork(%3ld, %3ld) Runtime: %6ld cycles, %6ld instr\r\n",
+  // core_id, num_cores, timer_end - timer_init,
+  //       instr_end - instr_init - 2);
+  wake_up(core_id + 1);
+
+  // Wait until all cores are done
+  mempool_barrier(num_cores);
+
+  if (core_id == 0) {
+    for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
+      for (uint32_t i = 0; i < DeeployNetwork_outputs_bytes[buf]; i++) {
+        expected = ((char *)testOutputVector[buf])[i];
+        actual = ((char *)DeeployNetwork_outputs[buf])[i];
+
+        diff = expected - actual;
+
+        if (diff) {
+          dump_expected((uint32_t)expected);
+          dump_actual((uint32_t)actual);
+          dump_diff((uint32_t)diff);
+          tot_err += 1;
+        }
+      }
+    }
+
+    dump_info((uint32_t)tot_err);
+  }
+#endif
+
+  // Wait until all cores have finished
+  mempool_barrier(num_cores);
+
+  return tot_err;
+}
diff --git a/DeeployTest/Platforms/PULPOpen/CMakeLists.txt b/DeeployTest/Platforms/PULPOpen/CMakeLists.txt
new file mode 100644
index 0000000..d43eb1d
--- /dev/null
+++ b/DeeployTest/Platforms/PULPOpen/CMakeLists.txt
@@ -0,0 +1,14 @@
+set(ProjectId ${TESTNAME})
+
+file(GLOB_RECURSE SOURCES
+    src/CycleCounter.c
+    src/deeploytest.c
+)
+
+add_deeploy_executable(${ProjectId} EXCLUDE_FROM_ALL ${SOURCES})
+target_include_directories(${ProjectId} PRIVATE ${CMAKE_CURRENT_LIST_DIR}/inc)
+
+target_link_libraries(${ProjectId} PRIVATE network deeploylib)
+add_gvsoc_emulation(${ProjectId})
+
+link_compile_dump(${TESTNAME})
diff --git a/DeeployTest/Platforms/PULPOpen/inc/CycleCounter.h b/DeeployTest/Platforms/PULPOpen/inc/CycleCounter.h
new file mode 100644
index 0000000..21d501e
--- /dev/null
+++ b/DeeployTest/Platforms/PULPOpen/inc/CycleCounter.h
@@ -0,0 +1,42 @@
+/* =====================================================================
+ * Title:        CycleCounter.h
+ * Description:
+ *
+ * $Date:        26.07.2024
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef CYCLECOUNTER
+#define CYCLECOUNTER
+
+// Resets the internal cycle counter to zero
+void ResetTimer(void);
+
+// Starts the internal cycle counter
+void StartTimer(void);
+
+// Stops the internal cycle counter
+void StopTimer(void);
+
+// Returns the current number of cycles according to the internal cycle counter
+unsigned int getCycles(void);
+
+#endif
diff --git a/DeeployTest/Platforms/PULPOpen/src/CycleCounter.c b/DeeployTest/Platforms/PULPOpen/src/CycleCounter.c
new file mode 100644
index 0000000..1f35d31
--- /dev/null
+++ b/DeeployTest/Platforms/PULPOpen/src/CycleCounter.c
@@ -0,0 +1,39 @@
+/* =====================================================================
+ * Title:        CycleCounter.c
+ * Description:
+ *
+ * $Date:        26.07.2024
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "CycleCounter.h"
+#include "pmsis.h"
+
+void ResetTimer() {
+  pi_perf_conf(PI_PERF_CYCLES);
+  pi_perf_cl_reset();
+}
+
+void StartTimer() { pi_perf_cl_start(); }
+
+void StopTimer() { pi_perf_cl_stop(); }
+
+unsigned int getCycles() { return pi_perf_cl_read(PI_PERF_CYCLES); }
diff --git a/DeeployTest/Platforms/PULPOpen/src/deeploytest.c b/DeeployTest/Platforms/PULPOpen/src/deeploytest.c
new file mode 100644
index 0000000..525852d
--- /dev/null
+++ b/DeeployTest/Platforms/PULPOpen/src/deeploytest.c
@@ -0,0 +1,113 @@
+/* =====================================================================
+ * Title:        deeploytest.c
+ * Description:
+ *
+ * $Date:        26.12.2021
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CycleCounter.h"
+#include "Network.h"
+#include "dory_mem.h"
+#include "pmsis.h"
+#include "testinputs.h"
+#include "testoutputs.h"
+
+struct pi_device cluster_dev;
+
+void main(void) {
+#ifndef CI
+  printf("HELLO WORLD:\r\n");
+#endif
+
+  struct pi_cluster_conf conf;
+
+  pi_cluster_conf_init(&conf);
+  conf.id = 0;
+  pi_open_from_conf(&cluster_dev, &conf);
+  if (pi_cluster_open(&cluster_dev))
+    return;
+
+  struct pi_cluster_task cluster_task_mem_init;
+
+  pi_cluster_task(&cluster_task_mem_init, mem_init, NULL);
+  cluster_task_mem_init.stack_size = 5000;
+  cluster_task_mem_init.slave_stack_size = 3800;
+  pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task_mem_init);
+
+  struct pi_cluster_task cluster_task;
+
+  pi_cluster_task(&cluster_task, InitNetwork, NULL);
+  cluster_task.stack_size = 5000;
+  cluster_task.slave_stack_size = 3800;
+  pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
+
+#ifndef CI
+  printf("Initialized\r\n");
+#endif
+  for (int buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
+    memcpy(DeeployNetwork_inputs[buf], testInputVector[buf],
+           DeeployNetwork_inputs_bytes[buf]);
+  }
+#ifndef CI
+  printf("Input copied\r\n");
+#endif
+  ResetTimer();
+  StartTimer();
+
+  // RunNetwork(0, 1);
+  pi_cluster_task(&cluster_task, RunNetwork, NULL);
+  cluster_task.stack_size = 5000;
+  cluster_task.slave_stack_size = 3800;
+  pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
+#ifndef CI
+  printf("Run\r\n");
+#endif
+  StopTimer();
+#ifndef CI
+  printf("Output:\r\n");
+#endif
+  int32_t diff, tot_err;
+  tot_err = 0;
+  for (int buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
+    for (int i = 0; i < DeeployNetwork_outputs_bytes[buf]; i++) {
+      diff = ((char *)testOutputVector[buf])[i] -
+             ((char *)DeeployNetwork_outputs[buf])[i];
+      if (diff) {
+        tot_err += 1;
+#ifndef CI
+        printf("Expected: %i\t\t", ((int8_t *)testOutputVector[buf])[i]);
+        printf("Actual: %i \t\t", ((int8_t *)DeeployNetwork_outputs[buf])[i]);
+#endif
+#ifndef CI
+        printf("Diff: %i at Index %u \r\n", diff, i);
+#endif
+      } else {
+        /* #ifndef CI */
+        /*       printf("\r\n"); */
+        /* #endif */
+      }
+    }
+  }
+  printf("Runtime: %u cycles\r\n", getCycles());
+  printf("Errors: %u out of %u \r\n", tot_err, DeeployNetwork_output_0_len);
+}
diff --git a/DeeployTest/Platforms/QEMU_ARM/CMakeLists.txt b/DeeployTest/Platforms/QEMU_ARM/CMakeLists.txt
new file mode 100644
index 0000000..9820396
--- /dev/null
+++ b/DeeployTest/Platforms/QEMU_ARM/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(ProjectId ${TESTNAME})
+
+file(GLOB_RECURSE SOURCES
+    src/CycleCounter.c
+    src/deeploytest.c
+)
+
+add_deeploy_executable(${ProjectId} EXCLUDE_FROM_ALL ${SOURCES} )
+target_compile_options(${ProjectId} PRIVATE
+  -fno-inline-functions
+)
+
+target_include_directories(${ProjectId} PRIVATE ${CMAKE_CURRENT_LIST_DIR}/inc)
+
+target_link_libraries(${ProjectId} PRIVATE network deeploylib)
+
+add_binary_dump(${ProjectId})
+add_qemu_emulation(${ProjectId})
+
+link_compile_dump(${TESTNAME})
diff --git a/DeeployTest/Platforms/QEMU_ARM/inc/CycleCounter.h b/DeeployTest/Platforms/QEMU_ARM/inc/CycleCounter.h
new file mode 100644
index 0000000..e9ca1c1
--- /dev/null
+++ b/DeeployTest/Platforms/QEMU_ARM/inc/CycleCounter.h
@@ -0,0 +1,46 @@
+/* =====================================================================
+ * Title:        CycleCounter.h
+ * Description:
+ *
+ * $Date:        26.07.2024
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef CYCLECOUNTER
+#define CYCLECOUNTER
+
+extern volatile unsigned int *DWT_CYCCNT;
+extern volatile unsigned int *DWT_CONTROL;
+extern volatile unsigned int *SCB_DEMCR;
+
+// Resets the internal cycle counter to zero
+void ResetTimer(void);
+
+// Starts the internal cycle counter
+void StartTimer(void);
+
+// Stops the internal cycle counter
+void StopTimer(void);
+
+// Returns the current number of cycles according to the internal cycle counter
+unsigned int getCycles(void);
+
+#endif
diff --git a/DeeployTest/Platforms/QEMU_ARM/src/CycleCounter.c b/DeeployTest/Platforms/QEMU_ARM/src/CycleCounter.c
new file mode 100644
index 0000000..3fbdc12
--- /dev/null
+++ b/DeeployTest/Platforms/QEMU_ARM/src/CycleCounter.c
@@ -0,0 +1,64 @@
+/* =====================================================================
+ * Title:        CycleCounter.c
+ * Description:
+ *
+ * $Date:        26.07.2024
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "CycleCounter.h"
+
+volatile unsigned int *DWT_CYCCNT =
+    (unsigned int *)0xE0001004; // address of the register
+volatile unsigned int *DWT_CONTROL =
+    (unsigned int *)0xE0001000; // address of the register
+volatile unsigned int *SCB_DEMCR =
+    (unsigned int *)0xE000EDFC; // address of the register
+
+static unsigned int prev_val = 0;
+static int stopped = 0;
+
+void ResetTimer() {
+
+  *SCB_DEMCR = *SCB_DEMCR | 0x01000000;
+  *DWT_CYCCNT = 0; // reset the counter
+  *DWT_CONTROL = 1;
+  stopped = 1;
+  prev_val = 0;
+}
+
+void StartTimer() {
+  prev_val = *DWT_CYCCNT;
+  stopped = 0;
+}
+
+void StopTimer() {
+  prev_val = *DWT_CYCCNT - prev_val;
+  stopped = 1;
+}
+
+unsigned int getCycles() {
+  if (stopped) {
+    return prev_val;
+  } else {
+    return *DWT_CYCCNT - prev_val;
+  }
+}
diff --git a/DeeployTest/Platforms/QEMU_ARM/src/deeploytest.c b/DeeployTest/Platforms/QEMU_ARM/src/deeploytest.c
new file mode 100644
index 0000000..0043941
--- /dev/null
+++ b/DeeployTest/Platforms/QEMU_ARM/src/deeploytest.c
@@ -0,0 +1,67 @@
+/* =====================================================================
+ * Title:        deeploytest.c
+ * Description:
+ *
+ * Date:        15.03.2023
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Network.h"
+#include "testinputs.h"
+#include "testoutputs.h"
+#include <stdint.h>
+#include <stdlib.h>
+
+int main(void) {
+  InitNetwork(0, 1);
+
+  for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
+    memcpy(DeeployNetwork_inputs[buf], testInputVector[buf],
+           DeeployNetwork_inputs_bytes[buf]);
+  }
+
+  RunNetwork(0, 1);
+
+  int32_t tot_err = 0;
+  uint32_t tot = 0;
+  int32_t diff;
+  int32_t expected, actual;
+
+  for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
+    tot += DeeployNetwork_outputs_bytes[buf];
+    for (uint32_t i = 0; i < DeeployNetwork_outputs_bytes[buf]; i++) {
+      expected = ((char *)testOutputVector[buf])[i];
+      actual = ((char *)DeeployNetwork_outputs[buf])[i];
+      diff = expected - actual;
+
+      if (diff) {
+        tot_err += 1;
+        printf("Expected: %4ld  ", expected);
+        printf("Actual: %4ld  ", actual);
+        printf("Diff: %4ld at Index %12lu in Output %lu\r\n", diff, i, buf);
+      }
+    }
+  }
+  printf("Errors: %ld out of %ld \r\n", tot_err, tot);
+
+  return tot_err;
+}
diff --git a/DeeployTest/Platforms/Siracusa/CMakeLists.txt b/DeeployTest/Platforms/Siracusa/CMakeLists.txt
new file mode 100644
index 0000000..4fb009e
--- /dev/null
+++ b/DeeployTest/Platforms/Siracusa/CMakeLists.txt
@@ -0,0 +1,15 @@
+set(ProjectId ${TESTNAME})
+
+file(GLOB_RECURSE SOURCES
+    src/CycleCounter.c
+    src/deeploytest.c
+)
+
+add_deeploy_executable(${ProjectId} EXCLUDE_FROM_ALL ${SOURCES})
+target_include_directories(${ProjectId} PRIVATE ${CMAKE_CURRENT_LIST_DIR}/inc)
+
+target_link_libraries(${ProjectId} PRIVATE network deeploylib)
+target_compile_options(${ProjectId} INTERFACE network)
+add_gvsoc_emulation(${ProjectId})
+
+link_compile_dump(${TESTNAME})
diff --git a/DeeployTest/Platforms/Siracusa/inc/CycleCounter.h b/DeeployTest/Platforms/Siracusa/inc/CycleCounter.h
new file mode 100644
index 0000000..21d501e
--- /dev/null
+++ b/DeeployTest/Platforms/Siracusa/inc/CycleCounter.h
@@ -0,0 +1,42 @@
+/* =====================================================================
+ * Title:        CycleCounter.h
+ * Description:
+ *
+ * $Date:        26.07.2024
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef CYCLECOUNTER
+#define CYCLECOUNTER
+
+// Resets the internal cycle counter to zero
+void ResetTimer(void);
+
+// Starts the internal cycle counter
+void StartTimer(void);
+
+// Stops the internal cycle counter
+void StopTimer(void);
+
+// Returns the current number of cycles according to the internal cycle counter
+unsigned int getCycles(void);
+
+#endif
diff --git a/DeeployTest/Platforms/Siracusa/src/CycleCounter.c b/DeeployTest/Platforms/Siracusa/src/CycleCounter.c
new file mode 100644
index 0000000..1f35d31
--- /dev/null
+++ b/DeeployTest/Platforms/Siracusa/src/CycleCounter.c
@@ -0,0 +1,39 @@
+/* =====================================================================
+ * Title:        CycleCounter.c
+ * Description:
+ *
+ * $Date:        26.07.2024
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "CycleCounter.h"
+#include "pmsis.h"
+
+void ResetTimer() {
+  pi_perf_conf(PI_PERF_CYCLES);
+  pi_perf_cl_reset();
+}
+
+void StartTimer() { pi_perf_cl_start(); }
+
+void StopTimer() { pi_perf_cl_stop(); }
+
+unsigned int getCycles() { return pi_perf_cl_read(PI_PERF_CYCLES); }
diff --git a/DeeployTest/Platforms/Siracusa/src/deeploytest.c b/DeeployTest/Platforms/Siracusa/src/deeploytest.c
new file mode 100644
index 0000000..c111055
--- /dev/null
+++ b/DeeployTest/Platforms/Siracusa/src/deeploytest.c
@@ -0,0 +1,129 @@
+/* =====================================================================
+ * Title:        deeploytest.c
+ * Description:
+ *
+ * $Date:        26.12.2021
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CycleCounter.h"
+#include "Network.h"
+#include "dory_mem.h"
+#include "pmsis.h"
+#include "testinputs.h"
+#include "testoutputs.h"
+
+#define MAINSTACKSIZE 8000
+#define SLAVESTACKSIZE 3800
+
+struct pi_device cluster_dev;
+
+void main(void) {
+#ifndef CI
+  printf("HELLO WORLD:\r\n");
+#endif
+  struct pi_cluster_conf conf;
+
+  pi_cluster_conf_init(&conf);
+  conf.id = 0;
+  pi_open_from_conf(&cluster_dev, &conf);
+  if (pi_cluster_open(&cluster_dev))
+    return;
+
+  mem_init();
+#ifndef NOFLASH
+  open_fs();
+#endif
+
+  printf("Intializing\r\n");
+
+  struct pi_cluster_task cluster_task;
+
+  pi_cluster_task(&cluster_task, InitNetwork, NULL);
+  cluster_task.stack_size = MAINSTACKSIZE;
+  cluster_task.slave_stack_size = SLAVESTACKSIZE;
+  pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
+
+#ifndef CI
+  printf("Initialized\r\n");
+#endif
+  for (int buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
+    if (DeeployNetwork_inputs[buf] >= 0x10000000) {
+      memcpy(DeeployNetwork_inputs[buf], testInputVector[buf],
+             DeeployNetwork_inputs_bytes[buf]);
+    }
+  }
+
+#ifndef CI
+  printf("Input copied\r\n");
+#endif
+  // RunNetwork(0, 1);
+  pi_cluster_task(&cluster_task, RunNetwork, NULL);
+  cluster_task.stack_size = MAINSTACKSIZE;
+  cluster_task.slave_stack_size = SLAVESTACKSIZE;
+  ResetTimer();
+  StartTimer();
+  pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
+  StopTimer();
+
+#ifndef CI
+  printf("Output:\r\n");
+#endif
+  int32_t diff, tot_err, tot_tested;
+  tot_err = 0;
+  tot_tested = 0;
+  char *compbuf;
+  for (int buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
+
+    if (DeeployNetwork_outputs[buf] < 0x1000000) {
+      compbuf = pi_l2_malloc(DeeployNetwork_outputs_bytes[buf]);
+      ram_read(compbuf, DeeployNetwork_outputs[buf],
+               DeeployNetwork_outputs_bytes[buf]);
+    } else {
+      compbuf = DeeployNetwork_outputs[buf];
+    }
+
+    for (int i = 0; i < DeeployNetwork_outputs_bytes[buf]; i++) {
+      diff = ((char *)testOutputVector[buf])[i] - ((char *)compbuf)[i];
+      tot_tested++;
+      if (diff) {
+        tot_err += 1;
+#ifndef CI
+        printf("Expected: %i\t\t", ((int8_t *)testOutputVector[buf])[i]);
+        printf("Actual: %i \t\t", ((int8_t *)compbuf)[i]);
+#endif
+#ifndef CI
+        printf("Diff: %i at Index %u \r\n", diff, i);
+#endif
+      } else {
+        /* #ifndef CI */
+        /*       printf("\r\n"); */
+        /* #endif */
+      }
+    }
+    if (DeeployNetwork_outputs[buf] < 0x1000000) {
+      pi_l2_free(compbuf, DeeployNetwork_outputs_bytes[buf]);
+    }
+  }
+  printf("Runtime: %u cycles\r\n", getCycles());
+  printf("Errors: %u out of %u \r\n", tot_err, tot_tested);
+}
diff --git a/DeeployTest/Tests/Adder/inputs.npz b/DeeployTest/Tests/Adder/inputs.npz
new file mode 100644
index 0000000..e74c7e5
Binary files /dev/null and b/DeeployTest/Tests/Adder/inputs.npz differ
diff --git a/DeeployTest/Tests/Adder/network.onnx b/DeeployTest/Tests/Adder/network.onnx
new file mode 100644
index 0000000..851801a
--- /dev/null
+++ b/DeeployTest/Tests/Adder/network.onnx
@@ -0,0 +1,28 @@
+pytorch1.11.0:�
+)
+onnx::Add_0
+onnx::Add_12Add_0"Addtorch-jit-exportZ%
+onnx::Add_0
+
+
+
+
+Z%
+onnx::Add_1
+
+
+
+
+b
+2
+
+
+
+
+j
+2
+
+
+
+
+B	
\ No newline at end of file
diff --git a/DeeployTest/Tests/Adder/outputs.npz b/DeeployTest/Tests/Adder/outputs.npz
new file mode 100644
index 0000000..9a8d668
Binary files /dev/null and b/DeeployTest/Tests/Adder/outputs.npz differ
diff --git a/DeeployTest/Tests/Attention/activations.npz b/DeeployTest/Tests/Attention/activations.npz
new file mode 100644
index 0000000..334682d
Binary files /dev/null and b/DeeployTest/Tests/Attention/activations.npz differ
diff --git a/DeeployTest/Tests/Attention/inputs.npz b/DeeployTest/Tests/Attention/inputs.npz
new file mode 100644
index 0000000..2d7f202
Binary files /dev/null and b/DeeployTest/Tests/Attention/inputs.npz differ
diff --git a/DeeployTest/Tests/Attention/network.onnx b/DeeployTest/Tests/Attention/network.onnx
new file mode 100644
index 0000000..ace3b2e
Binary files /dev/null and b/DeeployTest/Tests/Attention/network.onnx differ
diff --git a/DeeployTest/Tests/Attention/outputs.npz b/DeeployTest/Tests/Attention/outputs.npz
new file mode 100644
index 0000000..7ab305d
Binary files /dev/null and b/DeeployTest/Tests/Attention/outputs.npz differ
diff --git a/DeeployTest/Tests/EEGFormer/activations.npz b/DeeployTest/Tests/EEGFormer/activations.npz
new file mode 100644
index 0000000..a18da9a
Binary files /dev/null and b/DeeployTest/Tests/EEGFormer/activations.npz differ
diff --git a/DeeployTest/Tests/EEGFormer/inputs.npz b/DeeployTest/Tests/EEGFormer/inputs.npz
new file mode 100644
index 0000000..098b45d
Binary files /dev/null and b/DeeployTest/Tests/EEGFormer/inputs.npz differ
diff --git a/DeeployTest/Tests/EEGFormer/network.onnx b/DeeployTest/Tests/EEGFormer/network.onnx
new file mode 100644
index 0000000..d81aba7
Binary files /dev/null and b/DeeployTest/Tests/EEGFormer/network.onnx differ
diff --git a/DeeployTest/Tests/EEGFormer/outputs.npz b/DeeployTest/Tests/EEGFormer/outputs.npz
new file mode 100644
index 0000000..d1b16fd
Binary files /dev/null and b/DeeployTest/Tests/EEGFormer/outputs.npz differ
diff --git a/DeeployTest/Tests/Hardswish/inputs.npz b/DeeployTest/Tests/Hardswish/inputs.npz
new file mode 100644
index 0000000..b557b46
Binary files /dev/null and b/DeeployTest/Tests/Hardswish/inputs.npz differ
diff --git a/DeeployTest/Tests/Hardswish/network.onnx b/DeeployTest/Tests/Hardswish/network.onnx
new file mode 100644
index 0000000..070b36b
Binary files /dev/null and b/DeeployTest/Tests/Hardswish/network.onnx differ
diff --git a/DeeployTest/Tests/Hardswish/outputs.npz b/DeeployTest/Tests/Hardswish/outputs.npz
new file mode 100644
index 0000000..c07ca45
Binary files /dev/null and b/DeeployTest/Tests/Hardswish/outputs.npz differ
diff --git a/DeeployTest/Tests/ICCT/activations.npz b/DeeployTest/Tests/ICCT/activations.npz
new file mode 100644
index 0000000..c8754c2
Binary files /dev/null and b/DeeployTest/Tests/ICCT/activations.npz differ
diff --git a/DeeployTest/Tests/ICCT/inputs.npz b/DeeployTest/Tests/ICCT/inputs.npz
new file mode 100644
index 0000000..15137ca
Binary files /dev/null and b/DeeployTest/Tests/ICCT/inputs.npz differ
diff --git a/DeeployTest/Tests/ICCT/network.onnx b/DeeployTest/Tests/ICCT/network.onnx
new file mode 100644
index 0000000..ca200f7
Binary files /dev/null and b/DeeployTest/Tests/ICCT/network.onnx differ
diff --git a/DeeployTest/Tests/ICCT/outputs.npz b/DeeployTest/Tests/ICCT/outputs.npz
new file mode 100644
index 0000000..f0eef5d
Binary files /dev/null and b/DeeployTest/Tests/ICCT/outputs.npz differ
diff --git a/DeeployTest/Tests/ICCT_8/activations.npz b/DeeployTest/Tests/ICCT_8/activations.npz
new file mode 100644
index 0000000..0c536d7
Binary files /dev/null and b/DeeployTest/Tests/ICCT_8/activations.npz differ
diff --git a/DeeployTest/Tests/ICCT_8/inputs.npz b/DeeployTest/Tests/ICCT_8/inputs.npz
new file mode 100644
index 0000000..15137ca
Binary files /dev/null and b/DeeployTest/Tests/ICCT_8/inputs.npz differ
diff --git a/DeeployTest/Tests/ICCT_8/network.onnx b/DeeployTest/Tests/ICCT_8/network.onnx
new file mode 100644
index 0000000..9eab1e9
Binary files /dev/null and b/DeeployTest/Tests/ICCT_8/network.onnx differ
diff --git a/DeeployTest/Tests/ICCT_8/outputs.npz b/DeeployTest/Tests/ICCT_8/outputs.npz
new file mode 100644
index 0000000..e12ca13
Binary files /dev/null and b/DeeployTest/Tests/ICCT_8/outputs.npz differ
diff --git a/DeeployTest/Tests/ICCT_ITA/activations.npz b/DeeployTest/Tests/ICCT_ITA/activations.npz
new file mode 100644
index 0000000..fc1892a
Binary files /dev/null and b/DeeployTest/Tests/ICCT_ITA/activations.npz differ
diff --git a/DeeployTest/Tests/ICCT_ITA/inputs.npz b/DeeployTest/Tests/ICCT_ITA/inputs.npz
new file mode 100644
index 0000000..15137ca
Binary files /dev/null and b/DeeployTest/Tests/ICCT_ITA/inputs.npz differ
diff --git a/DeeployTest/Tests/ICCT_ITA/network.onnx b/DeeployTest/Tests/ICCT_ITA/network.onnx
new file mode 100644
index 0000000..73eaff6
Binary files /dev/null and b/DeeployTest/Tests/ICCT_ITA/network.onnx differ
diff --git a/DeeployTest/Tests/ICCT_ITA/outputs.npz b/DeeployTest/Tests/ICCT_ITA/outputs.npz
new file mode 100644
index 0000000..9532922
Binary files /dev/null and b/DeeployTest/Tests/ICCT_ITA/outputs.npz differ
diff --git a/DeeployTest/Tests/ICCT_ITA_8/activations.npz b/DeeployTest/Tests/ICCT_ITA_8/activations.npz
new file mode 100644
index 0000000..73ca450
Binary files /dev/null and b/DeeployTest/Tests/ICCT_ITA_8/activations.npz differ
diff --git a/DeeployTest/Tests/ICCT_ITA_8/inputs.npz b/DeeployTest/Tests/ICCT_ITA_8/inputs.npz
new file mode 100644
index 0000000..15137ca
Binary files /dev/null and b/DeeployTest/Tests/ICCT_ITA_8/inputs.npz differ
diff --git a/DeeployTest/Tests/ICCT_ITA_8/network.onnx b/DeeployTest/Tests/ICCT_ITA_8/network.onnx
new file mode 100644
index 0000000..23594c5
Binary files /dev/null and b/DeeployTest/Tests/ICCT_ITA_8/network.onnx differ
diff --git a/DeeployTest/Tests/ICCT_ITA_8/outputs.npz b/DeeployTest/Tests/ICCT_ITA_8/outputs.npz
new file mode 100644
index 0000000..72c9498
Binary files /dev/null and b/DeeployTest/Tests/ICCT_ITA_8/outputs.npz differ
diff --git a/DeeployTest/Tests/MLPerf/AnomalyDetection/activations.npz b/DeeployTest/Tests/MLPerf/AnomalyDetection/activations.npz
new file mode 100644
index 0000000..63007a4
Binary files /dev/null and b/DeeployTest/Tests/MLPerf/AnomalyDetection/activations.npz differ
diff --git a/DeeployTest/Tests/MLPerf/AnomalyDetection/inputs.npz b/DeeployTest/Tests/MLPerf/AnomalyDetection/inputs.npz
new file mode 100644
index 0000000..085c92c
Binary files /dev/null and b/DeeployTest/Tests/MLPerf/AnomalyDetection/inputs.npz differ
diff --git a/DeeployTest/Tests/MLPerf/AnomalyDetection/network.onnx b/DeeployTest/Tests/MLPerf/AnomalyDetection/network.onnx
new file mode 100644
index 0000000..a153e2e
Binary files /dev/null and b/DeeployTest/Tests/MLPerf/AnomalyDetection/network.onnx differ
diff --git a/DeeployTest/Tests/MLPerf/AnomalyDetection/outputs.npz b/DeeployTest/Tests/MLPerf/AnomalyDetection/outputs.npz
new file mode 100644
index 0000000..ee0ac69
Binary files /dev/null and b/DeeployTest/Tests/MLPerf/AnomalyDetection/outputs.npz differ
diff --git a/DeeployTest/Tests/MLPerf/ImageClassification/activations.npz b/DeeployTest/Tests/MLPerf/ImageClassification/activations.npz
new file mode 100644
index 0000000..de8375b
Binary files /dev/null and b/DeeployTest/Tests/MLPerf/ImageClassification/activations.npz differ
diff --git a/DeeployTest/Tests/MLPerf/ImageClassification/inputs.npz b/DeeployTest/Tests/MLPerf/ImageClassification/inputs.npz
new file mode 100644
index 0000000..6fe7c5e
Binary files /dev/null and b/DeeployTest/Tests/MLPerf/ImageClassification/inputs.npz differ
diff --git a/DeeployTest/Tests/MLPerf/ImageClassification/network.onnx b/DeeployTest/Tests/MLPerf/ImageClassification/network.onnx
new file mode 100644
index 0000000..58b297b
Binary files /dev/null and b/DeeployTest/Tests/MLPerf/ImageClassification/network.onnx differ
diff --git a/DeeployTest/Tests/MLPerf/ImageClassification/outputs.npz b/DeeployTest/Tests/MLPerf/ImageClassification/outputs.npz
new file mode 100644
index 0000000..c84aef6
Binary files /dev/null and b/DeeployTest/Tests/MLPerf/ImageClassification/outputs.npz differ
diff --git a/DeeployTest/Tests/MLPerf/KeywordSpotting/activations.npz b/DeeployTest/Tests/MLPerf/KeywordSpotting/activations.npz
new file mode 100644
index 0000000..eca4e8a
Binary files /dev/null and b/DeeployTest/Tests/MLPerf/KeywordSpotting/activations.npz differ
diff --git a/DeeployTest/Tests/MLPerf/KeywordSpotting/inputs.npz b/DeeployTest/Tests/MLPerf/KeywordSpotting/inputs.npz
new file mode 100644
index 0000000..dfd8fdf
Binary files /dev/null and b/DeeployTest/Tests/MLPerf/KeywordSpotting/inputs.npz differ
diff --git a/DeeployTest/Tests/MLPerf/KeywordSpotting/network.onnx b/DeeployTest/Tests/MLPerf/KeywordSpotting/network.onnx
new file mode 100644
index 0000000..50bb46f
Binary files /dev/null and b/DeeployTest/Tests/MLPerf/KeywordSpotting/network.onnx differ
diff --git a/DeeployTest/Tests/MLPerf/KeywordSpotting/outputs.npz b/DeeployTest/Tests/MLPerf/KeywordSpotting/outputs.npz
new file mode 100644
index 0000000..22b48e5
Binary files /dev/null and b/DeeployTest/Tests/MLPerf/KeywordSpotting/outputs.npz differ
diff --git a/DeeployTest/Tests/MLPerf/VisualWakeWords/activations.npz b/DeeployTest/Tests/MLPerf/VisualWakeWords/activations.npz
new file mode 100644
index 0000000..db73ea7
Binary files /dev/null and b/DeeployTest/Tests/MLPerf/VisualWakeWords/activations.npz differ
diff --git a/DeeployTest/Tests/MLPerf/VisualWakeWords/inputs.npz b/DeeployTest/Tests/MLPerf/VisualWakeWords/inputs.npz
new file mode 100644
index 0000000..9536601
Binary files /dev/null and b/DeeployTest/Tests/MLPerf/VisualWakeWords/inputs.npz differ
diff --git a/DeeployTest/Tests/MLPerf/VisualWakeWords/network.onnx b/DeeployTest/Tests/MLPerf/VisualWakeWords/network.onnx
new file mode 100644
index 0000000..2e6740c
Binary files /dev/null and b/DeeployTest/Tests/MLPerf/VisualWakeWords/network.onnx differ
diff --git a/DeeployTest/Tests/MLPerf/VisualWakeWords/outputs.npz b/DeeployTest/Tests/MLPerf/VisualWakeWords/outputs.npz
new file mode 100644
index 0000000..f57b9b7
Binary files /dev/null and b/DeeployTest/Tests/MLPerf/VisualWakeWords/outputs.npz differ
diff --git a/DeeployTest/Tests/MobileNetv2/activations.npz b/DeeployTest/Tests/MobileNetv2/activations.npz
new file mode 100644
index 0000000..e0ee097
Binary files /dev/null and b/DeeployTest/Tests/MobileNetv2/activations.npz differ
diff --git a/DeeployTest/Tests/MobileNetv2/inputs.npz b/DeeployTest/Tests/MobileNetv2/inputs.npz
new file mode 100644
index 0000000..3be4fb4
Binary files /dev/null and b/DeeployTest/Tests/MobileNetv2/inputs.npz differ
diff --git a/DeeployTest/Tests/MobileNetv2/network.onnx b/DeeployTest/Tests/MobileNetv2/network.onnx
new file mode 100644
index 0000000..d71eb29
Binary files /dev/null and b/DeeployTest/Tests/MobileNetv2/network.onnx differ
diff --git a/DeeployTest/Tests/MobileNetv2/outputs.npz b/DeeployTest/Tests/MobileNetv2/outputs.npz
new file mode 100644
index 0000000..2737b78
Binary files /dev/null and b/DeeployTest/Tests/MobileNetv2/outputs.npz differ
diff --git a/DeeployTest/Tests/MultIO/inputs.npz b/DeeployTest/Tests/MultIO/inputs.npz
new file mode 100644
index 0000000..1da7119
Binary files /dev/null and b/DeeployTest/Tests/MultIO/inputs.npz differ
diff --git a/DeeployTest/Tests/MultIO/network.onnx b/DeeployTest/Tests/MultIO/network.onnx
new file mode 100644
index 0000000..fd11ca8
--- /dev/null
+++ b/DeeployTest/Tests/MultIO/network.onnx
@@ -0,0 +1,55 @@
+pytorch1.11.0:�
+)
+onnx::Add_0
+onnx::Add_34Add_0"Add
+)
+onnx::Add_1
+onnx::Add_25Add_1"Addtorch-jit-exportZ%
+onnx::Add_0
+
+
+
+
+Z%
+onnx::Add_1
+
+
+
+
+Z%
+onnx::Add_2
+
+
+
+
+Z%
+onnx::Add_3
+
+
+
+
+b
+4
+
+
+
+
+b
+5
+
+
+
+
+j
+4
+
+
+
+
+j
+5
+
+
+
+
+B	
\ No newline at end of file
diff --git a/DeeployTest/Tests/MultIO/outputs.npz b/DeeployTest/Tests/MultIO/outputs.npz
new file mode 100644
index 0000000..4c93c92
Binary files /dev/null and b/DeeployTest/Tests/MultIO/outputs.npz differ
diff --git a/DeeployTest/Tests/RQHardswish/inputs.npz b/DeeployTest/Tests/RQHardswish/inputs.npz
new file mode 100644
index 0000000..b557b46
Binary files /dev/null and b/DeeployTest/Tests/RQHardswish/inputs.npz differ
diff --git a/DeeployTest/Tests/RQHardswish/network.onnx b/DeeployTest/Tests/RQHardswish/network.onnx
new file mode 100644
index 0000000..799e25f
Binary files /dev/null and b/DeeployTest/Tests/RQHardswish/network.onnx differ
diff --git a/DeeployTest/Tests/RQHardswish/outputs.npz b/DeeployTest/Tests/RQHardswish/outputs.npz
new file mode 100644
index 0000000..18c3cec
Binary files /dev/null and b/DeeployTest/Tests/RQHardswish/outputs.npz differ
diff --git a/DeeployTest/Tests/Transformer/activations.npz b/DeeployTest/Tests/Transformer/activations.npz
new file mode 100644
index 0000000..5d4522e
Binary files /dev/null and b/DeeployTest/Tests/Transformer/activations.npz differ
diff --git a/DeeployTest/Tests/Transformer/inputs.npz b/DeeployTest/Tests/Transformer/inputs.npz
new file mode 100644
index 0000000..763ab22
Binary files /dev/null and b/DeeployTest/Tests/Transformer/inputs.npz differ
diff --git a/DeeployTest/Tests/Transformer/network.onnx b/DeeployTest/Tests/Transformer/network.onnx
new file mode 100644
index 0000000..458c93f
Binary files /dev/null and b/DeeployTest/Tests/Transformer/network.onnx differ
diff --git a/DeeployTest/Tests/Transformer/outputs.npz b/DeeployTest/Tests/Transformer/outputs.npz
new file mode 100644
index 0000000..c98520e
Binary files /dev/null and b/DeeployTest/Tests/Transformer/outputs.npz differ
diff --git a/DeeployTest/Tests/WaveFormer/inputs.npz b/DeeployTest/Tests/WaveFormer/inputs.npz
new file mode 100644
index 0000000..9856065
Binary files /dev/null and b/DeeployTest/Tests/WaveFormer/inputs.npz differ
diff --git a/DeeployTest/Tests/WaveFormer/network.onnx b/DeeployTest/Tests/WaveFormer/network.onnx
new file mode 100644
index 0000000..3123441
Binary files /dev/null and b/DeeployTest/Tests/WaveFormer/network.onnx differ
diff --git a/DeeployTest/Tests/WaveFormer/outputs.npz b/DeeployTest/Tests/WaveFormer/outputs.npz
new file mode 100644
index 0000000..aa86f3a
Binary files /dev/null and b/DeeployTest/Tests/WaveFormer/outputs.npz differ
diff --git a/DeeployTest/Tests/iSoftmax/activations.npz b/DeeployTest/Tests/iSoftmax/activations.npz
new file mode 100644
index 0000000..15cb0ec
Binary files /dev/null and b/DeeployTest/Tests/iSoftmax/activations.npz differ
diff --git a/DeeployTest/Tests/iSoftmax/inputs.npz b/DeeployTest/Tests/iSoftmax/inputs.npz
new file mode 100644
index 0000000..1b3e56a
Binary files /dev/null and b/DeeployTest/Tests/iSoftmax/inputs.npz differ
diff --git a/DeeployTest/Tests/iSoftmax/network.onnx b/DeeployTest/Tests/iSoftmax/network.onnx
new file mode 100644
index 0000000..34b8cb8
Binary files /dev/null and b/DeeployTest/Tests/iSoftmax/network.onnx differ
diff --git a/DeeployTest/Tests/iSoftmax/outputs.npz b/DeeployTest/Tests/iSoftmax/outputs.npz
new file mode 100644
index 0000000..f178e2e
Binary files /dev/null and b/DeeployTest/Tests/iSoftmax/outputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama1/activations.npz b/DeeployTest/Tests/microLlama/microLlama1/activations.npz
new file mode 100644
index 0000000..4eced1b
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama1/activations.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama1/inputs.npz b/DeeployTest/Tests/microLlama/microLlama1/inputs.npz
new file mode 100644
index 0000000..a5016cd
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama1/inputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama1/network.onnx b/DeeployTest/Tests/microLlama/microLlama1/network.onnx
new file mode 100644
index 0000000..72b1566
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama1/network.onnx differ
diff --git a/DeeployTest/Tests/microLlama/microLlama1/outputs.npz b/DeeployTest/Tests/microLlama/microLlama1/outputs.npz
new file mode 100644
index 0000000..91a04c2
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama1/outputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama128/activations.npz b/DeeployTest/Tests/microLlama/microLlama128/activations.npz
new file mode 100644
index 0000000..2c1389a
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama128/activations.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama128/inputs.npz b/DeeployTest/Tests/microLlama/microLlama128/inputs.npz
new file mode 100644
index 0000000..92b238e
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama128/inputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama128/network.onnx b/DeeployTest/Tests/microLlama/microLlama128/network.onnx
new file mode 100644
index 0000000..81dafbf
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama128/network.onnx differ
diff --git a/DeeployTest/Tests/microLlama/microLlama128/outputs.npz b/DeeployTest/Tests/microLlama/microLlama128/outputs.npz
new file mode 100644
index 0000000..6910aea
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama128/outputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama16/activations.npz b/DeeployTest/Tests/microLlama/microLlama16/activations.npz
new file mode 100644
index 0000000..00f0440
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama16/activations.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama16/inputs.npz b/DeeployTest/Tests/microLlama/microLlama16/inputs.npz
new file mode 100644
index 0000000..0d3be3c
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama16/inputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama16/network.onnx b/DeeployTest/Tests/microLlama/microLlama16/network.onnx
new file mode 100644
index 0000000..1bba124
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama16/network.onnx differ
diff --git a/DeeployTest/Tests/microLlama/microLlama16/outputs.npz b/DeeployTest/Tests/microLlama/microLlama16/outputs.npz
new file mode 100644
index 0000000..c9b9f37
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama16/outputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama16_parallel/activations.npz b/DeeployTest/Tests/microLlama/microLlama16_parallel/activations.npz
new file mode 100644
index 0000000..97390f9
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama16_parallel/activations.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama16_parallel/inputs.npz b/DeeployTest/Tests/microLlama/microLlama16_parallel/inputs.npz
new file mode 100644
index 0000000..fb64a02
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama16_parallel/inputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama16_parallel/network.onnx b/DeeployTest/Tests/microLlama/microLlama16_parallel/network.onnx
new file mode 100644
index 0000000..eb88cb4
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama16_parallel/network.onnx differ
diff --git a/DeeployTest/Tests/microLlama/microLlama16_parallel/outputs.npz b/DeeployTest/Tests/microLlama/microLlama16_parallel/outputs.npz
new file mode 100644
index 0000000..3821b10
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama16_parallel/outputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama1_parallel/activations.npz b/DeeployTest/Tests/microLlama/microLlama1_parallel/activations.npz
new file mode 100644
index 0000000..d448495
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama1_parallel/activations.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama1_parallel/inputs.npz b/DeeployTest/Tests/microLlama/microLlama1_parallel/inputs.npz
new file mode 100644
index 0000000..5c13ef4
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama1_parallel/inputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama1_parallel/network.onnx b/DeeployTest/Tests/microLlama/microLlama1_parallel/network.onnx
new file mode 100644
index 0000000..e64307b
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama1_parallel/network.onnx differ
diff --git a/DeeployTest/Tests/microLlama/microLlama1_parallel/outputs.npz b/DeeployTest/Tests/microLlama/microLlama1_parallel/outputs.npz
new file mode 100644
index 0000000..2a0210b
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama1_parallel/outputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama2/activations.npz b/DeeployTest/Tests/microLlama/microLlama2/activations.npz
new file mode 100644
index 0000000..5afcee7
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama2/activations.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama2/inputs.npz b/DeeployTest/Tests/microLlama/microLlama2/inputs.npz
new file mode 100644
index 0000000..a55454c
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama2/inputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama2/network.onnx b/DeeployTest/Tests/microLlama/microLlama2/network.onnx
new file mode 100644
index 0000000..54ac223
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama2/network.onnx differ
diff --git a/DeeployTest/Tests/microLlama/microLlama2/outputs.npz b/DeeployTest/Tests/microLlama/microLlama2/outputs.npz
new file mode 100644
index 0000000..88b5373
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama2/outputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama256/activations.npz b/DeeployTest/Tests/microLlama/microLlama256/activations.npz
new file mode 100644
index 0000000..6ded3cd
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama256/activations.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama256/inputs.npz b/DeeployTest/Tests/microLlama/microLlama256/inputs.npz
new file mode 100644
index 0000000..e149a13
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama256/inputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama256/network.onnx b/DeeployTest/Tests/microLlama/microLlama256/network.onnx
new file mode 100644
index 0000000..b49bd11
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama256/network.onnx differ
diff --git a/DeeployTest/Tests/microLlama/microLlama256/outputs.npz b/DeeployTest/Tests/microLlama/microLlama256/outputs.npz
new file mode 100644
index 0000000..cd3ec47
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama256/outputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama2_parallel/activations.npz b/DeeployTest/Tests/microLlama/microLlama2_parallel/activations.npz
new file mode 100644
index 0000000..71303d7
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama2_parallel/activations.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama2_parallel/inputs.npz b/DeeployTest/Tests/microLlama/microLlama2_parallel/inputs.npz
new file mode 100644
index 0000000..a27aad2
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama2_parallel/inputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama2_parallel/network.onnx b/DeeployTest/Tests/microLlama/microLlama2_parallel/network.onnx
new file mode 100644
index 0000000..eda2dc7
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama2_parallel/network.onnx differ
diff --git a/DeeployTest/Tests/microLlama/microLlama2_parallel/outputs.npz b/DeeployTest/Tests/microLlama/microLlama2_parallel/outputs.npz
new file mode 100644
index 0000000..c940e96
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama2_parallel/outputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama32/activations.npz b/DeeployTest/Tests/microLlama/microLlama32/activations.npz
new file mode 100644
index 0000000..b64fd15
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama32/activations.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama32/inputs.npz b/DeeployTest/Tests/microLlama/microLlama32/inputs.npz
new file mode 100644
index 0000000..4ac1e10
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama32/inputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama32/network.onnx b/DeeployTest/Tests/microLlama/microLlama32/network.onnx
new file mode 100644
index 0000000..029720f
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama32/network.onnx differ
diff --git a/DeeployTest/Tests/microLlama/microLlama32/outputs.npz b/DeeployTest/Tests/microLlama/microLlama32/outputs.npz
new file mode 100644
index 0000000..c22b1ca
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama32/outputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama32_parallel/activations.npz b/DeeployTest/Tests/microLlama/microLlama32_parallel/activations.npz
new file mode 100644
index 0000000..3dc6f83
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama32_parallel/activations.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama32_parallel/inputs.npz b/DeeployTest/Tests/microLlama/microLlama32_parallel/inputs.npz
new file mode 100644
index 0000000..110ef3a
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama32_parallel/inputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama32_parallel/network.onnx b/DeeployTest/Tests/microLlama/microLlama32_parallel/network.onnx
new file mode 100644
index 0000000..f947f1d
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama32_parallel/network.onnx differ
diff --git a/DeeployTest/Tests/microLlama/microLlama32_parallel/outputs.npz b/DeeployTest/Tests/microLlama/microLlama32_parallel/outputs.npz
new file mode 100644
index 0000000..8358f7c
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama32_parallel/outputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama4/activations.npz b/DeeployTest/Tests/microLlama/microLlama4/activations.npz
new file mode 100644
index 0000000..c3a388e
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama4/activations.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama4/inputs.npz b/DeeployTest/Tests/microLlama/microLlama4/inputs.npz
new file mode 100644
index 0000000..fdca163
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama4/inputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama4/network.onnx b/DeeployTest/Tests/microLlama/microLlama4/network.onnx
new file mode 100644
index 0000000..44bcffb
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama4/network.onnx differ
diff --git a/DeeployTest/Tests/microLlama/microLlama4/outputs.npz b/DeeployTest/Tests/microLlama/microLlama4/outputs.npz
new file mode 100644
index 0000000..d3ea38c
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama4/outputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama4_parallel/activations.npz b/DeeployTest/Tests/microLlama/microLlama4_parallel/activations.npz
new file mode 100644
index 0000000..d343816
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama4_parallel/activations.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama4_parallel/inputs.npz b/DeeployTest/Tests/microLlama/microLlama4_parallel/inputs.npz
new file mode 100644
index 0000000..bc07090
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama4_parallel/inputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama4_parallel/network.onnx b/DeeployTest/Tests/microLlama/microLlama4_parallel/network.onnx
new file mode 100644
index 0000000..fd5bf7d
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama4_parallel/network.onnx differ
diff --git a/DeeployTest/Tests/microLlama/microLlama4_parallel/outputs.npz b/DeeployTest/Tests/microLlama/microLlama4_parallel/outputs.npz
new file mode 100644
index 0000000..304696b
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama4_parallel/outputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama64/activations.npz b/DeeployTest/Tests/microLlama/microLlama64/activations.npz
new file mode 100644
index 0000000..257b492
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama64/activations.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama64/inputs.npz b/DeeployTest/Tests/microLlama/microLlama64/inputs.npz
new file mode 100644
index 0000000..bf27ed6
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama64/inputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama64/network.onnx b/DeeployTest/Tests/microLlama/microLlama64/network.onnx
new file mode 100644
index 0000000..5a8e6b5
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama64/network.onnx differ
diff --git a/DeeployTest/Tests/microLlama/microLlama64/outputs.npz b/DeeployTest/Tests/microLlama/microLlama64/outputs.npz
new file mode 100644
index 0000000..b9aa7cd
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama64/outputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama64_parallel/activations.npz b/DeeployTest/Tests/microLlama/microLlama64_parallel/activations.npz
new file mode 100644
index 0000000..3c5365a
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama64_parallel/activations.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama64_parallel/inputs.npz b/DeeployTest/Tests/microLlama/microLlama64_parallel/inputs.npz
new file mode 100644
index 0000000..0919189
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama64_parallel/inputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama64_parallel/network.onnx b/DeeployTest/Tests/microLlama/microLlama64_parallel/network.onnx
new file mode 100644
index 0000000..baf409e
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama64_parallel/network.onnx differ
diff --git a/DeeployTest/Tests/microLlama/microLlama64_parallel/outputs.npz b/DeeployTest/Tests/microLlama/microLlama64_parallel/outputs.npz
new file mode 100644
index 0000000..4d22226
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama64_parallel/outputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama8/activations.npz b/DeeployTest/Tests/microLlama/microLlama8/activations.npz
new file mode 100644
index 0000000..455a65e
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama8/activations.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama8/inputs.npz b/DeeployTest/Tests/microLlama/microLlama8/inputs.npz
new file mode 100644
index 0000000..6e7a07f
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama8/inputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama8/network.onnx b/DeeployTest/Tests/microLlama/microLlama8/network.onnx
new file mode 100644
index 0000000..a00fd38
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama8/network.onnx differ
diff --git a/DeeployTest/Tests/microLlama/microLlama8/outputs.npz b/DeeployTest/Tests/microLlama/microLlama8/outputs.npz
new file mode 100644
index 0000000..d9ce479
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama8/outputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama8_parallel/activations.npz b/DeeployTest/Tests/microLlama/microLlama8_parallel/activations.npz
new file mode 100644
index 0000000..802daca
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama8_parallel/activations.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama8_parallel/inputs.npz b/DeeployTest/Tests/microLlama/microLlama8_parallel/inputs.npz
new file mode 100644
index 0000000..fd10e8b
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama8_parallel/inputs.npz differ
diff --git a/DeeployTest/Tests/microLlama/microLlama8_parallel/network.onnx b/DeeployTest/Tests/microLlama/microLlama8_parallel/network.onnx
new file mode 100644
index 0000000..0862963
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama8_parallel/network.onnx differ
diff --git a/DeeployTest/Tests/microLlama/microLlama8_parallel/outputs.npz b/DeeployTest/Tests/microLlama/microLlama8_parallel/outputs.npz
new file mode 100644
index 0000000..5f08387
Binary files /dev/null and b/DeeployTest/Tests/microLlama/microLlama8_parallel/outputs.npz differ
diff --git a/DeeployTest/Tests/miniMobileNet/activations.npz b/DeeployTest/Tests/miniMobileNet/activations.npz
new file mode 100644
index 0000000..afafe6b
Binary files /dev/null and b/DeeployTest/Tests/miniMobileNet/activations.npz differ
diff --git a/DeeployTest/Tests/miniMobileNet/inputs.npz b/DeeployTest/Tests/miniMobileNet/inputs.npz
new file mode 100644
index 0000000..c513978
Binary files /dev/null and b/DeeployTest/Tests/miniMobileNet/inputs.npz differ
diff --git a/DeeployTest/Tests/miniMobileNet/network.onnx b/DeeployTest/Tests/miniMobileNet/network.onnx
new file mode 100644
index 0000000..9a07df6
Binary files /dev/null and b/DeeployTest/Tests/miniMobileNet/network.onnx differ
diff --git a/DeeployTest/Tests/miniMobileNet/outputs.npz b/DeeployTest/Tests/miniMobileNet/outputs.npz
new file mode 100644
index 0000000..f9ae6b9
Binary files /dev/null and b/DeeployTest/Tests/miniMobileNet/outputs.npz differ
diff --git a/DeeployTest/Tests/miniMobileNetv2/activations.npz b/DeeployTest/Tests/miniMobileNetv2/activations.npz
new file mode 100644
index 0000000..5b7cb1e
Binary files /dev/null and b/DeeployTest/Tests/miniMobileNetv2/activations.npz differ
diff --git a/DeeployTest/Tests/miniMobileNetv2/inputs.npz b/DeeployTest/Tests/miniMobileNetv2/inputs.npz
new file mode 100644
index 0000000..7b265cb
Binary files /dev/null and b/DeeployTest/Tests/miniMobileNetv2/inputs.npz differ
diff --git a/DeeployTest/Tests/miniMobileNetv2/network.onnx b/DeeployTest/Tests/miniMobileNetv2/network.onnx
new file mode 100644
index 0000000..0ae5760
Binary files /dev/null and b/DeeployTest/Tests/miniMobileNetv2/network.onnx differ
diff --git a/DeeployTest/Tests/miniMobileNetv2/outputs.npz b/DeeployTest/Tests/miniMobileNetv2/outputs.npz
new file mode 100644
index 0000000..0766c30
Binary files /dev/null and b/DeeployTest/Tests/miniMobileNetv2/outputs.npz differ
diff --git a/DeeployTest/Tests/simpleCNN/activations.npz b/DeeployTest/Tests/simpleCNN/activations.npz
new file mode 100644
index 0000000..d6d4ccb
Binary files /dev/null and b/DeeployTest/Tests/simpleCNN/activations.npz differ
diff --git a/DeeployTest/Tests/simpleCNN/inputs.npz b/DeeployTest/Tests/simpleCNN/inputs.npz
new file mode 100644
index 0000000..2e044e1
Binary files /dev/null and b/DeeployTest/Tests/simpleCNN/inputs.npz differ
diff --git a/DeeployTest/Tests/simpleCNN/network.onnx b/DeeployTest/Tests/simpleCNN/network.onnx
new file mode 100644
index 0000000..0706f3d
Binary files /dev/null and b/DeeployTest/Tests/simpleCNN/network.onnx differ
diff --git a/DeeployTest/Tests/simpleCNN/outputs.npz b/DeeployTest/Tests/simpleCNN/outputs.npz
new file mode 100644
index 0000000..d332629
Binary files /dev/null and b/DeeployTest/Tests/simpleCNN/outputs.npz differ
diff --git a/DeeployTest/Tests/simpleRegression/activations.npz b/DeeployTest/Tests/simpleRegression/activations.npz
new file mode 100644
index 0000000..675a720
Binary files /dev/null and b/DeeployTest/Tests/simpleRegression/activations.npz differ
diff --git a/DeeployTest/Tests/simpleRegression/inputs.npz b/DeeployTest/Tests/simpleRegression/inputs.npz
new file mode 100644
index 0000000..8ef8ffa
Binary files /dev/null and b/DeeployTest/Tests/simpleRegression/inputs.npz differ
diff --git a/DeeployTest/Tests/simpleRegression/network.onnx b/DeeployTest/Tests/simpleRegression/network.onnx
new file mode 100644
index 0000000..903c745
Binary files /dev/null and b/DeeployTest/Tests/simpleRegression/network.onnx differ
diff --git a/DeeployTest/Tests/simpleRegression/outputs.npz b/DeeployTest/Tests/simpleRegression/outputs.npz
new file mode 100644
index 0000000..fc9e21d
Binary files /dev/null and b/DeeployTest/Tests/simpleRegression/outputs.npz differ
diff --git a/DeeployTest/Tests/test1DConvolution/inputs.npz b/DeeployTest/Tests/test1DConvolution/inputs.npz
new file mode 100644
index 0000000..62d222d
Binary files /dev/null and b/DeeployTest/Tests/test1DConvolution/inputs.npz differ
diff --git a/DeeployTest/Tests/test1DConvolution/network.onnx b/DeeployTest/Tests/test1DConvolution/network.onnx
new file mode 100644
index 0000000..e779a04
Binary files /dev/null and b/DeeployTest/Tests/test1DConvolution/network.onnx differ
diff --git a/DeeployTest/Tests/test1DConvolution/outputs.npz b/DeeployTest/Tests/test1DConvolution/outputs.npz
new file mode 100644
index 0000000..620ac3d
Binary files /dev/null and b/DeeployTest/Tests/test1DConvolution/outputs.npz differ
diff --git a/DeeployTest/Tests/test1DDWConvolution/inputs.npz b/DeeployTest/Tests/test1DDWConvolution/inputs.npz
new file mode 100644
index 0000000..7d11dca
Binary files /dev/null and b/DeeployTest/Tests/test1DDWConvolution/inputs.npz differ
diff --git a/DeeployTest/Tests/test1DDWConvolution/network.onnx b/DeeployTest/Tests/test1DDWConvolution/network.onnx
new file mode 100644
index 0000000..568a10f
Binary files /dev/null and b/DeeployTest/Tests/test1DDWConvolution/network.onnx differ
diff --git a/DeeployTest/Tests/test1DDWConvolution/outputs.npz b/DeeployTest/Tests/test1DDWConvolution/outputs.npz
new file mode 100644
index 0000000..19b37df
Binary files /dev/null and b/DeeployTest/Tests/test1DDWConvolution/outputs.npz differ
diff --git a/DeeployTest/Tests/test1DPad/inputs.npz b/DeeployTest/Tests/test1DPad/inputs.npz
new file mode 100644
index 0000000..3f7a34f
Binary files /dev/null and b/DeeployTest/Tests/test1DPad/inputs.npz differ
diff --git a/DeeployTest/Tests/test1DPad/network.onnx b/DeeployTest/Tests/test1DPad/network.onnx
new file mode 100644
index 0000000..51602d5
Binary files /dev/null and b/DeeployTest/Tests/test1DPad/network.onnx differ
diff --git a/DeeployTest/Tests/test1DPad/outputs.npz b/DeeployTest/Tests/test1DPad/outputs.npz
new file mode 100644
index 0000000..009ac43
Binary files /dev/null and b/DeeployTest/Tests/test1DPad/outputs.npz differ
diff --git a/DeeployTest/Tests/test2DConvolution/inputs.npz b/DeeployTest/Tests/test2DConvolution/inputs.npz
new file mode 100644
index 0000000..a341927
Binary files /dev/null and b/DeeployTest/Tests/test2DConvolution/inputs.npz differ
diff --git a/DeeployTest/Tests/test2DConvolution/network.onnx b/DeeployTest/Tests/test2DConvolution/network.onnx
new file mode 100644
index 0000000..fe9b85d
Binary files /dev/null and b/DeeployTest/Tests/test2DConvolution/network.onnx differ
diff --git a/DeeployTest/Tests/test2DConvolution/outputs.npz b/DeeployTest/Tests/test2DConvolution/outputs.npz
new file mode 100644
index 0000000..31029fb
Binary files /dev/null and b/DeeployTest/Tests/test2DConvolution/outputs.npz differ
diff --git a/DeeployTest/Tests/test2DDWConvolution/inputs.npz b/DeeployTest/Tests/test2DDWConvolution/inputs.npz
new file mode 100644
index 0000000..150eac0
Binary files /dev/null and b/DeeployTest/Tests/test2DDWConvolution/inputs.npz differ
diff --git a/DeeployTest/Tests/test2DDWConvolution/network.onnx b/DeeployTest/Tests/test2DDWConvolution/network.onnx
new file mode 100644
index 0000000..dd49570
Binary files /dev/null and b/DeeployTest/Tests/test2DDWConvolution/network.onnx differ
diff --git a/DeeployTest/Tests/test2DDWConvolution/outputs.npz b/DeeployTest/Tests/test2DDWConvolution/outputs.npz
new file mode 100644
index 0000000..2f6b323
Binary files /dev/null and b/DeeployTest/Tests/test2DDWConvolution/outputs.npz differ
diff --git a/DeeployTest/Tests/test2DPad/inputs.npz b/DeeployTest/Tests/test2DPad/inputs.npz
new file mode 100644
index 0000000..65d1aa8
Binary files /dev/null and b/DeeployTest/Tests/test2DPad/inputs.npz differ
diff --git a/DeeployTest/Tests/test2DPad/network.onnx b/DeeployTest/Tests/test2DPad/network.onnx
new file mode 100644
index 0000000..97284cf
Binary files /dev/null and b/DeeployTest/Tests/test2DPad/network.onnx differ
diff --git a/DeeployTest/Tests/test2DPad/outputs.npz b/DeeployTest/Tests/test2DPad/outputs.npz
new file mode 100644
index 0000000..2b94f87
Binary files /dev/null and b/DeeployTest/Tests/test2DPad/outputs.npz differ
diff --git a/DeeployTest/Tests/test2DRequantizedConv/activations.npz b/DeeployTest/Tests/test2DRequantizedConv/activations.npz
new file mode 100644
index 0000000..7b4b8c7
Binary files /dev/null and b/DeeployTest/Tests/test2DRequantizedConv/activations.npz differ
diff --git a/DeeployTest/Tests/test2DRequantizedConv/inputs.npz b/DeeployTest/Tests/test2DRequantizedConv/inputs.npz
new file mode 100644
index 0000000..e49b356
Binary files /dev/null and b/DeeployTest/Tests/test2DRequantizedConv/inputs.npz differ
diff --git a/DeeployTest/Tests/test2DRequantizedConv/network.onnx b/DeeployTest/Tests/test2DRequantizedConv/network.onnx
new file mode 100644
index 0000000..c4ba5aa
Binary files /dev/null and b/DeeployTest/Tests/test2DRequantizedConv/network.onnx differ
diff --git a/DeeployTest/Tests/test2DRequantizedConv/outputs.npz b/DeeployTest/Tests/test2DRequantizedConv/outputs.npz
new file mode 100644
index 0000000..c9fdd5e
Binary files /dev/null and b/DeeployTest/Tests/test2DRequantizedConv/outputs.npz differ
diff --git a/DeeployTest/Tests/testBacktracking/activations.npz b/DeeployTest/Tests/testBacktracking/activations.npz
new file mode 100644
index 0000000..15cb0ec
Binary files /dev/null and b/DeeployTest/Tests/testBacktracking/activations.npz differ
diff --git a/DeeployTest/Tests/testBacktracking/inputs.npz b/DeeployTest/Tests/testBacktracking/inputs.npz
new file mode 100644
index 0000000..6753b07
Binary files /dev/null and b/DeeployTest/Tests/testBacktracking/inputs.npz differ
diff --git a/DeeployTest/Tests/testBacktracking/network.onnx b/DeeployTest/Tests/testBacktracking/network.onnx
new file mode 100644
index 0000000..be7cc56
Binary files /dev/null and b/DeeployTest/Tests/testBacktracking/network.onnx differ
diff --git a/DeeployTest/Tests/testBacktracking/outputs.npz b/DeeployTest/Tests/testBacktracking/outputs.npz
new file mode 100644
index 0000000..4fc89de
Binary files /dev/null and b/DeeployTest/Tests/testBacktracking/outputs.npz differ
diff --git a/DeeployTest/Tests/testConcat/activations.npz b/DeeployTest/Tests/testConcat/activations.npz
new file mode 100644
index 0000000..15cb0ec
Binary files /dev/null and b/DeeployTest/Tests/testConcat/activations.npz differ
diff --git a/DeeployTest/Tests/testConcat/inputs.npz b/DeeployTest/Tests/testConcat/inputs.npz
new file mode 100644
index 0000000..9bebd93
Binary files /dev/null and b/DeeployTest/Tests/testConcat/inputs.npz differ
diff --git a/DeeployTest/Tests/testConcat/network.onnx b/DeeployTest/Tests/testConcat/network.onnx
new file mode 100644
index 0000000..01977a4
--- /dev/null
+++ b/DeeployTest/Tests/testConcat/network.onnx
@@ -0,0 +1,39 @@
+onnxruntime.transformers1.16.1:�
+G
+input
+onnx::Concat_1outputConcat_0"Concat*
+axis����������	torch_jitZ
+input
+
+
+
+
+@Z(
+onnx::Concat_1
+
+
+
+
+@b 
+output
+
+
+
+
+@j 
+output
+
+
+
+
+@B
+B
+
+ai.onnx.mlB
+ai.onnx.trainingB
+com.ms.internal.nhwcB
+ai.onnx.preview.trainingB
+com.microsoftB
+com.microsoft.experimentalB
+com.microsoft.nchwcB
+org.pytorch.aten
\ No newline at end of file
diff --git a/DeeployTest/Tests/testConcat/outputs.npz b/DeeployTest/Tests/testConcat/outputs.npz
new file mode 100644
index 0000000..2daf7a6
Binary files /dev/null and b/DeeployTest/Tests/testConcat/outputs.npz differ
diff --git a/DeeployTest/Tests/testGEMM/inputs.npz b/DeeployTest/Tests/testGEMM/inputs.npz
new file mode 100644
index 0000000..d94a87e
Binary files /dev/null and b/DeeployTest/Tests/testGEMM/inputs.npz differ
diff --git a/DeeployTest/Tests/testGEMM/network.onnx b/DeeployTest/Tests/testGEMM/network.onnx
new file mode 100644
index 0000000..23adcef
Binary files /dev/null and b/DeeployTest/Tests/testGEMM/network.onnx differ
diff --git a/DeeployTest/Tests/testGEMM/outputs.npz b/DeeployTest/Tests/testGEMM/outputs.npz
new file mode 100644
index 0000000..c2cb4e2
Binary files /dev/null and b/DeeployTest/Tests/testGEMM/outputs.npz differ
diff --git a/DeeployTest/Tests/testMatMul/inputs.npz b/DeeployTest/Tests/testMatMul/inputs.npz
new file mode 100644
index 0000000..4659950
Binary files /dev/null and b/DeeployTest/Tests/testMatMul/inputs.npz differ
diff --git a/DeeployTest/Tests/testMatMul/network.onnx b/DeeployTest/Tests/testMatMul/network.onnx
new file mode 100644
index 0000000..44217ee
--- /dev/null
+++ b/DeeployTest/Tests/testMatMul/network.onnx
@@ -0,0 +1,22 @@
+onnx1.12.0:�
++
+input_0
+input_1outputMatMul1"MatMulMatMulZ!
+input_0
+
+
+
+ 
+0Z!
+input_1
+
+
+
+0
+@b 
+output
+
+
+
+ 
+@B
\ No newline at end of file
diff --git a/DeeployTest/Tests/testMatMul/outputs.npz b/DeeployTest/Tests/testMatMul/outputs.npz
new file mode 100644
index 0000000..3f942b9
Binary files /dev/null and b/DeeployTest/Tests/testMatMul/outputs.npz differ
diff --git a/DeeployTest/Tests/testMatMulAdd/inputs.npz b/DeeployTest/Tests/testMatMulAdd/inputs.npz
new file mode 100644
index 0000000..96b77bf
Binary files /dev/null and b/DeeployTest/Tests/testMatMulAdd/inputs.npz differ
diff --git a/DeeployTest/Tests/testMatMulAdd/network.onnx b/DeeployTest/Tests/testMatMulAdd/network.onnx
new file mode 100644
index 0000000..8faca08
Binary files /dev/null and b/DeeployTest/Tests/testMatMulAdd/network.onnx differ
diff --git a/DeeployTest/Tests/testMatMulAdd/outputs.npz b/DeeployTest/Tests/testMatMulAdd/outputs.npz
new file mode 100644
index 0000000..2ec35c5
Binary files /dev/null and b/DeeployTest/Tests/testMatMulAdd/outputs.npz differ
diff --git a/DeeployTest/Tests/testMaxPool/inputs.npz b/DeeployTest/Tests/testMaxPool/inputs.npz
new file mode 100644
index 0000000..f2af42d
Binary files /dev/null and b/DeeployTest/Tests/testMaxPool/inputs.npz differ
diff --git a/DeeployTest/Tests/testMaxPool/network.onnx b/DeeployTest/Tests/testMaxPool/network.onnx
new file mode 100644
index 0000000..6c2f6ce
Binary files /dev/null and b/DeeployTest/Tests/testMaxPool/network.onnx differ
diff --git a/DeeployTest/Tests/testMaxPool/outputs.npz b/DeeployTest/Tests/testMaxPool/outputs.npz
new file mode 100644
index 0000000..fe2f6dc
Binary files /dev/null and b/DeeployTest/Tests/testMaxPool/outputs.npz differ
diff --git a/DeeployTest/Tests/testPointwise/activations.npz b/DeeployTest/Tests/testPointwise/activations.npz
new file mode 100644
index 0000000..2066f8c
Binary files /dev/null and b/DeeployTest/Tests/testPointwise/activations.npz differ
diff --git a/DeeployTest/Tests/testPointwise/inputs.npz b/DeeployTest/Tests/testPointwise/inputs.npz
new file mode 100644
index 0000000..17b0f07
Binary files /dev/null and b/DeeployTest/Tests/testPointwise/inputs.npz differ
diff --git a/DeeployTest/Tests/testPointwise/network.onnx b/DeeployTest/Tests/testPointwise/network.onnx
new file mode 100644
index 0000000..c5e3d10
Binary files /dev/null and b/DeeployTest/Tests/testPointwise/network.onnx differ
diff --git a/DeeployTest/Tests/testPointwise/outputs.npz b/DeeployTest/Tests/testPointwise/outputs.npz
new file mode 100644
index 0000000..7160bb4
Binary files /dev/null and b/DeeployTest/Tests/testPointwise/outputs.npz differ
diff --git a/DeeployTest/Tests/testPointwiseConvBNReLU/activations.npz b/DeeployTest/Tests/testPointwiseConvBNReLU/activations.npz
new file mode 100644
index 0000000..2fabc2b
Binary files /dev/null and b/DeeployTest/Tests/testPointwiseConvBNReLU/activations.npz differ
diff --git a/DeeployTest/Tests/testPointwiseConvBNReLU/inputs.npz b/DeeployTest/Tests/testPointwiseConvBNReLU/inputs.npz
new file mode 100644
index 0000000..b4ffd09
Binary files /dev/null and b/DeeployTest/Tests/testPointwiseConvBNReLU/inputs.npz differ
diff --git a/DeeployTest/Tests/testPointwiseConvBNReLU/network.onnx b/DeeployTest/Tests/testPointwiseConvBNReLU/network.onnx
new file mode 100644
index 0000000..fbfcc33
Binary files /dev/null and b/DeeployTest/Tests/testPointwiseConvBNReLU/network.onnx differ
diff --git a/DeeployTest/Tests/testPointwiseConvBNReLU/outputs.npz b/DeeployTest/Tests/testPointwiseConvBNReLU/outputs.npz
new file mode 100644
index 0000000..577d8bf
Binary files /dev/null and b/DeeployTest/Tests/testPointwiseConvBNReLU/outputs.npz differ
diff --git a/DeeployTest/Tests/testPointwiseUnsignedWeights/activations.npz b/DeeployTest/Tests/testPointwiseUnsignedWeights/activations.npz
new file mode 100644
index 0000000..d640993
Binary files /dev/null and b/DeeployTest/Tests/testPointwiseUnsignedWeights/activations.npz differ
diff --git a/DeeployTest/Tests/testPointwiseUnsignedWeights/inputs.npz b/DeeployTest/Tests/testPointwiseUnsignedWeights/inputs.npz
new file mode 100644
index 0000000..149666a
Binary files /dev/null and b/DeeployTest/Tests/testPointwiseUnsignedWeights/inputs.npz differ
diff --git a/DeeployTest/Tests/testPointwiseUnsignedWeights/network.onnx b/DeeployTest/Tests/testPointwiseUnsignedWeights/network.onnx
new file mode 100644
index 0000000..f8cd916
Binary files /dev/null and b/DeeployTest/Tests/testPointwiseUnsignedWeights/network.onnx differ
diff --git a/DeeployTest/Tests/testPointwiseUnsignedWeights/outputs.npz b/DeeployTest/Tests/testPointwiseUnsignedWeights/outputs.npz
new file mode 100644
index 0000000..975612b
Binary files /dev/null and b/DeeployTest/Tests/testPointwiseUnsignedWeights/outputs.npz differ
diff --git a/DeeployTest/Tests/testRMSNorm/activations.npz b/DeeployTest/Tests/testRMSNorm/activations.npz
new file mode 100644
index 0000000..5cb1c32
Binary files /dev/null and b/DeeployTest/Tests/testRMSNorm/activations.npz differ
diff --git a/DeeployTest/Tests/testRMSNorm/inputs.npz b/DeeployTest/Tests/testRMSNorm/inputs.npz
new file mode 100644
index 0000000..e84844b
Binary files /dev/null and b/DeeployTest/Tests/testRMSNorm/inputs.npz differ
diff --git a/DeeployTest/Tests/testRMSNorm/network.onnx b/DeeployTest/Tests/testRMSNorm/network.onnx
new file mode 100644
index 0000000..18b0385
Binary files /dev/null and b/DeeployTest/Tests/testRMSNorm/network.onnx differ
diff --git a/DeeployTest/Tests/testRMSNorm/outputs.npz b/DeeployTest/Tests/testRMSNorm/outputs.npz
new file mode 100644
index 0000000..9a3ff0a
Binary files /dev/null and b/DeeployTest/Tests/testRMSNorm/outputs.npz differ
diff --git a/DeeployTest/Tests/testRQConv/inputs.npz b/DeeployTest/Tests/testRQConv/inputs.npz
new file mode 100644
index 0000000..ac1bdc9
Binary files /dev/null and b/DeeployTest/Tests/testRQConv/inputs.npz differ
diff --git a/DeeployTest/Tests/testRQConv/network.onnx b/DeeployTest/Tests/testRQConv/network.onnx
new file mode 100644
index 0000000..308bcbd
Binary files /dev/null and b/DeeployTest/Tests/testRQConv/network.onnx differ
diff --git a/DeeployTest/Tests/testRQConv/outputs.npz b/DeeployTest/Tests/testRQConv/outputs.npz
new file mode 100644
index 0000000..843fbc0
Binary files /dev/null and b/DeeployTest/Tests/testRQConv/outputs.npz differ
diff --git a/DeeployTest/Tests/testRQGEMM/inputs.npz b/DeeployTest/Tests/testRQGEMM/inputs.npz
new file mode 100644
index 0000000..e2458d6
Binary files /dev/null and b/DeeployTest/Tests/testRQGEMM/inputs.npz differ
diff --git a/DeeployTest/Tests/testRQGEMM/network.onnx b/DeeployTest/Tests/testRQGEMM/network.onnx
new file mode 100644
index 0000000..3958135
Binary files /dev/null and b/DeeployTest/Tests/testRQGEMM/network.onnx differ
diff --git a/DeeployTest/Tests/testRQGEMM/outputs.npz b/DeeployTest/Tests/testRQGEMM/outputs.npz
new file mode 100644
index 0000000..9bebf88
Binary files /dev/null and b/DeeployTest/Tests/testRQGEMM/outputs.npz differ
diff --git a/DeeployTest/Tests/testRQMatMul/inputs.npz b/DeeployTest/Tests/testRQMatMul/inputs.npz
new file mode 100644
index 0000000..c2ce614
Binary files /dev/null and b/DeeployTest/Tests/testRQMatMul/inputs.npz differ
diff --git a/DeeployTest/Tests/testRQMatMul/network.onnx b/DeeployTest/Tests/testRQMatMul/network.onnx
new file mode 100644
index 0000000..af514a1
Binary files /dev/null and b/DeeployTest/Tests/testRQMatMul/network.onnx differ
diff --git a/DeeployTest/Tests/testRQMatMul/outputs.npz b/DeeployTest/Tests/testRQMatMul/outputs.npz
new file mode 100644
index 0000000..22a8238
Binary files /dev/null and b/DeeployTest/Tests/testRQMatMul/outputs.npz differ
diff --git a/DeeployTest/Tests/testReduceMean/inputs.npz b/DeeployTest/Tests/testReduceMean/inputs.npz
new file mode 100644
index 0000000..0f4d1a1
Binary files /dev/null and b/DeeployTest/Tests/testReduceMean/inputs.npz differ
diff --git a/DeeployTest/Tests/testReduceMean/network.onnx b/DeeployTest/Tests/testReduceMean/network.onnx
new file mode 100644
index 0000000..f96045f
Binary files /dev/null and b/DeeployTest/Tests/testReduceMean/network.onnx differ
diff --git a/DeeployTest/Tests/testReduceMean/outputs.npz b/DeeployTest/Tests/testReduceMean/outputs.npz
new file mode 100644
index 0000000..68d1cd2
Binary files /dev/null and b/DeeployTest/Tests/testReduceMean/outputs.npz differ
diff --git a/DeeployTest/Tests/testReduceSum/inputs.npz b/DeeployTest/Tests/testReduceSum/inputs.npz
new file mode 100644
index 0000000..7acf0c9
Binary files /dev/null and b/DeeployTest/Tests/testReduceSum/inputs.npz differ
diff --git a/DeeployTest/Tests/testReduceSum/network.onnx b/DeeployTest/Tests/testReduceSum/network.onnx
new file mode 100644
index 0000000..b31e609
Binary files /dev/null and b/DeeployTest/Tests/testReduceSum/network.onnx differ
diff --git a/DeeployTest/Tests/testReduceSum/outputs.npz b/DeeployTest/Tests/testReduceSum/outputs.npz
new file mode 100644
index 0000000..d7b3101
Binary files /dev/null and b/DeeployTest/Tests/testReduceSum/outputs.npz differ
diff --git a/DeeployTest/Tests/testRequantizedDWConv/activations.npz b/DeeployTest/Tests/testRequantizedDWConv/activations.npz
new file mode 100644
index 0000000..a96d09b
Binary files /dev/null and b/DeeployTest/Tests/testRequantizedDWConv/activations.npz differ
diff --git a/DeeployTest/Tests/testRequantizedDWConv/inputs.npz b/DeeployTest/Tests/testRequantizedDWConv/inputs.npz
new file mode 100644
index 0000000..1b72afd
Binary files /dev/null and b/DeeployTest/Tests/testRequantizedDWConv/inputs.npz differ
diff --git a/DeeployTest/Tests/testRequantizedDWConv/network.onnx b/DeeployTest/Tests/testRequantizedDWConv/network.onnx
new file mode 100644
index 0000000..d7f9513
Binary files /dev/null and b/DeeployTest/Tests/testRequantizedDWConv/network.onnx differ
diff --git a/DeeployTest/Tests/testRequantizedDWConv/outputs.npz b/DeeployTest/Tests/testRequantizedDWConv/outputs.npz
new file mode 100644
index 0000000..5609467
Binary files /dev/null and b/DeeployTest/Tests/testRequantizedDWConv/outputs.npz differ
diff --git a/DeeployTest/Tests/testRequantizedLinear/activations.npz b/DeeployTest/Tests/testRequantizedLinear/activations.npz
new file mode 100644
index 0000000..d6adda6
Binary files /dev/null and b/DeeployTest/Tests/testRequantizedLinear/activations.npz differ
diff --git a/DeeployTest/Tests/testRequantizedLinear/inputs.npz b/DeeployTest/Tests/testRequantizedLinear/inputs.npz
new file mode 100644
index 0000000..defca6e
Binary files /dev/null and b/DeeployTest/Tests/testRequantizedLinear/inputs.npz differ
diff --git a/DeeployTest/Tests/testRequantizedLinear/network.onnx b/DeeployTest/Tests/testRequantizedLinear/network.onnx
new file mode 100644
index 0000000..9282ce5
Binary files /dev/null and b/DeeployTest/Tests/testRequantizedLinear/network.onnx differ
diff --git a/DeeployTest/Tests/testRequantizedLinear/outputs.npz b/DeeployTest/Tests/testRequantizedLinear/outputs.npz
new file mode 100644
index 0000000..f407d79
Binary files /dev/null and b/DeeployTest/Tests/testRequantizedLinear/outputs.npz differ
diff --git a/DeeployTest/Tests/testSlice/activations.npz b/DeeployTest/Tests/testSlice/activations.npz
new file mode 100644
index 0000000..15cb0ec
Binary files /dev/null and b/DeeployTest/Tests/testSlice/activations.npz differ
diff --git a/DeeployTest/Tests/testSlice/inputs.npz b/DeeployTest/Tests/testSlice/inputs.npz
new file mode 100644
index 0000000..99cadcf
Binary files /dev/null and b/DeeployTest/Tests/testSlice/inputs.npz differ
diff --git a/DeeployTest/Tests/testSlice/network.onnx b/DeeployTest/Tests/testSlice/network.onnx
new file mode 100644
index 0000000..5d7d412
Binary files /dev/null and b/DeeployTest/Tests/testSlice/network.onnx differ
diff --git a/DeeployTest/Tests/testSlice/outputs.npz b/DeeployTest/Tests/testSlice/outputs.npz
new file mode 100644
index 0000000..f349879
Binary files /dev/null and b/DeeployTest/Tests/testSlice/outputs.npz differ
diff --git a/DeeployTest/Tests/trueIntegerDivSandwich/activations.npz b/DeeployTest/Tests/trueIntegerDivSandwich/activations.npz
new file mode 100644
index 0000000..cdc9c31
Binary files /dev/null and b/DeeployTest/Tests/trueIntegerDivSandwich/activations.npz differ
diff --git a/DeeployTest/Tests/trueIntegerDivSandwich/inputs.npz b/DeeployTest/Tests/trueIntegerDivSandwich/inputs.npz
new file mode 100644
index 0000000..b256df2
Binary files /dev/null and b/DeeployTest/Tests/trueIntegerDivSandwich/inputs.npz differ
diff --git a/DeeployTest/Tests/trueIntegerDivSandwich/network.onnx b/DeeployTest/Tests/trueIntegerDivSandwich/network.onnx
new file mode 100644
index 0000000..fb8469f
Binary files /dev/null and b/DeeployTest/Tests/trueIntegerDivSandwich/network.onnx differ
diff --git a/DeeployTest/Tests/trueIntegerDivSandwich/outputs.npz b/DeeployTest/Tests/trueIntegerDivSandwich/outputs.npz
new file mode 100644
index 0000000..f48da9d
Binary files /dev/null and b/DeeployTest/Tests/trueIntegerDivSandwich/outputs.npz differ
diff --git a/DeeployTest/deeployStateEqualityTest.py b/DeeployTest/deeployStateEqualityTest.py
new file mode 100644
index 0000000..297e52e
--- /dev/null
+++ b/DeeployTest/deeployStateEqualityTest.py
@@ -0,0 +1,130 @@
+# ----------------------------------------------------------------------
+#
+# File: deeployStateEqualityTest.py
+#
+# Last edited: 04.05.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import copy
+import os
+
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform
+from testUtils.typeMapping import inferInputType
+
+from Deeploy.DeeployTypes import NetworkContext, StructBuffer, VariableBuffer, _backendPostBindingFilename, \
+    _middlewarePreLoweringFilename
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description = "Test Utility for the State Equality Check.")
+    parser.add_argument('-t',
+                        metavar = 'testdir',
+                        dest = 'dir',
+                        type = str,
+                        default = './Tests/simpleRegression',
+                        help = 'Set the regression test\n')
+    parser.add_argument('-d',
+                        metavar = 'dumpdir',
+                        dest = 'dumpdir',
+                        type = str,
+                        default = './TestFiles',
+                        help = 'Set the output dump folder\n')
+    parser.add_argument('-p',
+                        metavar = 'platform',
+                        dest = 'platform',
+                        type = str,
+                        default = "QEMU-ARM",
+                        help = 'Choose the target Platform\n')
+    args = parser.parse_args()
+
+    _DEEPLOYSTATEDIR = os.path.join("./TEST_STATE_EQUALITY_DeeployState", args.platform, args.dir)
+
+    onnx_graph = onnx.load_model(f'./{args.dir}/network.onnx')
+    graph = gs.import_onnx(onnx_graph)
+
+    inputTypes = {}
+    inputOffsets = {}
+
+    inputs = np.load(f'./{args.dir}/inputs.npz')
+    tensors = graph.tensors()
+
+    # Load as int64 and infer types later
+    test_inputs = [inputs[x].reshape(-1).astype(np.int64) for x in inputs.files]
+
+    platform, signProp = mapPlatform(args.platform)
+
+    for index, num in enumerate(test_inputs):
+        _type, offset = inferInputType(num, signProp)[0]
+        inputTypes[f"input_{index}"] = _type
+        inputOffsets[f"input_{index}"] = offset
+
+    deployer = mapDeployer(platform, graph, inputTypes, deeployStateDir = _DEEPLOYSTATEDIR, inputOffsets = inputOffsets)
+
+    # Instantiate Classes Requried for Memory Level Annotation Extension
+    L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 1024000)
+    L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = 512000)
+    L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = 128000)
+
+    memoryHierarchy = MemoryHierarchy([L3, L2, L1])
+    memoryHierarchy.setDefaultMemoryLevel("L3")
+
+    deployer.Platform = setupMemoryPlatform(deployer.Platform, memoryHierarchy, defaultTargetMemoryLevel = L1)
+    deployer = MemoryDeployerWrapper(deployer)
+
+    deployer.generateFunction()
+
+    ctxt_post_binding_imported = NetworkContext.importNetworkContext(_DEEPLOYSTATEDIR, _backendPostBindingFilename)
+    ctxt_pre_lowering_imported = NetworkContext.importNetworkContext(_DEEPLOYSTATEDIR, _middlewarePreLoweringFilename)
+
+    memoryHierarchy = deployer.Platform.memoryHierarchy
+    defaultMemoryLevel = deployer.Platform.memoryHierarchy.getDefaultMemoryLevel()
+
+    for bufferName, buffer in ctxt_post_binding_imported.globalObjects.items():
+        if isinstance(buffer, VariableBuffer) and not isinstance(buffer, StructBuffer):
+            assert buffer._memoryLevel == defaultMemoryLevel.name, f"Tensor {bufferName} in global scope is not annotated with the default memory level"
+
+    for bufferName, buffer in ctxt_post_binding_imported.localObjects.items():
+        if isinstance(buffer, VariableBuffer) and not isinstance(buffer, StructBuffer):
+            assert buffer._memoryLevel == defaultMemoryLevel.name, f"Tensor {bufferName} in local scope is not annotated with the default memory level"
+
+    assert not ctxt_pre_lowering_imported == deployer.ctxt, "Contexts are not supposed to be equal but are, test failed!"
+    assert ctxt_post_binding_imported == deployer.ctxt, "Contexts are supposed to be equal but are not, test failed!"
+
+    # Test if the equality fails correctly if we add a new buffer to the context
+    dummyBuffer = VariableBuffer('dummyBuffer')
+    alteredCtxt1 = copy.deepcopy(deployer.ctxt)
+    alteredCtxt1.globalObjects[dummyBuffer.name] = dummyBuffer
+    assert not ctxt_post_binding_imported == alteredCtxt1, "Contexts are not supposed to be equal but are, test failed!"
+
+    # Test if the equality fails correctly if we modify a buffer of the context
+    alteredCtxt2 = copy.deepcopy(deployer.ctxt)
+    bufferName = list(alteredCtxt2.globalObjects.keys())[0]
+    alteredCtxt2.globalObjects[bufferName].name = "meme"
+    assert not ctxt_post_binding_imported == alteredCtxt2, "Contexts are not supposed to be equal but are, test failed!"
+
+    print("Contexts equality test passed!")
diff --git a/DeeployTest/generateNetwork.py b/DeeployTest/generateNetwork.py
new file mode 100644
index 0000000..a1c7476
--- /dev/null
+++ b/DeeployTest/generateNetwork.py
@@ -0,0 +1,151 @@
+# ----------------------------------------------------------------------
+#
+# File: generateNetwork.py
+#
+# Last edited: 08.01.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Moritz Scherer, ETH Zurich
+# - Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+from testUtils.codeGenerate import generateTestInputsHeader, generateTestNetworkHeader, \
+    generateTestNetworkImplementation, generateTestOutputsHeader
+from testUtils.graphDebug import generateDebugConfig
+from testUtils.platformMapping import mapDeployer, mapPlatform
+from testUtils.testRunner import TestGeneratorArgumentParser
+from testUtils.typeMapping import inferInputType
+
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.DebugPasses import EmulateCMSISRequantPass
+from Deeploy.DeeployTypes import _NoVerbosity
+from Deeploy.Targets.CortexM.Platform import CMSISPlatform
+
+_TEXT_ALIGN = 30
+
+if __name__ == '__main__':
+
+    parser = TestGeneratorArgumentParser(description = "Deeploy Code Generation Utility.")
+    parser.add_argument('--debug',
+                        dest = 'debug',
+                        action = 'store_true',
+                        default = False,
+                        help = 'Enable debugging mode\n')
+    parser.add_argument('--overwriteRecentState',
+                        action = 'store_true',
+                        help = 'Copy the recent deeply state to the ./deeployStates folder\n')
+
+    args = parser.parse_args()
+
+    onnx_graph = onnx.load_model(f'{args.dir}/network.onnx')
+    graph = gs.import_onnx(onnx_graph)
+
+    inputTypes = {}
+    inputOffsets = {}
+
+    inputs = np.load(f'{args.dir}/inputs.npz')
+    outputs = np.load(f'{args.dir}/outputs.npz')
+    if os.path.isfile(f'{args.dir}/activations.npz'):
+        activations = np.load(f'{args.dir}/activations.npz')
+    else:
+        activations = None
+
+    tensors = graph.tensors()
+
+    if args.debug:
+        test_inputs, test_outputs, graph = generateDebugConfig(inputs, outputs, activations, graph)
+
+    else:
+        # Load as int64 and infer types later
+        test_inputs = [inputs[x].reshape(-1).astype(np.int64) for x in inputs.files]
+        test_outputs = [outputs[x].reshape(-1).astype(np.int64) for x in outputs.files]
+
+        # WIESEP: Hack to get CI running because only one specific array is used
+        if "WaveFormer" in args.dir:
+            test_inputs = [test_inputs[0]]
+            test_outputs = [test_outputs[-2]]
+
+    platform, signProp = mapPlatform(args.platform)
+
+    for index, num in enumerate(test_inputs):
+        # WIESP: Do not infer types and offset of empty arrays
+        if np.prod(num.shape) == 0:
+            continue
+        _type, offset = inferInputType(num, signProp)[0]
+        inputTypes[f"input_{index}"] = _type
+        inputOffsets[f"input_{index}"] = offset
+
+    _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates")
+
+    deployer = mapDeployer(platform, graph, inputTypes, deeployStateDir = _DEEPLOYSTATEDIR, inputOffsets = inputOffsets)
+
+    if not isinstance(
+            platform, CMSISPlatform
+    ) and not "simpleCNN" in args.dir and not "testRQMatMul" in args.dir and not "testRQGEMM" in args.dir:
+        deployer.loweringOptimizer.passes.insert(0, EmulateCMSISRequantPass())
+
+    # Parse graph and infer output levels and signedness
+    _ = deployer.generateFunction(verbose = _NoVerbosity)
+
+    if args.overwriteRecentState:
+        os.makedirs(f'./deeployStates/', exist_ok = True)
+        os.system(f'cp -r {_DEEPLOYSTATEDIR}/* ./deeployStates/')
+
+    # Create input and output vectors
+    os.makedirs(f'{args.dumpdir}', exist_ok = True)
+
+    testInputStr = generateTestInputsHeader(deployer, test_inputs, inputTypes, inputOffsets)
+    f = open(f'{args.dumpdir}/testinputs.h', "w")
+    f.write(testInputStr)
+    f.close()
+
+    testOutputStr = generateTestOutputsHeader(deployer, test_outputs, signProp, verbose = args.verbose)
+    f = open(f'{args.dumpdir}/testoutputs.h', "w")
+    f.write(testOutputStr)
+    f.close()
+
+    # Generate code for Network
+    testNetworkHeaderStr = generateTestNetworkHeader(deployer, platform)
+    f = open(f'{args.dumpdir}/Network.h', "w")
+    f.write(testNetworkHeaderStr)
+    f.close()
+
+    testNetworkImplementationStr = generateTestNetworkImplementation(deployer, platform, verbose = args.verbose)
+    f = open(f'{args.dumpdir}/Network.c', "w")
+    f.write(testNetworkImplementationStr)
+    f.close()
+
+    clang_format = "{BasedOnStyle: llvm, IndentWidth: 2, ColumnLimit: 160}"
+    os.system(f'clang-format -i --style="{clang_format}" {args.dumpdir}/Network.c')
+    os.system(f'clang-format -i --style="{clang_format}" {args.dumpdir}/Network.h')
+    os.system(f'clang-format -i --style="{clang_format}" {args.dumpdir}/testoutputs.h')
+    os.system(f'clang-format -i --style="{clang_format}" {args.dumpdir}/testinputs.h')
+
+    if args.verbose:
+        print()
+        print("=" * 80)
+        num_ops = deployer.numberOfOps(args.verbose)
+        print("=" * 80)
+        print()
+        print(f"{'Number of Ops:' :<{_TEXT_ALIGN}} {num_ops}")
+        print(f"{'Model Parameters: ' :<{_TEXT_ALIGN}} {deployer.getParameterSize()}")
diff --git a/DeeployTest/profiling2csv.py b/DeeployTest/profiling2csv.py
new file mode 100644
index 0000000..7d8feb5
--- /dev/null
+++ b/DeeployTest/profiling2csv.py
@@ -0,0 +1,86 @@
+# ----------------------------------------------------------------------
+#
+# File: profiling2csv.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import csv
+import dataclasses
+
+from prettytable import PrettyTable
+from testUtils.ProfilingTraceParser import LayerProfiling, ProfilingTraceParser
+
+
+@dataclasses.dataclass
+class LayerInfo:
+    name: str
+    bufferingMode: str
+    ops: int
+    totalKernelCycles: int
+    totalInputDmaCycles: int
+    totalOutputDmaCycles: int
+
+
+def layerInfoFromProfiling(name: str, profiling: LayerProfiling) -> LayerInfo:
+    return LayerInfo(name = name,
+                     bufferingMode = profiling.bufferingMode,
+                     ops = profiling.ops,
+                     totalKernelCycles = sum(profiling.kernelCycles),
+                     totalInputDmaCycles = sum(profiling.inputDmaCycles),
+                     totalOutputDmaCycles = sum(profiling.outputDmaCycles))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description = 'Parse and visualize profiling results')
+    parser.add_argument('trace_path', type = str, help = 'Path to the profiling trace file')
+    parser.add_argument('-o',
+                        '--output_path',
+                        type = str,
+                        default = "profile.csv",
+                        help = 'Path to the output CSV file')
+    parser.add_argument('--table',
+                        action = 'store_true',
+                        default = False,
+                        help = 'Print a table of the profiled results.')
+    args = parser.parse_args()
+
+    profilingParser = ProfilingTraceParser()
+
+    with open(args.trace_path, "r") as f:
+        layerProfilings = profilingParser.parse(f.read())
+
+    fieldnames = [field.name for field in dataclasses.fields(LayerInfo)]
+    layerInfos = [layerInfoFromProfiling(name, profiling) for name, profiling in layerProfilings.items()]
+
+    with open(args.output_path, 'w', newline = '') as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
+
+        writer.writeheader()
+        for info in layerInfos:
+            writer.writerow(dataclasses.asdict(info))
+
+    if args.table:
+        table = PrettyTable(field_names = fieldnames)
+        for info in layerInfos:
+            table.add_row(dataclasses.astuple(info))
+        print(table)
diff --git a/DeeployTest/testComponentGraph.py b/DeeployTest/testComponentGraph.py
new file mode 100644
index 0000000..e091e0b
--- /dev/null
+++ b/DeeployTest/testComponentGraph.py
@@ -0,0 +1,100 @@
+# ----------------------------------------------------------------------
+#
+# File: testComponentGraph.py
+#
+# Last edited: 10.10.2023.
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+#   - Luka Macan, University of Bologna
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import onnx
+import onnx_graphsurgeon as gs
+from testUtils.graphColoring import graph_coloring
+
+from Deeploy.ComponentGraph import extractComponentGraph, extractComponentsFromComponentGraph
+
+if __name__ == "__main__":
+    test_dir = "Tests/WaveFormer"
+    colors = ["red", "green", "blue", "yellow"]
+    component_color = "red"
+    color_attr = "color"
+    color_frequency = 10
+
+    model = onnx.load(os.path.join(test_dir, "network.onnx"))
+    graph = gs.import_onnx(model).toposort()
+
+    # Color the graph randomly
+    graph = graph_coloring(graph, colors = colors, frequency = color_frequency, color_attr = color_attr)
+
+    # Color a few output nodes
+    N_OUTPUT_NODES = 5
+    first_output_node = graph.outputs[0].inputs[0]
+    next_nodes = [first_output_node]
+    for i in range(N_OUTPUT_NODES):
+        node = next_nodes.pop()
+        node.attrs[color_attr] = component_color
+        next_nodes += [tensor.inputs[0] for tensor in node.inputs if isinstance(tensor, gs.Variable)]
+
+    # Check that all the nodes have been colored
+    for node in graph.nodes:
+        assert color_attr in node.attrs
+
+    componentGraph = extractComponentGraph(graph, lambda node: node.attrs[color_attr] == component_color)
+
+    model = gs.export_onnx(componentGraph)
+    onnx.save_model(model, "component_graph.onnx")
+
+    # Check that all the nodes in the components are of the component_color
+    for node in componentGraph.nodes:
+        assert node.attrs[
+            color_attr] == component_color, f"Node {node.name} is not of {component_color} but {node.attrs['color']}"
+
+    # Check that all the component_color nodes from the original graph exist in the components
+    for node in graph.nodes:
+        if node.attrs[color_attr] == component_color:
+            assert any(node.name == componentNode.name for componentNode in componentGraph.nodes
+                      ), f"Node {node.name} of color {component_color} does not exist in any of the components"
+
+    # Check for duplicates in the inputs
+    inputNames = [tensor.name for tensor in componentGraph.inputs]
+    assert len(inputNames) == len(set(inputNames))
+
+    # Check for duplicates in the outputs
+    outputNames = [tensor.name for tensor in componentGraph.outputs]
+    assert len(outputNames) == len(set(outputNames))
+
+    components = extractComponentsFromComponentGraph(componentGraph)
+
+    componentNodes = []
+    for component in components:
+        componentNodes += list(component.nodes)
+
+    # Check components contain all the nodes from the componentGraph
+    for node in componentGraph.nodes:
+        assert any(node.name == componentNode.name
+                   for componentNode in componentNodes), f"Node {node.name} is not present in any of the components"
+
+    # Check if there are nodes that are in multiple components
+    nodeNames = [node.name for node in componentNodes]
+    assert len(nodeNames) == len(set(nodeNames))
+
+    print("Test passed")
diff --git a/DeeployTest/testEngineAwareOptimizerWrapper.py b/DeeployTest/testEngineAwareOptimizerWrapper.py
new file mode 100644
index 0000000..877230b
--- /dev/null
+++ b/DeeployTest/testEngineAwareOptimizerWrapper.py
@@ -0,0 +1,130 @@
+# ----------------------------------------------------------------------
+#
+# File: testEngineAwareOptimizerWrapper.py
+#
+# Last edited: 10.10.2023.
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+#   - Luka Macan, University of Bologna
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import onnx
+import onnx_graphsurgeon as gs
+from testUtils.graphColoring import graph_coloring
+from testUtils.graphDebug import graphDiff
+
+from Deeploy.DeeployTypes import TopologyOptimizationPass, TopologyOptimizer
+from Deeploy.DeploymentEngine import EngineAwareOptimizerWrapper
+from Deeploy.OptimizationPasses.TopologyOptimizationPasses.BasicPasses import IntegerDivRequantMergePass, \
+    MergeConstAddAndRequantPass, TransposeConstOptPass, TransposeMergePass, iGELURequantMergePass
+from Deeploy.OptimizationPasses.TopologyOptimizationPasses.CMSISPasses import ConvRequantMergePass, \
+    GEMMRequantMergePass, LinearAttentionAlignmentPass, MatMulRequantMergePass, MHSAAlignmentPass
+from Deeploy.OptimizationPasses.TopologyOptimizationPasses.DebugPasses import DebugPrintMergePass
+from Deeploy.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import NCHWtoNHWCPass, \
+    TransposeMatmulInputsPass
+from Deeploy.OptimizationPasses.TopologyOptimizationPasses.PULPPasses import PULPConvRequantMergePass
+
+
+def _test_partial_coloring():
+    test_dir = "Tests/simpleRegression"
+
+    model = onnx.load(os.path.join(test_dir, "network.onnx"))
+    graph = gs.import_onnx(model).toposort()
+
+    graph = graph_coloring(graph, ["mockedEngine0", "mockedEngine1"], frequency = [3, 20], color_attr = "engine")
+
+    original_optimizer = TopologyOptimizer([])
+    optimizer = EngineAwareOptimizerWrapper(original_optimizer, engineName = "mockedEngine0")
+
+    assert id(original_optimizer.passes) == id(
+        optimizer.passes), "The wrapped optimizer does not expose original optimizers passes attribute."
+
+    assert len(original_optimizer.passes) == 0
+    optimizer.passes = [PULPConvRequantMergePass()]
+    assert len(
+        original_optimizer.passes) == 1, "Wrapped optimizer does not modify the original optimizers passes attribute."
+
+    graph = optimizer.optimize(graph)
+    graph = graph.cleanup().toposort()
+
+    assert graph.nodes[
+        0].op == "RequantizedConv", f"First 2 nodes should have been replaced with the RequantizedConv node. Got {graph.nodes[0].op}"
+
+    non_optimized = [2, 5, 8]
+    for i in non_optimized:
+        assert graph.nodes[i].op == "Conv" and graph.nodes[
+            i +
+            1].op == "RequantShift", f"Nodes outside of the subgraph shouldn't have been optimized. Failed on nodes {i}:{graph.nodes[i].op} and {i+1}:{graph.nodes[i+1].op}"
+
+
+def _test_pass(_pass: TopologyOptimizationPass, graph: gs.Graph, engineName: str) -> gs.Graph:
+    # Mock Engine
+    engineName = "mockEngine"
+
+    # Mock coloring
+    for node in graph.nodes:
+        node.attrs["engine"] = engineName
+
+    topologyOptimizer = TopologyOptimizer([_pass])
+    engineOptimizer = EngineAwareOptimizerWrapper(topologyOptimizer, engineName)
+
+    topologyOptimizedGraph = topologyOptimizer.optimize(graph.copy())
+
+    # Mock recoloring
+    for node in topologyOptimizedGraph.nodes:
+        node.attrs["engine"] = engineName
+
+    engineOptimizedGraph = engineOptimizer.optimize(graph.copy())
+
+    diffTree = graphDiff(topologyOptimizedGraph, engineOptimizedGraph)
+    assert diffTree.root is None, f"Failed at pass {type(_pass).__name__}\n{diffTree.message}"
+
+    return topologyOptimizedGraph
+
+
+def _test_passes():
+    test_dir = "Tests/simpleRegression"
+    model = onnx.load(os.path.join(test_dir, "network.onnx"))
+    graph = gs.import_onnx(model).toposort()
+    passes = [
+        IntegerDivRequantMergePass(),
+        iGELURequantMergePass(),
+        LinearAttentionAlignmentPass(),
+        MHSAAlignmentPass(),
+        MergeConstAddAndRequantPass(),
+        ConvRequantMergePass(),
+        GEMMRequantMergePass(),
+        MatMulRequantMergePass(),
+        TransposeMatmulInputsPass(),
+        NCHWtoNHWCPass(False),
+        TransposeMergePass(),
+        TransposeConstOptPass(),
+        DebugPrintMergePass()
+    ]
+
+    for _pass in passes:
+        graph = _test_pass(_pass, graph, "mockEngine")
+
+
+if __name__ == "__main__":
+    _test_partial_coloring()
+    _test_passes()
+    print("Test passed")
diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py
new file mode 100644
index 0000000..a2e1971
--- /dev/null
+++ b/DeeployTest/testMVP.py
@@ -0,0 +1,421 @@
+# ----------------------------------------------------------------------
+#
+# File: testMVP.py
+#
+# Last edited: 31.10.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+from collections import OrderedDict
+from typing import List, Union
+
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+import pytest
+from ortools.constraint_solver.pywrapcp import IntVar
+from testUtils.codeGenerate import generateL3HexDump, generateTestInputsHeader, generateTestNetworkHeader, \
+    generateTestNetworkImplementation, generateTestOutputsHeader
+from testUtils.graphDebug import generateDebugConfig
+from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform
+from testUtils.testRunner import TestGeneratorArgumentParser
+from testUtils.typeMapping import inferInputType
+
+from Deeploy.DeeployTypes import CodeGenVerbosity, ConstantBuffer, NetworkContext, NetworkDeployer, ONNXLayer, \
+    SubGraph, TransientBuffer
+from Deeploy.EngineExtension.NetworkDeployers.EngineColoringDeployer import EngineColoringDeployerWrapper
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper
+from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel, \
+    AnnotateIOMemoryLevel, AnnotateNeurekaWeightMemoryLevel
+from Deeploy.TilingExtension.MemoryScheduler import MemoryScheduler
+from Deeploy.TilingExtension.TilerExtension import Tiler, TilerDeployerWrapper
+from Deeploy.TilingExtension.TilerModel import TilerModel
+
+_TEXT_ALIGN = 30
+
+
+class DBOnlyL3Tiler(Tiler):
+
+    def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
+                            hop: str, tensorName: str) -> Union[int, IntVar]:
+
+        varBuffer = ctxt.lookup(tensorName)
+
+        generalCoeff = 2
+
+        if isinstance(varBuffer, TransientBuffer):
+            coefficient = 1
+        elif isinstance(varBuffer, ConstantBuffer):
+            coefficient = generalCoeff
+        else:
+            coefficient = generalCoeff
+
+        if args.defaultMemLevel == "L2":
+            return coefficient
+
+        if hop == 'L1':
+            return 1
+
+        return coefficient
+
+
+class DBTiler(Tiler):
+
+    def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
+                            hop: str, tensorName: str) -> Union[int, IntVar]:
+        varBuffer = ctxt.lookup(tensorName)
+
+        generalCoeff = 2
+
+        if isinstance(varBuffer, TransientBuffer):
+            coefficient = 1
+        elif isinstance(varBuffer, ConstantBuffer):
+            coefficient = generalCoeff
+        else:
+            coefficient = generalCoeff
+
+        return coefficient
+
+
+class SBTiler(DBTiler):
+
+    def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
+                            hop: str, tensorName: str) -> Union[int, IntVar]:
+        varBuffer = ctxt.lookup(tensorName)
+
+        generalCoeff = 1
+
+        if isinstance(varBuffer, TransientBuffer):
+            coefficient = 1
+        elif isinstance(varBuffer, ConstantBuffer):
+            coefficient = generalCoeff
+        else:
+            coefficient = generalCoeff
+
+        return coefficient
+
+
+class RandomizedMemoryScheduler(MemoryScheduler):
+
+    def heuristicPermutation(self, adjacencyMatrix, costVector) -> List[int]:
+        permutationList = list(range(len(costVector)))
+        random.seed(self.seed)
+        random.shuffle(permutationList)
+
+        return permutationList
+
+
+class RandomizedSBTiler(DBTiler):
+
+    memorySchedulerClass = RandomizedMemoryScheduler
+
+    def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
+                            hop: str, tensorName: str) -> Union[int, IntVar]:
+        varBuffer = ctxt.lookup(tensorName)
+
+        generalCoeff = 1
+
+        if isinstance(varBuffer, TransientBuffer):
+            coefficient = 1
+        elif isinstance(varBuffer, ConstantBuffer):
+            coefficient = generalCoeff
+        else:
+            coefficient = generalCoeff
+
+        return coefficient
+
+
+# Mock of the Global Scheduler's inteface
+# Returns a list of list of nodes instead of simply a list
+# Inner list represent the patter over which we tile
+def _mockScheduler(graph: gs.Graph) -> List[List[gs.Node]]:
+
+    schedule = [[node] for node in graph.nodes]
+
+    return schedule
+
+
+def _filterSchedule(schedule: List[List[gs.Node]], layerBinding: 'OrderedDict[str, ONNXLayer]') -> List[List[gs.Node]]:
+
+    filteredSchedule = []
+
+    for pattern in schedule:
+
+        filteredSchedulePattern = []
+        for node in pattern:
+            if node.name in layerBinding.keys():
+                filteredSchedulePattern.append(node)
+        filteredSchedule.append(filteredSchedulePattern)
+
+    return filteredSchedule
+
+
+def setupDeployer(graph: gs.Graph,
+                  memoryHierarchy: MemoryHierarchy,
+                  defaultTargetMemoryLevel: MemoryLevel,
+                  defaultIoMemoryLevel: MemoryLevel,
+                  verbose: CodeGenVerbosity,
+                  overwriteRecentState = False) -> NetworkDeployer:
+
+    inputTypes = {}
+    inputOffsets = {}
+
+    _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates")
+
+    inputs = np.load(f'{args.dir}/inputs.npz')
+    tensors = graph.tensors()
+
+    # Load as int64 and infer types later
+    test_inputs = [inputs[x].reshape(-1).astype(np.int64) for x in inputs.files]
+
+    platform, signProp = mapPlatform(args.platform)
+
+    if args.enable_3x3:
+        platform.engines[0].enable3x3 = True
+    if args.enableStrides:
+        platform.engines[0].enableStrides = True
+
+    for index, num in enumerate(test_inputs):
+        # WIESP: Do not infer types and offset of empty arrays
+        if np.prod(num.shape) == 0:
+            continue
+        _type, offset = inferInputType(num, signProp)[0]
+        inputTypes[f"input_{index}"] = _type
+        inputOffsets[f"input_{index}"] = offset
+
+    deployer = mapDeployer(platform,
+                           graph,
+                           inputTypes,
+                           deeployStateDir = _DEEPLOYSTATEDIR,
+                           inputOffsets = inputOffsets,
+                           scheduler = _mockScheduler)
+
+    # Make the deployer engine-color-aware
+    if args.platform == "Siracusa_w_neureka":
+        deployer = EngineColoringDeployerWrapper(deployer)
+
+    # Make platform memory-aware after mapDeployer because it requires the platform to be an instance of an unwrapped platform
+    deployer.Platform = setupMemoryPlatform(deployer.Platform, memoryHierarchy, defaultTargetMemoryLevel)
+
+    memoryLevelAnnotationPasses = [
+        AnnotateIOMemoryLevel(defaultIoMemoryLevel.name),
+        AnnotateDefaultMemoryLevel(memoryHierarchy)
+    ]
+
+    if args.neureka_wmem:
+        weightMemoryLevel = memoryHierarchy.memoryLevels["WeightMemory_SRAM"]
+        memoryLevelAnnotationPasses.append(
+            AnnotateNeurekaWeightMemoryLevel(neurekaEngineName = deployer.Platform.engines[0].name,
+                                             weightMemoryLevel = weightMemoryLevel))
+
+    # Make the deployer memory-level aware
+    deployer = MemoryDeployerWrapper(deployer, memoryLevelAnnotationPasses)
+
+    # Make the deployer tiler aware
+    if args.doublebuffer:
+        deployer = TilerDeployerWrapper(deployer, DBOnlyL3Tiler)
+    elif args.randomizedMemoryScheduler:
+        deployer = TilerDeployerWrapper(deployer, RandomizedSBTiler)
+    else:
+        deployer = TilerDeployerWrapper(deployer, SBTiler)
+
+    deployer.frontEnd()
+    deployer.midEnd()
+
+    # Decomposed Backend to mock the scheduler
+    deployer.backEnd(verbose)
+
+    deployer.prepared = True
+
+    if overwriteRecentState:
+        os.makedirs(f'./deeployStates/', exist_ok = True)
+        os.system(f'cp -r {_DEEPLOYSTATEDIR}/* ./deeployStates/')
+
+    return deployer
+
+
+if __name__ == '__main__':
+
+    parser = TestGeneratorArgumentParser(
+        description = "Deeploy Code Generation Utility with Memory Level Annotation and Tiling Extension.")
+
+    parser.add_argument('--debug',
+                        dest = 'debug',
+                        action = 'store_true',
+                        default = False,
+                        help = 'Enable debugging mode\n')
+    parser.add_argument('--defaultMemLevel',
+                        metavar = 'defaultMemLevel',
+                        dest = 'defaultMemLevel',
+                        type = str,
+                        default = "L2",
+                        help = 'Set default memory level\n')
+    parser.add_argument('--neureka-wmem',
+                        dest = "neureka_wmem",
+                        action = "store_true",
+                        default = False,
+                        help = 'Adds weight memory and neureka engine color\n')
+    parser.add_argument('--enable-3x3',
+                        dest = "enable_3x3",
+                        action = "store_true",
+                        default = False,
+                        help = 'Adds EXPERIMENTAL support for 3x3 convolutions on N-EUREKA\n')
+    parser.add_argument('--enableStrides',
+                        dest = "enableStrides",
+                        action = "store_true",
+                        default = False,
+                        help = 'Adds EXPERIMENTAL support for strided convolutions on N-EUREKA\n')
+    parser.add_argument('--randomizedMemoryScheduler', action = "store_true")
+    parser.add_argument('--doublebuffer', action = 'store_true')
+    parser.add_argument('--l1', metavar = 'l1', dest = 'l1', type = int, default = 64000, help = 'Set L1 size\n')
+    parser.add_argument('--shouldFail', action = 'store_true')
+    parser.add_argument('--profileTiling',
+                        metavar = 'profileTiling',
+                        dest = 'profileTiling',
+                        type = str,
+                        default = None)
+    parser.add_argument('--overwriteRecentState',
+                        action = 'store_true',
+                        help = 'Copy the recent deeply state to the ./deeployStates folder\n')
+
+    parser.set_defaults(shouldFail = False)
+    args = parser.parse_args()
+
+    verbosityCfg = CodeGenVerbosity(None)
+
+    if args.profileTiling is not None:
+        verbosityCfg.tilingProfiling = args.profileTiling
+
+    onnx_graph = onnx.load_model(f'{args.dir}/network.onnx')
+    graph = gs.import_onnx(onnx_graph)
+
+    inputTypes = {}
+    inputOffsets = {}
+
+    inputs = np.load(f'{args.dir}/inputs.npz')
+    outputs = np.load(f'{args.dir}/outputs.npz')
+    if os.path.isfile(f'{args.dir}/activations.npz'):
+        activations = np.load(f'{args.dir}/activations.npz')
+    else:
+        activations = None
+
+    tensors = graph.tensors()
+
+    if args.debug:
+        test_inputs, test_outputs, graph = generateDebugConfig(inputs, outputs, activations, graph)
+    else:
+        # Load as int64 and infer types later
+        test_inputs = [inputs[x].reshape(-1).astype(np.int64) for x in inputs.files]
+        test_outputs = [outputs[x].reshape(-1).astype(np.int64) for x in outputs.files]
+
+        # WIESEP: Hack to get CI running because only one specific array is used
+        if "WaveFormer" in args.dir:
+            test_inputs = [test_inputs[0]]
+            test_outputs = [test_outputs[-2]]
+
+    # Instantiate Classes Requried for Memory Level Annotation Extension
+    L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 64000000)
+    L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = 512000)
+    L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = args.l1)
+    memoryLevels = [L3, L2, L1]
+
+    if args.neureka_wmem:
+        memoryLevels.append(MemoryLevel(name = "WeightMemory_SRAM", neighbourNames = [], size = 4 * 1024 * 1024))
+
+    memoryHierarchy = MemoryHierarchy(memoryLevels)
+    memoryHierarchy.setDefaultMemoryLevel(args.defaultMemLevel)
+
+    deployer = setupDeployer(graph,
+                             memoryHierarchy,
+                             defaultTargetMemoryLevel = L1,
+                             defaultIoMemoryLevel = memoryHierarchy.memoryLevels[args.defaultMemLevel],
+                             verbose = verbosityCfg,
+                             overwriteRecentState = args.overwriteRecentState)
+
+    platform = deployer.Platform
+    signProp = False
+
+    for index, num in enumerate(test_inputs):
+        # WIESP: Do not infer types and offset of empty arrays
+        if np.prod(num.shape) == 0:
+            continue
+        _type, offset = inferInputType(num, signProp)[0]
+        inputTypes[f"input_{index}"] = _type
+        inputOffsets[f"input_{index}"] = offset
+
+    schedule = _filterSchedule(_mockScheduler(graph), deployer.layerBinding)
+
+    if args.shouldFail:
+        with pytest.raises(Exception):
+            tilingSchedule = deployer.tiler.computeTilingSchedule(deployer.ctxt)
+
+        print("Tiler test ended, failed as expected!")
+    else:
+
+        _ = deployer.generateFunction(verbosityCfg)
+
+        # Create input and output vectors
+        os.makedirs(f'{args.dumpdir}', exist_ok = True)
+
+        testInputStr = generateTestInputsHeader(deployer, test_inputs, inputTypes, inputOffsets)
+        f = open(f'{args.dumpdir}/testinputs.h', "w")
+        f.write(testInputStr)
+        f.close()
+
+        testOutputStr = generateTestOutputsHeader(deployer, test_outputs, signProp, args.verbose)
+        f = open(f'{args.dumpdir}/testoutputs.h', "w")
+        f.write(testOutputStr)
+        f.close()
+
+        # Generate code for Network
+        testNetworkHeaderStr = generateTestNetworkHeader(deployer, platform)
+        f = open(f'{args.dumpdir}/Network.h', "w")
+        f.write(testNetworkHeaderStr)
+        f.close()
+
+        testNetworkImplementationStr = generateTestNetworkImplementation(deployer, platform)
+        f = open(f'{args.dumpdir}/Network.c', "w")
+        f.write(testNetworkImplementationStr)
+        f.close()
+
+        generateL3HexDump(deployer, os.path.join(f'{args.dumpdir}', 'hex'), test_inputs, test_outputs)
+
+        clang_format = "{BasedOnStyle: llvm, IndentWidth: 2, ColumnLimit: 160}"
+        os.system(f'clang-format -i --style="{clang_format}" {args.dumpdir}/Network.c')
+        os.system(f'clang-format -i --style="{clang_format}" {args.dumpdir}/Network.h')
+        os.system(f'clang-format -i --style="{clang_format}" {args.dumpdir}/testoutputs.h')
+        os.system(f'clang-format -i --style="{clang_format}" {args.dumpdir}/testinputs.h')
+
+        if args.verbose:
+            print()
+            print("=" * 80)
+            num_ops = deployer.numberOfOps(args.verbose)
+            print("=" * 80)
+            print()
+            print(f"{'Number of Ops:' :<{_TEXT_ALIGN}} {num_ops}")
+            print('Worst Case Buffer Size:')
+            for level in deployer.worstCaseBufferSize.keys():
+                print(f"{'  ' + str(level) + ':' :<{_TEXT_ALIGN}} {deployer.worstCaseBufferSize[level]}")
+            print(f"{'Model Parameters: ' :<{_TEXT_ALIGN}} {deployer.getParameterSize()}")
+
+        print("Tiler test ended, no memory violations!")
diff --git a/DeeployTest/testMemoryLevelExtension.py b/DeeployTest/testMemoryLevelExtension.py
new file mode 100644
index 0000000..6a4917e
--- /dev/null
+++ b/DeeployTest/testMemoryLevelExtension.py
@@ -0,0 +1,221 @@
+# ----------------------------------------------------------------------
+#
+# File: testMemoryLevelExtension.py
+#
+# Last edited: 04.05.2022
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+from testUtils.platformMapping import defaultScheduler, mapDeployer, mapPlatform, setupMemoryPlatform
+from testUtils.testRunner import TestGeneratorArgumentParser, getPaths
+from testUtils.typeMapping import inferInputType
+
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    NCHWtoNHWCPass, TransposeMatmulInputsPass
+from Deeploy.DeeployTypes import StructBuffer, VariableBuffer
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper, \
+    MemoryLevelAwareSignPropDeployer
+from Deeploy.Targets.CortexM.Platform import CMSISEngine, CMSISMapping, CMSISOptimizer, CMSISPlatform
+from Deeploy.Targets.Generic.Platform import GenericEngine, GenericMapping, GenericOptimizer, GenericPlatform
+from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import TransposeConstOptPass, TransposeMergePass
+from Deeploy.Targets.MemPool.Platform import MemPoolEngine, MemPoolMapping, MemPoolOptimizer, MemPoolPlatform
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine, PULPMapping, PULPOptimizer, PULPPlatform
+
+if __name__ == '__main__':
+
+    parser = TestGeneratorArgumentParser(description = "Test Utility for the Memory Level Extension.")
+    args = parser.parse_args()
+
+    inputTypes = {}
+    inputOffsets = {}
+
+    _GENDIRROOT = f'TEST_{args.platform.upper()}'
+    _GENDIR, _TESTDIR, _TESTNAME = getPaths(args.dir, _GENDIRROOT)
+
+    print("GENDIR    : ", _GENDIR)
+    print("TESTDIR   : ", _TESTDIR)
+    print("TESTNAME  : ", _TESTNAME)
+
+    _DEEPLOYSTATEDIR = os.path.join(_GENDIR, "TEST_MEMORYLEVEL", "deeployStates")
+    _DEEPLOYSTATEDIRMOCK = os.path.join(_GENDIR, "TEST_MEMORYLEVEL", "deeployStatesMock")
+
+    onnx_graph = onnx.load_model(f'{_TESTDIR}/network.onnx')
+    graph = gs.import_onnx(onnx_graph)
+
+    inputs = np.load(f'{_TESTDIR}/inputs.npz')
+    tensors = graph.tensors()
+
+    # Load as int64 and infer types later
+    test_inputs = [inputs[x].reshape(-1).astype(np.int64) for x in inputs.files]
+
+    # Instantiate Classes Requried for Memory Level Annotation Extension
+    L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 1024000)
+    L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = 512000)
+    L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = 128000)
+
+    memoryHierarchy = MemoryHierarchy([L3, L2, L1])
+    memoryHierarchy.setDefaultMemoryLevel("L3")
+    defaultTargetMemoryLevel = L1
+
+    platform, signProp = mapPlatform(args.platform)
+
+    for index, num in enumerate(test_inputs):
+        _type, offset = inferInputType(num, signProp)[0]
+        inputTypes[f"input_{index}"] = _type
+        inputOffsets[f"input_{index}"] = offset
+        if "simpleRegression" in args.dir:
+            inputOffsets[f"input_{index}"] = 0
+
+    deployer = mapDeployer(platform, graph, inputTypes, deeployStateDir = _DEEPLOYSTATEDIR, inputOffsets = inputOffsets)
+
+    if args.platform == "QEMU-ARM":
+
+        MockEngine = CMSISEngine("MockCmsis", Mapping = copy.copy(CMSISMapping))
+        MockPlatform = CMSISPlatform(engines = [MockEngine])
+        MockPlatform = setupMemoryPlatform(MockPlatform, memoryHierarchy, defaultTargetMemoryLevel)
+
+        mockDeployer = MemoryLevelAwareSignPropDeployer(graph,
+                                                        MockPlatform,
+                                                        inputTypes,
+                                                        CMSISOptimizer,
+                                                        defaultScheduler,
+                                                        name = "DeeployNetwork",
+                                                        deeployStateDir = _DEEPLOYSTATEDIR,
+                                                        default_channels_first = False)
+
+        # # Manually add the necessary optimization passes to parse WaveFormer
+        mockDeployer.loweringOptimizer.passes += [
+            TransposeMatmulInputsPass(),
+            NCHWtoNHWCPass(deployer.default_channels_first),
+            TransposeMergePass(),
+            TransposeConstOptPass()
+        ]
+
+    elif args.platform == "MemPool":
+
+        MockEngine = MemPoolEngine("MockMemPool", Mapping = copy.copy(MemPoolMapping))
+        MockPlatform = MemPoolPlatform(engines = [MockEngine])
+        MockPlatform = setupMemoryPlatform(MockPlatform, memoryHierarchy, defaultTargetMemoryLevel)
+
+        mockDeployer = MemoryLevelAwareSignPropDeployer(graph,
+                                                        MockPlatform,
+                                                        inputTypes,
+                                                        MemPoolOptimizer,
+                                                        defaultScheduler,
+                                                        name = "DeeployNetwork",
+                                                        deeployStateDir = _DEEPLOYSTATEDIR,
+                                                        default_channels_first = True)
+
+    elif args.platform == "Generic":
+
+        MockEngine = GenericEngine("MockGeneric", Mapping = copy.copy(GenericMapping))
+        MockPlatform = GenericPlatform(engines = [MockEngine])
+        MockPlatform = setupMemoryPlatform(MockPlatform, memoryHierarchy, defaultTargetMemoryLevel)
+
+        mockDeployer = MemoryLevelAwareSignPropDeployer(graph,
+                                                        MockPlatform,
+                                                        inputTypes,
+                                                        GenericOptimizer,
+                                                        defaultScheduler,
+                                                        name = "DeeployNetworkMock",
+                                                        deeployStateDir = _DEEPLOYSTATEDIRMOCK,
+                                                        default_channels_first = True)
+
+    elif args.platform == "Siracusa":
+
+        MockEngine = PULPClusterEngine("MockPulpCluster", Mapping = copy.copy(PULPMapping))
+        MockPlatform = PULPPlatform(engines = [MockEngine])
+        MockPlatform = setupMemoryPlatform(MockPlatform, memoryHierarchy, defaultTargetMemoryLevel)
+
+        mockDeployer = MemoryLevelAwareSignPropDeployer(graph,
+                                                        MockPlatform,
+                                                        inputTypes,
+                                                        PULPOptimizer,
+                                                        defaultScheduler,
+                                                        name = "DeeployNetworkMock",
+                                                        deeployStateDir = _DEEPLOYSTATEDIRMOCK,
+                                                        default_channels_first = False)
+
+        # Manually add the necessary optimization pass to parse WaveFormer
+        mockDeployer.loweringOptimizer.passes += [
+            TransposeMatmulInputsPass(),
+            NCHWtoNHWCPass(mockDeployer.default_channels_first),
+            TransposeMergePass(),
+            TransposeConstOptPass()
+        ]
+
+    else:
+        raise RuntimeError(f"Deployment platform {args.platform} is not implemented")
+
+    # Make the deployer memory-level aware
+    deployer.Platform = setupMemoryPlatform(deployer.Platform, memoryHierarchy, defaultTargetMemoryLevel)
+    deployer = MemoryDeployerWrapper(deployer)
+
+    # Run the middleware and backend
+    mockDeployer.generateFunction()
+    deployer.generateFunction()
+
+    # Test if the Contexts are correctly annotated for both the deployer and the mockDeployer
+    defaultMemoryLevel = deployer.Platform.memoryHierarchy.getDefaultMemoryLevel()
+
+    for bufferName, buffer in deployer.ctxt.globalObjects.items():
+        if isinstance(buffer, VariableBuffer) and not isinstance(buffer, StructBuffer):
+            assert buffer._memoryLevel == defaultMemoryLevel.name, f"Tensor {bufferName} in global scope of the deployer is not annotated with the default memory level"
+
+    for bufferName, buffer in deployer.ctxt.localObjects.items():
+        if isinstance(buffer, VariableBuffer) and not isinstance(buffer, StructBuffer):
+            assert buffer._memoryLevel == defaultMemoryLevel.name, f"Tensor {bufferName} in local scope of the deployer is not annotated with the default memory level"
+
+    for bufferName, buffer in mockDeployer.ctxt.globalObjects.items():
+        if isinstance(buffer, VariableBuffer) and not isinstance(buffer, StructBuffer):
+            assert buffer._memoryLevel == defaultMemoryLevel.name, f"Tensor {bufferName} in global scope of the mock deployer is not annotated with the default memory level"
+
+    for bufferName, buffer in mockDeployer.ctxt.localObjects.items():
+        if isinstance(buffer, VariableBuffer) and not isinstance(buffer, StructBuffer):
+            assert buffer._memoryLevel == defaultMemoryLevel.name, f"Tensor {bufferName} in local scope of the mock deployer is not annotated with the default memory level"
+
+    # Test if the memoryHierarchy attribute of the deployer and mockDeployer are equal
+    assert mockDeployer.Platform.memoryHierarchy == deployer.Platform.memoryHierarchy, "Memory hierarchy of the deployer and mock deployer are not equal"
+
+    # Test if the equality fails correctly if the memory hierarchy does not contain the same nodes
+    L3 = MemoryLevel(name = "L3", neighbourNames = [], size = 1024000)
+    memoryHierarchyAltered1 = MemoryHierarchy([L3])
+    memoryHierarchyAltered1.setDefaultMemoryLevel("L3")
+    mockDeployer.Platform.memoryHierarchy = memoryHierarchyAltered1
+    assert not mockDeployer.Platform.memoryHierarchy == deployer.Platform.memoryHierarchy, "Memory hierarchy of the deployer and mock deployer are equal but are not supposed to be"
+
+    # Test if the equality fails correctly if the default memory hierarchy is not the same
+    L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 1024000)
+    L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = 752000)
+    L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = 128000)
+    memoryHierarchyAltered2 = MemoryHierarchy([L3, L2, L1])
+    memoryHierarchyAltered2.setDefaultMemoryLevel("L3")
+    mockDeployer.Platform.memoryHierarchy = memoryHierarchyAltered2
+    assert not mockDeployer.Platform.memoryHierarchy == deployer.Platform.memoryHierarchy, "Memory hierarchy of the deployer and mock deployer are equal but are not supposed to be"
+
+    print("Memory Level Extension test passed!")
diff --git a/DeeployTest/testRegexMatching.py b/DeeployTest/testRegexMatching.py
new file mode 100644
index 0000000..45ecd51
--- /dev/null
+++ b/DeeployTest/testRegexMatching.py
@@ -0,0 +1,86 @@
+# ----------------------------------------------------------------------
+#
+# File: testRegexMatching.py
+#
+# Last edited: 10.10.2023.
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+#   - Luka Macan, University of Bologna
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+import onnx
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.Matchers import Match, NonBranchingMatcher
+from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, contextagnostic
+from Deeploy.DeeployTypes import TopologyOptimizer
+
+# Match any operation that contains conv in it's name
+test_regex = r'.*[Cc]onv.*'
+test_op_name = 'TestConv'
+
+
+def _rename_conv_to_test_conv(graph: gs.Graph, match: Match, name: str):
+    matched_nodes = list(match.nodes_map.values())
+    assert len(matched_nodes) == 1
+    conv = matched_nodes[0]
+    testConv = gs.Node(op = test_op_name, name = name, attrs = {**conv.attrs})
+    graph.replaceInsertNode(conv.inputs, conv.outputs, testConv)
+    return graph
+
+
+# Match all nodes with 'conv' in their name and add a `test` attribute
+@contextagnostic
+class ConvTestPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        pattern = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = pattern.layer(inputs = [_input], outputs = ['conv_out'], op = test_regex, name = 'conv1')
+        pattern.outputs.append(output)
+        pattern.inputs.append(_input)
+
+        name = "_CONV_TEST_PASS"
+        super().__init__(pattern, _rename_conv_to_test_conv, name, NonBranchingMatcher(regex_op = True))
+
+
+if __name__ == "__main__":
+    optimizer = TopologyOptimizer([ConvTestPass()])
+    model = onnx.load_model('Tests/simpleCNN/network.onnx')
+    graph = gs.import_onnx(model)
+
+    match_count = 0
+
+    for node in graph.nodes:
+        if re.match(test_regex, node.op):
+            match_count += 1
+
+    optimized_graph = optimizer.optimize(graph)
+
+    test_op_name_count = 0
+
+    for node in optimized_graph.nodes:
+        if node.op == test_op_name:
+            test_op_name_count += 1
+
+    assert match_count == test_op_name_count, "Didn't match all the operations."
+
+    print("Test passed")
diff --git a/DeeployTest/testReplaceInsertSubgraph.py b/DeeployTest/testReplaceInsertSubgraph.py
new file mode 100644
index 0000000..c6a8aa5
--- /dev/null
+++ b/DeeployTest/testReplaceInsertSubgraph.py
@@ -0,0 +1,66 @@
+# ----------------------------------------------------------------------
+#
+# File: testReplaceInsertSubgraph.py
+#
+# Last edited: 10.10.2023.
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+#   - Luka Macan, University of Bologna
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import onnx
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import TopologyOptimizer
+from Deeploy.OptimizationPasses.TopologyOptimizationPasses.PULPPasses import PULPConvRequantMergePass
+
+if __name__ == "__main__":
+    test_dir = "Tests/simpleRegression"
+
+    model = onnx.load(os.path.join(test_dir, "network.onnx"))
+    graph = gs.import_onnx(model).toposort()
+    graph_len = len(graph.nodes)
+
+    subgraph = graph.copy()
+    # Make the subgraph the first 3 nodes
+    subgraph_len = 3
+    subgraph.outputs = [subgraph.nodes[subgraph_len - 1].outputs[0]]
+    subgraph.cleanup()
+
+    assert len(subgraph.nodes) == subgraph_len
+    assert len(graph.nodes) == graph_len
+
+    optimizer = TopologyOptimizer([PULPConvRequantMergePass()])
+
+    subgraph = optimizer.optimize(subgraph)
+    graph.replaceInsertSubgraph(subgraph)
+    graph = graph.cleanup().toposort()
+
+    assert graph.nodes[
+        0].op == "RequantizedConv", f"First 2 nodes should have been replaced with the RequantizedConv node. Got {graph.nodes[0].op}"
+
+    non_optimized = [2, 5, 8]
+    for i in non_optimized:
+        assert graph.nodes[i].op == "Conv" and graph.nodes[
+            i +
+            1].op == "RequantShift", f"Nodes outside of the subgraph shouldn't have been optimized. Failed on nodes {i}:{graph.nodes[i].op} and {i+1}:{graph.nodes[i+1].op}"
+
+    print("Test passed")
diff --git a/DeeployTest/testRunner_cortexm.py b/DeeployTest/testRunner_cortexm.py
new file mode 100644
index 0000000..efa927f
--- /dev/null
+++ b/DeeployTest/testRunner_cortexm.py
@@ -0,0 +1,37 @@
+# ----------------------------------------------------------------------
+#
+# File: testRunner_qemu.py
+#
+# Last edited: 17.03.2023
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
+
+if __name__ == "__main__":
+
+    parser = TestRunnerArgumentParser(
+        tiling_arguments = False,
+        description = "Deeploy Code Generation Utility for the ARM (QEMU) Platform (no Tiling).")
+    args = parser.parse_args()
+
+    testRunner = TestRunner(platform = "QEMU-ARM", simulator = "qemu", tiling = False, argument_parser = parser)
+
+    testRunner.run()
diff --git a/DeeployTest/testRunner_generic.py b/DeeployTest/testRunner_generic.py
new file mode 100644
index 0000000..70909bf
--- /dev/null
+++ b/DeeployTest/testRunner_generic.py
@@ -0,0 +1,37 @@
+# ----------------------------------------------------------------------
+#
+# File: testRunner_generic.py
+#
+# Last edited: 17.03.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
+
+if __name__ == "__main__":
+
+    parser = TestRunnerArgumentParser(
+        tiling_arguments = False,
+        description = "Deeploy Code Generation Utility for the Generic Platform (Host Machine, no Tiling).")
+    args = parser.parse_args()
+
+    testRunner = TestRunner(platform = "Generic", simulator = "host", tiling = False, argument_parser = parser)
+
+    testRunner.run()
diff --git a/DeeployTest/testRunner_mempool.py b/DeeployTest/testRunner_mempool.py
new file mode 100644
index 0000000..c3bbd4f
--- /dev/null
+++ b/DeeployTest/testRunner_mempool.py
@@ -0,0 +1,44 @@
+# ----------------------------------------------------------------------
+#
+# File: testRunner_mempool.py
+#
+# Last edited: 17.03.2023
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
+
+if __name__ == "__main__":
+    parser = TestRunnerArgumentParser(
+        tiling_arguments = False, description = "Deeploy Code Generation Utility for the MemPool Platform (no Tiling).")
+
+    parser.add_argument('-n',
+                        metavar = 'num_threads',
+                        dest = 'num_threads',
+                        type = int,
+                        default = 16,
+                        help = 'Number of parallel threads\n')
+    args = parser.parse_args()
+
+    testRunner = TestRunner(platform = "MemPool", simulator = "banshee", tiling = False, argument_parser = parser)
+
+    testRunner.cmake_args += f" -D num_threads={args.num_threads}"
+
+    testRunner.run()
diff --git a/DeeployTest/testRunner_siracusa.py b/DeeployTest/testRunner_siracusa.py
new file mode 100644
index 0000000..b23287d
--- /dev/null
+++ b/DeeployTest/testRunner_siracusa.py
@@ -0,0 +1,46 @@
+# ----------------------------------------------------------------------
+#
+# File: testRunner_siracusa.py
+#
+# Last edited: 11.04.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
+
+if __name__ == "__main__":
+
+    parser = TestRunnerArgumentParser(
+        tiling_arguments = False,
+        description = "Deeploy Code Generation Utility for the Siracusa Platform (no Tiling).")
+
+    parser.add_argument('--cores',
+                        metavar = '<cores>',
+                        dest = 'cores',
+                        type = int,
+                        default = 1,
+                        help = 'Set number of cluster cores')
+    args = parser.parse_args()
+
+    testRunner = TestRunner(platform = "Siracusa", simulator = "gvsoc", tiling = False, argument_parser = parser)
+
+    testRunner.cmake_args += f" -D NUM_CORES={args.cores}"
+    testRunner.run()
diff --git a/DeeployTest/testRunner_tiled_siracusa.py b/DeeployTest/testRunner_tiled_siracusa.py
new file mode 100644
index 0000000..4609df8
--- /dev/null
+++ b/DeeployTest/testRunner_tiled_siracusa.py
@@ -0,0 +1,46 @@
+# ----------------------------------------------------------------------
+#
+# File: testRunner_tiled_siracusa.py
+#
+# Last edited: 31.10.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
+
+if __name__ == "__main__":
+
+    parser = TestRunnerArgumentParser(
+        tiling_arguments = True,
+        description = "Deeploy Code Generation Utility for the Siracusa Platform (Tiling & NEureka).")
+
+    parser.add_argument('--cores',
+                        metavar = '<cores>',
+                        dest = 'cores',
+                        type = int,
+                        default = 1,
+                        help = 'Set number of cluster cores')
+    args = parser.parse_args()
+
+    testRunner = TestRunner(platform = "Siracusa", simulator = "gvsoc", tiling = True, argument_parser = parser)
+
+    testRunner.cmake_args += f" -D NUM_CORES={args.cores}"
+
+    testRunner.run()
diff --git a/DeeployTest/testRunner_tiled_siracusa_w_neureka.py b/DeeployTest/testRunner_tiled_siracusa_w_neureka.py
new file mode 100644
index 0000000..95c3eec
--- /dev/null
+++ b/DeeployTest/testRunner_tiled_siracusa_w_neureka.py
@@ -0,0 +1,70 @@
+# ----------------------------------------------------------------------
+#
+# File: testRunner_tiled_siracusa_w_neureka.py
+#
+# Last edited: 31.10.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
+
+if __name__ == "__main__":
+
+    parser = TestRunnerArgumentParser(
+        tiling_arguments = True,
+        description = "Deeploy Code Generation Utility for the Siracusa Platform (Tiling & NEureka).")
+
+    parser.add_argument('--cores',
+                        metavar = '<cores>',
+                        dest = 'cores',
+                        type = int,
+                        default = 1,
+                        help = 'Set number of cluster cores')
+    parser.add_argument('--neureka-wmem',
+                        dest = "neureka_wmem",
+                        action = "store_true",
+                        default = False,
+                        help = 'Adds weight memory and neureka engine color\n')
+    parser.add_argument('--enable-3x3',
+                        dest = "enable_3x3",
+                        action = "store_true",
+                        default = False,
+                        help = 'Adds EXPERIMENTAL support for 3x3 convolutions on N-EUREKA\n')
+    parser.add_argument('--enableStrides',
+                        dest = "enableStrides",
+                        action = "store_true",
+                        default = False,
+                        help = 'Adds EXPERIMENTAL support for strided convolutions on N-EUREKA\n')
+    args = parser.parse_args()
+
+    testRunner = TestRunner(platform = "Siracusa_w_neureka",
+                            simulator = "gvsoc",
+                            tiling = True,
+                            argument_parser = parser)
+
+    testRunner.cmake_args += f" -D NUM_CORES={args.cores}"
+    if args.neureka_wmem:
+        testRunner.gen_args += " --neureka-wmem"
+    if args.enable_3x3:
+        testRunner.gen_args += " --enable-3x3"
+    if args.enableStrides:
+        testRunner.gen_args += " --enableStrides"
+
+    testRunner.run()
diff --git a/DeeployTest/testRunner_tiled_snitch.py b/DeeployTest/testRunner_tiled_snitch.py
new file mode 100644
index 0000000..34ebefb
--- /dev/null
+++ b/DeeployTest/testRunner_tiled_snitch.py
@@ -0,0 +1,46 @@
+# ----------------------------------------------------------------------
+#
+# File: testRunner_tiled_snitch.py
+#
+# Last edited: 23.04.2024
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
+
+if __name__ == "__main__":
+
+    parser = TestRunnerArgumentParser(tiling_arguments = True,
+                                      description = "Deeploy Code Generation Utility for the Snitch Platform (Tiling).")
+
+    parser.add_argument('--cores',
+                        metavar = '<cores>',
+                        dest = 'cores',
+                        type = int,
+                        default = 9,
+                        help = 'Set number of cluster cores')
+    parser.set_defaults(toolchain_install_dir = "/usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0")
+    args = parser.parse_args()
+
+    testRunner = TestRunner(platform = "Snitch", simulator = "banshee", tiling = True, argument_parser = parser)
+
+    testRunner.cmake_args += f" -D NUM_CORES={args.cores}"
+
+    testRunner.run()
diff --git a/DeeployTest/testSchedulingExtension.py b/DeeployTest/testSchedulingExtension.py
new file mode 100644
index 0000000..d6372de
--- /dev/null
+++ b/DeeployTest/testSchedulingExtension.py
@@ -0,0 +1,384 @@
+# ----------------------------------------------------------------------
+#
+# File: tilerExtensionTest.py
+#
+# Last edited: 09.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from collections import OrderedDict
+from typing import Dict, List
+
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+import pytest
+from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform
+from testUtils.testRunner import TestGeneratorArgumentParser
+from testUtils.typeMapping import inferInputType
+
+from Deeploy.DeeployTypes import NetworkContext, NetworkDeployer, ONNXLayer, Schedule, StructBuffer, TransientBuffer, \
+    VariableBuffer
+from Deeploy.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper
+from Deeploy.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel, AnnotateIOMemoryLevel
+from Deeploy.TilingExtension.MemoryScheduler import MemoryBlock
+from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper, TilingSolution
+
+
+# Mock of the Global Scheduler's inteface
+# Returns a list of list of nodes instead of simply a list
+# Inner list represent the patter over which we tile
+def _mockScheduler(graph: gs.Graph) -> List[List[gs.Node]]:
+
+    schedule = [[node] for node in graph.nodes]
+
+    return schedule
+
+
+def _filterSchedule(schedule: List[List[gs.Node]], layerBinding: 'OrderedDict[str, ONNXLayer]') -> List[List[gs.Node]]:
+
+    filteredSchedule = []
+
+    for pattern in schedule:
+
+        filteredSchedulePattern = []
+        for node in pattern:
+            if node.name in layerBinding.keys():
+                filteredSchedulePattern.append(node)
+        filteredSchedule.append(filteredSchedulePattern)
+
+    return filteredSchedule
+
+
+def getMemoryOccupation(ctxt, tiledTensors, memoryLevel):
+
+    occupation = 0
+
+    for tensor in tiledTensors.values():
+        for memoryConstraint in tensor.memoryConstraints.values():
+            if memoryConstraint.memoryLevel == memoryLevel:
+
+                if not isinstance(ctxt.lookup(tensor.tensorName), TransientBuffer):
+                    typeWidth = (ctxt.lookup(tensor.tensorName)._type.referencedType.typeWidth // 8)
+                else:
+                    typeWidth = 1
+
+                delta = memoryConstraint.multiBufferCoefficient * memoryConstraint.size * typeWidth
+                occupation += delta
+
+    return occupation
+
+
+def validateTilingTopologySolution(schedule: Schedule, tilingSchedule: Schedule, memoryHierarchy: MemoryHierarchy):
+
+    assert len(schedule) == len(tilingSchedule), "ERROR: schedule and tilingSchedule don't have the same length"
+
+    for pattern, tilingPattern in zip(schedule, tilingSchedule):
+        subGraph = gs.Graph(nodes = pattern)
+        patternTensors = set([key for key, value in subGraph.tensors().items() if ctxt.lookup(key)._deploy])
+
+        # intermediateTensors are all tensors that are used and produced by the pattern.
+        # Including transient Buffers!
+        usedTensors = set()
+        producedTensors = set()
+        transientTensors = set()
+
+        for tensor in patternTensors:
+            users = ctxt.lookup(tensor)._users
+
+            for node in pattern:
+                if node.name in users:
+                    usedTensors.add(tensor)
+                    break
+
+        for node in pattern:
+            outputTensors = {node.name for node in node.outputs}
+            producedTensors |= outputTensors
+
+        for tensorName, varBuffer in ctxt.localObjects.items():
+            if isinstance(varBuffer, TransientBuffer):
+                assert len(varBuffer._users) == 1
+                if varBuffer._users[0] in patternTensors:
+                    transientTensors.add(tensorName)
+
+        for tilingStep in tilingPattern.nodeConstraints:
+            borderTensors = {
+                tensor.tensorName
+                for tensor in tilingStep.tensorMemoryConstraints.values()
+                if len(tensor.memoryConstraints.keys()) > 1
+            }
+
+            intermediateTensors = patternTensors - borderTensors
+            assert intermediateTensors == ((usedTensors & producedTensors) |
+                                           transientTensors), "ERROR in tilingSchedule!"
+
+            assert borderTensors == (usedTensors - producedTensors) | (producedTensors -
+                                                                       usedTensors), "ERROR in tilingSchedule!"
+
+            l1Occupation = getMemoryOccupation(ctxt, tilingStep.tensorMemoryConstraints, "L1")
+            assert l1Occupation <= memoryHierarchy.memoryLevels['L1'].size, "L1 usage is too high"
+
+            l2Occupation = getMemoryOccupation(ctxt, tilingStep.tensorMemoryConstraints, "L2")
+            assert l2Occupation <= memoryHierarchy.memoryLevels['L2'].size, "L2 usage is too high!"
+
+
+def _findBlocks(memoryMap: Dict[str, List[List[MemoryBlock]]], name: str) -> List[MemoryBlock]:
+
+    res = []
+
+    for key, patterns in memoryMap.items():
+        for pattern in patterns:
+            for block in pattern:
+                if block.name == name:
+                    res.append(block)
+
+    return res
+
+
+def validateStaticMemoryLayoutSolution(ctxt: NetworkContext, memoryMap: Dict[str, List[List[MemoryBlock]]]):
+
+    # SCHEREMO: Assert that every VariableBuffer and ConstantBuffer is fully allocated somewhere
+    # SCHEREMO: This doesn't need to hold for depth-first tiling!
+    for key, buf in {**ctxt.localObjects}.items():
+        if not isinstance(buf, (VariableBuffer)) or isinstance(buf, TransientBuffer) or isinstance(buf, StructBuffer):
+            continue
+
+        # SCHEREMO: Exception for memory arenas
+        if buf._users == []:
+            continue
+
+        blocks = _findBlocks(memoryMap, key)
+
+        if len(blocks) == 0:
+            raise Exception(f"Didn't find any allocation of {key}")
+
+        buf = ctxt.lookup(key)
+        blockSize = np.prod(buf.shape) * (buf._type.referencedType.typeWidth // 8)
+
+        blockFound = False
+        for block in blocks:
+            size = block.addrSpace[1] - block.addrSpace[0]
+            blockFound |= (size == blockSize)
+
+        assert blockFound, f"Didn't find full allocation of block {key}, expected {size} got {blockSize}"
+
+
+def validateDynamicMemoryLayoutSolution(ctxt: NetworkContext, tilingSchedule: TilingSolution,
+                                        memoryMap: Dict[str, List[List[MemoryBlock]]]):
+
+    # SCHEREMO: Assert that tilingSchedule is implemented
+    for patternIdx, patternConstraints in enumerate(tilingSchedule):
+        for nodeConstraint in patternConstraints.nodeConstraints:
+            for tensorConstraint in nodeConstraint.tensorMemoryConstraints.values():
+
+                blocks = _findBlocks(memoryMap, tensorConstraint.tensorName)
+                blockLevels = [block.level for block in blocks]
+
+                buf = ctxt.lookup(tensorConstraint.tensorName)
+
+                for memoryConstraint in tensorConstraint.memoryConstraints.values():
+
+                    # SCHEREMO: Don't check static allocation
+                    if buf._memoryLevel == memoryConstraint.memoryLevel:
+                        continue
+
+                    assert memoryConstraint.memoryLevel in blockLevels, f"No constraint for {tensorConstraint.tensorName} memoryLevel {memoryConstraint.memoryLevel}"
+
+                    patternBlocks = memoryMap[memoryConstraint.memoryLevel][patternIdx]
+
+                    _block = [block for block in patternBlocks if block.name == tensorConstraint.tensorName]
+
+                    assert len(_block) == 1, f"{tensorConstraint.tensorName} not exactly once in pattern {patternIdx}!"
+
+                    block = _block[0]
+                    otherBlocks = [oblock for oblock in patternBlocks if oblock != block]
+
+                    collisions = []
+                    _buffer = ctxt.lookup(block.name)
+                    for other in otherBlocks:
+                        _otherBuffer = ctxt.lookup(other.name)
+                        if (hasattr(_buffer, "_alias")
+                                and _buffer._alias == other.name) or (hasattr(_otherBuffer, "_alias")
+                                                                      and _otherBuffer._alias == block.name):
+                            collisions.append(False)
+                            continue
+
+                        collisions.append(block.collides(other))
+
+                    assert not any(collisions), f"{block.name} collides with another block in pattern {patternIdx}"
+
+                    ctxtSize = memoryConstraint.size * memoryConstraint.multiBufferCoefficient * (
+                        buf._type.referencedType.typeWidth // 8)
+                    blockSize = block.addrSpace[1] - block.addrSpace[0]
+
+                    assert ctxtSize <= blockSize, f"{tensorConstraint.tensorName}'s expected size does not match!"
+
+
+def setupDeployer(memoryHierarchy: MemoryHierarchy, graph: gs.Graph) -> NetworkDeployer:
+
+    inputTypes = {}
+    inputOffsets = {}
+
+    _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates")
+
+    inputs = np.load(f'./{args.dir}/inputs.npz')
+    tensors = graph.tensors()
+
+    # Load as int64 and infer types later
+    test_inputs = [inputs[x].reshape(-1).astype(np.int64) for x in inputs.files]
+
+    platform, signProp = mapPlatform(args.platform)
+
+    for index, num in enumerate(test_inputs):
+        _type, offset = inferInputType(num, signProp)[0]
+        inputTypes[f"input_{index}"] = _type
+        inputOffsets[f"input_{index}"] = offset
+
+    deployer = mapDeployer(platform,
+                           graph,
+                           inputTypes,
+                           deeployStateDir = _DEEPLOYSTATEDIR,
+                           inputOffsets = inputOffsets,
+                           scheduler = _mockScheduler)
+
+    memoryLevelAnnotationPasses = [AnnotateDefaultMemoryLevel(memoryHierarchy), AnnotateIOMemoryLevel("L2")]
+
+    # Make the deployer memory-level aware
+    deployer.Platform = setupMemoryPlatform(deployer.Platform,
+                                            memoryHierarchy,
+                                            defaultTargetMemoryLevel = memoryHierarchy.memoryLevels["L1"])
+    # Make the deployer memory-level aware
+    deployer = MemoryDeployerWrapper(deployer, memoryLevelAnnotationPasses)
+
+    # Make the deployer tiler aware
+    deployer = TilerDeployerWrapper(deployer)
+
+    deployer.frontEnd()
+    #deployer.midEnd()
+
+    return deployer
+
+
+def validateEffectiveLoad(outerMemoryMap: Dict[str, List[List[MemoryBlock]]],
+                          innerMemoryMap: Dict[str, List[List[MemoryBlock]]], memoryHierarchy: MemoryHierarchy):
+    staticLoadDict = {}
+    maxAddr = 0
+    for level, patterns in outerMemoryMap.items():
+        maxAddr = 0
+        for pattern in patterns:
+            for block in pattern:
+                maxAddr = max(maxAddr, block.addrSpace[1])
+        staticLoadDict[level] = maxAddr
+
+    dynamicLoadDict = {}
+    maxAddr = 0
+    for level, patterns in innerMemoryMap.items():
+        maxAddr = 0
+        for pattern in patterns:
+            for block in pattern:
+                maxAddr = max(maxAddr, block.addrSpace[1])
+        dynamicLoadDict[level] = maxAddr
+
+    totalLoadDict = {}
+    for level in dynamicLoadDict.keys():
+        totalLoadDict[level] = staticLoadDict[level] + dynamicLoadDict[level]
+
+    for level, load in totalLoadDict.items():
+        assert memoryHierarchy.memoryLevels[
+            level].size > load, f"Effective memory layout does not fit {memoryHierarchy.memoryLevels[level].size} in {level}"
+
+
+def validateDynamicLifetimes(ctxt: NetworkContext, tilingSchedule: TilingSolution,
+                             outerMemoryMap: Dict[str, List[List[MemoryBlock]]]):
+
+    for patternIdx, pattern in enumerate(tilingSchedule):
+        for nodeConstraint in pattern.nodeConstraints:
+            for tensor in nodeConstraint.tensorMemoryConstraints.values():
+                name = tensor.tensorName
+
+                buf = ctxt.lookup(name)
+                if isinstance(buf, TransientBuffer) or ctxt.is_global(name):
+                    continue
+
+                blocks = _findBlocks(outerMemoryMap, name)
+                assert len(blocks) == 1, f"Found {name} more than once in static life time map!"
+
+                block = blocks[0]
+                assert (patternIdx >= block.lifetime[0] and patternIdx
+                        <= block.lifetime[1]), f"Tile of {name} is used after deallocation of the static buffer!"
+
+
+if __name__ == '__main__':
+
+    parser = TestGeneratorArgumentParser(description = "Test Utility for the Scheduling Extension.")
+    parser.add_argument('--l1', metavar = 'l1', dest = 'l1', type = int, default = 64000, help = 'Set L1 size\n')
+    parser.add_argument('--shouldFail', action = 'store_true')
+    parser.set_defaults(shouldFail = False)
+    args = parser.parse_args()
+
+    onnx_graph = onnx.load_model(f'./{args.dir}/network.onnx')
+    graph = gs.import_onnx(onnx_graph)
+
+    # Instantiate Classes Requried for Memory Level Annotation Extension
+    L3_2 = MemoryLevel(name = "L3.1", neighbourNames = ["L2"], size = 1024000)
+    L3_1 = MemoryLevel(name = "L3.2", neighbourNames = ["L2"], size = 4000)
+    L2 = MemoryLevel(name = "L2", neighbourNames = ["L3.1", "L3.2", "L1"], size = 512000)
+    L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = args.l1)
+
+    memoryHierarchy = MemoryHierarchy([L3_1, L3_2, L2, L1])
+    #memoryHierarchy.setDefaultMemoryLevel("L3.1")
+    memoryHierarchy.setDefaultMemoryLevel("L2")
+
+    deployer = setupDeployer(memoryHierarchy, graph)
+
+    schedule = _filterSchedule(_mockScheduler(graph), deployer.layerBinding)
+
+    if args.shouldFail:
+        with pytest.raises(Exception):
+            tilingSchedule = deployer.tiler.computeTilingSchedule(deployer.ctxt)
+
+        print("Tiler test ended, failed as expected!")
+    else:
+
+        _ = deployer.generateFunction()
+
+        tilingSchedule = deployer.tiler._getTilingSolution(deployer.tiler.tilerModel, deployer.ctxt,
+                                                           deployer.tiler.tilerModel._collector,
+                                                           deployer.tiler.symbolicMemoryConstraints)
+
+        ctxt = deployer.ctxt
+        layerBinding = deployer.layerBinding
+        schedule = _mockScheduler(deployer.graph)
+
+        validateTilingTopologySolution(schedule, tilingSchedule, memoryHierarchy)
+
+        innerMemoryMap = deployer.tiler.innerMemoryScheduler.memoryMap
+        outerMemoryMap = deployer.tiler.outerMemoryScheduler.memoryMap
+
+        validateStaticMemoryLayoutSolution(ctxt, outerMemoryMap)
+        validateDynamicMemoryLayoutSolution(ctxt, tilingSchedule, innerMemoryMap)
+
+        validateDynamicLifetimes(ctxt, tilingSchedule, outerMemoryMap)
+
+        validateEffectiveLoad(outerMemoryMap, innerMemoryMap, memoryHierarchy)
+
+        print("Tiler test ended, no memory violations!")
diff --git a/DeeployTest/testSlice_PULP.py b/DeeployTest/testSlice_PULP.py
new file mode 100644
index 0000000..dda9d13
--- /dev/null
+++ b/DeeployTest/testSlice_PULP.py
@@ -0,0 +1,169 @@
+# ----------------------------------------------------------------------
+#
+# File: testSlice_PULP.py
+#
+# Last edited: 15.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import subprocess
+
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+from testUtils.codeGenerate import generateTestInputsHeader, generateTestNetworkHeader, \
+    generateTestNetworkImplementation, generateTestOutputsHeader
+from testUtils.platformMapping import mapDeployer, setupMemoryPlatform
+from testUtils.testRunner import escapeAnsi
+from testUtils.typeMapping import inferInputType
+
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper
+from Deeploy.Targets.PULPOpen.Platform import PULPPlatform
+from Deeploy.Targets.PULPOpen.Templates.AllocateTemplate import pulpL1AllocateTemplate
+from Deeploy.Targets.PULPOpen.Templates.FreeTemplate import pulpL1FreeTemplate
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description = "Test Utility for the Slice Operation.")
+    parser.add_argument('--toolchain',
+                        metavar = 'toolchain',
+                        dest = 'toolchain',
+                        type = str,
+                        default = "LLVM",
+                        help = 'Pick compiler toolchain')
+    parser.add_argument('--toolchain_install_dir',
+                        metavar = 'toolchain_install_dir',
+                        dest = 'toolchain_install_dir',
+                        type = str,
+                        default = os.environ.get('LLVM_INSTALL_DIR'),
+                        help = 'Pick compiler install dir')
+    args = parser.parse_args()
+    _TOOLCHAIN_DIR = os.path.normpath(args.toolchain_install_dir)
+
+    signProp = False
+
+    onnx_graph = onnx.load_model('./Tests/testSlice/network.onnx')
+    graph = gs.import_onnx(onnx_graph)
+
+    inputs = np.load('./Tests/testSlice/inputs.npz')
+    outputs = np.load(f'./Tests/testSlice/outputs.npz')
+    tensors = graph.tensors()
+
+    # Load as int64 and infer types later
+    test_inputs = [inputs[x].reshape(-1).astype(np.int64) for x in inputs.files]
+    test_outputs = [outputs[x].reshape(-1).astype(np.int64) for x in outputs.files]
+
+    inputTypes = {}
+    inputOffsets = {}
+
+    L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 1024000)
+    L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = 512000)
+    L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = 128000)
+
+    memoryHierarchy = MemoryHierarchy([L3, L2, L1])
+    memoryHierarchy.setDefaultMemoryLevel("L2")
+
+    platform = PULPPlatform()
+
+    for index, num in enumerate(test_inputs):
+        _type, offset = inferInputType(num, signProp)[0]
+        inputTypes[f"input_{index}"] = _type
+        inputOffsets[f"input_{index}"] = offset
+
+    deployer = mapDeployer(platform, graph, inputTypes, inputOffsets = inputOffsets)
+    # Make the platform memory-level aware
+    deployer.Platform = setupMemoryPlatform(deployer.Platform, memoryHierarchy, defaultTargetMemoryLevel = L1)
+    # Make the deployer memory-level aware
+    deployer = MemoryDeployerWrapper(deployer)
+
+    # Go through flow
+    deployer.frontEnd()
+    deployer.parse(deployer.default_channels_first)
+
+    deployer.ctxt.lookup('onnx::Slice_5')._memoryLevel = "L1"
+    deployer.ctxt.lookup('onnx::Slice_5').allocTemplate = pulpL1AllocateTemplate
+    deployer.ctxt.lookup('onnx::Slice_5').deallocTemplate = pulpL1FreeTemplate
+
+    deployer.midEnd()
+
+    deployer.codeTransform()
+    deployer.prepared = True
+    deployer.generateInferenceCode()
+
+    # Create input and output vectors
+    os.makedirs('TEST_SIRACUSA/Tests/testSlice', exist_ok = True)
+
+    testInputStr = generateTestInputsHeader(deployer, test_inputs, inputTypes, inputOffsets)
+    f = open('TEST_SIRACUSA/Tests/testSlice/testinputs.h', "w")
+    f.write(testInputStr)
+    f.close()
+
+    testOutputStr = generateTestOutputsHeader(deployer, test_outputs, signProp, False)
+    f = open('TEST_SIRACUSA/Tests/testSlice/testoutputs.h', "w")
+    f.write(testOutputStr)
+    f.close()
+
+    # Generate code for Network
+    testNetworkHeaderStr = generateTestNetworkHeader(deployer, platform)
+    f = open('TEST_SIRACUSA/Tests/testSlice/Network.h', "w")
+    f.write(testNetworkHeaderStr)
+    f.close()
+
+    testNetworkImplementationStr = generateTestNetworkImplementation(deployer, platform)
+    f = open('TEST_SIRACUSA/Tests/testSlice/Network.c', "w")
+    f.write(testNetworkImplementationStr)
+    f.close()
+
+    clang_format = "{BasedOnStyle: llvm, IndentWidth: 2, ColumnLimit: 160}"
+    os.system(f'clang-format -i --style="{clang_format}" TEST_SIRACUSA/Tests/testSlice/Network.c')
+    os.system(f'clang-format -i --style="{clang_format}" TEST_SIRACUSA/Tests/testSlice/Network.h')
+    os.system(f'clang-format -i --style="{clang_format}" TEST_SIRACUSA/Tests/testSlice/testoutputs.h')
+    os.system(f'clang-format -i --style="{clang_format}" TEST_SIRACUSA/Tests/testSlice/testinputs.h')
+
+    os.system(
+        f"$CMAKE -DTOOLCHAIN={args.toolchain} -DTOOLCHAIN_INSTALL_DIR={_TOOLCHAIN_DIR}  -DTESTNAME=testSlice -DGENERATED_SOURCE=TEST_SIRACUSA/Tests/testSlice -Dplatform=Siracusa -B TEST_SIRACUSA/build -DNUM_CORES=1 .."
+    )
+    process = subprocess.Popen(["$CMAKE --build TEST_SIRACUSA/build --target gvsoc_testSlice"],
+                               stdout = subprocess.PIPE,
+                               stderr = subprocess.STDOUT,
+                               shell = True,
+                               encoding = 'utf-8')
+    fileHandle = open('out.txt', 'a')
+    fileHandle.write(f"################## Testing Tests/testSlice on SIRACUSA Platform ##################\n")
+
+    result = ""
+    while True:
+        output = process.stdout.readline()
+        if output == '' and process.poll() is not None:
+            break
+        if output:
+            result += output
+            fileHandle.write(f"{escapeAnsi(output)}")
+
+    print(result.strip())
+
+    fileHandle.write("")
+    fileHandle.close()
+
+    if not "Errors: 0 out of " in result:
+        raise RuntimeError(f"Found an error in Tests/testSlice")
diff --git a/DeeployTest/testTilerExtension.py b/DeeployTest/testTilerExtension.py
new file mode 100644
index 0000000..edf1e6d
--- /dev/null
+++ b/DeeployTest/testTilerExtension.py
@@ -0,0 +1,242 @@
+# ----------------------------------------------------------------------
+#
+# File: tilerExtensionTest.py
+#
+# Last edited: 09.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from collections import OrderedDict
+from typing import List
+
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+import pytest
+from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform
+from testUtils.testRunner import TestGeneratorArgumentParser
+from testUtils.typeMapping import inferInputType
+
+from Deeploy.DeeployTypes import GlobalDefinition, NetworkDeployer, ONNXLayer, Schedule, TransientBuffer
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper
+from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel, \
+    AnnotateIOMemoryLevel
+from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper
+
+# Mock of the Global Scheduler's inteface
+# Returns a list of list of nodes instead of simply a list
+# Inner list represent the patter over which we tile
+
+
+def _mockScheduler(graph: gs.Graph) -> List[List[gs.Node]]:
+
+    schedule = [[node] for node in graph.nodes]
+
+    return schedule
+
+
+def _filterSchedule(schedule: List[List[gs.Node]], layerBinding: 'OrderedDict[str, ONNXLayer]') -> List[List[gs.Node]]:
+
+    filteredSchedule = []
+
+    for pattern in schedule:
+
+        filteredSchedulePattern = []
+        for node in pattern:
+            if node.name in layerBinding.keys():
+                filteredSchedulePattern.append(node)
+        filteredSchedule.append(filteredSchedulePattern)
+
+    return filteredSchedule
+
+
+def getMemoryOccupation(ctxt, tiledTensors, memoryLevel):
+
+    occupation = 0
+
+    for tensor in ctxt.globalObjects.values():
+
+        if isinstance(tensor, GlobalDefinition):
+            continue
+
+        if tensor._memoryLevel == memoryLevel and tensor._users != []:
+            occupation += np.prod(tensor.shape) * (tensor._type.referencedType.typeWidth // 8)
+
+    for tensor in tiledTensors.values():
+        for memoryConstraint in tensor.memoryConstraints.values():
+            if memoryConstraint.memoryLevel == memoryLevel:
+
+                if not isinstance(ctxt.lookup(tensor.tensorName), TransientBuffer):
+                    typeWidth = (ctxt.lookup(tensor.tensorName)._type.referencedType.typeWidth // 8)
+                else:
+                    typeWidth = 1
+
+                delta = memoryConstraint.multiBufferCoefficient * memoryConstraint.size * typeWidth
+                occupation += delta
+
+    return occupation
+
+
+def validateSolution(schedule: Schedule, tilingSchedule: Schedule, memoryHierarchy: MemoryHierarchy):
+
+    assert len(schedule) == len(tilingSchedule), "ERROR: schedule and tilingSchedule don't have the same length"
+
+    for pattern, tilingPattern in zip(schedule, tilingSchedule):
+        subGraph = gs.Graph(nodes = pattern)
+        patternTensors = set([key for key, value in subGraph.tensors().items() if ctxt.lookup(key)._deploy])
+
+        # intermediateTensors are all tensors that are used and produced by the pattern.
+        # Including transient Buffers!
+        usedTensors = set()
+        producedTensors = set()
+        transientTensors = set()
+
+        for tensor in patternTensors:
+            users = ctxt.lookup(tensor)._users
+
+            for node in pattern:
+                if node.name in users:
+                    usedTensors.add(tensor)
+                    break
+
+        for node in pattern:
+            outputTensors = {node.name for node in node.outputs}
+            producedTensors |= outputTensors
+
+        for tensorName, varBuffer in ctxt.localObjects.items():
+            if isinstance(varBuffer, TransientBuffer):
+                assert len(varBuffer._users) == 1
+                if varBuffer._users[0] in patternTensors:
+                    transientTensors.add(tensorName)
+
+        for tilingStep in tilingPattern.nodeConstraints:
+            borderTensors = {
+                tensor.tensorName
+                for tensor in tilingStep.tensorMemoryConstraints.values()
+                if len(tensor.memoryConstraints) > 1
+            }
+
+            intermediateTensors = patternTensors - borderTensors
+
+            assert intermediateTensors == ((usedTensors & producedTensors) |
+                                           transientTensors), "ERROR in tilingSchedule!"
+            assert borderTensors == (usedTensors - producedTensors) | (producedTensors -
+                                                                       usedTensors), "ERROR in tilingSchedule!"
+
+            l1Occupation = getMemoryOccupation(ctxt, tilingStep.tensorMemoryConstraints, "L1")
+            assert l1Occupation <= memoryHierarchy.memoryLevels['L1'].size, "L1 usage is too high!"
+
+            l2Occupation = getMemoryOccupation(ctxt, tilingStep.tensorMemoryConstraints, "L2")
+            assert l2Occupation <= memoryHierarchy.memoryLevels['L2'].size, "L2 usage is too high!"
+
+
+def setupDeployer(memoryHierarchy: MemoryHierarchy, graph: gs.Graph) -> NetworkDeployer:
+
+    inputTypes = {}
+    inputOffsets = {}
+
+    _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates")
+
+    inputs = np.load(f'./{args.dir}/inputs.npz')
+    tensors = graph.tensors()
+
+    # Load as int64 and infer types later
+    test_inputs = [inputs[x].reshape(-1).astype(np.int64) for x in inputs.files]
+
+    platform, signProp = mapPlatform(args.platform)
+
+    for index, num in enumerate(test_inputs):
+        _type, offset = inferInputType(num, signProp)[0]
+        inputTypes[f"input_{index}"] = _type
+        inputOffsets[f"input_{index}"] = offset
+        if "simpleRegression" in args.dir:
+            inputOffsets[f"input_{index}"] = 0
+
+    deployer = mapDeployer(platform,
+                           graph,
+                           inputTypes,
+                           deeployStateDir = _DEEPLOYSTATEDIR,
+                           inputOffsets = inputOffsets,
+                           scheduler = _mockScheduler)
+
+    memoryLevelAnnotationPasses = [AnnotateIOMemoryLevel("L2"), AnnotateDefaultMemoryLevel(memoryHierarchy)]
+
+    # Make the platform memory-level aware
+    deployer.Platform = setupMemoryPlatform(deployer.Platform,
+                                            memoryHierarchy,
+                                            defaultTargetMemoryLevel = memoryHierarchy.memoryLevels["L1"])
+    # Make the deployer memory-level aware
+    deployer = MemoryDeployerWrapper(deployer, memoryLevelAnnotationPasses)
+
+    # Make the deployer tiler aware
+    deployer = TilerDeployerWrapper(deployer)
+
+    deployer.frontEnd()
+
+    return deployer
+
+
+if __name__ == '__main__':
+
+    parser = TestGeneratorArgumentParser(description = "Test Utility for the Tiler Extension.")
+
+    parser.add_argument('--l1', metavar = 'l1', dest = 'l1', type = int, default = 64000, help = 'Set L1 size\n')
+    parser.add_argument('--shouldFail', action = 'store_true')
+    parser.set_defaults(shouldFail = False)
+    args = parser.parse_args()
+
+    onnx_graph = onnx.load_model(f'./{args.dir}/network.onnx')
+    graph = gs.import_onnx(onnx_graph)
+
+    # Instantiate Classes Requried for Memory Level Annotation Extension
+    L3_2 = MemoryLevel(name = "L3.1", neighbourNames = ["L2"], size = 1024000)
+    L3_1 = MemoryLevel(name = "L3.2", neighbourNames = ["L2"], size = 4000)
+    L2 = MemoryLevel(name = "L2", neighbourNames = ["L3.1", "L3.2", "L1"], size = 512000)
+    L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = args.l1)
+
+    memoryHierarchy = MemoryHierarchy([L3_1, L3_2, L2, L1])
+    memoryHierarchy.setDefaultMemoryLevel("L2")
+
+    deployer = setupDeployer(memoryHierarchy, graph)
+
+    schedule = _filterSchedule(_mockScheduler(graph), deployer.layerBinding)
+
+    if args.shouldFail:
+        with pytest.raises(Exception):
+            tilingSchedule = deployer.tiler.computeTilingSchedule(deployer.ctxt)
+
+        print("Tiler test ended, failed as expected!")
+    else:
+
+        _ = deployer.generateFunction()
+
+        tilingSchedule = deployer.tiler._getTilingSolution(deployer.tiler.tilerModel, deployer.ctxt,
+                                                           deployer.tiler.tilerModel._collector,
+                                                           deployer.tiler.symbolicMemoryConstraints)
+
+        ctxt = deployer.ctxt
+        layerBinding = deployer.layerBinding
+        schedule = _mockScheduler(deployer.graph)
+
+        validateSolution(schedule, tilingSchedule, memoryHierarchy)
+
+        print("Tiler test ended, no memory violations!")
diff --git a/DeeployTest/testTypes.py b/DeeployTest/testTypes.py
new file mode 100644
index 0000000..5394086
--- /dev/null
+++ b/DeeployTest/testTypes.py
@@ -0,0 +1,235 @@
+# ----------------------------------------------------------------------
+#
+# File: testTypes.py
+#
+# Last edited: 15.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle
+
+import pytest
+
+from Deeploy.AbstractDataTypes import PointerClass, StructClass
+from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes, int8_t, int16_t, int32_t
+from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, StructBuffer, TransientBuffer, VariableBuffer
+
+
+def testImmediateSerialization():
+    for _type in IntegerDataTypes:
+        val = _type(17)
+        assert pickle.loads(pickle.dumps(val)) == val, f"Serialized value {val} should be equal!"
+        assert pickle.loads(pickle.dumps(_type))(17) == _type(17), f"Serialized type {_type} should be equal to type!"
+    return True
+
+
+def testStructSerialization():
+    structType = {"f1": int32_t, "f2": int8_t}
+    s = StructClass("s", structType)
+    val = {"f1": 7, "f2": 8}
+
+    assert pickle.loads(pickle.dumps(s))(val) == s(val), f"Serialized type {s} should be equal to type!"
+    assert pickle.loads(pickle.dumps(s(val))) == s(val), f"Serialized type {s} should be equal to type!"
+    globals()[s] = None
+    return True
+
+
+def testImmediateTypeEquivalence():
+    for _type in IntegerDataTypes:
+        val = _type(17)
+        assert _type(17) == val, f"Type {_type} should be equal if content is the same!"
+    return True
+
+
+def testStructTypeEquivalence():
+    structType = {"f1": int32_t, "f2": int8_t}
+    s = StructClass("s4", structType)
+    val = {"f1": 7, "f2": 8}
+    assert s(val) == s(val), f"Type {s} should be equal if content is same!"
+
+    return True
+
+
+def testImmediatePromotion():
+    with pytest.raises(Exception):
+        _ = int8_t(2**7)
+        _ = int16_t(2**15)
+        _ = int32_t(2**31)
+    a = int8_t(2**7 - 1)
+    b = int16_t(2**15 - 1)
+    c = int32_t(2**31 - 1)
+
+    with pytest.raises(Exception):
+        _ = int8_t(b)
+        _ = int8_t(c)
+        _ = int16_t(c)
+
+    _ = int8_t(a)
+    _ = int16_t(a)
+    _ = int32_t(a)
+
+    _ = int16_t(b)
+    _ = int32_t(b)
+
+    _ = int32_t(c)
+
+    return True
+
+
+def generateTestStruct() -> StructClass:
+    testStructType = {"f1": int32_t, "f2": int8_t}
+    s1 = StructClass("s2", testStructType)
+    structType = {"f1": int32_t, "struct": s1}
+    s = StructClass("s3", structType)
+    return s, s1
+
+
+def testStructPromotion():
+
+    s, s1 = generateTestStruct()
+
+    _ = s({"f1": 15, "struct": s1({"f1": 8, "f2": 8})})
+
+    globals()["s"] = None
+    globals()["s1"] = None
+
+
+def testStructKeyChecking():
+
+    s, _ = generateTestStruct()
+    with pytest.raises(Exception):
+        _ = s({"struct": {"f1": 2540, "f2": 17}})
+    with pytest.raises(Exception):
+        _ = s({"f1": 2**14, "struct": {"f2": 17}})
+    with pytest.raises(Exception):
+        _ = s({"f1": 2**14, "struct": {"f1": 2540, "f3": 18}})
+    with pytest.raises(Exception):
+        _ = s({"f1": 2**14, "strct": {"f1": 2540, "f3": 18}})
+
+    _ = s({"struct": {"f1": 2540, "f2": 17}, "f1": 2**14})
+    _ = s({"struct": {"f2": 17, "f1": 2540}, "f1": 2**14})
+
+    return True
+
+
+def testStructRecursiveEquivalence():
+
+    s, _ = generateTestStruct()
+
+    with pytest.raises(Exception):
+        _ = s({"f1": 2**14, "struct": {"f1": 2540, "f2": 2**8}})
+
+    _ = s({"f1": 2**14, "struct": {"f1": 2540, "f2": 18}})
+    return True
+
+
+def generateTestCtxt() -> NetworkContext:
+    testCtxt = NetworkContext(VariableBuffer, ConstantBuffer, StructBuffer, TransientBuffer)
+
+    var = ConstantBuffer("testConstant", shape = [
+        16,
+    ], values = [14] * 16)
+    testCtxt.add(var, 'global')
+
+    return testCtxt
+
+
+def testPointerPromotion():
+
+    testCtxt = generateTestCtxt()
+    i8p = PointerClass(int8_t)
+    i16p = PointerClass(int16_t)
+    sp = PointerClass(generateTestStruct()[0])
+    testCtxt.annotateType(name = "testConstant", _type = i8p)
+
+    with pytest.raises(Exception):
+        _ = i16p("testConstant", testCtxt)
+    with pytest.raises(Exception):
+        _ = sp("testConstant", testCtxt)
+    _ = i8p("testConstant", testCtxt)
+
+    return True
+
+
+def testPointerSerialization():
+
+    testCtxt = generateTestCtxt()
+
+    i8p = PointerClass(int8_t)
+    testCtxt.annotateType(name = "testConstant", _type = i8p)
+
+    _ = i8p("testConstant", testCtxt)
+    _ = i8p("testConstant", pickle.loads(pickle.dumps(testCtxt)))
+    _ = pickle.loads(pickle.dumps(i8p))("testConstant", testCtxt)
+    _ = pickle.loads(pickle.dumps(i8p))("testConstant", pickle.loads(pickle.dumps(testCtxt)))
+
+    return True
+
+
+def testPointerTypeEquivalence():
+
+    testCtxt = generateTestCtxt()
+
+    i8p = PointerClass(int8_t)
+    i16p = PointerClass(int16_t)
+
+    var = ConstantBuffer("testConstant2", shape = [
+        16,
+    ], values = [14] * 16)
+    testCtxt.add(var, 'global')
+
+    testCtxt.annotateType(name = "testConstant", _type = i8p)
+    testCtxt.annotateType(name = "testConstant2", _type = i8p)
+
+    with pytest.raises(Exception):
+        _ = i16p("testConstant2", testCtxt)
+        _ = i16p("testConstant", testCtxt)
+
+    assert i8p("testConstant",
+               testCtxt) != i8p("testConstant2",
+                                testCtxt), "Pointers testConstant and testConstant2 should not be equal!"
+    assert i8p("testConstant", testCtxt) == i8p("testConstant",
+                                                testCtxt), "Pointers constructed from same reference should be equal!"
+
+    assert pickle.loads(pickle.dumps(i8p("testConstant", testCtxt))) == i8p(
+        "testConstant", testCtxt), "Pointers constructed from same reference should be equal!"
+
+    assert pickle.loads(pickle.dumps(i8p("testConstant", pickle.loads(pickle.dumps(testCtxt))))) == i8p(
+        "testConstant", testCtxt), "Pointers constructed from same reference should be equal!"
+    assert pickle.loads(pickle.dumps(i8p("testConstant", testCtxt))) == i8p(
+        "testConstant", testCtxt), "Pointers constructed from same reference should be equal!"
+
+    return True
+
+
+if __name__ == "__main__":
+    testImmediateSerialization()
+    testImmediatePromotion()
+    testImmediateTypeEquivalence()
+
+    testStructSerialization()
+    testStructPromotion()
+    testStructTypeEquivalence()
+    testStructKeyChecking()
+    testStructRecursiveEquivalence()
+
+    testPointerSerialization()
+    testPointerPromotion()
+    testPointerTypeEquivalence()
diff --git a/DeeployTest/testUtils/ProfilingTraceParser.py b/DeeployTest/testUtils/ProfilingTraceParser.py
new file mode 100644
index 0000000..3374c88
--- /dev/null
+++ b/DeeployTest/testUtils/ProfilingTraceParser.py
@@ -0,0 +1,70 @@
+# ----------------------------------------------------------------------
+#
+# File: ProfilingTraceParser.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from dataclasses import dataclass
+from typing import Dict, List, Literal, get_args
+
+BufferingMode = Literal["SB", "DB"]
+
+
+@dataclass
+class LayerProfiling:
+    bufferingMode: BufferingMode
+    ops: int
+    kernelCycles: List[int]
+    inputDmaCycles: List[int]
+    outputDmaCycles: List[int]
+
+
+class ProfilingTraceParser:
+
+    lineRegex = re.compile(
+        r"\[(\w+)\]\[(SB|DB)\]\[(\d+) ops\]\[Tile \d+\] (Input DMA|Output DMA|Kernel) took (\d+) cycles\n")
+
+    def parse(self, trace: str) -> Dict[str, LayerProfiling]:
+        layerProfilings: Dict[str, LayerProfiling] = {}
+        for match in ProfilingTraceParser.lineRegex.finditer(trace):
+            layerName, bufferingMode, ops, measurementName, cycles = match.groups()
+
+            if layerName not in layerProfilings:
+                assert bufferingMode in get_args(BufferingMode), f"Unsupported bufferingMode {bufferingMode}"
+                layerProfilings[layerName] = LayerProfiling(
+                    bufferingMode = bufferingMode,  # type: ignore
+                    ops = int(ops),
+                    kernelCycles = [],
+                    inputDmaCycles = [],
+                    outputDmaCycles = [])
+
+            if measurementName == "Kernel":
+                layerProfilings[layerName].kernelCycles.append(int(cycles))
+            elif measurementName == "Input DMA":
+                layerProfilings[layerName].inputDmaCycles.append(int(cycles))
+            elif measurementName == "Output DMA":
+                layerProfilings[layerName].outputDmaCycles.append(int(cycles))
+            else:
+                raise RuntimeError(f"Unsupported measurement name: {measurementName}")
+
+        return layerProfilings
diff --git a/DeeployTest/testUtils/__init__.py b/DeeployTest/testUtils/__init__.py
new file mode 100644
index 0000000..65ec809
--- /dev/null
+++ b/DeeployTest/testUtils/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/DeeployTest/testUtils/codeGenerate.py b/DeeployTest/testUtils/codeGenerate.py
new file mode 100644
index 0000000..e33447e
--- /dev/null
+++ b/DeeployTest/testUtils/codeGenerate.py
@@ -0,0 +1,288 @@
+# ----------------------------------------------------------------------
+#
+# File: codeGenerate.py
+#
+# Last edited: 23.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pprint import pprint
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+
+from Deeploy.DeeployTypes import ConstantBuffer, DeploymentPlatform, NetworkDeployer, VariableBuffer
+from Deeploy.Targets.MemPool.Platform import MemPoolPlatform
+
+_TEXT_ALIGN = 30
+
+
+def _shapeBroadcast(ctxt, value, name):
+    if ctxt.is_global(f"{name}"):
+        broadcastShape = ctxt.lookup(f"{name}").shape
+        repeat = np.prod(broadcastShape) / np.prod(value.shape)
+        # Raise error if repeat is not an integer
+        if repeat % 1 != 0:
+            raise ValueError(f"Input {name} has to be broadcastable to shape {broadcastShape}!")
+        repeatNum = np.tile(value, int(repeat))
+        broadcastNum = repeatNum.reshape(-1)
+        ctxt.lookup(f"{name}").shape = broadcastNum.shape
+    else:
+        broadcastNum = value
+
+    return broadcastNum
+
+
+def generateTestInputsHeader(deployer: NetworkDeployer, test_inputs: List, inputTypes: Dict, inputOffsets: Dict) -> str:
+    retStr = ""
+    inputNames = [deployer.ctxt.lookup(buf.name) for buf in deployer.graph.inputs]
+    inputTypes = {buf.name: buf._type for buf in inputNames}
+
+    for index, num in enumerate(test_inputs):
+
+        if f"input_{index}" not in inputTypes.keys():
+            continue
+
+        # WIESEP: Correctly handle empty arrays
+        if np.prod(num.shape) == 0:
+            continue
+
+        test_inputs[index] -= inputOffsets[f"input_{index}"]
+
+        broadcastNum = _shapeBroadcast(deployer.ctxt, num, f"input_{index}")
+
+        data_type = inputTypes[f"input_{index}"]
+        data_width = inputTypes[f"input_{index}"].referencedType.typeWidth
+
+        retStr += f"{data_type.referencedType.typeName} testInputVector{index}[] ="
+        retStr += "{"
+        list_str = (", ").join([str(x) for x in broadcastNum])
+
+        # WIESEP: Arrays have to be 4 byte alinged (at lest in banshee)
+        bytes = len(broadcastNum) * (data_width // 8)
+        if bytes % 4 != 0:
+            bytes = 4 * int((bytes / 4 + 1))
+            padding = (bytes * 8) // data_width - len(broadcastNum)
+            list_str += ", "
+            list_str += (", ").join([str(0) for x in range(padding)])
+
+        retStr += list_str
+        retStr += "};\n"
+
+    retStr += f"void* testInputVector[{len(inputTypes)}] = " + "{"
+    retStr += ", ".join([
+        f"testInputVector{idx}" for idx, _ in enumerate(test_inputs)
+        if np.prod(test_inputs[idx].shape) != 0 and f"input_{idx}" in inputTypes.keys()
+    ])
+    retStr += "};\n"
+
+    return retStr
+
+
+def generateTestOutputsHeader(deployer: NetworkDeployer,
+                              test_outputs: List,
+                              signProp: Optional[bool] = None,
+                              verbose: Optional[bool] = None) -> str:
+
+    output_signed = {}
+    output_n_levels = {}
+    output_data_type = {}
+
+    if signProp is None:
+        signProp = False
+
+    if verbose is None:
+        verbose = False
+
+    retStr = ""
+
+    for index, num in enumerate(test_outputs):
+        output_data_type[f"output_{index}"] = deployer.ctxt.lookup(f'output_{index}')._type
+
+        if signProp:
+            output_n_levels[f"output_{index}"] = deployer.ctxt.lookup(f'output_{index}').nLevels
+            output_signed[f"output_{index}"] = deployer.ctxt.lookup(f'output_{index}')._signed
+            test_outputs[index] -= int(
+                ((1 - output_signed[f"output_{index}"]) * (output_n_levels[f"output_{index}"] / 2)))
+
+        data_type = output_data_type[f"output_{index}"]
+        data_width = data_type.referencedType.typeWidth
+        retStr += f"{data_type.referencedType.typeName} testOutputVector{index}[] ="
+        retStr += "{"
+
+        # WIESEP: Arrays have to be 4 byte alinged (at lest in banshee)
+        list_str = (", ").join([str(x) for x in num])
+
+        bytes = len(num) * (data_width // 8)
+        if bytes % 4 != 0:
+            bytes = 4 * int((bytes / 4 + 1))
+            padding = (bytes * 8) // data_width - len(num)
+            list_str += ", "
+            list_str += (", ").join([str(0) for x in range(padding)])
+
+        retStr += list_str
+        retStr += "};\n"
+
+    retStr += f"void* testOutputVector[{len(test_outputs)}] = " + "{"
+    retStr += ", ".join([f"testOutputVector{idx}" for idx, _ in enumerate(test_outputs)])
+    retStr += "};\n"
+
+    if verbose:
+        if signProp:
+            print('Output N Levels:')
+            pprint(output_n_levels, indent = 2, width = 120)
+            print('Output Signed:')
+            pprint(output_signed, indent = 2, width = 120)
+        print('Output Data Type:')
+        pprint(output_data_type, indent = 2, width = 120)
+
+    return retStr
+
+
+def generateTestNetworkHeader(deployer: NetworkDeployer, platform: DeploymentPlatform) -> str:
+
+    retStr = ""
+
+    retStr += """
+    #ifndef __DEEPLOY_HEADER_
+    #define __DEEPLOY_HEADER_
+    #include <stdio.h>
+    #include <stdint.h>
+    #include <stdlib.h>
+    """
+    retStr += deployer.generateIncludeString()
+    retStr += """
+    void RunNetwork(uint32_t core_id, uint32_t numThreads);
+    void InitNetwork(uint32_t core_id, uint32_t numThread);
+
+    """
+
+    retStr += deployer.generateIOBufferInitializationCode()
+    retStr += """
+    #endif
+    """
+
+    return retStr
+
+
+def generateTestNetworkImplementation(deployer: NetworkDeployer,
+                                      platform: DeploymentPlatform,
+                                      verbose: Optional[bool] = None) -> str:
+
+    if verbose is None:
+        verbose = False
+
+    retStr = ""
+
+    retStr += """#include <stdio.h>
+    #include <stdlib.h>
+    """
+    retStr += deployer.generateIncludeString()
+    retStr += """
+
+    #include "Network.h"
+
+    """
+
+    retStr += deployer.generateBufferInitializationCode()
+    retStr += deployer.generateGlobalDefinitionCode()
+
+    # WIESEP: Mempool assigns section attributes to intermediate buffers to allow .
+    if isinstance(platform, MemPoolPlatform):
+        retStr += deployer.generateInferenceInitializationCode()
+        retStr += """
+        void RunNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){
+        """
+    else:
+        retStr += """
+        void RunNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){
+        """
+        retStr += deployer.generateInferenceInitializationCode()
+
+    retStr += deployer.generateFunction(verbose)
+    retStr += """
+    }
+
+    void InitNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){
+    """
+    retStr += deployer.generateEngineInitializationCode()
+    retStr += deployer.generateBufferAllocationCode()
+    retStr += """
+    }
+    """
+
+    return retStr
+
+
+def generateL3HexDump(deployer: NetworkDeployer, path: str, test_inputs: List, test_outputs: List):
+
+    def type2TypeStr(dataType) -> Tuple[str, int]:
+        width = dataType.referencedType.typeWidth
+        signed = (dataType.referencedType.typeMin < 0)
+
+        retStr = ""
+
+        if signed:
+            retStr += "int"
+        else:
+            retStr += "uint"
+
+        retStr += str(width)
+
+        return retStr, width
+
+    def dumpBuffer(buf: VariableBuffer, path: str):
+
+        if "input" in buf.name:
+            idx = int(buf.name.split("_")[1])
+            array = _shapeBroadcast(deployer.ctxt, test_inputs[idx], f"input_{idx}")
+
+        elif "output" in buf.name:
+            _list = buf.name.split("_")
+            idx = int(_list[1])
+            array = _shapeBroadcast(deployer.ctxt, test_outputs[idx], f"output_{idx}")
+
+        elif isinstance(buf, ConstantBuffer):
+            array = buf.values
+        else:
+            raise Exception(f"Unexpected buffer {buf}!")
+
+        typeStr, width = type2TypeStr(buf._type)
+
+        # Word alignment
+        mod = (32 // width)
+        paddingLength = (mod - (array.size % mod)) % mod
+        paddedArray = np.pad(array.flatten(), (0, paddingLength), 'constant')
+
+        paddedArray.astype(typeStr).tofile(path)
+
+    # SCHEREMO: Dump all global const buffers as hex files
+    globalConstBuffers = [
+        buf for key, buf in deployer.ctxt.globalObjects.items() if isinstance(buf, VariableBuffer) and buf._deploy
+    ]
+    l3ConstBuffer = [buf for buf in globalConstBuffers if hasattr(buf, "_memoryLevel") and buf._memoryLevel == "L3"]
+
+    os.makedirs(path, exist_ok = True)
+
+    for idx, buf in enumerate(l3ConstBuffer):
+        if hasattr(buf, "extName"):
+            pathName = os.path.join(path, f"{buf.extName}.hex")
+            dumpBuffer(buf, pathName)
diff --git a/DeeployTest/testUtils/graphColoring.py b/DeeployTest/testUtils/graphColoring.py
new file mode 100644
index 0000000..4437c94
--- /dev/null
+++ b/DeeployTest/testUtils/graphColoring.py
@@ -0,0 +1,57 @@
+# ----------------------------------------------------------------------
+#
+# File: graphColoring.py
+#
+# Last edited: 10.10.2023.
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+#   - Luka Macan, University of Bologna
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+
+import onnx_graphsurgeon as gs
+
+
+def graph_coloring(graph: gs.Graph, colors: List[str], frequency: Union[int, List[int]], color_attr: str):
+    color_count = 0
+    i_color = 0
+    graph = graph.copy()
+
+    if isinstance(frequency, int):
+        frequency = [frequency] * len(colors)
+    elif isinstance(frequency, list):
+        assert len(frequency) == len(colors), "The length of frequency and colors does not match"
+
+    for node in graph.nodes:
+        if node.op == 'Constant':
+            continue
+        node.attrs[color_attr] = colors[i_color]
+        for inputTensor in node.inputs:
+            if not inputTensor.inputs:
+                continue
+            input = inputTensor.inputs[0]
+            if isinstance(input, gs.Node) and input.op == 'Constant':
+                input.attrs[color_attr] = colors[i_color]
+        color_count += 1
+        if color_count == frequency[i_color]:
+            color_count = 0
+            i_color = (i_color + 1) % len(colors)
+
+    return graph
diff --git a/DeeployTest/testUtils/graphDebug.py b/DeeployTest/testUtils/graphDebug.py
new file mode 100644
index 0000000..9aa87d9
--- /dev/null
+++ b/DeeployTest/testUtils/graphDebug.py
@@ -0,0 +1,54 @@
+# ----------------------------------------------------------------------
+#
+# File: graphDebug.py
+#
+# Last edited: 23.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Tuple
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from .graphDiffUtils import DiffTree, createParentDiffNode, listDiff, nodeDiff, tensorDiff
+
+
+def generateDebugConfig(test_inputs_files, test_outputs_files, activations_files,
+                        graph: gs.Graph) -> Tuple[Dict, Dict, gs.Graph]:
+
+    test_inputs = [test_inputs_files[x].reshape(-1).astype(np.int64) for x in test_inputs_files.files]
+    test_outputs = [test_outputs_files[x].reshape(-1).astype(np.int64) for x in test_outputs_files.files]
+
+    import IPython
+    IPython.embed()
+
+    return test_inputs, test_outputs, graph
+
+
+def graphDiff(graph: gs.Graph, other: gs.Graph) -> DiffTree:
+    graph = graph.toposort()
+    other = other.toposort()
+    diffs = []
+    diffs.append(listDiff(graph.nodes, other.nodes, "nodes", nodeDiff))
+    diffs.append(listDiff(graph.inputs, other.inputs, "inputs", tensorDiff))
+    diffs.append(listDiff(graph.outputs, other.outputs, "outputs", tensorDiff))
+    root = createParentDiffNode(graph, other, graph.name, diffs)
+    return DiffTree(root)
diff --git a/DeeployTest/testUtils/graphDiffUtils.py b/DeeployTest/testUtils/graphDiffUtils.py
new file mode 100644
index 0000000..a3b4d48
--- /dev/null
+++ b/DeeployTest/testUtils/graphDiffUtils.py
@@ -0,0 +1,148 @@
+# ----------------------------------------------------------------------
+#
+# File: graphDiffUtils.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import Callable, Generator, Generic, List, Optional, Sequence, TypeVar
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+T = TypeVar("T")
+
+
+class DiffTreeNode(Generic[T]):
+
+    _MESSAGE_INDENTATION_INCREMENT = 2
+
+    def __init__(self, _object: T, other: T, name: str, children: List[DiffTreeNode]):
+        self._object = _object
+        self.other = other
+        self.name = name
+        self.children = children
+
+    def messages(self, indentation: int = 0) -> List[str]:
+        msg = " " * indentation + self.name
+        if len(self.children) == 0:
+            msg += f": {self._object} vs {self.other}"
+
+        ret = [msg]
+        for child in self.children:
+            ret += child.messages(indentation + self._MESSAGE_INDENTATION_INCREMENT)
+        return ret
+
+    def __repr__(self):
+        return self.name
+
+
+class DiffTree():
+
+    def __init__(self, root: Optional[DiffTreeNode]):
+        self.root = root
+
+    @property
+    def message(self) -> str:
+        if self.root is not None:
+            return ("\n").join(self.root.messages())
+        return ""
+
+    def __iter__(self):
+
+        def dfs(node: Optional[DiffTreeNode[T]]) -> Generator[DiffTreeNode[T], None, None]:
+            if node is None:
+                return
+            yield node
+            for child in node.children:
+                yield from dfs(child)
+
+        return dfs(self.root)
+
+
+def createParentDiffNode(_object: T, other: T, name: str,
+                         children: Sequence[Optional[DiffTreeNode]]) -> Optional[DiffTreeNode[T]]:
+    """Return a node if children list has nodes, otherwise None"""
+    filteredChildren = [child for child in children if child is not None]
+    if len(filteredChildren) > 0:
+        return DiffTreeNode(_object, other, name, filteredChildren)
+    return None
+
+
+def _attrDiff(instance: T,
+              other: T,
+              attr: str,
+              eqFun: Callable[[T, T], bool] = lambda x, y: x == y) -> Optional[DiffTreeNode]:
+    if not eqFun(getattr(instance, attr), getattr(other, attr)):
+        return DiffTreeNode(getattr(instance, attr), getattr(other, attr), attr, [])
+    return None
+
+
+def _variableDiff(variable: gs.Variable, other: gs.Variable) -> Optional[DiffTreeNode]:
+    diffs = []
+    for attr in ["dtype", "shape"]:
+        diffs.append(_attrDiff(variable, other, attr))
+    return createParentDiffNode(variable, other, variable.name, diffs)
+
+
+def _constantDiff(constant: gs.Constant, other: gs.Constant) -> Optional[DiffTreeNode]:
+    diffs = []
+    diffs.append(_attrDiff(constant, other, "data_location"))
+    diffs.append(_attrDiff(constant, other, "values", np.array_equal))
+    return createParentDiffNode(constant, other, constant.name, diffs)
+
+
+def tensorDiff(tensor: gs.Tensor, other: gs.Tensor) -> Optional[DiffTreeNode]:
+    if isinstance(tensor, gs.Variable) and isinstance(other, gs.Variable):
+        return _variableDiff(tensor, other)
+
+    if isinstance(tensor, gs.Constant) and isinstance(other, gs.Constant):
+        return _constantDiff(tensor, other)
+
+    assert isinstance(tensor, gs.Variable) or isinstance(tensor, gs.Constant)
+
+    return DiffTreeNode(tensor, other, tensor.name, [DiffTreeNode(type(tensor), type(other), "type", [])])
+
+
+def listDiff(_list: Sequence[T], other: Sequence[T], name: str,
+             diffFun: Callable[[T, T], Optional[DiffTreeNode]]) -> Optional[DiffTreeNode]:
+    if len(_list) != len(other):
+        diffs = [DiffTreeNode(len(_list), len(other), "length", [])]
+    else:
+        diffs = [diffFun(item, item_other) for item, item_other in zip(_list, other)]
+    return createParentDiffNode(_list, other, name, diffs)
+
+
+def nodeDiff(node: gs.Node, other: gs.Node) -> Optional[DiffTreeNode]:
+    diffs = []
+    attrs = set(list(node.attrs.keys()) + list(other.attrs.keys()))
+    for attr in attrs:
+        if attr not in node.attrs:
+            diffs.append(DiffTreeNode(None, other.attrs[attr], attr, []))
+        elif attr not in other.attrs:
+            diffs.append(DiffTreeNode(node.attrs[attr], None, attr, []))
+        elif node.attrs[attr] != other.attrs[attr]:
+            diffs.append(DiffTreeNode(node.attrs[attr], other.attrs[attr], attr, []))
+    diffs.append(listDiff(node.inputs, other.inputs, "inputs", tensorDiff))
+    diffs.append(listDiff(node.outputs, other.outputs, "outputs", tensorDiff))
+    return createParentDiffNode(node, other, node.name, diffs)
diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py
new file mode 100644
index 0000000..49f4c0d
--- /dev/null
+++ b/DeeployTest/testUtils/platformMapping.py
@@ -0,0 +1,210 @@
+# ----------------------------------------------------------------------
+#
+# File: platformMapping.py
+#
+# Last edited: 23.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, Optional, Tuple, Union
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, TopologyOptimizer
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper
+from Deeploy.Targets.CortexM.Deployer import CMSISDeployer
+from Deeploy.Targets.CortexM.Platform import CMSISOptimizer, CMSISPlatform
+from Deeploy.Targets.Generic.Deployer import GenericDeployer
+from Deeploy.Targets.Generic.Platform import GenericOptimizer, GenericPlatform
+from Deeploy.Targets.MemPool.Deployer import MemPoolDeployer
+from Deeploy.Targets.MemPool.Platform import MemPoolOptimizer, MemPoolPlatform
+from Deeploy.Targets.Neureka.Deployer import NeurekaDeployer
+from Deeploy.Targets.Neureka.Platform import MemoryNeurekaPlatform, MemoryNeurekaPlatformWrapper, NeurekaOptimizer, \
+    NeurekaPlatform
+from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer
+from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPOptimizer, PULPPlatform
+
+_SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool"]
+_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen"]
+_PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS
+
+
+def defaultScheduler(graph: gs.Graph):
+    return graph.nodes
+
+
+def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]:
+
+    assert platformName in _PLATFORMS,\
+        "Platform's signprop preference is unknown! Add it in platformMapping.py."
+
+    if platformName in _SIGNPROP_PLATFORMS:
+        signProp = True
+    else:
+        signProp = False
+
+    if platformName == "Apollo3" or platformName == "Apollo4" or platformName == "QEMU-ARM":
+        Platform = CMSISPlatform()
+
+    elif platformName == "MemPool":
+        Platform = MemPoolPlatform()
+
+    elif platformName == "Generic":
+        Platform = GenericPlatform()
+
+    elif platformName == "Siracusa" or platformName == "PULPOpen":
+        Platform = PULPPlatform()
+
+    elif platformName == "Siracusa_w_neureka":
+        Platform = NeurekaPlatform()
+
+    else:
+        raise RuntimeError(f"Deployment platform {platformName} is not implemented")
+
+    return Platform, signProp
+
+
+def setupMemoryPlatform(platform: DeploymentPlatform, memoryHierarchy: MemoryHierarchy,
+                        defaultTargetMemoryLevel: MemoryLevel) -> Union[MemoryPlatform, MemoryPlatformWrapper]:
+    if isinstance(platform, PULPPlatform):
+        return MemoryPULPPlatformWrapper(platform, memoryHierarchy, defaultTargetMemoryLevel)
+    elif isinstance(platform, NeurekaPlatform):
+        weightMemoryLevel = memoryHierarchy.memoryLevels["WeightMemory_SRAM"] \
+            if "WeightMemory_SRAM" in memoryHierarchy.memoryLevels else None
+        return MemoryNeurekaPlatformWrapper(platform, memoryHierarchy, defaultTargetMemoryLevel, weightMemoryLevel)
+    else:
+        return MemoryPlatformWrapper(platform, memoryHierarchy, defaultTargetMemoryLevel)
+
+
+def mapDeployer(platform: DeploymentPlatform,
+                graph: gs.Graph,
+                inputTypes: Dict[str, type],
+                loweringOptimizer: Optional[TopologyOptimizer] = None,
+                scheduler: Optional[Callable] = None,
+                name: Optional[str] = None,
+                default_channels_first: Optional[bool] = None,
+                deeployStateDir: Optional[str] = None,
+                inputOffsets: Optional[Dict[str, int]] = None) -> NetworkDeployer:
+
+    if scheduler is None:
+        scheduler = defaultScheduler
+
+    if deeployStateDir is None:
+        deeployStateDir = "deeployStates"
+
+    if name is None:
+        name = "DeeployNetwork"
+
+    if isinstance(platform, CMSISPlatform):
+
+        if loweringOptimizer is None:
+            loweringOptimizer = CMSISOptimizer
+
+        if default_channels_first is None:
+            default_channels_first = False
+
+        deployer = CMSISDeployer(graph,
+                                 platform,
+                                 inputTypes,
+                                 loweringOptimizer,
+                                 scheduler,
+                                 name = name,
+                                 default_channels_first = default_channels_first,
+                                 deeployStateDir = deeployStateDir,
+                                 inputOffsets = inputOffsets)
+
+    elif isinstance(platform, MemPoolPlatform):
+
+        if loweringOptimizer is None:
+            loweringOptimizer = MemPoolOptimizer
+
+        if default_channels_first is None:
+            default_channels_first = True
+
+        deployer = MemPoolDeployer(graph,
+                                   platform,
+                                   inputTypes,
+                                   loweringOptimizer,
+                                   scheduler,
+                                   name = name,
+                                   default_channels_first = default_channels_first,
+                                   deeployStateDir = deeployStateDir,
+                                   inputOffsets = inputOffsets)
+
+    elif isinstance(platform, GenericPlatform):
+        # WIESEP: CMSIS performs add-multiply-divide and we normally do multiply-add-divide
+        #         Because these deployer were fine-tuned with a add-multiply-divide aware deployer can emulate this
+        #         behavior with the EmulateCMSISRequantPass
+
+        if loweringOptimizer is None:
+            loweringOptimizer = GenericOptimizer
+
+        if default_channels_first is None:
+            default_channels_first = True
+
+        deployer = GenericDeployer(graph,
+                                   platform,
+                                   inputTypes,
+                                   loweringOptimizer,
+                                   scheduler,
+                                   name = name,
+                                   default_channels_first = default_channels_first,
+                                   deeployStateDir = deeployStateDir,
+                                   inputOffsets = inputOffsets)
+
+    elif isinstance(platform, (NeurekaPlatform, MemoryNeurekaPlatform, MemoryNeurekaPlatformWrapper)):
+
+        if loweringOptimizer is None:
+            loweringOptimizer = NeurekaOptimizer
+
+        if default_channels_first is None:
+            default_channels_first = False
+
+        deployer = NeurekaDeployer(graph,
+                                   platform,
+                                   inputTypes,
+                                   loweringOptimizer,
+                                   scheduler,
+                                   name = name,
+                                   default_channels_first = default_channels_first,
+                                   deeployStateDir = deeployStateDir)
+
+    elif isinstance(platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)):
+
+        if loweringOptimizer is None:
+            loweringOptimizer = PULPOptimizer
+
+        if default_channels_first is None:
+            default_channels_first = False
+
+        deployer = PULPDeployer(graph,
+                                platform,
+                                inputTypes,
+                                loweringOptimizer,
+                                scheduler,
+                                name = name,
+                                default_channels_first = default_channels_first,
+                                deeployStateDir = deeployStateDir)
+
+    else:
+        raise RuntimeError(f"Deployer for platform {platform} is not implemented")
+
+    return deployer
diff --git a/DeeployTest/testUtils/testRunner.py b/DeeployTest/testUtils/testRunner.py
new file mode 100644
index 0000000..4002de6
--- /dev/null
+++ b/DeeployTest/testUtils/testRunner.py
@@ -0,0 +1,395 @@
+# ----------------------------------------------------------------------
+#
+# File: testRunner.py
+#
+# Last edited: 17.03.2023
+#
+# Copyright (C) 2022, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import codecs
+import os
+import re
+import subprocess
+from typing import Literal, Tuple
+
+
+# Source: https://stackoverflow.com/a/38662876
+def escapeAnsi(line):
+    ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
+    return ansi_escape.sub('', line)
+
+
+def prRed(skk):
+    print("\033[91m{}\033[00m".format(skk))
+
+
+def prGreen(skk):
+    print("\033[92m{}\033[00m".format(skk))
+
+
+def prBlue(skk):
+    print("\033[94m{}\033[00m".format(skk))
+
+
+def getPaths(path_test: str, gendir_name: str) -> Tuple[str, str]:
+
+    dir_test = os.path.normpath(path_test)
+    dir_abs = os.path.abspath(dir_test)
+    test_name = dir_abs.split(os.sep)[-1]
+    # Check if path is inside in some child folder of the script location
+
+    # Get the absolute path of the script location
+    scriptPath = os.path.realpath(__file__)
+
+    # Get absolute path path to folder of the script location parent directory
+    scriptDir = os.path.dirname(os.path.dirname(scriptPath))
+
+    # Check if the path is inside the script location
+    if scriptDir in dir_abs:
+        dir_gen = os.path.join(scriptDir, gendir_name, dir_test)
+        dir_gen = os.path.normpath(dir_gen)
+    else:
+        dir_gen = os.path.join(scriptDir, gendir_name, test_name)
+        dir_gen = os.path.normpath(dir_gen)
+
+        print(f"Path is not inside the script location. Using gendir path {dir_gen}")
+
+    return dir_gen, dir_test, test_name
+
+
+def cmake_str(arg_str):
+    return "-D" + codecs.decode(str(arg_str), 'unicode_escape')
+
+
+class _ArgumentDefaultMetavarTypeFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.MetavarTypeHelpFormatter):
+
+    def __init__(self, prog: str, indent_increment: int = 2, max_help_position: int = 100, width = None) -> None:
+        super().__init__(prog, indent_increment, max_help_position, width)
+
+
+class TestGeneratorArgumentParser(argparse.ArgumentParser):
+
+    def __init__(self, description = None):
+
+        formatter = _ArgumentDefaultMetavarTypeFormatter
+
+        if description is None:
+            super().__init__(description = "Test Utility.", formatter_class = formatter)
+        else:
+            super().__init__(description = description, formatter_class = formatter)
+
+        self.add_argument('-t',
+                          metavar = '<dir>',
+                          dest = 'dir',
+                          type = str,
+                          required = True,
+                          help = 'Set the regression test\n')
+        self.add_argument('-p',
+                          metavar = '<platform>',
+                          dest = 'platform',
+                          type = str,
+                          required = True,
+                          help = 'Choose the target Platform\n')
+        self.add_argument('-d',
+                          metavar = '<dir>',
+                          dest = 'dumpdir',
+                          type = str,
+                          default = './TestFiles',
+                          help = 'Set the output dump folder\n')
+        self.add_argument('-v', action = 'count', dest = 'verbose', default = 0, help = 'Increase verbosity level\n')
+
+        self.args = None
+
+    def parse_args(self, args = None, namespace = None) -> argparse.Namespace:
+        self.args = super().parse_args(args, namespace)
+        return self.args
+
+
+class TestRunnerArgumentParser(argparse.ArgumentParser):
+
+    def __init__(self, tiling_arguments: bool, description = None):
+
+        formatter = _ArgumentDefaultMetavarTypeFormatter
+
+        if description is None:
+            super().__init__(description = "Deeploy Code Generation Utility.", formatter_class = formatter)
+        else:
+            super().__init__(description = description, formatter_class = formatter)
+
+        self.tiling_arguments = tiling_arguments
+
+        self.add_argument('-t',
+                          metavar = '<dir>',
+                          dest = 'dir',
+                          type = str,
+                          required = True,
+                          help = 'Set the regression test\n')
+        self.add_argument('-v', action = 'count', dest = 'verbose', default = 0, help = 'Increase verbosity level\n')
+        self.add_argument('-D',
+                          dest = 'cmake',
+                          action = 'extend',
+                          nargs = "*",
+                          type = cmake_str,
+                          help = "Create or update a cmake cache entry\n")
+        self.add_argument('--debug',
+                          dest = 'debug',
+                          action = 'store_true',
+                          default = False,
+                          help = 'Enable debugging mode.\n')
+        self.add_argument('--skipgen',
+                          dest = 'skipgen',
+                          action = 'store_true',
+                          default = False,
+                          help = 'Skip network generation\n')
+        self.add_argument('--skipsim',
+                          dest = 'skipsim',
+                          action = 'store_true',
+                          default = False,
+                          help = 'Skip network simulation\n')
+        self.add_argument('--toolchain',
+                          metavar = '<LLVM|GCC>',
+                          dest = 'toolchain',
+                          type = str,
+                          default = "LLVM",
+                          help = 'Pick compiler toolchain\n')
+        self.add_argument('--toolchain_install_dir',
+                          metavar = '<dir>',
+                          dest = 'toolchain_install_dir',
+                          type = str,
+                          default = os.environ.get('LLVM_INSTALL_DIR'),
+                          help = 'Pick compiler install dir\n')
+        self.add_argument('--overwriteRecentState',
+                          action = 'store_true',
+                          help = 'Copy the recent state to the ./deeployStates folder\n')
+
+        if self.tiling_arguments:
+            self.add_argument('--defaultMemLevel',
+                              metavar = '<level>',
+                              dest = 'defaultMemLevel',
+                              type = str,
+                              default = "L2",
+                              help = 'Set default memory level\n')
+
+            self.add_argument('--doublebuffer', action = 'store_true', help = 'Enable double buffering\n')
+            self.add_argument('--l1',
+                              metavar = '<size>',
+                              dest = 'l1',
+                              type = int,
+                              default = 64000,
+                              help = 'Set L1 size\n')
+            self.add_argument('--randomizedMemoryScheduler',
+                              action = "store_true",
+                              help = 'Enable randomized memory scheduler\n')
+            self.add_argument('--profileTiling',
+                              metavar = '<level>',
+                              dest = 'profileTiling',
+                              type = str,
+                              default = None,
+                              help = 'Profile tiling for a given memory level (eg. "L2")\n')
+
+        self.args = None
+
+    def parse_args(self, args = None, namespace = None) -> argparse.Namespace:
+        self.args = super().parse_args(args, namespace)
+        return self.args
+
+    def generate_cmd_args(self) -> str:
+        if self.args is None:
+            self.args = super().parse_args()
+
+        command = ""
+        if self.args.verbose:
+            command += " -v"
+        if self.args.overwriteRecentState:
+            command += " --overwriteRecentState"
+        if self.args.debug:
+            command += " --debug"
+
+        if self.tiling_arguments:
+            if self.args.defaultMemLevel:
+                command += f" --defaultMemLevel={self.args.defaultMemLevel}"
+            if self.args.doublebuffer:
+                command += " --doublebuffer"
+            if self.args.l1:
+                command += f" --l1={self.args.l1}"
+            if self.args.randomizedMemoryScheduler:
+                command += " --randomizedMemoryScheduler"
+            if self.args.profileTiling is not None:
+                command += f" --profileTiling {self.args.profileTiling}"
+
+        return command
+
+    def cmake_args(self) -> str:
+        if self.args is None:
+            self.args = super().parse_args()
+
+        cmake_args = " ".join(self.args.cmake) if self.args.cmake is not None else ""
+        return cmake_args
+
+
+class TestRunner():
+
+    def __init__(self,
+                 platform: str,
+                 simulator: Literal['gvsoc', 'banshee', 'qemu', 'vsim', 'vsim.gui', 'host', 'none'],
+                 tiling: bool,
+                 argument_parser: TestRunnerArgumentParser,
+                 gen_args: str = "",
+                 cmake_args: str = ""):
+
+        if simulator not in ['gvsoc', 'banshee', 'qemu', 'vsim', 'vsim.gui', 'host', 'none']:
+            raise ValueError(
+                f"Invalid emulator {simulator} (valid options are 'gvsoc', 'banshee', 'qemu', 'vsim', 'vsim.gui', 'host', 'none')!"
+            )
+
+        if tiling is not argument_parser.tiling_arguments:
+            raise ValueError("Specified argument parser without tile arguments for tiling test or vice versa!")
+
+        self._platform = platform
+        self._simulator = simulator
+        self._tiling = tiling
+
+        self._argument_parser = argument_parser
+        self._args = self._argument_parser.parse_args()
+
+        self.cmake_args = cmake_args
+        self.gen_args = gen_args
+
+        self._dir_gen_root = f'TEST_{platform.upper()}'
+        self._dir_toolchain = os.path.normpath(self._args.toolchain_install_dir)
+        self._dir_build = f"{self._dir_gen_root}/build"
+        self._dir_gen, self._dir_test, self._name_test = getPaths(self._args.dir, self._dir_gen_root)
+
+        print("Generation Directory: ", self._dir_gen)
+        print("Test Directory      : ", self._dir_test)
+        print("Test Name           : ", self._name_test)
+
+    def run(self,):
+        prRed(f"################## Testing {self._dir_test} on {self._platform} Platform ##################")
+
+        if self._args.skipgen is False:
+            self.generate_test()
+
+        self.configure_cmake_project()
+
+        self.build_binary()
+
+        if self._args.skipsim is False:
+            self.run_simulation()
+
+    def generate_test(self):
+        if self._tiling is True:
+            generation_script = "testMVP.py"
+        else:
+            generation_script = "generateNetwork.py"
+
+        command = f"python {generation_script} -d {self._dir_gen} -t {self._dir_test} -p {self._platform} {self.gen_args}"
+        command += self._argument_parser.generate_cmd_args()
+
+        if self._args.verbose >= 2:
+            prBlue(f"[TestRunner] Generation Command: {command}")
+
+        err = os.system(command)
+        if err != 0:
+            raise RuntimeError(f"generate Network failed on {self._args.dir}")
+
+    def configure_cmake_project(self):
+        self.cmake_args += self._argument_parser.cmake_args()
+
+        if self._simulator == 'banshee':
+            self.cmake_args += " -D banshee_simulation=ON"
+        else:
+            self.cmake_args += " -D banshee_simulation=OFF"
+
+        command = f"$CMAKE -D TOOLCHAIN={self._args.toolchain} -D TOOLCHAIN_INSTALL_DIR={self._dir_toolchain} -D GENERATED_SOURCE={self._dir_gen} -D platform={self._platform} {self.cmake_args} -B {self._dir_build} -D TESTNAME={self._name_test} .."
+
+        if self._args.verbose >= 3:
+            command = "VERBOSE=1 " + command
+        if self._args.verbose >= 2:
+            prBlue(f"[TestRunner] Cmake Command: {command}")
+
+        err = os.system(command)
+        if err != 0:
+            raise RuntimeError(f"Configuring cMake project failed on {self._dir_test}")
+
+    def build_binary(self):
+        command = f"$CMAKE --build {self._dir_build} --target {self._name_test}"
+
+        if self._args.verbose >= 3:
+            command = "VERBOSE=1 " + command
+        if self._args.verbose >= 2:
+            prBlue(f"[TestRunner] Building Command: {command}")
+
+        err = os.system(command)
+        if err != 0:
+            raise RuntimeError(f"Building cMake project failed on {self._dir_test}")
+
+    def run_simulation(self, out_file = 'out.txt'):
+        if self._simulator == 'none':
+            raise RuntimeError("No simulator specified!")
+
+        if self._simulator == 'host':
+            command = f"{self._dir_build}/bin/{self._name_test}"
+        else:
+            command = f"$CMAKE --build {self._dir_build} --target {self._simulator}_{self._name_test}"
+
+        if self._args.verbose >= 3:
+            command = "VERBOSE=1 " + command
+
+        if self._simulator == 'banshee':
+            if self._args.verbose == 1:
+                command = "BANSHEE_LOG=warn " + command
+            if self._args.verbose == 2:
+                command = "BANSHEE_LOG=info " + command
+            if self._args.verbose >= 3:
+                command = "BANSHEE_LOG=debug " + command
+
+        if self._args.verbose >= 2:
+            prBlue(f"[TestRunner] Simulation Command: {command}")
+
+        process = subprocess.Popen([command],
+                                   stdout = subprocess.PIPE,
+                                   stderr = subprocess.STDOUT,
+                                   shell = True,
+                                   encoding = 'utf-8')
+
+        fileHandle = open(out_file, 'a', encoding = 'utf-8')
+        fileHandle.write(
+            f"################## Testing {self._dir_test} on {self._platform} Platform ##################\n")
+
+        result = ""
+        while True:
+            output = process.stdout.readline()
+            if output == '' and process.poll() is not None:
+                break
+            if output:
+                print(output.strip())
+                result += output
+                fileHandle.write(f"{escapeAnsi(output)}")
+
+        fileHandle.write("")
+        fileHandle.close()
+
+        if "Errors: 0 out of " not in result:
+            prRed(f"❌ Found errors in {self._dir_test}")
+            raise RuntimeError(f"Found an error in {self._dir_test}")
+        else:
+            prGreen(f"✅ No errors found in in {self._dir_test}")
diff --git a/DeeployTest/testUtils/typeMapping.py b/DeeployTest/testUtils/typeMapping.py
new file mode 100644
index 0000000..66efa94
--- /dev/null
+++ b/DeeployTest/testUtils/typeMapping.py
@@ -0,0 +1,94 @@
+# ----------------------------------------------------------------------
+#
+# File: typeMapping.py
+#
+# Last edited: 22.05.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+from typing import List, Optional
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes, int8_t
+
+offsetType = namedtuple("offsetType", ("type", "offset"))
+
+
+def isInteger(_input: np.array) -> bool:
+    if np.abs((_input.astype(int) - _input)).max() > 0.001:
+        return False
+    return True
+
+
+def isUnsigned(_input: np.array) -> bool:
+    if (_input).min() < 0:
+        return False
+    return True
+
+
+def dataWidth(n):
+    count = 0
+    n = np.abs(int(n - 1))
+    while (n > 0):
+        count += 1
+        n = n >> 8
+    ret = 2**(count + 2)
+    if ret < 8:
+        ret = 8
+    return ret
+
+
+def inferInputType(_input: np.ndarray,
+                   signProp: Optional[bool] = None,
+                   defaultType = PointerClass(int8_t),
+                   defaultOffset = 0) -> List[offsetType]:
+
+    # WIESEP: We cannot do type inference for empty arrays.
+    if np.prod(_input.shape) == 0:
+        print(f"Warning: Empty input array for type inference for {_input}!")
+        return [(defaultType, defaultOffset)]
+
+    if not isInteger(_input):
+        raise Exception("Deeploy currently only handles integer types!")
+
+    if signProp is None:
+        signProp = False
+
+    signedPlatformTypes = [_type for _type in IntegerDataTypes if _type.typeMin < 0]
+
+    matchingTypes = []
+
+    if signProp and isUnsigned(_input):
+        for _type in sorted(signedPlatformTypes, key = lambda x: x.typeWidth):
+            signPropOffset = (2**(_type.typeWidth - 1))
+            if _type.checkPromotion(_input - signPropOffset):
+                matchingTypes.append(offsetType(PointerClass(_type), signPropOffset))
+    else:
+        for _type in sorted(IntegerDataTypes, key = lambda x: x.typeWidth):
+            if _type.checkPromotion(_input):
+                matchingTypes.append(offsetType(PointerClass(_type), 0))
+
+    if matchingTypes == []:
+        raise Exception("Could not find a matching type!")
+
+    return matchingTypes
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0b4f59a
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2022 deeploy
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..544a5c1
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,315 @@
+# ----------------------------------------------------------------------
+#
+# File: Makefile
+#
+# Created: 30.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SHELL = /usr/bin/env bash
+ROOT_DIR := $(patsubst %/,%, $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
+
+INSTALL_PREFIX        ?= install
+
+DEEPLOY_INSTALL_DIR           ?= ${ROOT_DIR}/${INSTALL_PREFIX}
+TOOLCHAIN_DIR         := ${ROOT_DIR}/toolchain
+
+LLVM_INSTALL_DIR      ?= ${DEEPLOY_INSTALL_DIR}/llvm
+LLVM_CLANG_RT_ARM      ?= ${LLVM_INSTALL_DIR}/lib/clang/15.0.0/lib/baremetal/libclang_rt.builtins-armv7m.a
+LLVM_CLANG_RT_RISCV_RV32IMC      ?= ${LLVM_INSTALL_DIR}/lib/clang/15.0.0/lib/baremetal/rv32imc/libclang_rt.builtins-riscv32.a
+LLVM_CLANG_RT_RISCV_RV32IM      ?= ${LLVM_INSTALL_DIR}/lib/clang/15.0.0/lib/baremetal/rv32im/libclang_rt.builtins-riscv32.a
+PICOLIBC_ARM_INSTALL_DIR      ?= ${LLVM_INSTALL_DIR}/picolibc/arm
+PICOLIBC_RISCV_INSTALL_DIR      ?= ${LLVM_INSTALL_DIR}/picolibc/riscv
+
+PULP_SDK_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/pulp-sdk
+QEMU_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/qemu
+BANSHEE_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/banshee
+MEMPOOL_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/mempool
+
+CMAKE ?= cmake
+
+LLVM_COMMIT_HASH ?= 99902b1
+PICOLIBC_COMMIT_HASH ?= 31ff1b3601b379e4cab63837f253f59729ce1fef
+PULP_SDK_COMMIT_HASH ?= c216298881cee767afc30928e055982b9e40e568
+BANSHEE_COMMIT_HASH ?= 9644dd1d84d4899909f48f107332c59d166617f5
+MEMPOOL_COMMIT_HASH ?= affd45d94e05e375a6966af6a762deeb182a7bd6
+
+RUSTUP_CARGO ?= $$(rustup which cargo)
+
+all: toolchain emulators docs echo-bash
+
+echo-bash:
+	@echo "Please export the following symbols:"
+	@echo "PULP_SDK_HOME=${PULP_SDK_INSTALL_DIR}"
+	@echo "LLVM_INSTALL_DIR=${LLVM_INSTALL_DIR}"
+	@echo "CMAKE=$$(which cmake)"
+
+	@echo "Please add the following paths to your PATH variable:"
+	@echo "${QEMU_INSTALL_DIR}/bin"
+	@echo "${BANSHEE_INSTALL_DIR}"
+
+	@echo "For PULP to work, please source the following file:"
+	@echo "${PULP_SDK_INSTALL_DIR}/configs/siracusa.sh"
+
+	@echo ""
+	@echo "TL/DR: add these lines to run ~/.bashrc"
+	@echo "export PULP_SDK_HOME=${PULP_SDK_INSTALL_DIR}"
+	@echo "export LLVM_INSTALL_DIR=${LLVM_INSTALL_DIR}"
+	@echo "export PULP_RISCV_GCC_TOOLCHAIN=/PULP_SDK_IS_A_MESS"
+	@echo "export MEMPOOL_HOME=${MEMPOOL_INSTALL_DIR}"
+	@echo "export CMAKE=$$(which cmake)"
+	@echo "export PATH=${QEMU_INSTALL_DIR}/bin:${BANSHEE_INSTALL_DIR}:\$$PATH"
+	@echo "export PATH=~/.cargo/bin:$PATH"
+	@echo "source ${PULP_SDK_INSTALL_DIR}/configs/siracusa.sh"
+
+
+toolchain: llvm llvm-compiler-rt-riscv llvm-compiler-rt-arm picolibc-arm picolibc-riscv
+
+emulators: pulp-sdk qemu banshee mempool
+
+${TOOLCHAIN_DIR}/llvm-project:
+	cd ${TOOLCHAIN_DIR} && \
+	git clone https://github.com/pulp-platform/llvm-project.git \
+	 -b main && \
+	cd ${TOOLCHAIN_DIR}/llvm-project && git checkout ${LLVM_COMMIT_HASH} && \
+	git submodule update --init --recursive && \
+	git apply ${TOOLCHAIN_DIR}/llvm.patch
+
+${LLVM_INSTALL_DIR}: ${TOOLCHAIN_DIR}/llvm-project
+	cd ${TOOLCHAIN_DIR}/llvm-project && \
+	mkdir -p build && cd build && \
+	${CMAKE} -G Ninja \
+	-DCMAKE_INSTALL_PREFIX=${LLVM_INSTALL_DIR} \
+	-DLLVM_ENABLE_PROJECTS="clang;lld" \
+	-DLLVM_TARGETS_TO_BUILD="ARM;RISCV;host" \
+	-DLLVM_BUILD_DOCS="0" \
+	-DLLVM_ENABLE_BINDINGS="0" \
+	-DLLVM_ENABLE_TERMINFO="0" \
+	-DLLVM_OPTIMIZED_TABLEGEN=ON \
+	-DLLVM_PARALLEL_LINK_JOBS=2 \
+	-DCMAKE_BUILD_TYPE=Release \
+	../llvm && \
+	${CMAKE} --build . -j && \
+	${CMAKE} --install .
+
+llvm: ${LLVM_INSTALL_DIR}
+
+${LLVM_CLANG_RT_RISCV_RV32IMC}: ${TOOLCHAIN_DIR}/llvm-project
+	cd ${TOOLCHAIN_DIR}/llvm-project && mkdir -p build-compiler-rt-riscv-rv32imc \
+	&& cd build-compiler-rt-riscv-rv32imc; \
+	${CMAKE} ../compiler-rt \
+	-DCMAKE_C_COMPILER_WORKS=1 \
+	-DCMAKE_CXX_COMPILER_WORKS=1 \
+	-DCMAKE_AR=${LLVM_INSTALL_DIR}/bin/llvm-ar \
+	-DCMAKE_INSTALL_PREFIX=${LLVM_INSTALL_DIR}/lib/clang/15.0.0 \
+	-DCMAKE_ASM_COMPILER_TARGET="riscv32-unknown-elf" \
+	-DCMAKE_C_COMPILER=${LLVM_INSTALL_DIR}/bin/clang \
+	-DCMAKE_ASM_COMPILER=${LLVM_INSTALL_DIR}/bin/clang \
+	-DCMAKE_C_FLAGS="-mno-relax -march=rv32imc" \
+	-DCMAKE_SYSTEM_NAME=baremetal \
+	-DCMAKE_HOST_SYSTEM_NAME=baremetal \
+	-DCMAKE_C_COMPILER_TARGET="riscv32-unknown-elf" \
+	-DCMAKE_CXX_COMPILER_TARGET="riscv32-unknown-elf" \
+	-DCMAKE_SIZEOF_VOID_P=4 \
+	-DCMAKE_NM=${LLVM_INSTALL_DIR}/bin/llvm-nm \
+	-DCMAKE_RANLIB=${LLVM_INSTALL_DIR}/bin/llvm-ranlib \
+	-DCOMPILER_RT_BUILD_BUILTINS=ON \
+	-DCOMPILER_RT_BUILD_LIBFUZZER=OFF \
+	-DCOMPILER_RT_BUILD_MEMPROF=OFF \
+	-DCOMPILER_RT_BUILD_PROFILE=OFF \
+	-DCOMPILER_RT_BUILD_SANITIZERS=OFF \
+	-DCOMPILER_RT_BUILD_XRAY=OFF \
+	-DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON \
+	-DCOMPILER_RT_BAREMETAL_BUILD=ON \
+	-DCOMPILER_RT_OS_DIR="baremetal/rv32imc" \
+	-DLLVM_CONFIG_PATH=${LLVM_INSTALL_DIR}/bin/llvm-config && \
+	${CMAKE} --build . -j && \
+	${CMAKE} --install .
+
+${LLVM_CLANG_RT_RISCV_RV32IM}: ${TOOLCHAIN_DIR}/llvm-project
+	cd ${TOOLCHAIN_DIR}/llvm-project && mkdir -p build-compiler-rt-riscv-rv32im \
+	&& cd build-compiler-rt-riscv-rv32im; \
+	${CMAKE} ../compiler-rt \
+	-DCMAKE_C_COMPILER_WORKS=1 \
+	-DCMAKE_CXX_COMPILER_WORKS=1 \
+	-DCMAKE_AR=${LLVM_INSTALL_DIR}/bin/llvm-ar \
+	-DCMAKE_INSTALL_PREFIX=${LLVM_INSTALL_DIR}/lib/clang/15.0.0 \
+	-DCMAKE_ASM_COMPILER_TARGET="riscv32-unknown-elf" \
+	-DCMAKE_C_COMPILER=${LLVM_INSTALL_DIR}/bin/clang \
+	-DCMAKE_ASM_COMPILER=${LLVM_INSTALL_DIR}/bin/clang \
+	-DCMAKE_C_FLAGS="-mno-relax -march=rv32im" \
+	-DCMAKE_SYSTEM_NAME=baremetal \
+	-DCMAKE_HOST_SYSTEM_NAME=baremetal \
+	-DCMAKE_C_COMPILER_TARGET="riscv32-unknown-elf" \
+	-DCMAKE_CXX_COMPILER_TARGET="riscv32-unknown-elf" \
+	-DCMAKE_SIZEOF_VOID_P=4 \
+	-DCMAKE_NM=${LLVM_INSTALL_DIR}/bin/llvm-nm \
+	-DCMAKE_RANLIB=${LLVM_INSTALL_DIR}/bin/llvm-ranlib \
+	-DCOMPILER_RT_BUILD_BUILTINS=ON \
+	-DCOMPILER_RT_BUILD_LIBFUZZER=OFF \
+	-DCOMPILER_RT_BUILD_MEMPROF=OFF \
+	-DCOMPILER_RT_BUILD_PROFILE=OFF \
+	-DCOMPILER_RT_BUILD_SANITIZERS=OFF \
+	-DCOMPILER_RT_BUILD_XRAY=OFF \
+	-DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON \
+	-DCOMPILER_RT_BAREMETAL_BUILD=ON \
+	-DCOMPILER_RT_OS_DIR="baremetal/rv32im" \
+	-DLLVM_CONFIG_PATH=${LLVM_INSTALL_DIR}/bin/llvm-config && \
+	${CMAKE} --build . -j && \
+	${CMAKE} --install .
+
+llvm-compiler-rt-riscv: ${LLVM_CLANG_RT_RISCV_RV32IM} ${LLVM_CLANG_RT_RISCV_RV32IMC}
+
+${LLVM_CLANG_RT_ARM}: ${TOOLCHAIN_DIR}/llvm-project
+	cd ${TOOLCHAIN_DIR}/llvm-project && mkdir -p build-compiler-rt-arm \
+	&& cd build-compiler-rt-arm; \
+	${CMAKE} ../compiler-rt \
+	-DCMAKE_C_COMPILER_WORKS=1 \
+	-DCMAKE_CXX_COMPILER_WORKS=1 \
+	-DCMAKE_AR=${LLVM_INSTALL_DIR}/bin/llvm-ar \
+	-DCMAKE_INSTALL_PREFIX=${LLVM_INSTALL_DIR}/lib/clang/15.0.0 \
+	-DCMAKE_ASM_COMPILER_TARGET="armv7m-none-eabi" \
+	-DCMAKE_C_COMPILER=${LLVM_INSTALL_DIR}/bin/clang \
+	-DCMAKE_ASM_COMPILER=${LLVM_INSTALL_DIR}/bin/clang \
+	-DCMAKE_C_FLAGS="-mcpu=cortex-m4 "\
+	-DCMAKE_SYSTEM_NAME=baremetal \
+	-DCMAKE_HOST_SYSTEM_NAME=baremetal \
+	-DCMAKE_C_COMPILER_TARGET="armv7m-none-eabi" \
+	-DCMAKE_CXX_COMPILER_TARGET="armv7m-none-eabi" \
+	-DCMAKE_SIZEOF_VOID_P=4 \
+	-DCMAKE_NM=${LLVM_INSTALL_DIR}/bin/llvm-nm \
+	-DCMAKE_RANLIB=${LLVM_INSTALL_DIR}/bin/llvm-ranlib \
+	-DCOMPILER_RT_BUILD_BUILTINS=ON \
+	-DCOMPILER_RT_BUILD_LIBFUZZER=OFF \
+	-DCOMPILER_RT_BUILD_MEMPROF=OFF \
+	-DCOMPILER_RT_BUILD_PROFILE=OFF \
+	-DCOMPILER_RT_BUILD_SANITIZERS=OFF \
+	-DCOMPILER_RT_BUILD_XRAY=OFF \
+	-DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON \
+	-DCOMPILER_RT_BAREMETAL_BUILD=ON \
+	-DCOMPILER_RT_OS_DIR="baremetal" \
+	-DLLVM_CONFIG_PATH=${LLVM_INSTALL_DIR}/bin/llvm-config && \
+	${CMAKE} --build . -j && \
+	${CMAKE} --install .
+
+llvm-compiler-rt-arm: ${LLVM_CLANG_RT_ARM}
+
+${TOOLCHAIN_DIR}/picolibc:
+	cd ${TOOLCHAIN_DIR} && \
+	git clone https://github.com/picolibc/picolibc.git && \
+	cd ${TOOLCHAIN_DIR}/picolibc && git checkout ${PICOLIBC_COMMIT_HASH} && \
+	git submodule update --init --recursive
+
+${PICOLIBC_ARM_INSTALL_DIR}: ${TOOLCHAIN_DIR}/picolibc
+	cd ${TOOLCHAIN_DIR}/picolibc && mkdir -p build-arm && cd build-arm && \
+	cp ${TOOLCHAIN_DIR}/meson-build-script-arm.txt ../scripts && \
+	PATH=${LLVM_INSTALL_DIR}/bin:${PATH} meson setup --reconfigure -Dincludedir=include \
+	-Dlibdir=lib \
+	-Dspecsdir=none \
+	-Dmultilib=false \
+	--prefix ${PICOLIBC_ARM_INSTALL_DIR} \
+	--cross-file ../scripts/meson-build-script-arm.txt && \
+	PATH=${LLVM_INSTALL_DIR}/bin:${PATH} meson install
+
+picolibc-arm: ${PICOLIBC_ARM_INSTALL_DIR}
+
+${PICOLIBC_RISCV_INSTALL_DIR}: ${TOOLCHAIN_DIR}/picolibc
+	cd ${TOOLCHAIN_DIR}/picolibc && mkdir -p build-riscv && cd build-riscv && \
+	cp ${TOOLCHAIN_DIR}/meson-build-script-riscv.txt ../scripts && \
+	PATH=${LLVM_INSTALL_DIR}/bin:${PATH} meson setup --reconfigure -Dincludedir=include \
+	-Dlibdir=lib \
+	-Dspecsdir=none \
+	-Dmultilib=false \
+	--prefix ${PICOLIBC_RISCV_INSTALL_DIR} \
+	--cross-file ../scripts/meson-build-script-riscv.txt && \
+	PATH=${LLVM_INSTALL_DIR}/bin:${PATH} meson install
+
+picolibc-riscv: ${PICOLIBC_RISCV_INSTALL_DIR}
+
+${TOOLCHAIN_DIR}/pulp-sdk:
+	cd ${TOOLCHAIN_DIR} && \
+	git clone https://github.com/Scheremo/pulp-sdk.git -b scheremo && \
+	cd ${TOOLCHAIN_DIR}/pulp-sdk && git checkout ${PULP_SDK_COMMIT_HASH} && \
+	git submodule update --init --recursive
+
+${PULP_SDK_INSTALL_DIR}: ${TOOLCHAIN_DIR}/pulp-sdk
+	mkdir -p ${PULP_SDK_INSTALL_DIR}
+	cp -r ${TOOLCHAIN_DIR}/pulp-sdk/ ${PULP_SDK_INSTALL_DIR}/../
+	cd ${PULP_SDK_INSTALL_DIR} && \
+	source configs/siracusa.sh && \
+	make build
+
+pulp-sdk: ${PULP_SDK_INSTALL_DIR}
+
+${TOOLCHAIN_DIR}/qemu:
+	cd ${TOOLCHAIN_DIR} && \
+	git clone https://github.com/qemu/qemu.git --depth 1 -b stable-6.1 && \
+	cd ${TOOLCHAIN_DIR}/qemu && \
+	git submodule update --init --recursive
+
+${QEMU_INSTALL_DIR}: ${TOOLCHAIN_DIR}/qemu
+	cd ${TOOLCHAIN_DIR}/qemu/ && \
+	mkdir -p build && cd build && \
+	../configure --target-list=arm-softmmu,arm-linux-user,riscv32-softmmu,riscv32-linux-user \
+	--prefix=${QEMU_INSTALL_DIR} && \
+	make -j && \
+	make install
+
+qemu: ${QEMU_INSTALL_DIR}
+
+${TOOLCHAIN_DIR}/banshee:
+	cd ${TOOLCHAIN_DIR} && \
+	git clone https://github.com/pulp-platform/banshee.git && \
+	cd ${TOOLCHAIN_DIR}/banshee && git checkout ${BANSHEE_COMMIT_HASH} && \
+	git submodule update --init --recursive && \
+	git apply ${TOOLCHAIN_DIR}/banshee.patch
+
+${BANSHEE_INSTALL_DIR}: ${TOOLCHAIN_DIR}/banshee
+	export LLVM_SYS_150_PREFIX=${LLVM_INSTALL_DIR} && \
+	cd ${TOOLCHAIN_DIR}/banshee/ && \
+	${RUSTUP_CARGO} clean && \
+	${RUSTUP_CARGO} install --path . -f
+
+banshee: ${BANSHEE_INSTALL_DIR}
+
+mempool: ${MEMPOOL_INSTALL_DIR}
+
+${TOOLCHAIN_DIR}/mempool:
+	cd ${TOOLCHAIN_DIR} && \
+	git clone https://github.com/Xeratec/mempool.git && \
+	cd ${TOOLCHAIN_DIR}/mempool && git checkout ${MEMPOOL_COMMIT_HASH}
+
+${MEMPOOL_INSTALL_DIR}: ${TOOLCHAIN_DIR}/mempool
+	mkdir -p ${MEMPOOL_INSTALL_DIR}/software && \
+	cd ${TOOLCHAIN_DIR}/mempool && \
+	cp -r ${TOOLCHAIN_DIR}/mempool/software/runtime ${MEMPOOL_INSTALL_DIR}/software
+
+.PHONY: docs clean-docs format
+
+format:
+	python scripts/run_clang_format.py -e "*/third_party/*" -e "*/install/*" -e "*/toolchain/*" -ir ./ scripts --clang-format-executable=${LLVM_INSTALL_DIR}/bin/clang-format
+	autoflake -i -r --remove-all-unused-imports --ignore-init-module-imports --exclude "*/third_party/**" ./
+	yapf -ipr -e "third_party/" -e "install/" -e "toolchain/" .
+	isort --sg "**/third_party/*"  --sg "install/*" --sg "toolchain/*" ./
+
+docs:
+	make -C docs html
+clean-docs:
+	rm -rf docs/_autosummary docs/_build
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c3af707
--- /dev/null
+++ b/README.md
@@ -0,0 +1,21 @@
+# Deeploy
+
+Deeploy is a python tool to generate low-level optimized C Code for multi-cluster, heterogeneous SoCs. Its goal is to enable configurable deployment flows from a bottom-up compiler perspective, modelling target hardware in a fine-grained and modular manner.
+
+Deeploy is developed as part of the PULP project, a joint effort between ETH Zurich and the University of Bologna.
+
+## License
+
+Unless specified otherwise in the respective file headers, all code checked into this repository is made available under a permissive license. All software sources and tool scripts are licensed under Apache 2.0, except for files contained in the `scripts` directory, which are licensed under the MIT license, and files contained in the `DeeployTest/Tests`directory, which are licensed under the [Creative Commons Attribution-NoDerivates 4.0 International](https://creativecommons.org/licenses/by-nd/4.0) license (CC BY-ND 4.0).
+
+## Getting started
+
+To install Deeploy, simply run
+```
+pip install -e .
+```
+to download and install the Python dependencies. Run
+```
+make docs
+```
+and open `docs/_build/html/index.html` for more extensive documentation & getting started guides.
diff --git a/TargetLibraries/CMSIS/CMakeLists.txt b/TargetLibraries/CMSIS/CMakeLists.txt
new file mode 100644
index 0000000..2b6b75d
--- /dev/null
+++ b/TargetLibraries/CMSIS/CMakeLists.txt
@@ -0,0 +1,24 @@
+file(GLOB_RECURSE SOURCES
+  "src/**"
+)
+
+add_deeploy_library(deeploycmsis STATIC ${SOURCES})
+
+target_include_directories(deeploycmsis
+  PUBLIC
+  ${CMAKE_CURRENT_LIST_DIR}/inc
+)
+
+# SCHEREMO: CMSIS-NN
+add_subdirectory(third_party/CMSIS-NN)
+target_compile_options(cmsis-nn
+  PRIVATE
+  -DARM_MATH_DSP
+  -DARM_MATH_LOOPUNROLL
+  -Ofast
+  # SCHEREMO: Waive CMSIS-NN warnings
+  -Wno-sign-conversion
+  -Wno-conversion
+)
+
+target_link_libraries(deeploycmsis INTERFACE cmsis-nn)
diff --git a/TargetLibraries/CMSIS/inc/DeeployMath.h b/TargetLibraries/CMSIS/inc/DeeployMath.h
new file mode 100644
index 0000000..7aa1b18
--- /dev/null
+++ b/TargetLibraries/CMSIS/inc/DeeployMath.h
@@ -0,0 +1,45 @@
+/* =====================================================================
+ * Title:        DeeployMath.h
+ * Description:
+ *
+ * $Date:        30.12.2021
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_HEADER_
+#define __DEEPLOY_MATH_HEADER_
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#if defined(AM_PART_APOLLO4B) | defined(DAM_PART_APOLLO3)
+#include "am_bsp.h"
+#include "am_mcu_apollo.h"
+#include "am_util.h"
+#endif
+
+#include "DeeployBasicMath.h"
+
+#endif // __DEEPLOY_MATH_HEADER_
diff --git a/TargetLibraries/CMSIS/src/Util.c b/TargetLibraries/CMSIS/src/Util.c
new file mode 100644
index 0000000..257ea95
--- /dev/null
+++ b/TargetLibraries/CMSIS/src/Util.c
@@ -0,0 +1,53 @@
+/* =====================================================================
+ * Title:        Util.c
+ * Description:
+ *
+ * Date:         15.03.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except pSrcA compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to pSrcA writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+// Overwrite weak function from DeeployBasicLibs
+int deeploy_log(const char *__restrict fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  int ret;
+
+#if defined(AM_PART_APOLLO4B) | defined(DAM_PART_APOLLO3)
+  ret = am_util_stdio_vprintf(fmt, args);
+#else
+  ret = vprintf(fmt, args);
+#endif
+
+  va_end(args);
+  return ret;
+}
+
+void *deeploy_malloc(const size_t size) { return malloc(size); }
+
+void deeploy_free(void *const ptr) { free(ptr); }
diff --git a/TargetLibraries/CMSIS/third_party/CMSIS-NN b/TargetLibraries/CMSIS/third_party/CMSIS-NN
new file mode 160000
index 0000000..9d924bd
--- /dev/null
+++ b/TargetLibraries/CMSIS/third_party/CMSIS-NN
@@ -0,0 +1 @@
+Subproject commit 9d924bdaee51ca8e0c4e86779bbb6d0c9644e555
diff --git a/TargetLibraries/Generic/CMakeLists.txt b/TargetLibraries/Generic/CMakeLists.txt
new file mode 100644
index 0000000..55bcde7
--- /dev/null
+++ b/TargetLibraries/Generic/CMakeLists.txt
@@ -0,0 +1,10 @@
+file(GLOB_RECURSE SOURCES 
+    "src/**"
+)
+
+add_deeploy_library(deeploybasic STATIC ${SOURCES})
+
+target_include_directories(deeploybasic 
+    PUBLIC
+    ${CMAKE_CURRENT_LIST_DIR}/inc
+)
diff --git a/TargetLibraries/Generic/inc/DeeployBasicMath.h b/TargetLibraries/Generic/inc/DeeployBasicMath.h
new file mode 100644
index 0000000..b9e1acd
--- /dev/null
+++ b/TargetLibraries/Generic/inc/DeeployBasicMath.h
@@ -0,0 +1,73 @@
+/* =====================================================================
+ * Title:        DeeployBasicMath.h
+ * Description:
+ *
+ * Date:         14.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ * - Victor Jung, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_HEADER_
+#define __DEEPLOY_BASIC_MATH_HEADER_
+
+// Define default empty wrapper for single core section
+#ifndef BEGIN_SINGLE_CORE
+#define BEGIN_SINGLE_CORE
+#endif
+
+#ifndef END_SINGLE_CORE
+#define END_SINGLE_CORE
+#endif
+
+#ifndef SINGLE_CORE
+#define SINGLE_CORE
+#endif
+
+#include <ctype.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "macros.h"
+#include "util.h"
+
+#include "kernel/Convolution.h"
+#include "kernel/DWConvolution.h"
+#include "kernel/Div.h"
+#include "kernel/GELU.h"
+#include "kernel/Gemm.h"
+#include "kernel/Hardswish.h"
+#include "kernel/Layernorm.h"
+#include "kernel/MatMul.h"
+#include "kernel/MaxPool.h"
+#include "kernel/RMSNorm.h"
+#include "kernel/RQDiv.h"
+#include "kernel/RQGELU.h"
+#include "kernel/RQHardswish.h"
+#include "kernel/RequantShift.h"
+#include "kernel/Softmax.h"
+
+#endif //__DEEPLOY_BASIC_MATH_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Convolution.h b/TargetLibraries/Generic/inc/kernel/Convolution.h
new file mode 100644
index 0000000..43c4a1f
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/Convolution.h
@@ -0,0 +1,62 @@
+/* =====================================================================
+ * Title:        Convolution.h
+ * Description:
+ *
+ * Date:         04.01.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_CONVOLUTION_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_CONVOLUTION_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/* This file implements convolution.
+ *
+ * A is an M x N input matrix, B is a P x Q kernel matrix and C is and MxN
+ * output matrix
+ *
+ */
+
+/******************************************************************************/
+/*                         General Convolution (8bit)                         */
+/******************************************************************************/
+
+/*
+ * 2D Convolution  ----------------------------------
+ * kernel      = Conv2d_s8_s8_s32_NCHW
+ * layout      = NCHW
+ * data type   = 8-bit integer
+ * kernel size = generic
+ * unrolling   = no
+ * simd        = no
+ */
+void Conv2d_s8_s8_s32_NCHW(int8_t const *__restrict__ pSrcA, uint32_t C,
+                           uint32_t H, uint32_t W,
+                           int8_t const *__restrict__ pSrcB, uint32_t F,
+                           uint32_t P, uint32_t Q, uint32_t SP, uint32_t SQ,
+                           int32_t *__restrict__ pDstC, int32_t input_offset,
+                           int32_t output_offset);
+
+#endif //__DEEPLOY_BASIC_MATH_CONVOLUTION_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/DWConvolution.h b/TargetLibraries/Generic/inc/kernel/DWConvolution.h
new file mode 100644
index 0000000..36a3fad
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/DWConvolution.h
@@ -0,0 +1,62 @@
+/* =====================================================================
+ * Title:        DWConvolution.h
+ * Description:
+ *
+ * Date:         05.01.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_DWCONVOLUTION_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_DWCONVOLUTION_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/* This file implements depth-wise convolution.
+ *
+ * A is an M x N input matrix, B is a P x Q kernel matrix and C is and MxN
+ * output matrix
+ *
+ */
+
+/******************************************************************************/
+/*                   General Depth-Wise Convolution (8bit)                    */
+/******************************************************************************/
+
+/*
+ * 2D Convolution  ----------------------------------
+ * kernel      = DWConv2d_s8_s8_s32_NCHW
+ * layout      = NCHW
+ * data type   = 8-bit integer
+ * kernel size = generic
+ * unrolling   = no
+ * simd        = no
+ */
+void DWConv2d_s8_s8_s32_NCHW(int8_t const *__restrict__ pSrcA, uint32_t C,
+                             uint32_t H, uint32_t W,
+                             int8_t const *__restrict__ pSrcB, uint32_t P,
+                             uint32_t Q, uint32_t SP, uint32_t SQ,
+                             int32_t *__restrict__ pDstC, int32_t input_offset,
+                             int32_t output_offset);
+
+#endif //__DEEPLOY_BASIC_MATH_DWCONVOLUTION_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Div.h b/TargetLibraries/Generic/inc/kernel/Div.h
new file mode 100644
index 0000000..672cff2
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/Div.h
@@ -0,0 +1,48 @@
+/* =====================================================================
+ * Title:        Div.h
+ * Description:
+ *
+ * Date:         19.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_DIV_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_DIV_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ * This file implements the element-wise binary division.
+ */
+
+/******************************************************************************/
+/*                              Division (32bit)                              */
+/******************************************************************************/
+
+void Div_s32_s32(int32_t *data_in_nom, int32_t *data_in_denom, int32_t size_nom,
+                 int32_t size_denom, int32_t nomStep, int32_t denomStep,
+                 int32_t *data_out, int32_t Delta, int32_t eps, int32_t eta);
+
+#endif //__DEEPLOY_BASIC_MATH_DIV_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/GELU.h b/TargetLibraries/Generic/inc/kernel/GELU.h
new file mode 100644
index 0000000..bcede0b
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/GELU.h
@@ -0,0 +1,47 @@
+/* =====================================================================
+ * Title:        GELU.h
+ * Description:
+ *
+ * Date:         19.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_GELU_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_GELU_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ *
+ */
+
+/******************************************************************************/
+/*                              Division (32bit)                              */
+/******************************************************************************/
+
+void GELU_s8_s32(int8_t *data_in, int32_t *data_out, int32_t dataSize, int8_t b,
+                 int16_t one, int32_t input_offset);
+
+#endif //__DEEPLOY_BASIC_MATH_GELU_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Gemm.h b/TargetLibraries/Generic/inc/kernel/Gemm.h
new file mode 100644
index 0000000..1f0ae32
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/Gemm.h
@@ -0,0 +1,66 @@
+/* =====================================================================
+ * Title:        Gemm.h
+ * Description:
+ *
+ * Date:         05.01.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_GEMM_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_GEMM_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ * This library implements the matrix multiplication for several data widths
+ * in multiple different ways. The functions all follow the following format:
+ *
+ * A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix
+ * A' = transpose(A) if transA else A
+ * B' = transpose(B) if transB else B
+ *
+ * Y = alpha * A' * B' + beta * C
+ *
+ */
+
+/******************************************************************************/
+/*                     General Matrix Multiplication (8bit)                   */
+/******************************************************************************/
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = Gemm_s8_s8_s32
+ * data type  = 8-bit integer
+ * unrolling  = no
+ * cleanup    = yes
+ */
+void Gemm_s8_s8_s32_s32(int8_t const *__restrict__ pSrcA,
+                        int8_t const *__restrict__ pSrcB,
+                        int32_t const *__restrict__ pSrcC,
+                        int32_t *__restrict__ pDstY, uint32_t M, uint32_t N,
+                        uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
+                        int32_t transB, int32_t A_offset, int32_t B_offset,
+                        int32_t C_offset, int32_t Y_offset);
+
+#endif //__DEEPLOY_BASIC_MATH_GEMM_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Hardswish.h b/TargetLibraries/Generic/inc/kernel/Hardswish.h
new file mode 100644
index 0000000..967d2b6
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/Hardswish.h
@@ -0,0 +1,35 @@
+/* ----------------------------------------------------------------------
+#
+# File: Hardswish.h
+#
+# Last edited: 22.02.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+*/
+
+#include "DeeployBasicMath.h"
+
+/******************************************************************************/
+/*                             Hardswish (8bit)                               */
+/******************************************************************************/
+
+void iHardswish_s8_s32(int8_t *input, int32_t *output, int32_t size,
+                       int32_t one_over_six, int32_t three, int32_t six,
+                       int32_t input_offset);
\ No newline at end of file
diff --git a/TargetLibraries/Generic/inc/kernel/Layernorm.h b/TargetLibraries/Generic/inc/kernel/Layernorm.h
new file mode 100644
index 0000000..9539096
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/Layernorm.h
@@ -0,0 +1,48 @@
+/* =====================================================================
+ * Title:        Layernorm.h
+ * Description:
+ *
+ * Date:         19.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_LAYERNORM_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_LAYERNORM_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ *
+ */
+
+/******************************************************************************/
+/*                             Layernorm (8bit)                               */
+/******************************************************************************/
+
+void Layernorm_s8_s8(int8_t *data_in, int8_t *data_out, int32_t *weight,
+                     int32_t *bias, int32_t input_offset, int32_t size,
+                     int32_t lastDimLength, int32_t log2D);
+
+#endif //__DEEPLOY_BASIC_MATH_LAYERNORM_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/MatMul.h b/TargetLibraries/Generic/inc/kernel/MatMul.h
new file mode 100644
index 0000000..33b6496
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/MatMul.h
@@ -0,0 +1,65 @@
+/* =====================================================================
+ * Title:        MatMul.h
+ * Description:
+ *
+ * Date:         19.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_MATMUL_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_MATMUL_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ * This library implements the matrix multiplication for several data widths
+ * in multiple different ways. The functions all follow the following format:
+ *
+ * A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix
+ * C = AB
+ *
+ * Note that all the matrices dimensions must be multiples of 4; these
+ * kernels do not have clean-up code and remaining elements would not be
+ * considered, leading to wrong results
+ */
+
+/******************************************************************************/
+/*                         Matrix Multiplication (8bit)                       */
+/******************************************************************************/
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = MatMul_unrolled_2x2_s8_rv32im
+ * data type  = 8-bit integer
+ * unrolling  = 4 elements of C per iteration (2x2 chunks)
+ * cleanup    = yes
+ */
+void MatMul_s8_s8_s32(int8_t const *__restrict__ pSrcA,
+                      int8_t const *__restrict__ pSrcB,
+                      int32_t *__restrict__ pDstC, uint32_t M, uint32_t N,
+                      uint32_t P, int32_t A_offset, int32_t B_offset,
+                      int32_t C_offset);
+
+#endif //__DEEPLOY_BASIC_MATH_MATMUL_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/MaxPool.h b/TargetLibraries/Generic/inc/kernel/MaxPool.h
new file mode 100644
index 0000000..b062ed0
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/MaxPool.h
@@ -0,0 +1,60 @@
+/* =====================================================================
+ * Title:        MaxPool.h
+ * Description:
+ *
+ * Date:         04.01.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_MAXPOOL_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_MAXPOOL_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/* This file implements the MaxPool operation.
+ *
+ * A is an M x N input matrix, P x Q the kernel size and SPxSQ the kernel
+ * stride.
+ *
+ */
+
+/******************************************************************************/
+/*                         General MaxPool (8bit)                         */
+/******************************************************************************/
+
+/*
+ * 2D Maxpool  ----------------------------------
+ * kernel      = MaxPool2d_s8_s8_NCHW
+ * layout      = NCHW
+ * data type   = 8-bit integer
+ * kernel size = generic
+ * unrolling   = no
+ * simd        = no
+ */
+void MaxPool2d_s8_s8_NCHW(int8_t const *__restrict__ pSrcA, uint32_t C,
+                          uint32_t H, uint32_t W, uint32_t P, uint32_t Q,
+                          uint32_t SP, uint32_t SQ, int8_t *__restrict__ pDstC,
+                          int32_t input_offset, int32_t output_offset);
+
+#endif //__DEEPLOY_BASIC_MATH_MAXPOOL_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/RMSNorm.h b/TargetLibraries/Generic/inc/kernel/RMSNorm.h
new file mode 100644
index 0000000..a960b4a
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/RMSNorm.h
@@ -0,0 +1,45 @@
+/* =====================================================================
+ * Title:        RMSNorm.h
+ * Description:
+ *
+ * $Date:        20.02.2024
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_RMSNORM_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_RMSNORM_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ *
+ */
+
+/******************************************************************************/
+/*                             Layernorm (8bit)                               */
+/******************************************************************************/
+
+void iRMSnorm_s8_s8(int8_t *data_in, int8_t *data_out, int32_t *weight,
+                    int32_t input_offset, int32_t size, int32_t lastDimLength,
+                    int32_t log2D);
+
+#endif //__DEEPLOY_BASIC_MATH_RMSNORM_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/RQDiv.h b/TargetLibraries/Generic/inc/kernel/RQDiv.h
new file mode 100644
index 0000000..3e79d01
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/RQDiv.h
@@ -0,0 +1,50 @@
+/* =====================================================================
+ * Title:        RQDiv.h
+ * Description:
+ *
+ * Date:         19.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_RQDIV_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_RQDIV_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ * This file implements the requantized division.
+ */
+
+/******************************************************************************/
+/*                 Division with requantization to 8bit                       */
+/******************************************************************************/
+
+void RQDiv_s32_s8(int32_t *data_in_nom, int32_t *data_in_denom,
+                  int32_t size_nom, int32_t size_denom, int32_t nomStep,
+                  int32_t denomStep, int8_t *data_out, int32_t Delta,
+                  int32_t eps, int32_t eta, int32_t requant_mul,
+                  int32_t requant_add, int32_t requant_shift);
+
+#endif //__DEEPLOY_BASIC_MATH_RQDIV_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/RQGELU.h b/TargetLibraries/Generic/inc/kernel/RQGELU.h
new file mode 100644
index 0000000..8965a34
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/RQGELU.h
@@ -0,0 +1,48 @@
+/* =====================================================================
+ * Title:        RQGELU.h
+ * Description:
+ *
+ * Date:         19.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_RQGELU_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_RQGELU_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ * This file implements the requantized GELU.
+ */
+
+/******************************************************************************/
+/*                 GELU with requantization to 8bit                       */
+/******************************************************************************/
+
+void RQGELU_s8_s8(int8_t *data_in, int8_t *data_out, int32_t dataSize, int8_t b,
+                  int16_t one, int32_t input_offset, int32_t output_offset,
+                  int32_t *mul, int32_t *add, int32_t *shift);
+
+#endif //__DEEPLOY_BASIC_MATH_RQGELU_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/RQHardswish.h b/TargetLibraries/Generic/inc/kernel/RQHardswish.h
new file mode 100644
index 0000000..6e2ea7e
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/RQHardswish.h
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+#
+# File: RQHardswish.h
+#
+# Last edited: 23.02.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+*/
+
+#include "DeeployBasicMath.h"
+
+/******************************************************************************/
+/*                             Requantized Hardswish (8bit)                   */
+/******************************************************************************/
+
+void RQiHardswish_s8_s8(int8_t *input, int8_t *output, int32_t size,
+                        int32_t one_over_six, int32_t three, int32_t six,
+                        int32_t input_offset, int32_t output_offset,
+                        int32_t mul, int32_t add, int32_t shift);
diff --git a/TargetLibraries/Generic/inc/kernel/RequantShift.h b/TargetLibraries/Generic/inc/kernel/RequantShift.h
new file mode 100644
index 0000000..cc1c1cd
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/RequantShift.h
@@ -0,0 +1,135 @@
+/* =====================================================================
+ * Title:        RequantShift.h
+ * Description:
+ *
+ * Date:         19.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_REQUANTSHIFT_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_REQUANTSHIFT_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ * This file implements the requantization kernel for several data widths
+ * in multiple different ways.
+ */
+
+/******************************************************************************/
+/*                         Requantization to 8bit                             */
+/******************************************************************************/
+
+/*
+ * Re-quantization and Shift  ----------------------------------
+ * kernel           = RequantShift_s8_s8_NHWC
+ * layout           = NHWC
+ * input data type  = 8-bit integer
+ * output data type = 8-bit integer
+ * unrolling        = no
+ * simd             = no
+ */
+void RequantShift_s8_s8_NHWC(int8_t *data_in, int32_t size, int32_t *mul,
+                             int32_t *add, int8_t *data_out, int32_t log2D,
+                             int32_t channels, int32_t input_offset,
+                             int32_t output_offset, int8_t output_min,
+                             int8_t output_max, bool rounding);
+
+/*
+ * Re-quantization and Shift  ----------------------------------
+ * kernel           = RequantShift_s16_s8_NHWC
+ * layout           = NHWC
+ * input data type  = 16-bit integer
+ * output data type = 8-bit integer
+ * unrolling        = no
+ * simd             = no
+ */
+void RequantShift_s16_s8_NHWC(int16_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, int8_t *data_out, int32_t log2D,
+                              int32_t channels, int32_t input_offset,
+                              int32_t output_offset, int8_t output_min,
+                              int8_t output_max, bool rounding);
+
+/*
+ * Re-quantization and Shift  ----------------------------------
+ * kernel           = RequantShift_s32_s8_NHWC
+ * layout           = NHWC
+ * input data type  = 32-bit integer
+ * output data type = 8-bit integer
+ * unrolling        = no
+ * simd             = no
+ */
+void RequantShift_s32_s8_NHWC(int32_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, int8_t *data_out, int32_t log2D,
+                              int32_t channels, int32_t input_offset,
+                              int32_t output_offset, int8_t output_min,
+                              int8_t output_max, bool rounding);
+
+/*
+ * Re-quantization and Shift  ----------------------------------
+ * kernel           = RequantShift_s16_s8_NCHW
+ * layout           = NCHW
+ * input data type  = 16-bit integer
+ * output data type = 8-bit integer
+ * unrolling        = no
+ * simd             = no
+ */
+void RequantShift_s8_s8_NCHW(int8_t *data_in, int32_t size, int32_t *mul,
+                             int32_t *add, int8_t *data_out, int32_t log2D,
+                             int32_t HW, int32_t input_offset,
+                             int32_t output_offset, int8_t output_min,
+                             int8_t output_max, bool rounding);
+
+/*
+ * Re-quantization and Shift  ----------------------------------
+ * kernel           = RequantShift_s16_s8_NCHW
+ * layout           = NCHW
+ * input data type  = 16-bit integer
+ * output data type = 8-bit integer
+ * unrolling        = no
+ * simd             = no
+ */
+void RequantShift_s16_s8_NCHW(int16_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, int8_t *data_out, int32_t log2D,
+                              int32_t HW, int32_t input_offset,
+                              int32_t output_offset, int8_t output_min,
+                              int8_t output_max, bool rounding);
+
+/*
+ * Re-quantization and Shift  ----------------------------------
+ * kernel           = RequantShift_s32_s8_NCHW
+ * layout           = NCHW
+ * input data type  = 32-bit integer
+ * output data type = 8-bit integer
+ * unrolling        = no
+ * simd             = no
+ */
+void RequantShift_s32_s8_NCHW(int32_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, int8_t *data_out, int32_t log2D,
+                              int32_t HW, int32_t input_offset,
+                              int32_t output_offset, int8_t output_min,
+                              int8_t output_max, bool rounding);
+
+#endif //__DEEPLOY_BASIC_MATH_REQUANTSHIFT_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Softmax.h b/TargetLibraries/Generic/inc/kernel/Softmax.h
new file mode 100644
index 0000000..ebe4874
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/Softmax.h
@@ -0,0 +1,91 @@
+/* =====================================================================
+ * Title:        Softmax.h
+ * Description:
+ *
+ * Date:         19.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_SOFTMAX_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_SOFTMAX_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ * This file implements various softmax kernels.
+ */
+
+/******************************************************************************/
+/*                              Softmax (8bit)                                */
+/******************************************************************************/
+
+/**
+ * @brief Approximate softmax implementation according to the I-BERT paper.
+ * @see https://arxiv.org/abs/2101.01321
+ *
+ * @param data_in
+ * @param data_out
+ * @param size
+ * @param lastDimLength
+ * @param coeffA
+ * @param coeffB
+ * @param coeffC
+ * @param log2
+ * @param n_levels
+ */
+void Softmax_s8_s8(int8_t *data_in, int8_t *data_out, uint32_t size,
+                   uint32_t lastDimLength, int32_t coeffA, int32_t coeffB,
+                   int64_t coeffC, int32_t log2, uint32_t n_levels);
+
+/**
+ * @brief Approximate softmax implementation.
+ *
+ * @param pSrcA
+ * @param pDstB
+ * @param pBufN
+ * @param size
+ * @param lastDimLength
+ * @param n_levels
+ */
+void ITAMax_s8(int8_t const *__restrict__ pSrcA, int8_t *__restrict__ pDstB,
+               int8_t *__restrict__ pBufN, uint32_t size,
+               uint32_t lastDimLength, uint32_t n_levels);
+
+/**
+ * @brief Approximate partial softmax implementation used in ITA.
+ *
+ * @param pSrcA
+ * @param pDstB
+ * @param size
+ * @param lastDimLength
+ * @param group_width
+ * @param n_levels
+ */
+void ITAPartialMax_s8(int8_t const *__restrict__ pSrcA,
+                      int8_t *__restrict__ pDstB, uint32_t size,
+                      uint32_t lastDimLength, uint32_t group_width,
+                      uint32_t n_levels);
+
+#endif //__DEEPLOY_BASIC_MATH_SOFTMAX_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/macros.h b/TargetLibraries/Generic/inc/macros.h
new file mode 100644
index 0000000..3235424
--- /dev/null
+++ b/TargetLibraries/Generic/inc/macros.h
@@ -0,0 +1,52 @@
+/* =====================================================================
+ * Title:        macros.h
+ * Description:
+ *
+ * Date:         29.11.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Samuel Riedel, ETH Zurich
+ * - Sergio Mazzola, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_MACROS_HEADER_
+#define __DEEPLOY_BASIC_MATH_MACROS_HEADER_
+
+#define MAX(a, b)                                                              \
+  ({                                                                           \
+    __typeof__(a) _a = (a);                                                    \
+    __typeof__(b) _b = (b);                                                    \
+    _a > _b ? _a : _b;                                                         \
+  })
+
+#define MIN(a, b)                                                              \
+  ({                                                                           \
+    __typeof__(a) _a = (a);                                                    \
+    __typeof__(b) _b = (b);                                                    \
+    _a < _b ? _a : _b;                                                         \
+  })
+
+#define CLAMP(x, low, high)                                                    \
+  (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x)))
+
+#endif //__DEEPLOY_BASIC_MATH_MACROS_HEADER_
diff --git a/TargetLibraries/Generic/inc/util.h b/TargetLibraries/Generic/inc/util.h
new file mode 100644
index 0000000..ff9f143
--- /dev/null
+++ b/TargetLibraries/Generic/inc/util.h
@@ -0,0 +1,92 @@
+/* =====================================================================
+ * Title:        util.h
+ * Description:
+ *
+ * Date:         06.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_UTIL_HEADER_
+#define __DEEPLOY_BASIC_MATH_UTIL_HEADER_
+
+int deeploy_log(const char *__restrict fmt, ...)
+    __attribute__((__format__(__printf__, 1, 2)));
+void *deeploy_malloc(const size_t size);
+void deeploy_free(void *const ptr);
+
+void PrintMatrix_s8_NCHW(int8_t const *__restrict__ pSrcA, uint32_t N,
+                         uint32_t C, uint32_t H, uint32_t W, int32_t offset);
+
+void PrintMatrix_s8_NHWC(int8_t const *__restrict__ pSrcA, uint32_t N,
+                         uint32_t C, uint32_t H, uint32_t W, int32_t offset);
+
+void PrintMatrix_s16_NCHW(int16_t const *__restrict__ pSrcA, uint32_t N,
+                          uint32_t C, uint32_t H, uint32_t W, int32_t offset);
+
+void PrintMatrix_s16_NHWC(int16_t const *__restrict__ pSrcA, uint32_t N,
+                          uint32_t C, uint32_t H, uint32_t W, int32_t offset);
+
+void PrintMatrix_s32_NCHW(int32_t const *__restrict__ pSrcA, uint32_t N,
+                          uint32_t C, uint32_t H, uint32_t W, int32_t offset);
+
+void PrintMatrix_s32_NHWC(int32_t const *__restrict__ pSrcA, uint32_t N,
+                          uint32_t C, uint32_t H, uint32_t W, int32_t offset);
+
+void PrintMatrix_u8_NCHW(uint8_t const *__restrict__ pSrcA, uint32_t N,
+                         uint32_t C, uint32_t H, uint32_t W, uint32_t offset);
+
+void PrintMatrix_u8_NHWC(uint8_t const *__restrict__ pSrcA, uint32_t N,
+                         uint32_t C, uint32_t H, uint32_t W, uint32_t offset);
+
+void PrintMatrix_u16_NCHW(uint16_t const *__restrict__ pSrcA, uint32_t N,
+                          uint32_t C, uint32_t H, uint32_t W, uint32_t offset);
+
+void PrintMatrix_u16_NHWC(uint16_t const *__restrict__ pSrcA, uint32_t N,
+                          uint32_t C, uint32_t H, uint32_t W, uint32_t offset);
+
+void PrintMatrix_u32_NCHW(uint32_t const *__restrict__ pSrcA, uint32_t N,
+                          uint32_t C, uint32_t H, uint32_t W, uint32_t offset);
+
+void PrintMatrix_u32_NHWC(uint32_t const *__restrict__ pSrcA, uint32_t N,
+                          uint32_t C, uint32_t H, uint32_t W, uint32_t offset);
+
+void PrintArray_s8(int8_t const *__restrict__ pSrcA, uint32_t N,
+                   int32_t offset);
+
+void PrintArray_s16(int16_t const *__restrict__ pSrcA, uint32_t N,
+                    int32_t offset);
+
+void PrintArray_s32(int32_t const *__restrict__ pSrcA, uint32_t N,
+                    int32_t offset);
+
+void PrintArray_u8(uint8_t const *__restrict__ pSrcA, uint32_t N,
+                   uint32_t offset);
+
+void PrintArray_u16(uint16_t const *__restrict__ pSrcA, uint32_t N,
+                    uint32_t offset);
+
+void PrintArray_u32(uint32_t const *__restrict__ pSrcA, uint32_t N,
+                    uint32_t offset);
+
+#endif //__DEEPLOY_BASIC_MATH_UTIL_HEADER_
diff --git a/TargetLibraries/Generic/src/Convolution_s8.c b/TargetLibraries/Generic/src/Convolution_s8.c
new file mode 100644
index 0000000..a250124
--- /dev/null
+++ b/TargetLibraries/Generic/src/Convolution_s8.c
@@ -0,0 +1,75 @@
+/* =====================================================================
+ * Title:        Convolution_s8.c
+ * Description:
+ *
+ * Date:         04.01.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployBasicMath.h"
+
+void Conv2d_s8_s8_s32_NCHW(int8_t const *__restrict__ pSrcA, uint32_t C,
+                           uint32_t H, uint32_t W,
+                           int8_t const *__restrict__ pSrcB, uint32_t F,
+                           uint32_t P, uint32_t Q, uint32_t SP, uint32_t SQ,
+                           int32_t *__restrict__ pDstC, int32_t input_offset,
+                           int32_t output_offset) {
+
+  // WIESEP: For now assume padding=0
+  uint32_t H_out = (H - P) / SP + 1;
+  uint32_t W_out = (W - Q) / SQ + 1;
+
+  uint32_t c = 0; // input channel loop counter
+  uint32_t h = 0; // input row loop counter
+  uint32_t w = 0; // input column loop counter
+
+  uint32_t f = 0; // kernel filter loop counter
+  uint32_t p = 0; // kernel row loop counter
+  uint32_t q = 0; // kernel column loop counter
+
+  int32_t sum;
+  for (f = 0; f < F; ++f) {
+    for (h = 0; h < H_out; ++h) {
+      for (w = 0; w < W_out; ++w) {
+        sum = 0;
+        for (c = 0; c < C; ++c) {
+          // printf("(%2d,%2d,%2d) ", c, h, w);
+          for (p = 0; p < P; ++p) {
+            for (q = 0; q < Q; ++q) {
+              sum += (pSrcA[c * H * W + (h * SP + p) * W + (w * SQ + q)] +
+                      input_offset) *
+                     pSrcB[f * C * P * Q + c * P * Q + p * Q + q];
+              // printf("%4d*%-4d + ", pSrcA[c * H * W + (h * SP + p) * W + (w *
+              // SQ + q)],
+              //  pSrcB[f * C * P * Q + c * P * Q + p * Q + q]);
+            }
+          }
+          // printf("\r\n");
+        }
+        // printf("= %-6ld\r\n", sum);
+        pDstC[f * H_out * W_out + h * W_out + w] = sum + output_offset;
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/Generic/src/DWConvolution_s8.c b/TargetLibraries/Generic/src/DWConvolution_s8.c
new file mode 100644
index 0000000..1a7da6a
--- /dev/null
+++ b/TargetLibraries/Generic/src/DWConvolution_s8.c
@@ -0,0 +1,72 @@
+/* =====================================================================
+ * Title:        DWConvolution_s8.c
+ * Description:
+ *
+ * Date:         05.01.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployBasicMath.h"
+
+void DWConv2d_s8_s8_s32_NCHW(int8_t const *__restrict__ pSrcA, uint32_t C,
+                             uint32_t H, uint32_t W,
+                             int8_t const *__restrict__ pSrcB, uint32_t P,
+                             uint32_t Q, uint32_t SP, uint32_t SQ,
+                             int32_t *__restrict__ pDstC, int32_t input_offset,
+                             int32_t output_offset) {
+
+  // WIESEP: For now assume padding=0
+  uint32_t H_out = (H - P) / SP + 1;
+  uint32_t W_out = (W - Q) / SQ + 1;
+
+  uint32_t c = 0; // input channel loop counter
+  uint32_t h = 0; // input row loop counter
+  uint32_t w = 0; // input column loop counter
+
+  uint32_t f = 0; // kernel filter loop counter
+  uint32_t p = 0; // kernel row loop counter
+  uint32_t q = 0; // kernel column loop counter
+
+  int32_t sum;
+  for (c = 0; c < C; ++c) {
+    for (h = 0; h < H_out; ++h) {
+      for (w = 0; w < W_out; ++w) {
+        sum = 0;
+        // printf("(%2ld,%2ld,%2ld) ", c, h, w);
+        for (p = 0; p < P; ++p) {
+          for (q = 0; q < Q; ++q) {
+            sum += (pSrcA[c * H * W + (h * SP + p) * W + (w * SQ + q)] +
+                    input_offset) *
+                   pSrcB[f * C * P * Q + c * P * Q + p * Q + q];
+            // printf("%4d*%-4d + ", pSrcA[c * H * W + (h * SP + p) * W + (w *
+            // SQ + q)], pSrcB[f * C * P * Q + c * P * Q + p * Q + q]);
+          }
+        }
+        // printf("\r\n");
+        // printf("= %-6ld\r\n", sum);
+        pDstC[c * H_out * W_out + h * W_out + w] = sum + output_offset;
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/Generic/src/Div_s32.c b/TargetLibraries/Generic/src/Div_s32.c
new file mode 100644
index 0000000..c2b66ad
--- /dev/null
+++ b/TargetLibraries/Generic/src/Div_s32.c
@@ -0,0 +1,58 @@
+/* =====================================================================
+ * Title:        Div_s32.c
+ * Description:
+ *
+ * $Date:        19.12.2022
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployBasicMath.h"
+
+void Div_s32_s32(int32_t *data_in_nom, int32_t *data_in_denom, int32_t size_nom,
+                 int32_t __attribute__((unused)) size_denom, int32_t nomStep,
+                 int32_t denomStep, int32_t *data_out, int32_t Delta,
+                 int32_t eps, int32_t eta) {
+
+  int32_t innerMostIter = denomStep;
+  int32_t secondIter = nomStep / innerMostIter;
+  int32_t thirdIter = size_nom / secondIter;
+  int64_t nom;
+  int32_t sgnNom = 0;
+  int64_t denom;
+
+  for (int i = 0; i < thirdIter; i++) {
+    for (int k = 0; k < innerMostIter; k++) {
+      denom = data_in_denom[i * innerMostIter + k];
+      denom = ((eta * denom) + eps);
+      for (int j = 0; j < secondIter; j++) {
+        nom =
+            data_in_nom[i * secondIter * innerMostIter + j * innerMostIter + k];
+        nom = (Delta * eta * nom);
+        sgnNom = (nom >= 0) - (nom < 0);
+        data_out[i * secondIter * innerMostIter + j * innerMostIter + k] =
+            (int32_t)((nom + sgnNom * (denom >> 1)) / denom);
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/Generic/src/GELU_s8.c b/TargetLibraries/Generic/src/GELU_s8.c
new file mode 100644
index 0000000..91f4254
--- /dev/null
+++ b/TargetLibraries/Generic/src/GELU_s8.c
@@ -0,0 +1,51 @@
+/* =====================================================================
+ * Title:        GELU_s8.c
+ * Description:
+ *
+ * $Date:        19.12.2022
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployBasicMath.h"
+
+void GELU_s8_s32(int8_t *data_in, int32_t *data_out, int32_t dataSize, int8_t b,
+                 int16_t one, int32_t input_offset) {
+  int32_t sign, x, x_abs, q;
+  int32_t d;
+  int32_t L, y;
+  for (int i = 0; i < dataSize; i++) {
+    x = data_in[i] + input_offset;
+    sign = (x > 0) - (x < 0); // sgn(x)
+    x_abs = sign * x;         // abs(x)
+    if (x_abs > -b) {
+      q = -b;
+    } else {
+      q = x_abs;
+    }
+    d = q + b;
+    L = sign * (-(d * d) + one);
+    y = x * (((one + L)) >> 1);
+    data_out[i] = y;
+  }
+}
diff --git a/TargetLibraries/Generic/src/Gemm_s8.c b/TargetLibraries/Generic/src/Gemm_s8.c
new file mode 100644
index 0000000..bd5e807
--- /dev/null
+++ b/TargetLibraries/Generic/src/Gemm_s8.c
@@ -0,0 +1,92 @@
+/* =====================================================================
+ * Title:        Gemm_s8.c
+ * Description:
+ *
+ * $Date:        05.01.2023
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployBasicMath.h"
+
+void Gemm_s8_s8_s32_s32(int8_t const *__restrict__ pSrcA,
+                        int8_t const *__restrict__ pSrcB,
+                        int32_t const *__restrict__ pSrcC,
+                        int32_t *__restrict__ pDstY, uint32_t M, uint32_t N,
+                        uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
+                        int32_t transB, int32_t A_offset, int32_t B_offset,
+                        int32_t C_offset, int32_t Y_offset) {
+
+  uint32_t m = 0; // loop counter
+  uint32_t n = 0; // loop counter
+  uint32_t p = 0; // loop counter
+
+  if (transA == 0 && transB == 0) {
+    for (m = 0; m < M; ++m) {
+      for (p = 0; p < P; p++) {
+        int32_t sum = 0;
+        for (n = 0; n < N; n++) {
+          sum += (int32_t)(pSrcA[m * N + n] + A_offset) *
+                 (pSrcB[n * P + p] + B_offset);
+        }
+        pDstY[m * P + p] =
+            alpha * sum + beta * (pSrcC[m * P + p] + C_offset) + Y_offset;
+      }
+    }
+  } else if (transA == 1 && transB == 0) {
+    for (uint32_t m = 0; m < M; ++m) {
+      for (p = 0; p < P; p++) {
+        int32_t sum = 0;
+        for (n = 0; n < N; n++) {
+          sum += (int32_t)(pSrcA[n * M + m] + A_offset) *
+                 (pSrcB[n * P + p] + B_offset);
+        }
+        pDstY[m * P + p] =
+            alpha * sum + beta * (pSrcC[m * P + p] + C_offset) + Y_offset;
+      }
+    }
+  } else if (transA == 0 && transB == 1) {
+    for (uint32_t m = 0; m < M; ++m) {
+      for (p = 0; p < P; p++) {
+        int32_t sum = 0;
+        for (n = 0; n < N; n++) {
+          sum += (int32_t)(pSrcA[m * N + n] + A_offset) *
+                 (pSrcB[p * N + n] + B_offset);
+        }
+        pDstY[m * P + p] =
+            alpha * sum + beta * (pSrcC[m * P + p] + C_offset) + Y_offset;
+      }
+    }
+  } else {
+    for (uint32_t m = 0; m < M; ++m) {
+      for (p = 0; p < P; p++) {
+        int32_t sum = 0;
+        for (n = 0; n < N; n++) {
+          sum += (int32_t)(pSrcA[n * M + m] + A_offset) *
+                 (pSrcB[p * N + n] + B_offset);
+        }
+        pDstY[m * P + p] =
+            alpha * sum + beta * (pSrcC[m * P + p] + C_offset) + Y_offset;
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/Generic/src/Hardswish_s8.c b/TargetLibraries/Generic/src/Hardswish_s8.c
new file mode 100644
index 0000000..fb7c1f9
--- /dev/null
+++ b/TargetLibraries/Generic/src/Hardswish_s8.c
@@ -0,0 +1,47 @@
+/* ----------------------------------------------------------------------
+#
+# File: Hardswish_s8.c
+#
+# Last edited: 22.02.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+*/
+
+#include "DeeployBasicMath.h"
+
+void iHardswish_s8_s32(int8_t *input, int32_t *output, int32_t size,
+                       int32_t one_over_six, int32_t three, int32_t six,
+                       int32_t input_offset) {
+
+  int32_t temp;
+
+  for (int i = 0; i < size; i++) {
+    temp = input[i] + input_offset + three;
+    if (temp < 0) {
+      temp = 0;
+    }
+    if (temp > six) {
+      temp = six;
+    }
+    temp = temp * one_over_six;
+    temp = input[i] * temp;
+    output[i] = temp;
+  }
+}
\ No newline at end of file
diff --git a/TargetLibraries/Generic/src/Layernorm_s8.c b/TargetLibraries/Generic/src/Layernorm_s8.c
new file mode 100644
index 0000000..5a30e2d
--- /dev/null
+++ b/TargetLibraries/Generic/src/Layernorm_s8.c
@@ -0,0 +1,104 @@
+/* =====================================================================
+ * Title:        Layernorm_s8.c
+ * Description:
+ *
+ * $Date:        19.12.2022
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployBasicMath.h"
+
+// Taken from PULP-DSP - Moritz Scherer is original author
+void _plp_sqrt_q32(const int32_t *__restrict__ pSrc, const uint32_t fracBits,
+                   int32_t *__restrict__ pRes) {
+
+  int32_t number = *pSrc;
+  int32_t root = 0;
+
+  int32_t start = 0;
+  int32_t end = 46342; // smallest integer that is larger than sqrt(0x7FFFFFFF)
+  int32_t mid;
+
+  if (number > 0) {
+
+    while (start <= end) {
+
+      mid = (start + end) >> 1;
+
+      if (((mid * mid) >> fracBits) == number) {
+        root = mid;
+        break;
+      }
+
+      if (((mid * mid) >> fracBits) < number) {
+        start = mid + 1;
+        root = mid;
+      } else {
+        end = mid - 1;
+      }
+    }
+
+    *pRes = root;
+
+  } else {
+    *pRes = 0;
+  }
+}
+
+void Layernorm_s8_s8(int8_t *data_in, int8_t *data_out, int32_t *weight,
+                     int32_t *bias, int32_t input_offset, int32_t size,
+                     int32_t lastDimLength, int32_t log2D) {
+
+  int32_t mean;
+  // int16_t temp[size];
+  int32_t sum;
+  int32_t std;
+  int16_t temp;
+
+  for (int i = 0; i < (size / lastDimLength); i++) {
+    sum = 0;
+    mean = 0;
+    for (int j = 0; j < lastDimLength; j++) {
+      mean += data_in[j + i * lastDimLength] + input_offset;
+    }
+    mean = mean / lastDimLength;
+    for (int j = 0; j < lastDimLength; j++) {
+      temp = (int16_t)(data_in[j + i * lastDimLength] + input_offset - mean);
+      sum += temp * temp;
+    }
+    sum = sum / lastDimLength;
+    sum += 1;
+    _plp_sqrt_q32(&sum, 0, &std);
+
+    for (int j = 0; j < lastDimLength; j++) {
+      data_out[j + i * lastDimLength] =
+          (int8_t)((((((int64_t)data_in[j + i * lastDimLength]) + input_offset -
+                      mean) *
+                     weight[j]) /
+                        (std) +
+                    bias[j]) >>
+                   log2D);
+    }
+  }
+}
\ No newline at end of file
diff --git a/TargetLibraries/Generic/src/MatMul_s8.c b/TargetLibraries/Generic/src/MatMul_s8.c
new file mode 100644
index 0000000..2b53baa
--- /dev/null
+++ b/TargetLibraries/Generic/src/MatMul_s8.c
@@ -0,0 +1,129 @@
+/* =====================================================================
+ * Title:        MatMul_s8.c
+ * Description:
+ *
+ * $Date:        19.12.2022
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployBasicMath.h"
+
+void MatMul_s8_s8_s32(int8_t const *__restrict__ pSrcA,
+                      int8_t const *__restrict__ pSrcB,
+                      int32_t *__restrict__ pDstC, uint32_t M, uint32_t N,
+                      uint32_t P, int32_t A_offset, int32_t B_offset,
+                      int32_t C_offset) {
+  uint32_t i = 0; // loop counter
+  uint32_t j = 0; // loop counter
+  uint32_t k = 0; // loop counter
+
+  for (i = 0; i < M / 2; i++) {
+    for (k = 0; k < P / 2; k++) {
+
+      int32_t sum00 = C_offset;
+      int32_t sum01 = C_offset;
+      int32_t sum10 = C_offset;
+      int32_t sum11 = C_offset;
+
+      for (j = 0; j < N / 2; j++) {
+        int32_t AVal00 = pSrcA[(i * 2) * N + j * 2] + A_offset;
+        int32_t AVal10 = pSrcA[(i * 2 + 1) * N + j * 2] + A_offset;
+        int32_t AVal01 = pSrcA[(i * 2) * N + j * 2 + 1] + A_offset;
+        int32_t AVal11 = pSrcA[(i * 2 + 1) * N + j * 2 + 1] + A_offset;
+        int32_t BVal00 = pSrcB[(j * 2) * P + (k * 2)] + B_offset;
+        int32_t BVal01 = pSrcB[(j * 2) * P + (k * 2 + 1)] + B_offset;
+        int32_t BVal10 = pSrcB[(j * 2 + 1) * P + (k * 2)] + B_offset;
+        int32_t BVal11 = pSrcB[(j * 2 + 1) * P + (k * 2 + 1)] + B_offset;
+
+        sum00 = sum00 + AVal00 * BVal00;
+        sum00 = sum00 + AVal01 * BVal10;
+        sum01 = sum01 + AVal00 * BVal01;
+        sum01 = sum01 + AVal01 * BVal11;
+        sum10 = sum10 + AVal10 * BVal00;
+        sum10 = sum10 + AVal11 * BVal10;
+        sum11 = sum11 + AVal10 * BVal01;
+        sum11 = sum11 + AVal11 * BVal11;
+      }
+      pDstC[(i * 2) * P + (k * 2)] = sum00;
+      pDstC[(i * 2) * P + (k * 2 + 1)] = sum01;
+      pDstC[(i * 2 + 1) * P + (k * 2)] = sum10;
+      pDstC[(i * 2 + 1) * P + (k * 2 + 1)] = sum11;
+    }
+  }
+
+  // clean up code
+  i = i * 2;
+  j = j * 2;
+  k = k * 2;
+
+  // clean up code
+  // check if every index is nicely finished
+  if (i == M && j == N && k == P) {
+    return;
+  } else {
+    uint32_t iEnd = i;
+    uint32_t jEnd = j;
+    uint32_t kEnd = k;
+
+    // clean up for j
+    if (jEnd != N) {
+      for (i = 0; i < iEnd; i++) {
+        for (k = 0; k < kEnd; k++) {
+          int32_t sum = 0;
+          for (j = jEnd; j < N; j++) {
+            sum = sum +
+                  (pSrcA[i * N + j] + A_offset) * (pSrcB[j * P + k] + B_offset);
+          }
+          pDstC[i * P + k] += sum;
+        }
+      }
+    }
+
+    // clean up for k
+    if (kEnd != P) {
+      for (i = 0; i < iEnd; i++) {
+        for (k = kEnd; k < P; k++) {
+          int32_t sum = C_offset;
+          for (j = 0; j < N; j++) {
+            sum = sum +
+                  (pSrcA[i * N + j] + A_offset) * (pSrcB[j * P + k] + B_offset);
+          }
+          pDstC[i * P + k] = sum;
+        }
+      }
+    }
+
+    // clean up for i
+    for (i = iEnd; i < M; i++) {
+      for (k = 0; k < P; k++) {
+        int32_t sum = C_offset;
+        for (j = 0; j < N; j++) {
+          sum = sum +
+                (pSrcA[i * N + j] + A_offset) * (pSrcB[j * P + k] + B_offset);
+        }
+        pDstC[i * P + k] = sum;
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/Generic/src/MaxPool_s8.c b/TargetLibraries/Generic/src/MaxPool_s8.c
new file mode 100644
index 0000000..3e482f8
--- /dev/null
+++ b/TargetLibraries/Generic/src/MaxPool_s8.c
@@ -0,0 +1,73 @@
+/* =====================================================================
+ * Title:        MaxPool_s8.c
+ * Description:
+ *
+ * Date:         04.01.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployBasicMath.h"
+
+void MaxPool2d_s8_s8_NCHW(int8_t const *__restrict__ pSrcA, uint32_t C,
+                          uint32_t H, uint32_t W, uint32_t P, uint32_t Q,
+                          uint32_t SP, uint32_t SQ, int8_t *__restrict__ pDstC,
+                          int32_t input_offset, int32_t output_offset) {
+  // WIESEP: For now assume padding=0
+  uint32_t H_out = (H - P) / SP + 1;
+  uint32_t W_out = (W - Q) / SQ + 1;
+
+  uint32_t c = 0; // input channel loop counter
+  uint32_t h = 0; // input row loop counter
+  uint32_t w = 0; // input column loop counter
+
+  uint32_t p = 0; // kernel row loop counter
+  uint32_t q = 0; // kernel column loop counter
+
+  int32_t max;
+  int32_t volatile tmp;
+  for (c = 0; c < C; ++c) {
+    for (h = 0; h < H_out; ++h) {
+      for (w = 0; w < W_out; ++w) {
+        max = -128;
+        // printf("(%2d,%2d,%2d) ", c, h, w);
+        for (p = 0; p < P; ++p) {
+          for (q = 0; q < Q; ++q) {
+            tmp = (int32_t)(pSrcA[c * H * W + (h * SP + p) * W + (w * SQ + q)] +
+                            input_offset);
+            if (tmp > max) {
+              // printf("%4d >  %4d, ", tmp, max);
+              max = tmp;
+            }
+            // else {
+            // printf("%4d <= %-4d, ", tmp, max);
+            // }
+          }
+        }
+        // printf(" -> %d\r\n", max);
+        pDstC[c * H_out * W_out + h * W_out + w] =
+            (int8_t)(max + output_offset);
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/Generic/src/RQDiv_s8.c b/TargetLibraries/Generic/src/RQDiv_s8.c
new file mode 100644
index 0000000..2ac7524
--- /dev/null
+++ b/TargetLibraries/Generic/src/RQDiv_s8.c
@@ -0,0 +1,62 @@
+/* =====================================================================
+ * Title:        RQDiv_s8.c
+ * Description:
+ *
+ * $Date:        19.12.2022
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployBasicMath.h"
+
+void RQDiv_s32_s8(int32_t *data_in_nom, int32_t *data_in_denom,
+                  int32_t size_nom, int32_t __attribute__((unused)) size_denom,
+                  int32_t nomStep, int32_t denomStep, int8_t *data_out,
+                  int32_t Delta, int32_t eps, int32_t eta, int32_t requant_mul,
+                  int32_t requant_add, int32_t requant_shift) {
+
+  int32_t innerMostIter = denomStep;
+  int32_t secondIter = nomStep / innerMostIter;
+  int32_t thirdIter = size_nom / secondIter;
+  int64_t nom;
+  int32_t sgnNom = 0;
+  int64_t denom;
+  int32_t y, intermediate;
+
+  for (int i = 0; i < thirdIter; i++) {
+    for (int k = 0; k < innerMostIter; k++) {
+      denom = data_in_denom[i * innerMostIter + k];
+      denom = ((eta * denom) + eps);
+      for (int j = 0; j < secondIter; j++) {
+        nom =
+            data_in_nom[i * secondIter * innerMostIter + j * innerMostIter + k];
+        nom = (Delta * eta * nom);
+        sgnNom = (nom >= 0) - (nom < 0);
+        y = (int32_t)((nom + sgnNom * (denom >> 1)) / denom);
+        intermediate = (int32_t)(y)*requant_mul + requant_add;
+        intermediate =
+            ((intermediate + ((1 << (requant_shift - 1)))) >> requant_shift);
+        *data_out++ = (int8_t)CLAMP(intermediate, -128, 127);
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/Generic/src/RQGELU_s8.c b/TargetLibraries/Generic/src/RQGELU_s8.c
new file mode 100644
index 0000000..0e033a2
--- /dev/null
+++ b/TargetLibraries/Generic/src/RQGELU_s8.c
@@ -0,0 +1,58 @@
+/* =====================================================================
+ * Title:        RQGELU_s8.c
+ * Description:
+ *
+ * $Date:        19.12.2022
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployBasicMath.h"
+
+void RQGELU_s8_s8(int8_t *data_in, int8_t *data_out, int32_t dataSize, int8_t b,
+                  int16_t one, int32_t input_offset, int32_t output_offset,
+                  int32_t *mul, int32_t *add, int32_t *shift) {
+
+  int32_t sign, x, x_abs, q;
+  int32_t d;
+  int32_t L, y;
+  int32_t intermediate;
+
+  for (int i = 0; i < dataSize; i++) {
+    x = data_in[i] + input_offset;
+    sign = (x > 0) - (x < 0); // sgn(x)
+    x_abs = sign * x;         // abs(x)
+    if (x_abs > -b) {
+      q = -b;
+    } else {
+      q = x_abs;
+    }
+    d = q + b;
+    L = sign * (-(d * d) + one);
+    y = x * (((one + L)) >> 1);
+
+    intermediate = ((int32_t)y) * (*mul) + (*add);
+    intermediate =
+        ((intermediate + ((1 << ((*shift) - 1)))) >> (*shift)) + output_offset;
+    data_out[i] = (int8_t)CLAMP(intermediate, -128, 127);
+  }
+}
diff --git a/TargetLibraries/Generic/src/RQHardswish.c b/TargetLibraries/Generic/src/RQHardswish.c
new file mode 100644
index 0000000..6826c2c
--- /dev/null
+++ b/TargetLibraries/Generic/src/RQHardswish.c
@@ -0,0 +1,50 @@
+/* ----------------------------------------------------------------------
+#
+# File: RQHardswish.c
+#
+# Last edited: 23.02.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+*/
+
+#include "DeeployBasicMath.h"
+
+void RQiHardswish_s8_s8(int8_t *input, int8_t *output, int32_t size,
+                        int32_t one_over_six, int32_t three, int32_t six,
+                        int32_t input_offset, int32_t output_offset,
+                        int32_t mul, int32_t add, int32_t shift) {
+
+  int32_t temp;
+
+  for (int i = 0; i < size; i++) {
+    temp = input[i] + input_offset + three;
+    if (temp < 0) {
+      temp = 0;
+    }
+    if (temp > six) {
+      temp = six;
+    }
+    temp = temp * one_over_six;
+    temp = input[i] * temp;
+    temp = temp * (mul) + (add);
+    temp = ((temp + ((1 << ((shift)-1)))) >> (shift)) + output_offset;
+    output[i] = (int8_t)CLAMP(temp, -128, 127);
+  }
+}
diff --git a/TargetLibraries/Generic/src/RequantShift_s8.c b/TargetLibraries/Generic/src/RequantShift_s8.c
new file mode 100644
index 0000000..ece458a
--- /dev/null
+++ b/TargetLibraries/Generic/src/RequantShift_s8.c
@@ -0,0 +1,155 @@
+/* =====================================================================
+ * Title:        RequantShift_s8.c
+ * Description:
+ *
+ * Date:         19.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployBasicMath.h"
+
+void RequantShift_s8_s8_NHWC(int8_t *data_in, int32_t size, int32_t *mul,
+                             int32_t *add, int8_t *data_out, int32_t log2D,
+                             int32_t channels, int32_t input_offset,
+                             int32_t output_offset, int8_t output_min,
+                             int8_t output_max, bool rounding) {
+  int32_t intermediate;
+  int8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate = ((int32_t)data_in[i] + input_offset) * mul[i % channels] +
+                   add[i % channels];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_s16_s8_NHWC(int16_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, int8_t *data_out, int32_t log2D,
+                              int32_t channels, int32_t input_offset,
+                              int32_t output_offset, int8_t output_min,
+                              int8_t output_max, bool rounding) {
+  int32_t intermediate;
+  int8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate = ((int32_t)data_in[i] + input_offset) * mul[i % channels] +
+                   add[i % channels];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_s32_s8_NHWC(int32_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, int8_t *data_out, int32_t log2D,
+                              int32_t channels, int32_t input_offset,
+                              int32_t output_offset, int8_t output_min,
+                              int8_t output_max, bool rounding) {
+  int32_t intermediate;
+  int8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate = ((int32_t)data_in[i] + input_offset) * mul[i % channels] +
+                   add[i % channels];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_s8_s8_NCHW(int8_t *data_in, int32_t size, int32_t *mul,
+                             int32_t *add, int8_t *data_out, int32_t log2D,
+                             int32_t HW, int32_t input_offset,
+                             int32_t output_offset, int8_t output_min,
+                             int8_t output_max, bool rounding) {
+  int32_t intermediate;
+  int8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate =
+        ((int32_t)data_in[i] + input_offset) * mul[i / HW] + add[i / HW];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_s16_s8_NCHW(int16_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, int8_t *data_out, int32_t log2D,
+                              int32_t HW, int32_t input_offset,
+                              int32_t output_offset, int8_t output_min,
+                              int8_t output_max, bool rounding) {
+  int32_t intermediate;
+  int8_t out;
+  for (int i = 0; i < size; i++) {
+
+    intermediate = (int32_t)data_in[i];
+
+#ifdef DEEPLOY_PULP_PLATFORM
+    // SCHEREMO: PULP specific hack
+    // SCHEREMO: Need to trigger a HW loop with at least 3 nops
+#pragma nounroll
+    for (int j = 0; j < 3; j++) {
+      asm volatile("nop" ::);
+    }
+#endif
+
+    intermediate = (intermediate + input_offset) * mul[i / HW] + add[i / HW];
+
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_s32_s8_NCHW(int32_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, int8_t *data_out, int32_t log2D,
+                              int32_t HW, int32_t input_offset,
+                              int32_t output_offset, int8_t output_min,
+                              int8_t output_max, bool rounding) {
+  int32_t intermediate;
+  int8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate = (int32_t)data_in[i];
+
+#ifdef DEEPLOY_PULP_PLATFORM
+    // SCHEREMO: PULP specific hack
+    // SCHEREMO: Need to trigger a HW loop with at least 3 nops
+#pragma nounroll
+    for (int j = 0; j < 3; j++) {
+      asm volatile("nop" ::);
+    }
+#endif
+
+    intermediate = (intermediate + input_offset) * mul[i / HW] + add[i / HW];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
diff --git a/TargetLibraries/Generic/src/Softmax_s8.c b/TargetLibraries/Generic/src/Softmax_s8.c
new file mode 100644
index 0000000..46d0110
--- /dev/null
+++ b/TargetLibraries/Generic/src/Softmax_s8.c
@@ -0,0 +1,187 @@
+/* =====================================================================
+ * Title:        Softmax_s8.c
+ * Description:
+ *
+ * $Date:        27.03.2023
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployBasicMath.h"
+
+#include <stdlib.h>
+
+/**
+ * @todo Remove malloc in function and make the buffer a pointer passed to the
+ * fuction which is allocated by the user.
+ */
+void Softmax_s8_s8(int8_t *data_in, int8_t *data_out, uint32_t size,
+                   uint32_t lastDimLength, int32_t coeffA, int32_t coeffB,
+                   int64_t coeffC, int32_t log2, uint32_t n_levels) {
+
+  int8_t xTilde, z, p;
+  uint32_t y_sum;
+  int8_t x_max;
+  uint32_t *y = (uint32_t *)deeploy_malloc(sizeof(int32_t) * lastDimLength);
+
+  for (uint32_t i = 0; i < size / lastDimLength; i++) {
+    y_sum = 0;
+    x_max = -128;
+    for (uint32_t j = 0; j < lastDimLength; j++) {
+      if (data_in[j + i * lastDimLength] > x_max) {
+        x_max = data_in[j + i * lastDimLength];
+      }
+    }
+    for (uint32_t j = 0; j < lastDimLength; j++) {
+      xTilde = (int8_t)(data_in[j + i * lastDimLength] - x_max);
+      z = (int8_t) - (xTilde / log2);
+      p = (int8_t)(xTilde + z * log2);
+      y[j] = (uint32_t)((uint64_t)(coeffA * ((p + coeffB) * (p + coeffB)) +
+                                   coeffC) >>
+                        (z));
+      y_sum += y[j];
+    }
+    for (uint32_t j = 0; j < lastDimLength; j++) {
+      data_out[j + i * lastDimLength] =
+          (int8_t)((y[j] * (n_levels - 1)) / (y_sum)-n_levels / 2);
+    }
+  }
+  deeploy_free(y);
+}
+
+void ITAMax_s8(int8_t const *__restrict__ pSrcA, int8_t *__restrict__ pDstB,
+               int8_t *__restrict__ pBufN, uint32_t size,
+               uint32_t lastDimLength, uint32_t n_levels) {
+
+  uint32_t i = 0; // Row Counter
+  uint32_t j = 0; // Column Counter
+
+  uint8_t *shift = (uint8_t *)pBufN;
+
+  for (i = 0; i < size / lastDimLength; ++i) {
+    // 1. Find maximum over row
+    int8_t max = -128;
+    for (j = 0; j < lastDimLength; ++j) {
+      if (pSrcA[i * lastDimLength + j] > max) {
+        max = pSrcA[i * lastDimLength + j];
+      }
+    }
+
+    // 2. Calculate exponential sum
+    uint32_t exp_sum = 0;
+    for (j = 0; j < lastDimLength; ++j) {
+      int32_t diff = max - pSrcA[i * lastDimLength + j];
+      shift[j] = (uint8_t)((diff + 16) >> 5);
+      exp_sum += (256U >> shift[j]);
+    }
+
+    uint32_t exp_sum_inv = ((n_levels - 1) * 256U) / exp_sum;
+
+    for (j = 0; j < lastDimLength; ++j) {
+      pDstB[i * lastDimLength + j] =
+          (int8_t)((exp_sum_inv >> shift[j]) - (n_levels / 2));
+    }
+  }
+}
+
+void ITAPartialMax_s8(int8_t const *__restrict__ pSrcA,
+                      int8_t *__restrict__ pDstB, uint32_t size,
+                      uint32_t lastDimLength, uint32_t group_width,
+                      uint32_t n_levels) {
+
+  uint32_t i = 0; // Row Counter
+  uint32_t j = 0; // Column Counter
+  uint32_t g = 0; // Group Counter
+
+  // Iterate over rows
+  for (i = 0; i < size / lastDimLength; ++i) {
+
+    // Initialize denominator
+    uint32_t exp_partial_sum = 0;
+
+    // Initialize maximum with minimal possible value
+    int8_t global_max = -128;
+
+    // STAGE 1: Compute the denominator of the softmax
+    // Iterate over groups
+    for (g = 0; g < lastDimLength / group_width; ++g) {
+
+      // Find the maximum for each row in the current column block
+      int8_t current_max = -128;
+      for (uint32_t k = 0; k < group_width; ++k) {
+        int8_t value = pSrcA[i * lastDimLength + g * group_width + k];
+        if (value > current_max) {
+          current_max = value;
+        }
+      }
+
+      // Calculate shift values (integer division with rounding)
+      int32_t max_shift = (current_max - global_max + 16) >> 5;
+
+      // Update all shift values where new maximum is larger
+      int32_t shift_sum = (current_max > global_max) ? max_shift : 0;
+      global_max = (current_max > global_max) ? current_max : global_max;
+
+      // Calculate exponential sum over the current part of the row
+      uint32_t exp_sum = 0;
+      for (uint32_t k = 0; k < group_width; ++k) {
+        int32_t diff =
+            global_max - pSrcA[i * lastDimLength + g * group_width + k];
+        uint8_t shift = (uint8_t)((diff + 16) >> 5);
+        exp_sum += (256U >> shift);
+      }
+
+      // Update the accumulated sum and add the accumulation over the current
+      // part of the row
+      exp_partial_sum = (exp_partial_sum >> shift_sum) + exp_sum;
+
+      // deeploy_log("[R %d,G %d]: %6d, %6d, %6d, %6d, %6d, %6d\n", i, g,
+      // current_max, max_shift, shift_sum, global_max, exp_sum,
+      // exp_partial_sum);
+    }
+
+    // STAGE 2: Calculate the softmax activation
+    // WIESEP: Scale Softmax to 127
+    // The Softmax values are maximum 127 as sumdot modules can only do
+    // signed-signed operations for now. This is a temporary fix until sumdot is
+    // fixed.
+    uint32_t exp_partial_sum_inverse =
+        ((n_levels / 2 - 1) * 256U) / exp_partial_sum;
+
+    for (j = 0; j < lastDimLength; ++j) {
+      // Find the difference between the maximum and x
+      int32_t diff = global_max - pSrcA[i * lastDimLength + j];
+
+      // Shift the values by B-log2B -> multiply by B/2**B = log2e*eps_x
+      // (integer division with rounding)
+      uint8_t shift = (uint8_t)((diff + 16) >> 5);
+
+      // Calculate the activation value
+      pDstB[i * lastDimLength + j] =
+          (int8_t)((exp_partial_sum_inverse >> shift) - (n_levels / 2));
+
+      // deeploy_log("[R %d,C %d]: %6d, %6d, %6d, %6d, %6d\n", i, j, pSrcA[i *
+      // lastDimLength + j], diff, shift, exp_partial_sum_inverse, pDstB[i *
+      // lastDimLength + j]);
+    }
+  }
+}
\ No newline at end of file
diff --git a/TargetLibraries/Generic/src/Util.c b/TargetLibraries/Generic/src/Util.c
new file mode 100644
index 0000000..e576e01
--- /dev/null
+++ b/TargetLibraries/Generic/src/Util.c
@@ -0,0 +1,507 @@
+/* =====================================================================
+ * Title:        Util.c
+ * Description:
+ *
+ * Date:         06.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployBasicMath.h"
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+// WIESEP: Provide implementation for the generic platform as it has no
+// dedicated library
+#ifdef DEEPLOY_GENERIC_PLATFORM
+int deeploy_log(const char *__restrict fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  int ret = vprintf(fmt, args);
+  va_end(args);
+  return ret;
+}
+void *deeploy_malloc(const size_t size) { return malloc(size); }
+void deeploy_free(void *const ptr) { free(ptr); }
+#else
+extern int deeploy_log(const char *__restrict fmt, ...);
+extern void *deeploy_malloc(const size_t size);
+extern void deeploy_free(void *const ptr);
+#endif
+
+void PrintMatrix_s8_NCHW(int8_t const *__restrict__ pSrcA, uint32_t N,
+                         uint32_t C, uint32_t H, uint32_t W, int32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    if (N > 0)
+      deeploy_log("[\r\n");
+
+    for (uint32_t c = 0; c < C; c++) {
+      if (N > 0) {
+        deeploy_log("  [\r\n  ");
+      } else if (C > 0) {
+        deeploy_log("[\r\n");
+      }
+      for (uint32_t h = 0; h < H; h++) {
+        for (uint32_t w = 0; w < W; w++) {
+          deeploy_log(
+              "%4d ",
+              (int8_t)(pSrcA[n * C * H * W + c * H * W + h * W + w] + offset));
+        }
+
+        if (N > 0) {
+          deeploy_log("\r\n  ");
+        } else {
+          deeploy_log("\r\n");
+        }
+      }
+      if (C > 0)
+        deeploy_log("]\r\n");
+    }
+
+    if (N > 0)
+      deeploy_log("]\r\n");
+  }
+}
+
+void PrintMatrix_s8_NHWC(int8_t const *__restrict__ pSrcA, uint32_t N,
+                         uint32_t C, uint32_t H, uint32_t W, int32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    if (N > 0)
+      deeploy_log("[\r\n");
+
+    for (uint32_t c = 0; c < C; c++) {
+      if (N > 0) {
+        deeploy_log("  [\r\n  ");
+      } else if (C > 0) {
+        deeploy_log("[\r\n");
+      }
+      for (uint32_t h = 0; h < H; h++) {
+        for (uint32_t w = 0; w < W; w++) {
+          deeploy_log(
+              "%4d ",
+              (int8_t)(pSrcA[n * C * H * W + h * C * W + w * C + c] + offset));
+        }
+
+        if (N > 0) {
+          deeploy_log("\r\n  ");
+        } else {
+          deeploy_log("\r\n");
+        }
+      }
+      if (C > 0)
+        deeploy_log("]\r\n");
+    }
+
+    if (N > 0)
+      deeploy_log("]\r\n");
+  }
+}
+
+void PrintMatrix_s16_NCHW(int16_t const *__restrict__ pSrcA, uint32_t N,
+                          uint32_t C, uint32_t H, uint32_t W, int32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    if (N > 0)
+      deeploy_log("[\r\n");
+
+    for (uint32_t c = 0; c < C; c++) {
+      if (N > 0) {
+        deeploy_log("  [\r\n  ");
+      } else if (C > 0) {
+        deeploy_log("[\r\n");
+      }
+      for (uint32_t h = 0; h < H; h++) {
+        for (uint32_t w = 0; w < W; w++) {
+          deeploy_log(
+              "%6hd ",
+              (int16_t)(pSrcA[n * C * H * W + c * H * W + h * W + w] + offset));
+        }
+
+        if (N > 0) {
+          deeploy_log("\r\n  ");
+        } else {
+          deeploy_log("\r\n");
+        }
+      }
+      if (C > 0)
+        deeploy_log("]\r\n");
+    }
+
+    if (N > 0)
+      deeploy_log("]\r\n");
+  }
+}
+
+void PrintMatrix_s16_NHWC(int16_t const *__restrict__ pSrcA, uint32_t N,
+                          uint32_t C, uint32_t H, uint32_t W, int32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    if (N > 0)
+      deeploy_log("[\r\n");
+
+    for (uint32_t c = 0; c < C; c++) {
+      if (N > 0) {
+        deeploy_log("  [\r\n  ");
+      } else if (C > 0) {
+        deeploy_log("[\r\n");
+      }
+      for (uint32_t h = 0; h < H; h++) {
+        for (uint32_t w = 0; w < W; w++) {
+          deeploy_log(
+              "%6hd ",
+              (int16_t)(pSrcA[n * C * H * W + h * C * W + w * C + c] + offset));
+        }
+
+        if (N > 0) {
+          deeploy_log("\r\n  ");
+        } else {
+          deeploy_log("\r\n");
+        }
+      }
+      if (C > 0)
+        deeploy_log("]\r\n");
+    }
+
+    if (N > 0)
+      deeploy_log("]\r\n");
+  }
+}
+
+void PrintMatrix_s32_NCHW(int32_t const *__restrict__ pSrcA, uint32_t N,
+                          uint32_t C, uint32_t H, uint32_t W, int32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    if (N > 0)
+      deeploy_log("[\r\n");
+
+    for (uint32_t c = 0; c < C; c++) {
+      if (N > 0) {
+        deeploy_log("  [\r\n  ");
+      } else if (C > 0) {
+        deeploy_log("[\r\n");
+      }
+      for (uint32_t h = 0; h < H; h++) {
+        for (uint32_t w = 0; w < W; w++) {
+          deeploy_log(
+              "%11" PRId32 " ",
+              (int32_t)(pSrcA[n * C * H * W + c * H * W + h * W + w] + offset));
+        }
+
+        if (N > 0) {
+          deeploy_log("\r\n  ");
+        } else {
+          deeploy_log("\r\n");
+        }
+      }
+      if (C > 0)
+        deeploy_log("]\r\n");
+    }
+
+    if (N > 0)
+      deeploy_log("]\r\n");
+  }
+}
+
+void PrintMatrix_s32_NHWC(int32_t const *__restrict__ pSrcA, uint32_t N,
+                          uint32_t C, uint32_t H, uint32_t W, int32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    if (N > 0)
+      deeploy_log("[\r\n");
+
+    for (uint32_t c = 0; c < C; c++) {
+      if (N > 0) {
+        deeploy_log("  [\r\n  ");
+      } else if (C > 0) {
+        deeploy_log("[\r\n");
+      }
+      for (uint32_t h = 0; h < H; h++) {
+        for (uint32_t w = 0; w < W; w++) {
+          deeploy_log(
+              "%11" PRId32 " ",
+              (int32_t)(pSrcA[n * C * H * W + h * C * W + w * C + c] + offset));
+        }
+
+        if (N > 0) {
+          deeploy_log("\r\n  ");
+        } else {
+          deeploy_log("\r\n");
+        }
+      }
+      if (C > 0)
+        deeploy_log("]\r\n");
+    }
+
+    if (N > 0)
+      deeploy_log("]\r\n");
+  }
+}
+
+void PrintArray_s8(int8_t const *__restrict__ pSrcA, uint32_t N,
+                   int32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    deeploy_log("%4d ", (int8_t)(pSrcA[n] + offset));
+  }
+  deeploy_log("\r\n");
+}
+
+void PrintArray_s16(int16_t const *__restrict__ pSrcA, uint32_t N,
+                    int32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    deeploy_log("%6hd ", (int16_t)(pSrcA[n] + offset));
+  }
+  deeploy_log("\r\n");
+}
+
+void PrintArray_s32(int32_t const *__restrict__ pSrcA, uint32_t N,
+                    int32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    deeploy_log("%11" PRId32 " ", (int32_t)(pSrcA[n] + offset));
+  }
+  deeploy_log("\r\n");
+}
+
+void PrintMatrix_u8_NCHW(uint8_t const *__restrict__ pSrcA, uint32_t N,
+                         uint32_t C, uint32_t H, uint32_t W, uint32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    if (N > 0)
+      deeploy_log("[\r\n");
+
+    for (uint32_t c = 0; c < C; c++) {
+      if (N > 0) {
+        deeploy_log("  [\r\n  ");
+      } else if (C > 0) {
+        deeploy_log("[\r\n");
+      }
+      for (uint32_t h = 0; h < H; h++) {
+        for (uint32_t w = 0; w < W; w++) {
+          deeploy_log(
+              "%4u ",
+              (uint8_t)(pSrcA[n * C * H * W + c * H * W + h * W + w] + offset));
+        }
+
+        if (N > 0) {
+          deeploy_log("\r\n  ");
+        } else {
+          deeploy_log("\r\n");
+        }
+      }
+      if (C > 0)
+        deeploy_log("]\r\n");
+    }
+
+    if (N > 0)
+      deeploy_log("]\r\n");
+  }
+}
+
+void PrintMatrix_u8_NHWC(uint8_t const *__restrict__ pSrcA, uint32_t N,
+                         uint32_t C, uint32_t H, uint32_t W, uint32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    if (N > 0)
+      deeploy_log("[\r\n");
+
+    for (uint32_t c = 0; c < C; c++) {
+      if (N > 0) {
+        deeploy_log("  [\r\n  ");
+      } else if (C > 0) {
+        deeploy_log("[\r\n");
+      }
+      for (uint32_t h = 0; h < H; h++) {
+        for (uint32_t w = 0; w < W; w++) {
+          deeploy_log(
+              "%4u ",
+              (uint8_t)(pSrcA[n * C * H * W + h * C * W + w * C + c] + offset));
+        }
+
+        if (N > 0) {
+          deeploy_log("\r\n  ");
+        } else {
+          deeploy_log("\r\n");
+        }
+      }
+      if (C > 0)
+        deeploy_log("]\r\n");
+    }
+
+    if (N > 0)
+      deeploy_log("]\r\n");
+  }
+}
+
+void PrintMatrix_u16_NCHW(uint16_t const *__restrict__ pSrcA, uint32_t N,
+                          uint32_t C, uint32_t H, uint32_t W, uint32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    if (N > 0)
+      deeploy_log("[\r\n");
+
+    for (uint32_t c = 0; c < C; c++) {
+      if (N > 0) {
+        deeploy_log("  [\r\n  ");
+      } else if (C > 0) {
+        deeploy_log("[\r\n");
+      }
+      for (uint32_t h = 0; h < H; h++) {
+        for (uint32_t w = 0; w < W; w++) {
+          deeploy_log("%6hu ",
+                      (uint16_t)(pSrcA[n * C * H * W + c * H * W + h * W + w] +
+                                 offset));
+        }
+
+        if (N > 0) {
+          deeploy_log("\r\n  ");
+        } else {
+          deeploy_log("\r\n");
+        }
+      }
+      if (C > 0)
+        deeploy_log("]\r\n");
+    }
+
+    if (N > 0)
+      deeploy_log("]\r\n");
+  }
+}
+
+void PrintMatrix_u16_NHWC(uint16_t const *__restrict__ pSrcA, uint32_t N,
+                          uint32_t C, uint32_t H, uint32_t W, uint32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    if (N > 0)
+      deeploy_log("[\r\n");
+
+    for (uint32_t c = 0; c < C; c++) {
+      if (N > 0) {
+        deeploy_log("  [\r\n  ");
+      } else if (C > 0) {
+        deeploy_log("[\r\n");
+      }
+      for (uint32_t h = 0; h < H; h++) {
+        for (uint32_t w = 0; w < W; w++) {
+          deeploy_log("%6hu ",
+                      (uint16_t)(pSrcA[n * C * H * W + h * C * W + w * C + c] +
+                                 offset));
+        }
+
+        if (N > 0) {
+          deeploy_log("\r\n  ");
+        } else {
+          deeploy_log("\r\n");
+        }
+      }
+      if (C > 0)
+        deeploy_log("]\r\n");
+    }
+
+    if (N > 0)
+      deeploy_log("]\r\n");
+  }
+}
+
+void PrintMatrix_u32_NCHW(uint32_t const *__restrict__ pSrcA, uint32_t N,
+                          uint32_t C, uint32_t H, uint32_t W, uint32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    if (N > 0)
+      deeploy_log("[\r\n");
+
+    for (uint32_t c = 0; c < C; c++) {
+      if (N > 0) {
+        deeploy_log("  [\r\n  ");
+      } else if (C > 0) {
+        deeploy_log("[\r\n");
+      }
+      for (uint32_t h = 0; h < H; h++) {
+        for (uint32_t w = 0; w < W; w++) {
+          deeploy_log("%11" PRIu32 " ",
+                      (uint32_t)(pSrcA[n * C * H * W + c * H * W + h * W + w] +
+                                 offset));
+        }
+
+        if (N > 0) {
+          deeploy_log("\r\n  ");
+        } else {
+          deeploy_log("\r\n");
+        }
+      }
+      if (C > 0)
+        deeploy_log("]\r\n");
+    }
+
+    if (N > 0)
+      deeploy_log("]\r\n");
+  }
+}
+
+void PrintMatrix_u32_NHWC(uint32_t const *__restrict__ pSrcA, uint32_t N,
+                          uint32_t C, uint32_t H, uint32_t W, uint32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    if (N > 0)
+      deeploy_log("[\r\n");
+
+    for (uint32_t c = 0; c < C; c++) {
+      if (N > 0) {
+        deeploy_log("  [\r\n  ");
+      } else if (C > 0) {
+        deeploy_log("[\r\n");
+      }
+      for (uint32_t h = 0; h < H; h++) {
+        for (uint32_t w = 0; w < W; w++) {
+          deeploy_log("%11" PRIu32 " ",
+                      (uint32_t)(pSrcA[n * C * H * W + h * C * W + w * C + c] +
+                                 offset));
+        }
+
+        if (N > 0) {
+          deeploy_log("\r\n  ");
+        } else {
+          deeploy_log("\r\n");
+        }
+      }
+      if (C > 0)
+        deeploy_log("]\r\n");
+    }
+
+    if (N > 0)
+      deeploy_log("]\r\n");
+  }
+}
+
+void PrintArray_u8(uint8_t const *__restrict__ pSrcA, uint32_t N,
+                   uint32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    deeploy_log("%4u ", (uint8_t)(pSrcA[n] + offset));
+  }
+  deeploy_log("\r\n");
+}
+
+void PrintArray_u16(uint16_t const *__restrict__ pSrcA, uint32_t N,
+                    uint32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    deeploy_log("%6hu ", (uint16_t)(pSrcA[n] + offset));
+  }
+  deeploy_log("\r\n");
+}
+
+void PrintArray_u32(uint32_t const *__restrict__ pSrcA, uint32_t N,
+                    uint32_t offset) {
+  for (uint32_t n = 0; n < N; n++) {
+    deeploy_log("%11" PRIu32 " ", (uint32_t)(pSrcA[n] + offset));
+  }
+  deeploy_log("\r\n");
+}
diff --git a/TargetLibraries/Generic/src/iRMSNorm_s8.c b/TargetLibraries/Generic/src/iRMSNorm_s8.c
new file mode 100644
index 0000000..e84c407
--- /dev/null
+++ b/TargetLibraries/Generic/src/iRMSNorm_s8.c
@@ -0,0 +1,97 @@
+/* =====================================================================
+ * Title:        iRMSNorm_s8.c
+ * Description:
+ *
+ * $Date:        20.02.2024
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployBasicMath.h"
+
+void _plp_sqrt_q32(const int32_t *__restrict__ pSrc, const uint32_t fracBits,
+                   int32_t *__restrict__ pRes) {
+
+  int32_t number = *pSrc;
+  int32_t root = 0;
+
+  int32_t start = 0;
+  int32_t end = 46342; // smallest integer that is larger than sqrt(0x7FFFFFFF)
+  int32_t mid;
+
+  if (number > 0) {
+
+    while (start <= end) {
+
+      mid = (start + end) >> 1;
+
+      if (((mid * mid) >> fracBits) == number) {
+        root = mid;
+        break;
+      }
+
+      if (((mid * mid) >> fracBits) < number) {
+        start = mid + 1;
+        root = mid;
+      } else {
+        end = mid - 1;
+      }
+    }
+
+    *pRes = root;
+
+  } else {
+    *pRes = 0;
+  }
+}
+
+void iRMSnorm_s8_s8(int8_t *data_in, int8_t *data_out, int32_t *weight,
+                    int32_t input_offset, int32_t size, int32_t lastDimLength,
+                    int32_t log2D) {
+
+  // int16_t temp[size];
+  int32_t sum;
+  int32_t std;
+  int16_t temp;
+  int32_t intermediate;
+
+  for (int i = 0; i < (size / lastDimLength); i++) {
+    sum = 0;
+    for (int j = 0; j < lastDimLength; j++) {
+      temp = (int16_t)(data_in[j + i * lastDimLength] + input_offset);
+      sum += temp * temp;
+    }
+    sum = sum / lastDimLength;
+    sum += 1;
+    _plp_sqrt_q32(&sum, 0, &std);
+
+    for (int j = 0; j < lastDimLength; j++) {
+
+      intermediate =
+          ((((((int32_t)data_in[j + i * lastDimLength]) + input_offset) *
+             weight[j]) /
+            (std)) >>
+           log2D);
+
+      data_out[j + i * lastDimLength] = (int8_t)CLAMP(intermediate, -128, 127);
+    }
+  }
+}
diff --git a/TargetLibraries/MemPool/CMakeLists.txt b/TargetLibraries/MemPool/CMakeLists.txt
new file mode 100644
index 0000000..d22180c
--- /dev/null
+++ b/TargetLibraries/MemPool/CMakeLists.txt
@@ -0,0 +1,22 @@
+file(GLOB_RECURSE SOURCES
+  "src/**"
+)
+
+include(cmake/mempool-runtime.cmake)
+
+add_deeploy_library(deeploymempool STATIC ${SOURCES})
+
+target_include_directories(deeploymempool
+  PUBLIC
+  ${CMAKE_CURRENT_LIST_DIR}/inc
+)
+
+target_include_directories(deeploymempool PUBLIC
+  ${MEMPOOL_RUNTIME_INCLUDE}
+  ${MEMPOOL_RUNTIME_OMP_INCLUDE}
+  ${MEMPOOL_RUNTIME_HALIDE_INCLUDE}
+)
+target_compile_options(deeploymempool PUBLIC ${MEMPOOL_RUNTIME_COMPILE_FLAGS})
+
+target_link_libraries(deeploymempool INTERFACE mempool-runtime)
+target_sources(deeploymempool INTERFACE $<TARGET_OBJECTS:mempool-runtime>)
diff --git a/TargetLibraries/MemPool/cmake/mempool-runtime.cmake b/TargetLibraries/MemPool/cmake/mempool-runtime.cmake
new file mode 100644
index 0000000..96cebfd
--- /dev/null
+++ b/TargetLibraries/MemPool/cmake/mempool-runtime.cmake
@@ -0,0 +1,105 @@
+set(MEMPOOL_HOME $ENV{MEMPOOL_HOME})
+set(MEMPOOL_RUNTIME_HOME ${MEMPOOL_HOME}/software/runtime)
+
+set(MEMPOOL_RUNTIME_C_SOURCE
+  ${MEMPOOL_RUNTIME_HOME}/alloc.c
+  ${MEMPOOL_RUNTIME_HOME}/dma.c
+  ${MEMPOOL_RUNTIME_HOME}/printf.c
+  ${MEMPOOL_RUNTIME_HOME}/serial.c
+  ${MEMPOOL_RUNTIME_HOME}/string.c
+  ${MEMPOOL_RUNTIME_HOME}/synchronization.c
+  )
+
+set(MEMPOOL_RUNTIME_ASM_SOURCE
+  ${MEMPOOL_RUNTIME_HOME}/crt0.S
+)
+
+set(MEMPOOL_RUNTIME_INCLUDE
+  ${MEMPOOL_RUNTIME_HOME}
+  ${MEMPOOL_RUNTIME_HOME}/target/${mempool_flavour}
+)
+
+set(MEMPOOL_RUNTIME_COMPILE_FLAGS
+  -D__riscv__
+  -D__builtin_shuffle=__builtin_pulp_shuffle2h
+)
+
+if (${MEMPOOL_USE_OMP})
+set(MEMPOOL_RUNTIME_OMP_C_SOURCE
+  ${MEMPOOL_RUNTIME_HOME}/omp/barrier.c
+  ${MEMPOOL_RUNTIME_HOME}/omp/critical.c
+  ${MEMPOOL_RUNTIME_HOME}/omp/loop.c
+  ${MEMPOOL_RUNTIME_HOME}/omp/parallel.c
+  ${MEMPOOL_RUNTIME_HOME}/omp/sections.c
+  ${MEMPOOL_RUNTIME_HOME}/omp/single.c
+  ${MEMPOOL_RUNTIME_HOME}/omp/work.c
+)
+
+set(MEMPOOL_RUNTIME_OMP_INCLUDE
+  ${MEMPOOL_RUNTIME_HOME}/omp/
+)
+endif()
+
+if (${MEMPOOL_USE_HALIDE})
+set(MEMPOOL_RUNTIME_HALIDE_C_SOURCE
+  ${MEMPOOL_RUNTIME_HOME}/halide/halide_runtime.c
+)
+
+set(MEMPOOL_RUNTIME_HALIDE_INCLUDE
+  ${MEMPOOL_RUNTIME_HOME}/halide/
+)
+endif()
+
+get_directory_property(DirDefs DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS )
+
+add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/arch.ld
+  MAIN_DEPENDENCY ${MEMPOOL_RUNTIME_HOME}/target/${mempool_flavour}/arch.ld.c
+  COMMAND ${CMAKE_C_COMPILER}
+  -P -E "$<$<BOOL:${DirDefs}>:-D$<JOIN:${DirDefs},;-D>>"
+  ${MEMPOOL_RUNTIME_HOME}/target/${mempool_flavour}/arch.ld.c
+  -o ${CMAKE_BINARY_DIR}/arch.ld
+  COMMAND_EXPAND_LISTS
+  COMMENT "Generate arch.ld from arch.ld.c (${CMAKE_BINARY_DIR})"
+  VERBATIM)
+
+# Create a consumer target for the linker script.
+# So previous `add_custom_command` will have an effect.
+add_custom_target(linkerscript DEPENDS ${CMAKE_BINARY_DIR}/arch.ld)
+
+set_source_files_properties(${MEMPOOL_RUNTIME_ASM_SOURCE} PROPERTIES COMPILE_FLAGS -DLANGUAGE_ASSEMBLY)
+add_library(mempool-runtime OBJECT
+  ${MEMPOOL_RUNTIME_C_SOURCE}
+  ${MEMPOOL_RUNTIME_ASM_SOURCE}
+  ${MEMPOOL_RUNTIME_OMP_C_SOURCE}
+  ${MEMPOOL_RUNTIME_HALIDE_C_SOURCE}
+)
+
+target_include_directories(mempool-runtime SYSTEM PUBLIC
+  ${MEMPOOL_RUNTIME_INCLUDE}
+  ${MEMPOOL_RUNTIME_OMP_INCLUDE}
+  ${MEMPOOL_RUNTIME_HALIDE_INCLUDE}
+)
+target_compile_options(mempool-runtime PUBLIC ${MEMPOOL_RUNTIME_COMPILE_FLAGS})
+target_compile_options(mempool-runtime PRIVATE
+  -O2
+  -fno-inline
+  -fno-common
+)
+target_compile_options(mempool-runtime INTERFACE
+  -Wno-unused-function
+)
+
+set(MEMPOOL_LINK_OPTIONS
+  -Wl,--gc-sections
+  -L${CMAKE_BINARY_DIR}
+  -T${MEMPOOL_RUNTIME_HOME}/target/${mempool_flavour}/link.ld
+)
+
+target_link_libraries(mempool-runtime PUBLIC
+  ${MEMPOOL_LINK_OPTIONS}
+)
+
+# Make executable to depend on that target.
+# So, the check whether to relink the executable will be performed
+# after possible rebuilding the linker script.
+add_dependencies(mempool-runtime linkerscript)
\ No newline at end of file
diff --git a/TargetLibraries/MemPool/inc/CycleCounter.h b/TargetLibraries/MemPool/inc/CycleCounter.h
new file mode 100644
index 0000000..52e24d2
--- /dev/null
+++ b/TargetLibraries/MemPool/inc/CycleCounter.h
@@ -0,0 +1,51 @@
+/* =====================================================================
+ * Title:        CycleCounter.h
+ * Description:
+ *
+ * Date:         06.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_CYCLE_HEADER_
+#define __DEEPLOY_MATH_CYCLE_HEADER_
+
+#include <stdint.h>
+
+// Resets the internal cycle and instruction counter to zero
+void ResetTimer(void);
+
+// Starts the internal cycle and instruction counter
+void StartTimer(void);
+
+// Stops the internal cycle and instruction counter
+void StopTimer(void);
+
+// Returns the current number of cycles according to the internal cycle counter
+uint32_t getCycles(void);
+
+// Returns the current number of instructions according to the internal
+// instructions counter
+uint32_t getInstr(void);
+
+#endif //__DEEPLOY_MATH_CYCLE_HEADER_
diff --git a/TargetLibraries/MemPool/inc/DeeployMath.h b/TargetLibraries/MemPool/inc/DeeployMath.h
new file mode 100644
index 0000000..2ff4080
--- /dev/null
+++ b/TargetLibraries/MemPool/inc/DeeployMath.h
@@ -0,0 +1,68 @@
+/* =====================================================================
+ * Title:        DeeployMath.h
+ * Description:
+ *
+ * Date:         29.11.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Samuel Riedel, ETH Zurich
+ * - Sergio Mazzola, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_HEADER_
+#define __DEEPLOY_MATH_HEADER_
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+#define BEGIN_SINGLE_CORE if (core_id == 0) {
+#define END_SINGLE_CORE }
+#define SINGLE_CORE if (core_id == 0)
+
+#include "CycleCounter.h"
+#include "ITA.h"
+#include "constants.h"
+#include "macros.h"
+
+#include "DeeployBasicMath.h"
+
+#include "builtins.h"
+#include "dma.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+#include "kernel/Convolution.h"
+#include "kernel/DWConvolution.h"
+#include "kernel/Gemm.h"
+#include "kernel/MHSA.h"
+#include "kernel/MatMul.h"
+#include "kernel/MaxPool.h"
+#include "kernel/RQGemm.h"
+#include "kernel/RQMatMul.h"
+#include "kernel/RequantShift.h"
+#include "kernel/Softmax.h"
+
+#endif //__DEEPLOY_MATH_HEADER_
diff --git a/TargetLibraries/MemPool/inc/ITA.h b/TargetLibraries/MemPool/inc/ITA.h
new file mode 100644
index 0000000..2bdda47
--- /dev/null
+++ b/TargetLibraries/MemPool/inc/ITA.h
@@ -0,0 +1,227 @@
+/* =====================================================================
+ * Title:        ITA.h
+ * Description:
+ *
+ * Date:         03.03.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_ITA_HEADER_
+#define __DEEPLOY_MATH_ITA_HEADER_
+
+/* Includes ------------------------------------------------------------------*/
+
+#include "DeeployMath.h"
+
+/* Exported constants --------------------------------------------------------*/
+
+#define ITA0_BASE (0x40000040)
+#define ITA1_BASE (0x40000070)
+#define ITA2_BASE (0x400000A0)
+#define ITA3_BASE (0x400000D0)
+
+#define ITA0_L1_BASE (0xC0000)
+#define ITA1_L1_BASE (0xD0000)
+#define ITA2_L1_BASE (0xE0000)
+#define ITA3_L1_BASE (0xF0000)
+
+// ITA has 0x1000 bytes per matrix and OUT is matrix 0
+#define ITA0_OUT (ITA0_L1_BASE + 0x3000)
+#define ITA1_OUT (ITA1_L1_BASE + 0x3000)
+#define ITA2_OUT (ITA2_L1_BASE + 0x3000)
+#define ITA3_OUT (ITA3_L1_BASE + 0x3000)
+
+// Default ITA configuration
+#ifndef ITA_PE
+#define ITA_PE 16 // Number of processing engines per ITA core
+#endif
+
+/* Exported macros -----------------------------------------------------------*/
+#define MUL(X, Y) ((X) * (Y))
+
+// clang-format off
+#define SET_BIT(REG, BIT)     ((REG) |= (BIT))
+#define CLEAR_BIT(REG, BIT)   ((REG) &= ~(BIT))
+#define READ_BIT(REG, BIT)    ((REG) & (BIT))
+#define CLEAR_REG(REG)        ((REG) = (0x0))
+#define WRITE_REG(REG, VAL)   ((REG) = (VAL))
+#define READ_REG(REG)         ((REG))
+
+#define MODIFY_REG(REG, CLEARMASK, SETMASK) WRITE_REG((REG), (((READ_REG(REG)) & (~(CLEARMASK))) | (SETMASK)))
+// clang-format on
+
+/* Exported types ------------------------------------------------------------*/
+
+typedef struct {
+  int8_t *wo_weight;
+  int8_t *wv_weight;
+  int8_t *wk_weight;
+  int8_t *q;
+  int8_t *k;
+  int8_t *wq_weight;
+  int32_t *wo_bias;
+  int32_t *wv_bias;
+  int32_t *wk_bias;
+  int32_t *wq_bias;
+} ita_data_t;
+
+typedef struct {
+  uint8_t *eps_mult;
+  uint8_t *right_shift;
+  int32_t *add;
+} ita_quant_t;
+
+/* Exported variables --------------------------------------------------------*/
+
+/* Exported functions --------------------------------------------------------*/
+void ITA_getStruct(ita_data_t *ita_data, int8_t *base_address, uint32_t S,
+                   uint32_t E, uint32_t P);
+
+void ITA_copyInput(int8_t *pDst, int8_t const *__restrict__ pSrc, uint32_t S,
+                   uint32_t E, int8_t offset);
+
+void ITA_printAddresses(ita_data_t *ita_data);
+
+/* Peripheral Definition
+ * ------------------------------------------------------*/
+// Inspired by CMSIS Device Peripheral Access Layer Header Files.
+
+// clang-format off
+typedef struct
+{
+  volatile uint32_t STATE;          /*!< ITA STATE Register,                                        Address offset: 0x00 */
+  volatile uint32_t START_ADDR;     /*!< ITA Start Address,                                         Address offset: 0x04 */
+  volatile uint32_t OUT_ADDR;       /*!< ITA Out Address,                                           Address offset: 0x08 */
+  volatile uint32_t RQS_ADDR;       /*!< ITA Requantization Parameter Address,                      Address offset: 0x0C */
+  volatile uint32_t S;              /*!< ITA Sequence Length Register                               Address offset: 0x10 */
+  volatile uint32_t E;              /*!< ITA Embedding Length Register                              Address offset: 0x14 */
+  volatile uint32_t P;              /*!< ITA Projection Length Register                             Address offset: 0x18 */
+} ITA_TypeDef;
+
+#define ITA_CONFIG_START_Pos          (0U)
+#define ITA_CONFIG_START_Msk          (0x1UL << ITA_CONFIG_START_Pos)           /*!< 0x00000001 */
+#define ITA_CONFIG_START              ITA_CONFIG_START_Msk                     /*!< ITA Start Computation Flag */
+
+#define ITA_CONFIG_BUSY_Pos           (1U)
+#define ITA_CONFIG_BUSY_Msk           (0x1UL << ITA_CONFIG_BUSY_Pos)            /*!< 0x00000002 */
+#define ITA_CONFIG_BUSY               ITA_CONFIG_BUSY_Msk                      /*!< ITA Busy Flag */
+
+#define ITA_CONFIG_DONE_Pos           (2U)
+#define ITA_CONFIG_DONE_Msk           (0x1UL << ITA_CONFIG_DONE_Pos)            /*!< 0x00000004 */
+#define ITA_CONFIG_DONE               ITA_CONFIG_DONE_Msk                      /*!< ITA1 Done Flag */
+
+#define ITA_CONFIG_SKIPSOFTMAX_Pos    (4U)
+#define ITA_CONFIG_SKIPSOFTMAX_Msk    (0x1UL << ITA_CONFIG_SKIPSOFTMAX_Pos)     /*!< 0x00000010 */
+#define ITA_CONFIG_SKIPSOFTMAX        ITA_CONFIG_SKIPSOFTMAX_Msk                /*!< ITA1 Skip Softmax Flag (unused) */
+
+#define ITA_CONFIG_ITER_Pos           (5U)
+#define ITA_CONFIG_ITER_Msk           (0x7UL << ITA_CONFIG_ITER_Pos)            /*!< 0x000000e0 */
+#define ITA_CONFIG_ITER               ITA_CONFIG_ITER_Msk                       /*!< ITA Iteration [2:0] Bits (Value 0 -> 1 Iteration) */
+
+#define ITA0              ((ITA_TypeDef *) ITA0_BASE)
+#define ITA1              ((ITA_TypeDef *) ITA1_BASE)
+#define ITA2              ((ITA_TypeDef *) ITA2_BASE)
+#define ITA3              ((ITA_TypeDef *) ITA3_BASE)
+
+
+static inline void ITA_Start(ITA_TypeDef *ITAx)
+{
+  // An explicit softmax state is no longer used, hence we always skip it.
+  SET_BIT(ITAx->STATE, ITA_CONFIG_SKIPSOFTMAX);
+
+  // Start ITA
+  SET_BIT(ITAx->STATE, ITA_CONFIG_START);
+}
+
+static inline void ITA_SetStartAddress(ITA_TypeDef *ITAx, const uint32_t StartAddress)
+{
+  ITAx->START_ADDR = StartAddress;
+}
+
+static inline uint32_t ITA_GetStartAddress(const ITA_TypeDef *ITAx)
+{
+  return ITAx->START_ADDR;
+}
+
+static inline void ITA_SetOutAddress(ITA_TypeDef *ITAx, const uint32_t OutAddress)
+{
+  ITAx->OUT_ADDR = OutAddress;
+}
+
+static inline uint32_t ITA_GetOutAddress(const ITA_TypeDef *ITAx)
+{
+  return ITAx->OUT_ADDR;
+}
+
+static inline void ITA_SetRQSAddress(ITA_TypeDef *ITAx, const uint32_t RQSAddress)
+{
+  ITAx->RQS_ADDR = RQSAddress;
+}
+
+static inline uint32_t ITA_GetRQSAddress(const ITA_TypeDef *ITAx)
+{
+  return ITAx->RQS_ADDR;
+}
+
+static inline void ITA_SetShape(ITA_TypeDef *ITAx, uint32_t S, uint32_t E, uint32_t P)
+{
+  ITAx->S = S;
+  ITAx->E = E;
+  ITAx->P = P;
+}
+
+static inline uint32_t ITA_GetShape_S(const ITA_TypeDef *ITAx)
+{
+  return ITAx->S;
+}
+
+static inline uint32_t ITA_GetShape_E(const ITA_TypeDef *ITAx)
+{
+  return ITAx->E;
+}
+
+static inline uint32_t ITA_GetShape_P(const ITA_TypeDef *ITAx)
+{
+  return ITAx->P;
+}
+
+static inline uint32_t ITA_IsBusy(const ITA_TypeDef *ITAx)
+{
+  return (READ_BIT(ITAx->STATE, ITA_CONFIG_BUSY) == (ITA_CONFIG_BUSY));
+}
+
+static inline uint32_t ITA_IsDone(const ITA_TypeDef *ITAx)
+{
+  return (READ_BIT(ITAx->STATE, ITA_CONFIG_DONE) == (ITA_CONFIG_DONE));
+}
+
+static inline void ITA_SetIter(ITA_TypeDef *ITAx, uint32_t Counter)
+{
+  if (Counter > 0) {
+    MODIFY_REG(ITAx->STATE, ITA_CONFIG_ITER, Counter - 1);
+  }
+}
+// clang-format on
+
+#endif //__DEEPLOY_MATH_ITA_HEADER_
diff --git a/TargetLibraries/MemPool/inc/constants.h b/TargetLibraries/MemPool/inc/constants.h
new file mode 100644
index 0000000..6a3d2ca
--- /dev/null
+++ b/TargetLibraries/MemPool/inc/constants.h
@@ -0,0 +1,37 @@
+/* =====================================================================
+ * Title:        constants.h
+ * Description:
+ *
+ * Date:         29.11.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Samuel Riedel, ETH Zurich
+ * - Sergio Mazzola, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_CONSTANTS_HEADER_
+#define __DEEPLOY_MATH_CONSTANTS_HEADER_
+
+#define sqrt2 0b1011010100000100
+
+#endif //__DEEPLOY_MATH_CONSTANTS_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/Convolution.h b/TargetLibraries/MemPool/inc/kernel/Convolution.h
new file mode 100644
index 0000000..63ac483
--- /dev/null
+++ b/TargetLibraries/MemPool/inc/kernel/Convolution.h
@@ -0,0 +1,103 @@
+/* =====================================================================
+ * Title:        Convolution.h
+ * Description:
+ *
+ * Date:         02.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Samuel Riedel, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_CONVOLUTION_KERNEL_HEADER_
+#define __DEEPLOY_MATH_CONVOLUTION_KERNEL_HEADER_
+
+#include "DeeployMath.h"
+
+/* This library implements the requantized convolution for several data widths
+ * in multiple different ways.
+ *
+ * A is an M x N input matrix, B is a P x Q kernel matrix and C is and MxN
+ * output matrix
+ *
+ * Note that all the matrices dimensions must be multiples of 4; these
+ * kernels do not have clean-up code and remaining elements would not be
+ * considered, leading to wrong results.
+ */
+
+/******************************************************************************/
+/*                         General Convolution (8bit)                         */
+/******************************************************************************/
+
+/*
+ * 2D Convolution  ----------------------------------
+ * kernel      = Conv2d_parallel_s8_NCHW_rv32im
+ * layout      = NCHW
+ * data type   = 8-bit integer
+ * kernel size = generic
+ * multi-core  = yes
+ * unrolling   = no
+ * simd        = no
+ */
+void Conv2d_parallel_s8_NCHW_rv32im(
+    int8_t const *__restrict__ pSrcA, uint32_t C, uint32_t H, uint32_t W,
+    int8_t const *__restrict__ pSrcB, uint32_t F, uint32_t P, uint32_t Q,
+    uint32_t SP, uint32_t SQ, int32_t *__restrict__ pDstC, int32_t input_offset,
+    int32_t output_offset, uint32_t core_id, uint32_t numThreads);
+
+/*
+ * 2D Convolution  ----------------------------------
+ * kernel      = Conv2d_3x3_unrolled_parallel_s8_NCHW_rv32im
+ * layout      = NCHW
+ * data type   = 8-bit integer
+ * kernel size = 3x3
+ * multi-core  = yes
+ * unrolling   = yes
+ * simd        = no
+ */
+void Conv2d_3x3_unrolled_parallel_s8_NCHW_rv32im(
+    int8_t const *__restrict__ pSrcA, uint32_t M, uint32_t N,
+    int8_t const *__restrict__ pSrcB, int32_t *__restrict__ pDstC,
+    uint32_t core_id, uint32_t numThreads);
+
+// Mapper Functions
+static inline void __attribute__((always_inline)) Conv2d_parallel_s8_NCHW(
+    int8_t const *__restrict__ pSrcA, uint32_t C, uint32_t H, uint32_t W,
+    int8_t const *__restrict__ pSrcB, uint32_t F, uint32_t P, uint32_t Q,
+    uint32_t SP, uint32_t SQ, int32_t *__restrict__ pDstC, int32_t input_offset,
+    int32_t output_offset, uint32_t core_id, uint32_t numThreads) {
+  Conv2d_parallel_s8_NCHW_rv32im(pSrcA, C, H, W, pSrcB, F, P, Q, SP, SQ, pDstC,
+                                 input_offset, output_offset, core_id,
+                                 numThreads);
+}
+
+static inline void __attribute__((always_inline))
+Conv2d_3x3_unrolled_parallel_s8_NCHW(int8_t const *__restrict__ pSrcA,
+                                     uint32_t M, uint32_t N,
+                                     int8_t const *__restrict__ pSrcB,
+                                     int32_t *__restrict__ pDstC,
+                                     uint32_t core_id, uint32_t numThreads) {
+  Conv2d_3x3_unrolled_parallel_s8_NCHW_rv32im(pSrcA, M, N, pSrcB, pDstC,
+                                              core_id, numThreads);
+}
+
+#endif //__DEEPLOY_MATH_CONVOLUTION_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/DWConvolution.h b/TargetLibraries/MemPool/inc/kernel/DWConvolution.h
new file mode 100644
index 0000000..e05ae7a
--- /dev/null
+++ b/TargetLibraries/MemPool/inc/kernel/DWConvolution.h
@@ -0,0 +1,77 @@
+/* =====================================================================
+ * Title:        DWConvolution.h
+ * Description:
+ *
+ * Date:         09.01.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_DWCONVOLUTION_KERNEL_HEADER_
+#define __DEEPLOY_MATH_DWCONVOLUTION_KERNEL_HEADER_
+
+#include "DeeployMath.h"
+
+/* This library implements the requantized convolution for several data widths
+ * in multiple different ways.
+ *
+ * A is an M x N input matrix, B is a P x Q kernel matrix and C is and MxN
+ * output matrix
+ *
+ * Note that all the matrices dimensions must be multiples of 4; these
+ * kernels do not have clean-up code and remaining elements would not be
+ * considered, leading to wrong results.
+ */
+
+/******************************************************************************/
+/*                         General Convolution (8bit)                         */
+/******************************************************************************/
+
+/*
+ * 2D Convolution  ----------------------------------
+ * kernel      = DWConv2d_parallel_s8_NCHW_rv32im
+ * layout      = NCHW
+ * data type   = 8-bit integer
+ * kernel size = generic
+ * multi-core  = yes
+ * unrolling   = no
+ * simd        = no
+ */
+void DWConv2d_parallel_s8_NCHW_rv32im(
+    int8_t const *__restrict__ pSrcA, uint32_t C, uint32_t H, uint32_t W,
+    int8_t const *__restrict__ pSrcB, uint32_t P, uint32_t Q, uint32_t SP,
+    uint32_t SQ, int32_t *__restrict__ pDstC, int32_t input_offset,
+    int32_t output_offset, uint32_t core_id, uint32_t numThreads);
+
+// Mapper Functions
+static inline void __attribute__((always_inline)) DWConv2d_parallel_s8_NCHW(
+    int8_t const *__restrict__ pSrcA, uint32_t C, uint32_t H, uint32_t W,
+    int8_t const *__restrict__ pSrcB, uint32_t P, uint32_t Q, uint32_t SP,
+    uint32_t SQ, int32_t *__restrict__ pDstC, int32_t input_offset,
+    int32_t output_offset, uint32_t core_id, uint32_t numThreads) {
+  DWConv2d_parallel_s8_NCHW_rv32im(pSrcA, C, H, W, pSrcB, P, Q, SP, SQ, pDstC,
+                                   input_offset, output_offset, core_id,
+                                   numThreads);
+}
+
+#endif //__DEEPLOY_MATH_DWCONVOLUTION_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/Gemm.h b/TargetLibraries/MemPool/inc/kernel/Gemm.h
new file mode 100644
index 0000000..157a3d2
--- /dev/null
+++ b/TargetLibraries/MemPool/inc/kernel/Gemm.h
@@ -0,0 +1,81 @@
+/* =====================================================================
+ * Title:        Gemm.h
+ * Description:
+ *
+ * Date:         16.05.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_GEMM_KERNEL_HEADER_
+#define __DEEPLOY_MATH_GEMM_KERNEL_HEADER_
+
+#include "DeeployMath.h"
+
+/*
+ * This library implements the matrix multiplication for several data widths
+ * in multiple different ways. The functions all follow the following format:
+ *
+ * A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix
+ * A' = transpose(A) if transA else A
+ * B' = transpose(B) if transB else B
+ *
+ * Y = alpha * A' * B' + beta * C
+ *
+ */
+
+/******************************************************************************/
+/*               General Matrix Multiplication (8bit)                         */
+/******************************************************************************/
+
+/*
+ * General Matrix Multiplication ----------------------------------
+ * kernel     = Gemm_parallel_s8_rv32im
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = no
+ * simd       = no
+ * cleanup    = yes
+ */
+void Gemm_parallel_s8_rv32im(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t const *__restrict__ pSrcC, int32_t *__restrict__ pDstY, uint32_t M,
+    uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
+    int32_t transB, int32_t A_offset, int32_t B_offset, int32_t C_offset,
+    int32_t Y_offset, uint32_t core_id, uint32_t numThreads);
+
+// Mapper Functions
+static inline void __attribute__((always_inline))
+Gemm_parallel_s8(int8_t const *__restrict__ pSrcA,
+                 int8_t const *__restrict__ pSrcB,
+                 int32_t const *__restrict__ pSrcC, int32_t *__restrict__ pDstY,
+                 uint32_t M, uint32_t N, uint32_t P, int32_t alpha,
+                 int32_t beta, int32_t transA, int32_t transB, int32_t A_offset,
+                 int32_t B_offset, int32_t C_offset, int32_t Y_offset,
+                 uint32_t core_id, uint32_t numThreads) {
+  Gemm_parallel_s8_rv32im(pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta,
+                          transA, transB, A_offset, B_offset, C_offset,
+                          Y_offset, core_id, numThreads);
+}
+
+#endif //__DEEPLOY_MATH_GEMM_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/MHSA.h b/TargetLibraries/MemPool/inc/kernel/MHSA.h
new file mode 100644
index 0000000..a97f4ae
--- /dev/null
+++ b/TargetLibraries/MemPool/inc/kernel/MHSA.h
@@ -0,0 +1,96 @@
+/* =====================================================================
+ * Title:        MHSA.h
+ * Description:
+ *
+ * Date:         08.02.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_MHSA_KERNEL_HEADER_
+#define __DEEPLOY_MATH_MHSA_KERNEL_HEADER_
+
+/* Includes ------------------------------------------------------------------*/
+
+#include "DeeployMath.h"
+
+/* This library implements multi-head self attention for several data widths
+ * in multiple different ways. The functions all follow the following format:
+ */
+
+/******************************************************************************/
+/*                      Multi-Head Self Attention (8bit)                      */
+/******************************************************************************/
+
+/*
+ * MHSA  ----------------------------------
+ * kernel      = M1HSA_s8_ITA
+ * data type   = 8-bit integer
+ * multi-core  = yes
+ * unrolling   = no
+ * simd        = no
+ * accelerator = ITA
+ * heads       = 1
+ */
+void M1HSA_s8_ITA(int8_t const *__restrict__ pSrcQ,
+                  int8_t const *__restrict__ pSrcK, int8_t *__restrict__ pBuf,
+                  uint32_t S, uint32_t E, uint32_t P,
+                  ita_quant_t const *__restrict__ quant_param,
+                  int8_t *__restrict__ pDst, int8_t Q_offset, int8_t K_offset,
+                  int8_t output_offset, uint32_t core_id, uint32_t numThreads);
+
+/*
+ * MHSA  ----------------------------------
+ * kernel      = M2HSA_s8_ITA
+ * data type   = 8-bit integer
+ * multi-core  = yes
+ * unrolling   = no
+ * simd        = no
+ * accelerator = ITA
+ * heads       = 2
+ */
+void M2HSA_s8_ITA(int8_t const *__restrict__ pSrcQ,
+                  int8_t const *__restrict__ pSrcK, int8_t **__restrict__ pBuf,
+                  uint32_t S, uint32_t E, uint32_t P,
+                  ita_quant_t const **__restrict__ quant_params,
+                  int8_t *__restrict__ pDst, int8_t Q_offset, int8_t K_offset,
+                  int8_t output_offset, uint32_t core_id, uint32_t numThreads);
+
+/*
+ * MHSA  ----------------------------------
+ * kernel      = M4HSA_s8_ITA
+ * data type   = 8-bit integer
+ * multi-core  = yes
+ * unrolling   = no
+ * simd        = no
+ * accelerator = ITA
+ * heads       = 4
+ */
+void M4HSA_s8_ITA(int8_t const *__restrict__ pSrcQ,
+                  int8_t const *__restrict__ pSrcK, int8_t **__restrict__ pBuf,
+                  uint32_t S, uint32_t E, uint32_t P,
+                  ita_quant_t const **__restrict__ quant_params,
+                  int8_t *__restrict__ pDst, int8_t Q_offset, int8_t K_offset,
+                  int8_t output_offset, uint32_t core_id, uint32_t numThreads);
+
+#endif //__DEEPLOY_MATH_MHSA_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/MatMul.h b/TargetLibraries/MemPool/inc/kernel/MatMul.h
new file mode 100644
index 0000000..ad61baf
--- /dev/null
+++ b/TargetLibraries/MemPool/inc/kernel/MatMul.h
@@ -0,0 +1,306 @@
+/* =====================================================================
+ * Title:        MatMul.h
+ * Description:
+ *
+ * Date:         29.11.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Samuel Riedel, ETH Zurich
+ * - Sergio Mazzola, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_MATMUL_KERNEL_HEADER_
+#define __DEEPLOY_MATH_MATMUL_KERNEL_HEADER_
+
+#include "DeeployMath.h"
+
+/* This library implements the matrix multiplication for several data widths
+ * in multiple different ways. The functions all follow the following format:
+ *
+ * A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix
+ * C = AB
+ *
+ * Note that all the matrices dimensions must be multiples of 4; these
+ * kernels do not have clean-up code and remaining elements would not be
+ * considered, leading to wrong results
+ */
+
+/******************************************************************************/
+/*                         Matrix Multiplication (8bit)                       */
+/******************************************************************************/
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = MatMul_parallel_s8_rv32im
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = no
+ * simd       = no
+ * cleanup    = yes
+ */
+void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA,
+                               int8_t const *__restrict__ pSrcB,
+                               int32_t *__restrict__ pDstC, uint32_t M,
+                               uint32_t N, uint32_t P, int32_t A_offset,
+                               int32_t B_offset, int32_t output_offset,
+                               uint32_t core_id, uint32_t numThreads);
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = MatMul_unrolled_2x2_parallel_s8_rv32im
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = 4 elements of C per iteration (2x2 chunks)
+ * simd       = no
+ * cleanup    = no
+ */
+void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA,
+                                            int8_t const *__restrict__ pSrcB,
+                                            int32_t *__restrict__ pDstC,
+                                            uint32_t M, uint32_t N, uint32_t P,
+                                            uint32_t core_id,
+                                            uint32_t numThreads);
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = MatMul_unrolled_2x2_parallel_s8_rv32im
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = 4 elements of C per iteration (2x2 chunks)
+ * simd       = no
+ * cleanup    = no
+ */
+void MatMul_offset_unrolled_2x2_parallel_s8_rv32im(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    int32_t A_offset, int32_t B_offset, int32_t output_offset, uint32_t core_id,
+    uint32_t numThreads);
+
+#ifdef __XPULPIMG
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = MatMul_unrolled_2x4_parallel_s8_xpulpv2
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = 8 elements of C per iteration (2x4 chunks)
+ * simd       = yes, Xpulpv2 intrinsics
+ * cleanup    = no
+ *
+ * Original plp_mat_mult_s8p_xpulpv2 from pulp-dsp
+ */
+void MatMul_unrolled_2x4_parallel_s8_xpulpv2(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    int32_t A_offset, int32_t B_offset, int32_t output_offset, uint32_t core_id,
+    uint32_t numThreads);
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = MatMul_unrolled_2x4_pincr_asm_parallel_s8_xpulpv2
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = 8 elements of C per iteration (2x4 chunks)
+ * simd       = yes, Xpulpv2 intrinsics
+ * cleanup    = no
+ * other      = using pointer incrementing instead of array
+ *              indexing and loads/stores explicitly written
+ *              in asm, for optimal register utilization
+ *
+ * Inspired from plp_mat_mult_s8p_xpulpv2 from pulp-dsp
+ */
+void MatMul_unrolled_2x4_pincr_asm_parallel_s8_xpulpv2(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    uint32_t core_id, uint32_t numThreads);
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = MatMul_unrolled_2x4_pincr_asm_parallel_s8_xpulpv2
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = 8 elements of C per iteration (2x4 chunks)
+ * simd       = yes, Xpulpv2 intrinsics
+ * cleanup    = no
+ * other      = using pointer incrementing instead of array
+ *              indexing and loads/stores explicitly written
+ *              in asm, for optimal register utilization
+ *
+ * Inspired from plp_mat_mult_s8p_xpulpv2 from pulp-dsp
+ */
+void MatMul_offset_unrolled_2x4_pincr_asm_parallel_s8_xpulpv2(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    int32_t A_offset, int32_t B_offset, int32_t output_offset, uint32_t core_id,
+    uint32_t numThreads);
+
+#endif
+
+// Mapper Functions
+static inline void __attribute__((always_inline))
+MatMul_parallel_s8(int8_t const *__restrict__ pSrcA,
+                   int8_t const *__restrict__ pSrcB,
+                   int32_t *__restrict__ pDstC, uint32_t M, uint32_t N,
+                   uint32_t P, int32_t A_offset, int32_t B_offset,
+                   int32_t output_offset, uint32_t core_id,
+                   uint32_t numThreads) {
+  MatMul_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset,
+                            output_offset, core_id, numThreads);
+}
+
+static inline void __attribute__((always_inline))
+MatMul_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA,
+                                int8_t const *__restrict__ pSrcB,
+                                int32_t *__restrict__ pDstC, uint32_t M,
+                                uint32_t N, uint32_t P, uint32_t core_id,
+                                uint32_t numThreads) {
+#ifdef __XPULPIMG
+  MatMul_unrolled_2x4_pincr_asm_parallel_s8_xpulpv2(pSrcA, pSrcB, pDstC, M, N,
+                                                    P, core_id, numThreads);
+#else
+  MatMul_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, core_id,
+                                         numThreads);
+#endif
+}
+
+static inline void __attribute__((always_inline))
+MatMul_offset_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA,
+                                       int8_t const *__restrict__ pSrcB,
+                                       int32_t *__restrict__ pDstC, uint32_t M,
+                                       uint32_t N, uint32_t P, int32_t A_offset,
+                                       int32_t B_offset, int32_t output_offset,
+                                       uint32_t core_id, uint32_t numThreads) {
+#ifdef __XPULPIMG
+  MatMul_offset_unrolled_2x4_pincr_asm_parallel_s8_xpulpv2(
+      pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset, core_id,
+      numThreads);
+#else
+  MatMul_offset_unrolled_2x2_parallel_s8_rv32im(
+      pSrcA, pSrcB, pDstC, M, N, P, A_offset, B_offset, output_offset, core_id,
+      numThreads);
+#endif
+}
+
+/******************************************************************************/
+/*                        Matrix Multiplication (16bit)                       */
+/******************************************************************************/
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = MatMul_unrolled_2x2_parallel_s16_rv32im
+ * data type  = 16-bit integer
+ * multi-core = yes
+ * unrolling  = 4 elements of C per iteration (2x2 chunks)
+ * simd       = no
+ * cleanup    = no
+ */
+void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA,
+                                             int16_t const *__restrict__ pSrcB,
+                                             int32_t *__restrict__ pDstC,
+                                             uint32_t M, uint32_t N, uint32_t P,
+                                             uint32_t core_id,
+                                             uint32_t numThreads);
+
+#ifdef __XPULPIMG
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = MatMul_unrolled_4x2_parallel_s16_xpulpv2
+ * data type  = 16-bit integer
+ * multi-core = yes
+ * unrolling  = 8 elements of C per iteration (4x2 chunks)
+ * simd       = yes, Xpulpv2 intrinsics
+ * cleanup    = no
+ *
+ * Original plp_mat_mult_s16p_xpulpv2 from pulp-dsp
+ */
+void MatMul_unrolled_4x2_parallel_s16_xpulpv2(int16_t const *__restrict__ pSrcA,
+                                              int16_t const *__restrict__ pSrcB,
+                                              int32_t *__restrict__ pDstC,
+                                              uint32_t M, uint32_t N,
+                                              uint32_t P, uint32_t core_id,
+                                              uint32_t numThreads);
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = MatMul_unrolled_4x2_pincr_asm_parallel_s16_xpulpv2
+ * data type  = 16-bit integer
+ * multi-core = yes
+ * unrolling  = 8 elements of C per iteration (4x2 chunks)
+ * simd       = yes, Xpulpv2 intrinsics
+ * cleanup    = no
+ * other      = using pointer incrementing instead of array
+ *              indexing and loads/stores explicitly written
+ *              in asm, for optimal register utilization
+ *
+ * Inspired from plp_mat_mult_s16p_xpulpv2 from pulp-dsp
+ */
+void MatMul_unrolled_4x2_pincr_asm_parallel_s16_xpulpv2(
+    int16_t const *__restrict__ pSrcA, int16_t const *__restrict__ pSrcB,
+    int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    uint32_t core_id, uint32_t numThreads);
+
+#endif //__XPULPIMG
+
+/******************************************************************************/
+/*                        Matrix Multiplication (32bit)                       */
+/******************************************************************************/
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = MatMul_unrolled_2x2_parallel_s32_xpulpv2
+ * data type  = 32-bit integer
+ * multi-core = yes
+ * unrolling  = 4 elements of C per iteration (2x2 chunks)
+ * simd       = no
+ * cleanup    = no
+ * other      = loads/stores explicitly written in asm
+ *              for optimal register utilization
+ */
+void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA,
+                                             int32_t const *__restrict__ pSrcB,
+                                             int32_t *__restrict__ pDstC,
+                                             uint32_t M, uint32_t N, uint32_t P,
+                                             uint32_t core_id,
+                                             uint32_t numThreads);
+
+#ifdef __XPULPIMG
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = MatMul_unrolled_2x2_parallel_s32_xpulpv2
+ * data type  = 32-bit integer
+ * multi-core = yes
+ * unrolling  = 4 elements of C per iteration (2x2 chunks)
+ * simd       = no
+ * cleanup    = no
+ * other      = loads/stores explicitly written in asm
+ *              for optimal register utilization
+ */
+void MatMul_unrolled_2x2_parallel_s32_xpulpv2(int32_t const *__restrict__ pSrcA,
+                                              int32_t const *__restrict__ pSrcB,
+                                              int32_t *__restrict__ pDstC,
+                                              uint32_t M, uint32_t N,
+                                              uint32_t P, uint32_t core_id,
+                                              uint32_t numThreads);
+#endif //__XPULPIMG
+
+#endif //__DEEPLOY_MATH_MATMUL_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/MaxPool.h b/TargetLibraries/MemPool/inc/kernel/MaxPool.h
new file mode 100644
index 0000000..d8b02b5
--- /dev/null
+++ b/TargetLibraries/MemPool/inc/kernel/MaxPool.h
@@ -0,0 +1,79 @@
+/* =====================================================================
+ * Title:        MaxPool.h
+ * Description:
+ *
+ * Date:         13.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_MAXPOOL_KERNEL_HEADER_
+#define __DEEPLOY_MATH_MAXPOOL_KERNEL_HEADER_
+
+#include "DeeployMath.h"
+
+/* This file implements the MaxPool operation.
+ *
+ * A is an M x N input matrix, P x Q the kernel size and SPxSQ the kernel
+ * stride.
+ *
+ * Note that all the matrices dimensions must be multiples of 4; these
+ * kernels do not have clean-up code and remaining elements would not be
+ * considered, leading to wrong results.
+ */
+
+/******************************************************************************/
+/*                         General MaxPool (8bit)                         */
+/******************************************************************************/
+
+/*
+ * 2D Maxpool  ----------------------------------
+ * kernel      = MaxPool2d_parallel_s8_NCHW_rv32im
+ * layout      = NCHW
+ * data type   = 8-bit integer
+ * kernel size = generic
+ * multi-core  = yes
+ * unrolling   = no
+ * simd        = no
+ */
+void MaxPool2d_parallel_s8_NCHW_rv32im(int8_t const *__restrict__ pSrcA,
+                                       uint32_t C, uint32_t H, uint32_t W,
+                                       uint32_t P, uint32_t Q, uint32_t SP,
+                                       uint32_t SQ, int8_t *__restrict__ pDstC,
+                                       int32_t input_offset,
+                                       int32_t output_offset, uint32_t core_id,
+                                       uint32_t numThreads);
+
+// Mapper Functions
+static inline void __attribute__((always_inline))
+MaxPool2d_parallel_s8_NCHW(int8_t const *__restrict__ pSrcA, uint32_t C,
+                           uint32_t H, uint32_t W, uint32_t P, uint32_t Q,
+                           uint32_t SP, uint32_t SQ, int8_t *__restrict__ pDstC,
+                           int32_t input_offset, int32_t output_offset,
+                           uint32_t core_id, uint32_t numThreads) {
+  MaxPool2d_parallel_s8_NCHW_rv32im(pSrcA, C, H, W, P, Q, SP, SQ, pDstC,
+                                    input_offset, output_offset, core_id,
+                                    numThreads);
+}
+
+#endif //__DEEPLOY_MATH_MAXPOOL_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/RQGemm.h b/TargetLibraries/MemPool/inc/kernel/RQGemm.h
new file mode 100644
index 0000000..a548122
--- /dev/null
+++ b/TargetLibraries/MemPool/inc/kernel/RQGemm.h
@@ -0,0 +1,149 @@
+/* =====================================================================
+ * Title:        RQGemm.h
+ * Description:
+ *
+ * Date:         16.05.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_RQGEMM_KERNEL_HEADER_
+#define __DEEPLOY_MATH_RQGEMM_KERNEL_HEADER_
+
+#include "DeeployMath.h"
+
+/*
+ * This library implements the matrix multiplication for several data widths
+ * in multiple different ways. The functions all follow the following format:
+ *
+ * A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix
+ * A' = transpose(A) if transA else A
+ * B' = transpose(B) if transB else B
+ *
+ * Y = alpha * A' * B' + beta * C
+ *
+ */
+
+/******************************************************************************/
+/*          General Requantized Matrix Multiplication (8bit)                  */
+/******************************************************************************/
+
+/*
+ * General Requantized Matrix Multiplication ----------------------------------
+ * kernel     = RQGemm_parallel_s8_rv32im
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = no
+ * simd       = no
+ * cleanup    = yes
+ */
+void RQGemm_parallel_s8_rv32im(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M,
+    uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
+    int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset,
+    int32_t Y_offset, int8_t output_min, int8_t output_max, uint32_t core_id,
+    uint32_t numThreads);
+
+/*
+ * General Requantized Matrix multiplication ----------------------------------
+ * kernel     = RQGemm_offset_unrolled_2x2_parallel_s8_rv32im
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = 4 elements of C per iteration (2x2 chunks)
+ * simd       = no
+ * cleanup    = no
+ */
+void RQGemm_offset_unrolled_2x2_parallel_s8_rv32im(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M,
+    uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
+    int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset,
+    int32_t Y_offset, uint32_t core_id, uint32_t numThreads);
+
+#ifdef __XPULPIMG
+/*
+ * General Requantized Matrix multiplication ----------------------------------
+ * kernel     = RQGemm_offset_unrolled_4x4_pincr_asm_parallel_s8_xpulpv2
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = tranA=0, trabsB=0: 8 elements of C per iteration (2x4 chunks)
+ *              tranA=1, trabsB=0: 16 elements of C per iteration (4x4 chunks)
+ *              tranA=0, trabsB=1: 8 elements of C per iteration (2x4 chunks)
+ *              tranA=1, trabsB=1: 16 elements of C per iteration (4x4 chunks)
+ * simd       = yes, Xpulpv2 intrinsics
+ * cleanup    = no
+ * other      = using pointer incrementing instead of array
+ *              indexing and loads/stores explicitly written
+ *              in asm, for optimal register utilization
+ */
+void RQGemm_offset_unrolled_4x4_pincr_asm_parallel_s8_xpulpv2(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M,
+    uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
+    int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset,
+    int32_t Y_offset, uint32_t core_id, uint32_t numThreads);
+
+#endif //__XPULPIMG
+
+// Mapper Functions
+static inline void __attribute__((always_inline)) RQGemm_parallel_s8(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M,
+    uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
+    int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset,
+    int32_t Y_offset, int8_t output_min, int8_t output_max, uint32_t core_id,
+    uint32_t numThreads) {
+  RQGemm_parallel_s8_rv32im(
+      pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, mul,
+      add, log2D, rounding, per_row_quant, A_offset, B_offset, C_offset,
+      Y_offset, output_min, output_max, core_id, numThreads);
+}
+
+// Mapper Functions
+static inline void __attribute__((always_inline))
+RQGemm_offset_unrolled_2x2_parallel_s8(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M,
+    uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
+    int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset,
+    int32_t Y_offset, uint32_t core_id, uint32_t numThreads) {
+#ifdef __XPULPIMG
+  RQGemm_offset_unrolled_4x4_pincr_asm_parallel_s8_xpulpv2(
+      pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, mul,
+      add, log2D, rounding, per_row_quant, A_offset, B_offset, C_offset,
+      Y_offset, core_id, numThreads);
+#else
+  RQGemm_offset_unrolled_2x2_parallel_s8_rv32im(
+      pSrcA, pSrcB, pSrcC, pDstY, M, N, P, alpha, beta, transA, transB, mul,
+      add, log2D, rounding, per_row_quant, A_offset, B_offset, C_offset,
+      Y_offset, core_id, numThreads);
+#endif
+}
+
+#endif //__DEEPLOY_MATH_RQGEMM_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/RQMatMul.h b/TargetLibraries/MemPool/inc/kernel/RQMatMul.h
new file mode 100644
index 0000000..3e4fd96
--- /dev/null
+++ b/TargetLibraries/MemPool/inc/kernel/RQMatMul.h
@@ -0,0 +1,197 @@
+/* =====================================================================
+ * Title:        RQMatMul.h
+ * Description:
+ *
+ * Date:         24.04.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_RQMATMUL_KERNEL_HEADER_
+#define __DEEPLOY_MATH_RQMATMUL_KERNEL_HEADER_
+
+#include "DeeployMath.h"
+
+/* This library implements the requantiyed matrix multiplication for several
+ * data widths in multiple different ways. The functions all follow the
+ * following format:
+ *
+ * A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix
+ * C = AB
+ *
+ * Note that all the matrices dimensions must be multiples of 4; these
+ * kernels do not have clean-up code and remaining elements would not be
+ * considered, leading to wrong results
+ */
+
+/******************************************************************************/
+/*               Requantized Matrix Multiplication (8bit)                     */
+/******************************************************************************/
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = RQMatMul_parallel_s8_rv32im
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = no
+ * simd       = no
+ * cleanup    = yes
+ */
+void RQMatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA,
+                                 int8_t const *__restrict__ pSrcB,
+                                 int8_t *__restrict__ pDstC, uint32_t M,
+                                 uint32_t N, uint32_t P, int32_t *mul,
+                                 int32_t *add, int32_t log2D, bool rounding,
+                                 bool per_row_quant, int32_t A_offset,
+                                 int32_t B_offset, int32_t output_offset,
+                                 int8_t output_min, int8_t output_max,
+                                 uint32_t core_id, uint32_t numThreads);
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = RQMatMul_unrolled_2x2_parallel_s8_rv32im
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = 4 elements of C per iteration (2x2 chunks)
+ * simd       = no
+ * cleanup    = no
+ */
+void RQMatMul_unrolled_2x2_parallel_s8_rv32im(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, uint32_t core_id, uint32_t numThreads);
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = RQMatMul_unrolled_2x2_parallel_s8_rv32im
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = 4 elements of C per iteration (2x2 chunks)
+ * simd       = no
+ * cleanup    = no
+ */
+void RQMatMul_offset_unrolled_2x2_parallel_s8_rv32im(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, int32_t A_offset, int32_t B_offset,
+    int32_t output_offset, uint32_t core_id, uint32_t numThreads);
+
+#ifdef __XPULPIMG
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = RQMatMul_unrolled_2x4_pincr_asm_parallel_s8_xpulpv2
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = 8 elements of C per iteration (2x4 chunks)
+ * simd       = yes, Xpulpv2 intrinsics
+ * cleanup    = no
+ * other      = using pointer incrementing instead of array
+ *              indexing and loads/stores explicitly written
+ *              in asm, for optimal register utilization
+ *
+ * Inspired from plp_mat_mult_s8p_xpulpv2 from pulp-dsp
+ */
+void RQMatMul_unrolled_2x4_pincr_asm_parallel_s8_xpulpv2(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, uint32_t core_id, uint32_t numThreads);
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = RQMatMul_unrolled_2x4_pincr_asm_parallel_s8_xpulpv2
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = 8 elements of C per iteration (2x4 chunks)
+ * simd       = yes, Xpulpv2 intrinsics
+ * cleanup    = no
+ * other      = using pointer incrementing instead of array
+ *              indexing and loads/stores explicitly written
+ *              in asm, for optimal register utilization
+ *
+ * Inspired from plp_mat_mult_s8p_xpulpv2 from pulp-dsp
+ */
+void RQMatMul_offset_unrolled_2x4_pincr_asm_parallel_s8_xpulpv2(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, int32_t A_offset, int32_t B_offset,
+    int32_t output_offset, uint32_t core_id, uint32_t numThreads);
+
+#endif
+
+// Mapper Functions
+static inline void __attribute__((always_inline))
+RQMatMul_parallel_s8(int8_t const *__restrict__ pSrcA,
+                     int8_t const *__restrict__ pSrcB,
+                     int8_t *__restrict__ pDstC, uint32_t M, uint32_t N,
+                     uint32_t P, int32_t *mul, int32_t *add, int32_t log2D,
+                     bool rounding, bool per_row_quant, int32_t A_offset,
+                     int32_t B_offset, int32_t output_offset, int8_t output_min,
+                     int8_t output_max, uint32_t core_id, uint32_t numThreads) {
+  RQMatMul_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, mul, add, log2D,
+                              rounding, per_row_quant, A_offset, B_offset,
+                              output_offset, output_min, output_max, core_id,
+                              numThreads);
+}
+
+static inline void __attribute__((always_inline))
+RQMatMul_unrolled_2x2_parallel_s8(int8_t const *__restrict__ pSrcA,
+                                  int8_t const *__restrict__ pSrcB,
+                                  int8_t *__restrict__ pDstC, uint32_t M,
+                                  uint32_t N, uint32_t P, int32_t *mul,
+                                  int32_t *add, int32_t log2D, bool rounding,
+                                  bool per_row_quant, uint32_t core_id,
+                                  uint32_t numThreads) {
+#ifdef __XPULPIMG
+  RQMatMul_unrolled_2x4_pincr_asm_parallel_s8_xpulpv2(
+      pSrcA, pSrcB, pDstC, M, N, P, mul, add, log2D, rounding, per_row_quant,
+      core_id, numThreads);
+#else
+  RQMatMul_unrolled_2x2_parallel_s8_rv32im(pSrcA, pSrcB, pDstC, M, N, P, mul,
+                                           add, log2D, rounding, per_row_quant,
+                                           core_id, numThreads);
+#endif
+}
+
+static inline void __attribute__((always_inline))
+RQMatMul_offset_unrolled_2x2_parallel_s8(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, int32_t A_offset, int32_t B_offset,
+    int32_t output_offset, uint32_t core_id, uint32_t numThreads) {
+#ifdef __XPULPIMG
+  RQMatMul_offset_unrolled_2x4_pincr_asm_parallel_s8_xpulpv2(
+      pSrcA, pSrcB, pDstC, M, N, P, mul, add, log2D, rounding, per_row_quant,
+      A_offset, B_offset, output_offset, core_id, numThreads);
+#else
+  RQMatMul_offset_unrolled_2x2_parallel_s8_rv32im(
+      pSrcA, pSrcB, pDstC, M, N, P, mul, add, log2D, rounding, per_row_quant,
+      A_offset, B_offset, output_offset, core_id, numThreads);
+#endif
+}
+
+#endif //__DEEPLOY_MATH_RQMATMUL_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/RequantShift.h b/TargetLibraries/MemPool/inc/kernel/RequantShift.h
new file mode 100644
index 0000000..b995e31
--- /dev/null
+++ b/TargetLibraries/MemPool/inc/kernel/RequantShift.h
@@ -0,0 +1,205 @@
+/* =====================================================================
+ * Title:        RequantShift.h
+ * Description:
+ *
+ * Date:         24.04.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_REQUANTSHIFT_KERNEL_HEADER_
+#define __DEEPLOY_MATH_REQUANTSHIFT_KERNEL_HEADER_
+
+#include "DeeployMath.h"
+
+/*
+ * This file implements the requantization kernel for several data widths
+ * in multiple different ways.
+ */
+
+/******************************************************************************/
+/*                         Requantization to 8bit                             */
+/******************************************************************************/
+
+/*
+ * Re-quantization and Shift  ----------------------------------
+ * kernel           = RequantShift_parallel_s8_s8_NHWC
+ * layout           = NHWC
+ * input data type  = 8-bit integer
+ * output data type = 8-bit integer
+ * multi-core       = yes
+ * unrolling        = no
+ * simd             = no
+ */
+void RequantShift_parallel_s8_s8_NHWC(int8_t *data_in, uint32_t size,
+                                      int32_t *mul, int32_t *add,
+                                      int8_t *data_out, int32_t log2D,
+                                      uint32_t channels, int32_t input_offset,
+                                      int32_t output_offset, int8_t output_min,
+                                      int8_t output_max, bool rounding,
+                                      uint32_t core_id, uint32_t numThreads);
+
+/*
+ * Re-quantization and Shift  ----------------------------------
+ * kernel           = RequantShift_parallel_s16_s8_NHWC
+ * layout           = NHWC
+ * input data type  = 16-bit integer
+ * output data type = 8-bit integer
+ * multi-core       = yes
+ * unrolling        = no
+ * simd             = no
+ */
+void RequantShift_parallel_s16_s8_NHWC(int16_t *data_in, uint32_t size,
+                                       int32_t *mul, int32_t *add,
+                                       int8_t *data_out, int32_t log2D,
+                                       uint32_t channels, int32_t input_offset,
+                                       int32_t output_offset, int8_t output_min,
+                                       int8_t output_max, bool rounding,
+                                       uint32_t core_id, uint32_t numThreads);
+
+/*
+ * Re-quantization and Shift  ----------------------------------
+ * kernel           = RequantShift_parallel_s32_s8_NHWC
+ * layout           = NHWC
+ * input data type  = 32-bit integer
+ * output data type = 8-bit integer
+ * multi-core       = yes
+ * unrolling        = no
+ * simd             = no
+ */
+void RequantShift_parallel_s32_s8_NHWC(int32_t *data_in, uint32_t size,
+                                       int32_t *mul, int32_t *add,
+                                       int8_t *data_out, int32_t log2D,
+                                       uint32_t channels, int32_t input_offset,
+                                       int32_t output_offset, int8_t output_min,
+                                       int8_t output_max, bool rounding,
+                                       uint32_t core_id, uint32_t numThreads);
+
+/*
+ * Re-quantization and Shift  ----------------------------------
+ * kernel           = RequantShift_parallel_s16_s8_NCHW
+ * layout           = NCHW
+ * input data type  = 16-bit integer
+ * output data type = 8-bit integer
+ * multi-core       = yes
+ * unrolling        = no
+ * simd             = no
+ */
+void RequantShift_parallel_s8_s8_NCHW(int8_t *data_in, uint32_t size,
+                                      int32_t *mul, int32_t *add,
+                                      int8_t *data_out, int32_t log2D,
+                                      uint32_t HW, int32_t input_offset,
+                                      int32_t output_offset, int8_t output_min,
+                                      int8_t output_max, bool rounding,
+                                      uint32_t core_id, uint32_t numThreads);
+
+/*
+ * Re-quantization and Shift  ----------------------------------
+ * kernel           = RequantShift_parallel_s16_s8_NCHW
+ * layout           = NCHW
+ * input data type  = 16-bit integer
+ * output data type = 8-bit integer
+ * multi-core       = yes
+ * unrolling        = no
+ * simd             = no
+ */
+void RequantShift_parallel_s16_s8_NCHW(int16_t *data_in, uint32_t size,
+                                       int32_t *mul, int32_t *add,
+                                       int8_t *data_out, int32_t log2D,
+                                       uint32_t HW, int32_t input_offset,
+                                       int32_t output_offset, int8_t output_min,
+                                       int8_t output_max, bool rounding,
+                                       uint32_t core_id, uint32_t numThreads);
+
+/*
+ * Re-quantization and Shift  ----------------------------------
+ * kernel           = RequantShift_parallel_s32_s8_NCHW
+ * layout           = NCHW
+ * input data type  = 32-bit integer
+ * output data type = 8-bit integer
+ * multi-core       = yes
+ * unrolling        = no
+ * simd             = no
+ */
+void RequantShift_parallel_s32_s8_NCHW(int32_t *data_in, uint32_t size,
+                                       int32_t *mul, int32_t *add,
+                                       int8_t *data_out, int32_t log2D,
+                                       uint32_t HW, int32_t input_offset,
+                                       int32_t output_offset, int8_t output_min,
+                                       int8_t output_max, bool rounding,
+                                       uint32_t core_id, uint32_t numThreads);
+
+/*
+ * Re-quantization and Shift  ----------------------------------
+ * kernel           = RequantShift_unrolled_1x4_parallel_s32_s8_NCHW_rv32im
+ * layout           = NCHW
+ * input data type  = 32-bit integer
+ * output data type = 8-bit integer
+ * multi-core       = yes
+ * unrolling        = yes (4 elements per iteration)
+ * simd             = no
+ */
+void RequantShift_unrolled_1x4_parallel_s32_s8_NCHW_rv32im(
+    int32_t *data_in, uint32_t size, int32_t *mul, int32_t *add,
+    int8_t *data_out, int32_t log2D, uint32_t HW, int32_t input_offset,
+    int32_t output_offset, bool rounding, uint32_t core_id,
+    uint32_t numThreads);
+
+#ifdef __XPULPIMG
+/*
+ * Re-quantization and Shift  ----------------------------------
+ * kernel           = RequantShift_unrolled_1x2_parallel_s32_s8_NCHW_xpulpv2
+ * layout           = NCHW
+ * input data type  = 32-bit integer
+ * output data type = 8-bit integer
+ * multi-core       = yes
+ * unrolling        = yes (4 elements per iteration)
+ * simd             = no
+ */
+void RequantShift_unrolled_1x4_parallel_s32_s8_NCHW_xpulpv2(
+    int32_t *data_in, uint32_t size, int32_t *mul, int32_t *add,
+    int8_t *data_out, int32_t log2D, uint32_t HW, int32_t input_offset,
+    int32_t output_offset, bool rounding, uint32_t core_id,
+    uint32_t numThreads);
+
+#endif //__XPULPIMG
+
+// Mapper Functions
+static inline void __attribute__((always_inline))
+RequantShift_unrolled_1x4_parallel_s32_s8_NCHW(
+    int32_t *data_in, uint32_t size, int32_t *mul, int32_t *add,
+    int8_t *data_out, int32_t log2D, uint32_t HW, int32_t input_offset,
+    int32_t output_offset, bool rounding, uint32_t core_id,
+    uint32_t numThreads) {
+#ifdef __XPULPIMG
+  RequantShift_unrolled_1x4_parallel_s32_s8_NCHW_xpulpv2(
+      data_in, size, mul, add, data_out, log2D, HW, input_offset, output_offset,
+      rounding, core_id, numThreads);
+#else
+  RequantShift_unrolled_1x4_parallel_s32_s8_NCHW_rv32im(
+      data_in, size, mul, add, data_out, log2D, HW, input_offset, output_offset,
+      rounding, core_id, numThreads);
+#endif
+}
+
+#endif //__DEEPLOY_MATH_REQUANTSHIFT_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/Softmax.h b/TargetLibraries/MemPool/inc/kernel/Softmax.h
new file mode 100644
index 0000000..4a21a4e
--- /dev/null
+++ b/TargetLibraries/MemPool/inc/kernel/Softmax.h
@@ -0,0 +1,58 @@
+/* =====================================================================
+ * Title:        Softmax.h
+ * Description:
+ *
+ * Date:         25.04.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_SOFTMAX_KERNEL_HEADER_
+#define __DEEPLOY_MATH_SOFTMAX_KERNEL_HEADER_
+
+#include "DeeployMath.h"
+
+/*
+ * This file implements various softmax kernels.
+ */
+
+/******************************************************************************/
+/*                              Softmax (8bit)                                */
+/******************************************************************************/
+
+/**
+ * @brief Approximate softmax implementation used in ITA.
+ *
+ * @param pSrcA
+ * @param pDstB
+ * @param pBufN
+ * @param size
+ * @param lastDimLength
+ */
+void ITAMax_parallel_s8(int8_t const *__restrict__ pSrcA,
+                        int8_t *__restrict__ pDstB, int8_t *__restrict__ pBufN,
+                        uint32_t size, uint32_t lastDimLength,
+                        uint32_t n_levels, uint32_t core_id,
+                        uint32_t numThreads);
+
+#endif //__DEEPLOY_MATH_SOFTMAX_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/macros.h b/TargetLibraries/MemPool/inc/macros.h
new file mode 100644
index 0000000..2d36e23
--- /dev/null
+++ b/TargetLibraries/MemPool/inc/macros.h
@@ -0,0 +1,35 @@
+/* =====================================================================
+ * Title:        macros.h
+ * Description:
+ *
+ * Date:         29.11.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Samuel Riedel, ETH Zurich
+ * - Sergio Mazzola, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_MACROS_HEADER_
+#define __DEEPLOY_MATH_MACROS_HEADER_
+
+#endif //__DEEPLOY_MATH_MACROS_HEADER_
diff --git a/TargetLibraries/MemPool/src/Convolution_s8.c b/TargetLibraries/MemPool/src/Convolution_s8.c
new file mode 100644
index 0000000..a81673e
--- /dev/null
+++ b/TargetLibraries/MemPool/src/Convolution_s8.c
@@ -0,0 +1,145 @@
+/* =====================================================================
+ * Title:        Convolution_s8.c
+ * Description:
+ *
+ * Date:         02.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Samuel Riedel, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except pSrcA compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to pSrcA writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+
+void Conv2d_parallel_s8_NCHW_rv32im(
+    int8_t const *__restrict__ pSrcA, uint32_t C, uint32_t H, uint32_t W,
+    int8_t const *__restrict__ pSrcB, uint32_t F, uint32_t P, uint32_t Q,
+    uint32_t SP, uint32_t SQ, int32_t *__restrict__ pDstC, int32_t input_offset,
+    int32_t output_offset, uint32_t core_id, uint32_t numThreads) {
+  // Parallelize along m output columns
+  uint32_t start = 0;
+  uint32_t end = 0;
+
+  // WIESEP: For now assume padding=0
+  uint32_t H_out = (H - P) / SP + 1;
+  uint32_t W_out = (W - Q) / SQ + 1;
+  // uint32_t div = H_out / numThreads;
+  // uint32_t rem = H_out % numThreads;
+  uint32_t div = W_out / numThreads;
+  uint32_t rem = W_out % numThreads;
+
+  // if (core_id < H_out) {
+  if (core_id < W_out) {
+    start = div * core_id;
+    end = div * (core_id + 1);
+  } else {
+    return;
+  }
+
+  // printf("H_out: %3ld, W_out: %3ld ", H_out, W_out);
+  // printf("DIV  : %3ld, REM  : %3ld ", div, rem);
+  // printf("start: %3ld, end  : %3ld ", start, end);
+
+  // printf("SP   : %3ld, SQ   : %3ld ", SP, SQ);
+  // printf("H    : %3ld, W    : %3ld\r\n", H, W);
+
+  start += core_id < rem ? core_id : rem;
+  end += core_id < rem ? core_id + 1 : rem;
+
+  uint32_t c = 0; // input channel loop counter
+  uint32_t h = 0; // input row loop counter
+  uint32_t w = 0; // input column loop counter
+
+  uint32_t f = 0; // kernel filter loop counter
+  uint32_t p = 0; // kernel row loop counter
+  uint32_t q = 0; // kernel column loop counter
+
+  int32_t sum;
+  for (f = 0; f < F; ++f) {
+    // for (h = start; h < end; ++h) {
+    for (h = 0; h < H_out; ++h) {
+      // for (w = 0; w < W_out; ++w) {
+      for (w = start; w < end; ++w) {
+        sum = 0;
+        for (c = 0; c < C; ++c) {
+          // printf("(%2ld,%2ld,%2ld) ", c, h, w);
+          for (p = 0; p < P; ++p) {
+            for (q = 0; q < Q; ++q) {
+              sum += (pSrcA[c * H * W + (h * SP + p) * W + (w * SQ + q)] +
+                      input_offset) *
+                     pSrcB[f * C * P * Q + c * P * Q + p * Q + q];
+              // printf("%4d*%-4d + ", pSrcA[c * H * W + (h * SP + p) * W + (w *
+              // SQ + q)],
+              //  pSrcB[f * C * P * Q + c * P * Q + p * Q + q]);
+            }
+          }
+          // printf("\r\n");
+        }
+        // printf("= %-6ld\r\n", sum);
+        pDstC[f * H_out * W_out + h * W_out + w] = sum + output_offset;
+      }
+    }
+  }
+}
+
+void Conv2d_3x3_unrolled_parallel_s8_NCHW_rv32im(
+    int8_t const *__restrict__ pSrcA, uint32_t M, uint32_t N,
+    int8_t const *__restrict__ pSrcB, int32_t *__restrict__ pDstC,
+    uint32_t core_id, uint32_t numThreads) {
+  // Parallelize along m output rows
+  uint32_t start = 0;
+  uint32_t end = 0;
+
+  // WIESEP: For now assume padding=0
+  uint32_t M_out = M - 2;
+  uint32_t N_out = N - 2;
+  uint32_t div = M_out / numThreads;
+  uint32_t rem = M_out % numThreads;
+
+  if (core_id < M_out) {
+    start = div * core_id + 1;
+    end = div * (core_id + 1) + 1;
+  }
+
+  start += core_id < rem ? core_id : rem;
+  end += core_id < rem ? core_id + 1 : rem;
+
+  uint32_t i = 0; // row loop counter
+  uint32_t j = 0; // column loop counter
+  int32_t sum;
+
+  for (i = start; i < end; ++i) {
+    for (j = 1; j < N - 1; ++j) {
+      sum = 0;
+      sum += pSrcA[(i - 1) * N + (j - 1)] * (int32_t)pSrcB[0];
+      sum += pSrcA[(i - 1) * N + (j + 0)] * (int32_t)pSrcB[1];
+      sum += pSrcA[(i - 1) * N + (j + 1)] * (int32_t)pSrcB[2];
+      sum += pSrcA[(i + 0) * N + (j - 1)] * (int32_t)pSrcB[3];
+      sum += pSrcA[(i + 0) * N + (j + 0)] * (int32_t)pSrcB[4];
+      sum += pSrcA[(i + 0) * N + (j + 1)] * (int32_t)pSrcB[5];
+      sum += pSrcA[(i + 1) * N + (j - 1)] * (int32_t)pSrcB[6];
+      sum += pSrcA[(i + 1) * N + (j + 0)] * (int32_t)pSrcB[7];
+      sum += pSrcA[(i + 1) * N + (j + 1)] * (int32_t)pSrcB[8];
+      pDstC[(i - 1) * N_out + (j - 1)] = sum;
+    }
+  }
+}
\ No newline at end of file
diff --git a/TargetLibraries/MemPool/src/CycleCounter.c b/TargetLibraries/MemPool/src/CycleCounter.c
new file mode 100644
index 0000000..6e0be0e
--- /dev/null
+++ b/TargetLibraries/MemPool/src/CycleCounter.c
@@ -0,0 +1,63 @@
+/* =====================================================================
+ * Title:        CycleCounter.c
+ * Description:
+ *
+ * Date:         06.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except pSrcA compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to pSrcA writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+
+static uint32_t timer_init[NUM_CORES] __attribute__((section(".l1")));
+static uint32_t timer_end[NUM_CORES] __attribute__((section(".l1")));
+static uint32_t instr_init[NUM_CORES] __attribute__((section(".l1")));
+static uint32_t instr_end[NUM_CORES] __attribute__((section(".l1")));
+
+void ResetTimer(void) {
+  uint32_t const core_id = mempool_get_core_id();
+  timer_init[core_id] = read_csr(mcycle);
+  instr_init[core_id] = read_csr(minstret);
+}
+
+void StartTimer(void) {
+  uint32_t const core_id = mempool_get_core_id();
+  timer_init[core_id] = read_csr(mcycle);
+  instr_init[core_id] = read_csr(minstret);
+}
+
+void StopTimer(void) {
+  uint32_t const core_id = mempool_get_core_id();
+  timer_end[core_id] = read_csr(mcycle);
+  instr_end[core_id] = read_csr(minstret);
+}
+
+uint32_t getCycles(void) {
+  uint32_t const core_id = mempool_get_core_id();
+  return timer_end[core_id] - timer_init[core_id];
+}
+
+uint32_t getInstr(void) {
+  uint32_t const core_id = mempool_get_core_id();
+  return instr_end[core_id] - instr_init[core_id];
+}
\ No newline at end of file
diff --git a/TargetLibraries/MemPool/src/DWConvolution_s8.c b/TargetLibraries/MemPool/src/DWConvolution_s8.c
new file mode 100644
index 0000000..00b5070
--- /dev/null
+++ b/TargetLibraries/MemPool/src/DWConvolution_s8.c
@@ -0,0 +1,94 @@
+/* =====================================================================
+ * Title:        DWConvolution_s8.c
+ * Description:
+ *
+ * Date:         09.01.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except pSrcA compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to pSrcA writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+
+void DWConv2d_parallel_s8_NCHW_rv32im(
+    int8_t const *__restrict__ pSrcA, uint32_t C, uint32_t H, uint32_t W,
+    int8_t const *__restrict__ pSrcB, uint32_t P, uint32_t Q, uint32_t SP,
+    uint32_t SQ, int32_t *__restrict__ pDstC, int32_t input_offset,
+    int32_t output_offset, uint32_t core_id, uint32_t numThreads) {
+
+  // Parallelize along m output columns
+  uint32_t start = 0;
+  uint32_t end = 0;
+
+  // WIESEP: For now assume padding=0
+  uint32_t H_out = (H - P) / SP + 1;
+  uint32_t W_out = (W - Q) / SQ + 1;
+  uint32_t div = W_out / numThreads;
+  uint32_t rem = W_out % numThreads;
+
+  if (core_id < W_out) {
+    start = div * core_id;
+    end = div * (core_id + 1);
+  } else {
+    return;
+  }
+
+  // printf("H_out: %3ld, W_out: %3ld ", H_out, W_out);
+  // printf("DIV  : %3ld, REM  : %3ld ", div, rem);
+  // printf("start: %3ld, end  : %3ld ", start, end);
+
+  // printf("SP   : %3ld, SQ   : %3ld ", SP, SQ);
+  // printf("H    : %3ld, W    : %3ld\r\n", H, W);
+
+  start += core_id < rem ? core_id : rem;
+  end += core_id < rem ? core_id + 1 : rem;
+
+  uint32_t c = 0; // input channel loop counter
+  uint32_t h = 0; // input row loop counter
+  uint32_t w = 0; // input column loop counter
+
+  uint32_t f = 0; // kernel filter loop counter
+  uint32_t p = 0; // kernel row loop counter
+  uint32_t q = 0; // kernel column loop counter
+
+  int32_t sum;
+  for (c = 0; c < C; ++c) {
+    for (h = 0; h < H_out; ++h) {
+      for (w = start; w < end; ++w) {
+        sum = 0;
+        // printf("(%2ld,%2ld,%2ld) ", c, h, w);
+        for (p = 0; p < P; ++p) {
+          for (q = 0; q < Q; ++q) {
+            sum += (pSrcA[c * H * W + (h * SP + p) * W + (w * SQ + q)] +
+                    input_offset) *
+                   pSrcB[f * C * P * Q + c * P * Q + p * Q + q];
+            // printf("%4d*%-4d + ", pSrcA[c * H * W + (h * SP + p) * W + (w *
+            // SQ + q)], pSrcB[f * C * P * Q + c * P * Q + p * Q + q]);
+          }
+        }
+        // printf("\r\n");
+        // printf("= %-6ld\r\n", sum);
+        pDstC[c * H_out * W_out + h * W_out + w] = sum + output_offset;
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/MemPool/src/Gemm_s8.c b/TargetLibraries/MemPool/src/Gemm_s8.c
new file mode 100644
index 0000000..e5eba7e
--- /dev/null
+++ b/TargetLibraries/MemPool/src/Gemm_s8.c
@@ -0,0 +1,90 @@
+/* =====================================================================
+ * Title:        Gemm_s8.c
+ * Description:
+ *
+ * Date:         16.05.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+void Gemm_parallel_s8_rv32im(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t const *__restrict__ pSrcC, int32_t *__restrict__ pDstY, uint32_t M,
+    uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
+    int32_t transB, int32_t A_offset, int32_t B_offset, int32_t C_offset,
+    int32_t Y_offset, uint32_t core_id, uint32_t numThreads) {
+
+  // Parallelize by assigning each core one row
+  uint32_t const c = 1; // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (core_id % c);
+  uint32_t const c_end = (P / c) * ((core_id % c) + 1);
+
+  const int32_t bias = beta * C_offset + Y_offset;
+
+  if (transA == 0 && transB == 0) {
+    for (uint32_t m = core_id / c; m < M; m += numThreads / c) {
+      for (uint32_t p = c_start; p < c_end; ++p) {
+        int32_t sum = 0;
+        for (uint32_t n = 0; n < N; ++n) {
+          sum += (int32_t)(pSrcA[m * N + n] + A_offset) *
+                 (pSrcB[n * P + p] + B_offset);
+        }
+        pDstY[m * P + p] = alpha * sum + beta * pSrcC[m * P + p] + bias;
+      }
+    }
+  } else if (transA == 1 && transB == 0) {
+    for (uint32_t m = core_id / c; m < M; m += numThreads / c) {
+      for (uint32_t p = c_start; p < c_end; ++p) {
+        int32_t sum = 0;
+        for (uint32_t n = 0; n < N; ++n) {
+          sum += (int32_t)(pSrcA[n * M + m] + A_offset) *
+                 (pSrcB[n * P + p] + B_offset);
+        }
+        pDstY[m * P + p] = alpha * sum + beta * pSrcC[m * P + p] + bias;
+      }
+    }
+  } else if (transA == 0 && transB == 1) {
+    for (uint32_t m = core_id / c; m < M; m += numThreads / c) {
+      for (uint32_t p = c_start; p < c_end; ++p) {
+        int32_t sum = 0;
+        for (uint32_t n = 0; n < N; ++n) {
+          sum += (int32_t)(pSrcA[m * N + n] + A_offset) *
+                 (pSrcB[p * N + n] + B_offset);
+        }
+        pDstY[m * P + p] = alpha * sum + beta * pSrcC[m * P + p] + bias;
+      }
+    }
+  } else {
+    for (uint32_t m = core_id / c; m < M; m += numThreads / c) {
+      for (uint32_t p = c_start; p < c_end; ++p) {
+        int32_t sum = 0;
+        for (uint32_t n = 0; n < N; ++n) {
+          sum += (int32_t)(pSrcA[n * M + m] + A_offset) *
+                 (pSrcB[p * N + n] + B_offset);
+        }
+        pDstY[m * P + p] = alpha * sum + beta * pSrcC[m * P + p] + bias;
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/MemPool/src/ITA.c b/TargetLibraries/MemPool/src/ITA.c
new file mode 100644
index 0000000..ee1a4cb
--- /dev/null
+++ b/TargetLibraries/MemPool/src/ITA.c
@@ -0,0 +1,87 @@
+/* =====================================================================
+ * Title:        ITA.c
+ * Description:
+ *
+ * Date:         5.12.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+
+void ITA_getStruct(ita_data_t *ita_data, int8_t *base_address, uint32_t S,
+                   uint32_t E, uint32_t P) {
+  ita_data->wo_weight = base_address;
+  ita_data->wv_weight = ita_data->wo_weight + E * P;
+  ita_data->wk_weight = ita_data->wv_weight + E * P;
+  ita_data->q = ita_data->wk_weight + E * P;
+  ita_data->k = ita_data->q + S * E;
+  ita_data->wq_weight = ita_data->k + S * E;
+  ita_data->wo_bias = (int32_t *)ita_data->wq_weight + E * P; // 32 bit values
+  ita_data->wv_bias = ita_data->wo_bias + 1 * E;              // 32 bit values
+  ita_data->wk_bias = ita_data->wv_bias + 1 * P;              // 32 bit values
+  ita_data->wq_bias = ita_data->wk_bias + 1 * P;              // 32 bit values
+}
+
+// The tensors have to be stored in a split layout equivalent to
+// np.reshape(np.concatenate(np.split(self.Q, self.split, axis = 1)),
+// (self.S_ITA, self.E_ITA))
+void ITA_copyInput(int8_t *pDst, int8_t const *__restrict__ pSrc, uint32_t S,
+                   uint32_t E, int8_t offset) {
+  uint32_t i = 0;
+  uint32_t j = 0;
+  uint32_t k = 0;
+
+  for (i = 0; i < E / ITA_PE; ++i) {
+    for (j = 0; j < S; ++j) {
+      if (offset != 0) {
+        for (k = 0; k < ITA_PE; ++k) {
+          pDst[i * S * ITA_PE + j * ITA_PE + k] =
+              (int8_t)(pSrc[i * ITA_PE + j * E + k] + offset);
+        }
+      } else {
+#if USE_DMA
+        dma_memcpy_blocking((void *)&pDst[i * S * ITA_PE + j * ITA_PE],
+                            (void *)&pSrc[i * ITA_PE + j * E], ITA_PE);
+#else
+        memcpy((void *)&data[i * S * ITA_PE + j * ITA_PE],
+               (void *)&pSrc[i * ITA_PE + j * E], ITA_PE);
+#endif
+      }
+    }
+  }
+}
+
+void ITA_printAddresses(ita_data_t *ita_data) {
+  deeploy_log("ITA addresses:\n");
+  deeploy_log("wo_weight: %p\n", ita_data->wo_weight);
+  deeploy_log("wv_weight: %p\n", ita_data->wv_weight);
+  deeploy_log("wk_weight: %p\n", ita_data->wk_weight);
+  deeploy_log("q: %p\n", ita_data->q);
+  deeploy_log("k: %p\n", ita_data->k);
+  deeploy_log("wq_weight: %p\n", ita_data->wq_weight);
+  deeploy_log("wo_bias: %p\n", ita_data->wo_bias);
+  deeploy_log("wv_bias: %p\n", ita_data->wv_bias);
+  deeploy_log("wk_bias: %p\n", ita_data->wk_bias);
+  deeploy_log("wq_bias: %p\n", ita_data->wq_bias);
+}
\ No newline at end of file
diff --git a/TargetLibraries/MemPool/src/MHSA_s8.c b/TargetLibraries/MemPool/src/MHSA_s8.c
new file mode 100644
index 0000000..9b3570e
--- /dev/null
+++ b/TargetLibraries/MemPool/src/MHSA_s8.c
@@ -0,0 +1,267 @@
+/* =====================================================================
+ * Title:        M4HSA_s8.c
+ * Description:
+ *
+ * Date:         08.02.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+
+dump(timer_cycle, 0);
+dump(timer_instr, 1);
+
+void M1HSA_s8_ITA(int8_t const *__restrict__ pSrcQ,
+                  int8_t const *__restrict__ pSrcK, int8_t *__restrict__ pBuf,
+                  uint32_t S, uint32_t E, uint32_t P,
+                  ita_quant_t const *__restrict__ quant_param,
+                  int8_t *__restrict__ pDst, int8_t Q_offset, int8_t K_offset,
+                  int8_t output_offset, uint32_t core_id,
+                  __attribute__((unused)) uint32_t numThreads) {
+
+  ita_data_t ita_data;
+  uint32_t i = 0;
+  uint32_t j = 0;
+
+  ITA_getStruct(&ita_data, pBuf, S, E, P);
+  // if (core_id == 0) ITA_printAddresses(&ita_data);
+
+  // Copy the keys (Q) and queries/values (K) data to the L2 buffer
+  if (core_id == 0)
+    ITA_copyInput(ita_data.q, pSrcQ, S, E, Q_offset);
+#if USE_DMA
+  if (core_id == 0) {
+#else
+  if ((core_id == 1) | (numThreads == 1)) {
+#endif
+    ITA_copyInput(ita_data.k, pSrcK, S, E, K_offset);
+  }
+
+  mempool_barrier(numThreads);
+
+  mempool_stop_benchmark();
+  mempool_start_benchmark();
+
+  mempool_timer_t instr_init = 0, instr_end = 0;
+  mempool_timer_t timer_init = 0, timer_end = 0;
+
+  if (core_id == 0) {
+    // Configure ITA core 0
+    ITA_SetShape(ITA0, S, E, P);
+    ITA_SetStartAddress(ITA0, (uint32_t)pBuf);
+    ITA_SetOutAddress(ITA0, (uint32_t)pDst);
+    ITA_SetRQSAddress(ITA0, (uint32_t)quant_param);
+    ITA_SetIter(ITA0, 1);
+
+    // Run one iteration
+    instr_init = read_csr(minstret);
+    timer_init = read_csr(mcycle);
+
+    ITA_Start(ITA0);
+
+    while (!ITA_IsDone(ITA0)) {
+      mempool_wait(16);
+    }
+  }
+
+  mempool_barrier(numThreads);
+
+  mempool_stop_benchmark();
+  mempool_start_benchmark();
+  if (core_id == 0) {
+    timer_end = read_csr(mcycle);
+    instr_end = read_csr(minstret);
+    dump_timer_cycle(timer_end - timer_init);
+    dump_timer_instr(instr_end - instr_init - 2);
+
+    // Add offset to output matrix
+    if (output_offset != 0) {
+      for (i = 0; i < S; ++i) {
+        for (j = 0; j < E; ++j) {
+          pDst[i * E + j] += output_offset;
+        }
+      }
+    }
+  }
+}
+
+void M2HSA_s8_ITA(int8_t const *__restrict__ pSrcQ,
+                  int8_t const *__restrict__ pSrcK, int8_t **__restrict__ pBuf,
+                  uint32_t S, uint32_t E, uint32_t P,
+                  ita_quant_t const **__restrict__ quant_params,
+                  int8_t *__restrict__ pDst, int8_t Q_offset, int8_t K_offset,
+                  int8_t output_offset, uint32_t core_id,
+                  __attribute__((unused)) uint32_t numThreads) {
+
+  ita_data_t ita_data;
+  uint32_t i = 0;
+  uint32_t j = 0;
+  ITA_TypeDef *ita_inst[] = {ITA0, ITA1};
+  uint8_t ita_h = sizeof(ita_inst) / sizeof(ITA_TypeDef *);
+
+  ITA_getStruct(&ita_data, pBuf[0], S, E, P);
+  // if (core_id == 0) ITA_printAddresses(&ita_data);
+
+  // Copy the keys (Q) and queries/values (K) data to the L2 buffer
+  if (core_id == 0)
+    ITA_copyInput(ita_data.q, pSrcQ, S, E, Q_offset);
+#if USE_DMA
+  if (core_id == 0) {
+#else
+  if ((core_id == 1) | (numThreads == 1)) {
+#endif
+    ITA_copyInput(ita_data.k, pSrcK, S, E, K_offset);
+  }
+
+  // WIESP: All ITA cores fetch the Q and K vector always from the address
+  // specified to core 0, hence we must make sure that this is valid
+  if (core_id == 0)
+    ITA_SetStartAddress(ita_inst[core_id], (uint32_t)pBuf[core_id]);
+
+  mempool_barrier(numThreads);
+
+  mempool_stop_benchmark();
+  mempool_start_benchmark();
+
+  mempool_timer_t instr_init = 0, instr_end = 0;
+  mempool_timer_t timer_init = 0, timer_end = 0;
+
+  if (core_id < ita_h) {
+    // Configure ITA cores
+    ITA_SetShape(ita_inst[core_id], S, E, P);
+    ITA_SetStartAddress(ita_inst[core_id], (uint32_t)pBuf[core_id]);
+    ITA_SetOutAddress(ita_inst[core_id], (uint32_t)(pDst + core_id * S * E));
+    ITA_SetRQSAddress(ita_inst[core_id], (uint32_t)quant_params[core_id]);
+    ITA_SetIter(ita_inst[core_id], 1);
+
+    // Run one iteration
+    instr_init = read_csr(minstret);
+    timer_init = read_csr(mcycle);
+
+    ITA_Start(ita_inst[core_id]);
+
+    while (!ITA_IsDone(ita_inst[core_id])) {
+      mempool_wait(16);
+    }
+  }
+  mempool_barrier(numThreads);
+
+  mempool_stop_benchmark();
+  mempool_start_benchmark();
+  if (core_id < ita_h) {
+    timer_end = read_csr(mcycle);
+    instr_end = read_csr(minstret);
+    dump_timer_cycle(timer_end - timer_init);
+    dump_timer_instr(instr_end - instr_init - 2);
+
+    // Add offset to output matrix
+    if (output_offset != 0) {
+      for (i = 0; i < S; ++i) {
+        for (j = 0; j < E; ++j) {
+          pDst[core_id * S * E + i * E + j] += output_offset;
+        }
+      }
+    }
+  }
+}
+
+void M4HSA_s8_ITA(int8_t const *__restrict__ pSrcQ,
+                  int8_t const *__restrict__ pSrcK, int8_t **__restrict__ pBuf,
+                  uint32_t S, uint32_t E, uint32_t P,
+                  ita_quant_t const **__restrict__ quant_params,
+                  int8_t *__restrict__ pDst, int8_t Q_offset, int8_t K_offset,
+                  int8_t output_offset, uint32_t core_id,
+                  __attribute__((unused)) uint32_t numThreads) {
+
+  ita_data_t ita_data;
+  uint32_t i = 0;
+  uint32_t j = 0;
+  ITA_TypeDef *ita_inst[] = {ITA0, ITA1, ITA2, ITA3};
+  uint8_t ita_h = sizeof(ita_inst) / sizeof(ITA_TypeDef *);
+
+  ITA_getStruct(&ita_data, pBuf[0], S, E, P);
+  // if (core_id == 0) ITA_printAddresses(&ita_data);
+
+  // Copy the keys (Q) and queries/values (K) data to the L2 buffer
+  if (core_id == 0)
+    ITA_copyInput(ita_data.q, pSrcQ, S, E, Q_offset);
+#if USE_DMA
+  if (core_id == 0) {
+#else
+  if ((core_id == 1) | (numThreads == 1)) {
+#endif
+    ITA_copyInput(ita_data.k, pSrcK, S, E, K_offset);
+  }
+
+  // WIESP: All ITA cores fetch the Q and K vector always from the address
+  // specified to core 0, hence we must make sure that this is valid
+  if (core_id == 0)
+    ITA_SetStartAddress(ita_inst[core_id], (uint32_t)pBuf[core_id]);
+
+  mempool_barrier(numThreads);
+
+  mempool_stop_benchmark();
+  mempool_start_benchmark();
+
+  mempool_timer_t instr_init = 0, instr_end = 0;
+  mempool_timer_t timer_init = 0, timer_end = 0;
+
+  if (core_id < ita_h) {
+    // Configure ITA cores
+    ITA_SetShape(ita_inst[core_id], S, E, P);
+    ITA_SetStartAddress(ita_inst[core_id], (uint32_t)pBuf[core_id]);
+    ITA_SetOutAddress(ita_inst[core_id], (uint32_t)(pDst + core_id * S * E));
+    ITA_SetRQSAddress(ita_inst[core_id], (uint32_t)quant_params[core_id]);
+    ITA_SetIter(ita_inst[core_id], 1);
+
+    // Run one iteration
+    instr_init = read_csr(minstret);
+    timer_init = read_csr(mcycle);
+
+    ITA_Start(ita_inst[core_id]);
+
+    while (!ITA_IsDone(ita_inst[core_id])) {
+      mempool_wait(16);
+    }
+  }
+  mempool_barrier(numThreads);
+
+  mempool_stop_benchmark();
+  mempool_start_benchmark();
+  if (core_id < ita_h) {
+    timer_end = read_csr(mcycle);
+    instr_end = read_csr(minstret);
+    dump_timer_cycle(timer_end - timer_init);
+    dump_timer_instr(instr_end - instr_init - 2);
+
+    // Add offset to output matrix
+    if (output_offset != 0) {
+      for (i = 0; i < S; ++i) {
+        for (j = 0; j < E; ++j) {
+          pDst[core_id * S * E + i * E + j] += output_offset;
+        }
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/MemPool/src/MatMul_s16.c b/TargetLibraries/MemPool/src/MatMul_s16.c
new file mode 100644
index 0000000..b28bf00
--- /dev/null
+++ b/TargetLibraries/MemPool/src/MatMul_s16.c
@@ -0,0 +1,234 @@
+/* =====================================================================
+ * Title:        MatMul_s16.c
+ * Description:
+ *
+ * Date:         29.11.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Samuel Riedel, ETH Zurich
+ * - Sergio Mazzola, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+
+void MatMul_unrolled_2x2_parallel_s16_rv32im(int16_t const *__restrict__ pSrcA,
+                                             int16_t const *__restrict__ pSrcB,
+                                             int32_t *__restrict__ pDstC,
+                                             uint32_t M, uint32_t N, uint32_t P,
+                                             uint32_t core_id,
+                                             uint32_t numThreads) {
+  // Parallelize by assigning each core one row
+  uint32_t const c = 1; // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (core_id % c);
+  uint32_t const c_end = (P / c) * ((core_id % c) + 1);
+  for (uint32_t i = 2 * (core_id / c); i < M; i += 2 * (numThreads / c)) {
+    for (uint32_t j = c_start; j < c_end; j += 2) {
+      int32_t c00 = 0;
+      int32_t c01 = 0;
+      int32_t c10 = 0;
+      int32_t c11 = 0;
+      for (uint32_t k = 0; k < N; k += 2) {
+        // Explicitly load the values first to help with scheduling
+        int16_t val_a00 = pSrcA[(i + 0) * N + k + 0];
+        int16_t val_a01 = pSrcA[(i + 0) * N + k + 1];
+        int16_t val_a10 = pSrcA[(i + 1) * N + k + 0];
+        int16_t val_a11 = pSrcA[(i + 1) * N + k + 1];
+        int16_t val_b00 = pSrcB[(k + 0) * P + j + 0];
+        int16_t val_b01 = pSrcB[(k + 0) * P + j + 1];
+        int16_t val_b10 = pSrcB[(k + 1) * P + j + 0];
+        int16_t val_b11 = pSrcB[(k + 1) * P + j + 1];
+        c00 += val_a00 * val_b00;
+        c00 += val_a01 * val_b10;
+        c01 += val_a00 * val_b01;
+        c01 += val_a01 * val_b11;
+        c10 += val_a10 * val_b00;
+        c10 += val_a11 * val_b10;
+        c11 += val_a10 * val_b01;
+        c11 += val_a11 * val_b11;
+      }
+      pDstC[(i + 0) * P + j + 0] = c00;
+      pDstC[(i + 0) * P + j + 1] = c01;
+      pDstC[(i + 1) * P + j + 0] = c10;
+      pDstC[(i + 1) * P + j + 1] = c11;
+    }
+  }
+}
+
+#ifdef __XPULPIMG
+
+void MatMul_unrolled_4x2_parallel_s16_xpulpv2(int16_t const *__restrict__ pSrcA,
+                                              int16_t const *__restrict__ pSrcB,
+                                              int32_t *__restrict__ pDstC,
+                                              uint32_t M, uint32_t N,
+                                              uint32_t P, uint32_t core_id,
+                                              uint32_t numThreads) {
+  uint32_t i = 0; // loop counter for M
+  uint32_t j = 0; // loop counter for N
+  uint32_t k = 0; // loop counter for P
+
+  for (k = core_id; k < P / 2; k += numThreads) {
+    for (i = 0; i < M / 4; i++) {
+      int32_t sum00 = 0;
+      int32_t sum01 = 0;
+      int32_t sum10 = 0;
+      int32_t sum11 = 0;
+      int32_t sum20 = 0;
+      int32_t sum21 = 0;
+      int32_t sum30 = 0;
+      int32_t sum31 = 0;
+
+      for (j = 0; j < N / 2; j++) {
+        v2s aVec0 = *((v2s *)&(pSrcA[(i * 4) * N + (j * 2)]));
+        v2s aVec1 = *((v2s *)&(pSrcA[(i * 4 + 1) * N + (j * 2)]));
+        v2s aVec2 = *((v2s *)&(pSrcA[(i * 4 + 2) * N + (j * 2)]));
+        v2s aVec3 = *((v2s *)&(pSrcA[(i * 4 + 3) * N + (j * 2)]));
+
+        v2s bTemp0 = *((v2s *)&(pSrcB[(j * 2) * P + (k * 2)]));
+        v2s bTemp1 = *((v2s *)&(pSrcB[(j * 2 + 1) * P + (k * 2)]));
+
+        v2s bVec0 = __builtin_shuffle(bTemp0, bTemp1, (v2s){0, 2});
+        v2s bVec1 = __builtin_shuffle(bTemp0, bTemp1, (v2s){1, 3});
+
+        sum00 = __SUMDOTP2(aVec0, bVec0, sum00);
+        sum01 = __SUMDOTP2(aVec0, bVec1, sum01);
+        sum10 = __SUMDOTP2(aVec1, bVec0, sum10);
+        sum11 = __SUMDOTP2(aVec1, bVec1, sum11);
+        sum20 = __SUMDOTP2(aVec2, bVec0, sum20);
+        sum21 = __SUMDOTP2(aVec2, bVec1, sum21);
+        sum30 = __SUMDOTP2(aVec3, bVec0, sum30);
+        sum31 = __SUMDOTP2(aVec3, bVec1, sum31);
+      }
+
+      pDstC[(i * 4) * P + (k * 2)] = sum00;
+      pDstC[(i * 4) * P + (k * 2 + 1)] = sum01;
+      pDstC[(i * 4 + 1) * P + (k * 2)] = sum10;
+      pDstC[(i * 4 + 1) * P + (k * 2 + 1)] = sum11;
+      pDstC[(i * 4 + 2) * P + (k * 2)] = sum20;
+      pDstC[(i * 4 + 2) * P + (k * 2 + 1)] = sum21;
+      pDstC[(i * 4 + 3) * P + (k * 2)] = sum30;
+      pDstC[(i * 4 + 3) * P + (k * 2 + 1)] = sum31;
+    }
+  }
+}
+
+void MatMul_unrolled_4x2_pincr_asm_parallel_s16_xpulpv2(
+    int16_t const *__restrict__ pSrcA, int16_t const *__restrict__ pSrcB,
+    int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    uint32_t core_id, uint32_t numThreads) {
+  // Loop counter for P
+  uint32_t k = 0;
+  // Increment for A matrix = 1 row forward
+  uint32_t const A_incr = N * sizeof(int16_t);
+  // Decrement for A matrix = 3 rows backward and 2 words forward
+  int32_t const A_decr =
+      -(int)(N * 3 * sizeof(int16_t)) + 2 * (int)sizeof(int16_t);
+  // Increment for B matrix = 1 row forward
+  uint32_t const B_incr = P * sizeof(int16_t); // bytes in 1 row
+  // Increment for C matrix = 1 row forward and 1 word backward
+  uint32_t const C_incr = (P * sizeof(int32_t)) - sizeof(int32_t);
+
+  for (k = core_id; k < P / 2; k += numThreads) {
+    const int16_t *idx_a = &pSrcA[0];     // start_a
+    int32_t *idx_c = &pDstC[k * 2];       // start_c
+    int32_t const *end_c = &pDstC[P * M]; // actually (P * M) + (k * 2)
+
+    while (idx_c < end_c) {
+      int32_t sum00 = 0;
+      int32_t sum01 = 0;
+      int32_t sum10 = 0;
+      int32_t sum11 = 0;
+      int32_t sum20 = 0;
+      int32_t sum21 = 0;
+      int32_t sum30 = 0;
+      int32_t sum31 = 0;
+
+      int16_t const *end_a = idx_a + N;
+      const int16_t *idx_b = &pSrcB[k * 2]; // start_b
+
+      while (idx_a < end_a) {
+        v2s aVec0, aVec1, aVec2, aVec3;
+        v2s bTemp0, bTemp1;
+
+        __asm__ volatile(
+            "p.lw %[a0], %[a_incr](%[addr_a]!) \n\t"
+            "p.lw %[a1], %[a_incr](%[addr_a]!) \n\t"
+            "p.lw %[a2], %[a_incr](%[addr_a]!) \n\t"
+            "p.lw %[a3], %[a_decr](%[addr_a]!) \n\t"
+            "p.lw %[t0], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[t1], %[b_incr](%[addr_b]!) \n\t"
+            : [a0] "=&r"(aVec0), [a1] "=&r"(aVec1), [a2] "=&r"(aVec2),
+              [a3] "=&r"(aVec3), [t0] "=&r"(bTemp0), [t1] "=&r"(bTemp1),
+              [addr_a] "+&r"(idx_a), [addr_b] "+&r"(idx_b)
+            : [a_incr] "r"(A_incr), [a_decr] "r"(A_decr), [b_incr] "r"(B_incr)
+            : "memory");
+        /* The asm code above implements the following commented C code */
+        // v2s aVec0 = *((v2s *)&(pSrcA[(i * 4) * N + (j * 2)]));
+        // v2s aVec1 = *((v2s *)&(pSrcA[(i * 4 + 1) * N + (j * 2)]));
+        // v2s aVec2 = *((v2s *)&(pSrcA[(i * 4 + 2) * N + (j * 2)]));
+        // v2s aVec3 = *((v2s *)&(pSrcA[(i * 4 + 3) * N + (j * 2)]));
+        // v2s bTemp0 = *((v2s *)&(pSrcB[(j * 2) * P + (k * 2)]));
+        // v2s bTemp1 = *((v2s *)&(pSrcB[(j * 2 + 1) * P + (k * 2)]));
+
+        v2s bVec0 = __builtin_shuffle(bTemp0, bTemp1, (v2s){0, 2});
+        v2s bVec1 = __builtin_shuffle(bTemp0, bTemp1, (v2s){1, 3});
+
+        sum00 = __SUMDOTP2(aVec0, bVec0, sum00);
+        sum01 = __SUMDOTP2(aVec0, bVec1, sum01);
+        sum10 = __SUMDOTP2(aVec1, bVec0, sum10);
+        sum11 = __SUMDOTP2(aVec1, bVec1, sum11);
+        sum20 = __SUMDOTP2(aVec2, bVec0, sum20);
+        sum21 = __SUMDOTP2(aVec2, bVec1, sum21);
+        sum30 = __SUMDOTP2(aVec3, bVec0, sum30);
+        sum31 = __SUMDOTP2(aVec3, bVec1, sum31);
+      }
+
+      __asm__ volatile(
+          "p.sw %[s00], 4(%[addr_c]!) \n\t"
+          "p.sw %[s01], %[c_incr](%[addr_c]!) \n\t"
+          "p.sw %[s10], 4(%[addr_c]!) \n\t"
+          "p.sw %[s11], %[c_incr](%[addr_c]!) \n\t"
+          "p.sw %[s20], 4(%[addr_c]!) \n\t"
+          "p.sw %[s21], %[c_incr](%[addr_c]!) \n\t"
+          "p.sw %[s30], 4(%[addr_c]!) \n\t"
+          "p.sw %[s31], %[c_incr](%[addr_c]!) \n\t"
+          : [addr_c] "+&r"(idx_c)
+          : [s00] "r"(sum00), [s01] "r"(sum01), [s10] "r"(sum10),
+            [s11] "r"(sum11), [s20] "r"(sum20), [s21] "r"(sum21),
+            [s30] "r"(sum30), [s31] "r"(sum31), [c_incr] "r"(C_incr)
+          : "memory");
+      /* The asm code above implements the following commented C code */
+      // pDstC[(i * 4) * P + (k * 2)] = sum00;
+      // pDstC[(i * 4) * P + (k * 2 + 1)] = sum01;
+      // pDstC[(i * 4 + 1) * P + (k * 2)] = sum10;
+      // pDstC[(i * 4 + 1) * P + (k * 2 + 1)] = sum11;
+      // pDstC[(i * 4 + 2) * P + (k * 2)] = sum20;
+      // pDstC[(i * 4 + 2) * P + (k * 2 + 1)] = sum21;
+      // pDstC[(i * 4 + 3) * P + (k * 2)] = sum30;
+      // pDstC[(i * 4 + 3) * P + (k * 2 + 1)] = sum31;
+
+      idx_a += N * 3;
+    }
+  }
+}
+
+#endif //__XPULPIMG
diff --git a/TargetLibraries/MemPool/src/MatMul_s32.c b/TargetLibraries/MemPool/src/MatMul_s32.c
new file mode 100644
index 0000000..6b04fc2
--- /dev/null
+++ b/TargetLibraries/MemPool/src/MatMul_s32.c
@@ -0,0 +1,155 @@
+/* =====================================================================
+ * Title:        MatMul_s32.c
+ * Description:
+ *
+ * Date:         29.11.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Samuel Riedel, ETH Zurich
+ * - Sergio Mazzola, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+
+void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA,
+                                             int32_t const *__restrict__ pSrcB,
+                                             int32_t *__restrict__ pDstC,
+                                             uint32_t M, uint32_t N, uint32_t P,
+                                             uint32_t core_id,
+                                             uint32_t numThreads) {
+  // Parallelize by assigning each core one row
+  uint32_t const c = 1; // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (core_id % c);
+  uint32_t const c_end = (P / c) * ((core_id % c) + 1);
+  for (uint32_t i = 2 * (core_id / c); i < M; i += 2 * (numThreads / c)) {
+    for (uint32_t j = c_start; j < c_end; j += 2) {
+      int32_t c00 = 0;
+      int32_t c01 = 0;
+      int32_t c10 = 0;
+      int32_t c11 = 0;
+      for (uint32_t k = 0; k < N; k += 2) {
+        // Explicitly load the values first to help with scheduling
+        int32_t val_a00 = pSrcA[(i + 0) * N + k + 0];
+        int32_t val_a01 = pSrcA[(i + 0) * N + k + 1];
+        int32_t val_a10 = pSrcA[(i + 1) * N + k + 0];
+        int32_t val_a11 = pSrcA[(i + 1) * N + k + 1];
+        int32_t val_b00 = pSrcB[(k + 0) * P + j + 0];
+        int32_t val_b01 = pSrcB[(k + 0) * P + j + 1];
+        int32_t val_b10 = pSrcB[(k + 1) * P + j + 0];
+        int32_t val_b11 = pSrcB[(k + 1) * P + j + 1];
+        c00 += val_a00 * val_b00;
+        c00 += val_a01 * val_b10;
+        c01 += val_a00 * val_b01;
+        c01 += val_a01 * val_b11;
+        c10 += val_a10 * val_b00;
+        c10 += val_a11 * val_b10;
+        c11 += val_a10 * val_b01;
+        c11 += val_a11 * val_b11;
+      }
+      pDstC[(i + 0) * P + j + 0] = c00;
+      pDstC[(i + 0) * P + j + 1] = c01;
+      pDstC[(i + 1) * P + j + 0] = c10;
+      pDstC[(i + 1) * P + j + 1] = c11;
+    }
+  }
+}
+
+#ifdef __XPULPIMG
+
+void MatMul_unrolled_2x2_parallel_s32_xpulpv2(int32_t const *__restrict__ pSrcA,
+                                              int32_t const *__restrict__ pSrcB,
+                                              int32_t *__restrict__ pDstC,
+                                              uint32_t M, uint32_t N,
+                                              uint32_t P, uint32_t core_id,
+                                              uint32_t numThreads) {
+  // Parallelize by assigning each core one row
+  uint32_t const c = 8; // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (core_id % c);
+  uint32_t const c_end = (P / c) * ((core_id % c) + 1);
+
+  uint32_t const A_incr = (N * sizeof(int32_t)) - sizeof(int32_t);
+  uint32_t const B_incr = (P * sizeof(int32_t)) - sizeof(int32_t);
+
+  for (uint32_t i = 2 * (core_id / c); i < M; i += 2 * (numThreads / c)) {
+    for (uint32_t j = c_start; j < c_end; j += 2) {
+      int32_t c00 = 0;
+      int32_t c01 = 0;
+      int32_t c10 = 0;
+      int32_t c11 = 0;
+
+      for (uint32_t k = 0; k < N; k += 2) {
+        const int32_t *idx_a = &pSrcA[i * N + k];
+        const int32_t *idx_b = &pSrcB[k * P + j];
+        int32_t val_a00, val_a01, val_a10, val_a11, val_b00, val_b01, val_b10,
+            val_b11;
+        __asm__ volatile(
+            "p.lw %[a00], 4(%[addr_a]!) \n\t"
+            "p.lw %[a01], %[a_incr](%[addr_a]!) \n\t"
+            "p.lw %[a10], 4(%[addr_a]!) \n\t"
+            "p.lw %[a11], 0(%[addr_a]) \n\t"
+            "p.lw %[b00], 4(%[addr_b]!) \n\t"
+            "p.lw %[b01], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[b10], 4(%[addr_b]!) \n\t"
+            "p.lw %[b11], 0(%[addr_b]) \n\t"
+            : [a00] "=&r"(val_a00), [a01] "=&r"(val_a01), [a10] "=&r"(val_a10),
+              [a11] "=&r"(val_a11), [b00] "=&r"(val_b00), [b01] "=&r"(val_b01),
+              [b10] "=&r"(val_b10), [b11] "=&r"(val_b11), [addr_a] "+&r"(idx_a),
+              [addr_b] "+&r"(idx_b)
+            : [a_incr] "r"(A_incr), [b_incr] "r"(B_incr)
+            : "memory");
+        /* The asm code above implements the following commented C code */
+        // int32_t val_a00 =pSrcA[(i + 0) * N + k + 0];
+        // int32_t val_a01 =pSrcA[(i + 0) * N + k + 1];
+        // int32_t val_a10 =pSrcA[(i + 1) * N + k + 0];
+        // int32_t val_a11 =pSrcA[(i + 1) * N + k + 1];
+        // int32_t val_b00 =pSrcB[(k + 0) * P + j + 0];
+        // int32_t val_b01 =pSrcB[(k + 0) * P + j + 1];
+        // int32_t val_b10 =pSrcB[(k + 1) * P + j + 0];
+        // int32_t val_b11 =pSrcB[(k + 1) * P + j + 1];
+        c00 += val_a00 * val_b00;
+        c00 += val_a01 * val_b10;
+        c01 += val_a00 * val_b01;
+        c01 += val_a01 * val_b11;
+        c10 += val_a10 * val_b00;
+        c10 += val_a11 * val_b10;
+        c11 += val_a10 * val_b01;
+        c11 += val_a11 * val_b11;
+      }
+      int32_t *idx_c = &pDstC[i * P + j];
+      __asm__ volatile("p.sw %[s00], 4(%[addr_c]!) \n\t"
+                       "p.sw %[s01], %[c_incr](%[addr_c]!) \n\t"
+                       "p.sw %[s10], 4(%[addr_c]!) \n\t"
+                       "p.sw %[s11], 0(%[addr_c]) \n\t"
+                       : [addr_c] "+&r"(idx_c)
+                       : [s00] "r"(c00), [s01] "r"(c01), [s10] "r"(c10),
+                         [s11] "r"(c11), [c_incr] "r"(B_incr)
+                       : "memory");
+      /* The asm code above implements the following commented C code */
+      // pDstC[(i + 0) * P + j + 0] = c00;
+      // pDstC[(i + 0) * P + j + 1] = c01;
+      // pDstC[(i + 1) * P + j + 0] = c10;
+      // pDstC[(i + 1) * P + j + 1] = c11;
+    }
+  }
+}
+#endif //__XPULPIMG
diff --git a/TargetLibraries/MemPool/src/MatMul_s8.c b/TargetLibraries/MemPool/src/MatMul_s8.c
new file mode 100644
index 0000000..81ddcf5
--- /dev/null
+++ b/TargetLibraries/MemPool/src/MatMul_s8.c
@@ -0,0 +1,539 @@
+/* =====================================================================
+ * Title:        MatMul_s8.c
+ * Description:
+ *
+ * Date:         29.11.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Samuel Riedel, ETH Zurich
+ * - Sergio Mazzola, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+void MatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA,
+                               int8_t const *__restrict__ pSrcB,
+                               int32_t *__restrict__ pDstC, uint32_t M,
+                               uint32_t N, uint32_t P, int32_t A_offset,
+                               int32_t B_offset, int32_t output_offset,
+                               uint32_t core_id, uint32_t numThreads) {
+  // Parallelize by assigning each core one row
+  uint32_t const c = 1; // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (core_id % c);
+  uint32_t const c_end = (P / c) * ((core_id % c) + 1);
+
+  for (uint32_t i = core_id / c; i < M; i += numThreads / c) {
+    for (uint32_t j = c_start; j < c_end; ++j) {
+      int32_t sum = 0;
+      for (uint32_t k = 0; k < N; ++k) {
+        sum += (int32_t)(pSrcA[i * N + k] + A_offset) *
+               (pSrcB[k * P + j] + B_offset);
+      }
+      pDstC[i * P + j] = sum + output_offset;
+    }
+  }
+}
+
+void MatMul_unrolled_2x2_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA,
+                                            int8_t const *__restrict__ pSrcB,
+                                            int32_t *__restrict__ pDstC,
+                                            uint32_t M, uint32_t N, uint32_t P,
+                                            uint32_t core_id,
+                                            uint32_t numThreads) {
+  // Parallelize by assigning each core one row
+  uint32_t const c = 1; // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (core_id % c);
+  uint32_t const c_end = (P / c) * ((core_id % c) + 1);
+  for (uint32_t i = 2 * (core_id / c); i < M; i += 2 * (numThreads / c)) {
+    for (uint32_t j = c_start; j < c_end; j += 2) {
+      int32_t c00 = 0;
+      int32_t c01 = 0;
+      int32_t c10 = 0;
+      int32_t c11 = 0;
+      for (uint32_t k = 0; k < N; k += 2) {
+        // Explicitly load the values first to help with scheduling
+        int8_t val_a00 = (int8_t)(pSrcA[(i + 0) * N + k + 0]);
+        int8_t val_a01 = (int8_t)(pSrcA[(i + 0) * N + k + 1]);
+        int8_t val_a10 = (int8_t)(pSrcA[(i + 1) * N + k + 0]);
+        int8_t val_a11 = (int8_t)(pSrcA[(i + 1) * N + k + 1]);
+        int8_t val_b00 = (int8_t)(pSrcB[(k + 0) * P + j + 0]);
+        int8_t val_b01 = (int8_t)(pSrcB[(k + 0) * P + j + 1]);
+        int8_t val_b10 = (int8_t)(pSrcB[(k + 1) * P + j + 0]);
+        int8_t val_b11 = (int8_t)(pSrcB[(k + 1) * P + j + 1]);
+        c00 += val_a00 * val_b00;
+        c00 += val_a01 * val_b10;
+        c01 += val_a00 * val_b01;
+        c01 += val_a01 * val_b11;
+        c10 += val_a10 * val_b00;
+        c10 += val_a11 * val_b10;
+        c11 += val_a10 * val_b01;
+        c11 += val_a11 * val_b11;
+      }
+      pDstC[(i + 0) * P + j + 0] = c00;
+      pDstC[(i + 0) * P + j + 1] = c01;
+      pDstC[(i + 1) * P + j + 0] = c10;
+      pDstC[(i + 1) * P + j + 1] = c11;
+    }
+  }
+}
+
+void MatMul_offset_unrolled_2x2_parallel_s8_rv32im(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    int32_t A_offset, int32_t B_offset, int32_t output_offset, uint32_t core_id,
+    uint32_t numThreads) {
+  // Parallelize by assigning each core one row
+  uint32_t const c = 1; // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (core_id % c);
+  uint32_t const c_end = (P / c) * ((core_id % c) + 1);
+  for (uint32_t i = 2 * (core_id / c); i < M; i += 2 * (numThreads / c)) {
+    for (uint32_t j = c_start; j < c_end; j += 2) {
+      int32_t c00 = 0;
+      int32_t c01 = 0;
+      int32_t c10 = 0;
+      int32_t c11 = 0;
+      for (uint32_t k = 0; k < N; k += 2) {
+        // Explicitly load the values first to help with scheduling
+        int8_t val_a00 = (int8_t)(pSrcA[(i + 0) * N + k + 0] + A_offset);
+        int8_t val_a01 = (int8_t)(pSrcA[(i + 0) * N + k + 1] + A_offset);
+        int8_t val_a10 = (int8_t)(pSrcA[(i + 1) * N + k + 0] + A_offset);
+        int8_t val_a11 = (int8_t)(pSrcA[(i + 1) * N + k + 1] + A_offset);
+        int8_t val_b00 = (int8_t)(pSrcB[(k + 0) * P + j + 0] + B_offset);
+        int8_t val_b01 = (int8_t)(pSrcB[(k + 0) * P + j + 1] + B_offset);
+        int8_t val_b10 = (int8_t)(pSrcB[(k + 1) * P + j + 0] + B_offset);
+        int8_t val_b11 = (int8_t)(pSrcB[(k + 1) * P + j + 1] + B_offset);
+        c00 += val_a00 * val_b00;
+        c00 += val_a01 * val_b10;
+        c01 += val_a00 * val_b01;
+        c01 += val_a01 * val_b11;
+        c10 += val_a10 * val_b00;
+        c10 += val_a11 * val_b10;
+        c11 += val_a10 * val_b01;
+        c11 += val_a11 * val_b11;
+      }
+      pDstC[(i + 0) * P + j + 0] = c00 + output_offset;
+      pDstC[(i + 0) * P + j + 1] = c01 + output_offset;
+      pDstC[(i + 1) * P + j + 0] = c10 + output_offset;
+      pDstC[(i + 1) * P + j + 1] = c11 + output_offset;
+    }
+  }
+}
+
+#ifdef __XPULPIMG
+
+void MatMul_unrolled_2x4_s8_xpulpv2(int8_t const *__restrict__ pSrcA,
+                                    int8_t const *__restrict__ pSrcB,
+                                    int32_t *__restrict__ pDstC, uint32_t M,
+                                    uint32_t N, uint32_t P, int32_t A_offset,
+                                    int32_t B_offset, int32_t output_offset) {
+  static v4s mask0 = {0, 1, 4, 5};
+  static v4s mask1 = {2, 3, 6, 7};
+  static v4s mask2 = {0, 2, 4, 6};
+  static v4s mask3 = {1, 3, 5, 7};
+
+  uint32_t i = 0; // loop counter for M
+  uint32_t j = 0; // loop counter for N
+  uint32_t k = 0; // loop counter for P
+
+  v4s aVecOffset = {(int8_t)A_offset, (int8_t)A_offset, (int8_t)A_offset,
+                    (int8_t)A_offset};
+  v4s bVecOffset = {(int8_t)B_offset, (int8_t)B_offset, (int8_t)B_offset,
+                    (int8_t)B_offset};
+
+  for (i = 0; i < M / 2; i++) {
+    for (k = 0; k < P / 4; k++) {
+      int32_t sum00 = output_offset;
+      int32_t sum01 = output_offset;
+      int32_t sum02 = output_offset;
+      int32_t sum03 = output_offset;
+      int32_t sum10 = output_offset;
+      int32_t sum11 = output_offset;
+      int32_t sum12 = output_offset;
+      int32_t sum13 = output_offset;
+
+      for (j = 0; j < N / 4; j++) {
+        v4s aVec0 = *((v4s *)&(pSrcA[(i * 2) * N + (j * 4)]));
+        v4s aVec1 = *((v4s *)&(pSrcA[(i * 2 + 1) * N + (j * 4)]));
+
+        aVec0 = __ADD4(aVec0, aVecOffset);
+        aVec1 = __ADD4(aVec1, aVecOffset);
+
+        v4s temp0 = *((v4s *)&(pSrcB[(j * 4) * P + (k * 4)]));
+        v4s temp1 = *((v4s *)&(pSrcB[(j * 4 + 1) * P + (k * 4)]));
+        v4s temp2 = *((v4s *)&(pSrcB[(j * 4 + 2) * P + (k * 4)]));
+        v4s temp3 = *((v4s *)&(pSrcB[(j * 4 + 3) * P + (k * 4)]));
+
+        v4s temp4 = __builtin_shuffle(temp0, temp1, mask0); // 0,1,4,5
+        v4s temp5 = __builtin_shuffle(temp2, temp3, mask0); // 8,9,12,13
+        v4s temp6 = __builtin_shuffle(temp0, temp1, mask1); // 2,3,6,7
+        v4s temp7 = __builtin_shuffle(temp2, temp3, mask1); // 3,7,11,15
+
+        v4s bVec0 = __builtin_shuffle(temp4, temp5, mask2); // 0,4,8,12
+        v4s bVec1 = __builtin_shuffle(temp4, temp5, mask3); // 1,5,9,13
+        v4s bVec2 = __builtin_shuffle(temp6, temp7, mask2); // 2,6,10,14
+        v4s bVec3 = __builtin_shuffle(temp6, temp7, mask3); // 3,7,11,15
+
+        bVec0 = __ADD4(bVec0, bVecOffset);
+        bVec1 = __ADD4(bVec1, bVecOffset);
+        bVec2 = __ADD4(bVec2, bVecOffset);
+        bVec3 = __ADD4(bVec3, bVecOffset);
+
+        sum00 = __SUMDOTP4(aVec0, bVec0, sum00);
+        sum01 = __SUMDOTP4(aVec0, bVec1, sum01);
+        sum02 = __SUMDOTP4(aVec0, bVec2, sum02);
+        sum03 = __SUMDOTP4(aVec0, bVec3, sum03);
+        sum10 = __SUMDOTP4(aVec1, bVec0, sum10);
+        sum11 = __SUMDOTP4(aVec1, bVec1, sum11);
+        sum12 = __SUMDOTP4(aVec1, bVec2, sum12);
+        sum13 = __SUMDOTP4(aVec1, bVec3, sum13);
+      }
+
+      pDstC[(i * 2) * P + (k * 4)] = sum00;
+      pDstC[(i * 2) * P + (k * 4 + 1)] = sum01;
+      pDstC[(i * 2) * P + (k * 4 + 2)] = sum02;
+      pDstC[(i * 2) * P + (k * 4 + 3)] = sum03;
+      pDstC[(i * 2 + 1) * P + (k * 4)] = sum10;
+      pDstC[(i * 2 + 1) * P + (k * 4 + 1)] = sum11;
+      pDstC[(i * 2 + 1) * P + (k * 4 + 2)] = sum12;
+      pDstC[(i * 2 + 1) * P + (k * 4 + 3)] = sum13;
+    }
+  }
+}
+
+void MatMul_unrolled_2x4_parallel_s8_xpulpv2(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    int32_t A_offset, int32_t B_offset, int32_t output_offset, uint32_t core_id,
+    uint32_t numThreads) {
+  static v4s mask0 = {0, 1, 4, 5};
+  static v4s mask1 = {2, 3, 6, 7};
+  static v4s mask2 = {0, 2, 4, 6};
+  static v4s mask3 = {1, 3, 5, 7};
+
+  uint32_t i = 0; // loop counter for M
+  uint32_t j = 0; // loop counter for N
+  uint32_t k = 0; // loop counter for P
+
+  v4s aVecOffset = {(int8_t)A_offset, (int8_t)A_offset, (int8_t)A_offset,
+                    (int8_t)A_offset};
+  v4s bVecOffset = {(int8_t)B_offset, (int8_t)B_offset, (int8_t)B_offset,
+                    (int8_t)B_offset};
+
+  for (k = core_id; k < P / 4; k += numThreads) {
+    for (i = 0; i < M / 2; i++) {
+      int32_t sum00 = output_offset;
+      int32_t sum01 = output_offset;
+      int32_t sum02 = output_offset;
+      int32_t sum03 = output_offset;
+      int32_t sum10 = output_offset;
+      int32_t sum11 = output_offset;
+      int32_t sum12 = output_offset;
+      int32_t sum13 = output_offset;
+
+      for (j = 0; j < N / 4; j++) {
+        v4s aVec0 = *((v4s *)&(pSrcA[(i * 2) * N + (j * 4)]));
+        v4s aVec1 = *((v4s *)&(pSrcA[(i * 2 + 1) * N + (j * 4)]));
+
+        aVec0 = __ADD4(aVec0, aVecOffset);
+        aVec1 = __ADD4(aVec1, aVecOffset);
+
+        v4s temp0 = *((v4s *)&(pSrcB[(j * 4) * P + (k * 4)]));
+        v4s temp1 = *((v4s *)&(pSrcB[(j * 4 + 1) * P + (k * 4)]));
+        v4s temp2 = *((v4s *)&(pSrcB[(j * 4 + 2) * P + (k * 4)]));
+        v4s temp3 = *((v4s *)&(pSrcB[(j * 4 + 3) * P + (k * 4)]));
+
+        v4s temp4 = __builtin_shuffle(temp0, temp1, mask0); // 0,1,4,5
+        v4s temp5 = __builtin_shuffle(temp2, temp3, mask0); // 8,9,12,13
+        v4s temp6 = __builtin_shuffle(temp0, temp1, mask1); // 2,3,6,7
+        v4s temp7 = __builtin_shuffle(temp2, temp3, mask1); // 3,7,11,15
+
+        v4s bVec0 = __builtin_shuffle(temp4, temp5, mask2); // 0,4,8,12
+        v4s bVec1 = __builtin_shuffle(temp4, temp5, mask3); // 1,5,9,13
+        v4s bVec2 = __builtin_shuffle(temp6, temp7, mask2); // 2,6,10,14
+        v4s bVec3 = __builtin_shuffle(temp6, temp7, mask3); // 3,7,11,15
+
+        bVec0 = __ADD4(bVec0, bVecOffset);
+        bVec1 = __ADD4(bVec1, bVecOffset);
+        bVec2 = __ADD4(bVec2, bVecOffset);
+        bVec3 = __ADD4(bVec3, bVecOffset);
+
+        sum00 = __SUMDOTP4(aVec0, bVec0, sum00);
+        sum01 = __SUMDOTP4(aVec0, bVec1, sum01);
+        sum02 = __SUMDOTP4(aVec0, bVec2, sum02);
+        sum03 = __SUMDOTP4(aVec0, bVec3, sum03);
+        sum10 = __SUMDOTP4(aVec1, bVec0, sum10);
+        sum11 = __SUMDOTP4(aVec1, bVec1, sum11);
+        sum12 = __SUMDOTP4(aVec1, bVec2, sum12);
+        sum13 = __SUMDOTP4(aVec1, bVec3, sum13);
+      }
+
+      pDstC[(i * 2) * P + (k * 4)] = sum00 + output_offset;
+      pDstC[(i * 2) * P + (k * 4 + 1)] = sum01 + output_offset;
+      pDstC[(i * 2) * P + (k * 4 + 2)] = sum02 + output_offset;
+      pDstC[(i * 2) * P + (k * 4 + 3)] = sum03 + output_offset;
+      pDstC[(i * 2 + 1) * P + (k * 4)] = sum10 + output_offset;
+      pDstC[(i * 2 + 1) * P + (k * 4 + 1)] = sum11 + output_offset;
+      pDstC[(i * 2 + 1) * P + (k * 4 + 2)] = sum12 + output_offset;
+      pDstC[(i * 2 + 1) * P + (k * 4 + 3)] = sum13 + output_offset;
+    }
+  }
+}
+
+void MatMul_unrolled_2x4_pincr_asm_parallel_s8_xpulpv2(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    uint32_t core_id, uint32_t numThreads) {
+  // Masks for shuffles
+  static v4s mask0 = {0, 1, 4, 5};
+  static v4s mask1 = {2, 3, 6, 7};
+  static v4s mask2 = {0, 2, 4, 6};
+  static v4s mask3 = {1, 3, 5, 7};
+
+  // Loop counter for P
+  uint32_t k = 0;
+  // Row decrement for A matrix
+  int32_t const N_decr = -(int)N + 4;
+  // Row increment for C matrix
+  uint32_t const P_incr = (P * 4) - 12;
+
+  for (k = core_id; k < P / 4; k += numThreads) {
+    const int8_t *idx_a = &pSrcA[0];      // start_a
+    int32_t *idx_c = &pDstC[k * 4];       // start_c
+    int32_t const *end_c = &pDstC[P * M]; // actually (P * M) + (k * 4)
+    while (idx_c < end_c) {
+      int32_t sum00 = 0;
+      int32_t sum01 = 0;
+      int32_t sum02 = 0;
+      int32_t sum03 = 0;
+      int32_t sum10 = 0;
+      int32_t sum11 = 0;
+      int32_t sum12 = 0;
+      int32_t sum13 = 0;
+
+      int8_t const *end_a = idx_a + N;
+      const int8_t *idx_b = &pSrcB[k * 4]; // start_b
+      while (idx_a < end_a) {
+        v4s aVec0, aVec1;
+
+        v4s temp0, temp1, temp2, temp3;
+
+        __asm__ volatile(
+            "p.lw %[a0], %[a_incr](%[addr_a]!) \n\t"
+            "p.lw %[a1], %[a_decr](%[addr_a]!) \n\t"
+            "p.lw %[t0], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[t1], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[t2], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[t3], %[b_incr](%[addr_b]!) \n\t"
+            : [a0] "=&r"(aVec0), [a1] "=&r"(aVec1), [t0] "=&r"(temp0),
+              [t1] "=&r"(temp1), [t2] "=&r"(temp2), [t3] "=&r"(temp3),
+              [addr_a] "+&r"(idx_a), [addr_b] "+&r"(idx_b)
+            : [a_incr] "r"(N), [a_decr] "r"(N_decr), [b_incr] "r"(P)
+            : "memory");
+        /* The asm code above implements the following commented C code */
+        // go to next row, same column
+        // v4s aVec0 = *((v4s *)idx_a); idx_a += N;
+        // go to previous row, one column forward
+        // v4s aVec1 = *((v4s *)idx_a); idx_a -= N - 4;
+        // v4s temp0 = *((v4s *)idx_b); idx_b += P;
+        // v4s temp1 = *((v4s *)idx_b); idx_b += P;
+        // v4s temp2 = *((v4s *)idx_b); idx_b += P;
+        // v4s temp3 = *((v4s *)idx_b); idx_b += P;
+
+        // Shuffles to transpose at runtime the chunk extracted from B before
+        // multiplying with A chunk temp0-3 variables needed because shuffles
+        // use rD as source, but also modify it, thus we need a copy of their
+        // content to use it twice in their original form
+        v4s temp4 = __builtin_shuffle(temp0, temp1, mask0); // 0,1,4,5
+        v4s temp5 = __builtin_shuffle(temp2, temp3, mask0); // 8,9,12,13
+        v4s temp6 = __builtin_shuffle(temp0, temp1, mask1); // 2,3,6,7
+        v4s temp7 = __builtin_shuffle(temp2, temp3, mask1); // 3,7,11,15
+
+        v4s bVec0 = __builtin_shuffle(temp4, temp5, mask2); // 0,4,8,12
+        v4s bVec1 = __builtin_shuffle(temp4, temp5, mask3); // 1,5,9,13
+        v4s bVec2 = __builtin_shuffle(temp6, temp7, mask2); // 2,6,10,14
+        v4s bVec3 = __builtin_shuffle(temp6, temp7, mask3); // 3,7,11,15
+
+        sum00 = __SUMDOTP4(aVec0, bVec0, sum00);
+        sum01 = __SUMDOTP4(aVec0, bVec1, sum01);
+        sum02 = __SUMDOTP4(aVec0, bVec2, sum02);
+        sum03 = __SUMDOTP4(aVec0, bVec3, sum03);
+        sum10 = __SUMDOTP4(aVec1, bVec0, sum10);
+        sum11 = __SUMDOTP4(aVec1, bVec1, sum11);
+        sum12 = __SUMDOTP4(aVec1, bVec2, sum12);
+        sum13 = __SUMDOTP4(aVec1, bVec3, sum13);
+      }
+
+      __asm__ volatile(
+          "p.sw %[s00], 4(%[addr_c]!) \n\t"
+          "p.sw %[s01], 4(%[addr_c]!) \n\t"
+          "p.sw %[s02], 4(%[addr_c]!) \n\t"
+          "p.sw %[s03], %[c_incr](%[addr_c]!) \n\t"
+          "p.sw %[s10], 4(%[addr_c]!) \n\t"
+          "p.sw %[s11], 4(%[addr_c]!) \n\t"
+          "p.sw %[s12], 4(%[addr_c]!) \n\t"
+          "p.sw %[s13], %[c_incr](%[addr_c]!) \n\t"
+          : [addr_c] "+&r"(idx_c)
+          : [s00] "r"(sum00), [s01] "r"(sum01), [s02] "r"(sum02),
+            [s03] "r"(sum03), [s10] "r"(sum10), [s11] "r"(sum11),
+            [s12] "r"(sum12), [s13] "r"(sum13), [c_incr] "r"(P_incr)
+          : "memory");
+      /* The asm code above implements the following commented C code */
+      // *(idx_c++) = sum00;
+      // *(idx_c++) = sum01;
+      // *(idx_c++) = sum02;
+      // *(idx_c) = sum03; idx_c += P - 3;
+      // *(idx_c++) = sum10;
+      // *(idx_c++) = sum11;
+      // *(idx_c++) = sum12;
+      // *(idx_c) = sum13; idx_c += P - 3;
+
+      idx_a += N; // adjust A matrix pointer
+    }
+  }
+}
+
+void MatMul_offset_unrolled_2x4_pincr_asm_parallel_s8_xpulpv2(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    int32_t A_offset, int32_t B_offset, int32_t output_offset, uint32_t core_id,
+    uint32_t numThreads) {
+  // Masks for shuffles
+  static v4s mask0 = {0, 1, 4, 5};
+  static v4s mask1 = {2, 3, 6, 7};
+  static v4s mask2 = {0, 2, 4, 6};
+  static v4s mask3 = {1, 3, 5, 7};
+
+  // Loop counter for P
+  uint32_t k = 0;
+  // Row decrement for A matrix
+  int32_t const N_decr = -(int)N + 4;
+  // Row increment for C matrix
+  uint32_t const P_incr = (P * 4) - 12;
+
+  v4s aVecOffset = {(int8_t)A_offset, (int8_t)A_offset, (int8_t)A_offset,
+                    (int8_t)A_offset};
+  v4s bVecOffset = {(int8_t)B_offset, (int8_t)B_offset, (int8_t)B_offset,
+                    (int8_t)B_offset};
+
+  for (k = core_id; k < P / 4; k += numThreads) {
+    const int8_t *idx_a = &pSrcA[0];      // start_a
+    int32_t *idx_c = &pDstC[k * 4];       // start_c
+    int32_t const *end_c = &pDstC[P * M]; // actually (P * M) + (k * 4)
+    while (idx_c < end_c) {
+      int32_t sum00 = output_offset;
+      int32_t sum01 = output_offset;
+      int32_t sum02 = output_offset;
+      int32_t sum03 = output_offset;
+      int32_t sum10 = output_offset;
+      int32_t sum11 = output_offset;
+      int32_t sum12 = output_offset;
+      int32_t sum13 = output_offset;
+
+      int8_t const *end_a = idx_a + N;
+      const int8_t *idx_b = &pSrcB[k * 4]; // start_b
+      while (idx_a < end_a) {
+        v4s aVec0, aVec1;
+
+        v4s temp0, temp1, temp2, temp3;
+
+        __asm__ volatile(
+            "p.lw %[a0], %[a_incr](%[addr_a]!) \n\t"
+            "p.lw %[a1], %[a_decr](%[addr_a]!) \n\t"
+            "p.lw %[t0], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[t1], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[t2], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[t3], %[b_incr](%[addr_b]!) \n\t"
+            : [a0] "=&r"(aVec0), [a1] "=&r"(aVec1), [t0] "=&r"(temp0),
+              [t1] "=&r"(temp1), [t2] "=&r"(temp2), [t3] "=&r"(temp3),
+              [addr_a] "+&r"(idx_a), [addr_b] "+&r"(idx_b)
+            : [a_incr] "r"(N), [a_decr] "r"(N_decr), [b_incr] "r"(P)
+            : "memory");
+        /* The asm code above implements the following commented C code */
+        // go to next row, same column
+        // v4s aVec0 = *((v4s *)idx_a); idx_a += N;
+        // go to previous row, one column forward
+        // v4s aVec1 = *((v4s *)idx_a); idx_a -= N - 4;
+        // v4s temp0 = *((v4s *)idx_b); idx_b += P;
+        // v4s temp1 = *((v4s *)idx_b); idx_b += P;
+        // v4s temp2 = *((v4s *)idx_b); idx_b += P;
+        // v4s temp3 = *((v4s *)idx_b); idx_b += P;
+        aVec0 = __ADD4(aVec0, aVecOffset);
+        aVec1 = __ADD4(aVec1, aVecOffset);
+
+        // Shuffles to transpose at runtime the chunk extracted from B before
+        // multiplying with A chunk temp0-3 variables needed because shuffles
+        // use rD as source, but also modify it, thus we need a copy of their
+        // content to use it twice in their original form
+        v4s temp4 = __builtin_shuffle(temp0, temp1, mask0); // 0,1,4,5
+        v4s temp5 = __builtin_shuffle(temp2, temp3, mask0); // 8,9,12,13
+        v4s temp6 = __builtin_shuffle(temp0, temp1, mask1); // 2,3,6,7
+        v4s temp7 = __builtin_shuffle(temp2, temp3, mask1); // 3,7,11,15
+
+        v4s bVec0 = __builtin_shuffle(temp4, temp5, mask2); // 0,4,8,12
+        v4s bVec1 = __builtin_shuffle(temp4, temp5, mask3); // 1,5,9,13
+        v4s bVec2 = __builtin_shuffle(temp6, temp7, mask2); // 2,6,10,14
+        v4s bVec3 = __builtin_shuffle(temp6, temp7, mask3); // 3,7,11,15
+
+        bVec0 = __ADD4(bVec0, bVecOffset);
+        bVec1 = __ADD4(bVec1, bVecOffset);
+        bVec2 = __ADD4(bVec2, bVecOffset);
+        bVec3 = __ADD4(bVec3, bVecOffset);
+
+        sum00 = __SUMDOTP4(aVec0, bVec0, sum00);
+        sum01 = __SUMDOTP4(aVec0, bVec1, sum01);
+        sum02 = __SUMDOTP4(aVec0, bVec2, sum02);
+        sum03 = __SUMDOTP4(aVec0, bVec3, sum03);
+        sum10 = __SUMDOTP4(aVec1, bVec0, sum10);
+        sum11 = __SUMDOTP4(aVec1, bVec1, sum11);
+        sum12 = __SUMDOTP4(aVec1, bVec2, sum12);
+        sum13 = __SUMDOTP4(aVec1, bVec3, sum13);
+      }
+
+      __asm__ volatile(
+          "p.sw %[s00], 4(%[addr_c]!) \n\t"
+          "p.sw %[s01], 4(%[addr_c]!) \n\t"
+          "p.sw %[s02], 4(%[addr_c]!) \n\t"
+          "p.sw %[s03], %[c_incr](%[addr_c]!) \n\t"
+          "p.sw %[s10], 4(%[addr_c]!) \n\t"
+          "p.sw %[s11], 4(%[addr_c]!) \n\t"
+          "p.sw %[s12], 4(%[addr_c]!) \n\t"
+          "p.sw %[s13], %[c_incr](%[addr_c]!) \n\t"
+          : [addr_c] "+&r"(idx_c)
+          : [s00] "r"(sum00), [s01] "r"(sum01), [s02] "r"(sum02),
+            [s03] "r"(sum03), [s10] "r"(sum10), [s11] "r"(sum11),
+            [s12] "r"(sum12), [s13] "r"(sum13), [c_incr] "r"(P_incr)
+          : "memory");
+      /* The asm code above implements the following commented C code */
+      // *(idx_c++) = sum00;
+      // *(idx_c++) = sum01;
+      // *(idx_c++) = sum02;
+      // *(idx_c) = sum03; idx_c += P - 3;
+      // *(idx_c++) = sum10;
+      // *(idx_c++) = sum11;
+      // *(idx_c++) = sum12;
+      // *(idx_c) = sum13; idx_c += P - 3;
+
+      idx_a += N; // adjust A matrix pointer
+    }
+  }
+}
+#endif //__XPULPIMG
diff --git a/TargetLibraries/MemPool/src/MaxPool_s8.c b/TargetLibraries/MemPool/src/MaxPool_s8.c
new file mode 100644
index 0000000..94e3f61
--- /dev/null
+++ b/TargetLibraries/MemPool/src/MaxPool_s8.c
@@ -0,0 +1,96 @@
+/* =====================================================================
+ * Title:        MaxPool_s8.c
+ * Description:
+ *
+ * Date:         13.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except pSrcA compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to pSrcA writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+
+void MaxPool2d_parallel_s8_NCHW_rv32im(int8_t const *__restrict__ pSrcA,
+                                       uint32_t C, uint32_t H, uint32_t W,
+                                       uint32_t P, uint32_t Q, uint32_t SP,
+                                       uint32_t SQ, int8_t *__restrict__ pDstC,
+                                       int32_t input_offset,
+                                       int32_t output_offset, uint32_t core_id,
+                                       uint32_t numThreads) {
+  // Parallelize along W output columns
+  uint32_t start = 0;
+  uint32_t end = 0;
+
+  // WIESEP: For now assume padding=0
+  uint32_t H_out = (H - P) / SP + 1;
+  uint32_t W_out = (W - Q) / SQ + 1;
+  uint32_t div = W_out / numThreads;
+  uint32_t rem = W_out % numThreads;
+
+  if (core_id < W_out) {
+    start = div * core_id;
+    end = div * (core_id + 1);
+  } else {
+    return;
+  }
+
+  // printf("H_out: %3ld, W_out: %3ld ", H_out, W_out);
+  // printf("DIV  : %3ld, REM  : %3ld ", div, rem);
+  // printf("start: %3ld, end  : %3ld\r\n", start, end);
+
+  start += core_id < rem ? core_id : rem;
+  end += core_id < rem ? core_id + 1 : rem;
+
+  uint32_t c = 0; // input channel loop counter
+  uint32_t h = 0; // input row loop counter
+  uint32_t w = 0; // input column loop counter
+
+  uint32_t p = 0; // kernel row loop counter
+  uint32_t q = 0; // kernel column loop counter
+
+  int32_t max;
+  int32_t volatile tmp;
+  for (c = 0; c < C; ++c) {
+    for (h = 0; h < H_out; ++h) {
+      for (w = start; w < end; ++w) {
+        max = -128;
+        // printf("(%2d,%2d,%2d) ", c, h, w);
+        for (p = 0; p < P; ++p) {
+          for (q = 0; q < Q; ++q) {
+            tmp = (int32_t)(pSrcA[c * H * W + (h * SP + p) * W + (w * SQ + q)] +
+                            input_offset);
+            if (tmp > max) {
+              // printf("%4d >  %4d, ", tmp, max);
+              max = tmp;
+            }
+            // else {
+            // printf("%4d <= %-4d, ", tmp, max);
+            // }
+          }
+        }
+        // printf(" -> %d\r\n", max);
+        pDstC[c * H_out * W_out + h * W_out + w] =
+            (int8_t)(max + output_offset);
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/MemPool/src/RQGemm_s8.c b/TargetLibraries/MemPool/src/RQGemm_s8.c
new file mode 100644
index 0000000..f983ef5
--- /dev/null
+++ b/TargetLibraries/MemPool/src/RQGemm_s8.c
@@ -0,0 +1,1251 @@
+/* =====================================================================
+ * Title:        RQGemm_s8.c
+ * Description:
+ *
+ * Date:         16.05.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+void RQGemm_parallel_s8_rv32im(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M,
+    uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
+    int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset,
+    int32_t Y_offset, int8_t output_min, int8_t output_max, uint32_t core_id,
+    uint32_t numThreads) {
+
+  // Parallelize by assigning each core one row
+  uint32_t const c = 1; // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (core_id % c);
+  uint32_t const c_end = (P / c) * ((core_id % c) + 1);
+
+  const int32_t rqs_bias = ((1 << (log2D - 1))) * rounding;
+  const int32_t bias = beta * C_offset;
+
+  int32_t _add = add[0];
+  int32_t _mul = mul[0];
+
+  if (transA == 0 && transB == 0) {
+    for (uint32_t m = core_id / c; m < M; m += numThreads / c) {
+      if (per_row_quant) {
+        _mul = mul[m];
+        _add = add[m];
+      }
+      for (uint32_t p = c_start; p < c_end; ++p) {
+        int32_t sum = 0;
+        for (uint32_t n = 0; n < N; ++n) {
+          sum += (int32_t)(pSrcA[m * N + n] + A_offset) *
+                 (pSrcB[n * P + p] + B_offset);
+        }
+        // Requantize value
+        sum = alpha * sum + beta * pSrcC[m * P + p] + bias;
+        sum = sum * _mul + rqs_bias + _add;
+        sum = (sum >> log2D) + Y_offset;
+        pDstY[m * P + p] = (int8_t)CLAMP(sum, output_min, output_max);
+      }
+    }
+  } else if (transA == 1 && transB == 0) {
+    for (uint32_t m = core_id / c; m < M; m += numThreads / c) {
+      if (per_row_quant) {
+        _mul = mul[m];
+        _add = add[m];
+      }
+      for (uint32_t p = c_start; p < c_end; ++p) {
+        int32_t sum = 0;
+        for (uint32_t n = 0; n < N; ++n) {
+          sum += (int32_t)(pSrcA[n * M + m] + A_offset) *
+                 (pSrcB[n * P + p] + B_offset);
+        }
+        // Requantize value
+        sum = alpha * sum + beta * pSrcC[m * P + p] + bias;
+        sum = sum * _mul + rqs_bias + _add;
+        sum = (sum >> log2D) + Y_offset;
+        pDstY[m * P + p] = (int8_t)CLAMP(sum, output_min, output_max);
+      }
+    }
+  } else if (transA == 0 && transB == 1) {
+    for (uint32_t m = core_id / c; m < M; m += numThreads / c) {
+      if (per_row_quant) {
+        _mul = mul[m];
+        _add = add[m];
+      }
+      for (uint32_t p = c_start; p < c_end; ++p) {
+        int32_t sum = 0;
+        for (uint32_t n = 0; n < N; ++n) {
+          sum += (int32_t)(pSrcA[m * N + n] + A_offset) *
+                 (pSrcB[p * N + n] + B_offset);
+        }
+        // Requantize value
+        sum = alpha * sum + beta * pSrcC[m * P + p] + bias;
+        sum = sum * _mul + rqs_bias + _add;
+        sum = (sum >> log2D) + Y_offset;
+        pDstY[m * P + p] = (int8_t)CLAMP(sum, output_min, output_max);
+      }
+    }
+  } else {
+    for (uint32_t m = core_id / c; m < M; m += numThreads / c) {
+      if (per_row_quant) {
+        _mul = mul[m];
+        _add = add[m];
+      }
+      for (uint32_t p = c_start; p < c_end; ++p) {
+        int32_t sum = 0;
+        for (uint32_t n = 0; n < N; ++n) {
+          sum += (int32_t)(pSrcA[n * M + m] + A_offset) *
+                 (pSrcB[p * N + n] + B_offset);
+        }
+        // Requantize value
+        sum = alpha * sum + beta * pSrcC[m * P + p] + bias;
+        sum = sum * _mul + rqs_bias + _add;
+        sum = (sum >> log2D) + Y_offset;
+        pDstY[m * P + p] = (int8_t)CLAMP(sum, output_min, output_max);
+      }
+    }
+  }
+}
+
+void RQGemm_offset_unrolled_2x2_parallel_s8_rv32im(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M,
+    uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
+    int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset,
+    int32_t Y_offset, uint32_t core_id, uint32_t numThreads) {
+
+  // Parallelize by assigning each core one row
+  uint32_t const c = 1; // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (core_id % c);
+  uint32_t const c_end = (P / c) * ((core_id % c) + 1);
+
+  const int32_t rqs_bias = ((1 << (log2D - 1))) * rounding;
+  const int32_t bias = beta * C_offset;
+
+  int32_t _add0 = add[0];
+  int32_t _add1 = add[0];
+  int32_t _mul0 = mul[0];
+  int32_t _mul1 = mul[0];
+
+  if (transA == 0 && transB == 0) {
+    for (uint32_t m = 2 * (core_id / c); m < M; m += 2 * (numThreads / c)) {
+      if (per_row_quant) {
+        _mul0 = mul[m + 0];
+        _mul1 = mul[m + 1];
+        _add0 = add[m + 0];
+        _add1 = add[m + 1];
+      }
+      for (uint32_t p = c_start; p < c_end; p += 2) {
+        int32_t c00 = 0;
+        int32_t c01 = 0;
+        int32_t c10 = 0;
+        int32_t c11 = 0;
+        for (uint32_t n = 0; n < N; n += 2) {
+          // Explicitly load the values first to help with scheduling
+          int8_t val_a00 = (int8_t)(pSrcA[(m + 0) * N + n + 0] + A_offset);
+          int8_t val_a01 = (int8_t)(pSrcA[(m + 0) * N + n + 1] + A_offset);
+          int8_t val_a10 = (int8_t)(pSrcA[(m + 1) * N + n + 0] + A_offset);
+          int8_t val_a11 = (int8_t)(pSrcA[(m + 1) * N + n + 1] + A_offset);
+          int8_t val_b00 = (int8_t)(pSrcB[(n + 0) * P + p + 0] + B_offset);
+          int8_t val_b01 = (int8_t)(pSrcB[(n + 0) * P + p + 1] + B_offset);
+          int8_t val_b10 = (int8_t)(pSrcB[(n + 1) * P + p + 0] + B_offset);
+          int8_t val_b11 = (int8_t)(pSrcB[(n + 1) * P + p + 1] + B_offset);
+          c00 += val_a00 * val_b00;
+          c00 += val_a01 * val_b10;
+          c01 += val_a00 * val_b01;
+          c01 += val_a01 * val_b11;
+          c10 += val_a10 * val_b00;
+          c10 += val_a11 * val_b10;
+          c11 += val_a10 * val_b01;
+          c11 += val_a11 * val_b11;
+        }
+
+        c00 = c00 * alpha + beta * pSrcC[(m + 0) * P + p + 0] + bias;
+        c01 = c01 * alpha + beta * pSrcC[(m + 0) * P + p + 1] + bias;
+        c10 = c10 * alpha + beta * pSrcC[(m + 1) * P + p + 0] + bias;
+        c11 = c11 * alpha + beta * pSrcC[(m + 1) * P + p + 1] + bias;
+
+        c00 = c00 * _mul0 + rqs_bias + _add0;
+        c01 = c01 * _mul0 + rqs_bias + _add0;
+        c10 = c10 * _mul1 + rqs_bias + _add1;
+        c11 = c11 * _mul1 + rqs_bias + _add1;
+
+        c00 = (c00 >> log2D) + Y_offset;
+        c01 = (c01 >> log2D) + Y_offset;
+        c10 = (c10 >> log2D) + Y_offset;
+        c11 = (c11 >> log2D) + Y_offset;
+
+        pDstY[(m + 0) * P + p + 0] = (int8_t)CLAMP(c00, -128, 127);
+        pDstY[(m + 0) * P + p + 1] = (int8_t)CLAMP(c01, -128, 127);
+        pDstY[(m + 1) * P + p + 0] = (int8_t)CLAMP(c10, -128, 127);
+        pDstY[(m + 1) * P + p + 1] = (int8_t)CLAMP(c11, -128, 127);
+      }
+    }
+  } else if (transA == 1 && transB == 0) {
+    for (uint32_t m = 2 * (core_id / c); m < M; m += 2 * (numThreads / c)) {
+      if (per_row_quant) {
+        _mul0 = mul[m + 0];
+        _mul1 = mul[m + 1];
+        _add0 = add[m + 0];
+        _add1 = add[m + 1];
+      }
+      for (uint32_t p = c_start; p < c_end; p += 2) {
+        int32_t c00 = 0;
+        int32_t c01 = 0;
+        int32_t c10 = 0;
+        int32_t c11 = 0;
+        for (uint32_t n = 0; n < N; n += 2) {
+          // Explicitly load the values first to help with scheduling
+          int8_t val_a00 = (int8_t)(pSrcA[(n + 0) * M + m + 0] + A_offset);
+          int8_t val_a01 = (int8_t)(pSrcA[(n + 1) * M + m + 0] + A_offset);
+          int8_t val_a10 = (int8_t)(pSrcA[(n + 0) * M + m + 1] + A_offset);
+          int8_t val_a11 = (int8_t)(pSrcA[(n + 1) * M + m + 1] + A_offset);
+          int8_t val_b00 = (int8_t)(pSrcB[(n + 0) * P + p + 0] + B_offset);
+          int8_t val_b01 = (int8_t)(pSrcB[(n + 0) * P + p + 1] + B_offset);
+          int8_t val_b10 = (int8_t)(pSrcB[(n + 1) * P + p + 0] + B_offset);
+          int8_t val_b11 = (int8_t)(pSrcB[(n + 1) * P + p + 1] + B_offset);
+          c00 += val_a00 * val_b00;
+          c00 += val_a01 * val_b10;
+          c01 += val_a00 * val_b01;
+          c01 += val_a01 * val_b11;
+          c10 += val_a10 * val_b00;
+          c10 += val_a11 * val_b10;
+          c11 += val_a10 * val_b01;
+          c11 += val_a11 * val_b11;
+        }
+
+        c00 = c00 * alpha + beta * pSrcC[(m + 0) * P + p + 0] + bias;
+        c01 = c01 * alpha + beta * pSrcC[(m + 0) * P + p + 1] + bias;
+        c10 = c10 * alpha + beta * pSrcC[(m + 1) * P + p + 0] + bias;
+        c11 = c11 * alpha + beta * pSrcC[(m + 1) * P + p + 1] + bias;
+
+        c00 = c00 * _mul0 + rqs_bias + _add0;
+        c01 = c01 * _mul0 + rqs_bias + _add0;
+        c10 = c10 * _mul1 + rqs_bias + _add1;
+        c11 = c11 * _mul1 + rqs_bias + _add1;
+
+        c00 = (c00 >> log2D) + Y_offset;
+        c01 = (c01 >> log2D) + Y_offset;
+        c10 = (c10 >> log2D) + Y_offset;
+        c11 = (c11 >> log2D) + Y_offset;
+
+        pDstY[(m + 0) * P + p + 0] = (int8_t)CLAMP(c00, -128, 127);
+        pDstY[(m + 0) * P + p + 1] = (int8_t)CLAMP(c01, -128, 127);
+        pDstY[(m + 1) * P + p + 0] = (int8_t)CLAMP(c10, -128, 127);
+        pDstY[(m + 1) * P + p + 1] = (int8_t)CLAMP(c11, -128, 127);
+      }
+    }
+  } else if (transA == 0 && transB == 1) {
+    for (uint32_t m = 2 * (core_id / c); m < M; m += 2 * (numThreads / c)) {
+      if (per_row_quant) {
+        _mul0 = mul[m + 0];
+        _mul1 = mul[m + 1];
+        _add0 = add[m + 0];
+        _add1 = add[m + 1];
+      }
+      for (uint32_t p = c_start; p < c_end; p += 2) {
+        int32_t c00 = 0;
+        int32_t c01 = 0;
+        int32_t c10 = 0;
+        int32_t c11 = 0;
+        for (uint32_t n = 0; n < N; n += 2) {
+          // Explicitly load the values first to help with scheduling
+          int8_t val_a00 = (int8_t)(pSrcA[(m + 0) * N + n + 0] + A_offset);
+          int8_t val_a01 = (int8_t)(pSrcA[(m + 0) * N + n + 1] + A_offset);
+          int8_t val_a10 = (int8_t)(pSrcA[(m + 1) * N + n + 0] + A_offset);
+          int8_t val_a11 = (int8_t)(pSrcA[(m + 1) * N + n + 1] + A_offset);
+          int8_t val_b00 = (int8_t)(pSrcB[(p + 0) * N + n + 0] + B_offset);
+          int8_t val_b01 = (int8_t)(pSrcB[(p + 1) * N + n + 0] + B_offset);
+          int8_t val_b10 = (int8_t)(pSrcB[(p + 0) * N + n + 1] + B_offset);
+          int8_t val_b11 = (int8_t)(pSrcB[(p + 1) * N + n + 1] + B_offset);
+          c00 += val_a00 * val_b00;
+          c00 += val_a01 * val_b10;
+          c01 += val_a00 * val_b01;
+          c01 += val_a01 * val_b11;
+          c10 += val_a10 * val_b00;
+          c10 += val_a11 * val_b10;
+          c11 += val_a10 * val_b01;
+          c11 += val_a11 * val_b11;
+        }
+
+        c00 = c00 * alpha + beta * pSrcC[(m + 0) * P + p + 0] + bias;
+        c01 = c01 * alpha + beta * pSrcC[(m + 0) * P + p + 1] + bias;
+        c10 = c10 * alpha + beta * pSrcC[(m + 1) * P + p + 0] + bias;
+        c11 = c11 * alpha + beta * pSrcC[(m + 1) * P + p + 1] + bias;
+
+        c00 = c00 * _mul0 + rqs_bias + _add0;
+        c01 = c01 * _mul0 + rqs_bias + _add0;
+        c10 = c10 * _mul1 + rqs_bias + _add1;
+        c11 = c11 * _mul1 + rqs_bias + _add1;
+
+        c00 = (c00 >> log2D) + Y_offset;
+        c01 = (c01 >> log2D) + Y_offset;
+        c10 = (c10 >> log2D) + Y_offset;
+        c11 = (c11 >> log2D) + Y_offset;
+
+        pDstY[(m + 0) * P + p + 0] = (int8_t)CLAMP(c00, -128, 127);
+        pDstY[(m + 0) * P + p + 1] = (int8_t)CLAMP(c01, -128, 127);
+        pDstY[(m + 1) * P + p + 0] = (int8_t)CLAMP(c10, -128, 127);
+        pDstY[(m + 1) * P + p + 1] = (int8_t)CLAMP(c11, -128, 127);
+      }
+    }
+  } else if (transA == 1 && transB == 1) {
+    for (uint32_t m = 2 * (core_id / c); m < M; m += 2 * (numThreads / c)) {
+      if (per_row_quant) {
+        _mul0 = mul[m + 0];
+        _mul1 = mul[m + 1];
+        _add0 = add[m + 0];
+        _add1 = add[m + 1];
+      }
+      for (uint32_t p = c_start; p < c_end; p += 2) {
+        int32_t c00 = 0;
+        int32_t c01 = 0;
+        int32_t c10 = 0;
+        int32_t c11 = 0;
+        for (uint32_t n = 0; n < N; n += 2) {
+          // Explicitly load the values first to help with scheduling
+          int8_t val_a00 = (int8_t)(pSrcA[(n + 0) * M + m + 0] + A_offset);
+          int8_t val_a01 = (int8_t)(pSrcA[(n + 1) * M + m + 0] + A_offset);
+          int8_t val_a10 = (int8_t)(pSrcA[(n + 0) * M + m + 1] + A_offset);
+          int8_t val_a11 = (int8_t)(pSrcA[(n + 1) * M + m + 1] + A_offset);
+          int8_t val_b00 = (int8_t)(pSrcB[(p + 0) * N + n + 0] + B_offset);
+          int8_t val_b01 = (int8_t)(pSrcB[(p + 1) * N + n + 0] + B_offset);
+          int8_t val_b10 = (int8_t)(pSrcB[(p + 0) * N + n + 1] + B_offset);
+          int8_t val_b11 = (int8_t)(pSrcB[(p + 1) * N + n + 1] + B_offset);
+          c00 += val_a00 * val_b00;
+          c00 += val_a01 * val_b10;
+          c01 += val_a00 * val_b01;
+          c01 += val_a01 * val_b11;
+          c10 += val_a10 * val_b00;
+          c10 += val_a11 * val_b10;
+          c11 += val_a10 * val_b01;
+          c11 += val_a11 * val_b11;
+        }
+
+        c00 = c00 * alpha + beta * pSrcC[(m + 0) * P + p + 0] + bias;
+        c01 = c01 * alpha + beta * pSrcC[(m + 0) * P + p + 1] + bias;
+        c10 = c10 * alpha + beta * pSrcC[(m + 1) * P + p + 0] + bias;
+        c11 = c11 * alpha + beta * pSrcC[(m + 1) * P + p + 1] + bias;
+
+        c00 = c00 * _mul0 + rqs_bias + _add0;
+        c01 = c01 * _mul0 + rqs_bias + _add0;
+        c10 = c10 * _mul1 + rqs_bias + _add1;
+        c11 = c11 * _mul1 + rqs_bias + _add1;
+
+        c00 = (c00 >> log2D) + Y_offset;
+        c01 = (c01 >> log2D) + Y_offset;
+        c10 = (c10 >> log2D) + Y_offset;
+        c11 = (c11 >> log2D) + Y_offset;
+
+        pDstY[(m + 0) * P + p + 0] = (int8_t)CLAMP(c00, -128, 127);
+        pDstY[(m + 0) * P + p + 1] = (int8_t)CLAMP(c01, -128, 127);
+        pDstY[(m + 1) * P + p + 0] = (int8_t)CLAMP(c10, -128, 127);
+        pDstY[(m + 1) * P + p + 1] = (int8_t)CLAMP(c11, -128, 127);
+      }
+    }
+  }
+}
+
+#ifdef __XPULPIMG
+void RQGemm_offset_unrolled_4x4_pincr_asm_parallel_s8_xpulpv2(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int32_t const *__restrict__ pSrcC, int8_t *__restrict__ pDstY, uint32_t M,
+    uint32_t N, uint32_t P, int32_t alpha, int32_t beta, int32_t transA,
+    int32_t transB, int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, int32_t A_offset, int32_t B_offset, int32_t C_offset,
+    int32_t Y_offset, uint32_t core_id, uint32_t numThreads) {
+
+  const int32_t rqs_bias = ((1 << (log2D - 1))) * rounding;
+  const int32_t bias = beta * C_offset;
+
+  const int32_t *idx_add = &add[0];
+  const int32_t *idx_mul = &mul[0];
+  // Loop counter for P
+  uint32_t k = 0;
+  if (transA == 0 && transB == 0) {
+    // Masks for shuffles
+    static v4s mask0 = {0, 1, 4, 5};
+    static v4s mask1 = {2, 3, 6, 7};
+    static v4s mask2 = {0, 2, 4, 6};
+    static v4s mask3 = {1, 3, 5, 7};
+
+    // Row decrement for A matrix
+    int32_t const N_decr = -(int)N + 4;
+    // Row increment for C matrix
+    uint32_t const P_incr = (P * 4) - 12;
+
+    v4s aVecOffset = {(int8_t)A_offset, (int8_t)A_offset, (int8_t)A_offset,
+                      (int8_t)A_offset};
+    v4s bVecOffset = {(int8_t)B_offset, (int8_t)B_offset, (int8_t)B_offset,
+                      (int8_t)B_offset};
+
+    for (k = core_id; k < P / 4; k += numThreads) {
+      const int8_t *idx_a = &pSrcA[0];      // start_a
+      const int32_t *idx_c = &pSrcC[k * 4]; // start_c
+      int8_t *idx_y = &pDstY[k * 4];        // start_y
+      int8_t const *end_y = &pDstY[P * M];  // actually (P * M) + (k * 4)
+      while (idx_y < end_y) {
+        int32_t sum00 = 0;
+        int32_t sum01 = 0;
+        int32_t sum02 = 0;
+        int32_t sum03 = 0;
+        int32_t sum10 = 0;
+        int32_t sum11 = 0;
+        int32_t sum12 = 0;
+        int32_t sum13 = 0;
+
+        v4s sum0, sum1;
+
+        int8_t const *end_a = idx_a + N;
+        const int8_t *idx_b = &pSrcB[k * 4]; // start_b
+        while (idx_a < end_a) {
+          v4s aVec0, aVec1;
+
+          v4s temp0, temp1, temp2, temp3;
+
+          __asm__ volatile(
+              "p.lw %[a0], %[a_incr](%[addr_a]!) \n\t"
+              "p.lw %[a1], %[a_decr](%[addr_a]!) \n\t"
+              "p.lw %[t0], %[b_incr](%[addr_b]!) \n\t"
+              "p.lw %[t1], %[b_incr](%[addr_b]!) \n\t"
+              "p.lw %[t2], %[b_incr](%[addr_b]!) \n\t"
+              "p.lw %[t3], %[b_incr](%[addr_b]!) \n\t"
+              : [a0] "=&r"(aVec0), [a1] "=&r"(aVec1), [t0] "=&r"(temp0),
+                [t1] "=&r"(temp1), [t2] "=&r"(temp2), [t3] "=&r"(temp3),
+                [addr_a] "+&r"(idx_a), [addr_b] "+&r"(idx_b)
+              : [a_incr] "r"(N), [a_decr] "r"(N_decr), [b_incr] "r"(P)
+              : "memory");
+          /* The asm code above implements the following commented C code */
+          // go to next row, same column
+          // v4s aVec0 = *((v4s *)idx_a); idx_a += N;
+          // go to previous row, one column forward
+          // v4s aVec1 = *((v4s *)idx_a); idx_a -= N - 4;
+          // v4s temp0 = *((v4s *)idx_b); idx_b += P;
+          // v4s temp1 = *((v4s *)idx_b); idx_b += P;
+          // v4s temp2 = *((v4s *)idx_b); idx_b += P;
+          // v4s temp3 = *((v4s *)idx_b); idx_b += P;
+          aVec0 = __ADD4(aVec0, aVecOffset);
+          aVec1 = __ADD4(aVec1, aVecOffset);
+
+          // Shuffles to transpose at runtime the chunk extracted from B before
+          // multiplying with A chunk temp0-3 variables needed because shuffles
+          // use rD as source, but also modify it, thus we need a copy of their
+          // content to use it twice in their original form
+          v4s temp4 = __builtin_shuffle(temp0, temp1, mask0); // 0,1,4,5
+          v4s temp5 = __builtin_shuffle(temp2, temp3, mask0); // 8,9,12,13
+          v4s temp6 = __builtin_shuffle(temp0, temp1, mask1); // 2,3,6,7
+          v4s temp7 = __builtin_shuffle(temp2, temp3, mask1); // 3,7,11,15
+
+          v4s bVec0 = __builtin_shuffle(temp4, temp5, mask2); // 0,4,8,12
+          v4s bVec1 = __builtin_shuffle(temp4, temp5, mask3); // 1,5,9,13
+          v4s bVec2 = __builtin_shuffle(temp6, temp7, mask2); // 2,6,10,14
+          v4s bVec3 = __builtin_shuffle(temp6, temp7, mask3); // 3,7,11,15
+
+          bVec0 = __ADD4(bVec0, bVecOffset);
+          bVec1 = __ADD4(bVec1, bVecOffset);
+          bVec2 = __ADD4(bVec2, bVecOffset);
+          bVec3 = __ADD4(bVec3, bVecOffset);
+
+          sum00 = __SUMDOTP4(aVec0, bVec0, sum00);
+          sum01 = __SUMDOTP4(aVec0, bVec1, sum01);
+          sum02 = __SUMDOTP4(aVec0, bVec2, sum02);
+          sum03 = __SUMDOTP4(aVec0, bVec3, sum03);
+          sum10 = __SUMDOTP4(aVec1, bVec0, sum10);
+          sum11 = __SUMDOTP4(aVec1, bVec1, sum11);
+          sum12 = __SUMDOTP4(aVec1, bVec2, sum12);
+          sum13 = __SUMDOTP4(aVec1, bVec3, sum13);
+        }
+        int32_t bias00, bias01, bias02, bias03;
+        int32_t bias10, bias11, bias12, bias13;
+
+        __asm__ volatile(
+            "p.lw %[b00], 4(%[addr_c]!) \n\t"
+            "p.lw %[b01], 4(%[addr_c]!) \n\t"
+            "p.lw %[b02], 4(%[addr_c]!) \n\t"
+            "p.lw %[b03], %[c_incr](%[addr_c]!) \n\t"
+            "p.lw %[b10], 4(%[addr_c]!) \n\t"
+            "p.lw %[b11], 4(%[addr_c]!) \n\t"
+            "p.lw %[b12], 4(%[addr_c]!) \n\t"
+            "p.lw %[b13], %[c_incr](%[addr_c]!) \n\t"
+            : [b00] "=&r"(bias00), [b01] "=&r"(bias01), [b02] "=&r"(bias02),
+              [b03] "=&r"(bias03), [b10] "=&r"(bias10), [b11] "=&r"(bias11),
+              [b12] "=&r"(bias12), [b13] "=&r"(bias13), [addr_c] "+&r"(idx_c)
+            : [c_incr] "r"(P_incr)
+            : "memory");
+
+        sum00 = alpha * sum00 + beta * bias00 + bias;
+        sum01 = alpha * sum01 + beta * bias01 + bias;
+        sum02 = alpha * sum02 + beta * bias02 + bias;
+        sum03 = alpha * sum03 + beta * bias03 + bias;
+        sum10 = alpha * sum10 + beta * bias10 + bias;
+        sum11 = alpha * sum11 + beta * bias11 + bias;
+        sum12 = alpha * sum12 + beta * bias12 + bias;
+        sum13 = alpha * sum13 + beta * bias13 + bias;
+
+        int32_t _add0, _add1;
+        int32_t _mul0, _mul1;
+        if (per_row_quant) {
+          __asm__ volatile(
+              "p.lw %[add0], 4(%[addr_add]!) \n\t"
+              "p.lw %[add1], 4(%[addr_add]!) \n\t"
+              "p.lw %[mul0], 4(%[addr_mul]!) \n\t"
+              "p.lw %[mul1], 4(%[addr_mul]!) \n\t"
+              : [add0] "=&r"(_add0), [mul0] "=&r"(_mul0), [add1] "=&r"(_add1),
+                [mul1] "=&r"(_mul1), [addr_add] "+&r"(idx_add),
+                [addr_mul] "+&r"(idx_mul)::"memory");
+        } else {
+          _add0 = add[0];
+          _add1 = add[0];
+          _mul0 = mul[0];
+          _mul1 = mul[0];
+        }
+
+        sum00 = sum00 * _mul0 + rqs_bias + _add0;
+        sum01 = sum01 * _mul0 + rqs_bias + _add0;
+        sum02 = sum02 * _mul0 + rqs_bias + _add0;
+        sum03 = sum03 * _mul0 + rqs_bias + _add0;
+        sum10 = sum10 * _mul1 + rqs_bias + _add1;
+        sum11 = sum11 * _mul1 + rqs_bias + _add1;
+        sum12 = sum12 * _mul1 + rqs_bias + _add1;
+        sum13 = sum13 * _mul1 + rqs_bias + _add1;
+
+        sum00 = (sum00 >> log2D) + Y_offset;
+        sum01 = (sum01 >> log2D) + Y_offset;
+        sum02 = (sum02 >> log2D) + Y_offset;
+        sum03 = (sum03 >> log2D) + Y_offset;
+        sum10 = (sum10 >> log2D) + Y_offset;
+        sum11 = (sum11 >> log2D) + Y_offset;
+        sum12 = (sum12 >> log2D) + Y_offset;
+        sum13 = (sum13 >> log2D) + Y_offset;
+
+        sum0[0] = (int8_t)__CLIP(sum00, 7);
+        sum0[1] = (int8_t)__CLIP(sum01, 7);
+        sum0[2] = (int8_t)__CLIP(sum02, 7);
+        sum0[3] = (int8_t)__CLIP(sum03, 7);
+        sum1[0] = (int8_t)__CLIP(sum10, 7);
+        sum1[1] = (int8_t)__CLIP(sum11, 7);
+        sum1[2] = (int8_t)__CLIP(sum12, 7);
+        sum1[3] = (int8_t)__CLIP(sum13, 7);
+
+        __asm__ volatile("p.sw %[s0], %[y_incr](%[addr_y]!) \n\t"
+                         "p.sw %[s1], %[y_incr](%[addr_y]!) \n\t"
+                         : [addr_y] "+&r"(idx_y)
+                         : [s0] "r"(sum0), [s1] "r"(sum1), [y_incr] "r"(P)
+                         : "memory");
+        /* The asm code above implements the following commented C code */
+        // *(idx_y) = sum0; idx_y += P;
+        // *(idx_y) = sum1; idx_y += P;
+
+        idx_a += N; // adjust A matrix pointer
+      }
+    }
+  } else if (transA == 1 && transB == 0) {
+    // Masks for shuffles
+    static v4s mask0 = {0, 1, 4, 5};
+    static v4s mask1 = {2, 3, 6, 7};
+    static v4s mask2 = {0, 2, 4, 6};
+    static v4s mask3 = {1, 3, 5, 7};
+
+    // Row increment for C matrix
+    uint32_t const P_incr = (P * 4) - 12;
+
+    v4s aVecOffset = {(int8_t)A_offset, (int8_t)A_offset, (int8_t)A_offset,
+                      (int8_t)A_offset};
+    v4s bVecOffset = {(int8_t)B_offset, (int8_t)B_offset, (int8_t)B_offset,
+                      (int8_t)B_offset};
+
+    for (k = core_id; k < P / 4; k += numThreads) {
+      const int8_t *idx_a = &pSrcA[0];      // start_a
+      const int32_t *idx_c = &pSrcC[k * 4]; // start_c
+      int8_t *idx_y = &pDstY[k * 4];        // start_y
+      int8_t const *end_y = &pDstY[P * M];  // actually (P * M) + (k * 4)
+      while (idx_y < end_y) {
+        int32_t sum00 = 0;
+        int32_t sum01 = 0;
+        int32_t sum02 = 0;
+        int32_t sum03 = 0;
+        int32_t sum10 = 0;
+        int32_t sum11 = 0;
+        int32_t sum12 = 0;
+        int32_t sum13 = 0;
+        int32_t sum20 = 0;
+        int32_t sum21 = 0;
+        int32_t sum22 = 0;
+        int32_t sum23 = 0;
+        int32_t sum30 = 0;
+        int32_t sum31 = 0;
+        int32_t sum32 = 0;
+        int32_t sum33 = 0;
+
+        v4s sum0, sum1, sum2, sum3;
+
+        int8_t const *end_a = idx_a + N * M;
+        const int8_t *idx_b = &pSrcB[k * 4]; // start_b
+        while (idx_a < end_a) {
+          v4s bTemp0, bTemp1, bTemp2, bTemp3;
+          v4s aTemp0, aTemp1, aTemp2, aTemp3;
+
+          __asm__ volatile(
+              "p.lw %[at0], %[a_incr](%[addr_a]!) \n\t"
+              "p.lw %[at1], %[a_incr](%[addr_a]!) \n\t"
+              "p.lw %[at2], %[a_incr](%[addr_a]!) \n\t"
+              "p.lw %[at3], %[a_incr](%[addr_a]!) \n\t"
+              "p.lw %[bt0], %[b_incr](%[addr_b]!) \n\t"
+              "p.lw %[bt1], %[b_incr](%[addr_b]!) \n\t"
+              "p.lw %[bt2], %[b_incr](%[addr_b]!) \n\t"
+              "p.lw %[bt3], %[b_incr](%[addr_b]!) \n\t"
+              : [at0] "=&r"(aTemp0), [at1] "=&r"(aTemp1), [at2] "=&r"(aTemp2),
+                [at3] "=&r"(aTemp3), [bt0] "=&r"(bTemp0), [bt1] "=&r"(bTemp1),
+                [bt2] "=&r"(bTemp2), [bt3] "=&r"(bTemp3), [addr_a] "+&r"(idx_a),
+                [addr_b] "+&r"(idx_b)
+              : [a_incr] "r"(M), [b_incr] "r"(P)
+              : "memory");
+          /* The asm code above implements the following commented C code */
+          // go to next row, same column
+          // v4s aTemp0 = *((v4s *)idx_a); idx_a += M;
+          // v4s aTemp1 = *((v4s *)idx_a); idx_a += M;
+          // v4s aTemp2 = *((v4s *)idx_a); idx_a += M;
+          // v4s aTemp3 = *((v4s *)idx_a); idx_a += M;
+
+          // v4s bTemp0 = *((v4s *)idx_b); idx_b += P;
+          // v4s bTemp1 = *((v4s *)idx_b); idx_b += P;
+          // v4s bTemp2 = *((v4s *)idx_b); idx_b += P;
+          // v4s bTemp3 = *((v4s *)idx_b); idx_b += P;
+
+          // Shuffles to transpose at runtime the chunk extracted from B before
+          // multiplying with A
+          v4s bTemp4 = __builtin_shuffle(bTemp0, bTemp1, mask0); // 0,1,4,5
+          v4s bTemp5 = __builtin_shuffle(bTemp2, bTemp3, mask0); // 8,9,12,13
+          v4s bTemp6 = __builtin_shuffle(bTemp0, bTemp1, mask1); // 2,3,6,7
+          v4s bTemp7 = __builtin_shuffle(bTemp2, bTemp3, mask1); // 3,7,11,15
+
+          v4s bVec0 = __builtin_shuffle(bTemp4, bTemp5, mask2); // 0,4,8,12
+          v4s bVec1 = __builtin_shuffle(bTemp4, bTemp5, mask3); // 1,5,9,13
+          v4s bVec2 = __builtin_shuffle(bTemp6, bTemp7, mask2); // 2,6,10,14
+          v4s bVec3 = __builtin_shuffle(bTemp6, bTemp7, mask3); // 3,7,11,15
+
+          // Shuffles to transpose at runtime the chunk extracted from A before
+          // multiplying with B
+          v4s aTemp4 = __builtin_shuffle(aTemp0, aTemp1, mask0); // 0,1,4,5
+          v4s aTemp5 = __builtin_shuffle(aTemp2, aTemp3, mask0); // 8,9,12,13
+          v4s aTemp6 = __builtin_shuffle(aTemp0, aTemp1, mask1); // 2,3,6,7
+          v4s aTemp7 = __builtin_shuffle(aTemp2, aTemp3, mask1); // 3,7,11,15
+
+          v4s aVec0 = __builtin_shuffle(aTemp4, aTemp5, mask2); // 0,4,8,12
+          v4s aVec1 = __builtin_shuffle(aTemp4, aTemp5, mask3); // 1,5,9,13
+          v4s aVec2 = __builtin_shuffle(aTemp6, aTemp7, mask2); // 2,6,10,14
+          v4s aVec3 = __builtin_shuffle(aTemp6, aTemp7, mask3); // 3,7,11,15
+
+          aVec0 = __ADD4(aVec0, aVecOffset);
+          aVec1 = __ADD4(aVec1, aVecOffset);
+          aVec2 = __ADD4(aVec2, aVecOffset);
+          aVec3 = __ADD4(aVec3, aVecOffset);
+
+          bVec0 = __ADD4(bVec0, bVecOffset);
+          bVec1 = __ADD4(bVec1, bVecOffset);
+          bVec2 = __ADD4(bVec2, bVecOffset);
+          bVec3 = __ADD4(bVec3, bVecOffset);
+
+          sum00 = __SUMDOTP4(aVec0, bVec0, sum00);
+          sum01 = __SUMDOTP4(aVec0, bVec1, sum01);
+          sum02 = __SUMDOTP4(aVec0, bVec2, sum02);
+          sum03 = __SUMDOTP4(aVec0, bVec3, sum03);
+          sum10 = __SUMDOTP4(aVec1, bVec0, sum10);
+          sum11 = __SUMDOTP4(aVec1, bVec1, sum11);
+          sum12 = __SUMDOTP4(aVec1, bVec2, sum12);
+          sum13 = __SUMDOTP4(aVec1, bVec3, sum13);
+          sum20 = __SUMDOTP4(aVec2, bVec0, sum20);
+          sum21 = __SUMDOTP4(aVec2, bVec1, sum21);
+          sum22 = __SUMDOTP4(aVec2, bVec2, sum22);
+          sum23 = __SUMDOTP4(aVec2, bVec3, sum23);
+          sum30 = __SUMDOTP4(aVec3, bVec0, sum30);
+          sum31 = __SUMDOTP4(aVec3, bVec1, sum31);
+          sum32 = __SUMDOTP4(aVec3, bVec2, sum32);
+          sum33 = __SUMDOTP4(aVec3, bVec3, sum33);
+        }
+        int32_t bias00, bias01, bias02, bias03;
+        int32_t bias10, bias11, bias12, bias13;
+        int32_t bias20, bias21, bias22, bias23;
+        int32_t bias30, bias31, bias32, bias33;
+
+        __asm__ volatile(
+            "p.lw %[b00], 4(%[addr_c]!) \n\t"
+            "p.lw %[b01], 4(%[addr_c]!) \n\t"
+            "p.lw %[b02], 4(%[addr_c]!) \n\t"
+            "p.lw %[b03], %[c_incr](%[addr_c]!) \n\t"
+            "p.lw %[b10], 4(%[addr_c]!) \n\t"
+            "p.lw %[b11], 4(%[addr_c]!) \n\t"
+            "p.lw %[b12], 4(%[addr_c]!) \n\t"
+            "p.lw %[b13], %[c_incr](%[addr_c]!) \n\t"
+            "p.lw %[b20], 4(%[addr_c]!) \n\t"
+            "p.lw %[b21], 4(%[addr_c]!) \n\t"
+            "p.lw %[b22], 4(%[addr_c]!) \n\t"
+            "p.lw %[b23], %[c_incr](%[addr_c]!) \n\t"
+            "p.lw %[b30], 4(%[addr_c]!) \n\t"
+            "p.lw %[b31], 4(%[addr_c]!) \n\t"
+            "p.lw %[b32], 4(%[addr_c]!) \n\t"
+            "p.lw %[b33], %[c_incr](%[addr_c]!) \n\t"
+            : [b00] "=&r"(bias00), [b01] "=&r"(bias01), [b02] "=&r"(bias02),
+              [b03] "=&r"(bias03), [b10] "=&r"(bias10), [b11] "=&r"(bias11),
+              [b12] "=&r"(bias12), [b13] "=&r"(bias13), [b20] "=&r"(bias20),
+              [b21] "=&r"(bias21), [b22] "=&r"(bias22), [b23] "=&r"(bias23),
+              [b30] "=&r"(bias30), [b31] "=&r"(bias31), [b32] "=&r"(bias32),
+              [b33] "=&r"(bias33), [addr_c] "+&r"(idx_c)
+            : [c_incr] "r"(P_incr)
+            : "memory");
+
+        sum00 = alpha * sum00 + beta * bias00 + bias;
+        sum01 = alpha * sum01 + beta * bias01 + bias;
+        sum02 = alpha * sum02 + beta * bias02 + bias;
+        sum03 = alpha * sum03 + beta * bias03 + bias;
+        sum10 = alpha * sum10 + beta * bias10 + bias;
+        sum11 = alpha * sum11 + beta * bias11 + bias;
+        sum12 = alpha * sum12 + beta * bias12 + bias;
+        sum13 = alpha * sum13 + beta * bias13 + bias;
+        sum20 = alpha * sum20 + beta * bias20 + bias;
+        sum21 = alpha * sum21 + beta * bias21 + bias;
+        sum22 = alpha * sum22 + beta * bias22 + bias;
+        sum23 = alpha * sum23 + beta * bias23 + bias;
+        sum30 = alpha * sum30 + beta * bias30 + bias;
+        sum31 = alpha * sum31 + beta * bias31 + bias;
+        sum32 = alpha * sum32 + beta * bias32 + bias;
+        sum33 = alpha * sum33 + beta * bias33 + bias;
+
+        int32_t _add0, _add1, _add2, _add3;
+        int32_t _mul0, _mul1, _mul2, _mul3;
+        if (per_row_quant) {
+          __asm__ volatile(
+              "p.lw %[add0], 4(%[addr_add]!) \n\t"
+              "p.lw %[add1], 4(%[addr_add]!) \n\t"
+              "p.lw %[add2], 4(%[addr_add]!) \n\t"
+              "p.lw %[add3], 4(%[addr_add]!) \n\t"
+              "p.lw %[mul0], 4(%[addr_mul]!) \n\t"
+              "p.lw %[mul1], 4(%[addr_mul]!) \n\t"
+              "p.lw %[mul2], 4(%[addr_mul]!) \n\t"
+              "p.lw %[mul3], 4(%[addr_mul]!) \n\t"
+              : [add0] "=&r"(_add0), [mul0] "=&r"(_mul0), [add1] "=&r"(_add1),
+                [mul1] "=&r"(_mul1), [add2] "=&r"(_add2), [mul2] "=&r"(_mul2),
+                [add3] "=&r"(_add3), [mul3] "=&r"(_mul3),
+                [addr_add] "+&r"(idx_add), [addr_mul] "+&r"(idx_mul)::"memory");
+        } else {
+          _add0 = add[0];
+          _add1 = add[0];
+          _add2 = add[0];
+          _add3 = add[0];
+          _mul0 = mul[0];
+          _mul1 = mul[0];
+          _mul2 = mul[0];
+          _mul3 = mul[0];
+        }
+
+        sum00 = sum00 * _mul0 + rqs_bias + _add0;
+        sum01 = sum01 * _mul0 + rqs_bias + _add0;
+        sum02 = sum02 * _mul0 + rqs_bias + _add0;
+        sum03 = sum03 * _mul0 + rqs_bias + _add0;
+        sum10 = sum10 * _mul1 + rqs_bias + _add1;
+        sum11 = sum11 * _mul1 + rqs_bias + _add1;
+        sum12 = sum12 * _mul1 + rqs_bias + _add1;
+        sum13 = sum13 * _mul1 + rqs_bias + _add1;
+        sum20 = sum20 * _mul2 + rqs_bias + _add2;
+        sum21 = sum21 * _mul2 + rqs_bias + _add2;
+        sum22 = sum22 * _mul2 + rqs_bias + _add2;
+        sum23 = sum23 * _mul2 + rqs_bias + _add2;
+        sum30 = sum30 * _mul3 + rqs_bias + _add3;
+        sum31 = sum31 * _mul3 + rqs_bias + _add3;
+        sum32 = sum32 * _mul3 + rqs_bias + _add3;
+        sum33 = sum33 * _mul3 + rqs_bias + _add3;
+
+        sum00 = (sum00 >> log2D) + Y_offset;
+        sum01 = (sum01 >> log2D) + Y_offset;
+        sum02 = (sum02 >> log2D) + Y_offset;
+        sum03 = (sum03 >> log2D) + Y_offset;
+        sum10 = (sum10 >> log2D) + Y_offset;
+        sum11 = (sum11 >> log2D) + Y_offset;
+        sum12 = (sum12 >> log2D) + Y_offset;
+        sum13 = (sum13 >> log2D) + Y_offset;
+        sum20 = (sum20 >> log2D) + Y_offset;
+        sum21 = (sum21 >> log2D) + Y_offset;
+        sum22 = (sum22 >> log2D) + Y_offset;
+        sum23 = (sum23 >> log2D) + Y_offset;
+        sum30 = (sum30 >> log2D) + Y_offset;
+        sum31 = (sum31 >> log2D) + Y_offset;
+        sum32 = (sum32 >> log2D) + Y_offset;
+        sum33 = (sum33 >> log2D) + Y_offset;
+
+        sum0[0] = (int8_t)__CLIP(sum00, 7);
+        sum0[1] = (int8_t)__CLIP(sum01, 7);
+        sum0[2] = (int8_t)__CLIP(sum02, 7);
+        sum0[3] = (int8_t)__CLIP(sum03, 7);
+        sum1[0] = (int8_t)__CLIP(sum10, 7);
+        sum1[1] = (int8_t)__CLIP(sum11, 7);
+        sum1[2] = (int8_t)__CLIP(sum12, 7);
+        sum1[3] = (int8_t)__CLIP(sum13, 7);
+        sum2[0] = (int8_t)__CLIP(sum20, 7);
+        sum2[1] = (int8_t)__CLIP(sum21, 7);
+        sum2[2] = (int8_t)__CLIP(sum22, 7);
+        sum2[3] = (int8_t)__CLIP(sum23, 7);
+        sum3[0] = (int8_t)__CLIP(sum30, 7);
+        sum3[1] = (int8_t)__CLIP(sum31, 7);
+        sum3[2] = (int8_t)__CLIP(sum32, 7);
+        sum3[3] = (int8_t)__CLIP(sum33, 7);
+
+        __asm__ volatile("p.sw %[s0], %[y_incr](%[addr_y]!) \n\t"
+                         "p.sw %[s1], %[y_incr](%[addr_y]!) \n\t"
+                         "p.sw %[s2], %[y_incr](%[addr_y]!) \n\t"
+                         "p.sw %[s3], %[y_incr](%[addr_y]!) \n\t"
+                         : [addr_y] "+&r"(idx_y)
+                         : [s0] "r"(sum0), [s1] "r"(sum1), [s2] "r"(sum2),
+                           [s3] "r"(sum3), [y_incr] "r"(P)
+                         : "memory");
+        /* The asm code above implements the following commented C code */
+        // *(idx_y) = sum0; idx_y += P;
+        // *(idx_y) = sum1; idx_y += P;
+        // *(idx_y) = sum2; idx_y += P;
+        // *(idx_y) = sum3; idx_y += P;
+
+        idx_a -= N * M - 4; // adjust A matrix pointer
+      }
+    }
+  } else if (transA == 0 && transB == 1) {
+    // Row decrement for A matrix
+    int32_t const N_decr = -(int)N + 4;
+    int32_t const B_decr = -(int)N * 3 + 4;
+    // Row increment for C matrix
+    uint32_t const P_incr = (P * 4) - 12;
+
+    v4s aVecOffset = {(int8_t)A_offset, (int8_t)A_offset, (int8_t)A_offset,
+                      (int8_t)A_offset};
+    v4s bVecOffset = {(int8_t)B_offset, (int8_t)B_offset, (int8_t)B_offset,
+                      (int8_t)B_offset};
+
+    for (k = core_id; k < P / 4; k += numThreads) {
+      const int8_t *idx_a = &pSrcA[0];      // start_a
+      const int32_t *idx_c = &pSrcC[k * 4]; // start_c
+      int8_t *idx_y = &pDstY[k * 4];        // start_y
+      int8_t const *end_y = &pDstY[P * M];  // actually (P * M) + (k * 4)
+      while (idx_y < end_y) {
+        int32_t sum00 = 0;
+        int32_t sum01 = 0;
+        int32_t sum02 = 0;
+        int32_t sum03 = 0;
+        int32_t sum10 = 0;
+        int32_t sum11 = 0;
+        int32_t sum12 = 0;
+        int32_t sum13 = 0;
+
+        v4s sum0, sum1;
+
+        int8_t const *end_a = idx_a + N;
+        const int8_t *idx_b = &pSrcB[k * 4 * N]; // start_b
+        while (idx_a < end_a) {
+          v4s aVec0, aVec1;
+
+          v4s bVec0, bVec1, bVec2, bVec3;
+
+          __asm__ volatile(
+              "p.lw %[a0], %[a_incr](%[addr_a]!) \n\t"
+              "p.lw %[a1], %[a_decr](%[addr_a]!) \n\t"
+              "p.lw %[b0], %[b_incr](%[addr_b]!) \n\t"
+              "p.lw %[b1], %[b_incr](%[addr_b]!) \n\t"
+              "p.lw %[b2], %[b_incr](%[addr_b]!) \n\t"
+              "p.lw %[b3], %[b_decr](%[addr_b]!) \n\t"
+              : [a0] "=&r"(aVec0), [a1] "=&r"(aVec1), [b0] "=&r"(bVec0),
+                [b1] "=&r"(bVec1), [b2] "=&r"(bVec2), [b3] "=&r"(bVec3),
+                [addr_a] "+&r"(idx_a), [addr_b] "+&r"(idx_b)
+              : [a_incr] "r"(N), [a_decr] "r"(N_decr), [b_incr] "r"(N),
+                [b_decr] "r"(B_decr)
+              : "memory");
+          /* The asm code above implements the following commented C code */
+          // go to next row, same column
+          // v4s aVec0 = *((v4s *)idx_a); idx_a += N;
+          // go to previous row, one column forward
+          // v4s aVec1 = *((v4s *)idx_a); idx_a -= N - 4;
+          // v4s bVec0 = *((v4s *)idx_b); idx_b += N;
+          // v4s bVec1 = *((v4s *)idx_b); idx_b += N;
+          // v4s bVec2 = *((v4s *)idx_b); idx_b += N;
+          // v4s bVec3 = *((v4s *)idx_b); idx_b -= 3*N - 4;
+          aVec0 = __ADD4(aVec0, aVecOffset);
+          aVec1 = __ADD4(aVec1, aVecOffset);
+
+          bVec0 = __ADD4(bVec0, bVecOffset);
+          bVec1 = __ADD4(bVec1, bVecOffset);
+          bVec2 = __ADD4(bVec2, bVecOffset);
+          bVec3 = __ADD4(bVec3, bVecOffset);
+
+          sum00 = __SUMDOTP4(aVec0, bVec0, sum00);
+          sum01 = __SUMDOTP4(aVec0, bVec1, sum01);
+          sum02 = __SUMDOTP4(aVec0, bVec2, sum02);
+          sum03 = __SUMDOTP4(aVec0, bVec3, sum03);
+          sum10 = __SUMDOTP4(aVec1, bVec0, sum10);
+          sum11 = __SUMDOTP4(aVec1, bVec1, sum11);
+          sum12 = __SUMDOTP4(aVec1, bVec2, sum12);
+          sum13 = __SUMDOTP4(aVec1, bVec3, sum13);
+        }
+        int32_t bias00, bias01, bias02, bias03;
+        int32_t bias10, bias11, bias12, bias13;
+
+        __asm__ volatile(
+            "p.lw %[b00], 4(%[addr_c]!) \n\t"
+            "p.lw %[b01], 4(%[addr_c]!) \n\t"
+            "p.lw %[b02], 4(%[addr_c]!) \n\t"
+            "p.lw %[b03], %[c_incr](%[addr_c]!) \n\t"
+            "p.lw %[b10], 4(%[addr_c]!) \n\t"
+            "p.lw %[b11], 4(%[addr_c]!) \n\t"
+            "p.lw %[b12], 4(%[addr_c]!) \n\t"
+            "p.lw %[b13], %[c_incr](%[addr_c]!) \n\t"
+            : [b00] "=&r"(bias00), [b01] "=&r"(bias01), [b02] "=&r"(bias02),
+              [b03] "=&r"(bias03), [b10] "=&r"(bias10), [b11] "=&r"(bias11),
+              [b12] "=&r"(bias12), [b13] "=&r"(bias13), [addr_c] "+&r"(idx_c)
+            : [c_incr] "r"(P_incr)
+            : "memory");
+
+        sum00 = alpha * sum00 + beta * bias00 + bias;
+        sum01 = alpha * sum01 + beta * bias01 + bias;
+        sum02 = alpha * sum02 + beta * bias02 + bias;
+        sum03 = alpha * sum03 + beta * bias03 + bias;
+        sum10 = alpha * sum10 + beta * bias10 + bias;
+        sum11 = alpha * sum11 + beta * bias11 + bias;
+        sum12 = alpha * sum12 + beta * bias12 + bias;
+        sum13 = alpha * sum13 + beta * bias13 + bias;
+
+        int32_t _add0, _add1;
+        int32_t _mul0, _mul1;
+        if (per_row_quant) {
+          __asm__ volatile(
+              "p.lw %[add0], 4(%[addr_add]!) \n\t"
+              "p.lw %[add1], 4(%[addr_add]!) \n\t"
+              "p.lw %[mul0], 4(%[addr_mul]!) \n\t"
+              "p.lw %[mul1], 4(%[addr_mul]!) \n\t"
+              : [add0] "=&r"(_add0), [mul0] "=&r"(_mul0), [add1] "=&r"(_add1),
+                [mul1] "=&r"(_mul1), [addr_add] "+&r"(idx_add),
+                [addr_mul] "+&r"(idx_mul)::"memory");
+        } else {
+          _add0 = add[0];
+          _add1 = add[0];
+          _mul0 = mul[0];
+          _mul1 = mul[0];
+        }
+
+        sum00 = sum00 * _mul0 + rqs_bias + _add0;
+        sum01 = sum01 * _mul0 + rqs_bias + _add0;
+        sum02 = sum02 * _mul0 + rqs_bias + _add0;
+        sum03 = sum03 * _mul0 + rqs_bias + _add0;
+        sum10 = sum10 * _mul1 + rqs_bias + _add1;
+        sum11 = sum11 * _mul1 + rqs_bias + _add1;
+        sum12 = sum12 * _mul1 + rqs_bias + _add1;
+        sum13 = sum13 * _mul1 + rqs_bias + _add1;
+
+        sum00 = (sum00 >> log2D) + Y_offset;
+        sum01 = (sum01 >> log2D) + Y_offset;
+        sum02 = (sum02 >> log2D) + Y_offset;
+        sum03 = (sum03 >> log2D) + Y_offset;
+        sum10 = (sum10 >> log2D) + Y_offset;
+        sum11 = (sum11 >> log2D) + Y_offset;
+        sum12 = (sum12 >> log2D) + Y_offset;
+        sum13 = (sum13 >> log2D) + Y_offset;
+
+        sum0[0] = (int8_t)__CLIP(sum00, 7);
+        sum0[1] = (int8_t)__CLIP(sum01, 7);
+        sum0[2] = (int8_t)__CLIP(sum02, 7);
+        sum0[3] = (int8_t)__CLIP(sum03, 7);
+        sum1[0] = (int8_t)__CLIP(sum10, 7);
+        sum1[1] = (int8_t)__CLIP(sum11, 7);
+        sum1[2] = (int8_t)__CLIP(sum12, 7);
+        sum1[3] = (int8_t)__CLIP(sum13, 7);
+
+        __asm__ volatile("p.sw %[s0], %[y_incr](%[addr_y]!) \n\t"
+                         "p.sw %[s1], %[y_incr](%[addr_y]!) \n\t"
+                         : [addr_y] "+&r"(idx_y)
+                         : [s0] "r"(sum0), [s1] "r"(sum1), [y_incr] "r"(P)
+                         : "memory");
+        /* The asm code above implements the following commented C code */
+        // *(idx_y) = sum0; idx_y += P;
+        // *(idx_y) = sum1; idx_y += P;
+
+        idx_a += N; // adjust A matrix pointer
+      }
+    }
+  } else if (transA == 1 && transB == 1) {
+    // Masks for shuffles
+    static v4s mask0 = {0, 1, 4, 5};
+    static v4s mask1 = {2, 3, 6, 7};
+    static v4s mask2 = {0, 2, 4, 6};
+    static v4s mask3 = {1, 3, 5, 7};
+
+    // Row decrement for A matrix
+    int32_t const B_decr = -(int)N * 3 + 4;
+    // Row increment for C matrix
+    uint32_t const P_incr = (P * 4) - 12;
+
+    v4s aVecOffset = {(int8_t)A_offset, (int8_t)A_offset, (int8_t)A_offset,
+                      (int8_t)A_offset};
+    v4s bVecOffset = {(int8_t)B_offset, (int8_t)B_offset, (int8_t)B_offset,
+                      (int8_t)B_offset};
+
+    for (k = core_id; k < P / 4; k += numThreads) {
+      const int8_t *idx_a = &pSrcA[0];      // start_a
+      const int32_t *idx_c = &pSrcC[k * 4]; // start_c
+      int8_t *idx_y = &pDstY[k * 4];        // start_y
+      int8_t const *end_y = &pDstY[P * M];  // actually (P * M) + (k * 4)
+      while (idx_y < end_y) {
+        int32_t sum00 = 0;
+        int32_t sum01 = 0;
+        int32_t sum02 = 0;
+        int32_t sum03 = 0;
+        int32_t sum10 = 0;
+        int32_t sum11 = 0;
+        int32_t sum12 = 0;
+        int32_t sum13 = 0;
+        int32_t sum20 = 0;
+        int32_t sum21 = 0;
+        int32_t sum22 = 0;
+        int32_t sum23 = 0;
+        int32_t sum30 = 0;
+        int32_t sum31 = 0;
+        int32_t sum32 = 0;
+        int32_t sum33 = 0;
+
+        v4s sum0, sum1, sum2, sum3;
+
+        int8_t const *end_a = idx_a + N * M;
+        const int8_t *idx_b = &pSrcB[k * 4 * N]; // start_b
+        while (idx_a < end_a) {
+
+          v4s bVec0, bVec1, bVec2, bVec3;
+          v4s temp0, temp1, temp2, temp3;
+
+          __asm__ volatile(
+              "p.lw %[at0], %[a_incr](%[addr_a]!) \n\t"
+              "p.lw %[at1], %[a_incr](%[addr_a]!) \n\t"
+              "p.lw %[at2], %[a_incr](%[addr_a]!) \n\t"
+              "p.lw %[at3], %[a_incr](%[addr_a]!) \n\t"
+              "p.lw %[bt0], %[b_incr](%[addr_b]!) \n\t"
+              "p.lw %[bt1], %[b_incr](%[addr_b]!) \n\t"
+              "p.lw %[bt2], %[b_incr](%[addr_b]!) \n\t"
+              "p.lw %[bt3], %[b_decr](%[addr_b]!) \n\t"
+              : [at0] "=&r"(temp0), [at1] "=&r"(temp1), [at2] "=&r"(temp2),
+                [at3] "=&r"(temp3), [bt0] "=&r"(bVec0), [bt1] "=&r"(bVec1),
+                [bt2] "=&r"(bVec2), [bt3] "=&r"(bVec3), [addr_a] "+&r"(idx_a),
+                [addr_b] "+&r"(idx_b)
+              : [a_incr] "r"(M), [b_incr] "r"(N), [b_decr] "r"(B_decr)
+              : "memory");
+          /* The asm code above implements the following commented C code */
+          // go to next row, same column
+          // v4s aVec0 = *((v4s *)idx_a); idx_a += M;
+          // v4s aVec1 = *((v4s *)idx_a); idx_a += M;
+          // v4s aVec2 = *((v4s *)idx_a); idx_a += M;
+          // v4s aVec3 = *((v4s *)idx_a); idx_a += M;
+          // v4s bVec0 = *((v4s *)idx_b); idx_b += P;
+          // v4s bVec1 = *((v4s *)idx_b); idx_b += P;
+          // v4s bVec2 = *((v4s *)idx_b); idx_b += P;
+          // v4s bVec3 = *((v4s *)idx_b); idx_b += P;
+
+          bVec0 = __ADD4(bVec0, bVecOffset);
+          bVec1 = __ADD4(bVec1, bVecOffset);
+          bVec2 = __ADD4(bVec2, bVecOffset);
+          bVec3 = __ADD4(bVec3, bVecOffset);
+
+          // Shuffles to transpose at runtime the chunk extracted from A before
+          // multiplying with B chunk temp0-3 variables needed because shuffles
+          // use rD as source, but also modify it, thus we need a copy of their
+          // content to use it twice in their original form
+          v4s temp4 = __builtin_shuffle(temp0, temp1, mask0); // 0,1,4,5
+          v4s temp5 = __builtin_shuffle(temp2, temp3, mask0); // 8,9,12,13
+          v4s temp6 = __builtin_shuffle(temp0, temp1, mask1); // 2,3,6,7
+          v4s temp7 = __builtin_shuffle(temp2, temp3, mask1); // 3,7,11,15
+
+          v4s aVec0 = __builtin_shuffle(temp4, temp5, mask2); // 0,4,8,12
+          v4s aVec1 = __builtin_shuffle(temp4, temp5, mask3); // 1,5,9,13
+          v4s aVec2 = __builtin_shuffle(temp6, temp7, mask2); // 2,6,10,14
+          v4s aVec3 = __builtin_shuffle(temp6, temp7, mask3); // 3,7,11,15
+
+          aVec0 = __ADD4(aVec0, aVecOffset);
+          aVec1 = __ADD4(aVec1, aVecOffset);
+          aVec2 = __ADD4(aVec2, aVecOffset);
+          aVec3 = __ADD4(aVec3, aVecOffset);
+
+          sum00 = __SUMDOTP4(aVec0, bVec0, sum00);
+          sum01 = __SUMDOTP4(aVec0, bVec1, sum01);
+          sum02 = __SUMDOTP4(aVec0, bVec2, sum02);
+          sum03 = __SUMDOTP4(aVec0, bVec3, sum03);
+          sum10 = __SUMDOTP4(aVec1, bVec0, sum10);
+          sum11 = __SUMDOTP4(aVec1, bVec1, sum11);
+          sum12 = __SUMDOTP4(aVec1, bVec2, sum12);
+          sum13 = __SUMDOTP4(aVec1, bVec3, sum13);
+          sum20 = __SUMDOTP4(aVec2, bVec0, sum20);
+          sum21 = __SUMDOTP4(aVec2, bVec1, sum21);
+          sum22 = __SUMDOTP4(aVec2, bVec2, sum22);
+          sum23 = __SUMDOTP4(aVec2, bVec3, sum23);
+          sum30 = __SUMDOTP4(aVec3, bVec0, sum30);
+          sum31 = __SUMDOTP4(aVec3, bVec1, sum31);
+          sum32 = __SUMDOTP4(aVec3, bVec2, sum32);
+          sum33 = __SUMDOTP4(aVec3, bVec3, sum33);
+        }
+        int32_t bias00, bias01, bias02, bias03;
+        int32_t bias10, bias11, bias12, bias13;
+        int32_t bias20, bias21, bias22, bias23;
+        int32_t bias30, bias31, bias32, bias33;
+
+        __asm__ volatile(
+            "p.lw %[b00], 4(%[addr_c]!) \n\t"
+            "p.lw %[b01], 4(%[addr_c]!) \n\t"
+            "p.lw %[b02], 4(%[addr_c]!) \n\t"
+            "p.lw %[b03], %[c_incr](%[addr_c]!) \n\t"
+            "p.lw %[b10], 4(%[addr_c]!) \n\t"
+            "p.lw %[b11], 4(%[addr_c]!) \n\t"
+            "p.lw %[b12], 4(%[addr_c]!) \n\t"
+            "p.lw %[b13], %[c_incr](%[addr_c]!) \n\t"
+            "p.lw %[b20], 4(%[addr_c]!) \n\t"
+            "p.lw %[b21], 4(%[addr_c]!) \n\t"
+            "p.lw %[b22], 4(%[addr_c]!) \n\t"
+            "p.lw %[b23], %[c_incr](%[addr_c]!) \n\t"
+            "p.lw %[b30], 4(%[addr_c]!) \n\t"
+            "p.lw %[b31], 4(%[addr_c]!) \n\t"
+            "p.lw %[b32], 4(%[addr_c]!) \n\t"
+            "p.lw %[b33], %[c_incr](%[addr_c]!) \n\t"
+            : [b00] "=&r"(bias00), [b01] "=&r"(bias01), [b02] "=&r"(bias02),
+              [b03] "=&r"(bias03), [b10] "=&r"(bias10), [b11] "=&r"(bias11),
+              [b12] "=&r"(bias12), [b13] "=&r"(bias13), [b20] "=&r"(bias20),
+              [b21] "=&r"(bias21), [b22] "=&r"(bias22), [b23] "=&r"(bias23),
+              [b30] "=&r"(bias30), [b31] "=&r"(bias31), [b32] "=&r"(bias32),
+              [b33] "=&r"(bias33), [addr_c] "+&r"(idx_c)
+            : [c_incr] "r"(P_incr)
+            : "memory");
+
+        sum00 = alpha * sum00 + beta * bias00 + bias;
+        sum01 = alpha * sum01 + beta * bias01 + bias;
+        sum02 = alpha * sum02 + beta * bias02 + bias;
+        sum03 = alpha * sum03 + beta * bias03 + bias;
+        sum10 = alpha * sum10 + beta * bias10 + bias;
+        sum11 = alpha * sum11 + beta * bias11 + bias;
+        sum12 = alpha * sum12 + beta * bias12 + bias;
+        sum13 = alpha * sum13 + beta * bias13 + bias;
+        sum20 = alpha * sum20 + beta * bias20 + bias;
+        sum21 = alpha * sum21 + beta * bias21 + bias;
+        sum22 = alpha * sum22 + beta * bias22 + bias;
+        sum23 = alpha * sum23 + beta * bias23 + bias;
+        sum30 = alpha * sum30 + beta * bias30 + bias;
+        sum31 = alpha * sum31 + beta * bias31 + bias;
+        sum32 = alpha * sum32 + beta * bias32 + bias;
+        sum33 = alpha * sum33 + beta * bias33 + bias;
+
+        int32_t _add0, _add1, _add2, _add3;
+        int32_t _mul0, _mul1, _mul2, _mul3;
+        if (per_row_quant) {
+          __asm__ volatile(
+              "p.lw %[add0], 4(%[addr_add]!) \n\t"
+              "p.lw %[add1], 4(%[addr_add]!) \n\t"
+              "p.lw %[add2], 4(%[addr_add]!) \n\t"
+              "p.lw %[add3], 4(%[addr_add]!) \n\t"
+              "p.lw %[mul0], 4(%[addr_mul]!) \n\t"
+              "p.lw %[mul1], 4(%[addr_mul]!) \n\t"
+              "p.lw %[mul2], 4(%[addr_mul]!) \n\t"
+              "p.lw %[mul3], 4(%[addr_mul]!) \n\t"
+              : [add0] "=&r"(_add0), [mul0] "=&r"(_mul0), [add1] "=&r"(_add1),
+                [mul1] "=&r"(_mul1), [add2] "=&r"(_add2), [mul2] "=&r"(_mul2),
+                [add3] "=&r"(_add3), [mul3] "=&r"(_mul3),
+                [addr_add] "+&r"(idx_add), [addr_mul] "+&r"(idx_mul)::"memory");
+        } else {
+          _add0 = add[0];
+          _add1 = add[0];
+          _add2 = add[0];
+          _add3 = add[0];
+          _mul0 = mul[0];
+          _mul1 = mul[0];
+          _mul2 = mul[0];
+          _mul3 = mul[0];
+        }
+
+        sum00 = sum00 * _mul0 + rqs_bias + _add0;
+        sum01 = sum01 * _mul0 + rqs_bias + _add0;
+        sum02 = sum02 * _mul0 + rqs_bias + _add0;
+        sum03 = sum03 * _mul0 + rqs_bias + _add0;
+        sum10 = sum10 * _mul1 + rqs_bias + _add1;
+        sum11 = sum11 * _mul1 + rqs_bias + _add1;
+        sum12 = sum12 * _mul1 + rqs_bias + _add1;
+        sum13 = sum13 * _mul1 + rqs_bias + _add1;
+        sum20 = sum20 * _mul2 + rqs_bias + _add2;
+        sum21 = sum21 * _mul2 + rqs_bias + _add2;
+        sum22 = sum22 * _mul2 + rqs_bias + _add2;
+        sum23 = sum23 * _mul2 + rqs_bias + _add2;
+        sum30 = sum30 * _mul3 + rqs_bias + _add3;
+        sum31 = sum31 * _mul3 + rqs_bias + _add3;
+        sum32 = sum32 * _mul3 + rqs_bias + _add3;
+        sum33 = sum33 * _mul3 + rqs_bias + _add3;
+
+        sum00 = (sum00 >> log2D) + Y_offset;
+        sum01 = (sum01 >> log2D) + Y_offset;
+        sum02 = (sum02 >> log2D) + Y_offset;
+        sum03 = (sum03 >> log2D) + Y_offset;
+        sum10 = (sum10 >> log2D) + Y_offset;
+        sum11 = (sum11 >> log2D) + Y_offset;
+        sum12 = (sum12 >> log2D) + Y_offset;
+        sum13 = (sum13 >> log2D) + Y_offset;
+        sum20 = (sum20 >> log2D) + Y_offset;
+        sum21 = (sum21 >> log2D) + Y_offset;
+        sum22 = (sum22 >> log2D) + Y_offset;
+        sum23 = (sum23 >> log2D) + Y_offset;
+        sum30 = (sum30 >> log2D) + Y_offset;
+        sum31 = (sum31 >> log2D) + Y_offset;
+        sum32 = (sum32 >> log2D) + Y_offset;
+        sum33 = (sum33 >> log2D) + Y_offset;
+
+        sum0[0] = (int8_t)__CLIP(sum00, 7);
+        sum0[1] = (int8_t)__CLIP(sum01, 7);
+        sum0[2] = (int8_t)__CLIP(sum02, 7);
+        sum0[3] = (int8_t)__CLIP(sum03, 7);
+        sum1[0] = (int8_t)__CLIP(sum10, 7);
+        sum1[1] = (int8_t)__CLIP(sum11, 7);
+        sum1[2] = (int8_t)__CLIP(sum12, 7);
+        sum1[3] = (int8_t)__CLIP(sum13, 7);
+        sum2[0] = (int8_t)__CLIP(sum20, 7);
+        sum2[1] = (int8_t)__CLIP(sum21, 7);
+        sum2[2] = (int8_t)__CLIP(sum22, 7);
+        sum2[3] = (int8_t)__CLIP(sum23, 7);
+        sum3[0] = (int8_t)__CLIP(sum30, 7);
+        sum3[1] = (int8_t)__CLIP(sum31, 7);
+        sum3[2] = (int8_t)__CLIP(sum32, 7);
+        sum3[3] = (int8_t)__CLIP(sum33, 7);
+
+        __asm__ volatile("p.sw %[s0], %[y_incr](%[addr_y]!) \n\t"
+                         "p.sw %[s1], %[y_incr](%[addr_y]!) \n\t"
+                         "p.sw %[s2], %[y_incr](%[addr_y]!) \n\t"
+                         "p.sw %[s3], %[y_incr](%[addr_y]!) \n\t"
+                         : [addr_y] "+&r"(idx_y)
+                         : [s0] "r"(sum0), [s1] "r"(sum1), [s2] "r"(sum2),
+                           [s3] "r"(sum3), [y_incr] "r"(P)
+                         : "memory");
+        /* The asm code above implements the following commented C code */
+        // *(idx_y) = sum0; idx_y += P;
+        // *(idx_y) = sum1; idx_y += P;
+        // *(idx_y) = sum2; idx_y += P;
+        // *(idx_y) = sum3; idx_y += P;
+
+        idx_a -= N * M - 4; // adjust A matrix pointer
+      }
+    }
+  }
+}
+
+#endif //__XPULPIMG
diff --git a/TargetLibraries/MemPool/src/RQMatMul_s8.c b/TargetLibraries/MemPool/src/RQMatMul_s8.c
new file mode 100644
index 0000000..8a499c2
--- /dev/null
+++ b/TargetLibraries/MemPool/src/RQMatMul_s8.c
@@ -0,0 +1,517 @@
+/* =====================================================================
+ * Title:        RQMatMul_s8.c
+ * Description:
+ *
+ * Date:         24.04.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+void RQMatMul_parallel_s8_rv32im(int8_t const *__restrict__ pSrcA,
+                                 int8_t const *__restrict__ pSrcB,
+                                 int8_t *__restrict__ pDstC, uint32_t M,
+                                 uint32_t N, uint32_t P, int32_t *mul,
+                                 int32_t *add, int32_t log2D, bool rounding,
+                                 bool per_row_quant, int32_t A_offset,
+                                 int32_t B_offset, int32_t output_offset,
+                                 int8_t output_min, int8_t output_max,
+                                 uint32_t core_id, uint32_t numThreads) {
+  // Parallelize by assigning each core one row
+  uint32_t const c = 1; // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (core_id % c);
+  uint32_t const c_end = (P / c) * ((core_id % c) + 1);
+
+  const int32_t rqs_bias = ((1 << (log2D - 1))) * rounding;
+
+  int32_t _add = add[0];
+  int32_t _mul = mul[0];
+
+  for (uint32_t i = core_id / c; i < M; i += numThreads / c) {
+    if (per_row_quant) {
+      _mul = mul[i];
+      _add = add[i];
+    }
+    for (uint32_t j = c_start; j < c_end; ++j) {
+      int32_t sum = 0;
+      for (uint32_t k = 0; k < N; ++k) {
+        sum += (int32_t)(pSrcA[i * N + k] + A_offset) *
+               (pSrcB[k * P + j] + B_offset);
+      }
+      // Requantize value
+      sum = sum * _mul + rqs_bias + _add;
+      sum = (sum >> log2D) + output_offset;
+      pDstC[i * P + j] = (int8_t)CLAMP(sum, output_min, output_max);
+    }
+  }
+}
+
+void RQMatMul_unrolled_2x2_parallel_s8_rv32im(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, uint32_t core_id, uint32_t numThreads) {
+
+  const int32_t rqs_bias = ((1 << (log2D - 1))) * rounding;
+
+  int32_t _add0 = add[0];
+  int32_t _add1 = add[0];
+  int32_t _mul0 = mul[0];
+  int32_t _mul1 = mul[0];
+
+  // Parallelize by assigning each core one row
+  uint32_t const c = 1; // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (core_id % c);
+  uint32_t const c_end = (P / c) * ((core_id % c) + 1);
+  for (uint32_t i = 2 * (core_id / c); i < M; i += 2 * (numThreads / c)) {
+    if (per_row_quant) {
+      _mul0 = mul[i];
+      _mul1 = mul[i + 1];
+      _add0 = add[i];
+      _add1 = add[i + 1];
+    }
+    for (uint32_t j = c_start; j < c_end; j += 2) {
+      int32_t c00 = 0;
+      int32_t c01 = 0;
+      int32_t c10 = 0;
+      int32_t c11 = 0;
+      for (uint32_t k = 0; k < N; k += 2) {
+        // Explicitly load the values first to help with scheduling
+        int8_t val_a00 = (int8_t)(pSrcA[(i + 0) * N + k + 0]);
+        int8_t val_a01 = (int8_t)(pSrcA[(i + 0) * N + k + 1]);
+        int8_t val_a10 = (int8_t)(pSrcA[(i + 1) * N + k + 0]);
+        int8_t val_a11 = (int8_t)(pSrcA[(i + 1) * N + k + 1]);
+        int8_t val_b00 = (int8_t)(pSrcB[(k + 0) * P + j + 0]);
+        int8_t val_b01 = (int8_t)(pSrcB[(k + 0) * P + j + 1]);
+        int8_t val_b10 = (int8_t)(pSrcB[(k + 1) * P + j + 0]);
+        int8_t val_b11 = (int8_t)(pSrcB[(k + 1) * P + j + 1]);
+        c00 += val_a00 * val_b00;
+        c00 += val_a01 * val_b10;
+        c01 += val_a00 * val_b01;
+        c01 += val_a01 * val_b11;
+        c10 += val_a10 * val_b00;
+        c10 += val_a11 * val_b10;
+        c11 += val_a10 * val_b01;
+        c11 += val_a11 * val_b11;
+      }
+
+      c00 = c00 * _mul0 + rqs_bias + _add0;
+      c01 = c01 * _mul0 + rqs_bias + _add0;
+      c10 = c10 * _mul1 + rqs_bias + _add1;
+      c11 = c11 * _mul1 + rqs_bias + _add1;
+
+      c00 = (c00 >> log2D);
+      c01 = (c01 >> log2D);
+      c10 = (c10 >> log2D);
+      c11 = (c11 >> log2D);
+
+      pDstC[(i + 0) * P + j + 0] = (int8_t)CLAMP(c00, -128, 127);
+      pDstC[(i + 0) * P + j + 1] = (int8_t)CLAMP(c01, -128, 127);
+      pDstC[(i + 1) * P + j + 0] = (int8_t)CLAMP(c10, -128, 127);
+      pDstC[(i + 1) * P + j + 1] = (int8_t)CLAMP(c11, -128, 127);
+    }
+  }
+}
+
+void RQMatMul_offset_unrolled_2x2_parallel_s8_rv32im(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, int32_t A_offset, int32_t B_offset,
+    int32_t output_offset, uint32_t core_id, uint32_t numThreads) {
+  const int32_t rqs_bias = ((1 << (log2D - 1))) * rounding;
+
+  int32_t _add0 = add[0];
+  int32_t _add1 = add[0];
+  int32_t _mul0 = mul[0];
+  int32_t _mul1 = mul[0];
+
+  // Parallelize by assigning each core one row
+  uint32_t const c = 1; // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (core_id % c);
+  uint32_t const c_end = (P / c) * ((core_id % c) + 1);
+  for (uint32_t i = 2 * (core_id / c); i < M; i += 2 * (numThreads / c)) {
+    if (per_row_quant) {
+      _mul0 = mul[i];
+      _mul1 = mul[i + 1];
+      _add0 = add[i];
+      _add1 = add[i + 1];
+    }
+    for (uint32_t j = c_start; j < c_end; j += 2) {
+      int32_t c00 = 0;
+      int32_t c01 = 0;
+      int32_t c10 = 0;
+      int32_t c11 = 0;
+      for (uint32_t k = 0; k < N; k += 2) {
+        // Explicitly load the values first to help with scheduling
+        int32_t val_a00 = pSrcA[(i + 0) * N + k + 0] + A_offset;
+        int32_t val_a01 = pSrcA[(i + 0) * N + k + 1] + A_offset;
+        int32_t val_a10 = pSrcA[(i + 1) * N + k + 0] + A_offset;
+        int32_t val_a11 = pSrcA[(i + 1) * N + k + 1] + A_offset;
+        int32_t val_b00 = pSrcB[(k + 0) * P + j + 0] + B_offset;
+        int32_t val_b01 = pSrcB[(k + 0) * P + j + 1] + B_offset;
+        int32_t val_b10 = pSrcB[(k + 1) * P + j + 0] + B_offset;
+        int32_t val_b11 = pSrcB[(k + 1) * P + j + 1] + B_offset;
+        c00 += val_a00 * val_b00;
+        c00 += val_a01 * val_b10;
+        c01 += val_a00 * val_b01;
+        c01 += val_a01 * val_b11;
+        c10 += val_a10 * val_b00;
+        c10 += val_a11 * val_b10;
+        c11 += val_a10 * val_b01;
+        c11 += val_a11 * val_b11;
+      }
+
+      c00 = c00 * _mul0 + rqs_bias + _add0;
+      c01 = c01 * _mul0 + rqs_bias + _add0;
+      c10 = c10 * _mul1 + rqs_bias + _add1;
+      c11 = c11 * _mul1 + rqs_bias + _add1;
+
+      c00 = (c00 >> log2D) + output_offset;
+      c01 = (c01 >> log2D) + output_offset;
+      c10 = (c10 >> log2D) + output_offset;
+      c11 = (c11 >> log2D) + output_offset;
+
+      pDstC[(i + 0) * P + j + 0] = (int8_t)CLAMP(c00, -128, 127);
+      pDstC[(i + 0) * P + j + 1] = (int8_t)CLAMP(c01, -128, 127);
+      pDstC[(i + 1) * P + j + 0] = (int8_t)CLAMP(c10, -128, 127);
+      pDstC[(i + 1) * P + j + 1] = (int8_t)CLAMP(c11, -128, 127);
+    }
+  }
+}
+
+#ifdef __XPULPIMG
+void RQMatMul_unrolled_2x4_pincr_asm_parallel_s8_xpulpv2(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, uint32_t core_id, uint32_t numThreads) {
+  // Masks for shuffles
+  static v4s mask0 = {0, 1, 4, 5};
+  static v4s mask1 = {2, 3, 6, 7};
+  static v4s mask2 = {0, 2, 4, 6};
+  static v4s mask3 = {1, 3, 5, 7};
+
+  const int32_t rqs_bias = ((1 << (log2D - 1))) * rounding;
+
+  // Loop counter for P
+  uint32_t k = 0;
+  // Row decrement for A matrix
+  int32_t const N_decr = -(int)N + 4;
+  // Row increment for C matrix
+  uint32_t const P_incr = P;
+
+  const int32_t *idx_add = &add[0];
+  const int32_t *idx_mul = &mul[0];
+
+  int32_t _add0 = add[0];
+  int32_t _add1 = add[0];
+  int32_t _mul0 = mul[0];
+  int32_t _mul1 = mul[0];
+
+  for (k = core_id; k < P / 4; k += numThreads) {
+    const int8_t *idx_a = &pSrcA[0];     // start_a
+    int8_t *idx_c = &pDstC[k * 4];       // start_c
+    int8_t const *end_c = &pDstC[P * M]; // actually (P * M) + (k * 4)
+    while (idx_c < end_c) {
+      int32_t sum00 = 0;
+      int32_t sum01 = 0;
+      int32_t sum02 = 0;
+      int32_t sum03 = 0;
+      int32_t sum10 = 0;
+      int32_t sum11 = 0;
+      int32_t sum12 = 0;
+      int32_t sum13 = 0;
+
+      v4s sum0, sum1;
+
+      int8_t const *end_a = idx_a + N;
+      const int8_t *idx_b = &pSrcB[k * 4]; // start_b
+      while (idx_a < end_a) {
+        v4s aVec0, aVec1;
+
+        v4s temp0, temp1, temp2, temp3;
+
+        __asm__ volatile(
+            "p.lw %[a0], %[a_incr](%[addr_a]!) \n\t"
+            "p.lw %[a1], %[a_decr](%[addr_a]!) \n\t"
+            "p.lw %[t0], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[t1], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[t2], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[t3], %[b_incr](%[addr_b]!) \n\t"
+            : [a0] "=&r"(aVec0), [a1] "=&r"(aVec1), [t0] "=&r"(temp0),
+              [t1] "=&r"(temp1), [t2] "=&r"(temp2), [t3] "=&r"(temp3),
+              [addr_a] "+&r"(idx_a), [addr_b] "+&r"(idx_b)
+            : [a_incr] "r"(N), [a_decr] "r"(N_decr), [b_incr] "r"(P)
+            : "memory");
+        /* The asm code above implements the following commented C code */
+        // go to next row, same column
+        // v4s aVec0 = *((v4s *)idx_a); idx_a += N;
+        // go to previous row, one column forward
+        // v4s aVec1 = *((v4s *)idx_a); idx_a -= N - 4;
+        // v4s temp0 = *((v4s *)idx_b); idx_b += P;
+        // v4s temp1 = *((v4s *)idx_b); idx_b += P;
+        // v4s temp2 = *((v4s *)idx_b); idx_b += P;
+        // v4s temp3 = *((v4s *)idx_b); idx_b += P;
+
+        // Shuffles to transpose at runtime the chunk extracted from B before
+        // multiplying with A chunk temp0-3 variables needed because shuffles
+        // use rD as source, but also modify it, thus we need a copy of their
+        // content to use it twice in their original form
+        v4s temp4 = __builtin_shuffle(temp0, temp1, mask0); // 0,1,4,5
+        v4s temp5 = __builtin_shuffle(temp2, temp3, mask0); // 8,9,12,13
+        v4s temp6 = __builtin_shuffle(temp0, temp1, mask1); // 2,3,6,7
+        v4s temp7 = __builtin_shuffle(temp2, temp3, mask1); // 3,7,11,15
+
+        v4s bVec0 = __builtin_shuffle(temp4, temp5, mask2); // 0,4,8,12
+        v4s bVec1 = __builtin_shuffle(temp4, temp5, mask3); // 1,5,9,13
+        v4s bVec2 = __builtin_shuffle(temp6, temp7, mask2); // 2,6,10,14
+        v4s bVec3 = __builtin_shuffle(temp6, temp7, mask3); // 3,7,11,15
+
+        sum00 = __SUMDOTP4(aVec0, bVec0, sum00);
+        sum01 = __SUMDOTP4(aVec0, bVec1, sum01);
+        sum02 = __SUMDOTP4(aVec0, bVec2, sum02);
+        sum03 = __SUMDOTP4(aVec0, bVec3, sum03);
+        sum10 = __SUMDOTP4(aVec1, bVec0, sum10);
+        sum11 = __SUMDOTP4(aVec1, bVec1, sum11);
+        sum12 = __SUMDOTP4(aVec1, bVec2, sum12);
+        sum13 = __SUMDOTP4(aVec1, bVec3, sum13);
+      }
+
+      if (per_row_quant) {
+        __asm__ volatile(
+            "p.lw %[add0], 4(%[addr_add]!) \n\t"
+            "p.lw %[add1], 4(%[addr_add]!) \n\t"
+            "p.lw %[mul0], 4(%[addr_mul]!) \n\t"
+            "p.lw %[mul1], 4(%[addr_mul]!) \n\t"
+            : [add0] "=&r"(_add0), [mul0] "=&r"(_mul0), [add1] "=&r"(_add1),
+              [mul1] "=&r"(_mul1), [addr_add] "+&r"(idx_add),
+              [addr_mul] "+&r"(idx_mul)::"memory");
+      }
+
+      sum00 = sum00 * _mul0 + rqs_bias + _add0;
+      sum01 = sum01 * _mul0 + rqs_bias + _add0;
+      sum02 = sum02 * _mul0 + rqs_bias + _add0;
+      sum03 = sum03 * _mul0 + rqs_bias + _add0;
+      sum10 = sum10 * _mul1 + rqs_bias + _add1;
+      sum11 = sum11 * _mul1 + rqs_bias + _add1;
+      sum12 = sum12 * _mul1 + rqs_bias + _add1;
+      sum13 = sum13 * _mul1 + rqs_bias + _add1;
+
+      sum00 = (sum00 >> log2D);
+      sum01 = (sum01 >> log2D);
+      sum02 = (sum02 >> log2D);
+      sum03 = (sum03 >> log2D);
+      sum10 = (sum10 >> log2D);
+      sum11 = (sum11 >> log2D);
+      sum12 = (sum12 >> log2D);
+      sum13 = (sum13 >> log2D);
+
+      sum0[0] = (int8_t)__CLIP(sum00, 7);
+      sum0[1] = (int8_t)__CLIP(sum01, 7);
+      sum0[2] = (int8_t)__CLIP(sum02, 7);
+      sum0[3] = (int8_t)__CLIP(sum03, 7);
+      sum1[0] = (int8_t)__CLIP(sum10, 7);
+      sum1[1] = (int8_t)__CLIP(sum11, 7);
+      sum1[2] = (int8_t)__CLIP(sum12, 7);
+      sum1[3] = (int8_t)__CLIP(sum13, 7);
+
+      __asm__ volatile("p.sw %[s0], %[c_incr](%[addr_c]!) \n\t"
+                       "p.sw %[s1], %[c_incr](%[addr_c]!) \n\t"
+                       : [addr_c] "+&r"(idx_c)
+                       : [s0] "r"(sum0), [s1] "r"(sum1), [c_incr] "r"(P_incr)
+                       : "memory");
+      /* The asm code above implements the following commented C code */
+      // *(idx_c) = sum0; idx_c += P;
+      // *(idx_c) = sum1; idx_c += P;
+
+      idx_a += N; // adjust A matrix pointer
+    }
+  }
+}
+
+void RQMatMul_offset_unrolled_2x4_pincr_asm_parallel_s8_xpulpv2(
+    int8_t const *__restrict__ pSrcA, int8_t const *__restrict__ pSrcB,
+    int8_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    int32_t *mul, int32_t *add, int32_t log2D, bool rounding,
+    bool per_row_quant, int32_t A_offset, int32_t B_offset,
+    int32_t output_offset, uint32_t core_id, uint32_t numThreads) {
+  // Masks for shuffles
+  static v4s mask0 = {0, 1, 4, 5};
+  static v4s mask1 = {2, 3, 6, 7};
+  static v4s mask2 = {0, 2, 4, 6};
+  static v4s mask3 = {1, 3, 5, 7};
+
+  const int32_t rqs_bias = ((1 << (log2D - 1))) * rounding;
+
+  // Loop counter for P
+  uint32_t k = 0;
+  // Row decrement for A matrix
+  int32_t const N_decr = -(int)N + 4;
+  // Row increment for C matrix
+  uint32_t const P_incr = P;
+
+  const int32_t *idx_add = &add[0];
+  const int32_t *idx_mul = &mul[0];
+
+  int32_t _add0 = add[0];
+  int32_t _add1 = add[0];
+  int32_t _mul0 = mul[0];
+  int32_t _mul1 = mul[0];
+
+  v4s aVecOffset = {(int8_t)A_offset, (int8_t)A_offset, (int8_t)A_offset,
+                    (int8_t)A_offset};
+  v4s bVecOffset = {(int8_t)B_offset, (int8_t)B_offset, (int8_t)B_offset,
+                    (int8_t)B_offset};
+
+  for (k = core_id; k < P / 4; k += numThreads) {
+    const int8_t *idx_a = &pSrcA[0];     // start_a
+    int8_t *idx_c = &pDstC[k * 4];       // start_c
+    int8_t const *end_c = &pDstC[P * M]; // actually (P * M) + (k * 4)
+    while (idx_c < end_c) {
+      int32_t sum00 = 0;
+      int32_t sum01 = 0;
+      int32_t sum02 = 0;
+      int32_t sum03 = 0;
+      int32_t sum10 = 0;
+      int32_t sum11 = 0;
+      int32_t sum12 = 0;
+      int32_t sum13 = 0;
+
+      v4s sum0, sum1;
+
+      int8_t const *end_a = idx_a + N;
+      const int8_t *idx_b = &pSrcB[k * 4]; // start_b
+      while (idx_a < end_a) {
+        v4s aVec0, aVec1;
+
+        v4s temp0, temp1, temp2, temp3;
+
+        __asm__ volatile(
+            "p.lw %[a0], %[a_incr](%[addr_a]!) \n\t"
+            "p.lw %[a1], %[a_decr](%[addr_a]!) \n\t"
+            "p.lw %[t0], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[t1], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[t2], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[t3], %[b_incr](%[addr_b]!) \n\t"
+            : [a0] "=&r"(aVec0), [a1] "=&r"(aVec1), [t0] "=&r"(temp0),
+              [t1] "=&r"(temp1), [t2] "=&r"(temp2), [t3] "=&r"(temp3),
+              [addr_a] "+&r"(idx_a), [addr_b] "+&r"(idx_b)
+            : [a_incr] "r"(N), [a_decr] "r"(N_decr), [b_incr] "r"(P)
+            : "memory");
+        /* The asm code above implements the following commented C code */
+        // go to next row, same column
+        // v4s aVec0 = *((v4s *)idx_a); idx_a += N;
+        // go to previous row, one column forward
+        // v4s aVec1 = *((v4s *)idx_a); idx_a -= N - 4;
+        // v4s temp0 = *((v4s *)idx_b); idx_b += P;
+        // v4s temp1 = *((v4s *)idx_b); idx_b += P;
+        // v4s temp2 = *((v4s *)idx_b); idx_b += P;
+        // v4s temp3 = *((v4s *)idx_b); idx_b += P;
+
+        // WIESEP: This might lead to problems as the result of two int8 numbers
+        // can be larger than 8 bit!
+        aVec0 = __ADD4(aVec0, aVecOffset);
+        aVec1 = __ADD4(aVec1, aVecOffset);
+
+        // Shuffles to transpose at runtime the chunk extracted from B before
+        // multiplying with A chunk temp0-3 variables needed because shuffles
+        // use rD as source, but also modify it, thus we need a copy of their
+        // content to use it twice in their original form
+        v4s temp4 = __builtin_shuffle(temp0, temp1, mask0); // 0,1,4,5
+        v4s temp5 = __builtin_shuffle(temp2, temp3, mask0); // 8,9,12,13
+        v4s temp6 = __builtin_shuffle(temp0, temp1, mask1); // 2,3,6,7
+        v4s temp7 = __builtin_shuffle(temp2, temp3, mask1); // 3,7,11,15
+
+        v4s bVec0 = __builtin_shuffle(temp4, temp5, mask2); // 0,4,8,12
+        v4s bVec1 = __builtin_shuffle(temp4, temp5, mask3); // 1,5,9,13
+        v4s bVec2 = __builtin_shuffle(temp6, temp7, mask2); // 2,6,10,14
+        v4s bVec3 = __builtin_shuffle(temp6, temp7, mask3); // 3,7,11,15
+
+        bVec0 = __ADD4(bVec0, bVecOffset);
+        bVec1 = __ADD4(bVec1, bVecOffset);
+        bVec2 = __ADD4(bVec2, bVecOffset);
+        bVec3 = __ADD4(bVec3, bVecOffset);
+
+        sum00 = __SUMDOTP4(aVec0, bVec0, sum00);
+        sum01 = __SUMDOTP4(aVec0, bVec1, sum01);
+        sum02 = __SUMDOTP4(aVec0, bVec2, sum02);
+        sum03 = __SUMDOTP4(aVec0, bVec3, sum03);
+        sum10 = __SUMDOTP4(aVec1, bVec0, sum10);
+        sum11 = __SUMDOTP4(aVec1, bVec1, sum11);
+        sum12 = __SUMDOTP4(aVec1, bVec2, sum12);
+        sum13 = __SUMDOTP4(aVec1, bVec3, sum13);
+      }
+
+      if (per_row_quant) {
+        __asm__ volatile(
+            "p.lw %[add0], 4(%[addr_add]!) \n\t"
+            "p.lw %[add1], 4(%[addr_add]!) \n\t"
+            "p.lw %[mul0], 4(%[addr_mul]!) \n\t"
+            "p.lw %[mul1], 4(%[addr_mul]!) \n\t"
+            : [add0] "=&r"(_add0), [mul0] "=&r"(_mul0), [add1] "=&r"(_add1),
+              [mul1] "=&r"(_mul1), [addr_add] "+&r"(idx_add),
+              [addr_mul] "+&r"(idx_mul)::"memory");
+      }
+
+      sum00 = sum00 * _mul0 + rqs_bias + _add0;
+      sum01 = sum01 * _mul0 + rqs_bias + _add0;
+      sum02 = sum02 * _mul0 + rqs_bias + _add0;
+      sum03 = sum03 * _mul0 + rqs_bias + _add0;
+      sum10 = sum10 * _mul1 + rqs_bias + _add1;
+      sum11 = sum11 * _mul1 + rqs_bias + _add1;
+      sum12 = sum12 * _mul1 + rqs_bias + _add1;
+      sum13 = sum13 * _mul1 + rqs_bias + _add1;
+
+      sum00 = (sum00 >> log2D) + output_offset;
+      sum01 = (sum01 >> log2D) + output_offset;
+      sum02 = (sum02 >> log2D) + output_offset;
+      sum03 = (sum03 >> log2D) + output_offset;
+      sum10 = (sum10 >> log2D) + output_offset;
+      sum11 = (sum11 >> log2D) + output_offset;
+      sum12 = (sum12 >> log2D) + output_offset;
+      sum13 = (sum13 >> log2D) + output_offset;
+
+      sum0[0] = (int8_t)__CLIP(sum00, 7);
+      sum0[1] = (int8_t)__CLIP(sum01, 7);
+      sum0[2] = (int8_t)__CLIP(sum02, 7);
+      sum0[3] = (int8_t)__CLIP(sum03, 7);
+      sum1[0] = (int8_t)__CLIP(sum10, 7);
+      sum1[1] = (int8_t)__CLIP(sum11, 7);
+      sum1[2] = (int8_t)__CLIP(sum12, 7);
+      sum1[3] = (int8_t)__CLIP(sum13, 7);
+
+      __asm__ volatile("p.sw %[s0], %[c_incr](%[addr_c]!) \n\t"
+                       "p.sw %[s1], %[c_incr](%[addr_c]!) \n\t"
+                       : [addr_c] "+&r"(idx_c)
+                       : [s0] "r"(sum0), [s1] "r"(sum1), [c_incr] "r"(P_incr)
+                       : "memory");
+      /* The asm code above implements the following commented C code */
+      // *(idx_c) = sum0; idx_c += P;
+      // *(idx_c) = sum1; idx_c += P;
+
+      idx_a += N; // adjust A matrix pointer
+    }
+  }
+}
+#endif //__XPULPIMG
diff --git a/TargetLibraries/MemPool/src/RequantShift_s8.c b/TargetLibraries/MemPool/src/RequantShift_s8.c
new file mode 100644
index 0000000..c719423
--- /dev/null
+++ b/TargetLibraries/MemPool/src/RequantShift_s8.c
@@ -0,0 +1,216 @@
+/* =====================================================================
+ * Title:        RequantShift_s8.c
+ * Description:
+ *
+ * Date:         24.04.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2023 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+
+void RequantShift_parallel_s8_s8_NHWC(int8_t *data_in, uint32_t size,
+                                      int32_t *mul, int32_t *add,
+                                      int8_t *data_out, int32_t log2D,
+                                      uint32_t channels, int32_t input_offset,
+                                      int32_t output_offset, int8_t output_min,
+                                      int8_t output_max, bool rounding,
+                                      uint32_t core_id, uint32_t numThreads) {
+  int32_t intermediate;
+  int8_t out;
+  for (uint32_t i = core_id; i < size; i += numThreads) {
+    intermediate = ((int32_t)data_in[i] + input_offset) * mul[i % channels] +
+                   add[i % channels];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_parallel_s16_s8_NHWC(int16_t *data_in, uint32_t size,
+                                       int32_t *mul, int32_t *add,
+                                       int8_t *data_out, int32_t log2D,
+                                       uint32_t channels, int32_t input_offset,
+                                       int32_t output_offset, int8_t output_min,
+                                       int8_t output_max, bool rounding,
+                                       uint32_t core_id, uint32_t numThreads) {
+  int32_t intermediate;
+  int8_t out;
+  for (uint32_t i = core_id; i < size; i += numThreads) {
+    intermediate = ((int32_t)data_in[i] + input_offset) * mul[i % channels] +
+                   add[i % channels];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_parallel_s32_s8_NHWC(int32_t *data_in, uint32_t size,
+                                       int32_t *mul, int32_t *add,
+                                       int8_t *data_out, int32_t log2D,
+                                       uint32_t channels, int32_t input_offset,
+                                       int32_t output_offset, int8_t output_min,
+                                       int8_t output_max, bool rounding,
+                                       uint32_t core_id, uint32_t numThreads) {
+  int32_t intermediate;
+  int8_t out;
+  for (uint32_t i = core_id; i < size; i += numThreads) {
+    intermediate = ((int32_t)data_in[i] + input_offset) * mul[i % channels] +
+                   add[i % channels];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_parallel_s8_s8_NCHW(int8_t *data_in, uint32_t size,
+                                      int32_t *mul, int32_t *add,
+                                      int8_t *data_out, int32_t log2D,
+                                      uint32_t HW, int32_t input_offset,
+                                      int32_t output_offset, int8_t output_min,
+                                      int8_t output_max, bool rounding,
+                                      uint32_t core_id, uint32_t numThreads) {
+  int32_t intermediate;
+  int8_t out;
+  for (uint32_t i = core_id; i < size; i += numThreads) {
+    intermediate =
+        ((int32_t)data_in[i] + input_offset) * mul[i / HW] + add[i / HW];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_parallel_s16_s8_NCHW(int16_t *data_in, uint32_t size,
+                                       int32_t *mul, int32_t *add,
+                                       int8_t *data_out, int32_t log2D,
+                                       uint32_t HW, int32_t input_offset,
+                                       int32_t output_offset, int8_t output_min,
+                                       int8_t output_max, bool rounding,
+                                       uint32_t core_id, uint32_t numThreads) {
+  int32_t intermediate;
+  int8_t out;
+  for (uint32_t i = core_id; i < size; i += numThreads) {
+    intermediate =
+        ((int32_t)data_in[i] + input_offset) * mul[i / HW] + add[i / HW];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_parallel_s32_s8_NCHW(int32_t *data_in, uint32_t size,
+                                       int32_t *mul, int32_t *add,
+                                       int8_t *data_out, int32_t log2D,
+                                       uint32_t HW, int32_t input_offset,
+                                       int32_t output_offset, int8_t output_min,
+                                       int8_t output_max, bool rounding,
+                                       uint32_t core_id, uint32_t numThreads) {
+  int32_t intermediate;
+  int8_t out;
+  for (uint32_t i = core_id; i < size; i += numThreads) {
+    intermediate =
+        ((int32_t)data_in[i] + input_offset) * mul[i / HW] + add[i / HW];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_unrolled_1x4_parallel_s32_s8_NCHW_rv32im(
+    int32_t *data_in, uint32_t size, int32_t *mul, int32_t *add,
+    int8_t *data_out, int32_t log2D, uint32_t HW, int32_t input_offset,
+    int32_t output_offset, bool rounding, uint32_t core_id,
+    uint32_t numThreads) {
+
+  const int32_t round_bias = ((1 << (log2D - 1))) * rounding;
+
+  for (uint32_t i = core_id; i < size / 4; i += numThreads) {
+    int32_t shifted0, shifted1, shifted2, shifted3;
+
+    shifted0 = (((data_in[i * 4 + 0] + input_offset) * mul[(i * 4 + 0) / HW] +
+                 add[(i * 4 + 0) / HW] + round_bias) >>
+                log2D) +
+               output_offset;
+    shifted1 = (((data_in[i * 4 + 1] + input_offset) * mul[(i * 4 + 1) / HW] +
+                 add[(i * 4 + 1) / HW] + round_bias) >>
+                log2D) +
+               output_offset;
+    shifted2 = (((data_in[i * 4 + 2] + input_offset) * mul[(i * 4 + 2) / HW] +
+                 add[(i * 4 + 2) / HW] + round_bias) >>
+                log2D) +
+               output_offset;
+    shifted3 = (((data_in[i * 4 + 3] + input_offset) * mul[(i * 4 + 3) / HW] +
+                 add[(i * 4 + 3) / HW] + round_bias) >>
+                log2D) +
+               output_offset;
+
+    data_out[i * 4 + 0] = (int8_t)(CLAMP(shifted0, -128, 127));
+    data_out[i * 4 + 1] = (int8_t)(CLAMP(shifted1, -128, 127));
+    data_out[i * 4 + 2] = (int8_t)(CLAMP(shifted2, -128, 127));
+    data_out[i * 4 + 3] = (int8_t)(CLAMP(shifted3, -128, 127));
+  }
+}
+
+#ifdef __XPULPIMG
+void RequantShift_unrolled_1x4_parallel_s32_s8_NCHW_xpulpv2(
+    int32_t *data_in, uint32_t size, int32_t *mul, int32_t *add,
+    int8_t *data_out, int32_t log2D, uint32_t HW, int32_t input_offset,
+    int32_t output_offset, bool rounding, uint32_t core_id,
+    uint32_t numThreads) {
+
+  const int32_t round_bias = ((1 << (log2D - 1))) * rounding;
+
+  for (uint32_t i = core_id; i < size / 4; i += numThreads) {
+    int32_t shifted0, shifted1, shifted2, shifted3;
+
+    shifted0 = (((data_in[i * 4 + 0] + input_offset) * mul[(i * 4 + 0) / HW] +
+                 add[(i * 4 + 0) / HW] + round_bias) >>
+                log2D) +
+               output_offset;
+    shifted1 = (((data_in[i * 4 + 1] + input_offset) * mul[(i * 4 + 1) / HW] +
+                 add[(i * 4 + 1) / HW] + round_bias) >>
+                log2D) +
+               output_offset;
+    shifted2 = (((data_in[i * 4 + 2] + input_offset) * mul[(i * 4 + 2) / HW] +
+                 add[(i * 4 + 2) / HW] + round_bias) >>
+                log2D) +
+               output_offset;
+    shifted3 = (((data_in[i * 4 + 3] + input_offset) * mul[(i * 4 + 3) / HW] +
+                 add[(i * 4 + 3) / HW] + round_bias) >>
+                log2D) +
+               output_offset;
+
+    data_out[i * 4 + 0] = (int8_t)(__CLIP(shifted0, 7));
+    data_out[i * 4 + 1] = (int8_t)(__CLIP(shifted1, 7));
+    data_out[i * 4 + 2] = (int8_t)(__CLIP(shifted2, 7));
+    data_out[i * 4 + 3] = (int8_t)(__CLIP(shifted3, 7));
+  }
+}
+#endif //__XPULPIMG
\ No newline at end of file
diff --git a/TargetLibraries/MemPool/src/Softmax_s8.c b/TargetLibraries/MemPool/src/Softmax_s8.c
new file mode 100644
index 0000000..cefe3ec
--- /dev/null
+++ b/TargetLibraries/MemPool/src/Softmax_s8.c
@@ -0,0 +1,66 @@
+/* =====================================================================
+ * Title:        Softmax_s8.c
+ * Description:
+ *
+ * $Date:        25.04.2023
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+
+void ITAMax_parallel_s8(int8_t const *__restrict__ pSrcA,
+                        int8_t *__restrict__ pDstB, int8_t *__restrict__ pBufN,
+                        uint32_t size, uint32_t lastDimLength,
+                        uint32_t n_levels, uint32_t core_id,
+                        uint32_t numThreads) {
+
+  uint32_t i = 0; // Row Counter
+  uint32_t j = 0; // Column Counter
+
+  uint8_t *shift = (uint8_t *)pBufN;
+
+  for (i = core_id; i < size / lastDimLength; i += numThreads) {
+    // 1. Find maximum over row
+    int8_t max = -128;
+    for (j = 0; j < lastDimLength; ++j) {
+      if (pSrcA[i * lastDimLength + j] > max) {
+        max = pSrcA[i * lastDimLength + j];
+      }
+    }
+
+    // 2. Calculate exponential sum
+    uint32_t exp_sum = 0;
+    for (j = 0; j < lastDimLength; ++j) {
+      int32_t diff = max - pSrcA[i * lastDimLength + j];
+      shift[j + lastDimLength * core_id] = (uint8_t)((diff + 16) >> 5);
+      exp_sum += (256U >> shift[j + lastDimLength * core_id]);
+    }
+
+    uint32_t exp_sum_inv = ((n_levels - 1) * 256U) / exp_sum;
+
+    for (j = 0; j < lastDimLength; ++j) {
+      pDstB[i * lastDimLength + j] =
+          (int8_t)((exp_sum_inv >> shift[j + lastDimLength * core_id]) -
+                   (n_levels / 2));
+    }
+  }
+}
diff --git a/TargetLibraries/MemPool/src/Util.c b/TargetLibraries/MemPool/src/Util.c
new file mode 100644
index 0000000..315469c
--- /dev/null
+++ b/TargetLibraries/MemPool/src/Util.c
@@ -0,0 +1,46 @@
+/* =====================================================================
+ * Title:        Util.c
+ * Description:
+ *
+ * Date:         15.03.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except pSrcA compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to pSrcA writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+// Overwrite weak function from DeeployBasicLibs
+int deeploy_log(const char *__restrict fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  int ret = vprintf_(fmt, args);
+  va_end(args);
+  return ret;
+}
+
+void *deeploy_malloc(const size_t size) { return simple_malloc(size); }
+
+void deeploy_free(void *const ptr) { simple_free(ptr); }
diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt
new file mode 100644
index 0000000..d951c6b
--- /dev/null
+++ b/TargetLibraries/PULPOpen/CMakeLists.txt
@@ -0,0 +1,39 @@
+file(GLOB_RECURSE SOURCES
+  "src/**"
+)
+
+if(platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka")
+  include(cmake/pulp-sdk-siracusa.cmake)
+elseif(platform STREQUAL "PULPOpen")
+  include(cmake/pulp-sdk-pulp-open.cmake)
+endif()
+
+add_deeploy_library(deeploypulp STATIC ${SOURCES})
+target_include_directories(deeploypulp
+  PUBLIC
+  ${CMAKE_CURRENT_LIST_DIR}/inc
+)
+
+target_compile_options(deeploypulp PRIVATE
+  -Wno-implicit-function-declaration
+  -Wno-implicit-int-conversion
+  -Wno-sign-conversion
+  -Wno-sign-compare
+)
+
+target_include_directories(deeploypulp PUBLIC ${PULP_SDK_INCLUDES})
+target_compile_options(deeploypulp PUBLIC ${PULP_SDK_COMPILE_FLAGS})
+
+add_subdirectory(third_party/pulp-nn-mixed)
+target_include_directories(pulp-nn-mixed PUBLIC ${PULP_SDK_INCLUDES})
+target_compile_options(pulp-nn-mixed PUBLIC ${PULP_SDK_COMPILE_FLAGS})
+
+set(USE_NEUREKA ON)
+add_subdirectory(third_party/pulp-nnx)
+target_include_directories(pulp-nnx PUBLIC ${PULP_SDK_INCLUDES})
+target_compile_options(pulp-nnx PUBLIC ${PULP_SDK_COMPILE_FLAGS})
+
+target_link_libraries(deeploypulp PUBLIC pulp-nn-mixed)
+target_link_libraries(deeploypulp INTERFACE pulp-nnx)
+target_link_libraries(deeploypulp INTERFACE pulp-sdk)
+target_sources(deeploypulp INTERFACE $<TARGET_OBJECTS:pulp-sdk>)
diff --git a/TargetLibraries/PULPOpen/cmake/pulp-sdk-base.cmake b/TargetLibraries/PULPOpen/cmake/pulp-sdk-base.cmake
new file mode 100644
index 0000000..8c7109d
--- /dev/null
+++ b/TargetLibraries/PULPOpen/cmake/pulp-sdk-base.cmake
@@ -0,0 +1,72 @@
+set(PULP_SDK_HOME $ENV{PULP_SDK_HOME})
+
+set(PULP_SDK_BASE_C_SOURCE
+  ${PULP_SDK_HOME}/rtos/pmsis/pmsis_bsp/ram/ram.c
+  ${PULP_SDK_HOME}/rtos/pmsis/pmsis_bsp/ram/alloc_extern.c
+  ${PULP_SDK_HOME}/rtos/pmsis/pmsis_bsp/ram/hyperram/hyperram.c
+  ${PULP_SDK_HOME}/rtos/pmsis/pmsis_bsp/fs/read_fs/read_fs.c
+  ${PULP_SDK_HOME}/rtos/pmsis/pmsis_bsp/fs/host_fs/semihost.c
+  ${PULP_SDK_HOME}/rtos/pmsis/pmsis_bsp/fs/host_fs/host_fs.c
+  ${PULP_SDK_HOME}/rtos/pmsis/pmsis_bsp/fs/fs.c
+  ${PULP_SDK_HOME}/rtos/pmsis/pmsis_bsp/flash/hyperflash/hyperflash.c
+  ${PULP_SDK_HOME}/rtos/pmsis/pmsis_bsp/flash/flash.c
+  ${PULP_SDK_HOME}/rtos/pmsis/pmsis_bsp/partition/partition.c
+  ${PULP_SDK_HOME}/rtos/pmsis/pmsis_bsp/partition/flash_partition.c
+  ${PULP_SDK_HOME}/rtos/pmsis/pmsis_bsp/crc/md5.c
+  ${PULP_SDK_HOME}/rtos/pmsis/pmsis_bsp/bsp/siracusa.c
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/init.c
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/kernel.c
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/device.c
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/task.c
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/alloc.c
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/alloc_pool.c
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/irq.c
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/soc_event.c
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/log.c
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/time.c
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/drivers/hyperbus/hyperbus-v3.c
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/drivers/uart/uart-v1.c
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/drivers/udma/udma-v3.c
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/drivers/cluster/cluster.c
+  ${PULP_SDK_HOME}/rtos/pulpos/common/lib/libc/minimal/io.c
+  ${PULP_SDK_HOME}/rtos/pulpos/common/lib/libc/minimal/fprintf.c
+  ${PULP_SDK_HOME}/rtos/pulpos/common/lib/libc/minimal/prf.c
+  ${PULP_SDK_HOME}/rtos/pulpos/common/lib/libc/minimal/sprintf.c
+  ${PULP_SDK_HOME}/rtos/pulpos/common/lib/libc/minimal/semihost.c
+)
+
+set(PULP_SDK_BASE_ASM_SOURCE
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/crt0.S
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/irq_asm.S
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/task_asm.S
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/time_asm.S
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/soc_event_v2_itc.S
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/drivers/cluster/pe-eu-v3.S
+)
+
+set(PULP_SDK_BASE_INCLUDE
+  ${PULP_SDK_HOME}/rtos/pulpos/common/lib/libc/minimal/include
+  ${PULP_SDK_HOME}/rtos/pulpos/common/include
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp_archi/include
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp_hal/include
+  ${PULP_SDK_HOME}/rtos/pmsis/pmsis_api/include
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/include
+  ${PULP_SDK_HOME}/rtos/pmsis/pmsis_bsp/include
+)
+
+set(PULP_SDK_BASE_COMPILE_FLAGS
+  -D__riscv__
+  -D__CONFIG_UDMA__
+  -D__PULPOS2__
+  -D__PLATFORM__=ARCHI_PLATFORM_GVSOC
+  -DARCHI_CLUSTER_NB_PE=8
+  -DPOS_CONFIG_IO_UART=0
+  -DPOS_CONFIG_IO_UART_BAUDRATE=115200
+  -DPOS_CONFIG_IO_UART_ITF=0
+  -D__TRACE_LEVEL__=3
+  -DPI_LOG_LOCAL_LEVEL=2
+)
+
+set_source_files_properties(${PULP_SDK_BASE_ASM_SOURCE} PROPERTIES COMPILE_FLAGS -DLANGUAGE_ASSEMBLY)
+add_library(pulp-sdk-base OBJECT ${PULP_SDK_BASE_C_SOURCE} ${PULP_SDK_BASE_ASM_SOURCE})
diff --git a/TargetLibraries/PULPOpen/cmake/pulp-sdk-pulp-open.cmake b/TargetLibraries/PULPOpen/cmake/pulp-sdk-pulp-open.cmake
new file mode 100644
index 0000000..3027ddb
--- /dev/null
+++ b/TargetLibraries/PULPOpen/cmake/pulp-sdk-pulp-open.cmake
@@ -0,0 +1,57 @@
+include(cmake/pulp-sdk-base.cmake)
+
+set(PULP_SDK_HOME $ENV{PULP_SDK_HOME})
+
+set(PULP_OPEN_COMPILE_FLAGS
+  -DCONFIG_PULP
+  -DCONFIG_BOARD_VERSION_PULP
+  -DCONFIG_PROFILE_PULP
+  -DUSE_HYPERFLASH
+  -DUSE_HYPERRAM
+  -DPULP_CHIP_STR=pulp
+)
+
+set(PULP_OPEN_INCLUDES
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/include/pos/chips/pulp
+)
+
+set(PULP_SDK_PULP_OPEN_C_SOURCE
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/fll-v1.c
+  ${PULP_SDK_HOME}/rtos/pulpos/common/kernel/freq-domains.c
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/kernel/chips/pulp/soc.c
+)
+
+set_source_files_properties(${PULP_SDK_PULP_OPEN_ASM_SOURCE} PROPERTIES COMPILE_FLAGS -DLANGUAGE_ASSEMBLY)
+add_library(pulp-sdk OBJECT ${PULP_SDK_BASE_C_SOURCE} ${PULP_SDK_BASE_ASM_SOURCE} ${PULP_SDK_PULP_OPEN_C_SOURCE} ${PULP_SDK_PULP_OPEN_ASM_SOURCE})
+
+set(PULP_SDK_COMPILE_FLAGS ${PULP_OPEN_COMPILE_FLAGS} ${PULP_SDK_BASE_COMPILE_FLAGS})
+set(PULP_SDK_INCLUDES ${PULP_OPEN_INCLUDES} ${PULP_SDK_BASE_INCLUDE})
+
+target_include_directories(pulp-sdk SYSTEM PUBLIC ${PULP_SDK_INCLUDES} ${PULP_OPEN_INCLUDES})
+target_compile_options(pulp-sdk PUBLIC ${PULP_SDK_COMPILE_FLAGS})
+target_compile_options(pulp-sdk PRIVATE
+  -O2
+  -Wno-sign-conversion
+  -Wno-unused-function
+  -Wno-unused-parameter
+  -Wno-conversion
+  -Wno-sign-conversion
+  -Wno-unused-variable
+  -Wno-sign-compare
+  -Wno-return-type
+  -fno-inline-functions
+)
+target_compile_options(pulp-sdk INTERFACE
+  -Wno-unused-function
+)
+
+
+set(PULP_OPEN_LINK_OPTIONS
+  -Wl,--gc-sections
+  -L${PULP_SDK_HOME}/rtos/pulpos/pulp/kernel
+  -Tchips/pulp/link.ld
+)
+
+target_link_libraries(pulp-sdk PUBLIC
+  ${PULP_OPEN_LINK_OPTIONS}
+)
diff --git a/TargetLibraries/PULPOpen/cmake/pulp-sdk-siracusa.cmake b/TargetLibraries/PULPOpen/cmake/pulp-sdk-siracusa.cmake
new file mode 100644
index 0000000..04461ae
--- /dev/null
+++ b/TargetLibraries/PULPOpen/cmake/pulp-sdk-siracusa.cmake
@@ -0,0 +1,65 @@
+include(cmake/pulp-sdk-base.cmake)
+
+set(PULP_SDK_HOME $ENV{PULP_SDK_HOME})
+
+set(SIRACUSA_COMPILE_FLAGS
+  -include ${PULP_SDK_HOME}/rtos/pulpos/pulp/include/pos/chips/siracusa/config.h
+  -DCONFIG_SIRACUSA
+  -DCONFIG_BOARD_VERSION_SIRACUSA
+  -DCONFIG_PROFILE_SIRACUSA
+  -DSKIP_PLL_INIT
+  -DUSE_HYPERFLASH
+  -DUSE_HYPERRAM
+  -DPULP_CHIP_STR=siracusa
+)
+
+set(SIRACUSA_INCLUDES
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/include/pos/chips/siracusa
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/drivers/i3c/include
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/drivers/siracusa_padmux/include
+)
+
+set(PULP_SDK_SIRACUSA_C_SOURCE
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/kernel/chips/siracusa/pll.c
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/kernel/chips/siracusa/soc.c
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/drivers/i3c/src/cdn_print.c
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/drivers/i3c/src/command_list.c
+  #${PULP_SDK_HOME}/rtos/pulpos/pulp/drivers/i3c/src/i3c.c
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/drivers/i3c/src/i3c_obj_if.c
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/drivers/i3c/src/cps_impl.c
+  ${PULP_SDK_HOME}/rtos/pulpos/pulp/drivers/siracusa_padmux/src/siracusa_padctrl.c
+)
+
+set_source_files_properties(${PULP_SDK_SIRACUSA_ASM_SOURCE} PROPERTIES COMPILE_FLAGS -DLANGUAGE_ASSEMBLY)
+add_library(pulp-sdk OBJECT ${PULP_SDK_BASE_C_SOURCE} ${PULP_SDK_BASE_ASM_SOURCE} ${PULP_SDK_SIRACUSA_C_SOURCE} ${PULP_SDK_SIRACUSA_ASM_SOURCE})
+
+set(PULP_SDK_COMPILE_FLAGS ${SIRACUSA_COMPILE_FLAGS} ${PULP_SDK_BASE_COMPILE_FLAGS})
+set(PULP_SDK_INCLUDES ${SIRACUSA_INCLUDES} ${PULP_SDK_BASE_INCLUDE})
+
+target_include_directories(pulp-sdk SYSTEM PUBLIC ${PULP_SDK_INCLUDES})
+target_compile_options(pulp-sdk PUBLIC ${PULP_SDK_COMPILE_FLAGS})
+target_compile_options(pulp-sdk PRIVATE
+  -Wno-sign-conversion
+  -Wno-unused-function
+  -Wno-unused-parameter
+  -Wno-conversion
+  -Wno-sign-conversion
+  -Wno-unused-variable
+  -Wno-sign-compare
+  -Wno-return-type
+  -fno-inline-functions
+)
+target_compile_options(pulp-sdk INTERFACE
+  -Wno-unused-function
+)
+
+
+set(SIRACUSA_LINK_OPTIONS
+  -Wl,--gc-sections
+  -L${PULP_SDK_HOME}/rtos/pulpos/pulp/kernel
+  -Tchips/siracusa/link.ld
+)
+
+target_link_libraries(pulp-sdk PUBLIC
+  ${SIRACUSA_LINK_OPTIONS}
+)
diff --git a/TargetLibraries/PULPOpen/inc/DeeployMath.h b/TargetLibraries/PULPOpen/inc/DeeployMath.h
new file mode 100644
index 0000000..7aa1b18
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/DeeployMath.h
@@ -0,0 +1,45 @@
+/* =====================================================================
+ * Title:        DeeployMath.h
+ * Description:
+ *
+ * $Date:        30.12.2021
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_HEADER_
+#define __DEEPLOY_MATH_HEADER_
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#if defined(AM_PART_APOLLO4B) | defined(DAM_PART_APOLLO3)
+#include "am_bsp.h"
+#include "am_mcu_apollo.h"
+#include "am_util.h"
+#endif
+
+#include "DeeployBasicMath.h"
+
+#endif // __DEEPLOY_MATH_HEADER_
diff --git a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h
new file mode 100644
index 0000000..f8c95b7
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h
@@ -0,0 +1,48 @@
+/* =====================================================================
+ * Title:        DeeployMath.h
+ * Description:
+ *
+ * $Date:        30.12.2021
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEEPLOY_MATH_HEADER_
+#define __DEEPLOY_MATH_HEADER_
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "DeeployBasicMath.h"
+
+#include "pmsis.h"
+
+#include "kernel/RQiHardswish.h"
+#include "kernel/RequantShift.h"
+#include "kernel/UniformRequantShift.h"
+#include "kernel/gemv.h"
+#include "kernel/iRMSnorm.h"
+#include "kernel/iSoftmax.h"
+
+#endif // __DEEPLOY_MATH_HEADER_
diff --git a/TargetLibraries/PULPOpen/inc/dory_dma.h b/TargetLibraries/PULPOpen/inc/dory_dma.h
new file mode 100644
index 0000000..330eec2
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/dory_dma.h
@@ -0,0 +1,53 @@
+/*
+ * dory.h
+ * Alessio Burrello <alessio.burrello@unibo.it>
+ *
+ * Copyright (C) 2019-2020 University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _DORY_DMA_H
+#define _DORY_DMA_H
+
+typedef struct {
+  void *ext;
+  void *loc;
+  unsigned short hwc_to_chw;
+  unsigned short stride_2d;
+  unsigned short number_of_2d_copies;
+  unsigned short stride_1d;
+  unsigned short number_of_1d_copies;
+  unsigned short length_1d_copy;
+  unsigned int mchan_cmd;
+  int dir; // 0 l1->l2, 1 l2->l1
+  int tid;
+} DMA_copy;
+
+void dory_dma_memcpy_hwc_to_chw(DMA_copy *copy);
+
+void dory_dma_memcpy_1d_async(DMA_copy *copy);
+
+void dory_dma_memcpy_2d_async(DMA_copy *copy);
+
+void dory_dma_memcpy_3d_async(DMA_copy *copy);
+
+void dory_dma_memcpy_async(DMA_copy *copy);
+
+void dory_dma_free(DMA_copy *copy);
+
+void dory_dma_barrier(DMA_copy *copy);
+
+int dory_dma_allocate();
+#endif
diff --git a/TargetLibraries/PULPOpen/inc/dory_mem.h b/TargetLibraries/PULPOpen/inc/dory_mem.h
new file mode 100644
index 0000000..f577458
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/dory_mem.h
@@ -0,0 +1,48 @@
+/* =====================================================================
+ * Title:        dory_mem.h
+ * Description:
+ *
+ * $Date:        12.12.2023
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MEM_H__
+#define __MEM_H__
+
+#include <stddef.h>
+
+extern struct pi_device ram;
+
+void open_fs();
+void mem_init();
+struct pi_device *get_ram_ptr();
+void *ram_malloc(size_t size);
+void ram_free(void *ptr, size_t size);
+void ram_read(void *dest, void *src, size_t size);
+void ram_write(void *dest, void *src, size_t size);
+void *cl_ram_malloc(size_t size);
+void cl_ram_free(void *ptr, size_t size);
+void cl_ram_read(void *dest, void *src, size_t size);
+void cl_ram_write(void *dest, void *src, size_t size);
+size_t load_file_to_ram(const void *dest, const char *filename);
+
+#endif // __MEM_H__
diff --git a/TargetLibraries/PULPOpen/inc/kernel/RQiHardswish.h b/TargetLibraries/PULPOpen/inc/kernel/RQiHardswish.h
new file mode 100644
index 0000000..bd52849
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/kernel/RQiHardswish.h
@@ -0,0 +1,32 @@
+/* =====================================================================
+ * Title:        RQiHardswish.h
+ * Description:
+ *
+ * $Date:        15.03.2024
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployPULPMath.h"
+
+void RQiHardswish_s8_s8_plp(int8_t *input, int8_t *output, int32_t size,
+                            int32_t one_over_six, int32_t three, int32_t six,
+                            int32_t mul, int32_t add, int32_t shift);
diff --git a/TargetLibraries/PULPOpen/inc/kernel/RequantShift.h b/TargetLibraries/PULPOpen/inc/kernel/RequantShift.h
new file mode 100644
index 0000000..54c3862
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/kernel/RequantShift.h
@@ -0,0 +1,139 @@
+/* =====================================================================
+ * Title:        RequantShift_s8.c
+ * Description:
+ *
+ * Date:         19.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployPULPMath.h"
+
+void RequantShift_u8_s8_NHWC(uint8_t *data_in, int32_t size, int32_t *mul,
+                             int32_t *add, int8_t *data_out, int32_t log2D,
+                             int32_t channels, int32_t input_offset,
+                             int32_t output_offset, int8_t output_min,
+                             int8_t output_max, bool rounding);
+
+void RequantShift_u16_s8_NHWC(uint16_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, int8_t *data_out, int32_t log2D,
+                              int32_t channels, int32_t input_offset,
+                              int32_t output_offset, int8_t output_min,
+                              int8_t output_max, bool rounding);
+
+void RequantShift_u32_s8_NHWC(uint32_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, int8_t *data_out, int32_t log2D,
+                              int32_t channels, int32_t input_offset,
+                              int32_t output_offset, int8_t output_min,
+                              int8_t output_max, bool rounding);
+
+void RequantShift_u8_s8_NCHW(uint8_t *data_in, int32_t size, int32_t *mul,
+                             int32_t *add, int8_t *data_out, int32_t log2D,
+                             int32_t HW, int32_t input_offset,
+                             int32_t output_offset, int8_t output_min,
+                             int8_t output_max, bool rounding);
+
+void RequantShift_u16_s8_NCHW(uint16_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, int8_t *data_out, int32_t log2D,
+                              int32_t HW, int32_t input_offset,
+                              int32_t output_offset, int8_t output_min,
+                              int8_t output_max, bool rounding);
+
+void RequantShift_u32_s8_NCHW(uint32_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, int8_t *data_out, int32_t log2D,
+                              int32_t HW, int32_t input_offset,
+                              int32_t output_offset, int8_t output_min,
+                              int8_t output_max, bool rounding);
+
+void RequantShift_u8_u8_NHWC(uint8_t *data_in, int32_t size, int32_t *mul,
+                             int32_t *add, uint8_t *data_out, int32_t log2D,
+                             int32_t channels, int32_t input_offset,
+                             int32_t output_offset, uint8_t output_min,
+                             uint8_t output_max, bool rounding);
+
+void RequantShift_u16_u8_NHWC(uint16_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, uint8_t *data_out, int32_t log2D,
+                              int32_t channels, int32_t input_offset,
+                              int32_t output_offset, uint8_t output_min,
+                              uint8_t output_max, bool rounding);
+
+void RequantShift_u32_u8_NHWC(uint32_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, uint8_t *data_out, int32_t log2D,
+                              int32_t channels, int32_t input_offset,
+                              int32_t output_offset, uint8_t output_min,
+                              uint8_t output_max, bool rounding);
+
+void RequantShift_u8_u8_NCHW(uint8_t *data_in, int32_t size, int32_t *mul,
+                             int32_t *add, uint8_t *data_out, int32_t log2D,
+                             int32_t HW, int32_t input_offset,
+                             int32_t output_offset, uint8_t output_min,
+                             uint8_t output_max, bool rounding);
+
+void RequantShift_u16_u8_NCHW(uint16_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, uint8_t *data_out, int32_t log2D,
+                              int32_t HW, int32_t input_offset,
+                              int32_t output_offset, uint8_t output_min,
+                              uint8_t output_max, bool rounding);
+
+void RequantShift_u32_u8_NCHW(uint32_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, uint8_t *data_out, int32_t log2D,
+                              int32_t HW, int32_t input_offset,
+                              int32_t output_offset, uint8_t output_min,
+                              uint8_t output_max, bool rounding);
+
+void RequantShift_s8_u8_NHWC(int8_t *data_in, int32_t size, int32_t *mul,
+                             int32_t *add, uint8_t *data_out, int32_t log2D,
+                             int32_t channels, int32_t input_offset,
+                             int32_t output_offset, uint8_t output_min,
+                             uint8_t output_max, bool rounding);
+
+void RequantShift_s16_u8_NHWC(int16_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, uint8_t *data_out, int32_t log2D,
+                              int32_t channels, int32_t input_offset,
+                              int32_t output_offset, uint8_t output_min,
+                              uint8_t output_max, bool rounding);
+
+void RequantShift_s32_u8_NHWC(int32_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, uint8_t *data_out, int32_t log2D,
+                              int32_t channels, int32_t input_offset,
+                              int32_t output_offset, uint8_t output_min,
+                              uint8_t output_max, bool rounding);
+
+void RequantShift_s8_u8_NCHW(int8_t *data_in, int32_t size, int32_t *mul,
+                             int32_t *add, uint8_t *data_out, int32_t log2D,
+                             int32_t HW, int32_t input_offset,
+                             int32_t output_offset, uint8_t output_min,
+                             uint8_t output_max, bool rounding);
+
+void RequantShift_s16_u8_NCHW(int16_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, uint8_t *data_out, int32_t log2D,
+                              int32_t HW, int32_t input_offset,
+                              int32_t output_offset, uint8_t output_min,
+                              uint8_t output_max, bool rounding);
+
+void RequantShift_s32_u8_NCHW(int32_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, uint8_t *data_out, int32_t log2D,
+                              int32_t HW, int32_t input_offset,
+                              int32_t output_offset, uint8_t output_min,
+                              uint8_t output_max, bool rounding);
diff --git a/TargetLibraries/PULPOpen/inc/kernel/UniformRequantShift.h b/TargetLibraries/PULPOpen/inc/kernel/UniformRequantShift.h
new file mode 100644
index 0000000..0cbd5c2
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/kernel/UniformRequantShift.h
@@ -0,0 +1,51 @@
+/* ----------------------------------------------------------------------
+#
+# File: UniformRequantShift.h
+#
+# Last edited: 12.03.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+*/
+
+#include "DeeployPULPMath.h"
+
+void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul,
+                               int32_t add, int8_t *data_out, int32_t log2D,
+                               int32_t HW, int32_t input_offset,
+                               int32_t output_offset, int8_t output_min,
+                               int8_t output_max, bool rounding);
+
+void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul,
+                               int32_t add, int8_t *data_out, int32_t log2D,
+                               int32_t HW, int32_t input_offset,
+                               int32_t output_offset, int8_t output_min,
+                               int8_t output_max, bool rounding);
+
+void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul,
+                                int32_t add, int8_t *data_out, int32_t log2D,
+                                int32_t HW, int32_t input_offset,
+                                int32_t output_offset, int8_t output_min,
+                                int8_t output_max, bool rounding);
+
+void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul,
+                                int32_t add, int8_t *data_out, int32_t log2D,
+                                int32_t HW, int32_t input_offset,
+                                int32_t output_offset, int8_t output_min,
+                                int8_t output_max, bool rounding);
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/inc/kernel/gemv.h b/TargetLibraries/PULPOpen/inc/kernel/gemv.h
new file mode 100644
index 0000000..214f830
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/kernel/gemv.h
@@ -0,0 +1,34 @@
+/* =====================================================================
+ * Title:        vec2mat.h
+ * Description:
+ *
+ * $Date:        15.03.2024
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdint.h"
+
+void gemv_s8_s8_plp(int8_t *pIn, int8_t *pBias, int8_t *pOut, int8_t *pWeight,
+                    int32_t *pKappa, int32_t *pLambda, uint16_t out_mult,
+                    uint16_t out_shift, uint16_t dim_vec,
+                    uint16_t num_o_neurons, uint8_t flag_relu,
+                    uint8_t flag_batch_norm);
diff --git a/TargetLibraries/PULPOpen/inc/kernel/iRMSnorm.h b/TargetLibraries/PULPOpen/inc/kernel/iRMSnorm.h
new file mode 100644
index 0000000..fa1c5e4
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/kernel/iRMSnorm.h
@@ -0,0 +1,31 @@
+/* =====================================================================
+ * Title:        iRMSnorm.h
+ * Description:
+ *
+ * $Date:        14.03.2024
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployPULPMath.h"
+
+void iRMSnorm_s8_s8_plp(int8_t *data_in, int8_t *data_out, int32_t *weight,
+                        int32_t size, int32_t lastDimLength, int32_t log2D);
diff --git a/TargetLibraries/PULPOpen/inc/kernel/iSoftmax.h b/TargetLibraries/PULPOpen/inc/kernel/iSoftmax.h
new file mode 100644
index 0000000..52220bc
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/kernel/iSoftmax.h
@@ -0,0 +1,37 @@
+/* =====================================================================
+ * Title:        iSoftmax.h
+ * Description:
+ *
+ * $Date:        13.11.2023
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployPULPMath.h"
+
+void PULPSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out,
+                       uint32_t *lastDimBuffer, uint32_t size,
+                       uint32_t lastDimLength, int32_t coeffB, int32_t coeffC,
+                       int32_t log2);
+void PULPSoftmax_i8_u8(int8_t *data_in, uint8_t *data_out,
+                       uint32_t *lastDimBuffer, uint32_t size,
+                       uint32_t lastDimLength, int32_t coeffB, int32_t coeffC,
+                       int32_t log2);
diff --git a/TargetLibraries/PULPOpen/inc/mchan.h b/TargetLibraries/PULPOpen/inc/mchan.h
new file mode 100644
index 0000000..cd7c2ee
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/mchan.h
@@ -0,0 +1,161 @@
+/* =====================================================================
+ * Title:        mchan.h
+ * Description:
+ *
+ * $Date:        26.07.2024
+ *
+ * ===================================================================== */
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+
+ * Adopted from PULP-SDK (https://github.com/pulp-platform/pulp-sdk), released
+ under Apache 2.0
+
+ */
+
+#ifndef _MCHAN_H
+#define _MCHAN_H
+
+// Requires to have MCHAN_BASE_ADDR, MCHAN_EVENT defined outside of header
+#ifndef MCHAN_BASE_ADDR
+#error "[mchan.h] MCHAN_BASE_ADDR not defined!"
+#endif
+
+#if !defined(MCHAN_EVENT) && !defined(MCHAN_POLLED)
+#error "[mchan.h] Nor MCHAN_EVENT nor MCHAN_POLLED defined!"
+#endif
+
+#if defined(MCHAN_EVENT) && !defined(MCHAN_EVENT_BIT)
+#error                                                                         \
+    "[mchan.h] MCHAN_EVENT_BIT should be defined when using events as signalization!"
+#endif
+
+#include "pmsis.h"
+
+#define MCHAN_CMD_OFFSET 0
+#define MCHAN_STATUS_OFFSET 4
+
+#define MCHAN_CMD_ADDR (MCHAN_BASE_ADDR + MCHAN_CMD_OFFSET)
+#define MCHAN_STATUS_ADDR (MCHAN_BASE_ADDR + MCHAN_STATUS_OFFSET)
+
+#define READ_REG(addr) (*(volatile int *)(addr))
+#define WRITE_REG(addr, value)                                                 \
+  do {                                                                         \
+    *(volatile int *)(addr) = (int)value;                                      \
+  } while (0)
+
+#define MCHAN_READ_CMD() READ_REG(MCHAN_CMD_ADDR)
+#define MCHAN_WRITE_CMD(value) WRITE_REG(MCHAN_CMD_ADDR, value)
+
+#define MCHAN_READ_STATUS() READ_REG(MCHAN_STATUS_ADDR)
+#define MCHAN_WRITE_STATUS(value) WRITE_REG(MCHAN_STATUS_ADDR, value)
+
+// MCHAN version 7 has 1 more bit for the transfer length, so all the flag
+// offsets are shifted by 1. Also, LOC (TCDM) striding is not supported in v6.
+#if MCHAN_VERSION == 7
+#define MCHAN_TRANSFER_LEN_SIZE (17)
+#else
+#define MCHAN_TRANSFER_LEN_SIZE (16)
+#endif
+
+#define MCHAN_CMD_FLAG_DIRECTION_LOC2EXT (0 << (MCHAN_TRANSFER_LEN_SIZE + 0))
+#define MCHAN_CMD_FLAG_DIRECTION_EXT2LOC (1 << (MCHAN_TRANSFER_LEN_SIZE + 0))
+#define MCHAN_CMD_FLAG_INCREMENTAL (1 << (MCHAN_TRANSFER_LEN_SIZE + 1))
+#define MCHAN_CMD_FLAG_2D_TRANSFER_EXTERNAL (1 << (MCHAN_TRANSFER_LEN_SIZE + 2))
+#define MCHAN_CMD_FLAG_EVENT_ENABLE (1 << (MCHAN_TRANSFER_LEN_SIZE + 3))
+#define MCHAN_CMD_FLAG_INTERRUPT_ENABLE (1 << (MCHAN_TRANSFER_LEN_SIZE + 4))
+#define MCHAN_CMD_FLAG_BROADCAST_FINISH (1 << (MCHAN_TRANSFER_LEN_SIZE + 5))
+#if MCHAN_VERSION == 7
+#define MCHAN_CMD_FLAG_2D_TRANSFER_LOCAL                                       \
+  (1 << (MCHAN_TRANSFER_LEN_SIZE + 6)) // can only be used with MCHAN v7
+#endif
+#define MCHAN_CMD_SHIFT_DIRECTION MCHAN_TRANSFER_LEN_SIZE
+
+#define MCHAN_CMD(len, dir, inc, loc_2d, ext_2d, int_en, event_en, broadcast)  \
+  (len | dir | inc | loc_2d | ext_2d | broadcast | int_en | event_en)
+
+typedef enum {
+  MCHAN_DMA_TRANSFER_DIRECTION_EXT2LOC = MCHAN_CMD_FLAG_DIRECTION_EXT2LOC,
+  MCHAN_DMA_TRANSFER_DIRECTION_LOC2EXT = MCHAN_CMD_FLAG_DIRECTION_LOC2EXT
+} mchan_dma_transfer_direction_e;
+
+typedef struct {
+  int cmd;
+  int size;
+
+  void *loc;
+  int loc_size_1d;
+  int loc_stride_1d;
+
+  void *ext;
+  int ext_size_1d;
+  int ext_stride_1d;
+} mchan_transfer_t;
+
+static int mchan_transfer_get_id() { return MCHAN_READ_CMD(); }
+
+static void mchan_transfer_push_1d(mchan_transfer_t trans) {
+  MCHAN_WRITE_CMD(trans.cmd);
+  MCHAN_WRITE_CMD(trans.loc);
+  MCHAN_WRITE_CMD(trans.ext);
+}
+
+static void mchan_transfer_push_2d(mchan_transfer_t trans) {
+  MCHAN_WRITE_CMD(trans.cmd);
+  MCHAN_WRITE_CMD(trans.loc);
+  MCHAN_WRITE_CMD(trans.ext);
+// MCHAN version 7 takes 2D "count" (length of 1D transfers) and stride in 2
+// steps, v7 takes it in 1 step with the stride shifted to the upper 16 bits.
+#if MCHAN_VERSION == 7
+  MCHAN_WRITE_CMD(trans.ext_size_1d);
+  MCHAN_WRITE_CMD(trans.ext_stride_1d);
+#else
+  MCHAN_WRITE_CMD(trans.ext_size_1d | (trans.ext_stride_1d << 16));
+#endif
+}
+
+static void mchan_transfer_push(mchan_transfer_t trans) {
+  MCHAN_WRITE_CMD(trans.cmd);
+  MCHAN_WRITE_CMD(trans.loc);
+  MCHAN_WRITE_CMD(trans.ext);
+
+  if (trans.ext_size_1d < trans.size) {
+    MCHAN_WRITE_CMD(trans.ext_size_1d);
+    MCHAN_WRITE_CMD(trans.ext_stride_1d);
+  }
+
+  if (trans.loc_size_1d < trans.size) {
+    MCHAN_WRITE_CMD(trans.loc_size_1d);
+    MCHAN_WRITE_CMD(trans.loc_stride_1d);
+  }
+}
+
+static void mchan_transfer_free(int tid) { MCHAN_WRITE_STATUS(1 << tid); }
+
+static int mchan_transfer_busy(int tid) {
+  return MCHAN_READ_STATUS() & (1 << tid);
+}
+
+static void mchan_transfer_wait(int tid) {
+#if defined(MCHAN_EVENT)
+  while (mchan_transfer_busy(tid))
+    eu_evt_maskWaitAndClr(1 << MCHAN_EVENT_BIT);
+#elif defined(MCHAN_POLLED)
+  while (mchan_transfer_busy(tid))
+    ;
+#endif
+}
+
+#endif
diff --git a/TargetLibraries/PULPOpen/src/RQiHardswish.c b/TargetLibraries/PULPOpen/src/RQiHardswish.c
new file mode 100644
index 0000000..8689383
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/RQiHardswish.c
@@ -0,0 +1,58 @@
+/* =====================================================================
+ * Title:        RQiHardswish.c
+ * Description:
+ *
+ * $Date:        15.03.2024
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployPULPMath.h"
+
+void RQiHardswish_s8_s8_plp(int8_t *input, int8_t *output, int32_t size,
+                            int32_t one_over_six, int32_t three, int32_t six,
+                            int32_t mul, int32_t add, int32_t shift) {
+
+  int32_t temp;
+  int32_t rnd;
+
+  rnd = (1 << (shift - 1));
+
+  int8_t core_id = pi_core_id();
+  int8_t log2Core = log2(NUM_CORES);
+  int16_t chunk = (size >> log2Core) + ((size & (NUM_CORES - 1)) != 0);
+  int16_t chunk_start = MIN(chunk * core_id, size);
+  int16_t chunk_stop = MIN(chunk_start + chunk, size + 1);
+
+#pragma unroll 2
+  for (int i = chunk_start; i < chunk_stop; i++) {
+    temp = input[i] + three;
+    temp = CLAMP(temp, 0, six);
+
+    temp = temp * one_over_six;
+    temp = input[i] * temp;
+    temp = temp * (mul) + (add + rnd);
+
+    temp = temp >> shift;
+
+    output[i] = (int8_t)CLAMP(temp, -128, 127);
+  }
+}
diff --git a/TargetLibraries/PULPOpen/src/RequantShift.c b/TargetLibraries/PULPOpen/src/RequantShift.c
new file mode 100644
index 0000000..9343be2
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/RequantShift.c
@@ -0,0 +1,339 @@
+/* =====================================================================
+ * Title:        RequantShift_s8.c
+ * Description:
+ *
+ * Date:         19.12.2022
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Moritz Scherer, ETH Zurich
+ * - Philip Wiese, ETH Zurich
+ * - Victor Jung, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployPULPMath.h"
+#include "pmsis.h"
+
+void RequantShift_u8_s8_NHWC(uint8_t *data_in, int32_t size, int32_t *mul,
+                             int32_t *add, int8_t *data_out, int32_t log2D,
+                             int32_t channels, int32_t input_offset,
+                             int32_t output_offset, int8_t output_min,
+                             int8_t output_max, bool rounding) {
+  int32_t intermediate;
+  int8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate = ((int32_t)data_in[i] + input_offset) * mul[i % channels] +
+                   add[i % channels];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_u16_s8_NHWC(uint16_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, int8_t *data_out, int32_t log2D,
+                              int32_t channels, int32_t input_offset,
+                              int32_t output_offset, int8_t output_min,
+                              int8_t output_max, bool rounding) {
+  int32_t intermediate;
+  int8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate = ((int32_t)data_in[i] + input_offset) * mul[i % channels] +
+                   add[i % channels];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_u32_s8_NHWC(uint32_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, int8_t *data_out, int32_t log2D,
+                              int32_t channels, int32_t input_offset,
+                              int32_t output_offset, int8_t output_min,
+                              int8_t output_max, bool rounding) {
+  int32_t intermediate;
+  int8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate = ((int32_t)data_in[i] + input_offset) * mul[i % channels] +
+                   add[i % channels];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_u8_s8_NCHW(uint8_t *data_in, int32_t size, int32_t *mul,
+                             int32_t *add, int8_t *data_out, int32_t log2D,
+                             int32_t HW, int32_t input_offset,
+                             int32_t output_offset, int8_t output_min,
+                             int8_t output_max, bool rounding) {
+  int32_t intermediate;
+  int8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate =
+        ((int32_t)data_in[i] + input_offset) * mul[i / HW] + add[i / HW];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_u16_s8_NCHW(uint16_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, int8_t *data_out, int32_t log2D,
+                              int32_t HW, int32_t input_offset,
+                              int32_t output_offset, int8_t output_min,
+                              int8_t output_max, bool rounding) {
+  int32_t intermediate;
+  int8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate =
+        ((int32_t)data_in[i] + input_offset) * mul[i / HW] + add[i / HW];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_u32_s8_NCHW(uint32_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, int8_t *data_out, int32_t log2D,
+                              int32_t HW, int32_t input_offset,
+                              int32_t output_offset, int8_t output_min,
+                              int8_t output_max, bool rounding) {
+  int32_t intermediate;
+  int8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate =
+        ((int32_t)data_in[i] + input_offset) * mul[i / HW] + add[i / HW];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_u8_u8_NHWC(uint8_t *data_in, int32_t size, int32_t *mul,
+                             int32_t *add, uint8_t *data_out, int32_t log2D,
+                             int32_t channels, int32_t input_offset,
+                             int32_t output_offset, uint8_t output_min,
+                             uint8_t output_max, bool rounding) {
+  int32_t intermediate;
+  uint8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate = ((int32_t)data_in[i] + input_offset) * mul[i % channels] +
+                   add[i % channels];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (uint8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_u16_u8_NHWC(uint16_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, uint8_t *data_out, int32_t log2D,
+                              int32_t channels, int32_t input_offset,
+                              int32_t output_offset, uint8_t output_min,
+                              uint8_t output_max, bool rounding) {
+  int32_t intermediate;
+  uint8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate = ((int32_t)data_in[i] + input_offset) * mul[i % channels] +
+                   add[i % channels];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (uint8_t)CLAMP((uint32_t)intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_u32_u8_NHWC(uint32_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, uint8_t *data_out, int32_t log2D,
+                              int32_t channels, int32_t input_offset,
+                              int32_t output_offset, uint8_t output_min,
+                              uint8_t output_max, bool rounding) {
+  int32_t intermediate;
+  uint8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate = ((int32_t)data_in[i] + input_offset) * mul[i % channels] +
+                   add[i % channels];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (uint8_t)CLAMP((uint32_t)intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_u8_u8_NCHW(uint8_t *data_in, int32_t size, int32_t *mul,
+                             int32_t *add, uint8_t *data_out, int32_t log2D,
+                             int32_t HW, int32_t input_offset,
+                             int32_t output_offset, uint8_t output_min,
+                             uint8_t output_max, bool rounding) {
+  int32_t intermediate;
+  uint8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate =
+        ((int32_t)data_in[i] + input_offset) * mul[i / HW] + add[i / HW];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (uint8_t)CLAMP((uint32_t)intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_u16_u8_NCHW(uint16_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, uint8_t *data_out, int32_t log2D,
+                              int32_t HW, int32_t input_offset,
+                              int32_t output_offset, uint8_t output_min,
+                              uint8_t output_max, bool rounding) {
+  int32_t intermediate;
+  uint8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate =
+        ((int32_t)data_in[i] + input_offset) * mul[i / HW] + add[i / HW];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (uint8_t)CLAMP((uint32_t)intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_u32_u8_NCHW(uint32_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, uint8_t *data_out, int32_t log2D,
+                              int32_t HW, int32_t input_offset,
+                              int32_t output_offset, uint8_t output_min,
+                              uint8_t output_max, bool rounding) {
+  int32_t intermediate;
+  uint8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate =
+        ((int32_t)data_in[i] + input_offset) * mul[i / HW] + add[i / HW];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (uint8_t)CLAMP((uint32_t)intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_s8_u8_NHWC(int8_t *data_in, int32_t size, int32_t *mul,
+                             int32_t *add, uint8_t *data_out, int32_t log2D,
+                             int32_t channels, int32_t input_offset,
+                             int32_t output_offset, uint8_t output_min,
+                             uint8_t output_max, bool rounding) {
+  int32_t intermediate;
+  uint8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate = ((int32_t)data_in[i] + input_offset) * mul[i % channels] +
+                   add[i % channels];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (uint8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_s16_u8_NHWC(int16_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, uint8_t *data_out, int32_t log2D,
+                              int32_t channels, int32_t input_offset,
+                              int32_t output_offset, uint8_t output_min,
+                              uint8_t output_max, bool rounding) {
+  int32_t intermediate;
+  uint8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate = ((int32_t)data_in[i] + input_offset) * mul[i % channels] +
+                   add[i % channels];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (uint8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_s32_u8_NHWC(int32_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, uint8_t *data_out, int32_t log2D,
+                              int32_t channels, int32_t input_offset,
+                              int32_t output_offset, uint8_t output_min,
+                              uint8_t output_max, bool rounding) {
+  int32_t intermediate;
+  uint8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate = ((int32_t)data_in[i] + input_offset) * mul[i % channels] +
+                   add[i % channels];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (uint8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_s8_u8_NCHW(int8_t *data_in, int32_t size, int32_t *mul,
+                             int32_t *add, uint8_t *data_out, int32_t log2D,
+                             int32_t HW, int32_t input_offset,
+                             int32_t output_offset, uint8_t output_min,
+                             uint8_t output_max, bool rounding) {
+  int32_t intermediate;
+  uint8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate =
+        ((int32_t)data_in[i] + input_offset) * mul[i / HW] + add[i / HW];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (uint8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_s16_u8_NCHW(int16_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, uint8_t *data_out, int32_t log2D,
+                              int32_t HW, int32_t input_offset,
+                              int32_t output_offset, uint8_t output_min,
+                              uint8_t output_max, bool rounding) {
+  int32_t intermediate;
+  uint8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate =
+        ((int32_t)data_in[i] + input_offset) * mul[i / HW] + add[i / HW];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (uint8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
+
+void RequantShift_s32_u8_NCHW(int32_t *data_in, int32_t size, int32_t *mul,
+                              int32_t *add, uint8_t *data_out, int32_t log2D,
+                              int32_t HW, int32_t input_offset,
+                              int32_t output_offset, uint8_t output_min,
+                              uint8_t output_max, bool rounding) {
+  int32_t intermediate;
+  uint8_t out;
+  for (int i = 0; i < size; i++) {
+    intermediate =
+        ((int32_t)data_in[i] + input_offset) * mul[i / HW] + add[i / HW];
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (uint8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+  }
+}
diff --git a/TargetLibraries/PULPOpen/src/UniformRequantShift.c b/TargetLibraries/PULPOpen/src/UniformRequantShift.c
new file mode 100644
index 0000000..5507d0e
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/UniformRequantShift.c
@@ -0,0 +1,288 @@
+/* ----------------------------------------------------------------------
+#
+# File: UniformRequantShift.c
+#
+# Last edited: 12.03.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+*/
+
+#include "DeeployPULPMath.h"
+#include "pmsis.h"
+
+void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul,
+                               int32_t add, int8_t *data_out, int32_t log2D,
+                               int32_t HW, int32_t input_offset,
+                               int32_t output_offset, int8_t output_min,
+                               int8_t output_max, bool rounding) {
+
+  int8_t core_id = pi_core_id();
+  int8_t log2Core = log2(NUM_CORES);
+  int16_t chunk = (size >> log2Core) + ((size & (NUM_CORES - 1)) != 0);
+  int16_t chunk_start = MIN(chunk * core_id, size);
+  int16_t chunk_stop = MIN(chunk_start + chunk, size + 1);
+
+  // JUNGVI: Compiler magic, don't remove the volatile keyword below
+  int32_t volatile halfChunkSize = chunk >> 1;
+  int32_t intermediate;
+  int8_t out;
+  int8_t reg_data_in_A;
+  int8_t reg_data_in_B;
+
+  // Load step 0
+  reg_data_in_A = data_in[chunk_start];
+
+  for (int i = chunk_start; i < chunk_start + halfChunkSize; i++) {
+
+    // Load step halfChunkSize + i
+    reg_data_in_B = data_in[halfChunkSize + i];
+
+    // Compute i
+    intermediate = (reg_data_in_A + input_offset) * mul + add;
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+
+    // Load step i + 1
+    reg_data_in_A = data_in[i + 1];
+
+    // Compute step halfChunkSize + i
+    intermediate = (reg_data_in_B + input_offset) * mul + add;
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[halfChunkSize + i] = out;
+  }
+
+  // Leftover computation
+  if ((chunk_stop - chunk_start) % 2) {
+
+    reg_data_in_B = data_in[chunk_stop - 1];
+    reg_data_in_A = data_in[chunk_stop];
+
+    intermediate = (reg_data_in_B + input_offset) * mul + add;
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[chunk_stop - 1] = out;
+
+    intermediate = (reg_data_in_A + input_offset) * mul + add;
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[chunk_stop] = out;
+  }
+}
+
+void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul,
+                               int32_t add, int8_t *data_out, int32_t log2D,
+                               int32_t HW, int32_t input_offset,
+                               int32_t output_offset, int8_t output_min,
+                               int8_t output_max, bool rounding) {
+
+  int8_t core_id = pi_core_id();
+  int8_t log2Core = log2(NUM_CORES);
+  int16_t chunk = (size >> log2Core) + ((size & (NUM_CORES - 1)) != 0);
+  int16_t chunk_start = MIN(chunk * core_id, size);
+  int16_t chunk_stop = MIN(chunk_start + chunk, size + 1);
+
+  // JUNGVI: Compiler magic, don't remove the volatile keyword below
+  int32_t volatile halfChunkSize = chunk >> 1;
+  int32_t intermediate;
+  int8_t out;
+  uint8_t reg_data_in_A;
+  uint8_t reg_data_in_B;
+
+  // Load step 0
+  reg_data_in_A = data_in[chunk_start];
+
+  for (int i = chunk_start; i < chunk_start + halfChunkSize; i++) {
+
+    // Load step halfChunkSize + i
+    reg_data_in_B = data_in[halfChunkSize + i];
+
+    // Compute i
+    intermediate = (reg_data_in_A + input_offset) * mul + add;
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+
+    // Load step i + 1
+    reg_data_in_A = data_in[i + 1];
+
+    // Compute step halfChunkSize + i
+    intermediate = (reg_data_in_B + input_offset) * mul + add;
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[halfChunkSize + i] = out;
+  }
+
+  // Leftover computation
+  if ((chunk_stop - chunk_start) % 2) {
+
+    reg_data_in_B = data_in[chunk_stop - 1];
+    reg_data_in_A = data_in[chunk_stop];
+
+    intermediate = (reg_data_in_B + input_offset) * mul + add;
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[chunk_stop - 1] = out;
+
+    intermediate = (reg_data_in_A + input_offset) * mul + add;
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[chunk_stop] = out;
+  }
+}
+
+void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul,
+                                int32_t add, int8_t *data_out, int32_t log2D,
+                                int32_t HW, int32_t input_offset,
+                                int32_t output_offset, int8_t output_min,
+                                int8_t output_max, bool rounding) {
+
+  int8_t core_id = pi_core_id();
+  int8_t log2Core = log2(NUM_CORES);
+  int16_t chunk = (size >> log2Core) + ((size & (NUM_CORES - 1)) != 0);
+  int16_t chunk_start = MIN(chunk * core_id, size);
+  int16_t chunk_stop = MIN(chunk_start + chunk, size + 1);
+
+  // JUNGVI: Compiler magic, don't remove the volatile keyword below
+  int32_t volatile halfChunkSize = chunk >> 1;
+  int32_t intermediate;
+  int8_t out;
+  int16_t reg_data_in_A;
+  int16_t reg_data_in_B;
+
+  // Load step 0
+  reg_data_in_A = data_in[chunk_start];
+
+  for (int i = chunk_start; i < chunk_start + halfChunkSize; i++) {
+
+    // Load step halfChunkSize + i
+    reg_data_in_B = data_in[halfChunkSize + i];
+
+    // Compute i
+    intermediate = (reg_data_in_A + input_offset) * mul + add;
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+
+    // Load step i + 1
+    reg_data_in_A = data_in[i + 1];
+
+    // Compute step halfChunkSize + i
+    intermediate = (reg_data_in_B + input_offset) * mul + add;
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[halfChunkSize + i] = out;
+  }
+
+  // Leftover computation
+  if ((chunk_stop - chunk_start) % 2) {
+
+    reg_data_in_B = data_in[chunk_stop - 1];
+    reg_data_in_A = data_in[chunk_stop];
+
+    intermediate = (reg_data_in_B + input_offset) * mul + add;
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[chunk_stop - 1] = out;
+
+    intermediate = (reg_data_in_A + input_offset) * mul + add;
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[chunk_stop] = out;
+  }
+}
+
+void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul,
+                                int32_t add, int8_t *data_out, int32_t log2D,
+                                int32_t HW, int32_t input_offset,
+                                int32_t output_offset, int8_t output_min,
+                                int8_t output_max, bool rounding) {
+
+  int8_t core_id = pi_core_id();
+  int8_t log2Core = log2(NUM_CORES);
+  int16_t chunk = (size >> log2Core) + ((size & (NUM_CORES - 1)) != 0);
+  int16_t chunk_start = MIN(chunk * core_id, size);
+  int16_t chunk_stop = MIN(chunk_start + chunk, size + 1);
+
+  // JUNGVI: Compiler magic, don't remove the volatile keyword below
+  int32_t volatile halfChunkSize = chunk >> 1;
+  int32_t intermediate;
+  int8_t out;
+  int32_t reg_data_in_A;
+  int32_t reg_data_in_B;
+
+  // Load step 0
+  reg_data_in_A = data_in[chunk_start];
+
+  for (int i = chunk_start; i < chunk_start + halfChunkSize; i++) {
+
+    // Load step halfChunkSize + i
+    reg_data_in_B = data_in[halfChunkSize + i];
+
+    // Compute i
+    intermediate = (reg_data_in_A + input_offset) * mul + add;
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[i] = out;
+
+    // Load step i + 1
+    reg_data_in_A = data_in[i + 1];
+
+    // Compute step halfChunkSize + i
+    intermediate = (reg_data_in_B + input_offset) * mul + add;
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[halfChunkSize + i] = out;
+  }
+
+  // Leftover computation
+  if ((chunk_stop - chunk_start) % 2) {
+
+    reg_data_in_B = data_in[chunk_stop - 1];
+    reg_data_in_A = data_in[chunk_stop];
+
+    intermediate = (reg_data_in_B + input_offset) * mul + add;
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[chunk_stop - 1] = out;
+
+    intermediate = (reg_data_in_A + input_offset) * mul + add;
+    intermediate = ((intermediate + ((1 << (log2D - 1))) * rounding) >> log2D) +
+                   output_offset;
+    out = (int8_t)CLAMP(intermediate, output_min, output_max);
+    data_out[chunk_stop] = out;
+  }
+}
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/src/Util.c b/TargetLibraries/PULPOpen/src/Util.c
new file mode 100644
index 0000000..257ea95
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/Util.c
@@ -0,0 +1,53 @@
+/* =====================================================================
+ * Title:        Util.c
+ * Description:
+ *
+ * Date:         15.03.2023
+ *
+ * ===================================================================== */
+
+/*
+ * Copyright (C) 2022 ETH Zurich and University of Bologna.
+ *
+ * Authors:
+ * - Philip Wiese, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except pSrcA compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to pSrcA writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployMath.h"
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+// Overwrite weak function from DeeployBasicLibs
+int deeploy_log(const char *__restrict fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  int ret;
+
+#if defined(AM_PART_APOLLO4B) | defined(DAM_PART_APOLLO3)
+  ret = am_util_stdio_vprintf(fmt, args);
+#else
+  ret = vprintf(fmt, args);
+#endif
+
+  va_end(args);
+  return ret;
+}
+
+void *deeploy_malloc(const size_t size) { return malloc(size); }
+
+void deeploy_free(void *const ptr) { free(ptr); }
diff --git a/TargetLibraries/PULPOpen/src/dory_dma.c b/TargetLibraries/PULPOpen/src/dory_dma.c
new file mode 100644
index 0000000..0aa31dc
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/dory_dma.c
@@ -0,0 +1,228 @@
+/*
+ * dory_dma.c
+ * Alessio Burrello <alessio.burrello@unibo.it>
+ *
+ * Copyright (C) 2019-2020 University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dory_dma.h"
+
+#include "pmsis.h"
+
+#ifndef MCHAN_BASE_ADDR
+// FIXME: For GAP9, this must point to ARCHI_MCHAN_EXT_ADDR!!!
+// In PULP-SDK for Kraken, this is fixed.
+// GAP8 hardware to be tested...
+#define MCHAN_BASE_ADDR (ARCHI_MCHAN_DEMUX_ADDR) // CLUSTER_MCHAN_ADDR
+#endif
+#define MCHAN_EVENT
+// #define MCHAN_POLLED
+#ifdef MCHAN_EVENT
+#define MCHAN_EVENT_BIT (ARCHI_CL_EVT_DMA0) // 8
+#endif
+#include "mchan.h"
+
+#if defined(MCHAN_POLLED)
+#define MCHAN_FLAGS (MCHAN_CMD_FLAG_INCREMENTAL)
+#elif defined(MCHAN_EVENT)
+#define MCHAN_FLAGS (MCHAN_CMD_FLAG_EVENT_ENABLE | MCHAN_CMD_FLAG_INCREMENTAL)
+#elif defined(MCHAN_INTERRUPT)
+#define MCHAN_FLAGS                                                            \
+  (MCHAN_CMD_FLAG_INTERRUPT_ENABLE | MCHAN_CMD_FLAG_INCREMENTAL)
+#endif
+
+#define MCHAN_FLAGS_1D (MCHAN_FLAGS)
+#define MCHAN_FLAGS_2D (MCHAN_FLAGS | MCHAN_CMD_FLAG_2D_TRANSFER_EXTERNAL)
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+void dory_dma_memcpy_hwc_to_chw(DMA_copy *copy) {
+  int core_id = pi_core_id();
+  int Log2Core = log2(NUM_CORES);
+  int number_of_copies_per_core =
+      (copy->length_1d_copy >> Log2Core) +
+      ((copy->length_1d_copy & (NUM_CORES - 1)) != 0);
+  int start_pixel, stop_pixel; // "pixel" is a misnomer; the CHANNELS are
+                               // divided between the cores
+  // this function assumes that a DW tile is always as wide as the complete
+  // feature map (this is enforced by DORY's tiler)
+  start_pixel = MIN(number_of_copies_per_core * core_id, copy->length_1d_copy);
+  stop_pixel =
+      MIN(start_pixel + number_of_copies_per_core, copy->length_1d_copy);
+  void *ext = copy->ext + start_pixel;
+  void *loc = copy->loc + copy->number_of_1d_copies *
+                              copy->number_of_2d_copies * start_pixel;
+  const int size_2d = copy->number_of_1d_copies * copy->number_of_2d_copies;
+
+  for (int i = start_pixel; i < stop_pixel; i++) {
+    mchan_transfer_t trans = {.cmd = size_2d |
+                                     copy->dir << MCHAN_CMD_SHIFT_DIRECTION |
+                                     MCHAN_FLAGS_2D,
+                              .size = size_2d,
+                              .ext = ext,
+                              .loc = loc,
+                              .ext_size_1d = 1, // one byte at a time...
+                              .ext_stride_1d = copy->stride_1d};
+    mchan_transfer_push_2d(trans);
+#ifdef ALWAYS_BLOCK_DMA_TRANSFERS // needed on GAP8 board
+    dory_dma_barrier(copy);
+#endif
+    ext += 1; // next channel
+    loc += copy->number_of_1d_copies * copy->number_of_2d_copies;
+  }
+}
+
+void dory_dma_memcpy_1d_async(DMA_copy *copy) {
+  if (pi_core_id() == 0) {
+    mchan_transfer_t trans = {
+        .cmd = copy->length_1d_copy * copy->number_of_1d_copies *
+                   copy->number_of_2d_copies |
+               (copy->dir << MCHAN_CMD_SHIFT_DIRECTION) | MCHAN_FLAGS_1D,
+        .size = copy->length_1d_copy * copy->number_of_1d_copies *
+                copy->number_of_2d_copies,
+        .ext = copy->ext,
+        .loc = copy->loc};
+    mchan_transfer_push_1d(trans);
+  }
+}
+
+void dory_dma_memcpy_2d_async(DMA_copy *copy) {
+  if (pi_core_id() == 0) {
+    const int size_2d = copy->number_of_1d_copies * copy->length_1d_copy *
+                        copy->number_of_2d_copies;
+    const int stride =
+        (copy->number_of_2d_copies == 1) ? copy->stride_1d : copy->stride_2d;
+    const int size_1d = (copy->number_of_2d_copies == 1)
+                            ? copy->length_1d_copy
+                            : copy->length_1d_copy * copy->number_of_1d_copies;
+
+    mchan_transfer_t trans = {.cmd = size_2d |
+                                     copy->dir << MCHAN_CMD_SHIFT_DIRECTION |
+                                     MCHAN_FLAGS_2D,
+                              .size = size_2d,
+                              .ext = copy->ext,
+                              .loc = copy->loc,
+                              .ext_size_1d = size_1d,
+                              .ext_stride_1d = stride};
+    mchan_transfer_push_2d(trans);
+  }
+}
+
+void dory_dma_memcpy_3d_async(DMA_copy *copy) {
+  int core_id = pi_core_id();
+  if (core_id == 0) {
+    int Log2Core = log2(1);
+    int number_of_2d_copies_per_core = (copy->number_of_2d_copies >> Log2Core) +
+                                       ((copy->number_of_2d_copies & (0)) != 0);
+    int start_pixel, stop_pixel;
+    start_pixel =
+        MIN(number_of_2d_copies_per_core * core_id, copy->number_of_2d_copies);
+    stop_pixel = MIN(start_pixel + number_of_2d_copies_per_core,
+                     copy->number_of_2d_copies);
+    void *ext = copy->ext + copy->stride_2d * start_pixel;
+    void *loc = copy->loc +
+                copy->length_1d_copy * copy->number_of_1d_copies * start_pixel;
+    const int size_2d = copy->number_of_1d_copies * copy->length_1d_copy;
+
+    for (int i = start_pixel; i < stop_pixel; i++) {
+      mchan_transfer_t trans = {.cmd = size_2d |
+                                       copy->dir << MCHAN_CMD_SHIFT_DIRECTION |
+                                       MCHAN_FLAGS_2D,
+                                .size = size_2d,
+                                .ext = ext,
+                                .loc = loc,
+                                .ext_size_1d = copy->length_1d_copy,
+                                .ext_stride_1d = copy->stride_1d};
+      mchan_transfer_push_2d(trans);
+#ifdef ALWAYS_BLOCK_DMA_TRANSFERS // needed on GAP8 board
+                                  // dory_dma_barrier(copy);
+#endif
+      loc += size_2d;
+      ext += copy->stride_2d;
+    }
+  }
+}
+
+void dory_dma_memcpy_async(DMA_copy *copy) {
+  if (copy->hwc_to_chw == 1) {
+    dory_dma_memcpy_hwc_to_chw(copy);
+  } else if ((copy->number_of_2d_copies == 1 &&
+              copy->number_of_1d_copies == 1) ||
+             (copy->stride_1d == copy->length_1d_copy &&
+              copy->number_of_1d_copies * copy->length_1d_copy ==
+                  copy->stride_2d) ||
+             (copy->number_of_2d_copies == 1 &&
+              copy->length_1d_copy == copy->stride_1d)) {
+    dory_dma_memcpy_1d_async(copy);
+  } else if ((copy->number_of_2d_copies == 1) ||
+             (copy->length_1d_copy == copy->stride_1d)) { // wrong!
+    dory_dma_memcpy_2d_async(copy);
+  } else {
+    dory_dma_memcpy_3d_async(copy);
+  }
+}
+
+void dory_dma_memcpy_1d_mindims_async(DMA_copy *copy) {
+  mchan_transfer_t trans = {
+      .cmd = copy->mchan_cmd, .ext = copy->ext, .loc = copy->loc};
+  mchan_transfer_push_1d(trans);
+}
+
+void dory_dma_memcpy_2d_mindims_async(DMA_copy *copy) {
+  mchan_transfer_t trans = {.cmd = copy->mchan_cmd,
+                            .ext = copy->ext,
+                            .loc = copy->loc,
+                            .ext_size_1d = copy->length_1d_copy,
+                            .ext_stride_1d = copy->stride_1d};
+  mchan_transfer_push_2d(trans);
+}
+
+void dory_dma_memcpy_3d_mindims_async(DMA_copy *copy) {
+  void *ext = copy->ext;
+  void *loc = copy->loc;
+  const int length_2d_copy =
+      copy->mchan_cmd & ((1 << MCHAN_TRANSFER_LEN_SIZE) - 1);
+
+  for (int i = 0; i < copy->number_of_2d_copies; i++) {
+    mchan_transfer_t trans = {.cmd = copy->mchan_cmd,
+                              .ext = ext,
+                              .loc = loc,
+                              .ext_size_1d = copy->length_1d_copy,
+                              .ext_stride_1d = copy->stride_1d};
+    mchan_transfer_push_2d(trans);
+    loc += length_2d_copy;
+    ext += copy->stride_2d;
+#ifdef ALWAYS_BLOCK_DMA_TRANSFERS // needed on GAP8 board
+                                  // dory_dma_barrier(copy);
+#endif
+  }
+}
+
+void dory_dma_memcpy_mindims_async(DMA_copy *copy) {
+  if (copy->number_of_2d_copies == 1 && copy->number_of_1d_copies == 1) {
+    dory_dma_memcpy_1d_mindims_async(copy);
+  } else if (copy->number_of_2d_copies == 1) {
+    dory_dma_memcpy_2d_mindims_async(copy);
+  } else {
+    dory_dma_memcpy_3d_mindims_async(copy);
+  }
+}
+
+void dory_dma_free(DMA_copy *copy) { mchan_transfer_free(copy->tid); }
+
+void dory_dma_barrier(DMA_copy *copy) { mchan_transfer_wait(copy->tid); }
+
+int dory_dma_allocate() { return mchan_transfer_get_id(); }
diff --git a/TargetLibraries/PULPOpen/src/dory_mem.c b/TargetLibraries/PULPOpen/src/dory_mem.c
new file mode 100644
index 0000000..fb3f3bf
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/dory_mem.c
@@ -0,0 +1,170 @@
+/* =====================================================================
+ * Title:        dory_mem.c
+ * Description:
+ *
+ * $Date:        12.12.2023
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dory_mem.h"
+#include "bsp/bsp.h"
+#include "bsp/flash.h"
+#include "bsp/fs.h"
+#include "bsp/fs/readfs.h"
+#include "bsp/ram.h"
+#include "pmsis.h"
+
+#ifdef USE_HYPERFLASH
+#include "bsp/flash/hyperflash.h"
+typedef struct pi_hyperflash_conf flash_conf_t;
+#define flash_conf_init(conf) pi_hyperflash_conf_init(conf)
+#elif defined USE_SPIFLASH
+#include "bsp/flash/spiflash.h"
+typedef struct pi_spiflash_conf flash_conf_t;
+#define flash_conf_init(conf) pi_spiflash_conf_init(conf)
+#elif defined USE_MRAM
+typedef struct pi_mram_conf flash_conf_t;
+#define flash_conf_init(conf) pi_mram_conf_init(conf)
+#else
+typedef struct pi_default_flash_conf flash_conf_t;
+#define flash_conf_init(conf) pi_default_flash_conf_init(conf)
+#endif
+
+#ifdef USE_HYPERRAM
+#include "bsp/ram/hyperram.h"
+typedef struct pi_hyperram_conf ram_conf_t;
+#define ram_conf_init(conf) pi_hyperram_conf_init(conf)
+#else
+typedef struct pi_default_ram_conf ram_conf_t;
+#define ram_conf_init(conf) pi_default_ram_conf_init(conf)
+#endif
+
+#define BUFFER_SIZE 128
+static uint8_t buffer[BUFFER_SIZE];
+
+static struct pi_device flash;
+static flash_conf_t flash_conf;
+
+static struct pi_device fs;
+static struct pi_readfs_conf fs_conf;
+
+struct pi_device ram;
+static ram_conf_t ram_conf;
+
+void open_fs() {
+  // SCHEREMO: Fix FS
+  // Open filesystem on flash.
+  pi_readfs_conf_init(&fs_conf);
+  fs_conf.fs.flash = &flash;
+  pi_open_from_conf(&fs, &fs_conf);
+  if (pi_fs_mount(&fs)) {
+    printf("ERROR: Cannot mount filesystem! Exiting...\n");
+    pmsis_exit(-2);
+  }
+}
+
+void mem_init() {
+  flash_conf_init(&flash_conf);
+  pi_open_from_conf(&flash, &flash_conf);
+  if (pi_flash_open(&flash)) {
+    printf("ERROR: Cannot open flash! Exiting...\n");
+    pmsis_exit(-1);
+  }
+
+  ram_conf_init(&ram_conf);
+  pi_open_from_conf(&ram, &ram_conf);
+  if (pi_ram_open(&ram)) {
+    printf("ERROR: Cannot open ram! Exiting...\n");
+    pmsis_exit(-3);
+  }
+}
+
+struct pi_device *get_ram_ptr() { return &ram; }
+
+void *ram_malloc(size_t size) {
+  void *ptr = NULL;
+  pi_ram_alloc(&ram, &ptr, size);
+  return ptr;
+}
+
+void ram_free(void *ptr, size_t size) { pi_ram_free(&ram, ptr, size); }
+
+void ram_read(void *dest, void *src, const size_t size) {
+  pi_ram_read(&ram, src, dest, size);
+}
+
+void ram_write(void *dest, void *src, const size_t size) {
+  pi_ram_write(&ram, dest, src, size);
+}
+
+void *cl_ram_malloc(size_t size) {
+  int addr;
+  pi_cl_ram_req_t req;
+  pi_cl_ram_alloc(&ram, size, &req);
+  pi_cl_ram_alloc_wait(&req, &addr);
+  return (void *)addr;
+}
+
+void cl_ram_free(void *ptr, size_t size) {
+  pi_cl_ram_req_t req;
+  pi_cl_ram_free(&ram, ptr, size, &req);
+  pi_cl_ram_free_wait(&req);
+}
+
+void cl_ram_read(void *dest, void *src, const size_t size) {
+  pi_cl_ram_req_t req;
+  pi_cl_ram_read(&ram, src, dest, size, &req);
+  pi_cl_ram_read_wait(&req);
+}
+
+void cl_ram_write(void *dest, void *src, const size_t size) {
+  pi_cl_ram_req_t req;
+  pi_cl_ram_write(&ram, dest, src, size, &req);
+  pi_cl_ram_write_wait(&req);
+}
+
+size_t load_file_to_ram(const void *dest, const char *filename) {
+  pi_fs_file_t *fd = pi_fs_open(&fs, filename, 0);
+  if (fd == NULL) {
+    printf("ERROR: Cannot open file %s! Exiting...", filename);
+    pmsis_exit(-4);
+  }
+
+  size_t size = fd->size;
+  size_t load_size = 0;
+  size_t remaining_size = size;
+
+  size_t offset = 0;
+  do {
+
+    remaining_size = size - offset;
+    load_size = BUFFER_SIZE < remaining_size ? BUFFER_SIZE : remaining_size;
+
+    pi_cl_fs_req_t req;
+    pi_cl_fs_read(fd, buffer, load_size, &req);
+    pi_cl_fs_wait(&req);
+    cl_ram_write(dest + offset, buffer, load_size);
+    offset += load_size;
+  } while (offset < size);
+
+  return offset;
+}
diff --git a/TargetLibraries/PULPOpen/src/gemv.c b/TargetLibraries/PULPOpen/src/gemv.c
new file mode 100644
index 0000000..c774224
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/gemv.c
@@ -0,0 +1,155 @@
+/* =====================================================================
+ * Title:        vec2mat.c
+ * Description:
+ *
+ * $Date:        15.03.2024
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pmsis.h"
+#include "pulp_nn_kernels.h"
+#include "pulp_nn_utils.h"
+
+#include "DeeployPULPMath.h"
+
+void gemv_s8_s8_plp(int8_t *pIn, int8_t *pBias, int8_t *pOut, int8_t *pWeight,
+                    int32_t *pKappa, int32_t *pLambda, uint16_t out_mult,
+                    uint16_t out_shift, uint16_t dim_vec,
+                    uint16_t num_o_neurons, uint8_t flag_relu,
+                    uint8_t flag_batch_norm) {
+
+  uint16_t dim_vec_in = dim_vec;
+  uint16_t dim_vec_wt = dim_vec;
+
+  int start = 0;
+  int stop = num_o_neurons;
+
+  v4s vecA;
+  v4s vecB;
+  v4s vecB2;
+
+  int8_t *pOutBuffer = (int8_t *)pOut + start;
+  int lft_neurons = num_o_neurons & 0x01;
+  int stop_even = stop - lft_neurons;
+
+  int i;
+  int32_t *k1 = pKappa + start;
+  int32_t *lambda1 = pLambda + start;
+
+  for (i = start; i < stop_even; i += 2) {
+    int sum = 0;
+    int sum2 = 0;
+    if (pBias != NULL) {
+      sum = *(int32_t *)(pBias + 4 * i);
+      sum2 = *(int32_t *)(pBias + 4 * i + 4);
+    }
+
+    int8_t *pA = pIn;
+    int8_t *pB = pWeight + (i * dim_vec_wt);
+    int8_t *pB2 = pB + dim_vec_wt;
+
+    for (int j = 0; j < (dim_vec >> 2); j++) {
+      vecA = *((v4s *)pA);
+      vecB = *((v4s *)pB);
+      vecB2 = *((v4s *)pB2);
+      sum = SumDotps4(vecA, vecB, sum);
+      sum2 = SumDotps4(vecA, vecB2, sum2);
+      pA += 4;
+      pB += 4;
+      pB2 += 4;
+    }
+    uint16_t col_cnt = dim_vec & 0x3;
+    while (col_cnt) {
+      int8_t inA = *pA;
+      pA++;
+      int8_t inB = *pB;
+      pB++;
+      int8_t inB2 = *pB2;
+      pB2++;
+      sum += inA * inB;
+      sum2 += inA * inB2;
+      col_cnt--;
+    }
+    if (flag_batch_norm && flag_relu) {
+      *pOutBuffer = pulp_nn_bn_quant_i8(sum, *k1, *lambda1, out_shift);
+      pOutBuffer++;
+      *pOutBuffer =
+          pulp_nn_bn_quant_i8(sum2, *(k1 + 1), *(lambda1 + 1), out_shift);
+      pOutBuffer++;
+      k1 += 2;
+      lambda1 += 2;
+    } else {
+      if (flag_relu == 1) {
+        *pOutBuffer = pulp_nn_quant_i8(sum, out_mult, out_shift);
+        pOutBuffer++;
+        *pOutBuffer = pulp_nn_quant_i8(sum2, out_mult, out_shift);
+        pOutBuffer++;
+      } else {
+        *pOutBuffer = (int8_t)clips8(sum >> out_shift);
+        pOutBuffer++;
+        *pOutBuffer = (int8_t)clips8(sum2 >> out_shift);
+        pOutBuffer++;
+      }
+    }
+  }
+  if (lft_neurons && (stop - start) > 0) {
+    int sum = 0;
+    if (pBias != NULL) {
+      sum = *(int32_t *)(pBias + 4 * i);
+    }
+
+    int8_t *pA = pIn;
+    int8_t *pB = pWeight + (i * dim_vec_wt);
+
+    for (int j = 0; j < (dim_vec >> 2); j++) {
+      vecA = *((v4s *)pA);
+      vecB = *((v4s *)pB);
+      sum = SumDotps4(vecA, vecB, sum);
+      pA += 4;
+      pB += 4;
+    }
+    uint16_t col_cnt = dim_vec & 0x3;
+    while (col_cnt) {
+      int8_t inA = *pA;
+      pA++;
+      int8_t inB = *pB;
+      pB++;
+      sum += inA * inB;
+      col_cnt--;
+    }
+    if (flag_batch_norm && flag_relu) {
+      *pOutBuffer = pulp_nn_bn_quant_i8(sum, *pKappa, *pLambda, out_shift);
+      pOutBuffer++;
+      pKappa++;
+      pLambda++;
+    } else {
+      if (flag_relu == 1) {
+        *pOutBuffer = pulp_nn_quant_i8(sum, out_mult, out_shift);
+        pOutBuffer++;
+      } else {
+        *pOutBuffer = (int8_t)clips8(sum >> out_shift);
+        pOutBuffer++;
+      }
+    }
+  }
+  pi_cl_team_barrier(0);
+}
diff --git a/TargetLibraries/PULPOpen/src/iGELU.c b/TargetLibraries/PULPOpen/src/iGELU.c
new file mode 100644
index 0000000..f6be595
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/iGELU.c
@@ -0,0 +1,33 @@
+/* =====================================================================
+ * Title:        iGELU.c
+ * Description:
+ *
+ * $Date:        13.11.2023
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployPULPMath.h"
+
+void PULPiGELU_s8_s8(int8_t *data_in, int8_t *data_out, int32_t dataSize,
+                     int8_t b, int16_t one, int32_t input_offset,
+                     int32_t output_offset, int32_t *mul, int32_t *add,
+                     int32_t *shift) {}
diff --git a/TargetLibraries/PULPOpen/src/iRMSnorm.c b/TargetLibraries/PULPOpen/src/iRMSnorm.c
new file mode 100644
index 0000000..78f882e
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/iRMSnorm.c
@@ -0,0 +1,97 @@
+/* =====================================================================
+ * Title:        iRMSnorm.c
+ * Description:
+ *
+ * $Date:        14.03.2024
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployPULPMath.h"
+#include "pmsis.h"
+#include <stdint.h>
+
+inline int16_t _plp_sqrt_q16(int16_t pSrc) {
+
+  int16_t number = pSrc;
+  int16_t root = 0;
+
+  int16_t start = 0;
+  int16_t end = 255; // smallest integer that is larger than sqrt(0x7FFF)
+  int16_t mid;
+
+  while (start <= end) {
+
+    mid = (start + end) >> 1;
+
+    if (((mid * mid)) == number) {
+      root = mid;
+      break;
+    }
+
+    if (((mid * mid)) < number) {
+      start = mid + 1;
+      root = mid;
+    } else {
+      end = mid - 1;
+    }
+  }
+
+  return root;
+}
+
+void iRMSnorm_s8_s8_plp(int8_t *data_in, int8_t *data_out, int32_t *weight,
+                        int32_t size, int32_t lastDimLength, int32_t log2D) {
+
+  int32_t sum;
+  int32_t std;
+  int16_t temp, temp1;
+  int32_t intermediate;
+
+  int8_t core_id = pi_core_id();
+  int8_t log2Core = log2(NUM_CORES);
+  int16_t chunk =
+      (lastDimLength >> log2Core) + ((lastDimLength & (NUM_CORES - 1)) != 0);
+  int16_t chunk_start = MIN(chunk * core_id, lastDimLength);
+  int16_t chunk_stop = MIN(chunk_start + chunk, lastDimLength + 1);
+
+  for (int i = 0; i < (size / lastDimLength); i++) {
+    sum = 0;
+
+#pragma unroll 8
+    for (int j = 0; j < lastDimLength; j++) {
+      temp = (data_in[j + i * lastDimLength]);
+      sum += temp * temp;
+    }
+
+    sum = sum / lastDimLength;
+    sum += 1;
+    std = _plp_sqrt_q16((int16_t)sum);
+
+    for (int j = chunk_start; j < chunk_stop; j++) {
+
+      intermediate =
+          (((data_in[j + i * lastDimLength] * weight[j]) / (std)) >> log2D);
+
+      data_out[j + i * lastDimLength] = CLAMP(intermediate, -128, 127);
+    }
+  }
+}
diff --git a/TargetLibraries/PULPOpen/src/iSoftmax.c b/TargetLibraries/PULPOpen/src/iSoftmax.c
new file mode 100644
index 0000000..de809bd
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/iSoftmax.c
@@ -0,0 +1,124 @@
+/* =====================================================================
+ * Title:        iSoftmax.c
+ * Description:
+ *
+ * $Date:        13.11.2023
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Moritz Scherer, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeeployPULPMath.h"
+#include "pmsis.h"
+
+void PULPSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out,
+                       uint32_t *lastDimBuffer, uint32_t size,
+                       uint32_t lastDimLength, int32_t coeffB, int32_t coeffC,
+                       int32_t log2) {
+  uint8_t z;
+  int16_t xTilde, p;
+  uint32_t y_sum;
+  uint8_t x_max;
+
+  uint32_t intermediateResult;
+  uint32_t chunk, offset;
+
+  if (pi_core_id() < (NUM_CORES - 1)) {
+    chunk = (size / lastDimLength) / NUM_CORES;
+    offset = chunk * lastDimLength * pi_core_id();
+    lastDimBuffer += lastDimLength * pi_core_id();
+  } else {
+    uint32_t prevChunk = (size / lastDimLength) / NUM_CORES;
+    chunk = (size / lastDimLength) - prevChunk * (NUM_CORES - 1);
+    offset = size - (chunk * lastDimLength);
+    lastDimBuffer += lastDimLength * pi_core_id();
+  }
+
+  for (uint32_t i = offset; i < offset + (chunk * lastDimLength);
+       i += lastDimLength) {
+    y_sum = 0;
+    x_max = 0;
+    for (uint32_t j = 0; j < lastDimLength; j++) {
+      if (data_in[j + i] > x_max) {
+        x_max = data_in[j + i];
+      }
+    }
+    for (uint32_t j = 0; j < lastDimLength; j++) {
+      xTilde = ((data_in[j + i]) - x_max);
+      z = (uint8_t)(-(xTilde / log2));
+      z = CLAMP(z, 0, 31);
+      p = (xTilde + z * log2);
+      intermediateResult = (uint32_t)(((p + coeffB) * (p + coeffB)) + coeffC);
+      lastDimBuffer[j] = (uint32_t)(intermediateResult >> (z));
+      y_sum += lastDimBuffer[j];
+    }
+    for (uint32_t j = 0; j < lastDimLength; j++) {
+      data_out[j + i] = (uint8_t)((lastDimBuffer[j] * 255) / (y_sum));
+    }
+  }
+}
+
+void PULPSoftmax_i8_u8(int8_t *data_in, uint8_t *data_out,
+                       uint32_t *lastDimBuffer, uint32_t size,
+                       uint32_t lastDimLength, int32_t coeffB, int32_t coeffC,
+                       int32_t log2) {
+  uint8_t z;
+  int16_t xTilde, p;
+  uint32_t y_sum;
+  int8_t x_max;
+
+  uint32_t intermediateResult;
+  uint32_t chunk, offset;
+
+  if (pi_core_id() < (NUM_CORES - 1)) {
+    chunk = (size / lastDimLength) / NUM_CORES;
+    offset = chunk * lastDimLength * pi_core_id();
+    lastDimBuffer += lastDimLength * pi_core_id();
+  } else {
+    uint32_t prevChunk = (size / lastDimLength) / NUM_CORES;
+    chunk = (size / lastDimLength) - prevChunk * (NUM_CORES - 1);
+    offset = size - (chunk * lastDimLength);
+    lastDimBuffer += lastDimLength * pi_core_id();
+  }
+
+  for (uint32_t i = offset; i < offset + (chunk * lastDimLength);
+       i += lastDimLength) {
+
+    y_sum = 0;
+    x_max = -128;
+    for (uint32_t j = 0; j < lastDimLength; j++) {
+      if (data_in[j + i] > x_max) {
+        x_max = data_in[j + i];
+      }
+    }
+    for (uint32_t j = 0; j < lastDimLength; j++) {
+      xTilde = ((data_in[j + i]) - x_max);
+      z = (uint8_t)(-(xTilde / log2));
+      z = CLAMP(z, 0, 31);
+      p = (xTilde + z * log2);
+      intermediateResult = (((p + coeffB) * (p + coeffB)) + coeffC);
+      lastDimBuffer[j] = (intermediateResult >> (z));
+      y_sum += lastDimBuffer[j];
+    }
+    for (uint32_t j = 0; j < lastDimLength; j++) {
+      data_out[j + i] = (uint8_t)((lastDimBuffer[j] * 255) / (y_sum));
+    }
+  }
+}
diff --git a/TargetLibraries/PULPOpen/third_party/pulp-nn-mixed b/TargetLibraries/PULPOpen/third_party/pulp-nn-mixed
new file mode 160000
index 0000000..b69ec23
--- /dev/null
+++ b/TargetLibraries/PULPOpen/third_party/pulp-nn-mixed
@@ -0,0 +1 @@
+Subproject commit b69ec23ec81595ebbec694f4a28d84022858af83
diff --git a/TargetLibraries/PULPOpen/third_party/pulp-nnx b/TargetLibraries/PULPOpen/third_party/pulp-nnx
new file mode 160000
index 0000000..234971f
--- /dev/null
+++ b/TargetLibraries/PULPOpen/third_party/pulp-nnx
@@ -0,0 +1 @@
+Subproject commit 234971fca4a0eba5e8b703e9ccb62b7764dac7fa
diff --git a/cmake/Util.cmake b/cmake/Util.cmake
new file mode 100644
index 0000000..a46904c
--- /dev/null
+++ b/cmake/Util.cmake
@@ -0,0 +1,27 @@
+macro(add_deeploy_library name)
+    add_library(${ARGV})
+    add_custom_command(
+        TARGET ${name}
+        POST_BUILD
+        COMMAND ${CMAKE_OBJDUMP} -dhS $<TARGET_FILE:${name}> > $<TARGET_FILE:${name}>.s)
+endmacro()
+
+macro(add_deeploy_executable name)
+    add_executable(${ARGV})
+    add_custom_command(
+        TARGET ${name}
+        POST_BUILD
+        COMMAND ${CMAKE_OBJDUMP} -dhS $<TARGET_FILE:${name}> > $<TARGET_FILE:${name}>.s)
+endmacro()
+
+macro(link_compile_dump name)
+    add_custom_command(
+        TARGET ${name}
+        POST_BUILD
+        COMMAND ln -sf ${CMAKE_BINARY_DIR}/compile_commands.json ${CMAKE_SOURCE_DIR}/compile_commands.json)
+endmacro()
+
+function(math_shell expr output)
+    execute_process(COMMAND awk "BEGIN {printf ${expr}}" OUTPUT_VARIABLE __output)
+    set(${output} ${__output} PARENT_SCOPE)
+endfunction()
diff --git a/cmake/cmsis/cmsis.cmake b/cmake/cmsis/cmsis.cmake
new file mode 100644
index 0000000..0819815
--- /dev/null
+++ b/cmake/cmsis/cmsis.cmake
@@ -0,0 +1,5 @@
+add_compile_definitions(
+    DEEPLOY_CMSIS_PLATFORM
+)
+
+set(DEEPLOY_ARCH CMSIS)
diff --git a/cmake/cmsis/qemu.cmake b/cmake/cmsis/qemu.cmake
new file mode 100644
index 0000000..d8b8e04
--- /dev/null
+++ b/cmake/cmsis/qemu.cmake
@@ -0,0 +1,38 @@
+set(TARGET_CPU "cortex-m4" CACHE STRING "Target CPU")
+set(CPU cortex-m4)
+set(FPU fpv4-sp-d16)
+set(FABI soft)
+
+add_compile_options(
+  -mcpu=${CPU}
+  -mfpu=${FPU}
+  -mfloat-abi=${FABI}
+)
+
+add_link_options(
+  -mcpu=${CPU}
+  -mfpu=${FPU}
+  -mfloat-abi=${FABI}
+)
+
+macro(add_binary_dump name)
+  add_custom_target(bin_${name}
+    DEPENDS ${name}
+    COMMAND ${CMAKE_OBJCOPY} -Obinary ${CMAKE_BINARY_DIR}/bin/${name} ${CMAKE_BINARY_DIR}/bin/${name}.bin
+    COMMENT "Dumping raw binary"
+    POST_BUILD
+    USES_TERMINAL
+    VERBATIM
+  )
+endmacro()
+
+macro(add_qemu_emulation name)
+  add_custom_target(qemu_${name}
+    DEPENDS bin_${name}
+    COMMAND qemu-system-arm -machine mps2-an386 -cpu cortex-m4 -monitor null -semihosting --semihosting-config enable=on,target=native -kernel ${CMAKE_BINARY_DIR}/bin/${name}.bin -serial stdio -nographic
+    COMMENT "Simulating deeploytest with QEMU"
+    POST_BUILD
+    USES_TERMINAL
+    VERBATIM
+  )
+endmacro()
diff --git a/cmake/cmsis/toolchain_gcc.cmake b/cmake/cmsis/toolchain_gcc.cmake
new file mode 100644
index 0000000..c99245b
--- /dev/null
+++ b/cmake/cmsis/toolchain_gcc.cmake
@@ -0,0 +1,53 @@
+set(TOOLCHAIN_PREFIX arm-none-eabi)
+
+set(CMAKE_SYSTEM_NAME Generic)
+
+set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc)
+set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++)
+set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
+set(CMAKE_OBJCOPY ${TOOLCHAIN_PREFIX}-objcopy)
+set(CMAKE_OBJDUMP ${TOOLCHAIN_PREFIX}-objdump)
+set(CMAKE_AR ${TOOLCHAIN_PREFIX}-ar)
+set(SIZE ${TOOLCHAIN_PREFIX}-size)
+
+set(CMAKE_EXECUTABLE_SUFFIX ".elf")
+
+add_compile_options(
+  -mthumb
+  -ffunction-sections
+  -fdata-sections
+  -fomit-frame-pointer
+  -MMD
+  -MP
+  -std=c99
+  -Wall
+  -g
+  -O2
+  -I${TOOLCHAIN_INSTALL_DIR}/picolibc/arm/include
+)
+
+add_link_options(
+  -mthumb
+  -nostartfiles
+  -static
+  -MMD
+  -MP
+  -Wl,--print-memory-usage
+  -L${TOOLCHAIN_INSTALL_DIR}/picolibc/arm/lib
+  -Tpicolibc.ld
+  -Wl,--defsym=__flash=0x00000000
+  -Wl,--defsym=__flash_size=0x400000
+  -Wl,--defsym=__ram=0x20000000
+  -Wl,--defsym=__ram_size=0x400000
+  -Wl,--defsym=__stack_size=0x4000
+)
+
+link_libraries(
+  -lm
+  -lc
+  -lcrt0-semihost
+  -lsemihost
+)
+
+add_compile_definitions(__LINK_LD)
+add_compile_definitions(__TOOLCHAIN_GCC__)
diff --git a/cmake/cmsis/toolchain_llvm.cmake b/cmake/cmsis/toolchain_llvm.cmake
new file mode 100644
index 0000000..9a5575b
--- /dev/null
+++ b/cmake/cmsis/toolchain_llvm.cmake
@@ -0,0 +1,60 @@
+set(TOOLCHAIN_PREFIX ${TOOLCHAIN_INSTALL_DIR}/bin)
+
+set(CMAKE_SYSTEM_NAME Generic)
+
+set(LLVM_TAG llvm)
+
+set(CMAKE_C_COMPILER   ${TOOLCHAIN_PREFIX}/clang)
+set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}/clang++)
+set(CMAKE_ASM_COMPILER ${TOOLCHAIN_PREFIX}/clang)
+set(CMAKE_OBJCOPY ${TOOLCHAIN_PREFIX}/${LLVM_TAG}-objcopy)
+set(CMAKE_OBJDUMP ${TOOLCHAIN_PREFIX}/${LLVM_TAG}-objdump)
+
+set(ISA cortex-m4)
+set(PE 8)
+set(FC 1)
+
+set(CMAKE_EXECUTABLE_SUFFIX ".elf")
+
+add_compile_options(
+  -target armv7m-none-eabi
+  -mcpu=${ISA}
+  -ffunction-sections
+  -fdata-sections
+  -fomit-frame-pointer
+  -fno-exceptions
+  -fno-rtti
+  -mno-relax
+  -O2
+  -g3
+  -DNUM_CORES=${NUM_CORES}
+  -MMD
+  -MP
+  -I${TOOLCHAIN_INSTALL_DIR}/picolibc/arm/include
+)
+
+add_link_options(
+  -target armv7m-none-eabi
+  -MMD
+  -MP
+  -mcpu=${ISA}
+  -L${TOOLCHAIN_INSTALL_DIR}/picolibc/arm/lib
+  -Tpicolibc.ld
+  -v
+  -Wl,--defsym=__flash=0x00000000
+  -Wl,--defsym=__flash_size=0x400000
+  -Wl,--defsym=__ram=0x20000000
+  -Wl,--defsym=__ram_size=0x400000
+  -Wl,--defsym=__stack_size=0x4000
+  #-z norelro
+)
+
+link_libraries(
+  -lm
+  -lc
+  -lcrt0-semihost
+  -lsemihost
+)
+
+add_compile_definitions(__LINK_LD)
+add_compile_definitions(__TOOLCHAIN_LLVM__)
diff --git a/cmake/common.cmake b/cmake/common.cmake
new file mode 100644
index 0000000..07289e6
--- /dev/null
+++ b/cmake/common.cmake
@@ -0,0 +1,40 @@
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+set(CMAKE_EXPORT_COMPILE_COMMANDS TRUE)
+
+set(use_dma 1 CACHE STRING "Enable DMA trasfers")
+
+add_compile_definitions(
+    USE_DMA=${use_dma}
+)
+
+add_library(deeploylib INTERFACE)
+
+add_compile_options(
+    -std=gnu99
+
+    -ffast-math
+    -fdiagnostics-color=always
+
+    -Wunused-variable
+    -Wconversion
+    -Wall
+    -Wextra
+
+    -O2
+    -g
+    -ffunction-sections
+    -fdata-sections
+)
+
+add_link_options(
+    -std=gnu99
+
+    -ffast-math
+    -fdiagnostics-color=always
+
+    -Wunused-variable
+    -Wconversion
+    -Wall
+    -Wextra
+    -Wl,--gc-sections
+)
diff --git a/cmake/generic/generic.cmake b/cmake/generic/generic.cmake
new file mode 100644
index 0000000..bb81e1e
--- /dev/null
+++ b/cmake/generic/generic.cmake
@@ -0,0 +1,3 @@
+add_compile_definitions(
+    DEEPLOY_GENERIC_PLATFORM
+)
diff --git a/cmake/generic/toolchain_llvm.cmake b/cmake/generic/toolchain_llvm.cmake
new file mode 100644
index 0000000..b3954ca
--- /dev/null
+++ b/cmake/generic/toolchain_llvm.cmake
@@ -0,0 +1,15 @@
+set(TOOLCHAIN_PREFIX ${TOOLCHAIN_INSTALL_DIR}/bin)
+
+set(CMAKE_SYSTEM_NAME Generic)
+
+set(LLVM_TAG llvm)
+
+set(CMAKE_C_COMPILER   ${TOOLCHAIN_PREFIX}/clang)
+set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}/clang++)
+set(CMAKE_ASM_COMPILER ${TOOLCHAIN_PREFIX}/clang)
+set(CMAKE_OBJCOPY ${TOOLCHAIN_PREFIX}/${LLVM_TAG}-objcopy)
+set(CMAKE_OBJDUMP ${TOOLCHAIN_PREFIX}/${LLVM_TAG}-objdump)
+
+add_link_options(
+  -fuse-ld=lld
+)
diff --git a/cmake/mempool/mempool.cmake b/cmake/mempool/mempool.cmake
new file mode 100644
index 0000000..5638a34
--- /dev/null
+++ b/cmake/mempool/mempool.cmake
@@ -0,0 +1,111 @@
+#############################
+##  Address configuration  ##
+#############################
+
+# Boot address (in dec)
+set(boot_addr  2684354560   CACHE STRING "Boot address (in dec)") # A0000000
+
+# L2 memory configuration (in hex)
+set(l2_base   2147483648    CACHE STRING "L2 Memory Base (in dec)") # 80000000
+set(l2_size   4194304       CACHE STRING "L2 Memory Size (in dec)")    # 400000
+set(l2_banks  4             CACHE STRING "Number of L2 banks")
+
+# Size of sequential memory per core (in bytes)
+# (must be a power of two)
+set(seq_mem_size  1024      CACHE STRING "Size of sequential memory per core (in bytes, must be a power of two)")
+
+# Size of stack in sequential memory per core (in bytes)
+set(stack_size  1024        CACHE STRING "Size of stack in sequential memory per core (in bytes)")
+
+#########################
+##  AXI configuration  ##
+#########################
+# AXI bus data width (in bits)
+set(axi_data_width  512     CACHE STRING "AXI bus data width (in bits)")
+
+# Read-only cache line width in AXI interconnect (in bits)
+set(ro_line_width  512      CACHE STRING "Read-only cache line width in AXI interconnect (in bits)")
+
+# Number of DMA backends in each group
+set(dmas_per_group  4       CACHE STRING "Number of DMA backends in each group")
+
+#############################
+##  Xqueues configuration  ##
+#############################
+
+# XQueue extension's queue size in each memory bank (in words)
+set(xqueue_size  0          CACHE STRING "XQueue extension's queue size in each memory bank (in words)")
+
+################################
+##  Optional functionalities  ##
+################################
+
+# Enable the XpulpIMG extension
+set(xpulpimg  0             CACHE STRING "Enable the XpulpIMG extension")
+
+##################
+##  Simulation  ##
+##################
+
+set(BANSHEE_CONFIG ${CMAKE_CURRENT_LIST_DIR}/mempool.yaml CACHE INTERNAL "source_list")
+
+###############
+##  MemPool  ##
+###############
+
+# Number of cores
+set(num_cores  256          CACHE STRING "Number of cores")
+
+set(num_eff_cores 256       CACHE STRING "Number of effective cores")
+
+# Number of groups
+set(num_groups  4           CACHE STRING "Number of groups")
+
+# Number of cores per MemPool tile
+set(num_cores_per_tile  4   CACHE STRING "Number of cores per MemPool tile")
+
+# L1 scratchpad banking factor
+set(banking_factor  4       CACHE STRING "L1 scratchpad banking factor")
+
+# Radix for hierarchical AXI interconnect
+set(axi_hier_radix  20      CACHE STRING "Radix for hierarchical AXI interconnect")
+
+# Number of AXI masters per group
+set(axi_masters_per_group 1 CACHE STRING "Number of AXI masters per group")
+
+math_shell("${num_cores} / ${num_groups}" num_cores_per_group)
+math_shell("(${num_cores} / ${num_groups}) / ${num_cores_per_tile}" num_tiles_per_group)
+math_shell("log(${num_cores_per_tile}) / log(2)" log2_num_cores_per_tile)
+math_shell("log(${seq_mem_size}) / log(2)" log2_seq_mem_size)
+math_shell("log(${stack_size}) / log(2)" log2_stack_size)
+
+add_compile_definitions(
+    DEEPLOY_MEMPOOL_PLATFORM
+)
+
+add_compile_definitions(
+
+    PRINTF_DISABLE_SUPPORT_FLOAT
+    PRINTF_DISABLE_SUPPORT_LONG_LONG
+    PRINTF_DISABLE_SUPPORT_PTRDIFF_T
+
+    NUM_CORES=${num_cores}
+    NUM_EFF_CORES=${num_eff_cores}
+
+    NUM_THREADS=${num_threads}
+    NUM_GROUPS=${num_groups}
+    NUM_CORES_PER_TILE=${num_cores_per_tile}
+    LOG2_NUM_CORES_PER_TILE=${log2_num_cores_per_tile}
+    BANKING_FACTOR=${banking_factor}
+    NUM_CORES_PER_GROUP=${num_cores_per_group}
+    NUM_TILES_PER_GROUP=${num_tiles_per_group}
+
+    BOOT_ADDR=${boot_addr}
+    L2_BASE=${l2_base}
+    L2_SIZE=${l2_size}
+    LOG2_SEQ_MEM_SIZE=${log2_seq_mem_size}
+    SEQ_MEM_SIZE=${seq_mem_size}
+    STACK_SIZE=${stack_size}
+    LOG2_STACK_SIZE=${log2_stack_size}
+    XQUEUE_SIZE=${xqueue_size}
+)
diff --git a/cmake/mempool/mempool.yaml b/cmake/mempool/mempool.yaml
new file mode 100644
index 0000000..b9e776d
--- /dev/null
+++ b/cmake/mempool/mempool.yaml
@@ -0,0 +1,53 @@
+# Copyright 2021 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+---
+address:
+  scratch_reg: 0x40000000
+  wakeup_reg: 0x40000004
+  tcdm_start: 0x40000008
+  tcdm_end: 0x4000000C
+  nr_cores: 0x40000010
+  uart: 0xC0000000
+  # Not supported in MemPool
+  barrier_reg:
+    start: 0x50000000
+    offset: 0x100000
+  cluster_base_hartid: 0x50000001
+  cluster_num: 0x50000002
+  cluster_id: 0x50000003
+  cl_clint: 0x40000060
+  clint: 0xFFFF0000
+memory:
+  tcdm:
+    start: 0x0
+    size: 0x100000
+    offset: 0x100000
+    latency: 5
+  dram:
+    start: 0x80000000
+    size: 0x01000000
+    offset: 0x0
+    latency: 10
+  periphs:
+    start: 0x40000000
+    size: 0x20000
+    offset: 0x0
+    latency: 5
+    callbacks:
+      - name: zero-memory
+        size: 0x10000
+      - name: mempool-dma
+        size: 0x1C
+inst_latency:
+  mul: 3
+  mulh: 3
+  mulhsu: 3
+  mulhu: 3
+  div: 3
+  divu: 3
+  rem: 3
+  remu: 3
+ssr:
+  num_dm: 3
diff --git a/cmake/mempool/mempool_ita.cmake b/cmake/mempool/mempool_ita.cmake
new file mode 100644
index 0000000..bc3631c
--- /dev/null
+++ b/cmake/mempool/mempool_ita.cmake
@@ -0,0 +1,121 @@
+#############################
+##  Address configuration  ##
+#############################
+
+# Boot address (in dec)
+set(boot_addr  2684354560   CACHE STRING "Boot address (in dec)") # A0000000
+
+# L2 memory configuration (in hex)
+set(l2_base   2147483648    CACHE STRING "L2 Memory Base (in dec)") # 80000000
+set(l2_size   4194304       CACHE STRING "L2 Memory Size (in dec)")    # 400000
+set(l2_banks  16            CACHE STRING "Number of L2 banks")
+
+# Size of sequential memory per core (in bytes)
+# (must be a power of two)
+set(seq_mem_size  1024      CACHE STRING "Size of sequential memory per core (in bytes, must be a power of two)")
+
+# Size of stack in sequential memory per core (in bytes)
+set(stack_size  1024        CACHE STRING "Size of stack in sequential memory per core (in bytes)")
+
+#########################
+##  AXI configuration  ##
+#########################
+# AXI bus data width (in bits)
+set(axi_data_width  128     CACHE STRING "AXI bus data width (in bits)")
+
+# Read-only cache line width in AXI interconnect (in bits)
+set(ro_line_width  512      CACHE STRING "Read-only cache line width in AXI interconnect (in bits)")
+
+# Number of DMA backends in each group
+set(dmas_per_group  3       CACHE STRING "Number of DMA backends in each group")
+
+#############################
+##  Xqueues configuration  ##
+#############################
+
+# XQueue extension's queue size in each memory bank (in words)
+set(xqueue_size  0          CACHE STRING "XQueue extension's queue size in each memory bank (in words)")
+
+################################
+##  Optional functionalities  ##
+################################
+
+# Enable the XpulpIMG extension
+set(xpulpimg  0             CACHE STRING "Enable the XpulpIMG extension")
+
+##################
+##  Simulation  ##
+##################
+
+set(BANSHEE_CONFIG ${CMAKE_CURRENT_LIST_DIR}/mempool_ita.yaml CACHE INTERNAL "source_list")
+
+###############
+##  MemPool  ##
+###############
+
+# Number of cores
+set(num_cores  256          CACHE STRING "Number of cores")
+
+set(num_eff_cores 192       CACHE STRING "Number of effective cores")
+
+# Number of groups
+set(num_groups  4           CACHE STRING "Number of groups")
+
+# Number of cores per MemPool tile
+set(num_accel_tiles_per_group  4   CACHE STRING "Number of accelerator tiles per group")
+
+# Number of cores per MemPool tile
+set(num_cores_per_tile  4   CACHE STRING "Number of cores per MemPool tile")
+
+# L1 scratchpad banking factor
+set(banking_factor  4       CACHE STRING "L1 scratchpad banking factor")
+
+# Radix for hierarchical AXI interconnect
+set(axi_hier_radix  20      CACHE STRING "Radix for hierarchical AXI interconnect")
+
+# Number of AXI masters per group
+set(axi_masters_per_group 1 CACHE STRING "Number of AXI masters per group")
+
+###############
+##  ITA  ##
+###############
+set(ita_pe 16          CACHE STRING "ITA number of processing engines per ITA core")
+
+math_shell("${num_cores} / ${num_groups}" num_cores_per_group)
+math_shell("(${num_cores} / ${num_groups}) / ${num_cores_per_tile}" num_tiles_per_group)
+math_shell("log(${num_cores_per_tile}) / log(2)" log2_num_cores_per_tile)
+math_shell("log(${seq_mem_size}) / log(2)" log2_seq_mem_size)
+math_shell("log(${stack_size}) / log(2)" log2_stack_size)
+
+add_compile_definitions(
+    DEEPLOY_MEMPOOL_PLATFORM
+)
+
+add_compile_definitions(
+
+    PRINTF_DISABLE_SUPPORT_FLOAT
+    PRINTF_DISABLE_SUPPORT_LONG_LONG
+    PRINTF_DISABLE_SUPPORT_PTRDIFF_T
+
+    NUM_CORES=${num_cores}
+    NUM_EFF_CORES=${num_eff_cores}
+
+    ITA_PE=${ita_pe}
+
+    NUM_THREADS=${num_threads}
+    NUM_GROUPS=${num_groups}
+    NUM_CORES_PER_TILE=${num_cores_per_tile}
+    LOG2_NUM_CORES_PER_TILE=${log2_num_cores_per_tile}
+    BANKING_FACTOR=${banking_factor}
+    NUM_CORES_PER_GROUP=${num_cores_per_group}
+    NUM_TILES_PER_GROUP=${num_tiles_per_group}
+
+    BOOT_ADDR=${boot_addr}
+    L2_BASE=${l2_base}
+    L2_SIZE=${l2_size}
+    LOG2_SEQ_MEM_SIZE=${log2_seq_mem_size}
+    SEQ_MEM_SIZE=${seq_mem_size}
+    STACK_SIZE=${stack_size}
+    LOG2_STACK_SIZE=${log2_stack_size}
+    XQUEUE_SIZE=${xqueue_size}
+)
diff --git a/cmake/mempool/mempool_ita.yaml b/cmake/mempool/mempool_ita.yaml
new file mode 100644
index 0000000..3c26e30
--- /dev/null
+++ b/cmake/mempool/mempool_ita.yaml
@@ -0,0 +1,57 @@
+# Copyright 2021 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+---
+address:
+  scratch_reg: 0x40000000
+  wakeup_reg: 0x40000004
+  tcdm_start: 0x40000008
+  tcdm_end: 0x4000000C
+  nr_cores: 0x40000010
+  uart: 0xC0000000
+  # Not supported in MemPool
+  barrier_reg:
+    start: 0x50000000
+    offset: 0x100000
+  cluster_base_hartid: 0x50000001
+  cluster_num: 0x50000002
+  cluster_id: 0x50000003
+  cl_clint: 0x40000060
+  clint: 0xFFFF0000
+memory:
+  tcdm:
+    start: 0x0
+    size: 0x100000
+    offset: 0x100000
+    latency: 5
+  dram:
+    start: 0x80000000
+    size: 0x01000000
+    offset: 0x0
+    latency: 10
+  periphs:
+    start: 0x40000000
+    size: 0x20000
+    offset: 0x0
+    latency: 5
+    callbacks:
+      - name: zero-memory
+        size: 0x40
+      - name: mempool-ita
+        size: 0xC0
+      - name: zero-memory
+        size: 0xFF00
+      - name: mempool-dma
+        size: 0x1C
+inst_latency:
+  mul: 3
+  mulh: 3
+  mulhsu: 3
+  mulhu: 3
+  div: 3
+  divu: 3
+  rem: 3
+  remu: 3
+ssr:
+  num_dm: 3
diff --git a/cmake/mempool/minpool.cmake b/cmake/mempool/minpool.cmake
new file mode 100644
index 0000000..3c68850
--- /dev/null
+++ b/cmake/mempool/minpool.cmake
@@ -0,0 +1,111 @@
+#############################
+##  Address configuration  ##
+#############################
+
+# Boot address (in dec)
+set(boot_addr  2684354560   CACHE STRING "Boot address (in dec)") # A0000000
+
+# L2 memory configuration (in hex)
+set(l2_base   2147483648    CACHE STRING "L2 Memory Base (in dec)") # 80000000
+set(l2_size   4194304       CACHE STRING "L2 Memory Size (in dec)")    # 400000
+set(l2_banks  4             CACHE STRING "Number of L2 banks")
+
+# Size of sequential memory per core (in bytes)
+# (must be a power of two)
+set(seq_mem_size  1024      CACHE STRING "Size of sequential memory per core (in bytes, must be a power of two)")
+
+# Size of stack in sequential memory per core (in bytes)
+set(stack_size  1024        CACHE STRING "Size of stack in sequential memory per core (in bytes)")
+
+#########################
+##  AXI configuration  ##
+#########################
+# AXI bus data width (in bits)
+set(axi_data_width  256     CACHE STRING "AXI bus data width (in bits)")
+
+# Read-only cache line width in AXI interconnect (in bits)
+set(ro_line_width  256      CACHE STRING "Read-only cache line width in AXI interconnect (in bits)")
+
+# Number of DMA backends in each group
+set(dmas_per_group  1       CACHE STRING "Number of DMA backends in each group")
+
+#############################
+##  Xqueues configuration  ##
+#############################
+
+# XQueue extension's queue size in each memory bank (in words)
+set(xqueue_size  0          CACHE STRING "XQueue extension's queue size in each memory bank (in words)")
+
+################################
+##  Optional functionalities  ##
+################################
+
+# Enable the XpulpIMG extension
+set(xpulpimg  0             CACHE STRING "Enable the XpulpIMG extension")
+
+##################
+##  Simulation  ##
+##################
+
+set(BANSHEE_CONFIG ${CMAKE_CURRENT_LIST_DIR}/minpool.yaml CACHE INTERNAL "source_list")
+
+###############
+##  MinPool  ##
+###############
+
+# Number of cores
+set(num_cores  16          CACHE STRING "Number of cores")
+
+set(num_eff_cores 16       CACHE STRING "Number of effective cores")
+
+# Number of groups
+set(num_groups  4           CACHE STRING "Number of groups")
+
+# Number of cores per MemPool tile
+set(num_cores_per_tile  4   CACHE STRING "Number of cores per MemPool tile")
+
+# L1 scratchpad banking factor
+set(banking_factor  4       CACHE STRING "L1 scratchpad banking factor")
+
+# Radix for hierarchical AXI interconnect
+set(axi_hier_radix  2      CACHE STRING "Radix for hierarchical AXI interconnect")
+
+# Number of AXI masters per group
+set(axi_masters_per_group 1 CACHE STRING "Number of AXI masters per group")
+
+math_shell("${num_cores} / ${num_groups}" num_cores_per_group)
+math_shell("(${num_cores} / ${num_groups}) / ${num_cores_per_tile}" num_tiles_per_group)
+math_shell("log(${num_cores_per_tile}) / log(2)" log2_num_cores_per_tile)
+math_shell("log(${seq_mem_size}) / log(2)" log2_seq_mem_size)
+math_shell("log(${stack_size}) / log(2)" log2_stack_size)
+
+add_compile_definitions(
+    DEEPLOY_MEMPOOL_PLATFORM
+)
+
+add_compile_definitions(
+
+    PRINTF_DISABLE_SUPPORT_FLOAT
+    PRINTF_DISABLE_SUPPORT_LONG_LONG
+    PRINTF_DISABLE_SUPPORT_PTRDIFF_T
+
+    NUM_CORES=${num_cores}
+    NUM_EFF_CORES=${num_eff_cores}
+
+    NUM_THREADS=${num_threads}
+    NUM_GROUPS=${num_groups}
+    NUM_CORES_PER_TILE=${num_cores_per_tile}
+    LOG2_NUM_CORES_PER_TILE=${log2_num_cores_per_tile}
+    BANKING_FACTOR=${banking_factor}
+    NUM_CORES_PER_GROUP=${num_cores_per_group}
+    NUM_TILES_PER_GROUP=${num_tiles_per_group}
+
+    BOOT_ADDR=${boot_addr}
+    L2_BASE=${l2_base}
+    L2_SIZE=${l2_size}
+    LOG2_SEQ_MEM_SIZE=${log2_seq_mem_size}
+    SEQ_MEM_SIZE=${seq_mem_size}
+    STACK_SIZE=${stack_size}
+    LOG2_STACK_SIZE=${log2_stack_size}
+    XQUEUE_SIZE=${xqueue_size}
+)
diff --git a/cmake/mempool/minpool.yaml b/cmake/mempool/minpool.yaml
new file mode 100644
index 0000000..c80e996
--- /dev/null
+++ b/cmake/mempool/minpool.yaml
@@ -0,0 +1,53 @@
+# Copyright 2021 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+---
+address:
+  scratch_reg: 0x40000000
+  wakeup_reg: 0x40000004
+  tcdm_start: 0x40000008
+  tcdm_end: 0x4000000C
+  nr_cores: 0x40000010
+  uart: 0xC0000000
+  # Not supported in MemPool
+  barrier_reg:
+    start: 0x50000000
+    offset: 0x004000
+  cluster_base_hartid: 0x50000001
+  cluster_num: 0x50000002
+  cluster_id: 0x50000003
+  cl_clint: 0x40000060
+  clint: 0xFFFF0000
+memory:
+  tcdm:
+    start: 0x0
+    size: 0x004000
+    offset: 0x004000
+    latency: 5
+  dram:
+    start: 0x80000000
+    size: 0x01000000
+    offset: 0x0
+    latency: 10
+  periphs:
+    start: 0x40000000
+    size: 0x20000
+    offset: 0x0
+    latency: 5
+    callbacks:
+      - name: zero-memory
+        size: 0x10000
+      - name: mempool-dma
+        size: 0x1C
+inst_latency:
+  mul: 3
+  mulh: 3
+  mulhsu: 3
+  mulhu: 3
+  div: 3
+  divu: 3
+  rem: 3
+  remu: 3
+ssr:
+  num_dm: 3
diff --git a/cmake/mempool/toolchain_gcc.cmake b/cmake/mempool/toolchain_gcc.cmake
new file mode 100644
index 0000000..fa0636f
--- /dev/null
+++ b/cmake/mempool/toolchain_gcc.cmake
@@ -0,0 +1,75 @@
+set(TOOLCHAIN_PREFIX ${TOOLCHAIN_INSTALL_DIR}/bin/riscv32-unknown-elf)
+
+set(CMAKE_SYSTEM_NAME Generic)
+set(CMAKE_OBJCOPY ${TOOLCHAIN_PREFIX}-objcopy)
+set(CMAKE_OBJDUMP ${TOOLCHAIN_PREFIX}-objdump)
+set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc)
+set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
+set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++)
+set(CMAKE_AR ${TOOLCHAIN_PREFIX}-ar)
+set(SIZE ${TOOLCHAIN_PREFIX}-size)
+
+
+if(xpulpimg)
+    add_compile_options(
+        -march=rv32imaXpulpimg
+        -Wa,-march=rv32imaXpulpimg
+    )
+    add_link_options(
+        -march=rv32imaXpulpimg
+        -Wa,-march=rv32imaXpulpimg
+    )
+else()
+    add_compile_options(
+        -march=rv32ima
+        -Wa,-march=rv32ima
+    )
+    add_link_options(
+        -march=rv32ima
+        -Wa,-march=rv32ima
+    )
+endif()
+
+add_compile_options(
+    -mabi=ilp32
+    -mcmodel=medany
+    -mtune=mempool
+
+    # -falign-loops=32
+    # -falign-jumps=32
+    # Turn of optimization that lead to known problems
+    -fno-tree-loop-distribute-patterns
+    -fno-builtin-memcpy
+    -fno-builtin-memset
+
+    -fno-builtin-printf
+    -fno-common
+
+    -static
+)
+
+add_link_options(
+    -mabi=ilp32
+    -mcmodel=medany
+    -mtune=mempool
+
+    # Turn of optimization that lead to known problems
+    -fno-tree-loop-distribute-patterns
+    -fno-builtin-memcpy
+    -fno-builtin-memset
+
+    -fno-builtin-printf
+    -fno-common
+
+    -static
+    -nostartfiles
+)
+
+link_libraries(
+  -lm
+  -lgcc
+)
+
+
+add_compile_definitions(__LINK_LD)
+add_compile_definitions(__TOOLCHAIN_GCC__)
diff --git a/cmake/mempool/toolchain_llvm.cmake b/cmake/mempool/toolchain_llvm.cmake
new file mode 100644
index 0000000..bbca8fe
--- /dev/null
+++ b/cmake/mempool/toolchain_llvm.cmake
@@ -0,0 +1,83 @@
+set(TOOLCHAIN_PREFIX ${TOOLCHAIN_INSTALL_DIR}/bin)
+
+set(CMAKE_SYSTEM_NAME Generic)
+set(CMAKE_OBJCOPY ${TOOLCHAIN_PREFIX}/llvm-objcopy)
+set(CMAKE_OBJDUMP  ${TOOLCHAIN_PREFIX}/llvm-objdump)
+set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}/clang)
+set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}/clang++)
+set(CMAKE_AR ${TOOLCHAIN_PREFIX}/llvm-ar)
+set(SIZE ${TOOLCHAIN_PREFIX}/llvm-size)
+
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR})
+
+if(xpulpimg)
+  message(FATAL_ERROR "Xpulpimg extension is not supported for this compiler!")
+else()
+    add_compile_options(
+        -march=rv32ima
+    )
+    add_link_options(
+        -march=rv32ima
+    )
+endif()
+
+add_compile_options(
+  --target=riscv32-unknown-elf
+  --sysroot=${TOOLCHAIN_INSTALL_DIR}/picolibc/riscv
+
+  -mabi=ilp32
+  -mcmodel=medany
+  -mcpu=mempool-rv32
+  -mllvm
+  -misched-topdown
+
+  -std=gnu99
+
+  -fno-builtin-memcpy
+  -fno-builtin-memset
+
+  -ffast-math
+  -fno-builtin-printf
+  -fno-common
+  -fdiagnostics-color=always
+
+  -Wunused-variable
+  -Wconversion
+  -Wall
+  -Wextra
+
+  -static
+)
+
+add_link_options(
+  --target=riscv32-unknown-elf
+  --sysroot=${TOOLCHAIN_INSTALL_DIR}/picolibc/riscv
+
+  -mabi=ilp32
+  -mcmodel=medany
+  -mcpu=mempool-rv32
+  -std=gnu99
+
+  -fno-builtin-memcpy
+  -fno-builtin-memset
+
+  -ffast-math
+  -fno-builtin-printf
+  -fno-common
+  -fdiagnostics-color=always
+
+  -Wunused-variable
+  -Wconversion
+  -Wall
+  -Wextra
+
+  -static
+  -L${TOOLCHAIN_INSTALL_DIR}/lib/clang/15.0.0/lib/baremetal/rv32im/
+)
+
+link_libraries(
+  -lm
+)
+
+add_compile_definitions(__LINK_LD)
+add_compile_definitions(__TOOLCHAIN_LLVM__)
\ No newline at end of file
diff --git a/cmake/pulp/pulp-open/pulp-open.cmake b/cmake/pulp/pulp-open/pulp-open.cmake
new file mode 100644
index 0000000..3d08164
--- /dev/null
+++ b/cmake/pulp/pulp-open/pulp-open.cmake
@@ -0,0 +1,3 @@
+set(PULPNNVERSION XPULPV2)
+set(PULPNNBITWIDTH 32)
+set(CMAKE_VERBOSE_MAKEFILE ON)
diff --git a/cmake/pulp/pulp.cmake b/cmake/pulp/pulp.cmake
new file mode 100644
index 0000000..10b04a1
--- /dev/null
+++ b/cmake/pulp/pulp.cmake
@@ -0,0 +1,18 @@
+add_compile_definitions(
+  DEEPLOY_PULP_PLATFORM
+)
+
+set(DEEPLOY_ARCH PULP)
+
+macro(add_gvsoc_emulation name)
+  add_custom_target(gvsoc_${name}
+    DEPENDS ${name}
+    COMMAND gapy --target=siracusa --platform=gvsoc --work-dir=${CMAKE_BINARY_DIR}/bin --config-opt=cluster/nb_pe=8  ${GVSOCHEXINCLUDE} --config-opt=**/runner/verbose=true -v run --image --binary=${CMAKE_BINARY_DIR}/bin/${name} > /dev/null
+    COMMAND gapy --target=siracusa --platform=gvsoc --work-dir=${CMAKE_BINARY_DIR}/bin --config-opt=cluster/nb_pe=8  ${GVSOCHEXINCLUDE} --config-opt=**/runner/verbose=true -v run --flash --binary=${CMAKE_BINARY_DIR}/bin/${name} > /dev/null
+    COMMAND gapy --target=siracusa --platform=gvsoc --work-dir=${CMAKE_BINARY_DIR}/bin --config-opt=cluster/nb_pe=8  ${GVSOCHEXINCLUDE} --config-opt=**/runner/verbose=true -v run --exec-prepare --exec --binary=${CMAKE_BINARY_DIR}/bin/${name}
+    COMMENT "Simulating deeploytest with GVSOC"
+    POST_BUILD
+    USES_TERMINAL
+    VERBATIM
+  )
+endmacro()
diff --git a/cmake/pulp/siracusa/siracusa.cmake b/cmake/pulp/siracusa/siracusa.cmake
new file mode 100644
index 0000000..3d08164
--- /dev/null
+++ b/cmake/pulp/siracusa/siracusa.cmake
@@ -0,0 +1,3 @@
+set(PULPNNVERSION XPULPV2)
+set(PULPNNBITWIDTH 32)
+set(CMAKE_VERBOSE_MAKEFILE ON)
diff --git a/cmake/pulp/toolchain_gcc.cmake b/cmake/pulp/toolchain_gcc.cmake
new file mode 100644
index 0000000..a5681a3
--- /dev/null
+++ b/cmake/pulp/toolchain_gcc.cmake
@@ -0,0 +1,47 @@
+set(TOOLCHAIN_PREFIX ${TOOLCHAIN_INSTALL_DIR}/bin/riscv32-unknown-elf)
+
+set(CMAKE_SYSTEM_NAME Generic)
+
+set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc)
+set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++)
+set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
+set(CMAKE_OBJCOPY ${TOOLCHAIN_PREFIX}-objcopy)
+set(CMAKE_OBJDUMP ${TOOLCHAIN_PREFIX}-objdump)
+set(CMAKE_AR ${TOOLCHAIN_PREFIX}-ar)
+set(SIZE ${TOOLCHAIN_PREFIX}-size)
+
+set(ISA rv32imc_zfinx_xpulpv3)
+set(PE 8)
+set(FC 1)
+
+set(CMAKE_EXECUTABLE_SUFFIX ".elf")
+
+add_compile_options(
+  -march=${ISA}
+  -ffunction-sections
+  -fdata-sections
+  -fomit-frame-pointer
+  -fno-jump-tables
+  -fno-tree-loop-distribute-patterns
+  -O3
+  -DNUM_CORES=${NUM_CORES}
+  -MMD
+  -MP
+)
+
+add_link_options(
+  -MMD
+  -MP
+  -march=${ISA}
+  -nostartfiles
+  -nostdlib
+  -Wl,--print-memory-usage
+)
+
+link_libraries(
+  -lm
+  -lgcc
+)
+
+add_compile_definitions(__LINK_LD)
+add_compile_definitions(__TOOLCHAIN_GCC__)
diff --git a/cmake/pulp/toolchain_llvm.cmake b/cmake/pulp/toolchain_llvm.cmake
new file mode 100644
index 0000000..dcf8754
--- /dev/null
+++ b/cmake/pulp/toolchain_llvm.cmake
@@ -0,0 +1,53 @@
+set(TOOLCHAIN_PREFIX ${TOOLCHAIN_INSTALL_DIR}/bin)
+
+set(CMAKE_SYSTEM_NAME Generic)
+
+set(LLVM_TAG llvm)
+
+set(CMAKE_C_COMPILER   ${TOOLCHAIN_PREFIX}/clang)
+set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}/clang++)
+set(CMAKE_ASM_COMPILER ${TOOLCHAIN_PREFIX}/clang)
+set(CMAKE_OBJCOPY ${TOOLCHAIN_PREFIX}/${LLVM_TAG}-objcopy)
+set(CMAKE_OBJDUMP ${TOOLCHAIN_PREFIX}/${LLVM_TAG}-objdump)
+
+set(ISA rv32imc_zfinx_xpulpv2)
+set(PE 8)
+set(FC 1)
+
+set(CMAKE_EXECUTABLE_SUFFIX ".elf")
+
+add_compile_options(
+  -target riscv32-unknown-elf
+  -march=${ISA}
+  -ffunction-sections
+  -fdata-sections
+  -fomit-frame-pointer
+  -mno-relax
+  -O3
+  -DNUM_CORES=${NUM_CORES}
+  -MMD
+  -MP
+  --sysroot=${TOOLCHAIN_INSTALL_DIR}/picolibc/riscv
+  -fno-builtin-memcpy
+  -fno-builtin-memset
+)
+
+add_link_options(
+  -target riscv32-unknown-elf
+  -MMD
+  -MP
+  -nostartfiles
+  -march=${ISA}
+  --sysroot=${TOOLCHAIN_INSTALL_DIR}/picolibc/riscv
+  -L${TOOLCHAIN_INSTALL_DIR}/lib/clang/15.0.0/lib/baremetal/rv32imc/
+  -z norelro
+  -fno-builtin-memcpy
+  -fno-builtin-memset
+)
+
+link_libraries(
+  -lm
+)
+
+add_compile_definitions(__LINK_LD)
+add_compile_definitions(__TOOLCHAIN_LLVM__)
diff --git a/cmake/simulation.cmake b/cmake/simulation.cmake
new file mode 100644
index 0000000..c4a0bc4
--- /dev/null
+++ b/cmake/simulation.cmake
@@ -0,0 +1,50 @@
+#########################
+##  Simulation Config  ##
+#########################
+
+set(QUESTA questa-2022.3 CACHE STRING "QuestaSim version for RTL simulation")
+set(VERILATOR verilator-4.110 CACHE STRING "Verilator version for RTL simulation")
+set(VCS vcs-2020.12 CACHE STRING "VCS version for RTL simulations" )
+
+set(num_threads  1  CACHE STRING "Number of active cores")
+
+set(banshee_stack_size 16777216 CACHE STRING "Stack size of banshee threads")
+
+OPTION(banshee_simulation "Optimize binary for banshee simulation" OFF)
+if(banshee_simulation)
+  add_compile_definitions(BANSHEE_SIMULATION)
+endif()
+
+#########################
+##  Utility Functions  ##
+#########################
+
+macro(print_simulation_config)
+	message(STATUS "============================= Simulation Configuration ============================")
+	message(STATUS "[Simulator]   QuestaSim              = " ${QUESTA})
+	message(STATUS "[Simulator]   Verilator              = " ${VERILATOR})
+	message(STATUS "[Simulator]   VCS                    = " ${VCS})
+	message(STATUS "[Simulator]   banshee_simulation     = " ${banshee_simulation})
+	message(STATUS "[Simulator]   banshee_configuration  = " ${BANSHEE_CONFIG})
+	message(STATUS "[Simulator]   banshee_stack_size     = " ${banshee_stack_size})
+	message(STATUS "[Simulator]   num_threads            = " ${num_threads})
+	message(STATUS "================================================================================")
+	message(STATUS "")
+endmacro()
+
+macro(add_banshee_simulation name)
+    add_custom_target(banshee_${name}
+	DEPENDS ${name}
+	COMMAND RUST_MIN_STACK=${banshee_stack_size} banshee
+	--num-cores=${num_threads}
+	--num-clusters=1
+	--latency
+	--configuration
+	${BANSHEE_CONFIG}
+	${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${name} || true
+	COMMENT "Simulating deeploytest with banshe"
+	POST_BUILD
+	USES_TERMINAL
+	VERBATIM
+    )
+endmacro()
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..22e9a64
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,21 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Moritz Scherer <scheremo@iis.ee.ethz.ch>
+
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/_templates/custom-class-template.rst b/docs/_templates/custom-class-template.rst
new file mode 100644
index 0000000..aa31186
--- /dev/null
+++ b/docs/_templates/custom-class-template.rst
@@ -0,0 +1,31 @@
+{{ fullname | escape | underline}}
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
+   :members:
+   :show-inheritance:
+   :inherited-members:
+
+   {% block methods %}
+   {% if methods %}
+   .. rubric:: {{ _('Methods') }}
+   .. automethod:: __init__
+
+   .. autosummary::
+   {% for item in methods %}
+      ~{{ name }}.{{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block attributes %}
+   {% if attributes %}
+   .. rubric:: {{ _('Attributes') }}
+
+   .. autosummary::
+   {% for item in attributes %}
+      ~{{ name }}.{{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
diff --git a/docs/_templates/custom-module-template.rst b/docs/_templates/custom-module-template.rst
new file mode 100644
index 0000000..6adfe40
--- /dev/null
+++ b/docs/_templates/custom-module-template.rst
@@ -0,0 +1,66 @@
+{{ fullname | escape | underline}}
+
+.. automodule:: {{ fullname }}
+
+   {% block attributes %}
+   {% if attributes %}
+   .. rubric:: Module Attributes
+
+   .. autosummary::
+      :toctree:
+   {% for item in attributes %}
+      {{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block functions %}
+   {% if functions %}
+   .. rubric:: {{ _('Functions') }}
+
+   .. autosummary::
+      :toctree:
+   {% for item in functions %}
+      {{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block classes %}
+   {% if classes %}
+   .. rubric:: {{ _('Classes') }}
+
+   .. autosummary::
+      :toctree:
+      :template: custom-class-template.rst
+   {% for item in classes %}
+      {{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block exceptions %}
+   {% if exceptions %}
+   .. rubric:: {{ _('Exceptions') }}
+
+   .. autosummary::
+      :toctree:
+   {% for item in exceptions %}
+      {{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+{% block modules %}
+{% if modules %}
+.. rubric:: Modules
+
+.. autosummary::
+   :toctree:
+   :template: custom-module-template.rst
+   :recursive:
+{% for item in modules %}
+   {{ item }}
+{%- endfor %}
+{% endif %}
+{% endblock %}
diff --git a/docs/apidocs.rst b/docs/apidocs.rst
new file mode 100644
index 0000000..a7da36f
--- /dev/null
+++ b/docs/apidocs.rst
@@ -0,0 +1,9 @@
+API Reference
+*************
+
+.. autosummary::
+   :toctree: _autosummary
+   :template: custom-module-template.rst
+   :recursive:
+
+   Deeploy
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..4e2537c
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,58 @@
+# ----------------------------------------------------------------------
+#
+# File: conf.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath('../'))
+
+project = 'Deeploy'
+copyright = '2024, Moritz Scherer, Philip Wiese, Luka Macan, Victor Jung'
+author = 'Moritz Scherer, Philip Wiese, Luka Macan, Victor Jung'
+release = '2024'
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    'myst_parser',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.autodoc',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.autosummary',
+]
+autosummary_generate = True
+napoleon_use_ivar = True
+add_module_names = True
+autodoc_member_order = "bysource"
+
+templates_path = ['_templates']
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', "*flycheck_*"]
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = 'sphinx_rtd_theme'
+html_static_path = ['_static']
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..e015560
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,10 @@
+Deeploy Documentation
+=====================
+
+.. toctree::
+   :maxdepth: 3
+   :caption: Contents:
+
+   install
+   structure
+   apidocs
diff --git a/docs/install.md b/docs/install.md
new file mode 100644
index 0000000..677edff
--- /dev/null
+++ b/docs/install.md
@@ -0,0 +1,90 @@
+# Quickstart
+
+Even though Deeploy is a pure Python library, it uses system dependencies, including a [LLVM](https://llvm.org/) cross-compiler, to test its code generation. Deeploy's testing framework further uses [picolibc](https://github.com/picolibc/picolibc) for embedded `libc` implementations and [CMake](https://cmake.org/) for its testing build flow.
+
+Deeploy's embedded platform targets support software emulators, in the case of [ARM Cortex-M](https://www.arm.com/products/silicon-ip-cpu/cortex-m/cortex-m4) we use [QEMU](https://www.qemu.org/), for [MemPool](https://github.com/pulp-platform/mempool) and the [Snitch Cluster](https://github.com/pulp-platform/snitch_cluster) we use [Banshee](https://github.com/pulp-platform/banshee). For the PULP-Open, N-EUREKA, and Siracusa targets, we use GVSoC within the [PULP-SDK](https://github.com/pulp-platform/pulp-sdk).
+
+To install these various dependencies, we prove instructions below, and a `Makefile` setup.
+
+## Library Installation
+
+From a newly setup Ubuntu 20.04 installation, you may run the following sequence to install the necessary dependencies.
+For ARM64 machines, as of August 2024, `gcc-multilib` is only supported on Ubuntu 20.04. For x86_64, `gcc-multilib` should be available on most distributions.
+
+### Installing system dependencies
+
+```
+sudo apt install git git-lfs cmake build-essential ccache ninja-build pkg-config libglib2.0-dev libpixman-1-dev cargo python3 python-is-python3 curl protobuf-compiler libftdi-dev libftdi1 doxygen libsdl2-dev scons gtkwave libsndfile1-dev rsync autoconf automake texinfo libtool libsdl2-ttf-dev
+```
+
+In case you work on an x86_64 machine, please also install `gcc-multilib`:
+```
+sudo apt install gcc-multilib
+```
+
+In case you work on an ARM64 machine, please install `gcc-multilib-arm-linux-gnueabi`:
+```
+sudo apt install gcc-multilib-arm-linux-gnueabi
+export $C_INCLUDE_PATH:/usr/include:/usr/include/aarch64-linux-gnu:$C_INCLUDE_PATH
+```
+
+Other ISA/OS combinations might work, but your mileage may vary.
+
+### Bootstrapping pip
+
+```
+curl "https://bootstrap.pypa.io/get-pip.py" -o "get-pip.py"
+python get-pip.py
+rm get-pip.py
+export PATH=~/.local/bin:$PATH
+```
+
+### Installing Deeploy
+
+```
+pip install -e .
+```
+
+## Testing Framework Installation
+
+Please make sure to use a Rust version that is compatible with LLVM 15, like 1.63.0:
+
+```
+sudo snap install rustup --classic
+rustup install 1.63.0
+rustup default 1.63.0
+```
+
+The Makefile expects the environemt variable `CMAKE` to be defined. In case you have no strong preferences, you may run
+
+```
+export CMAKE=$(which cmake)
+```
+
+to achieve this.
+
+Finally, you should be able to run
+
+```
+make all
+```
+
+to build all Deeploy dependencies. Make sure to run
+
+```
+make echo-bash
+```
+
+to get instructions for setting up your environment.
+
+## Getting Started
+
+To get started with Deeploy, you can run any of the regression tests in `deeploytest`.
+For example, you can run
+
+```
+cd deeploytest
+python testRunner_generic -t Tests/simpleRegression
+```
+
+to run the `simpleRegression` test on your workstation. Various other tests are available and compatibility between tests and platforms is tested in the `.gitlab-ci.yml` file.
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..32bb245
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/structure.md b/docs/structure.md
new file mode 100644
index 0000000..24381f2
--- /dev/null
+++ b/docs/structure.md
@@ -0,0 +1,87 @@
+# Library Structure
+
+This repository contains the following folders:
+
+```
+deeploy
+├── cmake
+├── Deeploy
+├── TargetLibraries
+├── DeeployTest
+├── docs
+├── install
+├── scripts
+└── toolchain
+```
+
+The core abstractions and framework of Deeploy is contained in `Deeploy`. The folder `TargetLibraries` contains C microkernels for these platforms. `DeeployTest` contains the testing framework for Deeploy. The folders `install` and `toolchain` are used for local installations of the required compilation toolchain and its dependencies. `scripts` contains some helper scripts, mostly for code formatting. The `cmake` folder contains CMake configuration files which are used by the testing infrastructure to configure compiler flags and simulator targets.
+
+## Deeploy
+
+The Deeploy folder mainly contains the `DeeployTypes.py` and `AbstractDataTypes.py` files, which, in turn, contain the core abstractions of Deeploy. The remainder of the folder structure contains the `Target` folder and several extensions to `Deeploy`'s core flow, and appears as follows:
+
+```
+deeploy
+├── Deeploy
+	├── DeeployTypes.py
+	├── AbstractDataTypes.py
+	├── CommonExtensions
+	├── EngineExtension
+	├── FutureExtension
+	├── MemoryLevelExtension
+	├── Targets
+	└── TilingExtension
+```
+
+### Targets
+
+The `Targets` folder contains the Deeploy models and code generation infrastructure for a specific platform; currently, Deeploy supports the following targets:
+
+```
+deeploy
+├── Deeploy
+	├── Targets
+		├── CortexM
+		├── Generic
+		├── MemPool
+		├── Neureka
+		└── PULPOpen
+```
+
+Each of these `Target` folders is internally structured as follows:
+
+```
+deeploy
+├── Deeploy
+	├── Targets
+		├── PULPOpen
+			├── Bindings.py
+			├── DataTypes.py
+			├── Deployer.py
+			├── Layers.py
+			├── Parsers.py
+			├── Platform.py
+			├── TypeCheckers.py
+			├── Tiler.py
+			├── TileConstraints
+			├── CodeTransformationPasses
+			├── TopologyOptimizationPasses
+			└── Templates
+```
+
+Where, by convention, files ending with `.py` are implementations of either classes in `DeeployTypes.py`, `AbstractDataTypes.py`, or one of the extensions. For new platform contributions, please follow this general folder structure.
+
+### Extensions
+
+Each folder named `-Extension` contains widely reusable abstractions; they are internally structured like Targets, using names like `Bindings.py`, `DataTypes.py`, `Deployer.py`, `Layers.py`, `Parsers.py`, `Platform.py` and `TypeCheckers.py` for extensions concerning the appropriate base Deeploy abstraction. They may further add new filenames according to the need of the extension. For example, the `MemoryLevelExtension` is structured like this:
+
+```
+deeploy
+├── Deeploy
+	├── MemoryLevelExtension
+		├── MemoryLevels.py
+		├── NetworkDeployers
+		└── OptimizationPasses
+```
+
+When adding new extensions, please try to structure them similiarly to the structure used for `Targets` and existing `Extension`s.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..13a80bf
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,41 @@
+[build-system]
+requires = [
+"setuptools>=42",
+"wheel",
+]
+build-backend = "setuptools.build_meta"
+[project]
+name = "Deeploy"
+description = "Deeploy - Bring your networks to your platforms"
+version = '0.1.0'
+readme = "README.md"
+requires-python = ">=3.8"
+dependencies = [
+'protobuf==4.23.3',
+'numpy<2.0.0',
+'onnx==1.14.0',
+'onnxruntime',
+'mako',
+'IPython',
+'argparse',
+'argcomplete',
+'pyelftools',
+'pylink-square',
+'pyserial',
+'clang-format',
+'toml',
+'dill',
+'pytest',
+'yapf==0.33.0',
+'isort==5.12.0',
+'autoflake==2.3.0',
+"ortools",
+"onnx-graphsurgeon",
+"sphinx>=7.0.0",
+"sphinx-rtd-theme>=1.3.0",
+"myst_parser",
+"meson==1.3.1", # Avoid QEMU installation issues, see here: https://gitlab.com/qemu-project/qemu/-/issues/1853
+"ninja"
+]
+[tool.setuptools]
+packages = ['Deeploy']
diff --git a/scripts/run_clang_format.py b/scripts/run_clang_format.py
new file mode 100755
index 0000000..75dec86
--- /dev/null
+++ b/scripts/run_clang_format.py
@@ -0,0 +1,394 @@
+#!/usr/bin/env python
+
+# MIT License
+
+# Copyright (c) 2017 Guillaume Papin
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Taken from: https://github.com/Sarcasm/run-clang-format
+# SPDX-License-Identifier: MIT
+"""A wrapper script around clang-format, suitable for linting multiple files
+and to use for continuous integration.
+
+This is an alternative API for the clang-format command line.
+It runs over multiple files and directories in parallel.
+A diff output is produced and a sensible exit code is returned.
+
+"""
+
+from __future__ import print_function, unicode_literals
+
+import argparse
+import codecs
+import difflib
+import errno
+import fnmatch
+import io
+import multiprocessing
+import os
+import signal
+import subprocess
+import sys
+import traceback
+from functools import partial
+
+try:
+    from subprocess import DEVNULL  # py3k
+except ImportError:
+    DEVNULL = open(os.devnull, "wb")
+
+DEFAULT_EXTENSIONS = 'c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx'
+DEFAULT_CLANG_FORMAT_IGNORE = '.clang-format-ignore'
+
+
+class ExitStatus:
+    SUCCESS = 0
+    DIFF = 1
+    TROUBLE = 2
+
+
+def excludes_from_file(ignore_file):
+    excludes = []
+    try:
+        with io.open(ignore_file, 'r', encoding = 'utf-8') as f:
+            for line in f:
+                if line.startswith('#'):
+                    # ignore comments
+                    continue
+                pattern = line.rstrip()
+                if not pattern:
+                    # allow empty lines
+                    continue
+                excludes.append(pattern)
+    except EnvironmentError as e:
+        if e.errno != errno.ENOENT:
+            raise
+    return excludes
+
+
+def list_files(files, recursive = False, extensions = None, exclude = None):
+    if extensions is None:
+        extensions = []
+    if exclude is None:
+        exclude = []
+
+    out = []
+    for file in files:
+        if recursive and os.path.isdir(file):
+            for dirpath, dnames, fnames in os.walk(file):
+                fpaths = [os.path.join(dirpath, fname) for fname in fnames]
+                for pattern in exclude:
+                    # os.walk() supports trimming down the dnames list
+                    # by modifying it in-place,
+                    # to avoid unnecessary directory listings.
+                    dnames[:] = [x for x in dnames if not fnmatch.fnmatch(os.path.join(dirpath, x), pattern)]
+                    fpaths = [x for x in fpaths if not fnmatch.fnmatch(x, pattern)]
+                for f in fpaths:
+                    ext = os.path.splitext(f)[1][1:]
+                    if ext in extensions:
+                        out.append(f)
+        else:
+            out.append(file)
+    return out
+
+
+def make_diff(file, original, reformatted):
+    return list(
+        difflib.unified_diff(original,
+                             reformatted,
+                             fromfile = '{}\t(original)'.format(file),
+                             tofile = '{}\t(reformatted)'.format(file),
+                             n = 3))
+
+
+class DiffError(Exception):
+
+    def __init__(self, message, errs = None):
+        super(DiffError, self).__init__(message)
+        self.errs = errs or []
+
+
+class UnexpectedError(Exception):
+
+    def __init__(self, message, exc = None):
+        super(UnexpectedError, self).__init__(message)
+        self.formatted_traceback = traceback.format_exc()
+        self.exc = exc
+
+
+def run_clang_format_diff_wrapper(args, file):
+    try:
+        ret = run_clang_format_diff(args, file)
+        return ret
+    except DiffError:
+        raise
+    except Exception as e:
+        raise UnexpectedError('{}: {}: {}'.format(file, e.__class__.__name__, e), e)
+
+
+def run_clang_format_diff(args, file):
+    try:
+        with io.open(file, 'r', encoding = 'utf-8') as f:
+            original = f.readlines()
+    except IOError as exc:
+        raise DiffError(str(exc))
+
+    if args.in_place:
+        invocation = [args.clang_format_executable, '-i', file]
+    else:
+        invocation = [args.clang_format_executable, file]
+
+    if args.style:
+        invocation.extend(['-style', args.style])
+
+    if args.dry_run:
+        print(" ".join(invocation))
+        return [], []
+
+    # Use of utf-8 to decode the process output.
+    #
+    # Hopefully, this is the correct thing to do.
+    #
+    # It's done due to the following assumptions (which may be incorrect):
+    # - clang-format will returns the bytes read from the files as-is,
+    #   without conversion, and it is already assumed that the files use utf-8.
+    # - if the diagnostics were internationalized, they would use utf-8:
+    #   > Adding Translations to Clang
+    #   >
+    #   > Not possible yet!
+    #   > Diagnostic strings should be written in UTF-8,
+    #   > the client can translate to the relevant code page if needed.
+    #   > Each translation completely replaces the format string
+    #   > for the diagnostic.
+    #   > -- http://clang.llvm.org/docs/InternalsManual.html#internals-diag-translation
+    #
+    # It's not pretty, due to Python 2 & 3 compatibility.
+    encoding_py3 = {}
+    if sys.version_info[0] >= 3:
+        encoding_py3['encoding'] = 'utf-8'
+
+    try:
+        proc = subprocess.Popen(invocation,
+                                stdout = subprocess.PIPE,
+                                stderr = subprocess.PIPE,
+                                universal_newlines = True,
+                                **encoding_py3)
+    except OSError as exc:
+        raise DiffError("Command '{}' failed to start: {}".format(subprocess.list2cmdline(invocation), exc))
+    proc_stdout = proc.stdout
+    proc_stderr = proc.stderr
+    if sys.version_info[0] < 3:
+        # make the pipes compatible with Python 3,
+        # reading lines should output unicode
+        encoding = 'utf-8'
+        proc_stdout = codecs.getreader(encoding)(proc_stdout)
+        proc_stderr = codecs.getreader(encoding)(proc_stderr)
+    # hopefully the stderr pipe won't get full and block the process
+    outs = list(proc_stdout.readlines())
+    errs = list(proc_stderr.readlines())
+    proc.wait()
+    if proc.returncode:
+        raise DiffError(
+            "Command '{}' returned non-zero exit status {}".format(subprocess.list2cmdline(invocation),
+                                                                   proc.returncode),
+            errs,
+        )
+    if args.in_place:
+        return [], errs
+    return make_diff(file, original, outs), errs
+
+
+def bold_red(s):
+    return '\x1b[1m\x1b[31m' + s + '\x1b[0m'
+
+
+def colorize(diff_lines):
+
+    def bold(s):
+        return '\x1b[1m' + s + '\x1b[0m'
+
+    def cyan(s):
+        return '\x1b[36m' + s + '\x1b[0m'
+
+    def green(s):
+        return '\x1b[32m' + s + '\x1b[0m'
+
+    def red(s):
+        return '\x1b[31m' + s + '\x1b[0m'
+
+    for line in diff_lines:
+        if line[:4] in ['--- ', '+++ ']:
+            yield bold(line)
+        elif line.startswith('@@ '):
+            yield cyan(line)
+        elif line.startswith('+'):
+            yield green(line)
+        elif line.startswith('-'):
+            yield red(line)
+        else:
+            yield line
+
+
+def print_diff(diff_lines, use_color):
+    if use_color:
+        diff_lines = colorize(diff_lines)
+    if sys.version_info[0] < 3:
+        sys.stdout.writelines((l.encode('utf-8') for l in diff_lines))
+    else:
+        sys.stdout.writelines(diff_lines)
+
+
+def print_trouble(prog, message, use_colors):
+    error_text = 'error:'
+    if use_colors:
+        error_text = bold_red(error_text)
+    print("{}: {} {}".format(prog, error_text, message), file = sys.stderr)
+
+
+def main():
+    parser = argparse.ArgumentParser(description = __doc__)
+    parser.add_argument('--clang-format-executable',
+                        metavar = 'EXECUTABLE',
+                        help = 'path to the clang-format executable',
+                        default = 'clang-format')
+    parser.add_argument('--extensions',
+                        help = 'comma separated list of file extensions (default: {})'.format(DEFAULT_EXTENSIONS),
+                        default = DEFAULT_EXTENSIONS)
+    parser.add_argument('-r', '--recursive', action = 'store_true', help = 'run recursively over directories')
+    parser.add_argument('-d', '--dry-run', action = 'store_true', help = 'just print the list of files')
+    parser.add_argument('-i', '--in-place', action = 'store_true', help = 'format file instead of printing differences')
+    parser.add_argument('files', metavar = 'file', nargs = '+')
+    parser.add_argument('-q', '--quiet', action = 'store_true', help = "disable output, useful for the exit code")
+    parser.add_argument('-j',
+                        metavar = 'N',
+                        type = int,
+                        default = 0,
+                        help = 'run N clang-format jobs in parallel'
+                        ' (default number of cpus + 1)')
+    parser.add_argument('--color',
+                        default = 'auto',
+                        choices = ['auto', 'always', 'never'],
+                        help = 'show colored diff (default: auto)')
+    parser.add_argument('-e',
+                        '--exclude',
+                        metavar = 'PATTERN',
+                        action = 'append',
+                        default = [],
+                        help = 'exclude paths matching the given glob-like pattern(s)'
+                        ' from recursive search')
+    parser.add_argument('--style', help = 'formatting style to apply (LLVM, Google, Chromium, Mozilla, WebKit)')
+
+    args = parser.parse_args()
+
+    # use default signal handling, like diff return SIGINT value on ^C
+    # https://bugs.python.org/issue14229#msg156446
+    signal.signal(signal.SIGINT, signal.SIG_DFL)
+    try:
+        signal.SIGPIPE
+    except AttributeError:
+        # compatibility, SIGPIPE does not exist on Windows
+        pass
+    else:
+        signal.signal(signal.SIGPIPE, signal.SIG_DFL)
+
+    colored_stdout = False
+    colored_stderr = False
+    if args.color == 'always':
+        colored_stdout = True
+        colored_stderr = True
+    elif args.color == 'auto':
+        colored_stdout = sys.stdout.isatty()
+        colored_stderr = sys.stderr.isatty()
+
+    version_invocation = [args.clang_format_executable, str("--version")]
+    try:
+        subprocess.check_call(version_invocation, stdout = DEVNULL)
+    except subprocess.CalledProcessError as e:
+        print_trouble(parser.prog, str(e), use_colors = colored_stderr)
+        return ExitStatus.TROUBLE
+    except OSError as e:
+        print_trouble(
+            parser.prog,
+            "Command '{}' failed to start: {}".format(subprocess.list2cmdline(version_invocation), e),
+            use_colors = colored_stderr,
+        )
+        return ExitStatus.TROUBLE
+
+    retcode = ExitStatus.SUCCESS
+
+    excludes = excludes_from_file(DEFAULT_CLANG_FORMAT_IGNORE)
+    excludes.extend(args.exclude)
+
+    files = list_files(args.files,
+                       recursive = args.recursive,
+                       exclude = excludes,
+                       extensions = args.extensions.split(','))
+
+    if not files:
+        return
+
+    njobs = args.j
+    if njobs == 0:
+        njobs = multiprocessing.cpu_count() + 1
+    njobs = min(len(files), njobs)
+
+    if njobs == 1:
+        # execute directly instead of in a pool,
+        # less overhead, simpler stacktraces
+        it = (run_clang_format_diff_wrapper(args, file) for file in files)
+        pool = None
+    else:
+        pool = multiprocessing.Pool(njobs)
+        it = pool.imap_unordered(partial(run_clang_format_diff_wrapper, args), files)
+        pool.close()
+    while True:
+        try:
+            outs, errs = next(it)
+        except StopIteration:
+            break
+        except DiffError as e:
+            print_trouble(parser.prog, str(e), use_colors = colored_stderr)
+            retcode = ExitStatus.TROUBLE
+            sys.stderr.writelines(e.errs)
+        except UnexpectedError as e:
+            print_trouble(parser.prog, str(e), use_colors = colored_stderr)
+            sys.stderr.write(e.formatted_traceback)
+            retcode = ExitStatus.TROUBLE
+            # stop at the first unexpected error,
+            # something could be very wrong,
+            # don't process all files unnecessarily
+            if pool:
+                pool.terminate()
+            break
+        else:
+            sys.stderr.writelines(errs)
+            if outs == []:
+                continue
+            if not args.quiet:
+                print_diff(outs, use_color = colored_stdout)
+            if retcode == ExitStatus.SUCCESS:
+                retcode = ExitStatus.DIFF
+    if pool:
+        pool.join()
+    return retcode
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..bd6d90c
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,28 @@
+# ----------------------------------------------------------------------
+#
+# File: setup.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import setuptools
+
+setuptools.setup()
diff --git a/toolchain/banshee.patch b/toolchain/banshee.patch
new file mode 100644
index 0000000..9ea87c9
--- /dev/null
+++ b/toolchain/banshee.patch
@@ -0,0 +1,57 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Moritz Scherer <scheremo@iis.ee.ethz.ch>
+diff --git a/Cargo.toml b/Cargo.toml
+index d406357..7bd0f91 100644
+--- a/Cargo.toml
++++ b/Cargo.toml
+@@ -21,7 +21,7 @@ csv = "1.0.0-beta.2"
+ elf = "0.0.10"
+ flexfloat = { path = "flexfloat" }
+ itertools = "0.9"
+-llvm-sys = "120"
++llvm-sys = "150"
+ log = { version = "0.4", features = ["release_max_level_info"] }
+ pest = "2.1.3"
+ pest_derive = "2.1.0"
+diff --git a/build/runtime.rs b/build/runtime.rs
+index 04f80b8..c03f248 100644
+--- a/build/runtime.rs
++++ b/build/runtime.rs
+@@ -22,8 +22,7 @@ pub fn build() {
+             "--crate-type=staticlib",
+             "-Copt-level=3",
+             "-Cdebuginfo=0",
+-            "-Cpanic=abort",
+-            "-Cllvm-args=-opaque-pointers=0",
++            "-Cpanic=abort"
+         ])
+         .status()
+         .unwrap();
+diff --git a/src/engine.rs b/src/engine.rs
+index 216996b..e5abe38 100644
+--- a/src/engine.rs
++++ b/src/engine.rs
+@@ -281,7 +281,6 @@ impl Engine {
+
+             LLVMPassManagerBuilderPopulateFunctionPassManager(builder, func_passes);
+             LLVMAddAnalysisPasses(tm, module_passes);
+-            LLVMPassManagerBuilderPopulateLTOPassManager(builder, module_passes, 0, 1);
+             LLVMPassManagerBuilderPopulateModulePassManager(builder, module_passes);
+
+             // Create and run the function pass manager.
+diff --git a/src/tran.rs b/src/tran.rs
+index 866b9d9..83ea9ff 100644
+--- a/src/tran.rs
++++ b/src/tran.rs
+@@ -20,7 +20,7 @@ use std::{
+ };
+ extern crate flexfloat;
+
+-static NONAME: &'static i8 = unsafe { std::mem::transmute("\0".as_ptr()) };
++static NONAME: &'static u8 = unsafe { std::mem::transmute("\0".as_ptr()) };
+
+ /// Base address of the stream semantic regsiters
+ static SSR_BASE: u64 = 0x204800;
diff --git a/toolchain/llvm.patch b/toolchain/llvm.patch
new file mode 100644
index 0000000..5020f37
--- /dev/null
+++ b/toolchain/llvm.patch
@@ -0,0 +1,128 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Moritz Scherer <scheremo@iis.ee.ethz.ch>
+diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp
+index 1b58f3cb8ffe..cd704b719250 100644
+--- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp
++++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp
+@@ -262,11 +262,11 @@ bool isOnAllPredicatedControlFlowPaths(BasicBlock *BB, const Loop *L, const Domi
+     vis.insert(Current);
+
+     Instruction *T = Current->getTerminator();
+-    LLVM_DEBUG(T->dump());
++    //LLVM_DEBUG(T->dump());
+     if (BranchInst *BR = dyn_cast<BranchInst>(T)){
+       if (BR->isConditional()){
+         if (ICmpInst *Cmp = dyn_cast<ICmpInst>(BR->getCondition())){ //FOR NOW: only works with a single ICmpInst as branch condition operand
+-          LLVM_DEBUG(Cmp->dump());
++          //LLVM_DEBUG(Cmp->dump());
+           auto r = predicatedICmpOutcome(Cmp, Rep, SE);
+           if (r.hasValue()){
+             if (r.getValue()) q.push_back(BR->getSuccessor(0));
+@@ -688,8 +688,6 @@ void AffAcc::dumpInLoop(const Loop *L) const {
+     else errs()<<"<nullptr>";
+     errs()<<"\n";
+     errs()<<"\tloop header = ";
+-    if (getLoop(dim)) errs()<<getLoop(dim)->getHeader()->getNameOrAsOperand();
+-    else errs()<<"<nullptr>";
+     errs()<<"\n";
+   }
+ }
+@@ -776,7 +774,7 @@ Value *AffAcc::expandBaseAddr(unsigned dimension, Type *ty, Instruction *InsertB
+   if (!isSafeToExpandAt(getBaseAddr(dimension), InsertBefore, SE)){
+     LLVM_DEBUG(dbgs()<<"data not expanable here (note: only preheader guaranteed)\n");
+     LLVM_DEBUG(dbgs()<<"SCEV (dim = "<<dimension<<")= "<<*getBaseAddr(dimension)<<"\n");
+-    LLVM_DEBUG(dbgs()<<"in block:\n"; InsertBefore->getParent()->dump());
++    // LLVM_DEBUG(dbgs()<<"in block:\n"; InsertBefore->getParent()->dump());
+     LLVM_DEBUG(dbgs()<<"before inst: "<<*InsertBefore<<"\n");
+     LLVM_DEBUG(this->dump());
+     llvm_unreachable("cannot expand SCEV at desired location");
+@@ -801,9 +799,9 @@ Value *AffAcc::expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore
+   assert(isWellFormed(dimension) && dimension > 0u);
+   InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator();
+   if (!isSafeToExpandAt(getRep(dimension), InsertBefore, SE)) {
+-    getRep(dimension)->dump();
+-    InsertBefore->dump();
+-    InsertBefore->getParent()->dump();
++    // getRep(dimension)->dump();
++    // InsertBefore->dump();
++    // InsertBefore->getParent()->dump();
+     this->dump();
+   }
+   return reps[dimension]->expandAt(ty, InsertBefore);
+@@ -1032,9 +1030,9 @@ void AffineAccess::addAllConflicts(const std::vector<AffAcc *> &all) {
+       if (!L) continue;
+       if (L == outerMostExpandableExl) break;
+       if (!(!L || A->isWellFormed(L))){
+-        if (L) LLVM_DEBUG(L->dump());
+-        if (outerMostExpandableExl) LLVM_DEBUG(outerMostExpandableExl->dump());
+-        LLVM_DEBUG(A->dump());
++        //if (L) LLVM_DEBUG(L->dump());
++        //if (outerMostExpandableExl) LLVM_DEBUG(outerMostExpandableExl->dump());
++        //LLVM_DEBUG(A->dump());
+         llvm_unreachable("this should not happen!");
+       }
+       assert(!L || A->isWellFormed(L));
+@@ -1252,12 +1250,12 @@ AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM
+ //================== Affine Acces Analysis Pass for opt =======================================
+ PreservedAnalyses AffineAccessAnalysisPass::run(Function &F, FunctionAnalysisManager &FAM) {
+   AffineAccess AA = FAM.getResult<AffineAccessAnalysis>(F);
+-  for (const Loop *L : AA.getLI().getLoopsInPreorder()){
+-    L->dump();
+-    for (const AffAcc *A : AA.getExpandableAccesses(L)){
+-      A->dumpInLoop(L);
+-    }
+-  }
++  // for (const Loop *L : AA.getLI().getLoopsInPreorder()){
++  //   //L->dump();
++  //   for (const AffAcc *A : AA.getExpandableAccesses(L)){
++  //     A->dumpInLoop(L);
++  //   }
++  // }
+   return PreservedAnalyses::all();
+ }
+
+diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXpulp.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXpulp.td
+index 090598a2037e..c3f5abf4db39 100644
+--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXpulp.td
++++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXpulp.td
+@@ -1589,6 +1589,9 @@ def : Pat<(v4i8 (build_vector GPR:$rs1, GPR:$rs2, GPR:$rs3, GPR:$rs4)),
+ def : Pat<(v2i16 (splat_vector GPR:$rs1)), (PV_PACK_H GPR:$rs1, GPR:$rs1)>;
+ def : Pat<(v4i8 (splat_vector GPR:$rs1)), (PV_ADD_SC_B X0, GPR:$rs1)>;
+
++def : Pat<(v4i8 (riscv_vmv_v_x_vl (v4i8 undef), GPR:$imm, VLOpFrag)),
++	   (PV_ADD_SC_B X0, (ADD (SLL X0, $vl), $imm))>;
++
+ defm : GeneralSVectorPattern<seteq, "CMPEQ">;
+ defm : GeneralSVectorPattern<setne, "CMPNE">;
+ defm : GeneralSVectorPattern<setgt, "CMPGT">;
+diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp
+index 2c5bb14f85d7..63060155dbb8 100644
+--- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp
++++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp
+@@ -865,16 +865,6 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F
+       if (p != conds.end()) {
+         BasicBlock *Ex = getSingleExitBlock(L);
+         assert(Ex);
+-        if (SSRVerbose) {
+-          errs()
+-            <<"> Function "
+-            <<L->getHeader()->getParent()->getNameOrAsOperand()
+-            <<": Expanding SSR streams with "
+-            <<(L->getLoopDepth()-1)
+-            <<" containing loops and setup in preheader of loop with header "
+-            <<L->getHeader()->getNameOrAsOperand()
+-            <<"\n";
+-        }
+         cloneAndSetup(L->getLoopPreheader()->getTerminator(), &*Ex->getFirstInsertionPt(), p->second, exps.find(L)->getSecond());
+       }
+     }
+@@ -885,4 +875,4 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F
+   F.addFnAttr(StringRef(SSRFnAttr)); //we have inserted a stream, tag accordingly
+   if (SSRNoInline) F.addFnAttr(Attribute::AttrKind::NoInline);
+   return PreservedAnalyses::none();
+-}
+\ No newline at end of file
++}
diff --git a/toolchain/meson-build-script-arm.txt b/toolchain/meson-build-script-arm.txt
new file mode 100644
index 0000000..ac06293
--- /dev/null
+++ b/toolchain/meson-build-script-arm.txt
@@ -0,0 +1,20 @@
+[binaries]
+c = ['clang', '-m32', '-target', 'armv7m-none-eabi', '-mcpu=cortex-m4', '-mfloat-abi=soft', '-nostdlib']
+ar = 'llvm-ar'
+strip = 'llvm-strip'
+exe_wrapper = ['sh', '-c', 'test -z "$PICOLIBC_TEST" || run-thumbv7m "$@"', 'run-thumbv7m']
+
+[host_machine]
+system = 'none'
+cpu_family = 'arm'
+cpu = 'arm'
+endian = 'little'
+
+[properties]
+c_args = ['-Werror=double-promotion', '-Wno-unsupported-floating-point-opt', '-fshort-enums']
+c_link_args = ['-Wl,-z,noexecstack']
+skip_sanity_check = true
+default_flash_addr = '0x00000000'
+default_flash_size = '0x00400000'
+default_ram_addr   = '0x20000000'
+default_ram_size   = '0x00200000'
diff --git a/toolchain/meson-build-script-riscv.txt b/toolchain/meson-build-script-riscv.txt
new file mode 100644
index 0000000..f21bb36
--- /dev/null
+++ b/toolchain/meson-build-script-riscv.txt
@@ -0,0 +1,19 @@
+[binaries]
+c = ['clang', '-target', 'riscv32-unknown-elf', '-march=rv32imc_zfinx_xpulpv2', '-nostdlib']
+ar = 'llvm-ar'
+strip = 'llvm-strip'
+
+[host_machine]
+system = 'none'
+cpu_family = 'riscv32'
+cpu = 'riscv32'
+endian = 'little'
+
+[properties]
+c_args = ['-Werror=double-promotion', '-Wno-unsupported-floating-point-opt', '-fshort-enums', '-mno-relax']
+c_link_args = ['-Wl,-z,noexecstack']
+skip_sanity_check = true
+default_flash_addr = '0x00000000'
+default_flash_size = '0x00400000'
+default_ram_addr   = '0x20000000'
+default_ram_size   = '0x00200000'
diff --git a/toolchain/meson-build-script.txt b/toolchain/meson-build-script.txt
new file mode 100644
index 0000000..ac06293
--- /dev/null
+++ b/toolchain/meson-build-script.txt
@@ -0,0 +1,20 @@
+[binaries]
+c = ['clang', '-m32', '-target', 'armv7m-none-eabi', '-mcpu=cortex-m4', '-mfloat-abi=soft', '-nostdlib']
+ar = 'llvm-ar'
+strip = 'llvm-strip'
+exe_wrapper = ['sh', '-c', 'test -z "$PICOLIBC_TEST" || run-thumbv7m "$@"', 'run-thumbv7m']
+
+[host_machine]
+system = 'none'
+cpu_family = 'arm'
+cpu = 'arm'
+endian = 'little'
+
+[properties]
+c_args = ['-Werror=double-promotion', '-Wno-unsupported-floating-point-opt', '-fshort-enums']
+c_link_args = ['-Wl,-z,noexecstack']
+skip_sanity_check = true
+default_flash_addr = '0x00000000'
+default_flash_size = '0x00400000'
+default_ram_addr   = '0x20000000'
+default_ram_size   = '0x00200000'
diff --git a/toolchain/snitch_cluster.patch b/toolchain/snitch_cluster.patch
new file mode 100644
index 0000000..c39c525
--- /dev/null
+++ b/toolchain/snitch_cluster.patch
@@ -0,0 +1,63 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Moritz Scherer <scheremo@iis.ee.ethz.ch>
+diff --git a/sw/snRuntime/base.ld b/sw/snRuntime/base.ld
+index d0979b7..171921d 100644
+--- a/sw/snRuntime/base.ld
++++ b/sw/snRuntime/base.ld
+@@ -66,7 +66,7 @@ SECTIONS
+   .cbss :
+   {
+     __cbss_start = .;
+-    *(.cbss .cbss.*)
++    KEEP(*(.cbss .cbss.*))
+     __cbss_end = .;
+   } >L3
+
+diff --git a/sw/snRuntime/src/team.c b/sw/snRuntime/src/team.c
+index a9eb840..5290e1d 100644
+--- a/sw/snRuntime/src/team.c
++++ b/sw/snRuntime/src/team.c
+@@ -10,6 +10,10 @@ extern uint32_t snrt_global_core_idx();
+
+ extern uint32_t snrt_global_core_num();
+
++extern uint32_t snrt_global_compute_core_num();
++
++extern uint32_t snrt_global_compute_core_idx();
++
+ extern uint32_t snrt_cluster_idx();
+
+ extern uint32_t snrt_cluster_num();
+diff --git a/target/snitch_cluster/sw/runtime/rtl/src/putchar.c b/target/snitch_cluster/sw/runtime/rtl/src/putchar.c
+index 0ad9500..215c8b1 100644
+--- a/target/snitch_cluster/sw/runtime/rtl/src/putchar.c
++++ b/target/snitch_cluster/sw/runtime/rtl/src/putchar.c
+@@ -5,16 +5,19 @@
+ extern uintptr_t volatile tohost, fromhost;
+
+ // Rudimentary string buffer for putc calls.
+-extern uint32_t _edram;
+ #define PUTC_BUFFER_LEN (1024 - sizeof(size_t))
+-struct putc_buffer_header {
++
++typedef struct  {
+     size_t size;
+     uint64_t syscall_mem[8];
+-};
+-static volatile struct putc_buffer {
+-    struct putc_buffer_header hdr;
++} putc_buffer_header_t;
++
++typedef struct putc_buffer {
++    putc_buffer_header_t hdr;
+     char data[PUTC_BUFFER_LEN];
+-} *const putc_buffer = (void *)&_edram;
++} putc_buffer_t;
++
++static volatile putc_buffer_t putc_buffer[SNRT_CLUSTER_NUM*SNRT_CLUSTER_CORE_NUM] __attribute__((section(".dram")));
+
+ // Provide an implementation for putchar.
+ void _putchar(char character) {