From 1e1d1800948c1df3c9726553ff35b42396f4e273 Mon Sep 17 00:00:00 2001
From: doe300 <stadeldani@web.de>
Date: Thu, 1 Nov 2018 13:41:24 +0100
Subject: [PATCH 1/6] Removes mutex lock from most VPM access, small
 improvements and fixes

* dumps layout of used VPM per kernel
* rewrites Emulator to handle VPM configuration per QPU
* fixes bug in eliminaion of bit operations
* fixes bug mapping IR operations to machine code
* fixed bug mapping volatile parameters to read-only parameters
* Emulator now tracks TMU read per TMU

See #113
---
 src/analysis/DependencyGraph.cpp |  15 +--
 src/periphery/SFU.cpp            |   2 -
 src/periphery/VPM.cpp            | 196 +++++++++++++++++++++++--------
 src/periphery/VPM.h              |  52 +++++---
 src/tools/Emulator.cpp           | 105 +++++++++++------
 src/tools/Emulator.h             |  44 ++++---
 6 files changed, 283 insertions(+), 131 deletions(-)

diff --git a/src/analysis/DependencyGraph.cpp b/src/analysis/DependencyGraph.cpp
index 4294da62..0609abb4 100644
--- a/src/analysis/DependencyGraph.cpp
+++ b/src/analysis/DependencyGraph.cpp
@@ -303,12 +303,13 @@ static void createMutexDependencies(DependencyGraph& graph, DependencyNode& node
     const intermediate::IntermediateInstruction* lastSemaphoreAccess,
     const intermediate::IntermediateInstruction* lastMemFence)
 {
-    if((node.key->hasValueType(ValueType::REGISTER) && node.key->getOutput()->reg().isVertexPipelineMemory()) ||
-        std::any_of(node.key->getArguments().begin(), node.key->getArguments().end(),
-            [](const Value& arg) -> bool { return arg.hasRegister() && arg.reg().isVertexPipelineMemory(); }) ||
-        node.key->writesRegister(REG_MUTEX))
+    if(((node.key->hasValueType(ValueType::REGISTER) && node.key->getOutput()->reg().isVertexPipelineMemory()) ||
+           std::any_of(node.key->getArguments().begin(), node.key->getArguments().end(),
+               [](const Value& arg) -> bool { return arg.hasRegister() && arg.reg().isVertexPipelineMemory(); }) ||
+           node.key->writesRegister(REG_MUTEX)) &&
+        lastMutexLock != nullptr)
     {
-        // any VPM operation or mutex unlock must be ordered after the corresponding mutex lock
+        // any VPM operation or mutex unlock must be ordered after a previous mutex lock, if any
         auto& otherNode = graph.assertNode(lastMutexLock);
         addDependency(otherNode.getOrCreateEdge(&node).data, DependencyType::MUTEX_LOCK);
     }
@@ -497,13 +498,13 @@ static void createVPMIODependencies(DependencyGraph& graph, DependencyNode& node
         (node.key->writesRegister(REG_VPM_IO) || node.key->writesRegister(REG_VPM_DMA_STORE_ADDR) ||
             node.key->writesRegister(REG_MUTEX)))
     {
-        // any other VPM write, VPM write address setup or unlocking mutex must be executed aftre the VPM write
+        // any other VPM write, VPM write address setup or unlocking mutex must be executed after the VPM write
         auto& otherNode = graph.assertNode(lastVPMWrite);
         addDependency(otherNode.getOrCreateEdge(&node).data, DependencyType::PERIPHERY_ORDER);
     }
     if(lastVPMRead != nullptr && (node.key->readsRegister(REG_VPM_IO) || node.key->writesRegister(REG_MUTEX)))
     {
-        // any other VPM read or unlocking mutex must be executed aftre the VPM read
+        // any other VPM read or unlocking mutex must be executed after the VPM read
         auto& otherNode = graph.assertNode(lastVPMRead);
         addDependency(otherNode.getOrCreateEdge(&node).data, DependencyType::PERIPHERY_ORDER);
     }
diff --git a/src/periphery/SFU.cpp b/src/periphery/SFU.cpp
index 01e7747d..89f6be64 100644
--- a/src/periphery/SFU.cpp
+++ b/src/periphery/SFU.cpp
@@ -16,8 +16,6 @@ using namespace vc4c::operators;
 
 InstructionWalker periphery::insertSFUCall(const Register sfuReg, InstructionWalker it, const Value& arg)
 {
-    // TODO need to synchronize SFU ?? (per slice!)
-    // Also need to include the reading of r4. And if this is enclosed in mutex, the NOPs are no longer replaced?
     // 1. move argument to SFU register
     assign(it, Value(sfuReg, TYPE_FLOAT)) = arg;
     // 2. wait 2 instructions / don't touch r4
diff --git a/src/periphery/VPM.cpp b/src/periphery/VPM.cpp
index 64bec5fe..533f21b8 100644
--- a/src/periphery/VPM.cpp
+++ b/src/periphery/VPM.cpp
@@ -10,7 +10,9 @@
 #include "../intermediate/operators.h"
 #include "log.h"
 
+#include <algorithm>
 #include <cmath>
+#include <iomanip>
 
 using namespace vc4c;
 using namespace vc4c::periphery;
@@ -374,19 +376,11 @@ static uint8_t calculateQPUSideAddress(const DataType& type, unsigned char rowIn
 static NODISCARD InstructionWalker calculateElementOffset(
     Method& method, InstructionWalker it, const DataType& elementType, const Value& inAreaOffset, Value& elementOffset)
 {
-    if(inAreaOffset.getLiteralValue())
-    {
-        // e.g. 32-bit type, 4 byte offset -> 1 32-bit vector offset
-        // e.g. byte4 type, 4 byte offset -> 1 byte-vector offset
-        // e.g. half-word8 type, 32 byte offset -> 2 half-word vector offset
-        elementOffset = Value(
-            Literal(inAreaOffset.getLiteralValue()->signedInt() / elementType.getPhysicalWidth()), inAreaOffset.type);
-    }
-    else
-    {
-        // e.g. 32-bit type, 4 byte offset -> division by 4
-        elementOffset = assign(it, TYPE_INT32, "%vpm_element_offset") = inAreaOffset / 4_lit;
-    }
+    // e.g. 32-bit type, 4 byte offset -> 1 32-bit element offset
+    // e.g. byte4 type, 4 byte offset -> 1 byte element offset
+    // e.g. half-word8 type, 32 byte offset -> 2 half-word element offset
+    elementOffset = assign(it, TYPE_INT16, "%vpm_element_offset") =
+        inAreaOffset / Literal(elementType.getPhysicalWidth());
     return it;
 }
 
@@ -467,7 +461,7 @@ InstructionWalker VPM::insertWriteVPM(Method& method, InstructionWalker it, cons
 }
 
 InstructionWalker VPM::insertReadRAM(Method& method, InstructionWalker it, const Value& memoryAddress,
-    const DataType& type, const VPMArea* area, bool useMutex, const Value& inAreaOffset)
+    const DataType& type, const VPMArea* area, bool useMutex, const Value& inAreaOffset, const Value& numEntries)
 {
     if(area != nullptr)
         area->checkAreaSize(getVPMStorageType(type).getPhysicalWidth());
@@ -487,20 +481,20 @@ InstructionWalker VPM::insertReadRAM(Method& method, InstructionWalker it, const
                 memoryAddress.local()->reference.first->as<Parameter>()->decorations, ParameterDecorations::INPUT);
     }
 
+    auto rowCount = numEntries.getLiteralValue() ? numEntries.getLiteralValue()->unsignedInt() : 0;
+    if(rowCount > std::numeric_limits<uint8_t>::max())
+        throw CompilationError(CompilationStep::GENERAL, "Cannot read more than 16 entries at a time from RAM via DMA",
+            numEntries.to_string());
+
     it = insertLockMutex(it, useMutex);
     // for some additional information, see
     // http://maazl.de/project/vc4asm/doc/VideoCoreIV-addendum.html
 
     // initialize VPM DMA for reading from host
     const VPMArea& realArea = area != nullptr ? *area : getScratchArea();
-    const VPRSetup dmaSetup(realArea.toReadDMASetup(type));
-    if(inAreaOffset == INT_ZERO)
-    {
-        it.emplace(new LoadImmediate(VPM_IN_SETUP_REGISTER, Literal(dmaSetup.value)));
-        it->addDecorations(InstructionDecorations::VPM_READ_CONFIGURATION);
-        it.nextInBlock();
-    }
-    else
+    const VPRSetup dmaSetup(realArea.toReadDMASetup(type, static_cast<uint8_t>(rowCount)));
+    Value dmaSetupBits(Literal(dmaSetup.value), TYPE_INT32);
+    if(inAreaOffset != INT_ZERO)
     {
         // this is the offset in byte -> calculate the offset in elements of destination-type
 
@@ -508,10 +502,29 @@ InstructionWalker VPM::insertReadRAM(Method& method, InstructionWalker it, const
         Value elementOffset = UNDEFINED_VALUE;
         it = calculateElementOffset(method, it, memoryAddress.type.getElementType(), inAreaOffset, elementOffset);
         // 2) dynamically calculate new VPM address from base and offset (add offset to setup-value)
+        if(!realArea.canBePackedIntoRow())
+            // need to modify offset to point to next row, not next element in same row
+            elementOffset = assign(it, TYPE_INT32, "%vpm_row_offset") = elementOffset << 4_val;
         // 3) write setup with dynamic address
-        assign(it, VPM_IN_SETUP_REGISTER) = (Value(Literal(dmaSetup.value), TYPE_INT32) + elementOffset,
-            InstructionDecorations::VPM_READ_CONFIGURATION);
+        dmaSetupBits = assign(it, TYPE_INT32, "%vpr_setup") =
+            (dmaSetupBits + elementOffset, InstructionDecorations::VPM_READ_CONFIGURATION);
+    }
+
+    if(!numEntries.getLiteralValue())
+    {
+        // we need to dynamically set the number of elements to be read
+
+        // TODO this assumes 1 row = 1 entry, is this always correct?
+
+        // only 0-15 is supported, where value of 0 means 16 rows
+        // TODO this calculation treats a source of 0 or any multiple of 16 as 16 (and truncates all the higher counts)
+        auto numRows = assign(it, TYPE_INT8, "%vpr_setup_rows") = numEntries & 0xF_val;
+        auto numRowsShifted = assign(it, TYPE_INT8, "%vpr_setup_rows") = numRows << 16_val;
+        dmaSetupBits = assign(it, TYPE_INT32, "%vpr_setup") = dmaSetupBits + numRowsShifted;
     }
+
+    assign(it, VPM_IN_SETUP_REGISTER) = (dmaSetupBits, InstructionDecorations::VPM_READ_CONFIGURATION);
+
     const VPRSetup strideSetup(VPRStrideSetup(0));
     it.emplace(new LoadImmediate(VPM_IN_SETUP_REGISTER, Literal(strideSetup.value)));
     it->addDecorations(InstructionDecorations::VPM_READ_CONFIGURATION);
@@ -529,7 +542,7 @@ InstructionWalker VPM::insertReadRAM(Method& method, InstructionWalker it, const
 }
 
 InstructionWalker VPM::insertWriteRAM(Method& method, InstructionWalker it, const Value& memoryAddress,
-    const DataType& type, const VPMArea* area, bool useMutex, const Value& inAreaOffset)
+    const DataType& type, const VPMArea* area, bool useMutex, const Value& inAreaOffset, const Value& numEntries)
 {
     if(area != nullptr)
         area->checkAreaSize(getVPMStorageType(type).getPhysicalWidth());
@@ -537,6 +550,15 @@ InstructionWalker VPM::insertWriteRAM(Method& method, InstructionWalker it, cons
         // a single vector can only use a maximum of 1 row
         updateScratchSize(1);
 
+    // TODO is the calculation of the size to copy correct? We are mixing different types (e.g. byte from memory
+    // instruction, consecutive memory area) with type for VPM area (rows which might not be filled completely). Same
+    // for reading RAM!
+
+    auto rowCount = numEntries.getLiteralValue() ? numEntries.getLiteralValue()->unsignedInt() : 0;
+    if(rowCount > std::numeric_limits<uint8_t>::max())
+        throw CompilationError(CompilationStep::GENERAL,
+            "Cannot write more than 128 entries at a time into RAM via DMA", numEntries.to_string());
+
     if(memoryAddress.hasLocal() && memoryAddress.local() != nullptr)
     {
         // set the type of the parameter, if we can determine it
@@ -553,26 +575,42 @@ InstructionWalker VPM::insertWriteRAM(Method& method, InstructionWalker it, cons
 
     // initialize VPM DMA for writing to host
     const VPMArea& realArea = area != nullptr ? *area : getScratchArea();
-    const VPWSetup dmaSetup(realArea.toWriteDMASetup(type));
-    if(inAreaOffset == INT_ZERO)
-    {
-        it.emplace(new LoadImmediate(VPM_OUT_SETUP_REGISTER, Literal(dmaSetup.value)));
-        it->addDecorations(InstructionDecorations::VPM_WRITE_CONFIGURATION);
-        it.nextInBlock();
-    }
-    else
+    const VPWSetup dmaSetup(realArea.toWriteDMASetup(type, static_cast<uint8_t>(rowCount)));
+    Value dmaSetupBits(Literal(dmaSetup.value), TYPE_INT32);
+    if(inAreaOffset != INT_ZERO)
     {
         // this is the offset in byte -> calculate the offset in elements of destination-type
 
         // 1) convert offset in bytes to offset in elements (!! VPM stores vector-size of 16!!)
         Value elementOffset = UNDEFINED_VALUE;
-        it = calculateElementOffset(method, it, memoryAddress.type.getElementType(), inAreaOffset, elementOffset);
+        it = calculateElementOffset(
+            method, it, memoryAddress.type.getElementType().getElementType(), inAreaOffset, elementOffset);
         // 2) dynamically calculate new VPM address from base and offset (shift and add offset to setup-value)
+        if(!realArea.canBePackedIntoRow())
+            // need to modify offset to point to next row, not next element in same row
+            elementOffset = assign(it, TYPE_INT32, "%vpm_row_offset") = elementOffset << 4_val;
         Value shiftedOffset = assign(it, TYPE_INT32) = elementOffset << 3_val;
         // 3) write setup with dynamic address
-        assign(it, VPM_OUT_SETUP_REGISTER) = (Value(Literal(dmaSetup.value), TYPE_INT32) + shiftedOffset,
-            InstructionDecorations::VPM_WRITE_CONFIGURATION);
+        dmaSetupBits = assign(it, TYPE_INT32, "%vpw_setup") =
+            (Value(Literal(dmaSetup.value), TYPE_INT32) + shiftedOffset,
+                InstructionDecorations::VPM_WRITE_CONFIGURATION);
+    }
+
+    if(!numEntries.getLiteralValue())
+    {
+        // we need to dynamically set the number of elements to be written
+
+        // TODO this assumes 1 row = 1 entry, is this always correct?
+
+        // only 0-128 is supported, where value of 0 means 128 rows
+        // TODO this calculation treats a source of 0 or any multiple of 128 as 128 (and truncates all the higher
+        // counts)
+        auto numRows = assign(it, TYPE_INT8, "%vpw_setup_rows") = numEntries & 0x7F_val;
+        auto numRowsShifted = assign(it, TYPE_INT8, "%vpw_setup_rows") = numRows << 23_val;
+        dmaSetupBits = assign(it, TYPE_INT32, "%vpw_setup") = dmaSetupBits + numRowsShifted;
     }
+
+    assign(it, VPM_OUT_SETUP_REGISTER) = (dmaSetupBits, InstructionDecorations::VPM_WRITE_CONFIGURATION);
     // set stride to zero
     const VPWSetup strideSetup(VPWStrideSetup(0));
     it.emplace(new LoadImmediate(VPM_OUT_SETUP_REGISTER, Literal(strideSetup.value)));
@@ -593,6 +631,10 @@ InstructionWalker VPM::insertWriteRAM(Method& method, InstructionWalker it, cons
 InstructionWalker VPM::insertCopyRAM(Method& method, InstructionWalker it, const Value& destAddress,
     const Value& srcAddress, const unsigned numBytes, const VPMArea* area, bool useMutex)
 {
+    // TODO copying from/to RAM can use VPM area not accessible from QPU!!
+    // With area per QPU, so they can copy unsynchronized
+    // TODO test on py-videocore beforehand that access of upper VPM area works!
+
     const auto size = getBestVectorSize(numBytes);
     if(area != nullptr)
         area->checkAreaSize(size.first.getPhysicalWidth());
@@ -703,7 +745,8 @@ bool VPMArea::canBeAccessedViaDMA() const
 
 bool VPMArea::canBePackedIntoRow() const
 {
-    return !canBeAccessedViaDMA() || getElementType().getVectorWidth() == NATIVE_VECTOR_SIZE;
+    // TODO proper calculation (or pass in constructor!)
+    return /*!canBeAccessedViaDMA() ||*/ getElementType().getVectorWidth() == NATIVE_VECTOR_SIZE;
 }
 
 VPWGenericSetup VPMArea::toWriteSetup(const DataType& elementType) const
@@ -722,7 +765,7 @@ VPWGenericSetup VPMArea::toWriteSetup(const DataType& elementType) const
     return setup;
 }
 
-VPWDMASetup VPMArea::toWriteDMASetup(const DataType& elementType, uint8_t numValues) const
+VPWDMASetup VPMArea::toWriteDMASetup(const DataType& elementType, uint8_t numRows) const
 {
     DataType type = elementType.isUnknown() ? getElementType() : elementType;
     if(type.getScalarBitCount() > 32)
@@ -731,15 +774,18 @@ VPWDMASetup VPMArea::toWriteDMASetup(const DataType& elementType, uint8_t numVal
     if(type.isUnknown())
         throw CompilationError(
             CompilationStep::GENERAL, "Cannot generate VPW setup for unknown type", elementType.to_string());
+    if(numRows > 128)
+        throw CompilationError(CompilationStep::GENERAL, "Cannot write more than 128 rows via DMA from VPW at a time");
 
     // by "default", one value per row, so we need to store the number of values as number of rows
     uint8_t rowDepth = type.getVectorWidth(true);
-    uint8_t numRows = numValues;
+
     if(canBePackedIntoRow())
     {
+        // TODO is this still valid?
         // if we have the row packed, we need to calculate the row-width from the maximum row-width and the number of
         // elements
-        const unsigned totalNumElements = type.getVectorWidth(true) * numValues;
+        const unsigned totalNumElements = type.getVectorWidth(true) * numRows;
         const uint8_t elementsPerRow = getElementsInRow(type);
         if((totalNumElements > elementsPerRow) && (totalNumElements % elementsPerRow != 0))
             throw CompilationError(CompilationStep::GENERAL,
@@ -763,7 +809,7 @@ VPWDMASetup VPMArea::toWriteDMASetup(const DataType& elementType, uint8_t numVal
     return setup;
 }
 
-VPRGenericSetup VPMArea::toReadSetup(const DataType& elementType, uint8_t numValues) const
+VPRGenericSetup VPMArea::toReadSetup(const DataType& elementType, uint8_t numRows) const
 {
     DataType type = elementType.isUnknown() ? getElementType() : elementType;
     if(type.isUnknown())
@@ -773,13 +819,13 @@ VPRGenericSetup VPMArea::toReadSetup(const DataType& elementType, uint8_t numVal
     // if we can pack into a single row, do so. Otherwise set stride to beginning of next row
     const uint8_t stride =
         canBePackedIntoRow() ? 1 : static_cast<uint8_t>(TYPE_INT32.getScalarBitCount() / type.getScalarBitCount());
-    VPRGenericSetup setup(getVPMSize(type), stride, numValues, calculateQPUSideAddress(type, rowOffset, 0));
+    VPRGenericSetup setup(getVPMSize(type), stride, numRows, calculateQPUSideAddress(type, rowOffset, 0));
     setup.setHorizontal(IS_HORIZONTAL);
     setup.setLaned(!IS_PACKED);
     return setup;
 }
 
-VPRDMASetup VPMArea::toReadDMASetup(const DataType& elementType, uint8_t numValues) const
+VPRDMASetup VPMArea::toReadDMASetup(const DataType& elementType, uint8_t numRows) const
 {
     DataType type = elementType.isUnknown() ? getElementType() : elementType;
     if(type.getScalarBitCount() > 32)
@@ -788,14 +834,14 @@ VPRDMASetup VPMArea::toReadDMASetup(const DataType& elementType, uint8_t numValu
     if(type.isUnknown())
         throw CompilationError(
             CompilationStep::GENERAL, "Cannot generate VPW setup for unknown type", elementType.to_string());
-    if(numValues > 16)
+    if(numRows > 16)
         throw CompilationError(CompilationStep::GENERAL, "Cannot read more than 16 rows via DMA into VPW at a time",
-            std::to_string(numValues));
+            std::to_string(numRows));
 
     // If the data is packed, have a pitch of 1 unit (e.g. 1 byte/half-word/word offset depending on type)
     // otherwise, always jump to the next row
     const uint8_t vpmPitch = canBePackedIntoRow() ? 1 : TYPE_INT32.getScalarBitCount() / type.getScalarBitCount();
-    VPRDMASetup setup(getVPMDMAMode(type), type.getVectorWidth(true) % 16 /* 0 => 16 */, numValues % 16 /* 0 => 16 */,
+    VPRDMASetup setup(getVPMDMAMode(type), type.getVectorWidth(true) % 16 /* 0 => 16 */, numRows % 16 /* 0 => 16 */,
         vpmPitch % 16 /* 0 => 16 */);
     setup.setWordRow(rowOffset);
     setup.setVertical(!IS_HORIZONTAL);
@@ -1075,3 +1121,61 @@ DataType VPM::getVPMStorageType(const DataType& type)
         inVPMType = type.toVectorType(16);
     return inVPMType;
 }
+
+static void writeArea(std::wostream& s, const std::string& name, unsigned width)
+{
+    auto sub = name.substr(0, width - 1) + "|";
+    s << std::setw(width) << sub;
+}
+
+void VPM::dumpUsage() const
+{
+    static const unsigned outputWidth = 128;
+
+    logging::debug() << "VPM usage: "
+                     << std::accumulate(areas.begin(), areas.end(), 0u,
+                            [](unsigned sum, const std::shared_ptr<VPMArea>& area) -> unsigned {
+                                return sum + (area != nullptr);
+                            })
+                     << " of " << VPM_NUM_ROWS << " rows:" << logging::endl;
+
+    std::shared_ptr<VPMArea> lastArea;
+    unsigned numEmpty = 0;
+    auto& stream = logging::debug() << "|";
+    for(const auto& area : areas)
+    {
+        if(area == lastArea)
+            continue;
+        if(!area)
+        {
+            ++numEmpty;
+            continue;
+        }
+        if(numEmpty > 0)
+        {
+            writeArea(stream, "", (numEmpty * outputWidth) / VPM_NUM_ROWS);
+            numEmpty = 0;
+        }
+        lastArea = area;
+        std::string name;
+        switch(area->usageType)
+        {
+        case VPMUsage::SCRATCH:
+            name = "scratch";
+            break;
+        case VPMUsage::LOCAL_MEMORY:
+            name = area->originalAddress ? area->originalAddress->name : "local";
+            break;
+        case VPMUsage::REGISTER_SPILLING:
+            name = "spilling";
+            break;
+        case VPMUsage::STACK:
+            name = "stack";
+            break;
+        }
+        writeArea(stream, name, (area->numRows * outputWidth) / VPM_NUM_ROWS);
+    }
+    if(numEmpty > 0)
+        writeArea(stream, "", (numEmpty * outputWidth) / VPM_NUM_ROWS);
+    stream << logging::endl;
+}
\ No newline at end of file
diff --git a/src/periphery/VPM.h b/src/periphery/VPM.h
index ec8bed50..eea014c7 100644
--- a/src/periphery/VPM.h
+++ b/src/periphery/VPM.h
@@ -589,12 +589,12 @@ namespace vc4c
          * Inserts a read from the memory located at addr into the value dest
          */
         NODISCARD InstructionWalker insertReadDMA(
-            Method& method, InstructionWalker it, const Value& dest, const Value& addr, const bool useMutex = true);
+            Method& method, InstructionWalker it, const Value& dest, const Value& addr, const bool useMutex = false);
         /*
          * Inserts write from the value src into the memory located at addr
          */
         NODISCARD InstructionWalker insertWriteDMA(
-            Method& method, InstructionWalker it, const Value& src, const Value& addr, const bool useMutex = true);
+            Method& method, InstructionWalker it, const Value& src, const Value& addr, const bool useMutex = false);
 
         /*
          * Tries to find a combination of a vector of an integer-type and a number of vectors to match the given size in
@@ -660,6 +660,14 @@ namespace vc4c
              * is lowered into VPM).
              */
             const Local* originalAddress;
+            /*
+             * Whether data in this area is packed.
+             *
+             * If this flag is set, all space in this VPM area is assumed to be used, making it a "continuous" cache
+             * space. This applies e.g. for memory of vector-types with 16 elements (filling the whole row) as well as
+             * any memory location not accessed by the QPU at all (no need to pad up to 16-element vectors).
+             */
+            const bool isContinuous;
 
             void checkAreaSize(unsigned requestedSize) const;
 
@@ -694,6 +702,8 @@ namespace vc4c
              * When writing into VPM, a QPU always writes vectors of 16 elements. Since the DMA configuration cannot set
              * a stride of less than a row, we would not be able to transfer the second, third, etc. value without
              * copying all the junk of the remaining (unset) vector-elements of the previous values.
+             *
+             * TODO is this true at all? What about DMA write stride setup "Blockmode" bit? And DAM read setup "VPitch"?
              */
             bool canBeAccessedViaDMA() const;
 
@@ -715,27 +725,27 @@ namespace vc4c
 
             /*
              * Generates a VPM-to-RAM DMA write setup for storing the contents of the VPM area into RAM with the given
-             * element-type and number of values of the given type.
+             * element-type and number of rows of the given type.
              *
              * If the data-type is set to unknown, the element-type of the local associated with this area is used
              */
-            VPWDMASetup toWriteDMASetup(const DataType& elementType, uint8_t numValues = 1) const;
+            VPWDMASetup toWriteDMASetup(const DataType& elementType, uint8_t numRows = 1) const;
 
             /*
              * Generates a VPM-to-QPU read setup for accessing the base-address of this VPM area for the given number of
-             * elements of the given data-type.
+             * rows of the given data-type.
              *
              * If the data-type is set to unknown, the default element-type of this area is used
              */
-            VPRGenericSetup toReadSetup(const DataType& elementType, uint8_t numValues = 1) const;
+            VPRGenericSetup toReadSetup(const DataType& elementType, uint8_t numRows = 1) const;
 
             /*
              * Generates a RAM-to-VPM DMA read setup for loading the contents of a memory address into this VPM area
-             * given the element-type and numbr of values of the given type.
+             * given the element-type and number of rows of the given type.
              *
              * If the data-type is set to unknown, the default element-type of this area is used
              */
-            VPRDMASetup toReadDMASetup(const DataType& elementType, uint8_t numValues = 1) const;
+            VPRDMASetup toReadDMASetup(const DataType& elementType, uint8_t numRows = 1) const;
 
             std::string to_string() const;
         };
@@ -778,37 +788,38 @@ namespace vc4c
              * NOTE: the inAreaOffset is the offset in bytes
              */
             NODISCARD InstructionWalker insertReadVPM(Method& method, InstructionWalker it, const Value& dest,
-                const VPMArea* area = nullptr, bool useMutex = true, const Value& inAreaOffset = INT_ZERO);
+                const VPMArea* area = nullptr, bool useMutex = false, const Value& inAreaOffset = INT_ZERO);
+
             /*
              * Inserts a write from a QPU register into VPM
              *
              * NOTE: the inAreaOffset is the offset in bytes
              */
             NODISCARD InstructionWalker insertWriteVPM(Method& method, InstructionWalker it, const Value& src,
-                const VPMArea* area = nullptr, bool useMutex = true, const Value& inAreaOffset = INT_ZERO);
+                const VPMArea* area = nullptr, bool useMutex = false, const Value& inAreaOffset = INT_ZERO);
 
             /*
              * Inserts a read from RAM into VPM via DMA
              */
             NODISCARD InstructionWalker insertReadRAM(Method& method, InstructionWalker it, const Value& memoryAddress,
-                const DataType& type, const VPMArea* area = nullptr, bool useMutex = true,
-                const Value& inAreaOffset = INT_ZERO);
+                const DataType& type, const VPMArea* area = nullptr, bool useMutex = false,
+                const Value& inAreaOffset = INT_ZERO, const Value& numEntries = INT_ONE);
             /*
              * Inserts a write from VPM into RAM via DMA
              */
             NODISCARD InstructionWalker insertWriteRAM(Method& method, InstructionWalker it, const Value& memoryAddress,
-                const DataType& type, const VPMArea* area = nullptr, bool useMutex = true,
-                const Value& inAreaOffset = INT_ZERO);
+                const DataType& type, const VPMArea* area = nullptr, bool useMutex = false,
+                const Value& inAreaOffset = INT_ZERO, const Value& numEntries = INT_ONE);
             /*
              * Inserts a copy from RAM via DMA and VPM into RAM
              */
             NODISCARD InstructionWalker insertCopyRAM(Method& method, InstructionWalker it, const Value& destAddress,
-                const Value& srcAddress, unsigned numBytes, const VPMArea* area = nullptr, bool useMutex = true);
+                const Value& srcAddress, unsigned numBytes, const VPMArea* area = nullptr, bool useMutex = false);
             /*
              * Inserts a filling of a memory-area with a single value from VPM
              */
             NODISCARD InstructionWalker insertFillRAM(Method& method, InstructionWalker it, const Value& memoryAddress,
-                const DataType& type, unsigned numCopies, const VPMArea* area = nullptr, bool useMutex = true);
+                const DataType& type, unsigned numCopies, const VPMArea* area = nullptr, bool useMutex = false);
 
             /*
              * Updates the maximum size used by the scratch area.
@@ -823,6 +834,11 @@ namespace vc4c
              */
             static DataType getVPMStorageType(const DataType& type);
 
+            /*
+             * Prints the currently configured usage of the VPM (areas and types) to the log output
+             */
+            void dumpUsage() const;
+
         private:
             const unsigned maximumVPMSize;
             std::vector<std::shared_ptr<VPMArea>> areas;
@@ -877,8 +893,8 @@ namespace vc4c
         /*
          * Returns the instruction related to the current VPM access of the instruction given.
          *
-         * This function looks within the same mutex-lock block at the preceding and following instructions to find
-         * the instructions required for the given VPM access.
+         * This function looks within the same mutex-lock block (if any) at the preceding and following instructions to
+         * find the instructions required for the given VPM access.
          */
         VPMInstructions findRelatedVPMInstructions(InstructionWalker anyVPMInstruction, bool isVPMRead);
     } // namespace periphery
diff --git a/src/tools/Emulator.cpp b/src/tools/Emulator.cpp
index 8d6f5053..5bf4536b 100644
--- a/src/tools/Emulator.cpp
+++ b/src/tools/Emulator.cpp
@@ -194,15 +194,15 @@ void Registers::writeRegister(Register reg, const Value& val, std::bitset<16> el
     else if(reg.num == REG_UNIFORM_ADDRESS.num)
         qpu.uniforms.setUniformAddress(getActualValue(modifiedValue));
     else if(reg.num == REG_VPM_IO.num)
-        qpu.vpm.writeValue(getActualValue(modifiedValue));
+        qpu.vpm.writeValue(qpu.ID, getActualValue(modifiedValue));
     else if(reg == REG_VPM_IN_SETUP)
-        qpu.vpm.setReadSetup(getActualValue(modifiedValue));
+        qpu.vpm.setReadSetup(qpu.ID, getActualValue(modifiedValue));
     else if(reg == REG_VPM_OUT_SETUP)
-        qpu.vpm.setWriteSetup(getActualValue(modifiedValue));
+        qpu.vpm.setWriteSetup(qpu.ID, getActualValue(modifiedValue));
     else if(reg == REG_VPM_DMA_LOAD_ADDR)
-        qpu.vpm.setDMAReadAddress(getActualValue(modifiedValue));
+        qpu.vpm.setDMAReadAddress(qpu.ID, getActualValue(modifiedValue));
     else if(reg == REG_VPM_DMA_STORE_ADDR)
-        qpu.vpm.setDMAWriteAddress(getActualValue(modifiedValue));
+        qpu.vpm.setDMAWriteAddress(qpu.ID, getActualValue(modifiedValue));
     else if(reg.num == REG_MUTEX.num)
         qpu.mutex.unlock(qpu.ID);
     else if(reg.num == REG_SFU_RECIP.num)
@@ -270,13 +270,13 @@ std::pair<Value, bool> Registers::readRegister(Register reg)
     if(reg.num == REG_VPM_IO.num)
     {
         if(readCache.find(REG_VPM_IO) == readCache.end())
-            setReadCache(REG_VPM_IO, qpu.vpm.readValue());
+            setReadCache(REG_VPM_IO, qpu.vpm.readValue(qpu.ID));
         return std::make_pair(readCache.at(REG_VPM_IO), true);
     }
     if(reg == REG_VPM_DMA_LOAD_WAIT)
-        return std::make_pair(UNDEFINED_VALUE, qpu.vpm.waitDMARead());
+        return std::make_pair(UNDEFINED_VALUE, qpu.vpm.waitDMARead(qpu.ID));
     if(reg == REG_VPM_DMA_STORE_WAIT)
-        return std::make_pair(UNDEFINED_VALUE, qpu.vpm.waitDMAWrite());
+        return std::make_pair(UNDEFINED_VALUE, qpu.vpm.waitDMAWrite(qpu.ID));
     if(reg.num == REG_MUTEX.num)
     {
         if(readCache.find(REG_MUTEX) == readCache.end())
@@ -444,19 +444,36 @@ std::pair<Value, bool> TMUs::readTMU()
 
     // need to select the first triggered read in both queues
     std::queue<std::pair<Value, uint32_t>>* queue = nullptr;
+    bool tmuFlag = false;
     if(!tmu0ResponseQueue.empty())
+    {
         queue = &tmu0ResponseQueue;
+        tmuFlag = false;
+    }
     if(!tmu1ResponseQueue.empty())
     {
         if(queue == nullptr)
+        {
             queue = &tmu1ResponseQueue;
+            tmuFlag = true;
+        }
         else if(tmu1ResponseQueue.front().second < queue->front().second)
+        {
             queue = &tmu1ResponseQueue;
+            tmuFlag = true;
+        }
     }
 
     auto front = queue->front();
     queue->pop();
-    PROFILE_COUNTER(vc4c::profiler::COUNTER_EMULATOR + 68, "TMU read", 1);
+    if(!tmuFlag)
+    {
+        PROFILE_COUNTER(vc4c::profiler::COUNTER_EMULATOR + 68, "TMU0 read", 1);
+    }
+    else
+    {
+        PROFILE_COUNTER(vc4c::profiler::COUNTER_EMULATOR + 69, "TMU1 read", 1);
+    }
     return std::make_pair(front.first, true);
 }
 
@@ -521,7 +538,16 @@ bool TMUs::triggerTMURead(uint8_t tmu)
         throw CompilationError(CompilationStep::GENERAL, "TMU response queue is full!");
 
     auto val = requestQueue.front();
-    PROFILE_COUNTER(vc4c::profiler::COUNTER_EMULATOR + 65, "TMU read trigger", val.second + 9 <= qpu.getCurrentCycle());
+    if(tmu == 0)
+    {
+        PROFILE_COUNTER(
+            vc4c::profiler::COUNTER_EMULATOR + 65, "TMU0 read trigger", val.second + 9 <= qpu.getCurrentCycle());
+    }
+    else
+    {
+        PROFILE_COUNTER(
+            vc4c::profiler::COUNTER_EMULATOR + 66, "TMU1 read trigger", val.second + 9 <= qpu.getCurrentCycle());
+    }
     if(val.second + 9 > qpu.getCurrentCycle())
         // block for at least 9 cycles
         return false;
@@ -664,9 +690,9 @@ static std::pair<uint32_t, uint32_t> toStride(T setup)
     throw CompilationError(CompilationStep::GENERAL, "Unhandled VPM type-size", std::to_string(setup.getSize()));
 }
 
-Value VPM::readValue()
+Value VPM::readValue(unsigned char qpu)
 {
-    periphery::VPRSetup setup = periphery::VPRSetup::fromLiteral(vpmReadSetup);
+    periphery::VPRSetup setup = periphery::VPRSetup::fromLiteral(vpmReadSetup[qpu]);
 
     if(setup.value == 0)
         logging::warn() << "VPM generic setup was not previously set: " << setup.to_string() << logging::endl;
@@ -718,7 +744,7 @@ Value VPM::readValue()
     setup.genericSetup.setAddress(
         static_cast<uint8_t>(setup.genericSetup.getAddress() + setup.genericSetup.getStride()));
     setup.genericSetup.setNumber(static_cast<uint8_t>((16 + setup.genericSetup.getNumber() - 1) % 16));
-    vpmReadSetup = setup.value;
+    vpmReadSetup[qpu] = setup.value;
 
     logging::debug() << "Read value from VPM: " << result.to_string(false, true) << logging::endl;
     logging::debug() << "New read setup is now: " << setup.to_string() << logging::endl;
@@ -727,9 +753,9 @@ Value VPM::readValue()
     return result;
 }
 
-void VPM::writeValue(const Value& val)
+void VPM::writeValue(unsigned char qpu, const Value& val)
 {
-    periphery::VPWSetup setup = periphery::VPWSetup::fromLiteral(vpmWriteSetup);
+    periphery::VPWSetup setup = periphery::VPWSetup::fromLiteral(vpmWriteSetup[qpu]);
 
     if(setup.value == 0)
         logging::warn() << "VPM generic setup was not previously set: " << setup.to_string() << logging::endl;
@@ -787,55 +813,55 @@ void VPM::writeValue(const Value& val)
 
     setup.genericSetup.setAddress(
         static_cast<uint8_t>(setup.genericSetup.getAddress() + setup.genericSetup.getStride()));
-    vpmWriteSetup = setup.value;
+    vpmWriteSetup[qpu] = setup.value;
 
     logging::debug() << "Wrote value into VPM: " << val.to_string(true, true) << logging::endl;
     logging::debug() << "New write setup is now: " << setup.to_string() << logging::endl;
     PROFILE_COUNTER(vc4c::profiler::COUNTER_EMULATOR + 90, "VPM written", 1);
 }
 
-void VPM::setWriteSetup(const Value& val)
+void VPM::setWriteSetup(unsigned char qpu, const Value& val)
 {
     const Value& element0 = val.hasContainer() ? val.container().elements[0] : val;
     if(element0.isUndefined())
         throw CompilationError(CompilationStep::GENERAL, "Undefined VPM setup value", val.to_string());
     periphery::VPWSetup setup = periphery::VPWSetup::fromLiteral(element0.getLiteralValue()->unsignedInt());
     if(setup.isDMASetup())
-        dmaWriteSetup = setup.value;
+        dmaWriteSetup[qpu] = setup.value;
     else if(setup.isGenericSetup())
-        vpmWriteSetup = setup.value;
+        vpmWriteSetup[qpu] = setup.value;
     else if(setup.isStrideSetup())
-        writeStrideSetup = setup.value;
+        writeStrideSetup[qpu] = setup.value;
     else
         logging::warn() << "Writing unknown VPM write setup: " << element0.getLiteralValue()->unsignedInt()
                         << logging::endl;
     logging::debug() << "Set VPM write setup: " << setup.to_string() << logging::endl;
 }
 
-void VPM::setReadSetup(const Value& val)
+void VPM::setReadSetup(unsigned char qpu, const Value& val)
 {
     const Value& element0 = val.hasContainer() ? val.container().elements[0] : val;
     if(element0.isUndefined())
         throw CompilationError(CompilationStep::GENERAL, "Undefined VPM setup value", val.to_string());
     periphery::VPRSetup setup = periphery::VPRSetup::fromLiteral(element0.getLiteralValue()->unsignedInt());
     if(setup.isDMASetup())
-        dmaReadSetup = setup.value;
+        dmaReadSetup[qpu] = setup.value;
     else if(setup.isGenericSetup())
-        vpmReadSetup = setup.value;
+        vpmReadSetup[qpu] = setup.value;
     else if(setup.isStrideSetup())
-        readStrideSetup = setup.value;
+        readStrideSetup[qpu] = setup.value;
     else
         logging::warn() << "Writing unknown VPM read setup: " << element0.getLiteralValue()->unsignedInt()
                         << logging::endl;
     logging::debug() << "Set VPM read setup: " << setup.to_string() << logging::endl;
 }
 
-void VPM::setDMAWriteAddress(const Value& val)
+void VPM::setDMAWriteAddress(unsigned char qpu, const Value& val)
 {
     const Value& element0 = val.hasContainer() ? val.container().elements[0] : val;
     if(element0.isUndefined())
         throw CompilationError(CompilationStep::GENERAL, "Undefined DMA setup value", val.to_string());
-    periphery::VPWSetup setup = periphery::VPWSetup::fromLiteral(dmaWriteSetup);
+    periphery::VPWSetup setup = periphery::VPWSetup::fromLiteral(dmaWriteSetup[qpu]);
 
     if(setup.value == 0)
         logging::warn() << "VPM DMA write setup was not previously set: " << setup.to_string() << logging::endl;
@@ -855,7 +881,7 @@ void VPM::setDMAWriteAddress(const Value& val)
         throw CompilationError(
             CompilationStep::GENERAL, "Accessing more than a VPM row at once is not supported", setup.to_string());
 
-    auto stride = periphery::VPWSetup::fromLiteral(writeStrideSetup).strideSetup.getStride();
+    auto stride = periphery::VPWSetup::fromLiteral(writeStrideSetup[qpu]).strideSetup.getStride();
 
     MemoryAddress address = static_cast<MemoryAddress>(element0.getLiteralValue()->unsignedInt());
 
@@ -880,16 +906,16 @@ void VPM::setDMAWriteAddress(const Value& val)
         address += stride + (typeSize * sizes.second);
     }
 
-    lastDMAWriteTrigger = currentCycle;
+    lastDMAWriteTrigger[qpu] = currentCycle;
     PROFILE_COUNTER(vc4c::profiler::COUNTER_EMULATOR + 100, "write DMA write address", 1);
 }
 
-void VPM::setDMAReadAddress(const Value& val)
+void VPM::setDMAReadAddress(unsigned char qpu, const Value& val)
 {
     const Value& element0 = val.hasContainer() ? val.container().elements[0] : val;
     if(element0.isUndefined())
         throw CompilationError(CompilationStep::GENERAL, "Undefined DMA setup value", val.to_string());
-    periphery::VPRSetup setup = periphery::VPRSetup::fromLiteral(dmaReadSetup);
+    periphery::VPRSetup setup = periphery::VPRSetup::fromLiteral(dmaReadSetup[qpu]);
 
     if(setup.value == 0)
         logging::warn() << "VPM DMA read setup was not previously set: " << setup.to_string() << logging::endl;
@@ -913,7 +939,7 @@ void VPM::setDMAReadAddress(const Value& val)
         throw CompilationError(
             CompilationStep::GENERAL, "Accessing more than a VPM row at once is not supported", setup.to_string());
 
-    auto pitch = periphery::VPRSetup::fromLiteral(readStrideSetup).strideSetup.getPitch();
+    auto pitch = periphery::VPRSetup::fromLiteral(readStrideSetup[qpu]).strideSetup.getPitch();
 
     MemoryAddress address = static_cast<MemoryAddress>(element0.getLiteralValue()->unsignedInt());
 
@@ -937,22 +963,24 @@ void VPM::setDMAReadAddress(const Value& val)
         address += pitch;
     }
 
-    lastDMAReadTrigger = currentCycle;
+    lastDMAReadTrigger[qpu] = currentCycle;
     PROFILE_COUNTER(vc4c::profiler::COUNTER_EMULATOR + 110, "write DMA read address", 1);
 }
 
-bool VPM::waitDMAWrite() const
+bool VPM::waitDMAWrite(unsigned char qpu) const
 {
     // XXX how many cycles?
-    PROFILE_COUNTER(vc4c::profiler::COUNTER_EMULATOR + 120, "wait DMA write", lastDMAWriteTrigger + 12 < currentCycle);
-    return lastDMAWriteTrigger + 12 < currentCycle;
+    PROFILE_COUNTER(
+        vc4c::profiler::COUNTER_EMULATOR + 120, "wait DMA write", lastDMAWriteTrigger[qpu] + 12 < currentCycle);
+    return lastDMAWriteTrigger[qpu] + 12 < currentCycle;
 }
 
-bool VPM::waitDMARead() const
+bool VPM::waitDMARead(unsigned char qpu) const
 {
     // XXX how many cycles?
-    PROFILE_COUNTER(vc4c::profiler::COUNTER_EMULATOR + 130, "wait DMA read", lastDMAReadTrigger + 12 < currentCycle);
-    return lastDMAReadTrigger + 12 < currentCycle;
+    PROFILE_COUNTER(
+        vc4c::profiler::COUNTER_EMULATOR + 130, "wait DMA read", lastDMAReadTrigger[qpu] + 12 < currentCycle);
+    return lastDMAReadTrigger[qpu] + 12 < currentCycle;
 }
 
 void VPM::incrementCycle()
@@ -1767,7 +1795,6 @@ bool tools::emulate(std::vector<std::unique_ptr<qpu_asm::Instruction>>::const_it
         throw CompilationError(CompilationStep::GENERAL, "Cannot use more than 12 QPUs!");
 
     Mutex mutex;
-    // FIXME is SFU execution per QPU or need SFUs be locked?
     std::array<SFU, NUM_QPUS> sfus;
     VPM vpm(memory);
     Semaphores semaphores;
diff --git a/src/tools/Emulator.h b/src/tools/Emulator.h
index 6f4310fb..596905f7 100644
--- a/src/tools/Emulator.h
+++ b/src/tools/Emulator.h
@@ -167,23 +167,29 @@ namespace vc4c
         class VPM : private NonCopyable
         {
         public:
-            VPM(Memory& memory) :
-                memory(memory), vpmReadSetup(0), vpmWriteSetup(0), dmaReadSetup(0), dmaWriteSetup(0),
-                readStrideSetup(0), writeStrideSetup(0), lastDMAReadTrigger(0), lastDMAWriteTrigger(0), currentCycle(0)
+            VPM(Memory& memory) : memory(memory), currentCycle(0)
             {
+                vpmReadSetup.fill(0);
+                vpmWriteSetup.fill(0);
+                dmaReadSetup.fill(0);
+                dmaWriteSetup.fill(0);
+                readStrideSetup.fill(0);
+                writeStrideSetup.fill(0);
+                lastDMAReadTrigger.fill(0);
+                lastDMAWriteTrigger.fill(0);
             }
 
-            Value readValue();
-            void writeValue(const Value& val);
+            Value readValue(unsigned char qpu);
+            void writeValue(unsigned char qpu, const Value& val);
 
-            void setWriteSetup(const Value& val);
-            void setReadSetup(const Value& val);
+            void setWriteSetup(unsigned char qpu, const Value& val);
+            void setReadSetup(unsigned char qpu, const Value& val);
 
-            void setDMAWriteAddress(const Value& val);
-            void setDMAReadAddress(const Value& val);
+            void setDMAWriteAddress(unsigned char qpu, const Value& val);
+            void setDMAReadAddress(unsigned char qpu, const Value& val);
 
-            NODISCARD bool waitDMAWrite() const;
-            NODISCARD bool waitDMARead() const;
+            NODISCARD bool waitDMAWrite(unsigned char qpu) const;
+            NODISCARD bool waitDMARead(unsigned char qpu) const;
 
             void incrementCycle();
 
@@ -191,14 +197,14 @@ namespace vc4c
 
         private:
             Memory& memory;
-            uint32_t vpmReadSetup;
-            uint32_t vpmWriteSetup;
-            uint32_t dmaReadSetup;
-            uint32_t dmaWriteSetup;
-            uint32_t readStrideSetup;
-            uint32_t writeStrideSetup;
-            uint32_t lastDMAReadTrigger;
-            uint32_t lastDMAWriteTrigger;
+            std::array<uint32_t, NUM_QPUS> vpmReadSetup;
+            std::array<uint32_t, NUM_QPUS> vpmWriteSetup;
+            std::array<uint32_t, NUM_QPUS> dmaReadSetup;
+            std::array<uint32_t, NUM_QPUS> dmaWriteSetup;
+            std::array<uint32_t, NUM_QPUS> readStrideSetup;
+            std::array<uint32_t, NUM_QPUS> writeStrideSetup;
+            std::array<uint32_t, NUM_QPUS> lastDMAReadTrigger;
+            std::array<uint32_t, NUM_QPUS> lastDMAWriteTrigger;
             uint32_t currentCycle;
 
             std::array<std::array<Word, 16>, 64> cache;

From 86fba4ef2139ca4c33d92d1a0b06cc30ea00cb65 Mon Sep 17 00:00:00 2001
From: doe300 <stadeldani@web.de>
Date: Fri, 2 Nov 2018 10:11:21 +0100
Subject: [PATCH 2/6] Completely rewrites memory mapping

Memory access is now mapped in following steps:

* Determine prefered and fall-back lowering type per memory area
* Check whether lowering type can be applied, reserve resources
* Map all memory access to specified lowering level

Also disables combining of VPM/DMA writes/reads for now.

See #113

Effects (test-emulator, last 2 commits):
Instructions: 45160 to 45779 (+1%)
Cycles:       659247 to 661193 (+0.2%)
Mutex waits:  282551 to 281459 (-0.3%)
---
 src/intermediate/MemoryInstruction.cpp   |    7 +
 src/normalization/AddressCalculation.cpp |  289 +++++
 src/normalization/AddressCalculation.h   |  144 +++
 src/normalization/MemoryAccess.cpp       | 1271 +++-------------------
 src/normalization/MemoryMapChecks.cpp    |  797 ++++++++++++++
 src/normalization/MemoryMappings.cpp     |  541 +++++++++
 src/normalization/MemoryMappings.h       |   77 ++
 src/normalization/Normalizer.cpp         |   28 +-
 src/normalization/sources.list           |    5 +
 src/optimization/Combiner.cpp            |  453 --------
 src/optimization/Combiner.h              |    3 -
 src/optimization/Optimizer.cpp           |    3 -
 test/TestMemoryAccess.cpp                |    2 +
 testing/local_private_storage.cl         |    2 +
 14 files changed, 2016 insertions(+), 1606 deletions(-)
 create mode 100644 src/normalization/AddressCalculation.cpp
 create mode 100644 src/normalization/AddressCalculation.h
 create mode 100644 src/normalization/MemoryMapChecks.cpp
 create mode 100644 src/normalization/MemoryMappings.cpp
 create mode 100644 src/normalization/MemoryMappings.h

diff --git a/src/intermediate/MemoryInstruction.cpp b/src/intermediate/MemoryInstruction.cpp
index 57acd92c..d440154b 100644
--- a/src/intermediate/MemoryInstruction.cpp
+++ b/src/intermediate/MemoryInstruction.cpp
@@ -94,6 +94,13 @@ MemoryInstruction::MemoryInstruction(
 {
     setArgument(0, src);
     setArgument(1, numEntries);
+
+    if(numEntries != INT_ONE)
+    {
+        if(op != MemoryOperation::COPY && op != MemoryOperation::FILL)
+            throw CompilationError(
+                CompilationStep::LLVM_2_IR, "Can only use the entry count for copying of filling memory", to_string());
+    }
 }
 
 std::string MemoryInstruction::to_string() const
diff --git a/src/normalization/AddressCalculation.cpp b/src/normalization/AddressCalculation.cpp
new file mode 100644
index 00000000..081571e4
--- /dev/null
+++ b/src/normalization/AddressCalculation.cpp
@@ -0,0 +1,289 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#include "AddressCalculation.h"
+
+#include "../Locals.h"
+#include "../intermediate/IntermediateInstruction.h"
+#include "../intermediate/operators.h"
+#include "log.h"
+
+using namespace vc4c;
+using namespace vc4c::intermediate;
+using namespace vc4c::normalization;
+using namespace vc4c::operators;
+
+static BaseAndOffset findOffset(const Value& val)
+{
+    if(!val.hasLocal())
+        return BaseAndOffset();
+    const LocalUser* writer = val.getSingleWriter();
+    if(writer != nullptr)
+    {
+        const Optional<Value> offset = writer->precalculate(8);
+        if(offset && offset->isLiteralValue())
+        {
+            return BaseAndOffset(NO_VALUE, offset->getLiteralValue()->signedInt());
+        }
+    }
+    return BaseAndOffset();
+}
+
+BaseAndOffset normalization::findBaseAndOffset(const Value& val)
+{
+    // TODO add support for offsets via getlocal/global_id, etc.
+    // need to the set base to addr + offset and the offset to the offset of the offset (e.g. param[get_local_id(0) +
+    // 7])  but how to determine?
+    if(!val.hasLocal())
+        return BaseAndOffset();
+    if(val.local()->is<Parameter>() || val.local()->is<Global>() || val.local()->is<StackAllocation>())
+        return BaseAndOffset(val, 0);
+
+    // follow the references
+    const Local* ref = val.local()->getBase(false);
+    if(ref != val.local())
+        return findBaseAndOffset(ref->createReference());
+    if(val.local()->reference.first != nullptr && val.local()->reference.second != ANY_ELEMENT)
+        return BaseAndOffset(val.local()->reference.first->createReference(), val.local()->reference.second);
+
+    const auto writers = val.local()->getUsers(LocalUse::Type::WRITER);
+    if(writers.size() != 1)
+        return BaseAndOffset();
+
+    // The reader can be one of several valid cases:
+    // 1. a move from another local -> need to follow the move
+    if(dynamic_cast<const MoveOperation*>(*writers.begin()) != nullptr)
+        return findBaseAndOffset(dynamic_cast<const MoveOperation*>(*writers.begin())->getSource());
+    const auto& args = (*writers.begin())->getArguments();
+    // 2. an addition with a local and a literal -> the local is the base, the literal the offset
+    if(dynamic_cast<const Operation*>((*writers.begin())) != nullptr &&
+        dynamic_cast<const Operation*>((*writers.begin()))->op == OP_ADD && args.size() == 2 &&
+        std::any_of(args.begin(), args.end(), [](const Value& arg) -> bool { return arg.hasLocal(); }) &&
+        std::any_of(
+            args.begin(), args.end(), [](const Value& arg) -> bool { return arg.getLiteralValue().has_value(); }))
+    {
+        return BaseAndOffset(
+            std::find_if(args.begin(), args.end(), [](const Value& arg) -> bool { return arg.hasLocal(); })
+                ->local()
+                ->getBase(false)
+                ->createReference(),
+            static_cast<int32_t>((*std::find_if(args.begin(), args.end(),
+                                      [](const Value& arg) -> bool { return arg.getLiteralValue().has_value(); }))
+                                     .getLiteralValue()
+                                     ->signedInt() /
+                val.type.getElementType().getPhysicalWidth()));
+    }
+
+    // 3. an addition with two locals -> one is the base, the other the calculation of the literal
+    if(dynamic_cast<const Operation*>((*writers.begin())) != nullptr &&
+        dynamic_cast<const Operation*>((*writers.begin()))->op == OP_ADD && args.size() == 2 &&
+        std::all_of(args.begin(), args.end(), [](const Value& arg) -> bool { return arg.hasLocal(); }))
+    {
+        const auto offset0 = findOffset(args[0]);
+        const auto offset1 = findOffset(args[1]);
+        if(offset0.offset && args[1].hasLocal())
+            return BaseAndOffset(args[1].local()->getBase(false)->createReference(),
+                static_cast<int32_t>(offset0.offset.value() / val.type.getElementType().getPhysicalWidth()));
+        if(offset1.offset && args[0].hasLocal())
+            return BaseAndOffset(args[0].local()->getBase(false)->createReference(),
+                static_cast<int32_t>(offset1.offset.value() / val.type.getElementType().getPhysicalWidth()));
+    }
+    /*
+        if(writers.size() == 1)
+        {
+            // couldn't find literal offset for any direct base, try with arbitrary values
+            ref = val.local->getBase(true);
+            Optional<Value> offset = NO_VALUE;
+            for(const auto& arg : (*writers.begin())->getArguments())
+            {
+                if(ref != nullptr && arg.hasLocal() && arg.local->getBase(false) == ref)
+                    // skip finding the same base again
+                    continue;
+                auto tmp = findBaseAndOffset(arg);
+                if(tmp.base && tmp.base->local->getBase(true) == ref && tmp.offset.is(0))
+                    // this parameter is the base itself, is already handled
+                    continue;
+                // TODO how to combine the offsets?
+                // TODO also need to handle non-addition of offsets (e.g. ptr = base + (offset + size * i))
+                logging::debug() << "Found offset of " << tmp.base.to_string() << " + "
+                                 << (tmp.offset ? tmp.offset.value() : -1) << logging::endl;
+                logging::debug() << "Found offset of " << tmp.base.to_string() << " with expression: "
+                                 << vc4c::Expression::createExpression(*(*writers.begin())).to_string() <<
+       logging::endl;
+            }
+            // TODO why is this called twice? The whole function, from outside
+            logging::debug() << "Found base and non-literal offset: " << ref->to_string() << " - " << offset.to_string()
+                             << logging::endl;
+            if(ref && (ref->residesInMemory() || (ref->is<Parameter>() && ref->type.isPointerType())))
+                return BaseAndOffset(ref->createReference(), {});
+        }
+    */
+
+    return BaseAndOffset();
+}
+
+MemoryType normalization::toMemoryType(periphery::VPMUsage usage)
+{
+    switch(usage)
+    {
+    case periphery::VPMUsage::SCRATCH:
+    case periphery::VPMUsage::LOCAL_MEMORY:
+        return MemoryType::VPM_SHARED_ACCESS;
+    case periphery::VPMUsage::REGISTER_SPILLING:
+    case periphery::VPMUsage::STACK:
+        return MemoryType::VPM_PER_QPU;
+    }
+    throw CompilationError(CompilationStep::NORMALIZER,
+        "Unknown VPM usage type to map to memory type: ", std::to_string(static_cast<int>(usage)));
+}
+
+InstructionWalker normalization::insertAddressToOffset(
+    InstructionWalker it, Method& method, Value& out, const Local* baseAddress, const MemoryInstruction* mem)
+{
+    auto ptrVal = mem->op == MemoryOperation::READ ? mem->getSource() : mem->getDestination();
+    auto indexOp = dynamic_cast<const Operation*>(ptrVal.getSingleWriter());
+    if(!indexOp)
+    {
+        // for stores, the store itself is also a write instruction
+        auto writers = ptrVal.local()->getUsers(LocalUse::Type::WRITER);
+        if(writers.size() == 2 && writers.find(mem) != writers.end())
+        {
+            writers.erase(mem);
+            indexOp = dynamic_cast<const Operation*>(*writers.begin());
+        }
+    }
+    if(ptrVal.hasLocal(baseAddress))
+    {
+        // trivial case, the offset is zero
+        out = INT_ZERO;
+    }
+    else if(indexOp && indexOp->readsLocal(baseAddress) && indexOp->op == OP_ADD)
+    {
+        // for simple version where the index is base address + offset, simple use the offset directly
+        out = indexOp->getFirstArg().hasLocal(baseAddress) ? indexOp->getSecondArg().value() : indexOp->getFirstArg();
+    }
+    else
+    {
+        // for more complex versions, calculate offset by subtracting base address from result
+        // address
+        out = assign(it, baseAddress->type, "%pointer_diff") = ptrVal - baseAddress->createReference();
+    }
+    return it;
+}
+
+InstructionWalker normalization::insertAddressToStackOffset(InstructionWalker it, Method& method, Value& out,
+    const Local* baseAddress, MemoryType type, const MemoryInstruction* mem)
+{
+    Value tmpIndex = UNDEFINED_VALUE;
+    it = insertAddressToOffset(it, method, tmpIndex, baseAddress, mem);
+    if(type == MemoryType::VPM_PER_QPU)
+    {
+        // size of one stack-frame in bytes
+        auto stackByteSize = periphery::VPM::getVPMStorageType(baseAddress->type.getElementType()).getPhysicalWidth();
+        // add offset of stack-frame
+        Value stackOffset = method.addNewLocal(TYPE_VOID.toPointerType(), "%stack_offset");
+        Value tmp = method.addNewLocal(baseAddress->type);
+        assign(it, stackOffset) = mul24(Value(Literal(stackByteSize), TYPE_INT16), Value(REG_QPU_NUMBER, TYPE_INT8));
+        out = assign(it, TYPE_VOID.toPointerType(), "%stack_offset") = tmpIndex + stackOffset;
+    }
+    else
+    {
+        out = tmpIndex;
+    }
+    return it;
+}
+
+InstructionWalker normalization::insertAddressToElementOffset(InstructionWalker it, Method& method, Value& out,
+    const Local* baseAddress, const Value& container, const MemoryInstruction* mem)
+{
+    Value tmpIndex = UNDEFINED_VALUE;
+    it = insertAddressToOffset(it, method, tmpIndex, baseAddress, mem);
+    // the index (as per index calculation) is in bytes, but we need index in elements, so divide by element size
+    out = assign(it, TYPE_VOID.toPointerType(), "%element_offset") =
+        tmpIndex / Literal(container.type.getElementType().getPhysicalWidth());
+    return it;
+}
+
+static Optional<std::pair<Value, InstructionDecorations>> combineAdditions(
+    Method& method, InstructionWalker referenceIt, FastMap<Value, InstructionDecorations>& addedValues)
+{
+    if(addedValues.empty())
+        return {};
+    Optional<std::pair<Value, InstructionDecorations>> prevResult;
+    auto valIt = addedValues.begin();
+    while(valIt != addedValues.end())
+    {
+        if(prevResult)
+        {
+            auto newResult = method.addNewLocal(prevResult->first.type);
+            auto newFlags = intersect_flags(prevResult->second, valIt->second);
+            referenceIt.emplace(new Operation(OP_ADD, newResult, prevResult->first, valIt->first));
+            referenceIt->addDecorations(newFlags);
+            referenceIt.nextInBlock();
+            prevResult = std::make_pair(newResult, newFlags);
+        }
+        else
+            prevResult = std::make_pair(valIt->first, valIt->second);
+        valIt = addedValues.erase(valIt);
+    }
+    return prevResult;
+}
+
+std::string MemoryAccessRange::to_string() const
+{
+    return (memoryInstruction->to_string() +
+               (memoryInstruction->writesRegister(REG_VPM_DMA_LOAD_ADDR) ? " - read " : " - write ")) +
+        (memoryObject->to_string() +
+            (groupUniformAddressParts.empty() ? " with" : " with work-group uniform offset and") +
+            " dynamic element range [") +
+        (std::to_string(offsetRange.minValue) + ", ") + (std::to_string(offsetRange.maxValue) + "]");
+}
+
+InstructionWalker normalization::insertAddressToWorkItemSpecificOffset(
+    InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range)
+{
+    auto dynamicParts = combineAdditions(method, it, range.dynamicAddressParts);
+    out = dynamicParts->first;
+    if(range.typeSizeShift)
+        out = assign(it, dynamicParts->first.type) =
+            (dynamicParts->first << (*range.typeSizeShift)->assertArgument(1), dynamicParts->second);
+    return it;
+}
+
+InstructionWalker normalization::insertAddressToWorkGroupUniformOffset(
+    InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range)
+{
+    auto uniformParts = combineAdditions(method, it, range.groupUniformAddressParts);
+    out = uniformParts->first;
+    if(range.typeSizeShift)
+        out = assign(it, uniformParts->first.type) =
+            (uniformParts->first << (*range.typeSizeShift)->assertArgument(1), uniformParts->second);
+    static auto checkPointer = [](const Value& arg) -> bool { return arg.type.isPointerType(); };
+    if(std::all_of(
+           range.baseAddressAdd->getArguments().begin(), range.baseAddressAdd->getArguments().end(), checkPointer) ||
+        std::none_of(
+            range.baseAddressAdd->getArguments().begin(), range.baseAddressAdd->getArguments().end(), checkPointer))
+        throw CompilationError(CompilationStep::NORMALIZER, "Cannot determine base address of addition",
+            range.baseAddressAdd->to_string());
+    auto baseAddrIt = std::find_if(
+        range.baseAddressAdd->getArguments().begin(), range.baseAddressAdd->getArguments().end(), checkPointer);
+    out = assign(it, range.baseAddressAdd->getOutput()->type) =
+        (*baseAddrIt + out, uniformParts->second, InstructionDecorations::WORK_GROUP_UNIFORM_VALUE);
+    return it;
+}
+
+bool LocalUsageOrdering::operator()(const Local* l1, const Local* l2) const
+{
+    // prefer more usages over less usages
+    // since there is always only 1 writer for the local address, we prefer this over only counting readers for
+    // performance reasons
+    // TODO is this the correct way to do this? E.g. is there one usage per memory access?
+    if(l1->getUsers().size() > l2->getUsers().size())
+        return true;
+    if(l1->getUsers().size() == l2->getUsers().size())
+        return l1 < l2;
+    return false;
+}
\ No newline at end of file
diff --git a/src/normalization/AddressCalculation.h b/src/normalization/AddressCalculation.h
new file mode 100644
index 00000000..74fc493b
--- /dev/null
+++ b/src/normalization/AddressCalculation.h
@@ -0,0 +1,144 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+#ifndef VC4C_NORMALIZATION_ADDRESS_CALCULATION_H
+#define VC4C_NORMALIZATION_ADDRESS_CALCULATION_H
+
+#include "../InstructionWalker.h"
+#include "../Values.h"
+#include "../analysis/ValueRange.h"
+#include "../intermediate/IntermediateInstruction.h"
+#include "../periphery/VPM.h"
+
+namespace vc4c
+{
+    namespace normalization
+    {
+        struct BaseAndOffset
+        {
+            Optional<Value> base;
+            Optional<int32_t> offset;
+
+            explicit BaseAndOffset() : base(NO_VALUE), offset{} {}
+
+            BaseAndOffset(const Optional<Value>& base, Optional<int32_t> offset) : base(base), offset(offset) {}
+        };
+
+        /*
+         * Returns the base memory address for the given value as well as the offset the value has to this location
+         */
+        BaseAndOffset findBaseAndOffset(const Value& val);
+
+        enum class MemoryType
+        {
+            // lower the value into a register and replace all loads with moves
+            QPU_REGISTER_READONLY,
+            // lower the value into a register and replace all loads/stores with moves
+            QPU_REGISTER_READWRITE,
+            // store in VPM in extra space per QPU!!
+            VPM_PER_QPU,
+            // store in VPM, QPUs share access to common data
+            VPM_SHARED_ACCESS,
+            // keep in RAM/global data segment, read via TMU
+            RAM_LOAD_TMU,
+            // keep in RAM/global data segment, access via VPM
+            RAM_READ_WRITE_VPM
+        };
+
+        struct MemoryAccess
+        {
+            FastSet<InstructionWalker> accessInstructions;
+            MemoryType preferred;
+            MemoryType fallback;
+        };
+
+        MemoryType toMemoryType(periphery::VPMUsage usage);
+
+        /*
+         * Converts an address (e.g. an index chain) and the corresponding base pointer to the pointer difference
+         *
+         * NOTE: The result itself is still in "memory-address mode", meaning the offset is the number of bytes
+         *
+         * Returns (char*)address - (char*)baseAddress
+         */
+        InstructionWalker insertAddressToOffset(InstructionWalker it, Method& method, Value& out,
+            const Local* baseAddress, const intermediate::MemoryInstruction* mem);
+
+        /*
+         * Converts an address (e.g. an index-chain) and a base-address to the offset of the vector denoting the element
+         * accessed by the index-chain. In addition to #insertAddressToOffset, this function also handles multiple
+         * stack-frames.
+         *
+         * NOTE: The result is still the offset in bytes, since VPM#insertReadVPM and VPM#insertWriteVPM take the offset
+         * in bytes!
+         *
+         * Returns ((char*)address - (char*)baseAddress) + (typeSizeInBytes * stackIndex), where stackIndex is always
+         * zero (and therefore the second part omitted) for shared memory
+         */
+        InstructionWalker insertAddressToStackOffset(InstructionWalker it, Method& method, Value& out,
+            const Local* baseAddress, MemoryType type, const intermediate::MemoryInstruction* mem);
+
+        /*
+         * Converts an address (e.g. index-chain) and the corresponding base-address to the element offset for an
+         * element of the type used in the container
+         *
+         * Return ((char*)address - (char*)baseAddress) / sizeof(elementType)
+         */
+        InstructionWalker insertAddressToElementOffset(InstructionWalker it, Method& method, Value& out,
+            const Local* baseAddress, const Value& container, const intermediate::MemoryInstruction* mem);
+
+        // represents analysis data for the range of memory accessed per memory object
+        struct MemoryAccessRange
+        {
+            const Local* memoryObject;
+            // the memory instruction accessing the memory object
+            InstructionWalker memoryInstruction;
+            // the instruction adding the offset to the base pointer, could be the same as addressWrite
+            InstructionWalker baseAddressAdd;
+            // the instruction converting the address offset from element offset to byte offset
+            Optional<InstructionWalker> typeSizeShift;
+            // the work-group uniform parts of which the address offset is calculated from
+            FastMap<Value, intermediate::InstructionDecorations> groupUniformAddressParts;
+            // the dynamic parts of which the address offset is calculated from
+            FastMap<Value, intermediate::InstructionDecorations> dynamicAddressParts;
+            // the maximum range (in elements!) the memory is accessed in
+            analysis::IntegerRange offsetRange{0, 0};
+
+            std::string to_string() const;
+        };
+
+        /*
+         * Converts an address (e.g. index-chain) which contains work-group uniform and work-item specific parts (as
+         * specified in range) to the work-item specific part only.
+         * This can be seen as a specialization of #insertAddressToOffset
+         *
+         * NOTE: The result itself is still in "memory-address mode", meaning the offset is the number of bytes
+         *
+         * * E.g. get_global_id(0) (= get_group_id(0) + get_local_id(0)) will be converted to get_local_id(0)
+         */
+        InstructionWalker insertAddressToWorkItemSpecificOffset(
+            InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range);
+
+        /*
+         * Converts an address (e.g. index-chain) which contains work-group uniform and work-item specific parts (as
+         * specified in range) to the work-group uniform part only.
+         * Adding the result of this function and #insertAddressToWorkItemSpecificOffset will result in the original
+         * address
+         *
+         * NOTE: The result itself is in "memory-address mode", meaning the offset is the number of bytes, since it is
+         * meant for addressing memory
+         *
+         * * E.g. get_global_id(0) (= get_group_id(0) + get_local_id(0)) will be converted to get_group_id(0)
+         */
+        InstructionWalker insertAddressToWorkGroupUniformOffset(
+            InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range);
+
+        struct LocalUsageOrdering
+        {
+            bool operator()(const Local* l1, const Local* l2) const;
+        };
+    } /* namespace normalization */
+} /* namespace vc4c */
+#endif /* VC4C_NORMALIZATION_ADDRESS_CALCULATION_H */
\ No newline at end of file
diff --git a/src/normalization/MemoryAccess.cpp b/src/normalization/MemoryAccess.cpp
index e621dab4..d26a2600 100644
--- a/src/normalization/MemoryAccess.cpp
+++ b/src/normalization/MemoryAccess.cpp
@@ -6,23 +6,16 @@
 
 #include "MemoryAccess.h"
 
-#include "../Expression.h"
 #include "../InstructionWalker.h"
 #include "../Module.h"
 #include "../Profiler.h"
-#include "../analysis/ValueRange.h"
-#include "../intermediate/Helper.h"
 #include "../intermediate/IntermediateInstruction.h"
 #include "../intermediate/operators.h"
-#include "../periphery/TMU.h"
 #include "../periphery/VPM.h"
+#include "AddressCalculation.h"
+#include "MemoryMappings.h"
 #include "log.h"
 
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <numeric>
-
 using namespace vc4c;
 using namespace vc4c::normalization;
 using namespace vc4c::intermediate;
@@ -36,125 +29,6 @@ using namespace vc4c::operators;
  * - too complex phi-nodes with pointers: clNN/im2col.cl
  */
 
-struct BaseAndOffset
-{
-    Optional<Value> base;
-    Optional<int32_t> offset;
-
-    explicit BaseAndOffset() : base(NO_VALUE), offset{} {}
-
-    BaseAndOffset(const Optional<Value>& base, Optional<int32_t> offset) : base(base), offset(offset) {}
-};
-
-static BaseAndOffset findOffset(const Value& val)
-{
-    if(!val.hasLocal())
-        return BaseAndOffset();
-    const LocalUser* writer = val.getSingleWriter();
-    if(writer != nullptr)
-    {
-        const Optional<Value> offset = writer->precalculate(8);
-        if(offset && offset->isLiteralValue())
-        {
-            return BaseAndOffset(NO_VALUE, offset->getLiteralValue()->signedInt());
-        }
-    }
-    return BaseAndOffset();
-}
-
-static BaseAndOffset findBaseAndOffset(const Value& val)
-{
-    // TODO add support for offsets via getlocal/global_id, etc.
-    // need to the set base to addr + offset and the offset to the offset of the offset (e.g. param[get_local_id(0) +
-    // 7])  but how to determine?
-    if(!val.hasLocal())
-        return BaseAndOffset();
-    if(val.local()->is<Parameter>() || val.local()->is<Global>() || val.local()->is<StackAllocation>())
-        return BaseAndOffset(val, 0);
-
-    // follow the references
-    const Local* ref = val.local()->getBase(false);
-    if(ref != val.local())
-        return findBaseAndOffset(ref->createReference());
-    if(val.local()->reference.first != nullptr && val.local()->reference.second != ANY_ELEMENT)
-        return BaseAndOffset(val.local()->reference.first->createReference(), val.local()->reference.second);
-
-    const auto writers = val.local()->getUsers(LocalUse::Type::WRITER);
-    if(writers.size() != 1)
-        return BaseAndOffset();
-
-    // The reader can be one of several valid cases:
-    // 1. a move from another local -> need to follow the move
-    if(dynamic_cast<const MoveOperation*>(*writers.begin()) != nullptr)
-        return findBaseAndOffset(dynamic_cast<const MoveOperation*>(*writers.begin())->getSource());
-    const auto& args = (*writers.begin())->getArguments();
-    // 2. an addition with a local and a literal -> the local is the base, the literal the offset
-    if(dynamic_cast<const Operation*>((*writers.begin())) != nullptr &&
-        dynamic_cast<const Operation*>((*writers.begin()))->op == OP_ADD && args.size() == 2 &&
-        std::any_of(args.begin(), args.end(), [](const Value& arg) -> bool { return arg.hasLocal(); }) &&
-        std::any_of(
-            args.begin(), args.end(), [](const Value& arg) -> bool { return arg.getLiteralValue().has_value(); }))
-    {
-        return BaseAndOffset(
-            std::find_if(args.begin(), args.end(), [](const Value& arg) -> bool { return arg.hasLocal(); })
-                ->local()
-                ->getBase(false)
-                ->createReference(),
-            static_cast<int32_t>((*std::find_if(args.begin(), args.end(),
-                                      [](const Value& arg) -> bool { return arg.getLiteralValue().has_value(); }))
-                                     .getLiteralValue()
-                                     ->signedInt() /
-                val.type.getElementType().getPhysicalWidth()));
-    }
-
-    // 3. an addition with two locals -> one is the base, the other the calculation of the literal
-    if(dynamic_cast<const Operation*>((*writers.begin())) != nullptr &&
-        dynamic_cast<const Operation*>((*writers.begin()))->op == OP_ADD && args.size() == 2 &&
-        std::all_of(args.begin(), args.end(), [](const Value& arg) -> bool { return arg.hasLocal(); }))
-    {
-        const auto offset0 = findOffset(args[0]);
-        const auto offset1 = findOffset(args[1]);
-        if(offset0.offset && args[1].hasLocal())
-            return BaseAndOffset(args[1].local()->getBase(false)->createReference(),
-                static_cast<int32_t>(offset0.offset.value() / val.type.getElementType().getPhysicalWidth()));
-        if(offset1.offset && args[0].hasLocal())
-            return BaseAndOffset(args[0].local()->getBase(false)->createReference(),
-                static_cast<int32_t>(offset1.offset.value() / val.type.getElementType().getPhysicalWidth()));
-    }
-    /*
-        if(writers.size() == 1)
-        {
-            // couldn't find literal offset for any direct base, try with arbitrary values
-            ref = val.local->getBase(true);
-            Optional<Value> offset = NO_VALUE;
-            for(const auto& arg : (*writers.begin())->getArguments())
-            {
-                if(ref != nullptr && arg.hasLocal() && arg.local->getBase(false) == ref)
-                    // skip finding the same base again
-                    continue;
-                auto tmp = findBaseAndOffset(arg);
-                if(tmp.base && tmp.base->local->getBase(true) == ref && tmp.offset.is(0))
-                    // this parameter is the base itself, is already handled
-                    continue;
-                // TODO how to combine the offsets?
-                // TODO also need to handle non-addition of offsets (e.g. ptr = base + (offset + size * i))
-                logging::debug() << "Found offset of " << tmp.base.to_string() << " + "
-                                 << (tmp.offset ? tmp.offset.value() : -1) << logging::endl;
-                logging::debug() << "Found offset of " << tmp.base.to_string() << " with expression: "
-                                 << vc4c::Expression::createExpression(*(*writers.begin())).to_string() <<
-       logging::endl;
-            }
-            // TODO why is this called twice? The whole function, from outside
-            logging::debug() << "Found base and non-literal offset: " << ref->to_string() << " - " << offset.to_string()
-                             << logging::endl;
-            if(ref && (ref->residesInMemory() || (ref->is<Parameter>() && ref->type.isPointerType())))
-                return BaseAndOffset(ref->createReference(), {});
-        }
-    */
-
-    return BaseAndOffset();
-}
-
 struct VPMAccessGroup
 {
     bool isVPMWrite;
@@ -736,653 +610,129 @@ void normalization::resolveStackAllocation(
     }
 }
 
-enum class MemoryType
-{
-    // lower the value into a register and replace all loads with moves
-    QPU_REGISTER_READONLY,
-    // lower the value into a register and replace all loads/stores with moves
-    QPU_REGISTER_READWRITE,
-    // store in VPM in extra space per QPU!!
-    VPM_PER_QPU,
-    // store in VPM, QPUs share access to common data
-    VPM_SHARED_ACCESS,
-    // keep in RAM/global data segment, read via TMU
-    RAM_LOAD_TMU,
-    // keep in RAM/global data segment, access via VPM
-    RAM_READ_WRITE_VPM
-};
-
-static MemoryType toMemoryType(periphery::VPMUsage usage)
-{
-    switch(usage)
-    {
-    case periphery::VPMUsage::SCRATCH:
-    case periphery::VPMUsage::LOCAL_MEMORY:
-        return MemoryType::VPM_SHARED_ACCESS;
-    case periphery::VPMUsage::REGISTER_SPILLING:
-    case periphery::VPMUsage::STACK:
-        return MemoryType::VPM_PER_QPU;
-    }
-    throw CompilationError(CompilationStep::NORMALIZER,
-        "Unknown VPM usage type to map to memory type: ", std::to_string(static_cast<int>(usage)));
-}
-
+/* clang-format off */
 /*
- * Converts an address (e.g. an index chain) and the corresponding base pointer to the pointer difference
+ * Matrix of memory types and storage locations:
  *
- * NOTE: The result itself is still in "memory-address mode", meaning the offset is the number of bytes
+ *           | global | local | private | constant
+ * buffer    |   -    |VPM/GD | QPU/VPM | QPU/GD
+ * parameter |  RAM   |RAM/(*)|    -    |   RAM
  *
- * Returns (char*)address - (char*)baseAddress
- */
-static InstructionWalker insertAddressToOffset(
-    InstructionWalker it, Method& method, Value& out, const Local* baseAddress, const MemoryInstruction* mem)
-{
-    auto ptrVal = mem->op == MemoryOperation::READ ? mem->getSource() : mem->getDestination();
-    auto indexOp = dynamic_cast<const Operation*>(ptrVal.getSingleWriter());
-    if(!indexOp)
-    {
-        // for stores, the store itself is also a write instruction
-        auto writers = ptrVal.local()->getUsers(LocalUse::Type::WRITER);
-        if(writers.size() == 2 && writers.find(mem) != writers.end())
-        {
-            writers.erase(mem);
-            indexOp = dynamic_cast<const Operation*>(*writers.begin());
-        }
-    }
-    if(ptrVal.hasLocal(baseAddress))
-    {
-        // trivial case, the offset is zero
-        out = INT_ZERO;
-    }
-    else if(indexOp && indexOp->readsLocal(baseAddress) && indexOp->op == OP_ADD)
-    {
-        // for simple version where the index is base address + offset, simple use the offset directly
-        out = indexOp->getFirstArg().hasLocal(baseAddress) ? indexOp->getSecondArg().value() : indexOp->getFirstArg();
-    }
-    else
-    {
-        // for more complex versions, calculate offset by subtracting base address from result
-        // address
-        out = assign(it, baseAddress->type, "%pointer_diff") = ptrVal - baseAddress->createReference();
-    }
-    return it;
-}
-
-/*
- * Converts an address (e.g. an index-chain) and a base-address to the offset of the vector denoting the element
- * accessed by the index-chain. In addition to #insertAddressToOffset, this function also handles multiple stack-frames.
+ * buffer is both inside and outside of function scope (where allowed)
+ * - : is not allowed by OpenCL
+ * (*) could lower into VPM if the highest index accessed is known and fits?
+ * GD: global data segment of kernel buffer
+ * RAM: load via TMU if possible (not written to), otherwise use VPM
  *
- * NOTE: The result is still the offset in bytes, since VPM#insertReadVPM and VPM#insertWriteVPM take the offset in
- * bytes!
+ * Sources:
+ * https://stackoverflow.com/questions/22471466/why-program-global-scope-variables-must-be-constant#22474119
+ * https://stackoverflow.com/questions/17431941/how-to-use-arrays-in-program-global-scope-in-opencl
  *
- * Returns ((char*)address - (char*)baseAddress) + (typeSizeInBytes * stackIndex), where stackIndex is always zero (and
- * therefore the second part omitted) for shared memory
- */
-static InstructionWalker insertAddressToStackOffset(InstructionWalker it, Method& method, Value& out,
-    const Local* baseAddress, MemoryType type, const MemoryInstruction* mem)
-{
-    Value tmpIndex = UNDEFINED_VALUE;
-    it = insertAddressToOffset(it, method, tmpIndex, baseAddress, mem);
-    if(type == MemoryType::VPM_PER_QPU)
-    {
-        // size of one stack-frame in bytes
-        auto stackByteSize = periphery::VPM::getVPMStorageType(baseAddress->type.getElementType()).getPhysicalWidth();
-        // add offset of stack-frame
-        Value stackOffset = method.addNewLocal(TYPE_VOID.toPointerType(), "%stack_offset");
-        Value tmp = method.addNewLocal(baseAddress->type);
-        assign(it, stackOffset) = mul24(Value(Literal(stackByteSize), TYPE_INT16), Value(REG_QPU_NUMBER, TYPE_INT8));
-        out = assign(it, TYPE_VOID.toPointerType(), "%stack_offset") = tmpIndex + stackOffset;
-    }
-    else
-    {
-        out = tmpIndex;
-    }
-    return it;
-}
-
-/*
- * Converts an address (e.g. index-chain) and the corresponding base-address to the element offset for an element of the
- * type used in the container
+ * 
+ * Matrix of memory types and access ways:
+ * compile-time memory: __constant buffer with values known at compile-time
+ * constant memory: __constant or read-only __global/__local buffer/parameter
+ * private memory: __private buffer/stack allocations
+ * read-write memory: any other __global/__local buffer/parameter
  *
- * Return ((char*)address - (char*)baseAddress) / sizeof(elementType)
- */
-static InstructionWalker insertAddressToElementOffset(InstructionWalker it, Method& method, Value& out,
-    const Local* baseAddress, const Value& container, const MemoryInstruction* mem)
-{
-    Value tmpIndex = UNDEFINED_VALUE;
-    it = insertAddressToOffset(it, method, tmpIndex, baseAddress, mem);
-    // the index (as per index calculation) is in bytes, but we need index in elements, so divide by element size
-    out = assign(it, TYPE_VOID.toPointerType(), "%element_offset") =
-        tmpIndex / Literal(container.type.getElementType().getPhysicalWidth());
-    return it;
-}
-
-/*
- * Maps a memory access instruction to an instruction accessing RAM through VPM.
+ *                     |   optimization   |   location   |   read    |   write   |    copy from    |       copy to       | group | priority |
+ * compile-time memory |     "normal"     |      GD      |    TMU    |     -     |    DMA/TMU(*)   |          -          |  (1)  |     2    |    
+ *                     |   lowered load   |      QPU     | register  |     -     | VPM/register(*) |          -          |  (2)  |     1    |
+ * constant memory     |     "normal"     |     GD/RAM   |    TMU    |     -     |    DMA/TMU(*)   |          -          |  (1)  |     2    |
+ * private memory      |     "normal"     |      GD      |    DMA    |    DMA    |       DMA       |         DMA         |  (3)  |     3    |
+ *                     | lowered register |      QPU     | register  | register  | VPM/register(*) | VPM/TMU/register(*) |  (2)  |     1    |
+ *                     |   lowered VPM    |      VPM     |    VPM    |    VPM    |     VPM/DMA     |       VPM/DMA       |  (4)  |     2    |
+ * read-write memory   |     "normal"     |     GD/RAM   |    DMA    |    DMA    |       DMA       |         DMA         |  (3)  |     3    |
+ *                     |   lowered VPM    |      VPM     |    VPM    |    VPM    |     VPM/DMA     |       VPM/DMA       |  (4)  |     1    |
+ *                     |    cached VPM    | VPM + GD/RAM | VPM + DMA | VPM + DMA |     VPM/DMA     |       VPM/DMA       |  (4)  |     2    |
+ *
+ * Special cases:
+ *  (*) when copying from constant memory into register, TMU can be used instead. Copying from and to register is done inside the QPU
  *
- * NOTE: At least one of the operands of the instruction to be mapped must be located in RAM
- * NOTE: this is the least optimal mapping possible and should avoided if possible.
  */
-static InstructionWalker mapToVPMMemoryAccessInstructions(
-    Method& method, InstructionWalker it, const VPMArea* sourceArea = nullptr, const VPMArea* destArea = nullptr)
-{
-    if(sourceArea != nullptr && destArea != nullptr)
-        throw CompilationError(CompilationStep::NORMALIZER,
-            "Memory access with both operands located in VPM should have been handled already", it->to_string());
-    const MemoryInstruction* mem = it.get<MemoryInstruction>();
-    switch(mem->op)
-    {
-    case MemoryOperation::COPY:
-    {
-        if(!mem->getNumEntries().isLiteralValue())
-            throw CompilationError(CompilationStep::OPTIMIZER,
-                "Copying dynamically sized memory is not yet implemented", mem->to_string());
-        uint64_t numBytes = mem->getNumEntries().getLiteralValue()->unsignedInt() *
-            (mem->getSourceElementType().getScalarBitCount() * mem->getSourceElementType().getVectorWidth()) / 8;
-        if(numBytes > std::numeric_limits<unsigned>::max())
-            throw CompilationError(CompilationStep::OPTIMIZER, "Cannot copy more than 4GB of data", mem->to_string());
-        // TODO can the case occur, where the other value is not located in RAM??
-        if((sourceArea || destArea) && mem->getNumEntries() != INT_ONE)
-            throw CompilationError(CompilationStep::OPTIMIZER,
-                "This version of copying from/to VPM cached data is not yet implemented", mem->to_string());
-        // TODO are the following two blocks correct?
-        if(sourceArea)
-        {
-            Value inAreaOffset = UNDEFINED_VALUE;
-            it = insertAddressToStackOffset(
-                it, method, inAreaOffset, sourceArea->originalAddress, toMemoryType(sourceArea->usageType), mem);
-            // the VPM addressing for DMA access contains row and column offset. We only have row offset, so we need to
-            // shift the result to the correct position
-            inAreaOffset = assign(it, TYPE_VOID.toPointerType(), "%vpm_row_offset") = inAreaOffset << 4_val;
-            it = method.vpm->insertWriteRAM(
-                method, it, mem->getDestination(), mem->getSourceElementType(), sourceArea, true, inAreaOffset);
-        }
-        else if(destArea)
-        {
-            Value inAreaOffset = UNDEFINED_VALUE;
-            it = insertAddressToStackOffset(
-                it, method, inAreaOffset, destArea->originalAddress, toMemoryType(destArea->usageType), mem);
-            // the VPM addressing for DMA access contains row and column offset. We only have row offset, so we need to
-            // shift the result to the correct position
-            inAreaOffset = assign(it, TYPE_VOID.toPointerType(), "%vpm_row_offset") = inAreaOffset << 4_val;
-            it = method.vpm->insertReadRAM(
-                method, it, mem->getSource(), mem->getDestinationElementType(), destArea, true, inAreaOffset);
-        }
-        else
-        {
-            it = method.vpm->insertCopyRAM(
-                method, it, mem->getDestination(), mem->getSource(), static_cast<unsigned>(numBytes));
-        }
+/* clang-format on */
 
-        auto* src = mem->getSource().hasLocal() ? mem->getSource().local()->getBase(true) : nullptr;
-        if(src && src->is<Parameter>())
-            const_cast<Parameter*>(src->as<const Parameter>())->decorations =
-                add_flag(src->as<const Parameter>()->decorations, ParameterDecorations::INPUT);
-        auto* dest = mem->getDestination().hasLocal() ? mem->getDestination().local()->getBase(true) : nullptr;
-        if(dest && dest->is<Parameter>())
-            const_cast<Parameter*>(dest->as<const Parameter>())->decorations =
-                add_flag(dest->as<const Parameter>()->decorations, ParameterDecorations::OUTPUT);
-        break;
-    }
-    case MemoryOperation::FILL:
-    {
-        if(!mem->getNumEntries().isLiteralValue())
-            throw CompilationError(CompilationStep::OPTIMIZER,
-                "Filling dynamically sized memory is not yet implemented", mem->to_string());
-        uint64_t numCopies = mem->getNumEntries().getLiteralValue()->unsignedInt();
-        if(numCopies > std::numeric_limits<unsigned>::max())
-            throw CompilationError(CompilationStep::OPTIMIZER, "Cannot fill more than 4GB of data", mem->to_string());
-        if(destArea != nullptr)
-        {
-            // TODO should this also be handled already?
-            throw CompilationError(
-                CompilationStep::OPTIMIZER, "Filling VPM cached data is not yet implemented", mem->to_string());
-        }
-        it.emplace(new intermediate::MutexLock(intermediate::MutexAccess::LOCK));
-        it.nextInBlock();
-        // TODO could optimize (e.g. for zero-initializers) by writing several bytes at once
-        it = method.vpm->insertWriteVPM(method, it, mem->getSource(), nullptr, false);
-        it = method.vpm->insertFillRAM(method, it, mem->getDestination(), mem->getSourceElementType(),
-            static_cast<unsigned>(numCopies), nullptr, false);
-        it.emplace(new intermediate::MutexLock(intermediate::MutexAccess::RELEASE));
-        it.nextInBlock();
-        auto* dest = mem->getDestination().hasLocal() ? mem->getDestination().local()->getBase(true) : nullptr;
-        if(dest && dest->is<Parameter>())
-            const_cast<Parameter*>(dest->as<const Parameter>())->decorations =
-                add_flag(dest->as<const Parameter>()->decorations, ParameterDecorations::OUTPUT);
-        break;
-    }
-    case MemoryOperation::READ:
-    {
-        if(sourceArea != nullptr)
-            throw CompilationError(CompilationStep::NORMALIZER,
-                "Reading from VPM mapped memory should already be handled", mem->to_string());
-        it = periphery::insertReadDMA(method, it, mem->getDestination(), mem->getSource());
-        auto* src = mem->getSource().hasLocal() ? mem->getSource().local()->getBase(true) : nullptr;
-        if(src && src->is<Parameter>())
-            const_cast<Parameter*>(src->as<const Parameter>())->decorations =
-                add_flag(src->as<const Parameter>()->decorations, ParameterDecorations::INPUT);
-        break;
-    }
-    case MemoryOperation::WRITE:
-    {
-        if(destArea != nullptr)
-            throw CompilationError(CompilationStep::NORMALIZER,
-                "Writing into VPM mapped memory should already be handled", mem->to_string());
-        it = periphery::insertWriteDMA(method, it, mem->getSource(), mem->getDestination());
-        auto* dest = mem->getDestination().hasLocal() ? mem->getDestination().local()->getBase(true) : nullptr;
-        if(dest && dest->is<Parameter>())
-            const_cast<Parameter*>(dest->as<const Parameter>())->decorations =
-                add_flag(dest->as<const Parameter>()->decorations, ParameterDecorations::OUTPUT);
-        break;
-    }
-    }
-    // remove MemoryInstruction
-    // since a copy may have another iterator to it, do not remove the element, just clear it
-    // the empty instruction is cleaned up in #combineVPMAccess
-    return mem->op == MemoryOperation::COPY ? (it.reset(nullptr), it) : it.erase();
-}
-
-static bool isMemoryOnlyRead(const Local* local)
+void normalization::mapMemoryAccess(const Module& module, Method& method, const Configuration& config)
 {
-    auto base = local->getBase(true);
-    if(base->is<Parameter>() && has_flag(base->as<const Parameter>()->decorations, ParameterDecorations::READ_ONLY))
-        return true;
-    if(base->is<Global>() && base->as<Global>()->isConstant)
-        return true;
-    if(base->type.getPointerType() && base->type.getPointerType().value()->addressSpace == AddressSpace::CONSTANT)
-        return true;
-
-    // TODO also check for no actual writes. Need to heed index-calculation from base!
-    return false;
-}
+    /*
+     * 1. lower constant/private buffers into register
+     *    lower global constant buffers into registers
+     *    lower small enough private buffers to registers
+     * 2. generate TMU loads for read-only memory
+     *    keep all read-only parameters in RAM, load via TMU
+     *    also load constants via TMU, which could not be lowered into register
+     * 3. lower per-QPU (private) buffers into VPM
+     * 4. lower shared buffers (local) into VPM
+     * 5. cache RAM access in VPM
+     * 6. generate remaining instructions for RAM access via VPM scratch area
+     * TODO:
+     * 3.1 for memory located in RAM, try to group/queue reads/writes
+     */
+    MemoryAccessMap memoryMapping;
+    FastSet<InstructionWalker> memoryInstructions;
+    std::tie(memoryMapping, memoryInstructions) = determineMemoryAccess(method);
 
-/*
- * Maps all memory access instructions for the given memory location by placing it in RAM and accessing it via VPM
- *
- * NOTE: This is the least optimal way of accessing memory and should be avoided if possible
- */
-static bool mapReadWriteToMemoryViaVPM(Method& method, const Local* local,
-    FastSet<InstructionWalker>& memoryInstructions, FastMap<const Local*, const VPMArea*>& vpmMappedLocals,
-    FastSet<BasicBlock*>& affectedBlocks)
-{
-    for(auto it : memoryInstructions)
-    {
-        auto mem = it.get<const MemoryInstruction>();
-        if(!mem)
-            // already optimized (e.g. lowered into VPM)
-            continue;
-        logging::debug() << "Generating memory access which cannot be optimized: " << mem->to_string() << logging::endl;
-        auto srcVpmArea =
-            mem->getSource().hasLocal() ? vpmMappedLocals[mem->getSource().local()->getBase(true)] : nullptr;
-        auto dstVpmArea =
-            mem->getDestination().hasLocal() ? vpmMappedLocals[mem->getDestination().local()->getBase(true)] : nullptr;
-        it = mapToVPMMemoryAccessInstructions(method, it, srcVpmArea, dstVpmArea);
-        affectedBlocks.emplace(it.getBasicBlock());
-    }
+    FastMap<const Local*, MemoryInfo> infos;
+    infos.reserve(memoryMapping.size());
 
-    // everything else throws errors
-    return true;
-}
+    for(auto& mapping : memoryMapping)
+        infos.emplace(mapping.first, checkMemoryMapping(method, mapping.first, mapping.second));
 
-/*
- * Returns the constant value which will be read from the given memory access instruction.
- *
- * The value is constant if:
- * - the source memory location is constant
- * - the index is constant or the value can be determined without knowing the exact index (e.g. all elements are the
- * same)
- */
-static Optional<Value> getConstantValue(const MemoryInstruction* mem)
-{
-    // can only read from constant global data, so the global is always the source
-    const Global* global = mem->getSource().local()->getBase(true)->as<Global>();
-    if(mem->getSource().local()->reference.second >= 0 && global->value.hasContainer())
-        // fixed index
-        return global->value.container().elements.at(mem->getSource().local()->reference.second);
-    else if(global->value.isLiteralValue())
-        // scalar value
-        return global->value;
-    else if(global->value.isZeroInitializer())
-        // all entries are the same
-        return Value::createZeroInitializer(global->value.type.getElementType());
-    else if(global->value.isUndefined())
-        // all entries are undefined
-        return Value(global->value.type.getElementType());
-    else if(global->value.hasContainer() && global->value.container().isElementNumber())
-        return ELEMENT_NUMBER_REGISTER;
-    else if(global->value.hasContainer() && global->value.container().isAllSame())
-        // all entries are the same
-        return global->value.container().elements.at(0);
-    return NO_VALUE;
-}
+    // After we fixed all the VPM areas used for specific purposes, we can check how big of a scratch size we need
+    // TODO rewrite scratch area to per-QPU? To not need mutex lock!
+    // Would need size of per QPU scratch are before mapping any instruction, should be possible with new
+    // check-all-first-map-then flow
 
-/*
- * Tries to convert the array-type pointed to by the given local to a vector-type to fit into a single register.
- *
- * For this conversion to succeed, the array-element type must be a scalar of bit-width <= 32-bit and the size of the
- * array known to be less or equals to 16.
- */
-static Optional<DataType> convertSmallArrayToRegister(const Local* local)
-{
-    const Local* base = local->getBase(true);
-    if(base->type.getPointerType())
+    // TODO sort locals by where to put them and then call 1. check of mapping and 2. mapping on all
+    for(auto& memIt : memoryInstructions)
     {
-        const auto& baseType = base->type.getPointerType().value()->elementType;
-        auto arrayType = baseType.getArrayType();
-        if(arrayType && arrayType.value()->size <= NATIVE_VECTOR_SIZE && arrayType.value()->elementType.isScalarType())
-            return arrayType.value()->elementType.toVectorType(static_cast<uint8_t>(arrayType.value()->size));
-    }
-    return {};
-}
+        auto mem = memIt.get<const MemoryInstruction>();
+        const auto srcBaseLocal = mem->getSource().hasLocal() ? mem->getSource().local()->getBase(true) : nullptr;
+        const auto dstBaseLocal =
+            mem->getDestination().hasLocal() ? mem->getDestination().local()->getBase(true) : nullptr;
 
-/*
- * Maps memory access to the given local into moves from/to the given register
- *
- * NOTE: This is the best optimization for memory access and should always be preferred.
- * NOTE: This optimization cannot be applied if changes made to the lowered register need to be reflected to other QPUs.
- */
-static InstructionWalker lowerReadWriteOfMemoryToRegister(InstructionWalker it, Method& method, const Local* local,
-    const Value& loweredRegister, const MemoryInstruction* mem)
-{
-    Value tmpIndex = UNDEFINED_VALUE;
-    it = insertAddressToElementOffset(it, method, tmpIndex, local, loweredRegister, mem);
-    if(mem->op == MemoryOperation::READ)
-    {
-        // TODO check whether index is guaranteed to be in range [0, 16[
-        it = insertVectorExtraction(it, method, loweredRegister, tmpIndex, mem->getDestination());
-        return it.erase();
-    }
-    if(mem->op == MemoryOperation::WRITE)
-    {
-        // TODO need special handling for inserting multiple elements to set all new elements
-        it = insertVectorInsertion(it, method, loweredRegister, tmpIndex, mem->getSource());
-        return it.erase();
-    }
-    if(mem->op == MemoryOperation::COPY)
-    {
-        if(mem->getSource().hasLocal() && mem->getSource().local()->getBase(true) == local &&
-            mem->getDestination().hasLocal() && mem->getDestination().local()->getBase(true) == local)
-            throw CompilationError(CompilationStep::NORMALIZER,
-                "Copy from and to same register lowered memory area is not supported", mem->to_string());
-        if(mem->getSource().hasLocal() && mem->getSource().local()->getBase(true) == local)
-        {
-            // TODO check whether index is guaranteed to be in range [0, 16[
-            auto tmp = method.addNewLocal(mem->getSourceElementType());
-            it = insertVectorExtraction(it, method, loweredRegister, tmpIndex, tmp);
-            it.reset(new MemoryInstruction(MemoryOperation::WRITE, mem->getDestination(), tmp));
-            return it;
-        }
-        if(mem->getDestination().hasLocal() && mem->getDestination().local()->getBase(true) == local)
-        {
-            // TODO need special handling for inserting multiple elements to set all new elements
-            auto tmp = method.addNewLocal(mem->getDestinationElementType());
-            it.emplace(new MemoryInstruction(MemoryOperation::READ, tmp, mem->getSource()));
-            it.nextInBlock();
-            it = insertVectorInsertion(it, method, loweredRegister, tmpIndex, mem->getSource());
-            return it.erase();
-        }
+        auto srcInfoIt = srcBaseLocal ? infos.find(srcBaseLocal) : infos.end();
+        const MemoryInfo& srcInfo =
+            srcInfoIt != infos.end() ? srcInfoIt->second : MemoryInfo{srcBaseLocal, MemoryType::QPU_REGISTER_READWRITE};
+        auto dstInfoIt = dstBaseLocal ? infos.find(dstBaseLocal) : infos.end();
+        const MemoryInfo& dstInfo =
+            dstInfoIt != infos.end() ? dstInfoIt->second : MemoryInfo{dstBaseLocal, MemoryType::QPU_REGISTER_READWRITE};
+
+        mapMemoryAccess(method, memIt, const_cast<MemoryInstruction*>(mem), srcInfo, dstInfo);
+        // TODO mark local for prefetch/write-back (if necessary)
     }
-    throw CompilationError(
-        CompilationStep::NORMALIZER, "Unhandled case of lowering memory access to register", mem->to_string());
-}
 
-/*
- * Lowers access to a memory location into a register.
- *
- * This can be done for constant or private (stack) memory locations.
- *
- * NOTE: This is the best optimization for memory access and should be preferred, where applicable.
- */
-static bool lowerMemoryToRegister(
-    Method& method, const Local* local, MemoryType type, FastSet<InstructionWalker>& memoryInstructions)
-{
-    /*
-     * There are several cases of memory lowered into registers:
-     * - constant memory with constant index (direct value determinable) -> map to direct value
-     * - constant memory which fits into register but dynamic index -> map to register, index by vector rotation
-     * - private memory which fits into register -> map to register
-     * - private memory where the type can be converted to fit into register -> map to register + index by vector
-     * rotation
-     */
-    auto toConvertedRegisterType = convertSmallArrayToRegister(local);
-    if(type == MemoryType::QPU_REGISTER_READONLY)
-    {
-        // can handle extra read on its own, no check required for other accesses
-        auto it = memoryInstructions.begin();
-        while(it != memoryInstructions.end())
-        {
-            const MemoryInstruction* mem = it->get<const MemoryInstruction>();
-            if(!mem)
-            {
-                // already converted (cannot happen, since this is the first round, but we don't care)
-                ++it;
-                continue;
-            }
-            if(mem->op != MemoryOperation::READ && mem->op != MemoryOperation::COPY)
-                throw CompilationError(CompilationStep::NORMALIZER,
-                    "Cannot perform a non-read operation on constant memory", mem->to_string());
-            logging::debug() << "Trying to lower access to constant memory into register: " << mem->to_string()
-                             << logging::endl;
-            auto constantValue = getConstantValue(mem);
-            auto tmpIt = *it;
-            if(constantValue)
-            {
-                it = memoryInstructions.erase(it);
-                if(mem->op == MemoryOperation::COPY)
-                {
-                    // since a copy always involves another memory object, this rewrite is picked up when the other
-                    // object is processed
-                    tmpIt.reset(
-                        new MemoryInstruction(MemoryOperation::WRITE, mem->getDestination(), constantValue.value()));
-                    logging::debug() << "Replaced memory copy from constant memory to memory write of constant value: "
-                                     << tmpIt->to_string() << logging::endl;
-                }
-                else
-                {
-                    tmpIt.reset(new MoveOperation(mem->getOutput().value(), constantValue.value()));
-                    logging::debug() << "Replaced loading of constant memory with constant literal: "
-                                     << tmpIt->to_string() << logging::endl;
-                }
-            }
-            else if(mem->op == MemoryOperation::READ && local->is<Global>() && toConvertedRegisterType)
-            {
-                it = memoryInstructions.erase(it);
-                auto tmp = method.addNewLocal(toConvertedRegisterType.value(), "%lowered_constant");
+    // TODO move this out into own optimization step?!
+    // XXX if this is re-activated, it will probably need to be rewritten (no more guaranteed mutex),
+    // also no benefit of grouping VPM access (other than combining the setup instructions),
+    // also can combine all setups within a basic block which set up the same values (per DMA, generic, stride), except
+    // VPM read!
+    // combineVPMAccess(affectedBlocks, method);
 
-                assign(tmpIt, tmp) = local->as<const Global>()->value;
-                tmpIt = lowerReadWriteOfMemoryToRegister(tmpIt, method, local, tmp, mem);
-                logging::debug() << "Replaced loading of constant memory with vector rotation of register: "
-                                 << tmpIt.copy().previousInBlock()->to_string() << logging::endl;
-            }
-            else if(mem->op == MemoryOperation::COPY && local->is<Global>() && toConvertedRegisterType)
-            {
-                it = memoryInstructions.erase(it);
-                auto tmp = method.addNewLocal(toConvertedRegisterType.value(), "%lowered_constant");
+    method.vpm->dumpUsage();
 
-                assign(tmpIt, tmp) = local->as<const Global>()->value;
-                tmpIt = lowerReadWriteOfMemoryToRegister(tmpIt, method, local, tmp, mem);
-                logging::debug() << "Replaced copying from constant memory with vector rotation and writing of memory: "
-                                 << tmpIt.copy().previousInBlock()->to_string() << logging::endl;
-            }
-            else
-            {
-                // this can happen e.g. for memory copy
-                logging::debug() << "Failed to lower access to constant memory into register: " << mem->to_string()
-                                 << logging::endl;
-                ++it;
-            }
-        }
-        return memoryInstructions.empty();
-    }
-    else if(type == MemoryType::QPU_REGISTER_READWRITE && local->is<StackAllocation>())
-    {
-        // need to heed all access to memory area
-        if(local->type.isSimpleType())
-        {
-            // fits into a single register on its own, without rewriting
-            const Value loweredRegister = method.addNewLocal(local->type);
-            for(auto it : memoryInstructions)
-            {
-                const MemoryInstruction* mem = it.get<const MemoryInstruction>();
-                if(!mem)
-                    // instruction cannot be already converted here (either all are already converted or none)
-                    throw CompilationError(CompilationStep::NORMALIZER,
-                        "Invalid instruction to be lowered into register", it->to_string());
-                logging::debug() << "Trying to lower access to stack allocation into register: " << mem->to_string()
-                                 << logging::endl;
-                bool isRead = mem->getSource().hasLocal() && mem->getSource().local()->getBase(true) == local;
-                bool isWritten =
-                    mem->getDestination().hasLocal() && mem->getDestination().local()->getBase(true) == local;
-                switch(mem->op)
-                {
-                case MemoryOperation::COPY:
-                    if(isRead)
-                        // since a copy always involves another memory object, this rewrite is picked up when the other
-                        // object is processed
-                        it.reset(new MemoryInstruction(MemoryOperation::WRITE, mem->getDestination(), loweredRegister));
-                    else if(isWritten)
-                        it.reset(new MemoryInstruction(MemoryOperation::READ, loweredRegister, mem->getSource()));
-                    break;
-                case MemoryOperation::FILL:
-                    if(mem->getSource().type.isScalarType())
-                    {
-                        it = insertReplication(it, mem->getSource(), loweredRegister);
-                        it.erase();
-                    }
-                    else
-                        it.reset(new MoveOperation(loweredRegister, mem->getSource()));
-                    break;
-                case MemoryOperation::READ:
-                    it.reset(new MoveOperation(mem->getDestination(), loweredRegister));
-                    break;
-                case MemoryOperation::WRITE:
-                    it.reset(new MoveOperation(loweredRegister, mem->getSource()));
-                    break;
-                }
-                logging::debug() << "Replaced access to stack allocation '" << local->to_string()
-                                 << "' with: " << it->to_string() << logging::endl;
-            }
-            // the stack value always fits into a single register (is checked above) and therefore the lowering always
-            // succeeds
-            return true;
-        }
-        else if(toConvertedRegisterType)
-        {
-            if(std::any_of(memoryInstructions.begin(), memoryInstructions.end(), [](InstructionWalker it) -> bool {
-                   const MemoryInstruction* mem = it.get<const MemoryInstruction>();
-                   return mem->op == MemoryOperation::FILL || mem->op == MemoryOperation::COPY;
-               }))
-            {
-                // not supported, keep all access to use VPM/RAM
-                logging::debug()
-                    << "Lowering of memory which is filled or copied into registers is not yet implemented: "
-                    << local->to_string() << logging::endl;
-                return false;
-            }
-            Value loweredBuffer = method.addNewLocal(toConvertedRegisterType.value(), "%lowered_stack");
-            for(auto it : memoryInstructions)
-            {
-                const MemoryInstruction* mem = it.get<const MemoryInstruction>();
-                if(!mem)
-                    // instruction cannot be already converted here(either all are already converted or none)
-                    throw CompilationError(CompilationStep::NORMALIZER,
-                        "Invalid instruction to be lowered into register", it->to_string());
-                logging::debug() << "Trying to lower access to stack allocation into register: " << mem->to_string()
-                                 << logging::endl;
-                it = lowerReadWriteOfMemoryToRegister(it, method, local, loweredBuffer, mem);
-                logging::debug() << "Replaced access to stack allocation '" << local->to_string()
-                                 << "' with: " << it.copy().previousInBlock()->to_string() << logging::endl;
-            }
-            // all reads and writes (with any index) can be lowered into register, if the type fits
-            return true;
-        }
-        else
-            throw CompilationError(CompilationStep::NORMALIZER,
-                "Unhandled case of lowering stack allocation to register", local->to_string());
-    }
-    else
-        throw CompilationError(
-            CompilationStep::NORMALIZER, "Unhandled case of lowering to register", local->to_string());
-    return false;
+    // TODO clean up no longer used (all kernels!) globals and stack allocations
 }
 
 /*
- * Maps a single memory read to a TMU load
- *
- * NOTE: Memory locations loaded via TMU MUST NOT be written to by the same kernel (even on a different QPU)!
- */
-static bool mapReadsToTMULoad(
-    Method& method, const Local* local, FastSet<InstructionWalker>& memoryInstructions, bool tmuFlag)
+void cacheWorkGroupDMAAccess(const Module& module, Method& method, const Configuration& config)
 {
-    auto it = memoryInstructions.begin();
-    while(it != memoryInstructions.end())
+    auto memoryAccessRanges = determineAccessRanges(method, localsMap);
+    for(auto& pair : memoryAccessRanges)
     {
-        const MemoryInstruction* mem = it->get<const MemoryInstruction>();
-        if(!mem)
-            // already converted (e.g. when constant load lowered into register)
-            continue;
-        logging::debug() << "Trying to map load from read-only memory to TMU load: " << mem->to_string()
-                         << logging::endl;
-        if(mem->op != MemoryOperation::READ)
+        bool allUniformPartsEqual;
+        analysis::IntegerRange offsetRange;
+        std::tie(allUniformPartsEqual, offsetRange) = checkWorkGroupUniformParts(pair.second);
+        if(!allUniformPartsEqual)
         {
-            ++it;
+            logging::debug() << "Cannot cache memory location " << pair.first->to_string()
+                             << " in VPM, since the work-group uniform parts of the address calculations differ, which "
+                                "is not yet supported!"
+                             << logging::endl;
             continue;
         }
-        auto tmpIt = periphery::insertReadVectorFromTMU(
-            method, *it, mem->getDestination(), mem->getSource(), tmuFlag ? periphery::TMU1 : periphery::TMU0);
-        it = memoryInstructions.erase(it);
-        tmpIt.erase();
-        logging::debug() << "Replaced loading from read-only memory with TMU load: "
-                         << tmpIt.copy().previousInBlock()->to_string() << logging::endl;
-    }
-
-    return memoryInstructions.empty();
-}
-
-/*
- * Tries to map the given memory location into VPM
- *
- * This is applicable for private (stack) or local memory.
- *
- * NOTE: A memory location can only be lowered into VPM if all access to it can be lowered to VPM
- * NOTE: This is to be preferred over keeping the memory location in RAM
- */
-static bool lowerMemoryToVPM(Method& method, const Local* local, MemoryType type,
-    FastSet<InstructionWalker>& memoryInstructions, FastMap<const Local*, const VPMArea*>& vpmAreas)
-{
-    // Need to make sure addressing is still correct!
-    if(type == MemoryType::VPM_PER_QPU && !local->is<StackAllocation>())
-        throw CompilationError(
-            CompilationStep::NORMALIZER, "Unhandled case of per-QPU memory buffer", local->to_string());
-
-    // since the stack allocation is read-write, need to lower all access or none
-    auto vpmArea = method.vpm->addArea(
-        local, local->type.getElementType(), type == MemoryType::VPM_PER_QPU, method.metaData.getWorkGroupSize());
-    if(vpmArea == nullptr)
-        // did not fit into VPM
-        return false;
-    vpmAreas.emplace(local, vpmArea);
-
-    auto it = memoryInstructions.begin();
-    while(it != memoryInstructions.end())
-    {
-        const MemoryInstruction* mem = it->get<const MemoryInstruction>();
-        if(!mem)
-            // instruction cannot be already converted here (either all are already converted or none)
-            throw CompilationError(
-                CompilationStep::NORMALIZER, "Invalid instruction to be lowered into register", (*it)->to_string());
-        if(type == MemoryType::VPM_PER_QPU)
-            logging::debug() << "Trying to lower access to stack allocation into VPM: " << mem->to_string()
-                             << logging::endl;
-        else
-            logging::debug() << "Trying to lower access to shared local memory into VPM: " << mem->to_string()
-                             << logging::endl;
-        Value inAreaOffset = UNDEFINED_VALUE;
-        auto tmpIt = insertAddressToStackOffset(*it, method, inAreaOffset, local, type, mem);
-        switch(mem->op)
-        {
-        case MemoryOperation::COPY:
+        if((offsetRange.maxValue - offsetRange.minValue) >= config.availableVPMSize ||
+            (offsetRange.maxValue < offsetRange.minValue))
         {
             // if the other local is already mapped to VPM, insert copy instruction. Otherwise let other local handle
             // this
@@ -1395,410 +745,53 @@ static bool lowerMemoryToVPM(Method& method, const Local* local, MemoryType type
                     CompilationStep::NORMALIZER, "Copying from/to VPM is not yet implemented", mem->to_string());
             }
             ++it;
+            // this also checks for any over/underflow when converting the range to unsigned int in the next steps
+            logging::debug() << "Cannot cache memory location " << pair.first->to_string()
+                             << " in VPM, the accessed range is too big: [" << offsetRange.minValue << ", "
+                             << offsetRange.maxValue << "]" << logging::endl;
             continue;
         }
-        case MemoryOperation::FILL:
-            throw CompilationError(
-                CompilationStep::NORMALIZER, "Filling VPM area is not yet implemented", mem->to_string());
-        case MemoryOperation::READ:
-            it = memoryInstructions.erase(it);
-            tmpIt = method.vpm->insertReadVPM(method, tmpIt, mem->getDestination(), vpmArea, true, inAreaOffset);
-            tmpIt.erase();
-            break;
-        case MemoryOperation::WRITE:
-            it = memoryInstructions.erase(it);
-            tmpIt = method.vpm->insertWriteVPM(method, tmpIt, mem->getSource(), vpmArea, true, inAreaOffset);
-            tmpIt.erase();
-            break;
-        }
-        logging::debug() << "Replaced access to memory buffer with access to VPM" << logging::endl;
-    }
+        logging::debug() << "Memory location " << pair.first->to_string()
+                         << " is accessed via DMA in the dynamic range [" << offsetRange.minValue << ", "
+                         << offsetRange.maxValue << "]" << logging::endl;
 
-    // even if we did not map all accesses, we fixed the local to the VPM area
-    // for e.g. copy, the other local also has a reference to this MemoryInstruction and will handle it
-    return memoryInstructions.empty();
-}
+        auto accessedType = pair.first->type.toArrayType(static_cast<unsigned>(
+            offsetRange.maxValue - offsetRange.minValue + 1 / * bounds of range are inclusive! * /));
 
-struct MemoryAccess
-{
-    FastSet<InstructionWalker> accessInstructions;
-    MemoryType preferred;
-    MemoryType fallback;
-};
-
-// Finds the next instruction writing the given value into memory
-static InstructionWalker findNextValueStore(
-    InstructionWalker it, const Value& src, std::size_t limit, const Local* sourceLocation)
-{
-    while(!it.isEndOfBlock() && limit > 0)
-    {
-        auto memInstr = it.get<MemoryInstruction>();
-        if(memInstr != nullptr && memInstr->op == MemoryOperation::WRITE && memInstr->getSource() == src)
-        {
-            return it;
-        }
-        if(memInstr != nullptr && memInstr->getDestination().local()->getBase(true) == sourceLocation)
+        // TODO the local is not correct, at least not if there is a work-group uniform offset
+        auto vpmArea = method.vpm->addArea(pair.first, accessedType, false);
+        if(vpmArea == nullptr)
         {
-            // there is some other instruction writing into the memory we read, it could have been changed -> abort
-            // TODO can we be more precise and abort only if the same index is written?? How to determine??
-            return it.getBasicBlock()->walkEnd();
-        }
-        if(it.has<MemoryBarrier>() || it.has<Branch>() || it.has<MutexLock>() || it.has<SemaphoreAdjustment>())
-            break;
-        it.nextInBlock();
-        --limit;
-    }
-    return it.getBasicBlock()->walkEnd();
-}
-
-/*
- * Basic algorithm to determine the preferred and fall-back (e.g. if access-types not supported by preferred)
- * way of
- * a) mapping the memory regions used by this method to the available "memory" (registers, VPM, RAM) and
- * b) mapping the memory access types (read, write, copy, fill) to the available memory access types (TMU, VPM, etc.)
- */
-static FastMap<const Local*, MemoryAccess> determineMemoryAccess(Method& method)
-{
-    // TODO lower local/private struct-elements into VPM?! At least for single structs
-    logging::debug() << "Determining memory access for kernel: " << method.name << logging::endl;
-    FastMap<const Local*, MemoryAccess> mapping;
-    for(const auto& param : method.parameters)
-    {
-        if(!param.type.isPointerType())
+            logging::debug() << "Memory location " << pair.first->to_string() << " with dynamic access range ["
+                             << offsetRange.minValue << ", " << offsetRange.maxValue
+                             << "] cannot be cached in VPM, since it does not fit" << logging::endl;
             continue;
-        const auto* pointerType = param.type.getPointerType().value();
-        if(pointerType->addressSpace == AddressSpace::CONSTANT)
-        {
-            logging::debug() << "Constant parameter '" << param.to_string() << "' will be read from RAM via TMU"
-                             << logging::endl;
-            mapping[&param].preferred = MemoryType::RAM_LOAD_TMU;
-            // fall-back, e.g. for memory copy
-            mapping[&param].fallback = MemoryType::RAM_READ_WRITE_VPM;
-        }
-        else if(pointerType->addressSpace == AddressSpace::GLOBAL)
-        {
-            if(isMemoryOnlyRead(&param))
-            {
-                logging::debug() << "Global parameter '" << param.to_string()
-                                 << "' without any write access will be read from RAM via TMU" << logging::endl;
-                mapping[&param].preferred = MemoryType::RAM_LOAD_TMU;
-                // fall-back, e.g. for memory copy
-                mapping[&param].fallback = MemoryType::RAM_READ_WRITE_VPM;
-            }
-            else
-            {
-                logging::debug() << "Global parameter '" << param.to_string()
-                                 << "' which is written to will be stored in RAM and accessed via VPM" << logging::endl;
-                mapping[&param].preferred = MemoryType::RAM_READ_WRITE_VPM;
-                mapping[&param].fallback = MemoryType::RAM_READ_WRITE_VPM;
-            }
-        }
-        else if(pointerType->addressSpace == AddressSpace::LOCAL)
-        {
-            // TODO if last access index is known and fits into VPM, set for VPM-or-RAM
-            logging::debug() << "Local parameter '" << param.to_string()
-                             << "' will be stored in RAM and accessed via VPM" << logging::endl;
-            mapping[&param].preferred = MemoryType::RAM_READ_WRITE_VPM;
-            mapping[&param].fallback = MemoryType::RAM_READ_WRITE_VPM;
         }
-        else
-            throw CompilationError(
-                CompilationStep::NORMALIZER, "Invalid address space for pointer parameter", param.to_string(true));
-    }
 
-    InstructionWalker it = method.walkAllInstructions();
-    while(!it.isEndOfMethod())
-    {
-        if(it.has<MemoryInstruction>())
-        {
-            // convert read-then-write to copy
-            auto memInstr = it.get<const MemoryInstruction>();
-            if(memInstr->op == MemoryOperation::READ && !memInstr->hasConditionalExecution() &&
-                memInstr->getDestination().local()->getUsers(LocalUse::Type::READER).size() == 1)
-            {
-                auto nextIt = findNextValueStore(
-                    it, memInstr->getDestination(), 16 /* TODO */, memInstr->getSource().local()->getBase(true));
-                auto nextMemInstr = nextIt.isEndOfBlock() ? nullptr : nextIt.get<const MemoryInstruction>();
-                if(nextMemInstr != nullptr && !nextIt->hasConditionalExecution() &&
-                    nextMemInstr->op == MemoryOperation::WRITE &&
-                    nextMemInstr->getSource().getSingleWriter() == memInstr &&
-                    nextMemInstr->getSourceElementType().getPhysicalWidth() ==
-                        memInstr->getDestinationElementType().getPhysicalWidth())
-                {
-                    // TODO also extend so value read, not modified and stored (if used otherwise) is replaced with load
-                    // (for other uses) and copy -> supports other type sizes
-                    logging::debug()
-                        << "Found reading of memory where the sole usage writes the value back into memory: "
-                        << memInstr->to_string() << logging::endl;
-                    logging::debug() << "Replacing manual copy of memory with memory copy instruction for write: "
-                                     << nextMemInstr->to_string() << logging::endl;
-
-                    const Value src = memInstr->getSource();
-                    it.erase();
-                    nextIt.reset(new MemoryInstruction(
-                        MemoryOperation::COPY, nextMemInstr->getDestination(), src, nextMemInstr->getNumEntries()));
-                    // continue with the next instruction after the read in the next iteration
-                    continue;
-                }
-            }
-            for(const auto local : memInstr->getMemoryAreas())
-            {
-                if(mapping.find(local) != mapping.end())
-                {
-                    // local was already processed
-                    mapping[local].accessInstructions.emplace(it);
-                    continue;
-                }
-                mapping[local].accessInstructions.emplace(it);
-                if(local->is<StackAllocation>())
-                {
-                    if(local->type.isSimpleType() || convertSmallArrayToRegister(local))
-                    {
-                        logging::debug() << "Small stack value '" << local->to_string()
-                                         << "' will be stored in a register" << logging::endl;
-                        mapping[local].preferred = MemoryType::QPU_REGISTER_READWRITE;
-                        // we cannot pack an array into a VPM cache line, since always all 16 elements are read/written
-                        // and we would overwrite the other elements
-                        mapping[local].fallback =
-                            local->type.isSimpleType() ? MemoryType::VPM_PER_QPU : MemoryType::RAM_READ_WRITE_VPM;
-                    }
-                    else if(!local->type.getElementType().getStructType())
-                    {
-                        logging::debug() << "Stack value '" << local->to_string()
-                                         << "' will be stored in VPM per QPU (with fall-back to RAM via VPM)"
-                                         << logging::endl;
-                        mapping[local].preferred = MemoryType::VPM_PER_QPU;
-                        mapping[local].fallback = MemoryType::RAM_READ_WRITE_VPM;
-                    }
-                    else
-                    {
-                        logging::debug() << "Struct stack value '" << local->to_string()
-                                         << "' will be stored in RAM per QPU (via VPM)" << logging::endl;
-                        mapping[local].preferred = MemoryType::RAM_READ_WRITE_VPM;
-                        mapping[local].fallback = MemoryType::RAM_READ_WRITE_VPM;
-                    }
-                }
-                else if(local->is<Global>())
-                {
-                    if(isMemoryOnlyRead(local))
-                    {
-                        // global buffer
-                        if(getConstantValue(memInstr))
-                        {
-                            logging::debug() << "Constant element of constant buffer '" << local->to_string()
-                                             << "' will be stored in a register " << logging::endl;
-                            mapping[local].preferred = MemoryType::QPU_REGISTER_READONLY;
-                            mapping[local].fallback = MemoryType::RAM_LOAD_TMU;
-                        }
-                        else if(convertSmallArrayToRegister(local))
-                        {
-                            logging::debug() << "Small constant buffer '" << local->to_string()
-                                             << "' will be stored in a register" << logging::endl;
-                            mapping[local].preferred = MemoryType::QPU_REGISTER_READONLY;
-                            mapping[local].fallback = MemoryType::RAM_LOAD_TMU;
-                        }
-                        else
-                        {
-                            logging::debug() << "Constant buffer '" << local->to_string()
-                                             << "' will be read from RAM via TMU" << logging::endl;
-                            mapping[local].preferred = MemoryType::RAM_LOAD_TMU;
-                            // fall-back, e.g. for memory copy
-                            mapping[local].fallback = MemoryType::RAM_READ_WRITE_VPM;
-                        }
-                    }
-                    else if(!local->type.getElementType().getStructType())
-                    {
-                        // local buffer
-                        logging::debug() << "Local buffer '" << local->to_string()
-                                         << "' will be stored in VPM (with fall-back to RAM via VPM)" << logging::endl;
-                        mapping[local].preferred = MemoryType::VPM_SHARED_ACCESS;
-                        mapping[local].fallback = MemoryType::RAM_READ_WRITE_VPM;
-                    }
-                    else
-                    {
-                        // local buffer
-                        logging::debug() << "Local struct '" << local->to_string() << "' will be stored in RAM via VPM"
-                                         << logging::endl;
-                        mapping[local].preferred = MemoryType::RAM_READ_WRITE_VPM;
-                        mapping[local].fallback = MemoryType::RAM_READ_WRITE_VPM;
-                    }
-                }
-                else
-                    // parameters MUST be handled before and there is no other type of memory objects
-                    throw CompilationError(
-                        CompilationStep::NORMALIZER, "Invalid local type for memory area", local->to_string(true));
-            }
-        }
-        it.nextInMethod();
-    }
+        // TODO insert load memory area into VPM at start of kernel (after all the required offsets/indices are
+        // calculated)
+        // TODO calculate address from base address and work-group uniform parts
+        // TODO insert store VPM into memory area at end of kernel
+        // TODO rewrite memory accesses to only access the correct VPM area
 
-    return mapping;
-}
+        for(auto& entry : pair.second)
+            rewriteIndexCalculation(method, entry);
 
-void normalization::mapMemoryAccess(const Module& module, Method& method, const Configuration& config)
-{
-    /*
-     * Matrix of memory types and storage locations:
-     *
-     *           | global | local | private | constant
-     * buffer    |   -    |VPM/GD | QPU/VPM | QPU/GD
-     * parameter |  RAM   |RAM/(*)|    -    |   RAM
-     *
-     * buffer is both inside and outside of function scope (where allowed)
-     * - : is not allowed by OpenCL
-     * (*) could lower into VPM if the highest index accessed is known and fits?
-     * GD: global data segment of kernel buffer
-     * RAM: load via TMU if possible (not written to), otherwise use VPM
-     *
-     * Sources:
-     * https://stackoverflow.com/questions/22471466/why-program-global-scope-variables-must-be-constant#22474119
-     * https://stackoverflow.com/questions/17431941/how-to-use-arrays-in-program-global-scope-in-opencl
-     */
-    /*
-     * 1. lower constant/private buffers into register
-     *    lower global constant buffers into registers
-     *    lower small enough private buffers to registers
-     * 2. generate TMU loads for read-only memory
-     *    keep all read-only parameters in RAM, load via TMU
-     *    also load constants via TMU, which could not be lowered into register
-     * 3. lower per-QPU (private) buffers into VPM
-     * 4. lower shared buffers (local) into VPM
-     * 5. generate remaining instructions for RAM access via VPM
-     * TODO:
-     * 3.1 for memory located in RAM, try to group/queue reads/writes
-     * 3.2 also try to use VPM as cache (e.g. only write back into memory when VPM cache area full, prefetch into VPM)
-     * 4. final pass which actually converts VPM cache
-     */
-    auto memoryMapping = determineMemoryAccess(method);
-    // stores already assigned VPM areas, e.g. for memory-copy operations
-    FastMap<const Local*, const VPMArea*> vpmMappedLocals;
+        // TODO now, combine access to memory with VPM access
+        // need to make sure, only 1 kernel accesses RAM/writes the configuration, how?
+        // -> need some lightweight synchronization (e.g. status value in VPM?? One kernel would need to
+        // poll!!)
+        // TODO if minValue  != 0, need then to deduct it from the group-uniform address too!
+        // use base pointer as memory pointer (for read/write-back) and offset as VPM offset. maximum
+        // offset is the number of elements to copy/cache
 
-    // 1. lower into registers
-    auto mappingIt = memoryMapping.begin();
-    while(mappingIt != memoryMapping.end())
-    {
-        if(mappingIt->second.preferred == MemoryType::QPU_REGISTER_READONLY ||
-            mappingIt->second.preferred == MemoryType::QPU_REGISTER_READWRITE)
-        {
-            if(lowerMemoryToRegister(
-                   method, mappingIt->first, mappingIt->second.preferred, mappingIt->second.accessInstructions))
-                mappingIt = memoryMapping.erase(mappingIt);
-            else if(mappingIt->second.fallback == MemoryType::QPU_REGISTER_READONLY ||
-                mappingIt->second.fallback == MemoryType::QPU_REGISTER_READWRITE)
-                throw CompilationError(
-                    CompilationStep::NORMALIZER, "Failed to lower memory to register", mappingIt->first->to_string());
-            else
-            {
-                // could not lower to register, fall back to fall-back and try again
-                mappingIt->second.preferred = mappingIt->second.fallback;
-                ++mappingIt;
-            }
-        }
-        else
-            ++mappingIt;
-    }
-    // 2. load read-only parameter via TMU
-    mappingIt = memoryMapping.begin();
-    // the flag as to which TMU to use
-    // TODO for better performance, this should alternate in according to the order of usage (first read use TMU0,
-    // second read use TMU1, ...)
-    bool tmuFlag = false;
-    // The insertion of the TMU_NOSWAP configuration is inserted in #addStartStopSegment to the start of the kernel
-    while(mappingIt != memoryMapping.end())
-    {
-        if(mappingIt->second.preferred == MemoryType::RAM_LOAD_TMU)
-        {
-            if(mapReadsToTMULoad(method, mappingIt->first, mappingIt->second.accessInstructions, tmuFlag))
-            {
-                tmuFlag = !tmuFlag;
-                if(mappingIt->first->is<Parameter>())
-                {
-                    const_cast<Parameter*>(mappingIt->first->as<Parameter>())->decorations =
-                        add_flag(mappingIt->first->as<Parameter>()->decorations, ParameterDecorations::INPUT);
-                }
-                mappingIt = memoryMapping.erase(mappingIt);
-            }
-            else if(mappingIt->second.fallback == MemoryType::RAM_LOAD_TMU)
-                throw CompilationError(
-                    CompilationStep::NORMALIZER, "Failed to generate TMU load", mappingIt->first->to_string());
-            else
-            {
-                // could not load via TMU (e.g. copy), retry with fall-back
-                mappingIt->second.preferred = mappingIt->second.fallback;
-                ++mappingIt;
-            }
-        }
-        else
-            ++mappingIt;
-    }
-    // 3. lower private memory into VPM
-    mappingIt = memoryMapping.begin();
-    while(mappingIt != memoryMapping.end())
-    {
-        if(mappingIt->second.preferred == MemoryType::VPM_PER_QPU)
-        {
-            // TODO could optimize by preferring the private buffer accessed more often to be in VPM
-            if(lowerMemoryToVPM(method, mappingIt->first, MemoryType::VPM_PER_QPU, mappingIt->second.accessInstructions,
-                   vpmMappedLocals))
-                mappingIt = memoryMapping.erase(mappingIt);
-            else
-            {
-                // could not lower to VPM, fall back to fall-back and try again
-                mappingIt->second.preferred = mappingIt->second.fallback;
-                ++mappingIt;
-            }
-        }
-        else
-            ++mappingIt;
-    }
-    // 4. lower local memory into VPM
-    mappingIt = memoryMapping.begin();
-    while(mappingIt != memoryMapping.end())
-    {
-        if(mappingIt->second.preferred == MemoryType::VPM_SHARED_ACCESS)
-        {
-            // TODO could optimize by preferring the local buffer accessed more often to be in VPM
-            if(lowerMemoryToVPM(method, mappingIt->first, MemoryType::VPM_SHARED_ACCESS,
-                   mappingIt->second.accessInstructions, vpmMappedLocals))
-                mappingIt = memoryMapping.erase(mappingIt);
-            else
-            {
-                // could not lower to VPM, fall back to fall-back and try again
-                mappingIt->second.preferred = mappingIt->second.fallback;
-                ++mappingIt;
-            }
-        }
-        else
-            ++mappingIt;
-    }
-    // 5. map remaining instructions to access RAM via VPM
-    // list of basic blocks where multiple VPM accesses could be combined
-    FastSet<BasicBlock*> affectedBlocks;
-    mappingIt = memoryMapping.begin();
-    while(mappingIt != memoryMapping.end())
-    {
-        if(mappingIt->second.preferred == MemoryType::RAM_READ_WRITE_VPM)
-        {
-            if(mapReadWriteToMemoryViaVPM(
-                   method, mappingIt->first, mappingIt->second.accessInstructions, vpmMappedLocals, affectedBlocks))
-                mappingIt = memoryMapping.erase(mappingIt);
-            else
-                ++mappingIt;
-        }
-        else
-            ++mappingIt;
-    }
+        // TODO insert initial read from DMA, final write to DMA
+        // even for writes, need to read, since memory in between might be untouched?
 
-    if(!memoryMapping.empty())
-    {
-        for(const auto& map : memoryMapping)
-            logging::error() << "Unhandled memory access type: " << map.first->to_string() << logging::endl;
-        throw CompilationError(CompilationStep::NORMALIZER, "Unhandled memory access types!");
+        // TODO if it can be proven that all values in the range are guaranteed to be written (and not read before),
+        // we can skip the initial loading. This guarantee needs to hold across all work-items in a group!
     }
 
-    combineVPMAccess(affectedBlocks, method);
-
-    // TODO move calculation of stack/global indices in here too?
-
-    // TODO clean up no longer used (all kernels!) globals and stack allocations
+    // XXX
 }
+*/
diff --git a/src/normalization/MemoryMapChecks.cpp b/src/normalization/MemoryMapChecks.cpp
new file mode 100644
index 00000000..428c1f3d
--- /dev/null
+++ b/src/normalization/MemoryMapChecks.cpp
@@ -0,0 +1,797 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#include "MemoryMappings.h"
+
+#include "../Profiler.h"
+#include "../intermediate/IntermediateInstruction.h"
+#include "log.h"
+
+using namespace vc4c;
+using namespace vc4c::normalization;
+using namespace vc4c::intermediate;
+
+using MappingCheck = MemoryInfo (*)(Method& method, const Local*, MemoryAccess&);
+
+static MemoryInfo canLowerToRegisterReadOnly(Method& method, const Local* baseAddr, MemoryAccess& access);
+static MemoryInfo canLowerToRegisterReadWrite(Method& method, const Local* baseAddr, MemoryAccess& access);
+static MemoryInfo canLowerToPrivateVPMArea(Method& method, const Local* baseAddr, MemoryAccess& access);
+static MemoryInfo canLowerToSharedVPMArea(Method& method, const Local* baseAddr, MemoryAccess& access);
+static MemoryInfo canMapToTMUReadOnly(Method& method, const Local* baseAddr, MemoryAccess& access);
+static MemoryInfo canMapToDMAReadWrite(Method& method, const Local* baseAddr, MemoryAccess& access);
+
+static constexpr MappingCheck CHECKS[6] = {
+    canLowerToRegisterReadOnly,  /* QPU_REGISTER_READONLY */
+    canLowerToRegisterReadWrite, /* QPU_REGISTER_READWRITE */
+    canLowerToPrivateVPMArea,    /* VPM_PER_QPU */
+    canLowerToSharedVPMArea,     /* VPM_SHARED_ACCESS */
+    canMapToTMUReadOnly,         /* RAM_LOAD_TMU */
+    canMapToDMAReadWrite         /* RAM_READ_WRITE_VPM */
+};
+
+MemoryInfo normalization::checkMemoryMapping(Method& method, const Local* baseAddr, MemoryAccess& access)
+{
+    return CHECKS[static_cast<unsigned>(access.preferred)](method, baseAddr, access);
+}
+
+Optional<Value> normalization::getConstantValue(const Value& source)
+{
+    // can only read from constant global data, so the global is always the source
+    const Global* global = source.local()->getBase(true)->as<Global>();
+    if(source.local()->reference.second >= 0 && global->value.hasContainer())
+        // fixed index
+        return global->value.container().elements.at(source.local()->reference.second);
+    else if(global->value.isLiteralValue())
+        // scalar value
+        return global->value;
+    else if(global->value.isZeroInitializer())
+        // all entries are the same
+        return Value::createZeroInitializer(global->value.type.getElementType());
+    else if(global->value.isUndefined())
+        // all entries are undefined
+        return Value(global->value.type.getElementType());
+    else if(global->value.hasContainer() && global->value.container().isElementNumber())
+        return ELEMENT_NUMBER_REGISTER;
+    else if(global->value.hasContainer() && global->value.container().isAllSame())
+        // all entries are the same
+        return global->value.container().elements.at(0);
+    return NO_VALUE;
+}
+
+/*
+ * Tries to convert the array-type pointed to by the given local to a vector-type to fit into a single register.
+ *
+ * For this conversion to succeed, the array-element type must be a scalar of bit-width <= 32-bit and the size of the
+ * array known to be less or equals to 16.
+ */
+static Optional<DataType> convertSmallArrayToRegister(const Local* local)
+{
+    const Local* base = local->getBase(true);
+    if(base->type.getPointerType())
+    {
+        const auto& baseType = base->type.getPointerType().value()->elementType;
+        auto arrayType = baseType.getArrayType();
+        if(arrayType && arrayType.value()->size <= NATIVE_VECTOR_SIZE && arrayType.value()->elementType.isScalarType())
+            return arrayType.value()->elementType.toVectorType(static_cast<uint8_t>(arrayType.value()->size));
+    }
+    return {};
+}
+
+static bool isMemoryOnlyRead(const Local* local)
+{
+    auto base = local->getBase(true);
+    if(base->is<Parameter>() && has_flag(base->as<const Parameter>()->decorations, ParameterDecorations::READ_ONLY))
+        return true;
+
+    if(base->is<Global>() && base->as<Global>()->isConstant)
+        return true;
+
+    if(base->type.getPointerType() && base->type.getPointerType().value()->addressSpace == AddressSpace::CONSTANT)
+        return true;
+
+    // TODO also check for no actual writes. Need to heed index-calculation from base!
+    return false;
+}
+
+// Finds the next instruction writing the given value into memory
+static InstructionWalker findNextValueStore(InstructionWalker it, const Value& src, std::size_t limit)
+{
+    while(!it.isEndOfBlock() && limit > 0)
+    {
+        auto memInstr = it.get<MemoryInstruction>();
+        if(memInstr != nullptr && memInstr->op == MemoryOperation::WRITE && memInstr->getSource() == src)
+        {
+            return it;
+        }
+        if(it.has<MemoryBarrier>() || it.has<Branch>() || it.has<MutexLock>() || it.has<SemaphoreAdjustment>())
+            break;
+        it.nextInBlock();
+        --limit;
+    }
+    return it.getBasicBlock()->walkEnd();
+}
+
+std::pair<MemoryAccessMap, FastSet<InstructionWalker>> normalization::determineMemoryAccess(Method& method)
+{
+    // TODO lower local/private struct-elements into VPM?! At least for single structs
+    logging::debug() << "Determining memory access for kernel: " << method.name << logging::endl;
+    MemoryAccessMap mapping;
+    FastSet<InstructionWalker> allWalkers;
+    for(const auto& param : method.parameters)
+    {
+        if(!param.type.isPointerType())
+            continue;
+        const auto* pointerType = param.type.getPointerType().value();
+        if(pointerType->addressSpace == AddressSpace::CONSTANT)
+        {
+            logging::debug() << "Constant parameter '" << param.to_string() << "' will be read from RAM via TMU"
+                             << logging::endl;
+            mapping[&param].preferred = MemoryType::RAM_LOAD_TMU;
+            // fall-back, e.g. for memory copy
+            mapping[&param].fallback = MemoryType::RAM_READ_WRITE_VPM;
+        }
+        else if(pointerType->addressSpace == AddressSpace::GLOBAL)
+        {
+            if(isMemoryOnlyRead(&param))
+            {
+                logging::debug() << "Global parameter '" << param.to_string()
+                                 << "' without any write access will be read from RAM via TMU" << logging::endl;
+                mapping[&param].preferred = MemoryType::RAM_LOAD_TMU;
+                // fall-back, e.g. for memory copy
+                mapping[&param].fallback = MemoryType::RAM_READ_WRITE_VPM;
+            }
+            else
+            {
+                logging::debug() << "Global parameter '" << param.to_string()
+                                 << "' which is written to will be stored in RAM and accessed via VPM" << logging::endl;
+                mapping[&param].preferred = MemoryType::RAM_READ_WRITE_VPM;
+                mapping[&param].fallback = MemoryType::RAM_READ_WRITE_VPM;
+            }
+        }
+        else if(pointerType->addressSpace == AddressSpace::LOCAL)
+        {
+            logging::debug() << "Local parameter '" << param.to_string()
+                             << "' will be stored in RAM and accessed via VPM" << logging::endl;
+            mapping[&param].preferred = MemoryType::RAM_READ_WRITE_VPM;
+            mapping[&param].fallback = MemoryType::RAM_READ_WRITE_VPM;
+        }
+        else
+            throw CompilationError(
+                CompilationStep::NORMALIZER, "Invalid address space for pointer parameter", param.to_string(true));
+    }
+
+    InstructionWalker it = method.walkAllInstructions();
+    while(!it.isEndOfMethod())
+    {
+        if(it.has<MemoryInstruction>())
+        {
+            // convert read-then-write to copy
+            auto memInstr = it.get<const MemoryInstruction>();
+            if(memInstr->op == MemoryOperation::READ && !memInstr->hasConditionalExecution() &&
+                memInstr->getDestination().local()->getUsers(LocalUse::Type::READER).size() == 1)
+            {
+                auto nextIt = findNextValueStore(it, memInstr->getDestination(), 16 /* TODO */);
+                auto nextMemInstr = nextIt.isEndOfBlock() ? nullptr : nextIt.get<const MemoryInstruction>();
+                if(nextMemInstr != nullptr && !nextIt->hasConditionalExecution() &&
+                    nextMemInstr->op == MemoryOperation::WRITE &&
+                    nextMemInstr->getSource().getSingleWriter() == memInstr &&
+                    nextMemInstr->getSourceElementType().getPhysicalWidth() ==
+                        memInstr->getDestinationElementType().getPhysicalWidth())
+                {
+                    // TODO also extend so value read, not modified and stored (if used otherwise) is replaced with load
+                    // (for other uses) and copy -> supports other type sizes
+                    logging::debug()
+                        << "Found reading of memory where the sole usage writes the value back into memory: "
+                        << memInstr->to_string() << logging::endl;
+                    logging::debug() << "Replacing manual copy of memory with memory copy instruction for write: "
+                                     << nextMemInstr->to_string() << logging::endl;
+
+                    const Value src = memInstr->getSource();
+                    it.erase();
+                    nextIt.reset(new MemoryInstruction(
+                        MemoryOperation::COPY, nextMemInstr->getDestination(), src, nextMemInstr->getNumEntries()));
+                    // continue with the next instruction after the read in the next iteration
+                    continue;
+                }
+            }
+            for(const auto local : memInstr->getMemoryAreas())
+            {
+                if(mapping.find(local) != mapping.end())
+                {
+                    // local was already processed
+                    mapping[local].accessInstructions.emplace(it);
+                    continue;
+                }
+                mapping[local].accessInstructions.emplace(it);
+                if(local->is<StackAllocation>())
+                {
+                    if(local->type.isSimpleType() || convertSmallArrayToRegister(local))
+                    {
+                        logging::debug() << "Small stack value '" << local->to_string()
+                                         << "' will be stored in a register" << logging::endl;
+                        mapping[local].preferred = MemoryType::QPU_REGISTER_READWRITE;
+                        // we cannot pack an array into a VPM cache line, since always all 16 elements are read/written
+                        // and we would overwrite the other elements
+                        mapping[local].fallback =
+                            local->type.isSimpleType() ? MemoryType::VPM_PER_QPU : MemoryType::RAM_READ_WRITE_VPM;
+                    }
+                    else if(!local->type.getElementType().getStructType())
+                    {
+                        logging::debug() << "Stack value '" << local->to_string()
+                                         << "' will be stored in VPM per QPU (with fall-back to RAM via VPM)"
+                                         << logging::endl;
+                        mapping[local].preferred = MemoryType::VPM_PER_QPU;
+                        mapping[local].fallback = MemoryType::RAM_READ_WRITE_VPM;
+                    }
+                    else
+                    {
+                        logging::debug() << "Struct stack value '" << local->to_string()
+                                         << "' will be stored in RAM per QPU (via VPM)" << logging::endl;
+                        mapping[local].preferred = MemoryType::RAM_READ_WRITE_VPM;
+                        mapping[local].fallback = MemoryType::RAM_READ_WRITE_VPM;
+                    }
+                }
+                else if(local->is<Global>())
+                {
+                    if(isMemoryOnlyRead(local))
+                    {
+                        // global buffer
+                        if(getConstantValue(memInstr->getSource()))
+                        {
+                            logging::debug() << "Constant element of constant buffer '" << local->to_string()
+                                             << "' will be stored in a register " << logging::endl;
+                            mapping[local].preferred = MemoryType::QPU_REGISTER_READONLY;
+                            mapping[local].fallback = MemoryType::RAM_LOAD_TMU;
+                        }
+                        else if(convertSmallArrayToRegister(local))
+                        {
+                            logging::debug() << "Small constant buffer '" << local->to_string()
+                                             << "' will be stored in a register" << logging::endl;
+                            mapping[local].preferred = MemoryType::QPU_REGISTER_READONLY;
+                            mapping[local].fallback = MemoryType::RAM_LOAD_TMU;
+                        }
+                        else
+                        {
+                            logging::debug() << "Constant buffer '" << local->to_string()
+                                             << "' will be read from RAM via TMU" << logging::endl;
+                            mapping[local].preferred = MemoryType::RAM_LOAD_TMU;
+                            // fall-back, e.g. for memory copy
+                            mapping[local].fallback = MemoryType::RAM_READ_WRITE_VPM;
+                        }
+                    }
+                    else if(!local->type.getElementType().getStructType())
+                    {
+                        // local buffer
+                        logging::debug() << "Local buffer '" << local->to_string()
+                                         << "' will be stored in VPM (with fall-back to RAM via VPM)" << logging::endl;
+                        mapping[local].preferred = MemoryType::VPM_SHARED_ACCESS;
+                        mapping[local].fallback = MemoryType::RAM_READ_WRITE_VPM;
+                    }
+                    else
+                    {
+                        // local buffer
+                        logging::debug() << "Local struct '" << local->to_string() << "' will be stored in RAM via VPM"
+                                         << logging::endl;
+                        mapping[local].preferred = MemoryType::RAM_READ_WRITE_VPM;
+                        mapping[local].fallback = MemoryType::RAM_READ_WRITE_VPM;
+                    }
+                }
+                else
+                    // parameters MUST be handled before and there is no other type of memory objects
+                    throw CompilationError(
+                        CompilationStep::NORMALIZER, "Invalid local type for memory area", local->to_string(true));
+            }
+            if(it.has())
+                allWalkers.emplace(it);
+        }
+        it.nextInMethod();
+    }
+
+    return std::make_pair(std::move(mapping), std::move(allWalkers));
+}
+
+static MemoryInfo canLowerToRegisterReadOnly(Method& method, const Local* baseAddr, MemoryAccess& access)
+{
+    // a) the global is a constant scalar/vector which fits into a single register
+    auto constant = getConstantValue(baseAddr->createReference());
+    if(constant)
+    {
+        return MemoryInfo{baseAddr, MemoryType::QPU_REGISTER_READONLY, nullptr, {}, constant};
+    }
+    // b) the global in a constant array small enough to be rewritten to fit into a single register (e.g. int[8])
+    auto convertedType = convertSmallArrayToRegister(baseAddr);
+    if(convertedType)
+    {
+        // convert int[8] to int8
+        Value convertedValue(baseAddr->as<Global>()->value);
+        convertedValue.type = *convertedType;
+        return MemoryInfo{baseAddr, MemoryType::QPU_REGISTER_READONLY, nullptr, {}, convertedValue, convertedType};
+    }
+    // c) the global is a constant where all accesses have constant indices and therefore all accessed elements can be
+    // determined at compile time
+    if(std::all_of(
+           access.accessInstructions.begin(), access.accessInstructions.end(), [&](InstructionWalker it) -> bool {
+               return getConstantValue(it.get<const intermediate::MemoryInstruction>()->getSource()).has_value();
+           }))
+        return MemoryInfo{baseAddr, MemoryType::QPU_REGISTER_READONLY};
+
+    // cannot lower to constant register, use fall-back
+    access.preferred = access.fallback;
+    return checkMemoryMapping(method, baseAddr, access);
+}
+
+static MemoryInfo canLowerToRegisterReadWrite(Method& method, const Local* baseAddr, MemoryAccess& access)
+{
+    // a) the private memory fits into a single register
+    if(baseAddr->type.isScalarType())
+        return MemoryInfo{baseAddr, MemoryType::QPU_REGISTER_READWRITE, nullptr, {},
+            method.addNewLocal(baseAddr->type, "%lowered_stack")};
+    // b) the private memory is small enough to be rewritten to fit into a single register (e.g. int[4])
+    auto convertedType = convertSmallArrayToRegister(baseAddr);
+    if(convertedType)
+        return MemoryInfo{baseAddr, MemoryType::QPU_REGISTER_READWRITE, nullptr, {},
+            method.addNewLocal(*convertedType, "%lowered_stack"), convertedType};
+
+    // cannot lower to register, use fall-back
+    access.preferred = access.fallback;
+    return checkMemoryMapping(method, baseAddr, access);
+}
+
+static MemoryInfo canLowerToPrivateVPMArea(Method& method, const Local* baseAddr, MemoryAccess& access)
+{
+    auto area =
+        method.vpm->addArea(baseAddr, baseAddr->type.getElementType(), true, method.metaData.getWorkGroupSize());
+    if(area)
+        return MemoryInfo{baseAddr, MemoryType::VPM_PER_QPU, area, {}, NO_VALUE, convertSmallArrayToRegister(baseAddr)};
+
+    // cannot lower to register, use fall-back
+    access.preferred = access.fallback;
+    return checkMemoryMapping(method, baseAddr, access);
+}
+
+static MemoryInfo canLowerToSharedVPMArea(Method& method, const Local* baseAddr, MemoryAccess& access)
+{
+    auto area = method.vpm->addArea(baseAddr, baseAddr->type.getElementType(), false);
+    if(area)
+        return MemoryInfo{
+            baseAddr, MemoryType::VPM_SHARED_ACCESS, area, {}, NO_VALUE, convertSmallArrayToRegister(baseAddr)};
+
+    // cannot lower to register, use fall-back
+    access.preferred = access.fallback;
+    return checkMemoryMapping(method, baseAddr, access);
+}
+
+static MemoryInfo canMapToTMUReadOnly(Method& method, const Local* baseAddr, MemoryAccess& access)
+{
+    // TODO for better performance, the TMU flag should alternate in according to the order of usage (first read use
+    // TMU0, second read use TMU1, ...)
+    static thread_local bool tmuFlag = true;
+    tmuFlag = !tmuFlag;
+    return MemoryInfo{baseAddr, MemoryType::RAM_LOAD_TMU, nullptr, {}, {}, {}, tmuFlag};
+}
+
+static FastAccessList<MemoryAccessRange> determineAccessRanges(
+    Method& method, const Local* baseAddr, MemoryAccess& access);
+static const periphery::VPMArea* checkCacheMemoryAccessRanges(
+    Method& method, const Local* baseAddr, FastAccessList<MemoryAccessRange>& accesRanges);
+
+static MemoryInfo canMapToDMAReadWrite(Method& method, const Local* baseAddr, MemoryAccess& access)
+{
+    PROFILE_START(DetermineAccessRanges);
+    auto ranges = determineAccessRanges(method, baseAddr, access);
+    PROFILE_END(DetermineAccessRanges);
+
+    if(!ranges.empty())
+    {
+        auto area = checkCacheMemoryAccessRanges(method, baseAddr, ranges);
+        if(area)
+            // TODO also need to mark for initial load/write-back
+            return MemoryInfo{baseAddr, MemoryType::VPM_SHARED_ACCESS, area, std::move(ranges)};
+    }
+    return MemoryInfo{baseAddr, MemoryType::RAM_READ_WRITE_VPM};
+}
+
+static bool isGroupUniform(const Local* local)
+{
+    auto writers = local->getUsers(LocalUse::Type::WRITER);
+    return std::all_of(writers.begin(), writers.end(), [](const LocalUser* instr) -> bool {
+        return instr->hasDecoration(InstructionDecorations::WORK_GROUP_UNIFORM_VALUE);
+    });
+}
+
+static bool isWorkGroupUniform(const Value& val)
+{
+    return val.hasImmediate() || val.hasLiteral() ||
+        (val.hasLocal() && isGroupUniform(val.local()))
+        // XXX this is not true for the local ID UNIFORM
+        || (val.hasRegister(REG_UNIFORM));
+}
+
+static FastMap<Value, InstructionDecorations> findDirectLevelAdditionInputs(const Value& val)
+{
+    FastMap<Value, InstructionDecorations> result;
+    auto writer = val.getSingleWriter();
+    if(writer == nullptr || writer->hasDecoration(InstructionDecorations::WORK_GROUP_UNIFORM_VALUE))
+    {
+        // we have no need to split up work-group uniform values any more detailed
+        auto deco = writer ? writer->decoration : InstructionDecorations::NONE;
+        result.emplace(val,
+            add_flag(deco,
+                val.hasImmediate() || val.hasLiteral() ? InstructionDecorations::WORK_GROUP_UNIFORM_VALUE :
+                                                         InstructionDecorations::NONE));
+        if(val.hasImmediate() && val.immediate().getIntegerValue() >= 0)
+            result[val] = add_flag(result[val], InstructionDecorations::UNSIGNED_RESULT);
+        else if(val.hasLiteral() && val.literal().signedInt() >= 0)
+            result[val] = add_flag(result[val], InstructionDecorations::UNSIGNED_RESULT);
+        else if(val.hasRegister() && val.reg() == REG_UNIFORM)
+            // XXX this is not true for the local ID UNIFORM, which should never be checked here (since the actual ID
+            // needs always be extracted via non-ADDs, e.g. ANDs)
+            result[val] = add_flag(result[val], InstructionDecorations::WORK_GROUP_UNIFORM_VALUE);
+        return result;
+    }
+    auto move = dynamic_cast<const MoveOperation*>(writer);
+    if(move && !dynamic_cast<const VectorRotation*>(writer))
+        return findDirectLevelAdditionInputs(move->getSource());
+
+    auto op = dynamic_cast<const Operation*>(writer);
+    bool onlySideEffectIsReadingUniform = op && op->hasSideEffects() && !op->doesSetFlag() &&
+        !op->signal.hasSideEffects() &&
+        !(op->hasValueType(ValueType::REGISTER) && op->getOutput()->reg().hasSideEffectsOnWrite()) &&
+        std::all_of(op->getArguments().begin(), op->getArguments().end(), [](const Value& arg) -> bool {
+            return !arg.hasRegister() || arg.reg() == REG_UNIFORM || !arg.reg().hasSideEffectsOnRead();
+        });
+    if(op && op->op == OP_ADD && !op->hasConditionalExecution() &&
+        (!op->hasSideEffects() || onlySideEffectIsReadingUniform) && !op->hasPackMode() && !op->hasUnpackMode())
+    {
+        FastMap<Value, InstructionDecorations> args;
+        for(const auto& arg : op->getArguments())
+        {
+            auto tmp = findDirectLevelAdditionInputs(arg);
+            args.insert(tmp.begin(), tmp.end());
+        }
+        return args;
+    }
+    result.emplace(val, writer->decoration);
+    return result;
+}
+
+static Optional<MemoryAccessRange> determineAccessRange(Method& method, InstructionWalker it, InstructionWalker memIt)
+{
+    // 1. find writes to VPM DMA addresses with work-group uniform part in address values
+    if(it.has<MoveOperation>() && it->assertArgument(0).hasLocal() &&
+        (it->assertArgument(0).local()->is<Parameter>() || it->assertArgument(0).local()->is<Global>()))
+    {
+        // direct write of address (e.g. all work items write to the same location)
+        // XXX if the memory is __local and the width of the writes is known, can lower into VPM (e.g. for data
+        // exchange between work-items). But then the __local memory should be set small enough to fit in the VPM
+        // completely, which is already handled at this point.
+        logging::debug() << "DMA address is directly set to a parameter/global address, cannot be "
+                            "optimized by caching multiple accesses: "
+                         << it->to_string() << logging::endl;
+        return {};
+    }
+    MemoryAccessRange range;
+    range.memoryInstruction = memIt;
+    // if the instruction is a move, handle/skip it here, so the add with the shifted offset +
+    // base-pointer is found correctly
+    auto trackIt = it;
+    if(it.has<MoveOperation>() && it->assertArgument(0).getSingleWriter())
+    {
+        auto walker = it.getBasicBlock()->findWalkerForInstruction(it->assertArgument(0).getSingleWriter(), it);
+        if(!walker)
+        {
+            // TODO this is actually no problem (other than finding the iterator), is it?
+            logging::debug() << "Unhandled case, address is calculated in a different basic-block: " << it->to_string()
+                             << logging::endl;
+            return {};
+        }
+        else
+            trackIt = walker.value();
+    }
+
+    auto variableArg =
+        std::find_if_not(trackIt->getArguments().begin(), trackIt->getArguments().end(), isWorkGroupUniform);
+    if(variableArg != trackIt->getArguments().end() && variableArg->getSingleWriter() != nullptr)
+    {
+        // 2. rewrite address so all work-group uniform parts are combined and all variable parts and
+        // added in the end
+        logging::debug() << "Found VPM DMA address write with work-group uniform operand: " << it->to_string()
+                         << logging::endl;
+        Value varArg = *variableArg;
+        // 2.1 jump over final addition of base address if it is a parameter
+        if(trackIt.has<Operation>() && trackIt.get<const Operation>()->op == OP_ADD)
+        {
+            const auto& arg0 = trackIt->assertArgument(0);
+            const auto& arg1 = trackIt->assertArgument(1);
+            if(arg0.hasLocal() &&
+                (arg0.local()->is<Parameter>() || arg0.local()->is<Global>() ||
+                    arg0.local()->name == Method::GLOBAL_DATA_ADDRESS))
+            {
+                range.memoryObject = arg0.local();
+                varArg = arg1;
+            }
+            else if(arg1.hasLocal() &&
+                (arg1.local()->is<Parameter>() || arg1.local()->is<Global>() ||
+                    arg1.local()->name == Method::GLOBAL_DATA_ADDRESS))
+            {
+                range.memoryObject = arg1.local();
+                varArg = arg0;
+            }
+            else if(arg0.hasRegister(REG_UNIFORM))
+            {
+                // e.g. reading of uniform for parameter is replaced by reading uniform here (if
+                // parameter only used once)
+                range.memoryObject = trackIt->getOutput()->local()->getBase(true);
+                varArg = arg1;
+            }
+            else if(arg1.hasRegister(REG_UNIFORM))
+            {
+                range.memoryObject = trackIt->getOutput()->local()->getBase(true);
+                varArg = arg0;
+            }
+            else
+            {
+                throw CompilationError(
+                    CompilationStep::OPTIMIZER, "Unhandled case of memory access: ", trackIt->to_string());
+            }
+            range.baseAddressAdd = trackIt;
+        }
+        else
+        {
+            logging::debug() << "Cannot optimize further, since add of base-address and pointer was not found: "
+                             << it->to_string() << logging::endl;
+            return {};
+        }
+        auto writer = varArg.getSingleWriter();
+        // 2.2 jump over shl (if any) and remember offset
+        if(dynamic_cast<const Operation*>(writer) && dynamic_cast<const Operation*>(writer)->op == OP_SHL)
+        {
+            if(!writer->assertArgument(1).getLiteralValue() ||
+                (1u << writer->assertArgument(1).getLiteralValue()->unsignedInt()) !=
+                    it->assertArgument(0).type.getElementType().getPhysicalWidth())
+            {
+                // Abort, since the offset shifted does not match the type-width of the element type
+                logging::debug() << "Cannot optimize further, since shift-offset does not match type size: "
+                                 << it->to_string() << " and " << writer->to_string() << logging::endl;
+                return {};
+            }
+            range.typeSizeShift = trackIt.getBasicBlock()->findWalkerForInstruction(writer, trackIt);
+            varArg = writer->assertArgument(0);
+            writer = varArg.getSingleWriter();
+        }
+        // 2.3 collect all directly neighboring (and directly referenced) additions
+        // result is now: finalAdd + (sum(addedValues) << shiftFactor)
+        auto addressParts = findDirectLevelAdditionInputs(varArg);
+        // 2.4 calculate the maximum dynamic offset
+        for(const auto& val : addressParts)
+        {
+            if(!has_flag(val.second, InstructionDecorations::WORK_GROUP_UNIFORM_VALUE))
+            {
+                range.dynamicAddressParts.emplace(val);
+                if(val.first.hasLocal())
+                {
+                    auto singleRange = analysis::ValueRange::getValueRange(val.first, &method);
+                    range.offsetRange.minValue += singleRange.getIntRange()->minValue;
+                    range.offsetRange.maxValue += singleRange.getIntRange()->maxValue;
+                }
+                else
+                    throw CompilationError(
+                        CompilationStep::OPTIMIZER, "Unhandled value for memory access offset", val.first.to_string());
+            }
+            else
+                range.groupUniformAddressParts.emplace(val);
+        }
+        logging::debug() << range.to_string() << logging::endl;
+        return range;
+    }
+    return {};
+}
+
+static Optional<InstructionWalker> findSingleWriter(InstructionWalker it, const Value& val)
+{
+    const IntermediateInstruction* writer = nullptr;
+    for(const auto& w : val.local()->getUsers(LocalUse::Type::WRITER))
+    {
+        if(dynamic_cast<const MemoryInstruction*>(w))
+            // store memory instructions count as writers, so ignore them
+            continue;
+        if(writer)
+        {
+            writer = nullptr;
+            break;
+        }
+        writer = w;
+    }
+    if(!writer)
+    {
+        logging::debug() << "Unhandled case, value does not have exactly 1 writer: " << it->to_string()
+                         << logging::endl;
+        return {};
+    }
+    auto writerIt = it.getBasicBlock()->findWalkerForInstruction(writer, it);
+    if(!writerIt)
+    {
+        logging::debug() << "Unhandled case, address is calculated in a different basic-block: " << it->to_string()
+                         << logging::endl;
+        return {};
+    }
+    return writerIt;
+}
+
+static FastAccessList<MemoryAccessRange> determineAccessRanges(
+    Method& method, const Local* baseAddr, MemoryAccess& access)
+{
+    // FIXME re-enable check for caching once rest works again
+    return FastAccessList<MemoryAccessRange>{};
+    // NOTE: If we cannot find one access range for a local, we cannot combine any other access ranges for this local!
+    static const auto copiedFromCheck = [](const InstructionWalker& it) -> bool {
+        return it.get<const MemoryInstruction>()->op == MemoryOperation::COPY;
+    };
+    FastAccessList<MemoryAccessRange> result;
+    for(auto it : access.accessInstructions)
+    {
+        const auto memInstr = it.get<MemoryInstruction>();
+        switch(memInstr->op)
+        {
+        case MemoryOperation::READ:
+        {
+            auto writerIt = findSingleWriter(it, memInstr->getSource());
+            if(writerIt)
+            {
+                auto res = determineAccessRange(method, writerIt.value(), it);
+                if(res)
+                    result.emplace_back(std::move(res.value()));
+                else
+                    return FastAccessList<MemoryAccessRange>{};
+            }
+            break;
+        }
+        case MemoryOperation::WRITE:
+        case MemoryOperation::FILL:
+        {
+            auto writerIt = findSingleWriter(it, memInstr->getDestination());
+            if(writerIt)
+            {
+                auto res = determineAccessRange(method, writerIt.value(), it);
+                if(res)
+                    result.emplace_back(std::move(res.value()));
+                else
+                    return FastAccessList<MemoryAccessRange>{};
+            }
+            break;
+        }
+        case MemoryOperation::COPY:
+        {
+            auto writerIt = findSingleWriter(it, memInstr->getSource());
+            if(writerIt &&
+                // special handling for memory which is only copied from (never read/written), since no extra space
+                // is required
+                !std::all_of(access.accessInstructions.begin(), access.accessInstructions.end(), copiedFromCheck))
+            {
+                auto res = determineAccessRange(method, writerIt.value(), it);
+                if(res)
+                    result.emplace_back(std::move(res.value()));
+                else
+                    return FastAccessList<MemoryAccessRange>{};
+            }
+            writerIt = findSingleWriter(it, memInstr->getDestination());
+            if(writerIt)
+            {
+                auto res = determineAccessRange(method, writerIt.value(), it);
+                if(res)
+                    result.emplace_back(std::move(res.value()));
+                else
+                    return FastAccessList<MemoryAccessRange>{};
+            }
+            break;
+        }
+        }
+    }
+    return result;
+}
+
+static std::pair<bool, analysis::IntegerRange> checkWorkGroupUniformParts(
+    FastAccessList<MemoryAccessRange>& accessRanges)
+{
+    analysis::IntegerRange offsetRange{std::numeric_limits<int>::max(), std::numeric_limits<int>::min()};
+    const auto& firstUniformAddresses = accessRanges.front().groupUniformAddressParts;
+    FastMap<Value, InstructionDecorations> differingUniformParts;
+    bool allUniformPartsEqual = true;
+    for(auto& entry : accessRanges)
+    {
+        if(entry.groupUniformAddressParts != firstUniformAddresses)
+        {
+            allUniformPartsEqual = false;
+            for(const auto& pair : entry.groupUniformAddressParts)
+            {
+                if(firstUniformAddresses.find(pair.first) == firstUniformAddresses.end())
+                    differingUniformParts.emplace(pair);
+            }
+            for(const auto& pair : firstUniformAddresses)
+                if(entry.groupUniformAddressParts.find(pair.first) == entry.groupUniformAddressParts.end())
+                    differingUniformParts.emplace(pair);
+        }
+        offsetRange.minValue = std::min(offsetRange.minValue, entry.offsetRange.minValue);
+        offsetRange.maxValue = std::max(offsetRange.maxValue, entry.offsetRange.maxValue);
+    }
+    if(!allUniformPartsEqual)
+    {
+        if(std::all_of(differingUniformParts.begin(), differingUniformParts.end(),
+               [](const std::pair<Value, InstructionDecorations>& part) -> bool {
+                   return part.first.getLiteralValue().has_value();
+               }))
+        {
+            // all work-group uniform values which differ between various accesses of the same local are literal
+            // values. We can use this knowledge to still allow caching the local, by converting the literals to
+            // dynamic offsets
+            for(auto& entry : accessRanges)
+            {
+                auto it = entry.groupUniformAddressParts.begin();
+                while(it != entry.groupUniformAddressParts.end())
+                {
+                    if(differingUniformParts.find(it->first) != differingUniformParts.end())
+                    {
+                        entry.offsetRange.minValue += it->first.getLiteralValue()->signedInt();
+                        entry.offsetRange.maxValue += it->first.getLiteralValue()->signedInt();
+                        entry.dynamicAddressParts.emplace(*it);
+                        it = entry.groupUniformAddressParts.erase(it);
+                    }
+                    else
+                        ++it;
+                }
+            }
+            return checkWorkGroupUniformParts(accessRanges);
+        }
+        else
+            return std::make_pair(false, analysis::IntegerRange{});
+    }
+    return std::make_pair(true, offsetRange);
+}
+
+static const periphery::VPMArea* checkCacheMemoryAccessRanges(
+    Method& method, const Local* baseAddr, FastAccessList<MemoryAccessRange>& memoryAccessRanges)
+{
+    auto maxNumVectors = method.vpm->getMaxCacheVectors(TYPE_INT32, true);
+    GroupedAccessRanges result;
+
+    bool allUniformPartsEqual;
+    analysis::IntegerRange offsetRange;
+    std::tie(allUniformPartsEqual, offsetRange) = checkWorkGroupUniformParts(memoryAccessRanges);
+    if(!allUniformPartsEqual)
+    {
+        logging::debug() << "Cannot cache memory location " << baseAddr->to_string()
+                         << " in VPM, since the work-group uniform parts of the address calculations differ, which "
+                            "is not yet supported!"
+                         << logging::endl;
+        return nullptr;
+    }
+    if((offsetRange.maxValue - offsetRange.minValue) >= maxNumVectors || (offsetRange.maxValue < offsetRange.minValue))
+    {
+        // this also checks for any over/underflow when converting the range to unsigned int in the next steps
+        logging::debug() << "Cannot cache memory location " << baseAddr->to_string()
+                         << " in VPM, the accessed range is too big: [" << offsetRange.minValue << ", "
+                         << offsetRange.maxValue << "]" << logging::endl;
+        return nullptr;
+    }
+    logging::debug() << "Memory location " << baseAddr->to_string() << " is accessed via DMA in the dynamic range ["
+                     << offsetRange.minValue << ", " << offsetRange.maxValue << "]" << logging::endl;
+
+    // TODO correct type?? Shouldn't it be baseAddr->type.getElmentType().toArrayType(...??
+    auto accessedType = baseAddr->type.toArrayType(
+        static_cast<unsigned>(offsetRange.maxValue - offsetRange.minValue + 1 /* bounds of range are inclusive! */));
+
+    // XXX the local is not correct, at least not if there is a work-group uniform offset, but since all work-items
+    // use the same work-group offset, it doesn't matter
+    auto vpmArea = method.vpm->addArea(baseAddr, accessedType, false);
+    if(vpmArea == nullptr)
+    {
+        logging::debug() << "Memory location " << baseAddr->to_string() << " with dynamic access range ["
+                         << offsetRange.minValue << ", " << offsetRange.maxValue
+                         << "] cannot be cached in VPM, since it does not fit" << logging::endl;
+        return nullptr;
+    }
+    return vpmArea;
+}
\ No newline at end of file
diff --git a/src/normalization/MemoryMappings.cpp b/src/normalization/MemoryMappings.cpp
new file mode 100644
index 00000000..0dd70746
--- /dev/null
+++ b/src/normalization/MemoryMappings.cpp
@@ -0,0 +1,541 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#include "MemoryMappings.h"
+
+#include "../intermediate/Helper.h"
+#include "../intermediate/IntermediateInstruction.h"
+#include "../intermediate/operators.h"
+#include "../periphery/TMU.h"
+#include "log.h"
+
+using namespace vc4c;
+using namespace vc4c::normalization;
+using namespace vc4c::intermediate;
+using namespace vc4c::operators;
+
+using MemoryMapper = InstructionWalker (*)(
+    Method&, InstructionWalker, MemoryInstruction*, const MemoryInfo&, const MemoryInfo&);
+
+static InstructionWalker invalidMapping(
+    Method& method, InstructionWalker it, MemoryInstruction* mem, const MemoryInfo& srcInfo, const MemoryInfo& destInfo)
+{
+    throw CompilationError(CompilationStep::NORMALIZER, "Invalid memory access", mem->to_string());
+}
+
+static InstructionWalker lowerMemoryReadOnlyToRegister(Method& method, InstructionWalker it, MemoryInstruction* mem,
+    const MemoryInfo& srcInfo, const MemoryInfo& destInfo);
+static InstructionWalker lowerMemoryReadWriteToRegister(Method& method, InstructionWalker it, MemoryInstruction* mem,
+    const MemoryInfo& srcInfo, const MemoryInfo& destInfo);
+static InstructionWalker lowerMemoryCopyToRegister(Method& method, InstructionWalker it, MemoryInstruction* mem,
+    const MemoryInfo& srcInfo, const MemoryInfo& destInfo);
+static InstructionWalker lowerMemoryReadToVPM(Method& method, InstructionWalker it, MemoryInstruction* mem,
+    const MemoryInfo& srcInfo, const MemoryInfo& destInfo);
+static InstructionWalker lowerMemoryWriteToVPM(Method& method, InstructionWalker it, MemoryInstruction* mem,
+    const MemoryInfo& srcInfo, const MemoryInfo& destInfo);
+static InstructionWalker loadMemoryViaTMU(Method& method, InstructionWalker it, MemoryInstruction* mem,
+    const MemoryInfo& srcInfo, const MemoryInfo& destInfo);
+static InstructionWalker accessMemoryInRAMViaVPM(Method& method, InstructionWalker it, MemoryInstruction* mem,
+    const MemoryInfo& srcInfo, const MemoryInfo& destInfo);
+static InstructionWalker mapMemoryCopy(Method& method, InstructionWalker it, MemoryInstruction* mem,
+    const MemoryInfo& srcInfo, const MemoryInfo& destInfo);
+
+/* clang-format off */
+static constexpr MemoryMapper MAPPERS[6][4] = {
+    /* READ,                         WRITE,                          COPY (from),                   FILL */
+    {lowerMemoryReadOnlyToRegister,  invalidMapping,                 lowerMemoryReadOnlyToRegister, invalidMapping},                 /* QPU_REGISTER_READONLY */
+    {lowerMemoryReadWriteToRegister, lowerMemoryReadWriteToRegister, lowerMemoryCopyToRegister,     lowerMemoryReadWriteToRegister}, /* QPU_REGISTER_READWRITE */
+    {lowerMemoryReadToVPM,           lowerMemoryWriteToVPM,          mapMemoryCopy,                 lowerMemoryWriteToVPM},          /* VPM_PER_QPU */
+    {lowerMemoryReadToVPM,           lowerMemoryWriteToVPM,          mapMemoryCopy,                 lowerMemoryWriteToVPM},          /* VPM_SHARED_ACCESS */
+    {loadMemoryViaTMU,               invalidMapping,                 mapMemoryCopy,                 invalidMapping},                 /* RAM_LOAD_TMU */
+    {accessMemoryInRAMViaVPM,        accessMemoryInRAMViaVPM,        mapMemoryCopy,                 accessMemoryInRAMViaVPM},        /* RAM_READ_WRITE_VPM */
+};
+/* clang-format on */
+
+InstructionWalker normalization::mapMemoryAccess(Method& method, InstructionWalker it,
+    intermediate::MemoryInstruction* mem, const MemoryInfo& srcInfo, const MemoryInfo& destInfo)
+{
+    auto type = mem->op == MemoryOperation::READ || mem->op == MemoryOperation::COPY ? srcInfo.type : destInfo.type;
+    return MAPPERS[static_cast<unsigned>(type)][static_cast<unsigned>(mem->op)](method, it, mem, srcInfo, destInfo);
+}
+
+static bool copiesWholeRegister(const Value& numEntries, const DataType& elementType, const DataType& registerType)
+{
+    // for copying of byte* where actually the whole vector is copied
+    return numEntries.getLiteralValue() &&
+        numEntries.getLiteralValue()->unsignedInt() * elementType.getPhysicalWidth() == registerType.getPhysicalWidth();
+}
+
+/*
+ * There are several cases of memory lowered into registers:
+ * - constant memory with constant index (direct value determinable) -> map to direct value
+ * - constant memory which fits into register but dynamic index -> map to register, index by vector rotation
+ * - private memory which fits into register -> map to register
+ * - private memory where the type can be converted to fit into register -> map to register + index by vector
+ * rotation
+ */
+
+/*
+ * Lowers access to a constant memory location into a register.
+ *
+ * This can be done for constant memory locations.
+ *
+ * NOTE: This is the best optimization for memory access and should be preferred, where applicable.
+ */
+static InstructionWalker lowerMemoryReadOnlyToRegister(
+    Method& method, InstructionWalker it, MemoryInstruction* mem, const MemoryInfo& srcInfo, const MemoryInfo& destInfo)
+{
+    if(mem->op != MemoryOperation::READ && mem->op != MemoryOperation::COPY)
+        throw CompilationError(
+            CompilationStep::NORMALIZER, "Cannot perform a non-read operation on constant memory", mem->to_string());
+
+    Value tmpIndex = UNDEFINED_VALUE;
+    it = insertAddressToElementOffset(it, method, tmpIndex, srcInfo.local, *srcInfo.mappedRegisterOrConstant, mem);
+    // TODO check whether index is guaranteed to be in range [0, 16[
+    auto elementType = srcInfo.convertedRegisterType ? *srcInfo.convertedRegisterType :
+                                                       srcInfo.mappedRegisterOrConstant->type.getElementType();
+    auto wholeRegister =
+        copiesWholeRegister(mem->getNumEntries(), mem->getDestinationElementType(), *srcInfo.convertedRegisterType);
+    Value tmpVal(UNDEFINED_VALUE);
+    if(mem->op == MemoryOperation::COPY && wholeRegister)
+        // there is no need to calculate the index, if we copy the whole object
+        tmpVal = *srcInfo.convertedRegisterType;
+    else
+    {
+        tmpVal = method.addNewLocal(elementType, "%lowered_constant");
+        it = insertVectorExtraction(it, method, *srcInfo.mappedRegisterOrConstant, tmpIndex, tmpVal);
+    }
+
+    if(srcInfo.mappedRegisterOrConstant && !srcInfo.mappedRegisterOrConstant->hasLocal())
+    {
+        if(mem->op == MemoryOperation::COPY)
+        {
+            if(!wholeRegister && mem->getNumEntries() != INT_ONE)
+                throw CompilationError(CompilationStep::NORMALIZER,
+                    "Lowering copy with more than 1 entry is not yet implemented", mem->to_string());
+            it.reset(new MemoryInstruction(MemoryOperation::WRITE, mem->getDestination(), tmpVal));
+            logging::debug() << "Replaced memory copy from constant memory to memory write of constant value: "
+                             << it->to_string() << logging::endl;
+            return mapMemoryAccess(method, it, it.get<MemoryInstruction>(), srcInfo, destInfo);
+        }
+        if(mem->op == MemoryOperation::READ)
+        {
+            it.reset(new MoveOperation(mem->getDestination(), tmpVal));
+            logging::debug() << "Replaced loading of constant memory with constant literal: " << it->to_string()
+                             << logging::endl;
+            return it;
+        }
+    }
+    if(srcInfo.convertedRegisterType)
+    {
+        if(mem->op == MemoryOperation::READ)
+        {
+            it.reset(new MoveOperation(mem->getDestination(), tmpVal));
+            logging::debug() << "Replaced loading of constant memory with vector rotation of register: "
+                             << it->to_string() << logging::endl;
+            return it;
+        }
+        if(mem->op == MemoryOperation::COPY)
+        {
+            if(!wholeRegister && mem->getNumEntries() != INT_ONE)
+                throw CompilationError(CompilationStep::NORMALIZER,
+                    "Lowering copy with more than 1 entry is not yet implemented", mem->to_string());
+            it.reset(new MemoryInstruction(MemoryOperation::WRITE, mem->getDestination(), tmpVal));
+            it = mapMemoryAccess(method, it, it.get<MemoryInstruction>(), srcInfo, destInfo);
+            logging::debug() << "Replaced copying from constant memory with vector rotation and writing of memory: "
+                             << it->to_string() << logging::endl;
+            return it;
+        }
+    }
+    auto constant = getConstantValue(mem->getSource());
+    if(constant)
+    {
+        if(mem->op == MemoryOperation::COPY)
+        {
+            if(mem->getNumEntries() != INT_ONE)
+                throw CompilationError(CompilationStep::NORMALIZER,
+                    "Lowering copy with more than 1 entry is not yet implemented", mem->to_string());
+            // since a copy always involves another memory object, this rewrite is picked up when the other
+            // object is processed
+            it.reset(new MemoryInstruction(MemoryOperation::WRITE, mem->getDestination(), *constant));
+            it = mapMemoryAccess(method, it, it.get<MemoryInstruction>(), srcInfo, destInfo);
+            logging::debug() << "Replaced memory copy from constant memory to memory write of constant value: "
+                             << it->to_string() << logging::endl;
+            return it;
+        }
+        else
+        {
+            it.reset(new MoveOperation(mem->getOutput().value(), *constant));
+            logging::debug() << "Replaced loading of constant memory with constant literal: " << it->to_string()
+                             << logging::endl;
+            return it;
+        }
+    }
+    throw CompilationError(
+        CompilationStep::NORMALIZER, "Unhandled case of lowering constant memory to register", mem->to_string());
+}
+
+/*
+ * Maps memory access to the given local into moves from/to the given register
+ *
+ * NOTE: This is the best optimization for memory access and should always be preferred.
+ * NOTE: This optimization cannot be applied if changes made to the lowered register need to be reflected to other QPUs.
+ */
+static InstructionWalker lowerMemoryReadWriteToRegister(
+    Method& method, InstructionWalker it, MemoryInstruction* mem, const MemoryInfo& srcInfo, const MemoryInfo& destInfo)
+{
+    const auto& loweredInfo = mem->op == MemoryOperation::READ ? srcInfo : destInfo;
+    if(!loweredInfo.mappedRegisterOrConstant)
+        throw CompilationError(CompilationStep::NORMALIZER,
+            "Cannot map memory location to register without mapping register specified", mem->to_string());
+    const auto& loweredRegister = loweredInfo.mappedRegisterOrConstant.value();
+    const auto local = loweredInfo.local;
+    Value tmpIndex = UNDEFINED_VALUE;
+    // TODO does this also handle whole-object access (e.g. for scalar/vector memory areas) ??
+    // TODO check whether index is guaranteed to be in range [0, 16[
+    it = insertAddressToElementOffset(it, method, tmpIndex, local, loweredRegister, mem);
+    if(mem->op == MemoryOperation::READ)
+    {
+        it = insertVectorExtraction(it, method, loweredRegister, tmpIndex, mem->getDestination());
+    }
+    else if(mem->op == MemoryOperation::WRITE)
+    {
+        // TODO need special handling for inserting multiple elements to set all new elements
+        it = insertVectorInsertion(it, method, loweredRegister, tmpIndex, mem->getSource());
+    }
+    else if(mem->op == MemoryOperation::FILL && mem->getSource().type.isScalarType())
+    {
+        it = insertReplication(it, mem->getSource(), loweredRegister);
+    }
+    else
+        throw CompilationError(
+            CompilationStep::NORMALIZER, "Unhandled case of lowering memory access to register", mem->to_string());
+    logging::debug() << "Replaced access to stack allocation '" << it->to_string()
+                     << "' with: " << it.copy().previousInBlock()->to_string() << logging::endl;
+    return it.erase();
+}
+
+static InstructionWalker lowerMemoryCopyToRegister(
+    Method& method, InstructionWalker it, MemoryInstruction* mem, const MemoryInfo& srcInfo, const MemoryInfo& destInfo)
+{
+    if(srcInfo.local == destInfo.local)
+        throw CompilationError(CompilationStep::NORMALIZER,
+            "Copy from and to same register lowered memory area is not supported", mem->to_string());
+    if(mem->op != MemoryOperation::COPY)
+        throw CompilationError(
+            CompilationStep::NORMALIZER, "Unhandled case of lowering memory access to register", mem->to_string());
+    if(destInfo.type == MemoryType::QPU_REGISTER_READONLY)
+        throw CompilationError(
+            CompilationStep::NORMALIZER, "Copy into read-only registers is not supported", mem->to_string());
+
+    auto wholeRegister =
+        copiesWholeRegister(mem->getNumEntries(), mem->getDestinationElementType(), *srcInfo.convertedRegisterType);
+
+    if(!wholeRegister && mem->getNumEntries() != INT_ONE)
+        throw CompilationError(CompilationStep::NORMALIZER,
+            "Lowering copy with more than 1 entry is not yet implemented", mem->to_string());
+
+    logging::debug() << "Lowering copy with register-mapped memory: " << mem->to_string() << logging::endl;
+
+    Value tmpIndex = UNDEFINED_VALUE;
+    if(srcInfo.mappedRegisterOrConstant)
+    {
+        // TODO check whether index is guaranteed to be in range [0, 16[
+        Value tmp(UNDEFINED_VALUE);
+        if(wholeRegister)
+            tmp = *srcInfo.mappedRegisterOrConstant;
+        else
+        {
+            tmp = method.addNewLocal(mem->getSourceElementType());
+            it = insertVectorExtraction(it, method, *srcInfo.mappedRegisterOrConstant, tmpIndex, tmp);
+        }
+        it.reset(new MemoryInstruction(MemoryOperation::WRITE, mem->getDestination(), tmp));
+        return mapMemoryAccess(method, it, it.get<MemoryInstruction>(), srcInfo, destInfo);
+    }
+    if(destInfo.mappedRegisterOrConstant)
+    {
+        // TODO is this ever called?? copying into register (from anywhere should be handled smewhere else)
+        throw CompilationError(CompilationStep::NORMALIZER,
+            "lowerMemoryCopyToRegister should not be called to copy into register", mem->to_string());
+        // TODO need special handling for inserting multiple elements to set all new elements
+        auto tmp = method.addNewLocal(mem->getDestinationElementType());
+        it.emplace(new MemoryInstruction(MemoryOperation::READ, tmp, mem->getSource()));
+        it = mapMemoryAccess(method, it, it.get<MemoryInstruction>(), srcInfo, destInfo);
+        it = insertVectorInsertion(it, method, *destInfo.mappedRegisterOrConstant, tmpIndex, mem->getSource());
+        return it.erase();
+    }
+    throw CompilationError(
+        CompilationStep::NORMALIZER, "Unhandled case of lowering memory access to register", mem->to_string());
+}
+
+static InstructionWalker insertToInVPMAreaOffset(
+    Method& method, InstructionWalker it, Value& out, const MemoryInfo& info, const MemoryInstruction* mem)
+{
+    if(info.ranges)
+    {
+        auto range = std::find_if(info.ranges->begin(), info.ranges->end(),
+            [&](const MemoryAccessRange& range) -> bool { return range.memoryInstruction == it; });
+        if(range == info.ranges->end())
+            throw CompilationError(CompilationStep::NORMALIZER,
+                "Failed to find memory access range for VPM cached memory access", mem->to_string());
+        return insertAddressToWorkItemSpecificOffset(it, method, out, const_cast<MemoryAccessRange&>(*range));
+    }
+    return insertAddressToStackOffset(it, method, out, info.local, info.type, mem);
+}
+
+/*
+ * Tries to map the given memory location into VPM
+ *
+ * This is applicable for private (stack) or local memory.
+ *
+ * NOTE: A memory location can only be lowered into VPM if all access to it can be lowered to VPM
+ * NOTE: This is to be preferred over keeping the memory location in RAM
+ */
+static InstructionWalker lowerMemoryReadToVPM(
+    Method& method, InstructionWalker it, MemoryInstruction* mem, const MemoryInfo& srcInfo, const MemoryInfo& destInfo)
+{
+    // Need to make sure addressing is still correct!
+    if(srcInfo.type == MemoryType::VPM_PER_QPU && !srcInfo.local->is<StackAllocation>())
+        throw CompilationError(
+            CompilationStep::NORMALIZER, "Unhandled case of per-QPU memory buffer", srcInfo.local->to_string());
+    if(!srcInfo.area)
+        throw CompilationError(CompilationStep::NORMALIZER, "Cannot lower into VPM without VPM area", mem->to_string());
+
+    if(srcInfo.type == MemoryType::VPM_PER_QPU)
+        logging::debug() << "Lowering read of stack allocation into VPM: " << mem->to_string() << logging::endl;
+    else
+        logging::debug() << "Lowering read of shared local memory into VPM: " << mem->to_string() << logging::endl;
+
+    Value inAreaOffset = UNDEFINED_VALUE;
+    it = insertToInVPMAreaOffset(method, it, inAreaOffset, srcInfo, mem);
+    if(mem->op == MemoryOperation::READ)
+    {
+        it = method.vpm->insertReadVPM(method, it, mem->getDestination(), srcInfo.area, false, inAreaOffset);
+        return it.erase();
+    }
+    throw CompilationError(
+        CompilationStep::NORMALIZER, "Unhandled case to lower reading of memory into VPM", mem->to_string());
+}
+
+static InstructionWalker lowerMemoryWriteToVPM(
+    Method& method, InstructionWalker it, MemoryInstruction* mem, const MemoryInfo& srcInfo, const MemoryInfo& destInfo)
+{
+    if(destInfo.type == MemoryType::VPM_PER_QPU && !destInfo.local->is<StackAllocation>())
+        throw CompilationError(
+            CompilationStep::NORMALIZER, "Unhandled case of per-QPU memory buffer", destInfo.local->to_string());
+    if(!destInfo.area)
+        throw CompilationError(CompilationStep::NORMALIZER, "Cannot lower into VPM without VPM area", mem->to_string());
+
+    if(destInfo.type == MemoryType::VPM_PER_QPU)
+        logging::debug() << "Lowering write to stack allocation into VPM: " << mem->to_string() << logging::endl;
+    else
+        logging::debug() << "Lowering write to shared local memory into VPM: " << mem->to_string() << logging::endl;
+
+    Value inAreaOffset = UNDEFINED_VALUE;
+    it = insertToInVPMAreaOffset(method, it, inAreaOffset, destInfo, mem);
+    if(mem->op == MemoryOperation::WRITE)
+    {
+        it = method.vpm->insertWriteVPM(method, it, mem->getSource(), destInfo.area, false, inAreaOffset);
+        return it.erase();
+    }
+    if(mem->op == MemoryOperation::FILL)
+    {
+        logging::error() << "Destination: " << destInfo.local->to_string() << " - "
+                         << static_cast<unsigned>(destInfo.type) << " - "
+                         << (destInfo.area ? destInfo.area->to_string() : "") << logging::endl;
+        throw CompilationError(
+            CompilationStep::NORMALIZER, "Filling VPM area is not yet implemented", mem->to_string());
+    }
+    throw CompilationError(
+        CompilationStep::NORMALIZER, "Unhandled case to lower writing of memory into VPM", mem->to_string());
+}
+
+/*
+ * Maps a single memory read to a TMU load
+ *
+ * NOTE: Memory locations loaded via TMU MUST NOT be written to by the same kernel (even on a different QPU)!
+ */
+static InstructionWalker loadMemoryViaTMU(
+    Method& method, InstructionWalker it, MemoryInstruction* mem, const MemoryInfo& srcInfo, const MemoryInfo& destInfo)
+{
+    logging::debug() << "Loading from read-only memory via TMU: " << mem->to_string() << logging::endl;
+    if(mem->op == MemoryOperation::READ)
+    {
+        it = periphery::insertReadVectorFromTMU(
+            method, it, mem->getDestination(), mem->getSource(), srcInfo.tmuFlag ? periphery::TMU1 : periphery::TMU0);
+        return it.erase();
+    }
+    throw CompilationError(CompilationStep::NORMALIZER, "Unhandled case to read from memory via TMU", mem->to_string());
+}
+
+/*
+ * Maps a memory access instruction to an instruction accessing RAM through VPM.
+ *
+ * NOTE: At least one of the operands of the instruction to be mapped must be located in RAM
+ * NOTE: this is the least optimal mapping possible and should avoided if possible.
+ */
+static InstructionWalker accessMemoryInRAMViaVPM(
+    Method& method, InstructionWalker it, MemoryInstruction* mem, const MemoryInfo& srcInfo, const MemoryInfo& destInfo)
+{
+    logging::debug() << "Mapping access to memory located in RAM: " << mem->to_string() << logging::endl;
+    switch(mem->op)
+    {
+    case MemoryOperation::FILL:
+    {
+        if(!mem->getNumEntries().isLiteralValue())
+            throw CompilationError(CompilationStep::OPTIMIZER,
+                "Filling dynamically sized memory is not yet implemented", mem->to_string());
+        uint64_t numCopies = mem->getNumEntries().getLiteralValue()->unsignedInt();
+        if(numCopies > std::numeric_limits<unsigned>::max())
+            throw CompilationError(CompilationStep::OPTIMIZER, "Cannot fill more than 4GB of data", mem->to_string());
+        // TODO could optimize (e.g. for zero-initializers) by writing several bytes at once
+        it = method.vpm->insertWriteVPM(method, it, mem->getSource(), nullptr, false);
+        it = method.vpm->insertFillRAM(method, it, mem->getDestination(), mem->getSourceElementType(),
+            static_cast<unsigned>(numCopies), nullptr, false);
+        auto* dest = mem->getDestination().hasLocal() ? mem->getDestination().local()->getBase(true) : nullptr;
+        if(dest && dest->is<Parameter>())
+            const_cast<Parameter*>(dest->as<const Parameter>())->decorations =
+                add_flag(dest->as<const Parameter>()->decorations, ParameterDecorations::OUTPUT);
+        break;
+    }
+    case MemoryOperation::READ:
+    {
+        it = periphery::insertReadDMA(
+            method, it, mem->getDestination(), mem->getSource(), true /* need to lock mutex for shared scratch area */);
+        auto* src = mem->getSource().hasLocal() ? mem->getSource().local()->getBase(true) : nullptr;
+        if(src && src->is<Parameter>())
+            const_cast<Parameter*>(src->as<const Parameter>())->decorations =
+                add_flag(src->as<const Parameter>()->decorations, ParameterDecorations::INPUT);
+        break;
+    }
+    case MemoryOperation::WRITE:
+    {
+        it = periphery::insertWriteDMA(
+            method, it, mem->getSource(), mem->getDestination(), true /* need to lock mutex for shared scratch area */);
+        auto* dest = mem->getDestination().hasLocal() ? mem->getDestination().local()->getBase(true) : nullptr;
+        if(dest && dest->is<Parameter>())
+            const_cast<Parameter*>(dest->as<const Parameter>())->decorations =
+                add_flag(dest->as<const Parameter>()->decorations, ParameterDecorations::OUTPUT);
+        break;
+    }
+    default:
+        throw CompilationError(CompilationStep::NORMALIZER, "Unhandled case of accessing RAM", mem->to_string());
+    }
+    // remove MemoryInstruction
+    // since a copy may have another iterator to it, do not remove the element, just clear it
+    // the empty instruction is cleaned up in #combineVPMAccess
+    return it.erase();
+}
+
+static InstructionWalker mapMemoryCopy(
+    Method& method, InstructionWalker it, MemoryInstruction* mem, const MemoryInfo& srcInfo, const MemoryInfo& destInfo)
+{
+    /*
+     * Handled cases:
+     *
+     * From\To |     VPM      |        RAM           |
+     * VPM     | read + write |      DMA write       |
+     * RAM     |   DMA read   | DMA read + DMA write |
+     *
+     */
+
+    // srcInRegister is handled by another function
+    bool destInRegister = destInfo.type == MemoryType::QPU_REGISTER_READWRITE;
+    bool srcInVPM = srcInfo.type == MemoryType::VPM_PER_QPU || srcInfo.type == MemoryType::VPM_SHARED_ACCESS;
+    bool srcInRAM = srcInfo.type == MemoryType::RAM_LOAD_TMU || srcInfo.type == MemoryType::RAM_READ_WRITE_VPM;
+    bool destInVPM = destInfo.type == MemoryType::VPM_PER_QPU || destInfo.type == MemoryType::VPM_SHARED_ACCESS;
+    bool destInRAM = destInfo.type == MemoryType::RAM_LOAD_TMU || destInfo.type == MemoryType::RAM_READ_WRITE_VPM;
+
+    auto* src = mem->getSource().hasLocal() ? mem->getSource().local()->getBase(true) : nullptr;
+    if(src && src->is<Parameter>())
+        const_cast<Parameter*>(src->as<const Parameter>())->decorations =
+            add_flag(src->as<const Parameter>()->decorations, ParameterDecorations::INPUT);
+    auto* dest = mem->getDestination().hasLocal() ? mem->getDestination().local()->getBase(true) : nullptr;
+    if(dest && dest->is<Parameter>())
+        const_cast<Parameter*>(dest->as<const Parameter>())->decorations =
+            add_flag(dest->as<const Parameter>()->decorations, ParameterDecorations::OUTPUT);
+
+    if(srcInVPM && destInVPM)
+    {
+        // copy from VPM into VPM -> VPM read + VPM write
+        logging::debug() << "Mapping copy from/to VPM to VPM read and VPM write: " << mem->to_string() << logging::endl;
+
+        if(mem->getNumEntries() != INT_ONE)
+            // TODO could for static count insert that number of reads/writes, for dynamic need a loop!
+            throw CompilationError(CompilationStep::NORMALIZER,
+                "Copying within VPM with more than 1 entries is not yet implemented", mem->to_string());
+        auto tmpVal = method.addNewLocal(mem->getSourceElementType(), "%vpm_copy_tmp");
+        it.emplace(new MemoryInstruction(MemoryOperation::READ, tmpVal, mem->getSource()));
+        it = mapMemoryAccess(method, it, it.get<MemoryInstruction>(), srcInfo, destInfo);
+        it.reset(new MemoryInstruction(MemoryOperation::WRITE, mem->getDestination(), tmpVal));
+        return mapMemoryAccess(method, it, it.get<MemoryInstruction>(), srcInfo, destInfo);
+    }
+    else if(srcInVPM && destInRAM)
+    {
+        // copy from VPM into RAM -> DMA write
+        logging::debug() << "Mapping copy from VPM into RAM to DMA write: " << mem->to_string() << logging::endl;
+        Value inAreaOffset = UNDEFINED_VALUE;
+        it = insertToInVPMAreaOffset(method, it, inAreaOffset, srcInfo, mem);
+        it = method.vpm->insertWriteRAM(method, it, mem->getDestination(), mem->getSourceElementType(), srcInfo.area,
+            false, inAreaOffset, mem->getNumEntries());
+        return it.erase();
+    }
+    else if(srcInRAM && destInVPM)
+    {
+        // copy from RAM into VPM -> DMA read
+        logging::debug() << "Mapping copy from RAM into VPM to DMA read: " << mem->to_string() << logging::endl;
+        Value inAreaOffset = UNDEFINED_VALUE;
+        it = insertToInVPMAreaOffset(method, it, inAreaOffset, destInfo, mem);
+        it = method.vpm->insertReadRAM(method, it, mem->getSource(), mem->getSourceElementType(), destInfo.area, false,
+            inAreaOffset, mem->getNumEntries());
+        return it.erase();
+    }
+    else if(srcInRAM && destInRAM)
+    {
+        // copy from RAM into RAM -> DMA read + DMA write
+        if(!mem->getNumEntries().isLiteralValue())
+            throw CompilationError(CompilationStep::OPTIMIZER,
+                "Copying dynamically sized memory within RAM is not yet implemented", mem->to_string());
+        uint64_t numBytes = mem->getNumEntries().getLiteralValue()->unsignedInt() *
+            (mem->getSourceElementType().getScalarBitCount() * mem->getSourceElementType().getVectorWidth()) / 8;
+        if(numBytes > std::numeric_limits<unsigned>::max())
+            throw CompilationError(CompilationStep::OPTIMIZER, "Cannot copy more than 4GB of data", mem->to_string());
+        logging::debug() << "Mapping copy from RAM into RAM to DMA read and DMA write: " << mem->to_string()
+                         << logging::endl;
+        it = method.vpm->insertCopyRAM(
+            method, it, mem->getDestination(), mem->getSource(), static_cast<unsigned>(numBytes));
+        return it.erase();
+    }
+    else if(destInRegister && destInfo.convertedRegisterType)
+    {
+        // copy from VPM/RAM into register -> read from VPM/RAM + write to register
+        logging::debug() << "Mapping copy from VPM/RAM into register to read from VPM/RAM and register insertion: "
+                         << mem->to_string() << logging::endl;
+        // TODO some general version
+        if(copiesWholeRegister(mem->getNumEntries(), mem->getSourceElementType(), *destInfo.convertedRegisterType))
+        {
+            // e.g. for copying 32 bytes into float[8] register -> just read 1 float16 vector
+            it.reset(new MemoryInstruction(MemoryOperation::READ, *destInfo.mappedRegisterOrConstant,
+                Value(mem->getSource().local(), destInfo.convertedRegisterType->toPointerType())));
+            return mapMemoryAccess(method, it, it.get<MemoryInstruction>(), srcInfo, destInfo);
+        }
+    }
+    else
+    {
+        logging::error() << "Source: " << (srcInfo.local ? srcInfo.local->to_string() : "?") << " - "
+                         << static_cast<unsigned>(srcInfo.type) << " - "
+                         << (srcInfo.area ? srcInfo.area->to_string() : "") << logging::endl;
+
+        logging::error() << "Destination: " << (destInfo.local ? destInfo.local->to_string() : "?") << " - "
+                         << static_cast<unsigned>(destInfo.type) << " - "
+                         << (destInfo.area ? destInfo.area->to_string() : "") << logging::endl;
+
+        throw CompilationError(
+            CompilationStep::NORMALIZER, "Unhandled case for handling memory copy", mem->to_string());
+    }
+
+    throw CompilationError(CompilationStep::NORMALIZER, "Need to be re-written", mem->to_string());
+}
diff --git a/src/normalization/MemoryMappings.h b/src/normalization/MemoryMappings.h
new file mode 100644
index 00000000..933f8793
--- /dev/null
+++ b/src/normalization/MemoryMappings.h
@@ -0,0 +1,77 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4C_NORMALIZATION_MEMORY_MAPPING_H
+#define VC4C_NORMALIZATION_MEMORY_MAPPING_H
+
+#include "AddressCalculation.h"
+
+namespace vc4c
+{
+    namespace normalization
+    {
+        /*
+         * Container for all the information required for a memory area to be mapped to any of the possible storage
+         * locations
+         */
+        struct MemoryInfo
+        {
+            const Local* local;
+            // the type of how to lower the memory represented by the local
+            MemoryType type;
+            // the optional VPM area. If this is set, the memory is lowered to VPM
+            const periphery::VPMArea* area = nullptr;
+            // the optional access ranges. If set, the memory is located in RAM but cached in VPM
+            const Optional<std::vector<MemoryAccessRange>> ranges;
+            // the constant value or mapped register this local represents.
+            Optional<Value> mappedRegisterOrConstant = NO_VALUE;
+            // e.g. for arrays converted to vectors, this is the resulting vector type
+            Optional<DataType> convertedRegisterType = {};
+            // flags which TMU to be used for reading
+            bool tmuFlag = false;
+        };
+
+        using GroupedAccessRanges =
+            FastMap<const Local*, std::pair<FastAccessList<MemoryAccessRange>, const periphery::VPMArea*>>;
+        using MemoryAccessMap = OrderedMap<const Local*, MemoryAccess, LocalUsageOrdering>;
+        using AccessRanges = OrderedMap<const Local*, FastAccessList<MemoryAccessRange>, LocalUsageOrdering>;
+
+        /*
+         * Basic algorithm to determine the preferred and fall-back (e.g. if access-types not supported by preferred)
+         * way of
+         * a) mapping the memory regions used by this method to the available "memory" (registers, VPM, RAM) and
+         * b) mapping the memory access types (read, write, copy, fill) to the available memory access types (TMU, VPM,
+         * etc.)
+         */
+        std::pair<MemoryAccessMap, FastSet<InstructionWalker>> determineMemoryAccess(Method& method);
+
+        /*
+         * Returns the constant value which will be read from the given memory access instruction.
+         *
+         * The value is constant if:
+         * - the source memory location is constant
+         * - the index is constant or the value can be determined without knowing the exact index (e.g. all elements are
+         * the same)
+         */
+        Optional<Value> getConstantValue(const Value& source);
+
+        /*
+         * Checks whether the memory location can be mapped to the preferred location specified in the MemoryAccess
+         * parameter. If so, required resources will be reserved. If not, the check will be performed with the fall-back
+         * storage location.
+         */
+        MemoryInfo checkMemoryMapping(Method& method, const Local* baseAddr, MemoryAccess& access);
+
+        /*
+         * Maps the given memory access instruction to hardware instructions according to the given source and
+         * destination information.
+         */
+        InstructionWalker mapMemoryAccess(Method& method, InstructionWalker it, intermediate::MemoryInstruction* mem,
+            const MemoryInfo& srcInfo, const MemoryInfo& destInfo);
+    }
+}
+
+#endif /* VC4C_NORMALIZATION_MEMORY_MAPPING_H */
\ No newline at end of file
diff --git a/src/normalization/Normalizer.cpp b/src/normalization/Normalizer.cpp
index 7278543e..9df4519d 100644
--- a/src/normalization/Normalizer.cpp
+++ b/src/normalization/Normalizer.cpp
@@ -93,22 +93,25 @@ static void checkNormalized(Module& module, Method& method, InstructionWalker it
 
 // NOTE: The order is on purpose and must not be changed!
 const static std::vector<std::pair<std::string, NormalizationStep>> initialNormalizationSteps = {
-    // handles stack-allocations by calculating their offsets and indices
-    {"ResolveStackAllocations", resolveStackAllocation},
     // intrinsifies calls to built-ins and unsupported operations
     {"Intrinsics", optimizations::intrinsify},
     // replaces all remaining returns with jumps to the end of the kernel-function
     {"EliminateReturns", optimizations::eliminateReturn},
-    // moves vector-containers to locals and re-directs all uses to the local
-    {"HandleLiteralVector", handleContainer},
-    // maps access to global data to the offset in the code
-    {"MapGlobalDataToAddress", accessGlobalData},
     // rewrites the use of literal values to either small-immediate values or loading of literals
     // this first run here is only required, so some loading of literals can be optimized, which is no longer possible
     // after the second run
     {"HandleImmediates", handleImmediate},
     // propagates the instruction decoration whether values are work-group uniform
-    {"PropagateGroupUniformValues", propagateGroupUniforms},
+    {"PropagateGroupUniformValues", propagateGroupUniforms}};
+
+// these normalization steps are run after the memory access is converted
+const static std::vector<std::pair<std::string, NormalizationStep>> initialNormalizationSteps2 = {
+    // handles stack-allocations by calculating their offsets and indices
+    {"ResolveStackAllocations", resolveStackAllocation},
+    // maps access to global data to the offset in the code
+    {"MapGlobalDataToAddress", accessGlobalData},
+    // moves vector-containers to locals and re-directs all uses to the local
+    {"HandleLiteralVector", handleContainer},
     // dummy step which simply checks whether all remaining instructions are normalized
     {"CheckNormalized", checkNormalized}};
 
@@ -192,6 +195,15 @@ void Normalizer::normalizeMethod(Module& module, Method& method) const
 
     PROFILE_START(NormalizationPasses);
 
+    for(const auto& step : initialNormalizationSteps)
+    {
+        logging::debug() << logging::endl;
+        logging::debug() << "Running pass: " << step.first << logging::endl;
+        PROFILE_START_DYNAMIC(step.first);
+        runNormalizationStep(step.second, module, method, config);
+        PROFILE_END_DYNAMIC(step.first);
+    }
+
     // maps all memory-accessing instructions to instructions actually performing the hardware memory-access
     // this step is called extra, because it needs to be run over all instructions
     logging::debug() << logging::endl;
@@ -203,7 +215,7 @@ void Normalizer::normalizeMethod(Module& module, Method& method) const
     // calculate current/final stack offsets after lowering stack-accesses
     method.calculateStackOffsets();
 
-    for(const auto& step : initialNormalizationSteps)
+    for(const auto& step : initialNormalizationSteps2)
     {
         logging::debug() << logging::endl;
         logging::debug() << "Running pass: " << step.first << logging::endl;
diff --git a/src/normalization/sources.list b/src/normalization/sources.list
index 8637a9f3..e3ed0b2a 100644
--- a/src/normalization/sources.list
+++ b/src/normalization/sources.list
@@ -1,11 +1,16 @@
 target_sources(${VC4C_LIBRARY_NAME}
   PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}/AddressCalculation.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/AddressCalculation.h
     ${CMAKE_CURRENT_LIST_DIR}/Inliner.cpp
     ${CMAKE_CURRENT_LIST_DIR}/Inliner.h
     ${CMAKE_CURRENT_LIST_DIR}/LiteralValues.cpp
     ${CMAKE_CURRENT_LIST_DIR}/LiteralValues.h
     ${CMAKE_CURRENT_LIST_DIR}/MemoryAccess.cpp
     ${CMAKE_CURRENT_LIST_DIR}/MemoryAccess.h
+    ${CMAKE_CURRENT_LIST_DIR}/MemoryMapChecks.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/MemoryMappings.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/MemoryMappings.h
     ${CMAKE_CURRENT_LIST_DIR}/Normalizer.cpp
     ${CMAKE_CURRENT_LIST_DIR}/Normalizer.h
     ${CMAKE_CURRENT_LIST_DIR}/Rewrite.cpp
diff --git a/src/optimization/Combiner.cpp b/src/optimization/Combiner.cpp
index aa46f453..10919f3a 100644
--- a/src/optimization/Combiner.cpp
+++ b/src/optimization/Combiner.cpp
@@ -7,11 +7,8 @@
 #include "Combiner.h"
 
 #include "../InstructionWalker.h"
-#include "../analysis/ValueRange.h"
 #include "../intermediate/Helper.h"
 #include "../intermediate/operators.h"
-#include "../periphery/VPM.h"
-#include "Eliminator.h"
 #include "log.h"
 
 #include <algorithm>
@@ -1073,453 +1070,3 @@ InstructionWalker optimizations::combineFlagWithOutput(
     }
     return it;
 }
-
-static bool isGroupUniform(const Local* local)
-{
-    auto writers = local->getUsers(LocalUse::Type::WRITER);
-    return std::all_of(writers.begin(), writers.end(), [](const LocalUser* instr) -> bool {
-        return instr->hasDecoration(InstructionDecorations::WORK_GROUP_UNIFORM_VALUE);
-    });
-}
-
-static bool isWorkGroupUniform(const Value& val)
-{
-    return val.hasImmediate() || val.hasLiteral() ||
-        (val.hasLocal() && isGroupUniform(val.local()))
-        // XXX this is not true for the local ID UNIFORM
-        || (val.hasRegister(REG_UNIFORM));
-}
-
-static FastMap<Value, InstructionDecorations> findDirectLevelAdditionInputs(const Value& val)
-{
-    FastMap<Value, InstructionDecorations> result;
-    auto writer = val.getSingleWriter();
-    if(writer == nullptr || writer->hasDecoration(InstructionDecorations::WORK_GROUP_UNIFORM_VALUE))
-    {
-        // we have no need to split up work-group uniform values any more detailed
-        auto deco = writer ? writer->decoration : InstructionDecorations::NONE;
-        result.emplace(val,
-            add_flag(deco,
-                val.hasImmediate() || val.hasLiteral() ? InstructionDecorations::WORK_GROUP_UNIFORM_VALUE :
-                                                         InstructionDecorations::NONE));
-        if(val.hasImmediate() && val.immediate().getIntegerValue() >= 0)
-            result[val] = add_flag(result[val], InstructionDecorations::UNSIGNED_RESULT);
-        else if(val.hasLiteral() && val.literal().signedInt() >= 0)
-            result[val] = add_flag(result[val], InstructionDecorations::UNSIGNED_RESULT);
-        else if(val.hasRegister() && val.reg() == REG_UNIFORM)
-            // XXX this is not true for the local ID UNIFORM, which should never be checked here (since the actual ID
-            // needs always be extracted via non-adds)
-            result[val] = add_flag(result[val], InstructionDecorations::WORK_GROUP_UNIFORM_VALUE);
-        return result;
-    }
-    auto op = dynamic_cast<const Operation*>(writer);
-    bool onlySideEffectIsReadingUniform = op && op->hasSideEffects() && !op->doesSetFlag() &&
-        !op->signal.hasSideEffects() &&
-        !(op->hasValueType(ValueType::REGISTER) && op->getOutput()->reg().hasSideEffectsOnWrite()) &&
-        std::all_of(op->getArguments().begin(), op->getArguments().end(), [](const Value& arg) -> bool {
-            return !arg.hasRegister() || arg.reg() == REG_UNIFORM || !arg.reg().hasSideEffectsOnRead();
-        });
-    if(op && op->op == OP_ADD && !op->hasConditionalExecution() &&
-        (!op->hasSideEffects() || onlySideEffectIsReadingUniform) && !op->hasPackMode() && !op->hasUnpackMode())
-    {
-        FastMap<Value, InstructionDecorations> args;
-        for(const auto& arg : op->getArguments())
-        {
-            auto tmp = findDirectLevelAdditionInputs(arg);
-            args.insert(tmp.begin(), tmp.end());
-        }
-        return args;
-    }
-    result.emplace(val, writer->decoration);
-    return result;
-}
-
-// represents analysis data for the range of memory accessed per memory object
-struct MemoryAccessRange
-{
-    const Local* memoryObject = nullptr;
-    // the instruction writing the address to VPR_ADDR or VPW_ADDR
-    InstructionWalker addressWrite{};
-    // the instruction adding the offset to the base pointer, could be the same as addressWrite
-    InstructionWalker baseAddressAdd{};
-    // the instruction converting the address offset from element offset to byte offset
-    Optional<InstructionWalker> typeSizeShift{};
-    // the work-group uniform parts of which the address offset is calculated from
-    FastMap<Value, InstructionDecorations> groupUniformAddressParts{};
-    // the dynamic parts of which the address offset is calculated from
-    FastMap<Value, InstructionDecorations> dynamicAddressParts{};
-    // the maximum range (in elements!) the memory is accessed in
-    analysis::IntegerRange offsetRange{0, 0};
-
-    std::string to_string() const
-    {
-        return (addressWrite->to_string() +
-                   (addressWrite->writesRegister(REG_VPM_DMA_LOAD_ADDR) ? " - read " : " - write ")) +
-            (memoryObject->to_string() +
-                (groupUniformAddressParts.empty() ? " with" : " with work-group uniform offset and") +
-                " dynamic element range [") +
-            (std::to_string(offsetRange.minValue) + ", ") + (std::to_string(offsetRange.maxValue) + "]");
-    }
-};
-
-struct LocalUsageOrdering
-{
-    bool operator()(const Local* l1, const Local* l2) const
-    {
-        // prefer more usages over less usages
-        // TODO is this the correct way to do this? E.g. is there one usage per memory access?
-        return l1->getUsers(LocalUse::Type::READER).size() > l2->getUsers(LocalUse::Type::READER).size() || l1 < l2;
-    }
-};
-
-using AccessRanges = OrderedMap<const Local*, FastAccessList<MemoryAccessRange>, LocalUsageOrdering>;
-
-static AccessRanges determineAccessRanges(Method& method)
-{
-    // TODO if we cannot find an access range for a local, we cannot combine any other access ranges for this global!
-    AccessRanges result;
-    for(BasicBlock& block : method)
-    {
-        InstructionWalker it = block.walk();
-        while(!it.isEndOfBlock())
-        {
-            if(it.has() && (it->writesRegister(REG_VPM_DMA_LOAD_ADDR) || it->writesRegister(REG_VPM_DMA_STORE_ADDR)))
-            {
-                // 1. find writes to VPM DMA addresses with work-group uniform part in address values
-                if(std::any_of(it->getArguments().begin(), it->getArguments().end(), isWorkGroupUniform) ||
-                    it.has<MoveOperation>())
-                {
-                    if(it.has<MoveOperation>() && it->assertArgument(0).hasLocal() &&
-                        (it->assertArgument(0).local()->is<Parameter>() || it->assertArgument(0).local()->is<Global>()))
-                    {
-                        // direct write of address (e.g. all work items write to the same location
-                        logging::debug() << "DMA address is directly set to a parameter/global address, cannot be "
-                                            "optimized by caching multiple accesses: "
-                                         << it->to_string() << logging::endl;
-                        it.nextInBlock();
-                        continue;
-                    }
-                    MemoryAccessRange range;
-                    range.addressWrite = it;
-                    // if the instruction is a move, handle/skip it here, so the add with the shifted offset +
-                    // base-pointer is found correctly
-                    auto trackIt = it;
-                    if(it.has<MoveOperation>() && it->assertArgument(0).getSingleWriter())
-                    {
-                        auto walker =
-                            it.getBasicBlock()->findWalkerForInstruction(it->assertArgument(0).getSingleWriter(), it);
-                        if(!walker)
-                        {
-                            logging::debug() << "Unhandled case, address is calculated in a different basic-block: "
-                                             << it->to_string() << logging::endl;
-                            it.nextInBlock();
-                            continue;
-                        }
-                        else
-                            trackIt = walker.value();
-                    }
-
-                    auto variableArg = std::find_if_not(
-                        trackIt->getArguments().begin(), trackIt->getArguments().end(), isWorkGroupUniform);
-                    if(variableArg != trackIt->getArguments().end() && variableArg->getSingleWriter() != nullptr)
-                    {
-                        // 2. rewrite address so all work-group uniform parts are combined and all variable parts and
-                        // added in the end
-                        // TODO is this the correct criteria? We could also handle only base-pointer + local_id, for
-                        // example
-                        logging::debug() << "Found VPM DMA address write with work-group uniform operand: "
-                                         << it->to_string() << logging::endl;
-                        Value varArg = *variableArg;
-                        // 2.1 jump over final addition of base address if it is a parameter
-                        if(trackIt.has<Operation>() && trackIt.get<const Operation>()->op == OP_ADD)
-                        {
-                            const auto& arg0 = trackIt->assertArgument(0);
-                            const auto& arg1 = trackIt->assertArgument(1);
-                            if(arg0.hasLocal() &&
-                                (arg0.local()->is<Parameter>() || arg0.local()->is<Global>() ||
-                                    arg0.local()->name == Method::GLOBAL_DATA_ADDRESS))
-                            {
-                                range.memoryObject = arg0.local();
-                                varArg = arg1;
-                            }
-                            else if(arg1.hasLocal() &&
-                                (arg1.local()->is<Parameter>() || arg1.local()->is<Global>() ||
-                                    arg1.local()->name == Method::GLOBAL_DATA_ADDRESS))
-                            {
-                                range.memoryObject = arg1.local();
-                                varArg = arg0;
-                            }
-                            else if(arg0.hasRegister(REG_UNIFORM))
-                            {
-                                // e.g. reading of uniform for parameter is replaced by reading uniform here (if
-                                // parameter only used once)
-                                range.memoryObject = trackIt->getOutput()->local()->getBase(true);
-                                varArg = arg1;
-                            }
-                            else if(arg1.hasRegister(REG_UNIFORM))
-                            {
-                                range.memoryObject = trackIt->getOutput()->local()->getBase(true);
-                                varArg = arg0;
-                            }
-                            else
-                            {
-                                throw CompilationError(CompilationStep::OPTIMIZER,
-                                    "Unhandled case of memory access: ", trackIt->to_string());
-                            }
-                            range.baseAddressAdd = trackIt;
-                        }
-                        else
-                        {
-                            logging::debug()
-                                << "Cannot optimize further, since add of base-address and pointer was not found: "
-                                << it->to_string() << logging::endl;
-                            it.nextInBlock();
-                            continue;
-                        }
-                        auto writer = varArg.getSingleWriter();
-                        // 2.2 jump over shl (if any) and remember offset
-                        if(dynamic_cast<const Operation*>(writer) &&
-                            dynamic_cast<const Operation*>(writer)->op == OP_SHL)
-                        {
-                            if(!writer->assertArgument(1).getLiteralValue() ||
-                                (1u << writer->assertArgument(1).getLiteralValue()->unsignedInt()) !=
-                                    it->assertArgument(0).type.getElementType().getPhysicalWidth())
-                            {
-                                // Abort, since the offset shifted does not match the type-width of the element type
-                                logging::debug()
-                                    << "Cannot optimize further, since shift-offset does not match type size: "
-                                    << it->to_string() << " and " << writer->to_string() << logging::endl;
-                                it.nextInBlock();
-                                continue;
-                            }
-                            range.typeSizeShift = trackIt.getBasicBlock()->findWalkerForInstruction(writer, trackIt);
-                            varArg = writer->assertArgument(0);
-                            // TODO is never read. Remove or use?
-                            writer = varArg.getSingleWriter();
-                        }
-                        // 2.3 collect all directly neighboring (and directly referenced) additions
-                        // result is now: finalAdd + (sum(addedValues) << shiftFactor)
-                        auto addressParts = findDirectLevelAdditionInputs(varArg);
-                        if(addressParts.size() < 2)
-                        {
-                            // could not determine multiple inputs to add, abort
-                            it.nextInBlock();
-                            continue;
-                        }
-                        // 2.4 calculate the maximum dynamic offset
-                        for(const auto& val : addressParts)
-                        {
-                            if(!has_flag(val.second, InstructionDecorations::WORK_GROUP_UNIFORM_VALUE))
-                            {
-                                range.dynamicAddressParts.emplace(val);
-                                if(val.first.hasLocal())
-                                {
-                                    auto singleRange = analysis::ValueRange::getValueRange(val.first, &method);
-                                    range.offsetRange.minValue += singleRange.getIntRange()->minValue;
-                                    range.offsetRange.maxValue += singleRange.getIntRange()->maxValue;
-                                }
-                                else
-                                    throw CompilationError(CompilationStep::OPTIMIZER,
-                                        "Unhandled value for memory access offset", val.first.to_string());
-                            }
-                            else
-                                range.groupUniformAddressParts.emplace(val);
-                        }
-                        logging::debug() << range.to_string() << logging::endl;
-                        result[range.memoryObject].emplace_back(range);
-                    }
-                }
-            }
-            it.nextInBlock();
-        }
-    }
-    return result;
-}
-
-static Optional<std::pair<Value, InstructionDecorations>> combineAdditions(
-    Method& method, InstructionWalker referenceIt, FastMap<Value, InstructionDecorations>& addedValues)
-{
-    Optional<std::pair<Value, InstructionDecorations>> prevResult;
-    auto valIt = addedValues.begin();
-    while(valIt != addedValues.end())
-    {
-        if(prevResult)
-        {
-            auto newResult = method.addNewLocal(prevResult->first.type);
-            auto newFlags = intersect_flags(prevResult->second, valIt->second);
-            referenceIt.emplace(new Operation(OP_ADD, newResult, prevResult->first, valIt->first));
-            referenceIt->addDecorations(newFlags);
-            referenceIt.nextInBlock();
-            prevResult = std::make_pair(newResult, newFlags);
-        }
-        else
-            prevResult = std::make_pair(valIt->first, valIt->second);
-        valIt = addedValues.erase(valIt);
-    }
-    return prevResult;
-}
-
-static std::pair<bool, analysis::IntegerRange> checkWorkGroupUniformParts(
-    FastAccessList<MemoryAccessRange>& accessRanges)
-{
-    analysis::IntegerRange offsetRange{std::numeric_limits<int>::max(), std::numeric_limits<int>::min()};
-    const auto& firstUniformAddresses = accessRanges.front().groupUniformAddressParts;
-    FastMap<Value, InstructionDecorations> differingUniformParts;
-    bool allUniformPartsEqual = true;
-    for(auto& entry : accessRanges)
-    {
-        if(entry.groupUniformAddressParts != firstUniformAddresses)
-        {
-            allUniformPartsEqual = false;
-            for(const auto& pair : entry.groupUniformAddressParts)
-            {
-                if(firstUniformAddresses.find(pair.first) == firstUniformAddresses.end())
-                    differingUniformParts.emplace(pair);
-            }
-            for(const auto& pair : firstUniformAddresses)
-                if(entry.groupUniformAddressParts.find(pair.first) == entry.groupUniformAddressParts.end())
-                    differingUniformParts.emplace(pair);
-        }
-        offsetRange.minValue = std::min(offsetRange.minValue, entry.offsetRange.minValue);
-        offsetRange.maxValue = std::max(offsetRange.maxValue, entry.offsetRange.maxValue);
-    }
-    if(!allUniformPartsEqual)
-    {
-        if(std::all_of(differingUniformParts.begin(), differingUniformParts.end(),
-               [](const std::pair<Value, InstructionDecorations>& part) -> bool {
-                   return part.first.getLiteralValue().has_value();
-               }))
-        {
-            // all work-group uniform values which differ between various accesses of the same local are literal
-            // values. We can use this knowledge to still allow caching the local, by converting the literals to
-            // dynamic offsets
-            for(auto& entry : accessRanges)
-            {
-                auto it = entry.groupUniformAddressParts.begin();
-                while(it != entry.groupUniformAddressParts.end())
-                {
-                    if(differingUniformParts.find(it->first) != differingUniformParts.end())
-                    {
-                        entry.offsetRange.minValue += it->first.getLiteralValue()->signedInt();
-                        entry.offsetRange.maxValue += it->first.getLiteralValue()->signedInt();
-                        entry.dynamicAddressParts.emplace(*it);
-                        it = entry.groupUniformAddressParts.erase(it);
-                    }
-                    else
-                        ++it;
-                }
-            }
-            return checkWorkGroupUniformParts(accessRanges);
-        }
-        else
-            return std::make_pair(false, analysis::IntegerRange{});
-    }
-    return std::make_pair(true, offsetRange);
-}
-
-static void rewriteIndexCalculation(Method& method, MemoryAccessRange& range)
-{
-    // 3. combine the additions so work-group uniform and non-uniform values are added
-    // separately
-    auto insertIt = range.typeSizeShift ? range.typeSizeShift.value() : range.baseAddressAdd;
-    auto firstVal = combineAdditions(method, insertIt, range.groupUniformAddressParts);
-    auto secondVal = combineAdditions(method, insertIt, range.dynamicAddressParts);
-    Optional<std::pair<Value, InstructionDecorations>> resultVal;
-    if(!range.groupUniformAddressParts.empty() || !range.dynamicAddressParts.empty())
-        throw CompilationError(CompilationStep::OPTIMIZER, "Too many values remaining",
-            std::to_string(range.groupUniformAddressParts.size() + range.dynamicAddressParts.size()));
-    if(firstVal && secondVal)
-    {
-        // add work-group uniform and variable part
-        resultVal = std::make_pair(
-            method.addNewLocal(range.memoryObject->type), intersect_flags(firstVal->second, secondVal->second));
-        insertIt.emplace(new Operation(OP_ADD, resultVal->first, firstVal->first, secondVal->first));
-        insertIt->addDecorations(resultVal->second);
-    }
-    else if(firstVal)
-        resultVal = firstVal;
-    else if(secondVal)
-        resultVal = secondVal;
-    if(range.typeSizeShift)
-        (*range.typeSizeShift)->setArgument(0, resultVal->first);
-    else
-        // TODO replace index variable with new index variable
-        throw CompilationError(
-            CompilationStep::OPTIMIZER, "Not yet implemented, no shift in address calculation", range.to_string());
-
-    logging::debug() << "Rewrote address-calculation with indices "
-                     << (firstVal ? (firstVal->first.to_string() + " (" + toString(firstVal->second) + ")") : "")
-                     << " and "
-                     << (secondVal ? (secondVal->first.to_string() + " (" + toString(secondVal->second) + ")") : "")
-                     << logging::endl;
-}
-
-bool optimizations::cacheWorkGroupDMAAccess(const Module& module, Method& method, const Configuration& config)
-{
-    auto memoryAccessRanges = determineAccessRanges(method);
-    for(auto& pair : memoryAccessRanges)
-    {
-        bool allUniformPartsEqual;
-        analysis::IntegerRange offsetRange;
-        std::tie(allUniformPartsEqual, offsetRange) = checkWorkGroupUniformParts(pair.second);
-        if(!allUniformPartsEqual)
-        {
-            logging::debug() << "Cannot cache memory location " << pair.first->to_string()
-                             << " in VPM, since the work-group uniform parts of the address calculations differ, which "
-                                "is not yet supported!"
-                             << logging::endl;
-            continue;
-        }
-        if((offsetRange.maxValue - offsetRange.minValue) >= config.availableVPMSize ||
-            (offsetRange.maxValue < offsetRange.minValue))
-        {
-            // this also checks for any over/underflow when converting the range to unsigned int in the next steps
-            logging::debug() << "Cannot cache memory location " << pair.first->to_string()
-                             << " in VPM, the accessed range is too big: [" << offsetRange.minValue << ", "
-                             << offsetRange.maxValue << "]" << logging::endl;
-            continue;
-        }
-        logging::debug() << "Memory location " << pair.first->to_string()
-                         << " is accessed via DMA in the dynamic range [" << offsetRange.minValue << ", "
-                         << offsetRange.maxValue << "]" << logging::endl;
-
-        auto accessedType = pair.first->type.toArrayType(static_cast<unsigned>(
-            offsetRange.maxValue - offsetRange.minValue + 1 /* bounds of range are inclusive! */));
-
-        // TODO the local is not correct, at least not if there is a work-group uniform offset
-        auto vpmArea = method.vpm->addArea(pair.first, accessedType, false);
-        if(vpmArea == nullptr)
-        {
-            logging::debug() << "Memory location " << pair.first->to_string() << " with dynamic access range ["
-                             << offsetRange.minValue << ", " << offsetRange.maxValue
-                             << "] cannot be cached in VPM, since it does not fit" << logging::endl;
-            continue;
-        }
-
-        // TODO insert load memory area into VPM at start of kernel (after all the required offsets/indices are
-        // calculated)
-        // TODO calculate address from base address and work-group uniform parts
-        // TODO insert store VPM into memory area at end of kernel
-        // TODO rewrite memory accesses to only access the correct VPM area
-
-        for(auto& entry : pair.second)
-            rewriteIndexCalculation(method, entry);
-
-        // TODO now, combine access to memory with VPM access
-        // need to make sure, only 1 kernel accesses RAM/writes the configuration, how?
-        // -> need some lightweight synchronization (e.g. status value in VPM?? One kernel would need to
-        // poll!!)
-        // TODO if minValue  != 0, need then to deduct it from the group-uniform address too!
-        // use base pointer as memory pointer (for read/write-back) and offset as VPM offset. maximum
-        // offset is the number of elements to copy/cache
-
-        // TODO insert initial read from DMA, final write to DMA
-        // even for writes, need to read, since memory in between might be untouched?
-
-        // TODO if it can be proven that all values in the range are guaranteed to be written (and not read before),
-        // we can skip the initial loading. This guarantee needs to hold across all work-items in a group!
-    }
-
-    // XXX
-    return eliminateDeadCode(module, method, config);
-}
diff --git a/src/optimization/Combiner.h b/src/optimization/Combiner.h
index d5f6f57e..50df797f 100644
--- a/src/optimization/Combiner.h
+++ b/src/optimization/Combiner.h
@@ -206,9 +206,6 @@ namespace vc4c
          */
         InstructionWalker combineFlagWithOutput(
             const Module& module, Method& method, InstructionWalker it, const Configuration& config);
-
-        // TODO documentation, TODO move somewhere else?!
-        bool cacheWorkGroupDMAAccess(const Module& module, Method& method, const Configuration& config);
     } // namespace optimizations
 } // namespace vc4c
 #endif /* COMBINER_H */
diff --git a/src/optimization/Optimizer.cpp b/src/optimization/Optimizer.cpp
index ce42047a..8bfa2009 100644
--- a/src/optimization/Optimizer.cpp
+++ b/src/optimization/Optimizer.cpp
@@ -272,9 +272,6 @@ const std::vector<OptimizationPass> Optimizer::ALL_PASSES = {
         "combines loadings of the same constant value within a small range of a basic block", OptimizationType::FINAL),
     OptimizationPass("RemoveConstantLoadInLoops", "extract-loads-from-loops", removeConstantLoadInLoops,
         "move constant loads in (nested) loops outside the loops", OptimizationType::FINAL),
-    OptimizationPass("CacheAcrossWorkGroup", "work-group-cache", cacheWorkGroupDMAAccess,
-        "finds memory access across the work-group which can be cached in VPM to combine the DMA operation (WIP)",
-        OptimizationType::FINAL),
     OptimizationPass("InstructionScheduler", "schedule-instructions", reorderInstructions,
         "schedule instructions according to their dependencies within basic blocks (WIP, slow)",
         OptimizationType::FINAL),
diff --git a/test/TestMemoryAccess.cpp b/test/TestMemoryAccess.cpp
index 6efbb9a7..7b5361d1 100644
--- a/test/TestMemoryAccess.cpp
+++ b/test/TestMemoryAccess.cpp
@@ -360,6 +360,7 @@ void TestMemoryAccess::testVectorLoadStoreCharPrivate()
 
 void TestMemoryAccess::testVectorLoadStoreCharLocal()
 {
+    //FIXME manual test is correct!
     testPrivateLocalFunction<char>(config, "-DTYPE=char -DSTORAGE=__local",
         std::bind(&TestMemoryAccess::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
 }
@@ -375,6 +376,7 @@ void TestMemoryAccess::testVectorLoadStoreShortPrivate()
 }
 void TestMemoryAccess::testVectorLoadStoreShortLocal()
 {
+    //FIXME manual test is correct!
     testPrivateLocalFunction<short>(config, "-DTYPE=short -DSTORAGE=__local",
         std::bind(&TestMemoryAccess::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
 }
diff --git a/testing/local_private_storage.cl b/testing/local_private_storage.cl
index 072de739..d912e36d 100644
--- a/testing/local_private_storage.cl
+++ b/testing/local_private_storage.cl
@@ -42,6 +42,7 @@ __constant uchar message[12] = "Hello World";
 __kernel void test_constant_storage(__global uchar* out)
 {
 	size_t gid = get_global_id(0);
+	// every kernel writes 1 single character
 	out[gid] = message[gid];
 }
 
@@ -50,5 +51,6 @@ __kernel void test_register_storage(__global uchar* out)
 	size_t gid = get_global_id(0);
 	uchar4 pad = (uchar4)'\0';
 	uchar16 a = (uchar16)('H', 'e', 'l', 'l', 'o', ' ', 'W', 'o', 'r', 'l', 'd', '\0', pad);
+	// every kernel writes 1 single character
 	out[gid] = a[gid];
 }

From 016f870a48c6b092c52c1737ae5c574dc7be217a Mon Sep 17 00:00:00 2001
From: doe300 <stadeldani@web.de>
Date: Sat, 3 Nov 2018 12:03:47 +0100
Subject: [PATCH 3/6] Rewrites scratch VPM area to be per-QPU

This changes allows us to remove mutex locks from "direct" memory access.

See #113
---
 src/normalization/AddressCalculation.cpp |   1 +
 src/normalization/MemoryAccess.cpp       |  13 ++-
 src/normalization/MemoryMappings.cpp     |   8 +-
 src/periphery/VPM.cpp                    | 116 ++++++++++-------------
 src/periphery/VPM.h                      |  26 +++--
 test/TestMemoryAccess.cpp                |   2 -
 6 files changed, 76 insertions(+), 90 deletions(-)

diff --git a/src/normalization/AddressCalculation.cpp b/src/normalization/AddressCalculation.cpp
index 081571e4..b889d655 100644
--- a/src/normalization/AddressCalculation.cpp
+++ b/src/normalization/AddressCalculation.cpp
@@ -131,6 +131,7 @@ MemoryType normalization::toMemoryType(periphery::VPMUsage usage)
     {
     case periphery::VPMUsage::SCRATCH:
     case periphery::VPMUsage::LOCAL_MEMORY:
+    case periphery::VPMUsage::MEMORY_CACHE:
         return MemoryType::VPM_SHARED_ACCESS;
     case periphery::VPMUsage::REGISTER_SPILLING:
     case periphery::VPMUsage::STACK:
diff --git a/src/normalization/MemoryAccess.cpp b/src/normalization/MemoryAccess.cpp
index d26a2600..5bfac7fb 100644
--- a/src/normalization/MemoryAccess.cpp
+++ b/src/normalization/MemoryAccess.cpp
@@ -256,7 +256,9 @@ static void groupVPMWrites(VPM& vpm, VPMAccessGroup& group)
                 group.groupType.getElementType().getPhysicalWidth()));
     }
     std::size_t numRemoved = 0;
-    vpm.updateScratchSize(static_cast<unsigned char>(group.addressWrites.size()));
+    // TODO don't actually combine reads and writes, only setups
+    // vpm.updateScratchSize(static_cast<unsigned char>(group.addressWrites.size()));
+    throw CompilationError(CompilationStep::OPTIMIZER, "Has been broken by recent changes");
 
     // 2. Remove all but the first generic and DMA setups
     for(std::size_t i = 1; i < group.genericSetups.size(); ++i)
@@ -326,7 +328,9 @@ static void groupVPMReads(VPM& vpm, VPMAccessGroup& group)
     {
         VPRSetupWrapper dmaSetupValue(group.dmaSetups.at(0).get<LoadImmediate>());
         dmaSetupValue.dmaSetup.setNumberRows(group.genericSetups.size() % 16);
-        vpm.updateScratchSize(static_cast<unsigned char>(group.genericSetups.size()));
+        // TODO don't actually combine reads and writes, only setups
+        // vpm.updateScratchSize(static_cast<unsigned char>(group.genericSetups.size()));
+        throw CompilationError(CompilationStep::OPTIMIZER, "Has been broken by recent changes");
         // TODO can be space-optimized, half-words and bytes can be packed into single row (VPM writes too)
     }
     std::size_t numRemoved = 0;
@@ -678,11 +682,6 @@ void normalization::mapMemoryAccess(const Module& module, Method& method, const
     for(auto& mapping : memoryMapping)
         infos.emplace(mapping.first, checkMemoryMapping(method, mapping.first, mapping.second));
 
-    // After we fixed all the VPM areas used for specific purposes, we can check how big of a scratch size we need
-    // TODO rewrite scratch area to per-QPU? To not need mutex lock!
-    // Would need size of per QPU scratch are before mapping any instruction, should be possible with new
-    // check-all-first-map-then flow
-
     // TODO sort locals by where to put them and then call 1. check of mapping and 2. mapping on all
     for(auto& memIt : memoryInstructions)
     {
diff --git a/src/normalization/MemoryMappings.cpp b/src/normalization/MemoryMappings.cpp
index 0dd70746..274a82fb 100644
--- a/src/normalization/MemoryMappings.cpp
+++ b/src/normalization/MemoryMappings.cpp
@@ -403,8 +403,7 @@ static InstructionWalker accessMemoryInRAMViaVPM(
     }
     case MemoryOperation::READ:
     {
-        it = periphery::insertReadDMA(
-            method, it, mem->getDestination(), mem->getSource(), true /* need to lock mutex for shared scratch area */);
+        it = periphery::insertReadDMA(method, it, mem->getDestination(), mem->getSource());
         auto* src = mem->getSource().hasLocal() ? mem->getSource().local()->getBase(true) : nullptr;
         if(src && src->is<Parameter>())
             const_cast<Parameter*>(src->as<const Parameter>())->decorations =
@@ -413,8 +412,7 @@ static InstructionWalker accessMemoryInRAMViaVPM(
     }
     case MemoryOperation::WRITE:
     {
-        it = periphery::insertWriteDMA(
-            method, it, mem->getSource(), mem->getDestination(), true /* need to lock mutex for shared scratch area */);
+        it = periphery::insertWriteDMA(method, it, mem->getSource(), mem->getDestination());
         auto* dest = mem->getDestination().hasLocal() ? mem->getDestination().local()->getBase(true) : nullptr;
         if(dest && dest->is<Parameter>())
             const_cast<Parameter*>(dest->as<const Parameter>())->decorations =
@@ -506,7 +504,7 @@ static InstructionWalker mapMemoryCopy(
         logging::debug() << "Mapping copy from RAM into RAM to DMA read and DMA write: " << mem->to_string()
                          << logging::endl;
         it = method.vpm->insertCopyRAM(
-            method, it, mem->getDestination(), mem->getSource(), static_cast<unsigned>(numBytes));
+            method, it, mem->getDestination(), mem->getSource(), static_cast<unsigned>(numBytes), nullptr);
         return it.erase();
     }
     else if(destInRegister && destInfo.convertedRegisterType)
diff --git a/src/periphery/VPM.cpp b/src/periphery/VPM.cpp
index 533f21b8..ccb6430e 100644
--- a/src/periphery/VPM.cpp
+++ b/src/periphery/VPM.cpp
@@ -379,8 +379,23 @@ static NODISCARD InstructionWalker calculateElementOffset(
     // e.g. 32-bit type, 4 byte offset -> 1 32-bit element offset
     // e.g. byte4 type, 4 byte offset -> 1 byte element offset
     // e.g. half-word8 type, 32 byte offset -> 2 half-word element offset
-    elementOffset = assign(it, TYPE_INT16, "%vpm_element_offset") =
-        inAreaOffset / Literal(elementType.getPhysicalWidth());
+    if(inAreaOffset == INT_ZERO)
+        elementOffset = INT_ZERO;
+    else
+        elementOffset = assign(it, TYPE_INT16, "%vpm_element_offset") =
+            inAreaOffset / Literal(elementType.getPhysicalWidth());
+    return it;
+}
+
+static InstructionWalker addScratchOffset(
+    Method& method, InstructionWalker it, const Value& inAreaOffset, Value& scratchOffset, const DataType& elementType)
+{
+    // -> 8-bit element: add #QPU << 2
+    // -> 16-bit element: add #QPU << 1
+    // -> 32-bit element: add #QPU
+    auto tmp = assign(it, TYPE_INT8, "%scratch_offset") =
+        Value(REG_QPU_NUMBER, TYPE_INT8) * Literal(TYPE_INT32.getScalarBitCount() / elementType.getScalarBitCount());
+    scratchOffset = assign(it, TYPE_INT32, "%scratch_offset") = inAreaOffset + tmp;
     return it;
 }
 
@@ -390,15 +405,12 @@ InstructionWalker VPM::insertReadVPM(Method& method, InstructionWalker it, const
     const DataType vpmStorageType = getVPMStorageType(dest.type);
     if(area != nullptr)
         area->checkAreaSize(vpmStorageType.getPhysicalWidth());
-    else
-        // a single vector can only use a maximum of 1 row
-        updateScratchSize(1);
 
     it = insertLockMutex(it, useMutex);
     // 1) configure reading from VPM into QPU
     const VPMArea& realArea = area != nullptr ? *area : getScratchArea();
     const VPRSetup genericSetup(realArea.toReadSetup(dest.type));
-    if(inAreaOffset == INT_ZERO)
+    if(inAreaOffset == INT_ZERO && area != nullptr)
     {
         it.emplace(new LoadImmediate(VPM_IN_SETUP_REGISTER, Literal(genericSetup.value)));
         it->addDecorations(InstructionDecorations::VPM_READ_CONFIGURATION);
@@ -411,6 +423,8 @@ InstructionWalker VPM::insertReadVPM(Method& method, InstructionWalker it, const
         // 1) convert offset in bytes to offset in elements (!! VPM stores vector-size of 16!!)
         Value elementOffset = UNDEFINED_VALUE;
         it = calculateElementOffset(method, it, dest.type, inAreaOffset, elementOffset);
+        if(area == nullptr)
+            it = addScratchOffset(method, it, elementOffset, elementOffset, dest.type);
         // 2) dynamically calculate new VPM address from base and offset (add offset to setup-value)
         // 3) write setup with dynamic address
         assign(it, VPM_IN_SETUP_REGISTER) = (Value(Literal(genericSetup.value), TYPE_INT32) + elementOffset,
@@ -428,15 +442,12 @@ InstructionWalker VPM::insertWriteVPM(Method& method, InstructionWalker it, cons
     const DataType vpmStorageType = getVPMStorageType(src.type);
     if(area != nullptr)
         area->checkAreaSize(vpmStorageType.getPhysicalWidth());
-    else
-        // a single vector can only use a maximum of 1 row
-        updateScratchSize(1);
 
     it = insertLockMutex(it, useMutex);
     // 1. configure writing from QPU into VPM
     const VPMArea& realArea = area != nullptr ? *area : getScratchArea();
     const VPWSetup genericSetup(realArea.toWriteSetup(src.type));
-    if(inAreaOffset == INT_ZERO)
+    if(inAreaOffset == INT_ZERO && area != nullptr)
     {
         it.emplace(new LoadImmediate(VPM_OUT_SETUP_REGISTER, Literal(genericSetup.value)));
         it->addDecorations(InstructionDecorations::VPM_WRITE_CONFIGURATION);
@@ -449,6 +460,8 @@ InstructionWalker VPM::insertWriteVPM(Method& method, InstructionWalker it, cons
         // 1) convert offset in bytes to offset in elements (!! VPM stores vector-size of 16!!)
         Value elementOffset = UNDEFINED_VALUE;
         it = calculateElementOffset(method, it, src.type, inAreaOffset, elementOffset);
+        if(area == nullptr)
+            it = addScratchOffset(method, it, elementOffset, elementOffset, src.type);
         // 2) dynamically calculate new VPM address from base and offset (add offset to setup-value)
         // 3) write setup with dynamic address
         assign(it, VPM_OUT_SETUP_REGISTER) = (Value(Literal(genericSetup.value), TYPE_INT32) + elementOffset,
@@ -465,9 +478,6 @@ InstructionWalker VPM::insertReadRAM(Method& method, InstructionWalker it, const
 {
     if(area != nullptr)
         area->checkAreaSize(getVPMStorageType(type).getPhysicalWidth());
-    else
-        // a single vector can only use a maximum of 1 row
-        updateScratchSize(1);
 
     if(memoryAddress.hasLocal() && memoryAddress.local() != nullptr)
     {
@@ -494,13 +504,16 @@ InstructionWalker VPM::insertReadRAM(Method& method, InstructionWalker it, const
     const VPMArea& realArea = area != nullptr ? *area : getScratchArea();
     const VPRSetup dmaSetup(realArea.toReadDMASetup(type, static_cast<uint8_t>(rowCount)));
     Value dmaSetupBits(Literal(dmaSetup.value), TYPE_INT32);
-    if(inAreaOffset != INT_ZERO)
+    if(inAreaOffset != INT_ZERO || area == nullptr)
     {
         // this is the offset in byte -> calculate the offset in elements of destination-type
 
         // 1) convert offset in bytes to offset in elements (!! VPM stores vector-size of 16!!)
         Value elementOffset = UNDEFINED_VALUE;
         it = calculateElementOffset(method, it, memoryAddress.type.getElementType(), inAreaOffset, elementOffset);
+        if(area == nullptr)
+            it = addScratchOffset(
+                method, it, elementOffset, elementOffset, TYPE_INT32 /* we always address whole words here */);
         // 2) dynamically calculate new VPM address from base and offset (add offset to setup-value)
         if(!realArea.canBePackedIntoRow())
             // need to modify offset to point to next row, not next element in same row
@@ -546,9 +559,6 @@ InstructionWalker VPM::insertWriteRAM(Method& method, InstructionWalker it, cons
 {
     if(area != nullptr)
         area->checkAreaSize(getVPMStorageType(type).getPhysicalWidth());
-    else
-        // a single vector can only use a maximum of 1 row
-        updateScratchSize(1);
 
     // TODO is the calculation of the size to copy correct? We are mixing different types (e.g. byte from memory
     // instruction, consecutive memory area) with type for VPM area (rows which might not be filled completely). Same
@@ -577,7 +587,7 @@ InstructionWalker VPM::insertWriteRAM(Method& method, InstructionWalker it, cons
     const VPMArea& realArea = area != nullptr ? *area : getScratchArea();
     const VPWSetup dmaSetup(realArea.toWriteDMASetup(type, static_cast<uint8_t>(rowCount)));
     Value dmaSetupBits(Literal(dmaSetup.value), TYPE_INT32);
-    if(inAreaOffset != INT_ZERO)
+    if(inAreaOffset != INT_ZERO || area == nullptr)
     {
         // this is the offset in byte -> calculate the offset in elements of destination-type
 
@@ -585,6 +595,9 @@ InstructionWalker VPM::insertWriteRAM(Method& method, InstructionWalker it, cons
         Value elementOffset = UNDEFINED_VALUE;
         it = calculateElementOffset(
             method, it, memoryAddress.type.getElementType().getElementType(), inAreaOffset, elementOffset);
+        if(area == nullptr)
+            it = addScratchOffset(
+                method, it, elementOffset, elementOffset, TYPE_INT32 /* we always address whole words here */);
         // 2) dynamically calculate new VPM address from base and offset (shift and add offset to setup-value)
         if(!realArea.canBePackedIntoRow())
             // need to modify offset to point to next row, not next element in same row
@@ -631,15 +644,9 @@ InstructionWalker VPM::insertWriteRAM(Method& method, InstructionWalker it, cons
 InstructionWalker VPM::insertCopyRAM(Method& method, InstructionWalker it, const Value& destAddress,
     const Value& srcAddress, const unsigned numBytes, const VPMArea* area, bool useMutex)
 {
-    // TODO copying from/to RAM can use VPM area not accessible from QPU!!
-    // With area per QPU, so they can copy unsynchronized
-    // TODO test on py-videocore beforehand that access of upper VPM area works!
-
     const auto size = getBestVectorSize(numBytes);
     if(area != nullptr)
         area->checkAreaSize(size.first.getPhysicalWidth());
-    else
-        updateScratchSize(1);
 
     it = insertLockMutex(it, useMutex);
 
@@ -670,8 +677,6 @@ InstructionWalker VPM::insertFillRAM(Method& method, InstructionWalker it, const
 
     if(area != nullptr)
         area->checkAreaSize(type.getPhysicalWidth());
-    else
-        updateScratchSize(1);
 
     it = insertLockMutex(it, useMutex);
     it = insertWriteRAM(method, it, memoryAddress, type, area, false);
@@ -712,6 +717,7 @@ DataType VPMArea::getElementType() const
         // is not known
         return TYPE_UNKNOWN;
     case VPMUsage::LOCAL_MEMORY:
+    case VPMUsage::MEMORY_CACHE:
         // element-type of local assigned to this area
         return originalAddress->type.getElementType();
     case VPMUsage::REGISTER_SPILLING:
@@ -740,7 +746,8 @@ uint8_t VPMArea::getElementsInRow(const DataType& elementType) const
 
 bool VPMArea::canBeAccessedViaDMA() const
 {
-    return usageType == VPMUsage::SCRATCH || getElementType().getVectorWidth() == NATIVE_VECTOR_SIZE;
+    return usageType == VPMUsage::SCRATCH || usageType == VPMUsage::MEMORY_CACHE || usageType == VPMUsage::COPY_CACHE ||
+        getElementType().getVectorWidth() == NATIVE_VECTOR_SIZE;
 }
 
 bool VPMArea::canBePackedIntoRow() const
@@ -853,13 +860,17 @@ static std::string toUsageString(VPMUsage usage, const Local* local)
     switch(usage)
     {
     case VPMUsage::LOCAL_MEMORY:
-        return (local ? local->to_string() : "(nullptr)");
+        return (local ? local->to_string() : "(nullptr)") + " (lowered)";
     case VPMUsage::REGISTER_SPILLING:
         return "register spilling";
     case VPMUsage::SCRATCH:
         return "scratch area";
     case VPMUsage::STACK:
         return "stack" + (local ? " " + local->to_string() : "");
+    case VPMUsage::MEMORY_CACHE:
+        return (local ? local->to_string() : "(nullptr)") + " (cache)";
+    case VPMUsage::COPY_CACHE:
+        return "copy cache";
     }
     throw CompilationError(
         CompilationStep::GENERAL, "Unhandled VPM usage type", std::to_string(static_cast<unsigned>(usage)));
@@ -875,7 +886,11 @@ VPM::VPM(const unsigned totalVPMSize) : maximumVPMSize(std::min(VPM_DEFAULT_SIZE
 {
     // set a size of at least 1 row, so if no scratch is used, the first area has an offset of != 0 and therefore is
     // different than the scratch-area
-    areas[0] = std::make_shared<VPMArea>(VPMArea{VPMUsage::SCRATCH, 0, 1, nullptr});
+    auto scratch = std::make_shared<VPMArea>(VPMArea{VPMUsage::SCRATCH, 0, NUM_QPUS, nullptr});
+    for(unsigned i = 0; i < NUM_QPUS; ++i)
+        areas[i] = scratch;
+    PROFILE_COUNTER(
+        vc4c::profiler::COUNTER_GENERAL + 90, "VPM cache size", NUM_QPUS * VPM_NUM_COLUMNS * VPM_WORD_WIDTH);
 }
 
 const VPMArea& VPM::getScratchArea() const
@@ -931,7 +946,7 @@ const VPMArea* VPM::addArea(const Local* local, const DataType& elementType, boo
         // no more (big enough) free space on VPM
         return nullptr;
 
-    // for now align all new VPM areas at the beginning of a column
+    // for now align all new VPM areas at the beginning of a row
     auto ptr = std::make_shared<VPMArea>(VPMArea{isStackArea ? VPMUsage::STACK : VPMUsage::LOCAL_MEMORY,
         static_cast<uint8_t>(rowOffset.value()), numRows, local});
     for(auto i = rowOffset.value(); i < (rowOffset.value() + numRows); ++i)
@@ -960,26 +975,6 @@ unsigned VPM::getMaxCacheVectors(const DataType& type, bool writeAccess) const
     return std::min(std::min(15u, (maximumVPMSize / 16) / (type.getScalarBitCount() / 8)), numFreeRows);
 }
 
-void VPM::updateScratchSize(unsigned char requestedRows)
-{
-    if(requestedRows > VPM_NUM_ROWS)
-        throw CompilationError(CompilationStep::GENERAL,
-            "The requested size of the scratch area exceeds the total VPM size", std::to_string(requestedRows));
-    if(getMaxCacheVectors(TYPE_INT32, true) < requestedRows)
-        throw CompilationError(CompilationStep::GENERAL,
-            "The requested size of the scratch area exceeds the available VPM size", std::to_string(requestedRows));
-
-    if(getScratchArea().numRows < requestedRows)
-    {
-        logging::debug() << "Increased the scratch size to " << requestedRows << " rows (" << requestedRows * 64
-                         << " bytes)" << logging::endl;
-        const_cast<unsigned char&>(getScratchArea().numRows) = requestedRows;
-        // fill areas with scratch
-        for(unsigned i = 1; i < requestedRows; ++i)
-            areas[i] = areas[0];
-    }
-}
-
 InstructionWalker VPM::insertLockMutex(InstructionWalker it, bool useMutex) const
 {
     if(useMutex)
@@ -1125,7 +1120,7 @@ DataType VPM::getVPMStorageType(const DataType& type)
 static void writeArea(std::wostream& s, const std::string& name, unsigned width)
 {
     auto sub = name.substr(0, width - 1) + "|";
-    s << std::setw(width) << sub;
+    s << std::setw(static_cast<int>(width)) << sub;
 }
 
 void VPM::dumpUsage() const
@@ -1157,23 +1152,8 @@ void VPM::dumpUsage() const
             numEmpty = 0;
         }
         lastArea = area;
-        std::string name;
-        switch(area->usageType)
-        {
-        case VPMUsage::SCRATCH:
-            name = "scratch";
-            break;
-        case VPMUsage::LOCAL_MEMORY:
-            name = area->originalAddress ? area->originalAddress->name : "local";
-            break;
-        case VPMUsage::REGISTER_SPILLING:
-            name = "spilling";
-            break;
-        case VPMUsage::STACK:
-            name = "stack";
-            break;
-        }
-        writeArea(stream, name, (area->numRows * outputWidth) / VPM_NUM_ROWS);
+        writeArea(stream, toUsageString(area->usageType, area->originalAddress),
+            (area->numRows * outputWidth) / VPM_NUM_ROWS);
     }
     if(numEmpty > 0)
         writeArea(stream, "", (numEmpty * outputWidth) / VPM_NUM_ROWS);
diff --git a/src/periphery/VPM.h b/src/periphery/VPM.h
index eea014c7..a32d4096 100644
--- a/src/periphery/VPM.h
+++ b/src/periphery/VPM.h
@@ -610,13 +610,22 @@ namespace vc4c
              * The area of the VPM to be used as cache for general DMA access.
              *
              * NOTE:
-             * This area needs to always be at offset 0, its size is calculated to match the required scratch size
+             * This area needs to always be at offset 0, its size is calculated to match the required scratch size.
+             *
+             * NOTE:
+             * Every QPU has its own scratch area
              */
             SCRATCH,
             /*
              * This area is used as storage for local memory areas which fit into VPM.
              */
             LOCAL_MEMORY,
+            /*
+             * This area is used to cache a memory area in RAM. This is behavior similar to L1/L2/etc. caches for CPUs.
+             * The memory will be initially loaded into the cache area and written back at the end of the kernel
+             * execution.
+             */
+            MEMORY_CACHE,
             /*
              * This area is used to spill registers into.
              *
@@ -630,7 +639,14 @@ namespace vc4c
              * NOTE:
              * Its size needs include the spilled locals for all available QPUs!
              */
-            STACK
+            STACK,
+            /*
+             * This area is used for storing temporary data for copying from and to RAM.
+             *
+             * NOTE:
+             * These VPM areas CANNOT be accessed from QPU!!
+             */
+            COPY_CACHE
         };
 
         /*
@@ -821,12 +837,6 @@ namespace vc4c
             NODISCARD InstructionWalker insertFillRAM(Method& method, InstructionWalker it, const Value& memoryAddress,
                 const DataType& type, unsigned numCopies, const VPMArea* area = nullptr, bool useMutex = false);
 
-            /*
-             * Updates the maximum size used by the scratch area.
-             * This can only be called until the scratch-area is locked!
-             */
-            void updateScratchSize(unsigned char requestedRows);
-
             /*
              * Since we can only access the VPM from QPU-side in vectors of 16 elements,
              * the type needs to be converted to a type with all element-types set to 16-element vectors
diff --git a/test/TestMemoryAccess.cpp b/test/TestMemoryAccess.cpp
index 7b5361d1..6efbb9a7 100644
--- a/test/TestMemoryAccess.cpp
+++ b/test/TestMemoryAccess.cpp
@@ -360,7 +360,6 @@ void TestMemoryAccess::testVectorLoadStoreCharPrivate()
 
 void TestMemoryAccess::testVectorLoadStoreCharLocal()
 {
-    //FIXME manual test is correct!
     testPrivateLocalFunction<char>(config, "-DTYPE=char -DSTORAGE=__local",
         std::bind(&TestMemoryAccess::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
 }
@@ -376,7 +375,6 @@ void TestMemoryAccess::testVectorLoadStoreShortPrivate()
 }
 void TestMemoryAccess::testVectorLoadStoreShortLocal()
 {
-    //FIXME manual test is correct!
     testPrivateLocalFunction<short>(config, "-DTYPE=short -DSTORAGE=__local",
         std::bind(&TestMemoryAccess::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
 }

From c873760615f819360396ea80561f2fe37a933f49 Mon Sep 17 00:00:00 2001
From: doe300 <stadeldani@web.de>
Date: Sat, 3 Nov 2018 12:08:20 +0100
Subject: [PATCH 4/6] Various fixes and improvements

* fixes memory association for some special cases
* fixes elimination of moves for VPM usage without mutex

Effects (test-emulator, last 2 commits):
Instructions:       45779 to 52510 (+14%)
Cycles:             661193 to 644891 (-2%)
Mutex waits:        281459 to 0
Total time (in ms): ~57192 to 57232 (+-0%)
---
 src/optimization/Combiner.cpp   |  2 ++
 src/optimization/Eliminator.cpp | 21 ++++++++++++++-------
 src/periphery/VPM.cpp           |  8 +++++++-
 src/tools/Emulator.cpp          |  3 +++
 4 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/optimization/Combiner.cpp b/src/optimization/Combiner.cpp
index 10919f3a..b021ddb7 100644
--- a/src/optimization/Combiner.cpp
+++ b/src/optimization/Combiner.cpp
@@ -670,6 +670,8 @@ static bool canReplaceConstantLoad(
 
 bool optimizations::combineLoadingConstants(const Module& module, Method& method, const Configuration& config)
 {
+    // TODO extend/add new optimization combining constant registers (e.g. element number, qpu number, e.g. for
+    // TestMemoryAccess#GLOBAL_MEMORY_FUNCTION) or leave for common subexpression elimination?
     std::size_t threshold = config.additionalOptions.combineLoadThreshold;
     bool hasChanged = false;
 
diff --git a/src/optimization/Eliminator.cpp b/src/optimization/Eliminator.cpp
index 1d64249d..680e5e88 100644
--- a/src/optimization/Eliminator.cpp
+++ b/src/optimization/Eliminator.cpp
@@ -142,14 +142,18 @@ InstructionWalker optimizations::simplifyOperation(
             Optional<Value> leftAbsorbing = OpCode::getLeftAbsorbingElement(op->op);
 
             // one of the operands is the absorbing element, operation can be replaced with move
-            if(leftAbsorbing && firstArg.hasLiteral(leftAbsorbing->getLiteralValue().value()))
+            // TODO the hasLiteral() checks seem to sometimes not pass while the simple comparisons do. Check for "all"
+            // cases then remove them
+            if(leftAbsorbing &&
+                (firstArg.hasLiteral(leftAbsorbing->getLiteralValue().value()) || firstArg == leftAbsorbing))
             {
                 logging::debug() << "Replacing obsolete " << op->to_string() << " with move 1" << logging::endl;
                 it.reset((new intermediate::MoveOperation(
                               op->getOutput().value(), leftAbsorbing.value(), op->conditional, op->setFlags))
                              ->addDecorations(it->decoration));
             }
-            else if(rightAbsorbing && secondArg && secondArg->hasLiteral(rightAbsorbing->getLiteralValue().value()))
+            else if(rightAbsorbing && secondArg &&
+                (secondArg->hasLiteral(rightAbsorbing->getLiteralValue().value()) || *secondArg == rightAbsorbing))
             {
                 logging::debug() << "Replacing obsolete " << op->to_string() << " with move 2" << logging::endl;
                 it.reset((new intermediate::MoveOperation(
@@ -160,7 +164,8 @@ InstructionWalker optimizations::simplifyOperation(
             else if(op->getOutput() && op->getOutput().value() == op->getFirstArg())
             {
                 // check whether second-arg exists and does nothing
-                if(rightIdentity && secondArg && secondArg->hasLiteral(rightIdentity->getLiteralValue().value()))
+                if(rightIdentity && secondArg &&
+                    (secondArg->hasLiteral(rightIdentity->getLiteralValue().value()) || *secondArg == rightIdentity))
                 {
                     logging::debug() << "Removing obsolete " << op->to_string() << logging::endl;
                     it.erase();
@@ -178,7 +183,8 @@ InstructionWalker optimizations::simplifyOperation(
             else if(op->getOutput() && op->getSecondArg() && op->getOutput().value() == op->assertArgument(1))
             {
                 // check whether first-arg does nothing
-                if(leftIdentity && firstArg.hasLiteral(leftIdentity->getLiteralValue().value()))
+                if(leftIdentity &&
+                    (firstArg.hasLiteral(leftIdentity->getLiteralValue().value()) || firstArg == leftIdentity))
                 {
                     logging::debug() << "Removing obsolete " << op->to_string() << logging::endl;
                     it.erase();
@@ -197,7 +203,8 @@ InstructionWalker optimizations::simplifyOperation(
             else // writes to another local -> can be replaced with move
             {
                 // check whether second argument exists and does nothing
-                if(rightIdentity && secondArg && secondArg->hasLiteral(rightIdentity->getLiteralValue().value()))
+                if(rightIdentity && secondArg &&
+                    (secondArg->hasLiteral(rightIdentity->getLiteralValue().value()) || *secondArg == rightIdentity))
                 {
                     logging::debug() << "Replacing obsolete " << op->to_string() << " with move 3" << logging::endl;
                     it.reset((new intermediate::MoveOperation(
@@ -205,7 +212,8 @@ InstructionWalker optimizations::simplifyOperation(
                                  ->addDecorations(it->decoration));
                 }
                 // check whether first argument does nothing
-                else if(leftIdentity && secondArg && firstArg.hasLiteral(leftIdentity->getLiteralValue().value()))
+                else if(leftIdentity && op->getArgument(1) &&
+                    (firstArg.hasLiteral(leftIdentity->getLiteralValue().value()) || firstArg == leftIdentity))
                 {
                     logging::debug() << "Replacing obsolete " << op->to_string() << " with move 4" << logging::endl;
                     it.reset((new intermediate::MoveOperation(
@@ -216,7 +224,6 @@ InstructionWalker optimizations::simplifyOperation(
                 else if(op->op.isIdempotent() && secondArg && secondArg.value() == firstArg &&
                     !firstArg.hasRegister() && !firstArg.isUndefined())
                 {
-                    logging::debug() << secondArg.value().to_string() << " - " << firstArg.to_string() << logging::endl;
                     logging::debug() << "Replacing obsolete " << op->to_string() << " with move 5" << logging::endl;
                     it.reset((new intermediate::MoveOperation(
                                   op->getOutput().value(), op->assertArgument(1), op->conditional, op->setFlags))
diff --git a/src/periphery/VPM.cpp b/src/periphery/VPM.cpp
index ccb6430e..f93bb2c0 100644
--- a/src/periphery/VPM.cpp
+++ b/src/periphery/VPM.cpp
@@ -393,8 +393,14 @@ static InstructionWalker addScratchOffset(
     // -> 8-bit element: add #QPU << 2
     // -> 16-bit element: add #QPU << 1
     // -> 32-bit element: add #QPU
+    // we introduce a temporary here, since REG_QPU_NUMBER cannot be combined with shifting by literal anyway. This way,
+    // we allow other optimizations to process the instructions. Otherwise, the adjustment would split them after all
+    // optimization have run
+    auto elemNum = method.addNewLocal(TYPE_INT8, "%qpu_number");
+    it.emplace(new MoveOperation(elemNum, Value(REG_QPU_NUMBER, TYPE_INT8)));
+    it.nextInBlock();
     auto tmp = assign(it, TYPE_INT8, "%scratch_offset") =
-        Value(REG_QPU_NUMBER, TYPE_INT8) * Literal(TYPE_INT32.getScalarBitCount() / elementType.getScalarBitCount());
+        elemNum * Literal(TYPE_INT32.getScalarBitCount() / elementType.getScalarBitCount());
     scratchOffset = assign(it, TYPE_INT32, "%scratch_offset") = inAreaOffset + tmp;
     return it;
 }
diff --git a/src/tools/Emulator.cpp b/src/tools/Emulator.cpp
index 5bf4536b..f4d86234 100644
--- a/src/tools/Emulator.cpp
+++ b/src/tools/Emulator.cpp
@@ -474,6 +474,7 @@ std::pair<Value, bool> TMUs::readTMU()
     {
         PROFILE_COUNTER(vc4c::profiler::COUNTER_EMULATOR + 69, "TMU1 read", 1);
     }
+    logging::debug() << "Reading from TMU: " << front.first.to_string(true, true) << logging::endl;
     return std::make_pair(front.first, true);
 }
 
@@ -585,6 +586,7 @@ Value TMUs::readMemoryAddress(const Value& address) const
         else
             res.container().elements.push_back(memory.readWord(element.getLiteralValue()->toImmediate()));
     }
+    // XXX for cosmetic/correctness, this should print the rounded-down (to word boundaries) addresses
     logging::debug() << "Reading via TMU from memory address " << address.to_string(false, true) << ": "
                      << res.to_string(false, true) << logging::endl;
     return res;
@@ -605,6 +607,7 @@ Value SFU::readSFU()
     const Value val = sfuResult.value();
     sfuResult = NO_VALUE;
     PROFILE_COUNTER(vc4c::profiler::COUNTER_EMULATOR + 70, "SFU read", 1);
+    logging::debug() << "Reading from SFU: " << val.to_string(true, true) << logging::endl;
     return val;
 }
 

From 7bad8c02a310c010cae797988ac5fdac2a8ccdfe Mon Sep 17 00:00:00 2001
From: doe300 <stadeldani@web.de>
Date: Tue, 6 Nov 2018 18:29:32 +0100
Subject: [PATCH 5/6] Re-adds simple version of combining VPM setups

This version will only combine writing of same setup values,
where possible. The full version is also removed, since it will
anyway become obsolete with VPM cached memory (see #113).

Effects (test-emulator):
Instructions:       52511 to 49793 (-5%)
Cycles:             644891 to 641680 (-0.5%)
Total time (in ms): 62869 to 58456 (-7%)
---
 src/normalization/AddressCalculation.cpp | 109 ------
 src/normalization/AddressCalculation.h   |  15 -
 src/normalization/MemoryAccess.cpp       | 425 -----------------------
 src/optimization/Combiner.cpp            | 117 ++++++-
 src/optimization/Combiner.h              |   9 +
 src/optimization/Optimizer.cpp           |   3 +
 src/periphery/VPM.h                      |  12 +-
 7 files changed, 133 insertions(+), 557 deletions(-)

diff --git a/src/normalization/AddressCalculation.cpp b/src/normalization/AddressCalculation.cpp
index b889d655..7d4ff472 100644
--- a/src/normalization/AddressCalculation.cpp
+++ b/src/normalization/AddressCalculation.cpp
@@ -16,115 +16,6 @@ using namespace vc4c::intermediate;
 using namespace vc4c::normalization;
 using namespace vc4c::operators;
 
-static BaseAndOffset findOffset(const Value& val)
-{
-    if(!val.hasLocal())
-        return BaseAndOffset();
-    const LocalUser* writer = val.getSingleWriter();
-    if(writer != nullptr)
-    {
-        const Optional<Value> offset = writer->precalculate(8);
-        if(offset && offset->isLiteralValue())
-        {
-            return BaseAndOffset(NO_VALUE, offset->getLiteralValue()->signedInt());
-        }
-    }
-    return BaseAndOffset();
-}
-
-BaseAndOffset normalization::findBaseAndOffset(const Value& val)
-{
-    // TODO add support for offsets via getlocal/global_id, etc.
-    // need to the set base to addr + offset and the offset to the offset of the offset (e.g. param[get_local_id(0) +
-    // 7])  but how to determine?
-    if(!val.hasLocal())
-        return BaseAndOffset();
-    if(val.local()->is<Parameter>() || val.local()->is<Global>() || val.local()->is<StackAllocation>())
-        return BaseAndOffset(val, 0);
-
-    // follow the references
-    const Local* ref = val.local()->getBase(false);
-    if(ref != val.local())
-        return findBaseAndOffset(ref->createReference());
-    if(val.local()->reference.first != nullptr && val.local()->reference.second != ANY_ELEMENT)
-        return BaseAndOffset(val.local()->reference.first->createReference(), val.local()->reference.second);
-
-    const auto writers = val.local()->getUsers(LocalUse::Type::WRITER);
-    if(writers.size() != 1)
-        return BaseAndOffset();
-
-    // The reader can be one of several valid cases:
-    // 1. a move from another local -> need to follow the move
-    if(dynamic_cast<const MoveOperation*>(*writers.begin()) != nullptr)
-        return findBaseAndOffset(dynamic_cast<const MoveOperation*>(*writers.begin())->getSource());
-    const auto& args = (*writers.begin())->getArguments();
-    // 2. an addition with a local and a literal -> the local is the base, the literal the offset
-    if(dynamic_cast<const Operation*>((*writers.begin())) != nullptr &&
-        dynamic_cast<const Operation*>((*writers.begin()))->op == OP_ADD && args.size() == 2 &&
-        std::any_of(args.begin(), args.end(), [](const Value& arg) -> bool { return arg.hasLocal(); }) &&
-        std::any_of(
-            args.begin(), args.end(), [](const Value& arg) -> bool { return arg.getLiteralValue().has_value(); }))
-    {
-        return BaseAndOffset(
-            std::find_if(args.begin(), args.end(), [](const Value& arg) -> bool { return arg.hasLocal(); })
-                ->local()
-                ->getBase(false)
-                ->createReference(),
-            static_cast<int32_t>((*std::find_if(args.begin(), args.end(),
-                                      [](const Value& arg) -> bool { return arg.getLiteralValue().has_value(); }))
-                                     .getLiteralValue()
-                                     ->signedInt() /
-                val.type.getElementType().getPhysicalWidth()));
-    }
-
-    // 3. an addition with two locals -> one is the base, the other the calculation of the literal
-    if(dynamic_cast<const Operation*>((*writers.begin())) != nullptr &&
-        dynamic_cast<const Operation*>((*writers.begin()))->op == OP_ADD && args.size() == 2 &&
-        std::all_of(args.begin(), args.end(), [](const Value& arg) -> bool { return arg.hasLocal(); }))
-    {
-        const auto offset0 = findOffset(args[0]);
-        const auto offset1 = findOffset(args[1]);
-        if(offset0.offset && args[1].hasLocal())
-            return BaseAndOffset(args[1].local()->getBase(false)->createReference(),
-                static_cast<int32_t>(offset0.offset.value() / val.type.getElementType().getPhysicalWidth()));
-        if(offset1.offset && args[0].hasLocal())
-            return BaseAndOffset(args[0].local()->getBase(false)->createReference(),
-                static_cast<int32_t>(offset1.offset.value() / val.type.getElementType().getPhysicalWidth()));
-    }
-    /*
-        if(writers.size() == 1)
-        {
-            // couldn't find literal offset for any direct base, try with arbitrary values
-            ref = val.local->getBase(true);
-            Optional<Value> offset = NO_VALUE;
-            for(const auto& arg : (*writers.begin())->getArguments())
-            {
-                if(ref != nullptr && arg.hasLocal() && arg.local->getBase(false) == ref)
-                    // skip finding the same base again
-                    continue;
-                auto tmp = findBaseAndOffset(arg);
-                if(tmp.base && tmp.base->local->getBase(true) == ref && tmp.offset.is(0))
-                    // this parameter is the base itself, is already handled
-                    continue;
-                // TODO how to combine the offsets?
-                // TODO also need to handle non-addition of offsets (e.g. ptr = base + (offset + size * i))
-                logging::debug() << "Found offset of " << tmp.base.to_string() << " + "
-                                 << (tmp.offset ? tmp.offset.value() : -1) << logging::endl;
-                logging::debug() << "Found offset of " << tmp.base.to_string() << " with expression: "
-                                 << vc4c::Expression::createExpression(*(*writers.begin())).to_string() <<
-       logging::endl;
-            }
-            // TODO why is this called twice? The whole function, from outside
-            logging::debug() << "Found base and non-literal offset: " << ref->to_string() << " - " << offset.to_string()
-                             << logging::endl;
-            if(ref && (ref->residesInMemory() || (ref->is<Parameter>() && ref->type.isPointerType())))
-                return BaseAndOffset(ref->createReference(), {});
-        }
-    */
-
-    return BaseAndOffset();
-}
-
 MemoryType normalization::toMemoryType(periphery::VPMUsage usage)
 {
     switch(usage)
diff --git a/src/normalization/AddressCalculation.h b/src/normalization/AddressCalculation.h
index 74fc493b..5534705f 100644
--- a/src/normalization/AddressCalculation.h
+++ b/src/normalization/AddressCalculation.h
@@ -16,21 +16,6 @@ namespace vc4c
 {
     namespace normalization
     {
-        struct BaseAndOffset
-        {
-            Optional<Value> base;
-            Optional<int32_t> offset;
-
-            explicit BaseAndOffset() : base(NO_VALUE), offset{} {}
-
-            BaseAndOffset(const Optional<Value>& base, Optional<int32_t> offset) : base(base), offset(offset) {}
-        };
-
-        /*
-         * Returns the base memory address for the given value as well as the offset the value has to this location
-         */
-        BaseAndOffset findBaseAndOffset(const Value& val);
-
         enum class MemoryType
         {
             // lower the value into a register and replace all loads with moves
diff --git a/src/normalization/MemoryAccess.cpp b/src/normalization/MemoryAccess.cpp
index 5bfac7fb..29da40e1 100644
--- a/src/normalization/MemoryAccess.cpp
+++ b/src/normalization/MemoryAccess.cpp
@@ -29,419 +29,6 @@ using namespace vc4c::operators;
  * - too complex phi-nodes with pointers: clNN/im2col.cl
  */
 
-struct VPMAccessGroup
-{
-    bool isVPMWrite;
-    DataType groupType = TYPE_UNKNOWN;
-    RandomAccessList<InstructionWalker> dmaSetups;
-    RandomAccessList<InstructionWalker> genericSetups;
-    RandomAccessList<InstructionWalker> addressWrites;
-    // this is the distance/offset (start of row to start of row, 1 = consecutive) between two vectors in number of
-    // vectors that would fit in between
-    int stride = 1;
-
-    /*
-     * E.g. for memory-fills or -copies, the setup-instructions are re-used,
-     * so we need to clear the duplicates
-     */
-    void cleanDuplicateInstructions()
-    {
-        auto it = dmaSetups.begin();
-        ++it;
-        while(it != dmaSetups.end())
-        {
-            if((it - 1)->get() == it->get())
-                it = dmaSetups.erase(it);
-            else
-                ++it;
-        }
-
-        it = genericSetups.begin();
-        ++it;
-        while(it != genericSetups.end())
-        {
-            if((it - 1)->get() == it->get())
-                it = genericSetups.erase(it);
-            else
-                ++it;
-        }
-
-        it = addressWrites.begin();
-        ++it;
-        while(it != addressWrites.end())
-        {
-            if((it - 1)->get() == it->get())
-                it = addressWrites.erase(it);
-            else
-                ++it;
-        }
-    }
-};
-
-static InstructionWalker findGroupOfVPMAccess(
-    VPM& vpm, InstructionWalker start, const InstructionWalker end, VPMAccessGroup& group)
-{
-    Optional<Value> baseAddress = NO_VALUE;
-    int32_t nextOffset = -1;
-    // the number of elements between two entries in memory
-    group.stride = 0;
-    group.groupType = TYPE_UNKNOWN;
-    group.dmaSetups.clear();
-    group.genericSetups.clear();
-    group.addressWrites.clear();
-    group.dmaSetups.reserve(64);
-    group.dmaSetups.reserve(64);
-    group.addressWrites.reserve(64);
-
-    // FIXME to not build too large critical sections, only combine, if the resulting critical section:
-    // 1) is not too large: either in total numbers of instructions or in ratio instructions / VPW writes, since we save
-    // a few cycles per write (incl. delay for wait DMA)
-
-    auto it = start;
-    for(; !it.isEndOfBlock() && it != end; it.nextInBlock())
-    {
-        if(it.get() == nullptr)
-            continue;
-
-        if(it.has<MemoryBarrier>())
-            // memory barriers end groups, also don't check this barrier again
-            return it.nextInBlock();
-        if(it.has<SemaphoreAdjustment>())
-            // semaphore accesses end groups, also don't check this instruction again
-            return it.nextInBlock();
-
-        if(!(it->writesRegister(REG_VPM_DMA_LOAD_ADDR) || it->writesRegister(REG_VPM_DMA_STORE_ADDR)))
-            // for simplicity, we only check for VPM addresses and find all other instructions relative to it
-            continue;
-        if(!it.has<MoveOperation>())
-            throw CompilationError(
-                CompilationStep::OPTIMIZER, "Setting VPM address with non-move is not supported", it->to_string());
-        const auto baseAndOffset = findBaseAndOffset(it.get<MoveOperation>()->getSource());
-        const bool isVPMWrite = it->writesRegister(REG_VPM_DMA_STORE_ADDR);
-        logging::debug() << "Found base address " << baseAndOffset.base.to_string() << " with offset "
-                         << std::to_string(baseAndOffset.offset.value_or(-1L)) << " for "
-                         << (isVPMWrite ? "writing into" : "reading from") << " memory" << logging::endl;
-
-        if(!baseAndOffset.base)
-            // this address-write could not be fixed to a base and an offset
-            // skip this address write for the next check
-            return it.nextInBlock();
-        if(baseAndOffset.base && baseAndOffset.base->hasLocal() && baseAndOffset.base->local()->is<Parameter>() &&
-            has_flag(baseAndOffset.base->local()->as<Parameter>()->decorations, ParameterDecorations::VOLATILE))
-            // address points to a volatile parameter, which explicitly forbids combining reads/writes
-            // skip this address write for the next check
-            return it.nextInBlock();
-
-        // check if this address is consecutive to the previous one (if any)
-        if(baseAddress)
-        {
-            if(baseAndOffset.base && baseAddress.value() != baseAndOffset.base.value())
-                // a group exists, but the base addresses don't match
-                break;
-            if(group.addressWrites.size() == 1 && baseAndOffset.offset && group.stride == 0)
-            {
-                // special case for first offset - use it to determine stride
-                group.stride = baseAndOffset.offset.value() -
-                    findBaseAndOffset(group.addressWrites[0].get<MoveOperation>()->getSource()).offset.value();
-                logging::debug() << "Using a stride of " << group.stride
-                                 << " elements between consecutive access to memory" << logging::endl;
-            }
-            if(!baseAndOffset.offset || baseAndOffset.offset.value() != (nextOffset * group.stride))
-                // a group exists, but the offsets do not match
-                break;
-        }
-
-        // check if the access mode (read/write) is the same as for the group
-        if(baseAddress && group.isVPMWrite != isVPMWrite)
-            break;
-
-        auto vpmSetups = periphery::findRelatedVPMInstructions(it, !isVPMWrite);
-        auto genericSetup = vpmSetups.genericVPMSetup;
-        auto dmaSetup = vpmSetups.dmaSetup;
-
-        // check if the VPM and DMA configurations match with the previous one
-        if(baseAddress)
-        {
-            if(!genericSetup || !dmaSetup)
-                // either there are no setups for this VPM access, or they are not loaded from literals (e.g. dynamic
-                // setup)
-                break;
-            if(!genericSetup->has<LoadImmediate>() ||
-                (!group.genericSetups.empty() &&
-                    genericSetup->get<LoadImmediate>()->getImmediate() !=
-                        group.genericSetups.at(0).get<LoadImmediate>()->getImmediate()))
-                // generic setups do not match
-                break;
-            if(!dmaSetup->has<LoadImmediate>() ||
-                dmaSetup->get<LoadImmediate>()->getImmediate() !=
-                    group.dmaSetups.at(0).get<LoadImmediate>()->getImmediate())
-                // DMA setups do not match
-                break;
-        }
-
-        // check for complex types
-        DataType elementType = baseAndOffset.base->type.isPointerType() ?
-            baseAndOffset.base->type.getPointerType().value()->elementType :
-            baseAndOffset.base->type;
-        elementType = elementType.getArrayType() ? elementType.getArrayType().value()->elementType : elementType;
-        if(!elementType.isSimpleType())
-            // XXX for now, skip combining any access to complex types (here: only struct, image)
-            // don't check this read/write again
-            return it.nextInBlock();
-
-        // all matches so far, add to group (or create a new one)
-        group.isVPMWrite = isVPMWrite;
-        group.groupType = baseAndOffset.base->type;
-        baseAddress = baseAndOffset.base.value();
-        group.addressWrites.push_back(it);
-        if(dmaSetup)
-            // not always given, e.g. for caching in VPM without accessing RAM
-            group.dmaSetups.push_back(dmaSetup.value());
-        if(genericSetup)
-            // not always given, e.g. for copying memory without reading/writing into/from QPU
-            group.genericSetups.push_back(genericSetup.value());
-        nextOffset = (baseAndOffset.offset.value_or(-1) / (group.stride == 0 ? 1 : group.stride)) + 1;
-
-        if(group.isVPMWrite && group.addressWrites.size() >= vpm.getMaxCacheVectors(elementType, true))
-        {
-            // since the current address write might be removed, skip to next instruction
-            // TODO could the following instruction(s) be removed too?? (See beneath)
-            return it.nextInBlock();
-        }
-        if(!group.isVPMWrite && group.addressWrites.size() >= vpm.getMaxCacheVectors(elementType, false))
-        {
-            // since the current address write might be removed, skip to next instruction
-            // also need to skip the consecutive DMA wait and VPM setup
-            do
-            {
-                it.nextInBlock();
-            } while(!it.isEndOfBlock() && it.get() != nullptr && it->hasValueType(ValueType::REGISTER));
-            return it;
-        }
-    }
-    // end group, but do check this instruction again
-    return it;
-}
-
-static void groupVPMWrites(VPM& vpm, VPMAccessGroup& group)
-{
-    if(group.genericSetups.size() != group.addressWrites.size() || group.genericSetups.size() != group.dmaSetups.size())
-    {
-        logging::debug() << "Number of instructions do not match for combining VPM writes!" << logging::endl;
-        logging::debug() << group.genericSetups.size() << " generic VPM setups, " << group.addressWrites.size()
-                         << " VPR address writes and " << group.dmaSetups.size() << " DMA setups" << logging::endl;
-        return;
-    }
-    if(group.addressWrites.size() <= 1)
-        return;
-    logging::debug() << "Combining " << group.addressWrites.size()
-                     << " writes to consecutive memory into one DMA write... " << logging::endl;
-
-    // 1. Update DMA setup to the number of rows written
-    {
-        VPWSetupWrapper dmaSetupValue(group.dmaSetups.at(0).get<LoadImmediate>());
-        dmaSetupValue.dmaSetup.setUnits(static_cast<uint8_t>(group.addressWrites.size()));
-    }
-    // 1.1 Update stride setup to the stride between rows
-    {
-        LoadImmediate* strideSetup = group.dmaSetups.at(0).copy().nextInBlock().get<LoadImmediate>();
-        if(strideSetup == nullptr || !strideSetup->writesRegister(REG_VPM_OUT_SETUP) ||
-            !VPWSetup::fromLiteral(strideSetup->getImmediate().unsignedInt()).isStrideSetup())
-            throw CompilationError(CompilationStep::OPTIMIZER, "Failed to find VPW DMA stride setup for DMA setup",
-                group.dmaSetups.at(0)->to_string());
-        VPWSetupWrapper strideSetupValue(strideSetup);
-        // stride is the distance in bytes from end of v1 to start of v2
-        strideSetupValue.strideSetup.setStride(
-            static_cast<uint16_t>(static_cast<unsigned>(group.stride == 0 ? 0 : group.stride - 1) *
-                group.groupType.getElementType().getPhysicalWidth()));
-    }
-    std::size_t numRemoved = 0;
-    // TODO don't actually combine reads and writes, only setups
-    // vpm.updateScratchSize(static_cast<unsigned char>(group.addressWrites.size()));
-    throw CompilationError(CompilationStep::OPTIMIZER, "Has been broken by recent changes");
-
-    // 2. Remove all but the first generic and DMA setups
-    for(std::size_t i = 1; i < group.genericSetups.size(); ++i)
-    {
-        group.genericSetups[i].erase();
-        const LoadImmediate* strideSetup = group.dmaSetups.at(i).copy().nextInBlock().get<LoadImmediate>();
-        if(strideSetup == nullptr || !strideSetup->writesRegister(REG_VPM_OUT_SETUP) ||
-            !VPWSetup::fromLiteral(strideSetup->getImmediate().unsignedInt()).isStrideSetup())
-            throw CompilationError(CompilationStep::OPTIMIZER, "Failed to find VPW DMA stride setup for DMA setup",
-                group.dmaSetups.at(i)->to_string());
-        group.dmaSetups.at(i).copy().nextInBlock().erase();
-        group.dmaSetups.at(i).erase();
-        numRemoved += 3;
-    }
-
-    // 3. remove all but the last address writes (and the following DMA waits), update the last write to write the first
-    // address written to
-    group.addressWrites.back().get<MoveOperation>()->setSource(
-        group.addressWrites.at(0).get<MoveOperation>()->getSource());
-    for(std::size_t i = 0; i < group.addressWrites.size() - 1; ++i)
-    {
-        if(!group.addressWrites[i].copy().nextInBlock()->readsRegister(
-               group.isVPMWrite ? REG_VPM_DMA_STORE_WAIT : REG_VPM_DMA_LOAD_WAIT))
-            throw CompilationError(CompilationStep::OPTIMIZER, "Failed to find VPW wait for address write",
-                group.addressWrites[i]->to_string());
-        group.addressWrites[i].copy().nextInBlock().erase();
-        group.addressWrites[i].erase();
-        numRemoved += 2;
-    }
-
-    // 4. remove all Mutex acquires and releases between the first and the last write, so memory consistency is restored
-    auto it = group.dmaSetups.front();
-    while(!it.isEndOfBlock() && it != group.addressWrites.back())
-    {
-        if(it.get() && it->writesRegister(REG_MUTEX))
-        {
-            it = it.erase();
-            ++numRemoved;
-        }
-        else if(it.get() && it->readsRegister(REG_MUTEX))
-        {
-            it = it.erase();
-            ++numRemoved;
-        }
-        else
-            it.nextInBlock();
-    }
-
-    logging::debug() << "Removed " << numRemoved << " instructions by combining VPW writes" << logging::endl;
-}
-
-static void groupVPMReads(VPM& vpm, VPMAccessGroup& group)
-{
-    if(group.genericSetups.size() != group.addressWrites.size() || group.genericSetups.size() != group.dmaSetups.size())
-    {
-        logging::debug() << "Number of instructions do not match for combining VPM reads!" << logging::endl;
-        logging::debug() << group.genericSetups.size() << " generic VPM setups, " << group.addressWrites.size()
-                         << " VPR address writes and " << group.dmaSetups.size() << " DMA setups" << logging::endl;
-        return;
-    }
-    if(group.genericSetups.size() <= 1)
-        return;
-    logging::debug() << "Combining " << group.genericSetups.size()
-                     << " reads of consecutive memory into one DMA read... " << logging::endl;
-
-    // 1. Update DMA setup to the number of rows read
-    {
-        VPRSetupWrapper dmaSetupValue(group.dmaSetups.at(0).get<LoadImmediate>());
-        dmaSetupValue.dmaSetup.setNumberRows(group.genericSetups.size() % 16);
-        // TODO don't actually combine reads and writes, only setups
-        // vpm.updateScratchSize(static_cast<unsigned char>(group.genericSetups.size()));
-        throw CompilationError(CompilationStep::OPTIMIZER, "Has been broken by recent changes");
-        // TODO can be space-optimized, half-words and bytes can be packed into single row (VPM writes too)
-    }
-    std::size_t numRemoved = 0;
-
-    // 1.1 Update generic Setup to the number of rows read
-    {
-        VPRSetupWrapper genericSetup(group.genericSetups.at(0).get<LoadImmediate>());
-        genericSetup.genericSetup.setNumber(group.genericSetups.size() % 16);
-    }
-
-    // 1.2 Update stride setup for the stride used
-    {
-        LoadImmediate* strideSetup = group.dmaSetups.at(0).copy().nextInBlock().get<LoadImmediate>();
-        if(strideSetup == nullptr || !strideSetup->writesRegister(REG_VPM_IN_SETUP) ||
-            !VPRSetup::fromLiteral(strideSetup->getImmediate().unsignedInt()).isStrideSetup())
-            throw CompilationError(CompilationStep::OPTIMIZER, "Failed to find VPR DMA stride setup for DMA setup",
-                group.dmaSetups.at(0)->to_string());
-        VPRSetupWrapper strideSetupValue(strideSetup);
-        // in contrast to writing memory, the pitch is the distance from start to start of successive rows
-        strideSetupValue.strideSetup.setPitch(static_cast<uint16_t>(
-            static_cast<unsigned>(group.stride) * group.groupType.getElementType().getPhysicalWidth()));
-    }
-
-    // 2. Remove all but the first generic and DMA setups
-    for(std::size_t i = 1; i < group.genericSetups.size(); ++i)
-    {
-        group.genericSetups[i].erase();
-        const LoadImmediate* strideSetup = group.dmaSetups.at(i).copy().nextInBlock().get<LoadImmediate>();
-        if(strideSetup == nullptr || !strideSetup->writesRegister(REG_VPM_IN_SETUP) ||
-            !VPRSetup::fromLiteral(strideSetup->getImmediate().unsignedInt()).isStrideSetup())
-            throw CompilationError(CompilationStep::OPTIMIZER, "Failed to find VPR DMA stride setup for DMA setup",
-                group.dmaSetups.at(i)->to_string());
-        group.dmaSetups.at(i).copy().nextInBlock().erase();
-        group.dmaSetups.at(i).erase();
-        numRemoved += 2;
-    }
-
-    // 3. remove all Mutex acquires and releases between the first and the last write, so memory consistency is restored
-    auto it = group.addressWrites.front();
-    while(!it.isEndOfBlock() && it != group.addressWrites.back())
-    {
-        if(it.get() && it->writesRegister(REG_MUTEX))
-        {
-            it = it.erase();
-            ++numRemoved;
-        }
-        else if(it.get() && it->readsRegister(REG_MUTEX))
-        {
-            it = it.erase();
-            ++numRemoved;
-        }
-        else
-            it.nextInBlock();
-    }
-
-    // 4. remove all but the first address writes (and the following DMA writes)
-    for(std::size_t i = 1; i < group.addressWrites.size(); ++i)
-    {
-        if(!group.addressWrites[i].copy().nextInBlock()->readsRegister(
-               group.isVPMWrite ? REG_VPM_DMA_STORE_WAIT : REG_VPM_DMA_LOAD_WAIT))
-            throw CompilationError(CompilationStep::OPTIMIZER, "Failed to find VPR wait for address write",
-                group.addressWrites[i]->to_string());
-        group.addressWrites[i].copy().nextInBlock().erase();
-        group.addressWrites[i].erase();
-        numRemoved += 2;
-    }
-
-    logging::debug() << "Removed " << numRemoved << " instructions by combining VPR reads" << logging::endl;
-}
-
-/*
- * Combine consecutive configuration of VPW/VPR with the same settings
- *
- * In detail, this combines VPM read/writes of uniform type of access (read or write), uniform data-type and consecutive
- * memory-addresses
- *
- * NOTE: Combining VPM accesses merges their mutex-lock blocks which can cause other QPUs to stall for a long time.
- * Also, this optimization currently only supports access memory <-> QPU, data exchange between only memory and VPM are
- * not optimized
- */
-static void combineVPMAccess(FastSet<BasicBlock*>& blocks, Method& method)
-{
-    // combine configurations of VPM (VPW/VPR) which have the same values
-
-    // TODO for now, this cannot handle RAM->VPM, VPM->RAM only access as well as VPM->QPU or QPU->VPM
-
-    // run within all basic blocks
-    for(BasicBlock* block : blocks)
-    {
-        auto it = block->walk();
-        while(!it.isEndOfBlock())
-        {
-            VPMAccessGroup group;
-            it = findGroupOfVPMAccess(*method.vpm.get(), it, block->walkEnd(), group);
-            if(group.addressWrites.size() > 1)
-            {
-                group.cleanDuplicateInstructions();
-                if(group.isVPMWrite)
-                    groupVPMWrites(*method.vpm.get(), group);
-                else
-                    groupVPMReads(*method.vpm.get(), group);
-            }
-        }
-    }
-
-    // clean up empty instructions
-    method.cleanEmptyInstructions();
-    PROFILE_COUNTER(
-        vc4c::profiler::COUNTER_GENERAL + 80, "Scratch memory size (in rows)", method.vpm->getScratchArea().numRows);
-}
-
 InstructionWalker normalization::accessGlobalData(
     const Module& module, Method& method, InstructionWalker it, const Configuration& config)
 {
@@ -480,11 +67,6 @@ InstructionWalker normalization::accessGlobalData(
 void normalization::spillLocals(const Module& module, Method& method, const Configuration& config)
 {
     static const std::size_t MINIMUM_THRESHOLD = 128; /* TODO some better limit */
-    // TODO need to know how much of the VPM is still free (need per-kernel VPM object)
-    // also need to heed not to write/read into/from VPM from within a block of DMA reads/writes
-    // XXX or revert: run this before #combineVPMAccess and reserve as much VPM as required (dynamically, how? or first
-    // determine size, then spill)  and use the remainder for #combineVPMAccess  also need to regard writing into VPM
-    // (without DMA) for #combineVPMAccess, so the VPM configurations do not conflict
 
     /*
      * 1. find all candidate locals for spilling:
@@ -701,13 +283,6 @@ void normalization::mapMemoryAccess(const Module& module, Method& method, const
         // TODO mark local for prefetch/write-back (if necessary)
     }
 
-    // TODO move this out into own optimization step?!
-    // XXX if this is re-activated, it will probably need to be rewritten (no more guaranteed mutex),
-    // also no benefit of grouping VPM access (other than combining the setup instructions),
-    // also can combine all setups within a basic block which set up the same values (per DMA, generic, stride), except
-    // VPM read!
-    // combineVPMAccess(affectedBlocks, method);
-
     method.vpm->dumpUsage();
 
     // TODO clean up no longer used (all kernels!) globals and stack allocations
diff --git a/src/optimization/Combiner.cpp b/src/optimization/Combiner.cpp
index b021ddb7..1fd36609 100644
--- a/src/optimization/Combiner.cpp
+++ b/src/optimization/Combiner.cpp
@@ -9,6 +9,7 @@
 #include "../InstructionWalker.h"
 #include "../intermediate/Helper.h"
 #include "../intermediate/operators.h"
+#include "../periphery/VPM.h"
 #include "log.h"
 
 #include <algorithm>
@@ -670,8 +671,6 @@ static bool canReplaceConstantLoad(
 
 bool optimizations::combineLoadingConstants(const Module& module, Method& method, const Configuration& config)
 {
-    // TODO extend/add new optimization combining constant registers (e.g. element number, qpu number, e.g. for
-    // TestMemoryAccess#GLOBAL_MEMORY_FUNCTION) or leave for common subexpression elimination?
     std::size_t threshold = config.additionalOptions.combineLoadThreshold;
     bool hasChanged = false;
 
@@ -1072,3 +1071,117 @@ InstructionWalker optimizations::combineFlagWithOutput(
     }
     return it;
 }
+
+bool optimizations::combineVPMSetupWrites(const Module& module, Method& method, const Configuration& config)
+{
+    bool changedInstructions = false;
+    for(auto& bb : method)
+    {
+        Optional<periphery::VPRDMASetup> lastDMAReadSetup;
+        Optional<periphery::VPRStrideSetup> lastReadStrideSetup;
+        Optional<periphery::VPWDMASetup> lastDMAWriteSetup;
+        Optional<periphery::VPWStrideSetup> lastWriteStrideSetup;
+        // NOTE: cannot simply remove successive VPM read/write setup, since a) the VPM address is auto-incremented and
+        // b) the read setup specifies the exact number of rows to read
+
+        auto it = bb.walk();
+        while(!it.isEndOfBlock())
+        {
+            if(it.has() && (it->writesRegister(REG_VPM_IN_SETUP) || it->writesRegister(REG_VPM_OUT_SETUP)))
+            {
+                bool readSetup = it->writesRegister(REG_VPM_IN_SETUP);
+                // this can for now only optimize constant setups
+                auto setupBits = it->getArgument(0) ? it->assertArgument(0).getLiteralValue() : Optional<Literal>{};
+                if(setupBits && (it.has<MoveOperation>() || it.has<LoadImmediate>()))
+                {
+                    if(readSetup)
+                    {
+                        auto setup = periphery::VPRSetup::fromLiteral(setupBits->unsignedInt());
+                        if(setup.isDMASetup())
+                        {
+                            if(lastDMAReadSetup == setup.dmaSetup)
+                            {
+                                logging::debug()
+                                    << "Removing duplicate writing of same DMA read setup: " << it->to_string()
+                                    << logging::endl;
+                                it = it.erase();
+                                changedInstructions = true;
+                                continue;
+                            }
+                            lastDMAReadSetup = setup.dmaSetup;
+                        }
+                        else if(setup.isStrideSetup())
+                        {
+                            if(lastReadStrideSetup == setup.strideSetup)
+                            {
+                                logging::debug()
+                                    << "Removing duplicate writing of same DMA read stride setup: " << it->to_string()
+                                    << logging::endl;
+                                it = it.erase();
+                                changedInstructions = true;
+                                continue;
+                            }
+                            lastReadStrideSetup = setup.strideSetup;
+                        }
+                    }
+                    else
+                    {
+                        auto setup = periphery::VPWSetup::fromLiteral(setupBits->unsignedInt());
+                        if(setup.isDMASetup())
+                        {
+                            if(lastDMAWriteSetup == setup.dmaSetup)
+                            {
+                                logging::debug()
+                                    << "Removing duplicate writing of same DMA write setup: " << it->to_string()
+                                    << logging::endl;
+                                it = it.erase();
+                                changedInstructions = true;
+                                continue;
+                            }
+                            lastDMAWriteSetup = setup.dmaSetup;
+                        }
+                        else if(setup.isStrideSetup())
+                        {
+                            if(lastWriteStrideSetup == setup.strideSetup)
+                            {
+                                logging::debug()
+                                    << "Removing duplicate writing of same DMA write stride setup: " << it->to_string()
+                                    << logging::endl;
+                                it = it.erase();
+                                changedInstructions = true;
+                                continue;
+                            }
+                            lastWriteStrideSetup = setup.strideSetup;
+                        }
+                    }
+                }
+                else if(it.has<Operation>() &&
+                    std::none_of(
+                        it->getArguments().begin(), it->getArguments().end(), [readSetup](const Value& arg) -> bool {
+                            auto writer = arg.getSingleWriter();
+                            auto val = writer ? writer->precalculate(1) : arg;
+                            if(val && val->getLiteralValue())
+                                if(readSetup &&
+                                    periphery::VPRSetup::fromLiteral(val->getLiteralValue()->unsignedInt())
+                                        .isGenericSetup())
+                                    return true;
+                            if(!readSetup &&
+                                periphery::VPWSetup::fromLiteral(val->getLiteralValue()->unsignedInt())
+                                    .isGenericSetup())
+                                return true;
+                            return false;
+                        }))
+                {
+                    // we have a VPM IO setup write which we do not know to be a generic setup.
+                    // to be on the safe side, clear last setups
+                    lastDMAReadSetup = {};
+                    lastDMAWriteSetup = {};
+                    lastReadStrideSetup = {};
+                    lastWriteStrideSetup = {};
+                }
+            }
+            it.nextInBlock();
+        }
+    }
+    return changedInstructions;
+}
diff --git a/src/optimization/Combiner.h b/src/optimization/Combiner.h
index 50df797f..a618af45 100644
--- a/src/optimization/Combiner.h
+++ b/src/optimization/Combiner.h
@@ -206,6 +206,15 @@ namespace vc4c
          */
         InstructionWalker combineFlagWithOutput(
             const Module& module, Method& method, InstructionWalker it, const Configuration& config);
+
+        /*
+         * Combines writing of same VPM configurations within a basic block.
+         *
+         * Since VPM (and DMA) configurations are persistent (except VPM read/write config), there is no need to write
+         * the same configuration several times.
+         */
+        bool combineVPMSetupWrites(const Module& module, Method& method, const Configuration& config);
+
     } // namespace optimizations
 } // namespace vc4c
 #endif /* COMBINER_H */
diff --git a/src/optimization/Optimizer.cpp b/src/optimization/Optimizer.cpp
index 8bfa2009..61c0b083 100644
--- a/src/optimization/Optimizer.cpp
+++ b/src/optimization/Optimizer.cpp
@@ -233,6 +233,8 @@ const std::vector<OptimizationPass> Optimizer::ALL_PASSES = {
         OptimizationType::INITIAL),
     OptimizationPass("MergeBasicBlocks", "merge-blocks", mergeAdjacentBasicBlocks,
         "merges adjacent basic blocks if there are no other conflicting transitions", OptimizationType::INITIAL),
+    OptimizationPass("CombineVPMSetups", "combine-vpm-setups", combineVPMSetupWrites,
+        "combines duplicate VPM setup writes", OptimizationType::INITIAL),
     /*
      * The second block executes optimizations only within a single basic block.
      * These optimizations may be executed in a loop until there are not more changes to the instructions
@@ -307,6 +309,7 @@ std::set<std::string> Optimizer::getPasses(OptimizationLevel level)
         passes.emplace("single-steps");
         passes.emplace("reorder");
         passes.emplace("combine");
+        passes.emplace("combine-vpm-setups");
         FALL_THROUGH
     case OptimizationLevel::NONE:
         // TODO this is not an optimization, more a normalization step.
diff --git a/src/periphery/VPM.h b/src/periphery/VPM.h
index a32d4096..509a0b26 100644
--- a/src/periphery/VPM.h
+++ b/src/periphery/VPM.h
@@ -28,7 +28,7 @@ namespace vc4c
          *
          * see Broadcom spec, table 32
          */
-        class VPWGenericSetup : private Bitfield<uint32_t>
+        class VPWGenericSetup : public Bitfield<uint32_t>
         {
         public:
             VPWGenericSetup(uint8_t size, uint8_t stride, uint8_t address = 0) : Bitfield(0)
@@ -117,7 +117,7 @@ namespace vc4c
          *
          * see Broadcom spec, table 34
          */
-        class VPWDMASetup : private Bitfield<uint32_t>
+        class VPWDMASetup : public Bitfield<uint32_t>
         {
         public:
             VPWDMASetup(uint8_t mode, uint8_t depth, uint8_t units = 1) : Bitfield(0)
@@ -214,7 +214,7 @@ namespace vc4c
          *
          * see Broadcom spec, table 35
          */
-        class VPWStrideSetup : private Bitfield<uint32_t>
+        class VPWStrideSetup : public Bitfield<uint32_t>
         {
         public:
             explicit VPWStrideSetup(uint16_t stride = 0) : Bitfield(0)
@@ -296,7 +296,7 @@ namespace vc4c
          *
          * see Broadcom spec, table 33
          */
-        class VPRGenericSetup : private Bitfield<uint32_t>
+        class VPRGenericSetup : public Bitfield<uint32_t>
         {
         public:
             VPRGenericSetup(uint8_t size, uint8_t stride, uint8_t numVectors = 1, uint8_t address = 0) : Bitfield(0)
@@ -374,7 +374,7 @@ namespace vc4c
          *
          * see Broadcom spec, table 36
          */
-        class VPRDMASetup : private Bitfield<uint32_t>
+        class VPRDMASetup : public Bitfield<uint32_t>
         {
         public:
             VPRDMASetup(
@@ -473,7 +473,7 @@ namespace vc4c
          *
          * see Broadcom spec, table 37
          */
-        class VPRStrideSetup : private Bitfield<uint32_t>
+        class VPRStrideSetup : public Bitfield<uint32_t>
         {
         public:
             explicit VPRStrideSetup(uint16_t stride = 0) : Bitfield(0)

From 92e2c8b14e193c14fab8fe1c1b0c7dcfe620ef0d Mon Sep 17 00:00:00 2001
From: doe300 <stadeldani@web.de>
Date: Fri, 16 Nov 2018 11:57:15 +0100
Subject: [PATCH 6/6] Fixes rebase issues, copies fixes from master

---
 src/normalization/MemoryMapChecks.cpp | 12 ++++++++++--
 src/normalization/MemoryMappings.h    |  4 ++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/normalization/MemoryMapChecks.cpp b/src/normalization/MemoryMapChecks.cpp
index 428c1f3d..3c87b394 100644
--- a/src/normalization/MemoryMapChecks.cpp
+++ b/src/normalization/MemoryMapChecks.cpp
@@ -97,7 +97,8 @@ static bool isMemoryOnlyRead(const Local* local)
 }
 
 // Finds the next instruction writing the given value into memory
-static InstructionWalker findNextValueStore(InstructionWalker it, const Value& src, std::size_t limit)
+static InstructionWalker findNextValueStore(
+    InstructionWalker it, const Value& src, std::size_t limit, const Local* sourceLocation)
 {
     while(!it.isEndOfBlock() && limit > 0)
     {
@@ -106,6 +107,12 @@ static InstructionWalker findNextValueStore(InstructionWalker it, const Value& s
         {
             return it;
         }
+        if(memInstr != nullptr && memInstr->getDestination().local()->getBase(true) == sourceLocation)
+        {
+            // there is some other instruction writing into the memory we read, it could have been changed -> abort
+            // TODO can we be more precise and abort only if the same index is written?? How to determine??
+            return it.getBasicBlock()->walkEnd();
+        }
         if(it.has<MemoryBarrier>() || it.has<Branch>() || it.has<MutexLock>() || it.has<SemaphoreAdjustment>())
             break;
         it.nextInBlock();
@@ -173,7 +180,8 @@ std::pair<MemoryAccessMap, FastSet<InstructionWalker>> normalization::determineM
             if(memInstr->op == MemoryOperation::READ && !memInstr->hasConditionalExecution() &&
                 memInstr->getDestination().local()->getUsers(LocalUse::Type::READER).size() == 1)
             {
-                auto nextIt = findNextValueStore(it, memInstr->getDestination(), 16 /* TODO */);
+                auto nextIt = findNextValueStore(
+                    it, memInstr->getDestination(), 16 /* TODO */, memInstr->getSource().local()->getBase(true));
                 auto nextMemInstr = nextIt.isEndOfBlock() ? nullptr : nextIt.get<const MemoryInstruction>();
                 if(nextMemInstr != nullptr && !nextIt->hasConditionalExecution() &&
                     nextMemInstr->op == MemoryOperation::WRITE &&
diff --git a/src/normalization/MemoryMappings.h b/src/normalization/MemoryMappings.h
index 933f8793..424824f6 100644
--- a/src/normalization/MemoryMappings.h
+++ b/src/normalization/MemoryMappings.h
@@ -71,7 +71,7 @@ namespace vc4c
          */
         InstructionWalker mapMemoryAccess(Method& method, InstructionWalker it, intermediate::MemoryInstruction* mem,
             const MemoryInfo& srcInfo, const MemoryInfo& destInfo);
-    }
-}
+    } // namespace normalization
+} // namespace vc4c
 
 #endif /* VC4C_NORMALIZATION_MEMORY_MAPPING_H */
\ No newline at end of file