doe300 · doe300 · Nov 1, 2018 · Nov 2, 2018 · Nov 3, 2018 · Nov 3, 2018
diff --git a/src/analysis/DependencyGraph.cpp b/src/analysis/DependencyGraph.cpp
@@ -303,12 +303,13 @@ static void createMutexDependencies(DependencyGraph& graph, DependencyNode& node
     const intermediate::IntermediateInstruction* lastSemaphoreAccess,
     const intermediate::IntermediateInstruction* lastMemFence)
 {
-    if((node.key->hasValueType(ValueType::REGISTER) && node.key->getOutput()->reg().isVertexPipelineMemory()) ||
-        std::any_of(node.key->getArguments().begin(), node.key->getArguments().end(),
-            [](const Value& arg) -> bool { return arg.hasRegister() && arg.reg().isVertexPipelineMemory(); }) ||
-        node.key->writesRegister(REG_MUTEX))
+    if(((node.key->hasValueType(ValueType::REGISTER) && node.key->getOutput()->reg().isVertexPipelineMemory()) ||
+           std::any_of(node.key->getArguments().begin(), node.key->getArguments().end(),
+               [](const Value& arg) -> bool { return arg.hasRegister() && arg.reg().isVertexPipelineMemory(); }) ||
+           node.key->writesRegister(REG_MUTEX)) &&
+        lastMutexLock != nullptr)
     {
-        // any VPM operation or mutex unlock must be ordered after the corresponding mutex lock
+        // any VPM operation or mutex unlock must be ordered after a previous mutex lock, if any
         auto& otherNode = graph.assertNode(lastMutexLock);
         addDependency(otherNode.getOrCreateEdge(&node).data, DependencyType::MUTEX_LOCK);
     }
@@ -497,13 +498,13 @@ static void createVPMIODependencies(DependencyGraph& graph, DependencyNode& node
         (node.key->writesRegister(REG_VPM_IO) || node.key->writesRegister(REG_VPM_DMA_STORE_ADDR) ||
             node.key->writesRegister(REG_MUTEX)))
     {
-        // any other VPM write, VPM write address setup or unlocking mutex must be executed aftre the VPM write
+        // any other VPM write, VPM write address setup or unlocking mutex must be executed after the VPM write
         auto& otherNode = graph.assertNode(lastVPMWrite);
         addDependency(otherNode.getOrCreateEdge(&node).data, DependencyType::PERIPHERY_ORDER);
     }
     if(lastVPMRead != nullptr && (node.key->readsRegister(REG_VPM_IO) || node.key->writesRegister(REG_MUTEX)))
     {
-        // any other VPM read or unlocking mutex must be executed aftre the VPM read
+        // any other VPM read or unlocking mutex must be executed after the VPM read
         auto& otherNode = graph.assertNode(lastVPMRead);
         addDependency(otherNode.getOrCreateEdge(&node).data, DependencyType::PERIPHERY_ORDER);
     }

diff --git a/src/intermediate/MemoryInstruction.cpp b/src/intermediate/MemoryInstruction.cpp
@@ -94,6 +94,13 @@ MemoryInstruction::MemoryInstruction(
 {
     setArgument(0, src);
     setArgument(1, numEntries);
+
+    if(numEntries != INT_ONE)
+    {
+        if(op != MemoryOperation::COPY && op != MemoryOperation::FILL)
+            throw CompilationError(
+                CompilationStep::LLVM_2_IR, "Can only use the entry count for copying of filling memory", to_string());
+    }
 }
 
 std::string MemoryInstruction::to_string() const

diff --git a/src/normalization/AddressCalculation.cpp b/src/normalization/AddressCalculation.cpp
@@ -0,0 +1,181 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#include "AddressCalculation.h"
+
+#include "../Locals.h"
+#include "../intermediate/IntermediateInstruction.h"
+#include "../intermediate/operators.h"
+#include "log.h"
+
+using namespace vc4c;
+using namespace vc4c::intermediate;
+using namespace vc4c::normalization;
+using namespace vc4c::operators;
+
+MemoryType normalization::toMemoryType(periphery::VPMUsage usage)
+{
+    switch(usage)
+    {
+    case periphery::VPMUsage::SCRATCH:
+    case periphery::VPMUsage::LOCAL_MEMORY:
+    case periphery::VPMUsage::MEMORY_CACHE:
+        return MemoryType::VPM_SHARED_ACCESS;
+    case periphery::VPMUsage::REGISTER_SPILLING:
+    case periphery::VPMUsage::STACK:
+        return MemoryType::VPM_PER_QPU;
+    }
+    throw CompilationError(CompilationStep::NORMALIZER,
+        "Unknown VPM usage type to map to memory type: ", std::to_string(static_cast<int>(usage)));
+}
+
+InstructionWalker normalization::insertAddressToOffset(
+    InstructionWalker it, Method& method, Value& out, const Local* baseAddress, const MemoryInstruction* mem)
+{
+    auto ptrVal = mem->op == MemoryOperation::READ ? mem->getSource() : mem->getDestination();
+    auto indexOp = dynamic_cast<const Operation*>(ptrVal.getSingleWriter());
+    if(!indexOp)
+    {
+        // for stores, the store itself is also a write instruction
+        auto writers = ptrVal.local()->getUsers(LocalUse::Type::WRITER);
+        if(writers.size() == 2 && writers.find(mem) != writers.end())
+        {
+            writers.erase(mem);
+            indexOp = dynamic_cast<const Operation*>(*writers.begin());
+        }
+    }
+    if(ptrVal.hasLocal(baseAddress))
+    {
+        // trivial case, the offset is zero
+        out = INT_ZERO;
+    }
+    else if(indexOp && indexOp->readsLocal(baseAddress) && indexOp->op == OP_ADD)
+    {
+        // for simple version where the index is base address + offset, simple use the offset directly
+        out = indexOp->getFirstArg().hasLocal(baseAddress) ? indexOp->getSecondArg().value() : indexOp->getFirstArg();
+    }
+    else
+    {
+        // for more complex versions, calculate offset by subtracting base address from result
+        // address
+        out = assign(it, baseAddress->type, "%pointer_diff") = ptrVal - baseAddress->createReference();
+    }
+    return it;
+}
+
+InstructionWalker normalization::insertAddressToStackOffset(InstructionWalker it, Method& method, Value& out,
+    const Local* baseAddress, MemoryType type, const MemoryInstruction* mem)
+{
+    Value tmpIndex = UNDEFINED_VALUE;
+    it = insertAddressToOffset(it, method, tmpIndex, baseAddress, mem);
+    if(type == MemoryType::VPM_PER_QPU)
+    {
+        // size of one stack-frame in bytes
+        auto stackByteSize = periphery::VPM::getVPMStorageType(baseAddress->type.getElementType()).getPhysicalWidth();
+        // add offset of stack-frame
+        Value stackOffset = method.addNewLocal(TYPE_VOID.toPointerType(), "%stack_offset");
+        Value tmp = method.addNewLocal(baseAddress->type);
+        assign(it, stackOffset) = mul24(Value(Literal(stackByteSize), TYPE_INT16), Value(REG_QPU_NUMBER, TYPE_INT8));
+        out = assign(it, TYPE_VOID.toPointerType(), "%stack_offset") = tmpIndex + stackOffset;
+    }
+    else
+    {
+        out = tmpIndex;
+    }
+    return it;
+}
+
+InstructionWalker normalization::insertAddressToElementOffset(InstructionWalker it, Method& method, Value& out,
+    const Local* baseAddress, const Value& container, const MemoryInstruction* mem)
+{
+    Value tmpIndex = UNDEFINED_VALUE;
+    it = insertAddressToOffset(it, method, tmpIndex, baseAddress, mem);
+    // the index (as per index calculation) is in bytes, but we need index in elements, so divide by element size
+    out = assign(it, TYPE_VOID.toPointerType(), "%element_offset") =
+        tmpIndex / Literal(container.type.getElementType().getPhysicalWidth());
+    return it;
+}
+
+static Optional<std::pair<Value, InstructionDecorations>> combineAdditions(
+    Method& method, InstructionWalker referenceIt, FastMap<Value, InstructionDecorations>& addedValues)
+{
+    if(addedValues.empty())
+        return {};
+    Optional<std::pair<Value, InstructionDecorations>> prevResult;
+    auto valIt = addedValues.begin();
+    while(valIt != addedValues.end())
+    {
+        if(prevResult)
+        {
+            auto newResult = method.addNewLocal(prevResult->first.type);
+            auto newFlags = intersect_flags(prevResult->second, valIt->second);
+            referenceIt.emplace(new Operation(OP_ADD, newResult, prevResult->first, valIt->first));
+            referenceIt->addDecorations(newFlags);
+            referenceIt.nextInBlock();
+            prevResult = std::make_pair(newResult, newFlags);
+        }
+        else
+            prevResult = std::make_pair(valIt->first, valIt->second);
+        valIt = addedValues.erase(valIt);
+    }
+    return prevResult;
+}
+
+std::string MemoryAccessRange::to_string() const
+{
+    return (memoryInstruction->to_string() +
+               (memoryInstruction->writesRegister(REG_VPM_DMA_LOAD_ADDR) ? " - read " : " - write ")) +
+        (memoryObject->to_string() +
+            (groupUniformAddressParts.empty() ? " with" : " with work-group uniform offset and") +
+            " dynamic element range [") +
+        (std::to_string(offsetRange.minValue) + ", ") + (std::to_string(offsetRange.maxValue) + "]");
+}
+
+InstructionWalker normalization::insertAddressToWorkItemSpecificOffset(
+    InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range)
+{
+    auto dynamicParts = combineAdditions(method, it, range.dynamicAddressParts);
+    out = dynamicParts->first;
+    if(range.typeSizeShift)
+        out = assign(it, dynamicParts->first.type) =
+            (dynamicParts->first << (*range.typeSizeShift)->assertArgument(1), dynamicParts->second);
+    return it;
+}
+
+InstructionWalker normalization::insertAddressToWorkGroupUniformOffset(
+    InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range)
+{
+    auto uniformParts = combineAdditions(method, it, range.groupUniformAddressParts);
+    out = uniformParts->first;
+    if(range.typeSizeShift)
+        out = assign(it, uniformParts->first.type) =
+            (uniformParts->first << (*range.typeSizeShift)->assertArgument(1), uniformParts->second);
+    static auto checkPointer = [](const Value& arg) -> bool { return arg.type.isPointerType(); };
+    if(std::all_of(
+           range.baseAddressAdd->getArguments().begin(), range.baseAddressAdd->getArguments().end(), checkPointer) ||
+        std::none_of(
+            range.baseAddressAdd->getArguments().begin(), range.baseAddressAdd->getArguments().end(), checkPointer))
+        throw CompilationError(CompilationStep::NORMALIZER, "Cannot determine base address of addition",
+            range.baseAddressAdd->to_string());
+    auto baseAddrIt = std::find_if(
+        range.baseAddressAdd->getArguments().begin(), range.baseAddressAdd->getArguments().end(), checkPointer);
+    out = assign(it, range.baseAddressAdd->getOutput()->type) =
+        (*baseAddrIt + out, uniformParts->second, InstructionDecorations::WORK_GROUP_UNIFORM_VALUE);
+    return it;
+}
+
+bool LocalUsageOrdering::operator()(const Local* l1, const Local* l2) const
+{
+    // prefer more usages over less usages
+    // since there is always only 1 writer for the local address, we prefer this over only counting readers for
+    // performance reasons
+    // TODO is this the correct way to do this? E.g. is there one usage per memory access?
+    if(l1->getUsers().size() > l2->getUsers().size())
+        return true;
+    if(l1->getUsers().size() == l2->getUsers().size())
+        return l1 < l2;
+    return false;
+}
diff --git a/src/normalization/AddressCalculation.h b/src/normalization/AddressCalculation.h
@@ -0,0 +1,129 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+#ifndef VC4C_NORMALIZATION_ADDRESS_CALCULATION_H
+#define VC4C_NORMALIZATION_ADDRESS_CALCULATION_H
+
+#include "../InstructionWalker.h"
+#include "../Values.h"
+#include "../analysis/ValueRange.h"
+#include "../intermediate/IntermediateInstruction.h"
+#include "../periphery/VPM.h"
+
+namespace vc4c
+{
+    namespace normalization
+    {
+        enum class MemoryType
+        {
+            // lower the value into a register and replace all loads with moves
+            QPU_REGISTER_READONLY,
+            // lower the value into a register and replace all loads/stores with moves
+            QPU_REGISTER_READWRITE,
+            // store in VPM in extra space per QPU!!
+            VPM_PER_QPU,
+            // store in VPM, QPUs share access to common data
+            VPM_SHARED_ACCESS,
+            // keep in RAM/global data segment, read via TMU
+            RAM_LOAD_TMU,
+            // keep in RAM/global data segment, access via VPM
+            RAM_READ_WRITE_VPM
+        };
+
+        struct MemoryAccess
+        {
+            FastSet<InstructionWalker> accessInstructions;
+            MemoryType preferred;
+            MemoryType fallback;
+        };
+
+        MemoryType toMemoryType(periphery::VPMUsage usage);
+
+        /*
+         * Converts an address (e.g. an index chain) and the corresponding base pointer to the pointer difference
+         *
+         * NOTE: The result itself is still in "memory-address mode", meaning the offset is the number of bytes
+         *
+         * Returns (char*)address - (char*)baseAddress
+         */
+        InstructionWalker insertAddressToOffset(InstructionWalker it, Method& method, Value& out,
+            const Local* baseAddress, const intermediate::MemoryInstruction* mem);
+
+        /*
+         * Converts an address (e.g. an index-chain) and a base-address to the offset of the vector denoting the element
+         * accessed by the index-chain. In addition to #insertAddressToOffset, this function also handles multiple
+         * stack-frames.
+         *
+         * NOTE: The result is still the offset in bytes, since VPM#insertReadVPM and VPM#insertWriteVPM take the offset
+         * in bytes!
+         *
+         * Returns ((char*)address - (char*)baseAddress) + (typeSizeInBytes * stackIndex), where stackIndex is always
+         * zero (and therefore the second part omitted) for shared memory
+         */
+        InstructionWalker insertAddressToStackOffset(InstructionWalker it, Method& method, Value& out,
+            const Local* baseAddress, MemoryType type, const intermediate::MemoryInstruction* mem);
+
+        /*
+         * Converts an address (e.g. index-chain) and the corresponding base-address to the element offset for an
+         * element of the type used in the container
+         *
+         * Return ((char*)address - (char*)baseAddress) / sizeof(elementType)
+         */
+        InstructionWalker insertAddressToElementOffset(InstructionWalker it, Method& method, Value& out,
+            const Local* baseAddress, const Value& container, const intermediate::MemoryInstruction* mem);
+
+        // represents analysis data for the range of memory accessed per memory object
+        struct MemoryAccessRange
+        {
+            const Local* memoryObject;
+            // the memory instruction accessing the memory object
+            InstructionWalker memoryInstruction;
+            // the instruction adding the offset to the base pointer, could be the same as addressWrite
+            InstructionWalker baseAddressAdd;
+            // the instruction converting the address offset from element offset to byte offset
+            Optional<InstructionWalker> typeSizeShift;
+            // the work-group uniform parts of which the address offset is calculated from
+            FastMap<Value, intermediate::InstructionDecorations> groupUniformAddressParts;
+            // the dynamic parts of which the address offset is calculated from
+            FastMap<Value, intermediate::InstructionDecorations> dynamicAddressParts;
+            // the maximum range (in elements!) the memory is accessed in
+            analysis::IntegerRange offsetRange{0, 0};
+
+            std::string to_string() const;
+        };
+
+        /*
+         * Converts an address (e.g. index-chain) which contains work-group uniform and work-item specific parts (as
+         * specified in range) to the work-item specific part only.
+         * This can be seen as a specialization of #insertAddressToOffset
+         *
+         * NOTE: The result itself is still in "memory-address mode", meaning the offset is the number of bytes
+         *
+         * * E.g. get_global_id(0) (= get_group_id(0) + get_local_id(0)) will be converted to get_local_id(0)
+         */
+        InstructionWalker insertAddressToWorkItemSpecificOffset(
+            InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range);
+
+        /*
+         * Converts an address (e.g. index-chain) which contains work-group uniform and work-item specific parts (as
+         * specified in range) to the work-group uniform part only.
+         * Adding the result of this function and #insertAddressToWorkItemSpecificOffset will result in the original
+         * address
+         *
+         * NOTE: The result itself is in "memory-address mode", meaning the offset is the number of bytes, since it is
+         * meant for addressing memory
+         *
+         * * E.g. get_global_id(0) (= get_group_id(0) + get_local_id(0)) will be converted to get_group_id(0)
+         */
+        InstructionWalker insertAddressToWorkGroupUniformOffset(
+            InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range);
+
+        struct LocalUsageOrdering
+        {
+            bool operator()(const Local* l1, const Local* l2) const;
+        };
+    } /* namespace normalization */
+} /* namespace vc4c */
+#endif /* VC4C_NORMALIZATION_ADDRESS_CALCULATION_H */