Rewrites memory access mapping

Memory access is now mapped in following steps: * Determine preferred and fall-back lowering type per memory area (e.g. register, VPM, TMU, DMA) * Check whether lowering type can be applied, reserve resources * Map all memory access to specified lowering level Copies useful changes from #120 without changing the semantics.
doe300 · May 18, 2019 · 22ad51e · 22ad51e
1 parent be96155
commit 22ad51e
Show file tree

Hide file tree

Showing 8 changed files with 1,888 additions and 1,055 deletions.
diff --git a/src/normalization/AddressCalculation.cpp b/src/normalization/AddressCalculation.cpp
@@ -0,0 +1,153 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#include "AddressCalculation.h"
+
+#include "../InstructionWalker.h"
+#include "../intermediate/operators.h"
+#include "../periphery/VPM.h"
+#include "log.h"
+
+using namespace vc4c;
+using namespace vc4c::intermediate;
+using namespace vc4c::normalization;
+using namespace vc4c::operators;
+
+MemoryAccessType normalization::toMemoryAccessType(periphery::VPMUsage usage)
+{
+    switch(usage)
+    {
+    case periphery::VPMUsage::SCRATCH:
+    case periphery::VPMUsage::LOCAL_MEMORY:
+        return MemoryAccessType::VPM_SHARED_ACCESS;
+    case periphery::VPMUsage::REGISTER_SPILLING:
+    case periphery::VPMUsage::STACK:
+        return MemoryAccessType::VPM_PER_QPU;
+    }
+    throw CompilationError(CompilationStep::NORMALIZER,
+        "Unknown VPM usage type to map to memory type: ", std::to_string(static_cast<int>(usage)));
+}
+
+InstructionWalker normalization::insertAddressToOffset(InstructionWalker it, Method& method, Value& out,
+    const Local* baseAddress, const MemoryInstruction* mem, const Value& ptrValue)
+{
+    auto indexOp = dynamic_cast<const Operation*>(ptrValue.getSingleWriter());
+    if(!indexOp)
+    {
+        // for stores, the store itself is also a write instruction
+        auto writers = ptrValue.local()->getUsers(LocalUse::Type::WRITER);
+        if(writers.size() == 2 && writers.find(mem) != writers.end())
+        {
+            writers.erase(mem);
+            indexOp = dynamic_cast<const Operation*>(*writers.begin());
+        }
+    }
+    if(ptrValue.hasLocal(baseAddress))
+    {
+        // trivial case, the offset is zero
+        out = INT_ZERO;
+    }
+    else if(indexOp && indexOp->readsLocal(baseAddress) && indexOp->op == OP_ADD)
+    {
+        // for simple version where the index is base address + offset, simple use the offset directly
+        out = indexOp->getFirstArg().hasLocal(baseAddress) ? indexOp->getSecondArg().value() : indexOp->getFirstArg();
+    }
+    else
+    {
+        // for more complex versions, calculate offset by subtracting base address from result
+        // address
+        out = assign(it, baseAddress->type, "%pointer_diff") = ptrValue - baseAddress->createReference();
+    }
+    return it;
+}
+
+InstructionWalker normalization::insertAddressToStackOffset(InstructionWalker it, Method& method, Value& out,
+    const Local* baseAddress, MemoryAccessType type, const MemoryInstruction* mem, const Value& ptrValue)
+{
+    Value tmpIndex = UNDEFINED_VALUE;
+    it = insertAddressToOffset(it, method, tmpIndex, baseAddress, mem, ptrValue);
+    if(type == MemoryAccessType::VPM_PER_QPU)
+    {
+        // size of one stack-frame in bytes
+        auto stackByteSize = periphery::VPM::getVPMStorageType(baseAddress->type.getElementType()).getPhysicalWidth();
+        // add offset of stack-frame
+        Value stackOffset = method.addNewLocal(TYPE_VOID_POINTER, "%stack_offset");
+        assign(it, stackOffset) = mul24(Value(Literal(stackByteSize), TYPE_INT16), Value(REG_QPU_NUMBER, TYPE_INT8));
+        out = assign(it, TYPE_VOID_POINTER, "%stack_offset") = tmpIndex + stackOffset;
+    }
+    else
+    {
+        out = tmpIndex;
+    }
+    return it;
+}
+
+InstructionWalker normalization::insertAddressToElementOffset(InstructionWalker it, Method& method, Value& out,
+    const Local* baseAddress, const Value& container, const MemoryInstruction* mem, const Value& ptrValue)
+{
+    Value tmpIndex = UNDEFINED_VALUE;
+    it = insertAddressToOffset(it, method, tmpIndex, baseAddress, mem, ptrValue);
+    // the index (as per index calculation) is in bytes, but we need index in elements, so divide by element size
+    out = assign(it, TYPE_VOID_POINTER, "%element_offset") =
+        tmpIndex / Literal(container.type.getElementType().getPhysicalWidth());
+    return it;
+}
+
+static Optional<std::pair<Value, InstructionDecorations>> combineAdditions(
+    Method& method, InstructionWalker referenceIt, FastMap<Value, InstructionDecorations>& addedValues)
+{
+    if(addedValues.empty())
+        return {};
+    Optional<std::pair<Value, InstructionDecorations>> prevResult;
+    auto valIt = addedValues.begin();
+    while(valIt != addedValues.end())
+    {
+        if(prevResult)
+        {
+            auto newFlags = intersect_flags(prevResult->second, valIt->second);
+            auto newResult = assign(referenceIt, prevResult->first.type) = (prevResult->first + valIt->first, newFlags);
+            prevResult = std::make_pair(newResult, newFlags);
+        }
+        else
+            prevResult = std::make_pair(valIt->first, valIt->second);
+        valIt = addedValues.erase(valIt);
+    }
+    return prevResult;
+}
+
+std::string MemoryAccessRange::to_string() const
+{
+    return (memoryInstruction->to_string() +
+               (memoryInstruction->writesRegister(REG_VPM_DMA_LOAD_ADDR) ? " - read " : " - write ")) +
+        (memoryObject->to_string() +
+            (groupUniformAddressParts.empty() ? " with" : " with work-group uniform offset and") +
+            " dynamic element range [") +
+        (std::to_string(offsetRange.minValue) + ", ") + (std::to_string(offsetRange.maxValue) + "]");
+}
+
+InstructionWalker normalization::insertAddressToWorkItemSpecificOffset(
+    InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range)
+{
+    auto dynamicParts = combineAdditions(method, it, range.dynamicAddressParts);
+    out = dynamicParts->first;
+    if(range.typeSizeShift)
+        out = assign(it, dynamicParts->first.type) =
+            (dynamicParts->first << (*range.typeSizeShift)->assertArgument(1), dynamicParts->second);
+    return it;
+}
+
+bool LocalUsageOrdering::operator()(const Local* l1, const Local* l2) const
+{
+    // prefer more usages over less usages
+    // since there is always only 1 writer for the local address, we prefer this over only counting readers for
+    // performance reasons
+    // TODO is this the correct way to do this? E.g. is there one usage per memory access?
+    if(l1->getUsers().size() > l2->getUsers().size())
+        return true;
+    if(l1->getUsers().size() == l2->getUsers().size())
+        return l1 < l2;
+    return false;
+}
diff --git a/src/normalization/AddressCalculation.h b/src/normalization/AddressCalculation.h
@@ -0,0 +1,125 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+#ifndef VC4C_NORMALIZATION_ADDRESS_CALCULATION_H
+#define VC4C_NORMALIZATION_ADDRESS_CALCULATION_H
+
+#include "../InstructionWalker.h"
+#include "../Values.h"
+#include "../analysis/ValueRange.h"
+#include "../intermediate/IntermediateInstruction.h"
+#include "../performance.h"
+
+namespace vc4c
+{
+    namespace periphery
+    {
+        enum class VPMUsage : unsigned char;
+    } // namespace periphery
+
+    namespace normalization
+    {
+        /**
+         * Enum for the different ways of how to access memory areas
+         */
+        enum class MemoryAccessType
+        {
+            // lower the value into a register and replace all loads with moves
+            QPU_REGISTER_READONLY,
+            // lower the value into a register and replace all loads/stores with moves
+            QPU_REGISTER_READWRITE,
+            // store in VPM in extra space per QPU!!
+            VPM_PER_QPU,
+            // store in VPM, QPUs share access to common data
+            VPM_SHARED_ACCESS,
+            // keep in RAM/global data segment, read via TMU
+            RAM_LOAD_TMU,
+            // keep in RAM/global data segment, access via VPM
+            RAM_READ_WRITE_VPM
+        };
+
+        struct MemoryAccess
+        {
+            FastSet<InstructionWalker> accessInstructions;
+            MemoryAccessType preferred;
+            MemoryAccessType fallback;
+        };
+
+        MemoryAccessType toMemoryAccessType(periphery::VPMUsage usage);
+
+        /*
+         * Converts an address (e.g. an index chain) and the corresponding base pointer to the pointer difference
+         *
+         * NOTE: The result itself is still in "memory-address mode", meaning the offset is the number of bytes
+         *
+         * Returns (char*)address - (char*)baseAddress
+         */
+        NODISCARD InstructionWalker insertAddressToOffset(InstructionWalker it, Method& method, Value& out,
+            const Local* baseAddress, const intermediate::MemoryInstruction* mem, const Value& ptrValue);
+
+        /*
+         * Converts an address (e.g. an index-chain) and a base-address to the offset of the vector denoting the element
+         * accessed by the index-chain. In addition to #insertAddressToOffset, this function also handles multiple
+         * stack-frames.
+         *
+         * NOTE: The result is still the offset in bytes, since VPM#insertReadVPM and VPM#insertWriteVPM take the offset
+         * in bytes!
+         *
+         * Returns ((char*)address - (char*)baseAddress) + (typeSizeInBytes * stackIndex), where stackIndex is always
+         * zero (and therefore the second part omitted) for shared memory
+         */
+        NODISCARD InstructionWalker insertAddressToStackOffset(InstructionWalker it, Method& method, Value& out,
+            const Local* baseAddress, MemoryAccessType type, const intermediate::MemoryInstruction* mem,
+            const Value& ptrValue);
+
+        /*
+         * Converts an address (e.g. index-chain) and the corresponding base-address to the element offset for an
+         * element of the type used in the container
+         *
+         * Return ((char*)address - (char*)baseAddress) / sizeof(elementType)
+         */
+        NODISCARD InstructionWalker insertAddressToElementOffset(InstructionWalker it, Method& method, Value& out,
+            const Local* baseAddress, const Value& container, const intermediate::MemoryInstruction* mem,
+            const Value& ptrValue);
+
+        // represents analysis data for the range of memory accessed per memory object
+        struct MemoryAccessRange
+        {
+            const Local* memoryObject;
+            // the memory instruction accessing the memory object
+            InstructionWalker memoryInstruction;
+            // the instruction adding the offset to the base pointer
+            InstructionWalker baseAddressAdd;
+            // the instruction converting the address offset from element offset to byte offset
+            Optional<InstructionWalker> typeSizeShift;
+            // the work-group uniform parts of which the address offset is calculated from
+            FastMap<Value, intermediate::InstructionDecorations> groupUniformAddressParts;
+            // the dynamic parts (specific to the work-item) of which the address offset is calculated from
+            FastMap<Value, intermediate::InstructionDecorations> dynamicAddressParts;
+            // the maximum range (in elements!) the memory is accessed in
+            analysis::IntegerRange offsetRange{0, 0};
+
+            std::string to_string() const;
+        };
+
+        /*
+         * Converts an address (e.g. index-chain) which contains work-group uniform and work-item specific parts (as
+         * specified in range) to the work-item specific part only.
+         * This can be seen as a specialization of #insertAddressToOffset
+         *
+         * NOTE: The result itself is still in "memory-address mode", meaning the offset is the number of bytes
+         *
+         * * E.g. get_global_id(0) (= get_group_id(0) + get_local_id(0)) will be converted to get_local_id(0)
+         */
+        NODISCARD InstructionWalker insertAddressToWorkItemSpecificOffset(
+            InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range);
+
+        struct LocalUsageOrdering
+        {
+            bool operator()(const Local* l1, const Local* l2) const;
+        };
+    } /* namespace normalization */
+} /* namespace vc4c */
+#endif /* VC4C_NORMALIZATION_ADDRESS_CALCULATION_H */