Skip to content

Commit

Permalink
Rewrites memory access mapping
Browse files Browse the repository at this point in the history
Memory access is now mapped in following steps:

* Determine preferred and fall-back lowering type per memory area (e.g. register, VPM, TMU, DMA)
* Check whether lowering type can be applied, reserve resources
* Map all memory access to specified lowering level

Copies useful changes from #120 without changing the semantics.
  • Loading branch information
doe300 committed May 18, 2019
1 parent be96155 commit 22ad51e
Show file tree
Hide file tree
Showing 8 changed files with 1,888 additions and 1,055 deletions.
153 changes: 153 additions & 0 deletions src/normalization/AddressCalculation.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/

#include "AddressCalculation.h"

#include "../InstructionWalker.h"
#include "../intermediate/operators.h"
#include "../periphery/VPM.h"
#include "log.h"

using namespace vc4c;
using namespace vc4c::intermediate;
using namespace vc4c::normalization;
using namespace vc4c::operators;

MemoryAccessType normalization::toMemoryAccessType(periphery::VPMUsage usage)
{
switch(usage)
{
case periphery::VPMUsage::SCRATCH:
case periphery::VPMUsage::LOCAL_MEMORY:
return MemoryAccessType::VPM_SHARED_ACCESS;
case periphery::VPMUsage::REGISTER_SPILLING:
case periphery::VPMUsage::STACK:
return MemoryAccessType::VPM_PER_QPU;
}
throw CompilationError(CompilationStep::NORMALIZER,
"Unknown VPM usage type to map to memory type: ", std::to_string(static_cast<int>(usage)));
}

InstructionWalker normalization::insertAddressToOffset(InstructionWalker it, Method& method, Value& out,
const Local* baseAddress, const MemoryInstruction* mem, const Value& ptrValue)
{
auto indexOp = dynamic_cast<const Operation*>(ptrValue.getSingleWriter());
if(!indexOp)
{
// for stores, the store itself is also a write instruction
auto writers = ptrValue.local()->getUsers(LocalUse::Type::WRITER);
if(writers.size() == 2 && writers.find(mem) != writers.end())
{
writers.erase(mem);
indexOp = dynamic_cast<const Operation*>(*writers.begin());
}
}
if(ptrValue.hasLocal(baseAddress))
{
// trivial case, the offset is zero
out = INT_ZERO;
}
else if(indexOp && indexOp->readsLocal(baseAddress) && indexOp->op == OP_ADD)
{
// for simple version where the index is base address + offset, simple use the offset directly
out = indexOp->getFirstArg().hasLocal(baseAddress) ? indexOp->getSecondArg().value() : indexOp->getFirstArg();
}
else
{
// for more complex versions, calculate offset by subtracting base address from result
// address
out = assign(it, baseAddress->type, "%pointer_diff") = ptrValue - baseAddress->createReference();
}
return it;
}

InstructionWalker normalization::insertAddressToStackOffset(InstructionWalker it, Method& method, Value& out,
const Local* baseAddress, MemoryAccessType type, const MemoryInstruction* mem, const Value& ptrValue)
{
Value tmpIndex = UNDEFINED_VALUE;
it = insertAddressToOffset(it, method, tmpIndex, baseAddress, mem, ptrValue);
if(type == MemoryAccessType::VPM_PER_QPU)
{
// size of one stack-frame in bytes
auto stackByteSize = periphery::VPM::getVPMStorageType(baseAddress->type.getElementType()).getPhysicalWidth();
// add offset of stack-frame
Value stackOffset = method.addNewLocal(TYPE_VOID_POINTER, "%stack_offset");
assign(it, stackOffset) = mul24(Value(Literal(stackByteSize), TYPE_INT16), Value(REG_QPU_NUMBER, TYPE_INT8));
out = assign(it, TYPE_VOID_POINTER, "%stack_offset") = tmpIndex + stackOffset;
}
else
{
out = tmpIndex;
}
return it;
}

InstructionWalker normalization::insertAddressToElementOffset(InstructionWalker it, Method& method, Value& out,
const Local* baseAddress, const Value& container, const MemoryInstruction* mem, const Value& ptrValue)
{
Value tmpIndex = UNDEFINED_VALUE;
it = insertAddressToOffset(it, method, tmpIndex, baseAddress, mem, ptrValue);
// the index (as per index calculation) is in bytes, but we need index in elements, so divide by element size
out = assign(it, TYPE_VOID_POINTER, "%element_offset") =
tmpIndex / Literal(container.type.getElementType().getPhysicalWidth());
return it;
}

static Optional<std::pair<Value, InstructionDecorations>> combineAdditions(
Method& method, InstructionWalker referenceIt, FastMap<Value, InstructionDecorations>& addedValues)
{
if(addedValues.empty())
return {};
Optional<std::pair<Value, InstructionDecorations>> prevResult;
auto valIt = addedValues.begin();
while(valIt != addedValues.end())
{
if(prevResult)
{
auto newFlags = intersect_flags(prevResult->second, valIt->second);
auto newResult = assign(referenceIt, prevResult->first.type) = (prevResult->first + valIt->first, newFlags);
prevResult = std::make_pair(newResult, newFlags);
}
else
prevResult = std::make_pair(valIt->first, valIt->second);
valIt = addedValues.erase(valIt);
}
return prevResult;
}

std::string MemoryAccessRange::to_string() const
{
return (memoryInstruction->to_string() +
(memoryInstruction->writesRegister(REG_VPM_DMA_LOAD_ADDR) ? " - read " : " - write ")) +
(memoryObject->to_string() +
(groupUniformAddressParts.empty() ? " with" : " with work-group uniform offset and") +
" dynamic element range [") +
(std::to_string(offsetRange.minValue) + ", ") + (std::to_string(offsetRange.maxValue) + "]");
}

InstructionWalker normalization::insertAddressToWorkItemSpecificOffset(
InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range)
{
auto dynamicParts = combineAdditions(method, it, range.dynamicAddressParts);
out = dynamicParts->first;
if(range.typeSizeShift)
out = assign(it, dynamicParts->first.type) =
(dynamicParts->first << (*range.typeSizeShift)->assertArgument(1), dynamicParts->second);
return it;
}

bool LocalUsageOrdering::operator()(const Local* l1, const Local* l2) const
{
// prefer more usages over less usages
// since there is always only 1 writer for the local address, we prefer this over only counting readers for
// performance reasons
// TODO is this the correct way to do this? E.g. is there one usage per memory access?
if(l1->getUsers().size() > l2->getUsers().size())
return true;
if(l1->getUsers().size() == l2->getUsers().size())
return l1 < l2;
return false;
}
125 changes: 125 additions & 0 deletions src/normalization/AddressCalculation.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4C_NORMALIZATION_ADDRESS_CALCULATION_H
#define VC4C_NORMALIZATION_ADDRESS_CALCULATION_H

#include "../InstructionWalker.h"
#include "../Values.h"
#include "../analysis/ValueRange.h"
#include "../intermediate/IntermediateInstruction.h"
#include "../performance.h"

namespace vc4c
{
namespace periphery
{
enum class VPMUsage : unsigned char;
} // namespace periphery

namespace normalization
{
/**
* Enum for the different ways of how to access memory areas
*/
enum class MemoryAccessType
{
// lower the value into a register and replace all loads with moves
QPU_REGISTER_READONLY,
// lower the value into a register and replace all loads/stores with moves
QPU_REGISTER_READWRITE,
// store in VPM in extra space per QPU!!
VPM_PER_QPU,
// store in VPM, QPUs share access to common data
VPM_SHARED_ACCESS,
// keep in RAM/global data segment, read via TMU
RAM_LOAD_TMU,
// keep in RAM/global data segment, access via VPM
RAM_READ_WRITE_VPM
};

struct MemoryAccess
{
FastSet<InstructionWalker> accessInstructions;
MemoryAccessType preferred;
MemoryAccessType fallback;
};

MemoryAccessType toMemoryAccessType(periphery::VPMUsage usage);

/*
* Converts an address (e.g. an index chain) and the corresponding base pointer to the pointer difference
*
* NOTE: The result itself is still in "memory-address mode", meaning the offset is the number of bytes
*
* Returns (char*)address - (char*)baseAddress
*/
NODISCARD InstructionWalker insertAddressToOffset(InstructionWalker it, Method& method, Value& out,
const Local* baseAddress, const intermediate::MemoryInstruction* mem, const Value& ptrValue);

/*
* Converts an address (e.g. an index-chain) and a base-address to the offset of the vector denoting the element
* accessed by the index-chain. In addition to #insertAddressToOffset, this function also handles multiple
* stack-frames.
*
* NOTE: The result is still the offset in bytes, since VPM#insertReadVPM and VPM#insertWriteVPM take the offset
* in bytes!
*
* Returns ((char*)address - (char*)baseAddress) + (typeSizeInBytes * stackIndex), where stackIndex is always
* zero (and therefore the second part omitted) for shared memory
*/
NODISCARD InstructionWalker insertAddressToStackOffset(InstructionWalker it, Method& method, Value& out,
const Local* baseAddress, MemoryAccessType type, const intermediate::MemoryInstruction* mem,
const Value& ptrValue);

/*
* Converts an address (e.g. index-chain) and the corresponding base-address to the element offset for an
* element of the type used in the container
*
* Return ((char*)address - (char*)baseAddress) / sizeof(elementType)
*/
NODISCARD InstructionWalker insertAddressToElementOffset(InstructionWalker it, Method& method, Value& out,
const Local* baseAddress, const Value& container, const intermediate::MemoryInstruction* mem,
const Value& ptrValue);

// represents analysis data for the range of memory accessed per memory object
struct MemoryAccessRange
{
const Local* memoryObject;
// the memory instruction accessing the memory object
InstructionWalker memoryInstruction;
// the instruction adding the offset to the base pointer
InstructionWalker baseAddressAdd;
// the instruction converting the address offset from element offset to byte offset
Optional<InstructionWalker> typeSizeShift;
// the work-group uniform parts of which the address offset is calculated from
FastMap<Value, intermediate::InstructionDecorations> groupUniformAddressParts;
// the dynamic parts (specific to the work-item) of which the address offset is calculated from
FastMap<Value, intermediate::InstructionDecorations> dynamicAddressParts;
// the maximum range (in elements!) the memory is accessed in
analysis::IntegerRange offsetRange{0, 0};

std::string to_string() const;
};

/*
* Converts an address (e.g. index-chain) which contains work-group uniform and work-item specific parts (as
* specified in range) to the work-item specific part only.
* This can be seen as a specialization of #insertAddressToOffset
*
* NOTE: The result itself is still in "memory-address mode", meaning the offset is the number of bytes
*
* * E.g. get_global_id(0) (= get_group_id(0) + get_local_id(0)) will be converted to get_local_id(0)
*/
NODISCARD InstructionWalker insertAddressToWorkItemSpecificOffset(
InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range);

struct LocalUsageOrdering
{
bool operator()(const Local* l1, const Local* l2) const;
};
} /* namespace normalization */
} /* namespace vc4c */
#endif /* VC4C_NORMALIZATION_ADDRESS_CALCULATION_H */
Loading

0 comments on commit 22ad51e

Please sign in to comment.