-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Memory access is now mapped in following steps: * Determine preferred and fall-back lowering type per memory area (e.g. register, VPM, TMU, DMA) * Check whether lowering type can be applied, reserve resources * Map all memory access to specified lowering level Copies useful changes from #120 without changing the semantics.
- Loading branch information
Showing
8 changed files
with
1,888 additions
and
1,055 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
/* | ||
* Author: doe300 | ||
* | ||
* See the file "LICENSE" for the full license governing this code. | ||
*/ | ||
|
||
#include "AddressCalculation.h" | ||
|
||
#include "../InstructionWalker.h" | ||
#include "../intermediate/operators.h" | ||
#include "../periphery/VPM.h" | ||
#include "log.h" | ||
|
||
using namespace vc4c; | ||
using namespace vc4c::intermediate; | ||
using namespace vc4c::normalization; | ||
using namespace vc4c::operators; | ||
|
||
MemoryAccessType normalization::toMemoryAccessType(periphery::VPMUsage usage) | ||
{ | ||
switch(usage) | ||
{ | ||
case periphery::VPMUsage::SCRATCH: | ||
case periphery::VPMUsage::LOCAL_MEMORY: | ||
return MemoryAccessType::VPM_SHARED_ACCESS; | ||
case periphery::VPMUsage::REGISTER_SPILLING: | ||
case periphery::VPMUsage::STACK: | ||
return MemoryAccessType::VPM_PER_QPU; | ||
} | ||
throw CompilationError(CompilationStep::NORMALIZER, | ||
"Unknown VPM usage type to map to memory type: ", std::to_string(static_cast<int>(usage))); | ||
} | ||
|
||
InstructionWalker normalization::insertAddressToOffset(InstructionWalker it, Method& method, Value& out, | ||
const Local* baseAddress, const MemoryInstruction* mem, const Value& ptrValue) | ||
{ | ||
auto indexOp = dynamic_cast<const Operation*>(ptrValue.getSingleWriter()); | ||
if(!indexOp) | ||
{ | ||
// for stores, the store itself is also a write instruction | ||
auto writers = ptrValue.local()->getUsers(LocalUse::Type::WRITER); | ||
if(writers.size() == 2 && writers.find(mem) != writers.end()) | ||
{ | ||
writers.erase(mem); | ||
indexOp = dynamic_cast<const Operation*>(*writers.begin()); | ||
} | ||
} | ||
if(ptrValue.hasLocal(baseAddress)) | ||
{ | ||
// trivial case, the offset is zero | ||
out = INT_ZERO; | ||
} | ||
else if(indexOp && indexOp->readsLocal(baseAddress) && indexOp->op == OP_ADD) | ||
{ | ||
// for simple version where the index is base address + offset, simple use the offset directly | ||
out = indexOp->getFirstArg().hasLocal(baseAddress) ? indexOp->getSecondArg().value() : indexOp->getFirstArg(); | ||
} | ||
else | ||
{ | ||
// for more complex versions, calculate offset by subtracting base address from result | ||
// address | ||
out = assign(it, baseAddress->type, "%pointer_diff") = ptrValue - baseAddress->createReference(); | ||
} | ||
return it; | ||
} | ||
|
||
InstructionWalker normalization::insertAddressToStackOffset(InstructionWalker it, Method& method, Value& out, | ||
const Local* baseAddress, MemoryAccessType type, const MemoryInstruction* mem, const Value& ptrValue) | ||
{ | ||
Value tmpIndex = UNDEFINED_VALUE; | ||
it = insertAddressToOffset(it, method, tmpIndex, baseAddress, mem, ptrValue); | ||
if(type == MemoryAccessType::VPM_PER_QPU) | ||
{ | ||
// size of one stack-frame in bytes | ||
auto stackByteSize = periphery::VPM::getVPMStorageType(baseAddress->type.getElementType()).getPhysicalWidth(); | ||
// add offset of stack-frame | ||
Value stackOffset = method.addNewLocal(TYPE_VOID_POINTER, "%stack_offset"); | ||
assign(it, stackOffset) = mul24(Value(Literal(stackByteSize), TYPE_INT16), Value(REG_QPU_NUMBER, TYPE_INT8)); | ||
out = assign(it, TYPE_VOID_POINTER, "%stack_offset") = tmpIndex + stackOffset; | ||
} | ||
else | ||
{ | ||
out = tmpIndex; | ||
} | ||
return it; | ||
} | ||
|
||
InstructionWalker normalization::insertAddressToElementOffset(InstructionWalker it, Method& method, Value& out, | ||
const Local* baseAddress, const Value& container, const MemoryInstruction* mem, const Value& ptrValue) | ||
{ | ||
Value tmpIndex = UNDEFINED_VALUE; | ||
it = insertAddressToOffset(it, method, tmpIndex, baseAddress, mem, ptrValue); | ||
// the index (as per index calculation) is in bytes, but we need index in elements, so divide by element size | ||
out = assign(it, TYPE_VOID_POINTER, "%element_offset") = | ||
tmpIndex / Literal(container.type.getElementType().getPhysicalWidth()); | ||
return it; | ||
} | ||
|
||
static Optional<std::pair<Value, InstructionDecorations>> combineAdditions( | ||
Method& method, InstructionWalker referenceIt, FastMap<Value, InstructionDecorations>& addedValues) | ||
{ | ||
if(addedValues.empty()) | ||
return {}; | ||
Optional<std::pair<Value, InstructionDecorations>> prevResult; | ||
auto valIt = addedValues.begin(); | ||
while(valIt != addedValues.end()) | ||
{ | ||
if(prevResult) | ||
{ | ||
auto newFlags = intersect_flags(prevResult->second, valIt->second); | ||
auto newResult = assign(referenceIt, prevResult->first.type) = (prevResult->first + valIt->first, newFlags); | ||
prevResult = std::make_pair(newResult, newFlags); | ||
} | ||
else | ||
prevResult = std::make_pair(valIt->first, valIt->second); | ||
valIt = addedValues.erase(valIt); | ||
} | ||
return prevResult; | ||
} | ||
|
||
std::string MemoryAccessRange::to_string() const | ||
{ | ||
return (memoryInstruction->to_string() + | ||
(memoryInstruction->writesRegister(REG_VPM_DMA_LOAD_ADDR) ? " - read " : " - write ")) + | ||
(memoryObject->to_string() + | ||
(groupUniformAddressParts.empty() ? " with" : " with work-group uniform offset and") + | ||
" dynamic element range [") + | ||
(std::to_string(offsetRange.minValue) + ", ") + (std::to_string(offsetRange.maxValue) + "]"); | ||
} | ||
|
||
InstructionWalker normalization::insertAddressToWorkItemSpecificOffset( | ||
InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range) | ||
{ | ||
auto dynamicParts = combineAdditions(method, it, range.dynamicAddressParts); | ||
out = dynamicParts->first; | ||
if(range.typeSizeShift) | ||
out = assign(it, dynamicParts->first.type) = | ||
(dynamicParts->first << (*range.typeSizeShift)->assertArgument(1), dynamicParts->second); | ||
return it; | ||
} | ||
|
||
bool LocalUsageOrdering::operator()(const Local* l1, const Local* l2) const | ||
{ | ||
// prefer more usages over less usages | ||
// since there is always only 1 writer for the local address, we prefer this over only counting readers for | ||
// performance reasons | ||
// TODO is this the correct way to do this? E.g. is there one usage per memory access? | ||
if(l1->getUsers().size() > l2->getUsers().size()) | ||
return true; | ||
if(l1->getUsers().size() == l2->getUsers().size()) | ||
return l1 < l2; | ||
return false; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
/* | ||
* Author: doe300 | ||
* | ||
* See the file "LICENSE" for the full license governing this code. | ||
*/ | ||
#ifndef VC4C_NORMALIZATION_ADDRESS_CALCULATION_H | ||
#define VC4C_NORMALIZATION_ADDRESS_CALCULATION_H | ||
|
||
#include "../InstructionWalker.h" | ||
#include "../Values.h" | ||
#include "../analysis/ValueRange.h" | ||
#include "../intermediate/IntermediateInstruction.h" | ||
#include "../performance.h" | ||
|
||
namespace vc4c | ||
{ | ||
namespace periphery | ||
{ | ||
enum class VPMUsage : unsigned char; | ||
} // namespace periphery | ||
|
||
namespace normalization | ||
{ | ||
/** | ||
* Enum for the different ways of how to access memory areas | ||
*/ | ||
enum class MemoryAccessType | ||
{ | ||
// lower the value into a register and replace all loads with moves | ||
QPU_REGISTER_READONLY, | ||
// lower the value into a register and replace all loads/stores with moves | ||
QPU_REGISTER_READWRITE, | ||
// store in VPM in extra space per QPU!! | ||
VPM_PER_QPU, | ||
// store in VPM, QPUs share access to common data | ||
VPM_SHARED_ACCESS, | ||
// keep in RAM/global data segment, read via TMU | ||
RAM_LOAD_TMU, | ||
// keep in RAM/global data segment, access via VPM | ||
RAM_READ_WRITE_VPM | ||
}; | ||
|
||
struct MemoryAccess | ||
{ | ||
FastSet<InstructionWalker> accessInstructions; | ||
MemoryAccessType preferred; | ||
MemoryAccessType fallback; | ||
}; | ||
|
||
MemoryAccessType toMemoryAccessType(periphery::VPMUsage usage); | ||
|
||
/* | ||
* Converts an address (e.g. an index chain) and the corresponding base pointer to the pointer difference | ||
* | ||
* NOTE: The result itself is still in "memory-address mode", meaning the offset is the number of bytes | ||
* | ||
* Returns (char*)address - (char*)baseAddress | ||
*/ | ||
NODISCARD InstructionWalker insertAddressToOffset(InstructionWalker it, Method& method, Value& out, | ||
const Local* baseAddress, const intermediate::MemoryInstruction* mem, const Value& ptrValue); | ||
|
||
/* | ||
* Converts an address (e.g. an index-chain) and a base-address to the offset of the vector denoting the element | ||
* accessed by the index-chain. In addition to #insertAddressToOffset, this function also handles multiple | ||
* stack-frames. | ||
* | ||
* NOTE: The result is still the offset in bytes, since VPM#insertReadVPM and VPM#insertWriteVPM take the offset | ||
* in bytes! | ||
* | ||
* Returns ((char*)address - (char*)baseAddress) + (typeSizeInBytes * stackIndex), where stackIndex is always | ||
* zero (and therefore the second part omitted) for shared memory | ||
*/ | ||
NODISCARD InstructionWalker insertAddressToStackOffset(InstructionWalker it, Method& method, Value& out, | ||
const Local* baseAddress, MemoryAccessType type, const intermediate::MemoryInstruction* mem, | ||
const Value& ptrValue); | ||
|
||
/* | ||
* Converts an address (e.g. index-chain) and the corresponding base-address to the element offset for an | ||
* element of the type used in the container | ||
* | ||
* Return ((char*)address - (char*)baseAddress) / sizeof(elementType) | ||
*/ | ||
NODISCARD InstructionWalker insertAddressToElementOffset(InstructionWalker it, Method& method, Value& out, | ||
const Local* baseAddress, const Value& container, const intermediate::MemoryInstruction* mem, | ||
const Value& ptrValue); | ||
|
||
// represents analysis data for the range of memory accessed per memory object | ||
struct MemoryAccessRange | ||
{ | ||
const Local* memoryObject; | ||
// the memory instruction accessing the memory object | ||
InstructionWalker memoryInstruction; | ||
// the instruction adding the offset to the base pointer | ||
InstructionWalker baseAddressAdd; | ||
// the instruction converting the address offset from element offset to byte offset | ||
Optional<InstructionWalker> typeSizeShift; | ||
// the work-group uniform parts of which the address offset is calculated from | ||
FastMap<Value, intermediate::InstructionDecorations> groupUniformAddressParts; | ||
// the dynamic parts (specific to the work-item) of which the address offset is calculated from | ||
FastMap<Value, intermediate::InstructionDecorations> dynamicAddressParts; | ||
// the maximum range (in elements!) the memory is accessed in | ||
analysis::IntegerRange offsetRange{0, 0}; | ||
|
||
std::string to_string() const; | ||
}; | ||
|
||
/* | ||
* Converts an address (e.g. index-chain) which contains work-group uniform and work-item specific parts (as | ||
* specified in range) to the work-item specific part only. | ||
* This can be seen as a specialization of #insertAddressToOffset | ||
* | ||
* NOTE: The result itself is still in "memory-address mode", meaning the offset is the number of bytes | ||
* | ||
* * E.g. get_global_id(0) (= get_group_id(0) + get_local_id(0)) will be converted to get_local_id(0) | ||
*/ | ||
NODISCARD InstructionWalker insertAddressToWorkItemSpecificOffset( | ||
InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range); | ||
|
||
struct LocalUsageOrdering | ||
{ | ||
bool operator()(const Local* l1, const Local* l2) const; | ||
}; | ||
} /* namespace normalization */ | ||
} /* namespace vc4c */ | ||
#endif /* VC4C_NORMALIZATION_ADDRESS_CALCULATION_H */ |
Oops, something went wrong.