Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removes mutex locks from most memory accesses #120

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions src/analysis/DependencyGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,12 +303,13 @@ static void createMutexDependencies(DependencyGraph& graph, DependencyNode& node
const intermediate::IntermediateInstruction* lastSemaphoreAccess,
const intermediate::IntermediateInstruction* lastMemFence)
{
if((node.key->hasValueType(ValueType::REGISTER) && node.key->getOutput()->reg().isVertexPipelineMemory()) ||
std::any_of(node.key->getArguments().begin(), node.key->getArguments().end(),
[](const Value& arg) -> bool { return arg.hasRegister() && arg.reg().isVertexPipelineMemory(); }) ||
node.key->writesRegister(REG_MUTEX))
if(((node.key->hasValueType(ValueType::REGISTER) && node.key->getOutput()->reg().isVertexPipelineMemory()) ||
std::any_of(node.key->getArguments().begin(), node.key->getArguments().end(),
[](const Value& arg) -> bool { return arg.hasRegister() && arg.reg().isVertexPipelineMemory(); }) ||
node.key->writesRegister(REG_MUTEX)) &&
lastMutexLock != nullptr)
{
// any VPM operation or mutex unlock must be ordered after the corresponding mutex lock
// any VPM operation or mutex unlock must be ordered after a previous mutex lock, if any
auto& otherNode = graph.assertNode(lastMutexLock);
addDependency(otherNode.getOrCreateEdge(&node).data, DependencyType::MUTEX_LOCK);
}
Expand Down Expand Up @@ -497,13 +498,13 @@ static void createVPMIODependencies(DependencyGraph& graph, DependencyNode& node
(node.key->writesRegister(REG_VPM_IO) || node.key->writesRegister(REG_VPM_DMA_STORE_ADDR) ||
node.key->writesRegister(REG_MUTEX)))
{
// any other VPM write, VPM write address setup or unlocking mutex must be executed aftre the VPM write
// any other VPM write, VPM write address setup or unlocking mutex must be executed after the VPM write
auto& otherNode = graph.assertNode(lastVPMWrite);
addDependency(otherNode.getOrCreateEdge(&node).data, DependencyType::PERIPHERY_ORDER);
}
if(lastVPMRead != nullptr && (node.key->readsRegister(REG_VPM_IO) || node.key->writesRegister(REG_MUTEX)))
{
// any other VPM read or unlocking mutex must be executed aftre the VPM read
// any other VPM read or unlocking mutex must be executed after the VPM read
auto& otherNode = graph.assertNode(lastVPMRead);
addDependency(otherNode.getOrCreateEdge(&node).data, DependencyType::PERIPHERY_ORDER);
}
Expand Down
7 changes: 7 additions & 0 deletions src/intermediate/MemoryInstruction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@ MemoryInstruction::MemoryInstruction(
{
setArgument(0, src);
setArgument(1, numEntries);

if(numEntries != INT_ONE)
{
if(op != MemoryOperation::COPY && op != MemoryOperation::FILL)
throw CompilationError(
CompilationStep::LLVM_2_IR, "Can only use the entry count for copying of filling memory", to_string());
}
}

std::string MemoryInstruction::to_string() const
Expand Down
181 changes: 181 additions & 0 deletions src/normalization/AddressCalculation.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/

#include "AddressCalculation.h"

#include "../Locals.h"
#include "../intermediate/IntermediateInstruction.h"
#include "../intermediate/operators.h"
#include "log.h"

using namespace vc4c;
using namespace vc4c::intermediate;
using namespace vc4c::normalization;
using namespace vc4c::operators;

MemoryType normalization::toMemoryType(periphery::VPMUsage usage)
{
switch(usage)
{
case periphery::VPMUsage::SCRATCH:
case periphery::VPMUsage::LOCAL_MEMORY:
case periphery::VPMUsage::MEMORY_CACHE:
return MemoryType::VPM_SHARED_ACCESS;
case periphery::VPMUsage::REGISTER_SPILLING:
case periphery::VPMUsage::STACK:
return MemoryType::VPM_PER_QPU;
}
throw CompilationError(CompilationStep::NORMALIZER,
"Unknown VPM usage type to map to memory type: ", std::to_string(static_cast<int>(usage)));
}

InstructionWalker normalization::insertAddressToOffset(
InstructionWalker it, Method& method, Value& out, const Local* baseAddress, const MemoryInstruction* mem)
{
auto ptrVal = mem->op == MemoryOperation::READ ? mem->getSource() : mem->getDestination();
auto indexOp = dynamic_cast<const Operation*>(ptrVal.getSingleWriter());
if(!indexOp)
{
// for stores, the store itself is also a write instruction
auto writers = ptrVal.local()->getUsers(LocalUse::Type::WRITER);
if(writers.size() == 2 && writers.find(mem) != writers.end())
{
writers.erase(mem);
indexOp = dynamic_cast<const Operation*>(*writers.begin());
}
}
if(ptrVal.hasLocal(baseAddress))
{
// trivial case, the offset is zero
out = INT_ZERO;
}
else if(indexOp && indexOp->readsLocal(baseAddress) && indexOp->op == OP_ADD)
{
// for simple version where the index is base address + offset, simple use the offset directly
out = indexOp->getFirstArg().hasLocal(baseAddress) ? indexOp->getSecondArg().value() : indexOp->getFirstArg();
}
else
{
// for more complex versions, calculate offset by subtracting base address from result
// address
out = assign(it, baseAddress->type, "%pointer_diff") = ptrVal - baseAddress->createReference();
}
return it;
}

InstructionWalker normalization::insertAddressToStackOffset(InstructionWalker it, Method& method, Value& out,
const Local* baseAddress, MemoryType type, const MemoryInstruction* mem)
{
Value tmpIndex = UNDEFINED_VALUE;
it = insertAddressToOffset(it, method, tmpIndex, baseAddress, mem);
if(type == MemoryType::VPM_PER_QPU)
{
// size of one stack-frame in bytes
auto stackByteSize = periphery::VPM::getVPMStorageType(baseAddress->type.getElementType()).getPhysicalWidth();
// add offset of stack-frame
Value stackOffset = method.addNewLocal(TYPE_VOID.toPointerType(), "%stack_offset");
Value tmp = method.addNewLocal(baseAddress->type);
assign(it, stackOffset) = mul24(Value(Literal(stackByteSize), TYPE_INT16), Value(REG_QPU_NUMBER, TYPE_INT8));
out = assign(it, TYPE_VOID.toPointerType(), "%stack_offset") = tmpIndex + stackOffset;
}
else
{
out = tmpIndex;
}
return it;
}

InstructionWalker normalization::insertAddressToElementOffset(InstructionWalker it, Method& method, Value& out,
const Local* baseAddress, const Value& container, const MemoryInstruction* mem)
{
Value tmpIndex = UNDEFINED_VALUE;
it = insertAddressToOffset(it, method, tmpIndex, baseAddress, mem);
// the index (as per index calculation) is in bytes, but we need index in elements, so divide by element size
out = assign(it, TYPE_VOID.toPointerType(), "%element_offset") =
tmpIndex / Literal(container.type.getElementType().getPhysicalWidth());
return it;
}

static Optional<std::pair<Value, InstructionDecorations>> combineAdditions(
Method& method, InstructionWalker referenceIt, FastMap<Value, InstructionDecorations>& addedValues)
{
if(addedValues.empty())
return {};
Optional<std::pair<Value, InstructionDecorations>> prevResult;
auto valIt = addedValues.begin();
while(valIt != addedValues.end())
{
if(prevResult)
{
auto newResult = method.addNewLocal(prevResult->first.type);
auto newFlags = intersect_flags(prevResult->second, valIt->second);
referenceIt.emplace(new Operation(OP_ADD, newResult, prevResult->first, valIt->first));
referenceIt->addDecorations(newFlags);
referenceIt.nextInBlock();
prevResult = std::make_pair(newResult, newFlags);
}
else
prevResult = std::make_pair(valIt->first, valIt->second);
valIt = addedValues.erase(valIt);
}
return prevResult;
}

std::string MemoryAccessRange::to_string() const
{
return (memoryInstruction->to_string() +
(memoryInstruction->writesRegister(REG_VPM_DMA_LOAD_ADDR) ? " - read " : " - write ")) +
(memoryObject->to_string() +
(groupUniformAddressParts.empty() ? " with" : " with work-group uniform offset and") +
" dynamic element range [") +
(std::to_string(offsetRange.minValue) + ", ") + (std::to_string(offsetRange.maxValue) + "]");
}

InstructionWalker normalization::insertAddressToWorkItemSpecificOffset(
InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range)
{
auto dynamicParts = combineAdditions(method, it, range.dynamicAddressParts);
out = dynamicParts->first;
if(range.typeSizeShift)
out = assign(it, dynamicParts->first.type) =
(dynamicParts->first << (*range.typeSizeShift)->assertArgument(1), dynamicParts->second);
return it;
}

InstructionWalker normalization::insertAddressToWorkGroupUniformOffset(
InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range)
{
auto uniformParts = combineAdditions(method, it, range.groupUniformAddressParts);
out = uniformParts->first;
if(range.typeSizeShift)
out = assign(it, uniformParts->first.type) =
(uniformParts->first << (*range.typeSizeShift)->assertArgument(1), uniformParts->second);
static auto checkPointer = [](const Value& arg) -> bool { return arg.type.isPointerType(); };
if(std::all_of(
range.baseAddressAdd->getArguments().begin(), range.baseAddressAdd->getArguments().end(), checkPointer) ||
std::none_of(
range.baseAddressAdd->getArguments().begin(), range.baseAddressAdd->getArguments().end(), checkPointer))
throw CompilationError(CompilationStep::NORMALIZER, "Cannot determine base address of addition",
range.baseAddressAdd->to_string());
auto baseAddrIt = std::find_if(
range.baseAddressAdd->getArguments().begin(), range.baseAddressAdd->getArguments().end(), checkPointer);
out = assign(it, range.baseAddressAdd->getOutput()->type) =
(*baseAddrIt + out, uniformParts->second, InstructionDecorations::WORK_GROUP_UNIFORM_VALUE);
return it;
}

bool LocalUsageOrdering::operator()(const Local* l1, const Local* l2) const
{
// prefer more usages over less usages
// since there is always only 1 writer for the local address, we prefer this over only counting readers for
// performance reasons
// TODO is this the correct way to do this? E.g. is there one usage per memory access?
if(l1->getUsers().size() > l2->getUsers().size())
return true;
if(l1->getUsers().size() == l2->getUsers().size())
return l1 < l2;
return false;
}
129 changes: 129 additions & 0 deletions src/normalization/AddressCalculation.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4C_NORMALIZATION_ADDRESS_CALCULATION_H
#define VC4C_NORMALIZATION_ADDRESS_CALCULATION_H

#include "../InstructionWalker.h"
#include "../Values.h"
#include "../analysis/ValueRange.h"
#include "../intermediate/IntermediateInstruction.h"
#include "../periphery/VPM.h"

namespace vc4c
{
namespace normalization
{
enum class MemoryType
{
// lower the value into a register and replace all loads with moves
QPU_REGISTER_READONLY,
// lower the value into a register and replace all loads/stores with moves
QPU_REGISTER_READWRITE,
// store in VPM in extra space per QPU!!
VPM_PER_QPU,
// store in VPM, QPUs share access to common data
VPM_SHARED_ACCESS,
// keep in RAM/global data segment, read via TMU
RAM_LOAD_TMU,
// keep in RAM/global data segment, access via VPM
RAM_READ_WRITE_VPM
};

struct MemoryAccess
{
FastSet<InstructionWalker> accessInstructions;
MemoryType preferred;
MemoryType fallback;
};

MemoryType toMemoryType(periphery::VPMUsage usage);

/*
* Converts an address (e.g. an index chain) and the corresponding base pointer to the pointer difference
*
* NOTE: The result itself is still in "memory-address mode", meaning the offset is the number of bytes
*
* Returns (char*)address - (char*)baseAddress
*/
InstructionWalker insertAddressToOffset(InstructionWalker it, Method& method, Value& out,
const Local* baseAddress, const intermediate::MemoryInstruction* mem);

/*
* Converts an address (e.g. an index-chain) and a base-address to the offset of the vector denoting the element
* accessed by the index-chain. In addition to #insertAddressToOffset, this function also handles multiple
* stack-frames.
*
* NOTE: The result is still the offset in bytes, since VPM#insertReadVPM and VPM#insertWriteVPM take the offset
* in bytes!
*
* Returns ((char*)address - (char*)baseAddress) + (typeSizeInBytes * stackIndex), where stackIndex is always
* zero (and therefore the second part omitted) for shared memory
*/
InstructionWalker insertAddressToStackOffset(InstructionWalker it, Method& method, Value& out,
const Local* baseAddress, MemoryType type, const intermediate::MemoryInstruction* mem);

/*
* Converts an address (e.g. index-chain) and the corresponding base-address to the element offset for an
* element of the type used in the container
*
* Return ((char*)address - (char*)baseAddress) / sizeof(elementType)
*/
InstructionWalker insertAddressToElementOffset(InstructionWalker it, Method& method, Value& out,
const Local* baseAddress, const Value& container, const intermediate::MemoryInstruction* mem);

// represents analysis data for the range of memory accessed per memory object
struct MemoryAccessRange
{
const Local* memoryObject;
// the memory instruction accessing the memory object
InstructionWalker memoryInstruction;
// the instruction adding the offset to the base pointer, could be the same as addressWrite
InstructionWalker baseAddressAdd;
// the instruction converting the address offset from element offset to byte offset
Optional<InstructionWalker> typeSizeShift;
// the work-group uniform parts of which the address offset is calculated from
FastMap<Value, intermediate::InstructionDecorations> groupUniformAddressParts;
// the dynamic parts of which the address offset is calculated from
FastMap<Value, intermediate::InstructionDecorations> dynamicAddressParts;
// the maximum range (in elements!) the memory is accessed in
analysis::IntegerRange offsetRange{0, 0};

std::string to_string() const;
};

/*
* Converts an address (e.g. index-chain) which contains work-group uniform and work-item specific parts (as
* specified in range) to the work-item specific part only.
* This can be seen as a specialization of #insertAddressToOffset
*
* NOTE: The result itself is still in "memory-address mode", meaning the offset is the number of bytes
*
* * E.g. get_global_id(0) (= get_group_id(0) + get_local_id(0)) will be converted to get_local_id(0)
*/
InstructionWalker insertAddressToWorkItemSpecificOffset(
InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range);

/*
* Converts an address (e.g. index-chain) which contains work-group uniform and work-item specific parts (as
* specified in range) to the work-group uniform part only.
* Adding the result of this function and #insertAddressToWorkItemSpecificOffset will result in the original
* address
*
* NOTE: The result itself is in "memory-address mode", meaning the offset is the number of bytes, since it is
* meant for addressing memory
*
* * E.g. get_global_id(0) (= get_group_id(0) + get_local_id(0)) will be converted to get_group_id(0)
*/
InstructionWalker insertAddressToWorkGroupUniformOffset(
InstructionWalker it, Method& method, Value& out, MemoryAccessRange& range);

struct LocalUsageOrdering
{
bool operator()(const Local* l1, const Local* l2) const;
};
} /* namespace normalization */
} /* namespace vc4c */
#endif /* VC4C_NORMALIZATION_ADDRESS_CALCULATION_H */
Loading