diff --git a/src/Expression.cpp b/src/Expression.cpp index 9bdba743..4d46c947 100644 --- a/src/Expression.cpp +++ b/src/Expression.cpp @@ -6,6 +6,8 @@ using namespace vc4c; constexpr OpCode Expression::FAKEOP_UMUL; +constexpr OpCode Expression::FAKEOP_MUL; +constexpr OpCode Expression::FAKEOP_DIV; SubExpression::SubExpression(const Optional& val) : Base(VariantNamespace::monostate{}) { diff --git a/src/Expression.h b/src/Expression.h index 4459a7df..1f33bebe 100644 --- a/src/Expression.h +++ b/src/Expression.h @@ -109,6 +109,9 @@ namespace vc4c // A fake operation to indicate an unsigned multiplication static constexpr OpCode FAKEOP_UMUL{"umul", 132, 132, 2, false, false, FlagBehavior::NONE}; + static constexpr OpCode FAKEOP_MUL{"mul", 132, 132, 2, false, false, FlagBehavior::NONE}; + static constexpr OpCode FAKEOP_DIV{"div", 132, 132, 2, false, false, FlagBehavior::NONE}; + OpCode code; SubExpression arg0; SubExpression arg1{}; diff --git a/src/normalization/Normalizer.cpp b/src/normalization/Normalizer.cpp index 6401eea2..d5a79f19 100644 --- a/src/normalization/Normalizer.cpp +++ b/src/normalization/Normalizer.cpp @@ -16,6 +16,7 @@ #include "../optimization/ControlFlow.h" #include "../optimization/Eliminator.h" #include "../optimization/Reordering.h" +#include "../intermediate/operators.h" #include "../spirv/SPIRVBuiltins.h" #include "Inliner.h" #include "LiteralValues.h" @@ -23,6 +24,8 @@ #include "MemoryAccess.h" #include "Rewrite.h" +#include "../optimization/Combiner.h" + #include "log.h" #include @@ -30,6 +33,8 @@ using namespace vc4c; using namespace vc4c::normalization; +using namespace vc4c::periphery; +using namespace vc4c::operators; static bool checkWorkGroupUniform(const Value& arg) { @@ -253,6 +258,16 @@ void Normalizer::normalize(Module& module) const PROFILE_COUNTER_WITH_PREV(vc4c::profiler::COUNTER_NORMALIZATION + 2, "Eliminate Phi-nodes (after)", method->countInstructions(), vc4c::profiler::COUNTER_NORMALIZATION + 1); } + + { + // TODO: move this optimization to appropriate location + auto kernels = module.getKernels(); + for(Method* kernelFunc : kernels) + { + optimizations::combineDMALoads(module, *kernelFunc, config); + } + } + auto kernels = module.getKernels(); // 2. inline kernel-functions for(Method* kernelFunc : kernels) @@ -266,6 +281,7 @@ void Normalizer::normalize(Module& module) const PROFILE_COUNTER_WITH_PREV(vc4c::profiler::COUNTER_NORMALIZATION + 5, "Inline (after)", kernel.countInstructions(), vc4c::profiler::COUNTER_NORMALIZATION + 4); } + // 3. run other normalization steps on kernel functions const auto f = [&module, this](Method* kernelFunc) -> void { normalizeMethod(module, *kernelFunc); }; ThreadPool::scheduleAll("Normalization", kernels, f, THREAD_LOGGER.get()); diff --git a/src/optimization/Combiner.cpp b/src/optimization/Combiner.cpp index d1f3e28b..da209506 100644 --- a/src/optimization/Combiner.cpp +++ b/src/optimization/Combiner.cpp @@ -6,17 +6,20 @@ #include "Combiner.h" +#include "../Expression.h" #include "../InstructionWalker.h" #include "../analysis/MemoryAnalysis.h" #include "../intermediate/Helper.h" #include "../intermediate/operators.h" #include "../periphery/VPM.h" +#include "../spirv/SPIRVHelper.h" #include "Eliminator.h" #include "log.h" #include #include #include +#include // TODO combine y = (x >> n) << n with and // same for y = (x << n) >> n (at least of n constant) @@ -27,6 +30,7 @@ using namespace vc4c; using namespace vc4c::optimizations; using namespace vc4c::intermediate; using namespace vc4c::operators; +using namespace vc4c::periphery; // Taken from https://stackoverflow.com/questions/2835469/how-to-perform-rotate-shift-in-c?noredirect=1&lq=1 constexpr static uint32_t rotate_left_halfword(uint32_t value, uint8_t shift) noexcept @@ -1121,6 +1125,518 @@ InstructionWalker optimizations::combineArithmeticOperations( return it; } +SubExpression makeValueBinaryOpFromLocal(Value& left, const OpCode& binOp, Value& right) +{ + return SubExpression(std::make_shared(binOp, SubExpression(left), SubExpression(right))); +} + +// try to convert shl to mul and return it as ValueExpr +SubExpression shlToMul(const Value& value, const intermediate::Operation* op) +{ + auto left = op->getFirstArg(); + auto right = *op->getSecondArg(); + int shiftValue = 0; + if(auto lit = right.checkLiteral()) + { + shiftValue = lit->signedInt(); + } + else if(auto imm = right.checkImmediate()) + { + shiftValue = imm->getIntegerValue().value_or(0); + } + + if(shiftValue > 0) + { + auto right = Value(Literal(1 << shiftValue), TYPE_INT32); + return makeValueBinaryOpFromLocal(left, OP_FMUL, right); + } + else + { + return SubExpression(value); + } +} + +SubExpression iiToExpr(const Value& value, const LocalUser* inst) +{ + // add, sub, shr, shl, asr + if(auto op = dynamic_cast(inst)) + { + if(op->op == OP_ADD || op->op == OP_SUB) + { + auto left = op->getFirstArg(); + auto right = *op->getSecondArg(); + return makeValueBinaryOpFromLocal(left, op->op, right); + } + else if(op->op == OP_OR) // Treat `or` as `add` + { + auto left = op->getFirstArg(); + auto right = *op->getSecondArg(); + return makeValueBinaryOpFromLocal(left, OP_ADD, right); + } + else if(op->op == OP_SHL) + { + // convert shl to mul + return shlToMul(value, op); + // TODO: shr, asr + } + else + { + // If op is neither add nor sub, return value as-is. + return SubExpression(value); + } + } + // mul, div + else if(auto op = dynamic_cast(inst)) + { + OpCode binOp = OP_NOP; + if(op->opCode == "mul") + { + binOp = Expression::FAKEOP_MUL; + } + else if(op->opCode == "div") + { + binOp = Expression::FAKEOP_DIV; + } + else + { + // If op is neither add nor sub, return value as-is. + return SubExpression(value); + } + + auto left = op->getFirstArg(); + auto right = *op->getSecondArg(); + return makeValueBinaryOpFromLocal(left, binOp, right); + } + + return SubExpression(value); +} + +Optional getIntegerFromExpression(const SubExpression& expr) +{ + if(auto value = expr.checkValue()) + { + if(auto lit = value->checkLiteral()) + { + return Optional(lit->signedInt()); + } + else if(auto imm = value->checkImmediate()) + { + return imm->getIntegerValue(); + } + } + return Optional(); +} + +// signed, value +class ExpandedExprs : public std::vector> +{ +public: + std::string to_string() const + { + std::stringstream ss; + for(auto& p : *this) + { + ss << (p.first ? "+" : "-") << p.second.to_string(); + } + return ss.str(); + } +}; + +void expandExpression(const SubExpression& subExpr, ExpandedExprs& expanded) +{ + if(auto expr = subExpr.checkExpression()) + { + ExpandedExprs leftEE, rightEE; + auto& left = expr->arg0; + auto& right = expr->arg1; + auto& op = expr->code; + + expandExpression(left, leftEE); + expandExpression(right, rightEE); + + auto getInteger = [](const std::pair& v) { + std::function(const int&)> addSign = [&](const int& num) { + return make_optional(v.first ? num : -num); + }; + return getIntegerFromExpression(v.second) & addSign; + }; + + auto leftNum = (leftEE.size() == 1) ? getInteger(leftEE[0]) : Optional(); + auto rightNum = (rightEE.size() == 1) ? getInteger(rightEE[0]) : Optional(); + + auto append = [](ExpandedExprs& ee1, ExpandedExprs& ee2) { ee1.insert(ee1.end(), ee2.begin(), ee2.end()); }; + + if(leftNum && rightNum) + { + int l = leftNum.value_or(0); + int r = rightNum.value_or(0); + int num = 0; + + if(op == OP_ADD) + { + num = l + r; + } + else if(op == OP_SUB) + { + num = l - r; + } + else if(op == Expression::FAKEOP_MUL) + { + num = l * r; + } + else if(op == Expression::FAKEOP_DIV) + { + num = l / r; + } + else + { + throw CompilationError(CompilationStep::OPTIMIZER, "Unknown operation", op.name); + } + + // TODO: Care other types + auto value = Value(Literal(std::abs(num)), TYPE_INT32); + SubExpression foldedExpr(value); + expanded.push_back(std::make_pair(true, foldedExpr)); + } + else + { + if(op == OP_ADD) + { + append(expanded, leftEE); + append(expanded, rightEE); + } + else if(op == OP_SUB) + { + append(expanded, leftEE); + + for(auto& e : rightEE) + { + e.first = !e.first; + } + append(expanded, rightEE); + } + else if(op == Expression::FAKEOP_MUL) + { + if(leftNum || rightNum) + { + int num = 0; + ExpandedExprs* ee = nullptr; + if(leftNum) + { + num = leftNum.value_or(0); + ee = &rightEE; + } + else + { + num = rightNum.value_or(0); + ee = &leftEE; + } + for(int i = 0; i < num; i++) + { + append(expanded, *ee); + } + } + else + { + expanded.push_back( + std::make_pair(true, SubExpression(std::make_shared(op, left, right)))); + } + } + else if(op == Expression::FAKEOP_DIV) + { + expanded.push_back(std::make_pair(true, SubExpression(std::make_shared(op, left, right)))); + } + else + { + throw CompilationError(CompilationStep::OPTIMIZER, "Unknown operation", op.name); + } + } + } + else if(auto value = subExpr.checkValue()) + { + expanded.push_back(std::make_pair(true, subExpr)); + } + else + { + throw CompilationError(CompilationStep::OPTIMIZER, "Cannot expand expression", subExpr.to_string()); + } +} + +void calcValueExpr(ExpandedExprs& expanded) +{ + // ExpandedExprs expanded; + // expandExpression(expr, expanded); + + // for(auto& p : expanded) + // logging::debug() << (p.first ? "+" : "-") << p.second->to_string() << " "; + // logging::debug() << logging::endl; + + for(auto p = expanded.begin(); p != expanded.end();) + { + auto comp = std::find_if(expanded.begin(), expanded.end(), [&p](const std::pair& other) { + return p->first != other.first && p->second == other.second; + }); + if(comp != expanded.end()) + { + expanded.erase(comp); + p = expanded.erase(p); + } + else + { + p++; + } + } + + // SubExpression result(INT_ZERO); + // for(auto& p : expanded) + // { + // result = SubExpression(std::make_shared(p.first ? OP_ADD : OP_SUB, result, p.second)); + // } + // + // return result; +} + +SubExpression replaceLocalToExpr(const SubExpression& subExpr, const Value& local, SubExpression newExpr) +{ + if(auto expr = subExpr.checkExpression()) + { + return SubExpression(std::make_shared(expr->code, + replaceLocalToExpr(expr->arg0, local, newExpr), + replaceLocalToExpr(expr->arg1, local, newExpr))); + + } + else if(auto replacee = subExpr.checkLocal()) + { + if (auto replacer = local.checkLocal()) { + if (*replacee == *replacer) { + return newExpr; + } + } + } + + return subExpr; +} + +void optimizations::combineDMALoads(const Module& module, Method& method, const Configuration& config) +{ + using namespace std; + using namespace VariantNamespace; + + const std::regex vloadReg("vload(2|3|4|8|16)"); + + for(auto& bb : method) + { + // loadInstrs, offsetValues, addrValue + map, vector, Optional>> vloads; + + for(auto& it : bb) + { + // Find all vloadn calls + if(auto call = dynamic_cast(it.get())) + { + auto name = vc4c::spirv::demangleFunctionName(call->methodName); + + std::smatch m; + if(std::regex_search(name, m, vloadReg)) + { + int n = std::stoi(m.str(1)); + + // TODO: Check whether all second argument values are equal. + + auto& vload = vloads[n]; + auto& loadInstrs = get<0>(vload); + auto& offsetValues = get<1>(vload); + auto& addrValue = get<2>(vload); + + if(!addrValue.has_value()) + { + addrValue = call->getArgument(1); + } + else if(addrValue != call->getArgument(1)) + { + continue; + } + + offsetValues.push_back(call->assertArgument(0)); + loadInstrs.push_back(call); + } + } + } + + for(auto& p : vloads) + { + auto vectorLength = p.first; + auto& vload = p.second; + auto& loadInstrs = get<0>(vload); + auto& offsetValues = get<1>(vload); + auto& addrValue = get<2>(vload); + + if(offsetValues.size() <= 1) + continue; + + for(auto& inst : loadInstrs) + { + logging::debug() << inst->to_string() << logging::endl; + } + + std::vector> addrExprs; + + for(auto& addrValue : offsetValues) + { + if(auto loc = addrValue.checkLocal()) + { + if(auto writer = loc->getSingleWriter()) + { + addrExprs.push_back(std::make_pair(addrValue, iiToExpr(addrValue, writer))); + } + else + { + addrExprs.push_back(std::make_pair(addrValue, SubExpression(addrValue))); + } + } + else + { + // TODO: is it ok? + addrExprs.push_back(std::make_pair(addrValue, SubExpression(addrValue))); + } + } + + for(auto& current : addrExprs) + { + for(auto& other : addrExprs) + { + current.second = replaceLocalToExpr(current.second, other.first, other.second); + } + } + + for(auto& pair : addrExprs) + { + logging::debug() << pair.first.to_string() << " = " << pair.second.to_string() << logging::endl; + } + + ExpandedExprs diff; + bool eqDiff = true; + for(size_t i = 1; i < addrExprs.size(); i++) + { + auto x = addrExprs[i - 1].second; + auto y = addrExprs[i].second; + auto diffExpr = SubExpression(std::make_shared(OP_SUB, y, x)); + + ExpandedExprs currentDiff; + expandExpression(diffExpr, currentDiff); + + calcValueExpr(currentDiff); + + // Apply calcValueExpr again for integer literals. + SubExpression currentExpr(INT_ZERO); + for(auto& p : currentDiff) + { + currentExpr = + SubExpression(std::make_shared(p.first ? OP_ADD : OP_SUB, currentExpr, p.second)); + } + currentDiff.clear(); + expandExpression(currentExpr, currentDiff); + calcValueExpr(currentDiff); + + // logging::debug() << currentDiff.to_string() << ", " << diff.to_string() << logging::endl; + + if(i == 1) + { + diff = std::move(currentDiff); + } + else if(currentDiff != diff) + { + eqDiff = false; + break; + } + } + + logging::debug() << addrExprs.size() << " loads are " << (eqDiff ? "" : "not ") + << "equal difference: " << diff.to_string() << logging::endl; + + if(eqDiff) + { + // The form of diff should be "0 (+/-) expressions...", then remove the value 0 at most right. + // ExpandedExprs expanded; + // expandExpression(diff, expanded); + // for (auto& ex : expanded) { + // logging::debug() << "ex = " << ex.second.to_string() << logging::endl; + // } + if(diff.size() == 1) + { + auto diffExpr = diff[0].second; + + // logging::debug() << "diff = " << diff.to_string() << logging::endl; + + auto term = diffExpr.getConstantExpression(); + auto mpValue = term.has_value() ? term->getConstantValue() : Optional{}; + auto mpLiteral = mpValue.has_value() ? mpValue->getLiteralValue() : Optional{}; + + if(mpLiteral) + { + if(mpLiteral->unsignedInt() < (1u << 12)) + { + auto it = bb.walk(); + bool firstCall = true; + while(!it.isEndOfBlock()) + { + auto call = it.get(); + if(call && std::find(loadInstrs.begin(), loadInstrs.end(), call) != loadInstrs.end()) + { + it.erase(); + + auto output = *call->getOutput(); + if(firstCall) + { + firstCall = false; + + auto addrArg = call->assertArgument(1); + + auto elemType = addrArg.type.getElementType(); + auto vectorSize = elemType.getInMemoryWidth() * vectorLength; + + // TODO: limit loadInstrs.size() + Value offset = assign(it, TYPE_INT32) = + offsetValues[0] * Literal(vectorLength * elemType.getInMemoryWidth()); + Value addr = assign(it, TYPE_INT32) = offset + addrArg; + + uint16_t memoryPitch = + static_cast(mpLiteral->unsignedInt()) * vectorSize; + + DataType VectorType{ + elemType.getInMemoryWidth() * DataType::BYTE, vectorLength, false}; + + uint64_t rows = loadInstrs.size(); + auto entries = Value(Literal(static_cast(rows)), TYPE_INT32); + it = method.vpm->insertReadRAM(method, it, addr, VectorType, nullptr, true, + INT_ZERO, entries, Optional(memoryPitch)); + + VPMArea area(VPMUsage::SCRATCH, 0, static_cast(rows)); + it = method.vpm->insertReadVPM(method, it, output, &area, true); + } + else + { + // TODO: gather these instructions in one mutex lock + it = method.vpm->insertLockMutex(it, true); + assign(it, output) = VPM_IO_REGISTER; + it = method.vpm->insertUnlockMutex(it, true); + } + } + else + { + it.nextInBlock(); + } + } + + logging::debug() << loadInstrs.size() << " loads are combined" << logging::endl; + } + } + } + } + } + } +} + static Optional> combineAdditions( Method& method, InstructionWalker referenceIt, FastMap& addedValues) { diff --git a/src/optimization/Combiner.h b/src/optimization/Combiner.h index bac4ca98..e9b213c0 100644 --- a/src/optimization/Combiner.h +++ b/src/optimization/Combiner.h @@ -154,6 +154,36 @@ namespace vc4c InstructionWalker combineArithmeticOperations( const Module& module, Method& method, InstructionWalker it, const Configuration& config); + /* + * Combines vloadn with one DMA/VPM load. This available only for constant value offset. + * + * Example: + * %call = _Z7vload16jPU3AS1Kf(i32 2, (g) f32* %in) ; vload16 + * %call2 = _Z7vload16jPU3AS1Kf(i32 3, (g) f32* %in) + * %call3 = _Z7vload16jPU3AS1Kf(i32 4, (g) f32* %in) + * + * becomes: + * %tmp.405 = add i32 128, (p) f32* %in + * mutex_acq + * register vpr_setup = vdr_setup(rows: 3, columns: 16 words, address: h32(0,0), vpitch: 1) + * register vpr_setup = loadi vdr_setup(memory pitch: 64 bytes) + * register vpr_addr = i32 %tmp.405 + * register - = register vpr_wait + * mutex_rel + * mutex_acq + * register vpr_setup = loadi vpm_setup(num: 3, size: 16 words, stride: 1 rows, address: h32(0)) + * %tmp.404 = register vpm + * mutex_rel + * mutex_acq + * %tmp.403 = register vpm + * mutex_rel + * mutex_acq + * %tmp.402 = register vpm + * mutex_rel + * + */ + void combineDMALoads(const Module& module, Method& method, const Configuration& config); + // TODO documentation, TODO move somewhere else?! bool cacheWorkGroupDMAAccess(const Module& module, Method& method, const Configuration& config); } // namespace optimizations diff --git a/src/optimization/sources.list b/src/optimization/sources.list index 3a0fe012..79821bc8 100644 --- a/src/optimization/sources.list +++ b/src/optimization/sources.list @@ -8,4 +8,4 @@ target_sources(${VC4C_LIBRARY_NAME} ${CMAKE_CURRENT_LIST_DIR}/Optimizer.cpp ${CMAKE_CURRENT_LIST_DIR}/Reordering.cpp ${CMAKE_CURRENT_LIST_DIR}/InstructionScheduler.cpp -) \ No newline at end of file +) diff --git a/src/periphery/VPM.cpp b/src/periphery/VPM.cpp index 52507e19..88c742c9 100644 --- a/src/periphery/VPM.cpp +++ b/src/periphery/VPM.cpp @@ -675,7 +675,7 @@ InstructionWalker VPM::insertWriteVPM(Method& method, InstructionWalker it, cons } InstructionWalker VPM::insertReadRAM(Method& method, InstructionWalker it, const Value& memoryAddress, DataType type, - const VPMArea* area, bool useMutex, const Value& inAreaOffset, const Value& numEntries) + const VPMArea* area, bool useMutex, const Value& inAreaOffset, const Value& numEntries, Optional memoryPitch) { if(area != nullptr) // FIXME this needs to have the numEntries added and the correct type!!! @@ -744,7 +744,8 @@ InstructionWalker VPM::insertReadRAM(Method& method, InstructionWalker it, const if(numEntries != INT_ONE) // NOTE: This for read the pitch (start-to-start) and for write the stride (end-to-start) is set, we need to set // this to the data size, but not required for write setup! - strideSetup.strideSetup = VPRStrideSetup(static_cast(type.getInMemoryWidth())); + // strideSetup.strideSetup = VPRStrideSetup(static_cast(type.getInMemoryWidth())); + strideSetup.strideSetup = VPRStrideSetup(static_cast(memoryPitch.value_or(type.getInMemoryWidth()))); it.emplace(new LoadImmediate(VPM_IN_SETUP_REGISTER, Literal(strideSetup.value))); it->addDecorations(InstructionDecorations::VPM_READ_CONFIGURATION); it.nextInBlock(); @@ -1169,15 +1170,17 @@ VPWDMASetup VPMArea::toWriteDMASetup(DataType elementType, uint8_t numRows) cons return setup; } -VPRGenericSetup VPMArea::toReadSetup(DataType elementType, uint8_t numRows) const +VPRGenericSetup VPMArea::toReadSetup(DataType elementType/*, uint8_t numRows*/) const { + uint8_t numRows_ = numRows; + elementType = simplifyComplexTypes(elementType); DataType type = elementType.isUnknown() ? getElementType() : elementType; if(type.getScalarBitCount() > 32) { // 64-bit integer vectors are stored as 2 rows of 32-bit integer vectors in VPM type = DataType{32, type.getVectorWidth(), type.isFloatingType()}; - numRows = 2 * numRows; + numRows_ = 2 * numRows_; } if(type.isUnknown()) throw CompilationError( @@ -1186,7 +1189,10 @@ VPRGenericSetup VPMArea::toReadSetup(DataType elementType, uint8_t numRows) cons // if we can pack into a single row, do so. Otherwise set stride to beginning of next row const uint8_t stride = canBePackedIntoRow() ? 1 : static_cast(TYPE_INT32.getScalarBitCount() / type.getScalarBitCount()); - VPRGenericSetup setup(getVPMSize(type), stride, numRows, calculateQPUSideAddress(type, rowOffset, 0)); + + if (numRows_ >= 16) numRows_ = 1; + + VPRGenericSetup setup(getVPMSize(type), stride, numRows_, calculateQPUSideAddress(type, rowOffset, 0)); setup.setHorizontal(IS_HORIZONTAL); setup.setLaned(!IS_PACKED); return setup; diff --git a/src/periphery/VPM.h b/src/periphery/VPM.h index be6af3ea..4057c782 100644 --- a/src/periphery/VPM.h +++ b/src/periphery/VPM.h @@ -328,7 +328,8 @@ namespace vc4c * * see Broadcom spec, table 33 */ - class VPRGenericSetup : private Bitfield + // class VPRGenericSetup : private Bitfield + class VPRGenericSetup : public Bitfield { public: VPRGenericSetup(uint8_t size, uint8_t stride, uint8_t numVectors = 1, uint8_t address = 0) : Bitfield(0) @@ -408,7 +409,9 @@ namespace vc4c * * see Broadcom spec, table 36 */ - class VPRDMASetup : private Bitfield + // class VPRDMASetup : private Bitfield + // TODO: Changed to public, is it ok? + class VPRDMASetup : public Bitfield { public: VPRDMASetup( @@ -801,7 +804,7 @@ namespace vc4c * * If the data-type is set to unknown, the default element-type of this area is used */ - VPRGenericSetup toReadSetup(DataType elementType, uint8_t numRows = 1) const; + VPRGenericSetup toReadSetup(DataType elementType/*, uint8_t numRows = 1*/) const; /* * Generates a RAM-to-VPM DMA read setup for loading the contents of a memory address into this VPM area @@ -866,7 +869,7 @@ namespace vc4c */ NODISCARD InstructionWalker insertReadRAM(Method& method, InstructionWalker it, const Value& memoryAddress, DataType type, const VPMArea* area = nullptr, bool useMutex = true, - const Value& inAreaOffset = INT_ZERO, const Value& numEntries = INT_ONE); + const Value& inAreaOffset = INT_ZERO, const Value& numEntries = INT_ONE, Optional memoryPitch = {}); /* * Inserts a write from VPM into RAM via DMA */ @@ -909,12 +912,12 @@ namespace vc4c */ void dumpUsage() const; + InstructionWalker insertLockMutex(InstructionWalker it, bool useMutex) const; + InstructionWalker insertUnlockMutex(InstructionWalker it, bool useMutex) const; + private: const unsigned maximumVPMSize; std::vector> areas; - - InstructionWalker insertLockMutex(InstructionWalker it, bool useMutex) const; - InstructionWalker insertUnlockMutex(InstructionWalker it, bool useMutex) const; }; /* diff --git a/test/TestOptimizationSteps.cpp b/test/TestOptimizationSteps.cpp index 5dcd4682..3dfa87ed 100644 --- a/test/TestOptimizationSteps.cpp +++ b/test/TestOptimizationSteps.cpp @@ -5,6 +5,7 @@ */ #include "TestOptimizationSteps.h" +#include "Bitfield.h" #include "Expression.h" #include "Method.h" #include "Module.h" @@ -14,9 +15,12 @@ #include "optimization/ControlFlow.h" #include "optimization/Eliminator.h" #include "optimization/Flags.h" +#include "periphery/VPM.h" #include +#include "log.h" + using namespace vc4c; using namespace vc4c::optimizations; using namespace vc4c::operators; @@ -35,6 +39,7 @@ TestOptimizationSteps::TestOptimizationSteps() TEST_ADD(TestOptimizationSteps::testEliminateBitOperations); TEST_ADD(TestOptimizationSteps::testCombineRotations); TEST_ADD(TestOptimizationSteps::testLoopInvariantCodeMotion); + TEST_ADD(TestOptimizationSteps::testCombineDMALoads); } static bool checkEquals( @@ -1989,3 +1994,219 @@ void TestOptimizationSteps::testLoopInvariantCodeMotion() it.nextInMethod(); TEST_ASSERT(!!it.get()); } + +void TestOptimizationSteps::testCombineDMALoads() +{ + using namespace vc4c::intermediate; + + auto testCombineDMALoadsSub = [&](Module& module, Method& inputMethod, Configuration& config, DataType vectorType) { + + uint8_t elementBitCount = vectorType.getElementType().getScalarBitCount(); + uint8_t dmaSetupMode = 0; + uint8_t vpitch = 1; + uint8_t vprSize = 0; + uint8_t vprStride = 0; + switch(elementBitCount) + { + case 8: + dmaSetupMode = 4; + vpitch = 4; + vprSize = 0; + vprStride = 4; + break; + case 16: + dmaSetupMode = 2; + vpitch = 2; + vprSize = 1; + vprStride = 2; + break; + case 32: + dmaSetupMode = 0; + vpitch = 1; + vprSize = 2; + vprStride = 1; + break; + } + + const int numOfLoads = 3; + periphery::VPRDMASetup expectedDMASetup(dmaSetupMode, vectorType.getVectorWidth() % 16, numOfLoads, vpitch, 0); + periphery::VPRGenericSetup expectedVPRSetup(vprSize, vprStride, numOfLoads, 0); + + inputMethod.dumpInstructions(); + + combineDMALoads(module, inputMethod, config); + + inputMethod.dumpInstructions(); + + for(auto& bb : inputMethod) + { + int numOfDMASetup = 0; + int numOfStrideSetup = 0; + int numOfVPRSetup = 0; + int numOfVPMRead = 0; + + for(auto& it : bb) + { + if(auto move = dynamic_cast(it.get())) + { + auto source = move->getSource(); + if(source.getLiteralValue() && + (move->getOutput()->hasRegister(REG_VPM_IN_SETUP) || + has_flag(move->decoration, InstructionDecorations::VPM_READ_CONFIGURATION))) + { + auto dmaSetup = + periphery::VPRSetup::fromLiteral(source.getLiteralValue()->unsignedInt()).dmaSetup; + TEST_ASSERT_EQUALS(expectedDMASetup, dmaSetup); + + numOfDMASetup++; + } + else if(auto reg = source.checkRegister()) + { + // VPM Read + if(reg->file != RegisterFile::ACCUMULATOR && reg->num == 48) + { + numOfVPMRead++; + } + } + } + else if(auto load = dynamic_cast(it.get())) + { + if(load->type == LoadType::REPLICATE_INT32 && + (load->getOutput()->hasRegister(REG_VPM_IN_SETUP) || + has_flag(load->decoration, InstructionDecorations::VPM_READ_CONFIGURATION))) + { + auto vpr = periphery::VPRSetup::fromLiteral(load->getImmediate().unsignedInt()); + if(vpr.isStrideSetup()) + { + TEST_ASSERT_EQUALS(vectorType.getInMemoryWidth(), vpr.strideSetup.getPitch()); + numOfStrideSetup++; + } + if(vpr.isGenericSetup()) + { + TEST_ASSERT_EQUALS(expectedVPRSetup, vpr.genericSetup); + numOfVPRSetup++; + } + } + } + } + + TEST_ASSERT_EQUALS(1, numOfDMASetup); + TEST_ASSERT_EQUALS(1, numOfStrideSetup); + TEST_ASSERT_EQUALS(1, numOfVPRSetup); + TEST_ASSERT_EQUALS(numOfLoads, numOfVPMRead); + } + }; + + auto putMethodCall = [](Method& inputMethod, InstructionWalker& inIt, const DataType& vectorType, + std::string funcName, std::vector&& args) { + auto res = inputMethod.addNewLocal(vectorType); + inIt.emplace((new intermediate::MethodCall(std::move(res), std::move(funcName), std::move(args)))); + }; + + const DataType Float16{DataType::WORD, 16, true}; + const DataType Float8{DataType::WORD, 8, true}; + const DataType Uchar16{DataType::BYTE, 16, false}; + + // vload16(size_t, const float*) + const std::string vload16f = "_Z7vload16jPU3AS1Kf"; + // vload8(size_t, const float*) + const std::string vload8f = "_Z6vload8jPU3AS1Kf"; + // vload16(size_t, const uchar*) + const std::string vload16uc = "_Z7vload16jPU3AS1Kh"; + + Configuration config{}; + + { + // vload16f * 3 + + Module module{config}; + Method inputMethod(module); + + const DataType FloatPtr = inputMethod.createPointerType(TYPE_FLOAT); + + auto inIt = inputMethod.createAndInsertNewBlock(inputMethod.end(), "%dummy").walkEnd(); + auto in = assign(inIt, FloatPtr, "%in") = UNIFORM_REGISTER; + + putMethodCall(inputMethod, inIt, Float16, vload16f, {0_val, in}); + putMethodCall(inputMethod, inIt, Float16, vload16f, {1_val, in}); + putMethodCall(inputMethod, inIt, Float16, vload16f, {2_val, in}); + + testCombineDMALoadsSub(module, inputMethod, config, Float16); + } + + { + // vload8f * 3 + + Module module{config}; + Method inputMethod(module); + + const DataType FloatPtr = inputMethod.createPointerType(TYPE_FLOAT); + + auto inIt = inputMethod.createAndInsertNewBlock(inputMethod.end(), "%dummy").walkEnd(); + auto in = assign(inIt, FloatPtr, "%in") = UNIFORM_REGISTER; + + putMethodCall(inputMethod, inIt, Float8, vload8f, {0_val, in}); + putMethodCall(inputMethod, inIt, Float8, vload8f, {1_val, in}); + putMethodCall(inputMethod, inIt, Float8, vload8f, {2_val, in}); + + testCombineDMALoadsSub(module, inputMethod, config, Float8); + } + + { + // vload16uc * 3 + + Module module{config}; + Method inputMethod(module); + + const DataType Int8Ptr = inputMethod.createPointerType(TYPE_INT8); + + auto inIt = inputMethod.createAndInsertNewBlock(inputMethod.end(), "%dummy").walkEnd(); + auto in = assign(inIt, Int8Ptr, "%in") = UNIFORM_REGISTER; + + putMethodCall(inputMethod, inIt, Uchar16, vload16uc, {0_val, in}); + putMethodCall(inputMethod, inIt, Uchar16, vload16uc, {1_val, in}); + putMethodCall(inputMethod, inIt, Uchar16, vload16uc, {2_val, in}); + + testCombineDMALoadsSub(module, inputMethod, config, Uchar16); + } + + { + // vload16f * 3 + + Module module{config}; + Method inputMethod(module); + + const DataType FloatPtr = inputMethod.createPointerType(TYPE_FLOAT); + + auto inIt = inputMethod.createAndInsertNewBlock(inputMethod.end(), "%dummy").walkEnd(); + auto in = assign(inIt, FloatPtr, "%in") = UNIFORM_REGISTER; + + auto offset1 = assign(inIt, TYPE_INT32, "%offset1") = 42_val; + auto offset2 = assign(inIt, TYPE_INT32, "%offset2") = offset1 + 1_val; + auto offset3 = assign(inIt, TYPE_INT32, "%offset3") = offset1 + 2_val; + + putMethodCall(inputMethod, inIt, Float16, vload16f, {offset3, in}); + putMethodCall(inputMethod, inIt, Float16, vload16f, {offset2, in}); + putMethodCall(inputMethod, inIt, Float16, vload16f, {offset1, in}); + + testCombineDMALoadsSub(module, inputMethod, config, Float16); + } + + // { + // // expand + // + // Literal l(2); + // Value a(l, TYPE_INT32); + // Value b = 3_val; + // SubExpression expr( + // new ValueBinaryOp(makeValueBinaryOpFromLocal(a, ValueBinaryOp::BinaryOp::Add, b), + // ValueBinaryOp::BinaryOp::Sub, std::make_shared(1_val))); + // ValueExpr::ExpandedExprs expanded; + // expr->expand(expanded); + // + // TEST_ASSERT_EQUALS(1, expanded.size()); + // + // auto n = expanded[0].second->getInteger(); + // TEST_ASSERT_EQUALS(4, n.value_or(0)); + // } +} diff --git a/test/TestOptimizationSteps.h b/test/TestOptimizationSteps.h index 531c8f2d..2118dc9b 100644 --- a/test/TestOptimizationSteps.h +++ b/test/TestOptimizationSteps.h @@ -32,6 +32,7 @@ class TestOptimizationSteps : public Test::Suite void testEliminateMoves(); void testEliminateDeadCode(); void testLoopInvariantCodeMotion(); + void testCombineDMALoads(); private: void testMethodsEquals(vc4c::Method& m1, vc4c::Method& m2);