diff --git a/src/Expression.cpp b/src/Expression.cpp
index 9bdba743..4d46c947 100644
--- a/src/Expression.cpp
+++ b/src/Expression.cpp
@@ -6,6 +6,8 @@
 using namespace vc4c;
 
 constexpr OpCode Expression::FAKEOP_UMUL;
+constexpr OpCode Expression::FAKEOP_MUL;
+constexpr OpCode Expression::FAKEOP_DIV;
 
 SubExpression::SubExpression(const Optional<Value>& val) : Base(VariantNamespace::monostate{})
 {
diff --git a/src/Expression.h b/src/Expression.h
index 4459a7df..1f33bebe 100644
--- a/src/Expression.h
+++ b/src/Expression.h
@@ -109,6 +109,9 @@ namespace vc4c
         // A fake operation to indicate an unsigned multiplication
         static constexpr OpCode FAKEOP_UMUL{"umul", 132, 132, 2, false, false, FlagBehavior::NONE};
 
+        static constexpr OpCode FAKEOP_MUL{"mul", 132, 132, 2, false, false, FlagBehavior::NONE};
+        static constexpr OpCode FAKEOP_DIV{"div", 132, 132, 2, false, false, FlagBehavior::NONE};
+
         OpCode code;
         SubExpression arg0;
         SubExpression arg1{};
diff --git a/src/normalization/Normalizer.cpp b/src/normalization/Normalizer.cpp
index 6401eea2..d5a79f19 100644
--- a/src/normalization/Normalizer.cpp
+++ b/src/normalization/Normalizer.cpp
@@ -16,6 +16,7 @@
 #include "../optimization/ControlFlow.h"
 #include "../optimization/Eliminator.h"
 #include "../optimization/Reordering.h"
+#include "../intermediate/operators.h"
 #include "../spirv/SPIRVBuiltins.h"
 #include "Inliner.h"
 #include "LiteralValues.h"
@@ -23,6 +24,8 @@
 #include "MemoryAccess.h"
 #include "Rewrite.h"
 
+#include "../optimization/Combiner.h"
+
 #include "log.h"
 
 #include <string>
@@ -30,6 +33,8 @@
 
 using namespace vc4c;
 using namespace vc4c::normalization;
+using namespace vc4c::periphery;
+using namespace vc4c::operators;
 
 static bool checkWorkGroupUniform(const Value& arg)
 {
@@ -253,6 +258,16 @@ void Normalizer::normalize(Module& module) const
         PROFILE_COUNTER_WITH_PREV(vc4c::profiler::COUNTER_NORMALIZATION + 2, "Eliminate Phi-nodes (after)",
             method->countInstructions(), vc4c::profiler::COUNTER_NORMALIZATION + 1);
     }
+
+    {
+        // TODO: move this optimization to appropriate location
+        auto kernels = module.getKernels();
+        for(Method* kernelFunc : kernels)
+        {
+            optimizations::combineDMALoads(module, *kernelFunc, config);
+        }
+    }
+
     auto kernels = module.getKernels();
     // 2. inline kernel-functions
     for(Method* kernelFunc : kernels)
@@ -266,6 +281,7 @@ void Normalizer::normalize(Module& module) const
         PROFILE_COUNTER_WITH_PREV(vc4c::profiler::COUNTER_NORMALIZATION + 5, "Inline (after)",
             kernel.countInstructions(), vc4c::profiler::COUNTER_NORMALIZATION + 4);
     }
+
     // 3. run other normalization steps on kernel functions
     const auto f = [&module, this](Method* kernelFunc) -> void { normalizeMethod(module, *kernelFunc); };
     ThreadPool::scheduleAll<Method*>("Normalization", kernels, f, THREAD_LOGGER.get());
diff --git a/src/optimization/Combiner.cpp b/src/optimization/Combiner.cpp
index d1f3e28b..da209506 100644
--- a/src/optimization/Combiner.cpp
+++ b/src/optimization/Combiner.cpp
@@ -6,17 +6,20 @@
 
 #include "Combiner.h"
 
+#include "../Expression.h"
 #include "../InstructionWalker.h"
 #include "../analysis/MemoryAnalysis.h"
 #include "../intermediate/Helper.h"
 #include "../intermediate/operators.h"
 #include "../periphery/VPM.h"
+#include "../spirv/SPIRVHelper.h"
 #include "Eliminator.h"
 #include "log.h"
 
 #include <algorithm>
 #include <cstdlib>
 #include <memory>
+#include <regex>
 
 // TODO combine y = (x >> n) << n with and
 // same for y = (x << n) >> n (at least of n constant)
@@ -27,6 +30,7 @@ using namespace vc4c;
 using namespace vc4c::optimizations;
 using namespace vc4c::intermediate;
 using namespace vc4c::operators;
+using namespace vc4c::periphery;
 
 // Taken from https://stackoverflow.com/questions/2835469/how-to-perform-rotate-shift-in-c?noredirect=1&lq=1
 constexpr static uint32_t rotate_left_halfword(uint32_t value, uint8_t shift) noexcept
@@ -1121,6 +1125,518 @@ InstructionWalker optimizations::combineArithmeticOperations(
     return it;
 }
 
+SubExpression makeValueBinaryOpFromLocal(Value& left, const OpCode& binOp, Value& right)
+{
+    return SubExpression(std::make_shared<Expression>(binOp, SubExpression(left), SubExpression(right)));
+}
+
+// try to convert shl to mul and return it as ValueExpr
+SubExpression shlToMul(const Value& value, const intermediate::Operation* op)
+{
+    auto left = op->getFirstArg();
+    auto right = *op->getSecondArg();
+    int shiftValue = 0;
+    if(auto lit = right.checkLiteral())
+    {
+        shiftValue = lit->signedInt();
+    }
+    else if(auto imm = right.checkImmediate())
+    {
+        shiftValue = imm->getIntegerValue().value_or(0);
+    }
+
+    if(shiftValue > 0)
+    {
+        auto right = Value(Literal(1 << shiftValue), TYPE_INT32);
+        return makeValueBinaryOpFromLocal(left, OP_FMUL, right);
+    }
+    else
+    {
+        return SubExpression(value);
+    }
+}
+
+SubExpression iiToExpr(const Value& value, const LocalUser* inst)
+{
+    // add, sub, shr, shl, asr
+    if(auto op = dynamic_cast<const intermediate::Operation*>(inst))
+    {
+        if(op->op == OP_ADD || op->op == OP_SUB)
+        {
+            auto left = op->getFirstArg();
+            auto right = *op->getSecondArg();
+            return makeValueBinaryOpFromLocal(left, op->op, right);
+        }
+        else if(op->op == OP_OR) // Treat `or` as `add`
+        {
+            auto left = op->getFirstArg();
+            auto right = *op->getSecondArg();
+            return makeValueBinaryOpFromLocal(left, OP_ADD, right);
+        }
+        else if(op->op == OP_SHL)
+        {
+            // convert shl to mul
+            return shlToMul(value, op);
+            // TODO: shr, asr
+        }
+        else
+        {
+            // If op is neither add nor sub, return value as-is.
+            return SubExpression(value);
+        }
+    }
+    // mul, div
+    else if(auto op = dynamic_cast<const intermediate::IntrinsicOperation*>(inst))
+    {
+        OpCode binOp = OP_NOP;
+        if(op->opCode == "mul")
+        {
+            binOp = Expression::FAKEOP_MUL;
+        }
+        else if(op->opCode == "div")
+        {
+            binOp = Expression::FAKEOP_DIV;
+        }
+        else
+        {
+            // If op is neither add nor sub, return value as-is.
+            return SubExpression(value);
+        }
+
+        auto left = op->getFirstArg();
+        auto right = *op->getSecondArg();
+        return makeValueBinaryOpFromLocal(left, binOp, right);
+    }
+
+    return SubExpression(value);
+}
+
+Optional<int> getIntegerFromExpression(const SubExpression& expr)
+{
+    if(auto value = expr.checkValue())
+    {
+        if(auto lit = value->checkLiteral())
+        {
+            return Optional<int>(lit->signedInt());
+        }
+        else if(auto imm = value->checkImmediate())
+        {
+            return imm->getIntegerValue();
+        }
+    }
+    return Optional<int>();
+}
+
+//                                                 signed, value
+class ExpandedExprs : public std::vector<std::pair<bool, SubExpression>>
+{
+public:
+    std::string to_string() const
+    {
+        std::stringstream ss;
+        for(auto& p : *this)
+        {
+            ss << (p.first ? "+" : "-") << p.second.to_string();
+        }
+        return ss.str();
+    }
+};
+
+void expandExpression(const SubExpression& subExpr, ExpandedExprs& expanded)
+{
+    if(auto expr = subExpr.checkExpression())
+    {
+        ExpandedExprs leftEE, rightEE;
+        auto& left = expr->arg0;
+        auto& right = expr->arg1;
+        auto& op = expr->code;
+
+        expandExpression(left, leftEE);
+        expandExpression(right, rightEE);
+
+        auto getInteger = [](const std::pair<bool, SubExpression>& v) {
+            std::function<Optional<int>(const int&)> addSign = [&](const int& num) {
+                return make_optional(v.first ? num : -num);
+            };
+            return getIntegerFromExpression(v.second) & addSign;
+        };
+
+        auto leftNum = (leftEE.size() == 1) ? getInteger(leftEE[0]) : Optional<int>();
+        auto rightNum = (rightEE.size() == 1) ? getInteger(rightEE[0]) : Optional<int>();
+
+        auto append = [](ExpandedExprs& ee1, ExpandedExprs& ee2) { ee1.insert(ee1.end(), ee2.begin(), ee2.end()); };
+
+        if(leftNum && rightNum)
+        {
+            int l = leftNum.value_or(0);
+            int r = rightNum.value_or(0);
+            int num = 0;
+
+            if(op == OP_ADD)
+            {
+                num = l + r;
+            }
+            else if(op == OP_SUB)
+            {
+                num = l - r;
+            }
+            else if(op == Expression::FAKEOP_MUL)
+            {
+                num = l * r;
+            }
+            else if(op == Expression::FAKEOP_DIV)
+            {
+                num = l / r;
+            }
+            else
+            {
+                throw CompilationError(CompilationStep::OPTIMIZER, "Unknown operation", op.name);
+            }
+
+            // TODO: Care other types
+            auto value = Value(Literal(std::abs(num)), TYPE_INT32);
+            SubExpression foldedExpr(value);
+            expanded.push_back(std::make_pair(true, foldedExpr));
+        }
+        else
+        {
+            if(op == OP_ADD)
+            {
+                append(expanded, leftEE);
+                append(expanded, rightEE);
+            }
+            else if(op == OP_SUB)
+            {
+                append(expanded, leftEE);
+
+                for(auto& e : rightEE)
+                {
+                    e.first = !e.first;
+                }
+                append(expanded, rightEE);
+            }
+            else if(op == Expression::FAKEOP_MUL)
+            {
+                if(leftNum || rightNum)
+                {
+                    int num = 0;
+                    ExpandedExprs* ee = nullptr;
+                    if(leftNum)
+                    {
+                        num = leftNum.value_or(0);
+                        ee = &rightEE;
+                    }
+                    else
+                    {
+                        num = rightNum.value_or(0);
+                        ee = &leftEE;
+                    }
+                    for(int i = 0; i < num; i++)
+                    {
+                        append(expanded, *ee);
+                    }
+                }
+                else
+                {
+                    expanded.push_back(
+                        std::make_pair(true, SubExpression(std::make_shared<Expression>(op, left, right))));
+                }
+            }
+            else if(op == Expression::FAKEOP_DIV)
+            {
+                expanded.push_back(std::make_pair(true, SubExpression(std::make_shared<Expression>(op, left, right))));
+            }
+            else
+            {
+                throw CompilationError(CompilationStep::OPTIMIZER, "Unknown operation", op.name);
+            }
+        }
+    }
+    else if(auto value = subExpr.checkValue())
+    {
+        expanded.push_back(std::make_pair(true, subExpr));
+    }
+    else
+    {
+        throw CompilationError(CompilationStep::OPTIMIZER, "Cannot expand expression", subExpr.to_string());
+    }
+}
+
+void calcValueExpr(ExpandedExprs& expanded)
+{
+    // ExpandedExprs expanded;
+    // expandExpression(expr, expanded);
+
+    // for(auto& p : expanded)
+    //     logging::debug() << (p.first ? "+" : "-") << p.second->to_string() << " ";
+    // logging::debug() << logging::endl;
+
+    for(auto p = expanded.begin(); p != expanded.end();)
+    {
+        auto comp = std::find_if(expanded.begin(), expanded.end(), [&p](const std::pair<bool, SubExpression>& other) {
+            return p->first != other.first && p->second == other.second;
+        });
+        if(comp != expanded.end())
+        {
+            expanded.erase(comp);
+            p = expanded.erase(p);
+        }
+        else
+        {
+            p++;
+        }
+    }
+
+    // SubExpression result(INT_ZERO);
+    // for(auto& p : expanded)
+    // {
+    //     result = SubExpression(std::make_shared<Expression>(p.first ? OP_ADD : OP_SUB, result, p.second));
+    // }
+    //
+    // return result;
+}
+
+SubExpression replaceLocalToExpr(const SubExpression& subExpr, const Value& local, SubExpression newExpr)
+{
+    if(auto expr = subExpr.checkExpression())
+    {
+        return SubExpression(std::make_shared<Expression>(expr->code,
+                    replaceLocalToExpr(expr->arg0, local, newExpr),
+                    replaceLocalToExpr(expr->arg1, local, newExpr)));
+
+    }
+    else if(auto replacee = subExpr.checkLocal())
+    {
+        if (auto replacer = local.checkLocal()) {
+            if (*replacee == *replacer) {
+                return newExpr;
+            }
+        }
+    }
+
+    return subExpr;
+}
+
+void optimizations::combineDMALoads(const Module& module, Method& method, const Configuration& config)
+{
+    using namespace std;
+    using namespace VariantNamespace;
+
+    const std::regex vloadReg("vload(2|3|4|8|16)");
+
+    for(auto& bb : method)
+    {
+        //             loadInstrs,                        offsetValues,  addrValue
+        map<int, tuple<vector<intermediate::MethodCall*>, vector<Value>, Optional<Value>>> vloads;
+
+        for(auto& it : bb)
+        {
+            // Find all vloadn calls
+            if(auto call = dynamic_cast<intermediate::MethodCall*>(it.get()))
+            {
+                auto name = vc4c::spirv::demangleFunctionName(call->methodName);
+
+                std::smatch m;
+                if(std::regex_search(name, m, vloadReg))
+                {
+                    int n = std::stoi(m.str(1));
+
+                    // TODO: Check whether all second argument values are equal.
+
+                    auto& vload = vloads[n];
+                    auto& loadInstrs = get<0>(vload);
+                    auto& offsetValues = get<1>(vload);
+                    auto& addrValue = get<2>(vload);
+
+                    if(!addrValue.has_value())
+                    {
+                        addrValue = call->getArgument(1);
+                    }
+                    else if(addrValue != call->getArgument(1))
+                    {
+                        continue;
+                    }
+
+                    offsetValues.push_back(call->assertArgument(0));
+                    loadInstrs.push_back(call);
+                }
+            }
+        }
+
+        for(auto& p : vloads)
+        {
+            auto vectorLength = p.first;
+            auto& vload = p.second;
+            auto& loadInstrs = get<0>(vload);
+            auto& offsetValues = get<1>(vload);
+            auto& addrValue = get<2>(vload);
+
+            if(offsetValues.size() <= 1)
+                continue;
+
+            for(auto& inst : loadInstrs)
+            {
+                logging::debug() << inst->to_string() << logging::endl;
+            }
+
+            std::vector<std::pair<Value, SubExpression>> addrExprs;
+
+            for(auto& addrValue : offsetValues)
+            {
+                if(auto loc = addrValue.checkLocal())
+                {
+                    if(auto writer = loc->getSingleWriter())
+                    {
+                        addrExprs.push_back(std::make_pair(addrValue, iiToExpr(addrValue, writer)));
+                    }
+                    else
+                    {
+                        addrExprs.push_back(std::make_pair(addrValue, SubExpression(addrValue)));
+                    }
+                }
+                else
+                {
+                    // TODO: is it ok?
+                    addrExprs.push_back(std::make_pair(addrValue, SubExpression(addrValue)));
+                }
+            }
+
+            for(auto& current : addrExprs)
+            {
+                for(auto& other : addrExprs)
+                {
+                    current.second = replaceLocalToExpr(current.second, other.first, other.second);
+                }
+            }
+
+            for(auto& pair : addrExprs)
+            {
+                logging::debug() << pair.first.to_string() << " = " << pair.second.to_string() << logging::endl;
+            }
+
+            ExpandedExprs diff;
+            bool eqDiff = true;
+            for(size_t i = 1; i < addrExprs.size(); i++)
+            {
+                auto x = addrExprs[i - 1].second;
+                auto y = addrExprs[i].second;
+                auto diffExpr = SubExpression(std::make_shared<Expression>(OP_SUB, y, x));
+
+                ExpandedExprs currentDiff;
+                expandExpression(diffExpr, currentDiff);
+
+                calcValueExpr(currentDiff);
+
+                // Apply calcValueExpr again for integer literals.
+                SubExpression currentExpr(INT_ZERO);
+                for(auto& p : currentDiff)
+                {
+                    currentExpr =
+                        SubExpression(std::make_shared<Expression>(p.first ? OP_ADD : OP_SUB, currentExpr, p.second));
+                }
+                currentDiff.clear();
+                expandExpression(currentExpr, currentDiff);
+                calcValueExpr(currentDiff);
+
+                // logging::debug() << currentDiff.to_string() << ", " << diff.to_string() << logging::endl;
+
+                if(i == 1)
+                {
+                    diff = std::move(currentDiff);
+                }
+                else if(currentDiff != diff)
+                {
+                    eqDiff = false;
+                    break;
+                }
+            }
+
+            logging::debug() << addrExprs.size() << " loads are " << (eqDiff ? "" : "not ")
+                             << "equal difference: " << diff.to_string() << logging::endl;
+
+            if(eqDiff)
+            {
+                // The form of diff should be "0 (+/-) expressions...", then remove the value 0 at most right.
+                // ExpandedExprs expanded;
+                // expandExpression(diff, expanded);
+                // for (auto& ex : expanded) {
+                //     logging::debug() << "ex = " << ex.second.to_string()  << logging::endl;
+                // }
+                if(diff.size() == 1)
+                {
+                    auto diffExpr = diff[0].second;
+
+                    // logging::debug() << "diff = " << diff.to_string()  << logging::endl;
+
+                    auto term = diffExpr.getConstantExpression();
+                    auto mpValue = term.has_value() ? term->getConstantValue() : Optional<Value>{};
+                    auto mpLiteral = mpValue.has_value() ? mpValue->getLiteralValue() : Optional<Literal>{};
+
+                    if(mpLiteral)
+                    {
+                        if(mpLiteral->unsignedInt() < (1u << 12))
+                        {
+                            auto it = bb.walk();
+                            bool firstCall = true;
+                            while(!it.isEndOfBlock())
+                            {
+                                auto call = it.get<intermediate::MethodCall>();
+                                if(call && std::find(loadInstrs.begin(), loadInstrs.end(), call) != loadInstrs.end())
+                                {
+                                    it.erase();
+
+                                    auto output = *call->getOutput();
+                                    if(firstCall)
+                                    {
+                                        firstCall = false;
+
+                                        auto addrArg = call->assertArgument(1);
+
+                                        auto elemType = addrArg.type.getElementType();
+                                        auto vectorSize = elemType.getInMemoryWidth() * vectorLength;
+
+                                        // TODO: limit loadInstrs.size()
+                                        Value offset = assign(it, TYPE_INT32) =
+                                            offsetValues[0] * Literal(vectorLength * elemType.getInMemoryWidth());
+                                        Value addr = assign(it, TYPE_INT32) = offset + addrArg;
+
+                                        uint16_t memoryPitch =
+                                            static_cast<uint16_t>(mpLiteral->unsignedInt()) * vectorSize;
+
+                                        DataType VectorType{
+                                            elemType.getInMemoryWidth() * DataType::BYTE, vectorLength, false};
+
+                                        uint64_t rows = loadInstrs.size();
+                                        auto entries = Value(Literal(static_cast<uint32_t>(rows)), TYPE_INT32);
+                                        it = method.vpm->insertReadRAM(method, it, addr, VectorType, nullptr, true,
+                                            INT_ZERO, entries, Optional<uint16_t>(memoryPitch));
+
+                                        VPMArea area(VPMUsage::SCRATCH, 0, static_cast<uint8_t>(rows));
+                                        it = method.vpm->insertReadVPM(method, it, output, &area, true);
+                                    }
+                                    else
+                                    {
+                                        // TODO: gather these instructions in one mutex lock
+                                        it = method.vpm->insertLockMutex(it, true);
+                                        assign(it, output) = VPM_IO_REGISTER;
+                                        it = method.vpm->insertUnlockMutex(it, true);
+                                    }
+                                }
+                                else
+                                {
+                                    it.nextInBlock();
+                                }
+                            }
+
+                            logging::debug() << loadInstrs.size() << " loads are combined" << logging::endl;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
 static Optional<std::pair<Value, InstructionDecorations>> combineAdditions(
     Method& method, InstructionWalker referenceIt, FastMap<Value, InstructionDecorations>& addedValues)
 {
diff --git a/src/optimization/Combiner.h b/src/optimization/Combiner.h
index bac4ca98..e9b213c0 100644
--- a/src/optimization/Combiner.h
+++ b/src/optimization/Combiner.h
@@ -154,6 +154,36 @@ namespace vc4c
         InstructionWalker combineArithmeticOperations(
             const Module& module, Method& method, InstructionWalker it, const Configuration& config);
 
+        /*
+         * Combines vloadn with one DMA/VPM load. This available only for constant value offset.
+         *
+         * Example:
+         *   %call  = _Z7vload16jPU3AS1Kf(i32 2, (g) f32* %in) ; vload16
+         *   %call2 = _Z7vload16jPU3AS1Kf(i32 3, (g) f32* %in)
+         *   %call3 = _Z7vload16jPU3AS1Kf(i32 4, (g) f32* %in)
+         *
+         * becomes:
+         *   %tmp.405 = add i32 128, (p) f32* %in
+         *   mutex_acq
+         *   register vpr_setup = vdr_setup(rows: 3, columns: 16 words, address: h32(0,0), vpitch: 1)
+         *   register vpr_setup = loadi vdr_setup(memory pitch: 64 bytes)
+         *   register vpr_addr = i32 %tmp.405
+         *   register - = register vpr_wait
+         *   mutex_rel
+         *   mutex_acq
+         *   register vpr_setup = loadi vpm_setup(num: 3, size: 16 words, stride: 1 rows, address: h32(0))
+         *   %tmp.404 = register vpm
+         *   mutex_rel
+         *   mutex_acq
+         *   %tmp.403 = register vpm
+         *   mutex_rel
+         *   mutex_acq
+         *   %tmp.402 = register vpm
+         *   mutex_rel
+         *
+         */
+        void combineDMALoads(const Module& module, Method& method, const Configuration& config);
+
         // TODO documentation, TODO move somewhere else?!
         bool cacheWorkGroupDMAAccess(const Module& module, Method& method, const Configuration& config);
     } // namespace optimizations
diff --git a/src/optimization/sources.list b/src/optimization/sources.list
index 3a0fe012..79821bc8 100644
--- a/src/optimization/sources.list
+++ b/src/optimization/sources.list
@@ -8,4 +8,4 @@ target_sources(${VC4C_LIBRARY_NAME}
     ${CMAKE_CURRENT_LIST_DIR}/Optimizer.cpp
     ${CMAKE_CURRENT_LIST_DIR}/Reordering.cpp
     ${CMAKE_CURRENT_LIST_DIR}/InstructionScheduler.cpp
-)
\ No newline at end of file
+)
diff --git a/src/periphery/VPM.cpp b/src/periphery/VPM.cpp
index 52507e19..88c742c9 100644
--- a/src/periphery/VPM.cpp
+++ b/src/periphery/VPM.cpp
@@ -675,7 +675,7 @@ InstructionWalker VPM::insertWriteVPM(Method& method, InstructionWalker it, cons
 }
 
 InstructionWalker VPM::insertReadRAM(Method& method, InstructionWalker it, const Value& memoryAddress, DataType type,
-    const VPMArea* area, bool useMutex, const Value& inAreaOffset, const Value& numEntries)
+    const VPMArea* area, bool useMutex, const Value& inAreaOffset, const Value& numEntries, Optional<uint16_t> memoryPitch)
 {
     if(area != nullptr)
         // FIXME this needs to have the numEntries added and the correct type!!!
@@ -744,7 +744,8 @@ InstructionWalker VPM::insertReadRAM(Method& method, InstructionWalker it, const
     if(numEntries != INT_ONE)
         // NOTE: This for read the pitch (start-to-start) and for write the stride (end-to-start) is set, we need to set
         // this to the data size, but not required for write setup!
-        strideSetup.strideSetup = VPRStrideSetup(static_cast<uint16_t>(type.getInMemoryWidth()));
+        // strideSetup.strideSetup = VPRStrideSetup(static_cast<uint16_t>(type.getInMemoryWidth()));
+        strideSetup.strideSetup = VPRStrideSetup(static_cast<uint16_t>(memoryPitch.value_or(type.getInMemoryWidth())));
     it.emplace(new LoadImmediate(VPM_IN_SETUP_REGISTER, Literal(strideSetup.value)));
     it->addDecorations(InstructionDecorations::VPM_READ_CONFIGURATION);
     it.nextInBlock();
@@ -1169,15 +1170,17 @@ VPWDMASetup VPMArea::toWriteDMASetup(DataType elementType, uint8_t numRows) cons
     return setup;
 }
 
-VPRGenericSetup VPMArea::toReadSetup(DataType elementType, uint8_t numRows) const
+VPRGenericSetup VPMArea::toReadSetup(DataType elementType/*, uint8_t numRows*/) const
 {
+    uint8_t numRows_ = numRows;
+
     elementType = simplifyComplexTypes(elementType);
     DataType type = elementType.isUnknown() ? getElementType() : elementType;
     if(type.getScalarBitCount() > 32)
     {
         // 64-bit integer vectors are stored as 2 rows of 32-bit integer vectors in VPM
         type = DataType{32, type.getVectorWidth(), type.isFloatingType()};
-        numRows = 2 * numRows;
+        numRows_ = 2 * numRows_;
     }
     if(type.isUnknown())
         throw CompilationError(
@@ -1186,7 +1189,10 @@ VPRGenericSetup VPMArea::toReadSetup(DataType elementType, uint8_t numRows) cons
     // if we can pack into a single row, do so. Otherwise set stride to beginning of next row
     const uint8_t stride =
         canBePackedIntoRow() ? 1 : static_cast<uint8_t>(TYPE_INT32.getScalarBitCount() / type.getScalarBitCount());
-    VPRGenericSetup setup(getVPMSize(type), stride, numRows, calculateQPUSideAddress(type, rowOffset, 0));
+
+    if (numRows_ >= 16) numRows_ = 1;
+
+    VPRGenericSetup setup(getVPMSize(type), stride, numRows_, calculateQPUSideAddress(type, rowOffset, 0));
     setup.setHorizontal(IS_HORIZONTAL);
     setup.setLaned(!IS_PACKED);
     return setup;
diff --git a/src/periphery/VPM.h b/src/periphery/VPM.h
index be6af3ea..4057c782 100644
--- a/src/periphery/VPM.h
+++ b/src/periphery/VPM.h
@@ -328,7 +328,8 @@ namespace vc4c
          *
          * see Broadcom spec, table 33
          */
-        class VPRGenericSetup : private Bitfield<uint32_t>
+        // class VPRGenericSetup : private Bitfield<uint32_t>
+        class VPRGenericSetup : public Bitfield<uint32_t>
         {
         public:
             VPRGenericSetup(uint8_t size, uint8_t stride, uint8_t numVectors = 1, uint8_t address = 0) : Bitfield(0)
@@ -408,7 +409,9 @@ namespace vc4c
          *
          * see Broadcom spec, table 36
          */
-        class VPRDMASetup : private Bitfield<uint32_t>
+        // class VPRDMASetup : private Bitfield<uint32_t>
+        // TODO: Changed to public, is it ok?
+        class VPRDMASetup : public Bitfield<uint32_t>
         {
         public:
             VPRDMASetup(
@@ -801,7 +804,7 @@ namespace vc4c
              *
              * If the data-type is set to unknown, the default element-type of this area is used
              */
-            VPRGenericSetup toReadSetup(DataType elementType, uint8_t numRows = 1) const;
+            VPRGenericSetup toReadSetup(DataType elementType/*, uint8_t numRows = 1*/) const;
 
             /*
              * Generates a RAM-to-VPM DMA read setup for loading the contents of a memory address into this VPM area
@@ -866,7 +869,7 @@ namespace vc4c
              */
             NODISCARD InstructionWalker insertReadRAM(Method& method, InstructionWalker it, const Value& memoryAddress,
                 DataType type, const VPMArea* area = nullptr, bool useMutex = true,
-                const Value& inAreaOffset = INT_ZERO, const Value& numEntries = INT_ONE);
+                const Value& inAreaOffset = INT_ZERO, const Value& numEntries = INT_ONE, Optional<uint16_t> memoryPitch = {});
             /*
              * Inserts a write from VPM into RAM via DMA
              */
@@ -909,12 +912,12 @@ namespace vc4c
              */
             void dumpUsage() const;
 
+            InstructionWalker insertLockMutex(InstructionWalker it, bool useMutex) const;
+            InstructionWalker insertUnlockMutex(InstructionWalker it, bool useMutex) const;
+
         private:
             const unsigned maximumVPMSize;
             std::vector<std::shared_ptr<VPMArea>> areas;
-
-            InstructionWalker insertLockMutex(InstructionWalker it, bool useMutex) const;
-            InstructionWalker insertUnlockMutex(InstructionWalker it, bool useMutex) const;
         };
 
         /*
diff --git a/test/TestOptimizationSteps.cpp b/test/TestOptimizationSteps.cpp
index 5dcd4682..3dfa87ed 100644
--- a/test/TestOptimizationSteps.cpp
+++ b/test/TestOptimizationSteps.cpp
@@ -5,6 +5,7 @@
  */
 #include "TestOptimizationSteps.h"
 
+#include "Bitfield.h"
 #include "Expression.h"
 #include "Method.h"
 #include "Module.h"
@@ -14,9 +15,12 @@
 #include "optimization/ControlFlow.h"
 #include "optimization/Eliminator.h"
 #include "optimization/Flags.h"
+#include "periphery/VPM.h"
 
 #include <cmath>
 
+#include "log.h"
+
 using namespace vc4c;
 using namespace vc4c::optimizations;
 using namespace vc4c::operators;
@@ -35,6 +39,7 @@ TestOptimizationSteps::TestOptimizationSteps()
     TEST_ADD(TestOptimizationSteps::testEliminateBitOperations);
     TEST_ADD(TestOptimizationSteps::testCombineRotations);
     TEST_ADD(TestOptimizationSteps::testLoopInvariantCodeMotion);
+    TEST_ADD(TestOptimizationSteps::testCombineDMALoads);
 }
 
 static bool checkEquals(
@@ -1989,3 +1994,219 @@ void TestOptimizationSteps::testLoopInvariantCodeMotion()
     it.nextInMethod();
     TEST_ASSERT(!!it.get<Branch>());
 }
+
+void TestOptimizationSteps::testCombineDMALoads()
+{
+    using namespace vc4c::intermediate;
+
+    auto testCombineDMALoadsSub = [&](Module& module, Method& inputMethod, Configuration& config, DataType vectorType) {
+
+        uint8_t elementBitCount = vectorType.getElementType().getScalarBitCount();
+        uint8_t dmaSetupMode = 0;
+        uint8_t vpitch = 1;
+        uint8_t vprSize = 0;
+        uint8_t vprStride = 0;
+        switch(elementBitCount)
+        {
+        case 8:
+            dmaSetupMode = 4;
+            vpitch = 4;
+            vprSize = 0;
+            vprStride = 4;
+            break;
+        case 16:
+            dmaSetupMode = 2;
+            vpitch = 2;
+            vprSize = 1;
+            vprStride = 2;
+            break;
+        case 32:
+            dmaSetupMode = 0;
+            vpitch = 1;
+            vprSize = 2;
+            vprStride = 1;
+            break;
+        }
+
+        const int numOfLoads = 3;
+        periphery::VPRDMASetup expectedDMASetup(dmaSetupMode, vectorType.getVectorWidth() % 16, numOfLoads, vpitch, 0);
+        periphery::VPRGenericSetup expectedVPRSetup(vprSize, vprStride, numOfLoads, 0);
+
+        inputMethod.dumpInstructions();
+
+        combineDMALoads(module, inputMethod, config);
+
+        inputMethod.dumpInstructions();
+
+        for(auto& bb : inputMethod)
+        {
+            int numOfDMASetup = 0;
+            int numOfStrideSetup = 0;
+            int numOfVPRSetup = 0;
+            int numOfVPMRead = 0;
+
+            for(auto& it : bb)
+            {
+                if(auto move = dynamic_cast<intermediate::MoveOperation*>(it.get()))
+                {
+                    auto source = move->getSource();
+                    if(source.getLiteralValue() &&
+                        (move->getOutput()->hasRegister(REG_VPM_IN_SETUP) ||
+                            has_flag(move->decoration, InstructionDecorations::VPM_READ_CONFIGURATION)))
+                    {
+                        auto dmaSetup =
+                            periphery::VPRSetup::fromLiteral(source.getLiteralValue()->unsignedInt()).dmaSetup;
+                        TEST_ASSERT_EQUALS(expectedDMASetup, dmaSetup);
+
+                        numOfDMASetup++;
+                    }
+                    else if(auto reg = source.checkRegister())
+                    {
+                        // VPM Read
+                        if(reg->file != RegisterFile::ACCUMULATOR && reg->num == 48)
+                        {
+                            numOfVPMRead++;
+                        }
+                    }
+                }
+                else if(auto load = dynamic_cast<intermediate::LoadImmediate*>(it.get()))
+                {
+                    if(load->type == LoadType::REPLICATE_INT32 &&
+                        (load->getOutput()->hasRegister(REG_VPM_IN_SETUP) ||
+                            has_flag(load->decoration, InstructionDecorations::VPM_READ_CONFIGURATION)))
+                    {
+                        auto vpr = periphery::VPRSetup::fromLiteral(load->getImmediate().unsignedInt());
+                        if(vpr.isStrideSetup())
+                        {
+                            TEST_ASSERT_EQUALS(vectorType.getInMemoryWidth(), vpr.strideSetup.getPitch());
+                            numOfStrideSetup++;
+                        }
+                        if(vpr.isGenericSetup())
+                        {
+                            TEST_ASSERT_EQUALS(expectedVPRSetup, vpr.genericSetup);
+                            numOfVPRSetup++;
+                        }
+                    }
+                }
+            }
+
+            TEST_ASSERT_EQUALS(1, numOfDMASetup);
+            TEST_ASSERT_EQUALS(1, numOfStrideSetup);
+            TEST_ASSERT_EQUALS(1, numOfVPRSetup);
+            TEST_ASSERT_EQUALS(numOfLoads, numOfVPMRead);
+        }
+    };
+
+    auto putMethodCall = [](Method& inputMethod, InstructionWalker& inIt, const DataType& vectorType,
+                             std::string funcName, std::vector<Value>&& args) {
+        auto res = inputMethod.addNewLocal(vectorType);
+        inIt.emplace((new intermediate::MethodCall(std::move(res), std::move(funcName), std::move(args))));
+    };
+
+    const DataType Float16{DataType::WORD, 16, true};
+    const DataType Float8{DataType::WORD, 8, true};
+    const DataType Uchar16{DataType::BYTE, 16, false};
+
+    // vload16(size_t, const float*)
+    const std::string vload16f = "_Z7vload16jPU3AS1Kf";
+    // vload8(size_t, const float*)
+    const std::string vload8f = "_Z6vload8jPU3AS1Kf";
+    // vload16(size_t, const uchar*)
+    const std::string vload16uc = "_Z7vload16jPU3AS1Kh";
+
+    Configuration config{};
+
+    {
+        // vload16f * 3
+
+        Module module{config};
+        Method inputMethod(module);
+
+        const DataType FloatPtr = inputMethod.createPointerType(TYPE_FLOAT);
+
+        auto inIt = inputMethod.createAndInsertNewBlock(inputMethod.end(), "%dummy").walkEnd();
+        auto in = assign(inIt, FloatPtr, "%in") = UNIFORM_REGISTER;
+
+        putMethodCall(inputMethod, inIt, Float16, vload16f, {0_val, in});
+        putMethodCall(inputMethod, inIt, Float16, vload16f, {1_val, in});
+        putMethodCall(inputMethod, inIt, Float16, vload16f, {2_val, in});
+
+        testCombineDMALoadsSub(module, inputMethod, config, Float16);
+    }
+
+    {
+        // vload8f * 3
+
+        Module module{config};
+        Method inputMethod(module);
+
+        const DataType FloatPtr = inputMethod.createPointerType(TYPE_FLOAT);
+
+        auto inIt = inputMethod.createAndInsertNewBlock(inputMethod.end(), "%dummy").walkEnd();
+        auto in = assign(inIt, FloatPtr, "%in") = UNIFORM_REGISTER;
+
+        putMethodCall(inputMethod, inIt, Float8, vload8f, {0_val, in});
+        putMethodCall(inputMethod, inIt, Float8, vload8f, {1_val, in});
+        putMethodCall(inputMethod, inIt, Float8, vload8f, {2_val, in});
+
+        testCombineDMALoadsSub(module, inputMethod, config, Float8);
+    }
+
+    {
+        // vload16uc * 3
+
+        Module module{config};
+        Method inputMethod(module);
+
+        const DataType Int8Ptr = inputMethod.createPointerType(TYPE_INT8);
+
+        auto inIt = inputMethod.createAndInsertNewBlock(inputMethod.end(), "%dummy").walkEnd();
+        auto in = assign(inIt, Int8Ptr, "%in") = UNIFORM_REGISTER;
+
+        putMethodCall(inputMethod, inIt, Uchar16, vload16uc, {0_val, in});
+        putMethodCall(inputMethod, inIt, Uchar16, vload16uc, {1_val, in});
+        putMethodCall(inputMethod, inIt, Uchar16, vload16uc, {2_val, in});
+
+        testCombineDMALoadsSub(module, inputMethod, config, Uchar16);
+    }
+
+    {
+        // vload16f * 3
+
+        Module module{config};
+        Method inputMethod(module);
+
+        const DataType FloatPtr = inputMethod.createPointerType(TYPE_FLOAT);
+
+        auto inIt = inputMethod.createAndInsertNewBlock(inputMethod.end(), "%dummy").walkEnd();
+        auto in = assign(inIt, FloatPtr, "%in") = UNIFORM_REGISTER;
+
+        auto offset1 = assign(inIt, TYPE_INT32, "%offset1") = 42_val;
+        auto offset2 = assign(inIt, TYPE_INT32, "%offset2") = offset1 + 1_val;
+        auto offset3 = assign(inIt, TYPE_INT32, "%offset3") = offset1 + 2_val;
+
+        putMethodCall(inputMethod, inIt, Float16, vload16f, {offset3, in});
+        putMethodCall(inputMethod, inIt, Float16, vload16f, {offset2, in});
+        putMethodCall(inputMethod, inIt, Float16, vload16f, {offset1, in});
+
+        testCombineDMALoadsSub(module, inputMethod, config, Float16);
+    }
+
+    // {
+    //     // expand
+    //
+    //     Literal l(2);
+    //     Value a(l, TYPE_INT32);
+    //     Value b = 3_val;
+    //     SubExpression expr(
+    //         new ValueBinaryOp(makeValueBinaryOpFromLocal(a, ValueBinaryOp::BinaryOp::Add, b),
+    //             ValueBinaryOp::BinaryOp::Sub, std::make_shared<ValueTerm>(1_val)));
+    //     ValueExpr::ExpandedExprs expanded;
+    //     expr->expand(expanded);
+    //
+    //     TEST_ASSERT_EQUALS(1, expanded.size());
+    //
+    //     auto n = expanded[0].second->getInteger();
+    //     TEST_ASSERT_EQUALS(4, n.value_or(0));
+    // }
+}
diff --git a/test/TestOptimizationSteps.h b/test/TestOptimizationSteps.h
index 531c8f2d..2118dc9b 100644
--- a/test/TestOptimizationSteps.h
+++ b/test/TestOptimizationSteps.h
@@ -32,6 +32,7 @@ class TestOptimizationSteps : public Test::Suite
     void testEliminateMoves();
     void testEliminateDeadCode();
     void testLoopInvariantCodeMotion();
+    void testCombineDMALoads();
 
 private:
     void testMethodsEquals(vc4c::Method& m1, vc4c::Method& m2);