From 2fb50d64c791ccde00f0c15af05d031cfaa3bfc6 Mon Sep 17 00:00:00 2001 From: andyD123 Date: Mon, 8 Jul 2024 22:18:08 +0100 Subject: [PATCH] Add extra multi reduce functions (#14) * adding extra multi reduce fuctions * adding more versions of multi reduce with different nuimbers of unrol and arguments * remove unused variables, clean up accumulate/ multi reduction * commit of variadic reduction * clean up variadic reduce project --- .gitignore | 10 + VariadicReducrion/VariadicReducrion.vcxproj | 350 ++++++ .../VariadicReducrion.vcxproj.filters | 22 + VariadicReducrion/VariadicReduction.cpp | 515 ++++++++ VectorTest/DR3_tests.cpp | 263 ++++ Vectorisation.sln | 62 + Vectorisation/VecX/accumulate_transform.h | 1074 ++++++++++++++++- Vectorisation/VecX/dr3.h | 84 ++ Vectorisation/Vectorisation.vcxproj | 2 +- 9 files changed, 2348 insertions(+), 34 deletions(-) create mode 100644 VariadicReducrion/VariadicReducrion.vcxproj create mode 100644 VariadicReducrion/VariadicReducrion.vcxproj.filters create mode 100644 VariadicReducrion/VariadicReduction.cpp diff --git a/.gitignore b/.gitignore index 47b6041..a4105ca 100644 --- a/.gitignore +++ b/.gitignore @@ -69,3 +69,13 @@ /scratch/r003mi2 /scratch/scratch.inspxeproj /debug.log +/cmake-build-release/.cmake/api/v1/reply +/cmake-build-release/.cmake/api/v1/query +/cmake-build-release/accumulateExample/CMakeFiles/accumulateExample.dir/accumulate_example.cpp.obj +/cmake-build-release +/VariadicReducrion/e000 +/VariadicReducrion/Intel® VTune™ Profiler Results/VariadicReducrion +/VariadicReducrion/x64 +/DawnCache +/config +/GPUCache diff --git a/VariadicReducrion/VariadicReducrion.vcxproj b/VariadicReducrion/VariadicReducrion.vcxproj new file mode 100644 index 0000000..d5f64c0 --- /dev/null +++ b/VariadicReducrion/VariadicReducrion.vcxproj @@ -0,0 +1,350 @@ + + + + + clang-cl23 + Win32 + + + clang-cl23 + x64 + + + Debug + Win32 + + + ICC2023 + Win32 + + + ICC2023 + x64 + + + Release-23 + Win32 + + + Release-23 + x64 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 17.0 + Win32Proj + {271cf3d5-72ff-4657-9325-4206b8d5c84f} + VariadicReducrion + 10.0 + VariadicReduction + + + + Application + true + v143 + Unicode + + + Application + false + v143 + true + Unicode + + + Application + false + v143 + true + Unicode + + + Application + false + v143 + true + Unicode + + + Application + false + v143 + true + Unicode + + + Application + true + v143 + Unicode + + + Application + false + ClangCL + true + Unicode + + + Application + false + v143 + true + Unicode + + + Application + false + ClangCL + true + Unicode + + + Application + false + Intel C++ Compiler 2023 + true + Unicode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Level3 + true + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + + + + + Level3 + true + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + Level3 + true + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + Level3 + true + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + Level3 + true + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + Level3 + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + stdcpp17 + + + Console + true + + + + + Level3 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions512 + stdcpp17 + Precise + AnySuitable + Speed + Default + false + + + Console + true + true + true + + + + + Level3 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions512 + stdcpp17 + Precise + AnySuitable + Speed + Default + false + + + Console + true + true + true + + + + + Level3 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions512 + stdcpp17 + Precise + AnySuitable + Speed + Default + false + + + Console + true + true + true + + + + + Level3 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions512 + stdcpp17 + Precise + AnySuitable + Speed + Default + false + + + Console + true + true + true + + + + + + + + {837beacf-bced-4ab2-baf5-a7254eb6756b} + + + + + + \ No newline at end of file diff --git a/VariadicReducrion/VariadicReducrion.vcxproj.filters b/VariadicReducrion/VariadicReducrion.vcxproj.filters new file mode 100644 index 0000000..e757fe9 --- /dev/null +++ b/VariadicReducrion/VariadicReducrion.vcxproj.filters @@ -0,0 +1,22 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + \ No newline at end of file diff --git a/VariadicReducrion/VariadicReduction.cpp b/VariadicReducrion/VariadicReduction.cpp new file mode 100644 index 0000000..ea95c99 --- /dev/null +++ b/VariadicReducrion/VariadicReduction.cpp @@ -0,0 +1,515 @@ +// VariadicReducrion.cpp : This file contains the 'main' function. Program execution begins and ends there. +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "../Vectorisation/VecX/dr3.h" +#include "../Vectorisation/VecX/accumulate_transform.h" +#include "../Vectorisation/VecX/error_utils.h" + + +#include "../Vectorisation/VecX/zip_utils.h" + +//#include "norm.h" + + +/* +Uncomment one of the Using namespace lines below to select the instruction set that you wish to run +Those ending in F have float type as underlying, those ending with D have a double. + +The project is set to compile using the AVX512 enhanced instruction set. The namespace selection +choses the type of the intrinsics that are used to instantiate lambdas. + +If your hardware does not support AVX512 chose the next level down AVX2 and avoid using namespaces +DRC::VecD8D or DRC::VecF16F which will cause generation of code with instructions that your computer doesn't support. + +check device manager/processor to determine what processor you have and check against web site +https://ark.intel.com/content/www/us/en/ark/products/123550/intel-xeon-silver-4114-processor-13-75m-cache-2-20-ghz.html +or +https://www.intel.com/content/www/us/en/products/details/processors/xeon/scalable.html + +//Next uncomment example that you wish to run. +You can change projects setting for the compiler debug/release are for VS2019, clang and ICC2022 + +precision of results compare for float types is incorrect. + +*/ + +//pick an instruction set for intrinsics by selecting a name space +//using namespace DRC::VecLDb; //long double +//using namespace DRC::VecDb; //double +//using namespace DRC::VecD2D; //sse2 double +//using namespace DRC::VecD4D; //avx2 double +//using namespace DRC::VecF8F; // avx2 float + +using namespace DRC::VecD8D; //avx512 double +//using namespace DRC::VecF16F; //avx512 float + + +using FLOAT = typename InstructionTraits::FloatType; + + + +const double billion = 1000000000.0; + + +template +bool vectorsEqual(const std::vector& C1, const std::vector& C2, const std::vector& C3) +{ + bool testOK = true; + const double ERR = 1e-13; //for examples + if (C1.size() != C2.size()) + { + return false; + } + + if (C3.size() != C2.size()) + { + return false; + } + + for (size_t i = 0; i < C3.size(); i++) + { + auto err1 = fabs((C1[i] - C2[i]) / (C2[i] + C1[i])); + auto err2 = fabs((C1[i] - C3[i]) / (C1[i] + C3[i])); + + if ((err1 > ERR) || (err2 > ERR)) + { + testOK = false; + std::cout << "\n err diff@ " << i << " err1 =" << err1 << ", err2 = " << err2 << "\n"; + break; + } + } + + return testOK; + +} + + +long double getErr(const std::vector& t) +{ + ignore(t); + return 2e-11; +} + + +double getErr(const std::vector& t) +{ + ignore(t); + return 2e-11; +} + + +float getErr(const std::vector& t) +{ + ignore(t); + return 1.0; +} + + +template +bool vectorsEqualD(const std::vector& C1, const std::vector& C2, const std::vector& C3, const std::vector& input, T ERR = 1e-13) +{ + + + ERR = getErr(C1); + + bool testOK = true; + + if (C1.size() != C2.size()) + { + std::cout << "wrong size C1,C2" << C1.size() << ", " << C2.size() << std::endl; + return false; + } + + if (C3.size() != C2.size()) + { + std::cout << "wrong size C2,C3" << C2.size() << ", " << C3.size() << std::endl; + return false; + } + + for (size_t i = 0; i < C3.size(); i++) + { + auto err1 = fabs((C1[i] - C2[i]) / (fabs(C2[i]) + fabs(C1[i]))); + auto err2 = fabs((C1[i] - C3[i]) / (fabs(C1[i]) + fabs(C3[i]))); + + if ((err1 > ERR) || (err2 > ERR)) + { + testOK = false; + std::cout << "\n err diff@ " << i << " err1 =" << err1 << ", err2 = " << err2 << "\n"; + std::cout << "\n val @ " << i << " C1[i] =" << C1[i] << ", C2[i] = " << C2[i] << ", C3[i] = " << C3[i] << "input val=" << input[i] << "\n"; + std::cout << std::endl; + break; + } + } + + return testOK; + +} + +bool valuesAreEqual(double x, double y, double tol = 1e-14) +{ + + auto err1 = fabs((x - y) / (fabs(x) + fabs(y))); + + return (err1 > tol) ? false : true; +} + + +auto getRandomShuffledVector(int SZ, int instance_number = 0) +{ + using FloatType = typename InstructionTraits::FloatType; + + + static std::map > vectors; + + + int key = 10 * SZ + instance_number; + //store vectors with key 10 times size and add on 0-9 integer for instance of different random vector + + if (SZ < 0) + { + vectors.clear(); + SZ = 0; + } + + + if (vectors.find(key) != vectors.end()) + { + return vectors[key]; + } + else + { + std::vector v(SZ, VecXX::SCALA_TYPE(6.66)); + for (int i = 0; i < SZ; i++) { v[i] += /*FloatType(SZ / 2)*/ +i; } + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(begin(v), end(v), g); + vectors[key] = v; + return v; + } +} + + + + + + + +auto numOps = [](int TEST_LOOP_SZ, int SZ) { return static_cast(double(TEST_LOOP_SZ) * double(SZ)); }; + + +double getnull(double) +{ + return 0.0; +} + +using Calc_Values = std::map; +using Calc_Values_V = std::map >; +using Mapped_Performance_Results = std::map >; // array size v vector +using Mapped_Stats = std::map >; // size -.pair ( throughput , std dev of through put) + +struct RunResults +{ + Mapped_Performance_Results m_raw_results; + Calc_Values m_calc_results; +}; + +struct RunResultsVec +{ + Mapped_Performance_Results m_raw_results; + Calc_Values_V m_calc_results; +}; + +class TimerGuard +{ + double& m_runTime; + std::chrono::steady_clock::time_point m_startTme; + +public: + TimerGuard(double& runTime) : m_runTime(runTime), m_startTme(std::chrono::steady_clock::now()) { runTime = 0.; } + + ~TimerGuard() + { + auto endTime = std::chrono::steady_clock::now(); + auto runtime = endTime - m_startTme; + m_runTime = runtime.count() / billion; + } +}; + + +auto runFunctionOverDifferentSize = [](int testRepeats, int vec_start_size, int vec_stepSZ, int vec_maxSZ, const auto& func, long testLoopSZ) +{ + + RunResults results; + + for (int j = 0; j < testRepeats; ++j) + { + int VEC_SZ = vec_start_size; + for (; VEC_SZ < vec_maxSZ; VEC_SZ += vec_stepSZ) + { + auto res = func(VEC_SZ, testLoopSZ); + auto calculation_rate = res.second; + auto calc_value = res.first; + results.m_raw_results[VEC_SZ].push_back(calculation_rate); + if (j == 0) + { + results.m_calc_results[VEC_SZ] = static_cast(calc_value); + } + } + } + return results; +}; + +/* +auto runFunctionOverDifferentSizeVec = [](int testRepeats, int vec_start_size, int vec_stepSZ, int vec_maxSZ, const auto& func, long testLoopSZ) + { + + RunResultsVec results; + + for (int j = 0; j < testRepeats; ++j) + { + int VEC_SZ = vec_start_size; + for (; VEC_SZ < vec_maxSZ; VEC_SZ += vec_stepSZ) + { + auto res = func(VEC_SZ, testLoopSZ); + auto calculation_rate = res.second; + auto calc_value = res.first; + results.m_raw_results[VEC_SZ].push_back(calculation_rate); + + if (j == 0) + { + std::vector tmp = res.first; + results.m_calc_results[VEC_SZ] = tmp; + } + + } + } + return results; + }; + +*/ + +auto performanceStats = [](const Mapped_Performance_Results& raw_results) +{ + + Mapped_Stats stats; + + for (const auto& item : raw_results) + { + double sum = 0; + double sum_sqrd = 0; + double N = 0.0; + for (const auto run_rate : item.second) + { + sum += run_rate; + sum_sqrd += (run_rate * run_rate); + N++; + } + + double avg = sum / N; + double varSqrd = sum_sqrd + (avg * avg * N) - (2.0 * avg * sum); + double var = std::sqrt(varSqrd / (N - 1.)); + + stats[item.first] = { avg ,var }; + + } + return stats; +}; + + + +//example doing multi reduction operation after load +//should be faster +void doMultiStats(int runType =7) +{ + + const long TEST_LOOP_SZ = 10;// 100;// 100;//400;// 1000; + const int repeatRuns = 20;// 20; + const int vectorStepSize = 200; + const int maxVectorSize = 20000; + const int minVectorSize = 400; + + getRandomShuffledVector(-1); // reset random input vectors + + + auto accumulate_run = [&](int VEC_SZ, long TEST_LOOP_SZ) + { + double time = 0.; + double res = 0.; + double res_min = 0.; + volatile double total = 0.; //sum to prevent optimizing out + + auto v1 = getRandomShuffledVector(VEC_SZ, 0); + + auto sum_vals = [](double accumulate, double val) {return accumulate + val; }; + auto sumSqr = [](double accumulate, double val) {return accumulate + val * val; }; + auto sumCube = [](double accumulate, double val) {return accumulate + val * val * val; }; + auto sumQuart = [](double accumulate, double val) {return accumulate + val * val * val * val; }; + + //warm up + for (long l = 0; l < 100; l++) + { + auto sum_val = std::reduce( v1.begin(), v1.end(), 0.0, sum_vals); + auto sum_val1 = std::reduce( v1.begin(), v1.end(), 0.0, sumSqr); + auto sum_val2 = std::reduce( v1.begin(), v1.end(), 0.0, sumCube); + auto sum_val3 = std::reduce( v1.begin(), v1.end(), 0.0, sumQuart); + + total = sum_val + sum_val1 + sum_val2 + sum_val3; + } + + + { + TimerGuard timer(time); + for (long l = 0; l < TEST_LOOP_SZ; l++) + { + + auto sum_val = std::reduce( v1.begin(), v1.end(), 0.0, sum_vals); + auto sum_val1 = std::reduce( v1.begin(), v1.end(), 0.0, sumSqr); + auto sum_val2 = std::reduce( v1.begin(), v1.end(), 0.0, sumCube); + auto sum_val3 = std::reduce( v1.begin(), v1.end(), 0.0, sumQuart); + + + total = sum_val + sum_val1 + sum_val2 + sum_val3; + //compute combined value over all results to check tests and stop short circuit + } + } + + ignore(total); + ignore(res_min); + + return std::make_pair(res, numOps(TEST_LOOP_SZ, VEC_SZ) / time); + }; + + + + auto DR3_accumulate = [&](int SZ, long TEST_LOOP_SZ) + { + double time = 0.; + volatile double res = 0.; + + + auto sum = [](auto accumulate, auto val) {return accumulate + val; }; + auto sumSqr = [](auto accumulate, auto val) {return accumulate + val * val; }; + auto sumCube = [](auto accumulate, auto val) {return accumulate + val * val * val; }; + auto sumQuart = [](auto accumulate, auto val) {return accumulate + val * val * val * val; }; + + auto v1 = getRandomShuffledVector(SZ, 0); // std stl vector double or float + VecXX vec(v1); + + + for (long l = 0; l < 100; l++) + { + double sum_mnn = reduce(vec, sum); + double sumSqr_mnn = reduce(vec, sumSqr); + double sum_Cube = reduce(vec, sumCube); + double sum_Qrt = reduce(vec, sumQuart); + + res = 0.5 * (sum_mnn + sumSqr_mnn + sum_Cube + sum_Qrt); + } + + + { + TimerGuard timer(time); + for (long l = 0; l < TEST_LOOP_SZ; l++) + { + double sum_mnn = reduce(vec, sum); + double sumSqr_mnn = reduce(vec, sumSqr); + double sum_Cube = reduce(vec, sumCube); + double sum_Qrt = reduce(vec, sumQuart); + + res = 0.5 * (sum_mnn + sumSqr_mnn + sum_Cube + sum_Qrt); + } + } + + return std::make_pair(res, numOps(TEST_LOOP_SZ, SZ) / time); + + }; + + auto DR3_accumulate_multi = [&](int SZ, long TEST_LOOP_SZ) + { + double time = 0.; + volatile double res = 0.; + + auto sum = [](auto accumulate, auto val) {return accumulate + val; }; + auto sumSqr = [](auto accumulate, auto val) {return accumulate + val * val; }; + auto sumCube = [](auto accumulate, auto val) {return accumulate + val * val * val; }; + auto sumQuart = [](auto accumulate, auto val) {return accumulate + val * val * val * val; }; + + auto v1 = getRandomShuffledVector(SZ, 0); // std stl vector double or float + VecXX vec(v1); + + auto ress = reduceM(vec, sum, sumSqr, sumCube, sumQuart); + //warm up + for (long l = 0; l < 100; l++) + { + ress = reduceM(vec, sum, sumSqr, sumCube, sumQuart); + } + + + { + TimerGuard timer(time); + for (long l = 0; l < TEST_LOOP_SZ; l++) + { + ress = reduceM(vec, sum, sumSqr, sumCube, sumQuart); + + double sum_mnn = std::get<0>(ress); + double sumSqr_mnn = std::get<1>(ress); + double sum_Cube = std::get<2>(ress); + double sum_Qrt = std::get<3>(ress); + + res = 0.5 * (sum_mnn + sumSqr_mnn + sum_Cube + sum_Qrt); + } + } + + return std::make_pair(res, numOps(TEST_LOOP_SZ, SZ) / time); + }; + + + auto run_res_stl = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, accumulate_run, TEST_LOOP_SZ); + auto stats_stl = performanceStats(run_res_stl.m_raw_results); + + auto dr3_raw_results = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, DR3_accumulate, TEST_LOOP_SZ); + auto stats_DR3_perf = performanceStats(dr3_raw_results.m_raw_results); + + + auto dr3_raw_results_mult = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, DR3_accumulate_multi, TEST_LOOP_SZ); + auto stats_DR3_perf_mult = performanceStats(dr3_raw_results_mult.m_raw_results); + + + //print out results + for (const auto& perf_stl : stats_stl) + { + auto valDr3 = dr3_raw_results.m_calc_results[perf_stl.first]; + auto valStl = run_res_stl.m_calc_results[perf_stl.first]; + auto valDr3Mult = dr3_raw_results_mult.m_calc_results[perf_stl.first]; + auto strMatch = valuesAreEqual(valDr3, valStl, valDr3Mult) ? "calcs match" : "cal difference"; + std::cout << " std::max_element, size " << perf_stl.first << ", " << perf_stl.second.first << ", + - ," << perf_stl.second.second + << "\t \t DR3 reduce, size " << perf_stl.first << ", " << stats_DR3_perf[perf_stl.first].first << ", + - ," << stats_DR3_perf[perf_stl.first].second + << "\t \t DR3 reduce_mult, size " << perf_stl.first << ", " << stats_DR3_perf_mult[perf_stl.first].first << ", + - ," << stats_DR3_perf_mult[perf_stl.first].second + << ", numerical check : " << strMatch << "\n"; + } + +} + + + +int main() +{ + doMultiStats(7); + return 0; +} + + diff --git a/VectorTest/DR3_tests.cpp b/VectorTest/DR3_tests.cpp index 00ef779..0a8f294 100644 --- a/VectorTest/DR3_tests.cpp +++ b/VectorTest/DR3_tests.cpp @@ -2087,6 +2087,269 @@ TEST(TestDR3, testReduce_Views) testReduce_Vw(65); } +//////////////////////////// + +void testReduce_VecM_2(int SZ) +{ + + + std::vector input(SZ, asNumber(0.0)); + std::iota(begin(input), end(input), asNumber(0.0)); + + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(begin(input), end(input), g); + VecXX testVec(input); + VecXX negTestVec = -testVec + asNumber(SZ / 2); + + auto SUM = [](auto x, auto y) { return x + y; }; + auto SUM2 = [](auto x, auto y) { return x + y; }; + auto MAX = [](auto x, auto y) { return iff((x > y), x, y); }; + auto MIN = [](auto x, auto y) { return iff((x < y), x, y); }; + auto SUM_SQR = [](auto x, auto y) { return x + y*y; }; + + double smSqr = 0.0; + for (auto x : input) + { + smSqr += SUM_SQR(smSqr, x); + } + + auto resMulti_2 = reduceM(testVec, SUM, MAX); + auto SUM_M = std::get<0>(resMulti_2); + auto MAX_M = std::get<1>(resMulti_2); + auto expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0)); + EXPECT_NUMERIC_EQ(SUM_M, asNumber(expectedSum)); + EXPECT_NUMERIC_EQ(MAX_M, asNumber(asNumber(SZ - 1))); + + + //auto resMulti_3 = reduceM(testVec, SUM, MAX, MIN); + auto resMulti_3 = reduceM(testVec, SUM, MAX, MIN); + auto res_lambda_0 = std::get<0>(resMulti_3); + auto res_lambda_1 = std::get<1>(resMulti_3); + auto res_lambda_2 = std::get<2>(resMulti_3); + expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0)); + EXPECT_NUMERIC_EQ(res_lambda_0, asNumber(expectedSum)); + EXPECT_NUMERIC_EQ(res_lambda_1, asNumber(asNumber(SZ - 1))); + EXPECT_NUMERIC_EQ(res_lambda_2, asNumber(0)); + + + //////////////////// + + auto resMulti_4 = reduceM(testVec, SUM, MAX, MIN, SUM2); + auto res_lambda_0_4 = std::get<0>(resMulti_4); + auto res_lambda_1_4 = std::get<1>(resMulti_4); + auto res_lambda_2_4 = std::get<2>(resMulti_4); + auto res_lambda_3_4 = std::get<3>(resMulti_4); + expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0)); + EXPECT_NUMERIC_EQ(res_lambda_3_4, asNumber(expectedSum)); + EXPECT_NUMERIC_EQ(res_lambda_0_4, asNumber(expectedSum)); + EXPECT_NUMERIC_EQ(res_lambda_1_4, asNumber(asNumber(SZ - 1))); + EXPECT_NUMERIC_EQ(res_lambda_2_4, asNumber(0)); + +} + +void testReduce_VecM_1(int SZ) +{ + + + std::vector input(SZ, asNumber(0.0)); + std::iota(begin(input), end(input), asNumber(0.0)); + + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(begin(input), end(input), g); + VecXX testVec(input); + VecXX negTestVec = -testVec + asNumber(SZ / 2); + + auto SUM = [](auto x, auto y) { return x + y; }; + auto SUM2 = [](auto x, auto y) { return x + y; }; + auto MAX = [](auto x, auto y) { return iff((x > y), x, y); }; + auto MIN = [](auto x, auto y) { return iff((x < y), x, y); }; + auto SUM_SQR = [](auto x, auto y) { return x + y * y; }; + + double smSqr = 0.0; + for (auto x : input) + { + smSqr += SUM_SQR(smSqr, x); + } + + auto resMulti_2 = reduceM1(testVec, SUM, MAX); + auto SUM_M = std::get<0>(resMulti_2); + auto MAX_M = std::get<1>(resMulti_2); + auto expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0)); + EXPECT_NUMERIC_EQ(SUM_M, asNumber(expectedSum)); + EXPECT_NUMERIC_EQ(MAX_M, asNumber(asNumber(SZ - 1))); + + + //auto resMulti_3 = reduceM(testVec, SUM, MAX, MIN); + auto resMulti_3 = reduceM1(testVec, SUM, MAX, MIN); + auto res_lambda_0 = std::get<0>(resMulti_3); + auto res_lambda_1 = std::get<1>(resMulti_3); + auto res_lambda_2 = std::get<2>(resMulti_3); + expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0)); + EXPECT_NUMERIC_EQ(res_lambda_0, asNumber(expectedSum)); + EXPECT_NUMERIC_EQ(res_lambda_1, asNumber(asNumber(SZ - 1))); + EXPECT_NUMERIC_EQ(res_lambda_2, asNumber(0)); + + + //////////////////// + + auto resMulti_4 = reduceM1(testVec, SUM, MAX, MIN, SUM2); + auto res_lambda_0_4 = std::get<0>(resMulti_4); + auto res_lambda_1_4 = std::get<1>(resMulti_4); + auto res_lambda_2_4 = std::get<2>(resMulti_4); + auto res_lambda_3_4 = std::get<3>(resMulti_4); + expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0)); + EXPECT_NUMERIC_EQ(res_lambda_3_4, asNumber(expectedSum)); + EXPECT_NUMERIC_EQ(res_lambda_0_4, asNumber(expectedSum)); + EXPECT_NUMERIC_EQ(res_lambda_1_4, asNumber(asNumber(SZ - 1))); + EXPECT_NUMERIC_EQ(res_lambda_2_4, asNumber(0)); + +} + + +void testReduce_VecM_22(int SZ) +{ + + + std::vector input(SZ, asNumber(0.0)); + std::iota(begin(input), end(input), asNumber(0.0)); + + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(begin(input), end(input), g); + VecXX testVec(input); + VecXX negTestVec = -testVec + asNumber(SZ / 2); + + auto SUM = [](auto x, auto y) { return x + y; }; + auto SUM2 = [](auto x, auto y) { return x + y; }; + auto MAX = [](auto x, auto y) { return iff((x > y), x, y); }; + auto MIN = [](auto x, auto y) { return iff((x < y), x, y); }; + auto SUM_SQR = [](auto x, auto y) { return x + y * y; }; + + double smSqr = 0.0; + for (auto x : input) + { + smSqr += SUM_SQR(smSqr, x); + } + + auto resMulti_2 = reduceM2(testVec, SUM, MAX); + auto SUM_M = std::get<0>(resMulti_2); + auto MAX_M = std::get<1>(resMulti_2); + auto expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0)); + EXPECT_NUMERIC_EQ(SUM_M, asNumber(expectedSum)); + EXPECT_NUMERIC_EQ(MAX_M, asNumber(asNumber(SZ - 1))); + + + //auto resMulti_3 = reduceM(testVec, SUM, MAX, MIN); + auto resMulti_3 = reduceM2(testVec, SUM, MAX, MIN); + auto res_lambda_0 = std::get<0>(resMulti_3); + auto res_lambda_1 = std::get<1>(resMulti_3); + auto res_lambda_2 = std::get<2>(resMulti_3); + expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0)); + EXPECT_NUMERIC_EQ(res_lambda_0, asNumber(expectedSum)); + EXPECT_NUMERIC_EQ(res_lambda_1, asNumber(asNumber(SZ - 1))); + EXPECT_NUMERIC_EQ(res_lambda_2, asNumber(0)); + + + //////////////////// + + auto resMulti_4 = reduceM2(testVec, SUM, MAX, MIN, SUM2); + auto res_lambda_0_4 = std::get<0>(resMulti_4); + auto res_lambda_1_4 = std::get<1>(resMulti_4); + auto res_lambda_2_4 = std::get<2>(resMulti_4); + auto res_lambda_3_4 = std::get<3>(resMulti_4); + expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0)); + EXPECT_NUMERIC_EQ(res_lambda_3_4, asNumber(expectedSum)); + EXPECT_NUMERIC_EQ(res_lambda_0_4, asNumber(expectedSum)); + EXPECT_NUMERIC_EQ(res_lambda_1_4, asNumber(asNumber(SZ - 1))); + EXPECT_NUMERIC_EQ(res_lambda_2_4, asNumber(0)); + +} + + + + +/* +tests mult with x4 unroll +*/ +TEST(TestDR3, testReduceM_VecXX) +{ + + testReduce_VecM_2(2); + testReduce_VecM_2(3); + testReduce_VecM_2(4); + + + for (int SZ = 3; SZ < 33; SZ++) + { + testReduce_VecM_2(SZ); + } + + testReduce_VecM_2(34); + testReduce_VecM_2(63); + testReduce_VecM_2(64); + testReduce_VecM_2(65); + + + //// + + //TO DO TEST with unroll x2 + // and unrol x1 + +} +/////////////////////////////////// + + + +/* +tests mult with x1 unroll +*/ +TEST(TestDR3, testReduceM1_VecXX) +{ + testReduce_VecM_1(32); + testReduce_VecM_1(8); + testReduce_VecM_1(2); + testReduce_VecM_1(3); + testReduce_VecM_1(4); + + + for (int SZ = 3; SZ < 33; SZ++) + { + testReduce_VecM_1(SZ); + } + + testReduce_VecM_1(34); + testReduce_VecM_1(63); + testReduce_VecM_1(64); + testReduce_VecM_1(65); +} + + +TEST(TestDR3, testReduceM2_VecXX) +{ + testReduce_VecM_22(32); + testReduce_VecM_22(8); + testReduce_VecM_22(2); + testReduce_VecM_22(3); + testReduce_VecM_22(4); + + + for (int SZ = 3; SZ < 33; SZ++) + { + testReduce_VecM_22(SZ); + } + + testReduce_VecM_22(34); + testReduce_VecM_22(63); + testReduce_VecM_22(64); + testReduce_VecM_22(65); +} + + + + + void testTransformReduceUnitary_Vec(int SZ) { diff --git a/Vectorisation.sln b/Vectorisation.sln index ceccbfb..9d9a3f4 100644 --- a/Vectorisation.sln +++ b/Vectorisation.sln @@ -37,6 +37,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "curveExample", "curveExampl EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "scratch", "scratch\scratch.vcxproj", "{0EE7A152-AE30-4285-A5B1-6252ABC7A418}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "VariadicReducrion", "VariadicReducrion\VariadicReducrion.vcxproj", "{271CF3D5-72FF-4657-9325-4206B8D5C84F}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution cl23-Debug|ARM64 = cl23-Debug|ARM64 @@ -643,6 +645,66 @@ Global {0EE7A152-AE30-4285-A5B1-6252ABC7A418}.Release-23|x64.Build.0 = Release-23|x64 {0EE7A152-AE30-4285-A5B1-6252ABC7A418}.Release-23|x86.ActiveCfg = Release|Win32 {0EE7A152-AE30-4285-A5B1-6252ABC7A418}.Release-23|x86.Build.0 = Release|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.cl23-Debug|ARM64.ActiveCfg = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.cl23-Debug|ARM64.Build.0 = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.cl23-Debug|x64.ActiveCfg = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.cl23-Debug|x64.Build.0 = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.cl23-Debug|x86.ActiveCfg = Debug|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.cl23-Debug|x86.Build.0 = Debug|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl|ARM64.ActiveCfg = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl|ARM64.Build.0 = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl|x64.ActiveCfg = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl|x64.Build.0 = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl|x86.ActiveCfg = Debug|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl|x86.Build.0 = Debug|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|ARM64.ActiveCfg = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|ARM64.Build.0 = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x64.ActiveCfg = clang-cl23|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x64.Build.0 = clang-cl23|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x86.ActiveCfg = Debug|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x86.Build.0 = Debug|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|ARM64.ActiveCfg = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|ARM64.Build.0 = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x64.ActiveCfg = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x64.Build.0 = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x86.ActiveCfg = Debug|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x86.Build.0 = Debug|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug23|ARM64.ActiveCfg = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug23|ARM64.Build.0 = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug23|x64.ActiveCfg = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug23|x64.Build.0 = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug23|x86.ActiveCfg = Debug|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug23|x86.Build.0 = Debug|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.IC2022|ARM64.ActiveCfg = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.IC2022|ARM64.Build.0 = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.IC2022|x64.ActiveCfg = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.IC2022|x64.Build.0 = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.IC2022|x86.ActiveCfg = Debug|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.IC2022|x86.Build.0 = Debug|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2022|ARM64.ActiveCfg = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2022|ARM64.Build.0 = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2022|x64.ActiveCfg = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2022|x64.Build.0 = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2022|x86.ActiveCfg = Debug|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2022|x86.Build.0 = Debug|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|ARM64.ActiveCfg = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|ARM64.Build.0 = Debug|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x64.ActiveCfg = ICC2023|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x64.Build.0 = ICC2023|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x86.ActiveCfg = Debug|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x86.Build.0 = Debug|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|ARM64.ActiveCfg = Release|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|ARM64.Build.0 = Release|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x64.ActiveCfg = Release|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x64.Build.0 = Release|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x86.ActiveCfg = Release|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x86.Build.0 = Release|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|ARM64.ActiveCfg = Release|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|ARM64.Build.0 = Release|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x64.ActiveCfg = Release-23|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x64.Build.0 = Release-23|x64 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x86.ActiveCfg = Release|Win32 + {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/Vectorisation/VecX/accumulate_transform.h b/Vectorisation/VecX/accumulate_transform.h index 727c483..098f575 100644 --- a/Vectorisation/VecX/accumulate_transform.h +++ b/Vectorisation/VecX/accumulate_transform.h @@ -3110,9 +3110,154 @@ VecView ApplySelectTransformUR_XC(const VecView& rhs1, OP& co } +//////experimental unrolled double transform accumulation +template< typename INS_VEC, typename TF, typename TF2, typename OP, typename OP2> +typename std::tuple::FloatType, typename InstructionTraits::FloatType> +ApplyTransformAccumulate2UR_X2(const Vec& rhs1, TF& opTrans, OP& oper, TF2& opTrans2, OP2& oper2) +{ + check_vector(rhs1); + if (isScalar(rhs1)) // nothing to accumulate with so just return value + { + return { rhs1.getScalarValue(),rhs1.getScalarValue() }; + } + + int sz = rhs1.size(); + auto pRhs1 = rhs1.start(); + const int width = InstructionTraits::width; + int step = 4 * width; + + INS_VEC RHS1; + INS_VEC RES; + + INS_VEC RHS2; + INS_VEC RES1; + + INS_VEC RHS3; + INS_VEC RES2; + + INS_VEC RHS4; + INS_VEC RES3; + + // + INS_VEC RHS1_A; + INS_VEC RES_A; + + INS_VEC RHS2_A; + INS_VEC RES1_A; + + INS_VEC RHS3_A; + INS_VEC RES2_A; + + INS_VEC RHS4_A; + INS_VEC RES3_A; + + // + + int i = 0; + + if (sz >= step * 2) + { + //initialise first set of registers + { + RES.load_a(pRhs1 + i); + RES1.load_a(pRhs1 + i + width); + RES2.load_a(pRhs1 + i + width * 2); + RES3.load_a(pRhs1 + i + width * 3); + + + RES_A = opTrans2(RES); + RES1_A = opTrans2(RES1); + RES2_A = opTrans2(RES2); + RES3_A = opTrans2(RES3); + + + RES = opTrans(RES); + RES1 = opTrans(RES1); + RES2 = opTrans(RES2); + RES3 = opTrans(RES3); + } + + i += step; + int rhsSZ = rhs1.size(); + for (; i <= (rhsSZ - step); i += step) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, opTrans(RHS1)); + RES_A = oper2(RES_A, opTrans2(RHS1)); + + RHS2.load_a(pRhs1 + i + width); + RES1 = oper(RES1, opTrans(RHS2)); + RES1_A = oper2(RES1_A, opTrans2(RHS2)); + + RHS3.load_a(pRhs1 + i + width * 2); + RES2 = oper(RES2, opTrans(RHS3)); + RES2_A = oper2(RES2_A, opTrans2(RHS3)); + + RHS4.load_a(pRhs1 + i + width * 3); + RES3 = oper(RES3, opTrans(RHS4)); + RES3_A = oper2(RES3_A, opTrans2(RHS4)); + + } + + // odd bits + for (; i <= rhsSZ - width; i += width) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, opTrans(RHS1)); + RES_A = oper2(RES_A, opTrans2(RHS1)); + } + + RES = oper(RES, RES1); + RES2 = oper(RES2, RES3); + RES = oper(RES, RES2); + + RES_A = oper2(RES_A, RES1_A); + RES2_A = oper2(RES2_A, RES3_A); + RES_A = oper2(RES_A, RES2_A); + + } + else + { + RES.load_a(pRhs1); + RES_A = opTrans2(RES); + RES = opTrans(RES); + + i += width; + // odd bits + for (; i <= sz - width; i += width) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, opTrans(RHS1)); + RES_A = oper2(RES_A, opTrans2(RHS1)); + } + + } + + typename InstructionTraits::FloatType result = RES[0]; + typename InstructionTraits::FloatType result_A = RES_A[0]; + int min_wdth = std::min(sz, width); + //across vectors lanes // not assuming horizontal version exist + for (int j = 1; j < min_wdth; ++j) + { + result = ApplyBinaryOperationVec(result, RES[j], oper); + result_A = ApplyBinaryOperationVec(result_A, RES_A[j], oper2); + } + + //end bits for vecs not filling padding + for (; i < rhs1.size(); ++i) + { + //this is a bug need to transform + result = ApplyBinaryOperationVec(pRhs1[i], result, oper); + result_A = ApplyBinaryOperationVec(pRhs1[i], result_A, oper2); + } + + return { result, result_A }; +} +/// un-rolled x4 +/////////////////////////////////////////////// //////experimental unrolled double accumulation template< typename INS_VEC, typename OP, typename OP2> typename std::tuple::FloatType, typename InstructionTraits::FloatType> @@ -3251,18 +3396,15 @@ typename std::tuple::FloatType, typename Ins - -////////////// - -//////experimental unrolled double transform accumulation -template< typename INS_VEC, typename TF, typename TF2, typename OP, typename OP2> -typename std::tuple::FloatType, typename InstructionTraits::FloatType> -ApplyTransformAccumulate2UR_X2(const Vec& rhs1, TF& opTrans, OP& oper, TF2& opTrans2, OP2& oper2) +//////experimental unrolled triple accumulation/ reduction +template< typename INS_VEC, typename OP, typename OP2, typename OP3> +typename std::tuple::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType> +ApplyAccumulate2UR_X2(const Vec& rhs1, OP& oper, OP2& oper2, OP3& oper3) { check_vector(rhs1); if (isScalar(rhs1)) // nothing to accumulate with so just return value { - return { rhs1.getScalarValue(),rhs1.getScalarValue() }; + return { rhs1.getScalarValue(),rhs1.getScalarValue() ,rhs1.getScalarValue() }; } int sz = rhs1.size(); @@ -3297,6 +3439,11 @@ ApplyTransformAccumulate2UR_X2(const Vec& rhs1, TF& opTrans, OP& oper, // + INS_VEC RES_B; + INS_VEC RES1_B; + INS_VEC RES2_B; + INS_VEC RES3_B; + int i = 0; if (sz >= step * 2) @@ -3309,16 +3456,15 @@ ApplyTransformAccumulate2UR_X2(const Vec& rhs1, TF& opTrans, OP& oper, RES3.load_a(pRhs1 + i + width * 3); - RES_A = opTrans2(RES); - RES1_A = opTrans2(RES1); - RES2_A = opTrans2(RES2); - RES3_A = opTrans2(RES3); - + RES_A = RES; + RES1_A = RES1; + RES2_A = RES2; + RES3_A = RES3; - RES = opTrans(RES); - RES1 = opTrans(RES1); - RES2 = opTrans(RES2); - RES3 = opTrans(RES3); + RES_B = RES; + RES1_B = RES1; + RES2_B = RES2; + RES3_B = RES3; } i += step; @@ -3326,20 +3472,24 @@ ApplyTransformAccumulate2UR_X2(const Vec& rhs1, TF& opTrans, OP& oper, for (; i <= (rhsSZ - step); i += step) { RHS1.load_a(pRhs1 + i); - RES = oper(RES, opTrans(RHS1)); - RES_A = oper2(RES_A, opTrans2(RHS1)); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); RHS2.load_a(pRhs1 + i + width); - RES1 = oper(RES1, opTrans(RHS2)); - RES1_A = oper2(RES1_A, opTrans2(RHS2)); + RES1 = oper(RES1, RHS2); + RES1_A = oper2(RES1_A, RHS2); + RES1_B = oper3(RES1_B, RHS2); RHS3.load_a(pRhs1 + i + width * 2); - RES2 = oper(RES2, opTrans(RHS3)); - RES2_A = oper2(RES2_A, opTrans2(RHS3)); + RES2 = oper(RES2, RHS3); + RES2_A = oper2(RES2_A, RHS3); + RES2_B = oper3(RES2_B, RHS3); RHS4.load_a(pRhs1 + i + width * 3); - RES3 = oper(RES3, opTrans(RHS4)); - RES3_A = oper2(RES3_A, opTrans2(RHS4)); + RES3 = oper(RES3, RHS4); + RES3_A = oper2(RES3_A, RHS4); + RES3_B = oper3(RES3_B, RHS4); } @@ -3347,8 +3497,9 @@ ApplyTransformAccumulate2UR_X2(const Vec& rhs1, TF& opTrans, OP& oper, for (; i <= rhsSZ - width; i += width) { RHS1.load_a(pRhs1 + i); - RES = oper(RES, opTrans(RHS1)); - RES_A = oper2(RES_A, opTrans2(RHS1)); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); } RES = oper(RES, RES1); @@ -3359,43 +3510,900 @@ ApplyTransformAccumulate2UR_X2(const Vec& rhs1, TF& opTrans, OP& oper, RES2_A = oper2(RES2_A, RES3_A); RES_A = oper2(RES_A, RES2_A); + + RES_B = oper3(RES_B, RES1_B); + RES2_B = oper3(RES2_B, RES3_B); + RES_B = oper3(RES_B, RES2_B); + } else { RES.load_a(pRhs1); - RES_A = opTrans2(RES); - RES= opTrans(RES); + RES_A = RES; + RES_B = RES; i += width; // odd bits for (; i <= sz - width; i += width) { RHS1.load_a(pRhs1 + i); - RES = oper(RES, opTrans(RHS1)); - RES_A = oper2(RES_A, opTrans2(RHS1)); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); } } typename InstructionTraits::FloatType result = RES[0]; typename InstructionTraits::FloatType result_A = RES_A[0]; + typename InstructionTraits::FloatType result_B = RES_B[0]; int min_wdth = std::min(sz, width); //across vectors lanes // not assuming horizontal version exist for (int j = 1; j < min_wdth; ++j) { result = ApplyBinaryOperationVec(result, RES[j], oper); result_A = ApplyBinaryOperationVec(result_A, RES_A[j], oper2); + result_B = ApplyBinaryOperationVec(result_B, RES_B[j], oper3); } //end bits for vecs not filling padding for (; i < rhs1.size(); ++i) { - //this is a bug need to transform result = ApplyBinaryOperationVec(pRhs1[i], result, oper); result_A = ApplyBinaryOperationVec(pRhs1[i], result_A, oper2); + result_B = ApplyBinaryOperationVec(pRhs1[i], result_B, oper3); } - return { result, result_A }; + return { result, result_A, result_B }; } + +//////experimental unrolled quad accumulation/ reduction +template< typename INS_VEC, typename OP, typename OP2, typename OP3, typename OP4> +typename std::tuple::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType> +ApplyAccumulate2UR_X2(const Vec& rhs1, OP& oper, OP2& oper2, OP3& oper3, OP4& oper4) +{ + check_vector(rhs1); + if (isScalar(rhs1)) // nothing to accumulate with so just return value + { + return { rhs1.getScalarValue(),rhs1.getScalarValue(), rhs1.getScalarValue(),rhs1.getScalarValue() }; + } + + int sz = rhs1.size(); + auto pRhs1 = rhs1.start(); + const int width = InstructionTraits::width; + int step = 4 * width; + + INS_VEC RHS1; + INS_VEC RES; + + INS_VEC RHS2; + INS_VEC RES1; + + INS_VEC RHS3; + INS_VEC RES2; + + INS_VEC RHS4; + INS_VEC RES3; + + // + INS_VEC RHS1_A; + INS_VEC RES_A; + + INS_VEC RHS2_A; + INS_VEC RES1_A; + + INS_VEC RHS3_A; + INS_VEC RES2_A; + + INS_VEC RHS4_A; + INS_VEC RES3_A; + + // + + INS_VEC RES_B; + INS_VEC RES1_B; + INS_VEC RES2_B; + INS_VEC RES3_B; + + INS_VEC RES_C; + INS_VEC RES1_C; + INS_VEC RES2_C; + INS_VEC RES3_C; + + + int i = 0; + + if (sz >= step * 2) + { + //initialise first set of registers + { + RES.load_a(pRhs1 + i); + RES1.load_a(pRhs1 + i + width); + RES2.load_a(pRhs1 + i + width * 2); + RES3.load_a(pRhs1 + i + width * 3); + + + RES_A = RES; + RES1_A = RES1; + RES2_A = RES2; + RES3_A = RES3; + + RES_B = RES; + RES1_B = RES1; + RES2_B = RES2; + RES3_B = RES3; + + RES_C = RES; + RES1_C = RES1; + RES2_C = RES2; + RES3_C = RES3; + + } + + i += step; + int rhsSZ = rhs1.size(); + for (; i <= (rhsSZ - step); i += step) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); + RES_C = oper4(RES_C, RHS1); + + RHS2.load_a(pRhs1 + i + width); + RES1 = oper(RES1, RHS2); + RES1_A = oper2(RES1_A, RHS2); + RES1_B = oper3(RES1_B, RHS2); + RES1_C = oper4(RES1_C, RHS2); + + RHS3.load_a(pRhs1 + i + width * 2); + RES2 = oper(RES2, RHS3); + RES2_A = oper2(RES2_A, RHS3); + RES2_B = oper3(RES2_B, RHS3); + RES2_C = oper4(RES2_C, RHS3); + + RHS4.load_a(pRhs1 + i + width * 3); + RES3 = oper(RES3, RHS4); + RES3_A = oper2(RES3_A, RHS4); + RES3_B = oper3(RES3_B, RHS4); + RES3_C = oper4(RES3_C, RHS4); + + } + + // odd bits + for (; i <= rhsSZ - width; i += width) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); + RES_C = oper4(RES_C, RHS1); + } + + RES = oper(RES, RES1); + RES2 = oper(RES2, RES3); + RES = oper(RES, RES2); + + RES_A = oper2(RES_A, RES1_A); + RES2_A = oper2(RES2_A, RES3_A); + RES_A = oper2(RES_A, RES2_A); + + + RES_B = oper3(RES_B, RES1_B); + RES2_B = oper3(RES2_B, RES3_B); + RES_B = oper3(RES_B, RES2_B); + + RES_C = oper4(RES_C, RES1_C); + RES2_C = oper4(RES2_C, RES3_C); + RES_C = oper4(RES_C, RES2_C); + + } + else + { + RES.load_a(pRhs1); + RES_A = RES; + RES_B = RES; + RES_C = RES; + + i += width; + // odd bits + for (; i <= sz - width; i += width) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); + RES_C = oper4(RES_C, RHS1); + } + + } + + typename InstructionTraits::FloatType result = RES[0]; + typename InstructionTraits::FloatType result_A = RES_A[0]; + typename InstructionTraits::FloatType result_B = RES_B[0]; + typename InstructionTraits::FloatType result_C = RES_C[0]; + int min_wdth = std::min(sz, width); + //across vectors lanes // not assuming horizontal version exist + for (int j = 1; j < min_wdth; ++j) + { + result = ApplyBinaryOperationVec(result, RES[j], oper); + result_A = ApplyBinaryOperationVec(result_A, RES_A[j], oper2); + result_B = ApplyBinaryOperationVec(result_B, RES_B[j], oper3); + result_C = ApplyBinaryOperationVec(result_C, RES_C[j], oper4); + } + + //end bits for vecs not filling padding + for (; i < rhs1.size(); ++i) + { + result = ApplyBinaryOperationVec(pRhs1[i], result, oper); + result_A = ApplyBinaryOperationVec(pRhs1[i], result_A, oper2); + result_B = ApplyBinaryOperationVec(pRhs1[i], result_B, oper3); + result_C = ApplyBinaryOperationVec(pRhs1[i], result_C, oper4); + } + + return { result, result_A, result_B, result_C}; +} + + +/// un-rolled x2 +/////////////////////////////////////////////// +//////experimental unrolled double accumulation +template< typename INS_VEC, typename OP, typename OP2> +typename std::tuple::FloatType, typename InstructionTraits::FloatType > +ApplyAccumulate2UR_X2_2(const Vec& rhs1, OP& oper, OP2& oper2) +{ + check_vector(rhs1); + if (isScalar(rhs1)) // nothing to accumulate with so just return value + { + return { rhs1.getScalarValue(),rhs1.getScalarValue() }; + } + + int sz = rhs1.size(); + auto pRhs1 = rhs1.start(); + const int width = InstructionTraits::width; + + //int step = 4 * width; + int step = 2 * width; + + INS_VEC RHS1; + INS_VEC RES; + + INS_VEC RHS2; + INS_VEC RES1; + + INS_VEC RHS1_A; + INS_VEC RES_A; + + INS_VEC RHS2_A; + INS_VEC RES1_A; + + int i = 0; + + if (sz >= step * 2) + { + //initialise first set of registers + { + RES.load_a(pRhs1 + i); + RES1.load_a(pRhs1 + i + width); + + RES_A = RES; + RES1_A = RES1; + + } + + i += step; + int rhsSZ = rhs1.size(); + for (; i <= (rhsSZ - step); i += step) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + + RHS2.load_a(pRhs1 + i + width); + RES1 = oper(RES1, RHS2); + RES1_A = oper2(RES1_A, RHS2); + + } + + // odd bits + for (; i <= rhsSZ - width; i += width) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + } + + RES = oper(RES, RES1); + RES_A = oper2(RES_A, RES1_A); + + } + else + { + RES.load_a(pRhs1); + RES_A = RES; + + i += width; + // odd bits + for (; i <= sz - width; i += width) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + } + + } + + typename InstructionTraits::FloatType result = RES[0]; + typename InstructionTraits::FloatType result_A = RES_A[0]; + int min_wdth = std::min(sz, width); + //across vectors lanes // not assuming horizontal version exist + for (int j = 1; j < min_wdth; ++j) + { + result = ApplyBinaryOperationVec(result, RES[j], oper); + result_A = ApplyBinaryOperationVec(result_A, RES_A[j], oper2); + } + + //end bits for vecs not filling padding + for (; i < rhs1.size(); ++i) + { + result = ApplyBinaryOperationVec(pRhs1[i], result, oper); + result_A = ApplyBinaryOperationVec(pRhs1[i], result_A, oper2); + } + + return { result, result_A }; +} + + + +//////experimental unrolled triple accumulation/ reduction +template< typename INS_VEC, typename OP, typename OP2, typename OP3> +typename std::tuple::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType> +ApplyAccumulate2UR_X2_2(const Vec& rhs1, OP& oper, OP2& oper2, OP3& oper3) +{ + check_vector(rhs1); + if (isScalar(rhs1)) // nothing to accumulate with so just return value + { + return { rhs1.getScalarValue(),rhs1.getScalarValue() ,rhs1.getScalarValue() }; + } + + int sz = rhs1.size(); + auto pRhs1 = rhs1.start(); + const int width = InstructionTraits::width; + //int step = 4 * width; + int step = 2 * width; + + INS_VEC RHS1; + INS_VEC RES; + + INS_VEC RHS2; + INS_VEC RES1; + + INS_VEC RHS1_A; + INS_VEC RES_A; + + INS_VEC RHS2_A; + INS_VEC RES1_A; + + + INS_VEC RES_B; + INS_VEC RES1_B; + + int i = 0; + + if (sz >= step * 2) + { + //initialise first set of registers + { + RES.load_a(pRhs1 + i); + RES1.load_a(pRhs1 + i + width); + + RES_A = RES; + RES1_A = RES1; + + RES_B = RES; + RES1_B = RES1; + } + + i += step; + int rhsSZ = rhs1.size(); + for (; i <= (rhsSZ - step); i += step) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); + + RHS2.load_a(pRhs1 + i + width); + RES1 = oper(RES1, RHS2); + RES1_A = oper2(RES1_A, RHS2); + RES1_B = oper3(RES1_B, RHS2); + + } + + // odd bits + for (; i <= rhsSZ - width; i += width) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); + } + + RES = oper(RES, RES1); + RES_A = oper2(RES_A, RES1_A); + RES_B = oper3(RES_B, RES1_B); + + } + else + { + RES.load_a(pRhs1); + RES_A = RES; + RES_B = RES; + + i += width; + // odd bits + for (; i <= sz - width; i += width) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); + } + + } + + typename InstructionTraits::FloatType result = RES[0]; + typename InstructionTraits::FloatType result_A = RES_A[0]; + typename InstructionTraits::FloatType result_B = RES_B[0]; + int min_wdth = std::min(sz, width); + //across vectors lanes // not assuming horizontal version exist + for (int j = 1; j < min_wdth; ++j) + { + result = ApplyBinaryOperationVec(result, RES[j], oper); + result_A = ApplyBinaryOperationVec(result_A, RES_A[j], oper2); + result_B = ApplyBinaryOperationVec(result_B, RES_B[j], oper3); + } + + //end bits for vecs not filling padding + for (; i < rhs1.size(); ++i) + { + result = ApplyBinaryOperationVec(pRhs1[i], result, oper); + result_A = ApplyBinaryOperationVec(pRhs1[i], result_A, oper2); + result_B = ApplyBinaryOperationVec(pRhs1[i], result_B, oper3); + } + + return { result, result_A, result_B }; +} + +// + +//////experimental unrolled quad accumulation/ reduction +template< typename INS_VEC, typename OP, typename OP2, typename OP3, typename OP4> +typename std::tuple::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType> +ApplyAccumulate2UR_X2_2(const Vec& rhs1, OP& oper, OP2& oper2, OP3& oper3, OP4& oper4) +{ + check_vector(rhs1); + if (isScalar(rhs1)) // nothing to accumulate with so just return value + { + return { rhs1.getScalarValue(),rhs1.getScalarValue(), rhs1.getScalarValue(),rhs1.getScalarValue() }; + } + + int sz = rhs1.size(); + auto pRhs1 = rhs1.start(); + const int width = InstructionTraits::width; + + int step = 2 * width; + + INS_VEC RHS1; + INS_VEC RES; + + INS_VEC RHS2; + INS_VEC RES1; + + INS_VEC RHS1_A; + INS_VEC RES_A; + + INS_VEC RHS2_A; + INS_VEC RES1_A; + + INS_VEC RES_B; + INS_VEC RES1_B; + + INS_VEC RES_C; + INS_VEC RES1_C; + + + int i = 0; + + if (sz >= step * 2) + { + //initialise first set of registers + { + RES.load_a(pRhs1 + i); + RES1.load_a(pRhs1 + i + width); + + RES_A = RES; + RES1_A = RES1; + + RES_B = RES; + RES1_B = RES1; + + RES_C = RES; + RES1_C = RES1; + + } + + i += step; + int rhsSZ = rhs1.size(); + for (; i <= (rhsSZ - step); i += step) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); + RES_C = oper4(RES_C, RHS1); + + RHS2.load_a(pRhs1 + i + width); + RES1 = oper(RES1, RHS2); + RES1_A = oper2(RES1_A, RHS2); + RES1_B = oper3(RES1_B, RHS2); + RES1_C = oper4(RES1_C, RHS2); + + } + + // odd bits + for (; i <= rhsSZ - width; i += width) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); + RES_C = oper4(RES_C, RHS1); + } + + RES = oper(RES, RES1); + RES_A = oper2(RES_A, RES1_A); + RES_B = oper3(RES_B, RES1_B); + RES_C = oper4(RES_C, RES1_C); + + } + else + { + RES.load_a(pRhs1); + RES_A = RES; + RES_B = RES; + RES_C = RES; + + i += width; + // odd bits + for (; i <= sz - width; i += width) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); + RES_C = oper4(RES_C, RHS1); + } + + } + + typename InstructionTraits::FloatType result = RES[0]; + typename InstructionTraits::FloatType result_A = RES_A[0]; + typename InstructionTraits::FloatType result_B = RES_B[0]; + typename InstructionTraits::FloatType result_C = RES_C[0]; + int min_wdth = std::min(sz, width); + //across vectors lanes // not assuming horizontal version exist + for (int j = 1; j < min_wdth; ++j) + { + result = ApplyBinaryOperationVec(result, RES[j], oper); + result_A = ApplyBinaryOperationVec(result_A, RES_A[j], oper2); + result_B = ApplyBinaryOperationVec(result_B, RES_B[j], oper3); + result_C = ApplyBinaryOperationVec(result_C, RES_C[j], oper4); + } + + //end bits for vecs not filling padding + for (; i < rhs1.size(); ++i) + { + result = ApplyBinaryOperationVec(pRhs1[i], result, oper); + result_A = ApplyBinaryOperationVec(pRhs1[i], result_A, oper2); + result_B = ApplyBinaryOperationVec(pRhs1[i], result_B, oper3); + result_C = ApplyBinaryOperationVec(pRhs1[i], result_C, oper4); + } + + return { result, result_A, result_B, result_C }; +} + + +//////////////////////////// unrolled x1 + +/////////////////////////////////////////////// +//////experimental unrolled double accumulation +template< typename INS_VEC, typename OP, typename OP2> +typename std::tuple::FloatType, typename InstructionTraits::FloatType > +ApplyAccumulate2UR_X2_1(const Vec& rhs1, OP& oper, OP2& oper2) +{ + check_vector(rhs1); + if (isScalar(rhs1)) // nothing to accumulate with so just return value + { + return { rhs1.getScalarValue(),rhs1.getScalarValue() }; + } + + int sz = rhs1.size(); + auto pRhs1 = rhs1.start(); + const int width = InstructionTraits::width; + + int step = width; + + INS_VEC RHS1; + INS_VEC RES; + INS_VEC RES_A; + + int i = 0; + + if (sz >= step * 2) + { + //initialise first set of registers + { + RES.load_a(pRhs1 + i); + RES_A = RES; + } + + i += step; + int rhsSZ = rhs1.size(); + for (; i <= (rhsSZ - step); i += step) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + } + + // odd bits not needed + for (; i <= rhsSZ - width; i += width) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + } + + } + else + { + RES.load_a(pRhs1); + RES_A = RES; + + i += width; + // odd bits + for (; i <= sz - width; i += width) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + } + + } + + typename InstructionTraits::FloatType result = RES[0]; + typename InstructionTraits::FloatType result_A = RES_A[0]; + int min_wdth = std::min(sz, width); + //across vectors lanes // not assuming horizontal version exist + for (int j = 1; j < min_wdth; ++j) + { + result = ApplyBinaryOperationVec(result, RES[j], oper); + result_A = ApplyBinaryOperationVec(result_A, RES_A[j], oper2); + } + + //end bits for vecs not filling padding + for (; i < rhs1.size(); ++i) + { + result = ApplyBinaryOperationVec(pRhs1[i], result, oper); + result_A = ApplyBinaryOperationVec(pRhs1[i], result_A, oper2); + } + + return { result, result_A }; +} + + + +//////experimental unrolled triple accumulation/ reduction +template< typename INS_VEC, typename OP, typename OP2, typename OP3> +typename std::tuple::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType> +ApplyAccumulate2UR_X2_1(const Vec& rhs1, OP& oper, OP2& oper2, OP3& oper3) +{ + check_vector(rhs1); + if (isScalar(rhs1)) // nothing to accumulate with so just return value + { + return { rhs1.getScalarValue(),rhs1.getScalarValue() ,rhs1.getScalarValue()}; + } + + int sz = rhs1.size(); + auto pRhs1 = rhs1.start(); + const int width = InstructionTraits::width; + + int step = width; + + INS_VEC RHS1; + INS_VEC RES; + INS_VEC RES_A; + INS_VEC RES_B; + + int i = 0; + + if (sz >= step * 2) + { + //initialise first set of registers + { + RES.load_a(pRhs1 + i); + RES_A = RES; + RES_B = RES; + + } + + i += step; + int rhsSZ = rhs1.size(); + for (; i <= (rhsSZ - step); i += step) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); + + } + + // odd bits + for (; i <= rhsSZ - width; i += width) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); + } + + + } + else + { + RES.load_a(pRhs1); + RES_A = RES; + RES_B = RES; + + i += width; + // odd bits + for (; i <= sz - width; i += width) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); + } + + } + + typename InstructionTraits::FloatType result = RES[0]; + typename InstructionTraits::FloatType result_A = RES_A[0]; + typename InstructionTraits::FloatType result_B = RES_B[0]; + int min_wdth = std::min(sz, width); + //across vectors lanes // not assuming horizontal version exist + for (int j = 1; j < min_wdth; ++j) + { + result = ApplyBinaryOperationVec(result, RES[j], oper); + result_A = ApplyBinaryOperationVec(result_A, RES_A[j], oper2); + result_B = ApplyBinaryOperationVec(result_B, RES_B[j], oper3); + } + + //end bits for vecs not filling padding + for (; i < rhs1.size(); ++i) + { + result = ApplyBinaryOperationVec(pRhs1[i], result, oper); + result_A = ApplyBinaryOperationVec(pRhs1[i], result_A, oper2); + result_B = ApplyBinaryOperationVec(pRhs1[i], result_B, oper3); + } + + return { result, result_A, result_B }; +} + + +//////experimental unrolled quad accumulation/ reduction +template< typename INS_VEC, typename OP, typename OP2, typename OP3, typename OP4> +typename std::tuple::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType> +ApplyAccumulate2UR_X2_1(const Vec& rhs1, OP& oper, OP2& oper2, OP3& oper3, OP4& oper4) +{ + check_vector(rhs1); + if (isScalar(rhs1)) // nothing to accumulate with so just return value + { + return { rhs1.getScalarValue(),rhs1.getScalarValue() ,rhs1.getScalarValue() ,rhs1.getScalarValue() }; + } + + int sz = rhs1.size(); + auto pRhs1 = rhs1.start(); + const int width = InstructionTraits::width; + + int step = width; + + INS_VEC RHS1; + INS_VEC RES; + INS_VEC RES_A; + INS_VEC RES_B; + INS_VEC RES_C; + + int i = 0; + + if (sz >= step * 2) + { + //initialise first set of registers + { + RES.load_a(pRhs1 + i); + RES_A = RES; + RES_B = RES; + RES_C = RES; + + } + + i += step; + int rhsSZ = rhs1.size(); + for (; i <= (rhsSZ - step); i += step) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); + RES_C = oper4(RES_C, RHS1); + + } + + // odd bits + for (; i <= rhsSZ - width; i += width) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); + RES_C = oper4(RES_C, RHS1); + } + } + else + { + RES.load_a(pRhs1); + RES_A = RES; + RES_B = RES; + RES_C = RES; + + i += width; + // odd bits + for (; i <= sz - width; i += width) + { + RHS1.load_a(pRhs1 + i); + RES = oper(RES, RHS1); + RES_A = oper2(RES_A, RHS1); + RES_B = oper3(RES_B, RHS1); + RES_C = oper3(RES_C, RHS1); + } + + } + + typename InstructionTraits::FloatType result = RES[0]; + typename InstructionTraits::FloatType result_A = RES_A[0]; + typename InstructionTraits::FloatType result_B = RES_B[0]; + typename InstructionTraits::FloatType result_C = RES_C[0]; + int min_wdth = std::min(sz, width); + //across vectors lanes // not assuming horizontal version exist + for (int j = 1; j < min_wdth; ++j) + { + result = ApplyBinaryOperationVec(result, RES[j], oper); + result_A = ApplyBinaryOperationVec(result_A, RES_A[j], oper2); + result_B = ApplyBinaryOperationVec(result_B, RES_B[j], oper3); + result_C = ApplyBinaryOperationVec(result_C, RES_C[j], oper4); + } + + //end bits for vecs not filling padding + for (; i < rhs1.size(); ++i) + { + result = ApplyBinaryOperationVec(pRhs1[i], result, oper); + result_A = ApplyBinaryOperationVec(pRhs1[i], result_A, oper2); + result_B = ApplyBinaryOperationVec(pRhs1[i], result_B, oper3); + result_C = ApplyBinaryOperationVec(pRhs1[i], result_C, oper4); + } + + return { result, result_A, result_B, result_C }; +} diff --git a/Vectorisation/VecX/dr3.h b/Vectorisation/VecX/dr3.h index b9c5a6a..80becc2 100644 --- a/Vectorisation/VecX/dr3.h +++ b/Vectorisation/VecX/dr3.h @@ -722,8 +722,92 @@ reduceM(const Vec& rhs, OP& oper, OP1& oper1) } +//////experimental unrolled triple accumulation +template< typename INS_VEC, typename OP, typename OP1, typename OP2> +typename std::tuple::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType> +reduceM(const Vec& rhs, OP& oper, OP1& oper1, OP2& oper2) +{ + return ApplyAccumulate2UR_X2(rhs, oper, oper1, oper2); + +} + + +//////experimental unrolled quad accumulation +template< typename INS_VEC, typename OP, typename OP1, typename OP2, typename OP3> +typename std::tuple::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType> +reduceM(const Vec& rhs, OP& oper, OP1& oper1, OP2& oper2, OP3& oper3) +{ + return ApplyAccumulate2UR_X2(rhs, oper, oper1, oper2, oper3); + +} + +////////////////////////////// better dispatch to do +//////experimental unrolled double accumulation +template< typename INS_VEC, typename OP, typename OP1> +typename std::tuple::FloatType, typename InstructionTraits::FloatType> +reduceM1(const Vec& rhs, OP& oper, OP1& oper1) +{ + return ApplyAccumulate2UR_X2_1(rhs, oper, oper1); + +} + + +//////experimental unrolled triple accumulation +template< typename INS_VEC, typename OP, typename OP1, typename OP2> +typename std::tuple::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType> +reduceM1(const Vec& rhs, OP& oper, OP1& oper1, OP2& oper2) +{ + return ApplyAccumulate2UR_X2_1(rhs, oper, oper1, oper2); + +} + + +//////experimental unrolled quad accumulation +template< typename INS_VEC, typename OP, typename OP1, typename OP2, typename OP3> +typename std::tuple::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType> +reduceM1(const Vec& rhs, OP& oper, OP1& oper1, OP2& oper2, OP3& oper3) +{ + return ApplyAccumulate2UR_X2_1(rhs, oper, oper1, oper2, oper3); + +} + +////////////////////////////////////////// + +////////////////////////////// better dispatch to do +//////experimental unrolled double accumulation +template< typename INS_VEC, typename OP, typename OP1> +typename std::tuple::FloatType, typename InstructionTraits::FloatType> +reduceM2(const Vec& rhs, OP& oper, OP1& oper1) +{ + return ApplyAccumulate2UR_X2_2(rhs, oper, oper1); + +} + + +//////experimental unrolled triple accumulation +template< typename INS_VEC, typename OP, typename OP1, typename OP2> +typename std::tuple::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType> +reduceM2(const Vec& rhs, OP& oper, OP1& oper1, OP2& oper2) +{ + return ApplyAccumulate2UR_X2_2(rhs, oper, oper1, oper2); + +} + + +//////experimental unrolled quad accumulation +template< typename INS_VEC, typename OP, typename OP1, typename OP2, typename OP3> +typename std::tuple::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType, typename InstructionTraits::FloatType> +reduceM2(const Vec& rhs, OP& oper, OP1& oper1, OP2& oper2, OP3& oper3) +{ + return ApplyAccumulate2UR_X2_2(rhs, oper, oper1, oper2, oper3); + +} + + + +////////////////////////////////// //unitary transform template< typename INS_VEC, typename OPT, typename OP> typename InstructionTraits::FloatType transformReduce1(const Vec& rhs1, OPT& operTransform, OP& operAcc, typename InstructionTraits::FloatType initVal = InstructionTraits::nullValue, bool singularInit = true) diff --git a/Vectorisation/Vectorisation.vcxproj b/Vectorisation/Vectorisation.vcxproj index 6a808c1..30ab090 100644 --- a/Vectorisation/Vectorisation.vcxproj +++ b/Vectorisation/Vectorisation.vcxproj @@ -287,7 +287,7 @@ StaticLibrary false - v142 + ClangCL true Unicode false