From 2fb50d64c791ccde00f0c15af05d031cfaa3bfc6 Mon Sep 17 00:00:00 2001
From: andyD123 <andreedrakeford@hotmail.com>
Date: Mon, 8 Jul 2024 22:18:08 +0100
Subject: [PATCH] Add extra multi reduce functions (#14)

* adding extra multi reduce fuctions

* adding more versions of multi reduce with different nuimbers of unrol and arguments

* remove unused variables, clean up accumulate/ multi reduction

* commit of variadic reduction

* clean up variadic reduce project
---
 .gitignore                                    |   10 +
 VariadicReducrion/VariadicReducrion.vcxproj   |  350 ++++++
 .../VariadicReducrion.vcxproj.filters         |   22 +
 VariadicReducrion/VariadicReduction.cpp       |  515 ++++++++
 VectorTest/DR3_tests.cpp                      |  263 ++++
 Vectorisation.sln                             |   62 +
 Vectorisation/VecX/accumulate_transform.h     | 1074 ++++++++++++++++-
 Vectorisation/VecX/dr3.h                      |   84 ++
 Vectorisation/Vectorisation.vcxproj           |    2 +-
 9 files changed, 2348 insertions(+), 34 deletions(-)
 create mode 100644 VariadicReducrion/VariadicReducrion.vcxproj
 create mode 100644 VariadicReducrion/VariadicReducrion.vcxproj.filters
 create mode 100644 VariadicReducrion/VariadicReduction.cpp
diff --git a/.gitignore b/.gitignore
index 47b6041..a4105ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,3 +69,13 @@
 /scratch/r003mi2
 /scratch/scratch.inspxeproj
 /debug.log
+/cmake-build-release/.cmake/api/v1/reply
+/cmake-build-release/.cmake/api/v1/query
+/cmake-build-release/accumulateExample/CMakeFiles/accumulateExample.dir/accumulate_example.cpp.obj
+/cmake-build-release
+/VariadicReducrion/e000
+/VariadicReducrion/Intel® VTune™ Profiler Results/VariadicReducrion
+/VariadicReducrion/x64
+/DawnCache
+/config
+/GPUCache
diff --git a/VariadicReducrion/VariadicReducrion.vcxproj b/VariadicReducrion/VariadicReducrion.vcxproj
new file mode 100644
index 0000000..d5f64c0
--- /dev/null
+++ b/VariadicReducrion/VariadicReducrion.vcxproj
@@ -0,0 +1,350 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="clang-cl23|Win32">
+      <Configuration>clang-cl23</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="clang-cl23|x64">
+      <Configuration>clang-cl23</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="ICC2023|Win32">
+      <Configuration>ICC2023</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="ICC2023|x64">
+      <Configuration>ICC2023</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release-23|Win32">
+      <Configuration>Release-23</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release-23|x64">
+      <Configuration>Release-23</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>17.0</VCProjectVersion>
+    <Keyword>Win32Proj</Keyword>
+    <ProjectGuid>{271cf3d5-72ff-4657-9325-4206b8d5c84f}</ProjectGuid>
+    <RootNamespace>VariadicReducrion</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+    <ProjectName>VariadicReduction</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-23|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='clang-cl23|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='ICC2023|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>ClangCL</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-23|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='clang-cl23|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>ClangCL</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='ICC2023|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel C++ Compiler 2023</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-23|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='clang-cl23|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='ICC2023|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-23|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='clang-cl23|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='ICC2023|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release-23|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='clang-cl23|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='ICC2023|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions512</EnableEnhancedInstructionSet>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <FloatingPointModel>Precise</FloatingPointModel>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <StructMemberAlignment>Default</StructMemberAlignment>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release-23|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions512</EnableEnhancedInstructionSet>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <FloatingPointModel>Precise</FloatingPointModel>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <StructMemberAlignment>Default</StructMemberAlignment>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='clang-cl23|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions512</EnableEnhancedInstructionSet>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <FloatingPointModel>Precise</FloatingPointModel>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <StructMemberAlignment>Default</StructMemberAlignment>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='ICC2023|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions512</EnableEnhancedInstructionSet>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <FloatingPointModel>Precise</FloatingPointModel>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <StructMemberAlignment>Default</StructMemberAlignment>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="VariadicReduction.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="..\Vectorisation\Vectorisation.vcxproj">
+      <Project>{837beacf-bced-4ab2-baf5-a7254eb6756b}</Project>
+    </ProjectReference>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/VariadicReducrion/VariadicReducrion.vcxproj.filters b/VariadicReducrion/VariadicReducrion.vcxproj.filters
new file mode 100644
index 0000000..e757fe9
--- /dev/null
+++ b/VariadicReducrion/VariadicReducrion.vcxproj.filters
@@ -0,0 +1,22 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="VariadicReduction.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/VariadicReducrion/VariadicReduction.cpp b/VariadicReducrion/VariadicReduction.cpp
new file mode 100644
index 0000000..ea95c99
--- /dev/null
+++ b/VariadicReducrion/VariadicReduction.cpp
@@ -0,0 +1,515 @@
+// VariadicReducrion.cpp : This file contains the 'main' function. Program execution begins and ends there.
+//
+
+#include <algorithm>
+#include <random>
+#include <numeric>
+#include <iterator>
+#include <iostream>
+#include <vector>
+#include <chrono>
+#include <iomanip>  
+#include <thread>
+#include <map>
+#include <cstring>
+#include <execution>
+
+
+#include "../Vectorisation/VecX/dr3.h"
+#include "../Vectorisation/VecX/accumulate_transform.h"
+#include "../Vectorisation/VecX/error_utils.h"
+
+
+#include "../Vectorisation/VecX/zip_utils.h"
+
+//#include "norm.h"
+
+
+/*
+Uncomment one of the Using namespace lines below to select the instruction set that you wish to run
+Those ending in F have float type as underlying, those ending with D have a double.
+
+The project is set to compile using the AVX512  enhanced instruction set. The namespace selection
+choses the type of the intrinsics that are used to instantiate lambdas.
+
+If your hardware does not support AVX512 chose the next level down AVX2 and avoid using namespaces
+DRC::VecD8D or DRC::VecF16F which will cause generation of code with instructions that your computer doesn't support.
+
+check device manager/processor to determine what processor you have and check against web site
+https://ark.intel.com/content/www/us/en/ark/products/123550/intel-xeon-silver-4114-processor-13-75m-cache-2-20-ghz.html
+or
+https://www.intel.com/content/www/us/en/products/details/processors/xeon/scalable.html
+
+//Next uncomment example that you wish to run.
+You can change projects setting for the compiler debug/release  are for VS2019, clang  and ICC2022
+
+precision of results compare for float types is incorrect.
+
+*/
+
+//pick an instruction set for intrinsics by selecting a name space
+//using namespace DRC::VecLDb;  //long double 
+//using namespace DRC::VecDb; //double
+//using namespace DRC::VecD2D;  //sse2   double
+//using namespace DRC::VecD4D;	//avx2   double
+//using namespace DRC::VecF8F;	// avx2  float
+
+using namespace DRC::VecD8D;  //avx512 double
+//using namespace DRC::VecF16F; //avx512   float
+
+
+using FLOAT = typename InstructionTraits<VecXX::INS>::FloatType;
+
+
+
+const double billion = 1000000000.0;
+
+
+template<typename T>
+bool vectorsEqual(const std::vector<T>& C1, const std::vector<T>& C2, const std::vector<T>& C3)
+{
+	bool  testOK = true;
+	const double ERR = 1e-13; //for examples
+	if (C1.size() != C2.size())
+	{
+		return false;
+	}
+
+	if (C3.size() != C2.size())
+	{
+		return false;
+	}
+
+	for (size_t i = 0; i < C3.size(); i++)
+	{
+		auto err1 = fabs((C1[i] - C2[i]) / (C2[i] + C1[i]));
+		auto err2 = fabs((C1[i] - C3[i]) / (C1[i] + C3[i]));
+
+		if ((err1 > ERR) || (err2 > ERR))
+		{
+			testOK = false;
+			std::cout << "\n err diff@ " << i << " err1 =" << err1 << ", err2 = " << err2 << "\n";
+			break;
+		}
+	}
+
+	return testOK;
+
+}
+
+
+long double getErr(const  std::vector<long double>& t)
+{
+	ignore(t);
+	return 2e-11;
+}
+
+
+double getErr(const  std::vector<double>& t)
+{
+	ignore(t);
+	return 2e-11;
+}
+
+
+float getErr(const  std::vector<float>& t)
+{
+	ignore(t);
+	return 1.0;
+}
+
+
+template<typename T>
+bool vectorsEqualD(const std::vector<T>& C1, const std::vector<T>& C2, const std::vector<T>& C3, const std::vector<T>& input, T ERR = 1e-13)
+{
+
+
+	ERR = getErr(C1);
+
+	bool  testOK = true;
+
+	if (C1.size() != C2.size())
+	{
+		std::cout << "wrong size C1,C2" << C1.size() << ", " << C2.size() << std::endl;
+		return false;
+	}
+
+	if (C3.size() != C2.size())
+	{
+		std::cout << "wrong size C2,C3" << C2.size() << ", " << C3.size() << std::endl;
+		return false;
+	}
+
+	for (size_t i = 0; i < C3.size(); i++)
+	{
+		auto err1 = fabs((C1[i] - C2[i]) / (fabs(C2[i]) + fabs(C1[i])));
+		auto err2 = fabs((C1[i] - C3[i]) / (fabs(C1[i]) + fabs(C3[i])));
+
+		if ((err1 > ERR) || (err2 > ERR))
+		{
+			testOK = false;
+			std::cout << "\n err diff@ " << i << " err1 =" << err1 << ", err2 = " << err2 << "\n";
+			std::cout << "\n val @ " << i << " C1[i] =" << C1[i] << ", C2[i] = " << C2[i] << ", C3[i] = " << C3[i] << "input val=" << input[i] << "\n";
+			std::cout << std::endl;
+			break;
+		}
+	}
+
+	return testOK;
+
+}
+
+bool valuesAreEqual(double x, double y, double tol = 1e-14)
+{
+
+	auto err1 = fabs((x - y) / (fabs(x) + fabs(y)));
+
+	return (err1 > tol) ? false : true;
+}
+
+
+auto getRandomShuffledVector(int SZ, int instance_number = 0)
+{
+	using FloatType = typename InstructionTraits<VecXX::INS>::FloatType;
+
+
+	static std::map<int, std::vector<FloatType> > vectors;
+
+
+	int key = 10 * SZ + instance_number;
+	//store vectors with key 10 times size  and add on 0-9 integer for instance of different random vector
+
+	if (SZ < 0)
+	{
+		vectors.clear();
+		SZ = 0;
+	}
+
+
+	if (vectors.find(key) != vectors.end())
+	{
+		return vectors[key];
+	}
+	else
+	{
+		std::vector<FloatType>  v(SZ, VecXX::SCALA_TYPE(6.66));
+		for (int i = 0; i < SZ; i++) { v[i] += /*FloatType(SZ / 2)*/ +i; }
+		std::random_device rd;
+		std::mt19937 g(rd());
+		std::shuffle(begin(v), end(v), g);
+		vectors[key] = v;
+		return v;
+	}
+}
+
+
+
+
+
+
+
+auto numOps = [](int TEST_LOOP_SZ, int SZ) { return  static_cast<int>(double(TEST_LOOP_SZ) * double(SZ)); };
+
+
+double getnull(double)
+{
+	return 0.0;
+}
+
+using Calc_Values = std::map<int, FLOAT>;
+using Calc_Values_V = std::map<int, std::vector<FLOAT> >;
+using  Mapped_Performance_Results = std::map<int, std::vector<double> >; // array size  v vector<throughput for runs>
+using Mapped_Stats = std::map<int, std::pair<double, double> >; // size -.pair ( throughput ,  std dev of through put)
+
+struct RunResults
+{
+	Mapped_Performance_Results m_raw_results;
+	Calc_Values  m_calc_results;
+};
+
+struct RunResultsVec
+{
+	Mapped_Performance_Results m_raw_results;
+	Calc_Values_V  m_calc_results;
+};
+
+class TimerGuard
+{
+	double& m_runTime;
+	std::chrono::steady_clock::time_point  m_startTme;
+
+public:
+	TimerGuard(double& runTime) : m_runTime(runTime), m_startTme(std::chrono::steady_clock::now()) { runTime = 0.; }
+
+	~TimerGuard()
+	{
+		auto endTime = std::chrono::steady_clock::now();
+		auto runtime = endTime - m_startTme;
+		m_runTime = runtime.count() / billion;
+	}
+};
+
+
+auto runFunctionOverDifferentSize = [](int testRepeats, int vec_start_size, int vec_stepSZ, int vec_maxSZ, const auto& func, long testLoopSZ)
+{
+
+	RunResults results;
+
+	for (int j = 0; j < testRepeats; ++j)
+	{
+		int VEC_SZ = vec_start_size;
+		for (; VEC_SZ < vec_maxSZ; VEC_SZ += vec_stepSZ)
+		{
+			auto res = func(VEC_SZ, testLoopSZ);
+			auto calculation_rate = res.second;
+			auto calc_value = res.first;
+			results.m_raw_results[VEC_SZ].push_back(calculation_rate);
+			if (j == 0)
+			{
+				results.m_calc_results[VEC_SZ] = static_cast<FLOAT>(calc_value);
+			}
+		}
+	}
+	return results;
+};
+
+/*
+auto runFunctionOverDifferentSizeVec = [](int testRepeats, int vec_start_size, int vec_stepSZ, int vec_maxSZ, const auto& func, long testLoopSZ)
+	{
+
+		RunResultsVec results;
+
+		for (int j = 0; j < testRepeats; ++j)
+		{
+			int VEC_SZ = vec_start_size;
+			for (; VEC_SZ < vec_maxSZ; VEC_SZ += vec_stepSZ)
+			{
+				auto res = func(VEC_SZ, testLoopSZ);
+				auto calculation_rate = res.second;
+				auto calc_value = res.first;
+				results.m_raw_results[VEC_SZ].push_back(calculation_rate);
+
+				if (j == 0)
+				{
+					std::vector<FLOAT> tmp = res.first;
+					results.m_calc_results[VEC_SZ] = tmp;
+				}
+
+			}
+		}
+		return results;
+	};
+
+*/
+
+auto performanceStats = [](const Mapped_Performance_Results& raw_results)
+{
+
+	Mapped_Stats stats;
+
+	for (const auto& item : raw_results)
+	{
+		double sum = 0;
+		double sum_sqrd = 0;
+		double N = 0.0;
+		for (const auto run_rate : item.second)
+		{
+			sum += run_rate;
+			sum_sqrd += (run_rate * run_rate);
+			N++;
+		}
+
+		double avg = sum / N;
+		double varSqrd = sum_sqrd + (avg * avg * N) - (2.0 * avg * sum);
+		double var = std::sqrt(varSqrd / (N - 1.));
+
+		stats[item.first] = { avg ,var };
+
+	}
+	return stats;
+};
+
+
+
+//example doing  multi reduction operation after load
+//should be faster 
+void doMultiStats(int runType =7)
+{
+
+	const long TEST_LOOP_SZ = 10;// 100;// 100;//400;// 1000;
+	const int repeatRuns = 20;// 20;
+	const int vectorStepSize = 200;
+	const int maxVectorSize = 20000;
+	const int minVectorSize = 400;
+
+	getRandomShuffledVector(-1); // reset  random input vectors
+
+
+		auto accumulate_run = [&](int VEC_SZ, long TEST_LOOP_SZ)
+		{
+			double time = 0.;
+			double res = 0.;
+			double res_min = 0.;
+			volatile  double total = 0.;  //sum to prevent optimizing out
+		
+			auto v1 = getRandomShuffledVector(VEC_SZ, 0);
+
+			auto sum_vals = [](double accumulate, double val) {return accumulate + val; };
+			auto sumSqr = [](double accumulate, double val) {return accumulate + val * val; };
+			auto sumCube = [](double accumulate, double val) {return accumulate + val * val * val; };
+			auto sumQuart = [](double accumulate, double val) {return accumulate + val * val * val * val; };
+
+			//warm up
+			for (long l = 0; l < 100; l++)
+			{
+				auto sum_val = std::reduce( v1.begin(), v1.end(), 0.0, sum_vals);
+				auto sum_val1 = std::reduce( v1.begin(), v1.end(), 0.0, sumSqr);
+				auto sum_val2 = std::reduce( v1.begin(), v1.end(), 0.0, sumCube);
+				auto sum_val3 = std::reduce( v1.begin(), v1.end(), 0.0, sumQuart);
+				
+				total = sum_val + sum_val1 + sum_val2 + sum_val3;
+			}
+
+
+			{
+				TimerGuard timer(time);
+				for (long l = 0; l < TEST_LOOP_SZ; l++)
+				{
+
+					auto sum_val = std::reduce( v1.begin(), v1.end(), 0.0, sum_vals);
+					auto sum_val1 = std::reduce( v1.begin(), v1.end(), 0.0, sumSqr);
+					auto sum_val2 = std::reduce( v1.begin(), v1.end(), 0.0, sumCube);
+					auto sum_val3 = std::reduce( v1.begin(), v1.end(), 0.0, sumQuart);
+
+
+					total = sum_val + sum_val1 + sum_val2 + sum_val3; 
+					//compute combined value over all results  to check tests and stop short circuit
+				}
+			}
+
+			ignore(total);
+			ignore(res_min);
+
+			return  std::make_pair(res, numOps(TEST_LOOP_SZ, VEC_SZ) / time);
+		};
+
+
+
+		auto DR3_accumulate = [&](int SZ, long TEST_LOOP_SZ)
+		{
+			double time = 0.;
+			volatile  double res = 0.;
+		
+
+			auto sum = [](auto accumulate, auto val) {return accumulate + val; };
+			auto sumSqr = [](auto accumulate, auto val) {return accumulate + val * val; };
+			auto sumCube = [](auto accumulate, auto val) {return accumulate + val * val * val; };
+			auto sumQuart = [](auto accumulate, auto val) {return accumulate + val * val * val * val; };
+
+			auto v1 = getRandomShuffledVector(SZ, 0); // std stl vector double or float 
+			VecXX vec(v1);
+
+
+			for (long l = 0; l < 100; l++)
+			{
+				double sum_mnn = reduce(vec, sum);
+				double sumSqr_mnn = reduce(vec, sumSqr);
+				double sum_Cube = reduce(vec, sumCube);
+				double sum_Qrt = reduce(vec, sumQuart);
+
+				res = 0.5 * (sum_mnn + sumSqr_mnn + sum_Cube + sum_Qrt);
+			}
+
+
+			{
+				TimerGuard timer(time);
+				for (long l = 0; l < TEST_LOOP_SZ; l++)
+				{
+					double sum_mnn = reduce(vec, sum);
+					double sumSqr_mnn = reduce(vec, sumSqr);
+					double sum_Cube = reduce(vec, sumCube);
+					double sum_Qrt = reduce(vec, sumQuart);
+
+					res = 0.5 * (sum_mnn + sumSqr_mnn + sum_Cube + sum_Qrt);
+				}
+			}
+
+			return std::make_pair(res, numOps(TEST_LOOP_SZ, SZ) / time);
+
+		};
+
+		auto DR3_accumulate_multi = [&](int SZ, long TEST_LOOP_SZ)
+			{
+				double time = 0.;
+				volatile  double res = 0.;
+
+				auto sum = [](auto accumulate, auto val) {return accumulate + val; };
+				auto sumSqr = [](auto accumulate, auto val) {return accumulate + val * val; };
+				auto sumCube = [](auto accumulate, auto val) {return accumulate + val * val * val; };
+				auto sumQuart = [](auto accumulate, auto val) {return accumulate + val * val * val * val; };
+
+				auto v1 = getRandomShuffledVector(SZ, 0); // std stl vector double or float 
+				VecXX vec(v1);
+
+				auto ress = reduceM(vec, sum, sumSqr, sumCube, sumQuart);
+				//warm up
+				for (long l = 0; l < 100; l++)
+				{
+					ress = reduceM(vec, sum, sumSqr, sumCube, sumQuart);
+				}
+
+
+				{
+					TimerGuard timer(time);
+					for (long l = 0; l < TEST_LOOP_SZ; l++)
+					{
+						ress = reduceM(vec, sum, sumSqr, sumCube, sumQuart);
+
+						double sum_mnn = std::get<0>(ress);
+						double sumSqr_mnn = std::get<1>(ress);
+						double sum_Cube = std::get<2>(ress);
+						double sum_Qrt = std::get<3>(ress);
+
+						res = 0.5 * (sum_mnn + sumSqr_mnn + sum_Cube + sum_Qrt);
+					}
+				}
+
+				return std::make_pair(res, numOps(TEST_LOOP_SZ, SZ) / time);
+			};
+
+
+			auto run_res_stl = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, accumulate_run, TEST_LOOP_SZ);
+			auto stats_stl = performanceStats(run_res_stl.m_raw_results);
+
+			auto dr3_raw_results = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, DR3_accumulate, TEST_LOOP_SZ);
+			auto stats_DR3_perf = performanceStats(dr3_raw_results.m_raw_results);
+
+
+			auto dr3_raw_results_mult = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, DR3_accumulate_multi, TEST_LOOP_SZ);
+			auto stats_DR3_perf_mult = performanceStats(dr3_raw_results_mult.m_raw_results);
+
+
+			//print out results
+			for (const auto& perf_stl : stats_stl)
+			{
+				auto  valDr3 = dr3_raw_results.m_calc_results[perf_stl.first];
+				auto  valStl = run_res_stl.m_calc_results[perf_stl.first];
+				auto  valDr3Mult = dr3_raw_results_mult.m_calc_results[perf_stl.first];
+				auto strMatch = valuesAreEqual(valDr3, valStl, valDr3Mult) ? "calcs match" : "cal difference";
+				std::cout << "  std::max_element, size " << perf_stl.first << ", " << perf_stl.second.first << ", + - ," << perf_stl.second.second
+					<< "\t \t DR3 reduce, size " << perf_stl.first << ", " << stats_DR3_perf[perf_stl.first].first << ",  + - ," << stats_DR3_perf[perf_stl.first].second
+					<< "\t \t DR3 reduce_mult, size " << perf_stl.first << ", " << stats_DR3_perf_mult[perf_stl.first].first << ",  + - ," << stats_DR3_perf_mult[perf_stl.first].second
+					<< ", numerical check : " << strMatch << "\n";
+			}
+	
+}
+
+
+
+int main()
+{
+	doMultiStats(7);
+	return 0;
+}
+
+
diff --git a/VectorTest/DR3_tests.cpp b/VectorTest/DR3_tests.cpp
index 00ef779..0a8f294 100644
--- a/VectorTest/DR3_tests.cpp
+++ b/VectorTest/DR3_tests.cpp
@@ -2087,6 +2087,269 @@ TEST(TestDR3, testReduce_Views)
 	testReduce_Vw(65);
 }
 
+////////////////////////////
+
+void testReduce_VecM_2(int SZ)
+{
+
+
+	std::vector<Numeric> input(SZ, asNumber(0.0));
+	std::iota(begin(input), end(input), asNumber(0.0));
+
+	std::random_device rd;
+	std::mt19937 g(rd());
+	std::shuffle(begin(input), end(input), g);
+	VecXX testVec(input);
+	VecXX negTestVec = -testVec + asNumber(SZ / 2);
+
+	auto SUM = [](auto x, auto y) { return x + y; };
+	auto SUM2 = [](auto x, auto y) { return x + y; };
+	auto MAX = [](auto x, auto y) { return iff((x > y), x, y); };
+	auto MIN = [](auto x, auto y) { return iff((x < y), x, y); };
+	auto SUM_SQR = [](auto x, auto y) { return x + y*y; };
+
+	double smSqr = 0.0;
+	for (auto x : input)
+	{
+		smSqr += SUM_SQR(smSqr, x);
+	}
+
+	auto resMulti_2 = reduceM(testVec, SUM, MAX);
+	auto SUM_M = std::get<0>(resMulti_2);
+	auto MAX_M = std::get<1>(resMulti_2);
+	auto expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0));
+	EXPECT_NUMERIC_EQ(SUM_M, asNumber(expectedSum));
+	EXPECT_NUMERIC_EQ(MAX_M, asNumber(asNumber(SZ - 1)));
+
+
+	//auto resMulti_3 = reduceM(testVec, SUM, MAX, MIN);
+	auto resMulti_3 = reduceM(testVec, SUM, MAX, MIN);
+	 auto res_lambda_0 = std::get<0>(resMulti_3);
+	 auto res_lambda_1 = std::get<1>(resMulti_3);
+	 auto res_lambda_2 = std::get<2>(resMulti_3);
+	 expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0));
+	EXPECT_NUMERIC_EQ(res_lambda_0, asNumber(expectedSum));
+	EXPECT_NUMERIC_EQ(res_lambda_1, asNumber(asNumber(SZ - 1)));
+	EXPECT_NUMERIC_EQ(res_lambda_2, asNumber(0)); 
+
+
+	////////////////////
+	
+	auto resMulti_4 = reduceM(testVec, SUM, MAX, MIN, SUM2);
+	auto res_lambda_0_4 = std::get<0>(resMulti_4);
+	auto res_lambda_1_4 = std::get<1>(resMulti_4);
+	auto res_lambda_2_4 = std::get<2>(resMulti_4);
+	auto res_lambda_3_4 = std::get<3>(resMulti_4);
+	expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0));
+	EXPECT_NUMERIC_EQ(res_lambda_3_4, asNumber(expectedSum));
+	EXPECT_NUMERIC_EQ(res_lambda_0_4, asNumber(expectedSum));
+	EXPECT_NUMERIC_EQ(res_lambda_1_4, asNumber(asNumber(SZ - 1)));
+	EXPECT_NUMERIC_EQ(res_lambda_2_4, asNumber(0));
+	
+}
+
+void testReduce_VecM_1(int SZ)
+{
+
+
+	std::vector<Numeric> input(SZ, asNumber(0.0));
+	std::iota(begin(input), end(input), asNumber(0.0));
+
+	std::random_device rd;
+	std::mt19937 g(rd());
+	std::shuffle(begin(input), end(input), g);
+	VecXX testVec(input);
+	VecXX negTestVec = -testVec + asNumber(SZ / 2);
+
+	auto SUM = [](auto x, auto y) { return x + y; };
+	auto SUM2 = [](auto x, auto y) { return x + y; };
+	auto MAX = [](auto x, auto y) { return iff((x > y), x, y); };
+	auto MIN = [](auto x, auto y) { return iff((x < y), x, y); };
+	auto SUM_SQR = [](auto x, auto y) { return x + y * y; };
+
+	double smSqr = 0.0;
+	for (auto x : input)
+	{
+		smSqr += SUM_SQR(smSqr, x);
+	}
+
+	auto resMulti_2 = reduceM1(testVec, SUM, MAX);
+	auto SUM_M = std::get<0>(resMulti_2);
+	auto MAX_M = std::get<1>(resMulti_2);
+	auto expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0));
+	EXPECT_NUMERIC_EQ(SUM_M, asNumber(expectedSum));
+	EXPECT_NUMERIC_EQ(MAX_M, asNumber(asNumber(SZ - 1)));
+
+
+	//auto resMulti_3 = reduceM(testVec, SUM, MAX, MIN);
+	auto resMulti_3 = reduceM1(testVec, SUM, MAX, MIN);
+	auto res_lambda_0 = std::get<0>(resMulti_3);
+	auto res_lambda_1 = std::get<1>(resMulti_3);
+	auto res_lambda_2 = std::get<2>(resMulti_3);
+	expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0));
+	EXPECT_NUMERIC_EQ(res_lambda_0, asNumber(expectedSum));
+	EXPECT_NUMERIC_EQ(res_lambda_1, asNumber(asNumber(SZ - 1)));
+	EXPECT_NUMERIC_EQ(res_lambda_2, asNumber(0));
+
+
+	////////////////////
+
+	auto resMulti_4 = reduceM1(testVec, SUM, MAX, MIN, SUM2);
+	auto res_lambda_0_4 = std::get<0>(resMulti_4);
+	auto res_lambda_1_4 = std::get<1>(resMulti_4);
+	auto res_lambda_2_4 = std::get<2>(resMulti_4);
+	auto res_lambda_3_4 = std::get<3>(resMulti_4);
+	expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0));
+	EXPECT_NUMERIC_EQ(res_lambda_3_4, asNumber(expectedSum));
+	EXPECT_NUMERIC_EQ(res_lambda_0_4, asNumber(expectedSum));
+	EXPECT_NUMERIC_EQ(res_lambda_1_4, asNumber(asNumber(SZ - 1)));
+	EXPECT_NUMERIC_EQ(res_lambda_2_4, asNumber(0));
+
+}
+
+
+void testReduce_VecM_22(int SZ)
+{
+
+
+	std::vector<Numeric> input(SZ, asNumber(0.0));
+	std::iota(begin(input), end(input), asNumber(0.0));
+
+	std::random_device rd;
+	std::mt19937 g(rd());
+	std::shuffle(begin(input), end(input), g);
+	VecXX testVec(input);
+	VecXX negTestVec = -testVec + asNumber(SZ / 2);
+
+	auto SUM = [](auto x, auto y) { return x + y; };
+	auto SUM2 = [](auto x, auto y) { return x + y; };
+	auto MAX = [](auto x, auto y) { return iff((x > y), x, y); };
+	auto MIN = [](auto x, auto y) { return iff((x < y), x, y); };
+	auto SUM_SQR = [](auto x, auto y) { return x + y * y; };
+
+	double smSqr = 0.0;
+	for (auto x : input)
+	{
+		smSqr += SUM_SQR(smSqr, x);
+	}
+
+	auto resMulti_2 = reduceM2(testVec, SUM, MAX);
+	auto SUM_M = std::get<0>(resMulti_2);
+	auto MAX_M = std::get<1>(resMulti_2);
+	auto expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0));
+	EXPECT_NUMERIC_EQ(SUM_M, asNumber(expectedSum));
+	EXPECT_NUMERIC_EQ(MAX_M, asNumber(asNumber(SZ - 1)));
+
+
+	//auto resMulti_3 = reduceM(testVec, SUM, MAX, MIN);
+	auto resMulti_3 = reduceM2(testVec, SUM, MAX, MIN);
+	auto res_lambda_0 = std::get<0>(resMulti_3);
+	auto res_lambda_1 = std::get<1>(resMulti_3);
+	auto res_lambda_2 = std::get<2>(resMulti_3);
+	expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0));
+	EXPECT_NUMERIC_EQ(res_lambda_0, asNumber(expectedSum));
+	EXPECT_NUMERIC_EQ(res_lambda_1, asNumber(asNumber(SZ - 1)));
+	EXPECT_NUMERIC_EQ(res_lambda_2, asNumber(0));
+
+
+	////////////////////
+
+	auto resMulti_4 = reduceM2(testVec, SUM, MAX, MIN, SUM2);
+	auto res_lambda_0_4 = std::get<0>(resMulti_4);
+	auto res_lambda_1_4 = std::get<1>(resMulti_4);
+	auto res_lambda_2_4 = std::get<2>(resMulti_4);
+	auto res_lambda_3_4 = std::get<3>(resMulti_4);
+	expectedSum = asNumber((SZ - (long)(1)) * (SZ / 2.0));
+	EXPECT_NUMERIC_EQ(res_lambda_3_4, asNumber(expectedSum));
+	EXPECT_NUMERIC_EQ(res_lambda_0_4, asNumber(expectedSum));
+	EXPECT_NUMERIC_EQ(res_lambda_1_4, asNumber(asNumber(SZ - 1)));
+	EXPECT_NUMERIC_EQ(res_lambda_2_4, asNumber(0));
+
+}
+
+
+
+
+/*
+tests mult with x4 unroll
+*/
+TEST(TestDR3, testReduceM_VecXX)
+{
+
+	testReduce_VecM_2(2);
+	testReduce_VecM_2(3);
+	testReduce_VecM_2(4);
+
+
+	for (int SZ = 3; SZ < 33; SZ++)
+	{
+		testReduce_VecM_2(SZ);
+	}
+
+	testReduce_VecM_2(34);
+	testReduce_VecM_2(63);
+	testReduce_VecM_2(64);
+	testReduce_VecM_2(65);
+	
+
+	////
+
+	//TO DO TEST with unroll x2
+	//	and unrol x1
+	
+}
+///////////////////////////////////
+
+
+
+/*
+tests mult with x1 unroll
+*/
+TEST(TestDR3, testReduceM1_VecXX)
+{
+	testReduce_VecM_1(32);
+	testReduce_VecM_1(8);
+	testReduce_VecM_1(2);
+	testReduce_VecM_1(3);
+	testReduce_VecM_1(4);
+
+
+	for (int SZ = 3; SZ < 33; SZ++)
+	{
+		testReduce_VecM_1(SZ);
+	}
+	
+	testReduce_VecM_1(34);
+	testReduce_VecM_1(63);
+	testReduce_VecM_1(64);
+	testReduce_VecM_1(65);
+}
+
+
+TEST(TestDR3, testReduceM2_VecXX)
+{
+	testReduce_VecM_22(32);
+	testReduce_VecM_22(8);
+	testReduce_VecM_22(2);
+	testReduce_VecM_22(3);
+	testReduce_VecM_22(4);
+
+
+	for (int SZ = 3; SZ < 33; SZ++)
+	{
+		testReduce_VecM_22(SZ);
+	}
+
+	testReduce_VecM_22(34);
+	testReduce_VecM_22(63);
+	testReduce_VecM_22(64);
+	testReduce_VecM_22(65);
+}
+
+
+
+
+
 
 void testTransformReduceUnitary_Vec(int SZ)
 {
diff --git a/Vectorisation.sln b/Vectorisation.sln
index ceccbfb..9d9a3f4 100644
--- a/Vectorisation.sln
+++ b/Vectorisation.sln
@@ -37,6 +37,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "curveExample", "curveExampl
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "scratch", "scratch\scratch.vcxproj", "{0EE7A152-AE30-4285-A5B1-6252ABC7A418}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "VariadicReducrion", "VariadicReducrion\VariadicReducrion.vcxproj", "{271CF3D5-72FF-4657-9325-4206B8D5C84F}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		cl23-Debug|ARM64 = cl23-Debug|ARM64
@@ -643,6 +645,66 @@ Global
 		{0EE7A152-AE30-4285-A5B1-6252ABC7A418}.Release-23|x64.Build.0 = Release-23|x64
 		{0EE7A152-AE30-4285-A5B1-6252ABC7A418}.Release-23|x86.ActiveCfg = Release|Win32
 		{0EE7A152-AE30-4285-A5B1-6252ABC7A418}.Release-23|x86.Build.0 = Release|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.cl23-Debug|ARM64.ActiveCfg = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.cl23-Debug|ARM64.Build.0 = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.cl23-Debug|x64.ActiveCfg = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.cl23-Debug|x64.Build.0 = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.cl23-Debug|x86.ActiveCfg = Debug|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.cl23-Debug|x86.Build.0 = Debug|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl|ARM64.ActiveCfg = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl|ARM64.Build.0 = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl|x64.ActiveCfg = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl|x64.Build.0 = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl|x86.ActiveCfg = Debug|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl|x86.Build.0 = Debug|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|ARM64.ActiveCfg = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|ARM64.Build.0 = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x64.ActiveCfg = clang-cl23|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x64.Build.0 = clang-cl23|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x86.ActiveCfg = Debug|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x86.Build.0 = Debug|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|ARM64.ActiveCfg = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|ARM64.Build.0 = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x64.ActiveCfg = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x64.Build.0 = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x86.ActiveCfg = Debug|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x86.Build.0 = Debug|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug23|ARM64.ActiveCfg = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug23|ARM64.Build.0 = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug23|x64.ActiveCfg = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug23|x64.Build.0 = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug23|x86.ActiveCfg = Debug|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug23|x86.Build.0 = Debug|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.IC2022|ARM64.ActiveCfg = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.IC2022|ARM64.Build.0 = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.IC2022|x64.ActiveCfg = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.IC2022|x64.Build.0 = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.IC2022|x86.ActiveCfg = Debug|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.IC2022|x86.Build.0 = Debug|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2022|ARM64.ActiveCfg = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2022|ARM64.Build.0 = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2022|x64.ActiveCfg = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2022|x64.Build.0 = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2022|x86.ActiveCfg = Debug|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2022|x86.Build.0 = Debug|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|ARM64.ActiveCfg = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|ARM64.Build.0 = Debug|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x64.ActiveCfg = ICC2023|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x64.Build.0 = ICC2023|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x86.ActiveCfg = Debug|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x86.Build.0 = Debug|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|ARM64.ActiveCfg = Release|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|ARM64.Build.0 = Release|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x64.ActiveCfg = Release|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x64.Build.0 = Release|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x86.ActiveCfg = Release|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x86.Build.0 = Release|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|ARM64.ActiveCfg = Release|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|ARM64.Build.0 = Release|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x64.ActiveCfg = Release-23|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x64.Build.0 = Release-23|x64
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x86.ActiveCfg = Release|Win32
+		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x86.Build.0 = Release|Win32
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/Vectorisation/VecX/accumulate_transform.h b/Vectorisation/VecX/accumulate_transform.h
index 727c483..098f575 100644
--- a/Vectorisation/VecX/accumulate_transform.h
+++ b/Vectorisation/VecX/accumulate_transform.h
@@ -3110,9 +3110,154 @@ VecView<INS_VEC>  ApplySelectTransformUR_XC(const VecView<INS_VEC>& rhs1, OP& co
 }
 
 
+//////experimental unrolled  double transform accumulation
+template< typename INS_VEC, typename TF, typename TF2, typename OP, typename OP2>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType>
+ApplyTransformAccumulate2UR_X2(const Vec<INS_VEC>& rhs1, TF& opTrans, OP& oper, TF2& opTrans2, OP2& oper2)
+{
+	check_vector(rhs1);
+	if (isScalar(rhs1)) // nothing to accumulate with so just return  value
+	{
+		return { rhs1.getScalarValue(),rhs1.getScalarValue() };
+	}
+
+	int sz = rhs1.size();
+	auto pRhs1 = rhs1.start();
+	const int width = InstructionTraits<INS_VEC>::width;
+	int step = 4 * width;
+
+	INS_VEC RHS1;
+	INS_VEC RES;
+
+	INS_VEC RHS2;
+	INS_VEC RES1;
+
+	INS_VEC RHS3;
+	INS_VEC RES2;
+
+	INS_VEC RHS4;
+	INS_VEC RES3;
+
+	//
+	INS_VEC RHS1_A;
+	INS_VEC RES_A;
+
+	INS_VEC RHS2_A;
+	INS_VEC RES1_A;
+
+	INS_VEC RHS3_A;
+	INS_VEC RES2_A;
+
+	INS_VEC RHS4_A;
+	INS_VEC RES3_A;
+
+	//
+
+	int i = 0;
+
+	if (sz >= step * 2)
+	{
+		//initialise first set of registers
+		{
+			RES.load_a(pRhs1 + i);
+			RES1.load_a(pRhs1 + i + width);
+			RES2.load_a(pRhs1 + i + width * 2);
+			RES3.load_a(pRhs1 + i + width * 3);
+
+
+			RES_A = opTrans2(RES);
+			RES1_A = opTrans2(RES1);
+			RES2_A = opTrans2(RES2);
+			RES3_A = opTrans2(RES3);
+
+
+			RES = opTrans(RES);
+			RES1 = opTrans(RES1);
+			RES2 = opTrans(RES2);
+			RES3 = opTrans(RES3);
+		}
+
+		i += step;
+		int rhsSZ = rhs1.size();
+		for (; i <= (rhsSZ - step); i += step)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, opTrans(RHS1));
+			RES_A = oper2(RES_A, opTrans2(RHS1));
+
+			RHS2.load_a(pRhs1 + i + width);
+			RES1 = oper(RES1, opTrans(RHS2));
+			RES1_A = oper2(RES1_A, opTrans2(RHS2));
+
+			RHS3.load_a(pRhs1 + i + width * 2);
+			RES2 = oper(RES2, opTrans(RHS3));
+			RES2_A = oper2(RES2_A, opTrans2(RHS3));
+
+			RHS4.load_a(pRhs1 + i + width * 3);
+			RES3 = oper(RES3, opTrans(RHS4));
+			RES3_A = oper2(RES3_A, opTrans2(RHS4));
+
+		}
+
+		// odd bits
+		for (; i <= rhsSZ - width; i += width)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, opTrans(RHS1));
+			RES_A = oper2(RES_A, opTrans2(RHS1));
+		}
+
+		RES = oper(RES, RES1);
+		RES2 = oper(RES2, RES3);
+		RES = oper(RES, RES2);
+
+		RES_A = oper2(RES_A, RES1_A);
+		RES2_A = oper2(RES2_A, RES3_A);
+		RES_A = oper2(RES_A, RES2_A);
+
+	}
+	else
+	{
+		RES.load_a(pRhs1);
+		RES_A = opTrans2(RES);
+		RES = opTrans(RES);
+
+		i += width;
+		// odd bits
+		for (; i <= sz - width; i += width)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, opTrans(RHS1));
+			RES_A = oper2(RES_A, opTrans2(RHS1));
+		}
+
+	}
+
+	typename InstructionTraits<INS_VEC>::FloatType result = RES[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_A = RES_A[0];
+	int min_wdth = std::min(sz, width);
+	//across vectors lanes  // not assuming horizontal version exist
+	for (int j = 1; j < min_wdth; ++j)
+	{
+		result = ApplyBinaryOperationVec<INS_VEC, OP>(result, RES[j], oper);
+		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(result_A, RES_A[j], oper2);
+	}
+
+	//end bits for vecs not filling padding
+	for (; i < rhs1.size(); ++i)
+	{
+		//this is a bug need to transform
+		result = ApplyBinaryOperationVec<INS_VEC, OP>(pRhs1[i], result, oper);
+		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(pRhs1[i], result_A, oper2);
+	}
+
+	return { result, result_A };
+}
 
 
 
+///   un-rolled  x4 
+///////////////////////////////////////////////
 //////experimental unrolled  double accumulation
 template< typename INS_VEC, typename OP, typename OP2>
 typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType>
@@ -3251,18 +3396,15 @@ typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename Ins
 
 
 
-
-//////////////
-
-//////experimental unrolled  double transform accumulation
-template< typename INS_VEC, typename TF, typename TF2, typename OP, typename OP2>
-typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType>
-ApplyTransformAccumulate2UR_X2(const Vec<INS_VEC>& rhs1, TF& opTrans, OP& oper, TF2& opTrans2, OP2& oper2)
+//////experimental unrolled  triple accumulation/ reduction
+template< typename INS_VEC, typename OP, typename OP2, typename OP3>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType>
+ApplyAccumulate2UR_X2(const Vec<INS_VEC>& rhs1, OP& oper, OP2& oper2, OP3& oper3)
 {
 	check_vector(rhs1);
 	if (isScalar(rhs1)) // nothing to accumulate with so just return  value
 	{
-		return { rhs1.getScalarValue(),rhs1.getScalarValue() };
+		return { rhs1.getScalarValue(),rhs1.getScalarValue() ,rhs1.getScalarValue() };
 	}
 
 	int sz = rhs1.size();
@@ -3297,6 +3439,11 @@ ApplyTransformAccumulate2UR_X2(const Vec<INS_VEC>& rhs1, TF& opTrans, OP& oper,
 
 	//
 
+	INS_VEC RES_B;
+	INS_VEC RES1_B;
+	INS_VEC RES2_B;
+	INS_VEC RES3_B;
+
 	int i = 0;
 
 	if (sz >= step * 2)
@@ -3309,16 +3456,15 @@ ApplyTransformAccumulate2UR_X2(const Vec<INS_VEC>& rhs1, TF& opTrans, OP& oper,
 			RES3.load_a(pRhs1 + i + width * 3);
 
 
-			RES_A = opTrans2(RES);
-			RES1_A = opTrans2(RES1);
-			RES2_A = opTrans2(RES2);
-			RES3_A = opTrans2(RES3);
-
+			RES_A = RES;
+			RES1_A = RES1;
+			RES2_A = RES2;
+			RES3_A = RES3;
 
-			RES = opTrans(RES);
-			RES1 = opTrans(RES1);
-			RES2 = opTrans(RES2);
-			RES3 = opTrans(RES3);
+			RES_B = RES;
+			RES1_B = RES1;
+			RES2_B = RES2;
+			RES3_B = RES3;
 		}
 
 		i += step;
@@ -3326,20 +3472,24 @@ ApplyTransformAccumulate2UR_X2(const Vec<INS_VEC>& rhs1, TF& opTrans, OP& oper,
 		for (; i <= (rhsSZ - step); i += step)
 		{
 			RHS1.load_a(pRhs1 + i);
-			RES = oper(RES, opTrans(RHS1));
-			RES_A = oper2(RES_A, opTrans2(RHS1));
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
 
 			RHS2.load_a(pRhs1 + i + width);
-			RES1 = oper(RES1, opTrans(RHS2));
-			RES1_A = oper2(RES1_A, opTrans2(RHS2));
+			RES1 = oper(RES1, RHS2);
+			RES1_A = oper2(RES1_A, RHS2);
+			RES1_B = oper3(RES1_B, RHS2);
 
 			RHS3.load_a(pRhs1 + i + width * 2);
-			RES2 = oper(RES2, opTrans(RHS3));
-			RES2_A = oper2(RES2_A, opTrans2(RHS3));
+			RES2 = oper(RES2, RHS3);
+			RES2_A = oper2(RES2_A, RHS3);
+			RES2_B = oper3(RES2_B, RHS3);
 
 			RHS4.load_a(pRhs1 + i + width * 3);
-			RES3 = oper(RES3, opTrans(RHS4));
-			RES3_A = oper2(RES3_A, opTrans2(RHS4));
+			RES3 = oper(RES3, RHS4);
+			RES3_A = oper2(RES3_A, RHS4);
+			RES3_B = oper3(RES3_B, RHS4);
 
 		}
 
@@ -3347,8 +3497,9 @@ ApplyTransformAccumulate2UR_X2(const Vec<INS_VEC>& rhs1, TF& opTrans, OP& oper,
 		for (; i <= rhsSZ - width; i += width)
 		{
 			RHS1.load_a(pRhs1 + i);
-			RES = oper(RES, opTrans(RHS1));
-			RES_A = oper2(RES_A, opTrans2(RHS1));
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
 		}
 
 		RES = oper(RES, RES1);
@@ -3359,43 +3510,900 @@ ApplyTransformAccumulate2UR_X2(const Vec<INS_VEC>& rhs1, TF& opTrans, OP& oper,
 		RES2_A = oper2(RES2_A, RES3_A);
 		RES_A = oper2(RES_A, RES2_A);
 
+
+		RES_B = oper3(RES_B, RES1_B);
+		RES2_B = oper3(RES2_B, RES3_B);
+		RES_B = oper3(RES_B, RES2_B);
+
 	}
 	else
 	{
 		RES.load_a(pRhs1);
-		RES_A = opTrans2(RES);
-		RES= opTrans(RES);
+		RES_A = RES;
+		RES_B = RES;
 
 		i += width;
 		// odd bits
 		for (; i <= sz - width; i += width)
 		{
 			RHS1.load_a(pRhs1 + i);
-			RES = oper(RES, opTrans(RHS1));
-			RES_A = oper2(RES_A, opTrans2(RHS1));
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
 		}
 
 	}
 
 	typename InstructionTraits<INS_VEC>::FloatType result = RES[0];
 	typename InstructionTraits<INS_VEC>::FloatType result_A = RES_A[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_B = RES_B[0];
 	int min_wdth = std::min(sz, width);
 	//across vectors lanes  // not assuming horizontal version exist
 	for (int j = 1; j < min_wdth; ++j)
 	{
 		result = ApplyBinaryOperationVec<INS_VEC, OP>(result, RES[j], oper);
 		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(result_A, RES_A[j], oper2);
+		result_B = ApplyBinaryOperationVec<INS_VEC, OP3>(result_B, RES_B[j], oper3);
 	}
 
 	//end bits for vecs not filling padding
 	for (; i < rhs1.size(); ++i)
 	{
-		//this is a bug need to transform
 		result = ApplyBinaryOperationVec<INS_VEC, OP>(pRhs1[i], result, oper);
 		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(pRhs1[i], result_A, oper2);
+		result_B = ApplyBinaryOperationVec<INS_VEC, OP3>(pRhs1[i], result_B, oper3);
 	}
 
-	return { result, result_A };
+	return { result, result_A, result_B };
 }
 
 
+
+//////experimental unrolled quad  accumulation/ reduction
+template< typename INS_VEC, typename OP, typename OP2, typename OP3, typename OP4>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType>
+ApplyAccumulate2UR_X2(const Vec<INS_VEC>& rhs1, OP& oper, OP2& oper2, OP3& oper3, OP4& oper4)
+{
+	check_vector(rhs1);
+	if (isScalar(rhs1)) // nothing to accumulate with so just return  value
+	{
+		return { rhs1.getScalarValue(),rhs1.getScalarValue(),  rhs1.getScalarValue(),rhs1.getScalarValue() };
+	}
+
+	int sz = rhs1.size();
+	auto pRhs1 = rhs1.start();
+	const int width = InstructionTraits<INS_VEC>::width;
+	int step = 4 * width;
+
+	INS_VEC RHS1;
+	INS_VEC RES;
+
+	INS_VEC RHS2;
+	INS_VEC RES1;
+
+	INS_VEC RHS3;
+	INS_VEC RES2;
+
+	INS_VEC RHS4;
+	INS_VEC RES3;
+
+	//
+	INS_VEC RHS1_A;
+	INS_VEC RES_A;
+
+	INS_VEC RHS2_A;
+	INS_VEC RES1_A;
+
+	INS_VEC RHS3_A;
+	INS_VEC RES2_A;
+
+	INS_VEC RHS4_A;
+	INS_VEC RES3_A;
+
+	//
+
+	INS_VEC RES_B;
+	INS_VEC RES1_B;
+	INS_VEC RES2_B;
+	INS_VEC RES3_B;
+
+	INS_VEC RES_C;
+	INS_VEC RES1_C;
+	INS_VEC RES2_C;
+	INS_VEC RES3_C;
+
+
+	int i = 0;
+
+	if (sz >= step * 2)
+	{
+		//initialise first set of registers
+		{
+			RES.load_a(pRhs1 + i);
+			RES1.load_a(pRhs1 + i + width);
+			RES2.load_a(pRhs1 + i + width * 2);
+			RES3.load_a(pRhs1 + i + width * 3);
+
+
+			RES_A = RES;
+			RES1_A = RES1;
+			RES2_A = RES2;
+			RES3_A = RES3;
+
+			RES_B = RES;
+			RES1_B = RES1;
+			RES2_B = RES2;
+			RES3_B = RES3;
+
+			RES_C = RES;
+			RES1_C = RES1;
+			RES2_C = RES2;
+			RES3_C = RES3;
+
+		}
+
+		i += step;
+		int rhsSZ = rhs1.size();
+		for (; i <= (rhsSZ - step); i += step)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
+			RES_C = oper4(RES_C, RHS1);
+
+			RHS2.load_a(pRhs1 + i + width);
+			RES1 = oper(RES1, RHS2);
+			RES1_A = oper2(RES1_A, RHS2);
+			RES1_B = oper3(RES1_B, RHS2);
+			RES1_C = oper4(RES1_C, RHS2);
+
+			RHS3.load_a(pRhs1 + i + width * 2);
+			RES2 = oper(RES2, RHS3);
+			RES2_A = oper2(RES2_A, RHS3);
+			RES2_B = oper3(RES2_B, RHS3);
+			RES2_C = oper4(RES2_C, RHS3);
+
+			RHS4.load_a(pRhs1 + i + width * 3);
+			RES3 = oper(RES3, RHS4);
+			RES3_A = oper2(RES3_A, RHS4);
+			RES3_B = oper3(RES3_B, RHS4);
+			RES3_C = oper4(RES3_C, RHS4);
+
+		}
+
+		// odd bits
+		for (; i <= rhsSZ - width; i += width)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
+			RES_C = oper4(RES_C, RHS1);
+		}
+
+		RES = oper(RES, RES1);
+		RES2 = oper(RES2, RES3);
+		RES = oper(RES, RES2);
+
+		RES_A = oper2(RES_A, RES1_A);
+		RES2_A = oper2(RES2_A, RES3_A);
+		RES_A = oper2(RES_A, RES2_A);
+
+
+		RES_B = oper3(RES_B, RES1_B);
+		RES2_B = oper3(RES2_B, RES3_B);
+		RES_B = oper3(RES_B, RES2_B);
+
+		RES_C = oper4(RES_C, RES1_C);
+		RES2_C = oper4(RES2_C, RES3_C);
+		RES_C = oper4(RES_C, RES2_C);
+
+	}
+	else
+	{
+		RES.load_a(pRhs1);
+		RES_A = RES;
+		RES_B = RES;
+		RES_C = RES;
+
+		i += width;
+		// odd bits
+		for (; i <= sz - width; i += width)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
+			RES_C = oper4(RES_C, RHS1);
+		}
+
+	}
+
+	typename InstructionTraits<INS_VEC>::FloatType result = RES[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_A = RES_A[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_B = RES_B[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_C = RES_C[0];
+	int min_wdth = std::min(sz, width);
+	//across vectors lanes  // not assuming horizontal version exist
+	for (int j = 1; j < min_wdth; ++j)
+	{
+		result = ApplyBinaryOperationVec<INS_VEC, OP>(result, RES[j], oper);
+		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(result_A, RES_A[j], oper2);
+		result_B = ApplyBinaryOperationVec<INS_VEC, OP3>(result_B, RES_B[j], oper3);
+		result_C = ApplyBinaryOperationVec<INS_VEC, OP4>(result_C, RES_C[j], oper4);
+	}
+
+	//end bits for vecs not filling padding
+	for (; i < rhs1.size(); ++i)
+	{
+		result = ApplyBinaryOperationVec<INS_VEC, OP>(pRhs1[i], result, oper);
+		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(pRhs1[i], result_A, oper2);
+		result_B = ApplyBinaryOperationVec<INS_VEC, OP3>(pRhs1[i], result_B, oper3);
+		result_C = ApplyBinaryOperationVec<INS_VEC, OP4>(pRhs1[i], result_C, oper4);
+	}
+
+	return { result, result_A, result_B,  result_C};
+}
+
+
+///   un-rolled  x2 
+///////////////////////////////////////////////
+//////experimental unrolled  double accumulation
+template< typename INS_VEC, typename OP, typename OP2>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType >
+ApplyAccumulate2UR_X2_2(const Vec<INS_VEC>& rhs1, OP& oper, OP2& oper2)
+{
+	check_vector(rhs1);
+	if (isScalar(rhs1)) // nothing to accumulate with so just return  value
+	{
+		return { rhs1.getScalarValue(),rhs1.getScalarValue() };
+	}
+
+	int sz = rhs1.size();
+	auto pRhs1 = rhs1.start();
+	const int width = InstructionTraits<INS_VEC>::width;
+	
+	//int step = 4 * width;
+	int step = 2 * width;
+
+	INS_VEC RHS1;
+	INS_VEC RES;
+
+	INS_VEC RHS2;
+	INS_VEC RES1;
+
+	INS_VEC RHS1_A;
+	INS_VEC RES_A;
+
+	INS_VEC RHS2_A;
+	INS_VEC RES1_A;
+
+	int i = 0;
+
+	if (sz >= step * 2)
+	{
+		//initialise first set of registers
+		{
+			RES.load_a(pRhs1 + i);
+			RES1.load_a(pRhs1 + i + width);
+
+			RES_A = RES;
+			RES1_A = RES1;
+
+		}
+
+		i += step;
+		int rhsSZ = rhs1.size();
+		for (; i <= (rhsSZ - step); i += step)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+
+			RHS2.load_a(pRhs1 + i + width);
+			RES1 = oper(RES1, RHS2);
+			RES1_A = oper2(RES1_A, RHS2);
+
+		}
+
+		// odd bits
+		for (; i <= rhsSZ - width; i += width)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+		}
+
+		RES = oper(RES, RES1);
+		RES_A = oper2(RES_A, RES1_A);
+
+	}
+	else
+	{
+		RES.load_a(pRhs1);
+		RES_A = RES;
+
+		i += width;
+		// odd bits
+		for (; i <= sz - width; i += width)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+		}
+
+	}
+
+	typename InstructionTraits<INS_VEC>::FloatType result = RES[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_A = RES_A[0];
+	int min_wdth = std::min(sz, width);
+	//across vectors lanes  // not assuming horizontal version exist
+	for (int j = 1; j < min_wdth; ++j)
+	{
+		result = ApplyBinaryOperationVec<INS_VEC, OP>(result, RES[j], oper);
+		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(result_A, RES_A[j], oper2);
+	}
+
+	//end bits for vecs not filling padding
+	for (; i < rhs1.size(); ++i)
+	{
+		result = ApplyBinaryOperationVec<INS_VEC, OP>(pRhs1[i], result, oper);
+		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(pRhs1[i], result_A, oper2);
+	}
+
+	return { result, result_A };
+}
+
+
+
+//////experimental unrolled  triple accumulation/ reduction
+template< typename INS_VEC, typename OP, typename OP2, typename OP3>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType>
+ApplyAccumulate2UR_X2_2(const Vec<INS_VEC>& rhs1, OP& oper, OP2& oper2, OP3& oper3)
+{
+	check_vector(rhs1);
+	if (isScalar(rhs1)) // nothing to accumulate with so just return  value
+	{
+		return { rhs1.getScalarValue(),rhs1.getScalarValue() ,rhs1.getScalarValue() };
+	}
+
+	int sz = rhs1.size();
+	auto pRhs1 = rhs1.start();
+	const int width = InstructionTraits<INS_VEC>::width;
+	//int step = 4 * width;
+	int step = 2 * width;
+
+	INS_VEC RHS1;
+	INS_VEC RES;
+
+	INS_VEC RHS2;
+	INS_VEC RES1;
+
+	INS_VEC RHS1_A;
+	INS_VEC RES_A;
+
+	INS_VEC RHS2_A;
+	INS_VEC RES1_A;
+
+
+	INS_VEC RES_B;
+	INS_VEC RES1_B;
+
+	int i = 0;
+
+	if (sz >= step * 2)
+	{
+		//initialise first set of registers
+		{
+			RES.load_a(pRhs1 + i);
+			RES1.load_a(pRhs1 + i + width);
+
+			RES_A = RES;
+			RES1_A = RES1;
+
+			RES_B = RES;
+			RES1_B = RES1;
+		}
+
+		i += step;
+		int rhsSZ = rhs1.size();
+		for (; i <= (rhsSZ - step); i += step)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
+
+			RHS2.load_a(pRhs1 + i + width);
+			RES1 = oper(RES1, RHS2);
+			RES1_A = oper2(RES1_A, RHS2);
+			RES1_B = oper3(RES1_B, RHS2);
+
+		}
+
+		// odd bits
+		for (; i <= rhsSZ - width; i += width)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
+		}
+
+		RES = oper(RES, RES1);
+		RES_A = oper2(RES_A, RES1_A);
+		RES_B = oper3(RES_B, RES1_B);
+
+	}
+	else
+	{
+		RES.load_a(pRhs1);
+		RES_A = RES;
+		RES_B = RES;
+
+		i += width;
+		// odd bits
+		for (; i <= sz - width; i += width)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
+		}
+
+	}
+
+	typename InstructionTraits<INS_VEC>::FloatType result = RES[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_A = RES_A[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_B = RES_B[0];
+	int min_wdth = std::min(sz, width);
+	//across vectors lanes  // not assuming horizontal version exist
+	for (int j = 1; j < min_wdth; ++j)
+	{
+		result = ApplyBinaryOperationVec<INS_VEC, OP>(result, RES[j], oper);
+		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(result_A, RES_A[j], oper2);
+		result_B = ApplyBinaryOperationVec<INS_VEC, OP3>(result_B, RES_B[j], oper3);
+	}
+
+	//end bits for vecs not filling padding
+	for (; i < rhs1.size(); ++i)
+	{
+		result = ApplyBinaryOperationVec<INS_VEC, OP>(pRhs1[i], result, oper);
+		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(pRhs1[i], result_A, oper2);
+		result_B = ApplyBinaryOperationVec<INS_VEC, OP3>(pRhs1[i], result_B, oper3);
+	}
+
+	return { result, result_A, result_B };
+}
+
+//
+
+//////experimental unrolled quad  accumulation/ reduction
+template< typename INS_VEC, typename OP, typename OP2, typename OP3, typename OP4>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType>
+ApplyAccumulate2UR_X2_2(const Vec<INS_VEC>& rhs1, OP& oper, OP2& oper2, OP3& oper3, OP4& oper4)
+{
+	check_vector(rhs1);
+	if (isScalar(rhs1)) // nothing to accumulate with so just return  value
+	{
+		return { rhs1.getScalarValue(),rhs1.getScalarValue(), rhs1.getScalarValue(),rhs1.getScalarValue() };
+	}
+
+	int sz = rhs1.size();
+	auto pRhs1 = rhs1.start();
+	const int width = InstructionTraits<INS_VEC>::width;
+	
+	int step = 2 * width;
+
+	INS_VEC RHS1;
+	INS_VEC RES;
+
+	INS_VEC RHS2;
+	INS_VEC RES1;
+
+	INS_VEC RHS1_A;
+	INS_VEC RES_A;
+
+	INS_VEC RHS2_A;
+	INS_VEC RES1_A;
+
+	INS_VEC RES_B;
+	INS_VEC RES1_B;
+
+	INS_VEC RES_C;
+	INS_VEC RES1_C;
+
+
+	int i = 0;
+
+	if (sz >= step * 2)
+	{
+		//initialise first set of registers
+		{
+			RES.load_a(pRhs1 + i);
+			RES1.load_a(pRhs1 + i + width);
+
+			RES_A = RES;
+			RES1_A = RES1;
+
+			RES_B = RES;
+			RES1_B = RES1;
+
+			RES_C = RES;
+			RES1_C = RES1;
+
+		}
+
+		i += step;
+		int rhsSZ = rhs1.size();
+		for (; i <= (rhsSZ - step); i += step)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
+			RES_C = oper4(RES_C, RHS1);
+
+			RHS2.load_a(pRhs1 + i + width);
+			RES1 = oper(RES1, RHS2);
+			RES1_A = oper2(RES1_A, RHS2);
+			RES1_B = oper3(RES1_B, RHS2);
+			RES1_C = oper4(RES1_C, RHS2);
+
+		}
+
+		// odd bits
+		for (; i <= rhsSZ - width; i += width)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
+			RES_C = oper4(RES_C, RHS1);
+		}
+
+		RES = oper(RES, RES1);
+		RES_A = oper2(RES_A, RES1_A);
+		RES_B = oper3(RES_B, RES1_B);
+		RES_C = oper4(RES_C, RES1_C);
+
+	}
+	else
+	{
+		RES.load_a(pRhs1);
+		RES_A = RES;
+		RES_B = RES;
+		RES_C = RES;
+
+		i += width;
+		// odd bits
+		for (; i <= sz - width; i += width)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
+			RES_C = oper4(RES_C, RHS1);
+		}
+
+	}
+
+	typename InstructionTraits<INS_VEC>::FloatType result = RES[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_A = RES_A[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_B = RES_B[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_C = RES_C[0];
+	int min_wdth = std::min(sz, width);
+	//across vectors lanes  // not assuming horizontal version exist
+	for (int j = 1; j < min_wdth; ++j)
+	{
+		result = ApplyBinaryOperationVec<INS_VEC, OP>(result, RES[j], oper);
+		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(result_A, RES_A[j], oper2);
+		result_B = ApplyBinaryOperationVec<INS_VEC, OP3>(result_B, RES_B[j], oper3);
+		result_C = ApplyBinaryOperationVec<INS_VEC, OP4>(result_C, RES_C[j], oper4);
+	}
+
+	//end bits for vecs not filling padding
+	for (; i < rhs1.size(); ++i)
+	{
+		result = ApplyBinaryOperationVec<INS_VEC, OP>(pRhs1[i], result, oper);
+		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(pRhs1[i], result_A, oper2);
+		result_B = ApplyBinaryOperationVec<INS_VEC, OP3>(pRhs1[i], result_B, oper3);
+		result_C = ApplyBinaryOperationVec<INS_VEC, OP4>(pRhs1[i], result_C, oper4);
+	}
+
+	return { result, result_A, result_B,  result_C };
+}
+
+
+////////////////////////////   unrolled x1
+
+///////////////////////////////////////////////
+//////experimental unrolled  double accumulation
+template< typename INS_VEC, typename OP, typename OP2>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType >
+ApplyAccumulate2UR_X2_1(const Vec<INS_VEC>& rhs1, OP& oper, OP2& oper2)
+{
+	check_vector(rhs1);
+	if (isScalar(rhs1)) // nothing to accumulate with so just return  value
+	{
+		return { rhs1.getScalarValue(),rhs1.getScalarValue() };
+	}
+
+	int sz = rhs1.size();
+	auto pRhs1 = rhs1.start();
+	const int width = InstructionTraits<INS_VEC>::width;
+
+	int step =  width;
+
+	INS_VEC RHS1;
+	INS_VEC RES;
+	INS_VEC RES_A;
+
+	int i = 0;
+
+	if (sz >= step * 2)
+	{
+		//initialise first set of registers
+		{
+			RES.load_a(pRhs1 + i);
+			RES_A = RES;
+		}
+
+		i += step;
+		int rhsSZ = rhs1.size();
+		for (; i <= (rhsSZ - step); i += step)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+		}
+		
+		// odd bits not needed
+		for (; i <= rhsSZ - width; i += width)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+		}
+
+	}
+	else
+	{
+		RES.load_a(pRhs1);
+		RES_A = RES;
+
+		i += width;
+		// odd bits
+		for (; i <= sz - width; i += width)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+		}
+
+	}
+
+	typename InstructionTraits<INS_VEC>::FloatType result = RES[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_A = RES_A[0];
+	int min_wdth = std::min(sz, width);
+	//across vectors lanes  // not assuming horizontal version exist
+	for (int j = 1; j < min_wdth; ++j)
+	{
+		result = ApplyBinaryOperationVec<INS_VEC, OP>(result, RES[j], oper);
+		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(result_A, RES_A[j], oper2);
+	}
+
+	//end bits for vecs not filling padding
+	for (; i < rhs1.size(); ++i)
+	{
+		result = ApplyBinaryOperationVec<INS_VEC, OP>(pRhs1[i], result, oper);
+		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(pRhs1[i], result_A, oper2);
+	}
+
+	return { result, result_A };
+}
+
+
+
+//////experimental unrolled  triple accumulation/ reduction
+template< typename INS_VEC, typename OP, typename OP2, typename OP3>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType>
+ApplyAccumulate2UR_X2_1(const Vec<INS_VEC>& rhs1, OP& oper, OP2& oper2, OP3& oper3)
+{
+	check_vector(rhs1);
+	if (isScalar(rhs1)) // nothing to accumulate with so just return  value
+	{
+		return { rhs1.getScalarValue(),rhs1.getScalarValue() ,rhs1.getScalarValue()};
+	}
+
+	int sz = rhs1.size();
+	auto pRhs1 = rhs1.start();
+	const int width = InstructionTraits<INS_VEC>::width;
+	
+	int step =  width;
+
+	INS_VEC RHS1;
+	INS_VEC RES;
+	INS_VEC RES_A;
+	INS_VEC RES_B;
+
+	int i = 0;
+
+	if (sz >= step * 2)
+	{
+		//initialise first set of registers
+		{
+			RES.load_a(pRhs1 + i);
+			RES_A = RES;
+			RES_B = RES;
+		
+		}
+
+		i += step;
+		int rhsSZ = rhs1.size();
+		for (; i <= (rhsSZ - step); i += step)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
+
+		}
+
+		// odd bits
+		for (; i <= rhsSZ - width; i += width)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
+		}
+
+
+	}
+	else
+	{
+		RES.load_a(pRhs1);
+		RES_A = RES;
+		RES_B = RES;
+
+		i += width;
+		// odd bits
+		for (; i <= sz - width; i += width)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
+		}
+
+	}
+
+	typename InstructionTraits<INS_VEC>::FloatType result = RES[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_A = RES_A[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_B = RES_B[0];
+	int min_wdth = std::min(sz, width);
+	//across vectors lanes  // not assuming horizontal version exist
+	for (int j = 1; j < min_wdth; ++j)
+	{
+		result = ApplyBinaryOperationVec<INS_VEC, OP>(result, RES[j], oper);
+		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(result_A, RES_A[j], oper2);
+		result_B = ApplyBinaryOperationVec<INS_VEC, OP3>(result_B, RES_B[j], oper3);
+	}
+
+	//end bits for vecs not filling padding
+	for (; i < rhs1.size(); ++i)
+	{
+		result = ApplyBinaryOperationVec<INS_VEC, OP>(pRhs1[i], result, oper);
+		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(pRhs1[i], result_A, oper2);
+		result_B = ApplyBinaryOperationVec<INS_VEC, OP3>(pRhs1[i], result_B, oper3);
+	}
+
+	return { result, result_A, result_B };
+}
+
+
+//////experimental unrolled  quad accumulation/ reduction
+template< typename INS_VEC, typename OP, typename OP2, typename OP3, typename OP4>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType,  typename InstructionTraits<INS_VEC>::FloatType>
+ApplyAccumulate2UR_X2_1(const Vec<INS_VEC>& rhs1, OP& oper, OP2& oper2, OP3& oper3, OP4& oper4)
+{
+	check_vector(rhs1);
+	if (isScalar(rhs1)) // nothing to accumulate with so just return  value
+	{
+		return { rhs1.getScalarValue(),rhs1.getScalarValue() ,rhs1.getScalarValue() ,rhs1.getScalarValue() };
+	}
+
+	int sz = rhs1.size();
+	auto pRhs1 = rhs1.start();
+	const int width = InstructionTraits<INS_VEC>::width;
+
+	int step = width;
+
+	INS_VEC RHS1;
+	INS_VEC RES;
+	INS_VEC RES_A;
+	INS_VEC RES_B;
+	INS_VEC RES_C;
+
+	int i = 0;
+
+	if (sz >= step * 2)
+	{
+		//initialise first set of registers
+		{
+			RES.load_a(pRhs1 + i);
+			RES_A = RES;
+			RES_B = RES;
+			RES_C = RES;
+
+		}
+
+		i += step;
+		int rhsSZ = rhs1.size();
+		for (; i <= (rhsSZ - step); i += step)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
+			RES_C = oper4(RES_C, RHS1);
+
+		}
+
+		// odd bits
+		for (; i <= rhsSZ - width; i += width)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
+			RES_C = oper4(RES_C, RHS1);
+		}
+	}
+	else
+	{
+		RES.load_a(pRhs1);
+		RES_A = RES;
+		RES_B = RES;
+		RES_C = RES;
+
+		i += width;
+		// odd bits
+		for (; i <= sz - width; i += width)
+		{
+			RHS1.load_a(pRhs1 + i);
+			RES = oper(RES, RHS1);
+			RES_A = oper2(RES_A, RHS1);
+			RES_B = oper3(RES_B, RHS1);
+			RES_C = oper3(RES_C, RHS1);
+		}
+
+	}
+
+	typename InstructionTraits<INS_VEC>::FloatType result = RES[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_A = RES_A[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_B = RES_B[0];
+	typename InstructionTraits<INS_VEC>::FloatType result_C = RES_C[0];
+	int min_wdth = std::min(sz, width);
+	//across vectors lanes  // not assuming horizontal version exist
+	for (int j = 1; j < min_wdth; ++j)
+	{
+		result = ApplyBinaryOperationVec<INS_VEC, OP>(result, RES[j], oper);
+		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(result_A, RES_A[j], oper2);
+		result_B = ApplyBinaryOperationVec<INS_VEC, OP3>(result_B, RES_B[j], oper3);
+		result_C = ApplyBinaryOperationVec<INS_VEC, OP4>(result_C, RES_C[j], oper4);
+	}
+
+	//end bits for vecs not filling padding
+	for (; i < rhs1.size(); ++i)
+	{
+		result = ApplyBinaryOperationVec<INS_VEC, OP>(pRhs1[i], result, oper);
+		result_A = ApplyBinaryOperationVec<INS_VEC, OP2>(pRhs1[i], result_A, oper2);
+		result_B = ApplyBinaryOperationVec<INS_VEC, OP3>(pRhs1[i], result_B, oper3);
+		result_C = ApplyBinaryOperationVec<INS_VEC, OP4>(pRhs1[i], result_C, oper4);
+	}
+
+	return { result, result_A, result_B, result_C };
+}
diff --git a/Vectorisation/VecX/dr3.h b/Vectorisation/VecX/dr3.h
index b9c5a6a..80becc2 100644
--- a/Vectorisation/VecX/dr3.h
+++ b/Vectorisation/VecX/dr3.h
@@ -722,8 +722,92 @@ reduceM(const Vec<INS_VEC>& rhs, OP& oper, OP1& oper1)
 }
 
 
+//////experimental unrolled  triple accumulation
+template< typename INS_VEC, typename OP, typename OP1, typename OP2>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType>
+reduceM(const Vec<INS_VEC>& rhs, OP& oper, OP1& oper1, OP2& oper2)
+{
+	return ApplyAccumulate2UR_X2(rhs, oper, oper1, oper2);
+
+}
+
+
+//////experimental unrolled  quad accumulation
+template< typename INS_VEC, typename OP, typename OP1, typename OP2, typename OP3>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType>
+reduceM(const Vec<INS_VEC>& rhs, OP& oper, OP1& oper1, OP2& oper2, OP3& oper3)
+{
+	return ApplyAccumulate2UR_X2(rhs, oper, oper1, oper2, oper3);
+
+}
+
+////////////////////////////// better dispatch to do 
+//////experimental unrolled  double accumulation
+template< typename INS_VEC, typename OP, typename OP1>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType>
+reduceM1(const Vec<INS_VEC>& rhs, OP& oper, OP1& oper1)
+{
+	return ApplyAccumulate2UR_X2_1(rhs, oper, oper1);
+
+}
+
+
+//////experimental unrolled  triple accumulation
+template< typename INS_VEC, typename OP, typename OP1, typename OP2>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType>
+reduceM1(const Vec<INS_VEC>& rhs, OP& oper, OP1& oper1, OP2& oper2)
+{
+	return ApplyAccumulate2UR_X2_1(rhs, oper, oper1, oper2);
+
+}
+
+
+//////experimental unrolled  quad accumulation
+template< typename INS_VEC, typename OP, typename OP1, typename OP2, typename OP3>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType>
+reduceM1(const Vec<INS_VEC>& rhs, OP& oper, OP1& oper1, OP2& oper2, OP3& oper3)
+{
+	return ApplyAccumulate2UR_X2_1(rhs, oper, oper1, oper2, oper3);
+
+}
+
+//////////////////////////////////////////
+
+////////////////////////////// better dispatch to do 
+//////experimental unrolled  double accumulation
+template< typename INS_VEC, typename OP, typename OP1>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType>
+reduceM2(const Vec<INS_VEC>& rhs, OP& oper, OP1& oper1)
+{
+	return ApplyAccumulate2UR_X2_2(rhs, oper, oper1);
+
+}
+
+
+//////experimental unrolled  triple accumulation
+template< typename INS_VEC, typename OP, typename OP1, typename OP2>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType>
+reduceM2(const Vec<INS_VEC>& rhs, OP& oper, OP1& oper1, OP2& oper2)
+{
+	return ApplyAccumulate2UR_X2_2(rhs, oper, oper1, oper2);
+
+}
+
+
+//////experimental unrolled  quad accumulation
+template< typename INS_VEC, typename OP, typename OP1, typename OP2, typename OP3>
+typename std::tuple<typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType, typename InstructionTraits<INS_VEC>::FloatType>
+reduceM2(const Vec<INS_VEC>& rhs, OP& oper, OP1& oper1, OP2& oper2, OP3& oper3)
+{
+	return ApplyAccumulate2UR_X2_2(rhs, oper, oper1, oper2, oper3);
+
+}
+
+
+
 
 
+//////////////////////////////////
 //unitary transform
 template< typename INS_VEC, typename OPT, typename OP>
 typename InstructionTraits<INS_VEC>::FloatType transformReduce1(const Vec<INS_VEC>& rhs1, OPT& operTransform, OP& operAcc, typename InstructionTraits<INS_VEC>::FloatType initVal = InstructionTraits<INS_VEC>::nullValue, bool singularInit = true)
diff --git a/Vectorisation/Vectorisation.vcxproj b/Vectorisation/Vectorisation.vcxproj
index 6a808c1..30ab090 100644
--- a/Vectorisation/Vectorisation.vcxproj
+++ b/Vectorisation/Vectorisation.vcxproj
@@ -287,7 +287,7 @@
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>ClangCL</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
     <EnableASAN>false</EnableASAN>