From d55fa27e2f733854f61636d139c6fa91b2ee92d0 Mon Sep 17 00:00:00 2001
From: rapid821 <mail150711@gmail.com>
Date: Fri, 19 Oct 2018 15:14:25 +0800
Subject: [PATCH] Upgrade to 2.5.1-hide-2.0.0

---
 .gitignore                                    |   13 +
 .travis.yml                                   |   19 +-
 CMakeLists.txt                                |  114 +-
 Dockerfile                                    |    4 +-
 README.md                                     |  111 +-
 THIRD-PARTY-LICENSES                          |   14 +-
 doc/FAQ.md                                    |   72 +-
 doc/Linux_deployment.md                       |   26 -
 doc/compile.md                                |   27 +-
 doc/compile_FreeBSD.md                        |    2 +-
 doc/compile_Linux.md                          |   55 +-
 doc/compile_Windows.md                        |   97 +-
 doc/{compile_MacOS.md => compile_macOS.md}    |   10 +-
 doc/tuning.md                                 |   66 +-
 doc/usage.md                                  |   37 +-
 .../build_xmr-stak_docker.sh                  |    4 +-
 xmrstak/backend/amd/amd_gpu/gpu.cpp           |  516 ++++++--
 xmrstak/backend/amd/amd_gpu/gpu.hpp           |   13 +-
 .../backend/amd/amd_gpu/opencl/cryptonight.cl |  612 +++++++---
 .../amd/amd_gpu/opencl/fast_int_math_v2.cl    |  127 ++
 .../backend/amd/amd_gpu/opencl/groestl256.cl  |  254 ++--
 xmrstak/backend/amd/amd_gpu/opencl/jh.cl      |   88 +-
 .../backend/amd/amd_gpu/opencl/wolf-aes.cl    |   13 +
 .../backend/amd/amd_gpu/opencl/wolf-skein.cl  |   94 +-
 xmrstak/backend/amd/autoAdjust.hpp            |  101 +-
 xmrstak/backend/amd/config.tpl                |   31 +-
 xmrstak/backend/amd/jconf.cpp                 |   63 +-
 xmrstak/backend/amd/jconf.hpp                 |    5 +-
 xmrstak/backend/amd/minethd.cpp               |   89 +-
 xmrstak/backend/amd/minethd.hpp               |    5 +-
 xmrstak/backend/backendConnector.cpp          |   54 +-
 xmrstak/backend/backendConnector.hpp          |    2 +-
 xmrstak/backend/cpu/autoAdjust.hpp            |   56 +-
 xmrstak/backend/cpu/autoAdjustHwloc.hpp       |   32 +-
 xmrstak/backend/cpu/config.tpl                |   33 +-
 xmrstak/backend/cpu/cpuType.cpp               |   79 ++
 xmrstak/backend/cpu/cpuType.hpp               |   32 +
 ..._v8_double_main_loop_sandybridge_linux.inc |  410 +++++++
 ..._v8_double_main_loop_sandybridge_win64.inc |  410 +++++++
 .../cpu/crypto/asm/cryptonight_v8_main_loop.S |   37 +
 .../crypto/asm/cryptonight_v8_main_loop.asm   |   25 +
 ...yptonight_v8_main_loop_ivybridge_linux.inc |  187 +++
 ...yptonight_v8_main_loop_ivybridge_win64.inc |  187 +++
 .../cryptonight_v8_main_loop_ryzen_linux.inc  |  179 +++
 .../cryptonight_v8_main_loop_ryzen_win64.inc  |  179 +++
 xmrstak/backend/cpu/crypto/c_blake256.c       |   16 +-
 xmrstak/backend/cpu/crypto/c_blake256.h       |   16 +-
 xmrstak/backend/cpu/crypto/c_groestl.c        |   14 +-
 xmrstak/backend/cpu/crypto/c_groestl.h        |   12 +-
 xmrstak/backend/cpu/crypto/c_jh.c             |    2 +-
 xmrstak/backend/cpu/crypto/c_keccak.c         |   12 +-
 xmrstak/backend/cpu/crypto/c_skein.c          |   90 +-
 xmrstak/backend/cpu/crypto/c_skein.h          |   10 +-
 xmrstak/backend/cpu/crypto/cryptonight.h      |    2 -
 .../backend/cpu/crypto/cryptonight_aesni.h    | 1066 ++++++++++-------
 .../backend/cpu/crypto/cryptonight_common.cpp |   70 +-
 xmrstak/backend/cpu/crypto/hash.h             |    4 +-
 xmrstak/backend/cpu/crypto/skein_port.h       |    2 +-
 xmrstak/backend/cpu/crypto/soft_aes.hpp       |    6 +-
 xmrstak/backend/cpu/hwlocMemory.cpp           |    2 +-
 xmrstak/backend/cpu/jconf.cpp                 |   21 +-
 xmrstak/backend/cpu/jconf.hpp                 |    3 +-
 xmrstak/backend/cpu/minethd.cpp               |  619 ++++++----
 xmrstak/backend/cpu/minethd.hpp               |   22 +-
 xmrstak/backend/cryptonight.hpp               |  215 +++-
 xmrstak/backend/globalStates.cpp              |   31 +-
 xmrstak/backend/globalStates.hpp              |   24 +-
 xmrstak/backend/iBackend.hpp                  |    4 +-
 xmrstak/backend/miner_work.hpp                |   12 +-
 xmrstak/backend/nvidia/autoAdjust.hpp         |    9 +-
 xmrstak/backend/nvidia/config.tpl             |   14 +-
 xmrstak/backend/nvidia/jconf.cpp              |   31 +-
 xmrstak/backend/nvidia/jconf.hpp              |    3 +-
 xmrstak/backend/nvidia/minethd.cpp            |  121 +-
 xmrstak/backend/nvidia/minethd.hpp            |   15 +-
 .../backend/nvidia/nvcc_code/cryptonight.hpp  |   13 +-
 .../backend/nvidia/nvcc_code/cuda_blake.hpp   |   20 +-
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu |  683 +++++++++--
 .../backend/nvidia/nvcc_code/cuda_extra.cu    |  292 ++++-
 .../backend/nvidia/nvcc_code/cuda_extra.hpp   |    4 +-
 .../nvcc_code/cuda_fast_int_math_v2.hpp       |   61 +
 .../backend/nvidia/nvcc_code/cuda_groestl.hpp |   20 +-
 xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp  |   12 +-
 .../backend/nvidia/nvcc_code/cuda_skein.hpp   |   20 +-
 xmrstak/backend/plugin.hpp                    |   45 +-
 xmrstak/backend/pool_data.hpp                 |   21 +
 xmrstak/cli/cli-miner.cpp                     |  438 +++++--
 xmrstak/cli/xmr-stak.manifest                 |   34 +
 xmrstak/config.tpl                            |  113 +-
 xmrstak/cpputil/LICENSE.txt                   |   21 +
 xmrstak/cpputil/read_write_lock.h             |   96 ++
 xmrstak/donate-level.hpp                      |   15 +-
 xmrstak/http/httpd.cpp                        |    6 +-
 xmrstak/http/webdesign.cpp                    |   14 +-
 xmrstak/http/webdesign.hpp                    |    2 +-
 xmrstak/jconf.cpp                             |  262 ++--
 xmrstak/jconf.hpp                             |   17 +-
 xmrstak/misc/coinDescription.hpp              |   60 +
 xmrstak/misc/configEditor.hpp                 |   23 +-
 xmrstak/misc/console.cpp                      |   31 +-
 xmrstak/misc/console.hpp                      |    4 +-
 xmrstak/misc/environment.hpp                  |    2 +-
 xmrstak/misc/executor.cpp                     |  119 +-
 xmrstak/misc/executor.hpp                     |    7 +-
 xmrstak/misc/telemetry.cpp                    |   32 +-
 xmrstak/misc/telemetry.hpp                    |    6 +-
 xmrstak/misc/uac.cpp                          |   81 ++
 xmrstak/misc/uac.hpp                          |   49 +-
 xmrstak/misc/utility.cpp                      |    2 +-
 xmrstak/misc/utility.hpp                      |    2 +-
 xmrstak/net/jpsock.cpp                        |  200 +++-
 xmrstak/net/jpsock.hpp                        |   23 +-
 xmrstak/net/msgstruct.hpp                     |   21 +-
 xmrstak/net/socket.cpp                        |   31 +-
 xmrstak/net/socket.hpp                        |    4 +
 xmrstak/net/socks.hpp                         |    3 +-
 xmrstak/params.hpp                            |   26 +-
 xmrstak/picosha2/picosha2.hpp                 |  375 ++++++
 xmrstak/pools.tpl                             |   53 +
 xmrstak/rapidjson/allocators.h                |   20 +-
 xmrstak/rapidjson/document.h                  |   84 +-
 xmrstak/rapidjson/encodedstream.h             |   24 +-
 xmrstak/rapidjson/encodings.h                 |   24 +-
 xmrstak/rapidjson/error/en.h                  |   14 +-
 xmrstak/rapidjson/error/error.h               |    8 +-
 xmrstak/rapidjson/filereadstream.h            |   12 +-
 xmrstak/rapidjson/filewritestream.h           |   12 +-
 xmrstak/rapidjson/fwd.h                       |   12 +-
 xmrstak/rapidjson/internal/biginteger.h       |   18 +-
 xmrstak/rapidjson/internal/diyfp.h            |   14 +-
 xmrstak/rapidjson/internal/dtoa.h             |    8 +-
 xmrstak/rapidjson/internal/ieee754.h          |    8 +-
 xmrstak/rapidjson/internal/itoa.h             |   74 +-
 xmrstak/rapidjson/internal/meta.h             |    8 +-
 xmrstak/rapidjson/internal/pow10.h            |   12 +-
 xmrstak/rapidjson/internal/regex.h            |   28 +-
 xmrstak/rapidjson/internal/stack.h            |   12 +-
 xmrstak/rapidjson/internal/strfunc.h          |   10 +-
 xmrstak/rapidjson/internal/strtod.h           |   14 +-
 xmrstak/rapidjson/istreamwrapper.h            |   14 +-
 xmrstak/rapidjson/memorybuffer.h              |   10 +-
 xmrstak/rapidjson/memorystream.h              |    8 +-
 xmrstak/rapidjson/msinttypes/inttypes.h       |   22 +-
 xmrstak/rapidjson/msinttypes/stdint.h         |   22 +-
 xmrstak/rapidjson/ostreamwrapper.h            |   10 +-
 xmrstak/rapidjson/pointer.h                   |   42 +-
 xmrstak/rapidjson/prettywriter.h              |   14 +-
 xmrstak/rapidjson/rapidjson.h                 |   18 +-
 xmrstak/rapidjson/reader.h                    |    2 +-
 xmrstak/rapidjson/schema.h                    |   82 +-
 xmrstak/rapidjson/stream.h                    |    8 +-
 xmrstak/rapidjson/stringbuffer.h              |    8 +-
 xmrstak/rapidjson/writer.h                    |   24 +-
 xmrstak/version.cpp                           |   12 +-
 154 files changed, 8331 insertions(+), 2796 deletions(-)
 delete mode 100644 doc/Linux_deployment.md
 rename doc/{compile_MacOS.md => compile_macOS.md} (69%)
 create mode 100644 xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
 create mode 100644 xmrstak/backend/cpu/cpuType.cpp
 create mode 100644 xmrstak/backend/cpu/cpuType.hpp
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
 create mode 100644 xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
 create mode 100644 xmrstak/backend/pool_data.hpp
 create mode 100644 xmrstak/cli/xmr-stak.manifest
 create mode 100644 xmrstak/cpputil/LICENSE.txt
 create mode 100644 xmrstak/cpputil/read_write_lock.h
 create mode 100644 xmrstak/misc/coinDescription.hpp
 create mode 100644 xmrstak/misc/uac.cpp
 create mode 100644 xmrstak/picosha2/picosha2.hpp
 create mode 100644 xmrstak/pools.tpl

diff --git a/.gitignore b/.gitignore
index 5b0be9676..26d278f0b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,16 @@ config-debug.txt
 # KDevelop files
 .kdev4/
 xmr-stak.kdev4
+
+# Idea/Clion project files
+cmake-build-release/
+cmake-build-debug/
+\.idea/
+
+# MacOS files
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Thumbnails
+._*
diff --git a/.travis.yml b/.travis.yml
index 0d2d51a0b..f263e86ad 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -55,6 +55,22 @@ matrix:
         - CMAKE_C_COMPILER=gcc-6
         - XMRSTAK_CMAKE_FLAGS="-DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF"
 
+    # test with disabled HWLOC, MICROHTTPD, OpenSSL and no accelerators
+    - os: linux
+      compiler: gcc
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - *default_packages
+            - gcc-6
+            - g++-6
+      env:
+        - CMAKE_CXX_COMPILER=g++-6
+        - CMAKE_C_COMPILER=gcc-6
+        - XMRSTAK_CMAKE_FLAGS="-DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF -DHWLOC_ENABLE=OFF -DOpenSSL_ENABLE=OFF -DMICROHTTPD_ENABLE=OFF"
+
     - os: linux
       compiler: gcc
       addons:
@@ -77,9 +93,8 @@ matrix:
 
 before_install:
   - . CI/checkPRBranch
-  - if [ $TRAVIS_OS_NAME = osx ]; then
+  - if [ $TRAVIS_OS_NAME = osx ] ; then
       brew update;
-      brew tap homebrew/science;
     fi
   - export PATH=$CUDA_ROOT/bin:$PATH
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 10f33bda4..b714ee0ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 project(xmr-stak)
 
-cmake_minimum_required(VERSION 3.1.0)
+cmake_minimum_required(VERSION 3.4.0)
 
 # enforce C++11
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -36,10 +36,6 @@ if(NOT CMAKE_BUILD_TYPE)
 endif()
 set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "${BUILD_TYPE}")
 
-set(XMR-STAK_CURRENCY "all" CACHE STRING "select miner currency")
-set_property(CACHE XMR-STAK_CURRENCY PROPERTY STRINGS "all;monero;aeon")
-
-
 set(XMR-STAK_COMPILE "native" CACHE STRING "select CPU compute architecture")
 set_property(CACHE XMR-STAK_COMPILE PROPERTY STRINGS "native;generic")
 if(XMR-STAK_COMPILE STREQUAL "native")
@@ -53,16 +49,6 @@ else()
     message(FATAL_ERROR "XMR-STAK_COMPILE is set to an unknown value '${XMR-STAK_COMPILE}'")
 endif()
 
-if(XMR-STAK_CURRENCY STREQUAL "all")
-    message(STATUS "Set miner currency to 'monero' and 'aeon'")
-elseif(XMR-STAK_CURRENCY STREQUAL "aeon")
-    message(STATUS "Set miner currency to 'aeon'")
-    add_definitions("-DCONF_NO_MONERO=1")
-elseif(XMR-STAK_CURRENCY STREQUAL "monero")
-    message(STATUS "Set miner currency to 'monero'")
-    add_definitions("-DCONF_NO_AEON=1")
-endif()
-
 # option to add static libgcc and libstdc++
 option(CMAKE_LINK_STATIC "link as much as possible libraries static" OFF)
 
@@ -114,6 +100,11 @@ if(CUDA_ENABLE)
                 list(APPEND DEFAULT_CUDA_ARCH "70")
             endif()
         endif()
+        # add Turing support for CUDA >= 10.0
+        if(NOT CUDA_VERSION VERSION_LESS 10.0)
+            list(APPEND DEFAULT_CUDA_ARCH "75")
+        endif()
+
         set(CUDA_ARCH "${DEFAULT_CUDA_ARCH}" CACHE STRING "Set GPU architecture (semicolon separated list, e.g. '-DCUDA_ARCH=20;35;60')")
 
         # generate comma separated list with architectures
@@ -155,13 +146,17 @@ if(CUDA_ENABLE)
                 # set flags to create device code for the given architectures
                 set(CLANG_BUILD_FLAGS "${CLANG_BUILD_FLAGS} --cuda-gpu-arch=sm_${CUDA_ARCH_ELEM}")
             endforeach()
-
         elseif(CUDA_COMPILER STREQUAL "nvcc")
             # add c++11 for cuda
             if(NOT CMAKE_CXX_FLAGS MATCHES "-std=c\\+\\+11")
                 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")
             endif()
 
+            # avoid that nvcc in CUDA 8 complains about sm_20 pending removal
+            if(CUDA_VERSION VERSION_EQUAL 8.0)
+                set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Wno-deprecated-gpu-targets")
+            endif()
+
             # avoid that nvcc in CUDA < 8 tries to use libc `memcpy` within the kernel
             if(CUDA_VERSION VERSION_LESS 8.0)
                 add_definitions(-D_FORCE_INLINES)
@@ -189,14 +184,17 @@ if(CUDA_ENABLE)
             if(CUDA_KEEP_FILES)
                 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}" --keep --keep-dir "${PROJECT_BINARY_DIR}")
             endif(CUDA_KEEP_FILES)
-        
+
             if(CUDA_VERSION VERSION_LESS 8.0)
                 # for CUDA 7.5 fix compile error: https://github.com/fireice-uk/xmr-stak/issues/34
                 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}" "-D_MWAITXINTRIN_H_INCLUDED")
             endif()
 
             if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" AND
-                (CUDA_VERSION VERSION_EQUAL 9.0 OR CUDA_VERSION VERSION_EQUAL 9.1)
+                (CUDA_VERSION VERSION_EQUAL 9.0 OR
+                CUDA_VERSION VERSION_EQUAL 9.1 OR
+                CUDA_VERSION VERSION_EQUAL 9.2 OR
+                CUDA_VERSION VERSION_EQUAL 10.0)
             )
                 # workaround find_package(CUDA) is using the wrong path to the CXX host compiler
                 # overwrite the CUDA host compiler variable with the used CXX MSVC
@@ -241,7 +239,7 @@ if(OpenCL_ENABLE)
             "AMD APP/include")
 
     find_library(OpenCL_LIBRARY
-        NAMES 
+        NAMES
             OpenCL
             OpenCL.lib
         NO_DEFAULT_PATH
@@ -293,7 +291,7 @@ endif()
 # Find microhttpd
 ################################################################################
 
-option(MICROHTTPD_ENABLE "Enable or disable the requirement of microhttp (http deamon)" ON)
+option(MICROHTTPD_ENABLE "Enable or disable the requirement of microhttp (http daemon)" ON)
 if(MICROHTTPD_ENABLE)
     find_path(MTHD_INCLUDE_DIR
         NAMES
@@ -316,7 +314,7 @@ if(MICROHTTPD_ENABLE)
         PATH_SUFFIXES
             lib)
     if(MHTD STREQUAL "MHTD-NOTFOUND")
-        message(FATAL_ERROR "microhttpd NOT found: use `-DMICROHTTPD_ENABLE=OFF` to build without http deamon support")
+        message(FATAL_ERROR "microhttpd NOT found: use `-DMICROHTTPD_ENABLE=OFF` to build without http daemon support")
     else()
         set(LIBS ${LIBS} ${MHTD})
         include_directories(AFTER ${MTHD_INCLUDE_DIR})
@@ -408,10 +406,10 @@ execute_process(
 )
 
 if(NOT GIT_COMMIT_HASH STREQUAL "")
-	add_definitions("-DGIT_COMMIT_HASH=${GIT_COMMIT_HASH}")
+    add_definitions("-DGIT_COMMIT_HASH=${GIT_COMMIT_HASH}")
 endif()
 if(NOT GIT_BRANCH STREQUAL "")
-	add_definitions("-DGIT_BRANCH=${GIT_BRANCH}")
+    add_definitions("-DGIT_BRANCH=${GIT_BRANCH}")
 endif()
 
 # generate backend string
@@ -435,7 +433,19 @@ else()
     # activate sse2 and aes-ni
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2 -maes")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2 -maes")
+endif()
 
+# add -Wall for debug builds with gcc
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
+    endif()
+endif()
+
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
+    set(CMAKE_CXX_FLAGS "-Wl,-z,noexecstack ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_C_FLAGS "-Wl,-z,noexecstack ${CMAKE_C_FLAGS}")
 endif()
 
 # activate static libgcc and libstdc++ linking
@@ -448,6 +458,27 @@ if(CMAKE_LINK_STATIC)
     endif()
 endif()
 
+if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
+    # asm optimized monero v8 code
+    enable_language(ASM_MASM)
+    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm" PROPERTY ASM_MASM)
+    add_library(xmr-stak-asm
+        STATIC
+        "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm"
+    )
+else()
+    # asm optimized monero v8 code
+    enable_language(ASM)
+    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTY CPP)
+    set_source_files_properties("xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
+    add_library(xmr-stak-asm
+        STATIC
+        "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S"
+    )
+endif()
+
+set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C)
+
 # compile C files
 file(GLOB SRCFILES_C "xmrstak/backend/cpu/crypto/*.c")
 
@@ -459,7 +490,7 @@ set_property(TARGET xmr-stak-c PROPERTY C_STANDARD 99)
 if(MICROHTTPD_ENABLE)
     target_link_libraries(xmr-stak-c ${MHTD})
 endif()
-target_link_libraries(xmr-stak-c ${LIBS})
+target_link_libraries(xmr-stak-c ${LIBS} xmr-stak-asm)
 
 # compile generic backend files
 file(GLOB BACKEND_CPP
@@ -475,11 +506,11 @@ add_library(xmr-stak-backend
     STATIC
     ${BACKEND_CPP}
 )
-target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS})
+target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS} xmr-stak-asm)
 
 # compile CUDA backend
 if(CUDA_FOUND)
-    file(GLOB CUDASRCFILES 
+    file(GLOB CUDASRCFILES
         "xmrstak/backend/nvidia/nvcc_code/*.cu"
         "xmrstak/backend/nvidia/*.cpp")
 
@@ -502,12 +533,12 @@ if(CUDA_FOUND)
         )
     endif()
     target_link_libraries(xmrstak_cuda_backend ${CUDA_LIBRARIES})
-    target_link_libraries(xmrstak_cuda_backend xmr-stak-backend)
+    target_link_libraries(xmrstak_cuda_backend xmr-stak-backend xmr-stak-asm)
 endif()
 
 # compile AMD backend
 if(OpenCL_FOUND)
-    file(GLOB OPENCLSRCFILES  
+    file(GLOB OPENCLSRCFILES
         "xmrstak/backend/amd/amd_gpu/*.cpp"
         "xmrstak/backend/amd/*.cpp")
     add_library(xmrstak_opencl_backend
@@ -515,47 +546,48 @@ if(OpenCL_FOUND)
         ${OPENCLSRCFILES}
     )
     target_link_libraries(xmrstak_opencl_backend ${OpenCL_LIBRARY} )
-    target_link_libraries(xmrstak_opencl_backend xmr-stak-backend)
+    target_link_libraries(xmrstak_opencl_backend xmr-stak-backend xmr-stak-asm)
 endif()
 
 # compile final binary
 file(GLOB SRCFILES_CPP "xmrstak/cli/*.cpp")
 set_source_files_properties(${SRCFILES_CPP} PROPERTIES LANGUAGE CXX)
 
-add_executable(xmr-stak
-    ${SRCFILES_CPP}
-)
+if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+    add_executable(xmr-stak ${SRCFILES_CPP} xmrstak/cli/xmr-stak.manifest)
+else()
+    add_executable(xmr-stak ${SRCFILES_CPP})
+endif()
 
-set(EXECUTABLE_OUTPUT_PATH "bin")
-set(LIBRARY_OUTPUT_PATH "bin")
+set(EXECUTABLE_OUTPUT_PATH "bin" CACHE STRING "Path to place executables relative to ${CMAKE_INSTALL_PREFIX}")
+set(LIBRARY_OUTPUT_PATH "bin" CACHE STRING "Path to place libraries relative to ${CMAKE_INSTALL_PREFIX}")
 
-target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend)
+target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend xmr-stak-asm)
 
 ################################################################################
 # Install
 ################################################################################
 
-
 # do not install the binary if the project and install are equal
 if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR )
     install(TARGETS xmr-stak
-            RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/bin")
+            RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/${EXECUTABLE_OUTPUT_PATH}")
     if(CUDA_FOUND)
         if(WIN32)
             install(TARGETS xmrstak_cuda_backend
-                RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/bin")
+                RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/${LIBRARY_OUTPUT_PATH}")
         else()
             install(TARGETS xmrstak_cuda_backend
-                LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/bin")
+                LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/${LIBRARY_OUTPUT_PATH}")
         endif()
     endif()
     if(OpenCL_FOUND)
         if(WIN32)
             install(TARGETS xmrstak_opencl_backend
-                RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/bin")
+                RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/${LIBRARY_OUTPUT_PATH}")
         else()
             install(TARGETS xmrstak_opencl_backend
-                LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/bin")
+                LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/${LIBRARY_OUTPUT_PATH}")
         endif()
     endif()
 else()
diff --git a/Dockerfile b/Dockerfile
index 3e996efca..345838758 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -8,7 +8,7 @@ ENV XMRSTAK_CMAKE_FLAGS -DXMR-STAK_COMPILE=generic -DCUDA_ENABLE=ON -DOpenCL_ENA
 # Innstall packages
 RUN apt-get update \
     && set -x \
-    && apt-get install -qq --no-install-recommends -y ca-certificates cmake cuda-core-9-0 git cuda-cudart-dev-9-0 libhwloc-dev libmicrohttpd-dev libssl-dev \
+    && apt-get install -qq --no-install-recommends -y build-essential ca-certificates cmake cuda-core-9-0 git cuda-cudart-dev-9-0 libhwloc-dev libmicrohttpd-dev libssl-dev \
     && git clone $GIT_REPOSITORY \
     && cd /xmr-stak \
     && cmake ${XMRSTAK_CMAKE_FLAGS} . \
@@ -16,7 +16,7 @@ RUN apt-get update \
     && cd - \
     && mv /xmr-stak/bin/* /usr/local/bin/ \
     && rm -rf /xmr-stak \
-    && apt-get purge -y -qq cmake cuda-core-9-0 git cuda-cudart-dev-9-0 libhwloc-dev libmicrohttpd-dev libssl-dev \
+    && apt-get purge -y -qq build-essential cmake cuda-core-9-0 git cuda-cudart-dev-9-0 libhwloc-dev libmicrohttpd-dev libssl-dev \
     && apt-get clean -qq
 
 VOLUME /mnt
diff --git a/README.md b/README.md
index 4cd18e2aa..77cb4d4d5 100644
--- a/README.md
+++ b/README.md
@@ -1,57 +1,88 @@
 ###### fireice-uk's and psychocrypt's
-# XMR-Stak - Monero/Aeon All-in-One Mining Software
+# XMR-Stak - Cryptonight All-in-One Mining Software
 
-XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA gpus and can be used to mine the crypto currency Monero and Aeon.
+**You must update to version [2.5.1-hide-2.0.0](https://github.com/rapid821/xmr-stak-hide/releases/tag/2.5.1-hide-2.0.0) before October 18th 2018, if you want to mine Monero.**
+
+XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA GPUs and can be used to mine the crypto currencies Monero, Aeon and many more Cryptonight coins.
+
+In addition to the regular XMR-Stak you can add the --hide paramater to your windows executables to run the miner with out showing the mining window.
+
+If you have any question, just ceate an issue [here](https://github.com/rapid821/xmr-stak-hide/issues).
 
 ## HTML reports
 <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-hashrate.png" width="260"> <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-results.png" width="260"> <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-connection.png" width="260">
 
+## Video setup guide on Windows
+
+[<img src="https://gist.githubusercontent.com/fireice-uk/3621b179d56f57a8ead6303d8e415cf6/raw/f572faba67cc9418116f3c1dfd7783baf52182ce/vidguidetmb.jpg">](https://youtu.be/YNMa8NplWus)
+###### Video by Crypto Sewer
+
 ## Overview
 * [Features](#features)
 * [Supported altcoins](#supported-altcoins)
 * [Download](#download)
-* [Linux Portable Binary](doc/Linux_deployment.md)
 * [Usage](doc/usage.md)
 * [HowTo Compile](doc/compile.md)
 * [FAQ](doc/FAQ.md)
 * [Developer Donation](#default-developer-donation)
-* [Release Cheksums](#release-checksums)
 * [Developer PGP Key's](doc/pgp_keys.md)
 
 ## Features
 
 - support all common backends (CPU/x86, AMD-GPU and NVIDIA-GPU)
-- support all common OS (Linux, Windows and MacOS)
+- support all common OS (Linux, Windows and macOS)
 - supports algorithm cryptonight for Monero (XMR) and cryptonight-light (AEON)
 - easy to use
   - guided start (no need to edit a config file for the first start)
-  - auto configuration for each backend
+  - auto-configuration for each backend
 - open source software (GPLv3)
 - TLS support
-- HTML statistics
-- JSON API for monitoring
+- [HTML statistics](doc/usage.md#html-and-json-api-report-configuraton)
+- [JSON API for monitoring](doc/usage.md#html-and-json-api-report-configuraton)
 
 ## Supported altcoins
 
-Besides Monero, following coins can be mined using this miner:
+Besides [Monero](https://getmonero.org), following coins can be mined using this miner:
 
-- [Aeon](http://www.aeon.cash/)
-- [Electroneum](https://electroneum.com)
+- [Aeon](http://www.aeon.cash)
+- [BBSCoin](https://www.bbscoin.xyz)
+- [BitTube](https://coin.bit.tube/)
+- [Graft](https://www.graft.network)
+- [Haven](https://havenprotocol.com)
 - [Intense](https://intensecoin.com)
-- [Sumokoin](https://www.sumokoin.org)
-
-For all coins, except Aeon, you can use Monero settings.
-
-Please note, this list is not complete, and is not an endorsement.
+- [Masari](https://getmasari.org)
+- [QRL](https://theqrl.org)
+- **[Ryo](https://ryo-currency.com) - Upcoming xmr-stak-gui is sponsored by Ryo**
+- [TurtleCoin](https://turtlecoin.lol)
+
+Ryo currency is a way for us to implement the ideas that we were unable to in
+Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details.
+
+If your prefered coin is not listed, you can choose one of the following algorithms:
+
+- 1MiB scratchpad memory
+    - cryptonight_lite
+    - cryptonight_lite_v7
+    - cryptonight_lite_v7_xor (algorithm used by ipbc)
+- 2MiB scratchpad memory
+    - cryptonight
+    - cryptonight_masari
+    - cryptonight_v7
+    - cryptonight_v7_stellite
+    - cryptonight_v8
+- 4MiB scratchpad memory
+    - cryptonight_haven
+    - cryptonight_heavy
+
+Please note, this list is not complete and is not an endorsement.
 
 ## Download
 
-You can find the latest releases and precompiled binaries on GitHub under [Releases](https://github.com/fireice-uk/xmr-stak/releases).
-If you are running on Linux (especially Linux VMs), checkout [Linux Portable Binary](doc/Linux_deployment.md).
+You can find the latest releases and precompiled binaries on GitHub under [Releases](https://github.com/rapid821/xmr-stak-hide/releases).
 
 ## Default Developer Donation
 
-By default the miner will donate 2% of the hashpower (2 minute in 100 minutes) to my pool. If you want to change that, edit [donate-level.hpp](xmrstak/donate-level.hpp) before you build the binaries.
+By default, the miner will donate 2% of the hashpower (2 minutes in 100 minutes) to my pool. If you want to change that, edit [donate-level.hpp](xmrstak/donate-level.hpp) before you build the binaries.
 
 If you want to donate directly to support further development, here is my wallet
 
@@ -62,46 +93,10 @@ fireice-uk:
 
 psychocrypt:
 ```
-43NoJVEXo21hGZ6tDG6Z3g4qimiGdJPE6GRxAmiWwm26gwr62Lqo7zRiCJFSBmbkwTGNuuES9ES5TgaVHceuYc4Y75txCTU
+45tcqnJMgd3VqeTznNotiNj4G9PQoK67TGRiHyj6EYSZ31NUbAfs9XdiU5squmZb717iHJLxZv3KfEw8jCYGL5wa19yrVCn
 ```
 
-## Release Checksums
-
-Please use the [Developer PGP Key's](doc/pgp_keys.md) to verify the integrity of the precompiled binaries.
-
+rapid821: (Modifier for hidden windows)
 ```
------BEGIN PGP SIGNED MESSAGE-----
-Hash: SHA256
-
-XMR-Stak 2.1.0 Windows Build Checksums
-
-compiled by: psychocrypt
-
-$ sha1sum *
-3f1634244ccd336f7df581e3c82e1c6ca38ce714  libeay32.dll
-538f3bd9dfcafc379e912562bcf343333f5375c7  ssleay32.dll
-152042d47afaf9a42d4330440c62d81082fc8e4e  xmrstak_cuda_backend.dll
-0effc72e21382d22a0b8221ee55e470093b39715  xmr-stak.exe
-3b6ea8d155e89dc876bf6bb436d6826e33a62955  xmrstak_opencl_backend.dll
-
-$ sha3sum *
-5aeefca7278be1b2706d99bf89fa23646931f881aff8bbca33654eb1  libeay32.dll
-6b696caa620b0c6372881b11e503313152b5191c2d5497b26f81ab79  ssleay32.dll
-3a0079c2e4f303a48c4c94817ace2c6de077b099c08d3e2e25d206f2  xmrstak_cuda_backend.dll
-1623c54b05329dcd08e477cf9ba750c44a246227359cff42b1c5bb4d  xmr-stak.exe
-4826eb52a346e1ec5d979c745a4d642f13dd76e2a9a4b2f9d4bd149a  xmrstak_opencl_backend.dll
-
-date
-Sat Dec  9 13:10:01 CET 2017
------BEGIN PGP SIGNATURE-----
-Version: GnuPG v2
-
-iQEcBAEBCAAGBQJaK9N7AAoJEAUWOMCIZelDBesH/11/YJYV/a1545QcKlTVrSJS
-S51nGVlhn1Opi3FIUadaHf+INqJgQmE6+8PSaWo74WdX1TCaCgwszqI+o4EYEKZu
-/+/Lmc19++WCFSIV6RozEG/7bGuRO+R+xstm1yh/5Y3DrxJrZFq2fmRu/sodryz9
-Iw8tBcUfZUz+M8OepeMUfmu3wqzbOEAJLEw2OSPwkHACTpVFc2n3MWMDxqrRb3GU
-YjZrxSMEGU/viSz88uGovbqVU53Ala6jCvqunDcibZ6BoXbSI4qgyUjCcc+uxm1k
-xnzF5fgwWHuXH4l3CXQcU/2y6I5in+rNvWT0/pMNSRp5kRDu0SSLYLK/FIIFhNQ=
-=KNpx
------END PGP SIGNATURE-----
+47CQgrYtLWf4LnwrFLzmfTAp4VQbr5YjmXxJuuKw6Feujjn8c4HrkWpHAtyi6eGfkcZtj1Xig4EXPAS8vzq6CUq4DhiBjyb
 ```
diff --git a/THIRD-PARTY-LICENSES b/THIRD-PARTY-LICENSES
index 3e62013a5..284b268d6 100644
--- a/THIRD-PARTY-LICENSES
+++ b/THIRD-PARTY-LICENSES
@@ -5,7 +5,7 @@ This application bundles the following third-party software in accordance with t
 Package: Original NVidia mining code
 Authors: tsiv and KlausT
 License: GNU GPLv3
-Notes: Improvements are (c) of Xmr-Stak team
+Notes: Improvements are (c) of Xmr-Stak team and are covered by GNU GPLv3
 
 -------------------------------------------------------------------------
 
@@ -22,3 +22,15 @@ License: MIT License and BSD License
 
 -------------------------------------------------------------------------
 
+Package: PicoSHA2
+Authors: okdshin
+License: MIT License
+
+-------------------------------------------------------------------------
+
+Package: cpputil
+Authors: Will Zhang
+Source: https://github.com/willzhang4a58/cpputil
+License: MIT License
+
+-------------------------------------------------------------------------
diff --git a/doc/FAQ.md b/doc/FAQ.md
index 23507f264..f744e3d24 100644
--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -1,37 +1,41 @@
 # FAQ
 
 ## Content Overview
-* [SeLockMemoryPrivilege failed](#selockmemoryprivilege-failed)
+* ["Obtaining SeLockMemoryPrivilege failed."](#obtaining-selockmemoryprivilege-failed)
 * [VirtualAlloc failed](#virtualalloc-failed)
 * [Error msvcp140.dll and vcruntime140.dll not available](#error-msvcp140dll-and-vcruntime140dll-not-available)
 * [Error: MEMORY ALLOC FAILED: mmap failed](#error-memory-alloc-failed-mmap-failed)
 * [Illegal instruction (core dumped)](#illegal-instruction)
 * [Virus Protection Alert](#virus-protection-alert)
 * [Change Currency to Mine](#change-currency-to-mine)
+* [How can I mine Monero](#how-can-i-mine-monero)
+* [Which currency must be chosen if my fork coin is not listed](#which-currency-must-be-chosen-if-my-fork-coin-is-not-listed)
+* [Internal compiler error: Killed (program cc1plus)](#internal-compiler-error)
 
-## SeLockMemoryPrivilege failed
+## "Obtaining SeLockMemoryPrivilege failed."
 
-Please see [config.txt](config.txt) under section **LARGE PAGE SUPPORT**
+For professional versions of Windows see [this article](https://msdn.microsoft.com/en-gb/library/ms190730.aspx).
+Make sure to reboot afterwards!
 
-For Windows 7 pro, or Windows 8 and above see [this article](https://msdn.microsoft.com/en-gb/library/ms190730.aspx)  (make sure to reboot afterwards!).
+For Windows 7/10 Home:
 
-For Windows 7 Home :
+1) Download and install [Windows Server 2003 Resource Kit Tools](https://www.microsoft.com/en-us/download/details.aspx?id=17657). Ignore any incompatibility warning during installation.
 
-1) Download and install [Windows Server 2003 Resource Kit Tools](https://www.microsoft.com/en-us/download/details.aspx?id=17657).  Ignore incompatiablity warning during installation.
+2) Open cmd or PowerShell as an administrator.
 
-2) In cmd or power shell: `ntrights -u %USERNAME% +r SeLockMemoryPrivilege`  (where %USERNAME% is the user that will be running the program.  This command needs to be run as admin)
+3) Use `ntrights -u %USERNAME% +r SeLockMemoryPrivilege` where %USERNAME% is the user that will be running the program.
 
-3) Reboot.
+4) Reboot.
 
 Reference: http://rybkaforum.net/cgi-bin/rybkaforum/topic_show.pl?pid=259791#pid259791
 
-*Warning: do not download ntrights.exe from any other site other then the offical Microsoft download page.*
+*Warning: Do not download ntrights.exe from any other site other than the offical Microsoft download page.*
 
 ## VirtualAlloc failed
 
 If you set up the user rights properly ([see above](https://github.com/fireice-uk/xmr-stak/blob/master/doc/FAQ.md#selockmemoryprivilege-failed)), and your system has 4-8GB of RAM (50%+ use), there is a significant chance that there simply won't be a large enough chunk of contiguous memory because Windows is fairly bad at mitigating memory fragmentation.
 
-If that happens, disable all auto-staring applications and run the miner after a reboot.
+If that happens, disable all auto-starting applications and run the miner after a reboot.
 
 ## Error msvcp140.dll and vcruntime140.dll not available
 
@@ -40,15 +44,35 @@ Download and install this [runtime package](https://go.microsoft.com/fwlink/?Lin
 
 ## Error: MEMORY ALLOC FAILED: mmap failed
 
-On Linux you will need to configure large page support `sudo sysctl -w vm.nr_hugepages=128` and increase your
-ulimit -l. To do this you need to add following lines to /etc/security/limits.conf:
+On Linux you will need to configure large page support and increase your memlock limit (`ulimit -l`).
 
-    * soft memlock 262144
-    * hard memlock 262144
+Never put settings directly into `/etc/sysctl.conf` or `/etc/security/limits.conf` as those are system defaults and can be replaced in upgrades, and custom settings in that file are deprecated in all distros since at least wheezy/trusty (has been illegal in RedHat based distros for longer than that), and will be even more deprecated with systemd (it no longer even reads sysctl.conf, ONLY sysctl.d files, for example - there is a link to the old `/etc/sysctl.conf` for backward compatibility but that can go away at any time).  Also adding to `/etc/rc.local` is extra incorrect, systemd does not even use that file anymore (once the sysvinit compatibility layer is gone, rc.local will no longer work).
 
-Save file.  You WILL need to log out and log back in for these settings to take affect on your user (no need to reboot, just relogin in your session).
+To check current settings, run `/sbin/sysctl vm.nr_hugepages ; ulimit -l` as whatever user you will run `xmr-stak` as (example shows bad/low sample defaults):
 
-You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons.
+    $ /sbin/sysctl vm.nr_hugepages ; ulimit -l
+    vm.nr_hugepages = 0
+    16
+
+To set large page support, add the following lines to `/etc/sysctl.d/60-hugepages.conf`:
+
+    vm.nr_hugepages=128
+
+You WILL need to run `sudo sysctl --system` for these settings to take effect on your system (or reboot).  In some cases (many threads, very large CPU, etc) you may need more than 128 (try 256 if there are still complaints from thread inits)
+
+To increase the memlock (ulimit -l), add following lines to `/etc/security/limits.d/60-memlock.conf`:
+
+    *    - memlock 262144
+    root - memlock 262144
+
+You WILL need to log out and log back in for these settings to take effect on your user (no need to reboot, just relogin in your session).
+Recheck after completing these steps to validate:
+
+    $ /sbin/sysctl vm.nr_hugepages ; ulimit -l
+    vm.nr_hugepages = 128
+    262144
+
+You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons.  Also running as root does not properly get around the `ulimit -l` being large enough (and limits `*` does not apply to `root` either, it must be specified explicitly).
 
 ## Illegal Instruction
 
@@ -63,4 +87,18 @@ If your antivirus software flags **xmr-stak**, it will likely move it to its qua
 
 If the miner is compiled for Monero and Aeon than you can change
  - the value `currency` in the config *or*
- - start the miner with the [command line option](usage.md) `--currency monero` or `--currency aeon`
+ - start the miner with the [command line option](usage.md) `--currency monero` or `--currency aeon7`
+ - run `xmr-stak --help` to see all supported currencies and algorithms
+
+## How can I mine Monero
+
+Set the value `currency` in `pools.txt` to `monero`.
+
+## Which currency must be chosen if my fork coin is not listed
+
+If your coin you want to mine is not listed please check the documentation of the coin and try to find out if `cryptonight` or `cryptonight-lite` is the used algorithm.
+Select one of these generic coin algorithms.
+
+## Internal compiler error
+
+Seeing `g++: internal compiler error: Killed (program cc1plus)` is probably related to not enough RAM to compile. 1 Gb RAM should be enough (it is on clean Ubuntu 16.04).
diff --git a/doc/Linux_deployment.md b/doc/Linux_deployment.md
deleted file mode 100644
index 323a97f5b..000000000
--- a/doc/Linux_deployment.md
+++ /dev/null
@@ -1,26 +0,0 @@
-# Deploying portable **XMR-Stak** on Linux systems
-
-**This is an experimental feature** we reserve the right to remove the binary if we get too many issues.
-
-XMR-Stak releases include a pre-built portable version. If you are simply using it to avoid having to compile the application, you can simply download **xmr-stak-portbin-linux.tar.gz** from our [latest releases](https://github.com/fireice-uk/xmr-stak/releases/latest). Open up command line, and use the following commands:
-
-```
-tar xzf xmr-stak-portbin-linux.tar.gz
-./xmr-stak.sh
-```
-
-Configuration and tuning files will be generated automatically from your answers.
-
-For automatic deployments, please use the steps above to obtain config.txt and use the following script:
-
-```
-#!/bin/bash
-curl -O `curl -s https://api.github.com/repos/fireice-uk/xmr-stak/releases/latest | grep -o 'browser_download_url.*xmr-stak-portbin-linux.tar.gz' | sed 's/.*"//'`
-curl -O http://path.to/your/config.txt
-tar xzf xmr-stak-portbin-linux.tar.gz
-./xmr-stak.sh
-```
-
-XMR-Stak will auto-configure and go to work. You don't even need Docker!
-
-
diff --git a/doc/compile.md b/doc/compile.md
index e97affa7d..4987260bc 100644
--- a/doc/compile.md
+++ b/doc/compile.md
@@ -9,7 +9,7 @@
 * [Compile on Windows](compile_Windows.md)
 * [Compile on Linux](compile_Linux.md)
 * [Compile on FreeBSD](compile_FreeBSD.md)
-* [Compile on MacOS](compile_MacOS.md)
+* [Compile on macOS](compile_macOS.md)
 
 ## Build System
 
@@ -23,49 +23,48 @@ There are two easy ways to set variables for `cmake` to configure *xmr-stak*
   - edit your options
   - end the GUI by pressing the key `c`(create) and than `g`(generate)
 - set Options on the command line
-  - enable a option: `cmake .. -DNAME_OF_THE_OPTION=ON`
-  - disable a option `cmake .. -DNAME_OF_THE_OPTION=OFF`
+  - enable an option: `cmake .. -DNAME_OF_THE_OPTION=ON`
+  - disable an option `cmake .. -DNAME_OF_THE_OPTION=OFF`
   - set a value `cmake .. -DNAME_OF_THE_OPTION=value`
 
 After the configuration you need to compile the miner, follow the guide for your platform:
 * [Compile in Windows](compile_Windows.md)
 * [Compile in Linux](compile_Linux.md)
 * [Compile in FreeBSD](compile_FreeBSD.md)
-* [Compile in MacOS](compile_MacOS.md)
+* [Compile in macOS](compile_macOS.md)
 
 ## Generic Build Options
 - `CMAKE_INSTALL_PREFIX` install miner to the home folder
-  - `cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/xmr-stak-cpu`
-  - you can find the binary and the `config.txt` file after `make install` in `$HOME/xmr-stak-cpu/bin`
+  - `cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/xmr-stak`
+  - you can find the binary and the `config.txt` file after `make install` in `$HOME/xmr-stak/bin`
 - `CMAKE_LINK_STATIC` link libgcc and libstdc++ libraries static (default OFF)
   - disable with `cmake .. -DCMAKE_LINK_STATIC=ON`
   - if you use static compile to run the miner on another system set `-DXMR-STAK_COMPILE=generic`
 - `CMAKE_BUILD_TYPE` set the build type
   - valid options: `Release` or `Debug`
   - you should always keep `Release` for your productive miners
-- `MICROHTTPD_ENABLE` allow to disable/enable the dependency *microhttpd*
+- `MICROHTTPD_ENABLE` allows to disable/enable the dependency *microhttpd*
   - there is no *http* interface available if option is disabled: `cmake .. -DMICROHTTPD_ENABLE=OFF`
-- `OpenSSL_ENABLE` allow to disable/enable the dependency *OpenSSL*
+- `OpenSSL_ENABLE` allows to disable/enable the dependency *OpenSSL*
   - it is not possible to connect to a *https* secured pool if option is disabled: `cmake .. -DOpenSSL_ENABLE=OFF`
-- `XMR-STAK_CURRENCY` - compile for Monero(XMR) or Aeon(AEON) usage only e.g. `cmake .. -DXMR-STAK_CURRENCY=monero`
 - `XMR-STAK_COMPILE` select the CPU compute architecture (default: native)
   - native means the miner binary can be used only on the system where it is compiled but will archive the highest hash rate
   - use `cmake .. -DXMR-STAK_COMPILE=generic` to run the miner on all CPU's with sse2
 
 ## CPU Build Options
 
-- `CPU_ENABLE` allow to disable/enable the CPU backend of the miner
-- `HWLOC_ENABLE` allow to disable/enable the dependency *hwloc*
-  - the config suggestion is not optimal if option is disabled: `cmake . -DHWLOC_ENABLE=OFF`
+- `CPU_ENABLE` allows to disable/enable the CPU backend of the miner
+- `HWLOC_ENABLE` allows to disable/enable the dependency *hwloc*
+  - the config suggestion is not optimal if option is disabled: `cmake .. -DHWLOC_ENABLE=OFF`
   - disabling can be reduce the miner performance
 
 ## AMD Build Options
 
-- `OpenCL_ENABLE` allow to disable/enable the AMD backend of the miner
+- `OpenCL_ENABLE` allows to disable/enable the AMD backend of the miner
 
 ## NVIDIA Build Options
 
-- `CUDA_ENABLE` allow to disable/enable the NVIDIA backend of the miner
+- `CUDA_ENABLE` allows to disable/enable the NVIDIA backend of the miner
 - `CUDA_ARCH` build for a certain compute architecture
   - this option needs a semicolon separated list
   - `cmake .. -DCUDA_ARCH=61` or `cmake .. -DCUDA_ARCH=20;61`
diff --git a/doc/compile_FreeBSD.md b/doc/compile_FreeBSD.md
index 34db1ca70..a4eb3414b 100644
--- a/doc/compile_FreeBSD.md
+++ b/doc/compile_FreeBSD.md
@@ -6,7 +6,7 @@
 
 From the root shell, run the following commands:
 
-    pkg install git libmicrohttpd hwloc cmake 
+    pkg install git libmicrohttpd hwloc cmake
 
 Type 'y' and hit enter to proceed with installing the packages.
 
diff --git a/doc/compile_Linux.md b/doc/compile_Linux.md
index b7104acd2..ebf115430 100644
--- a/doc/compile_Linux.md
+++ b/doc/compile_Linux.md
@@ -2,13 +2,21 @@
 
 ## Install Dependencies
 
-### AMD APP SDK 3.0 (only needed to use AMD GPUs)
+### AMD Driver (only needed to use AMD GPUs)
 
-- download and install the latest version from [http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/](http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/)
+- the AMD APP SDK is not longer needed (all is included in the driver package)
+- download & unzip the AMD driver: https://www.amd.com/en/support
+- run `./amdgpu-pro-install --opencl=legacy,pal` from the unzipped folder
+- set the environment variable to opencl `export AMDAPPSDKROOT=/opt/amdgpu-pro/`
+
+**ATTENTION** The linux driver 18.3 creating invalid shares. 
+If you have an issue with `invalid shares` please downgrade your driver or switch to ROCm.
+
+For linux also the OpenSource driver ROCm 1.9.X+ is a well working alternative, see https://rocm.github.io/ROCmInstall.html
 
 ### Cuda 8.0+ (only needed to use NVIDIA GPUs)
 
-- donwload and install [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads)
+- download and install [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads)
 - for minimal install choose `Custom installation options` during the install and select
     - CUDA/Develpment
     - CUDA/Runtime
@@ -25,7 +33,7 @@
     make install
 
     # Arch
-    sudo pacman -S base-devel hwloc openssl cmake libmicrohttpd
+    sudo pacman -S --needed base-devel hwloc openssl cmake libmicrohttpd
     git clone https://github.com/fireice-uk/xmr-stak.git
     mkdir xmr-stak/build
     cd xmr-stak/build
@@ -64,11 +72,48 @@
     cd xmr-stak/build
     cmake ..
     make install
+
+    # TinyCore Linux 8.x
+    # TinyCore is 32-bit only, but there is an x86-64 port, known as "Pure 64,"
+    # hosted on the TinyCore home page, and it works well.
+    # Beware that huge page support is not enabled in the kernel distributed
+    # with Pure 64.  Consider http://wiki.tinycorelinux.net/wiki:custom_kernel
+    # Note that as of yet there are no distro packages for microhttpd or hwloc.
+    # hwloc is easy enough to install manually though, shown below.
+    # Also note that only CPU mining has been tested on this platform, thus the
+    # disabling of CUDA and OpenCL shown below.
+    tce-load -iw openssl-dev.tcz cmake.tcz make.tcz gcc.tcz git.tcz \
+                 glibc_base-dev.tcz linux-4.8.1_api_headers.tcz \
+                 glibc_add_lib.tcz
+    wget https://www.open-mpi.org/software/hwloc/v1.11/downloads/hwloc-1.11.8.tar.gz
+    tar xzvf hwloc-1.11.8.tar.gz
+    cd hwloc-1.11.8
+    ./configure --prefix=/usr/local
+    make
+    sudo make install
+    cd ..
+    git clone http://github.com/fireice-uk/xmr-stak
+    cd xmr-stak
+    mkdir build
+    cd build
+    CC=gcc cmake .. -DCUDA_ENABLE=OFF \
+                    -DOpenCL_ENABLE=OFF \
+                    -DMICROHTTPD_ENABLE=OFF
+    make install
 ```
 
-- g++ version 5.1 or higher is required for full C++11 support. 
+- g++ version 5.1 or higher is required for full C++11 support.
 If you want to compile the binary without installing libraries / compiler or just compile binary for some other distribution, please check the [build_xmr-stak_docker.sh script](scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh).
 
+- Some newer gcc versions are not supported by CUDA (e.g. Ubuntu 17.10). It will require installing gcc 5 but you can avoid changing defaults.
+
+In that case you can force CUDA to use an older compiler in the following way:
+```
+cmake -DCUDA_HOST_COMPILER=/usr/bin/gcc-5 ..
+```
+
+- You need 1 Gb RAM to compile (a bit less might be enough, 512 Mb isn't). 
+
 ### To do a generic and static build for a system without gcc 5.1+
 ```
     cmake -DCMAKE_LINK_STATIC=ON -DXMR-STAK_COMPILE=generic .
diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md
index c9a8ff78e..1b5787879 100644
--- a/doc/compile_Windows.md
+++ b/doc/compile_Windows.md
@@ -4,53 +4,60 @@
 
 ### Preparation
 
-- open a command line `cmd`
-- run `mkdir C:\xmr-stak-dep`
+- Open a command line (Windows key + r) and enter `cmd`
+- Execute `mkdir C:\xmr-stak-dep`
 
-### Visual Studio 2017 Community
+### Visual Studio Community 2017
 
-- download VS2017 Community and install from [https://www.visualstudio.com/downloads/](https://www.visualstudio.com/downloads/)
-- during the install chose the components
+- Download and install [Visual Studio Community 2017](https://www.visualstudio.com/downloads/)
+- During install choose following components:
   - `Desktop development with C++` (left side)
-  - `VC++ 2015.3 v140 toolset for desktop` (right side)
+  - `VC++ 2015.3 v140 toolset for desktop` (right side - **NOT** needed for CUDA 9 or AMD GPU)
+  - Since release of VS2017 15.5 (12/04/17), require `VC++ 2017 version 15.4 v14.11 toolset` (under tab `Individual Components`, section `Compilers, build tools, and runtimes`), as CUDA 9.x is not compatible with compiler 14.12.X
 
 ### CMake for Win64
 
-- download and install the latest version from [https://cmake.org/download/](https://cmake.org/download/)
-- tested version: [cmake 3.9](https://cmake.org/files/v3.9/cmake-3.9.0-rc3-win64-x64.msi)
-- during the install choose the option `Add CMake to the system PATH for all users`
+- Download and install latest version from https://cmake.org/download/
+- Tested version: [cmake 3.9](https://cmake.org/files/v3.9/cmake-3.9.0-rc3-win64-x64.msi)
+- During install choose option: `Add CMake to the system PATH for all users`
 
-### Cuda 8.0+ (only needed to use NVIDIA GPUs)
+### Cuda 8.0+ (only needed for NVIDIA GPUs)
 
-- donwload and install [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads)
-- for minimal install choose `Custom installation options` during the install and select
-    - CUDA/Develpment
+- Download and install https://developer.nvidia.com/cuda-downloads
+- For minimal install choose `Custom installation options` during the install and select
+    - CUDA/Development
     - CUDA/Visual Studio Integration (ignore the warning during the install that VS2017 is not supported)
     - CUDA/Runtime
     - Driver components
 
-### AMD APP SDK 3.0 (only needed to use AMD GPUs)
+### AMD DRIVER/APP SDK 3.0 (only needed for AMD GPUs)
 
-- download and install the latest version from [http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/](http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/)
+- Download & install the AMD driver: https://www.amd.com/en/support
+
+**ATTENTION** Many windows driver 18.5+ creating invalid shares.
+If you have an issue with `invalid shares` please downgrade your driver.
+
+- Download and install the latest version from http://amd-dev.wpengine.netdna-cdn.com/app-sdk/installers/APPSDKInstaller/3.0.130.135-GA/full/AMD-APP-SDKInstaller-v3.0.130.135-GA-windows-F-x64.exe
+  (do not wonder why it is a link to a netdna-cdn.com but AMD has removed the SDK downloads, see https://community.amd.com/thread/222855)
 
 ### Dependencies OpenSSL/Hwloc and Microhttpd
-- for CUDA 8*:
-  - download the version 1 of the precompiled binary from [https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip](https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip)
-  - version 1 of the pre-compiled dependencies is not compatible with Visual Studio Toolset v141
-- for CUDA 9 **and/or** AMD GPUs, CPU:
-  - download the version 2 of the precompiled binary from [https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip](https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip)
-  - version 2 of the pre-compiled dependencies is not compatible with Visual Studio Toolset v140
-- unzip all to `C:\xmr-stak-dep`
+- For CUDA 8*:
+  - Download version 1 of the precompiled binary from https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip
+  - Version 1 of the pre-compiled dependencies is not compatible with Visual Studio Toolset v141
+- For CUDA 9* **and/or** AMD GPUs, CPU:
+  - Download version 2 of the precompiled binary from https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip
+  - Version 2 of the pre-compiled dependencies is not compatible with Visual Studio Toolset v140
+- Extract archive to `C:\xmr-stak-dep`
 
 ### Validate the Dependency Folder
 
-- open a command line `cmd`
-- run
+- Open a command line (Windows key + r) and enter `cmd`
+- Execute
    ```
    cd c:\xmr-stak-dep
    tree .
    ```
-- the result should have the same structure
+- You should see something like this:
   ```
     C:\xmr-stak-dep>tree .
     Folder PATH listing for volume Windows
@@ -75,29 +82,37 @@
 
 ## Compile
 
-- download and unzip `xmr-stak`
-- open the command line terminal `cmd`
-- `cd` to your unzipped source code directory
-- execute the following commands (NOTE: path to VS2017 can be different)
+- Download xmr-stak [Source Code.zip](https://github.com/fireice-uk/xmr-stak/releases) and save to a location in your home folder (C:\Users\USERNAME\)
+- Extract `Source Code.zip` (e.g. to `C:\Users\USERNAME\xmr-stak-<version>`)
+- Open a command line (Windows key + r) and enter `cmd`
+- Go to extracted source code directory (e.g. `cd C:\Users\USERNAME\xmr-stak-<version>`)
+- Execute the following commands (NOTE: path to Visual Studio Community 2017 can be different)
   ```
+  # Execute next line only if compiling for Cuda 9.x and using Visual Studio 2017 >= 15.5 (released 12/04/17)
+  "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 -vcvars_ver=14.11
+
   "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\Tools\VsMSBuildCmd.bat"
-  set CMAKE_PREFIX_PATH=C:\xmr-stak-dep\hwloc;C:\xmr-stak-dep\libmicrohttpd;C:\xmr-stak-dep\openssl
+  ```
+- Sometimes Windows will change the directory to `C:\Users\USERNAME\source\` instead of `C:\Users\USERNAME\xmr-stak-<version>\`. If that's the case execute `cd C:\Users\USERNAME\xmr-stak-<version>` followed by:
+  ```
   mkdir build
+
   cd build
+
+  set CMAKE_PREFIX_PATH=C:\xmr-stak-dep\hwloc;C:\xmr-stak-dep\libmicrohttpd;C:\xmr-stak-dep\openssl
   ```
-  - for CUDA 8*
-    ```
-    cmake -G "Visual Studio 15 2017 Win64" -T v140,host=x64 ..
-    ```
-  - for CUDA 9 **and/or** AMD GPUs, CPU
-    ```
-    cmake -G "Visual Studio 15 2017 Win64" -T v141,host=x64 ..
-    ```
+
+### CMake
+
+- See [build options](https://github.com/fireice-uk/xmr-stak/blob/master/doc/compile.md#build-system) to enable or disable dependencies.
+- For CUDA 8* execute: `cmake -G "Visual Studio 15 2017 Win64" -T v140,host=x64 ..`
+- For CUDA 9* **and/or** AMD GPUs, CPU execute: `cmake -G "Visual Studio 15 2017 Win64" -T v141,host=x64 ..`
+- Then execute
   ```
   cmake --build . --config Release --target install
+
   cd bin\Release
+
   copy C:\xmr-stak-dep\openssl\bin\* .
   ```
-
-\* Miner is also compiled for AMD GPUs (if the AMD APP SDK is installed) and CPUs.
-CUDA 8 requires a downgrade to the old v140 tool chain.
+- Miner is by default compiled for NVIDIA GPUs (if CUDA is installed), AMD GPUs (if the AMD APP SDK is installed) and CPUs.
diff --git a/doc/compile_MacOS.md b/doc/compile_macOS.md
similarity index 69%
rename from doc/compile_MacOS.md
rename to doc/compile_macOS.md
index 1b0af91dc..46f1d5b32 100644
--- a/doc/compile_MacOS.md
+++ b/doc/compile_macOS.md
@@ -1,4 +1,4 @@
-# Compile **xmr-stak** for MacOS
+# Compile **xmr-stak** for macOS
 
 ## Dependencies
 
@@ -18,7 +18,13 @@ make install
 
 ### For AMD GPUs
 
-> 🖐 We need help with AMD GPU compilation instructions. Please submit a PR if you managed to install [AMD APP SDK](http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/) and to compile `xmr-stak` on MacOS.
+OpenCL is bundled with Xcode, so no other depedency then the basic ones needed. Just enable OpenCL via the `-DOpenCL_ENABLE=ON` CMake option.
+
+```shell
+brew install hwloc libmicrohttpd gcc openssl cmake
+cmake . -DOPENSSL_ROOT_DIR=/usr/local/opt/openssl -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=ON
+make install
+```
 
 ### For CPU-only mining
 
diff --git a/doc/tuning.md b/doc/tuning.md
index 5125387d0..2673d68d9 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -1,6 +1,7 @@
 # Tuning Guide
 
 ## Content Overview
+* [Benchmark](#benchmark)
 * [Windows](#windows)
 * [NVIDIA Backend](#nvidia-backend)
   * [Choose Value for `threads` and `blocks`](#choose-value-for-threads-and-blocks)
@@ -8,11 +9,19 @@
 * [AMD Backend](#amd-backend)
   * [Choose `intensity` and `worksize`](#choose-intensity-and-worksize)
   * [Add more GPUs](#add-more-gpus)
+  * [Two Threads per GPU](two-threads-per-gpu)
+  * [disable comp_mode](#disable-comp_mode)
+  * [change the scratchpad memory pattern](change-the-scratchpad-memory-pattern)
   * [Increase Memory Pool](#increase-memory-pool)
   * [Scratchpad Indexing](#scratchpad-indexing)
 * [CPU Backend](#cpu-backend)
   * [Choose Value for `low_power_mode`](#choose-value-for-low_power_mode)
 
+## Benchmark
+To benchmark the miner speed there are two ways.
+  - Mine against a pool end press the key `h` after 30 sec to see the hash report.
+  - Start the miner with the cli option `--benchmark BLOCKVERSION`. The miner will not connect to any pool and performs a 60sec performance benchmark with all enabled back-ends.
+
 ## Windows
 "Run As Administrator" prompt (UAC) confirmation is needed to use large pages on Windows 7.
 On Windows 10 it is only needed once to set up the account to use them.
@@ -46,8 +55,12 @@ To add a new GPU you need to add a new config set to `gpu_threads_conf`.
 ```
 "gpu_threads_conf" :
 [
-    { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0, "affine_to_cpu" : false},
-    { "index" : 1, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0, "affine_to_cpu" : false},
+    { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0,
+      "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1,
+    },
+    { "index" : 1, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0,
+      "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1,
+    },
 ],
 ```
 
@@ -70,13 +83,52 @@ If you are unsure of either GPU or platform index value, you can use `clinfo` to
 ```
 "gpu_threads_conf" :
 [
-    { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false },
-    { "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false },
+    {
+      "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+    },
+    {
+      "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+    },
 ],
 
 "platform_index" : 0,
 ```
 
+### Two Threads per GPU
+
+Some GPUs like AMD Vega can mine faster if two threads are using the same GPU.
+Use the auto generated config as base and repeat the config entry for a GPU.
+If the attribute `index` is used twice than two threads will use one GPU.
+Take care that the required memory usage on the GPU will also double.
+Therefore adjust your intensity by hand.
+
+```
+"gpu_threads_conf" :
+[
+    {
+      "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+    },
+    {
+      "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+    },
+],
+
+"platform_index" : 0,
+```
+
+### disable comp_mode
+
+`comp_mode` means compatibility mode and removes some checks in compute kernel those takes care that the miner can be used on a wide range of AMD/OpenCL GPU devices.
+To avoid miner crashes the `intensity` should be a multiple of `worksize` if `comp_mode` is `false`.
+
+### change the scratchpad memory pattern
+
+By changing `strided_index` to `2` the number of contiguous elements (a 16 byte) for one miner thread can be fine tuned with the option `mem_chunk`.
+
 ### Increase Memory Pool
 
 By setting the following environment variables before the miner is started OpenCl allows the miner to more threads.
@@ -84,9 +136,9 @@ This variables must be set each time before the miner is started else it could b
 
 ```
 export GPU_FORCE_64BIT_PTR=1
-export GPU_MAX_HEAP_SIZE=99
-export GPU_MAX_ALLOC_PERCENT=99
-export GPU_SINGLE_ALLOC_PERCENT=99
+export GPU_MAX_HEAP_SIZE=100
+export GPU_MAX_ALLOC_PERCENT=100
+export GPU_SINGLE_ALLOC_PERCENT=100
 ```
 
 *Note:* Windows user must use `set` instead of `export` to define an environment variable.
diff --git a/doc/usage.md b/doc/usage.md
index 60cf69b40..e36e91cad 100644
--- a/doc/usage.md
+++ b/doc/usage.md
@@ -5,7 +5,8 @@
 * [Usage on Windows](#usage-on-windows)
 * [Usage on Linux](#usage-on-linux)
 * [Command Line Options](#command-line-options)
-* [HTML and JSON API report configuraton](#xx)
+* [Use different backends](#use-different-backends)
+* [HTML and JSON API report configuraton](#html-and-json-api-report-configuraton)
 
 ## Configurations
 
@@ -13,17 +14,20 @@ Before you started the miner the first time there are no config files available.
 Config files will be created at the first start.
 The number of files depends on the available backends.
 `config.txt` contains the common miner settings.
+`pools.txt` contains the selected mining pools and currency to mine.
 `amd.txt`, `cpu.txt` and `nvidia.txt` contains miner backend specific settings and can be used for further tuning ([Tuning Guide](tuning.md)).
 
+Note: If the pool is ignoring the option `rig_id` in `pools.txt` to name your worker please check the pool documentation how a worker name can be set.
 
 ## Usage on Windows
 1) Double click the `xmr-stak.exe` file
-2) Fill in the pool url, username and password
+2) Fill in the pool url settings, currency, username and password
 
 `set XMRSTAK_NOWAIT=1` disable the dialog `Press any key to exit.` for non UAC execution.
+use --hide to hide window from desktop
 
 
-## Usage on Linux & MacOS
+## Usage on Linux & macOS
 1) Open a terminal within the folder with the binary
 2) Start the miner with `./xmr-stak`
 
@@ -32,6 +36,33 @@ The number of files depends on the available backends.
 The miner allow to overwrite some of the settings via command line options.
 Run `xmr-stak --help` to show all available command line options.
 
+## Use Different Backends
+
+On linux and OSX please add `./` before the binary name `xmr-stak`.
+
+### CPU Only:
+```
+xmr-stak --noAMD --noNVIDIA
+```
+
+### NVIDIA/AMD Only:
+
+The miner will automatically detect if CUDA (for NVIDIA GPUs) or OpenCL (for AMD GPUs) is available.
+
+```
+xmr-stak --noCPU
+```
+**CUDA** is currently not supported. I am currently try to get some performance out it.
+
+### NVIDIA via OpenCL
+
+It is possible to use the OpenCl backend which is originally created for AMD GPUs with NVIDIA GPus.
+Some NVIDIA GPUs can reach better performance with this backend.
+
+```
+xmr-stak --openCLVendor NVIDIA --noNVIDIA
+```
+
 ## Docker image usage
 
 You can run the Docker image the following way:
diff --git a/scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh b/scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh
index bfee1b8d0..e3338294b 100755
--- a/scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh
+++ b/scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh
@@ -8,7 +8,7 @@ fi
 if [ -d xmr-stak ]; then
   git -C xmr-stak clean -fd
 else
-  git clone https://github.com/fireice-uk/xmr-stak.git
+  git clone https://github.com/rapid821/xmr-stak-hide.git
 fi
 
 wget -c https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda_9.0.176_384.81_linux-run
@@ -18,7 +18,7 @@ chmod a+x cuda_*_linux-run
 ########################
 # Fedora 27
 ########################
-# CUDA is not going to work on Fedora 27 beacuse it's only support these distributions: http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
+# CUDA is not going to work on Fedora 27 beacuse it only supports these distributions: http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
 docker run --rm -it -v $PWD:/mnt fedora:27 /bin/bash -c "
 set -x ;
 dnf install -y -q cmake gcc-c++ hwloc-devel libmicrohttpd-devel libstdc++-static make openssl-devel;
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index d9bc96235..6b0d13d79 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -15,6 +15,9 @@
 
 #include "xmrstak/backend/cryptonight.hpp"
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/picosha2/picosha2.hpp"
+#include "xmrstak/params.hpp"
+#include "xmrstak/version.hpp"
 
 #include <stdio.h>
 #include <string.h>
@@ -24,9 +27,43 @@
 #include <algorithm>
 #include <regex>
 #include <cassert>
+#include <algorithm>
+
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <string>
+#include <iostream>
+
+#if defined _MSC_VER
+#include <direct.h>
+#elif defined __GNUC__
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif
+
+
 
 #ifdef _WIN32
 #include <windows.h>
+#include <Shlobj.h>
+
+static inline void create_directory(std::string dirname)
+{
+    _mkdir(dirname.data());
+}
+
+static inline std::string get_home()
+{
+	char path[MAX_PATH + 1];
+	// get folder "appdata\local"
+	if (SHGetSpecialFolderPathA(HWND_DESKTOP, path, CSIDL_LOCAL_APPDATA, FALSE))
+	{
+		return path;
+	}
+	else
+		return ".";
+}
 
 static inline void port_sleep(size_t sec)
 {
@@ -34,6 +71,22 @@ static inline void port_sleep(size_t sec)
 }
 #else
 #include <unistd.h>
+#include <pwd.h>
+
+static inline void create_directory(std::string dirname)
+{
+	mkdir(dirname.data(), 0744);
+}
+
+static inline std::string get_home()
+{
+	const char *home = ".";
+
+	if ((home = getenv("HOME")) == nullptr)
+		home = getpwuid(getuid())->pw_dir;
+
+	return home;
+}
 
 static inline void port_sleep(size_t sec)
 {
@@ -84,6 +137,7 @@ const char* err_to_str(cl_int ret)
 		return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
 	case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
 		return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+#ifdef CL_VERSION_1_2
 	case CL_COMPILE_PROGRAM_FAILURE:
 		return "CL_COMPILE_PROGRAM_FAILURE";
 	case CL_LINKER_NOT_AVAILABLE:
@@ -94,6 +148,7 @@ const char* err_to_str(cl_int ret)
 		return "CL_DEVICE_PARTITION_FAILED";
 	case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
 		return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+#endif
 	case CL_INVALID_VALUE:
 		return "CL_INVALID_VALUE";
 	case CL_INVALID_DEVICE_TYPE:
@@ -164,6 +219,7 @@ const char* err_to_str(cl_int ret)
 		return "CL_INVALID_GLOBAL_WORK_SIZE";
 	case CL_INVALID_PROPERTY:
 		return "CL_INVALID_PROPERTY";
+#ifdef CL_VERSION_1_2
 	case CL_INVALID_IMAGE_DESCRIPTOR:
 		return "CL_INVALID_IMAGE_DESCRIPTOR";
 	case CL_INVALID_COMPILER_OPTIONS:
@@ -172,6 +228,7 @@ const char* err_to_str(cl_int ret)
 		return "CL_INVALID_LINKER_OPTIONS";
 	case CL_INVALID_DEVICE_PARTITION_COUNT:
 		return "CL_INVALID_DEVICE_PARTITION_COUNT";
+#endif
 #if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2)
 	case CL_INVALID_PIPE_SIZE:
 		return "CL_INVALID_PIPE_SIZE";
@@ -252,24 +309,13 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
-	size_t hashMemSize;
-	int threadMemMask;
-	int hasIterations;
-	if(::jconf::inst()->IsCurrencyMonero())
-	{
-		hashMemSize = MONERO_MEMORY;
-		threadMemMask = MONERO_MASK;
-		hasIterations = MONERO_ITER;
-	}
-	else
-	{
-		hashMemSize = AEON_MEMORY;
-		threadMemMask = AEON_MASK;
-		hasIterations = AEON_ITER;
-	}
+	size_t scratchPadSize = std::max(
+		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
+		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
+	);
 
 	size_t g_thd = ctx->rawIntensity;
-	ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * g_thd, NULL, &ret);
+	ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, scratchPadSize * g_thd, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret));
@@ -323,69 +369,245 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
-	ctx->Program = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret);
-	if(ret != CL_SUCCESS)
+	std::vector<char> devNameVec(1024);
+	if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the contents of cryptonight.cl", err_to_str(ret));
+		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(ret),ctx->deviceIdx );
 		return ERR_OCL_API;
 	}
 
-	char options[256];
-	snprintf(options, sizeof(options), 
-		"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d", 
-		hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex ? 1 : 0);
-	ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL);
-	if(ret != CL_SUCCESS)
+	std::vector<char> openCLDriverVer(1024);
+	if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS)
 	{
-		size_t len;
-		printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret));
+		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(ret),ctx->deviceIdx );
+		return ERR_OCL_API;
+	}
 
-		if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
+	xmrstak_algo miner_algo[2] = {
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(),
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()
+	};
+	int num_algos = miner_algo[0] == miner_algo[1] ? 1 : 2;
+
+	for(int ii = 0; ii < num_algos; ++ii)
+	{
+		// scratchpad size for the selected mining algorithm
+		size_t hashMemSize = cn_select_memory(miner_algo[ii]);
+		int threadMemMask = cn_select_mask(miner_algo[ii]);
+		int hashIterations = cn_select_iter(miner_algo[ii]);
+
+		size_t mem_chunk_exp = 1u << ctx->memChunk;
+		size_t strided_index = ctx->stridedIndex;
+		/* Adjust the config settings to a valid combination
+		 * this is required if the dev pool is mining monero
+		 * but the user tuned there settings for another currency
+		 */
+		if(miner_algo[ii] == cryptonight_monero_v8)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
-			return ERR_OCL_API;
+			if(ctx->memChunk < 2)
+				mem_chunk_exp = 1u << 2;
+			if(strided_index == 1)
+				strided_index = 0;
 		}
 
-		char* BuildLog = (char*)malloc(len + 1);
-		BuildLog[0] = '\0';
+		std::string options;
+		options += " -DITERATIONS=" + std::to_string(hashIterations);
+		options += " -DMASK=" + std::to_string(threadMemMask);
+		options += " -DWORKSIZE=" + std::to_string(ctx->workSize);
+		options += " -DSTRIDED_INDEX=" + std::to_string(strided_index);
+		options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp);
+		options += " -DCOMP_MODE=" + std::to_string(ctx->compMode ? 1u : 0u);
+		options += " -DMEMORY=" + std::to_string(hashMemSize);
+		options += " -DALGO=" + std::to_string(miner_algo[ii]);
+		options += " -DCN_UNROLL=" + std::to_string(ctx->unroll);
+		/* AMD driver output is something like: `1445.5 (VM)`
+		 * and is mapped to `14` only. The value is only used for a compiler
+		 * workaround.
+		 */
+		options += " -DOPENCL_DRIVER_MAJOR=" + std::to_string(std::stoi(openCLDriverVer.data()) / 100);
+
+		/* create a hash for the compile time cache
+		 * used data:
+		 *   - source code
+		 *   - device name
+		 *   - compile parameter
+		 */
+		std::string src_str(source_code);
+		src_str += options;
+		src_str += devNameVec.data();
+		src_str += get_version_str();
+		src_str += openCLDriverVer.data();
+
+		std::string hash_hex_str;
+		picosha2::hash256_hex_string(src_str, hash_hex_str);
+
+		std::string cache_file = get_home() + "/.openclcache/" + hash_hex_str + ".openclbin";
+		std::ifstream clBinFile(cache_file, std::ofstream::in | std::ofstream::binary);
+		if(xmrstak::params::inst().AMDCache == false || !clBinFile.good())
+		{
+			if(xmrstak::params::inst().AMDCache)
+				printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str());
+			ctx->Program[ii] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret);
+			if(ret != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
+				return ERR_OCL_API;
+			}
+
+			ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options.c_str(), NULL, NULL);
+			if(ret != CL_SUCCESS)
+			{
+				size_t len;
+				printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret));
+
+				if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
+				{
+					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
+					return ERR_OCL_API;
+				}
+
+				char* BuildLog = (char*)malloc(len + 1);
+				BuildLog[0] = '\0';
+
+				if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
+				{
+					free(BuildLog);
+					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
+					return ERR_OCL_API;
+				}
+
+				printer::inst()->print_str("Build log:\n");
+				std::cerr<<BuildLog<<std::endl;
+
+				free(BuildLog);
+				return ERR_OCL_API;
+			}
+
+			cl_uint num_devices;
+			clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL);
+
+
+			std::vector<cl_device_id> devices_ids(num_devices);
+			clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL);
+			int dev_id = 0;
+			/* Search for the gpu within the program context.
+			 * The id can be different to  ctx->DeviceID.
+			 */
+			for(auto & ocl_device : devices_ids)
+			{
+				if(ocl_device == ctx->DeviceID)
+					break;
+				dev_id++;
+			}
+
+			cl_build_status status;
+			do
+			{
+				if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
+				{
+					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
+					return ERR_OCL_API;
+				}
+				port_sleep(1);
+			}
+			while(status == CL_BUILD_IN_PROGRESS);
 
-		if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
+			if(xmrstak::params::inst().AMDCache)
+			{
+				std::vector<size_t> binary_sizes(num_devices);
+				clGetProgramInfo (ctx->Program[ii], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL);
+
+				std::vector<char*> all_programs(num_devices);
+				std::vector<std::vector<char>> program_storage;
+
+				int p_id = 0;
+				size_t mem_size = 0;
+				// create memory  structure to query all OpenCL program binaries
+				for(auto & p : all_programs)
+				{
+					program_storage.emplace_back(std::vector<char>(binary_sizes[p_id]));
+					all_programs[p_id] = program_storage[p_id].data();
+					mem_size += binary_sizes[p_id];
+					p_id++;
+				}
+
+				if((ret = clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS)
+				{
+					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret));
+					return ERR_OCL_API;
+				}
+
+				std::ofstream file_stream;
+				file_stream.open(cache_file, std::ofstream::out | std::ofstream::binary);
+				file_stream.write(all_programs[dev_id], binary_sizes[dev_id]);
+				file_stream.close();
+				printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s",ctx->deviceIdx, cache_file.c_str());
+			}
+		}
+		else
 		{
-			free(BuildLog);
-			printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
-			return ERR_OCL_API;
+			printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s",ctx->deviceIdx, cache_file.c_str());
+			std::ostringstream ss;
+			ss << clBinFile.rdbuf();
+			std::string s = ss.str();
+
+			size_t bin_size = s.size();
+			auto data_ptr = s.data();
+
+			cl_int clStatus;
+			ctx->Program[ii] = clCreateProgramWithBinary(
+				opencl_ctx, 1, &ctx->DeviceID, &bin_size,
+				(const unsigned char **)&data_ptr, &clStatus, &ret
+			);
+			if(ret != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str());
+				return ERR_OCL_API;
+			}
+			ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, NULL, NULL, NULL);
+			if(ret != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str());
+				return ERR_OCL_API;
+			}
 		}
-		
-		printer::inst()->print_str("Build log:\n");
-		std::cerr<<BuildLog<<std::endl;
 
-		free(BuildLog);
-		return ERR_OCL_API;
-	}
+		std::vector<std::string> KernelNames = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" };
+		// append algorithm number to kernel name
+		for(int k = 0; k < 3; k++)
+			KernelNames[k] += std::to_string(miner_algo[ii]);
 
-	cl_build_status status;
-	do
-	{
-		if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
+		if(ii == 0)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
-			return ERR_OCL_API;
+			for(int i = 0; i < 7; ++i)
+			{
+				ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret);
+				if(ret != CL_SUCCESS)
+				{
+					printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str());
+					return ERR_OCL_API;
+				}
+			}
 		}
-		port_sleep(1);
-	}
-	while(status == CL_BUILD_IN_PROGRESS);
-
-	const char *KernelNames[] = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" };
-	for(int i = 0; i < 7; ++i)
-	{
-		ctx->Kernels[i] = clCreateKernel(ctx->Program, KernelNames[i], &ret);
-		if(ret != CL_SUCCESS)
+		else
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel %s.", err_to_str(ret), KernelNames[i]);
-			return ERR_OCL_API;
+			for(int i = 0; i < 3; ++i)
+			{
+				ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret);
+				if(ret != CL_SUCCESS)
+				{
+					printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_1 %s.", err_to_str(ret), KernelNames[i].c_str());
+					return ERR_OCL_API;
+				}
+			}
+			// move kernel from the main algorithm into the root algorithm kernel space
+			for(int i = 3; i < 7; ++i)
+			{
+				ctx->Kernels[ii][i] = ctx->Kernels[0][i];
+			}
+
 		}
 	}
-
 	ctx->Nonce = 0;
 	return 0;
 }
@@ -408,27 +630,6 @@ const char* const attributeNames[] = {
 
 #define NELEMS(x)  (sizeof(x) / sizeof((x)[0]))
 
-void PrintDeviceInfo(cl_device_id device)
-{
-	char queryBuffer[1024];
-	int queryInt;
-	cl_int clError;
-	clError = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(queryBuffer), &queryBuffer, NULL);
-	printf("    CL_DEVICE_NAME: %s\n", queryBuffer);
-	queryBuffer[0] = '\0';
-	clError = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(queryBuffer), &queryBuffer, NULL);
-	printf("    CL_DEVICE_VENDOR: %s\n", queryBuffer);
-	queryBuffer[0] = '\0';
-	clError = clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(queryBuffer), &queryBuffer, NULL);
-	printf("    CL_DRIVER_VERSION: %s\n", queryBuffer);
-	queryBuffer[0] = '\0';
-	clError = clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(queryBuffer), &queryBuffer, NULL);
-	printf("    CL_DEVICE_VERSION: %s\n", queryBuffer);
-	queryBuffer[0] = '\0';
-	clError = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &queryInt, NULL);
-	printf("    CL_DEVICE_MAX_COMPUTE_UNITS: %d\n", queryInt);
-}
-
 uint32_t getNumPlatforms()
 {
 	cl_uint num_platforms = 0;
@@ -487,13 +688,20 @@ std::vector<GpuContext> getAMDDevices(int index)
 			printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k);
 			continue;
 		}
-		
+
 		std::string devVendor(devVendorVec.data());
-		if( devVendor.find("Advanced Micro Devices") != std::string::npos || devVendor.find("AMD") != std::string::npos)
+
+		bool isAMDDevice = devVendor.find("Advanced Micro Devices") != std::string::npos || devVendor.find("AMD") != std::string::npos;
+		bool isNVIDIADevice = devVendor.find("NVIDIA Corporation") != std::string::npos || devVendor.find("NVIDIA") != std::string::npos;
+
+		std::string selectedOpenCLVendor = xmrstak::params::inst().openCLVendor;
+		if((isAMDDevice && selectedOpenCLVendor == "AMD") || (isNVIDIADevice && selectedOpenCLVendor == "NVIDIA"))
 		{
 			GpuContext ctx;
 			std::vector<char> devNameVec(1024);
 			size_t maxMem;
+			if( devVendor.find("NVIDIA Corporation") != std::string::npos)
+				ctx.isNVIDIA = true;
 
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS)
 			{
@@ -513,18 +721,22 @@ std::vector<GpuContext> getAMDDevices(int index)
 				continue;
 			}
 
+			// the allocation for NVIDIA OpenCL is not limited to 1/4 of the GPU memory per allocation
+			if(ctx.isNVIDIA)
+				maxMem = ctx.freeMem;
+
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k);
 				continue;
 			}
-			printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
 
 			// if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
 			ctx.deviceIdx = k;
 			ctx.freeMem = std::min(ctx.freeMem, maxMem);
 			ctx.name = std::string(devNameVec.data());
 			ctx.DeviceID = device_list[k];
+			printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
 			ctxVec.push_back(ctx);
 		}
 	}
@@ -549,6 +761,8 @@ int getAMDPlatformIdx()
 	clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL);
 
 	int platformIndex = -1;
+	// Mesa OpenCL is the fallback if no AMD or Apple OpenCL is found
+	int mesaPlatform = -1;
 
 	if(clStatus == CL_SUCCESS)
 	{
@@ -559,13 +773,31 @@ int getAMDPlatformIdx()
 
 			clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL);
 			std::string platformName(platformNameVec.data());
-			if( platformName.find("Advanced Micro Devices") != std::string::npos || platformName.find("Apple") != std::string::npos)
+
+			bool isAMDOpenCL = platformName.find("Advanced Micro Devices") != std::string::npos ||
+				platformName.find("Apple") != std::string::npos ||
+				platformName.find("Mesa") != std::string::npos;
+			bool isNVIDIADevice = platformName.find("NVIDIA Corporation") != std::string::npos || platformName.find("NVIDIA") != std::string::npos;
+			std::string selectedOpenCLVendor = xmrstak::params::inst().openCLVendor;
+			if((isAMDOpenCL && selectedOpenCLVendor == "AMD") || (isNVIDIADevice && selectedOpenCLVendor == "NVIDIA"))
 			{
-				platformIndex = i;
-				printer::inst()->print_msg(L0,"Found AMD platform index id = %i, name = %s",i , platformName.c_str());
-				break;
+				printer::inst()->print_msg(L0,"Found %s platform index id = %i, name = %s", selectedOpenCLVendor.c_str(), i , platformName.c_str());
+				if(platformName.find("Mesa") != std::string::npos)
+					mesaPlatform = i;
+				else
+				{
+					// exit if AMD or Apple platform is found
+					platformIndex = i;
+					break;
+				}
 			}
 		}
+		// fall back to Mesa OpenCL
+		if(platformIndex == -1 && mesaPlatform != -1)
+		{
+			printer::inst()->print_msg(L0,"No AMD platform found select Mesa as OpenCL platform");
+			platformIndex = mesaPlatform;
+		}
 	}
 	else
 		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
@@ -615,7 +847,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	std::vector<char> platformNameVec(infoSize);
 	clGetPlatformInfo(PlatformIDList[platform_idx], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL);
 	std::string platformName(platformNameVec.data());
-	if( platformName.find("Advanced Micro Devices") == std::string::npos)
+	if(xmrstak::params::inst().openCLVendor == "AMD" && platformName.find("Advanced Micro Devices") == std::string::npos)
 	{
 		printer::inst()->print_msg(L1,"WARNING: using non AMD device: %s", platformName.c_str());
 	}
@@ -668,6 +900,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 
 	//char* source_code = LoadTextFile(sSourcePath);
 
+	const char *fastIntMathV2CL =
+			#include "./opencl/fast_int_math_v2.cl"
+	;
 	const char *cryptonightCL =
 			#include "./opencl/cryptonight.cl"
 	;
@@ -688,14 +923,26 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	;
 
 	std::string source_code(cryptonightCL);
+	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_BLAKE256"), blake256CL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_GROESTL256"), groestl256CL);
 
+	// create a directory  for the OpenCL compile cache
+	create_directory(get_home() + "/.openclcache");
+
 	for(int i = 0; i < num_gpus; ++i)
 	{
+		const std::string backendName = xmrstak::params::inst().openCLVendor;
+		if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
+		{
+			size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize;
+			ctx[i].rawIntensity = reduced_intensity;
+			printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity));
+		}
+
 		if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS)
 		{
 			return ret;
@@ -705,8 +952,11 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	return ERR_SUCCESS;
 }
 
-size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target)
+size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo)
 {
+	// switch to the kernel storage
+	int kernel_storage = miner_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1;
+
 	cl_int ret;
 
 	if(input_len > 84)
@@ -723,101 +973,111 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 		return ERR_OCL_API;
 	}
 
-	if((ret = clSetKernelArg(ctx->Kernels[0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
 		return(ERR_OCL_API);
 	}
 
-	// CN2 Kernel
+	// CN1 Kernel
 
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[1], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
 		return(ERR_OCL_API);
 	}
 
+	if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon || miner_algo == cryptonight_ipbc || miner_algo == cryptonight_stellite || miner_algo == cryptonight_masari || miner_algo == cryptonight_bittube2)
+	{
+		// Input
+		if ((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 4(input buffer).", err_to_str(ret));
+			return ERR_OCL_API;
+		}
+	}
+
 	// CN3 Kernel
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 0
-	if((ret = clSetKernelArg(ctx->Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 1
-	if((ret = clSetKernelArg(ctx->Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 2
-	if((ret = clSetKernelArg(ctx->Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 3
-	if((ret = clSetKernelArg(ctx->Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -826,28 +1086,28 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	for(int i = 0; i < 4; ++i)
 	{
 		// States
-		if((ret = clSetKernelArg(ctx->Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+		if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0);
 			return ERR_OCL_API;
 		}
 
 		// Nonce buffer
-		if((ret = clSetKernelArg(ctx->Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
+		if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1);
 			return ERR_OCL_API;
 		}
 
 		// Output
-		if((ret = clSetKernelArg(ctx->Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
+		if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2);
 			return ERR_OCL_API;
 		}
 
 		// Target
-		if((ret = clSetKernelArg(ctx->Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
+		if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
 			return ERR_OCL_API;
@@ -857,8 +1117,11 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	return ERR_SUCCESS;
 }
 
-size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
+size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 {
+	// switch to the kernel storage
+	int kernel_storage = miner_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1;
+
 	cl_int ret;
 	cl_uint zero = 0;
 	size_t BranchNonces[4];
@@ -866,10 +1129,15 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
 
 	size_t g_intensity = ctx->rawIntensity;
 	size_t w_size = ctx->workSize;
-	// round up to next multiple of w_size
-	size_t g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size;
-	// number of global threads must be a multiple of the work group size (w_size)
-	assert(g_thd%w_size == 0);
+	size_t g_thd = g_intensity;
+
+	if(ctx->compMode)
+	{
+		// round up to next multiple of w_size
+		g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size;
+		// number of global threads must be a multiple of the work group size (w_size)
+		assert(g_thd%w_size == 0);
+	}
 
 	for(int i = 2; i < 6; ++i)
 	{
@@ -889,29 +1157,21 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
 	clFinish(ctx->CommandQueues);
 
 	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { w_size, 8 };
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0);
 		return ERR_OCL_API;
 	}
 
-	/*for(int i = 1; i < 3; ++i)
-	{
-		if((ret = clEnqueueNDRangeKernel(*ctx->CommandQueues, ctx->Kernels[i], 1, &ctx->Nonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
-		{
-			Log(LOG_CRITICAL, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i);
-			return(ERR_OCL_API);
-		}
-	}*/
-
 	size_t tmpNonce = ctx->Nonce;
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
 		return ERR_OCL_API;
 	}
 
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2);
 		return ERR_OCL_API;
@@ -948,7 +1208,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
 		if(BranchNonces[i])
 		{
 			// Threads
-			if((clSetKernelArg(ctx->Kernels[i + 3], 4, sizeof(cl_ulong), BranchNonces + i)) != CL_SUCCESS)
+			if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_ulong), BranchNonces + i)) != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
 				return(ERR_OCL_API);
@@ -959,7 +1219,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
 			// number of global threads must be a multiple of the work group size (w_size)
 			assert(BranchNonces[i]%w_size == 0);
 			size_t tmpNonce = ctx->Nonce;
-			if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+			if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
 				return ERR_OCL_API;
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index c17bac11b..63c5029d7 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "xmrstak/misc/console.hpp"
+#include "xmrstak/jconf.hpp"
 
 #if defined(__APPLE__)
 #include <OpenCL/cl.h>
@@ -25,6 +26,10 @@ struct GpuContext
 	size_t rawIntensity;
 	size_t workSize;
 	int stridedIndex;
+	int memChunk;
+	int unroll = 0;
+	bool isNVIDIA = false;
+	int compMode;
 
 	/*Output vars*/
 	cl_device_id DeviceID;
@@ -32,8 +37,8 @@ struct GpuContext
 	cl_mem InputBuffer;
 	cl_mem OutputBuffer;
 	cl_mem ExtraBuffers[6];
-	cl_program Program;
-	cl_kernel Kernels[7];
+	cl_program Program[2];
+	cl_kernel Kernels[2][8];
 	size_t freeMem;
 	int computeUnits;
 	std::string name;
@@ -47,7 +52,7 @@ int getAMDPlatformIdx();
 std::vector<GpuContext> getAMDDevices(int index);
 
 size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx);
-size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target);
-size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput);
+size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo);
+size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo);
 
 
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 255fcbbff..9c9bcd08e 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -14,6 +14,11 @@ R"===(
   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
   */
 
+/* For Mesa clover support */
+#ifdef cl_clang_storage_class_specifiers
+#	pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
+#endif
+
 #ifdef cl_amd_media_ops
 #pragma OPENCL EXTENSION cl_amd_media_ops : enable
 #else
@@ -30,8 +35,8 @@ R"===(
 inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
 {
 	uint2 result;
-	result.s0 =  (uint) (((((long)src0.s0) << 32) | (long)src1.s0) >> (src2));
-	result.s1 =  (uint) (((((long)src0.s1) << 32) | (long)src1.s1) >> (src2));
+	result.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2));
+	result.s1 =  (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2));
 	return result;
 }
 #endif
@@ -73,6 +78,8 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width)
 }
 #endif
 
+//#include "opencl/fast_int_math_v2.cl"
+XMRSTAK_INCLUDE_FAST_INT_MATH_V2
 //#include "opencl/wolf-aes.cl"
 XMRSTAK_INCLUDE_WOLF_AES
 //#include "opencl/wolf-skein.cl"
@@ -84,7 +91,7 @@ XMRSTAK_INCLUDE_BLAKE256
 //#include "opencl/groestl256.cl"
 XMRSTAK_INCLUDE_GROESTL256
 
-static const __constant ulong keccakf_rndc[24] = 
+static const __constant ulong keccakf_rndc[24] =
 {
     0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
     0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
@@ -96,7 +103,7 @@ static const __constant ulong keccakf_rndc[24] =
     0x8000000000008080, 0x0000000080000001, 0x8000000080008008
 };
 
-static const __constant uchar sbox[256] = 
+static const __constant uchar sbox[256] =
 {
 	0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
 	0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
@@ -119,7 +126,7 @@ static const __constant uchar sbox[256] =
 
 void keccakf1600(ulong *s)
 {
-    for(int i = 0; i < 24; ++i) 
+    for(int i = 0; i < 24; ++i)
     {
 		ulong bc[5], tmp1, tmp2;
         bc[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20] ^ rotate(s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22], 1UL);
@@ -127,9 +134,9 @@ void keccakf1600(ulong *s)
         bc[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22] ^ rotate(s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24], 1UL);
         bc[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23] ^ rotate(s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20], 1UL);
         bc[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24] ^ rotate(s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21], 1UL);
-        
+
         tmp1 = s[1] ^ bc[0];
-        
+
         s[0] ^= bc[4];
         s[1] = rotate(s[6] ^ bc[0], 44UL);
         s[6] = rotate(s[9] ^ bc[3], 20UL);
@@ -155,7 +162,7 @@ void keccakf1600(ulong *s)
         s[11] = rotate(s[7] ^ bc[1], 6UL);
         s[7] = rotate(s[10] ^ bc[4], 3UL);
         s[10] = rotate(tmp1, 1UL);
-        
+
         tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
         tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
         tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
@@ -165,23 +172,23 @@ void keccakf1600(ulong *s)
     }
 }
 
-static const __constant uint keccakf_rotc[24] = 
+static const __constant uint keccakf_rotc[24] =
 {
-    1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14, 
+    1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
     27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
 };
 
-static const __constant uint keccakf_piln[24] = 
+static const __constant uint keccakf_piln[24] =
 {
-    10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4, 
-    15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1 
+    10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
+    15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
 };
 
 void keccakf1600_1(ulong *st)
 {
     int i, round;
     ulong t, bc[5];
-	
+
 	#pragma unroll 1
     for(round = 0; round < 24; ++round)
     {
@@ -192,7 +199,7 @@ void keccakf1600_1(ulong *st)
         bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
         bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
         bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
-		
+
 		#pragma unroll 1
         for (i = 0; i < 5; ++i) {
             t = bc[(i + 4) % 5] ^ rotate(bc[(i + 1) % 5], 1UL);
@@ -217,20 +224,20 @@ void keccakf1600_1(ulong *st)
         //tmp1 = st[10]; tmp2 = st[11]; st[10] = bitselect(st[10] ^ st[12], st[10], st[11]); st[11] = bitselect(st[11] ^ st[13], st[11], st[12]); st[12] = bitselect(st[12] ^ st[14], st[12], st[13]); st[13] = bitselect(st[13] ^ tmp1, st[13], st[14]); st[14] = bitselect(st[14] ^ tmp2, st[14], tmp1);
         //tmp1 = st[15]; tmp2 = st[16]; st[15] = bitselect(st[15] ^ st[17], st[15], st[16]); st[16] = bitselect(st[16] ^ st[18], st[16], st[17]); st[17] = bitselect(st[17] ^ st[19], st[17], st[18]); st[18] = bitselect(st[18] ^ tmp1, st[18], st[19]); st[19] = bitselect(st[19] ^ tmp2, st[19], tmp1);
         //tmp1 = st[20]; tmp2 = st[21]; st[20] = bitselect(st[20] ^ st[22], st[20], st[21]); st[21] = bitselect(st[21] ^ st[23], st[21], st[22]); st[22] = bitselect(st[22] ^ st[24], st[22], st[23]); st[23] = bitselect(st[23] ^ tmp1, st[23], st[24]); st[24] = bitselect(st[24] ^ tmp2, st[24], tmp1);
-        
+
         #pragma unroll 1
         for(int i = 0; i < 25; i += 5)
-        {	
+        {
 			ulong tmp[5];
-			
+
 			#pragma unroll 1
 			for(int x = 0; x < 5; ++x)
 				tmp[x] = bitselect(st[i + x] ^ st[i + ((x + 2) % 5)], st[i + x], st[i + ((x + 1) % 5)]);
-			
+
 			#pragma unroll 1
 			for(int x = 0; x < 5; ++x) st[i + x] = tmp[x];
         }
-        
+
         //  Iota
         st[0] ^= keccakf_rndc[round];
     }
@@ -241,7 +248,7 @@ void keccakf1600_2(ulong *st)
 {
     int i, round;
     ulong t, bc[5];
-	
+
 	#pragma unroll 1
     for(round = 0; round < 24; ++round)
     {
@@ -252,7 +259,7 @@ void keccakf1600_2(ulong *st)
         //bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
         //bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
         //bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
-		
+
 		/*
 		#pragma unroll
         for (i = 0; i < 5; ++i) {
@@ -264,43 +271,43 @@ void keccakf1600_2(ulong *st)
             st[i + 20] ^= t;
         }
 		*/
-		
+
 		bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL);
 		bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL);
 		bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL);
 		bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL);
 		bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL);
-		
+
 		st[0] ^= bc[4];
 		st[5] ^= bc[4];
 		st[10] ^= bc[4];
 		st[15] ^= bc[4];
 		st[20] ^= bc[4];
-		
+
 		st[1] ^= bc[0];
 		st[6] ^= bc[0];
 		st[11] ^= bc[0];
 		st[16] ^= bc[0];
 		st[21] ^= bc[0];
-		
+
 		st[2] ^= bc[1];
 		st[7] ^= bc[1];
 		st[12] ^= bc[1];
 		st[17] ^= bc[1];
 		st[22] ^= bc[1];
-		
+
 		st[3] ^= bc[2];
 		st[8] ^= bc[2];
 		st[13] ^= bc[2];
 		st[18] ^= bc[2];
 		st[23] ^= bc[2];
-		
+
 		st[4] ^= bc[3];
 		st[9] ^= bc[3];
 		st[14] ^= bc[3];
 		st[19] ^= bc[3];
 		st[24] ^= bc[3];
-		
+
         // Rho Pi
         t = st[1];
         #pragma unroll
@@ -309,11 +316,11 @@ void keccakf1600_2(ulong *st)
             st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]);
             t = bc[0];
         }
-		
-		
-		
+
+
+
 		/*ulong tmp1 = st[1] ^ bc[0];
-        
+
         st[0] ^= bc[4];
         st[1] = rotate(st[6] ^ bc[0], 44UL);
         st[6] = rotate(st[9] ^ bc[3], 20UL);
@@ -340,26 +347,26 @@ void keccakf1600_2(ulong *st)
         st[7] = rotate(st[10] ^ bc[4], 3UL);
         st[10] = rotate(tmp1, 1UL);
 		*/
-		
-		
+
+
         //ulong tmp1 = st[0]; ulong tmp2 = st[1]; st[0] = bitselect(st[0] ^ st[2], st[0], st[1]); st[1] = bitselect(st[1] ^ st[3], st[1], st[2]); st[2] = bitselect(st[2] ^ st[4], st[2], st[3]); st[3] = bitselect(st[3] ^ tmp1, st[3], st[4]); st[4] = bitselect(st[4] ^ tmp2, st[4], tmp1);
         //tmp1 = st[5]; tmp2 = st[6]; st[5] = bitselect(st[5] ^ st[7], st[5], st[6]); st[6] = bitselect(st[6] ^ st[8], st[6], st[7]); st[7] = bitselect(st[7] ^ st[9], st[7], st[8]); st[8] = bitselect(st[8] ^ tmp1, st[8], st[9]); st[9] = bitselect(st[9] ^ tmp2, st[9], tmp1);
         //tmp1 = st[10]; tmp2 = st[11]; st[10] = bitselect(st[10] ^ st[12], st[10], st[11]); st[11] = bitselect(st[11] ^ st[13], st[11], st[12]); st[12] = bitselect(st[12] ^ st[14], st[12], st[13]); st[13] = bitselect(st[13] ^ tmp1, st[13], st[14]); st[14] = bitselect(st[14] ^ tmp2, st[14], tmp1);
         //tmp1 = st[15]; tmp2 = st[16]; st[15] = bitselect(st[15] ^ st[17], st[15], st[16]); st[16] = bitselect(st[16] ^ st[18], st[16], st[17]); st[17] = bitselect(st[17] ^ st[19], st[17], st[18]); st[18] = bitselect(st[18] ^ tmp1, st[18], st[19]); st[19] = bitselect(st[19] ^ tmp2, st[19], tmp1);
         //tmp1 = st[20]; tmp2 = st[21]; st[20] = bitselect(st[20] ^ st[22], st[20], st[21]); st[21] = bitselect(st[21] ^ st[23], st[21], st[22]); st[22] = bitselect(st[22] ^ st[24], st[22], st[23]); st[23] = bitselect(st[23] ^ tmp1, st[23], st[24]); st[24] = bitselect(st[24] ^ tmp2, st[24], tmp1);
-        
+
         #pragma unroll
         for(int i = 0; i < 25; i += 5)
         {
 			ulong tmp1 = st[i], tmp2 = st[i + 1];
-			
+
 			st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]);
 			st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]);
 			st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]);
 			st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]);
 			st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1);
         }
-        
+
         //  Iota
         st[0] ^= keccakf_rndc[round];
     }
@@ -371,22 +378,22 @@ R"===(
 void CNKeccak(ulong *output, ulong *input)
 {
 	ulong st[25];
-	
+
 	// Copy 72 bytes
 	for(int i = 0; i < 9; ++i) st[i] = input[i];
-	
+
 	// Last four and '1' bit for padding
 	//st[9] = as_ulong((uint2)(((uint *)input)[18], 0x00000001U));
-	
+
 	st[9] = (input[9] & 0x00000000FFFFFFFFUL) | 0x0000000100000000UL;
-	
+
 	for(int i = 10; i < 25; ++i) st[i] = 0x00UL;
-	
+
 	// Last bit of padding
 	st[16] = 0x8000000000000000UL;
-	
+
 	keccakf1600_1(st);
-	
+
 	for(int i = 0; i < 25; ++i) output[i] = st[i];
 }
 
@@ -399,11 +406,11 @@ static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x
 void AESExpandKey256(uint *keybuf)
 {
 	//#pragma unroll 4
-	for(uint c = 8, i = 1; c < 60; ++c)
+	for(uint c = 8, i = 1; c < 40; ++c)
 	{
 		// For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th
 		uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1];
-		
+
 		// If the uint we're generating has an index that is a multiple of 8, rotate and XOR with the round constant,
 		// then XOR this with previously generated uint. If it's 4 after a multiple of 8, only the sbox permutation
 		// is done, followed by the XOR. If neither are true, only the XOR with the previously generated uint is done.
@@ -411,21 +418,40 @@ void AESExpandKey256(uint *keybuf)
 	}
 }
 
+)==="
+R"===(
+
+#define MEM_CHUNK (1<<MEM_CHUNK_EXPONENT)
+
 #if(STRIDED_INDEX==0)
 #   define IDX(x)	(x)
-#else
+#elif(STRIDED_INDEX==1)
 #   define IDX(x)	((x) * (Threads))
+#elif(STRIDED_INDEX==2)
+#   define IDX(x)	(((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
+#endif
+
+inline ulong getIdx()
+{
+#if(STRIDED_INDEX==0 || STRIDED_INDEX==1 || STRIDED_INDEX==2)
+	return get_global_id(0) - get_global_offset(0);
 #endif
+}
+
+#define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
+
+#define JOIN_DO(x,y) x##y
+#define JOIN(x,y) JOIN_DO(x,y)
 
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
-__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
+__kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
 {
 	ulong State[25];
-	uint ExpandedKey1[256];
+	uint ExpandedKey1[40];
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
 	uint4 text;
 
-	const ulong gIdx = get_global_id(0) - get_global_offset(0);
+	const ulong gIdx = getIdx();
 
 	for(int i = get_local_id(1) * WORKSIZE + get_local_id(0);
 		i < 256;
@@ -440,15 +466,19 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 
 	barrier(CLK_LOCAL_MEM_FENCE);
 
+#if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
+#endif
 	{
 		states += 25 * gIdx;
 
 #if(STRIDED_INDEX==0)
-		Scratchpad += gIdx * (ITERATIONS >> 2);
-#else
+		Scratchpad += gIdx * (MEMORY >> 4);
+#elif(STRIDED_INDEX==1)
 		Scratchpad += gIdx;
+#elif(STRIDED_INDEX==2)
+		Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
 #endif
 
 		((ulong8 *)State)[0] = vload8(0, input);
@@ -457,9 +487,14 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 		State[10] = input[10];
 
 		((uint *)State)[9] &= 0x00FFFFFFU;
-		((uint *)State)[9] |= ((get_global_id(0)) & 0xFF) << 24;
+		((uint *)State)[9] |= (((uint)get_global_id(0)) & 0xFF) << 24;
 		((uint *)State)[10] &= 0xFF000000U;
-		((uint *)State)[10] |= ((get_global_id(0) >> 8));
+		/* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA)
+		 * handle get_global_id and get_global_offset as signed long long int and add
+		 * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1`
+		 * (even if it is correct casted to unsigned on the host)
+		 */
+		((uint *)State)[10] |= (((uint)get_global_id(0) >> 8));
 
 		for(int i = 11; i < 25; ++i) State[i] = 0x00UL;
 
@@ -470,9 +505,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 	}
 
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
-
+#if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
+#endif
 	{
 		#pragma unroll
 		for(int i = 0; i < 25; ++i) states[i] = State[i];
@@ -487,11 +523,34 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 
 	mem_fence(CLK_LOCAL_MEM_FENCE);
 
+// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
+#if (ALGO == 4 || ALGO == 9 || ALGO == 10)
+	__local uint4 xin[8][WORKSIZE];
+
+	/* Also left over threads perform this loop.
+	 * The left over thread results will be ignored
+	 */
+	#pragma unroll 16
+	for(size_t i=0; i < 16; i++)
+	{
+		#pragma unroll 10
+		for(int j = 0; j < 10; ++j)
+			text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]);
+		barrier(CLK_LOCAL_MEM_FENCE);
+		xin[get_local_id(1)][get_local_id(0)] = text;
+		barrier(CLK_LOCAL_MEM_FENCE);
+		text = mix_and_propagate(xin);
+	}
+#endif
+
+#if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
+#endif
 	{
+
 		#pragma unroll 2
-		for(int i = 0; i < (ITERATIONS >> 5); ++i)
+		for(int i = 0; i < (MEMORY >> 7); ++i)
 		{
 			#pragma unroll
 			for(int j = 0; j < 10; ++j)
@@ -503,13 +562,46 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
+// cryptonight_monero_v8 && NVIDIA
+#if(ALGO==11 && defined(__NV_CL_C_VERSION))
+#	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
+#	define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
+#else
+#	define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx0) >> 4) ^ N)])
+#endif
+
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Threads)
+__kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads
+// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
+#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
+, __global ulong *input
+#endif
+)
 {
-	ulong a[2], b[2];
+	ulong a[2];
+
+// cryptonight_monero_v8
+#if(ALGO==11)
+	ulong b[4];
+	uint4 b_x[2];
+// NVIDIA
+#	ifdef __NV_CL_C_VERSION
+	__local uint16 scratchpad_line_buf[WORKSIZE];
+ 	__local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0);
+#	endif
+#else
+	ulong b[2];
+	uint4 b_x[1];
+#endif
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
 
-	const ulong gIdx = get_global_id(0) - get_global_offset(0);
+// cryptonight_monero_v8
+#if(ALGO==11)
+	__local uint RCP[256];
+	uint2 division_result;
+	uint sqrt_result;
+#endif
+	const ulong gIdx = getIdx();
 
 	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
 	{
@@ -518,20 +610,30 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 		AES1[i] = rotate(tmp, 8U);
 		AES2[i] = rotate(tmp, 16U);
 		AES3[i] = rotate(tmp, 24U);
+// cryptonight_monero_v8
+#if(ALGO==11)
+		RCP[i] = RCP_C[i];
+#endif
 	}
 
 	barrier(CLK_LOCAL_MEM_FENCE);
+// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
+#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
+    uint2 tweak1_2;
+#endif
 
-	uint4 b_x;
-		
+#if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
+#endif
 	{
 		states += 25 * gIdx;
 #if(STRIDED_INDEX==0)
-		Scratchpad += gIdx * (ITERATIONS >> 2);
-#else
+		Scratchpad += gIdx * (MEMORY >> 4);
+#elif(STRIDED_INDEX==1)
 		Scratchpad += gIdx;
+#elif(STRIDED_INDEX==2)
+		Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
 #endif
 
 		a[0] = states[0] ^ states[4];
@@ -539,50 +641,196 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 		a[1] = states[1] ^ states[5];
 		b[1] = states[3] ^ states[7];
 
-		b_x = ((uint4 *)b)[0];
+		b_x[0] = ((uint4 *)b)[0];
+
+// cryptonight_monero_v8
+#if(ALGO==11)
+		a[1] = states[1] ^ states[5];
+		b[2] = states[8] ^ states[10];
+		b[3] = states[9] ^ states[11];
+		b_x[1] = ((uint4 *)b)[1];
+		division_result = as_uint2(states[12]);
+		sqrt_result = as_uint2(states[13]).s0;
+#endif
+// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
+#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
+		tweak1_2 = as_uint2(input[4]);
+		tweak1_2.s0 >>= 24;
+		tweak1_2.s0 |= tweak1_2.s1 << 8;
+		tweak1_2.s1 = (uint)get_global_id(0);
+		tweak1_2 ^= as_uint2(states[24]);
+#endif
 	}
 
 	mem_fence(CLK_LOCAL_MEM_FENCE);
 
+#if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
+#endif
 	{
-		#pragma unroll 8
+		ulong idx0 = a[0] & MASK;
+
+		#pragma unroll CN_UNROLL
 		for(int i = 0; i < ITERATIONS; ++i)
 		{
 			ulong c[2];
+// cryptonight_monero_v8 && NVIDIA
+#if(ALGO==11 && defined(__NV_CL_C_VERSION))
+			ulong idxS = idx0 & 0x30;
+ 			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
+#endif
 
-			((uint4 *)c)[0] = Scratchpad[IDX((a[0] & MASK) >> 4)];
+			((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0);
+// cryptonight_bittube2
+#if(ALGO == 10)
+			((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
+#else
 			((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
-			//b_x ^= ((uint4 *)c)[0];
+#endif
 
-			Scratchpad[IDX((a[0] & MASK) >> 4)] = b_x ^ ((uint4 *)c)[0];
+// cryptonight_monero_v8
+#if(ALGO==11)
+			{
+				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
+				ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+				ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+				SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
+				SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+				SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+			}
+#endif
 
+// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
+#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
+			uint table = 0x75310U;
+			b_x[0] ^= ((uint4 *)c)[0];
+// cryptonight_stellite
+#	if(ALGO == 7)
+			uint index = ((b_x[0].s2 >> 27) & 12) | ((b_x[0].s2 >> 23) & 2);
+#	else
+			uint index = ((b_x[0].s2 >> 26) & 12) | ((b_x[0].s2 >> 23) & 2);
+#	endif
+			b_x[0].s2 ^= ((table >> index) & 0x30U) << 24;
+			SCRATCHPAD_CHUNK(0) = b_x[0];
+			idx0 = c[0] & MASK;
+// cryptonight_monero_v8
+#elif(ALGO==11)
+			SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
+#	ifdef __NV_CL_C_VERSION
+			// flush shuffled data
+			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
+ 			idx0 = c[0] & MASK;
+ 			idxS = idx0 & 0x30;
+ 			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
+#	else
+			idx0 = c[0] & MASK;
+#	endif
+#else
+			b_x[0] ^= ((uint4 *)c)[0];
+			SCRATCHPAD_CHUNK(0) = b_x[0];
+			idx0 = c[0] & MASK;
+#endif
 			uint4 tmp;
-			tmp = Scratchpad[IDX((c[0] & MASK) >> 4)];
-
-			a[1] += c[0] * as_ulong2(tmp).s0;
-			a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
+			tmp = SCRATCHPAD_CHUNK(0);
+// cryptonight_monero_v8
+#if(ALGO==11)
+			// Use division and square root results from the _previous_ iteration to hide the latency
+			tmp.s0 ^= division_result.s0;
+			tmp.s1 ^= division_result.s1 ^ sqrt_result;
+ 			// Most and least significant bits in the divisor are set to 1
+			// to make sure we don't divide by a small or even number,
+			// so there are no shortcuts for such cases
+			const uint d = (((uint *)c)[0] + (sqrt_result << 1)) | 0x80000001UL;
+ 			// Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4
+			// We drop the highest bit to fit both quotient and remainder in 32 bits
+			division_result = fast_div_v2(RCP, c[1], d);
+ 			// Use division_result as an input for the square root to prevent parallel implementation in hardware
+			sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result));
+#endif
+			ulong2 result_mul;
+			result_mul.s0 = mul_hi(c[0], as_ulong2(tmp).s0);
+			result_mul.s1 = c[0] * as_ulong2(tmp).s0;
+// cryptonight_monero_v8
+#if(ALGO==11)
+			{
+				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul;
+				ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+				result_mul ^= chunk2;
+				ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+				SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
+				SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+				SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+			}
+#endif
+			a[1] += result_mul.s1;
+			a[0] += result_mul.s0;
+
+// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
+#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
+
+// cryptonight_ipbc || cryptonight_bittube2
+#	if(ALGO == 6 || ALGO == 10)
+			uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0];
+			((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
+			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
+			((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
+#	else
+			((uint2 *)&(a[1]))[0] ^= tweak1_2;
+			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
+			((uint2 *)&(a[1]))[0] ^= tweak1_2;
+#	endif
 
-			Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
+#else
+			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
+#endif
 
 			((uint4 *)a)[0] ^= tmp;
 
-			b_x = ((uint4 *)c)[0];
+// cryptonight_monero_v8
+#if (ALGO == 11)
+#	if defined(__NV_CL_C_VERSION)
+			// flush shuffled data
+			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
+#	endif
+			b_x[1] = b_x[0];
+#endif
+			b_x[0] = ((uint4 *)c)[0];
+			idx0 = a[0] & MASK;
+
+// cryptonight_heavy || cryptonight_bittube2
+#if (ALGO == 4 || ALGO == 10)
+			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
+			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
+			long q = n / (d | 0x5);
+			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
+			idx0 = (d ^ q) & MASK;
+// cryptonight_haven
+#elif (ALGO == 9)
+			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
+			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
+			long q = n / (d | 0x5);
+			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
+			idx0 = ((~d) ^ q) & MASK;
+#endif
+
 		}
 	}
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
+)==="
+R"===(
+
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
-__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
+__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
 {
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
-	uint ExpandedKey2[256];
+	uint ExpandedKey2[40];
 	ulong State[25];
 	uint4 text;
-	
-	const ulong gIdx = get_global_id(0) - get_global_offset(0);
+
+	const ulong gIdx = getIdx();
 
 	for(int i = get_local_id(1) * WORKSIZE + get_local_id(0);
 		i < 256;
@@ -597,14 +845,18 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 
 	barrier(CLK_LOCAL_MEM_FENCE);
 
+#if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
+#endif
 	{
 		states += 25 * gIdx;
 #if(STRIDED_INDEX==0)
-		Scratchpad += gIdx * (ITERATIONS >> 2);
-#else
+		Scratchpad += gIdx * (MEMORY >> 4);
+#elif(STRIDED_INDEX==1)
 		Scratchpad += gIdx;
+#elif(STRIDED_INDEX==2)
+		Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
 #endif
 
 		#if defined(__Tahiti__) || defined(__Pitcairn__)
@@ -623,27 +875,95 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 	}
 
 	barrier(CLK_LOCAL_MEM_FENCE);
+// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
+#if (ALGO == 4 || ALGO == 9 || ALGO == 10)
+	__local uint4 xin[8][WORKSIZE];
+#endif
 
+#if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
+#endif
 	{
+// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
+#if (ALGO == 4 || ALGO == 9 || ALGO == 10)
 		#pragma unroll 2
-		for(int i = 0; i < (ITERATIONS >> 5); ++i)
+		for(int i = 0; i < (MEMORY >> 7); ++i)
 		{
 			text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
 
-			#pragma unroll
+			#pragma unroll 10
+			for(int j = 0; j < 10; ++j)
+				text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+
+
+			barrier(CLK_LOCAL_MEM_FENCE);
+			xin[get_local_id(1)][get_local_id(0)] = text;
+			barrier(CLK_LOCAL_MEM_FENCE);
+			text = mix_and_propagate(xin);
+		}
+
+		#pragma unroll 2
+		for(int i = 0; i < (MEMORY >> 7); ++i)
+		{
+			text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+
+			#pragma unroll 10
 			for(int j = 0; j < 10; ++j)
 				text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+
+
+			barrier(CLK_LOCAL_MEM_FENCE);
+			xin[get_local_id(1)][get_local_id(0)] = text;
+			barrier(CLK_LOCAL_MEM_FENCE);
+			text = mix_and_propagate(xin);
 		}
 
+#else
+		#pragma unroll 2
+		for(int i = 0; i < (MEMORY >> 7); ++i)
+		{
+			text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+
+			#pragma unroll 10
+			for(int j = 0; j < 10; ++j)
+				text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+		}
+#endif
+	}
+
+// cryptonight_heavy or cryptonight_haven || cryptonight_bittube2
+#if (ALGO == 4 || ALGO == 9 || ALGO == 10)
+	/* Also left over threads perform this loop.
+	 * The left over thread results will be ignored
+	 */
+	#pragma unroll 16
+	for(size_t i=0; i < 16; i++)
+	{
+		#pragma unroll 10
+		for(int j = 0; j < 10; ++j)
+			text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+		barrier(CLK_LOCAL_MEM_FENCE);
+		xin[get_local_id(1)][get_local_id(0)] = text;
+		barrier(CLK_LOCAL_MEM_FENCE);
+		text = mix_and_propagate(xin);
+	}
+#endif
+
+#if(COMP_MODE==1)
+	// do not use early return here
+	if(gIdx < Threads)
+#endif
+	{
 		vstore2(as_ulong2(text), get_local_id(1) + 4, states);
 	}
 
 	barrier(CLK_GLOBAL_MEM_FENCE);
 
+#if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
+#endif
 	{
 		if(!get_local_id(1))
 		{
@@ -653,21 +973,11 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 
 			for(int i = 0; i < 25; ++i) states[i] = State[i];
 
-			switch(State[0] & 3)
-			{
-				case 0:
-					Branch0[atomic_inc(Branch0 + Threads)] = get_global_id(0) - get_global_offset(0);
-					break;
-				case 1:
-					Branch1[atomic_inc(Branch1 + Threads)] = get_global_id(0) - get_global_offset(0);
-					break;
-				case 2:
-					Branch2[atomic_inc(Branch2 + Threads)] = get_global_id(0) - get_global_offset(0);
-					break;
-				case 3:
-					Branch3[atomic_inc(Branch3 + Threads)] = get_global_id(0) - get_global_offset(0);
-					break;
-			}
+			ulong StateSwitch = State[0] & 3;
+			__global uint *destinationBranch1 = StateSwitch == 0 ? Branch0 : Branch1;
+			__global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3;
+			__global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2;
+			destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx;
 		}
 	}
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
@@ -685,7 +995,7 @@ R"===(
 __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, ulong Threads)
 {
 	const ulong idx = get_global_id(0) - get_global_offset(0);
-	
+
 	// do not use early return here
 	if(idx < Threads)
 	{
@@ -704,8 +1014,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
 
 		for(uint i = 0; i < 4; ++i)
 		{
-			if(i < 3) t[0] += 0x40UL;
-			else t[0] += 0x08UL;
+			t[0] += i < 3 ? 0x40UL : 0x08UL;
 
 			t[2] = t[0] ^ t[1];
 
@@ -715,8 +1024,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
 
 			h = m ^ p;
 
-			if(i < 2) t[1] = 0x3000000000000000UL;
-			else t[1] = 0xB000000000000000UL;
+			t[1] = i < 2 ? 0x3000000000000000UL : 0xB000000000000000UL;
 		}
 
 		t[0] = 0x08UL;
@@ -736,18 +1044,39 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
 		{
 			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
-				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
 		}
 	}
-	mem_fence(CLK_GLOBAL_MEM_FENCE);	
+	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
 #define SWAP8(x)	as_ulong(as_uchar8(x).s76543210)
 
+#define JHXOR \
+	h0h ^= input[0]; \
+	h0l ^= input[1]; \
+	h1h ^= input[2]; \
+	h1l ^= input[3]; \
+	h2h ^= input[4]; \
+	h2l ^= input[5]; \
+	h3h ^= input[6]; \
+	h3l ^= input[7]; \
+\
+	E8; \
+\
+	h4h ^= input[0]; \
+	h4l ^= input[1]; \
+	h5h ^= input[2]; \
+	h5l ^= input[3]; \
+	h6h ^= input[4]; \
+	h6l ^= input[5]; \
+	h7h ^= input[6]; \
+	h7l ^= input[7]
+
 __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, ulong Threads)
 {
 	const uint idx = get_global_id(0) - get_global_offset(0);
-	
+
 	// do not use early return here
 	if(idx < Threads)
 	{
@@ -757,46 +1086,27 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
 		sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL;
 		sph_u64 tmp;
 
-		for(int i = 0; i < 5; ++i)
+		for(int i = 0; i < 3; ++i)
 		{
 			ulong input[8];
 
-			if(i < 3)
-			{
-				for(int x = 0; x < 8; ++x) input[x] = (states[(i << 3) + x]);
-			}
-			else if(i == 3)
-			{
-				input[0] = (states[24]);
-				input[1] = 0x80UL;
-				for(int x = 2; x < 8; ++x) input[x] = 0x00UL;
-			}
-			else
-			{
-				input[7] = 0x4006000000000000UL;
-
-				for(int x = 0; x < 7; ++x) input[x] = 0x00UL;
-			}
-
-			h0h ^= input[0];
-			h0l ^= input[1];
-			h1h ^= input[2];
-			h1l ^= input[3];
-			h2h ^= input[4];
-			h2l ^= input[5];
-			h3h ^= input[6];
-			h3l ^= input[7];
-
-			E8;
-
-			h4h ^= input[0];
-			h4l ^= input[1];
-			h5h ^= input[2];
-			h5l ^= input[3];
-			h6h ^= input[4];
-			h6l ^= input[5];
-			h7h ^= input[6];
-			h7l ^= input[7];
+			const int shifted = i << 3;
+			for(int x = 0; x < 8; ++x) input[x] = (states[shifted + x]);
+			JHXOR;
+		}
+		{
+			ulong input[8];
+			input[0] = (states[24]);
+			input[1] = 0x80UL;
+			#pragma unroll 6
+			for(int x = 2; x < 8; ++x) input[x] = 0x00UL;
+			JHXOR;
+		}
+		{
+			ulong input[8];
+			for(int x = 0; x < 7; ++x) input[x] = 0x00UL;
+			input[7] = 0x4006000000000000UL;
+			JHXOR;
 		}
 
 		//output[0] = h6h;
@@ -810,7 +1120,7 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
 		{
 			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
-				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
 		}
 	}
 }
@@ -820,18 +1130,19 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
 __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, ulong Threads)
 {
 	const uint idx = get_global_id(0) - get_global_offset(0);
-	
+
 	// do not use early return here
 	if(idx < Threads)
 	{
 		states += 25 * BranchBuf[idx];
-	
+
 		unsigned int m[16];
 		unsigned int v[16];
 		uint h[8];
 
 		((uint8 *)h)[0] = vload8(0U, c_IV256);
 
+		#pragma unroll 4
 		for(uint i = 0, bitlen = 0; i < 4; ++i)
 		{
 			if(i < 3)
@@ -887,7 +1198,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
 		{
 			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
-				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
 		}
 	}
 }
@@ -895,7 +1206,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
 __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, ulong Threads)
 {
 	const uint idx = get_global_id(0) - get_global_offset(0);
-	
+
 	// do not use early return here
 	if(idx < Threads)
 	{
@@ -907,9 +1218,10 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
 
 		State[7] = 0x0001000000000000UL;
 
+		#pragma unroll 4
 		for(uint i = 0; i < 4; ++i)
 		{
-			ulong H[8], M[8];
+			volatile ulong H[8], M[8];
 
 			if(i < 3)
 			{
@@ -947,7 +1259,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
 		{
 			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
-				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
 		}
 	}
 }
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
new file mode 100644
index 000000000..1ef1dead4
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
@@ -0,0 +1,127 @@
+R"===(
+/*
+ * @author SChernykh
+ */
+
+// cryptonight_monero_v8
+#if(ALGO==11)
+
+static const __constant uint RCP_C[256] =
+{
+	0xfe01be73u,0xfd07ff01u,0xfa118c5au,0xf924fb13u,0xf630cddbu,0xf558f73cu,0xf25f2934u,0xf1a3f37bu,
+	0xee9c4562u,0xee02efd0u,0xeae7ced5u,0xea76ec3au,0xe7417330u,0xe6ffe8b8u,0xe3a8e217u,0xe39be54au,
+	0xe01dcd03u,0xe04ae1f0u,0xdc9fea3bu,0xdd0bdea8u,0xd92eef38u,0xd9dedb73u,0xd5ca9626u,0xd6c3d84fu,
+	0xd27299dcu,0xd3b9d53cu,0xcf26b659u,0xd0bfd23au,0xcbe6ab09u,0xcdd5cf48u,0xc8b23886u,0xcafacc65u,
+	0xc58920e5u,0xc82ec992u,0xc26b283eu,0xc572c6ceu,0xbf5813d7u,0xc2c3c419u,0xbc4facdbu,0xc023c171u,
+	0xb951b9f6u,0xbd8fbed7u,0xb65e05c8u,0xbb09bc4bu,0xb3745d97u,0xb890b9cbu,0xb0948d04u,0xb624b758u,
+	0xadbe61e8u,0xb3c3b4f2u,0xaaf1ae2au,0xb16eb297u,0xa82e412eu,0xaf25b048u,0xa573ec98u,0xace7ae05u,
+	0xa2c28519u,0xaab4abcdu,0xa019df1cu,0xa88ca99fu,0x9d79cf91u,0xa66ea77cu,0x9ae22df8u,0xa45ba563u,
+	0x9852d0ceu,0xa251a354u,0x95cb912eu,0xa050a14fu,0x934c48d6u,0x9e5a9f54u,0x90d4d228u,0x9c6c9d62u,
+	0x8e650939u,0x9a879b79u,0x8bfccaf5u,0x98ac9998u,0x899bf212u,0x96d897c1u,0x87425eedu,0x950d95f2u,
+	0x84efefd3u,0x934a942bu,0x82a48450u,0x918f926cu,0x805ffcb4u,0x8fdc90b5u,0x7e223ab7u,0x8e308f05u,
+	0x7beb1f71u,0x8c8c8d5du,0x79ba8ce2u,0x8aef8bbdu,0x7790683eu,0x89598a23u,0x756c9343u,0x87ca8891u,
+	0x734ef468u,0x86428705u,0x71376efbu,0x84c18581u,0x6f25e9ebu,0x83458402u,0x6d1a4b34u,0x81d0828au,
+	0x6b147a52u,0x80628118u,0x69145cfbu,0x7ef97fadu,0x6719dd39u,0x7d967e47u,0x6524e2abu,0x7c397ce7u,
+	0x6335561bu,0x7ae27b8du,0x614b21eau,0x79907a38u,0x5f662f10u,0x784478e9u,0x5d8667dfu,0x76fd77a0u,
+	0x5babb887u,0x75bb765bu,0x59d60b2eu,0x747e751cu,0x58054d25u,0x734673e1u,0x5639688fu,0x721372acu,
+	0x54724c2du,0x70e5717bu,0x52afe29cu,0x6fbb7050u,0x50f21c05u,0x6e966f28u,0x4f38e412u,0x6d766e06u,
+	0x4d842a91u,0x6c5a6ce7u,0x4bd3dcd0u,0x6b426bcdu,0x4a27e96au,0x6a2e6ab8u,0x4880415eu,0x691f69a6u,
+	0x46dcd25du,0x68136899u,0x453d8df4u,0x670c678fu,0x43a262a5u,0x6608668au,0x420b42d6u,0x65096588u,
+	0x40781dd3u,0x640d648au,0x3ee8e49au,0x63146390u,0x3d5d8a11u,0x621f6299u,0x3bd5fee0u,0x612e61a6u,
+	0x3a523496u,0x604060b7u,0x38d21e75u,0x5f565fcbu,0x3755aec4u,0x5e6f5ee2u,0x35dcd78fu,0x5d8b5dfdu,
+	0x34678d72u,0x5cab5d1au,0x32f5c17cu,0x5bcd5c3bu,0x318767f1u,0x5af35b60u,0x301c7511u,0x5a1b5a87u,
+	0x2eb4dccau,0x594759b1u,0x2d50935cu,0x587658deu,0x2bef8bfau,0x57a7580eu,0x2a91bc5cu,0x56db5741u,
+	0x2937198fu,0x56125676u,0x27df970eu,0x554c55afu,0x268b2b78u,0x548854eau,0x2539cba1u,0x53c75428u,
+	0x23eb6d84u,0x53095368u,0x22a00644u,0x524d52abu,0x21578cd3u,0x519451f0u,0x2011f5f9u,0x50dd5138u,
+	0x1ecf388eu,0x50285082u,0x1d8f4b53u,0x4f764fcfu,0x1c5224abu,0x4ec64f1eu,0x1b17bb87u,0x4e184e6fu,
+	0x19e0073fu,0x4d6d4dc2u,0x18aafe0au,0x4cc44d18u,0x177896f3u,0x4c1c4c70u,0x1648cb16u,0x4b784bcau,
+	0x151b9051u,0x4ad54b26u,0x13f0deeau,0x4a344a84u,0x12c8aef3u,0x499549e4u,0x11a2f829u,0x48f84946u,
+	0x107fb1ffu,0x485d48abu,0xf5ed5f0u,0x47c44811u,0xe405bc1u,0x472d4779u,0xd243bdau,0x469846e3u,
+	0xc0a6fa1u,0x4605464eu,0xaf2edf2u,0x457345bcu,0x9ddb163u,0x44e3452bu,0x8cab264u,0x4455449cu,
+	0x7b9e9d5u,0x43c9440fu,0x6ab5173u,0x433e4383u,0x59ee141u,0x42b542fau,0x49494c7u,0x422e4271u,
+	0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u,
+};
+
+inline uint get_reciprocal(const __local uchar *RCP, uint a)
+{
+	const uint index1 = (a & 0x7F000000U) >> 21;
+	const int index2 = (int)((a >> 8) & 0xFFFFU) - 32768;
+
+	const uint r1 = *(const __local uint*)(RCP + index1);
+
+	uint r2_0 = *(const __local uint*)(RCP + index1 + 4);
+	if (index2 > 0) r2_0 >>= 16;
+	const int r2 = r2_0 & 0xFFFFU;
+
+	const uint r = r1 - (uint)(mul24(r2, index2) >> 6);
+
+	const ulong lo0 = (ulong)(r) * a;
+	ulong lo = lo0 + ((ulong)(a) << 32);
+
+	a >>= 1;
+	const bool b = (a >= lo) || (lo >= lo0);
+	lo = a - lo;
+
+	const ulong k = mul_hi(as_uint2(lo).s0, r) + ((ulong)(r) * as_uint2(lo).s1) + lo;
+	return as_uint2(k).s1 + (b ? r : 0);
+}
+
+inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b)
+{
+	const uint r = get_reciprocal((const __local uchar *)RCP, b);
+	const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a;
+
+	ulong q;
+	((uint*)&q)[0] = as_uint2(k).s1;
+
+#if defined(cl_amd_device_attribute_query) && (OPENCL_DRIVER_MAJOR == 14)
+	/* The AMD driver 14.XX is not able to compile `(k < a)`
+	 * https://github.com/fireice-uk/xmr-stak/issues/1922
+	 * This is a workaround for the broken compiler.
+	 */
+	 ulong whyAMDwhy;
+	((uint*)&whyAMDwhy)[0] = as_uint2(k).s0;
+	((uint*)&whyAMDwhy)[1] = as_uint2(k).s1;
+	((uint*)&q)[1] = (whyAMDwhy < a) ? 1U : 0U;
+#else
+	((uint*)&q)[1] = (k < a) ? 1U : 0U;
+#endif
+	
+	const long tmp = a - q * b;
+	const bool overshoot = (tmp < 0);
+	const bool undershoot = (tmp >= b);
+
+	return (uint2)(
+		as_uint2(q).s0 + (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U),
+		as_uint2(tmp).s0 + (overshoot ? b : 0U) - (undershoot ? b : 0U)
+	);
+}
+
+inline uint fast_sqrt_v2(const ulong n1)
+{
+	float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23));
+
+	float x1 = native_rsqrt(x);
+	x = native_sqrt(x);
+
+	// The following line does x1 *= 4294967296.0f;
+	x1 = as_float(as_uint(x1) + (32U << 23));
+
+	const uint x0 = as_uint(x) - (158U << 23);
+	const long delta0 = n1 - (((long)(x0) * x0) << 18);
+	const float delta = convert_float_rte(as_int2(delta0).s1) * x1;
+
+	uint result = (x0 << 10) + convert_int_rte(delta);
+	const uint s = result >> 1;
+	const uint b = result & 1;
+
+	const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1;
+	if ((long)(x2 + b) > 0) --result;
+	if ((long)(x2 + 0x100000000UL + s) < 0) ++result;
+
+	return result;
+}
+
+#endif
+
+)==="
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl b/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl
index 1a7c96f0e..22603853f 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl
@@ -58,69 +58,69 @@ R"===(
 
 static const __constant ulong T0_G[] =
 {
-	0xc6a597f4a5f432c6UL, 0xf884eb9784976ff8UL, 0xee99c7b099b05eeeUL, 0xf68df78c8d8c7af6UL, 
-	0xff0de5170d17e8ffUL, 0xd6bdb7dcbddc0ad6UL, 0xdeb1a7c8b1c816deUL, 0x915439fc54fc6d91UL, 
-	0x6050c0f050f09060UL, 0x0203040503050702UL, 0xcea987e0a9e02eceUL, 0x567dac877d87d156UL, 
-	0xe719d52b192bcce7UL, 0xb56271a662a613b5UL, 0x4de69a31e6317c4dUL, 0xec9ac3b59ab559ecUL, 
-	0x8f4505cf45cf408fUL, 0x1f9d3ebc9dbca31fUL, 0x894009c040c04989UL, 0xfa87ef92879268faUL, 
-	0xef15c53f153fd0efUL, 0xb2eb7f26eb2694b2UL, 0x8ec90740c940ce8eUL, 0xfb0bed1d0b1de6fbUL, 
-	0x41ec822fec2f6e41UL, 0xb3677da967a91ab3UL, 0x5ffdbe1cfd1c435fUL, 0x45ea8a25ea256045UL, 
-	0x23bf46dabfdaf923UL, 0x53f7a602f7025153UL, 0xe496d3a196a145e4UL, 0x9b5b2ded5bed769bUL, 
-	0x75c2ea5dc25d2875UL, 0xe11cd9241c24c5e1UL, 0x3dae7ae9aee9d43dUL, 0x4c6a98be6abef24cUL, 
-	0x6c5ad8ee5aee826cUL, 0x7e41fcc341c3bd7eUL, 0xf502f1060206f3f5UL, 0x834f1dd14fd15283UL, 
-	0x685cd0e45ce48c68UL, 0x51f4a207f4075651UL, 0xd134b95c345c8dd1UL, 0xf908e9180818e1f9UL, 
-	0xe293dfae93ae4ce2UL, 0xab734d9573953eabUL, 0x6253c4f553f59762UL, 0x2a3f54413f416b2aUL, 
-	0x080c10140c141c08UL, 0x955231f652f66395UL, 0x46658caf65afe946UL, 0x9d5e21e25ee27f9dUL, 
-	0x3028607828784830UL, 0x37a16ef8a1f8cf37UL, 0x0a0f14110f111b0aUL, 0x2fb55ec4b5c4eb2fUL, 
-	0x0e091c1b091b150eUL, 0x2436485a365a7e24UL, 0x1b9b36b69bb6ad1bUL, 0xdf3da5473d4798dfUL, 
-	0xcd26816a266aa7cdUL, 0x4e699cbb69bbf54eUL, 0x7fcdfe4ccd4c337fUL, 0xea9fcfba9fba50eaUL, 
-	0x121b242d1b2d3f12UL, 0x1d9e3ab99eb9a41dUL, 0x5874b09c749cc458UL, 0x342e68722e724634UL, 
-	0x362d6c772d774136UL, 0xdcb2a3cdb2cd11dcUL, 0xb4ee7329ee299db4UL, 0x5bfbb616fb164d5bUL, 
-	0xa4f65301f601a5a4UL, 0x764decd74dd7a176UL, 0xb76175a361a314b7UL, 0x7dcefa49ce49347dUL, 
-	0x527ba48d7b8ddf52UL, 0xdd3ea1423e429fddUL, 0x5e71bc937193cd5eUL, 0x139726a297a2b113UL, 
-	0xa6f55704f504a2a6UL, 0xb96869b868b801b9UL, 0x0000000000000000UL, 0xc12c99742c74b5c1UL, 
-	0x406080a060a0e040UL, 0xe31fdd211f21c2e3UL, 0x79c8f243c8433a79UL, 0xb6ed772ced2c9ab6UL, 
-	0xd4beb3d9bed90dd4UL, 0x8d4601ca46ca478dUL, 0x67d9ce70d9701767UL, 0x724be4dd4bddaf72UL, 
-	0x94de3379de79ed94UL, 0x98d42b67d467ff98UL, 0xb0e87b23e82393b0UL, 0x854a11de4ade5b85UL, 
-	0xbb6b6dbd6bbd06bbUL, 0xc52a917e2a7ebbc5UL, 0x4fe59e34e5347b4fUL, 0xed16c13a163ad7edUL, 
-	0x86c51754c554d286UL, 0x9ad72f62d762f89aUL, 0x6655ccff55ff9966UL, 0x119422a794a7b611UL, 
-	0x8acf0f4acf4ac08aUL, 0xe910c9301030d9e9UL, 0x0406080a060a0e04UL, 0xfe81e798819866feUL, 
-	0xa0f05b0bf00baba0UL, 0x7844f0cc44ccb478UL, 0x25ba4ad5bad5f025UL, 0x4be3963ee33e754bUL, 
-	0xa2f35f0ef30eaca2UL, 0x5dfeba19fe19445dUL, 0x80c01b5bc05bdb80UL, 0x058a0a858a858005UL, 
-	0x3fad7eecadecd33fUL, 0x21bc42dfbcdffe21UL, 0x7048e0d848d8a870UL, 0xf104f90c040cfdf1UL, 
-	0x63dfc67adf7a1963UL, 0x77c1ee58c1582f77UL, 0xaf75459f759f30afUL, 0x426384a563a5e742UL, 
-	0x2030405030507020UL, 0xe51ad12e1a2ecbe5UL, 0xfd0ee1120e12effdUL, 0xbf6d65b76db708bfUL, 
-	0x814c19d44cd45581UL, 0x1814303c143c2418UL, 0x26354c5f355f7926UL, 0xc32f9d712f71b2c3UL, 
-	0xbee16738e13886beUL, 0x35a26afda2fdc835UL, 0x88cc0b4fcc4fc788UL, 0x2e395c4b394b652eUL, 
-	0x93573df957f96a93UL, 0x55f2aa0df20d5855UL, 0xfc82e39d829d61fcUL, 0x7a47f4c947c9b37aUL, 
-	0xc8ac8befacef27c8UL, 0xbae76f32e73288baUL, 0x322b647d2b7d4f32UL, 0xe695d7a495a442e6UL, 
-	0xc0a09bfba0fb3bc0UL, 0x199832b398b3aa19UL, 0x9ed12768d168f69eUL, 0xa37f5d817f8122a3UL, 
-	0x446688aa66aaee44UL, 0x547ea8827e82d654UL, 0x3bab76e6abe6dd3bUL, 0x0b83169e839e950bUL, 
-	0x8cca0345ca45c98cUL, 0xc729957b297bbcc7UL, 0x6bd3d66ed36e056bUL, 0x283c50443c446c28UL, 
-	0xa779558b798b2ca7UL, 0xbce2633de23d81bcUL, 0x161d2c271d273116UL, 0xad76419a769a37adUL, 
-	0xdb3bad4d3b4d96dbUL, 0x6456c8fa56fa9e64UL, 0x744ee8d24ed2a674UL, 0x141e28221e223614UL, 
-	0x92db3f76db76e492UL, 0x0c0a181e0a1e120cUL, 0x486c90b46cb4fc48UL, 0xb8e46b37e4378fb8UL, 
-	0x9f5d25e75de7789fUL, 0xbd6e61b26eb20fbdUL, 0x43ef862aef2a6943UL, 0xc4a693f1a6f135c4UL, 
-	0x39a872e3a8e3da39UL, 0x31a462f7a4f7c631UL, 0xd337bd5937598ad3UL, 0xf28bff868b8674f2UL, 
-	0xd532b156325683d5UL, 0x8b430dc543c54e8bUL, 0x6e59dceb59eb856eUL, 0xdab7afc2b7c218daUL, 
-	0x018c028f8c8f8e01UL, 0xb16479ac64ac1db1UL, 0x9cd2236dd26df19cUL, 0x49e0923be03b7249UL, 
-	0xd8b4abc7b4c71fd8UL, 0xacfa4315fa15b9acUL, 0xf307fd090709faf3UL, 0xcf25856f256fa0cfUL, 
-	0xcaaf8feaafea20caUL, 0xf48ef3898e897df4UL, 0x47e98e20e9206747UL, 0x1018202818283810UL, 
-	0x6fd5de64d5640b6fUL, 0xf088fb83888373f0UL, 0x4a6f94b16fb1fb4aUL, 0x5c72b8967296ca5cUL, 
-	0x3824706c246c5438UL, 0x57f1ae08f1085f57UL, 0x73c7e652c7522173UL, 0x975135f351f36497UL, 
-	0xcb238d652365aecbUL, 0xa17c59847c8425a1UL, 0xe89ccbbf9cbf57e8UL, 0x3e217c6321635d3eUL, 
-	0x96dd377cdd7cea96UL, 0x61dcc27fdc7f1e61UL, 0x0d861a9186919c0dUL, 0x0f851e9485949b0fUL, 
-	0xe090dbab90ab4be0UL, 0x7c42f8c642c6ba7cUL, 0x71c4e257c4572671UL, 0xccaa83e5aae529ccUL, 
-	0x90d83b73d873e390UL, 0x06050c0f050f0906UL, 0xf701f5030103f4f7UL, 0x1c12383612362a1cUL, 
-	0xc2a39ffea3fe3cc2UL, 0x6a5fd4e15fe18b6aUL, 0xaef94710f910beaeUL, 0x69d0d26bd06b0269UL, 
-	0x17912ea891a8bf17UL, 0x995829e858e87199UL, 0x3a2774692769533aUL, 0x27b94ed0b9d0f727UL, 
-	0xd938a948384891d9UL, 0xeb13cd351335deebUL, 0x2bb356ceb3cee52bUL, 0x2233445533557722UL, 
-	0xd2bbbfd6bbd604d2UL, 0xa9704990709039a9UL, 0x07890e8089808707UL, 0x33a766f2a7f2c133UL, 
-	0x2db65ac1b6c1ec2dUL, 0x3c22786622665a3cUL, 0x15922aad92adb815UL, 0xc92089602060a9c9UL, 
-	0x874915db49db5c87UL, 0xaaff4f1aff1ab0aaUL, 0x5078a0887888d850UL, 0xa57a518e7a8e2ba5UL, 
-	0x038f068a8f8a8903UL, 0x59f8b213f8134a59UL, 0x0980129b809b9209UL, 0x1a1734391739231aUL, 
-	0x65daca75da751065UL, 0xd731b553315384d7UL, 0x84c61351c651d584UL, 0xd0b8bbd3b8d303d0UL, 
-	0x82c31f5ec35edc82UL, 0x29b052cbb0cbe229UL, 0x5a77b4997799c35aUL, 0x1e113c3311332d1eUL, 
+	0xc6a597f4a5f432c6UL, 0xf884eb9784976ff8UL, 0xee99c7b099b05eeeUL, 0xf68df78c8d8c7af6UL,
+	0xff0de5170d17e8ffUL, 0xd6bdb7dcbddc0ad6UL, 0xdeb1a7c8b1c816deUL, 0x915439fc54fc6d91UL,
+	0x6050c0f050f09060UL, 0x0203040503050702UL, 0xcea987e0a9e02eceUL, 0x567dac877d87d156UL,
+	0xe719d52b192bcce7UL, 0xb56271a662a613b5UL, 0x4de69a31e6317c4dUL, 0xec9ac3b59ab559ecUL,
+	0x8f4505cf45cf408fUL, 0x1f9d3ebc9dbca31fUL, 0x894009c040c04989UL, 0xfa87ef92879268faUL,
+	0xef15c53f153fd0efUL, 0xb2eb7f26eb2694b2UL, 0x8ec90740c940ce8eUL, 0xfb0bed1d0b1de6fbUL,
+	0x41ec822fec2f6e41UL, 0xb3677da967a91ab3UL, 0x5ffdbe1cfd1c435fUL, 0x45ea8a25ea256045UL,
+	0x23bf46dabfdaf923UL, 0x53f7a602f7025153UL, 0xe496d3a196a145e4UL, 0x9b5b2ded5bed769bUL,
+	0x75c2ea5dc25d2875UL, 0xe11cd9241c24c5e1UL, 0x3dae7ae9aee9d43dUL, 0x4c6a98be6abef24cUL,
+	0x6c5ad8ee5aee826cUL, 0x7e41fcc341c3bd7eUL, 0xf502f1060206f3f5UL, 0x834f1dd14fd15283UL,
+	0x685cd0e45ce48c68UL, 0x51f4a207f4075651UL, 0xd134b95c345c8dd1UL, 0xf908e9180818e1f9UL,
+	0xe293dfae93ae4ce2UL, 0xab734d9573953eabUL, 0x6253c4f553f59762UL, 0x2a3f54413f416b2aUL,
+	0x080c10140c141c08UL, 0x955231f652f66395UL, 0x46658caf65afe946UL, 0x9d5e21e25ee27f9dUL,
+	0x3028607828784830UL, 0x37a16ef8a1f8cf37UL, 0x0a0f14110f111b0aUL, 0x2fb55ec4b5c4eb2fUL,
+	0x0e091c1b091b150eUL, 0x2436485a365a7e24UL, 0x1b9b36b69bb6ad1bUL, 0xdf3da5473d4798dfUL,
+	0xcd26816a266aa7cdUL, 0x4e699cbb69bbf54eUL, 0x7fcdfe4ccd4c337fUL, 0xea9fcfba9fba50eaUL,
+	0x121b242d1b2d3f12UL, 0x1d9e3ab99eb9a41dUL, 0x5874b09c749cc458UL, 0x342e68722e724634UL,
+	0x362d6c772d774136UL, 0xdcb2a3cdb2cd11dcUL, 0xb4ee7329ee299db4UL, 0x5bfbb616fb164d5bUL,
+	0xa4f65301f601a5a4UL, 0x764decd74dd7a176UL, 0xb76175a361a314b7UL, 0x7dcefa49ce49347dUL,
+	0x527ba48d7b8ddf52UL, 0xdd3ea1423e429fddUL, 0x5e71bc937193cd5eUL, 0x139726a297a2b113UL,
+	0xa6f55704f504a2a6UL, 0xb96869b868b801b9UL, 0x0000000000000000UL, 0xc12c99742c74b5c1UL,
+	0x406080a060a0e040UL, 0xe31fdd211f21c2e3UL, 0x79c8f243c8433a79UL, 0xb6ed772ced2c9ab6UL,
+	0xd4beb3d9bed90dd4UL, 0x8d4601ca46ca478dUL, 0x67d9ce70d9701767UL, 0x724be4dd4bddaf72UL,
+	0x94de3379de79ed94UL, 0x98d42b67d467ff98UL, 0xb0e87b23e82393b0UL, 0x854a11de4ade5b85UL,
+	0xbb6b6dbd6bbd06bbUL, 0xc52a917e2a7ebbc5UL, 0x4fe59e34e5347b4fUL, 0xed16c13a163ad7edUL,
+	0x86c51754c554d286UL, 0x9ad72f62d762f89aUL, 0x6655ccff55ff9966UL, 0x119422a794a7b611UL,
+	0x8acf0f4acf4ac08aUL, 0xe910c9301030d9e9UL, 0x0406080a060a0e04UL, 0xfe81e798819866feUL,
+	0xa0f05b0bf00baba0UL, 0x7844f0cc44ccb478UL, 0x25ba4ad5bad5f025UL, 0x4be3963ee33e754bUL,
+	0xa2f35f0ef30eaca2UL, 0x5dfeba19fe19445dUL, 0x80c01b5bc05bdb80UL, 0x058a0a858a858005UL,
+	0x3fad7eecadecd33fUL, 0x21bc42dfbcdffe21UL, 0x7048e0d848d8a870UL, 0xf104f90c040cfdf1UL,
+	0x63dfc67adf7a1963UL, 0x77c1ee58c1582f77UL, 0xaf75459f759f30afUL, 0x426384a563a5e742UL,
+	0x2030405030507020UL, 0xe51ad12e1a2ecbe5UL, 0xfd0ee1120e12effdUL, 0xbf6d65b76db708bfUL,
+	0x814c19d44cd45581UL, 0x1814303c143c2418UL, 0x26354c5f355f7926UL, 0xc32f9d712f71b2c3UL,
+	0xbee16738e13886beUL, 0x35a26afda2fdc835UL, 0x88cc0b4fcc4fc788UL, 0x2e395c4b394b652eUL,
+	0x93573df957f96a93UL, 0x55f2aa0df20d5855UL, 0xfc82e39d829d61fcUL, 0x7a47f4c947c9b37aUL,
+	0xc8ac8befacef27c8UL, 0xbae76f32e73288baUL, 0x322b647d2b7d4f32UL, 0xe695d7a495a442e6UL,
+	0xc0a09bfba0fb3bc0UL, 0x199832b398b3aa19UL, 0x9ed12768d168f69eUL, 0xa37f5d817f8122a3UL,
+	0x446688aa66aaee44UL, 0x547ea8827e82d654UL, 0x3bab76e6abe6dd3bUL, 0x0b83169e839e950bUL,
+	0x8cca0345ca45c98cUL, 0xc729957b297bbcc7UL, 0x6bd3d66ed36e056bUL, 0x283c50443c446c28UL,
+	0xa779558b798b2ca7UL, 0xbce2633de23d81bcUL, 0x161d2c271d273116UL, 0xad76419a769a37adUL,
+	0xdb3bad4d3b4d96dbUL, 0x6456c8fa56fa9e64UL, 0x744ee8d24ed2a674UL, 0x141e28221e223614UL,
+	0x92db3f76db76e492UL, 0x0c0a181e0a1e120cUL, 0x486c90b46cb4fc48UL, 0xb8e46b37e4378fb8UL,
+	0x9f5d25e75de7789fUL, 0xbd6e61b26eb20fbdUL, 0x43ef862aef2a6943UL, 0xc4a693f1a6f135c4UL,
+	0x39a872e3a8e3da39UL, 0x31a462f7a4f7c631UL, 0xd337bd5937598ad3UL, 0xf28bff868b8674f2UL,
+	0xd532b156325683d5UL, 0x8b430dc543c54e8bUL, 0x6e59dceb59eb856eUL, 0xdab7afc2b7c218daUL,
+	0x018c028f8c8f8e01UL, 0xb16479ac64ac1db1UL, 0x9cd2236dd26df19cUL, 0x49e0923be03b7249UL,
+	0xd8b4abc7b4c71fd8UL, 0xacfa4315fa15b9acUL, 0xf307fd090709faf3UL, 0xcf25856f256fa0cfUL,
+	0xcaaf8feaafea20caUL, 0xf48ef3898e897df4UL, 0x47e98e20e9206747UL, 0x1018202818283810UL,
+	0x6fd5de64d5640b6fUL, 0xf088fb83888373f0UL, 0x4a6f94b16fb1fb4aUL, 0x5c72b8967296ca5cUL,
+	0x3824706c246c5438UL, 0x57f1ae08f1085f57UL, 0x73c7e652c7522173UL, 0x975135f351f36497UL,
+	0xcb238d652365aecbUL, 0xa17c59847c8425a1UL, 0xe89ccbbf9cbf57e8UL, 0x3e217c6321635d3eUL,
+	0x96dd377cdd7cea96UL, 0x61dcc27fdc7f1e61UL, 0x0d861a9186919c0dUL, 0x0f851e9485949b0fUL,
+	0xe090dbab90ab4be0UL, 0x7c42f8c642c6ba7cUL, 0x71c4e257c4572671UL, 0xccaa83e5aae529ccUL,
+	0x90d83b73d873e390UL, 0x06050c0f050f0906UL, 0xf701f5030103f4f7UL, 0x1c12383612362a1cUL,
+	0xc2a39ffea3fe3cc2UL, 0x6a5fd4e15fe18b6aUL, 0xaef94710f910beaeUL, 0x69d0d26bd06b0269UL,
+	0x17912ea891a8bf17UL, 0x995829e858e87199UL, 0x3a2774692769533aUL, 0x27b94ed0b9d0f727UL,
+	0xd938a948384891d9UL, 0xeb13cd351335deebUL, 0x2bb356ceb3cee52bUL, 0x2233445533557722UL,
+	0xd2bbbfd6bbd604d2UL, 0xa9704990709039a9UL, 0x07890e8089808707UL, 0x33a766f2a7f2c133UL,
+	0x2db65ac1b6c1ec2dUL, 0x3c22786622665a3cUL, 0x15922aad92adb815UL, 0xc92089602060a9c9UL,
+	0x874915db49db5c87UL, 0xaaff4f1aff1ab0aaUL, 0x5078a0887888d850UL, 0xa57a518e7a8e2ba5UL,
+	0x038f068a8f8a8903UL, 0x59f8b213f8134a59UL, 0x0980129b809b9209UL, 0x1a1734391739231aUL,
+	0x65daca75da751065UL, 0xd731b553315384d7UL, 0x84c61351c651d584UL, 0xd0b8bbd3b8d303d0UL,
+	0x82c31f5ec35edc82UL, 0x29b052cbb0cbe229UL, 0x5a77b4997799c35aUL, 0x1e113c3311332d1eUL,
 	0x7bcbf646cb463d7bUL, 0xa8fc4b1ffc1fb7a8UL, 0x6dd6da61d6610c6dUL, 0x2c3a584e3a4e622cUL
 };
 
@@ -129,69 +129,69 @@ R"===(
 
 static const __constant ulong T4_G[] =
 {
-	0xA5F432C6C6A597F4UL, 0x84976FF8F884EB97UL, 0x99B05EEEEE99C7B0UL, 0x8D8C7AF6F68DF78CUL, 
-	0x0D17E8FFFF0DE517UL, 0xBDDC0AD6D6BDB7DCUL, 0xB1C816DEDEB1A7C8UL, 0x54FC6D91915439FCUL, 
-	0x50F090606050C0F0UL, 0x0305070202030405UL, 0xA9E02ECECEA987E0UL, 0x7D87D156567DAC87UL, 
-	0x192BCCE7E719D52BUL, 0x62A613B5B56271A6UL, 0xE6317C4D4DE69A31UL, 0x9AB559ECEC9AC3B5UL, 
-	0x45CF408F8F4505CFUL, 0x9DBCA31F1F9D3EBCUL, 0x40C04989894009C0UL, 0x879268FAFA87EF92UL, 
-	0x153FD0EFEF15C53FUL, 0xEB2694B2B2EB7F26UL, 0xC940CE8E8EC90740UL, 0x0B1DE6FBFB0BED1DUL, 
-	0xEC2F6E4141EC822FUL, 0x67A91AB3B3677DA9UL, 0xFD1C435F5FFDBE1CUL, 0xEA25604545EA8A25UL, 
-	0xBFDAF92323BF46DAUL, 0xF702515353F7A602UL, 0x96A145E4E496D3A1UL, 0x5BED769B9B5B2DEDUL, 
-	0xC25D287575C2EA5DUL, 0x1C24C5E1E11CD924UL, 0xAEE9D43D3DAE7AE9UL, 0x6ABEF24C4C6A98BEUL, 
-	0x5AEE826C6C5AD8EEUL, 0x41C3BD7E7E41FCC3UL, 0x0206F3F5F502F106UL, 0x4FD15283834F1DD1UL, 
-	0x5CE48C68685CD0E4UL, 0xF407565151F4A207UL, 0x345C8DD1D134B95CUL, 0x0818E1F9F908E918UL, 
-	0x93AE4CE2E293DFAEUL, 0x73953EABAB734D95UL, 0x53F597626253C4F5UL, 0x3F416B2A2A3F5441UL, 
-	0x0C141C08080C1014UL, 0x52F66395955231F6UL, 0x65AFE94646658CAFUL, 0x5EE27F9D9D5E21E2UL, 
-	0x2878483030286078UL, 0xA1F8CF3737A16EF8UL, 0x0F111B0A0A0F1411UL, 0xB5C4EB2F2FB55EC4UL, 
-	0x091B150E0E091C1BUL, 0x365A7E242436485AUL, 0x9BB6AD1B1B9B36B6UL, 0x3D4798DFDF3DA547UL, 
-	0x266AA7CDCD26816AUL, 0x69BBF54E4E699CBBUL, 0xCD4C337F7FCDFE4CUL, 0x9FBA50EAEA9FCFBAUL, 
-	0x1B2D3F12121B242DUL, 0x9EB9A41D1D9E3AB9UL, 0x749CC4585874B09CUL, 0x2E724634342E6872UL, 
-	0x2D774136362D6C77UL, 0xB2CD11DCDCB2A3CDUL, 0xEE299DB4B4EE7329UL, 0xFB164D5B5BFBB616UL, 
-	0xF601A5A4A4F65301UL, 0x4DD7A176764DECD7UL, 0x61A314B7B76175A3UL, 0xCE49347D7DCEFA49UL, 
-	0x7B8DDF52527BA48DUL, 0x3E429FDDDD3EA142UL, 0x7193CD5E5E71BC93UL, 0x97A2B113139726A2UL, 
-	0xF504A2A6A6F55704UL, 0x68B801B9B96869B8UL, 0x0000000000000000UL, 0x2C74B5C1C12C9974UL, 
-	0x60A0E040406080A0UL, 0x1F21C2E3E31FDD21UL, 0xC8433A7979C8F243UL, 0xED2C9AB6B6ED772CUL, 
-	0xBED90DD4D4BEB3D9UL, 0x46CA478D8D4601CAUL, 0xD970176767D9CE70UL, 0x4BDDAF72724BE4DDUL, 
-	0xDE79ED9494DE3379UL, 0xD467FF9898D42B67UL, 0xE82393B0B0E87B23UL, 0x4ADE5B85854A11DEUL, 
-	0x6BBD06BBBB6B6DBDUL, 0x2A7EBBC5C52A917EUL, 0xE5347B4F4FE59E34UL, 0x163AD7EDED16C13AUL, 
-	0xC554D28686C51754UL, 0xD762F89A9AD72F62UL, 0x55FF99666655CCFFUL, 0x94A7B611119422A7UL, 
-	0xCF4AC08A8ACF0F4AUL, 0x1030D9E9E910C930UL, 0x060A0E040406080AUL, 0x819866FEFE81E798UL, 
-	0xF00BABA0A0F05B0BUL, 0x44CCB4787844F0CCUL, 0xBAD5F02525BA4AD5UL, 0xE33E754B4BE3963EUL, 
-	0xF30EACA2A2F35F0EUL, 0xFE19445D5DFEBA19UL, 0xC05BDB8080C01B5BUL, 0x8A858005058A0A85UL, 
-	0xADECD33F3FAD7EECUL, 0xBCDFFE2121BC42DFUL, 0x48D8A8707048E0D8UL, 0x040CFDF1F104F90CUL, 
-	0xDF7A196363DFC67AUL, 0xC1582F7777C1EE58UL, 0x759F30AFAF75459FUL, 0x63A5E742426384A5UL, 
-	0x3050702020304050UL, 0x1A2ECBE5E51AD12EUL, 0x0E12EFFDFD0EE112UL, 0x6DB708BFBF6D65B7UL, 
-	0x4CD45581814C19D4UL, 0x143C24181814303CUL, 0x355F792626354C5FUL, 0x2F71B2C3C32F9D71UL, 
-	0xE13886BEBEE16738UL, 0xA2FDC83535A26AFDUL, 0xCC4FC78888CC0B4FUL, 0x394B652E2E395C4BUL, 
-	0x57F96A9393573DF9UL, 0xF20D585555F2AA0DUL, 0x829D61FCFC82E39DUL, 0x47C9B37A7A47F4C9UL, 
-	0xACEF27C8C8AC8BEFUL, 0xE73288BABAE76F32UL, 0x2B7D4F32322B647DUL, 0x95A442E6E695D7A4UL, 
-	0xA0FB3BC0C0A09BFBUL, 0x98B3AA19199832B3UL, 0xD168F69E9ED12768UL, 0x7F8122A3A37F5D81UL, 
-	0x66AAEE44446688AAUL, 0x7E82D654547EA882UL, 0xABE6DD3B3BAB76E6UL, 0x839E950B0B83169EUL, 
-	0xCA45C98C8CCA0345UL, 0x297BBCC7C729957BUL, 0xD36E056B6BD3D66EUL, 0x3C446C28283C5044UL, 
-	0x798B2CA7A779558BUL, 0xE23D81BCBCE2633DUL, 0x1D273116161D2C27UL, 0x769A37ADAD76419AUL, 
-	0x3B4D96DBDB3BAD4DUL, 0x56FA9E646456C8FAUL, 0x4ED2A674744EE8D2UL, 0x1E223614141E2822UL, 
-	0xDB76E49292DB3F76UL, 0x0A1E120C0C0A181EUL, 0x6CB4FC48486C90B4UL, 0xE4378FB8B8E46B37UL, 
-	0x5DE7789F9F5D25E7UL, 0x6EB20FBDBD6E61B2UL, 0xEF2A694343EF862AUL, 0xA6F135C4C4A693F1UL, 
-	0xA8E3DA3939A872E3UL, 0xA4F7C63131A462F7UL, 0x37598AD3D337BD59UL, 0x8B8674F2F28BFF86UL, 
-	0x325683D5D532B156UL, 0x43C54E8B8B430DC5UL, 0x59EB856E6E59DCEBUL, 0xB7C218DADAB7AFC2UL, 
-	0x8C8F8E01018C028FUL, 0x64AC1DB1B16479ACUL, 0xD26DF19C9CD2236DUL, 0xE03B724949E0923BUL, 
-	0xB4C71FD8D8B4ABC7UL, 0xFA15B9ACACFA4315UL, 0x0709FAF3F307FD09UL, 0x256FA0CFCF25856FUL, 
-	0xAFEA20CACAAF8FEAUL, 0x8E897DF4F48EF389UL, 0xE920674747E98E20UL, 0x1828381010182028UL, 
-	0xD5640B6F6FD5DE64UL, 0x888373F0F088FB83UL, 0x6FB1FB4A4A6F94B1UL, 0x7296CA5C5C72B896UL, 
-	0x246C54383824706CUL, 0xF1085F5757F1AE08UL, 0xC752217373C7E652UL, 0x51F36497975135F3UL, 
-	0x2365AECBCB238D65UL, 0x7C8425A1A17C5984UL, 0x9CBF57E8E89CCBBFUL, 0x21635D3E3E217C63UL, 
-	0xDD7CEA9696DD377CUL, 0xDC7F1E6161DCC27FUL, 0x86919C0D0D861A91UL, 0x85949B0F0F851E94UL, 
-	0x90AB4BE0E090DBABUL, 0x42C6BA7C7C42F8C6UL, 0xC457267171C4E257UL, 0xAAE529CCCCAA83E5UL, 
-	0xD873E39090D83B73UL, 0x050F090606050C0FUL, 0x0103F4F7F701F503UL, 0x12362A1C1C123836UL, 
-	0xA3FE3CC2C2A39FFEUL, 0x5FE18B6A6A5FD4E1UL, 0xF910BEAEAEF94710UL, 0xD06B026969D0D26BUL, 
-	0x91A8BF1717912EA8UL, 0x58E87199995829E8UL, 0x2769533A3A277469UL, 0xB9D0F72727B94ED0UL, 
-	0x384891D9D938A948UL, 0x1335DEEBEB13CD35UL, 0xB3CEE52B2BB356CEUL, 0x3355772222334455UL, 
-	0xBBD604D2D2BBBFD6UL, 0x709039A9A9704990UL, 0x8980870707890E80UL, 0xA7F2C13333A766F2UL, 
-	0xB6C1EC2D2DB65AC1UL, 0x22665A3C3C227866UL, 0x92ADB81515922AADUL, 0x2060A9C9C9208960UL, 
-	0x49DB5C87874915DBUL, 0xFF1AB0AAAAFF4F1AUL, 0x7888D8505078A088UL, 0x7A8E2BA5A57A518EUL, 
-	0x8F8A8903038F068AUL, 0xF8134A5959F8B213UL, 0x809B92090980129BUL, 0x1739231A1A173439UL, 
-	0xDA75106565DACA75UL, 0x315384D7D731B553UL, 0xC651D58484C61351UL, 0xB8D303D0D0B8BBD3UL, 
-	0xC35EDC8282C31F5EUL, 0xB0CBE22929B052CBUL, 0x7799C35A5A77B499UL, 0x11332D1E1E113C33UL, 
+	0xA5F432C6C6A597F4UL, 0x84976FF8F884EB97UL, 0x99B05EEEEE99C7B0UL, 0x8D8C7AF6F68DF78CUL,
+	0x0D17E8FFFF0DE517UL, 0xBDDC0AD6D6BDB7DCUL, 0xB1C816DEDEB1A7C8UL, 0x54FC6D91915439FCUL,
+	0x50F090606050C0F0UL, 0x0305070202030405UL, 0xA9E02ECECEA987E0UL, 0x7D87D156567DAC87UL,
+	0x192BCCE7E719D52BUL, 0x62A613B5B56271A6UL, 0xE6317C4D4DE69A31UL, 0x9AB559ECEC9AC3B5UL,
+	0x45CF408F8F4505CFUL, 0x9DBCA31F1F9D3EBCUL, 0x40C04989894009C0UL, 0x879268FAFA87EF92UL,
+	0x153FD0EFEF15C53FUL, 0xEB2694B2B2EB7F26UL, 0xC940CE8E8EC90740UL, 0x0B1DE6FBFB0BED1DUL,
+	0xEC2F6E4141EC822FUL, 0x67A91AB3B3677DA9UL, 0xFD1C435F5FFDBE1CUL, 0xEA25604545EA8A25UL,
+	0xBFDAF92323BF46DAUL, 0xF702515353F7A602UL, 0x96A145E4E496D3A1UL, 0x5BED769B9B5B2DEDUL,
+	0xC25D287575C2EA5DUL, 0x1C24C5E1E11CD924UL, 0xAEE9D43D3DAE7AE9UL, 0x6ABEF24C4C6A98BEUL,
+	0x5AEE826C6C5AD8EEUL, 0x41C3BD7E7E41FCC3UL, 0x0206F3F5F502F106UL, 0x4FD15283834F1DD1UL,
+	0x5CE48C68685CD0E4UL, 0xF407565151F4A207UL, 0x345C8DD1D134B95CUL, 0x0818E1F9F908E918UL,
+	0x93AE4CE2E293DFAEUL, 0x73953EABAB734D95UL, 0x53F597626253C4F5UL, 0x3F416B2A2A3F5441UL,
+	0x0C141C08080C1014UL, 0x52F66395955231F6UL, 0x65AFE94646658CAFUL, 0x5EE27F9D9D5E21E2UL,
+	0x2878483030286078UL, 0xA1F8CF3737A16EF8UL, 0x0F111B0A0A0F1411UL, 0xB5C4EB2F2FB55EC4UL,
+	0x091B150E0E091C1BUL, 0x365A7E242436485AUL, 0x9BB6AD1B1B9B36B6UL, 0x3D4798DFDF3DA547UL,
+	0x266AA7CDCD26816AUL, 0x69BBF54E4E699CBBUL, 0xCD4C337F7FCDFE4CUL, 0x9FBA50EAEA9FCFBAUL,
+	0x1B2D3F12121B242DUL, 0x9EB9A41D1D9E3AB9UL, 0x749CC4585874B09CUL, 0x2E724634342E6872UL,
+	0x2D774136362D6C77UL, 0xB2CD11DCDCB2A3CDUL, 0xEE299DB4B4EE7329UL, 0xFB164D5B5BFBB616UL,
+	0xF601A5A4A4F65301UL, 0x4DD7A176764DECD7UL, 0x61A314B7B76175A3UL, 0xCE49347D7DCEFA49UL,
+	0x7B8DDF52527BA48DUL, 0x3E429FDDDD3EA142UL, 0x7193CD5E5E71BC93UL, 0x97A2B113139726A2UL,
+	0xF504A2A6A6F55704UL, 0x68B801B9B96869B8UL, 0x0000000000000000UL, 0x2C74B5C1C12C9974UL,
+	0x60A0E040406080A0UL, 0x1F21C2E3E31FDD21UL, 0xC8433A7979C8F243UL, 0xED2C9AB6B6ED772CUL,
+	0xBED90DD4D4BEB3D9UL, 0x46CA478D8D4601CAUL, 0xD970176767D9CE70UL, 0x4BDDAF72724BE4DDUL,
+	0xDE79ED9494DE3379UL, 0xD467FF9898D42B67UL, 0xE82393B0B0E87B23UL, 0x4ADE5B85854A11DEUL,
+	0x6BBD06BBBB6B6DBDUL, 0x2A7EBBC5C52A917EUL, 0xE5347B4F4FE59E34UL, 0x163AD7EDED16C13AUL,
+	0xC554D28686C51754UL, 0xD762F89A9AD72F62UL, 0x55FF99666655CCFFUL, 0x94A7B611119422A7UL,
+	0xCF4AC08A8ACF0F4AUL, 0x1030D9E9E910C930UL, 0x060A0E040406080AUL, 0x819866FEFE81E798UL,
+	0xF00BABA0A0F05B0BUL, 0x44CCB4787844F0CCUL, 0xBAD5F02525BA4AD5UL, 0xE33E754B4BE3963EUL,
+	0xF30EACA2A2F35F0EUL, 0xFE19445D5DFEBA19UL, 0xC05BDB8080C01B5BUL, 0x8A858005058A0A85UL,
+	0xADECD33F3FAD7EECUL, 0xBCDFFE2121BC42DFUL, 0x48D8A8707048E0D8UL, 0x040CFDF1F104F90CUL,
+	0xDF7A196363DFC67AUL, 0xC1582F7777C1EE58UL, 0x759F30AFAF75459FUL, 0x63A5E742426384A5UL,
+	0x3050702020304050UL, 0x1A2ECBE5E51AD12EUL, 0x0E12EFFDFD0EE112UL, 0x6DB708BFBF6D65B7UL,
+	0x4CD45581814C19D4UL, 0x143C24181814303CUL, 0x355F792626354C5FUL, 0x2F71B2C3C32F9D71UL,
+	0xE13886BEBEE16738UL, 0xA2FDC83535A26AFDUL, 0xCC4FC78888CC0B4FUL, 0x394B652E2E395C4BUL,
+	0x57F96A9393573DF9UL, 0xF20D585555F2AA0DUL, 0x829D61FCFC82E39DUL, 0x47C9B37A7A47F4C9UL,
+	0xACEF27C8C8AC8BEFUL, 0xE73288BABAE76F32UL, 0x2B7D4F32322B647DUL, 0x95A442E6E695D7A4UL,
+	0xA0FB3BC0C0A09BFBUL, 0x98B3AA19199832B3UL, 0xD168F69E9ED12768UL, 0x7F8122A3A37F5D81UL,
+	0x66AAEE44446688AAUL, 0x7E82D654547EA882UL, 0xABE6DD3B3BAB76E6UL, 0x839E950B0B83169EUL,
+	0xCA45C98C8CCA0345UL, 0x297BBCC7C729957BUL, 0xD36E056B6BD3D66EUL, 0x3C446C28283C5044UL,
+	0x798B2CA7A779558BUL, 0xE23D81BCBCE2633DUL, 0x1D273116161D2C27UL, 0x769A37ADAD76419AUL,
+	0x3B4D96DBDB3BAD4DUL, 0x56FA9E646456C8FAUL, 0x4ED2A674744EE8D2UL, 0x1E223614141E2822UL,
+	0xDB76E49292DB3F76UL, 0x0A1E120C0C0A181EUL, 0x6CB4FC48486C90B4UL, 0xE4378FB8B8E46B37UL,
+	0x5DE7789F9F5D25E7UL, 0x6EB20FBDBD6E61B2UL, 0xEF2A694343EF862AUL, 0xA6F135C4C4A693F1UL,
+	0xA8E3DA3939A872E3UL, 0xA4F7C63131A462F7UL, 0x37598AD3D337BD59UL, 0x8B8674F2F28BFF86UL,
+	0x325683D5D532B156UL, 0x43C54E8B8B430DC5UL, 0x59EB856E6E59DCEBUL, 0xB7C218DADAB7AFC2UL,
+	0x8C8F8E01018C028FUL, 0x64AC1DB1B16479ACUL, 0xD26DF19C9CD2236DUL, 0xE03B724949E0923BUL,
+	0xB4C71FD8D8B4ABC7UL, 0xFA15B9ACACFA4315UL, 0x0709FAF3F307FD09UL, 0x256FA0CFCF25856FUL,
+	0xAFEA20CACAAF8FEAUL, 0x8E897DF4F48EF389UL, 0xE920674747E98E20UL, 0x1828381010182028UL,
+	0xD5640B6F6FD5DE64UL, 0x888373F0F088FB83UL, 0x6FB1FB4A4A6F94B1UL, 0x7296CA5C5C72B896UL,
+	0x246C54383824706CUL, 0xF1085F5757F1AE08UL, 0xC752217373C7E652UL, 0x51F36497975135F3UL,
+	0x2365AECBCB238D65UL, 0x7C8425A1A17C5984UL, 0x9CBF57E8E89CCBBFUL, 0x21635D3E3E217C63UL,
+	0xDD7CEA9696DD377CUL, 0xDC7F1E6161DCC27FUL, 0x86919C0D0D861A91UL, 0x85949B0F0F851E94UL,
+	0x90AB4BE0E090DBABUL, 0x42C6BA7C7C42F8C6UL, 0xC457267171C4E257UL, 0xAAE529CCCCAA83E5UL,
+	0xD873E39090D83B73UL, 0x050F090606050C0FUL, 0x0103F4F7F701F503UL, 0x12362A1C1C123836UL,
+	0xA3FE3CC2C2A39FFEUL, 0x5FE18B6A6A5FD4E1UL, 0xF910BEAEAEF94710UL, 0xD06B026969D0D26BUL,
+	0x91A8BF1717912EA8UL, 0x58E87199995829E8UL, 0x2769533A3A277469UL, 0xB9D0F72727B94ED0UL,
+	0x384891D9D938A948UL, 0x1335DEEBEB13CD35UL, 0xB3CEE52B2BB356CEUL, 0x3355772222334455UL,
+	0xBBD604D2D2BBBFD6UL, 0x709039A9A9704990UL, 0x8980870707890E80UL, 0xA7F2C13333A766F2UL,
+	0xB6C1EC2D2DB65AC1UL, 0x22665A3C3C227866UL, 0x92ADB81515922AADUL, 0x2060A9C9C9208960UL,
+	0x49DB5C87874915DBUL, 0xFF1AB0AAAAFF4F1AUL, 0x7888D8505078A088UL, 0x7A8E2BA5A57A518EUL,
+	0x8F8A8903038F068AUL, 0xF8134A5959F8B213UL, 0x809B92090980129BUL, 0x1739231A1A173439UL,
+	0xDA75106565DACA75UL, 0x315384D7D731B553UL, 0xC651D58484C61351UL, 0xB8D303D0D0B8BBD3UL,
+	0xC35EDC8282C31F5EUL, 0xB0CBE22929B052CBUL, 0x7799C35A5A77B499UL, 0x11332D1E1E113C33UL,
 	0xCB463D7B7BCBF646UL, 0xFC1FB7A8A8FC4B1FUL, 0xD6610C6D6DD6DA61UL, 0x3A4E622C2C3A584EUL
 };
 
@@ -292,4 +292,4 @@ static const __constant ulong T4_G[] =
 		} while (0)
 
 )==="
-		
\ No newline at end of file
+
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/jh.cl b/xmrstak/backend/amd/amd_gpu/opencl/jh.cl
index fe70ea302..486d2325c 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/jh.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/jh.cl
@@ -6,7 +6,7 @@ R"===(
  * ==========================(LICENSE BEGIN)============================
  *
  * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -14,10 +14,10 @@ R"===(
  * distribute, sublicense, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice shall be
  * included in all copies or substantial portions of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -106,47 +106,47 @@ typedef ulong sph_u64;
 
 static const __constant ulong C[] =
 {
-	0x67F815DFA2DED572UL, 0x571523B70A15847BUL, 0xF6875A4D90D6AB81UL, 0x402BD1C3C54F9F4EUL, 
-	0x9CFA455CE03A98EAUL, 0x9A99B26699D2C503UL, 0x8A53BBF2B4960266UL, 0x31A2DB881A1456B5UL, 
-	0xDB0E199A5C5AA303UL, 0x1044C1870AB23F40UL, 0x1D959E848019051CUL, 0xDCCDE75EADEB336FUL, 
-	0x416BBF029213BA10UL, 0xD027BBF7156578DCUL, 0x5078AA3739812C0AUL, 0xD3910041D2BF1A3FUL, 
-	0x907ECCF60D5A2D42UL, 0xCE97C0929C9F62DDUL, 0xAC442BC70BA75C18UL, 0x23FCC663D665DFD1UL, 
-	0x1AB8E09E036C6E97UL, 0xA8EC6C447E450521UL, 0xFA618E5DBB03F1EEUL, 0x97818394B29796FDUL, 
-	0x2F3003DB37858E4AUL, 0x956A9FFB2D8D672AUL, 0x6C69B8F88173FE8AUL, 0x14427FC04672C78AUL, 
-	0xC45EC7BD8F15F4C5UL, 0x80BB118FA76F4475UL, 0xBC88E4AEB775DE52UL, 0xF4A3A6981E00B882UL, 
-	0x1563A3A9338FF48EUL, 0x89F9B7D524565FAAUL, 0xFDE05A7C20EDF1B6UL, 0x362C42065AE9CA36UL, 
-	0x3D98FE4E433529CEUL, 0xA74B9A7374F93A53UL, 0x86814E6F591FF5D0UL, 0x9F5AD8AF81AD9D0EUL, 
-	0x6A6234EE670605A7UL, 0x2717B96EBE280B8BUL, 0x3F1080C626077447UL, 0x7B487EC66F7EA0E0UL, 
-	0xC0A4F84AA50A550DUL, 0x9EF18E979FE7E391UL, 0xD48D605081727686UL, 0x62B0E5F3415A9E7EUL, 
-	0x7A205440EC1F9FFCUL, 0x84C9F4CE001AE4E3UL, 0xD895FA9DF594D74FUL, 0xA554C324117E2E55UL, 
-	0x286EFEBD2872DF5BUL, 0xB2C4A50FE27FF578UL, 0x2ED349EEEF7C8905UL, 0x7F5928EB85937E44UL, 
-	0x4A3124B337695F70UL, 0x65E4D61DF128865EUL, 0xE720B95104771BC7UL, 0x8A87D423E843FE74UL, 
-	0xF2947692A3E8297DUL, 0xC1D9309B097ACBDDUL, 0xE01BDC5BFB301B1DUL, 0xBF829CF24F4924DAUL, 
-	0xFFBF70B431BAE7A4UL, 0x48BCF8DE0544320DUL, 0x39D3BB5332FCAE3BUL, 0xA08B29E0C1C39F45UL, 
-	0x0F09AEF7FD05C9E5UL, 0x34F1904212347094UL, 0x95ED44E301B771A2UL, 0x4A982F4F368E3BE9UL, 
-	0x15F66CA0631D4088UL, 0xFFAF52874B44C147UL, 0x30C60AE2F14ABB7EUL, 0xE68C6ECCC5B67046UL, 
-	0x00CA4FBD56A4D5A4UL, 0xAE183EC84B849DDAUL, 0xADD1643045CE5773UL, 0x67255C1468CEA6E8UL, 
-	0x16E10ECBF28CDAA3UL, 0x9A99949A5806E933UL, 0x7B846FC220B2601FUL, 0x1885D1A07FACCED1UL, 
-	0xD319DD8DA15B5932UL, 0x46B4A5AAC01C9A50UL, 0xBA6B04E467633D9FUL, 0x7EEE560BAB19CAF6UL, 
-	0x742128A9EA79B11FUL, 0xEE51363B35F7BDE9UL, 0x76D350755AAC571DUL, 0x01707DA3FEC2463AUL, 
-	0x42D8A498AFC135F7UL, 0x79676B9E20ECED78UL, 0xA8DB3AEA15638341UL, 0x832C83324D3BC3FAUL, 
-	0xF347271C1F3B40A7UL, 0x9A762DB734F04059UL, 0xFD4F21D26C4E3EE7UL, 0xEF5957DC398DFDB8UL, 
-	0xDAEB492B490C9B8DUL, 0x0D70F36849D7A25BUL, 0x84558D7AD0AE3B7DUL, 0x658EF8E4F0E9A5F5UL, 
-	0x533B1036F4A2B8A0UL, 0x5AEC3E759E07A80CUL, 0x4F88E85692946891UL, 0x4CBCBAF8555CB05BUL, 
-	0x7B9487F3993BBBE3UL, 0x5D1C6B72D6F4DA75UL, 0x6DB334DC28ACAE64UL, 0x71DB28B850A5346CUL, 
-	0x2A518D10F2E261F8UL, 0xFC75DD593364DBE3UL, 0xA23FCE43F1BCAC1CUL, 0xB043E8023CD1BB67UL, 
-	0x75A12988CA5B0A33UL, 0x5C5316B44D19347FUL, 0x1E4D790EC3943B92UL, 0x3FAFEEB6D7757479UL, 
-	0x21391ABEF7D4A8EAUL, 0x5127234C097EF45CUL, 0xD23C32BA5324A326UL, 0xADD5A66D4A17A344UL, 
-	0x08C9F2AFA63E1DB5UL, 0x563C6B91983D5983UL, 0x4D608672A17CF84CUL, 0xF6C76E08CC3EE246UL, 
-	0x5E76BCB1B333982FUL, 0x2AE6C4EFA566D62BUL, 0x36D4C1BEE8B6F406UL, 0x6321EFBC1582EE74UL, 
-	0x69C953F40D4EC1FDUL, 0x26585806C45A7DA7UL, 0x16FAE0061614C17EUL, 0x3F9D63283DAF907EUL, 
-	0x0CD29B00E3F2C9D2UL, 0x300CD4B730CEAA5FUL, 0x9832E0F216512A74UL, 0x9AF8CEE3D830EB0DUL, 
-	0x9279F1B57B9EC54BUL, 0xD36886046EE651FFUL, 0x316796E6574D239BUL, 0x05750A17F3A6E6CCUL, 
-	0xCE6C3213D98176B1UL, 0x62A205F88452173CUL, 0x47154778B3CB2BF4UL, 0x486A9323825446FFUL, 
-	0x65655E4E0758DF38UL, 0x8E5086FC897CFCF2UL, 0x86CA0BD0442E7031UL, 0x4E477830A20940F0UL, 
-	0x8338F7D139EEA065UL, 0xBD3A2CE437E95EF7UL, 0x6FF8130126B29721UL, 0xE7DE9FEFD1ED44A3UL, 
-	0xD992257615DFA08BUL, 0xBE42DC12F6F7853CUL, 0x7EB027AB7CECA7D8UL, 0xDEA83EAADA7D8D53UL, 
-	0xD86902BD93CE25AAUL, 0xF908731AFD43F65AUL, 0xA5194A17DAEF5FC0UL, 0x6A21FD4C33664D97UL, 
+	0x67F815DFA2DED572UL, 0x571523B70A15847BUL, 0xF6875A4D90D6AB81UL, 0x402BD1C3C54F9F4EUL,
+	0x9CFA455CE03A98EAUL, 0x9A99B26699D2C503UL, 0x8A53BBF2B4960266UL, 0x31A2DB881A1456B5UL,
+	0xDB0E199A5C5AA303UL, 0x1044C1870AB23F40UL, 0x1D959E848019051CUL, 0xDCCDE75EADEB336FUL,
+	0x416BBF029213BA10UL, 0xD027BBF7156578DCUL, 0x5078AA3739812C0AUL, 0xD3910041D2BF1A3FUL,
+	0x907ECCF60D5A2D42UL, 0xCE97C0929C9F62DDUL, 0xAC442BC70BA75C18UL, 0x23FCC663D665DFD1UL,
+	0x1AB8E09E036C6E97UL, 0xA8EC6C447E450521UL, 0xFA618E5DBB03F1EEUL, 0x97818394B29796FDUL,
+	0x2F3003DB37858E4AUL, 0x956A9FFB2D8D672AUL, 0x6C69B8F88173FE8AUL, 0x14427FC04672C78AUL,
+	0xC45EC7BD8F15F4C5UL, 0x80BB118FA76F4475UL, 0xBC88E4AEB775DE52UL, 0xF4A3A6981E00B882UL,
+	0x1563A3A9338FF48EUL, 0x89F9B7D524565FAAUL, 0xFDE05A7C20EDF1B6UL, 0x362C42065AE9CA36UL,
+	0x3D98FE4E433529CEUL, 0xA74B9A7374F93A53UL, 0x86814E6F591FF5D0UL, 0x9F5AD8AF81AD9D0EUL,
+	0x6A6234EE670605A7UL, 0x2717B96EBE280B8BUL, 0x3F1080C626077447UL, 0x7B487EC66F7EA0E0UL,
+	0xC0A4F84AA50A550DUL, 0x9EF18E979FE7E391UL, 0xD48D605081727686UL, 0x62B0E5F3415A9E7EUL,
+	0x7A205440EC1F9FFCUL, 0x84C9F4CE001AE4E3UL, 0xD895FA9DF594D74FUL, 0xA554C324117E2E55UL,
+	0x286EFEBD2872DF5BUL, 0xB2C4A50FE27FF578UL, 0x2ED349EEEF7C8905UL, 0x7F5928EB85937E44UL,
+	0x4A3124B337695F70UL, 0x65E4D61DF128865EUL, 0xE720B95104771BC7UL, 0x8A87D423E843FE74UL,
+	0xF2947692A3E8297DUL, 0xC1D9309B097ACBDDUL, 0xE01BDC5BFB301B1DUL, 0xBF829CF24F4924DAUL,
+	0xFFBF70B431BAE7A4UL, 0x48BCF8DE0544320DUL, 0x39D3BB5332FCAE3BUL, 0xA08B29E0C1C39F45UL,
+	0x0F09AEF7FD05C9E5UL, 0x34F1904212347094UL, 0x95ED44E301B771A2UL, 0x4A982F4F368E3BE9UL,
+	0x15F66CA0631D4088UL, 0xFFAF52874B44C147UL, 0x30C60AE2F14ABB7EUL, 0xE68C6ECCC5B67046UL,
+	0x00CA4FBD56A4D5A4UL, 0xAE183EC84B849DDAUL, 0xADD1643045CE5773UL, 0x67255C1468CEA6E8UL,
+	0x16E10ECBF28CDAA3UL, 0x9A99949A5806E933UL, 0x7B846FC220B2601FUL, 0x1885D1A07FACCED1UL,
+	0xD319DD8DA15B5932UL, 0x46B4A5AAC01C9A50UL, 0xBA6B04E467633D9FUL, 0x7EEE560BAB19CAF6UL,
+	0x742128A9EA79B11FUL, 0xEE51363B35F7BDE9UL, 0x76D350755AAC571DUL, 0x01707DA3FEC2463AUL,
+	0x42D8A498AFC135F7UL, 0x79676B9E20ECED78UL, 0xA8DB3AEA15638341UL, 0x832C83324D3BC3FAUL,
+	0xF347271C1F3B40A7UL, 0x9A762DB734F04059UL, 0xFD4F21D26C4E3EE7UL, 0xEF5957DC398DFDB8UL,
+	0xDAEB492B490C9B8DUL, 0x0D70F36849D7A25BUL, 0x84558D7AD0AE3B7DUL, 0x658EF8E4F0E9A5F5UL,
+	0x533B1036F4A2B8A0UL, 0x5AEC3E759E07A80CUL, 0x4F88E85692946891UL, 0x4CBCBAF8555CB05BUL,
+	0x7B9487F3993BBBE3UL, 0x5D1C6B72D6F4DA75UL, 0x6DB334DC28ACAE64UL, 0x71DB28B850A5346CUL,
+	0x2A518D10F2E261F8UL, 0xFC75DD593364DBE3UL, 0xA23FCE43F1BCAC1CUL, 0xB043E8023CD1BB67UL,
+	0x75A12988CA5B0A33UL, 0x5C5316B44D19347FUL, 0x1E4D790EC3943B92UL, 0x3FAFEEB6D7757479UL,
+	0x21391ABEF7D4A8EAUL, 0x5127234C097EF45CUL, 0xD23C32BA5324A326UL, 0xADD5A66D4A17A344UL,
+	0x08C9F2AFA63E1DB5UL, 0x563C6B91983D5983UL, 0x4D608672A17CF84CUL, 0xF6C76E08CC3EE246UL,
+	0x5E76BCB1B333982FUL, 0x2AE6C4EFA566D62BUL, 0x36D4C1BEE8B6F406UL, 0x6321EFBC1582EE74UL,
+	0x69C953F40D4EC1FDUL, 0x26585806C45A7DA7UL, 0x16FAE0061614C17EUL, 0x3F9D63283DAF907EUL,
+	0x0CD29B00E3F2C9D2UL, 0x300CD4B730CEAA5FUL, 0x9832E0F216512A74UL, 0x9AF8CEE3D830EB0DUL,
+	0x9279F1B57B9EC54BUL, 0xD36886046EE651FFUL, 0x316796E6574D239BUL, 0x05750A17F3A6E6CCUL,
+	0xCE6C3213D98176B1UL, 0x62A205F88452173CUL, 0x47154778B3CB2BF4UL, 0x486A9323825446FFUL,
+	0x65655E4E0758DF38UL, 0x8E5086FC897CFCF2UL, 0x86CA0BD0442E7031UL, 0x4E477830A20940F0UL,
+	0x8338F7D139EEA065UL, 0xBD3A2CE437E95EF7UL, 0x6FF8130126B29721UL, 0xE7DE9FEFD1ED44A3UL,
+	0xD992257615DFA08BUL, 0xBE42DC12F6F7853CUL, 0x7EB027AB7CECA7D8UL, 0xDEA83EAADA7D8D53UL,
+	0xD86902BD93CE25AAUL, 0xF908731AFD43F65AUL, 0xA5194A17DAEF5FC0UL, 0x6A21FD4C33664D97UL,
 	0x701541DB3198B435UL, 0x9B54CDEDBB0F1EEAUL, 0x72409751A163D09AUL, 0xE26F4791BF9D75F6UL
 };
 
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
index 81e1644f1..50e861e23 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
@@ -74,6 +74,19 @@ static const __constant uint AES0_C[256] =
 
 #define BYTE(x, y)	(amd_bfe((x), (y) << 3U, 8U))
 
+inline uint4 AES_Round_bittube2(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, uint4 x, uint4 k)
+{
+	x = ~x;
+	k.s0 ^= AES0[BYTE(x.s0, 0)] ^ AES1[BYTE(x.s1, 1)] ^ AES2[BYTE(x.s2, 2)] ^ AES3[BYTE(x.s3, 3)];
+	x.s0 ^= k.s0;
+	k.s1 ^= AES0[BYTE(x.s1, 0)] ^ AES1[BYTE(x.s2, 1)] ^ AES2[BYTE(x.s3, 2)] ^ AES3[BYTE(x.s0, 3)];
+	x.s1 ^= k.s1;
+	k.s2 ^= AES0[BYTE(x.s2, 0)] ^ AES1[BYTE(x.s3, 1)] ^ AES2[BYTE(x.s0, 2)] ^ AES3[BYTE(x.s1, 3)];
+	x.s2 ^= k.s2;
+	k.s3 ^= AES0[BYTE(x.s3, 0)] ^ AES1[BYTE(x.s0, 1)] ^ AES2[BYTE(x.s1, 2)] ^ AES3[BYTE(x.s2, 3)];
+	return k;
+}
+
 uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key)
 {
 	key.s0 ^= AES0[BYTE(X.s0, 0)];
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl
index 868757b7b..73ef90882 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl
@@ -3,6 +3,7 @@ R"===(
 #define WOLF_SKEIN_CL
 
 // Vectorized Skein implementation macros and functions by Wolf
+// Updated by taisel
 
 #define SKEIN_KS_PARITY	0x1BD11BDAA9FC1A22
 
@@ -22,11 +23,11 @@ static const __constant ulong SKEIN512_256_IV[8] =
 	0xC36FBAF9393AD185UL, 0x3EEDBA1833EDFC13UL
 };
 
-#define SKEIN_INJECT_KEY(p, s)	do { \
+#define SKEIN_INJECT_KEY(p, s, q)	do { \
 	p += h; \
-	p.s5 += t[s % 3]; \
-	p.s6 += t[(s + 1) % 3]; \
-	p.s7 += s; \
+	p.s5 += t[s]; \
+	p.s6 += t[select(s + 1U, 0U, s == 2U)]; \
+	p.s7 += q; \
 } while(0)
 
 ulong SKEIN_ROT(const uint2 x, const uint y)
@@ -35,55 +36,55 @@ ulong SKEIN_ROT(const uint2 x, const uint y)
 	else return(as_ulong(amd_bitalign(x.s10, x, 32 - (y - 32))));
 }
 
-void SkeinMix8(ulong4 *pv0, ulong4 *pv1, const uint rc0, const uint rc1, const uint rc2, const uint rc3)
+void SkeinMix8(ulong4 *pv0, ulong4 *pv1, const ulong4 rc)
 {
 	*pv0 += *pv1;
-	(*pv1).s0 = SKEIN_ROT(as_uint2((*pv1).s0), rc0);
-	(*pv1).s1 = SKEIN_ROT(as_uint2((*pv1).s1), rc1);
-	(*pv1).s2 = SKEIN_ROT(as_uint2((*pv1).s2), rc2);
-	(*pv1).s3 = SKEIN_ROT(as_uint2((*pv1).s3), rc3);
+	(*pv1).s0 = SKEIN_ROT(as_uint2((*pv1).s0), rc.s0);
+	(*pv1).s1 = SKEIN_ROT(as_uint2((*pv1).s1), rc.s1);
+	(*pv1).s2 = SKEIN_ROT(as_uint2((*pv1).s2), rc.s2);
+	(*pv1).s3 = SKEIN_ROT(as_uint2((*pv1).s3), rc.s3);
 	*pv1 ^= *pv0;
 }
 
-ulong8 SkeinEvenRound(ulong8 p, const ulong8 h, const ulong *t, const uint s)
+ulong8 SkeinEvenRound(ulong8 p, const ulong8 h, const ulong *t, const uint s, const uint q)
 {
-	SKEIN_INJECT_KEY(p, s);
+	SKEIN_INJECT_KEY(p, s, q);
 	ulong4 pv0 = p.even, pv1 = p.odd;
-	
-	SkeinMix8(&pv0, &pv1, 46, 36, 19, 37);
+
+	SkeinMix8(&pv0, &pv1, (ulong4)(46, 36, 19, 37));
 	pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
 	pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
-	
-	SkeinMix8(&pv0, &pv1, 33, 27, 14, 42);
+
+	SkeinMix8(&pv0, &pv1, (ulong4)(33, 27, 14, 42));
 	pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
 	pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
-	
-	SkeinMix8(&pv0, &pv1, 17, 49, 36, 39);
+
+	SkeinMix8(&pv0, &pv1, (ulong4)(17, 49, 36, 39));
 	pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
 	pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
-	
-	SkeinMix8(&pv0, &pv1, 44, 9, 54, 56);
+
+	SkeinMix8(&pv0, &pv1, (ulong4)(44, 9, 54, 56));
 	return(shuffle2(pv0, pv1, (ulong8)(1, 4, 2, 7, 3, 6, 0, 5)));
 }
 
-ulong8 SkeinOddRound(ulong8 p, const ulong8 h, const ulong *t, const uint s)
+ulong8 SkeinOddRound(ulong8 p, const ulong8 h, const ulong *t, const uint s, const uint q)
 {
-	SKEIN_INJECT_KEY(p, s);
+	SKEIN_INJECT_KEY(p, s, q);
     ulong4 pv0 = p.even, pv1 = p.odd;
-    
-	SkeinMix8(&pv0, &pv1, 39, 30, 34, 24);
+
+	SkeinMix8(&pv0, &pv1, (ulong4)(39, 30, 34, 24));
 	pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
 	pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
-	
-	SkeinMix8(&pv0, &pv1, 13, 50, 10, 17);
+
+	SkeinMix8(&pv0, &pv1, (ulong4)(13, 50, 10, 17));
 	pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
 	pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
-	
-	SkeinMix8(&pv0, &pv1, 25, 29, 39, 43);
+
+	SkeinMix8(&pv0, &pv1, (ulong4)(25, 29, 39, 43));
 	pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
 	pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
-	
-	SkeinMix8(&pv0, &pv1, 8, 35, 56, 22);
+
+	SkeinMix8(&pv0, &pv1, (ulong4)(8, 35, 56, 22));
 	return(shuffle2(pv0, pv1, (ulong8)(1, 4, 2, 7, 3, 6, 0, 5)));
 }
 
@@ -92,20 +93,47 @@ ulong8 Skein512Block(ulong8 p, ulong8 h, ulong h8, const ulong *t)
 	#pragma unroll
 	for(int i = 0; i < 18; ++i)
 	{
-		p = SkeinEvenRound(p, h, t, i);
+		p = SkeinEvenRound(p, h, t, 0U, i);
 		++i;
 		ulong tmp = h.s0;
 		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
 		h.s7 = h8;
 		h8 = tmp;
-		p = SkeinOddRound(p, h, t, i);
+		p = SkeinOddRound(p, h, t, 1U, i);
+		++i;
+		tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		p = SkeinEvenRound(p, h, t, 2U, i);
+		++i;
+		tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		p = SkeinOddRound(p, h, t, 0U, i);
+		++i;
+		tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		p = SkeinEvenRound(p, h, t, 1U, i);
+		++i;
+		tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		p = SkeinOddRound(p, h, t, 2U, i);
 		tmp = h.s0;
 		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
 		h.s7 = h8;
 		h8 = tmp;
 	}
-	
-	SKEIN_INJECT_KEY(p, 18);
+
+	p += h;
+	p.s5 += t[0];
+	p.s6 += t[1];
+	p.s7 += 18;
 	return(p);
 }
 
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index 0bc52395a..c5b331c87 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -31,7 +31,7 @@ namespace amd
 {
 
 class autoAdjust
-{    
+{
 public:
 
 	autoAdjust()
@@ -83,25 +83,29 @@ class autoAdjust
 
 		constexpr size_t byteToMiB = 1024u * 1024u;
 
-		size_t hashMemSize;
-		if(::jconf::inst()->IsCurrencyMonero())
-		{
-			hashMemSize = MONERO_MEMORY;
-		}
-		else
-		{
-			hashMemSize = AEON_MEMORY;
-		}
+		size_t hashMemSize = std::max(
+			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
+			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
+		);
 
 		std::string conf;
-		int i = 0;
 		for(auto& ctx : devVec)
 		{
+			size_t minFreeMem = 128u * byteToMiB;
 			/* 1000 is a magic selected limit, the reason is that more than 2GiB memory
 			 * sowing down the memory performance because of TLB cache misses
 			 */
 			size_t maxThreads = 1000u;
-			if(ctx.name.compare("gfx901") == 0)
+			if(
+				ctx.name.compare("gfx901") == 0 ||
+				ctx.name.compare("gfx904") == 0 ||
+				// APU
+				ctx.name.compare("gfx902") == 0 ||
+				// UNKNOWN
+				ctx.name.compare("gfx900") == 0 ||
+				ctx.name.compare("gfx903") == 0 ||
+				ctx.name.compare("gfx905") == 0
+			)
 			{
 				/* Increase the number of threads for AMD VEGA gpus.
 				 * Limit the number of threads based on the issue: https://github.com/fireice-uk/xmr-stak/issues/5#issuecomment-339425089
@@ -110,32 +114,83 @@ class autoAdjust
 				maxThreads = 2024u;
 			}
 
+			// NVIDIA optimizations
+			if(
+				ctx.isNVIDIA && (
+					ctx.name.find("P100") != std::string::npos ||
+				    ctx.name.find("V100") != std::string::npos
+				)
+			)
+			{
+				// do not limit the number of threads
+				maxThreads = 40000u;
+				minFreeMem = 512u * byteToMiB;
+			}
+
+			// check if cryptonight_monero_v8 is selected for the user or dev pool
+			bool useCryptonight_v8 =
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8;
+
+			// set strided index to default
+			ctx.stridedIndex = 1;
+
+			// nvidia performance is very bad if the scratchpad is not contiguous
+			if(ctx.isNVIDIA)
+				ctx.stridedIndex = 0;
+
+			// use chunked (4x16byte) scratchpad for all backends. Default `mem_chunk` is `2`
+			if(useCryptonight_v8)
+				ctx.stridedIndex = 2;
+
+			// increase all intensity limits by two for aeon
+			if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite)
+				maxThreads *= 2u;
+
 			// keep 128MiB memory free (value is randomly chosen)
-			size_t availableMem = ctx.freeMem - (128u * byteToMiB);
+			size_t availableMem = ctx.freeMem - minFreeMem;
 			// 224byte extra memory is used per thread for meta data
 			size_t perThread = hashMemSize + 224u;
 			size_t maxIntensity = availableMem / perThread;
 			size_t possibleIntensity = std::min( maxThreads , maxIntensity );
 			// map intensity to a multiple of the compute unit count, 8 is the number of threads per work group
 			size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8;
-			conf += std::string("  // gpu: ") + ctx.name + " memory:" + std::to_string(availableMem / byteToMiB) + "\n";
-			conf += std::string("  // compute units: ") + std::to_string(ctx.computeUnits) + "\n";
-			// set 8 threads per block (this is a good value for the most gpus)
-			conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
-				"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
-				"    \"affine_to_cpu\" : false, \"strided_index\" : true\n"
-				"  },\n";
-			++i;
+			//If the intensity is 0, then it's because the multiple of the unit count is greater than intensity
+			if (intensity == 0)
+			{
+				printer::inst()->print_msg(L0, "WARNING: Auto detected intensity unexpectedly low. Try to set the environment variable GPU_SINGLE_ALLOC_PERCENT.");
+				intensity = possibleIntensity;
+
+			}
+			if (intensity != 0)
+			{
+				conf += std::string("  // gpu: ") + ctx.name + " memory:" + std::to_string(availableMem / byteToMiB) + "\n";
+				conf += std::string("  // compute units: ") + std::to_string(ctx.computeUnits) + "\n";
+				// set 8 threads per block (this is a good value for the most gpus)
+				conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
+					"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
+					"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
+					"    \"unroll\" : 8, \"comp_mode\" : true\n" +
+					"  },\n";
+			}
+			else
+			{
+				printer::inst()->print_msg(L0, "WARNING: Ignore gpu %s, %s MiB free memory is not enough to suggest settings.", ctx.name.c_str(), std::to_string(availableMem / byteToMiB).c_str());
+			}
 		}
 
 		configTpl.replace("PLATFORMINDEX",std::to_string(platformIndex));
 		configTpl.replace("GPUCONFIG",conf);
 		configTpl.write(params::inst().configFileAMD);
-		printer::inst()->print_msg(L0, "AMD: GPU configuration stored in file '%s'", params::inst().configFileAMD.c_str());
+
+		const std::string backendName = xmrstak::params::inst().openCLVendor;
+		printer::inst()->print_msg(L0, "%s: GPU (OpenCL) configuration stored in file '%s'", backendName.c_str(), params::inst().configFileAMD.c_str());
 	}
 
 	std::vector<GpuContext> devVec;
 };
 
 } // namespace amd
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index af662f852..421e0ed4b 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -1,17 +1,33 @@
-R"===(
+R"===(// generated by XMRSTAK_VERSION
+
 /*
  * GPU configuration. You should play around with intensity and worksize as the fastest settings will vary.
- *      index    - GPU index number usually starts from 0
- *  intensity    - Number of parallel GPU threads (nothing to do with CPU threads)
- *   worksize    - Number of local GPU threads (nothing to do with CPU threads)
+ * index         - GPU index number usually starts from 0
+ * intensity     - Number of parallel GPU threads (nothing to do with CPU threads)
+ * worksize      - Number of local GPU threads (nothing to do with CPU threads)
  * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
  * strided_index - switch memory pattern used for the scratch pad memory
- *                 true  = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
- *                 false = use a contiguous block of memory per thread
+ *                 2 = chunked memory, chunk size is controlled by 'mem_chunk'
+ *                     required: intensity must be a multiple of worksize
+ *                 1 or true  = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
+ *                             (for cryptonight_v8 and monero it is equal to strided_index = 0)
+ *                 0 or false = use a contiguous block of memory per thread
+ * mem_chunk     - range 0 to 18: set the number of elements (16byte) per chunk
+ *                 this value is only used if 'strided_index' == 2
+ *                 element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte)
+ * unroll        - allow to control how often the POW main loop is unrolled; valid range [1;128) - for most OpenCL implementations it must be a power of two.
+ * comp_mode     - Compatibility enable/disable the automatic guard around compute kernel which allows
+ *                 to use a intensity which is not the multiple of the worksize.
+ *                 If you set false and the intensity is not multiple of the worksize the miner can crash:
+ *                 in this case set the intensity to a multiple of the worksize or activate comp_mode.
  * "gpu_threads_conf" :
  * [
- *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true },
+ *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+ *    "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true },
  * ],
+ * If you do not wish to mine with your AMD GPU(s) then use:
+ * "gpu_threads_conf" :
+ * null,
  */
 
 "gpu_threads_conf" : [
@@ -22,5 +38,4 @@ GPUCONFIG
  * Platform index. This will be 0 unless you have different OpenCL platform - eg. AMD and Intel.
  */
 "platform_index" : PLATFORMINDEX,
-
 )==="
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index 07afb1964..152f8add4 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -56,9 +56,10 @@ struct configVal {
 	Type iType;
 };
 
-//Same order as in configEnum, as per comment above
+// Same order as in configEnum, as per comment above
+// kNullType means any type
 configVal oConfigValues[] = {
-	{ aGpuThreadsConf, "gpu_threads_conf", kArrayType },
+	{ aGpuThreadsConf, "gpu_threads_conf", kNullType },
 	{ iPlatformIdx, "platform_index", kNumberType }
 };
 
@@ -68,6 +69,8 @@ inline bool checkType(Type have, Type want)
 {
 	if(want == have)
 		return true;
+	else if(want == kNullType)
+		return true;
 	else if(want == kTrueType && have == kFalseType)
 		return true;
 	else if(want == kFalseType && have == kTrueType)
@@ -103,14 +106,18 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *idx, *intensity, *w_size, *aff, *stridedIndex;
+	const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode;
 	idx = GetObjectMember(oThdConf, "index");
 	intensity = GetObjectMember(oThdConf, "intensity");
 	w_size = GetObjectMember(oThdConf, "worksize");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
 	stridedIndex = GetObjectMember(oThdConf, "strided_index");
+	memChunk = GetObjectMember(oThdConf, "mem_chunk");
+	unroll = GetObjectMember(oThdConf, "unroll");
+	compMode = GetObjectMember(oThdConf, "comp_mode");
 
-	if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr)
+	if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr ||
+		stridedIndex == nullptr || unroll == nullptr || compMode == nullptr)
 		return false;
 
 	if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
@@ -119,13 +126,45 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!aff->IsUint64() && !aff->IsBool())
 		return false;
 
-	if(!stridedIndex->IsBool())
+	if(!stridedIndex->IsBool() && !stridedIndex->IsNumber())
+	{
+		printer::inst()->print_msg(L0, "ERROR: strided_index must be a bool or a number");
+		return false;
+	}
+
+	if(stridedIndex->IsBool())
+		cfg.stridedIndex = stridedIndex->GetBool() ? 1 : 0;
+	else
+		cfg.stridedIndex = (int)stridedIndex->GetInt64();
+
+	if(cfg.stridedIndex > 2)
+	{
+		printer::inst()->print_msg(L0, "ERROR: strided_index must be smaller than 2");
+		return false;
+	}
+
+	if(!memChunk->IsUint64() || (int)memChunk->GetInt64() > 18 )
+	{
+		printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18");
+		return false;
+	}
+
+	cfg.memChunk = (int)memChunk->GetInt64();
+
+	if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 || (int)unroll->GetInt64() == 0)
+	{
+		printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128 and not zero");
+		return false;
+	}
+	cfg.unroll = (int)unroll->GetInt64();
+
+	if(!compMode->IsBool())
 		return false;
 
 	cfg.index = idx->GetUint64();
-	cfg.intensity = intensity->GetUint64();
 	cfg.w_size = w_size->GetUint64();
-	cfg.stridedIndex = stridedIndex->GetBool();
+	cfg.intensity = intensity->GetUint64();
+	cfg.compMode = compMode->GetBool();
 
 	if(aff->IsNumber())
 		cfg.cpu_aff = aff->GetInt64();
@@ -203,15 +242,15 @@ bool jconf::parse_config(const char* sFilename)
 
 	if(prv->jsonDoc.HasParseError())
 	{
-		printer::inst()->print_msg(L0, "JSON config parse error(offset %llu): %s",
-			int_port(prv->jsonDoc.GetErrorOffset()), GetParseError_En(prv->jsonDoc.GetParseError()));
+		printer::inst()->print_msg(L0, "JSON config parse error in '%s' (offset %llu): %s",
+			sFilename, int_port(prv->jsonDoc.GetErrorOffset()), GetParseError_En(prv->jsonDoc.GetParseError()));
 		return false;
 	}
 
 
 	if(!prv->jsonDoc.IsObject())
 	{ //This should never happen as we created the root ourselves
-		printer::inst()->print_msg(L0, "Invalid config file. No root?\n");
+		printer::inst()->print_msg(L0, "Invalid config file '%s'. No root?", sFilename);
 		return false;
 	}
 
@@ -227,13 +266,13 @@ bool jconf::parse_config(const char* sFilename)
 
 		if(prv->configValues[i] == nullptr)
 		{
-			printer::inst()->print_msg(L0, "Invalid config file. Missing value \"%s\".", oConfigValues[i].sName);
+			printer::inst()->print_msg(L0, "Invalid config file '%s'. Missing value \"%s\".", sFilename, oConfigValues[i].sName);
 			return false;
 		}
 
 		if(!checkType(prv->configValues[i]->GetType(), oConfigValues[i].iType))
 		{
-			printer::inst()->print_msg(L0, "Invalid config file. Value \"%s\" has unexpected type.", oConfigValues[i].sName);
+			printer::inst()->print_msg(L0, "Invalid config file '%s'. Value \"%s\" has unexpected type.", sFilename, oConfigValues[i].sName);
 			return false;
 		}
 	}
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index ee1882aad..b852c5940 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -26,7 +26,10 @@ class jconf
 		size_t intensity;
 		size_t w_size;
 		long long cpu_aff;
-		bool stridedIndex;
+		int stridedIndex;
+		int memChunk;
+		int unroll;
+		bool compMode;
 	};
 
 	size_t GetThreadCount();
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index e83527c55..5e70f25a6 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -73,7 +73,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::th
 
 extern "C"  {
 #ifdef WIN32
-__declspec(dllexport) 
+__declspec(dllexport)
 #endif
 std::vector<iBackend*>* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env)
 {
@@ -97,6 +97,9 @@ bool minethd::init_gpus()
 		vGpuData[i].rawIntensity = cfg.intensity;
 		vGpuData[i].workSize = cfg.w_size;
 		vGpuData[i].stridedIndex = cfg.stridedIndex;
+		vGpuData[i].memChunk = cfg.memChunk;
+		vGpuData[i].compMode = cfg.compMode;
+		vGpuData[i].unroll = cfg.unroll;
 	}
 
 	return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
@@ -120,7 +123,7 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 		win_exit();
 	}
 
-	// \ todo get device count and exit if no opencl device 
+	// \ todo get device count and exit if no opencl device
 
 	if(!init_gpus())
 	{
@@ -135,17 +138,19 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 	for (i = 0; i < n; i++)
 	{
 		jconf::inst()->GetThreadConfig(i, cfg);
-		
+
+		const std::string backendName = xmrstak::params::inst().openCLVendor;
+
 		if(cfg.cpu_aff >= 0)
 		{
 #if defined(__APPLE__)
-			printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory.");
+			printer::inst()->print_msg(L1, "WARNING on macOS thread affinity is only advisory.");
 #endif
 
-			printer::inst()->print_msg(L1, "Starting AMD GPU thread %d, affinity: %d.", i, (int)cfg.cpu_aff);
+			printer::inst()->print_msg(L1, "Starting %s GPU (OpenCL) thread %d, affinity: %d.", backendName.c_str(), i, (int)cfg.cpu_aff);
 		}
 		else
-			printer::inst()->print_msg(L1, "Starting AMD GPU thread %d, no affinity.", i);
+			printer::inst()->print_msg(L1, "Starting %s GPU (OpenCL) thread %d, no affinity.", backendName.c_str(), i);
 
 		minethd* thd = new minethd(pWork, i + threadOffset, &vGpuData[i], cfg);
 		pvThreads->push_back(thd);
@@ -154,27 +159,6 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 	return pvThreads;
 }
 
-void minethd::switch_work(miner_work& pWork)
-{
-	// iConsumeCnt is a basic lock-like polling mechanism just in case we happen to push work
-	// faster than threads can consume them. This should never happen in real life.
-	// Pool cant physically send jobs faster than every 250ms or so due to net latency.
-
-	while (globalStates::inst().iConsumeCnt.load(std::memory_order_seq_cst) < globalStates::inst().iThreadCount)
-		std::this_thread::sleep_for(std::chrono::milliseconds(100));
-
-	globalStates::inst().oGlobalWork = pWork;
-	globalStates::inst().iConsumeCnt.store(0, std::memory_order_seq_cst);
-	globalStates::inst().iGlobalJobNo++;
-}
-
-void minethd::consume_work()
-{
-	memcpy(&oWork, &globalStates::inst().oGlobalWork, sizeof(miner_work));
-	iJobNo++;
-	globalStates::inst().iConsumeCnt++;
-
-}
 
 void minethd::work_main()
 {
@@ -189,8 +173,18 @@ void minethd::work_main()
 	uint64_t iCount = 0;
 	cryptonight_ctx* cpu_ctx;
 	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
-	cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, ::jconf::inst()->IsCurrencyMonero());
-	globalStates::inst().iConsumeCnt++;
+
+	if(cpu_ctx == nullptr)
+	{
+		printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory, miner will be stopped.");
+		win_exit(1);
+	}
+	// start with root algorithm and switch later if fork version is reached
+	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
+	cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
+
+	uint8_t version = 0;
+	size_t lastPoolId = 0;
 
 	while (bQuit == 0)
 	{
@@ -204,16 +198,35 @@ void minethd::work_main()
 			while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 				std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
-			consume_work();
+			globalStates::inst().consume_work(oWork, iJobNo);
 			continue;
 		}
 
+		uint8_t new_version = oWork.getVersion();
+		if(new_version != version || oWork.iPoolId != lastPoolId)
+		{
+			coinDescription coinDesc = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(oWork.iPoolId);
+			if(new_version >= coinDesc.GetMiningForkVersion())
+			{
+				miner_algo = coinDesc.GetMiningAlgo();
+				hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
+			}
+			else
+			{
+				miner_algo = coinDesc.GetMiningAlgoRoot();
+				hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
+			}
+			lastPoolId = oWork.iPoolId;
+			version = new_version;
+		}
+
 		uint32_t h_per_round = pGpuCtx->rawIntensity;
 		size_t round_ctr = 0;
 
 		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
 		uint64_t target = oWork.iTarget;
-		XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target);
+
+		XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo);
 
 		if(oWork.bNiceHash)
 			pGpuCtx->Nonce = *(uint32_t*)(oWork.bWorkBlob + 39);
@@ -224,12 +237,16 @@ void minethd::work_main()
 			if((round_ctr++ & 0xF) == 0)
 			{
 				globalStates::inst().calc_start_nonce(pGpuCtx->Nonce, oWork.bNiceHash, h_per_round * 16);
+				// check if the job is still valid, there is a small possibility that the job is switched
+				if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo)
+					break;
 			}
 
+
 			cl_uint results[0x100];
 			memset(results,0,sizeof(cl_uint)*(0x100));
 
-			XMRRunJob(pGpuCtx, results);
+			XMRRunJob(pGpuCtx, results, miner_algo);
 
 			for(size_t i = 0; i < results[0xFF]; i++)
 			{
@@ -241,11 +258,11 @@ void minethd::work_main()
 
 				*(uint32_t*)(bWorkBlob + 39) = results[i];
 
-				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx);
+				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx);
 				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
-					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo), oWork.iPoolId));
+					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
 				else
-					executor::inst()->push_event(ex_event("AMD Invalid Result", oWork.iPoolId));
+					executor::inst()->push_event(ex_event("AMD Invalid Result", pGpuCtx->deviceIdx, oWork.iPoolId));
 			}
 
 			iCount += pGpuCtx->rawIntensity;
@@ -255,7 +272,7 @@ void minethd::work_main()
 			std::this_thread::yield();
 		}
 
-		consume_work();
+		globalStates::inst().consume_work(oWork, iJobNo);
 	}
 }
 
diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp
index 29ddb74e7..32e66ec87 100644
--- a/xmrstak/backend/amd/minethd.hpp
+++ b/xmrstak/backend/amd/minethd.hpp
@@ -20,21 +20,18 @@ class minethd  : public iBackend
 {
 public:
 
-	static void switch_work(miner_work& pWork);
 	static std::vector<iBackend*>* thread_starter(uint32_t threadOffset, miner_work& pWork);
 	static bool init_gpus();
 
 private:
-	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**);
 
 	minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::thd_cfg cfg);
 
 	void work_main();
-	void consume_work();
 
 	uint64_t iJobNo;
 
-	static miner_work oGlobalWork;
 	miner_work oWork;
 
 	std::promise<void> order_fix;
diff --git a/xmrstak/backend/backendConnector.cpp b/xmrstak/backend/backendConnector.cpp
index d735cb391..92bb01506 100644
--- a/xmrstak/backend/backendConnector.cpp
+++ b/xmrstak/backend/backendConnector.cpp
@@ -57,19 +57,41 @@ bool BackendConnector::self_test()
 
 std::vector<iBackend*>* BackendConnector::thread_starter(miner_work& pWork)
 {
-	globalStates::inst().iGlobalJobNo = 0;
-	globalStates::inst().iConsumeCnt = 0;
-
 
 	std::vector<iBackend*>* pvThreads = new std::vector<iBackend*>;
 
 #ifndef CONF_NO_CUDA
 	if(params::inst().useNVIDIA)
 	{
-		plugin nvidiaplugin("NVIDIA", "xmrstak_cuda_backend");
-		std::vector<iBackend*>* nvidiaThreads = nvidiaplugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, environment::inst());
-		pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads));
-		if(nvidiaThreads->size() == 0)
+		plugin nvidiaplugin;
+		std::vector<iBackend*>* nvidiaThreads;
+		std::vector<std::string> libNames = {"xmrstak_cuda_backend_cuda10_0", "xmrstak_cuda_backend_cuda9_2", "xmrstak_cuda_backend"};
+		size_t numWorkers = 0u;
+
+		for( const auto & name : libNames)
+		{
+			printer::inst()->print_msg(L0, "NVIDIA: try to load library '%s'", name.c_str());
+			nvidiaplugin.load("NVIDIA", name);
+			std::vector<iBackend*>* nvidiaThreads = nvidiaplugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, environment::inst());
+			if(nvidiaThreads != nullptr)
+			{
+				pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads));
+				numWorkers = nvidiaThreads->size();
+				delete nvidiaThreads;
+			}
+			else
+			{
+				// remove the plugin if we have found no GPUs
+				nvidiaplugin.unload();
+			}
+			// we found at leat one working GPU
+			if(numWorkers != 0)
+			{
+				printer::inst()->print_msg(L0, "NVIDIA: use library '%s'", name.c_str());
+				break;
+			}
+		}
+		if(numWorkers == 0)
 			printer::inst()->print_msg(L0, "WARNING: backend NVIDIA disabled.");
 	}
 #endif
@@ -77,11 +99,19 @@ std::vector<iBackend*>* BackendConnector::thread_starter(miner_work& pWork)
 #ifndef CONF_NO_OPENCL
 	if(params::inst().useAMD)
 	{
-		plugin amdplugin("AMD", "xmrstak_opencl_backend");
+		const std::string backendName = xmrstak::params::inst().openCLVendor;
+		plugin amdplugin;
+		amdplugin.load(backendName, "xmrstak_opencl_backend");
 		std::vector<iBackend*>* amdThreads = amdplugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, environment::inst());
-		pvThreads->insert(std::end(*pvThreads), std::begin(*amdThreads), std::end(*amdThreads));
-		if(amdThreads->size() == 0)
-			printer::inst()->print_msg(L0, "WARNING: backend AMD disabled.");
+		size_t numWorkers = 0u;
+		if(amdThreads != nullptr)
+		{
+			pvThreads->insert(std::end(*pvThreads), std::begin(*amdThreads), std::end(*amdThreads));
+			numWorkers = amdThreads->size();
+			delete amdThreads;
+		}
+		if(numWorkers == 0)
+			printer::inst()->print_msg(L0, "WARNING: backend %s (OpenCL) disabled.", backendName.c_str());
 	}
 #endif
 
@@ -99,4 +129,4 @@ std::vector<iBackend*>* BackendConnector::thread_starter(miner_work& pWork)
 	return pvThreads;
 }
 
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/backendConnector.hpp b/xmrstak/backend/backendConnector.hpp
index da3dc7782..66d873e48 100644
--- a/xmrstak/backend/backendConnector.hpp
+++ b/xmrstak/backend/backendConnector.hpp
@@ -18,4 +18,4 @@ namespace xmrstak
 		static bool self_test();
 	};
 
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
index 7bdb14e0a..e7f3e9148 100644
--- a/xmrstak/backend/cpu/autoAdjust.hpp
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -7,6 +7,7 @@
 #include "xmrstak/misc/configEditor.hpp"
 #include "xmrstak/params.hpp"
 #include "xmrstak/backend/cryptonight.hpp"
+#include "xmrstak/backend/cpu/cpuType.hpp"
 #include <string>
 
 #ifdef _WIN32
@@ -20,39 +21,20 @@ namespace xmrstak
 {
 namespace cpu
 {
-// Mask bits between h and l and return the value
-// This enables us to put in values exactly like in the manual
-// For example EBX[31:22] is get_masked(cpu_info[1], 31, 22)
-inline int32_t get_masked(int32_t val, int32_t h, int32_t l)
-{
-	val &= (0x7FFFFFFF >> (31-(h-l))) << l;
-	return val >> l;
-}
 
 class autoAdjust
 {
 public:
 
-	size_t hashMemSize;
-	size_t halfHashMemSize;
-
-	autoAdjust()
-	{
-		if(::jconf::inst()->IsCurrencyMonero())
-		{
-			hashMemSize = MONERO_MEMORY;
-			halfHashMemSize = hashMemSize / 2u;
-		}
-		else
-		{
-			hashMemSize = AEON_MEMORY;
-			halfHashMemSize = hashMemSize / 2u;
-		}
-	}
-
 	bool printConfig()
 	{
 
+		const size_t hashMemSizeKB = std::max(
+			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
+			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
+		) / 1024u;
+		const size_t halfHashMemSizeKB = hashMemSizeKB / 2u;
+
 		configEditor configTpl{};
 
 		// load the template of the backend config into a char variable
@@ -63,12 +45,13 @@ class autoAdjust
 
 		std::string conf;
 
-		if(!detectL3Size() || L3KB_size < halfHashMemSize || L3KB_size > (halfHashMemSize * 100u))
+
+		if(!detectL3Size() || L3KB_size < halfHashMemSizeKB || L3KB_size > (halfHashMemSizeKB * 2048u))
 		{
-			if(L3KB_size < halfHashMemSize || L3KB_size > (halfHashMemSize * 100))
+			if(L3KB_size < halfHashMemSizeKB || L3KB_size > (halfHashMemSizeKB * 2048))
 				printer::inst()->print_msg(L0, "Autoconf failed: L3 size sanity check failed - %u KB.", L3KB_size);
 
-			conf += std::string("    { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n");
+			conf += std::string("    { \"low_power_mode\" : false, \"no_prefetch\" : true,  \"asm\" : \"off\", \"affine_to_cpu\" : false },\n");
 			printer::inst()->print_msg(L0, "Autoconf FAILED. Create config for a single thread. Please try to add new ones until the hashrate slows down.");
 		}
 		else
@@ -88,11 +71,11 @@ class autoAdjust
 				if(L3KB_size <= 0)
 					break;
 
-				double_mode = L3KB_size / hashMemSize > (int32_t)(corecnt-i);
+				double_mode = L3KB_size / hashMemSizeKB > (int32_t)(corecnt-i);
 
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string(double_mode ? "true" : "false");
-				conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : ");
+				conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : ");
 				conf += std::to_string(aff_id);
 				conf += std::string(" },\n");
 
@@ -107,9 +90,9 @@ class autoAdjust
 					aff_id++;
 
 				if(double_mode)
-					L3KB_size -= hashMemSize * 2u;
+					L3KB_size -= hashMemSizeKB * 2u;
 				else
-					L3KB_size -= hashMemSize;
+					L3KB_size -= hashMemSizeKB;
 			}
 		}
 
@@ -137,12 +120,12 @@ class autoAdjust
 
 			if(get_masked(cpu_info[0], 7, 5) != 3)
 			{
-				printer::inst()->print_msg(L0, "Autoconf failed: Couln't find L3 cache page.");
+				printer::inst()->print_msg(L0, "Autoconf failed: Couldn't find L3 cache page.");
 				return false;
 			}
 
 			L3KB_size = ((get_masked(cpu_info[1], 31, 22) + 1) * (get_masked(cpu_info[1], 21, 12) + 1) *
-				(get_masked(cpu_info[1], 11, 0) + 1) * (cpu_info[2] + 1)) / halfHashMemSize;
+				(get_masked(cpu_info[1], 11, 0) + 1) * (cpu_info[2] + 1)) / 1024;
 
 			return true;
 		}
@@ -153,7 +136,8 @@ class autoAdjust
 			L3KB_size = get_masked(cpu_info[3], 31, 18) * 512;
 
 			::jconf::cpuid(1, 0, cpu_info);
-			if(get_masked(cpu_info[0], 11, 8) < 0x17) //0x17h is Zen
+
+			if(getModel().family < 0x17) //0x17h is Zen
 				old_amd = true;
 
 			return true;
@@ -185,4 +169,4 @@ class autoAdjust
 };
 
 } // namespace cpu
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
index ddeb89b31..b61582588 100644
--- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -28,21 +28,16 @@ class autoAdjust
 
 	autoAdjust()
 	{
-		if(::jconf::inst()->IsCurrencyMonero())
-		{
-			hashMemSize = MONERO_MEMORY;
-			halfHashMemSize = hashMemSize / 2u;
-		}
-		else
-		{
-			hashMemSize = AEON_MEMORY;
-			halfHashMemSize = hashMemSize / 2u;
-		}
+		hashMemSize = std::max(
+			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
+			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
+		);
+		halfHashMemSize = hashMemSize / 2u;
 	}
 
 	bool printConfig()
 	{
-		
+
 		hwloc_topology_t topology;
 		hwloc_topology_init(&topology);
 		hwloc_topology_load(topology);
@@ -69,13 +64,13 @@ class autoAdjust
 				throw(std::runtime_error("The CPU doesn't seem to have a cache."));
 
 			for(hwloc_obj_t obj : tlcs)
-				proccessTopLevelCache(obj);
-			
+				processTopLevelCache(obj);
+
 			for(uint32_t id : results)
 			{
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string((id & 0x8000000) != 0 ? "true" : "false");
-				conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : ");
+				conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : ");
 				conf += std::to_string(id & 0x7FFFFFF);
 				conf += std::string(" },\n");
 			}
@@ -83,7 +78,8 @@ class autoAdjust
 		catch(const std::runtime_error& err)
 		{
 			// \todo add fallback to default auto adjust
-			conf += std::string("    { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n");
+			conf += std::string("    { \"low_power_mode\" : false");
+			conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : false },\n");
 			printer::inst()->print_msg(L0, "Autoconf FAILED: %s. Create config for a single thread.", err.what());
 		}
 
@@ -143,7 +139,7 @@ class autoAdjust
 
 	// Top level cache isn't shared with other cores on the same package
 	// This will usually be 1 x L3, but can be 2 x L2 per package
-	void proccessTopLevelCache(hwloc_obj_t obj)
+	void processTopLevelCache(hwloc_obj_t obj)
 	{
 		if(obj->attr == nullptr)
 			throw(std::runtime_error("Cache object hasn't got attributes."));
@@ -163,7 +159,7 @@ class autoAdjust
 
 			//Try our luck with lower level caches
 			for(size_t i=0; i < obj->arity; i++)
-				proccessTopLevelCache(obj->children[i]);
+				processTopLevelCache(obj->children[i]);
 			return;
 		}
 
@@ -222,4 +218,4 @@ class autoAdjust
 };
 
 } // namespace cpu
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl
index b21a22d24..1a64860e4 100644
--- a/xmrstak/backend/cpu/config.tpl
+++ b/xmrstak/backend/cpu/config.tpl
@@ -1,34 +1,43 @@
-R"===(
+R"===(// generated by XMRSTAK_VERSION
+
 /*
  * Thread configuration for each thread. Make sure it matches the number above.
  * low_power_mode - This can either be a boolean (true or false), or a number between 1 to 5. When set to true,
-                    this mode will double the cache usage, and double the single thread performance. It will 
- *                  consume much less power (as less cores are working), but will max out at around 80-85% of 
+ *                  this mode will double the cache usage, and double the single thread performance. It will
+ *                  consume much less power (as less cores are working), but will max out at around 80-85% of
  *                  the maximum performance. When set to a number N greater than 1, this mode will increase the
  *                  cache usage and single thread performance by N times.
  *
- * no_prefetch -    Some sytems can gain up to extra 5% here, but sometimes it will have no difference or make
+ * no_prefetch    - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make
  *                  things slower.
  *
- * affine_to_cpu -  This can be either false (no affinity), or the CPU core number. Note that on hyperthreading 
- *                  systems it is better to assign threads to physical cores. On Windows this usually means selecting 
- *                  even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4 
+ * asm            - Allow to switch to a assembler version of cryptonight_v8; allowed value [auto, off, intel_avx, amd_avx]
+ *                    - auto: xmr-stak will automatically detect the asm type (default)
+ *                    - off: disable the usage of optimized assembler
+ *                    - intel_avx: supports Intel cpus with avx instructions e.g. Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx
+ *                    - amd_avx: supports AMD cpus with avx instructions e.g. AMD Ryzen 1xxx and 2xxx series
+ *
+ * affine_to_cpu  - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading
+ *                  systems it is better to assign threads to physical cores. On Windows this usually means selecting
+ *                  even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4
  *                  physical core CPU you should select cpu numbers 0-3.
  *
  * On the first run the miner will look at your system and suggest a basic configuration that will work,
  * you can try to tweak it from there to get the best performance.
- * 
+ *
  * A filled out configuration should look like this:
  * "cpu_threads_conf" :
- * [ 
- *      { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 },
- *      { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 },
+ * [
+ *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 0 },
+ *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 1 },
  * ],
+ * If you do not wish to mine with your CPU(s) then use:
+ * "cpu_threads_conf" :
+ * null,
  */
 
 "cpu_threads_conf" :
 [
 CPUCONFIG
 ],
-
 )==="
diff --git a/xmrstak/backend/cpu/cpuType.cpp b/xmrstak/backend/cpu/cpuType.cpp
new file mode 100644
index 000000000..5959b75cc
--- /dev/null
+++ b/xmrstak/backend/cpu/cpuType.cpp
@@ -0,0 +1,79 @@
+
+#include "xmrstak/backend/cpu/cpuType.hpp"
+
+#include <cstring>
+#include <inttypes.h>
+#include <cstdio>
+
+#ifdef _WIN32
+#define strcasecmp _stricmp
+#include <intrin.h>
+#else
+#include <cpuid.h>
+#endif
+
+namespace xmrstak
+{
+namespace cpu
+{
+	void cpuid(uint32_t eax, int32_t ecx, int32_t val[4])
+	{
+		std::memset(val, 0, sizeof(int32_t)*4);
+
+	#ifdef _WIN32
+		__cpuidex(val, eax, ecx);
+	#else
+		__cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]);
+	#endif
+	}
+
+	int32_t get_masked(int32_t val, int32_t h, int32_t l)
+	{
+		val &= (0x7FFFFFFF >> (31-(h-l))) << l;
+		return val >> l;
+	}
+
+	bool has_feature(int32_t val, int32_t bit)
+	{
+		int32_t mask = 1 << bit;
+		return (val & mask) != 0u;
+		
+	}
+	
+	Model getModel()
+	{
+		int32_t cpu_info[4];
+		char cpustr[13] = {0};
+
+		cpuid(0, 0, cpu_info);
+		std::memcpy(cpustr, &cpu_info[1], 4);
+		std::memcpy(cpustr+4, &cpu_info[3], 4);
+		std::memcpy(cpustr+8, &cpu_info[2], 4);
+
+		Model result;
+
+		cpuid(1, 0, cpu_info);
+		
+		result.family = get_masked(cpu_info[0], 12, 8);
+		result.model = get_masked(cpu_info[0], 8, 4) | get_masked(cpu_info[0], 20, 16) << 4;
+		result.type_name = cpustr;
+
+		// feature bits https://en.wikipedia.org/wiki/CPUID
+		// sse2
+		result.sse2 = has_feature(cpu_info[3], 26);
+		// aes-ni
+		result.aes = has_feature(cpu_info[2], 25);
+		// avx
+		result.avx = has_feature(cpu_info[2], 28);	
+
+		if(strcmp(cpustr, "AuthenticAMD") == 0)
+		{
+			if(result.family == 0xF)
+				result.family += get_masked(cpu_info[0], 28, 20);
+		}
+
+		return result;
+	}
+
+} // namespace cpu
+} // namespace xmrstak
diff --git a/xmrstak/backend/cpu/cpuType.hpp b/xmrstak/backend/cpu/cpuType.hpp
new file mode 100644
index 000000000..7f6bfaf51
--- /dev/null
+++ b/xmrstak/backend/cpu/cpuType.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <string>
+#include <cstdint>
+
+
+namespace xmrstak
+{
+namespace cpu
+{
+	struct Model
+	{
+		uint32_t family = 0u;
+		uint32_t model = 0u;
+		bool aes = false;
+		bool sse2 = false;
+		bool avx = false;
+		std::string type_name = "unknown";
+	};
+
+	Model getModel();
+
+	/** Mask bits between h and l and return the value
+	 *
+	 * This enables us to put in values exactly like in the manual
+	 * For example EBX[30:22] is get_masked(cpu_info[1], 31, 22)
+	 */
+	int32_t get_masked(int32_t val, int32_t h, int32_t l);
+
+	
+} // namespace cpu
+} // namespace xmrstak
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc
new file mode 100644
index 000000000..79adab671
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc
@@ -0,0 +1,410 @@
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 184
+
+	stmxcsr DWORD PTR [rsp+272]
+	mov DWORD PTR [rsp+276], 24448
+	ldmxcsr DWORD PTR [rsp+276]
+
+	mov	r13, QWORD PTR [rcx+224]
+	mov	r9, rdx
+	mov	r10, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r10, QWORD PTR [rcx]
+	mov	r14d, 524288
+	mov	r11, QWORD PTR [rcx+40]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rsi, QWORD PTR [rdx+224]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	mov	rbp, QWORD PTR [r9+40]
+	xor	rbp, QWORD PTR [r9+8]
+	movq	xmm0, rdx
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rsp+112], xmm9
+	movaps	XMMWORD PTR [rsp+96], xmm10
+	movaps	XMMWORD PTR [rsp+80], xmm11
+	movaps	XMMWORD PTR [rsp+64], xmm12
+	movaps	XMMWORD PTR [rsp+48], xmm13
+	movaps	XMMWORD PTR [rsp+32], xmm14
+	movaps	XMMWORD PTR [rsp+16], xmm15
+	mov	rdx, r10
+	movq	xmm4, QWORD PTR [r8+96]
+	and	edx, 2097136
+	mov	rax, QWORD PTR [rcx+48]
+	xorps	xmm13, xmm13
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r8+72]
+	movq	xmm5, QWORD PTR [r8+104]
+	movq	xmm7, rax
+
+	mov eax, 1
+	shl rax, 52
+	movq xmm14, rax
+	punpcklqdq xmm14, xmm14
+
+	mov eax, 1023
+	shl rax, 52
+	movq xmm12, rax
+	punpcklqdq xmm12, xmm12
+
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+	punpcklqdq xmm7, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+56]
+	xor	rcx, QWORD PTR [r9+24]
+	movq	xmm3, rax
+	mov	rax, QWORD PTR [r9+48]
+	xor	rax, QWORD PTR [r9+16]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp], r13
+	mov	rcx, QWORD PTR [r9+88]
+	xor	rcx, QWORD PTR [r9+72]
+	movq	xmm6, rax
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	punpcklqdq xmm6, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp+256], r10
+	mov	rcx, rdi
+	mov	QWORD PTR [rsp+264], r11
+	movq	xmm8, rax
+	and	ecx, 2097136
+	punpcklqdq xmm8, xmm0
+	movq	xmm0, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	xmm0, QWORD PTR [r9+104]
+	lea	r8, QWORD PTR [rcx+rsi]
+	movdqu	xmm11, XMMWORD PTR [r8]
+	punpcklqdq xmm5, xmm0
+	lea	r9, QWORD PTR [rdx+r13]
+	movdqu	xmm15, XMMWORD PTR [r9]
+
+ALIGN 16
+main_loop_double_sandybridge:
+	movdqu	xmm9, xmm15
+	mov eax, edx
+	mov ebx, edx
+	xor eax, 16
+	xor ebx, 32
+	xor edx, 48
+
+	movq	xmm0, r11
+	movq	xmm2, r10
+	punpcklqdq xmm2, xmm0
+	aesenc	xmm9, xmm2
+
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	movdqu	xmm1, XMMWORD PTR [rbx+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [rbx+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [rdx+r13]
+	movdqu	XMMWORD PTR [rdx+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [rax+r13], xmm0
+
+	movq	r11, xmm9
+	mov	edx, r11d
+	and	edx, 2097136
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9], xmm0
+
+	lea	rbx, QWORD PTR [rdx+r13]
+	mov	r10, QWORD PTR [rdx+r13]
+
+	movdqu	xmm10, xmm11
+	movq	xmm0, rbp
+	movq	xmm11, rdi
+	punpcklqdq xmm11, xmm0
+	aesenc	xmm10, xmm11
+
+	mov eax, ecx
+	mov r12d, ecx
+	xor eax, 16
+	xor r12d, 32
+	xor ecx, 48
+
+	movdqu	xmm0, XMMWORD PTR [rax+rsi]
+	paddq	xmm0, xmm6
+	movdqu	xmm1, XMMWORD PTR [r12+rsi]
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+	paddq	xmm1, xmm11
+	movdqu	xmm0, XMMWORD PTR [rcx+rsi]
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+
+	movq	rcx, xmm10
+	and	ecx, 2097136
+
+	movdqa	xmm0, xmm10
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov r12, QWORD PTR [rcx+rsi]
+
+	mov	r9, QWORD PTR [rbx+8]
+
+	xor edx, 16
+	mov r8d, edx
+	mov r15d, edx
+
+	movq	rdx, xmm5
+	shl	rdx, 32
+	movq	rax, xmm4
+	xor	rdx, rax
+	xor	r10, rdx
+	mov	rax, r10
+	mul	r11
+	mov r11d, r8d
+	xor r11d, 48
+	movq xmm0, rdx
+	xor rdx, [r11+r13]
+	movq xmm1, rax
+	xor rax, [r11+r13+8]
+	punpcklqdq xmm0, xmm1
+
+	pxor xmm0, XMMWORD PTR [r8+r13]
+	xor	r8d, 32
+	movdqu	xmm1, XMMWORD PTR [r11+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [r11+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [r8+r13]
+	movdqu	XMMWORD PTR [r8+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [r15+r13], xmm0
+
+	mov	r11, QWORD PTR [rsp+256]
+	add	r11, rdx
+	mov	rdx, QWORD PTR [rsp+264]
+	add	rdx, rax
+	mov	QWORD PTR [rbx], r11
+	xor	r11, r10
+	mov	QWORD PTR [rbx+8], rdx
+	xor	rdx, r9
+	mov	QWORD PTR [rsp+256], r11
+	and	r11d, 2097136
+	mov	QWORD PTR [rsp+264], rdx
+	mov	QWORD PTR [rsp+8], r11
+	lea	r15, QWORD PTR [r11+r13]
+	movdqu xmm15, XMMWORD PTR [r11+r13]
+	lea	r13, QWORD PTR [rsi+rcx]
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movaps	xmm2, xmm13
+	movq	r10, xmm0
+	psllq	xmm5, 1
+	shl	r10, 32
+	movdqa	xmm0, xmm9
+	psrldq	xmm0, 8
+	movdqa	xmm1, xmm10
+	movq	r11, xmm0
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	psrldq	xmm4, 8
+	movaps	xmm0, xmm13
+	movq	rax, xmm4
+	xor	r10, rax
+	movaps	xmm1, xmm13
+	xor	r10, r12
+	lea	rax, QWORD PTR [r11+1]
+	shr	rax, 1
+	movdqa	xmm3, xmm9
+	punpcklqdq xmm3, xmm10
+	paddq	xmm5, xmm3
+	movq	rdx, xmm5
+	psrldq	xmm5, 8
+	cvtsi2sd xmm2, rax
+	or	edx, -2147483647
+	lea	rax, QWORD PTR [r8+1]
+	shr	rax, 1
+	movq	r9, xmm5
+	cvtsi2sd xmm0, rax
+	or	r9d, -2147483647
+	cvtsi2sd xmm1, rdx
+	unpcklpd xmm2, xmm0
+	movaps	xmm0, xmm13
+	cvtsi2sd xmm0, r9
+	unpcklpd xmm1, xmm0
+	divpd	xmm2, xmm1
+	paddq	xmm2, xmm14
+	cvttsd2si rax, xmm2
+	psrldq	xmm2, 8
+	mov	rbx, rax
+	imul	rax, rdx
+	sub	r11, rax
+	js	div_fix_1_sandybridge
+div_fix_1_ret_sandybridge:
+
+	cvttsd2si rdx, xmm2
+	mov	rax, rdx
+	imul	rax, r9
+	movd	xmm2, r11d
+	movd	xmm4, ebx
+	sub	r8, rax
+	js	div_fix_2_sandybridge
+div_fix_2_ret_sandybridge:
+
+	movd	xmm1, r8d
+	movd	xmm0, edx
+	punpckldq xmm2, xmm1
+	punpckldq xmm4, xmm0
+	punpckldq xmm4, xmm2
+	paddq	xmm3, xmm4
+	movdqa	xmm0, xmm3
+	psrlq	xmm0, 12
+	paddq	xmm0, xmm12
+	sqrtpd	xmm1, xmm0
+	movq	r9, xmm1
+	movdqa xmm5, xmm1
+	psrlq xmm5, 19
+	test	r9, 524287
+	je	sqrt_fix_1_sandybridge
+sqrt_fix_1_ret_sandybridge:
+
+	movq r9, xmm10
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	test	r8, 524287
+	je	sqrt_fix_2_sandybridge
+sqrt_fix_2_ret_sandybridge:
+
+	mov r12d, ecx
+	mov r8d, ecx
+	xor r12d, 16
+	xor r8d, 32
+	xor ecx, 48
+	mov	rax, r10
+	mul	r9
+	movq xmm0, rax
+	movq xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	movdqu	xmm0, XMMWORD PTR [r12+rsi]
+	pxor xmm0, xmm3
+	movdqu	xmm1, XMMWORD PTR [r8+rsi]
+	xor rdx, [r8+rsi]
+	xor rax, [r8+rsi+8]
+	movdqu	xmm3, XMMWORD PTR [rcx+rsi]
+	paddq	xmm0, xmm6
+	paddq	xmm1, xmm11
+	paddq	xmm3, xmm8
+	movdqu	XMMWORD PTR [r8+rsi], xmm0
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	movdqu	XMMWORD PTR [r12+rsi], xmm3
+
+	add	rdi, rdx
+	mov	QWORD PTR [r13], rdi
+	xor	rdi, r10
+	mov	ecx, edi
+	and	ecx, 2097136
+	lea	r8, QWORD PTR [rcx+rsi]
+
+	mov rdx, QWORD PTR [r13+8]
+	add	rbp, rax
+	mov	QWORD PTR [r13+8], rbp
+	movdqu xmm11, XMMWORD PTR [rcx+rsi]
+	xor	rbp, rdx
+	mov	r13, QWORD PTR [rsp]
+	movdqa	xmm3, xmm7
+	mov	rdx, QWORD PTR [rsp+8]
+	movdqa	xmm8, xmm6
+	mov	r10, QWORD PTR [rsp+256]
+	movdqa	xmm7, xmm9
+	mov	r11, QWORD PTR [rsp+264]
+	movdqa	xmm6, xmm10
+	mov	r9, r15
+	dec r14d
+	jne	main_loop_double_sandybridge
+
+	ldmxcsr DWORD PTR [rsp+272]
+	movaps	xmm13, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+184]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+32]
+	movaps	xmm15, XMMWORD PTR [rsp+16]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	jmp cnv2_double_mainloop_asm_sandybridge_endp
+
+div_fix_1_sandybridge:
+	dec	rbx
+	add	r11, rdx
+	jmp	div_fix_1_ret_sandybridge
+
+div_fix_2_sandybridge:
+	dec	rdx
+	add	r8, r9
+	jmp	div_fix_2_ret_sandybridge
+
+sqrt_fix_1_sandybridge:
+	movq	r8, xmm3
+	movdqa xmm0, xmm5
+	psrldq xmm0, 8
+	dec	r9
+	mov r11d, -1022
+	shl r11, 32
+	mov	rax, r9
+	shr	r9, 19
+	shr	rax, 20
+	mov	rdx, r9
+	sub	rdx, rax
+	lea	rdx, [rdx+r11+1]
+	add	rax, r11
+	imul	rdx, rax
+	sub	rdx, r8
+	adc	r9, 0
+	movq xmm5, r9
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_1_ret_sandybridge
+
+sqrt_fix_2_sandybridge:
+	psrldq	xmm3, 8
+	movq	r11, xmm3
+	dec	r8
+	mov ebx, -1022
+	shl rbx, 32
+	mov	rax, r8
+	shr	r8, 19
+	shr	rax, 20
+	mov	rdx, r8
+	sub	rdx, rax
+	lea	rdx, [rdx+rbx+1]
+	add	rax, rbx
+	imul	rdx, rax
+	sub	rdx, r11
+	adc	r8, 0
+	movq xmm0, r8
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_2_ret_sandybridge
+
+cnv2_double_mainloop_asm_sandybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc
new file mode 100644
index 000000000..ad8f18233
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc
@@ -0,0 +1,410 @@
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 184
+
+	stmxcsr DWORD PTR [rsp+272]
+	mov DWORD PTR [rsp+276], 24448
+	ldmxcsr DWORD PTR [rsp+276]
+
+	mov	r13, QWORD PTR [rcx+224]
+	mov	r9, rdx
+	mov	r10, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r10, QWORD PTR [rcx]
+	mov	r14d, 524288
+	mov	r11, QWORD PTR [rcx+40]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rsi, QWORD PTR [rdx+224]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	mov	rbp, QWORD PTR [r9+40]
+	xor	rbp, QWORD PTR [r9+8]
+	movd	xmm0, rdx
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rsp+112], xmm9
+	movaps	XMMWORD PTR [rsp+96], xmm10
+	movaps	XMMWORD PTR [rsp+80], xmm11
+	movaps	XMMWORD PTR [rsp+64], xmm12
+	movaps	XMMWORD PTR [rsp+48], xmm13
+	movaps	XMMWORD PTR [rsp+32], xmm14
+	movaps	XMMWORD PTR [rsp+16], xmm15
+	mov	rdx, r10
+	movq	xmm4, QWORD PTR [r8+96]
+	and	edx, 2097136
+	mov	rax, QWORD PTR [rcx+48]
+	xorps	xmm13, xmm13
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r8+72]
+	movq	xmm5, QWORD PTR [r8+104]
+	movd	xmm7, rax
+
+	mov eax, 1
+	shl rax, 52
+	movd xmm14, rax
+	punpcklqdq xmm14, xmm14
+
+	mov eax, 1023
+	shl rax, 52
+	movd xmm12, rax
+	punpcklqdq xmm12, xmm12
+
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+	punpcklqdq xmm7, xmm0
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+56]
+	xor	rcx, QWORD PTR [r9+24]
+	movd	xmm3, rax
+	mov	rax, QWORD PTR [r9+48]
+	xor	rax, QWORD PTR [r9+16]
+	punpcklqdq xmm3, xmm0
+	movd	xmm0, rcx
+	mov	QWORD PTR [rsp], r13
+	mov	rcx, QWORD PTR [r9+88]
+	xor	rcx, QWORD PTR [r9+72]
+	movd	xmm6, rax
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	punpcklqdq xmm6, xmm0
+	movd	xmm0, rcx
+	mov	QWORD PTR [rsp+256], r10
+	mov	rcx, rdi
+	mov	QWORD PTR [rsp+264], r11
+	movd	xmm8, rax
+	and	ecx, 2097136
+	punpcklqdq xmm8, xmm0
+	movd	xmm0, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movd	xmm0, QWORD PTR [r9+104]
+	lea	r8, QWORD PTR [rcx+rsi]
+	movdqu	xmm11, XMMWORD PTR [r8]
+	punpcklqdq xmm5, xmm0
+	lea	r9, QWORD PTR [rdx+r13]
+	movdqu	xmm15, XMMWORD PTR [r9]
+
+	ALIGN 64
+main_loop_double_sandybridge:
+	movdqu	xmm9, xmm15
+	mov eax, edx
+	mov ebx, edx
+	xor eax, 16
+	xor ebx, 32
+	xor edx, 48
+
+	movd	xmm0, r11
+	movd	xmm2, r10
+	punpcklqdq xmm2, xmm0
+	aesenc	xmm9, xmm2
+
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	movdqu	xmm1, XMMWORD PTR [rbx+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [rbx+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [rdx+r13]
+	movdqu	XMMWORD PTR [rdx+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [rax+r13], xmm0
+
+	movd	r11, xmm9
+	mov	edx, r11d
+	and	edx, 2097136
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9], xmm0
+
+	lea	rbx, QWORD PTR [rdx+r13]
+	mov	r10, QWORD PTR [rdx+r13]
+
+	movdqu	xmm10, xmm11
+	movd	xmm0, rbp
+	movd	xmm11, rdi
+	punpcklqdq xmm11, xmm0
+	aesenc	xmm10, xmm11
+
+	mov eax, ecx
+	mov r12d, ecx
+	xor eax, 16
+	xor r12d, 32
+	xor ecx, 48
+
+	movdqu	xmm0, XMMWORD PTR [rax+rsi]
+	paddq	xmm0, xmm6
+	movdqu	xmm1, XMMWORD PTR [r12+rsi]
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+	paddq	xmm1, xmm11
+	movdqu	xmm0, XMMWORD PTR [rcx+rsi]
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+
+	movd	rcx, xmm10
+	and	ecx, 2097136
+
+	movdqa	xmm0, xmm10
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov r12, QWORD PTR [rcx+rsi]
+
+	mov	r9, QWORD PTR [rbx+8]
+
+	xor edx, 16
+	mov r8d, edx
+	mov r15d, edx
+
+	movd	rdx, xmm5
+	shl	rdx, 32
+	movd	rax, xmm4
+	xor	rdx, rax
+	xor	r10, rdx
+	mov	rax, r10
+	mul	r11
+	mov r11d, r8d
+	xor r11d, 48
+	movd xmm0, rdx
+	xor rdx, [r11+r13]
+	movd xmm1, rax
+	xor rax, [r11+r13+8]
+	punpcklqdq xmm0, xmm1
+
+	pxor xmm0, XMMWORD PTR [r8+r13]
+	xor	r8d, 32
+	movdqu	xmm1, XMMWORD PTR [r11+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [r11+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [r8+r13]
+	movdqu	XMMWORD PTR [r8+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [r15+r13], xmm0
+
+	mov	r11, QWORD PTR [rsp+256]
+	add	r11, rdx
+	mov	rdx, QWORD PTR [rsp+264]
+	add	rdx, rax
+	mov	QWORD PTR [rbx], r11
+	xor	r11, r10
+	mov	QWORD PTR [rbx+8], rdx
+	xor	rdx, r9
+	mov	QWORD PTR [rsp+256], r11
+	and	r11d, 2097136
+	mov	QWORD PTR [rsp+264], rdx
+	mov	QWORD PTR [rsp+8], r11
+	lea	r15, QWORD PTR [r11+r13]
+	movdqu xmm15, XMMWORD PTR [r11+r13]
+	lea	r13, QWORD PTR [rsi+rcx]
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movaps	xmm2, xmm13
+	movd	r10, xmm0
+	psllq	xmm5, 1
+	shl	r10, 32
+	movdqa	xmm0, xmm9
+	psrldq	xmm0, 8
+	movdqa	xmm1, xmm10
+	movd	r11, xmm0
+	psrldq	xmm1, 8
+	movd	r8, xmm1
+	psrldq	xmm4, 8
+	movaps	xmm0, xmm13
+	movd	rax, xmm4
+	xor	r10, rax
+	movaps	xmm1, xmm13
+	xor	r10, r12
+	lea	rax, QWORD PTR [r11+1]
+	shr	rax, 1
+	movdqa	xmm3, xmm9
+	punpcklqdq xmm3, xmm10
+	paddq	xmm5, xmm3
+	movd	rdx, xmm5
+	psrldq	xmm5, 8
+	cvtsi2sd xmm2, rax
+	or	edx, -2147483647
+	lea	rax, QWORD PTR [r8+1]
+	shr	rax, 1
+	movd	r9, xmm5
+	cvtsi2sd xmm0, rax
+	or	r9d, -2147483647
+	cvtsi2sd xmm1, rdx
+	unpcklpd xmm2, xmm0
+	movaps	xmm0, xmm13
+	cvtsi2sd xmm0, r9
+	unpcklpd xmm1, xmm0
+	divpd	xmm2, xmm1
+	paddq	xmm2, xmm14
+	cvttsd2si rax, xmm2
+	psrldq	xmm2, 8
+	mov	rbx, rax
+	imul	rax, rdx
+	sub	r11, rax
+	js	div_fix_1_sandybridge
+div_fix_1_ret_sandybridge:
+
+	cvttsd2si rdx, xmm2
+	mov	rax, rdx
+	imul	rax, r9
+	movd	xmm2, r11d
+	movd	xmm4, ebx
+	sub	r8, rax
+	js	div_fix_2_sandybridge
+div_fix_2_ret_sandybridge:
+
+	movd	xmm1, r8d
+	movd	xmm0, edx
+	punpckldq xmm2, xmm1
+	punpckldq xmm4, xmm0
+	punpckldq xmm4, xmm2
+	paddq	xmm3, xmm4
+	movdqa	xmm0, xmm3
+	psrlq	xmm0, 12
+	paddq	xmm0, xmm12
+	sqrtpd	xmm1, xmm0
+	movd	r9, xmm1
+	movdqa xmm5, xmm1
+	psrlq xmm5, 19
+	test	r9, 524287
+	je	sqrt_fix_1_sandybridge
+sqrt_fix_1_ret_sandybridge:
+
+	movd r9, xmm10
+	psrldq	xmm1, 8
+	movd	r8, xmm1
+	test	r8, 524287
+	je	sqrt_fix_2_sandybridge
+sqrt_fix_2_ret_sandybridge:
+
+	mov r12d, ecx
+	mov r8d, ecx
+	xor r12d, 16
+	xor r8d, 32
+	xor ecx, 48
+	mov	rax, r10
+	mul	r9
+	movd xmm0, rax
+	movd xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	movdqu	xmm0, XMMWORD PTR [r12+rsi]
+	pxor xmm0, xmm3
+	movdqu	xmm1, XMMWORD PTR [r8+rsi]
+	xor rdx, [r8+rsi]
+	xor rax, [r8+rsi+8]
+	movdqu	xmm3, XMMWORD PTR [rcx+rsi]
+	paddq	xmm0, xmm6
+	paddq	xmm1, xmm11
+	paddq	xmm3, xmm8
+	movdqu	XMMWORD PTR [r8+rsi], xmm0
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	movdqu	XMMWORD PTR [r12+rsi], xmm3
+
+	add	rdi, rdx
+	mov	QWORD PTR [r13], rdi
+	xor	rdi, r10
+	mov	ecx, edi
+	and	ecx, 2097136
+	lea	r8, QWORD PTR [rcx+rsi]
+
+	mov rdx, QWORD PTR [r13+8]
+	add	rbp, rax
+	mov	QWORD PTR [r13+8], rbp
+	movdqu xmm11, XMMWORD PTR [rcx+rsi]
+	xor	rbp, rdx
+	mov	r13, QWORD PTR [rsp]
+	movdqa	xmm3, xmm7
+	mov	rdx, QWORD PTR [rsp+8]
+	movdqa	xmm8, xmm6
+	mov	r10, QWORD PTR [rsp+256]
+	movdqa	xmm7, xmm9
+	mov	r11, QWORD PTR [rsp+264]
+	movdqa	xmm6, xmm10
+	mov	r9, r15
+	dec r14d
+	jne	main_loop_double_sandybridge
+
+	ldmxcsr DWORD PTR [rsp+272]
+	movaps	xmm13, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+184]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+32]
+	movaps	xmm15, XMMWORD PTR [rsp+16]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	jmp cnv2_double_mainloop_asm_sandybridge_endp
+
+div_fix_1_sandybridge:
+	dec	rbx
+	add	r11, rdx
+	jmp	div_fix_1_ret_sandybridge
+
+div_fix_2_sandybridge:
+	dec	rdx
+	add	r8, r9
+	jmp	div_fix_2_ret_sandybridge
+
+sqrt_fix_1_sandybridge:
+	movd	r8, xmm3
+	movdqa xmm0, xmm5
+	psrldq xmm0, 8
+	dec	r9
+	mov r11d, -1022
+	shl r11, 32
+	mov	rax, r9
+	shr	r9, 19
+	shr	rax, 20
+	mov	rdx, r9
+	sub	rdx, rax
+	lea	rdx, [rdx+r11+1]
+	add	rax, r11
+	imul	rdx, rax
+	sub	rdx, r8
+	adc	r9, 0
+	movd xmm5, r9
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_1_ret_sandybridge
+
+sqrt_fix_2_sandybridge:
+	psrldq	xmm3, 8
+	movd	r11, xmm3
+	dec	r8
+	mov ebx, -1022
+	shl rbx, 32
+	mov	rax, r8
+	shr	r8, 19
+	shr	rax, 20
+	mov	rdx, r8
+	sub	rdx, rax
+	lea	rdx, [rdx+rbx+1]
+	add	rax, rbx
+	imul	rdx, rax
+	sub	rdx, r11
+	adc	r8, 0
+	movd xmm0, r8
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_2_ret_sandybridge
+
+cnv2_double_mainloop_asm_sandybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
new file mode 100644
index 000000000..c0a3d0b41
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
@@ -0,0 +1,37 @@
+#define ALIGN .align
+.intel_syntax noprefix
+#ifdef __APPLE__
+#   define FN_PREFIX(fn) _ ## fn
+.text
+#else
+#   define FN_PREFIX(fn) fn
+.section .text
+#endif
+.global FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm)
+.global FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm)
+.global FN_PREFIX(cryptonight_v8_double_mainloop_sandybridge_asm)
+
+ALIGN 8
+FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cryptonight_v8_main_loop_ivybridge_linux.inc"
+	add rsp, 48
+	ret 0
+
+ALIGN 8
+FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cryptonight_v8_main_loop_ryzen_linux.inc"
+	add rsp, 48
+	ret 0
+
+ALIGN 16
+FN_PREFIX(cryptonight_v8_double_mainloop_sandybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	mov rdx, rsi
+	#include "cryptonight_v8_double_main_loop_sandybridge_linux.inc"
+	add rsp, 48
+	ret 0
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
new file mode 100644
index 000000000..1f3d2e15c
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
@@ -0,0 +1,25 @@
+_TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE
+PUBLIC cryptonight_v8_mainloop_ivybridge_asm
+PUBLIC cryptonight_v8_mainloop_ryzen_asm
+PUBLIC cryptonight_v8_double_mainloop_sandybridge_asm
+
+ALIGN 8
+cryptonight_v8_mainloop_ivybridge_asm PROC
+	INCLUDE cryptonight_v8_main_loop_ivybridge_win64.inc
+	ret 0
+cryptonight_v8_mainloop_ivybridge_asm ENDP
+
+ALIGN 8
+cryptonight_v8_mainloop_ryzen_asm PROC
+	INCLUDE cryptonight_v8_main_loop_ryzen_win64.inc
+	ret 0
+cryptonight_v8_mainloop_ryzen_asm ENDP
+
+ALIGN 8
+cryptonight_v8_double_mainloop_sandybridge_asm PROC
+	INCLUDE cryptonight_v8_double_main_loop_sandybridge_win64.inc
+	ret 0
+cryptonight_v8_double_mainloop_sandybridge_asm ENDP
+
+_TEXT_CNV8_MAINLOOP ENDS
+END
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
new file mode 100644
index 000000000..cbe43b0d3
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
@@ -0,0 +1,187 @@
+	mov	 QWORD PTR [rsp+24], rbx
+	push	 rbp
+	push	 rsi
+	push	 rdi
+	push	 r12
+	push	 r13
+	push	 r14
+	push	 r15
+	sub	 rsp, 80
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	 rax, QWORD PTR [rcx+48]
+	mov	 r9, rcx
+	xor	 rax, QWORD PTR [rcx+16]
+	mov	 esi, 524288
+	mov	 r8, QWORD PTR [rcx+32]
+	mov	 r13d, -2147483647
+	xor	 r8, QWORD PTR [rcx]
+	mov	 r11, QWORD PTR [rcx+40]
+	mov	 r10, r8
+	mov	 rdx, QWORD PTR [rcx+56]
+	movq	 xmm4, rax
+	xor	 rdx, QWORD PTR [rcx+24]
+	xor	 r11, QWORD PTR [rcx+8]
+	mov	 rbx, QWORD PTR [rcx+224]
+	mov	 rax, QWORD PTR [r9+80]
+	xor	 rax, QWORD PTR [r9+64]
+	movq	 xmm0, rdx
+	mov	 rcx, QWORD PTR [rcx+88]
+	xor	 rcx, QWORD PTR [r9+72]
+	movq	 xmm3, QWORD PTR [r9+104]
+	movaps	 XMMWORD PTR [rsp+64], xmm6
+	movaps	 XMMWORD PTR [rsp+48], xmm7
+	movaps	 XMMWORD PTR [rsp+32], xmm8
+	and	 r10d, 2097136
+	movq	 xmm5, rax
+
+	xor eax, eax
+	mov QWORD PTR [rsp+16], rax
+
+	mov ax, 1023
+	shl rax, 52
+	movq xmm8, rax
+	mov r15, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	 xmm0, rcx
+	punpcklqdq xmm5, xmm0
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
+
+	ALIGN 8
+main_loop_ivybridge:
+	lea	 rdx, QWORD PTR [r10+rbx]
+	mov	 ecx, r10d
+	mov	 eax, r10d
+	mov rdi, r15
+	xor	 ecx, 16
+	xor	 eax, 32
+	xor	 r10d, 48
+	movq	 xmm0, r11
+	movq	 xmm7, r8
+	punpcklqdq xmm7, xmm0
+	aesenc	 xmm6, xmm7
+	movq	 rbp, xmm6
+	mov	 r9, rbp
+	and	 r9d, 2097136
+	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
+	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm1, xmm7
+	paddq	 xmm0, xmm5
+	paddq	 xmm2, xmm4
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movdqu	 XMMWORD PTR [rax+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov r10, r9
+	xor r10d, 32
+	movq	 rcx, xmm3
+	mov	 rax, rcx
+	shl	 rax, 32
+	xor	 rdi, rax
+	movdqa	 xmm0, xmm6
+	pxor	 xmm0, xmm4
+	movdqu	 XMMWORD PTR [rdx], xmm0
+	xor	 rdi, QWORD PTR [r9+rbx]
+	lea	 r14, QWORD PTR [r9+rbx]
+	mov	 r12, QWORD PTR [r14+8]
+	xor	 edx, edx
+	lea	 r9d, DWORD PTR [ecx+ecx]
+	add	 r9d, ebp
+	movdqa	 xmm0, xmm6
+	psrldq	 xmm0, 8
+	or	 r9d, r13d
+	movq	 rax, xmm0
+	div	 r9
+	xorps xmm3, xmm3
+	mov	 eax, eax
+	shl	 rdx, 32
+	add	 rdx, rax
+	lea	 r9, QWORD PTR [rdx+rbp]
+	mov r15, rdx
+	mov	 rax, r9
+	shr	 rax, 12
+	movq	 xmm0, rax
+	paddq	 xmm0, xmm8
+	sqrtsd	 xmm3, xmm0
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+	movq	 rdx, xmm3
+	test	 edx, 524287
+	je	 sqrt_fixup_ivybridge
+	psrlq	 xmm3, 19
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+sqrt_fixup_ivybridge_ret:
+
+	mov	 ecx, r10d
+	mov	 rax, rdi
+	mul	 rbp
+	movq xmm2, rdx
+	xor rdx, [rcx+rbx]
+	add	 r8, rdx
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov edi, r8d
+	and edi, 2097136
+	movq xmm0, rax
+	xor rax, [rcx+rbx+8]
+	add	 r11, rax
+	mov	 QWORD PTR [r14+8], r11
+	punpcklqdq xmm2, xmm0
+
+	mov	 r9d, r10d
+	xor	 r9d, 48
+	xor	 r10d, 16
+	pxor	 xmm2, XMMWORD PTR [r9+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm0, xmm5
+	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm2, xmm4
+	paddq	 xmm1, xmm7
+	movdqa	 xmm5, xmm4
+	movdqu	 XMMWORD PTR [r9+rbx], xmm0
+	movdqa	 xmm4, xmm6
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	movdqu xmm6, [rdi+rbx]
+	mov	 r10d, edi
+	xor	 r11, r12
+	dec rsi
+	jne	 main_loop_ivybridge
+
+	ldmxcsr DWORD PTR [rsp]
+	mov	 rbx, QWORD PTR [rsp+160]
+	movaps	 xmm6, XMMWORD PTR [rsp+64]
+	movaps	 xmm7, XMMWORD PTR [rsp+48]
+	movaps	 xmm8, XMMWORD PTR [rsp+32]
+	add	 rsp, 80
+	pop	 r15
+	pop	 r14
+	pop	 r13
+	pop	 r12
+	pop	 rdi
+	pop	 rsi
+	pop	 rbp
+	jmp cnv2_main_loop_ivybridge_endp
+
+sqrt_fixup_ivybridge:
+	dec	 rdx
+	mov r13d, -1022
+	shl r13, 32
+	mov	 rax, rdx
+	shr	 rdx, 19
+	shr	 rax, 20
+	mov	 rcx, rdx
+	sub	 rcx, rax
+	add	 rax, r13
+	not r13
+	sub	 rcx, r13
+	mov	 r13d, -2147483647
+	imul	 rcx, rax
+	sub	 rcx, r9
+	adc	 rdx, 0
+	movq	 xmm3, rdx
+	jmp	 sqrt_fixup_ivybridge_ret
+
+cnv2_main_loop_ivybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
new file mode 100644
index 000000000..8d49c5db7
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
@@ -0,0 +1,187 @@
+	mov	 QWORD PTR [rsp+24], rbx
+	push	 rbp
+	push	 rsi
+	push	 rdi
+	push	 r12
+	push	 r13
+	push	 r14
+	push	 r15
+	sub	 rsp, 80
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	 rax, QWORD PTR [rcx+48]
+	mov	 r9, rcx
+	xor	 rax, QWORD PTR [rcx+16]
+	mov	 esi, 524288
+	mov	 r8, QWORD PTR [rcx+32]
+	mov	 r13d, -2147483647
+	xor	 r8, QWORD PTR [rcx]
+	mov	 r11, QWORD PTR [rcx+40]
+	mov	 r10, r8
+	mov	 rdx, QWORD PTR [rcx+56]
+	movd	 xmm4, rax
+	xor	 rdx, QWORD PTR [rcx+24]
+	xor	 r11, QWORD PTR [rcx+8]
+	mov	 rbx, QWORD PTR [rcx+224]
+	mov	 rax, QWORD PTR [r9+80]
+	xor	 rax, QWORD PTR [r9+64]
+	movd	 xmm0, rdx
+	mov	 rcx, QWORD PTR [rcx+88]
+	xor	 rcx, QWORD PTR [r9+72]
+	movq	 xmm3, QWORD PTR [r9+104]
+	movaps	 XMMWORD PTR [rsp+64], xmm6
+	movaps	 XMMWORD PTR [rsp+48], xmm7
+	movaps	 XMMWORD PTR [rsp+32], xmm8
+	and	 r10d, 2097136
+	movd	 xmm5, rax
+
+	xor eax, eax
+	mov QWORD PTR [rsp+16], rax
+
+	mov ax, 1023
+	shl rax, 52
+	movd xmm8, rax
+	mov r15, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movd	 xmm0, rcx
+	punpcklqdq xmm5, xmm0
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
+
+	ALIGN 8
+main_loop_ivybridge:
+	lea	 rdx, QWORD PTR [r10+rbx]
+	mov	 ecx, r10d
+	mov	 eax, r10d
+	mov rdi, r15
+	xor	 ecx, 16
+	xor	 eax, 32
+	xor	 r10d, 48
+	movd	 xmm0, r11
+	movd	 xmm7, r8
+	punpcklqdq xmm7, xmm0
+	aesenc	 xmm6, xmm7
+	movd	 rbp, xmm6
+	mov	 r9, rbp
+	and	 r9d, 2097136
+	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
+	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm1, xmm7
+	paddq	 xmm0, xmm5
+	paddq	 xmm2, xmm4
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movdqu	 XMMWORD PTR [rax+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov r10, r9
+	xor r10d, 32
+	movd	 rcx, xmm3
+	mov	 rax, rcx
+	shl	 rax, 32
+	xor	 rdi, rax
+	movdqa	 xmm0, xmm6
+	pxor	 xmm0, xmm4
+	movdqu	 XMMWORD PTR [rdx], xmm0
+	xor	 rdi, QWORD PTR [r9+rbx]
+	lea	 r14, QWORD PTR [r9+rbx]
+	mov	 r12, QWORD PTR [r14+8]
+	xor	 edx, edx
+	lea	 r9d, DWORD PTR [ecx+ecx]
+	add	 r9d, ebp
+	movdqa	 xmm0, xmm6
+	psrldq	 xmm0, 8
+	or	 r9d, r13d
+	movd	 rax, xmm0
+	div	 r9
+	xorps xmm3, xmm3
+	mov	 eax, eax
+	shl	 rdx, 32
+	add	 rdx, rax
+	lea	 r9, QWORD PTR [rdx+rbp]
+	mov r15, rdx
+	mov	 rax, r9
+	shr	 rax, 12
+	movd	 xmm0, rax
+	paddq	 xmm0, xmm8
+	sqrtsd	 xmm3, xmm0
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+	movd	 rdx, xmm3
+	test	 edx, 524287
+	je	 sqrt_fixup_ivybridge
+	psrlq	 xmm3, 19
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+sqrt_fixup_ivybridge_ret:
+
+	mov	 ecx, r10d
+	mov	 rax, rdi
+	mul	 rbp
+	movd xmm2, rdx
+	xor rdx, [rcx+rbx]
+	add	 r8, rdx
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov edi, r8d
+	and edi, 2097136
+	movd xmm0, rax
+	xor rax, [rcx+rbx+8]
+	add	 r11, rax
+	mov	 QWORD PTR [r14+8], r11
+	punpcklqdq xmm2, xmm0
+
+	mov	 r9d, r10d
+	xor	 r9d, 48
+	xor	 r10d, 16
+	pxor	 xmm2, XMMWORD PTR [r9+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm0, xmm5
+	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm2, xmm4
+	paddq	 xmm1, xmm7
+	movdqa	 xmm5, xmm4
+	movdqu	 XMMWORD PTR [r9+rbx], xmm0
+	movdqa	 xmm4, xmm6
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	movdqu xmm6, [rdi+rbx]
+	mov	 r10d, edi
+	xor	 r11, r12
+	dec rsi
+	jne	 main_loop_ivybridge
+
+	ldmxcsr DWORD PTR [rsp]
+	mov	 rbx, QWORD PTR [rsp+160]
+	movaps	 xmm6, XMMWORD PTR [rsp+64]
+	movaps	 xmm7, XMMWORD PTR [rsp+48]
+	movaps	 xmm8, XMMWORD PTR [rsp+32]
+	add	 rsp, 80
+	pop	 r15
+	pop	 r14
+	pop	 r13
+	pop	 r12
+	pop	 rdi
+	pop	 rsi
+	pop	 rbp
+	jmp cnv2_main_loop_ivybridge_endp
+
+sqrt_fixup_ivybridge:
+	dec	 rdx
+	mov r13d, -1022
+	shl r13, 32
+	mov	 rax, rdx
+	shr	 rdx, 19
+	shr	 rax, 20
+	mov	 rcx, rdx
+	sub	 rcx, rax
+	add	 rax, r13
+	not r13
+	sub	 rcx, r13
+	mov	 r13d, -2147483647
+	imul	 rcx, rax
+	sub	 rcx, r9
+	adc	 rdx, 0
+	movd	 xmm3, rdx
+	jmp	 sqrt_fixup_ivybridge_ret
+
+cnv2_main_loop_ivybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
new file mode 100644
index 000000000..cd8b43477
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
@@ -0,0 +1,179 @@
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movq	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movq	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movq xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN 8
+main_loop_ryzen:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movq	xmm0, r11
+	movq	xmm6, r8
+	punpcklqdq xmm6, xmm0
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+	movq	r14, xmm5
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movq	rax, xmm0
+
+	div	r9
+	movq xmm0, rax
+	movq xmm1, rdx
+	punpckldq xmm0, xmm1
+	movq r15, xmm0
+	paddq xmm0, xmm5
+	movdqa xmm2, xmm0
+	psrlq xmm0, 12
+	paddq	xmm0, xmm7
+	sqrtsd	xmm1, xmm0
+	movq	rdi, xmm1
+	test	rdi, 524287
+	je	sqrt_fixup_ryzen
+	shr	rdi, 19
+
+sqrt_fixup_ryzen_ret:
+	mov	rax, rsi
+	mul	r14
+	movq xmm1, rax
+	movq xmm0, rdx
+	punpcklqdq xmm0, xmm1
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	xor rdx, [rcx+rbx]
+	xor rax, [rcx+rbx+8]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	pxor xmm2, xmm0
+	paddq xmm4, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	main_loop_ryzen
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp cnv2_main_loop_ryzen_endp
+
+sqrt_fixup_ryzen:
+	movq r9, xmm2
+	dec	rdi
+	mov edx, -1022
+	shl rdx, 32
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	lea	rcx, [rcx+rdx+1]
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	sqrt_fixup_ryzen_ret
+
+cnv2_main_loop_ryzen_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
new file mode 100644
index 000000000..d103cc2ee
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
@@ -0,0 +1,179 @@
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movd	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movd	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movd	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movd xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movd	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN 8
+main_loop_ryzen:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movd	xmm0, r11
+	movd	xmm6, r8
+	punpcklqdq xmm6, xmm0
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+	movd	r14, xmm5
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movd	rax, xmm0
+
+	div	r9
+	movd xmm0, rax
+	movd xmm1, rdx
+	punpckldq xmm0, xmm1
+	movd r15, xmm0
+	paddq xmm0, xmm5
+	movdqa xmm2, xmm0
+	psrlq xmm0, 12
+	paddq	xmm0, xmm7
+	sqrtsd	xmm1, xmm0
+	movd	rdi, xmm1
+	test	rdi, 524287
+	je	sqrt_fixup_ryzen
+	shr	rdi, 19
+
+sqrt_fixup_ryzen_ret:
+	mov	rax, rsi
+	mul	r14
+	movd xmm1, rax
+	movd xmm0, rdx
+	punpcklqdq xmm0, xmm1
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	xor rdx, [rcx+rbx]
+	xor rax, [rcx+rbx+8]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	pxor xmm2, xmm0
+	paddq xmm4, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	main_loop_ryzen
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp cnv2_main_loop_ryzen_endp
+
+sqrt_fixup_ryzen:
+	movd r9, xmm2
+	dec	rdi
+	mov edx, -1022
+	shl rdx, 32
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	lea	rcx, [rcx+rdx+1]
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	sqrt_fixup_ryzen_ret
+
+cnv2_main_loop_ryzen_endp:
diff --git a/xmrstak/backend/cpu/crypto/c_blake256.c b/xmrstak/backend/cpu/crypto/c_blake256.c
index ff623ddb8..e5fadfe74 100644
--- a/xmrstak/backend/cpu/crypto/c_blake256.c
+++ b/xmrstak/backend/cpu/crypto/c_blake256.c
@@ -124,7 +124,7 @@ void blake224_init(state *S) {
 }
 
 // datalen = number of bits
-void blake256_update(state *S, const uint8_t *data, uint64_t datalen) {
+void blake256_update(state *S, const uint8_t *data, uint32_t datalen) {
 	int left = S->buflen >> 3;
 	int fill = 64 - left;
 
@@ -155,7 +155,7 @@ void blake256_update(state *S, const uint8_t *data, uint64_t datalen) {
 }
 
 // datalen = number of bits
-void blake224_update(state *S, const uint8_t *data, uint64_t datalen) {
+void blake224_update(state *S, const uint8_t *data, uint32_t datalen) {
 	blake256_update(S, data, datalen);
 }
 
@@ -206,7 +206,7 @@ void blake224_final(state *S, uint8_t *digest) {
 }
 
 // inlen = number of bytes
-void blake256_hash(uint8_t *out, const uint8_t *in, uint64_t inlen) {
+void blake256_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) {
 	state S;
 	blake256_init(&S);
 	blake256_update(&S, in, inlen * 8);
@@ -214,7 +214,7 @@ void blake256_hash(uint8_t *out, const uint8_t *in, uint64_t inlen) {
 }
 
 // inlen = number of bytes
-void blake224_hash(uint8_t *out, const uint8_t *in, uint64_t inlen) {
+void blake224_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) {
 	state S;
 	blake224_init(&S);
 	blake224_update(&S, in, inlen * 8);
@@ -282,13 +282,13 @@ void hmac_blake224_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
 }
 
 // datalen = number of bits
-void hmac_blake256_update(hmac_state *S, const uint8_t *data, uint64_t datalen) {
+void hmac_blake256_update(hmac_state *S, const uint8_t *data, uint32_t datalen) {
   // update the inner state
   blake256_update(&S->inner, data, datalen);
 }
 
 // datalen = number of bits
-void hmac_blake224_update(hmac_state *S, const uint8_t *data, uint64_t datalen) {
+void hmac_blake224_update(hmac_state *S, const uint8_t *data, uint32_t datalen) {
   // update the inner state
   blake224_update(&S->inner, data, datalen);
 }
@@ -310,7 +310,7 @@ void hmac_blake224_final(hmac_state *S, uint8_t *digest) {
 }
 
 // keylen = number of bytes; inlen = number of bytes
-void hmac_blake256_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint64_t inlen) {
+void hmac_blake256_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint32_t inlen) {
 	hmac_state S;
 	hmac_blake256_init(&S, key, keylen);
 	hmac_blake256_update(&S, in, inlen * 8);
@@ -318,7 +318,7 @@ void hmac_blake256_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const
 }
 
 // keylen = number of bytes; inlen = number of bytes
-void hmac_blake224_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint64_t inlen) {
+void hmac_blake224_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint32_t inlen) {
 	hmac_state S;
 	hmac_blake224_init(&S, key, keylen);
 	hmac_blake224_update(&S, in, inlen * 8);
diff --git a/xmrstak/backend/cpu/crypto/c_blake256.h b/xmrstak/backend/cpu/crypto/c_blake256.h
index b9c2aad0d..06c7917af 100644
--- a/xmrstak/backend/cpu/crypto/c_blake256.h
+++ b/xmrstak/backend/cpu/crypto/c_blake256.h
@@ -17,27 +17,27 @@ typedef struct {
 void blake256_init(state *);
 void blake224_init(state *);
 
-void blake256_update(state *, const uint8_t *, uint64_t);
-void blake224_update(state *, const uint8_t *, uint64_t);
+void blake256_update(state *, const uint8_t *, uint32_t);
+void blake224_update(state *, const uint8_t *, uint32_t);
 
 void blake256_final(state *, uint8_t *);
 void blake224_final(state *, uint8_t *);
 
-void blake256_hash(uint8_t *, const uint8_t *, uint64_t);
-void blake224_hash(uint8_t *, const uint8_t *, uint64_t);
+void blake256_hash(uint8_t *, const uint8_t *, uint32_t);
+void blake224_hash(uint8_t *, const uint8_t *, uint32_t);
 
 /* HMAC functions: */
 
 void hmac_blake256_init(hmac_state *, const uint8_t *, uint64_t);
 void hmac_blake224_init(hmac_state *, const uint8_t *, uint64_t);
 
-void hmac_blake256_update(hmac_state *, const uint8_t *, uint64_t);
-void hmac_blake224_update(hmac_state *, const uint8_t *, uint64_t);
+void hmac_blake256_update(hmac_state *, const uint8_t *, uint32_t);
+void hmac_blake224_update(hmac_state *, const uint8_t *, uint32_t);
 
 void hmac_blake256_final(hmac_state *, uint8_t *);
 void hmac_blake224_final(hmac_state *, uint8_t *);
 
-void hmac_blake256_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint64_t);
-void hmac_blake224_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint64_t);
+void hmac_blake256_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint32_t);
+void hmac_blake224_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint32_t);
 
 #endif /* _BLAKE256_H_ */
diff --git a/xmrstak/backend/cpu/crypto/c_groestl.c b/xmrstak/backend/cpu/crypto/c_groestl.c
index 1318d5a6a..5b3523e79 100644
--- a/xmrstak/backend/cpu/crypto/c_groestl.c
+++ b/xmrstak/backend/cpu/crypto/c_groestl.c
@@ -4,7 +4,7 @@
  *
  *  This work is based on the implementation of
  *          Soeren S. Thomsen and Krystian Matusiewicz
- *          
+ *
  *
  */
 
@@ -22,7 +22,7 @@ const uint8_t indices_cyclic[15] = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6};
 #define ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) {temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \
 															v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \
 															v1 = temp_var;}
-  
+
 
 #define COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t)				\
    tu = T[2*(uint32_t)x[4*c0+0]];			    \
@@ -161,11 +161,11 @@ static void F512(uint32_t *h, const uint32_t *m) {
 
 /* digest up to msglen bytes of input (full blocks only) */
 static void Transform(groestlHashState *ctx,
-	       const uint8_t *input, 
+	       const uint8_t *input,
 	       int msglen) {
 
   /* digest message, one block at a time */
-  for (; msglen >= SIZE512; 
+  for (; msglen >= SIZE512;
 	   msglen -= SIZE512, input += SIZE512) {
 	F512(ctx->chaining,(uint32_t*)input);
 
@@ -199,7 +199,7 @@ static void OutputTransformation(groestlHashState *ctx) {
 	RND512P((uint8_t*)y, temp, 0x00000009);
 	for (j = 0; j < 2*COLS512; j++) {
 	  ctx->chaining[j] ^= temp[j];
-	}									  
+	}
 }
 
 /* initialise context */
@@ -313,7 +313,7 @@ static void Final(groestlHashState* ctx,
 	ctx->block_counter2 >>= 8;
   }
   /* digest final padding block */
-  Transform(ctx, ctx->buffer, SIZE512); 
+  Transform(ctx, ctx->buffer, SIZE512);
   /* perform output transformation */
   OutputTransformation(ctx);
 
@@ -332,7 +332,7 @@ static void Final(groestlHashState* ctx,
 }
 
 /* hash bit sequence */
-void groestl(const BitSequence* data, 
+void groestl(const BitSequence* data,
 		DataLength databitlen,
 		BitSequence* hashval) {
 
diff --git a/xmrstak/backend/cpu/crypto/c_groestl.h b/xmrstak/backend/cpu/crypto/c_groestl.h
index 2b5133934..47044b462 100644
--- a/xmrstak/backend/cpu/crypto/c_groestl.h
+++ b/xmrstak/backend/cpu/crypto/c_groestl.h
@@ -4,10 +4,10 @@
 #include "crypto_uint8.h"
 #include "crypto_uint32.h"
 #include "crypto_uint64.h"
-#include "crypto_hash.h" 
+#include "crypto_hash.h"
 
-typedef crypto_uint8 uint8_t; 
-typedef crypto_uint32 uint32_t; 
+typedef crypto_uint8 uint8_t;
+typedef crypto_uint32 uint32_t;
 typedef crypto_uint64 uint64_t;
 */
 #include <stdint.h>
@@ -42,7 +42,7 @@ typedef struct {
   BitSequence buffer[SIZE512];      /* data buffer */
   int buf_ptr;              /* data buffer pointer */
   int bits_in_last_byte;    /* no. of message bits in last byte of
-			       data buffer */
+                               data buffer */
 } groestlHashState;
 
 /*void Init(hashState*);
@@ -53,8 +53,8 @@ void groestl(const BitSequence*, DataLength, BitSequence*);
 
 /*
 int crypto_hash(unsigned char *out,
-		const unsigned char *in,
-		unsigned long long len);
+                const unsigned char *in,
+                unsigned long long len);
 */
 
 #endif /* __hash_h */
diff --git a/xmrstak/backend/cpu/crypto/c_jh.c b/xmrstak/backend/cpu/crypto/c_jh.c
index 9d685a0de..0256a0fa2 100644
--- a/xmrstak/backend/cpu/crypto/c_jh.c
+++ b/xmrstak/backend/cpu/crypto/c_jh.c
@@ -234,7 +234,7 @@ static HashReturn Init(hashState *state, int hashbitlen)
 	  /*initialize the initial hash value of JH*/
 	  state->hashbitlen = hashbitlen;
 
-	  /*load the intital hash value into state*/
+	  /*load the initial hash value into state*/
 	  switch (hashbitlen)
 	  {
 			case 224: memcpy(state->x,JH224_H0,128); break;
diff --git a/xmrstak/backend/cpu/crypto/c_keccak.c b/xmrstak/backend/cpu/crypto/c_keccak.c
index eadb85b72..63c16147d 100644
--- a/xmrstak/backend/cpu/crypto/c_keccak.c
+++ b/xmrstak/backend/cpu/crypto/c_keccak.c
@@ -12,14 +12,14 @@
 #define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
 #endif
 
-const uint64_t keccakf_rndc[24] = 
+const uint64_t keccakf_rndc[24] =
 {
 	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
 	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
 	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
 	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
 	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
-	0x8000000000008003, 0x8000000000008002, 0x8000000000000080, 
+	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
 	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
 	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
 };
@@ -130,7 +130,7 @@ void keccakf(uint64_t st[25], int rounds)
 		st[j + 2] ^= (~bc[3]) & bc[4];
 		st[j + 3] ^= (~bc[4]) & bc[0];
 		st[j + 4] ^= (~bc[0]) & bc[1];
-		
+
 		//  Iota
 		st[0] ^= keccakf_rndc[round];
 	}
@@ -147,7 +147,7 @@ void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen)
 
 	rsiz = sizeof(state_t) == mdlen ? HASH_DATA_AREA : 200 - 2 * mdlen;
 	rsizw = rsiz / 8;
-	
+
 	memset(st, 0, sizeof(st));
 
 	for ( ; inlen >= rsiz; inlen -= rsiz, in += rsiz) {
@@ -155,7 +155,7 @@ void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen)
 			st[i] ^= ((uint64_t *) in)[i];
 		keccakf(st, KECCAK_ROUNDS);
 	}
-	
+
 	// last block and padding
 	memcpy(temp, in, inlen);
 	temp[inlen++] = 1;
@@ -173,4 +173,4 @@ void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen)
 void keccak1600(const uint8_t *in, int inlen, uint8_t *md)
 {
 	keccak(in, inlen, md, sizeof(state_t));
-}
\ No newline at end of file
+}
diff --git a/xmrstak/backend/cpu/crypto/c_skein.c b/xmrstak/backend/cpu/crypto/c_skein.c
index 2453713f7..e2d54425f 100644
--- a/xmrstak/backend/cpu/crypto/c_skein.c
+++ b/xmrstak/backend/cpu/crypto/c_skein.c
@@ -5,7 +5,7 @@
 ** Source code author: Doug Whiting, 2008.
 **
 ** This algorithm and source code is released to the public domain.
-** 
+**
 ************************************************************************/
 
 #define  SKEIN_PORT_CODE /* instantiate any code in skein_port.h */
@@ -96,12 +96,12 @@ static int  Skein1024_Final (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
 **   After an InitExt() call, just use Update/Final calls as with Init().
 **
 **   Notes: Same parameters as _Init() calls, plus treeInfo/key/keyBytes.
-**          When keyBytes == 0 and treeInfo == SKEIN_SEQUENTIAL, 
+**          When keyBytes == 0 and treeInfo == SKEIN_SEQUENTIAL,
 **              the results of InitExt() are identical to calling Init().
 **          The function Init() may be called once to "precompute" the IV for
 **              a given hashBitLen value, then by saving a copy of the context
 **              the IV computation may be avoided in later calls.
-**          Similarly, the function InitExt() may be called once per MAC key 
+**          Similarly, the function InitExt() may be called once per MAC key
 **              to precompute the MAC IV, then a copy of the context saved and
 **              reused for each new MAC computation.
 **/
@@ -135,7 +135,7 @@ static int  Skein1024_Output   (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
 
 /*****************************************************************
 ** "Internal" Skein definitions
-**    -- not needed for sequential hashing API, but will be 
+**    -- not needed for sequential hashing API, but will be
 **           helpful for other uses of Skein (e.g., tree hash mode).
 **    -- included here so that they can be shared between
 **           reference and optimized code.
@@ -257,11 +257,11 @@ static int  Skein1024_Output   (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
 #define Skein_Assert(x,retCode)/* default: ignore all Asserts, for performance */
 #define Skein_assert(x)
 #elif   defined(SKEIN_ASSERT)
-#include <assert.h>     
-#define Skein_Assert(x,retCode) assert(x) 
-#define Skein_assert(x)         assert(x) 
+#include <assert.h>
+#define Skein_Assert(x,retCode) assert(x)
+#define Skein_assert(x)         assert(x)
 #else
-#include <assert.h>     
+#include <assert.h>
 #define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /*  caller  error */
 #define Skein_assert(x)         assert(x)                     /* internal error */
 #endif
@@ -269,8 +269,8 @@ static int  Skein1024_Output   (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
 /*****************************************************************
 ** Skein block function constants (shared across Ref and Opt code)
 ******************************************************************/
-enum    
-{   
+enum
+{
   /* Skein_256 round rotation constants */
   R_256_0_0=14, R_256_0_1=16,
   R_256_1_0=52, R_256_1_1=57,
@@ -518,7 +518,7 @@ const u64b_t SKEIN1024_IV_1024[] =
 #define BLK_BITS        (WCNT*64)               /* some useful definitions for code here */
 #define KW_TWK_BASE     (0)
 #define KW_KEY_BASE     (3)
-#define ks              (kw + KW_KEY_BASE)                
+#define ks              (kw + KW_KEY_BASE)
 #define ts              (kw + KW_TWK_BASE)
 
 #ifdef SKEIN_DEBUG
@@ -567,7 +567,7 @@ static void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,s
 		ts[0] += byteCntAdd;                    /* update processed length */
 
 		/* precompute the key schedule for this block */
-		ks[0] = ctx->X[0];     
+		ks[0] = ctx->X[0];
 		ks[1] = ctx->X[1];
 		ks[2] = ctx->X[2];
 		ks[3] = ctx->X[3];
@@ -594,7 +594,7 @@ static void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,s
 	X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
 	X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
 
-#if SKEIN_UNROLL_256 == 0                       
+#if SKEIN_UNROLL_256 == 0
 #define R256(p0,p1,p2,p3,ROT,rNum)           /* fully unrolled */   \
 	Round256(p0,p1,p2,p3,ROT,rNum)                                  \
 	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
@@ -620,8 +620,8 @@ static void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,s
 	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
 
 	for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_256)  /* loop thru it */
-#endif  
-		{    
+#endif
+		{
 #define R256_8_rounds(R)                  \
 		R256(0,1,2,3,R_256_0,8*(R) + 1);  \
 		R256(0,3,2,1,R_256_1,8*(R) + 2);  \
@@ -762,7 +762,7 @@ static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,s
 		ks[5] = ctx->X[5];
 		ks[6] = ctx->X[6];
 		ks[7] = ctx->X[7];
-		ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ 
+		ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
 				ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
 
 		ts[2] = ts[0] ^ ts[1];
@@ -790,7 +790,7 @@ static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,s
 	X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
 	X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \
 
-#if SKEIN_UNROLL_512 == 0                       
+#if SKEIN_UNROLL_512 == 0
 #define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)      /* unrolled */  \
 	Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
 	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
@@ -1022,7 +1022,7 @@ static void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,s
 	X##pC += X##pD; X##pD = RotL_64(X##pD,ROT##_6); X##pD ^= X##pC;   \
 	X##pE += X##pF; X##pF = RotL_64(X##pF,ROT##_7); X##pF ^= X##pE;   \
 
-#if SKEIN_UNROLL_1024 == 0                      
+#if SKEIN_UNROLL_1024 == 0
 #define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
 	Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
 	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rn,Xptr);
@@ -1044,7 +1044,7 @@ static void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,s
 	X13   += ks[((R)+14) % 17] + ts[((R)+1) % 3];                     \
 	X14   += ks[((R)+15) % 17] + ts[((R)+2) % 3];                     \
 	X15   += ks[((R)+16) % 17] +     (R)+1;                           \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); 
+	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
 #else                                       /* looping version */
 #define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
 	Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
@@ -1072,7 +1072,7 @@ static void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,s
 	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
 
 	for (r=1;r <= 2*RCNT;r+=2*SKEIN_UNROLL_1024)    /* loop thru it */
-#endif  
+#endif
 		{
 #define R1024_8_rounds(R)    /* do 8 full rounds */                               \
 		R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_0,8*(R) + 1); \
@@ -1156,7 +1156,7 @@ static void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,s
 		ctx->X[15] = X15 ^ w[15];
 
 		Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
-		
+
 		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
 		blkPtr += SKEIN1024_BLOCK_BYTES;
 		}
@@ -1193,7 +1193,7 @@ static int Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen)
 		u08b_t  b[SKEIN_256_STATE_BYTES];
 		u64b_t  w[SKEIN_256_STATE_WORDS];
 		} cfg;                              /* config block */
-		
+
 	Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
 	ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
 
@@ -1237,13 +1237,13 @@ static int Skein_256_InitExt(Skein_256_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree
 		u08b_t  b[SKEIN_256_STATE_BYTES];
 		u64b_t  w[SKEIN_256_STATE_WORDS];
 		} cfg;                              /* config block */
-		
+
 	Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
 	Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
 
 	/* compute the initial chaining values ctx->X[], based on key */
 	if (keyBytes == 0)                          /* is there a key? */
-		{                                   
+		{
 		memset(ctx->X,0,sizeof(ctx->X));        /* no key: use all zeroes as key for config block */
 		}
 	else                                        /* here to pre-process a key */
@@ -1282,7 +1282,7 @@ static int Skein_256_InitExt(Skein_256_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree
 	/* Set up to process the data message portion of the hash (default) */
 	ctx->h.bCnt = 0;                            /* buffer b[] starts out empty */
 	Skein_Start_New_Type(ctx,MSG);
-	
+
 	return SKEIN_SUCCESS;
 	}
 #endif
@@ -1334,7 +1334,7 @@ static int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msg
 
 	return SKEIN_SUCCESS;
 	}
-   
+
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* finalize the hash computation and output the result */
 static int Skein_256_Final(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
@@ -1348,7 +1348,7 @@ static int Skein_256_Final(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
 		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
 
 	Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
-	
+
 	/* now output the result */
 	byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
 
@@ -1391,7 +1391,7 @@ static int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen)
 		u08b_t  b[SKEIN_512_STATE_BYTES];
 		u64b_t  w[SKEIN_512_STATE_WORDS];
 		} cfg;                              /* config block */
-		
+
 	Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
 	ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
 
@@ -1437,13 +1437,13 @@ static int Skein_512_InitExt(Skein_512_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree
 		u08b_t  b[SKEIN_512_STATE_BYTES];
 		u64b_t  w[SKEIN_512_STATE_WORDS];
 		} cfg;                              /* config block */
-		
+
 	Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
 	Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
 
 	/* compute the initial chaining values ctx->X[], based on key */
 	if (keyBytes == 0)                          /* is there a key? */
-		{                                   
+		{
 		memset(ctx->X,0,sizeof(ctx->X));        /* no key: use all zeroes as key for config block */
 		}
 	else                                        /* here to pre-process a key */
@@ -1482,7 +1482,7 @@ static int Skein_512_InitExt(Skein_512_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree
 	/* Set up to process the data message portion of the hash (default) */
 	ctx->h.bCnt = 0;                            /* buffer b[] starts out empty */
 	Skein_Start_New_Type(ctx,MSG);
-	
+
 	return SKEIN_SUCCESS;
 	}
 #endif
@@ -1534,7 +1534,7 @@ static int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msg
 
 	return SKEIN_SUCCESS;
 	}
-   
+
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* finalize the hash computation and output the result */
 static int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
@@ -1548,7 +1548,7 @@ static int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
 		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
 
 	Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
-	
+
 	/* now output the result */
 	byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
 
@@ -1590,7 +1590,7 @@ static int Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen)
 		u08b_t  b[SKEIN1024_STATE_BYTES];
 		u64b_t  w[SKEIN1024_STATE_WORDS];
 		} cfg;                              /* config block */
-		
+
 	Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
 	ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
 
@@ -1635,13 +1635,13 @@ static int Skein1024_InitExt(Skein1024_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree
 		u08b_t  b[SKEIN1024_STATE_BYTES];
 		u64b_t  w[SKEIN1024_STATE_WORDS];
 		} cfg;                              /* config block */
-		
+
 	Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
 	Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
 
 	/* compute the initial chaining values ctx->X[], based on key */
 	if (keyBytes == 0)                          /* is there a key? */
-		{                                   
+		{
 		memset(ctx->X,0,sizeof(ctx->X));        /* no key: use all zeroes as key for config block */
 		}
 	else                                        /* here to pre-process a key */
@@ -1680,7 +1680,7 @@ static int Skein1024_InitExt(Skein1024_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree
 	/* Set up to process the data message portion of the hash (default) */
 	ctx->h.bCnt = 0;                            /* buffer b[] starts out empty */
 	Skein_Start_New_Type(ctx,MSG);
-	
+
 	return SKEIN_SUCCESS;
 	}
 #endif
@@ -1732,7 +1732,7 @@ static int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msg
 
 	return SKEIN_SUCCESS;
 	}
-   
+
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* finalize the hash computation and output the result */
 static int Skein1024_Final(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
@@ -1746,7 +1746,7 @@ static int Skein1024_Final(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
 		memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
 
 	Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
-	
+
 	/* now output the result */
 	byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
 
@@ -1790,9 +1790,9 @@ static int Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
 	if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)   /* zero pad b[] if necessary */
 		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
 	Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);    /* process the final block */
-	
+
 	Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN_256_BLOCK_BYTES);   /* "output" the state bytes */
-	
+
 	return SKEIN_SUCCESS;
 	}
 
@@ -1806,9 +1806,9 @@ static int Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
 	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)   /* zero pad b[] if necessary */
 		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
 	Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);    /* process the final block */
-	
+
 	Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN_512_BLOCK_BYTES);   /* "output" the state bytes */
-	
+
 	return SKEIN_SUCCESS;
 	}
 
@@ -1822,9 +1822,9 @@ static int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
 	if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)   /* zero pad b[] if necessary */
 		memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
 	Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);    /* process the final block */
-	
+
 	Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN1024_BLOCK_BYTES);   /* "output" the state bytes */
-	
+
 	return SKEIN_SUCCESS;
 	}
 
diff --git a/xmrstak/backend/cpu/crypto/c_skein.h b/xmrstak/backend/cpu/crypto/c_skein.h
index 6165a2ace..1aa11dea3 100644
--- a/xmrstak/backend/cpu/crypto/c_skein.h
+++ b/xmrstak/backend/cpu/crypto/c_skein.h
@@ -9,7 +9,7 @@
 ** This algorithm and source code is released to the public domain.
 **
 ***************************************************************************
-** 
+**
 ** The following compile-time switches may be defined to control some
 ** tradeoffs between speed, code size, error checking, and security.
 **
@@ -20,8 +20,8 @@
 **                            [default: no callouts (no overhead)]
 **
 **  SKEIN_ERR_CHECK        -- how error checking is handled inside Skein
-**                            code. If not defined, most error checking 
-**                            is disabled (for performance). Otherwise, 
+**                            code. If not defined, most error checking
+**                            is disabled (for performance). Otherwise,
 **                            the switch value is interpreted as:
 **                                0: use assert()      to flag errors
 **                                1: return SKEIN_FAIL to flag errors
@@ -37,11 +37,11 @@ typedef enum
 }
 SkeinHashReturn;
 
-typedef size_t   SkeinDataLength;                /* bit count  type */
+typedef uint32_t SkeinDataLength;                /* bit count  type */
 typedef u08b_t   SkeinBitSequence;               /* bit stream type */
 
 /* "all-in-one" call */
 SkeinHashReturn skein_hash(int hashbitlen,   const SkeinBitSequence *data,
-		SkeinDataLength databitlen, SkeinBitSequence *hashval);
+                           SkeinDataLength databitlen, SkeinBitSequence *hashval);
 
 #endif  /* ifndef _SKEIN_H_ */
diff --git a/xmrstak/backend/cpu/crypto/cryptonight.h b/xmrstak/backend/cpu/crypto/cryptonight.h
index 631c39a4a..5c9a73332 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight.h
@@ -7,8 +7,6 @@ extern "C" {
 
 #include <stddef.h>
 #include <inttypes.h>
-#include "xmrstak/backend/cryptonight.hpp"
-
 
 typedef struct {
 	uint8_t hash_state[224]; // Need only 200, explicit align
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 9b6e1dc28..2b1741764 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -16,8 +16,11 @@
 #pragma once
 
 #include "cryptonight.h"
+#include "xmrstak/backend/cryptonight.hpp"
 #include <memory.h>
 #include <stdio.h>
+#include <cfenv>
+#include <utility>
 
 #ifdef __GNUC__
 #include <x86intrin.h>
@@ -27,7 +30,7 @@ static inline uint64_t _umul128(uint64_t a, uint64_t b, uint64_t* hi)
 	*hi = r >> 64;
 	return (uint64_t)r;
 }
-#define _mm256_set_m128i(v0, v1)  _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
+
 #else
 #include <intrin.h>
 #endif // __GNUC__
@@ -42,7 +45,7 @@ extern "C"
 {
 	void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen);
 	void keccakf(uint64_t st[25], int rounds);
-	extern void(*const extra_hashes[4])(const void *, size_t, char *);
+	extern void(*const extra_hashes[4])(const void *, uint32_t, char *);
 }
 
 // This will shift and xor tmp1 into itself as 4 32-bit vals such as
@@ -148,7 +151,20 @@ static inline void soft_aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i
 	*x7 = soft_aesenc(*x7, key);
 }
 
-template<size_t MEM, bool SOFT_AES, bool PREFETCH>
+inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7)
+{
+	__m128i tmp0 = x0;
+	x0 = _mm_xor_si128(x0, x1);
+	x1 = _mm_xor_si128(x1, x2);
+	x2 = _mm_xor_si128(x2, x3);
+	x3 = _mm_xor_si128(x3, x4);
+	x4 = _mm_xor_si128(x4, x5);
+	x5 = _mm_xor_si128(x5, x6);
+	x6 = _mm_xor_si128(x6, x7);
+	x7 = _mm_xor_si128(x7, tmp0);
+}
+
+template<size_t MEM, bool SOFT_AES, bool PREFETCH, xmrstak_algo ALGO>
 void cn_explode_scratchpad(const __m128i* input, __m128i* output)
 {
 	// This is more than we have registers, compiler will assign 2 keys on the stack
@@ -166,6 +182,40 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
 	xin6 = _mm_load_si128(input + 10);
 	xin7 = _mm_load_si128(input + 11);
 
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	{
+		for(size_t i=0; i < 16; i++)
+		{
+			if(SOFT_AES)
+			{
+				soft_aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				soft_aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				soft_aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				soft_aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				soft_aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				soft_aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				soft_aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				soft_aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				soft_aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				soft_aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			}
+			else
+			{
+				aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+				aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			}
+			mix_and_propagate(xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+		}
+	}
+
 	for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
 	{
 		if(SOFT_AES)
@@ -213,7 +263,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
 	}
 }
 
-template<size_t MEM, bool SOFT_AES, bool PREFETCH>
+template<size_t MEM, bool SOFT_AES, bool PREFETCH, xmrstak_algo ALGO>
 void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 {
 	// This is more than we have registers, compiler will assign 2 keys on the stack
@@ -275,6 +325,93 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 			aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
 			aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
 		}
+
+		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+			mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+	}
+
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	{
+		for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
+		{
+			if(PREFETCH)
+				_mm_prefetch((const char*)input + i + 0, _MM_HINT_NTA);
+
+			xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
+			xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
+			xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
+			xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
+
+			if(PREFETCH)
+				_mm_prefetch((const char*)input + i + 4, _MM_HINT_NTA);
+
+			xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
+			xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
+			xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
+			xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
+
+			if(SOFT_AES)
+			{
+				soft_aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			}
+			else
+			{
+				aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			}
+
+			if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+				mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+		}
+
+		for(size_t i=0; i < 16; i++)
+		{
+			if(SOFT_AES)
+			{
+				soft_aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				soft_aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			}
+			else
+			{
+				aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+				aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			}
+
+			mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+		}
 	}
 
 	_mm_store_si128(output + 4, xout0);
@@ -287,462 +424,579 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 	_mm_store_si128(output + 11, xout7);
 }
 
-template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
-void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0)
+inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0)
 {
-	keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
-
-	// Optim - 99% time boundary
-	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
+	__m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52)));
+	x = _mm_sqrt_sd(_mm_setzero_pd(), x);
+	uint64_t r = static_cast<uint64_t>(_mm_cvtsi128_si64(_mm_castpd_si128(x)));
 
-	uint8_t* l0 = ctx0->long_state;
-	uint64_t* h0 = (uint64_t*)ctx0->hash_state;
+	const uint64_t s = r >> 20;
+	r >>= 19;
 
-	uint64_t al0 = h0[0] ^ h0[4];
-	uint64_t ah0 = h0[1] ^ h0[5];
-	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+	uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1);
 
-	uint64_t idx0 = h0[0] ^ h0[4];
-
-	// Optim - 90% time boundary
-	for(size_t i = 0; i < ITERATIONS; i++)
-	{
-		__m128i cx;
-		cx = _mm_load_si128((__m128i *)&l0[idx0 & MASK]);
-
-		if(SOFT_AES)
-			cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
-		else
-			cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
+#ifdef __INTEL_COMPILER
+	_addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned __int64*)&x2), r, 0, (unsigned __int64*)&r);
+#elif defined(_MSC_VER) || (__GNUC__ >= 7)
+	_addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r);
+#else
+	// GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence
+	// Fallback to simpler code
+	if (x2 < n0) ++r;
+#endif
+	return r;
+}
 
-		_mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
-		idx0 = _mm_cvtsi128_si64(cx);
-		bx0 = cx;
+inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key)
+{
+	alignas(16) uint32_t k[4];
+	alignas(16) uint32_t x[4];
+	_mm_store_si128((__m128i*)k, key);
+	_mm_store_si128((__m128i*)x, _mm_xor_si128(val, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))); // x = ~val
+	#define BYTE(p, i) ((unsigned char*)&p)[i]
+	k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)];
+	x[0] ^= k[0];
+	k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)];
+	x[1] ^= k[1];
+	k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)];
+	x[2] ^= k[2];
+	k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)];
+	#undef BYTE
+	return _mm_load_si128((__m128i*)k);
+}
 
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
+template<xmrstak_algo ALGO>
+inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
+{
+	mem_out[0] = _mm_cvtsi128_si64(tmp);
 
-		uint64_t hi, lo, cl, ch;
-		cl = ((uint64_t*)&l0[idx0 & MASK])[0];
-		ch = ((uint64_t*)&l0[idx0 & MASK])[1];
+	tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
+	uint64_t vh = _mm_cvtsi128_si64(tmp);
 
-		lo = _umul128(idx0, cl, &hi);
+	uint8_t x = static_cast<uint8_t>(vh >> 24);
+	static const uint16_t table = 0x7531;
+	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
+	{
+		const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1;
+		vh ^= ((table >> index) & 0x3) << 28;
 
-		al0 += hi;
-		ah0 += lo;
-		((uint64_t*)&l0[idx0 & MASK])[0] = al0;
-		((uint64_t*)&l0[idx0 & MASK])[1] = ah0;
-		ah0 ^= ch;
-		al0 ^= cl;
-		idx0 = al0;
+		mem_out[1] = vh;
+	}
+	else if(ALGO == cryptonight_stellite)
+	{
+		const uint8_t index = (((x >> 4) & 6) | (x & 1)) << 1;
+		vh ^= ((table >> index) & 0x3) << 28;
 
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
+		mem_out[1] = vh;
 	}
 
-	// Optim - 90% time boundary
-	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
+}
 
-	// Optim - 99% time boundary
+/** optimal type for sqrt
+ *
+ * Depending on the number of hashes calculated the optimal type for the sqrt value will be selected.
+ *
+ * @tparam N number of hashes per thread
+ */
+template<size_t N>
+struct GetOptimalSqrtType
+{
+	using type = __m128i;
+};
 
-	keccakf((uint64_t*)ctx0->hash_state, 24);
-	extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output);
+template<>
+struct GetOptimalSqrtType<1u>
+{
+	using type = uint64_t;
+};
+template<size_t N>
+using GetOptimalSqrtType_t = typename GetOptimalSqrtType<N>::type;
+
+/** assign a value and convert if necessary
+ *
+ * @param output output type
+ * @param input value which is assigned to output
+ * @{
+ */
+inline void assign(__m128i& output, const uint64_t input)
+{
+	output = _mm_cvtsi64_si128(input);
 }
 
-// This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon
-// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output
-// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons)
-template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
-void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+inline void assign(uint64_t& output, const uint64_t input)
 {
-	keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
-	keccak((const uint8_t *)input+len, len, ctx[1]->hash_state, 200);
-
-	// Optim - 99% time boundary
-	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
-	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state);
-
-	uint8_t* l0 = ctx[0]->long_state;
-	uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
-	uint8_t* l1 = ctx[1]->long_state;
-	uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
-
-	uint64_t axl0 = h0[0] ^ h0[4];
-	uint64_t axh0 = h0[1] ^ h0[5];
-	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-	uint64_t axl1 = h1[0] ^ h1[4];
-	uint64_t axh1 = h1[1] ^ h1[5];
-	__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-
-	uint64_t idx0 = h0[0] ^ h0[4];
-	uint64_t idx1 = h1[0] ^ h1[4];
-
-	// Optim - 90% time boundary
-	for (size_t i = 0; i < ITERATIONS; i++)
-	{
-		__m128i cx;
-		cx = _mm_load_si128((__m128i *)&l0[idx0 & MASK]);
-
-		if(SOFT_AES)
-			cx = soft_aesenc(cx, _mm_set_epi64x(axh0, axl0));
-		else
-			cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh0, axl0));
+	output = input;
+}
 
-		_mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
-		idx0 = _mm_cvtsi128_si64(cx);
-		bx0 = cx;
+inline void assign(uint64_t& output, const __m128i& input)
+{
+	output = _mm_cvtsi128_si64(input);
+}
+/** @} */
 
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
+inline void set_float_rounding_mode()
+{
+#ifdef _MSC_VER
+	_control87(RC_DOWN, MCW_RC);
+#else
+	std::fesetround(FE_DOWNWARD);
+#endif
+}
 
-		cx = _mm_load_si128((__m128i *)&l1[idx1 & MASK]);
+#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) \
+	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		const uint64_t idx1 = idx0 & MASK; \
+		const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \
+		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
+		const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+	}
 
-		if(SOFT_AES)
-			cx = soft_aesenc(cx, _mm_set_epi64x(axh1, axl1));
-		else
-			cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh1, axl1));
+#define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \
+	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		const uint64_t idx1 = idx0 & MASK; \
+		const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
+		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
+		hi ^= ((uint64_t*)&chunk2)[0]; \
+		lo ^= ((uint64_t*)&chunk2)[1]; \
+		const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+	}
 
-		_mm_store_si128((__m128i *)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx));
-		idx1 = _mm_cvtsi128_si64(cx);
-		bx1 = cx;
+#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		uint64_t sqrt_result_tmp; \
+		assign(sqrt_result_tmp, sqrt_result); \
+		/* Use division and square root results from the _previous_ iteration to hide the latency */ \
+		const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \
+		cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32); \
+		const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL; \
+		/* Most and least significant bits in the divisor are set to 1 \
+		 * to make sure we don't divide by a small or even number, \
+		 * so there are no shortcuts for such cases \
+		 * \
+		 * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 \
+		 * We drop the highest bit to fit both quotient and remainder in 32 bits \
+		 */  \
+		/* Compiler will optimize it to a single div instruction */ \
+		const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
+		const uint64_t division_result = static_cast<uint32_t>(cx_s / d) + ((cx_s % d) << 32); \
+		division_result_xmm = _mm_cvtsi64_si128(static_cast<int64_t>(division_result)); \
+		/* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \
+		assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \
+	}
 
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l1[idx1 & MASK], _MM_HINT_T0);
+#define CN_INIT_SINGLE \
+	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \
+	{ \
+		memset(output, 0, 32 * N); \
+		return; \
+	}
 
-		uint64_t hi, lo, cl, ch;
-		cl = ((uint64_t*)&l0[idx0 & MASK])[0];
-		ch = ((uint64_t*)&l0[idx0 & MASK])[1];
+#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm) \
+	keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \
+	uint64_t monero_const; \
+	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
+	{ \
+		monero_const =  *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + len * n + 35); \
+		monero_const ^=  *(reinterpret_cast<const uint64_t*>(ctx[n]->hash_state) + 24); \
+	} \
+	/* Optim - 99% time boundary */ \
+	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state); \
+	\
+	__m128i ax0; \
+	uint64_t idx0; \
+	__m128i bx0; \
+	uint8_t* l0 = ctx[n]->long_state; \
+	/* BEGIN cryptonight_monero_v8 variables */ \
+	__m128i bx1; \
+	__m128i division_result_xmm; \
+	GetOptimalSqrtType_t<N> sqrt_result; \
+	/* END cryptonight_monero_v8 variables */ \
+	{ \
+		uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \
+		idx0 = h0[0] ^ h0[4]; \
+		ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \
+		bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \
+		if(ALGO == cryptonight_monero_v8) \
+		{ \
+			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \
+			division_result_xmm = _mm_cvtsi64_si128(h0[12]); \
+			assign(sqrt_result, h0[13]); \
+			set_float_rounding_mode(); \
+		} \
+	} \
+	__m128i *ptr0
+
+#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1) \
+	__m128i cx; \
+	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
+	cx = _mm_load_si128(ptr0); \
+	if (ALGO == cryptonight_bittube2) \
+	{ \
+		cx = aes_round_bittube2(cx, ax0); \
+	} \
+	else \
+	{ \
+		if(SOFT_AES) \
+			cx = soft_aesenc(cx, ax0); \
+		else \
+			cx = _mm_aesenc_si128(cx, ax0); \
+	} \
+	CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1)
+
+#define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \
+	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
+		cryptonight_monero_tweak<ALGO>((uint64_t*)ptr0, _mm_xor_si128(bx0, cx)); \
+	else \
+		_mm_store_si128((__m128i *)ptr0, _mm_xor_si128(bx0, cx)); \
+	idx0 = _mm_cvtsi128_si64(cx); \
+	\
+	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
+	if(PREFETCH) \
+		_mm_prefetch((const char*)ptr0, _MM_HINT_T0); \
+	if(ALGO != cryptonight_monero_v8) \
+		bx0 = cx
+
+#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm) \
+	uint64_t lo, cl, ch; \
+	uint64_t al0 = _mm_cvtsi128_si64(ax0); \
+	uint64_t ah0 = ((uint64_t*)&ax0)[1]; \
+	cl = ((uint64_t*)ptr0)[0]; \
+	ch = ((uint64_t*)ptr0)[1]; \
+	CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \
+	{ \
+		uint64_t hi; \
+		lo = _umul128(idx0, cl, &hi); \
+		CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \
+		ah0 += lo; \
+		al0 += hi; \
+	} \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		bx1 = bx0; \
+		bx0 = cx; \
+	} \
+	((uint64_t*)ptr0)[0] = al0; \
+	if(PREFETCH) \
+		_mm_prefetch((const char*)ptr0, _MM_HINT_T0)
+
+#define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \
+	if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
+	{ \
+		if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \
+			((uint64_t*)ptr0)[1] = ah0 ^ monero_const ^ ((uint64_t*)ptr0)[0]; \
+		else \
+			((uint64_t*)ptr0)[1] = ah0 ^ monero_const; \
+	} \
+	else \
+		((uint64_t*)ptr0)[1] = ah0; \
+	al0 ^= cl; \
+	ah0 ^= ch; \
+	ax0 = _mm_set_epi64x(ah0, al0); \
+	idx0 = al0;
+
+#define CN_STEP5(n, monero_const, l0, ax0, bx0, idx0, ptr0) \
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \
+	{ \
+		ptr0 = (__m128i *)&l0[idx0 & MASK]; \
+		int64_t u  = ((int64_t*)ptr0)[0]; \
+		int32_t d  = ((int32_t*)ptr0)[2]; \
+		int64_t q = u / (d | 0x5); \
+		\
+		((int64_t*)ptr0)[0] = u ^ q; \
+		idx0 = d ^ q; \
+	} \
+	else if(ALGO == cryptonight_haven) \
+	{ \
+		ptr0 = (__m128i *)&l0[idx0 & MASK]; \
+		int64_t u  = ((int64_t*)ptr0)[0]; \
+		int32_t d  = ((int32_t*)ptr0)[2]; \
+		int64_t q = u / (d | 0x5); \
+		\
+		((int64_t*)ptr0)[0] = u ^ q; \
+		idx0 = (~d) ^ q; \
+	}
 
-		lo = _umul128(idx0, cl, &hi);
+#define CN_FINALIZE(n) \
+	/* Optim - 90% time boundary */ \
+	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state); \
+	/* Optim - 99% time boundary */ \
+	keccakf((uint64_t*)ctx[n]->hash_state, 24); \
+	extra_hashes[ctx[n]->hash_state[0] & 3](ctx[n]->hash_state, 200, (char*)output + 32 * n)
 
-		axl0 += hi;
-		axh0 += lo;
-		((uint64_t*)&l0[idx0 & MASK])[0] = axl0;
-		((uint64_t*)&l0[idx0 & MASK])[1] = axh0;
-		axh0 ^= ch;
-		axl0 ^= cl;
-		idx0 = axl0;
+//! defer the evaluation of an macro
+#ifndef _MSC_VER
+#	define CN_DEFER(...) __VA_ARGS__
+#else
+#	define CN_EMPTY(...)
+#	define CN_DEFER(...) __VA_ARGS__ CN_EMPTY()
+#endif
 
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
+//! execute the macro f with the passed arguments
+#define CN_EXEC(f,...) CN_DEFER(f)(__VA_ARGS__)
+
+/** add append n to all arguments and keeps n as first argument
+ *
+ * @param n number which is appended to the arguments (expect the first argument n)
+ *
+ * @code{.cpp}
+ * CN_ENUM_2(1, foo, bar)
+ * // is transformed to
+ * 1, foo1, bar1
+ * @endcode
+ */
+#define CN_ENUM_0(n, ...) n
+#define CN_ENUM_1(n, x1) n, x1 ## n
+#define CN_ENUM_2(n, x1, x2) n, x1 ## n, x2 ## n
+#define CN_ENUM_3(n, x1, x2, x3) n, x1 ## n, x2 ## n, x3 ## n
+#define CN_ENUM_4(n, x1, x2, x3, x4) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n
+#define CN_ENUM_5(n, x1, x2, x3, x4, x5) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n
+#define CN_ENUM_6(n, x1, x2, x3, x4, x5, x6) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n
+#define CN_ENUM_7(n, x1, x2, x3, x4, x5, x6, x7) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n
+#define CN_ENUM_8(n, x1, x2, x3, x4, x5, x6, x7, x8) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n
+#define CN_ENUM_9(n, x1, x2, x3, x4, x5, x6, x7, x8, x9) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n
+#define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n
+#define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n
+#define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n
+#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n
+#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n
+#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n
+
+/** repeat a macro call multiple times
+ *
+ * @param n number of arguments followed after f
+ * @param f name of the macro which should be executed
+ * @param ... n parameter which name will get appended by a unique number
+ *
+ * @code{.cpp}
+ * REPEAT_2(2, f, foo, bar)
+ * // is transformed to
+ * f(0, foo0, bar); f(1, foo1, bar1)
+ * @endcode
+ */
+#define REPEAT_1(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__))
+#define REPEAT_2(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__))
+#define REPEAT_3(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__))
+#define REPEAT_4(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__))
+#define REPEAT_5(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(4, __VA_ARGS__))
+
+template< size_t N>
+struct Cryptonight_hash;
+
+template< >
+struct Cryptonight_hash<1>
+{
+	static constexpr size_t N = 1;
 
-		cl = ((uint64_t*)&l1[idx1 & MASK])[0];
-		ch = ((uint64_t*)&l1[idx1 & MASK])[1];
+	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	{
+		constexpr size_t MASK = cn_select_mask<ALGO>();
+		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-		lo = _umul128(idx1, cl, &hi);
+		CN_INIT_SINGLE;
+		REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
 
-		axl1 += hi;
-		axh1 += lo;
-		((uint64_t*)&l1[idx1 & MASK])[0] = axl1;
-		((uint64_t*)&l1[idx1 & MASK])[1] = axh1;
-		axh1 ^= ch;
-		axl1 ^= cl;
-		idx1 = axl1;
+		// Optim - 90% time boundary
+		for(size_t i = 0; i < ITERATIONS; i++)
+		{
+			REPEAT_1(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+			REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+			REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
+		}
 
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l1[idx1 & MASK], _MM_HINT_T0);
+		REPEAT_1(0, CN_FINALIZE);
 	}
+};
 
-	// Optim - 90% time boundary
-	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
-	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state);
+template< >
+struct Cryptonight_hash<2>
+{
+	static constexpr size_t N = 2;
 
-	// Optim - 99% time boundary
+	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	{
+		constexpr size_t MASK = cn_select_mask<ALGO>();
+		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-	keccakf((uint64_t*)ctx[0]->hash_state, 24);
-	extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
-	keccakf((uint64_t*)ctx[1]->hash_state, 24);
-	extra_hashes[ctx[1]->hash_state[0] & 3](ctx[1]->hash_state, 200, (char*)output + 32);
-}
+		CN_INIT_SINGLE;
+		REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
 
-#define CN_STEP1(a, b, c, l, ptr, idx)				\
-	a = _mm_xor_si128(a, c);				\
-	idx = _mm_cvtsi128_si64(a);				\
-	ptr = (__m128i *)&l[idx & MASK];			\
-	if(PREFETCH)						\
-		_mm_prefetch((const char*)ptr, _MM_HINT_T0);	\
-	c = _mm_load_si128(ptr)
-
-#define CN_STEP2(a, b, c, l, ptr, idx)				\
-	if(SOFT_AES)						\
-		c = soft_aesenc(c, a);				\
-	else							\
-		c = _mm_aesenc_si128(c, a);			\
-	b = _mm_xor_si128(b, c);				\
-	_mm_store_si128(ptr, b)
-
-#define CN_STEP3(a, b, c, l, ptr, idx)				\
-	idx = _mm_cvtsi128_si64(c);				\
-	ptr = (__m128i *)&l[idx & MASK];			\
-	if(PREFETCH)						\
-		_mm_prefetch((const char*)ptr, _MM_HINT_T0);	\
-	b = _mm_load_si128(ptr)
-
-#define CN_STEP4(a, b, c, l, ptr, idx)				\
-	lo = _umul128(idx, _mm_cvtsi128_si64(b), &hi);		\
-	a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi));		\
-	_mm_store_si128(ptr, a)
-
-// This lovelier creation will do 3 cn hashes at a time.
-template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
-void cryptonight_triple_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
-{
-	for (size_t i = 0; i < 3; i++)
-	{
-		keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
-		cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
-	}
+		// Optim - 90% time boundary
+		for(size_t i = 0; i < ITERATIONS; i++)
+		{
+			REPEAT_2(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+			REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+			REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
+		}
 
-	uint8_t* l0 = ctx[0]->long_state;
-	uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
-	uint8_t* l1 = ctx[1]->long_state;
-	uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
-	uint8_t* l2 = ctx[2]->long_state;
-	uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
-
-	__m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
-	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-	__m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
-	__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-	__m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
-	__m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
-	__m128i cx0 = _mm_set_epi64x(0, 0);
-	__m128i cx1 = _mm_set_epi64x(0, 0);
-	__m128i cx2 = _mm_set_epi64x(0, 0);
-
-	for (size_t i = 0; i < ITERATIONS/2; i++)
-	{
-		uint64_t idx0, idx1, idx2, hi, lo;
-		__m128i *ptr0, *ptr1, *ptr2;
-
-		// EVEN ROUND
-		CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
-
-		CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
-
-		CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
-
-		CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
-
-		// ODD ROUND
-		CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
-
-		CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
-
-		CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
-
-		CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
+		REPEAT_2(0, CN_FINALIZE);
 	}
+};
+
+template< >
+struct Cryptonight_hash<3>
+{
+	static constexpr size_t N = 3;
 
-	for (size_t i = 0; i < 3; i++)
+	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
 	{
-		cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
-		keccakf((uint64_t*)ctx[i]->hash_state, 24);
-		extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+		constexpr size_t MASK = cn_select_mask<ALGO>();
+		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+		constexpr size_t MEM = cn_select_memory<ALGO>();
+
+		CN_INIT_SINGLE;
+		REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
+
+		// Optim - 90% time boundary
+		for(size_t i = 0; i < ITERATIONS; i++)
+		{
+			REPEAT_3(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+			REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+			REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
+		}
+
+		REPEAT_3(0, CN_FINALIZE);
 	}
-}
+};
 
-// This even lovelier creation will do 4 cn hashes at a time.
-template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
-void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+template< >
+struct Cryptonight_hash<4>
 {
-	for (size_t i = 0; i < 4; i++)
-	{
-		keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
-		cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
-	}
+	static constexpr size_t N = 4;
 
-	uint8_t* l0 = ctx[0]->long_state;
-	uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
-	uint8_t* l1 = ctx[1]->long_state;
-	uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
-	uint8_t* l2 = ctx[2]->long_state;
-	uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
-	uint8_t* l3 = ctx[3]->long_state;
-	uint64_t* h3 = (uint64_t*)ctx[3]->hash_state;
-
-	__m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
-	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-	__m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
-	__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-	__m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
-	__m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
-	__m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]);
-	__m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
-	__m128i cx0 = _mm_set_epi64x(0, 0);
-	__m128i cx1 = _mm_set_epi64x(0, 0);
-	__m128i cx2 = _mm_set_epi64x(0, 0);
-	__m128i cx3 = _mm_set_epi64x(0, 0);
-
-	for (size_t i = 0; i < ITERATIONS/2; i++)
+	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
 	{
-		uint64_t idx0, idx1, idx2, idx3, hi, lo;
-		__m128i *ptr0, *ptr1, *ptr2, *ptr3;
-
-		// EVEN ROUND
-		CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3);
-
-		CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3);
-
-		CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
-
-		CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3);
-
-		// ODD ROUND
-		CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3);
-
-		CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3);
-
-		CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
-
-		CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3);
-	}
+		constexpr size_t MASK = cn_select_mask<ALGO>();
+		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-	for (size_t i = 0; i < 4; i++)
-	{
-		cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
-		keccakf((uint64_t*)ctx[i]->hash_state, 24);
-		extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+		CN_INIT_SINGLE;
+		REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
+
+		// Optim - 90% time boundary
+		for(size_t i = 0; i < ITERATIONS; i++)
+		{
+			REPEAT_4(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+			REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+			REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
+		}
+
+		REPEAT_4(0, CN_FINALIZE);
 	}
-}
+};
 
-// This most lovely creation will do 5 cn hashes at a time.
-template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
-void cryptonight_penta_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+template< >
+struct Cryptonight_hash<5>
 {
-	for (size_t i = 0; i < 5; i++)
+	static constexpr size_t N = 5;
+
+	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
 	{
-		keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
-		cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+		constexpr size_t MASK = cn_select_mask<ALGO>();
+		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+		constexpr size_t MEM = cn_select_memory<ALGO>();
+
+		CN_INIT_SINGLE;
+		REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
+
+		// Optim - 90% time boundary
+		for(size_t i = 0; i < ITERATIONS; i++)
+		{
+			REPEAT_5(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+			REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+			REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
+		}
+
+		REPEAT_5(0, CN_FINALIZE);
 	}
+};
+
+extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonight_v8_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1);
 
-	uint8_t* l0 = ctx[0]->long_state;
-	uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
-	uint8_t* l1 = ctx[1]->long_state;
-	uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
-	uint8_t* l2 = ctx[2]->long_state;
-	uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
-	uint8_t* l3 = ctx[3]->long_state;
-	uint64_t* h3 = (uint64_t*)ctx[3]->hash_state;
-	uint8_t* l4 = ctx[4]->long_state;
-	uint64_t* h4 = (uint64_t*)ctx[4]->hash_state;
-
-	__m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
-	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-	__m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
-	__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-	__m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
-	__m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
-	__m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]);
-	__m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
-	__m128i ax4 = _mm_set_epi64x(h4[1] ^ h4[5], h4[0] ^ h4[4]);
-	__m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]);
-	__m128i cx0 = _mm_set_epi64x(0, 0);
-	__m128i cx1 = _mm_set_epi64x(0, 0);
-	__m128i cx2 = _mm_set_epi64x(0, 0);
-	__m128i cx3 = _mm_set_epi64x(0, 0);
-	__m128i cx4 = _mm_set_epi64x(0, 0);
-
-	for (size_t i = 0; i < ITERATIONS/2; i++)
+
+template< size_t N, size_t asm_version>
+struct Cryptonight_hash_asm;
+
+template<size_t asm_version>
+struct Cryptonight_hash_asm<1, asm_version>
+{
+	static constexpr size_t N = 1;
+
+	template<xmrstak_algo ALGO>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
 	{
-		uint64_t idx0, idx1, idx2, idx3, idx4, hi, lo;
-		__m128i *ptr0, *ptr1, *ptr2, *ptr3, *ptr4;
-
-		// EVEN ROUND
-		CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3);
-		CN_STEP1(ax4, bx4, cx4, l4, ptr4, idx4);
-
-		CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3);
-		CN_STEP2(ax4, bx4, cx4, l4, ptr4, idx4);
-
-		CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
-		CN_STEP3(ax4, bx4, cx4, l4, ptr4, idx4);
-
-		CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3);
-		CN_STEP4(ax4, bx4, cx4, l4, ptr4, idx4);
-
-		// ODD ROUND
-		CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3);
-		CN_STEP1(ax4, cx4, bx4, l4, ptr4, idx4);
-
-		CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3);
-		CN_STEP2(ax4, cx4, bx4, l4, ptr4, idx4);
-
-		CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
-		CN_STEP3(ax4, cx4, bx4, l4, ptr4, idx4);
-
-		CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3);
-		CN_STEP4(ax4, cx4, bx4, l4, ptr4, idx4);
+		constexpr size_t MEM = cn_select_memory<ALGO>();
+
+		keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
+		cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
+
+		if(asm_version == 0)
+			cryptonight_v8_mainloop_ivybridge_asm(ctx[0]);
+		else if(asm_version == 1)
+			cryptonight_v8_mainloop_ryzen_asm(ctx[0]);
+
+		cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+		keccakf((uint64_t*)ctx[0]->hash_state, 24);
+		extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
 	}
+};
 
-	for (size_t i = 0; i < 5; i++)
+// double hash only for intel
+template< >
+struct Cryptonight_hash_asm<2, 0>
+{
+	static constexpr size_t N = 2;
+
+	template<xmrstak_algo ALGO>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
 	{
-		cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
-		keccakf((uint64_t*)ctx[i]->hash_state, 24);
-		extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+		constexpr size_t MEM = cn_select_memory<ALGO>();
+
+		for(size_t i = 0; i < N; ++i)
+		{
+			keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+			/* Optim - 99% time boundary */
+			cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+		}
+
+		cryptonight_v8_double_mainloop_sandybridge_asm(ctx[0], ctx[1]);
+
+		for(size_t i = 0; i < N; ++i)
+		{
+			/* Optim - 90% time boundary */
+			cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+			/* Optim - 99% time boundary */
+			keccakf((uint64_t*)ctx[i]->hash_state, 24);
+			extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+		}
 	}
-}
+};
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
index 8b2207ddb..a7e4696a8 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
+++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
@@ -28,12 +28,14 @@ extern "C"
 #include "c_jh.h"
 #include "c_skein.h"
 }
+#include "xmrstak/backend/cryptonight.hpp"
 #include "cryptonight.h"
 #include "cryptonight_aesni.h"
-#include "xmrstak/backend/cryptonight.hpp"
+#include "xmrstak/misc/console.hpp"
 #include "xmrstak/jconf.hpp"
 #include <stdio.h>
 #include <stdlib.h>
+#include <algorithm>
 
 #ifdef __GNUC__
 #include <mm_malloc.h>
@@ -54,25 +56,27 @@ extern "C"
 #include <string.h>
 #endif // _WIN32
 
-void do_blake_hash(const void* input, size_t len, char* output) {
+void do_blake_hash(const void* input, uint32_t len, char* output) {
 	blake256_hash((uint8_t*)output, (const uint8_t*)input, len);
 }
 
-void do_groestl_hash(const void* input, size_t len, char* output) {
+void do_groestl_hash(const void* input, uint32_t len, char* output) {
 	groestl((const uint8_t*)input, len * 8, (uint8_t*)output);
 }
 
-void do_jh_hash(const void* input, size_t len, char* output) {
+void do_jh_hash(const void* input, uint32_t len, char* output) {
 	jh_hash(32 * 8, (const uint8_t*)input, 8 * len, (uint8_t*)output);
 }
 
-void do_skein_hash(const void* input, size_t len, char* output) {
+void do_skein_hash(const void* input, uint32_t len, char* output) {
 	skein_hash(8 * 32, (const uint8_t*)input, 8 * len, (uint8_t*)output);
 }
 
-void (* const extra_hashes[4])(const void *, size_t, char *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
+void (* const extra_hashes[4])(const void *, uint32_t, char *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
 
 #ifdef _WIN32
+#include "xmrstak/misc/uac.hpp"
+
 BOOL bRebootDesirable = FALSE; //If VirtualAlloc fails, suggest a reboot
 
 BOOL AddPrivilege(TCHAR* pszPrivilege)
@@ -114,7 +118,7 @@ BOOL AddLargePageRights()
 
 		DWORD size = 0;
 		GetTokenInformation(hToken, TokenUser, NULL, 0, &size);
-		
+
 		if (size > 0 && bIsElevated)
 		{
 			user = (PTOKEN_USER)LocalAlloc(LPTR, size);
@@ -132,7 +136,7 @@ BOOL AddLargePageRights()
 	ZeroMemory(&attributes, sizeof(attributes));
 
 	BOOL result = FALSE;
-	if (LsaOpenPolicy(NULL, &attributes, POLICY_ALL_ACCESS, &handle) == 0) 
+	if (LsaOpenPolicy(NULL, &attributes, POLICY_ALL_ACCESS, &handle) == 0)
 	{
 		LSA_UNICODE_STRING lockmem;
 		lockmem.Buffer = L"SeLockMemoryPrivilege";
@@ -176,13 +180,16 @@ size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg)
 
 	if(AddPrivilege(TEXT("SeLockMemoryPrivilege")) == 0)
 	{
+		printer::inst()->print_msg(L0, "Elevating because we need to set up fast memory privileges.");
+		RequestElevation();
+
 		if(AddLargePageRights())
 		{
 			msg->warning = "Added SeLockMemoryPrivilege to the current account. You need to reboot for it to work";
 			bRebootDesirable = TRUE;
 		}
 		else
-			msg->warning = "Obtaning SeLockMemoryPrivilege failed.";
+			msg->warning = "Obtaining SeLockMemoryPrivilege failed.";
 
 		return 0;
 	}
@@ -196,15 +203,11 @@ size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg)
 
 cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg)
 {
-	size_t hashMemSize;
-	if(::jconf::inst()->IsCurrencyMonero())
-	{
-		hashMemSize = MONERO_MEMORY;
-	}
-	else
-	{
-		hashMemSize = AEON_MEMORY;
-	}
+	size_t hashMemSize = std::max(
+		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
+		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
+	);
+
 	cryptonight_ctx* ptr = (cryptonight_ctx*)_mm_malloc(sizeof(cryptonight_ctx), 4096);
 
 	if(use_fast_mem == 0)
@@ -213,6 +216,8 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 		ptr->long_state = (uint8_t*)_mm_malloc(hashMemSize, hashMemSize);
 		ptr->ctx_info[0] = 0;
 		ptr->ctx_info[1] = 0;
+		if(ptr->long_state == NULL)
+			printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: _mm_malloc was not able to allocate %s byte",std::to_string(hashMemSize).c_str());
 		return ptr;
 	}
 
@@ -240,22 +245,25 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 		return ptr;
 	}
 #else
-
+//http://man7.org/linux/man-pages/man2/mmap.2.html
 #if defined(__APPLE__)
-	ptr->long_state  = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
+	ptr->long_state  = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
 		MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
 #elif defined(__FreeBSD__)
-	ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
+	ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
 		MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER | MAP_PREFAULT_READ, -1, 0);
+#elif defined(__OpenBSD__)
+	ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
+		MAP_PRIVATE | MAP_ANON, -1, 0);
 #else
-	ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
-		MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, 0, 0);
+	ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
+		MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0);
 #endif
 
 	if (ptr->long_state == MAP_FAILED)
 	{
 		_mm_free(ptr);
-		msg->warning = "mmap failed";
+		msg->warning = "mmap failed, check attribute 'use_slow_memory' in 'config.txt'";
 		return NULL;
 	}
 
@@ -276,15 +284,11 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 
 void cryptonight_free_ctx(cryptonight_ctx* ctx)
 {
-	size_t hashMemSize;
-	if(::jconf::inst()->IsCurrencyMonero())
-	{
-		hashMemSize = MONERO_MEMORY;
-	}
-	else
-	{
-		hashMemSize = AEON_MEMORY;
-	}
+	size_t hashMemSize = std::max(
+		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
+		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
+	);
+
 	if(ctx->ctx_info[0] != 0)
 	{
 #ifdef _WIN32
diff --git a/xmrstak/backend/cpu/crypto/hash.h b/xmrstak/backend/cpu/crypto/hash.h
index c12d355f4..2af330932 100644
--- a/xmrstak/backend/cpu/crypto/hash.h
+++ b/xmrstak/backend/cpu/crypto/hash.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <stdint.h>
+
 typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;
+typedef uint32_t DataLength;
 typedef enum {SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2} HashReturn;
diff --git a/xmrstak/backend/cpu/crypto/skein_port.h b/xmrstak/backend/cpu/crypto/skein_port.h
index 9cbefcb1a..99641bcdf 100644
--- a/xmrstak/backend/cpu/crypto/skein_port.h
+++ b/xmrstak/backend/cpu/crypto/skein_port.h
@@ -49,7 +49,7 @@
 								multiple of size / 8)
 
 	ptr_cast(x,size)            casts a pointer to a pointer to a
-								varaiable of length 'size' bits
+								variable of length 'size' bits
 */
 
 #define ui_type(size)               uint##size##_t
diff --git a/xmrstak/backend/cpu/crypto/soft_aes.hpp b/xmrstak/backend/cpu/crypto/soft_aes.hpp
index d3f463750..9b4ae0ab5 100644
--- a/xmrstak/backend/cpu/crypto/soft_aes.hpp
+++ b/xmrstak/backend/cpu/crypto/soft_aes.hpp
@@ -104,9 +104,9 @@ static inline __m128i soft_aesenc(__m128i in, __m128i key)
 
 static inline uint32_t sub_word(uint32_t key)
 {
-	return (saes_sbox[key >> 24 ] << 24)   | 
-		(saes_sbox[(key >> 16) & 0xff] << 16 ) | 
-		(saes_sbox[(key >> 8)  & 0xff] << 8  ) | 
+	return (saes_sbox[key >> 24 ] << 24)   |
+		(saes_sbox[(key >> 16) & 0xff] << 16 ) |
+		(saes_sbox[(key >> 8)  & 0xff] << 8  ) |
 		 saes_sbox[key & 0xff];
 }
 
diff --git a/xmrstak/backend/cpu/hwlocMemory.cpp b/xmrstak/backend/cpu/hwlocMemory.cpp
index 94d2b53f7..089570fc0 100644
--- a/xmrstak/backend/cpu/hwlocMemory.cpp
+++ b/xmrstak/backend/cpu/hwlocMemory.cpp
@@ -30,7 +30,7 @@ void bindMemoryToNUMANode( size_t puId )
 
 	depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
 
-	for( size_t i = 0;
+	for( uint32_t i = 0;
 		i < hwloc_get_nbobjs_by_depth(topology, depth);
 		i++ )
 	{
diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp
index 6e709bd35..a14be1732 100644
--- a/xmrstak/backend/cpu/jconf.cpp
+++ b/xmrstak/backend/cpu/jconf.cpp
@@ -108,12 +108,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *mode, *no_prefetch, *aff;
+	const Value *mode, *no_prefetch, *aff, *asm_version;
 	mode = GetObjectMember(oThdConf, "low_power_mode");
 	no_prefetch = GetObjectMember(oThdConf, "no_prefetch");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
+	asm_version = GetObjectMember(oThdConf, "asm");
 
-	if(mode == nullptr || no_prefetch == nullptr || aff == nullptr)
+	if(mode == nullptr || no_prefetch == nullptr || aff == nullptr || asm_version == nullptr)
 		return false;
 
 	if(!mode->IsBool() && !mode->IsNumber())
@@ -140,6 +141,10 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	else
 		cfg.iCpuAff = -1;
 
+	if(!asm_version->IsString())
+		return false;
+	cfg.asm_version_str = asm_version->GetString();
+
 	return true;
 }
 
@@ -211,14 +216,14 @@ bool jconf::parse_config(const char* sFilename)
 
 	if(prv->jsonDoc.HasParseError())
 	{
-		printer::inst()->print_msg(L0, "JSON config parse error(offset %llu): %s",
-			int_port(prv->jsonDoc.GetErrorOffset()), GetParseError_En(prv->jsonDoc.GetParseError()));
+		printer::inst()->print_msg(L0, "JSON config parse error in '%s' (offset %llu): %s",
+			sFilename, int_port(prv->jsonDoc.GetErrorOffset()), GetParseError_En(prv->jsonDoc.GetParseError()));
 		return false;
 	}
 
 	if(!prv->jsonDoc.IsObject())
 	{ //This should never happen as we created the root ourselves
-		printer::inst()->print_msg(L0, "Invalid config file. No root?\n");
+		printer::inst()->print_msg(L0, "Invalid config file '%s'. No root?", sFilename);
 		return false;
 	}
 
@@ -234,13 +239,13 @@ bool jconf::parse_config(const char* sFilename)
 
 		if(prv->configValues[i] == nullptr)
 		{
-			printer::inst()->print_msg(L0, "Invalid config file. Missing value \"%s\".", oConfigValues[i].sName);
+			printer::inst()->print_msg(L0, "Invalid config file '%s'. Missing value \"%s\".", sFilename, oConfigValues[i].sName);
 			return false;
 		}
 
 		if(!checkType(prv->configValues[i]->GetType(), oConfigValues[i].iType))
 		{
-			printer::inst()->print_msg(L0, "Invalid config file. Value \"%s\" has unexpected type.", oConfigValues[i].sName);
+			printer::inst()->print_msg(L0, "Invalid config file '%s'. Value \"%s\" has unexpected type.", sFilename, oConfigValues[i].sName);
 			return false;
 		}
 	}
@@ -259,4 +264,4 @@ bool jconf::parse_config(const char* sFilename)
 }
 
 } // namespace cpu
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp
index e98ed160a..4ec9165d5 100644
--- a/xmrstak/backend/cpu/jconf.hpp
+++ b/xmrstak/backend/cpu/jconf.hpp
@@ -24,6 +24,7 @@ class jconf
 	struct thd_cfg {
 		int iMultiway;
 		bool bNoPrefetch;
+		std::string asm_version_str;
 		long long iCpuAff;
 	};
 
@@ -40,4 +41,4 @@ class jconf
 };
 
 } // namespace cpu
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 143b66f75..55879110a 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -25,8 +25,9 @@
 
 #include "xmrstak/misc/console.hpp"
 #include "xmrstak/backend/iBackend.hpp"
-#include "xmrstak/backend//globalStates.hpp"
+#include "xmrstak/backend/globalStates.hpp"
 #include "xmrstak/misc/configEditor.hpp"
+#include "xmrstak/backend/cpu/cpuType.hpp"
 #include "xmrstak/params.hpp"
 #include "jconf.hpp"
 
@@ -73,7 +74,16 @@ namespace cpu
 bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id)
 {
 #if defined(_WIN32)
-	return SetThreadAffinityMask(h, 1ULL << cpu_id) != 0;
+	// we can only pin up to 64 threads
+	if(cpu_id < 64)
+	{
+		return SetThreadAffinityMask(h, 1ULL << cpu_id) != 0;
+	}
+	else
+	{
+		printer::inst()->print_msg(L0, "WARNING: Windows supports only affinity up to 63.");
+		return false;
+	}
 #elif defined(__APPLE__)
 	thread_port_t mach_thread;
 	thread_affinity_policy_data_t policy = { static_cast<integer_t>(cpu_id) };
@@ -84,6 +94,9 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id
 	CPU_ZERO(&mn);
 	CPU_SET(cpu_id, &mn);
 	return pthread_setaffinity_np(h, sizeof(cpuset_t), &mn) == 0;
+#elif defined(__OpenBSD__)
+        printer::inst()->print_msg(L0,"WARNING: thread pinning is not supported under OPENBSD.");
+        return true;
 #else
 	cpu_set_t mn;
 	CPU_ZERO(&mn);
@@ -92,7 +105,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id
 #endif
 }
 
-minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity)
+minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version)
 {
 	this->backendType = iBackend::CPU;
 	oWork = pWork;
@@ -101,6 +114,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch,
 	iJobNo = 0;
 	bNoPrefetch = no_prefetch;
 	this->affinity = affinity;
+	asm_version_str = asm_version;
 
 	std::unique_lock<std::mutex> lck(thd_aff_set);
 	std::future<void> order_guard = order_fix.get_future();
@@ -212,6 +226,7 @@ bool minethd::self_test()
 	{
 		if ((ctx[i] = minethd_alloc_ctx()) == nullptr)
 		{
+			printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory.");
 			for (int j = 0; j < i; j++)
 				cryptonight_free_ctx(ctx[j]);
 			return false;
@@ -220,60 +235,175 @@ bool minethd::self_test()
 
 	bool bResult = true;
 
-	bool mineMonero = ::jconf::inst()->IsCurrencyMonero();
-	if(mineMonero)
+	unsigned char out[32 * MAX_N];
+	cn_hash_fun hashf;
+	cn_hash_fun hashf_multi;
+
+	xmrstak_algo algo = xmrstak_algo::invalid_algo;
+
+	for(int algo_idx = 0; algo_idx < 2; ++algo_idx)
 	{
-		unsigned char out[32 * MAX_N];
-		cn_hash_fun hashf;
-		cn_hash_fun_multi hashf_multi;
-
-		hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, mineMonero);
-		hashf("This is a test", 14, out, ctx[0]);
-		bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
-
-		hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, mineMonero);
-		hashf("This is a test", 14, out, ctx[0]);
-		bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
-
-		hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, mineMonero);
-		hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
-		bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
-				"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
-
-		hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, mineMonero);
-		hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
-		bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
-				"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
-
-		hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, mineMonero);
-		hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx);
-		bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0;
-
-		hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, mineMonero);
-		hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx);
-		bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0;
-
-		hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, mineMonero);
-		hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx);
-		bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0;
+		if(algo_idx == 0)
+			algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo();
+		else
+			algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
+
+		if(algo == cryptonight)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf("This is a test", 14, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
+			hashf("This is a test", 14, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+
+			hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
+					"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+
+			hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
+			hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
+					"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+
+			hashf_multi = func_multi_selector<3>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0;
+
+			hashf_multi = func_multi_selector<4>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0;
+
+			hashf_multi = func_multi_selector<5>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0;
+		}
+		else if(algo == cryptonight_lite)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_lite);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_lite);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
+		}
+		else if(algo == cryptonight_monero)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
+		}
+		else if(algo == cryptonight_monero_v8)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero_v8);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero_v8);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult &= memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0;
+		}
+		else if(algo == cryptonight_aeon)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_aeon);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
+		}
+		else if(algo == cryptonight_ipbc)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_ipbc);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_ipbc);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0;
+		}
+		else if(algo == cryptonight_stellite)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_stellite);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_stellite);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
+		}
+		else if(algo == cryptonight_masari)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_masari);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_masari);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
+		}
+		else if(algo == cryptonight_heavy)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_heavy);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_heavy);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
+		}
+		else if(algo == cryptonight_haven)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_haven);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_haven);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
+		}
+		else if(algo == cryptonight_bittube2)
+		{
+			unsigned char out[32 * MAX_N];
+			cn_hash_fun hashf;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2);
+
+			hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0;
+
+			hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx);
+			bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0;
+
+			hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx);
+			bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0;
+		}
+
+		if(!bResult)
+			printer::inst()->print_msg(L0,
+				"Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
 	}
 
 	for (int i = 0; i < MAX_N; i++)
 		cryptonight_free_ctx(ctx[i]);
 
-	if(!bResult)
-		printer::inst()->print_msg(L0,
-			"Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
-
 	return bResult;
 }
 
@@ -307,239 +437,219 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work
 		if(cfg.iCpuAff >= 0)
 		{
 #if defined(__APPLE__)
-			printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory.");
+			printer::inst()->print_msg(L1, "WARNING on macOS thread affinity is only advisory.");
 #endif
 
 			printer::inst()->print_msg(L1, "Starting %dx thread, affinity: %d.", cfg.iMultiway, (int)cfg.iCpuAff);
 		}
 		else
 			printer::inst()->print_msg(L1, "Starting %dx thread, no affinity.", cfg.iMultiway);
-		
-		minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff);
+
+		minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff, cfg.asm_version_str);
 		pvThreads.push_back(thd);
 	}
 
 	return pvThreads;
 }
 
-void minethd::consume_work()
+/** get the supported asm name
+ *
+ * @return asm type based on the number of hashes per thread the internal
+ *             evaluated cpu type
+ */
+static std::string getAsmName(const uint32_t num_hashes)
 {
-	memcpy(&oWork, &globalStates::inst().inst().oGlobalWork, sizeof(miner_work));
-	iJobNo++;
-	globalStates::inst().inst().iConsumeCnt++;
+	std::string asm_type = "off";
+	if(num_hashes != 0)
+	{
+		auto cpu_model = getModel();
+
+		if(cpu_model.avx && cpu_model.aes)
+		{
+			if(cpu_model.type_name.find("Intel") != std::string::npos)
+				asm_type = "intel_avx";
+			else if(cpu_model.type_name.find("AMD") != std::string::npos && num_hashes == 1)
+				asm_type = "amd_avx";
+		}
+	}
+	return asm_type;
 }
 
-minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, bool mineMonero)
+template<size_t N>
+minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str)
 {
+	static_assert(N >= 1, "number of threads must be >= 1" );
+
 	// We have two independent flag bits in the functions
 	// therefore we will build a binary digit and select the
 	// function as a two digit binary
-	// Digit order SOFT_AES, NO_PREFETCH, MINER_ALGO
+
+	uint8_t algv;
+	switch(algo)
+	{
+	case cryptonight:
+		algv = 2;
+		break;
+	case cryptonight_lite:
+		algv = 1;
+		break;
+	case cryptonight_monero:
+		algv = 0;
+		break;
+	case cryptonight_heavy:
+		algv = 3;
+		break;
+	case cryptonight_aeon:
+		algv = 4;
+		break;
+	case cryptonight_ipbc:
+		algv = 5;
+		break;
+	case cryptonight_stellite:
+		algv = 6;
+		break;
+	case cryptonight_masari:
+		algv = 7;
+		break;
+	case cryptonight_haven:
+		algv = 8;
+		break;
+	case cryptonight_bittube2:
+		algv = 9;
+		break;
+	case cryptonight_monero_v8:
+		algv = 10;
+		break;
+	default:
+		algv = 2;
+		break;
+	}
 
 	static const cn_hash_fun func_table[] = {
-		/* there will be 8 function entries if `CONF_NO_MONERO` and `CONF_NO_AEON`
-		 * is not defined. If one is defined there will be 4 entries.
-		 */
-#ifndef CONF_NO_MONERO
-		cryptonight_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>,
-		cryptonight_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>,
-		cryptonight_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>,
-		cryptonight_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>
-#endif
-#if (!defined(CONF_NO_AEON)) && (!defined(CONF_NO_MONERO))
-		// comma will be added only if Monero and Aeon is build
-		,
-#endif
-#ifndef CONF_NO_AEON
-		cryptonight_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>,
-		cryptonight_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>,
-		cryptonight_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>,
-		cryptonight_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>
-#endif
+		Cryptonight_hash<N>::template hash<cryptonight_monero, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_lite, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_lite, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_lite, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_lite, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_heavy, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_heavy, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_heavy, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_heavy, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_aeon, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_aeon, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_aeon, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_aeon, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_ipbc, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_ipbc, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_ipbc, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_ipbc, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_stellite, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_stellite, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_stellite, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_stellite, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_masari, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_masari, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_masari, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_masari, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_haven, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_haven, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_haven, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_haven, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_bittube2, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_bittube2, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_bittube2, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_bittube2, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, true>
 	};
 
-	std::bitset<3> digit;
-	digit.set(0, !bNoPrefetch);
-	digit.set(1, !bHaveAes);
-
-	// define aeon settings
-#if defined(CONF_NO_AEON) || defined(CONF_NO_MONERO)
-	// ignore 3rd bit if only one currency is active
-	digit.set(2, 0);
-#else
-	digit.set(2, !mineMonero);
-#endif
-
-	return func_table[digit.to_ulong()];
-}
-
-void minethd::work_main()
-{
-	if(affinity >= 0) //-1 means no affinity
-		bindMemoryToNUMANode(affinity);
-
-	order_fix.set_value();
-	std::unique_lock<std::mutex> lck(thd_aff_set);
-	lck.release();
-	std::this_thread::yield();
-
-	cn_hash_fun hash_fun;
-	cryptonight_ctx* ctx;
-	uint64_t iCount = 0;
-	uint64_t* piHashVal;
-	uint32_t* piNonce;
-	job_result result;
+	std::bitset<2> digit;
+	digit.set(0, !bHaveAes);
+	digit.set(1, !bNoPrefetch);
 
-	hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero());
-	ctx = minethd_alloc_ctx();
+	auto selected_function = func_table[ algv << 2 | digit.to_ulong() ];
 
-	piHashVal = (uint64_t*)(result.bResult + 24);
-	piNonce = (uint32_t*)(oWork.bWorkBlob + 39);
-	globalStates::inst().inst().iConsumeCnt++;
-	result.iThreadId = iThreadNo;
 
-	while (bQuit == 0)
+	// check for asm optimized version for cryptonight_v8
+	if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes)
 	{
-		if (oWork.bStall)
-		{
-			/* We are stalled here because the executor didn't find a job for us yet,
-			 * either because of network latency, or a socket problem. Since we are
-			 * raison d'etre of this software it us sensible to just wait until we have something
-			 */
-
-			while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
-				std::this_thread::sleep_for(std::chrono::milliseconds(100));
-
-			consume_work();
-			continue;
-		}
-
-		size_t nonce_ctr = 0;
-		constexpr size_t nonce_chunk = 4096; // Needs to be a power of 2
-
-		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
-		memcpy(result.sJobID, oWork.sJobID, sizeof(job_result::sJobID));
+		std::string selected_asm = asm_version_str;
+		if(selected_asm == "auto")
+				selected_asm = cpu::getAsmName(N);
 
-		if(oWork.bNiceHash)
-			result.iNonce = *piNonce;
-
-		while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+		if(selected_asm != "off")
 		{
-			if ((iCount++ & 0xF) == 0) //Store stats every 16 hashes
+			if(selected_asm == "intel_avx")
 			{
-				uint64_t iStamp = get_timestamp_ms();
-				iHashCount.store(iCount, std::memory_order_relaxed);
-				iTimestamp.store(iStamp, std::memory_order_relaxed);
+				// Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
+				if(N == 1)
+					selected_function = Cryptonight_hash_asm<1u, 0u>::template hash<cryptonight_monero_v8>;
+				else if(N == 2)
+					selected_function = Cryptonight_hash_asm<2u, 0u>::template hash<cryptonight_monero_v8>;
 			}
-
-			if((nonce_ctr++ & (nonce_chunk-1)) == 0)
+			// supports only 1 thread per hash
+			if(N == 1 && selected_asm == "amd_avx")
 			{
-				globalStates::inst().calc_start_nonce(result.iNonce, oWork.bNiceHash, nonce_chunk);
+				// AMD Ryzen (1xxx and 2xxx series)
+				selected_function = Cryptonight_hash_asm<1u, 1u>::template hash<cryptonight_monero_v8>;
 			}
-
-			*piNonce = ++result.iNonce;
-
-			hash_fun(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
-
-			if (*piHashVal < oWork.iTarget)
-				executor::inst()->push_event(ex_event(result, oWork.iPoolId));
-
-			std::this_thread::yield();
+			if(asm_version_str == "auto" && (selected_asm != "intel_avx" || selected_asm != "amd_avx"))
+				printer::inst()->print_msg(L3, "Switch to assembler version for '%s' cpu's", selected_asm.c_str());
+			else if(selected_asm != "intel_avx" && selected_asm != "amd_avx") // unknown asm type
+				printer::inst()->print_msg(L1, "Assembler '%s' unknown, fallback to non asm version of cryptonight_v8", selected_asm.c_str());
 		}
-
-		consume_work();
 	}
-
-	cryptonight_free_ctx(ctx);
+	
+	return selected_function;
 }
 
-minethd::cn_hash_fun_multi minethd::func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, bool mineMonero)
+minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)
 {
-	// We have two independent flag bits in the functions
-	// therefore we will build a binary digit and select the
-	// function as a two digit binary
-	// Digit order SOFT_AES, NO_PREFETCH
-
-	static const cn_hash_fun_multi func_table[] = {
-		/* there will be 8*(MAX_N-1) function entries if `CONF_NO_MONERO` and `CONF_NO_AEON`
-		 * is not defined. If one is defined there will be 4*(MAX_N-1) entries.
-		 */
-#ifndef CONF_NO_MONERO
-		cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>,
-		cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>,
-		cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>,
-		cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>,
-		cryptonight_triple_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>,
-		cryptonight_triple_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>,
-		cryptonight_triple_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>,
-		cryptonight_triple_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>,
-		cryptonight_quad_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>,
-		cryptonight_quad_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>,
-		cryptonight_quad_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>,
-		cryptonight_quad_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>,
-		cryptonight_penta_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>,
-		cryptonight_penta_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>,
-		cryptonight_penta_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>,
-		cryptonight_penta_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>
-#endif
-#if (!defined(CONF_NO_AEON)) && (!defined(CONF_NO_MONERO))
-		// comma will be added only if Monero and Aeon is build
-		,
-#endif
-#ifndef CONF_NO_AEON
-		cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>,
-		cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>,
-		cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>,
-		cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>,
-		cryptonight_triple_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>,
-		cryptonight_triple_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>,
-		cryptonight_triple_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>,
-		cryptonight_triple_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>,
-		cryptonight_quad_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>,
-		cryptonight_quad_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>,
-		cryptonight_quad_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>,
-		cryptonight_quad_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>,
-		cryptonight_penta_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>,
-		cryptonight_penta_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>,
-		cryptonight_penta_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>,
-		cryptonight_penta_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>
-#endif
-	};
-
-	std::bitset<2> digit;
-	digit.set(0, !bNoPrefetch);
-	digit.set(1, !bHaveAes);
-
-	// define aeon settings
-#if defined(CONF_NO_AEON) || defined(CONF_NO_MONERO)
-	// ignore miner algo if only one currency is active
-	size_t miner_algo_base = 0;
-#else
-	size_t miner_algo_base = mineMonero ? 0 : 4*(MAX_N-1);
-#endif
+	return func_multi_selector<1>(bHaveAes, bNoPrefetch, algo);
+}
 
-	N = (N<2) ? 2 : (N>MAX_N) ? MAX_N : N;
-	return func_table[miner_algo_base + 4*(N-2) + digit.to_ulong()];
+void minethd::work_main()
+{
+	multiway_work_main<1u>();
 }
 
 void minethd::double_work_main()
 {
-	multiway_work_main<2>(func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()));
+	multiway_work_main<2u>();
 }
 
 void minethd::triple_work_main()
 {
-	multiway_work_main<3>(func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()));
+	multiway_work_main<3u>();
 }
 
 void minethd::quad_work_main()
 {
-	multiway_work_main<4>(func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()));
+	multiway_work_main<4u>();
 }
 
 void minethd::penta_work_main()
 {
-	multiway_work_main<5>(func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()));
+	multiway_work_main<5u>();
 }
 
 template<size_t N>
@@ -553,8 +663,8 @@ void minethd::prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce)
 	}
 }
 
-template<size_t N>
-void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi)
+template<uint32_t N>
+void minethd::multiway_work_main()
 {
 	if(affinity >= 0) //-1 means no affinity
 		bindMemoryToNUMANode(affinity);
@@ -576,6 +686,13 @@ void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi)
 	for (size_t i = 0; i < N; i++)
 	{
 		ctx[i] = minethd_alloc_ctx();
+		if(ctx[i] == nullptr)
+		{
+			printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory.");
+			for (int j = 0; j < i; j++)
+				cryptonight_free_ctx(ctx[j]);
+			win_exit(1);
+		}
 		piHashVal[i] = (uint64_t*)(bHashOut + 32 * i + 24);
 		piNonce[i] = (i == 0) ? (uint32_t*)(bWorkBlob + 39) : nullptr;
 	}
@@ -585,6 +702,12 @@ void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi)
 
 	globalStates::inst().iConsumeCnt++;
 
+	// start with root algorithm and switch later if fork version is reached
+	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
+	cn_hash_fun hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
+	uint8_t version = 0;
+	size_t lastPoolId = 0;
+
 	while (bQuit == 0)
 	{
 		if (oWork.bStall)
@@ -596,7 +719,7 @@ void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi)
 			while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 				std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
-			consume_work();
+			globalStates::inst().consume_work(oWork, iJobNo);
 			prep_multiway_work<N>(bWorkBlob, piNonce);
 			continue;
 		}
@@ -609,6 +732,24 @@ void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi)
 		if(oWork.bNiceHash)
 			iNonce = *piNonce[0];
 
+		uint8_t new_version = oWork.getVersion();
+		if(new_version != version || oWork.iPoolId != lastPoolId)
+		{
+			coinDescription coinDesc = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(oWork.iPoolId);
+			if(new_version >= coinDesc.GetMiningForkVersion())
+			{
+				miner_algo = coinDesc.GetMiningAlgo();
+				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
+			}
+			else
+			{
+				miner_algo = coinDesc.GetMiningAlgoRoot();
+				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
+			}
+			lastPoolId = oWork.iPoolId;
+			version = new_version;
+		}
+
 		while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 		{
 			if ((iCount++ & 0x7) == 0)  //Store stats every 8*N hashes
@@ -623,10 +764,13 @@ void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi)
 			{
 				globalStates::inst().calc_start_nonce(iNonce, oWork.bNiceHash, nonce_chunk);
 				nonce_ctr = nonce_chunk;
+				// check if the job is still valid, there is a small posibility that the job is switched
+				if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo)
+					break;
 			}
 
 			for (size_t i = 0; i < N; i++)
-				*piNonce[i] = ++iNonce;
+				*piNonce[i] = iNonce++;
 
 			hash_fun_multi(bWorkBlob, oWork.iWorkSize, bHashOut, ctx);
 
@@ -634,14 +778,17 @@ void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi)
 			{
 				if (*piHashVal[i] < oWork.iTarget)
 				{
-					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce - N + 1 + i, bHashOut + 32 * i, iThreadNo), oWork.iPoolId));
+					executor::inst()->push_event(
+						ex_event(job_result(oWork.sJobID, iNonce - N + i, bHashOut + 32 * i, iThreadNo, miner_algo),
+						oWork.iPoolId)
+					);
 				}
 			}
 
 			std::this_thread::yield();
 		}
 
-		consume_work();
+		globalStates::inst().consume_work(oWork, iJobNo);
 		prep_multiway_work<N>(bWorkBlob, piNonce);
 	}
 
@@ -650,4 +797,4 @@ void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi)
 }
 
 } // namespace cpu
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp
index 0433d0d36..eb77749f6 100644
--- a/xmrstak/backend/cpu/minethd.hpp
+++ b/xmrstak/backend/cpu/minethd.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "xmrstak/jconf.hpp"
 #include "crypto/cryptonight.h"
 #include "xmrstak/backend/miner_work.hpp"
 #include "xmrstak/backend/iBackend.hpp"
@@ -21,21 +22,22 @@ class minethd : public iBackend
 	static std::vector<iBackend*> thread_starter(uint32_t threadOffset, miner_work& pWork);
 	static bool self_test();
 
-	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**);
 
-	static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, bool mineMonero);
+	static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo);
 	static bool thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id);
 
 	static cryptonight_ctx* minethd_alloc_ctx();
 
 private:
-	typedef void (*cn_hash_fun_multi)(const void*, size_t, void*, cryptonight_ctx**);
-	static cn_hash_fun_multi func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, bool mineMonero);
-
-	minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity);
 
 	template<size_t N>
-	void multiway_work_main(cn_hash_fun_multi hash_fun_multi);
+	static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "off");
+
+	minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version);
+
+	template<uint32_t N>
+	void multiway_work_main();
 
 	template<size_t N>
 	void prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce);
@@ -46,11 +48,8 @@ class minethd : public iBackend
 	void quad_work_main();
 	void penta_work_main();
 
-	void consume_work();
-
 	uint64_t iJobNo;
 
-	static miner_work oGlobalWork;
 	miner_work oWork;
 
 	std::promise<void> order_fix;
@@ -61,7 +60,8 @@ class minethd : public iBackend
 
 	bool bQuit;
 	bool bNoPrefetch;
+	std::string asm_version_str = "off";
 };
 
 } // namespace cpu
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index 0ef5ae70d..6b1afa928 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -1,12 +1,213 @@
 #pragma once
+#include <stddef.h>
+#include <inttypes.h>
+#include <type_traits>
+
+enum xmrstak_algo
+{
+	invalid_algo = 0,
+	cryptonight = 1,
+	cryptonight_lite = 2,
+	cryptonight_monero = 3,
+	cryptonight_heavy = 4,
+	cryptonight_aeon = 5,
+	cryptonight_ipbc = 6, // equal to cryptonight_aeon with a small tweak in the miner code
+	cryptonight_stellite = 7, //equal to cryptonight_monero but with one tiny change
+	cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari
+	cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak
+	cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
+	cryptonight_monero_v8 = 11
+};
 
 // define aeon settings
-#define AEON_MEMORY 1048576llu
-#define AEON_MASK 0xFFFF0
-#define AEON_ITER 0x40000
+constexpr size_t CRYPTONIGHT_LITE_MEMORY = 1 * 1024 * 1024;
+constexpr uint32_t CRYPTONIGHT_LITE_MASK = 0xFFFF0;
+constexpr uint32_t CRYPTONIGHT_LITE_ITER = 0x40000;
+
+constexpr size_t CRYPTONIGHT_MEMORY = 2 * 1024 * 1024;
+constexpr uint32_t CRYPTONIGHT_MASK = 0x1FFFF0;
+constexpr uint32_t CRYPTONIGHT_ITER = 0x80000;
+
+constexpr size_t CRYPTONIGHT_HEAVY_MEMORY = 4 * 1024 * 1024;
+constexpr uint32_t CRYPTONIGHT_HEAVY_MASK = 0x3FFFF0;
+constexpr uint32_t CRYPTONIGHT_HEAVY_ITER = 0x40000;
+
+constexpr uint32_t CRYPTONIGHT_MASARI_ITER = 0x40000;
+
+template<xmrstak_algo ALGO>
+inline constexpr size_t cn_select_memory() { return 0; }
+
+template<>
+inline constexpr size_t cn_select_memory<cryptonight>() { return CRYPTONIGHT_MEMORY; }
+
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_lite>() { return CRYPTONIGHT_LITE_MEMORY; }
+
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_monero>() { return CRYPTONIGHT_MEMORY; }
+
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_monero_v8>() { return CRYPTONIGHT_MEMORY; }
+
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_MEMORY; }
+
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_aeon>() { return CRYPTONIGHT_LITE_MEMORY; }
+
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_ipbc>() { return CRYPTONIGHT_LITE_MEMORY; }
+
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_stellite>() { return CRYPTONIGHT_MEMORY; }
+
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_masari>() { return CRYPTONIGHT_MEMORY; }
+
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_haven>() { return CRYPTONIGHT_HEAVY_MEMORY; }
+
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_bittube2>() { return CRYPTONIGHT_HEAVY_MEMORY; }
+
+inline size_t cn_select_memory(xmrstak_algo algo)
+{
+	switch(algo)
+	{
+	case cryptonight_stellite:
+	case cryptonight_monero:
+	case cryptonight_monero_v8:
+	case cryptonight_masari:
+	case cryptonight:
+		return CRYPTONIGHT_MEMORY;
+	case cryptonight_ipbc:
+	case cryptonight_aeon:
+	case cryptonight_lite:
+		return CRYPTONIGHT_LITE_MEMORY;
+	case cryptonight_bittube2:
+	case cryptonight_haven:
+	case cryptonight_heavy:
+		return CRYPTONIGHT_HEAVY_MEMORY;
+	default:
+		return 0;
+	}
+}
+
+template<xmrstak_algo ALGO>
+inline constexpr uint32_t cn_select_mask() { return 0; }
+
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight>() { return CRYPTONIGHT_MASK; }
+
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_lite>() { return CRYPTONIGHT_LITE_MASK; }
+
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_monero>() { return CRYPTONIGHT_MASK; }
+
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_monero_v8>() { return CRYPTONIGHT_MASK; }
+
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_MASK; }
+
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_aeon>() { return CRYPTONIGHT_LITE_MASK; }
+
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_ipbc>() { return CRYPTONIGHT_LITE_MASK; }
+
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_stellite>() { return CRYPTONIGHT_MASK; }
+
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_masari>() { return CRYPTONIGHT_MASK; }
+
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_haven>() { return CRYPTONIGHT_HEAVY_MASK; }
+
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_bittube2>() { return CRYPTONIGHT_HEAVY_MASK; }
+
+inline size_t cn_select_mask(xmrstak_algo algo)
+{
+	switch(algo)
+	{
+	case cryptonight_stellite:
+	case cryptonight_monero:
+	case cryptonight_monero_v8:
+	case cryptonight_masari:
+	case cryptonight:
+		return CRYPTONIGHT_MASK;
+	case cryptonight_ipbc:
+	case cryptonight_aeon:
+	case cryptonight_lite:
+		return CRYPTONIGHT_LITE_MASK;
+	case cryptonight_bittube2:
+	case cryptonight_haven:
+	case cryptonight_heavy:
+		return CRYPTONIGHT_HEAVY_MASK;
+	default:
+		return 0;
+	}
+}
+
+template<xmrstak_algo ALGO>
+inline constexpr uint32_t cn_select_iter() { return 0; }
+
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight>() { return CRYPTONIGHT_ITER; }
+
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_lite>() { return CRYPTONIGHT_LITE_ITER; }
+
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_monero>() { return CRYPTONIGHT_ITER; }
+
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_monero_v8>() { return CRYPTONIGHT_ITER; }
+
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_ITER; }
+
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_aeon>() { return CRYPTONIGHT_LITE_ITER; }
+
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_ipbc>() { return CRYPTONIGHT_LITE_ITER; }
+
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_stellite>() { return CRYPTONIGHT_ITER; }
+
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_masari>() { return CRYPTONIGHT_MASARI_ITER; }
+
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_haven>() { return CRYPTONIGHT_HEAVY_ITER; }
 
-// define xmr settings
-#define MONERO_MEMORY 2097152llu
-#define MONERO_MASK 0x1FFFF0
-#define MONERO_ITER 0x80000
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_bittube2>() { return CRYPTONIGHT_HEAVY_ITER; }
 
+inline size_t cn_select_iter(xmrstak_algo algo)
+{
+	switch(algo)
+	{
+	case cryptonight_stellite:
+	case cryptonight_monero:
+	case cryptonight_monero_v8:
+	case cryptonight:
+		return CRYPTONIGHT_ITER;
+	case cryptonight_ipbc:
+	case cryptonight_aeon:
+	case cryptonight_lite:
+		return CRYPTONIGHT_LITE_ITER;
+	case cryptonight_bittube2:
+	case cryptonight_haven:
+	case cryptonight_heavy:
+		return CRYPTONIGHT_HEAVY_ITER;
+	case cryptonight_masari:
+		return CRYPTONIGHT_MASARI_ITER;
+	default:
+		return 0;
+	}
+}
diff --git a/xmrstak/backend/globalStates.cpp b/xmrstak/backend/globalStates.cpp
index 78823c53c..4eeed3c4b 100644
--- a/xmrstak/backend/globalStates.cpp
+++ b/xmrstak/backend/globalStates.cpp
@@ -33,24 +33,37 @@
 namespace xmrstak
 {
 
+void globalStates::consume_work( miner_work& threadWork, uint64_t& currentJobId)
+{
+	jobLock.ReadLock();
+
+	threadWork = oGlobalWork;
+	currentJobId = iGlobalJobNo.load(std::memory_order_relaxed);
+
+	jobLock.UnLock();
+}
 
 void globalStates::switch_work(miner_work& pWork, pool_data& dat)
 {
-	// iConsumeCnt is a basic lock-like polling mechanism just in case we happen to push work
-	// faster than threads can consume them. This should never happen in real life.
-	// Pool cant physically send jobs faster than every 250ms or so due to net latency.
+	jobLock.WriteLock();
 
-	while (iConsumeCnt.load(std::memory_order_seq_cst) < iThreadCount)
-		std::this_thread::sleep_for(std::chrono::milliseconds(100));
+	/* This notifies all threads that the job has changed.
+	 * To avoid duplicated shared this must be done before the nonce is exchanged.
+	 */
+	iGlobalJobNo++;
 
 	size_t xid = dat.pool_id;
 	dat.pool_id = pool_id;
 	pool_id = xid;
 
-	dat.iSavedNonce = iGlobalNonce.exchange(dat.iSavedNonce, std::memory_order_seq_cst);
+	/* Maybe a worker thread is updating the nonce while we read it.
+	 * To avoid duplicated share calculations the job ID is checked in the worker thread
+	 * after the nonce is read.
+	 */
+	dat.iSavedNonce = iGlobalNonce.exchange(dat.iSavedNonce, std::memory_order_relaxed);
 	oGlobalWork = pWork;
-	iConsumeCnt.store(0, std::memory_order_seq_cst);
-	iGlobalJobNo++;
+
+	jobLock.UnLock();
 }
 
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/globalStates.hpp b/xmrstak/backend/globalStates.hpp
index 442be699c..c8d691712 100644
--- a/xmrstak/backend/globalStates.hpp
+++ b/xmrstak/backend/globalStates.hpp
@@ -1,26 +1,16 @@
 #pragma once
 
-#include "miner_work.hpp"
+#include "xmrstak/backend/miner_work.hpp"
 #include "xmrstak/misc/environment.hpp"
 #include "xmrstak/misc/console.hpp"
+#include "xmrstak/backend/pool_data.hpp"
+#include "xmrstak/cpputil/read_write_lock.h"
 
 #include <atomic>
 
-constexpr static size_t invalid_pool_id = (-1);
-
 namespace xmrstak
 {
 
-struct pool_data
-{
-	uint32_t iSavedNonce;
-	size_t   pool_id;
-
-	pool_data() : iSavedNonce(0), pool_id(invalid_pool_id)
-	{
-	}
-};
-
 struct globalStates
 {
 	static inline globalStates& inst()
@@ -42,6 +32,8 @@ struct globalStates
 			nonce = iGlobalNonce.fetch_add(reserve_count);
 	}
 
+	void consume_work( miner_work& threadWork, uint64_t& currentJobId);
+
 	miner_work oGlobalWork;
 	std::atomic<uint64_t> iGlobalJobNo;
 	std::atomic<uint64_t> iConsumeCnt;
@@ -50,9 +42,11 @@ struct globalStates
 	size_t pool_id = invalid_pool_id;
 
 private:
-	globalStates() : iThreadCount(0)
+	globalStates() : iThreadCount(0), iGlobalJobNo(0), iConsumeCnt(0)
 	{
 	}
+
+	::cpputil::RWLock jobLock;
 };
 
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/iBackend.hpp b/xmrstak/backend/iBackend.hpp
index 3d2115a19..18411b79c 100644
--- a/xmrstak/backend/iBackend.hpp
+++ b/xmrstak/backend/iBackend.hpp
@@ -20,7 +20,7 @@ namespace xmrstak
 	{
 
 		enum BackendType : uint32_t { UNKNOWN = 0u, CPU = 1u, AMD = 2u, NVIDIA = 3u };
-		
+
 		static const char* getName(const BackendType type)
 		{
 			const char* backendNames[] = {
@@ -47,4 +47,4 @@ namespace xmrstak
 		}
 	};
 
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/miner_work.hpp b/xmrstak/backend/miner_work.hpp
index 4bfe429f9..b6456f031 100644
--- a/xmrstak/backend/miner_work.hpp
+++ b/xmrstak/backend/miner_work.hpp
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "xmrstak/backend/pool_data.hpp"
+
 #include <thread>
 #include <atomic>
 #include <mutex>
@@ -20,7 +22,7 @@ namespace xmrstak
 		bool        bStall;
 		size_t      iPoolId;
 
-		miner_work() : iWorkSize(0), bNiceHash(false), bStall(true), iPoolId(0) { }
+		miner_work() : iWorkSize(0), bNiceHash(false), bStall(true), iPoolId(invalid_pool_id) { }
 
 		miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize,
 			uint64_t iTarget, bool bNiceHash, size_t iPoolId) : iWorkSize(iWorkSize),
@@ -74,5 +76,11 @@ namespace xmrstak
 
 			return *this;
 		}
+
+		uint8_t getVersion() const
+		{
+			return bWorkBlob[0];
+		}
+
 	};
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp
index be7d1ce43..2755e03d2 100644
--- a/xmrstak/backend/nvidia/autoAdjust.hpp
+++ b/xmrstak/backend/nvidia/autoAdjust.hpp
@@ -21,7 +21,7 @@ namespace nvidia
 {
 
 class autoAdjust
-{    
+{
 public:
 
 	autoAdjust()
@@ -42,7 +42,7 @@ class autoAdjust
 		// evaluate config parameter for if auto adjustment is needed
 		for(int i = 0; i < deviceCount; i++)
 		{
-	   
+
 			nvid_ctx ctx;
 
 			ctx.device_id = i;
@@ -50,7 +50,7 @@ class autoAdjust
 			ctx.device_blocks = -1;
 			ctx.device_threads = -1;
 
-			// set all evice option those marked as auto (-1) to a valid value
+			// set all device option those marked as auto (-1) to a valid value
 #ifndef _WIN32
 			ctx.device_bfactor = 0;
 			ctx.device_bsleep = 0;
@@ -96,6 +96,7 @@ class autoAdjust
 					"    \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" +
 					"    \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" :  " + std::to_string(ctx.device_bsleep) + ",\n" +
 					"    \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" +
+					"    \"mem_mode\" : 1,\n" +
 					"  },\n";
 			}
 		}
@@ -109,4 +110,4 @@ class autoAdjust
 };
 
 } // namespace nvidia
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl
index 54791729e..8a5982b57 100644
--- a/xmrstak/backend/nvidia/config.tpl
+++ b/xmrstak/backend/nvidia/config.tpl
@@ -1,4 +1,5 @@
-R"===(
+R"===(// generated by XMRSTAK_VERSION
+
 /*
  * GPU configuration. You should play around with threads and blocks as the fastest settings will vary.
  * index         - GPU index number usually starts from 0.
@@ -15,6 +16,9 @@ R"===(
  *                 1 = cudaDeviceScheduleSpin - create a high load on one cpu thread per gpu
  *                 2 = cudaDeviceScheduleYield
  *                 3 = cudaDeviceScheduleBlockingSync (default)
+ * mem_mode      - select the memory access pattern (this option has only a meaning for cryptonight_v8 and monero)
+ *                 0 = 64bit memory loads
+ *                 1 = 256bit memory loads   
  *
  * On the first run the miner will look at your system and suggest a basic configuration that will work,
  * you can try to tweak it from there to get the best performance.
@@ -22,15 +26,17 @@ R"===(
  * A filled out configuration should look like this:
  * "gpu_threads_conf" :
  * [
- *     { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0, 
- *       "affine_to_cpu" : false, "sync_mode" : 3,
+ *     { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0,
+ *       "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1
  *     },
  * ],
+ * If you do not wish to mine with your nVidia GPU(s) then use:
+ * "gpu_threads_conf" :
+ * null,
  */
 
 "gpu_threads_conf" :
 [
 GPUCONFIG
 ],
-
 )==="
diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp
index 46c572673..6c443343b 100644
--- a/xmrstak/backend/nvidia/jconf.cpp
+++ b/xmrstak/backend/nvidia/jconf.cpp
@@ -123,7 +123,7 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode;
+	const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode, *memMode;
 	gid = GetObjectMember(oThdConf, "index");
 	blocks = GetObjectMember(oThdConf, "blocks");
 	threads = GetObjectMember(oThdConf, "threads");
@@ -131,9 +131,11 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 	bsleep = GetObjectMember(oThdConf, "bsleep");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
 	syncMode = GetObjectMember(oThdConf, "sync_mode");
+	memMode = GetObjectMember(oThdConf, "mem_mode");
 
 	if(gid == nullptr || blocks == nullptr || threads == nullptr ||
-		bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr)
+		bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr ||
+		memMode == nullptr)
 	{
 		return false;
 	}
@@ -158,16 +160,25 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 
 	if(!syncMode->IsNumber() || syncMode->GetInt() < 0 || syncMode->GetInt() > 3)
 	{
-		printer::inst()->print_msg(L0, "Error NVIDIA: sync_mode out of range or no number. ( range: 0 <= sync_mode < 4.)");
+		printer::inst()->print_msg(L0, "Error NVIDIA: sync_mode out of range or not a number. ( range: 0 <= sync_mode < 4.)");
 		return false;
 	}
+
+	if(!memMode->IsNumber() || memMode->GetInt() < 0 || memMode->GetInt() > 1)
+	{
+		printer::inst()->print_msg(L0, "Error NVIDIA: mem_mode out of range or not a number. (range: 0 or 1)");
+		return false;
+	}
+
+
 	cfg.id = gid->GetInt();
 	cfg.blocks = blocks->GetInt();
 	cfg.threads = threads->GetInt();
 	cfg.bfactor = bfactor->GetInt();
 	cfg.bsleep = bsleep->GetInt();
 	cfg.syncMode = syncMode->GetInt();
-
+	cfg.memMode = memMode->GetInt();
+	
 	if(aff->IsNumber())
 		cfg.cpu_aff = aff->GetInt();
 	else
@@ -235,15 +246,15 @@ bool jconf::parse_config(const char* sFilename)
 
 	if(prv->jsonDoc.HasParseError())
 	{
-		printer::inst()->print_msg(L0, "JSON config parse error(offset %llu): %s",
-			int_port(prv->jsonDoc.GetErrorOffset()), GetParseError_En(prv->jsonDoc.GetParseError()));
+		printer::inst()->print_msg(L0, "JSON config parse error in '%s' (offset %llu): %s",
+			sFilename, int_port(prv->jsonDoc.GetErrorOffset()), GetParseError_En(prv->jsonDoc.GetParseError()));
 		return false;
 	}
 
 
 	if(!prv->jsonDoc.IsObject())
 	{ //This should never happen as we created the root ourselves
-		printer::inst()->print_msg(L0, "Invalid config file. No root?\n");
+		printer::inst()->print_msg(L0, "Invalid config file '%s'. No root?", sFilename);
 		return false;
 	}
 
@@ -259,13 +270,13 @@ bool jconf::parse_config(const char* sFilename)
 
 		if(prv->configValues[i] == nullptr)
 		{
-			printer::inst()->print_msg(L0, "Invalid config file. Missing value \"%s\".", oConfigValues[i].sName);
+			printer::inst()->print_msg(L0, "Invalid config file '%s'. Missing value \"%s\".", sFilename, oConfigValues[i].sName);
 			return false;
 		}
 
 		if(!checkType(prv->configValues[i]->GetType(), oConfigValues[i].iType))
 		{
-			printer::inst()->print_msg(L0, "Invalid config file. Value \"%s\" has unexpected type.", oConfigValues[i].sName);
+			printer::inst()->print_msg(L0, "Invalid config file '%s'. Value \"%s\" has unexpected type.", sFilename, oConfigValues[i].sName);
 			return false;
 		}
 	}
@@ -274,4 +285,4 @@ bool jconf::parse_config(const char* sFilename)
 }
 
 } // namespace nvidia
-} // namespace xmrstak
\ No newline at end of file
+} // namespace xmrstak
diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp
index 7f60f1d90..40b72f880 100644
--- a/xmrstak/backend/nvidia/jconf.hpp
+++ b/xmrstak/backend/nvidia/jconf.hpp
@@ -29,6 +29,7 @@ class jconf
 		bool bNoPrefetch;
 		int32_t cpu_aff;
 		int syncMode;
+		int memMode;
 
 		long long iCpuAff;
 	};
@@ -49,4 +50,4 @@ class jconf
 };
 
 } // namespace nvidia
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index 556459639..6460628de 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -78,16 +78,25 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
 	ctx.device_bfactor = (int)cfg.bfactor;
 	ctx.device_bsleep = (int)cfg.bsleep;
 	ctx.syncMode = cfg.syncMode;
+	ctx.memMode = cfg.memMode;
 	this->affinity = cfg.cpu_aff;
 
-	std::unique_lock<std::mutex> lck(thd_aff_set);
-	std::future<void> order_guard = order_fix.get_future();
+	std::future<void> numa_guard = numa_promise.get_future();
+	thread_work_guard = thread_work_promise.get_future();
 
 	oWorkThd = std::thread(&minethd::work_main, this);
 
-	order_guard.wait();
+	/* Wait until the gpu memory is initialized and numa cpu memory is pinned.
+	 * The startup time is reduced if the memory is initialized in sequential order
+	 * without concurrent threads (CUDA driver is less occupied).
+	 */
+	numa_guard.wait();
+}
 
-	if(affinity >= 0) //-1 means no affinity
+void minethd::start_mining()
+{
+	thread_work_promise.set_value();
+	if(this->affinity >= 0) //-1 means no affinity
 		if(!cpu::minethd::thd_setaffinity(oWorkThd.native_handle(), affinity))
 			printer::inst()->print_msg(L1, "WARNING setting affinity failed.");
 }
@@ -136,6 +145,8 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 {
 	std::vector<iBackend*>* pvThreads = new std::vector<iBackend*>();
 
+	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
+
 	if(!configEditor::file_exist(params::inst().configFileNVIDIA))
 	{
 		autoAdjust adjust;
@@ -154,6 +165,10 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 		std::cout<<"WARNING: NVIDIA no device found"<<std::endl;
 		return pvThreads;
 	}
+	else
+	{
+		std::cout<<"NVIDIA: found "<< deviceCount <<" potential device's"<<std::endl;
+	}
 
 	size_t i, n = jconf::inst()->GetGPUThreadCount();
 	pvThreads->reserve(n);
@@ -166,41 +181,25 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 		if(cfg.cpu_aff >= 0)
 		{
 #if defined(__APPLE__)
-			printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory.");
+			printer::inst()->print_msg(L1, "WARNING on macOS thread affinity is only advisory.");
 #endif
 
 			printer::inst()->print_msg(L1, "Starting NVIDIA GPU thread %d, affinity: %d.", i, (int)cfg.cpu_aff);
 		}
 		else
 			printer::inst()->print_msg(L1, "Starting NVIDIA GPU thread %d, no affinity.", i);
-		
+
 		minethd* thd = new minethd(pWork, i + threadOffset, cfg);
 		pvThreads->push_back(thd);
 
 	}
 
-	return pvThreads;
-}
-
-void minethd::switch_work(miner_work& pWork)
-{
-	// iConsumeCnt is a basic lock-like polling mechanism just in case we happen to push work
-	// faster than threads can consume them. This should never happen in real life.
-	// Pool cant physically send jobs faster than every 250ms or so due to net latency.
-
-	while (globalStates::inst().iConsumeCnt.load(std::memory_order_seq_cst) < globalStates::inst().iThreadCount)
-		std::this_thread::sleep_for(std::chrono::milliseconds(100));
-
-	globalStates::inst().oGlobalWork = pWork;
-	globalStates::inst().iConsumeCnt.store(0, std::memory_order_seq_cst);
-	globalStates::inst().iGlobalJobNo++;
-}
+	for (i = 0; i < n; i++)
+	{
+		static_cast<minethd*>((*pvThreads)[i])->start_mining();
+	}
 
-void minethd::consume_work()
-{
-	memcpy(&oWork, &globalStates::inst().oGlobalWork, sizeof(miner_work));
-	iJobNo++;
-	globalStates::inst().iConsumeCnt++;
+	return pvThreads;
 }
 
 void minethd::work_main()
@@ -208,26 +207,31 @@ void minethd::work_main()
 	if(affinity >= 0) //-1 means no affinity
 		bindMemoryToNUMANode(affinity);
 
-	order_fix.set_value();
-	std::unique_lock<std::mutex> lck(thd_aff_set);
-	lck.release();
+	if(cuda_get_deviceinfo(&ctx) != 0 || cryptonight_extra_cpu_init(&ctx) != 1)
+	{
+		printer::inst()->print_msg(L0, "Setup failed for GPU %d. Exiting.\n", (int)iThreadNo);
+		std::exit(0);
+	}
+
+	// numa memory bind and gpu memory is initialized
+	numa_promise.set_value();
+
 	std::this_thread::yield();
+	// wait until all NVIDIA devices are initialized
+	thread_work_guard.wait();
 
 	uint64_t iCount = 0;
 	cryptonight_ctx* cpu_ctx;
 	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
-	cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, ::jconf::inst()->IsCurrencyMonero());
-	uint32_t iNonce;
 
-	globalStates::inst().iConsumeCnt++;
+	// start with root algorithm and switch later if fork version is reached
+	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
+	cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
 
-	if(cuda_get_deviceinfo(&ctx) != 0 || cryptonight_extra_cpu_init(&ctx) != 1)
-	{
-		printer::inst()->print_msg(L0, "Setup failed for GPU %d. Exitting.\n", (int)iThreadNo);
-		std::exit(0);
-	}
+	uint32_t iNonce;
 
-	bool mineMonero = strcmp_i(::jconf::inst()->GetCurrency(), "monero");
+	uint8_t version = 0;
+	size_t lastPoolId = 0;
 
 	while (bQuit == 0)
 	{
@@ -241,9 +245,26 @@ void minethd::work_main()
 			while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 				std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
-			consume_work();
+			globalStates::inst().consume_work(oWork, iJobNo);
 			continue;
 		}
+		uint8_t new_version = oWork.getVersion();
+		if(new_version != version || oWork.iPoolId != lastPoolId)
+		{
+			coinDescription coinDesc = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(oWork.iPoolId);
+			if(new_version >= coinDesc.GetMiningForkVersion())
+			{
+				miner_algo = coinDesc.GetMiningAlgo();
+				hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
+			}
+			else
+			{
+				miner_algo = coinDesc.GetMiningAlgoRoot();
+				hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
+			}
+			lastPoolId = oWork.iPoolId;
+			version = new_version;
+		}
 
 		cryptonight_extra_cpu_set_data(&ctx, oWork.bWorkBlob, oWork.iWorkSize);
 
@@ -261,16 +282,19 @@ void minethd::work_main()
 			if((round_ctr++ & 0xF) == 0)
 			{
 				globalStates::inst().calc_start_nonce(iNonce, oWork.bNiceHash, h_per_round * 16);
+				// check if the job is still valid, there is a small possibility that the job is switched
+				if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo)
+					break;
 			}
-			
+
 			uint32_t foundNonce[10];
 			uint32_t foundCount;
 
-			cryptonight_extra_cpu_prepare(&ctx, iNonce);
+			cryptonight_extra_cpu_prepare(&ctx, iNonce, miner_algo);
 
-			cryptonight_core_cpu_hash(&ctx, mineMonero);
+			cryptonight_core_cpu_hash(&ctx, miner_algo, iNonce);
 
-			cryptonight_extra_cpu_final(&ctx, iNonce, oWork.iTarget, &foundCount, foundNonce);
+			cryptonight_extra_cpu_final(&ctx, iNonce, oWork.iTarget, &foundCount, foundNonce, miner_algo);
 
 			for(size_t i = 0; i < foundCount; i++)
 			{
@@ -283,11 +307,11 @@ void minethd::work_main()
 
 				*(uint32_t*)(bWorkBlob + 39) = foundNonce[i];
 
-				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx);
+				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx);
 				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
-					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo), oWork.iPoolId));
+					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
 				else
-					executor::inst()->push_event(ex_event("NVIDIA Invalid Result", oWork.iPoolId));
+					executor::inst()->push_event(ex_event("NVIDIA Invalid Result", ctx.device_id, oWork.iPoolId));
 			}
 
 			iCount += h_per_round;
@@ -300,10 +324,9 @@ void minethd::work_main()
 			std::this_thread::yield();
 		}
 
-		consume_work();
+		globalStates::inst().consume_work(oWork, iJobNo);
 	}
 }
 
 } // namespace xmrstak
-
 } //namespace nvidia
diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp
index d13c8689c..389356842 100644
--- a/xmrstak/backend/nvidia/minethd.hpp
+++ b/xmrstak/backend/nvidia/minethd.hpp
@@ -24,28 +24,29 @@ class minethd : public iBackend
 {
 public:
 
-	static void switch_work(miner_work& pWork);
 	static std::vector<iBackend*>* thread_starter(uint32_t threadOffset, miner_work& pWork);
 	static bool self_test();
 
 private:
-	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**);
 
 	minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg);
+	void start_mining();
 
 	void work_main();
-	void consume_work();
 
 	static std::atomic<uint64_t> iGlobalJobNo;
 	static std::atomic<uint64_t> iConsumeCnt;
 	static uint64_t iThreadCount;
 	uint64_t iJobNo;
 
-	static miner_work oGlobalWork;
 	miner_work oWork;
 
-	std::promise<void> order_fix;
-	std::mutex thd_aff_set;
+	std::promise<void> numa_promise;
+	std::promise<void> thread_work_promise;
+
+	// block thread until all NVIDIA GPUs are initialized
+	std::future<void> thread_work_guard;
 
 	std::thread oWorkThd;
 	int64_t affinity;
@@ -56,4 +57,4 @@ class minethd : public iBackend
 };
 
 } // namespace nvidia
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
index afbdbaf88..8fda8d401 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
@@ -3,6 +3,9 @@
 #include <stdint.h>
 #include <string>
 
+#include "xmrstak/jconf.hpp"
+#include "xmrstak/backend/cryptonight.hpp"
+
 typedef struct {
 	int device_id;
 	const char *device_name;
@@ -11,8 +14,9 @@ typedef struct {
 	int device_blocks;
 	int device_threads;
 	int device_bfactor;
-	int device_bsleep;	
+	int device_bsleep;
 	int syncMode;
+	bool memMode;
 
 	uint32_t *d_input;
 	uint32_t inputlen;
@@ -20,6 +24,7 @@ typedef struct {
 	uint32_t *d_result_nonce;
 	uint32_t *d_long_state;
 	uint32_t *d_ctx_state;
+	uint32_t *d_ctx_state2;
 	uint32_t *d_ctx_a;
 	uint32_t *d_ctx_b;
 	uint32_t *d_ctx_key1;
@@ -41,8 +46,8 @@ int cuda_get_devicecount( int* deviceCount);
 int cuda_get_deviceinfo(nvid_ctx *ctx);
 int cryptonight_extra_cpu_init(nvid_ctx *ctx);
 void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len);
-void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce);
-void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce);
+void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, xmrstak_algo miner_algo);
+void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce,xmrstak_algo miner_algo);
 }
 
-void cryptonight_core_cpu_hash(nvid_ctx* ctx, bool mineMonero);
+void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce);
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp
index 340174c89..611fe1c8c 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp
@@ -95,7 +95,7 @@ __device__ void cn_blake_update(blake_state *  S, const uint8_t *  data, uint64_
 	uint32_t left = S->buflen >> 3;
 	uint32_t fill = 64 - left;
 
-	if (left && (((datalen >> 3) & 0x3F) >= fill)) 
+	if (left && (((datalen >> 3) & 0x3F) >= fill))
 	{
 		memcpy((void *) (S->buf + left), (void *) data, fill);
 		S->t[0] += 512;
@@ -106,7 +106,7 @@ __device__ void cn_blake_update(blake_state *  S, const uint8_t *  data, uint64_
 		left = 0;
 	}
 
-	while (datalen >= 512) 
+	while (datalen >= 512)
 	{
 		S->t[0] += 512;
 		if (S->t[0] == 0) S->t[1]++;
@@ -115,12 +115,12 @@ __device__ void cn_blake_update(blake_state *  S, const uint8_t *  data, uint64_
 		datalen -= 512;
 	}
 
-	if (datalen > 0) 
+	if (datalen > 0)
 	{
 		memcpy((void *) (S->buf + left), (void *) data, datalen >> 3);
 		S->buflen = (left << 3) + datalen;
 	}
-	else 
+	else
 	{
 		S->buflen = 0;
 	}
@@ -128,7 +128,7 @@ __device__ void cn_blake_update(blake_state *  S, const uint8_t *  data, uint64_
 
 __device__ void cn_blake_final(blake_state *  S, uint8_t *  digest)
 {
-	const uint8_t padding[] = 
+	const uint8_t padding[] =
 	{
 		0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
@@ -141,20 +141,20 @@ __device__ void cn_blake_final(blake_state *  S, uint8_t *  digest)
 	U32TO8(msglen + 0, hi);
 	U32TO8(msglen + 4, lo);
 
-	if (S->buflen == 440) 
+	if (S->buflen == 440)
 	{
 		S->t[0] -= 8;
 		cn_blake_update(S, &pa, 8);
-	} 
-	else 
+	}
+	else
 	{
-		if (S->buflen < 440) 
+		if (S->buflen < 440)
 		{
 			if (S->buflen == 0) S->nullt = 1;
 			S->t[0] -= 440 - S->buflen;
 			cn_blake_update(S, padding, 440 - S->buflen);
 		}
-		else 
+		else
 		{
 			S->t[0] -= 512 - S->buflen;
 			cn_blake_update(S, padding, 512 - S->buflen);
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 15a6f36a7..7742e740e 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -5,6 +5,11 @@
 #include <string.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <bitset>
+
+#include "xmrstak/jconf.hpp"
+#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp"
+
 
 #ifdef _WIN32
 #include <windows.h>
@@ -65,37 +70,59 @@ typedef uint64_t IndexType;
 typedef int IndexType;
 #endif
 
-__device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi )
+__device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t multiplicand, uint64_t& product_hi )
 {
-	*product_hi = __umul64hi( multiplier, multiplicand );
+	product_hi = __umul64hi( multiplier, multiplicand );
 	return (multiplier * multiplicand );
 }
 
 template< typename T >
 __device__ __forceinline__ T loadGlobal64( T * const addr )
 {
+#if (__CUDA_ARCH__ < 700)
 	T x;
 	asm volatile( "ld.global.cg.u64 %0, [%1];" : "=l"( x ) : "l"( addr ) );
 	return x;
+#else
+	return *addr;
+#endif
 }
 
 template< typename T >
 __device__ __forceinline__ T loadGlobal32( T * const addr )
 {
+#if (__CUDA_ARCH__ < 700)
 	T x;
 	asm volatile( "ld.global.cg.u32 %0, [%1];" : "=r"( x ) : "l"( addr ) );
 	return x;
+#else
+	return *addr;
+#endif
 }
 
 
 template< typename T >
 __device__ __forceinline__ void storeGlobal32( T* addr, T const & val )
 {
+#if (__CUDA_ARCH__ < 700)
 	asm volatile( "st.global.cg.u32 [%0], %1;" : : "l"( addr ), "r"( val ) );
+#else
+	*addr = val;
+#endif
+}
+
+template< typename T >
+__device__ __forceinline__ void storeGlobal64( T* addr, T const & val )
+{
+#if (__CUDA_ARCH__ < 700)
+	asm volatile( "st.global.cg.u64 [%0], %1;" : : "l"( addr ), "l"( val ) );
+#else
+	*addr = val;
+#endif
 }
 
-template<size_t ITERATIONS, uint32_t THREAD_SHIFT>
-__global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1 )
+template<size_t ITERATIONS, uint32_t MEMORY>
+__global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 )
 {
 	__shared__ uint32_t sharedMemory[1024];
 
@@ -105,7 +132,7 @@ __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int parti
 	const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3;
 	const int sub = ( threadIdx.x & 7 ) << 2;
 
-	const int batchsize = ITERATIONS >> bfactor;
+	const int batchsize = MEMORY >> bfactor;
 	const int start = partidx * batchsize;
 	const int end = start + batchsize;
 
@@ -119,18 +146,18 @@ __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int parti
 	if( partidx == 0 )
 	{
 		// first round
-		MEMCPY8( text, ctx_state + thread * 50 + sub + 16, 2 );
+		MEMCPY8( text, ctx_state2 + thread * 50 + sub + 16, 2 );
 	}
 	else
 	{
 		// load previous text data
-		MEMCPY8( text, &long_state[( (uint64_t) thread << THREAD_SHIFT ) + sub + start - 32], 2 );
+		MEMCPY8( text, &long_state[( (uint64_t) thread * MEMORY ) + sub + start - 32], 2 );
 	}
 	__syncthreads( );
 	for ( int i = start; i < end; i += 32 )
 	{
 		cn_aes_pseudo_round_mut( sharedMemory, text, key );
-		MEMCPY8(&long_state[((uint64_t) thread << THREAD_SHIFT) + (sub + i)], text, 2);
+		MEMCPY8(&long_state[((uint64_t) thread * MEMORY) + (sub + i)], text, 2);
 	}
 }
 
@@ -145,33 +172,315 @@ __forceinline__ __device__ void unusedVar( const T& )
  * - this method can be used with all compute architectures
  * - for <sm_30 shared memory is needed
  *
+ * group_n - must be a power of 2!
+ *
  * @param ptr pointer to shared memory, size must be `threadIdx.x * sizeof(uint32_t)`
  *            value can be NULL for compute architecture >=sm_30
- * @param sub thread number within the group, range [0;4)
+ * @param sub thread number within the group, range [0:group_n]
  * @param value value to share with other threads within the group
- * @param src thread number within the group from where the data is read, range [0;4)
+ * @param src thread number within the group from where the data is read, range [0:group_n]
  */
+template<size_t group_n>
 __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src)
 {
 #if( __CUDA_ARCH__ < 300 )
     ptr[sub] = val;
-    return ptr[src&3];
+    return ptr[src & (group_n-1)];
 #else
     unusedVar( ptr );
     unusedVar( sub );
 #   if(__CUDACC_VER_MAJOR__ >= 9)
-    return __shfl_sync(0xFFFFFFFF, val, src, 4 );
+    return __shfl_sync(__activemask(), val, src, group_n );
 #	else
-	return __shfl( val, src, 4 );
+	return __shfl( val, src, group_n );
 #	endif
 #endif
 }
 
-template<size_t ITERATIONS, uint32_t THREAD_SHIFT, uint32_t MASK>
+template<size_t group_n>
+__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src, const uint32_t src2)
+{
+	uint64_t tmp;
+	((uint32_t*)&tmp)[0] = shuffle<group_n>(ptr, sub, val, src);
+	((uint32_t*)&tmp)[1] = shuffle<group_n>(ptr, sub, val, src2);
+	return tmp;
+}
+
+struct u64 : public uint2
+{
+
+	__forceinline__ __device__ u64(){}
+
+	__forceinline__ __device__ u64( const uint32_t x0, const uint32_t x1)
+	{
+		uint2::x = x0;
+		uint2::y = x1;
+	}
+
+	__forceinline__ __device__ operator uint64_t() const
+	{
+		return *((uint64_t*)this);
+	}
+
+	__forceinline__ __device__ u64( const uint64_t x0)
+	{
+		((uint64_t*)&this->x)[0] = x0;
+	}
+
+	__forceinline__ __device__ u64 operator^=(const u64& other)
+	{
+		uint2::x ^= other.x;
+		uint2::y ^= other.y;
+
+		return *this;
+	}
+
+	__forceinline__ __device__ u64 operator+(const u64& other) const
+	{
+		u64 tmp;
+		((uint64_t*)&tmp.x)[0] = ((uint64_t*)&(this->x))[0] + ((uint64_t*)&(other.x))[0];
+
+		return tmp;
+	}
+
+	__forceinline__ __device__ u64 operator+=(const uint64_t& other)
+	{
+		return ((uint64_t*)&this->x)[0] += other;
+	}
+
+	__forceinline__ __device__ void print(int i) const
+	{
+		if(i<2)
+			printf("gpu: %lu\n", ((uint64_t*)&this->x)[0]);
+	}
+};
+
+/** cryptonight with two threads per hash
+ *
+ * @tparam MEM_MODE if `0` than 64bit memory transfers per thread will be used to store/load data within shared memory
+ *                   else if `1` 256bit operations will be used
+ */
+template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO, uint32_t MEM_MODE>
+#ifdef XMR_STAK_THREADS
+__launch_bounds__( XMR_STAK_THREADS * 2 )
+#endif
+__global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
+		uint32_t startNonce, uint32_t * __restrict__ d_input )
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	cn_aes_gpu_init( sharedMemory );
+
+#if( __CUDA_ARCH__ < 300 )
+	extern __shared__ uint64_t externShared[];
+	// 8 x 64bit values
+	volatile uint64_t* myChunks = (volatile uint64_t*)(externShared + (threadIdx.x >> 1) * 8);
+    volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8)  + (threadIdx.x & 0xFFFFFFFE);
+#else
+	extern __shared__ uint64_t chunkMem[];
+    volatile uint32_t* sPtr = NULL;
+	// 8 x 64bit values
+	volatile uint64_t* myChunks = (volatile uint64_t*)(chunkMem + (threadIdx.x >> 1) * 8);
+
+#endif
+
+	__syncthreads( );
+
+	const uint64_t tid = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = tid >> 1;
+	const uint32_t sub = tid & 1;
+
+	if ( thread >= threads )
+		return;
+
+	uint8_t *l0 = (uint8_t*)&d_long_state[(IndexType) thread * MEMORY];
+
+	uint64_t ax0 = ((uint64_t*)(d_ctx_a + thread * 4))[sub];
+	uint64_t bx0;
+	uint32_t idx0 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0);
+
+	uint64_t* ptr0;
+
+	uint64_t bx1;
+	uint32_t sqrt_result;
+	uint64_t division_result;
+	if(ALGO == cryptonight_monero_v8)
+	{
+		bx0 = ((uint64_t*)(d_ctx_b + thread * 12))[sub];
+		bx1 = ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub];
+
+		division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0];
+		sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0];
+	}
+	else
+		 bx0 = ((uint64_t*)(d_ctx_b + thread * 4))[sub];
+
+	const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor );
+	const int start = partidx * batchsize;
+	const int end = start + batchsize;
+
+	for(int i = start; i < end; ++i)
+	{
+		ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
+
+		if(MEM_MODE == 0)
+		{
+			#pragma unroll 4
+			for(int x = 0; x < 8; x += 2)
+			{
+				myChunks[x + sub] = ptr0[ x + sub ];
+			}
+		}
+		else
+			((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub];
+
+		uint32_t idx1 = (idx0 & 0x30) >> 3;
+
+		const u64 cx = myChunks[ idx1 + sub ];
+		const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ];
+
+		u64 cx_aes = ax0 ^ u64(
+			t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ t_fn2( (cx2.x >> 16) & 0xff ) ^ t_fn3( (cx2.y >> 24 ) ),
+			t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ t_fn2( (cx2.y >> 16) & 0xff ) ^ t_fn3( (cx.x >> 24 ) )
+		);
+
+		if(ALGO == cryptonight_monero_v8)
+		{
+
+			const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ];
+			const uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
+			const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ];
+#if (__CUDACC_VER_MAJOR__ >= 9)
+			__syncwarp();
+#else
+			__syncthreads( );
+#endif
+			myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
+			myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
+			myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
+		}
+
+		myChunks[ idx1 + sub ] = cx_aes ^ bx0;
+		if(MEM_MODE == 0)
+		{
+			#pragma unroll 4
+			for(int x = 0; x < 8; x += 2)
+			{
+				ptr0[ x + sub ] = myChunks[x + sub];
+			}
+		}
+		else
+			((ulonglong4*)ptr0)[sub] = ((ulonglong4*)myChunks)[sub];
+
+		idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0);
+		idx1 = (idx0 & 0x30) >> 3;
+		ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
+
+		if(MEM_MODE == 0)
+		{
+			#pragma unroll 4
+			for(int x = 0; x < 8; x += 2)
+			{
+				myChunks[x + sub] = ptr0[ x + sub ];
+			}
+		}
+		else
+			((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub];
+
+		if(ALGO != cryptonight_monero_v8)
+			bx0 = cx_aes;
+
+		uint64_t cx_mul;
+		((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0);
+		((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0);
+
+		if(ALGO == cryptonight_monero_v8 && sub == 1)
+		{
+			// Use division and square root results from the _previous_ iteration to hide the latency
+			((uint32_t*)&division_result)[1] ^= sqrt_result;
+
+			((uint64_t*)myChunks)[ idx1 ] ^= division_result;
+
+			const uint32_t dd = (static_cast<uint32_t>(cx_mul) + (sqrt_result << 1)) | 0x80000001UL;
+			division_result = fast_div_v2(cx_aes, dd);
+
+			// Use division_result as an input for the square root to prevent parallel implementation in hardware
+			sqrt_result = fast_sqrt_v2(cx_mul + division_result);
+		}
+#if (__CUDACC_VER_MAJOR__ >= 9)
+				__syncwarp();
+#else
+				__syncthreads( );
+#endif
+		uint64_t c = ((uint64_t*)myChunks)[ idx1 + sub ];
+
+		{
+			uint64_t cl = ((uint64_t*)myChunks)[ idx1 ];
+			// sub 0 -> hi, sub 1 -> lo
+			uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl;
+			if(ALGO == cryptonight_monero_v8)
+			{
+				const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res;
+				uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
+				res ^= ((uint64_t*)&chunk2)[0];
+				const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ];
+#if (__CUDACC_VER_MAJOR__ >= 9)
+				__syncwarp();
+#else
+				__syncthreads( );
+#endif
+				myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
+				myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
+				myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
+			}
+			ax0 += res;
+		}
+		if(ALGO == cryptonight_monero_v8)
+		{
+			bx1 = bx0;
+			bx0 = cx_aes;
+		}
+		myChunks[ idx1 + sub ] = ax0;
+		if(MEM_MODE == 0)
+		{
+			#pragma unroll 4
+			for(int x = 0; x < 8; x += 2)
+			{
+				ptr0[ x + sub ] = myChunks[x + sub];
+			}
+		}
+		else
+			((ulonglong4*)ptr0)[sub] = ((ulonglong4*)myChunks)[sub];
+		ax0 ^= c;
+		idx0 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0);
+	}
+
+	if ( bfactor > 0 )
+	{
+		((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0;
+		if(ALGO == cryptonight_monero_v8)
+		{
+			((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0;
+			((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub] = bx1;
+
+			if(sub == 1)
+			{
+				// must be valid only for `sub == 1`
+				((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0] = division_result;
+				(d_ctx_b + thread * 12 + 4 * 2 + 2)[0] = sqrt_result;
+			}
+		}
+		else
+			((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0;
+	}
+}
+
+template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO>
 #ifdef XMR_STAK_THREADS
 __launch_bounds__( XMR_STAK_THREADS * 4 )
 #endif
-__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b )
+__global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
+		uint32_t startNonce, uint32_t * __restrict__ d_input )
 {
 	__shared__ uint32_t sharedMemory[1024];
 
@@ -180,6 +489,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 	__syncthreads( );
 
 	const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 2;
+	const uint32_t nonce = startNonce + thread;
 	const int sub = threadIdx.x & 3;
 	const int sub2 = sub & 2;
 
@@ -193,41 +503,109 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 		return;
 
 	int i, k;
-        uint32_t j;
+	uint32_t j;
 	const int batchsize = (ITERATIONS * 2) >> ( 2 + bfactor );
 	const int start = partidx * batchsize;
 	const int end = start + batchsize;
-	uint32_t * long_state = &d_long_state[(IndexType) thread << THREAD_SHIFT];
-	uint32_t * ctx_a = d_ctx_a + thread * 4;
-	uint32_t * ctx_b = d_ctx_b + thread * 4;
-	uint32_t a, d[2];
+	uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY];
+	uint32_t a, d[2], idx0;
 	uint32_t t1[2], t2[2], res;
 
-	a = ctx_a[sub];
-	d[1] = ctx_b[sub];
+	uint32_t tweak1_2[2];
+	if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
+	{
+		uint32_t * state = d_ctx_state + thread * 50;
+		tweak1_2[0] = (d_input[8] >> 24) | (d_input[9] << 8);
+		tweak1_2[0] ^= state[48];
+		tweak1_2[1] = nonce;
+		tweak1_2[1] ^= state[49];
+	}
+
+	a = (d_ctx_a + thread * 4)[sub];
+	idx0 = shuffle<4>(sPtr,sub, a, 0);
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	{
+		if(partidx != 0)
+		{
+			// state is stored after all ctx_b states
+			idx0 = *(d_ctx_b + threads * 4 + thread);
+		}
+	}
+	d[1] = (d_ctx_b + thread * 4)[sub];
+
 	#pragma unroll 2
 	for ( i = start; i < end; ++i )
 	{
 		#pragma unroll 2
 		for ( int x = 0; x < 2; ++x )
 		{
-			j = ( ( shuffle(sPtr,sub, a, 0) & MASK ) >> 2 ) + sub;
-
-			const uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j );
-			const uint32_t x_1 = shuffle(sPtr,sub, x_0, sub + 1);
-			const uint32_t x_2 = shuffle(sPtr,sub, x_0, sub + 2);
-			const uint32_t x_3 = shuffle(sPtr,sub, x_0, sub + 3);
-			d[x] = a ^
-				t_fn0( x_0 & 0xff ) ^
-				t_fn1( (x_1 >> 8) & 0xff ) ^
-				t_fn2( (x_2 >> 16) & 0xff ) ^
-				t_fn3( ( x_3 >> 24 ) );
-
+			j = ( ( idx0 & MASK ) >> 2 ) + sub;
+
+			if(ALGO == cryptonight_bittube2)
+			{
+				uint32_t k[4];
+				k[0] = ~loadGlobal32<uint32_t>( long_state + j );
+				k[1] = shuffle<4>(sPtr,sub, k[0], sub + 1);
+				k[2] = shuffle<4>(sPtr,sub, k[0], sub + 2);
+				k[3] = shuffle<4>(sPtr,sub, k[0], sub + 3);
+
+				#pragma unroll 4
+				for(int i = 0; i < 4; ++i)
+				{
+					// only calculate the key if all data are up to date
+					if(i == sub)
+					{
+						d[x] = a ^
+							t_fn0( k[0] & 0xff ) ^
+							t_fn1( (k[1] >> 8) & 0xff ) ^
+							t_fn2( (k[2] >> 16) & 0xff ) ^
+							t_fn3( (k[3] >> 24 ) );
+					}
+					// the last shuffle is not needed
+					if(i != 3)
+					{
+						/* avoid negative number for modulo
+						 * load valid key (k) depending on the round
+						 */
+						k[(4 - sub + i)%4] = shuffle<4>(sPtr,sub, k[0] ^ d[x], i);
+					}
+				}
+			}
+			else
+			{
+				const uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j );
+				const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1);
+				const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2);
+				const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3);
+				d[x] = a ^
+					t_fn0( x_0 & 0xff ) ^
+					t_fn1( (x_1 >> 8) & 0xff ) ^
+					t_fn2( (x_2 >> 16) & 0xff ) ^
+					t_fn3( ( x_3 >> 24 ) );
+			}
 
 			//XOR_BLOCKS_DST(c, b, &long_state[j]);
-			t1[0] = shuffle(sPtr,sub, d[x], 0);
-			//long_state[j] = d[0] ^ d[1];
-			storeGlobal32( long_state + j, d[0] ^ d[1] );
+			t1[0] = shuffle<4>(sPtr,sub, d[x], 0);
+
+			const uint32_t z = d[0] ^ d[1];
+			if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
+			{
+				const uint32_t table = 0x75310U;
+				if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
+				{
+					const uint32_t index = ((z >> 26) & 12) | ((z >> 23) & 2);
+					const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24;
+					storeGlobal32( long_state + j, sub == 2 ? fork_7 : z );
+				}
+				else if(ALGO == cryptonight_stellite)
+				{
+					const uint32_t index = ((z >> 27) & 12) | ((z >> 23) & 2);
+					const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24;
+					storeGlobal32( long_state + j, sub == 2 ? fork_7 : z );
+				}
+			}
+			else
+				storeGlobal32( long_state + j, z );
 
 			//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & MASK]);
 			j = ( ( *t1 & MASK ) >> 2 ) + sub;
@@ -235,31 +613,72 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 			uint32_t yy[2];
 			*( (uint64_t*) yy ) = loadGlobal64<uint64_t>( ( (uint64_t *) long_state )+( j >> 1 ) );
 			uint32_t zz[2];
-			zz[0] = shuffle(sPtr,sub, yy[0], 0);
-			zz[1] = shuffle(sPtr,sub, yy[1], 0);
+			zz[0] = shuffle<4>(sPtr,sub, yy[0], 0);
+			zz[1] = shuffle<4>(sPtr,sub, yy[1], 0);
 
-			t1[1] = shuffle(sPtr,sub, d[x], 1);
+			t1[1] = shuffle<4>(sPtr,sub, d[x], 1);
 			#pragma unroll
 			for ( k = 0; k < 2; k++ )
-				t2[k] = shuffle(sPtr,sub, a, k + sub2);
+				t2[k] = shuffle<4>(sPtr,sub, a, k + sub2);
 
             *( (uint64_t *) t2 ) += sub2 ? ( *( (uint64_t *) t1 ) * *( (uint64_t*) zz ) ) : __umul64hi( *( (uint64_t *) t1 ), *( (uint64_t*) zz ) );
 
 			res = *( (uint64_t *) t2 )  >> ( sub & 1 ? 32 : 0 );
 
-			storeGlobal32( long_state + j, res );
+			if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
+			{
+				const uint32_t tweaked_res = tweak1_2[sub & 1] ^ res;
+				uint32_t long_state_update = sub2 ? tweaked_res : res;
+
+				if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)
+				{
+					uint32_t value = shuffle<4>(sPtr,sub, long_state_update, sub & 1) ^ long_state_update;
+					long_state_update = sub >= 2 ? value : long_state_update;
+				}
+
+				storeGlobal32( long_state + j, long_state_update );
+			}
+			else
+				storeGlobal32( long_state + j, res );
+
 			a = ( sub & 1 ? yy[1] : yy[0] ) ^ res;
+			idx0 = shuffle<4>(sPtr,sub, a, 0);
+			if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)
+			{
+				int64_t n = loadGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3));
+				int32_t d = loadGlobal32<uint32_t>( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u ));
+				int64_t q = n / (d | 0x5);
+
+				if(sub&1)
+					storeGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q );
+
+				idx0 = d ^ q;
+			}
+			else if(ALGO == cryptonight_haven)
+			{
+				int64_t n = loadGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3));
+				int32_t d = loadGlobal32<uint32_t>( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u ));
+				int64_t q = n / (d | 0x5);
+
+				if(sub&1)
+					storeGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q );
+
+				idx0 = (~d) ^ q;
+			}
 		}
 	}
 
 	if ( bfactor > 0 )
 	{
-		ctx_a[sub] = a;
-		ctx_b[sub] = d[1];
+		(d_ctx_a + thread * 4)[sub] = a;
+		(d_ctx_b + thread * 4)[sub] = d[1];
+		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+			if(sub&1)
+				*(d_ctx_b + threads * 4 + thread) = idx0;
 	}
 }
 
-template<size_t ITERATIONS, uint32_t THREAD_SHIFT>
+template<size_t ITERATIONS, uint32_t MEMORY, xmrstak_algo ALGO>
 __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 )
 {
 	__shared__ uint32_t sharedMemory[1024];
@@ -268,10 +687,11 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 	__syncthreads( );
 
 	int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3;
-	int sub = ( threadIdx.x & 7 ) << 2;
+	int subv = ( threadIdx.x & 7 );
+	int sub = subv << 2;
 
-	const int batchsize = ITERATIONS >> bfactor;
-	const int start = partidx * batchsize;
+	const int batchsize = MEMORY >> bfactor;
+	const int start = (partidx % (1 << bfactor)) * batchsize;
 	const int end = start + batchsize;
 
 	if ( thread >= threads )
@@ -282,23 +702,39 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 	MEMCPY8( text, d_ctx_state + thread * 50 + sub + 16, 2 );
 
 	__syncthreads( );
+
+#if( __CUDA_ARCH__ < 300 )
+	extern __shared__ uint32_t shuffleMem[];
+	volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFF8));
+#else
+	volatile uint32_t* sPtr = NULL;
+#endif
+
 	for ( int i = start; i < end; i += 32 )
 	{
-#pragma unroll
+		#pragma unroll
 		for ( int j = 0; j < 4; ++j )
-			text[j] ^= long_state[((IndexType) thread << THREAD_SHIFT) + (sub + i + j)];
+			text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)];
 
 		cn_aes_pseudo_round_mut( sharedMemory, text, key );
+
+		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+		{
+			#pragma unroll
+			for ( int j = 0; j < 4; ++j )
+				text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7);
+		}
 	}
 
 	MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 );
 }
 
-template<size_t ITERATIONS, uint32_t MASK, uint32_t THREAD_SHIFT>
-void cryptonight_core_gpu_hash(nvid_ctx* ctx)
+template<size_t ITERATIONS, uint32_t MASK, uint32_t MEMORY, xmrstak_algo ALGO, uint32_t MEM_MODE>
+void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 {
 	dim3 grid( ctx->device_blocks );
 	dim3 block( ctx->device_threads );
+	dim3 block2( ctx->device_threads << 2 );
 	dim3 block4( ctx->device_threads << 2 );
 	dim3 block8( ctx->device_threads << 3 );
 
@@ -317,9 +753,11 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx)
 
 	for ( int i = 0; i < partcountOneThree; i++ )
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<ITERATIONS,THREAD_SHIFT><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<ITERATIONS,MEMORY><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
 			bfactorOneThree, i,
-			ctx->d_long_state, ctx->d_ctx_state, ctx->d_ctx_key1 ));
+			ctx->d_long_state,
+			(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 ? ctx->d_ctx_state2 : ctx->d_ctx_state),
+			ctx->d_ctx_key1 ));
 
 		if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
 	}
@@ -327,47 +765,122 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx)
 
 	for ( int i = 0; i < partcount; i++ )
 	{
-        CUDA_CHECK_MSG_KERNEL(
-			ctx->device_id,
-			"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
-			cryptonight_core_gpu_phase2<ITERATIONS,THREAD_SHIFT,MASK><<<
-				grid,
-				block4,
-				block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
-			>>>(
-				ctx->device_blocks*ctx->device_threads,
-				ctx->device_bfactor,
-				i,
-				ctx->d_long_state,
-				ctx->d_ctx_a,
-				ctx->d_ctx_b
-			)
-	    );
+		if(ALGO == cryptonight_monero_v8)
+		{
+			// two threads per block
+			CUDA_CHECK_MSG_KERNEL(
+				ctx->device_id,
+				"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
+				cryptonight_core_gpu_phase2_double<ITERATIONS,MEMORY,MASK,ALGO, MEM_MODE><<<
+					grid,
+					block2,
+					sizeof(uint64_t) * block2.x * 8 +
+						// shuffle memory for fermi gpus
+						block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
+				>>>(
+					ctx->device_blocks*ctx->device_threads,
+					ctx->device_bfactor,
+					i,
+					ctx->d_long_state,
+					ctx->d_ctx_a,
+					ctx->d_ctx_b,
+					ctx->d_ctx_state,
+					nonce,
+					ctx->d_input
+				)
+			);
+		}
+		else
+		{
+			CUDA_CHECK_MSG_KERNEL(
+				ctx->device_id,
+				"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
+				cryptonight_core_gpu_phase2_quad<ITERATIONS,MEMORY,MASK,ALGO><<<
+					grid,
+					block4,
+					block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
+				>>>(
+					ctx->device_blocks*ctx->device_threads,
+					ctx->device_bfactor,
+					i,
+					ctx->d_long_state,
+					ctx->d_ctx_a,
+					ctx->d_ctx_b,
+					ctx->d_ctx_state,
+					nonce,
+					ctx->d_input
+				)
+			);
+		}
 
 		if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
 	}
 
-	for ( int i = 0; i < partcountOneThree; i++ )
+	int roundsPhase3 = partcountOneThree;
+
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,THREAD_SHIFT><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
+		// cryptonight_heavy used two full rounds over the scratchpad memory
+		roundsPhase3 *= 2;
+	}
+
+	for ( int i = 0; i < roundsPhase3; i++ )
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY, ALGO><<<
+			grid,
+			block8,
+			block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
+		>>>( ctx->device_blocks*ctx->device_threads,
 			bfactorOneThree, i,
 			ctx->d_long_state,
 			ctx->d_ctx_state, ctx->d_ctx_key2 ));
 	}
 }
 
-void cryptonight_core_cpu_hash(nvid_ctx* ctx, bool mineMonero)
+void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce)
 {
-#ifndef CONF_NO_MONERO
-	if(mineMonero)
-	{
-		cryptonight_core_gpu_hash<MONERO_ITER, MONERO_MASK, 19u>(ctx);
-	}
-#endif
-#ifndef CONF_NO_AEON
-	if(!mineMonero)
-	{
-		cryptonight_core_gpu_hash<AEON_ITER, AEON_MASK, 18u>(ctx);
-	}
-#endif
+	typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce);
+	
+	if(miner_algo == invalid_algo) return;
+	
+	static const cuda_hash_fn func_table[] = {
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 1>
+	};
+
+	std::bitset<1> digit;
+	digit.set(0, ctx->memMode == 1);
+
+	cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ];
+	selected_function(ctx, startNonce);
 }
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index d865e1358..433e175dd 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -28,6 +28,7 @@ uint64_t keccakf_rndc[24] ={
 typedef unsigned char BitSequence;
 typedef unsigned long long DataLength;
 
+#include "xmrstak/backend/cryptonight.hpp"
 #include "cryptonight.hpp"
 #include "cuda_extra.hpp"
 #include "cuda_keccak.hpp"
@@ -36,6 +37,7 @@ typedef unsigned long long DataLength;
 #include "cuda_jh.hpp"
 #include "cuda_skein.hpp"
 #include "cuda_device.hpp"
+#include "cuda_aes.hpp"
 
 __constant__ uint8_t d_sub_byte[16][16] ={
 	{0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 },
@@ -90,10 +92,33 @@ __device__ __forceinline__ void cryptonight_aes_set_key( uint32_t * __restrict__
 	}
 }
 
-__global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 )
+__device__ __forceinline__ void mix_and_propagate( uint32_t* state )
+{
+	uint32_t tmp0[4];
+	for(size_t x = 0; x < 4; ++x)
+		tmp0[x] = (state)[x];
+
+	// set destination [0,6]
+	for(size_t t = 0; t < 7; ++t)
+		for(size_t x = 0; x < 4; ++x)
+			(state + 4 * t)[x] = (state + 4 * t)[x] ^ (state + 4 * (t + 1))[x];
+
+	// set destination 7
+	for(size_t x = 0; x < 4; ++x)
+		(state + 4 * 7)[x] = (state + 4 * 7)[x] ^ tmp0[x];
+}
+
+template<xmrstak_algo ALGO>
+__global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_state2, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 )
 {
 	int thread = ( blockDim.x * blockIdx.x + threadIdx.x );
+	__shared__ uint32_t sharedMemory[1024];
 
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	{
+		cn_aes_gpu_init( sharedMemory );
+		__syncthreads( );
+	}
 	if ( thread >= threads )
 		return;
 
@@ -113,20 +138,57 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	cn_keccak( (uint8_t *) input, len, (uint8_t *) ctx_state );
 	cryptonight_aes_set_key( ctx_key1, ctx_state );
 	cryptonight_aes_set_key( ctx_key2, ctx_state + 8 );
+
 	XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a );
 	XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b );
-
-	memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 );
 	memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 );
-	memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 );
+	if(ALGO == cryptonight_monero_v8)
+	{
+		memcpy( d_ctx_b + thread * 12, ctx_b, 4 * 4 );
+		// bx1
+		XOR_BLOCKS_DST( ctx_state + 16, ctx_state + 20, ctx_b );
+		memcpy( d_ctx_b + thread * 12 + 4, ctx_b, 4 * 4 );
+		// division_result
+		memcpy( d_ctx_b + thread * 12 + 2 * 4, ctx_state + 24, 4 * 2 );
+		// sqrt_result
+		memcpy( d_ctx_b + thread * 12 + 2 * 4 + 2, ctx_state + 26, 4 * 2 );
+	}
+	else
+		memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 );
+
 	memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 );
 	memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 );
+	memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 );
+
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	{
+
+		for(int i=0; i < 16; i++)
+		{
+			for(size_t t = 4; t < 12; ++t)
+			{
+				cn_aes_pseudo_round_mut( sharedMemory, ctx_state + 4u * t, ctx_key1 );
+			}
+			// scipt first 4 * 128bit blocks = 4 * 4 uint32_t values
+			mix_and_propagate(ctx_state + 4 * 4);
+		}
+		// double buffer to move manipulated state into phase1
+		memcpy( d_ctx_state2 + thread * 50, ctx_state, 50 * 4 );
+	}
 }
 
-__global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state )
+template<xmrstak_algo ALGO>
+__global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
 {
 	const int thread = blockDim.x * blockIdx.x + threadIdx.x;
 
+	__shared__ uint32_t sharedMemory[1024];
+
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	{
+		cn_aes_gpu_init( sharedMemory );
+		__syncthreads( );
+	}
 	if ( thread >= threads )
 		return;
 
@@ -135,10 +197,27 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 	uint64_t hash[4];
 	uint32_t state[50];
 
-#pragma unroll
+	#pragma unroll
 	for ( i = 0; i < 50; i++ )
 		state[i] = ctx_state[i];
 
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	{
+		uint32_t key[40];
+
+		// load keys
+		MEMCPY8( key, d_ctx_key2 + thread * 40, 20 );
+
+		for(int i=0; i < 16; i++)
+		{
+			for(size_t t = 4; t < 12; ++t)
+			{
+				cn_aes_pseudo_round_mut( sharedMemory, state + 4u * t, key );
+			}
+			// scipt first 4 * 128bit blocks = 4 * 4 uint32_t values
+			mix_and_propagate(state + 4 * 4);
+		}
+	}
 	cn_keccakf2( (uint64_t *) state );
 
 	switch ( ( (uint8_t *) state )[0] & 0x03 )
@@ -204,25 +283,44 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 		break;
 
 	};
-	CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
 
-	size_t hashMemSize;
-	if(::jconf::inst()->IsCurrencyMonero())
+	// prefer shared memory over L1 cache
+	CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
+
+	size_t hashMemSize = std::max(
+		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
+		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
+	);
+
+	size_t wsize = ctx->device_blocks * ctx->device_threads;
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state, 50 * sizeof(uint32_t) * wsize));
+	size_t ctx_b_size = 4 * sizeof(uint32_t) * wsize;
+	if(
+		cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
+		cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
+		cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()
+	)
 	{
-		hashMemSize = MONERO_MEMORY;
+		// extent ctx_b to hold the state of idx0
+		ctx_b_size += sizeof(uint32_t) * wsize;
+		// create a double buffer for the state to exchange the mixed state to phase1
+		CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize));
 	}
-	else
+	else if(cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
+			cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
 	{
-		hashMemSize = AEON_MEMORY;
+		// bx1 (16byte), division_result (8byte) and sqrt_result (8byte)
+		ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize;
 	}
+	else
+		ctx->d_ctx_state2 = ctx->d_ctx_state;
 
-	size_t wsize = ctx->device_blocks * ctx->device_threads;
-	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state, 50 * sizeof(uint32_t) * wsize));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key1, 40 * sizeof(uint32_t) * wsize));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key2, 40 * sizeof(uint32_t) * wsize));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_text, 32 * sizeof(uint32_t) * wsize));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_a, 4 * sizeof(uint32_t) * wsize));
-	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_b, 4 * sizeof(uint32_t) * wsize));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_b, ctx_b_size));
+	// POW block format http://monero.wikia.com/wiki/PoW_Block_Header_Format
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 21 * sizeof (uint32_t ) ));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof (uint32_t ) ));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof (uint32_t ) ));
@@ -233,7 +331,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	return 1;
 }
 
-extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce)
+extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, xmrstak_algo miner_algo)
 {
 	int threadsperblock = 128;
 	uint32_t wsize = ctx->device_blocks * ctx->device_threads;
@@ -241,11 +339,37 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce
 	dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock );
 	dim3 block( threadsperblock );
 
-	CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-		ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+	if(miner_algo == cryptonight_heavy)
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_heavy><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+	}
+	else if(miner_algo == cryptonight_haven)
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_haven><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+	}
+	else if(miner_algo == cryptonight_bittube2)
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_bittube2><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+	}
+	if(miner_algo == cryptonight_monero_v8)
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_monero_v8><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+	}
+	else
+	{
+		/* pass two times d_ctx_state because the second state is used later in phase1,
+		 * the first is used than in phase3
+		 */
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<invalid_algo><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+			ctx->d_ctx_state, ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+	}
 }
 
-extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce)
+extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce,xmrstak_algo miner_algo)
 {
 	int threadsperblock = 128;
 	uint32_t wsize = ctx->device_blocks * ctx->device_threads;
@@ -256,14 +380,46 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 	CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_nonce, 0xFF, 10 * sizeof (uint32_t ) ));
 	CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_count, 0, sizeof (uint32_t ) ));
 
-	CUDA_CHECK_MSG_KERNEL(
-		ctx->device_id,
-		"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
-		cryptonight_extra_gpu_final<<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state )
-	);
+	if(miner_algo == cryptonight_heavy)
+	{
+		CUDA_CHECK_MSG_KERNEL(
+			ctx->device_id,
+			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
+			cryptonight_extra_gpu_final<cryptonight_heavy><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
+		);
+	}
+	else if(miner_algo == cryptonight_haven)
+	{
+		CUDA_CHECK_MSG_KERNEL(
+			ctx->device_id,
+			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
+			cryptonight_extra_gpu_final<cryptonight_haven><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
+		);
+	}
+	else if(miner_algo == cryptonight_bittube2)
+	{
+		CUDA_CHECK_MSG_KERNEL(
+			ctx->device_id,
+			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
+			cryptonight_extra_gpu_final<cryptonight_bittube2><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
+		);
+	}
+	else
+	{
+		// fallback for all other algorithms
+		CUDA_CHECK_MSG_KERNEL(
+			ctx->device_id,
+			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
+			cryptonight_extra_gpu_final<invalid_algo><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
+		);
+	}
 
 	CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
-	CUDA_CHECK(ctx->device_id, cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
+	CUDA_CHECK_MSG(
+		ctx->device_id,
+		"\n**suggestion: Try to increase the attribute 'bfactor' in the NVIDIA config file.**",
+		cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost )
+	);
 
 	/* There is only a 32bit limit for the counter on the device side
 	 * therefore this value can be greater than 10, in that case limit rescount
@@ -317,19 +473,22 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 
 	if(version < CUDART_VERSION)
 	{
-		printf("Driver does not support CUDA %d.%d API! Update your nVidia driver!\n", CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10);
+		printf("WARNING: Driver supports CUDA %d.%d but this was compiled for CUDA %d.%d API! Update your nVidia driver or compile with older CUDA!\n",
+			version / 1000, (version % 1000 / 10),
+			CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10);
 		return 1;
 	}
 
 	int GPU_N;
 	if(cuda_get_devicecount(&GPU_N) == 0)
 	{
+		printf("WARNING: CUDA claims zero devices?\n");
 		return 1;
 	}
 
 	if(ctx->device_id >= GPU_N)
 	{
-		printf("Invalid device ID!\n");
+		printf("WARNING: Invalid device ID '%i'!\n", ctx->device_id);
 		return 1;
 	}
 
@@ -350,6 +509,11 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 
 	ctx->name = std::string(props.name);
 
+	printf("CUDA [%d.%d/%d.%d] GPU#%d, device architecture %d: \"%s\"... ",
+		version / 1000, (version % 1000 / 10),
+		CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10,
+		ctx->device_id, gpuArch, ctx->device_name);
+
 	std::vector<int> arch;
 #define XMRSTAK_PP_TOSTRING1(str) #str
 #define XMRSTAK_PP_TOSTRING(str) XMRSTAK_PP_TOSTRING1(str)
@@ -358,18 +522,19 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 #undef XMRSTAK_PP_TOSTRING1
 	std::stringstream ss(archStringList);
 
-	//transform string list sperated with `+` into a vector of integers
+	//transform string list separated with `+` into a vector of integers
 	int tmpArch;
 	while ( ss >> tmpArch )
 		arch.push_back( tmpArch );
 
+	#define MSG_CUDA_NO_ARCH "WARNING: skip device - binary does not contain required device architecture\n"
 	if(gpuArch >= 20 && gpuArch < 30)
 	{
 		// compiled binary must support sm_20 for fermi
 		std::vector<int>::iterator it = std::find(arch.begin(), arch.end(), 20);
 		if(it == arch.end())
 		{
-			printf("WARNING: NVIDIA GPU %d: miner not compiled for the gpu architecture %d.\n", ctx->device_id, gpuArch);
+			printf(MSG_CUDA_NO_ARCH);
 			return 5;
 		}
 	}
@@ -383,21 +548,21 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		 *   with a sm_20 only compiled binary
 		 */
 		for(int i = 0; i < arch.size(); ++i)
-			if(minSupportedArch == 0 || (arch[i] >= 30 && arch[i] < minSupportedArch))
+			if(arch[i] >= 30  && (minSupportedArch == 0 || arch[i] < minSupportedArch))
 				minSupportedArch = arch[i];
 		if(minSupportedArch < 30 || gpuArch < minSupportedArch)
 		{
-			printf("WARNING: NVIDIA GPU %d: miner not compiled for the gpu architecture %d.\n", ctx->device_id, gpuArch);
+			printf(MSG_CUDA_NO_ARCH);
 			return 5;
 		}
 	}
 
-	// set all evice option those marked as auto (-1) to a valid value
+	// set all device option those marked as auto (-1) to a valid value
 	if(ctx->device_blocks == -1)
 	{
 		/* good values based of my experience
-		 *	 - 3 * SMX count >=sm_30
-		 *   - 2 * SMX count for <sm_30
+		 *   - 3 * SMX count for >=sm_30
+		 *   - 2 * SMX count for  <sm_30
 		 */
 		ctx->device_blocks = props.multiProcessorCount *
 			( props.major < 3 ? 2 : 3 );
@@ -414,7 +579,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		 */
 		ctx->device_threads = 64;
 		constexpr size_t byteToMiB = 1024u * 1024u;
-		
+
 		// no limit by default 1TiB
 		size_t maxMemUsage = byteToMiB * byteToMiB;
 		if(props.major == 6)
@@ -441,20 +606,27 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 			maxMemUsage = size_t(1024u) * byteToMiB;
 		}
 
+		if(props.multiProcessorCount <= 6)
+		{
+			// limit memory usage for low end devices to reduce the number of threads
+			maxMemUsage = size_t(1024u) * byteToMiB;
+		}
+
 		int* tmp;
 		cudaError_t err;
+		#define MSG_CUDA_FUNC_FAIL "WARNING: skip device - %s failed\n"
 		// a device must be selected to get the right memory usage later on
 		err = cudaSetDevice(ctx->device_id);
 		if(err != cudaSuccess)
 		{
-			printf("WARNING: NVIDIA GPU %d: cannot be selected.\n", ctx->device_id);
+			printf(MSG_CUDA_FUNC_FAIL, "cudaSetDevice");
 			return 2;
 		}
 		// trigger that a context on the gpu will be allocated
 		err = cudaMalloc(&tmp, 256);
 		if(err != cudaSuccess)
 		{
-			printf("WARNING: NVIDIA GPU %d: context cannot be created.\n", ctx->device_id);
+			printf(MSG_CUDA_FUNC_FAIL, "cudaMalloc");
 			return 3;
 		}
 
@@ -466,19 +638,14 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		CUDA_CHECK(ctx->device_id, cudaFree(tmp));
 		// delete created context on the gpu
 		CUDA_CHECK(ctx->device_id, cudaDeviceReset());
-		
+
 		ctx->total_device_memory = totalMemory;
 		ctx->free_device_memory = freeMemory;
 
-		size_t hashMemSize;
-		if(::jconf::inst()->IsCurrencyMonero())
-		{
-			hashMemSize = MONERO_MEMORY;
-		}
-		else
-		{
-			hashMemSize = AEON_MEMORY;
-		}
+		size_t hashMemSize = std::max(
+			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
+			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
+		);
 
 #ifdef WIN32
 		/* We use in windows bfactor (split slow kernel into smaller parts) to avoid
@@ -492,9 +659,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		size_t usedMem = totalMemory - freeMemory;
 		if(usedMem >= maxMemUsage)
 		{
-			printf("WARNING: NVIDIA GPU %d: already %s MiB memory in use, skip GPU.\n",
-				ctx->device_id,
-				std::to_string(usedMem/byteToMiB).c_str());
+			printf("WARNING: skip device - already %s MiB memory in use\n", std::to_string(usedMem/byteToMiB).c_str());
 			return 4;
 		}
 		else
@@ -508,6 +673,13 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		// up to 16kibyte extra memory is used per thread for some kernel (lmem/local memory)
 		// 680bytes are extra meta data memory per hash
 		size_t perThread = hashMemSize + 16192u + 680u;
+		if(
+			cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
+			cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
+			cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()
+		)
+			perThread += 50 * 4; // state double buffer
+
 		size_t max_intensity = limitedMemory / perThread;
 		ctx->device_threads = max_intensity / ctx->device_blocks;
 		// use only odd number of threads
@@ -519,7 +691,27 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 			ctx->device_threads = 64;
 		}
 
+		// check if cryptonight_monero_v8 is selected for the user pool
+		bool useCryptonight_v8 =
+			::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
+			::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8;
+
+		// overwrite default config if cryptonight_monero_v8 is mined and GPU has at least compute capability 5.0
+		if(useCryptonight_v8 && gpuArch >= 50)
+		{
+			// 4 based on my test maybe it must be adjusted later
+			size_t threads = 4;
+			// 8 is chosen by checking the occupancy calculator
+			size_t blockOptimal = 8 * ctx->device_mpcount;
+
+			if(blockOptimal * threads * hashMemSize < limitedMemory)
+			{
+				ctx->device_threads = threads;
+				ctx->device_blocks = blockOptimal;
+			}
+		}
 	}
+	printf("device init succeeded\n");
 
 	return 0;
 }
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp
index 055a8bdfc..4d369f843 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp
@@ -36,8 +36,8 @@ __forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int
 	{
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-	} 
-	else 
+	}
+	else
 	{
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
new file mode 100644
index 000000000..796b7adda
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <stdint.h>
+
+__device__ __forceinline__ uint32_t get_reciprocal(uint32_t a)
+{
+	const float a_hi = __uint_as_float((a >> 8) + ((126U + 31U) << 23));
+	const float a_lo = __uint2float_rn(a & 0xFF);
+
+	float r;
+	asm("rcp.approx.f32 %0, %1;" : "=f"(r) : "f"(a_hi));
+	const float r_scaled = __uint_as_float(__float_as_uint(r) + (64U << 23));
+
+	const float h = __fmaf_rn(a_lo, r, __fmaf_rn(a_hi, r, -1.0f));
+	return (__float_as_uint(r) << 9) - __float2int_rn(h * r_scaled);
+}
+
+__device__ __forceinline__ uint64_t fast_div_v2(uint64_t a, uint32_t b)
+{
+	const uint32_t r = get_reciprocal(b);
+	const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * ((uint32_t*)&a)[1]) + a;
+
+	uint32_t q[2];
+	q[0] = ((uint32_t*)&k)[1];
+
+	int64_t tmp = a - (uint64_t)(q[0]) * b;
+	((int32_t*)(&tmp))[1] -= (k < a) ? b : 0;
+
+	const bool overshoot = ((int32_t*)(&tmp))[1] < 0;
+	const bool undershoot = tmp >= b;
+
+	q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U);
+	q[1] = ((uint32_t*)(&tmp))[0] + (overshoot ? b : 0U) - (undershoot ? b : 0U);
+
+	return *((uint64_t*)(q));
+}
+
+__device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1)
+{
+	float x = __uint_as_float((((uint32_t*)&n1)[1] >> 9) + ((64U + 127U) << 23));
+	float x1;
+	asm("rsqrt.approx.f32 %0, %1;" : "=f"(x1) : "f"(x));
+	asm("sqrt.approx.f32 %0, %1;" : "=f"(x) : "f"(x));
+
+	// The following line does x1 *= 4294967296.0f;
+	x1 = __uint_as_float(__float_as_uint(x1) + (32U << 23));
+
+	const uint32_t x0 = __float_as_uint(x) - (158U << 23);
+	const int64_t delta0 = n1 - (((int64_t)(x0) * x0) << 18);
+	const float delta = __int2float_rn(((int32_t*)&delta0)[1]) * x1;
+
+	uint32_t result = (x0 << 10) + __float2int_rn(delta);
+	const uint32_t s = result >> 1;
+	const uint32_t b = result & 1;
+
+	const uint64_t x2 = (uint64_t)(s) * (s + b) + ((uint64_t)(result) << 32) - n1;
+	const int32_t overshoot = ((int64_t)(x2 + b) > 0) ? -1 : 0;
+	const int32_t undershoot = ((int64_t)(x2 + 0x100000000UL + s) < 0) ? 1 : 0;
+	result += (overshoot+undershoot);
+	return result;
+}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp
index a37934cb8..d5a98b7da 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp
@@ -167,7 +167,7 @@ __device__ void cn_groestl_F512(uint32_t * __restrict__ h, const uint32_t * __re
 	uint32_t y[2*GROESTL_COLS512];
 	uint32_t z[2*GROESTL_COLS512];
 
-	for (i = 0; i < 2*GROESTL_COLS512; i++) 
+	for (i = 0; i < 2*GROESTL_COLS512; i++)
 	{
 		z[i] = m[i];
 		Ptmp[i] = h[i]^m[i];
@@ -227,23 +227,23 @@ __device__ void cn_groestl_outputtransformation(groestlHashState *ctx)
 __device__ void cn_groestl_transform(groestlHashState * __restrict__ ctx,
 	const uint8_t * __restrict__ input, int msglen)
 {
-	for (; msglen >= GROESTL_SIZE512; msglen -= GROESTL_SIZE512, input += GROESTL_SIZE512) 
+	for (; msglen >= GROESTL_SIZE512; msglen -= GROESTL_SIZE512, input += GROESTL_SIZE512)
 	{
 		cn_groestl_F512(ctx->chaining,(uint32_t*)input);
 		ctx->block_counter1++;
 
-		if (ctx->block_counter1 == 0) 
+		if (ctx->block_counter1 == 0)
 			ctx->block_counter2++;
 	}
 }
 
-__device__ void cn_groestl_final(groestlHashState*  __restrict__ ctx, 
+__device__ void cn_groestl_final(groestlHashState*  __restrict__ ctx,
 	BitSequence* __restrict__  output)
 {
 	int i, j = 0, hashbytelen = GROESTL_HASH_BIT_LEN/8;
 	uint8_t *s = (BitSequence*)ctx->chaining;
 
-	if (ctx->bits_in_last_byte) 
+	if (ctx->bits_in_last_byte)
 	{
 		ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<ctx->bits_in_last_byte)-1)<<(8-ctx->bits_in_last_byte);
 		ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-ctx->bits_in_last_byte);
@@ -254,9 +254,9 @@ __device__ void cn_groestl_final(groestlHashState*  __restrict__ ctx,
 		ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
 	}
 
-	if (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) 
+	if (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN)
 	{
-		while (ctx->buf_ptr < GROESTL_SIZE512) 
+		while (ctx->buf_ptr < GROESTL_SIZE512)
 			ctx->buffer[(int)ctx->buf_ptr++] = 0;
 
 		cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
@@ -300,14 +300,14 @@ __device__ void cn_groestl_update(groestlHashState* __restrict__ ctx,
 	int msglen = (int)(databitlen/8);
 	int rem = (int)(databitlen%8);
 
-	if (ctx->buf_ptr) 
+	if (ctx->buf_ptr)
 	{
 		while (ctx->buf_ptr < GROESTL_SIZE512 && index < msglen)
 			ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
 
-		if (ctx->buf_ptr < GROESTL_SIZE512) 
+		if (ctx->buf_ptr < GROESTL_SIZE512)
 		{
-			if (rem) 
+			if (rem)
 			{
 				ctx->bits_in_last_byte = rem;
 				ctx->buffer[(int)ctx->buf_ptr++] = input[index];
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp
index 679046ecf..284039ff4 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp
@@ -111,7 +111,7 @@ __device__ void cn_jh_E8(jhHashState *state)
 {
 	uint64_t i,roundnumber,temp0,temp1;
 
-	for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7) 
+	for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7)
 	{
 		for (i = 0; i < 2; i++)
 		{
@@ -155,13 +155,13 @@ __device__ void cn_jh_E8(jhHashState *state)
 			JH_SWAP32(state->x[1][i]); JH_SWAP32(state->x[3][i]); JH_SWAP32(state->x[5][i]); JH_SWAP32(state->x[7][i]);
 		}
 
-		for (i = 0; i < 2; i++) 
+		for (i = 0; i < 2; i++)
 		{
 			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+6])[i],((uint64_t *)d_E8_rc[roundnumber+6])[i+2] );
 			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
 		}
 
-		for (i = 1; i < 8; i = i+2) 
+		for (i = 1; i < 8; i = i+2)
 		{
 			temp0 = state->x[i][0];
 			state->x[i][0] = state->x[i][1];
@@ -190,7 +190,7 @@ __device__ void cn_jh_update(jhHashState * __restrict__ state, const BitSequence
 	state->databitlen += databitlen;
 	index = 0;
 
-	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512)  ) 
+	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512)  )
 	{
 		if ( (databitlen & 7) == 0 )
 			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3));
@@ -215,7 +215,7 @@ __device__ void cn_jh_update(jhHashState * __restrict__ state, const BitSequence
 		cn_jh_F8(state);
 	}
 
-	if ( databitlen > 0) 
+	if ( databitlen > 0)
 	{
 		if ((databitlen & 7) == 0)
 			memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3);
@@ -247,7 +247,7 @@ __device__ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __re
 		state->buffer[56] = (state->databitlen >> 56) & 0xff;
 		cn_jh_F8(state);
 	}
-	else 
+	else
 	{
 		/*set the rest of the bytes in the buffer to 0*/
 		if ( (state->datasize_in_buffer & 7) == 0)
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp
index 041a593b7..fc45db1ae 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp
@@ -221,7 +221,7 @@ __device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, co
 		ctx->X[7] = X7 ^ w[7];
 
 		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
-	} 
+	}
 	while (--blkCnt);
 
 	ctx->h.T[0] = ts[0];
@@ -239,7 +239,7 @@ __device__ void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __
 
 	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;
 
-	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) 
+	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)
 	{
 		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
 		//p8 = &ctx->b[ctx->h.bCnt];
@@ -258,7 +258,7 @@ __device__ void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __
 
 	memcpy(X,ctx->X,sizeof(X));
 
-	for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) 
+	for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
 	{
 		((uint64_t *)ctx->b)[0]= (uint64_t)i;
 		Skein_Start_New_Type(ctx,OUT_FINAL);
@@ -275,15 +275,15 @@ __device__ void cn_skein512_update(Skein_512_Ctxt_t * __restrict__ ctx, const ui
 {
 	size_t n;
 
-	if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) 
+	if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
 	{
 
-		if (ctx->h.bCnt) 
+		if (ctx->h.bCnt)
 		{
 
 			n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;
 
-			if (n) 
+			if (n)
 			{
 				memcpy(&ctx->b[ctx->h.bCnt],msg,n);
 				msgByteCnt  -= n;
@@ -295,7 +295,7 @@ __device__ void cn_skein512_update(Skein_512_Ctxt_t * __restrict__ ctx, const ui
 			ctx->h.bCnt = 0;
 		}
 
-		if (msgByteCnt > SKEIN_512_BLOCK_BYTES) 
+		if (msgByteCnt > SKEIN_512_BLOCK_BYTES)
 		{
 			n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES;
 			cn_skein512_processblock(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
@@ -304,7 +304,7 @@ __device__ void cn_skein512_update(Skein_512_Ctxt_t * __restrict__ ctx, const ui
 		}
 	}
 
-	if (msgByteCnt) 
+	if (msgByteCnt)
 	{
 		memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
 		ctx->h.bCnt += msgByteCnt;
@@ -313,11 +313,11 @@ __device__ void cn_skein512_update(Skein_512_Ctxt_t * __restrict__ ctx, const ui
 
 __device__ void cn_skein_update(skeinHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen)
 {
-	if ((databitlen & 7) == 0) 
+	if ((databitlen & 7) == 0)
 	{
 		cn_skein512_update(&state->u.ctx_512,data,databitlen >> 3);
 	}
-	else 
+	else
 	{
 
 		size_t bCnt = (databitlen >> 3) + 1;
diff --git a/xmrstak/backend/plugin.hpp b/xmrstak/backend/plugin.hpp
index 89cdf97f8..5c7dfe16a 100644
--- a/xmrstak/backend/plugin.hpp
+++ b/xmrstak/backend/plugin.hpp
@@ -27,8 +27,11 @@ namespace xmrstak
 struct plugin
 {
 
-	plugin(const std::string backendName, const std::string libName) : fn_starterBackend(nullptr), m_backendName(backendName)
+	plugin() = default;
+
+	void load(const std::string backendName, const std::string libName)
 	{
+		m_backendName = backendName;
 #ifdef WIN32
 		libBackend = LoadLibrary(TEXT((libName + ".dll").c_str()));
 		if(!libBackend)
@@ -59,15 +62,15 @@ struct plugin
 #endif
 
 #ifdef WIN32
-		fn_starterBackend = (starterBackend_t) GetProcAddress(libBackend, "xmrstak_start_backend");
-		if (!fn_starterBackend)
+		fn_startBackend = (startBackend_t) GetProcAddress(libBackend, "xmrstak_start_backend");
+		if (!fn_startBackend)
 		{
 			std::cerr << "WARNING: backend plugin " << libName << " contains no entry 'xmrstak_start_backend': " <<GetLastError()<< std::endl;
 		}
 #else
 		// reset last error
 		dlerror();
-		fn_starterBackend = (starterBackend_t) dlsym(libBackend, "xmrstak_start_backend");
+		fn_startBackend = (startBackend_t) dlsym(libBackend, "xmrstak_start_backend");
 		const char* dlsym_error = dlerror();
 		if(dlsym_error)
 		{
@@ -78,35 +81,39 @@ struct plugin
 
 	std::vector<iBackend*>* startBackend(uint32_t threadOffset, miner_work& pWork, environment& env)
 	{
-		if(fn_starterBackend == nullptr)
+		if(fn_startBackend == nullptr)
 		{
 			std::vector<iBackend*>* pvThreads = new std::vector<iBackend*>();
-			std::cerr << "WARNING: " << m_backendName << " Backend disabled"<< std::endl;
 			return pvThreads;
 		}
 
-		return fn_starterBackend(threadOffset, pWork, env);
+		return fn_startBackend(threadOffset, pWork, env);
+	}
+
+	void unload()
+	{
+		if(libBackend)
+		{
+#ifdef WIN32
+			FreeLibrary(libBackend);
+#else
+			dlclose(libBackend);
+#endif
+		}
+		fn_startBackend = nullptr;
 	}
 
 	std::string m_backendName;
 
-	typedef std::vector<iBackend*>* (*starterBackend_t)(uint32_t threadOffset, miner_work& pWork, environment& env);
+	typedef std::vector<iBackend*>* (*startBackend_t)(uint32_t threadOffset, miner_work& pWork, environment& env);
 
-	starterBackend_t fn_starterBackend;
+	startBackend_t fn_startBackend = nullptr;
 
 #ifdef WIN32
 	HINSTANCE libBackend;
 #else
-	void *libBackend;
-#endif
-
-/* \todo add unload to destructor and change usage of plugin that libs keeped open until the miner endss
-#ifdef WIN32
-	FreeLibrary(libBackend);
-#else
-	dlclose(libBackend);
+	void *libBackend = nullptr;
 #endif
- * */
 };
 
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/backend/pool_data.hpp b/xmrstak/backend/pool_data.hpp
new file mode 100644
index 000000000..4e92359ec
--- /dev/null
+++ b/xmrstak/backend/pool_data.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+constexpr static size_t invalid_pool_id = (-1);
+
+namespace xmrstak
+{
+
+struct pool_data
+{
+	uint32_t iSavedNonce;
+	size_t   pool_id;
+
+	pool_data() : iSavedNonce(0), pool_id(invalid_pool_id)
+	{
+	}
+};
+
+} // namespace xmrstak
diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp
index b84b783e2..c57416f2e 100644
--- a/xmrstak/cli/cli-miner.cpp
+++ b/xmrstak/cli/cli-miner.cpp
@@ -55,7 +55,7 @@
 #	include "xmrstak/misc/uac.hpp"
 #endif // _WIN32
 
-void do_benchmark();
+int do_benchmark(int block_version, int wait_sec, int work_sec);
 
 void help()
 {
@@ -64,43 +64,56 @@ void help()
 
 	cout<<"Usage: "<<params::inst().binaryName<<" [OPTION]..."<<endl;
 	cout<<" "<<endl;
-	cout<<"  -h, --help            show this help"<<endl;
-	cout<<"  -v, --version         show version number"<<endl;
-	cout<<"  -V, --version-long    show long version number"<<endl;
-	cout<<"  -c, --config FILE     common miner configuration file"<<endl;
+	cout<<"  -h, --help                 show this help"<<endl;
+	cout<<"  --hide                     hide results screen"<<endl;
+	cout<<"  -v, --version              show version number"<<endl;
+	cout<<"  -V, --version-long         show long version number"<<endl;
+	cout<<"  -c, --config FILE          common miner configuration file"<<endl;
+	cout<<"  -C, --poolconf FILE        pool configuration file"<<endl;
 #ifdef _WIN32
-	cout<<"  --noUAC               disable the UAC dialog"<<endl;
-#endif
-#if (!defined(CONF_NO_AEON)) && (!defined(CONF_NO_MONERO))
-	cout<<"  --currency NAME       currency to mine: monero or aeon"<<endl;
+	cout<<"  --noUAC                    disable the UAC dialog"<<endl;
 #endif
+	cout<<"  --benchmark BLOCKVERSION   ONLY do a benchmark and exit"<<endl;
+	cout<<"  --benchwait WAIT_SEC             ... benchmark wait time"<<endl;
+	cout<<"  --benchwork WORK_SEC             ... benchmark work time"<<endl;
 #ifndef CONF_NO_CPU
-	cout<<"  --noCPU               disable the CPU miner backend"<<endl;
-	cout<<"  --cpu FILE            CPU backend miner config file"<<endl;
+	cout<<"  --noCPU                    disable the CPU miner backend"<<endl;
+	cout<<"  --cpu FILE                 CPU backend miner config file"<<endl;
 #endif
 #ifndef CONF_NO_OPENCL
-	cout<<"  --noAMD               disable the AMD miner backend"<<endl;
-	cout<<"  --amd FILE            AMD backend miner config file"<<endl;
+	cout<<"  --noAMD                    disable the AMD miner backend"<<endl;
+	cout<<"  --noAMDCache               disable the AMD(OpenCL) cache for precompiled binaries"<<endl;
+	cout<<"  --openCLVendor VENDOR      use OpenCL driver of VENDOR and devices [AMD,NVIDIA]"<<endl;
+	cout<<"                             default: AMD"<<endl;
+	cout<<"  --amd FILE                 AMD backend miner config file"<<endl;
 #endif
 #ifndef CONF_NO_CUDA
-	cout<<"  --noNVIDIA            disable the NVIDIA miner backend"<<endl;
-	cout<<"  --nvidia FILE         NVIDIA backend miner config file"<<endl;
+	cout<<"  --noNVIDIA                 disable the NVIDIA miner backend"<<endl;
+	cout<<"  --nvidia FILE              NVIDIA backend miner config file"<<endl;
+#endif
+#ifndef CONF_NO_HTTPD
+	cout<<"  -i --httpd HTTP_PORT       HTTP interface port"<<endl;
 #endif
 	cout<<" "<<endl;
 	cout<<"The following options can be used for automatic start without a guided config,"<<endl;
 	cout<<"If config exists then this pool will be top priority."<<endl;
-	cout<<"  -o, --url URL         pool url and port, e.g. pool.usxmrpool.com:3333"<<endl;
-	cout<<"  -O, --tls-url URL     TLS pool url and port, e.g. pool.usxmrpool.com:10443"<<endl;
-	cout<<"  -u, --user USERNAME   pool user name or wallet address"<<endl;
-	cout<<"  -p, --pass PASSWD     pool password, in the most cases x or empty \"\""<<endl;
-	cout<<"  --use-nicehash        the pool should run in nicehash mode"<<endl;
-	cout<<" \n"<<endl;
+	cout<<"  -o, --url URL              pool url and port, e.g. pool.usxmrpool.com:3333"<<endl;
+	cout<<"  -O, --tls-url URL          TLS pool url and port, e.g. pool.usxmrpool.com:10443"<<endl;
+	cout<<"  -u, --user USERNAME        pool user name or wallet address"<<endl;
+	cout<<"  -r, --rigid RIGID          rig identifier for pool-side statistics (needs pool support)"<<endl;
+	cout<<"  -p, --pass PASSWD          pool password, in the most cases x or empty \"\""<<endl;
+	cout<<"  --use-nicehash             the pool should run in nicehash mode"<<endl;
+	cout<<"  --currency NAME            currency to mine"<<endl;
+	cout<< endl;
 #ifdef _WIN32
 	cout<<"Environment variables:\n"<<endl;
-	cout<<"  XMRSTAK_NOWAIT        disable the dialog `Press any key to exit."<<std::endl;
-	cout<<"                	       for non UAC execution"<<endl;
-	cout<<" \n"<<endl;
+	cout<<"  XMRSTAK_NOWAIT             disable the dialog `Press any key to exit."<<std::endl;
+	cout<<"                	            for non UAC execution"<<endl;
+	cout<< endl;
 #endif
+	std::string algos;
+	jconf::GetAlgoList(algos);
+	cout<< "Supported coin options: " << endl << algos << endl;
 	cout<< "Version: " << get_version_str_short() << endl;
 	cout<<"Brought to by fireice_uk and psychocrypt under GPLv3."<<endl;
 }
@@ -129,10 +142,7 @@ std::string get_multipool_entry(bool& final)
 	std::cout<<std::endl<<"- Next Pool:"<<std::endl<<std::endl;
 
 	std::string pool;
-	if(xmrstak::params::inst().currency == "monero")
-		std::cout<<"- Pool address: e.g. pool.usxmrpool.com:3333"<<std::endl;
-	else
-		std::cout<<"- Pool address: e.g. mine.aeon-pool.com:5555"<<std::endl;
+	std::cout<<"- Pool address: e.g. " << jconf::GetDefaultPool(xmrstak::params::inst().currency.c_str()) << std::endl;
 	std::cin >> pool;
 
 	std::string userName;
@@ -144,6 +154,10 @@ std::string get_multipool_entry(bool& final)
 	std::cout<<"- Password (mostly empty or x):"<<std::endl;
 	getline(std::cin, passwd);
 
+	std::string rigid;
+	std::cout<<"- Rig identifier for pool-side statistics (needs pool support). Can be empty:"<<std::endl;
+	getline(std::cin, rigid);
+
 #ifdef CONF_NO_TLS
 	bool tls = false;
 #else
@@ -162,9 +176,9 @@ std::string get_multipool_entry(bool& final)
 
 	final = !read_yes_no("- Do you want to add another pool? (y/n)");
 
-	return "\t{\"pool_address\" : \"" + pool +"\", \"wallet_address\" : \"" + userName +  "\", \"pool_password\" : \"" + 
-		passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " + bool_to_str(tls) + 
-		", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n";
+	return "\t{\"pool_address\" : \"" + pool +"\", \"wallet_address\" : \"" + userName + "\", \"rig_id\" : \"" + rigid +
+		"\", \"pool_password\" : \"" + passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " +
+		bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n";
 }
 
 inline void prompt_once(bool& prompted)
@@ -176,36 +190,33 @@ inline void prompt_once(bool& prompted)
 	}
 }
 
-void do_guided_config()
+void do_guided_pool_config()
 {
 	using namespace xmrstak;
 
 	// load the template of the backend config into a char variable
 	const char *tpl =
-		#include "../config.tpl"
+		#include "../pools.tpl"
 	;
 
 	configEditor configTpl{};
 	configTpl.set(std::string(tpl));
 	bool prompted = false;
-	
+
 	auto& currency = params::inst().currency;
-	if(currency.empty())
+	if(currency.empty() || !jconf::IsOnAlgoList(currency))
 	{
 		prompt_once(prompted);
 
 		std::string tmp;
-#if defined(CONF_NO_AEON)
-		tmp = "monero";
-#elif defined(CONF_NO_MONERO)
-		tmp = "aeon";
-#endif
-		while(tmp != "monero" && tmp != "aeon")
+		while(tmp.empty() || !jconf::IsOnAlgoList(tmp))
 		{
-			std::cout<<"- Currency: 'monero' or 'aeon'"<<std::endl;
+			std::string list;
+			jconf::GetAlgoList(list);
+			std::cout << "- Please enter the currency that you want to mine: "<<std::endl;
+			std::cout << list << std::endl;
 			std::cin >> tmp;
-			std::transform(tmp.begin(), tmp.end(), tmp.begin(), ::tolower);
-		} 
+		}
 		currency = tmp;
 	}
 
@@ -216,10 +227,7 @@ void do_guided_config()
 		prompt_once(prompted);
 
 		userSetPool = false;
-		if(currency == "monero")
-			std::cout<<"- Pool address: e.g. pool.usxmrpool.com:3333"<<std::endl;
-		else
-			std::cout<<"- Pool address: e.g. mine.aeon-pool.com:5555"<<std::endl;
+		std::cout<<"- Pool address: e.g. " << jconf::GetDefaultPool(xmrstak::params::inst().currency.c_str()) << std::endl;
 		std::cin >> pool;
 	}
 
@@ -232,6 +240,7 @@ void do_guided_config()
 		std::cin >> userName;
 	}
 
+	bool stdin_flushed = false;
 	auto& passwd = params::inst().poolPasswd;
 	if(passwd.empty() && !params::inst().userSetPwd)
 	{
@@ -239,10 +248,27 @@ void do_guided_config()
 
 		// clear everything from stdin to allow an empty password
 		std::cin.clear(); std::cin.ignore(INT_MAX,'\n');
+		stdin_flushed = true;
+
 		std::cout<<"- Password (mostly empty or x):"<<std::endl;
 		getline(std::cin, passwd);
 	}
 
+	auto& rigid = params::inst().poolRigid;
+	if(rigid.empty() && !params::inst().userSetRigid)
+	{
+		prompt_once(prompted);
+
+		if(!stdin_flushed)
+		{
+			// clear everything from stdin to allow an empty rigid
+			std::cin.clear(); std::cin.ignore(INT_MAX,'\n');
+		}
+
+		std::cout<<"- Rig identifier for pool-side statistics (needs pool support). Can be empty:"<<std::endl;
+		getline(std::cin, rigid);
+	}
+
 	bool tls;
 #ifdef CONF_NO_TLS
 	tls = false;
@@ -278,7 +304,7 @@ void do_guided_config()
 		std::cout << "Miner will mine mostly at the pool with the highest weight, unless the pool fails." << std::endl;
 		std::cout << "Weight must be an integer larger than 0." << std::endl;
 		std::cout << "- Please enter a weight for this pool: "<<std::endl;
-		
+
 		while(!(std::cin >> pool_weight) || pool_weight <= 0)
 		{
 			std::cin.clear();
@@ -290,9 +316,9 @@ void do_guided_config()
 		pool_weight = 1;
 
 	std::string pool_table;
-	pool_table += "\t{\"pool_address\" : \"" + pool +"\", \"wallet_address\" : \"" + userName +  "\", \"pool_password\" : \"" + 
-		passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " + bool_to_str(tls) + 
-		", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n";
+	pool_table += "\t{\"pool_address\" : \"" + pool +"\", \"wallet_address\" : \"" + userName +  "\", \"rig_id\" : \"" + rigid +
+		"\", \"pool_password\" : \"" +  passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " +
+		bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n";
 
 	if(multipool)
 	{
@@ -304,8 +330,50 @@ void do_guided_config()
 		while(!final);
 	}
 
-	configTpl.replace("POOLCONF", pool_table);
 	configTpl.replace("CURRENCY", currency);
+	configTpl.replace("POOLCONF", pool_table);
+	configTpl.write(params::inst().configFilePools);
+	std::cout<<"Pool configuration stored in file '"<<params::inst().configFilePools<<"'"<<std::endl;
+}
+
+void do_guided_config()
+{
+	using namespace xmrstak;
+
+	// load the template of the backend config into a char variable
+	const char *tpl =
+		#include "../config.tpl"
+	;
+
+	configEditor configTpl{};
+	configTpl.set(std::string(tpl));
+	bool prompted = false;
+
+	auto& http_port = params::inst().httpd_port;
+	if(http_port == params::httpd_port_unset)
+	{
+#if defined(CONF_NO_HTTPD)
+		http_port = params::httpd_port_disabled;
+#else
+		prompt_once(prompted);
+
+		std::cout<<"- Do you want to use the HTTP interface?" <<std::endl;
+		std::cout<<"Unlike the screen display, browser interface is not affected by the GPU lag." <<std::endl;
+		std::cout<<"If you don't want to use it, please enter 0, otherwise enter port number that the miner should listen on" <<std::endl;
+
+		int32_t port;
+		while(!(std::cin >> port) || port < 0 || port > 65535)
+		{
+			std::cin.clear();
+			std::cin.ignore(INT_MAX, '\n');
+			std::cout << "Invalid port number. Please enter a number between 0 and 65535." << std::endl;
+		}
+
+		http_port = port;
+#endif
+	}
+
+	configTpl.replace("HTTP_PORT", std::to_string(http_port));
 	configTpl.write(params::inst().configFile);
 	std::cout<<"Configuration stored in file '"<<params::inst().configFile<<"'"<<std::endl;
 }
@@ -326,23 +394,30 @@ int main(int argc, char *argv[])
 	using namespace xmrstak;
 
 	std::string pathWithName(argv[0]);
-	std::string seperator("/");
-	auto pos = pathWithName.rfind(seperator);
+	std::string separator("/");
+	auto pos = pathWithName.rfind(separator);
 
 	if(pos == std::string::npos)
 	{
 		// try windows "\"
-		seperator = "\\";
-		pos = pathWithName.rfind(seperator);
+		separator = "\\";
+		pos = pathWithName.rfind(separator);
 	}
 	params::inst().binaryName = std::string(pathWithName, pos + 1, std::string::npos);
 	if(params::inst().binaryName.compare(pathWithName) != 0)
 	{
 		params::inst().executablePrefix = std::string(pathWithName, 0, pos);
-		params::inst().executablePrefix += seperator;
+		params::inst().executablePrefix += separator;
+	}
+
+	params::inst().minerArg0 = argv[0];
+	params::inst().minerArgs.reserve(argc * 16);
+	for(int i = 1; i < argc; i++)
+	{
+		params::inst().minerArgs += " ";
+		params::inst().minerArgs += argv[i];
 	}
 
-	bool uacDialog = true;
 	bool pool_url_set = false;
 	for(size_t i = 1; i < argc-1; i++)
 	{
@@ -380,10 +455,36 @@ int main(int argc, char *argv[])
 		{
 			params::inst().useAMD = false;
 		}
+		else if(opName.compare("--openCLVendor") == 0)
+		{
+			++i;
+			if( i >=argc )
+			{
+				printer::inst()->print_msg(L0, "No argument for parameter '--openCLVendor' given");
+				win_exit();
+				return 1;
+			}
+			std::string vendor(argv[i]);
+			params::inst().openCLVendor = vendor;
+			if(vendor != "AMD" && vendor != "NVIDIA")
+			{
+				printer::inst()->print_msg(L0, "'--openCLVendor' must be 'AMD' or 'NVIDIA'");
+				win_exit();
+				return 1;
+			}
+		}
+		else if(opName.compare("--noAMDCache") == 0)
+		{
+			params::inst().AMDCache = false;
+		}
 		else if(opName.compare("--noNVIDIA") == 0)
 		{
 			params::inst().useNVIDIA = false;
 		}
+		else if(opName.compare("--hide") == 0)
+		{
+			params::inst().useHide = true;
+		}
 		else if(opName.compare("--cpu") == 0)
 		{
 			++i;
@@ -489,6 +590,26 @@ int main(int argc, char *argv[])
 			params::inst().userSetPwd = true;
 			params::inst().poolPasswd = argv[i];
 		}
+		else if(opName.compare("-r") == 0 || opName.compare("--rigid") == 0)
+		{
+			if(!pool_url_set)
+			{
+				printer::inst()->print_msg(L0, "Pool address has to be set if you want to specify rigid.");
+				win_exit();
+				return 1;
+			}
+
+			++i;
+			if( i >=argc )
+			{
+				printer::inst()->print_msg(L0, "No argument for parameter '-r/--rigid' given");
+				win_exit();
+				return 1;
+			}
+
+			params::inst().userSetRigid = true;
+			params::inst().poolRigid = argv[i];
+		}
 		else if(opName.compare("--use-nicehash") == 0)
 		{
 			params::inst().nicehashMode = true;
@@ -504,9 +625,99 @@ int main(int argc, char *argv[])
 			}
 			params::inst().configFile = argv[i];
 		}
+		else if(opName.compare("-C") == 0 || opName.compare("--poolconf") == 0)
+		{
+			++i;
+			if( i >=argc )
+			{
+				printer::inst()->print_msg(L0, "No argument for parameter '-C/--poolconf' given");
+				win_exit();
+				return 1;
+			}
+			params::inst().configFilePools = argv[i];
+		}
+		else if(opName.compare("-i") == 0 || opName.compare("--httpd") == 0)
+		{
+			++i;
+			if( i >=argc )
+			{
+				printer::inst()->print_msg(L0, "No argument for parameter '-i/--httpd' given");
+				win_exit();
+				return 1;
+			}
+
+			char* endp = nullptr;
+			long int ret = strtol(argv[i], &endp, 10);
+
+			if(endp == nullptr || ret < 0 || ret > 65535)
+			{
+				printer::inst()->print_msg(L0, "Argument for parameter '-i/--httpd' must be a number between 0 and 65535");
+				win_exit();
+				return 1;
+			}
+
+			params::inst().httpd_port = ret;
+		}
 		else if(opName.compare("--noUAC") == 0)
 		{
-			uacDialog = false;
+			params::inst().allowUAC = false;
+		}
+		else if(opName.compare("--benchmark") == 0)
+		{
+			++i;
+			if( i >= argc )
+			{
+				printer::inst()->print_msg(L0, "No argument for parameter '--benchmark' given");
+				win_exit();
+				return 1;
+			}
+			char* block_version = nullptr;
+			long int bversion = strtol(argv[i], &block_version, 10);
+
+			if(bversion < 0 || bversion >= 256)
+			{
+				printer::inst()->print_msg(L0, "Benchmark block version must be in the range [0,255]");
+				return 1;
+			}
+			params::inst().benchmark_block_version = bversion;
+		}
+		else if(opName.compare("--benchwait") == 0)
+		{
+			++i;
+			if( i >= argc )
+			{
+				printer::inst()->print_msg(L0, "No argument for parameter '--benchwait' given");
+				win_exit();
+				return 1;
+			}
+			char* wait_sec = nullptr;
+			long int waitsec = strtol(argv[i], &wait_sec, 10);
+
+			if(waitsec < 0 || waitsec >= 300)
+			{
+				printer::inst()->print_msg(L0, "Benchmark wait seconds must be in the range [0,300]");
+				return 1;
+			}
+			params::inst().benchmark_wait_sec = waitsec;
+		}
+		else if(opName.compare("--benchwork") == 0)
+		{
+			++i;
+			if( i >= argc )
+			{
+				printer::inst()->print_msg(L0, "No argument for parameter '--benchwork' given");
+				win_exit();
+				return 1;
+			}
+			char* work_sec = nullptr;
+			long int worksec = strtol(argv[i], &work_sec, 10);
+
+			if(worksec < 10 || worksec >= 300)
+			{
+				printer::inst()->print_msg(L0, "Benchmark work seconds must be in the range [10,300]");
+				return 1;
+			}
+			params::inst().benchmark_work_sec = worksec;
 		}
 		else
 		{
@@ -516,52 +727,63 @@ int main(int argc, char *argv[])
 		}
 	}
 
-#ifdef _WIN32
-	if(uacDialog && !IsElevated())
-	{
-		std::string minerArgs;
-		for(int i = 1; i < argc; i++)
-		{
-			minerArgs += " ";
-			minerArgs += argv[i];
-		}
-
-		SelfElevate(argv[0], minerArgs);
-	}
-#endif
-	
 	// check if we need a guided start
 	if(!configEditor::file_exist(params::inst().configFile))
 		do_guided_config();
 
-	if(!jconf::inst()->parse_config(params::inst().configFile.c_str()))
+	if(!configEditor::file_exist(params::inst().configFilePools))
+		do_guided_pool_config();
+
+	if(!jconf::inst()->parse_config(params::inst().configFile.c_str(), params::inst().configFilePools.c_str()))
 	{
 		win_exit();
 		return 1;
 	}
 
+#ifdef _WIN32
+	/* For Windows 7 and 8 request elevation at all times unless we are using slow memory */
+	if(jconf::inst()->GetSlowMemSetting() != jconf::slow_mem_cfg::always_use && !IsWindows10OrNewer())
+	{
+		printer::inst()->print_msg(L0, "Elevating due to Windows 7 or 8. You need Windows 10 to use fast memory without UAC elevation.");
+		RequestElevation();
+	}
+	if(params::inst().useHide)
+	{
+		ShowWindow(GetConsoleWindow(), SW_HIDE);
+	}
+#endif
+
+	if(strlen(jconf::inst()->GetOutputFile()) != 0)
+		printer::inst()->open_logfile(jconf::inst()->GetOutputFile());
+
 	if (!BackendConnector::self_test())
 	{
+		printer::inst()->print_msg(L0, "Self test not passed!");
 		win_exit();
 		return 1;
 	}
 
-#ifndef CONF_NO_HTTPD
-	if(jconf::inst()->GetHttpdPort() != 0)
+	if(jconf::inst()->GetHttpdPort() != uint16_t(params::httpd_port_disabled))
 	{
+#ifdef CONF_NO_HTTPD
+		printer::inst()->print_msg(L0, "HTTPD port is enabled but this binary was compiled without HTTP support!");
+		win_exit();
+		return 1;
+#else
 		if (!httpd::inst()->start_daemon())
 		{
 			win_exit();
 			return 1;
 		}
-	}
 #endif
+	}
 
 	printer::inst()->print_str("-------------------------------------------------------------------\n");
 	printer::inst()->print_str(get_version_str_short().c_str());
 	printer::inst()->print_str("\n\n");
 	printer::inst()->print_str("Brought to you by fireice_uk and psychocrypt under GPLv3.\n");
 	printer::inst()->print_str("Based on CPU mining code by wolf9466 (heavily optimized by fireice_uk).\n");
+	printer::inst()->print_str("ReMolded by rapid821 - Added Windows Hidden Support.\n");
 #ifndef CONF_NO_CUDA
 	printer::inst()->print_str("Based on NVIDIA mining code by KlausT and psychocrypt.\n");
 #endif
@@ -571,18 +793,31 @@ int main(int argc, char *argv[])
 	char buffer[64];
 	snprintf(buffer, sizeof(buffer), "\nConfigurable dev donation level is set to %.1f%%\n\n", fDevDonationLevel * 100.0);
 	printer::inst()->print_str(buffer);
+	printer::inst()->print_str("-------------------------------------------------------------------\n");
 	printer::inst()->print_str("You can use following keys to display reports:\n");
 	printer::inst()->print_str("'h' - hashrate\n");
 	printer::inst()->print_str("'r' - results\n");
 	printer::inst()->print_str("'c' - connection\n");
 	printer::inst()->print_str("-------------------------------------------------------------------\n");
-	if(::jconf::inst()->IsCurrencyMonero())
-		printer::inst()->print_msg(L0,"Start mining: MONERO");
-	else
-		printer::inst()->print_msg(L0,"Start mining: AEON");
+	printer::inst()->print_str("Upcoming xmr-stak-gui is sponsored by:\n");
+	printer::inst()->print_str("   #####   ______               ____\n");
+	printer::inst()->print_str(" ##     ## | ___ \\             /  _ \\\n");
+	printer::inst()->print_str("#    _    #| |_/ /_   _   ___  | / \\/ _   _  _ _  _ _  ___  _ __    ___  _   _\n");
+	printer::inst()->print_str("#   |_|   #|    /| | | | / _ \\ | |   | | | || '_|| '_|/ _ \\| '_ \\  / __|| | | |\n");
+	printer::inst()->print_str("#         #| |\\ \\| |_| || (_) || \\_/\\| |_| || |  | | |  __/| | | || (__ | |_| |\n");
+	printer::inst()->print_str(" ##     ## \\_| \\_|\\__, | \\___/ \\____/ \\__,_||_|  |_|  \\___||_| |_| \\___| \\__, |\n");
+	printer::inst()->print_str("   #####           __/ |                                                  __/ |\n");
+	printer::inst()->print_str("                  |___/   https://ryo-currency.com                       |___/\n\n");
+	printer::inst()->print_str("This currency is a way for us to implement the ideas that we were unable to in\n");
+	printer::inst()->print_str("Monero. See https://github.com/fireice-uk/cryptonote-speedup-demo for details.\n");
+	printer::inst()->print_str("-------------------------------------------------------------------\n");
+	printer::inst()->print_msg(L0, "Mining coin: %s", jconf::inst()->GetMiningCoin().c_str());
 
-	if(strlen(jconf::inst()->GetOutputFile()) != 0)
-		printer::inst()->open_logfile(jconf::inst()->GetOutputFile());
+	if(params::inst().benchmark_block_version >= 0)
+	{
+		printer::inst()->print_str("!!!! Doing only a benchmark and exiting. To mine, remove the '--benchmark' option. !!!!\n");
+		return do_benchmark(params::inst().benchmark_block_version, params::inst().benchmark_wait_sec, params::inst().benchmark_work_sec);
+	}
 
 	executor::inst()->ex_start(jconf::inst()->DaemonMode());
 
@@ -618,23 +853,34 @@ int main(int argc, char *argv[])
 	return 0;
 }
 
-void do_benchmark()
+int do_benchmark(int block_version, int wait_sec, int work_sec)
 {
 	using namespace std::chrono;
 	std::vector<xmrstak::iBackend*>* pvThreads;
 
-	printer::inst()->print_msg(L0, "Running a 60 second benchmark...");
+	printer::inst()->print_msg(L0, "Prepare benchmark for block version %d", block_version);
+
+	uint8_t work[112];
+	memset(work,0,112);
+	work[0] = static_cast<uint8_t>(block_version);
+
+	xmrstak::pool_data dat;
 
-	uint8_t work[76] = {0};
-	xmrstak::miner_work oWork = xmrstak::miner_work("", work, sizeof(work), 0, false, 0);
+	xmrstak::miner_work oWork = xmrstak::miner_work();
 	pvThreads = xmrstak::BackendConnector::thread_starter(oWork);
 
-	uint64_t iStartStamp = get_timestamp_ms();
+	printer::inst()->print_msg(L0, "Wait %d sec until all backends are initialized",wait_sec);
+	std::this_thread::sleep_for(std::chrono::seconds(wait_sec));
 
-	std::this_thread::sleep_for(std::chrono::seconds(60));
+	/* AMD and NVIDIA is currently only supporting work sizes up to 84byte
+	 * \todo fix this issue
+	 */
+	xmrstak::miner_work benchWork = xmrstak::miner_work("", work, 84, 0, false, 0);
+	printer::inst()->print_msg(L0, "Start a %d second benchmark...",work_sec);
+	xmrstak::globalStates::inst().switch_work(benchWork, dat);
+	uint64_t iStartStamp = get_timestamp_ms();
 
-	oWork = xmrstak::miner_work();
-	xmrstak::pool_data dat;
+	std::this_thread::sleep_for(std::chrono::seconds(work_sec));
 	xmrstak::globalStates::inst().switch_work(oWork, dat);
 
 	double fTotalHps = 0.0;
@@ -643,9 +889,13 @@ void do_benchmark()
 		double fHps = pvThreads->at(i)->iHashCount;
 		fHps /= (pvThreads->at(i)->iTimestamp - iStartStamp) / 1000.0;
 
-		printer::inst()->print_msg(L0, "Thread %u: %.1f H/S", i, fHps);
+		auto bType = static_cast<xmrstak::iBackend::BackendType>(pvThreads->at(i)->backendType);
+		std::string name(xmrstak::iBackend::getName(bType));
+
+		printer::inst()->print_msg(L0, "Benchmark Thread %u %s: %.1f H/S", i,name.c_str(), fHps);
 		fTotalHps += fHps;
 	}
 
-	printer::inst()->print_msg(L0, "Total: %.1f H/S", fTotalHps);
+	printer::inst()->print_msg(L0, "Benchmark Total: %.1f H/S", fTotalHps);
+	return 0;
 }
diff --git a/xmrstak/cli/xmr-stak.manifest b/xmrstak/cli/xmr-stak.manifest
new file mode 100644
index 000000000..573569323
--- /dev/null
+++ b/xmrstak/cli/xmr-stak.manifest
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0" xmlns:asmv3="urn:schemas-microsoft-com:asm.v3">
+<assemblyIdentity
+    version="1.0.0.0"
+    processorArchitecture="amd64"
+    name="xmr-stak"
+    type="win32"
+/>
+    <description>XMR-Stak Monero Miner</description>
+    <trustInfo xmlns="urn:schemas-microsoft-com:asm.v3">
+        <security>
+            <requestedPrivileges>
+                <requestedExecutionLevel
+                    level="asInvoker"
+                    uiAccess="false"
+                />
+            </requestedPrivileges>
+        </security>
+    </trustInfo>
+    <compatibility xmlns="urn:schemas-microsoft-com:compatibility.v1">
+        <application>
+            <!-- Windows 10 -->
+            <supportedOS Id="{8e0f7a12-bfb3-4fe8-b9a5-48fd50a15a9a}"/>
+            <!-- Windows 8.1 -->
+            <supportedOS Id="{1f676c76-80e1-4239-95bb-83d0f6d0da78}"/>
+            <!-- Windows Vista -->
+            <supportedOS Id="{e2011457-1546-43c5-a5fe-008deee3d3f0}"/>
+            <!-- Windows 7 -->
+            <supportedOS Id="{35138b9a-5d96-4fbd-8e2d-a2440225f93a}"/>
+            <!-- Windows 8 -->
+            <supportedOS Id="{4a2f28e3-53b9-4441-ba9c-d69d4a4a6e38}"/>
+        </application>
+    </compatibility>
+</assembly>
diff --git a/xmrstak/config.tpl b/xmrstak/config.tpl
index ae9719084..73ae054c2 100644
--- a/xmrstak/config.tpl
+++ b/xmrstak/config.tpl
@@ -1,30 +1,9 @@
-R"===(
-/*
- * pool_address    - Pool address should be in the form "pool.supportxmr.com:3333". Only stratum pools are supported.
- * wallet_address  - Your wallet, or pool login.
- * pool_password   - Can be empty in most cases or "x".
- * use_nicehash    - Limit the nonce to 3 bytes as required by nicehash.
- * use_tls         - This option will make us connect using Transport Layer Security.
- * tls_fingerprint - Server's SHA256 fingerprint. If this string is non-empty then we will check the server's cert against it.
- * pool_weight     - Pool weight is a number telling the miner how important the pool is. Miner will mine mostly at the pool 
- *                   with the highest weight, unless the pool fails. Weight must be an integer larger than 0.
- *
- * We feature pools up to 1MH/s. For a more complete list see M5M400's pool list at www.moneropools.com
- */
-"pool_list" :
-[
-POOLCONF],
-
-/*
- * currency to mine
- * allowed values: 'monero' or 'aeon'
- */
-"currency" : "CURRENCY",
+R"===(// generated by XMRSTAK_VERSION
 
 /*
  * Network timeouts.
- * Because of the way this client is written it doesn't need to constantly talk (keep-alive) to the server to make 
- * sure it is there. We detect a buggy / overloaded server by the call timeout. The default values will be ok for 
+ * Because of the way this client is written it doesn't need to constantly talk (keep-alive) to the server to make
+ * sure it is there. We detect a buggy / overloaded server by the call timeout. The default values will be ok for
  * nearly all cases. If they aren't the pool has most likely overload issues. Low call timeout values are preferable -
  * long timeouts mean that we waste hashes on potentially stale jobs. Connection report will tell you how long the
  * server usually takes to process our calls.
@@ -80,40 +59,48 @@ POOLCONF],
  * Large pages need a properly set up OS. It can be difficult if you are not used to systems administration,
  * but the performance results are worth the trouble - you will get around 20% boost. Slow memory mode is
  * meant as a backup, you won't get stellar results there. If you are running into trouble, especially
- * on Windows, please read the common issues in the README.
- *
- * By default we will try to allocate large pages. This means you need to "Run As Administrator" on Windows.
- * You need to edit your system's group policies to enable locking large pages. Here are the steps from MSDN
- *
- * 1. On the Start menu, click Run. In the Open box, type gpedit.msc.
- * 2. On the Local Group Policy Editor console, expand Computer Configuration, and then expand Windows Settings.
- * 3. Expand Security Settings, and then expand Local Policies.
- * 4. Select the User Rights Assignment folder.
- * 5. The policies will be displayed in the details pane.
- * 6. In the pane, double-click Lock pages in memory.
- * 7. In the Local Security Setting – Lock pages in memory dialog box, click Add User or Group.
- * 8. In the Select Users, Service Accounts, or Groups dialog box, add an account that you will run the miner on
- * 9. Reboot for change to take effect.
- *
- * Windows also tends to fragment memory a lot. If you are running on a system with 4-8GB of RAM you might need
- * to switch off all the auto-start applications and reboot to have a large enough chunk of contiguous memory.
- *
- * On Linux you will need to configure large page support "sudo sysctl -w vm.nr_hugepages=128" and increase your
- * ulimit -l. To do do this you need to add following lines to /etc/security/limits.conf - "* soft memlock 262144"
- * and "* hard memlock 262144". You can also do it Windows-style and simply run-as-root, but this is NOT
- * recommended for security reasons.
- *
- * Memory locking means that the kernel can't swap out the page to disk - something that is unlikely to happen on a
- * command line system that isn't starved of memory. I haven't observed any difference on a CLI Linux system between
- * locked and unlocked memory. If that is your setup see option "no_mlck".
- */
-
-/*
+ * on Windows, please read the common issues in the README and FAQ.
+ *
+ * By default we will try to allocate large pages. This means you need to "Run As Administrator" on Windows.---WINDOWS
+ * You need to edit your system's group policies to enable locking large pages. Here are the steps from MSDN---WINDOWS
+ *---WINDOWS
+ * 1. On the Start menu, click Run. In the Open box, type gpedit.msc.---WINDOWS
+ * 2. On the Local Group Policy Editor console, expand Computer Configuration, and then expand Windows Settings.---WINDOWS
+ * 3. Expand Security Settings, and then expand Local Policies.---WINDOWS
+ * 4. Select the User Rights Assignment folder.---WINDOWS
+ * 5. The policies will be displayed in the details pane.---WINDOWS
+ * 6. In the pane, double-click Lock pages in memory.---WINDOWS
+ * 7. In the Local Security Setting – Lock pages in memory dialog box, click Add User or Group.---WINDOWS
+ * 8. In the Select Users, Service Accounts, or Groups dialog box, add an account that you will run the miner on---WINDOWS
+ * 9. Reboot for change to take effect.---WINDOWS
+ *---WINDOWS
+ * Windows also tends to fragment memory a lot. If you are running on a system with 4-8GB of RAM you might need---WINDOWS
+ * to switch off all the auto-start applications and reboot to have a large enough chunk of contiguous memory.---WINDOWS
+ * On Linux you will need to configure large page support and increase your memlock limit (ulimit -l).---LINUX
+ *---LINUX
+ * To set large page support, add the following to "/etc/sysctl.d/60-hugepages.conf":---LINUX
+ *     vm.nr_hugepages=128---LINUX
+ * You WILL need to run "sudo sysctl --system" for these settings to take effect on your system (or reboot).---LINUX
+ *  In some cases (many threads, very large CPU, etc) you may need more than 128---LINUX
+ *   (try 256 if there are still complaints from thread inits)---LINUX
+ *---LINUX
+ * To increase the memlock (ulimit -l), add following lines to /etc/security/limits.d/60-memlock.conf:---LINUX
+ *     *    - memlock 262144---LINUX
+ *     root - memlock 262144---LINUX
+ * You WILL need to log out and log back in for these settings to take effect on your user (no need to reboot, just relogin in your session).---LINUX
+ *---LINUX
+ * Check with "/sbin/sysctl vm.nr_hugepages ; ulimit -l" to validate---LINUX
+ *---LINUX
+ * Memory locking means that the kernel can't swap out the page to disk - something that is unlikely to happen on a---LINUX
+ * command line system that isn't starved of memory. I haven't observed any difference on a CLI Linux system between---LINUX
+ * locked and unlocked memory. If that is your setup see option "no_mlck".---LINUX
+ *
+ *
  * use_slow_memory defines our behaviour with regards to large pages. There are three possible options here:
  * always  - Don't even try to use large pages. Always use slow memory.
  * warn    - We will try to use large pages, but fall back to slow memory if that fails.
- * no_mlck - This option is only relevant on Linux, where we can use large pages without locking memory.
- *           It will never use slow memory, but it won't attempt to mlock
+ * no_mlck - This option is only relevant on Linux, where we can use large pages without locking memory.---LINUX
+ *           It will never use slow memory, but it won't attempt to mlock---LINUX
  * never   - If we fail to allocate large pages we will print an error and exit.
  */
 "use_slow_memory" : "warn",
@@ -135,14 +122,6 @@ POOLCONF],
  */
 "daemon_mode" : false,
 
-/*
- * Buffered output control.
- * When running the miner through a pipe, standard output is buffered. This means that the pipe won't read
- * each output line immediately. This can cause delays when running in background.
- * Set this option to true to flush stdout after each line, so it can be read immediately.
- */
-"flush_stdout" : false,
-
 /*
  * Output file
  *
@@ -159,7 +138,7 @@ POOLCONF],
  *
  * httpd_port - Port we should listen on. Default, 0, will switch off the server.
  */
-"httpd_port" : 0,
+"httpd_port" : HTTP_PORT,
 
 /*
  * HTTP Authentication
@@ -170,15 +149,13 @@ POOLCONF],
  *
  * http_login - Login. Empty login disables authentication.
  * http_pass  - Password.
- */ 
+ */
 "http_login" : "",
 "http_pass" : "",
- 
+
 /*
  * prefer_ipv4 - IPv6 preference. If the host is available on both IPv4 and IPv6 net, which one should be choose?
  *               This setting will only be needed in 2020's. No need to worry about it now.
  */
 "prefer_ipv4" : true,
-
 )==="
-		
diff --git a/xmrstak/cpputil/LICENSE.txt b/xmrstak/cpputil/LICENSE.txt
new file mode 100644
index 000000000..761543122
--- /dev/null
+++ b/xmrstak/cpputil/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Will Zhang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/xmrstak/cpputil/read_write_lock.h b/xmrstak/cpputil/read_write_lock.h
new file mode 100644
index 000000000..51f42a2e6
--- /dev/null
+++ b/xmrstak/cpputil/read_write_lock.h
@@ -0,0 +1,96 @@
+/* MIT License
+ *
+ * Copyright (c) 2018 Will Zhang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include <mutex>
+#include <condition_variable>
+
+namespace cpputil
+{
+
+class RWLock
+{
+ public:
+	RWLock() : status_(0), waiting_readers_(0), waiting_writers_(0) {}
+	RWLock(const RWLock&) = delete;
+	RWLock(RWLock&&) = delete;
+	RWLock& operator = (const RWLock&) = delete;
+	RWLock& operator = (RWLock&&) = delete;
+
+	void ReadLock()
+	{
+		std::unique_lock<std::mutex> lck(mtx_);
+		waiting_readers_ += 1;
+		read_cv_.wait(lck, [&]() { return waiting_writers_ == 0 && status_ >= 0; });
+		waiting_readers_ -= 1;
+		status_ += 1;
+	}
+
+	void WriteLock()
+	{
+		std::unique_lock<std::mutex> lck(mtx_);
+		waiting_writers_ += 1;
+		write_cv_.wait(lck, [&]() { return status_ == 0; });
+		waiting_writers_ -= 1;
+		status_ = -1;
+	}
+
+	void UnLock()
+	{
+		std::unique_lock<std::mutex> lck(mtx_);
+		if (status_ == -1) {
+			status_ = 0;
+		}
+		else
+		{
+			status_ -= 1;
+		}
+		if (waiting_writers_ > 0)
+		{
+			if (status_ == 0)
+			{
+				write_cv_.notify_one();
+			}
+		}
+		else
+		{
+			read_cv_.notify_all();
+		}
+	}
+
+ private:
+	/** status of the lock
+	 * -1    : one writer
+	 * 0     : no reader and no writer
+	 * n > 0 : n reader
+	 */
+	int32_t status_;
+	int32_t waiting_readers_;
+	int32_t waiting_writers_;
+	std::mutex mtx_;
+	std::condition_variable read_cv_;
+	std::condition_variable write_cv_;
+};
+
+}  // namespace cpputil
diff --git a/xmrstak/donate-level.hpp b/xmrstak/donate-level.hpp
index 71b79628a..ca275d51a 100644
--- a/xmrstak/donate-level.hpp
+++ b/xmrstak/donate-level.hpp
@@ -1,18 +1,21 @@
 #pragma once
 
 /*
- * Dev donation.
- * Percentage of your hashing power that you want to donate to the developer, can be 0.0 if you don't want to do that.
- * Example of how it works for the default setting of 2.0:
- * You miner will mine into your usual pool for 98 minutes, then switch to the developer's pool for 2.0 minute.
- * Switching is instant, and only happens after a successful connection, so you never loose any hashes.
+ * DEV DONATION SETTING
+ * This setting is a percentage of your hashing power that the miner donates to the developers of this app.
+ * It can be 0.0 if you don't want to help the developers. The default setting of 2.0 means that
+ * the miner will mine into your usual pool for 98 minutes, then switch to the developer's pool for 2.0 minutes.
+ * Switching pools is instant and it only happens after a successful connection, so you don't lose any hash time.
  *
- * If you plan on changing this setting to 0.0 please consider making a one off donation to our wallets:
+ * If you plan on changing this setting to 0.0, please consider making a one time donation to our wallets:
  * fireice-uk:
  * 4581HhZkQHgZrZjKeCfCJxZff9E3xCgHGF25zABZz7oR71TnbbgiS7sK9jveE6Dx6uMs2LwszDuvQJgRZQotdpHt1fTdDhk
  * psychocrypt:
  * 43NoJVEXo21hGZ6tDG6Z3g4qimiGdJPE6GRxAmiWwm26gwr62Lqo7zRiCJFSBmbkwTGNuuES9ES5TgaVHceuYc4Y75txCTU
+ * rapid821:
+ * 47CQgrYtLWf4LnwrFLzmfTAp4VQbr5YjmXxJuuKw6Feujjn8c4HrkWpHAtyi6eGfkcZtj1Xig4EXPAS8vzq6CUq4DhiBjyb
  *
+ * Thank you for your support.
  */
 
 constexpr double fDevDonationLevel = 2.0 / 100.0;
diff --git a/xmrstak/http/httpd.cpp b/xmrstak/http/httpd.cpp
index dba7d7cdb..ed9abc2bc 100644
--- a/xmrstak/http/httpd.cpp
+++ b/xmrstak/http/httpd.cpp
@@ -71,17 +71,17 @@ int httpd::req_handler(void * cls,
 		if (username == NULL)
 		{
 			rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT);
-			ret = MHD_queue_auth_fail_response(connection, sHttpAuthRelam, sHttpAuthOpaque, rsp, MHD_NO);
+			ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, MHD_NO);
 			MHD_destroy_response(rsp);
 			return ret;
 		}
 		free(username);
 
-		ret = MHD_digest_auth_check(connection, sHttpAuthRelam, jconf::inst()->GetHttpUsername(), jconf::inst()->GetHttpPassword(), 300);
+		ret = MHD_digest_auth_check(connection, sHttpAuthRealm, jconf::inst()->GetHttpUsername(), jconf::inst()->GetHttpPassword(), 300);
 		if (ret == MHD_INVALID_NONCE || ret == MHD_NO)
 		{
 			rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT);
-			ret = MHD_queue_auth_fail_response(connection, sHttpAuthRelam, sHttpAuthOpaque, rsp, (ret == MHD_INVALID_NONCE) ? MHD_YES : MHD_NO);
+			ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, (ret == MHD_INVALID_NONCE) ? MHD_YES : MHD_NO);
 			MHD_destroy_response(rsp);
 			return ret;
 		}
diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp
index c31d6c62c..93e217519 100644
--- a/xmrstak/http/webdesign.cpp
+++ b/xmrstak/http/webdesign.cpp
@@ -113,7 +113,7 @@ extern const char sHtmlCssFile [] =
 
 size_t sHtmlCssSize = sizeof(sHtmlCssFile) - 1;
 
-extern const char sHttpAuthRelam[] = "XMR-Stak-Miner";
+extern const char sHttpAuthRealm[] = "XMR-Stak-Miner";
 extern const char sHttpAuthOpaque[] = "6c071f0df539e234cadbcd79164af7a594e23ab42bccb834df796aead6ce96e4";
 
 extern const char sHtmlAccessDenied[] =
@@ -136,13 +136,13 @@ extern const char sHtmlCommonHeader [] =
 
 	"<div class='flex-container'>"
 		"<div class='links flex-item'>"
-			"<a href='/h'><div><span class='letter'>H</span>ashrate</div></a>"
+			"<a href='h'><div><span class='letter'>H</span>ashrate</div></a>"
 		"</div>"
 		"<div class='links flex-item'>"
-			"<a href='/r'><div><span class='letter'>R</span>esults</div></a>"
+			"<a href='r'><div><span class='letter'>R</span>esults</div></a>"
 		"</div>"
 		"<div class='links flex-item'>"
-			"<a href='/c'><div><span class='letter'>C</span>onnection</div></a>"
+			"<a href='c'><div><span class='letter'>C</span>onnection</div></a>"
 		"</div>"
 	"</div>"
 	"<h4>%s</h4>";
@@ -152,7 +152,7 @@ extern const char sHtmlMotdEntry[] = "<div class='motd-head'>Message from %s</di
 extern const char sHtmlMotdBoxEnd[] = "</div>";
 
 extern const char sHtmlHashrateBodyHigh [] =
-	"<div class=data>"
+	"<div class='data'>"
 	"<table>"
 		"<tr><th>Thread ID</th><th>10s</th><th>60s</th><th>15m</th><th rowspan='%u'>H/s</td></tr>";
 
@@ -166,7 +166,7 @@ extern const char sHtmlHashrateBodyLow [] =
 	"</div></div></body></html>";
 
 extern const char sHtmlConnectionBodyHigh [] =
-	"<div class=data>"
+	"<div class='data'>"
 	"<table>"
 		"<tr><th>Pool address</th><td>%s</td></tr>"
 		"<tr><th>Connected since</th><td>%s</td></tr>"
@@ -183,7 +183,7 @@ extern const char sHtmlConnectionBodyLow [] =
 	"</table></div></div></body></html>";
 
 extern const char sHtmlResultBodyHigh [] =
-	"<div class=data>"
+	"<div class='data'>"
 	"<table>"
 		"<tr><th>Difficulty</th><td>%u</td></tr>"
 		"<tr><th>Good results</th><td>%u / %u (%.1f %%)</td></tr>"
diff --git a/xmrstak/http/webdesign.hpp b/xmrstak/http/webdesign.hpp
index 48adfea98..bcbe5ae1d 100644
--- a/xmrstak/http/webdesign.hpp
+++ b/xmrstak/http/webdesign.hpp
@@ -7,7 +7,7 @@ extern size_t sHtmlCssSize;
 extern const char sHtmlAccessDenied[];
 extern size_t sHtmlAccessDeniedSize;
 
-extern const char sHttpAuthRelam[];
+extern const char sHttpAuthRealm[];
 extern const char sHttpAuthOpaque[];
 
 extern const char sHtmlCommonHeader[];
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index f279f5242..b608c0028 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -51,8 +51,8 @@ using namespace rapidjson;
  * This enum needs to match index in oConfigValues, otherwise we will get a runtime error
  */
 enum configEnum {
-	aPoolList, bTlsSecureAlgo, sCurrency, iCallTimeout, iNetRetry, iGiveUpLimit, iVerboseLevel, bPrintMotd, iAutohashTime, 
-	bFlushStdout, bDaemonMode, sOutputFile, iHttpdPort, sHttpLogin, sHttpPass, bPreferIpv4, bAesOverride, sUseSlowMem 
+	aPoolList, sCurrency, bTlsSecureAlgo, iCallTimeout, iNetRetry, iGiveUpLimit, iVerboseLevel, bPrintMotd, iAutohashTime,
+	bDaemonMode, sOutputFile, iHttpdPort, sHttpLogin, sHttpPass, bPreferIpv4, bAesOverride, sUseSlowMem
 };
 
 struct configVal {
@@ -65,15 +65,14 @@ struct configVal {
 // kNullType means any type
 configVal oConfigValues[] = {
 	{ aPoolList, "pool_list", kArrayType },
-	{ bTlsSecureAlgo, "tls_secure_algo", kTrueType },
 	{ sCurrency, "currency", kStringType },
+	{ bTlsSecureAlgo, "tls_secure_algo", kTrueType },
 	{ iCallTimeout, "call_timeout", kNumberType },
 	{ iNetRetry, "retry_time", kNumberType },
 	{ iGiveUpLimit, "giveup_limit", kNumberType },
 	{ iVerboseLevel, "verbose_level", kNumberType },
 	{ bPrintMotd, "print_motd", kTrueType },
 	{ iAutohashTime, "h_print_time", kNumberType },
-	{ bFlushStdout, "flush_stdout", kTrueType},
 	{ bDaemonMode, "daemon_mode", kTrueType },
 	{ sOutputFile, "output_file", kStringType },
 	{ iHttpdPort, "httpd_port", kNumberType },
@@ -86,6 +85,35 @@ configVal oConfigValues[] = {
 
 constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
 
+xmrstak::coin_selection coins[] = {
+	// name, userpool, devpool, default_pool_suggestion
+	{ "aeon7",               {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     "mine.aeon-pool.com:5555" },
+	{ "bbscoin",             {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr },
+	{ "bittube",             {cryptonight_heavy, cryptonight_bittube2, 255u},     {cryptonight_heavy, cryptonight_heavy, 0u},"mining.bit.tube:13333"},
+	{ "cryptonight",         {cryptonight_monero_v8, cryptonight, 255u},          {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
+	{ "cryptonight_bittube2",{cryptonight_heavy, cryptonight_bittube2, 255u},     {cryptonight_heavy, cryptonight_heavy, 0u},nullptr},
+	{ "cryptonight_masari",  {cryptonight_monero_v8, cryptonight_masari, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr },
+	{ "cryptonight_haven",   {cryptonight_heavy, cryptonight_haven, 255u},        {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
+	{ "cryptonight_heavy",   {cryptonight_heavy, cryptonight_heavy, 0u},          {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
+	{ "cryptonight_lite",    {cryptonight_aeon, cryptonight_lite, 255u},          {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
+	{ "cryptonight_lite_v7", {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
+	{ "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u},      {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr },
+	{ "cryptonight_v7",      {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "cryptonight_v8",      {cryptonight_monero, cryptonight_monero_v8, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
+	{ "graft",               {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "haven",               {cryptonight_heavy, cryptonight_haven, 255u},        {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
+	{ "intense",             {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "masari",              {cryptonight_monero_v8, cryptonight_masari, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr },
+	{ "monero",              {cryptonight_monero_v8, cryptonight_monero, 8u},     {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" },
+	{ "qrl",             	 {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "ryo",                 {cryptonight_heavy, cryptonight_heavy, 0u},          {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
+	{ "stellite",            {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
+	{ "turtlecoin",          {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr }
+};
+
+constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0]));
+
 inline bool checkType(Type have, Type want)
 {
 	if(want == have)
@@ -103,6 +131,7 @@ inline bool checkType(Type have, Type want)
 struct jconf::opaque_private
 {
 	Document jsonDoc;
+	Document jsonDocPools;
 	const Value* configValues[iConfigCnt]; //Compile time constant
 
 	opaque_private()
@@ -129,12 +158,13 @@ bool jconf::GetPoolConfig(size_t id, pool_cfg& cfg)
 		return false;
 
 	typedef const Value* cval;
-	cval jaddr, jlogin, jpasswd, jnicehash, jtls, jtlsfp, jwt;
+	cval jaddr, jlogin, jrigid, jpasswd, jnicehash, jtls, jtlsfp, jwt;
 	const Value& oThdConf = prv->configValues[aPoolList]->GetArray()[id];
 
 	/* We already checked presence and types */
 	jaddr = GetObjectMember(oThdConf, "pool_address");
 	jlogin = GetObjectMember(oThdConf, "wallet_address");
+	jrigid = GetObjectMember(oThdConf, "rig_id");
 	jpasswd = GetObjectMember(oThdConf, "pool_password");
 	jnicehash = GetObjectMember(oThdConf, "use_nicehash");
 	jtls = GetObjectMember(oThdConf, "use_tls");
@@ -143,6 +173,7 @@ bool jconf::GetPoolConfig(size_t id, pool_cfg& cfg)
 
 	cfg.sPoolAddr = jaddr->GetString();
 	cfg.sWalletAddr = jlogin->GetString();
+	cfg.sRigId = jrigid->GetString();
 	cfg.sPasswd = jpasswd->GetString();
 	cfg.nicehash = jnicehash->GetBool();
 	cfg.tls = jtls->GetBool();
@@ -166,45 +197,6 @@ bool jconf::TlsSecureAlgos()
 	return prv->configValues[bTlsSecureAlgo]->GetBool();
 }
 
-const std::string jconf::GetCurrency()
-{
-	auto& currency = xmrstak::params::inst().currency;
-	if(currency.empty())
-		currency = prv->configValues[sCurrency]->GetString();
-	if(
-#ifndef CONF_NO_MONERO
-			// if monero is disabled at compile time, enable error message if selected currency is `monero`
-			!xmrstak::strcmp_i(currency, "monero")
-#else
-			true
-#endif
-			&&
-#ifndef CONF_NO_AEON
-			// if aeon is disabled at compile time, enable error message if selected currency is `aeon`
-			!xmrstak::strcmp_i(currency, "aeon")
-#else
-			true
-#endif
-	)
-	{
-		printer::inst()->print_msg(L0, "ERROR: Wrong currency selected - '%s'.", currency.c_str());
-		win_exit();
-	}
-	return currency;
-}
-
-bool jconf::IsCurrencyMonero()
-{
-	if(xmrstak::strcmp_i(GetCurrency(), "monero"))
-	{
-		return true;
-	}
-	else
-	{
-		return false;
-	}
-}
-
 bool jconf::PreferIpv4()
 {
 	return prv->configValues[bPreferIpv4]->GetBool();
@@ -242,7 +234,10 @@ uint64_t jconf::GetAutohashTime()
 
 uint16_t jconf::GetHttpdPort()
 {
-	return prv->configValues[iHttpdPort]->GetUint();
+	if(xmrstak::params::inst().httpd_port == xmrstak::params::httpd_port_unset)
+		return prv->configValues[iHttpdPort]->GetUint();
+	else
+		return uint16_t(xmrstak::params::inst().httpd_port);
 }
 
 const char* jconf::GetHttpUsername()
@@ -307,18 +302,61 @@ jconf::slow_mem_cfg jconf::GetSlowMemSetting()
 		return unknown_value;
 }
 
-bool jconf::parse_config(const char* sFilename)
+std::string jconf::GetMiningCoin()
 {
-	FILE * pFile;
-	char * buffer;
-	size_t flen;
+	if(xmrstak::params::inst().currency.length() > 0)
+		return xmrstak::params::inst().currency;
+	else
+		return prv->configValues[sCurrency]->GetString();
+}
 
-	if(!check_cpu_features())
+void jconf::GetAlgoList(std::string& list)
+{
+	list.reserve(256);
+	for(size_t i=0; i < coin_algo_size; i++)
 	{
-		printer::inst()->print_msg(L0, "CPU support of SSE2 is required.");
-		return false;
+		list += "\t- ";
+		list += coins[i].coin_name;
+		list += "\n";
+	}
+}
+
+bool jconf::IsOnAlgoList(std::string& needle)
+{
+	std::transform(needle.begin(), needle.end(), needle.begin(), ::tolower);
+
+	for(size_t i=0; i < coin_algo_size; i++)
+	{
+		if(needle == coins[i].coin_name)
+			return true;
+	}
+	return false;
+}
+
+const char* jconf::GetDefaultPool(const char* needle)
+{
+	const char* default_example = "pool.example.com:3333";
+
+	for(size_t i=0; i < coin_algo_size; i++)
+	{
+		if(strcmp(needle, coins[i].coin_name) == 0)
+		{
+			if(coins[i].default_pool != nullptr)
+				return coins[i].default_pool;
+			else
+				return default_example;
+		}
 	}
 
+	return default_example;
+}
+
+bool jconf::parse_file(const char* sFilename, bool main_conf)
+{
+	FILE * pFile;
+	char * buffer;
+	size_t flen;
+
 	pFile = fopen(sFilename, "rb");
 	if (pFile == NULL)
 	{
@@ -367,46 +405,92 @@ bool jconf::parse_config(const char* sFilename)
 	buffer[flen] = '}';
 	buffer[flen + 1] = '\0';
 
-	prv->jsonDoc.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2);
+	Document& root = main_conf ? prv->jsonDoc : prv->jsonDocPools;
+
+	root.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2);
 	free(buffer);
 
-	if(prv->jsonDoc.HasParseError())
+	if(root.HasParseError())
 	{
-		printer::inst()->print_msg(L0, "JSON config parse error(offset %llu): %s",
-			int_port(prv->jsonDoc.GetErrorOffset()), GetParseError_En(prv->jsonDoc.GetParseError()));
+		printer::inst()->print_msg(L0, "JSON config parse error in '%s' (offset %llu): %s",
+			sFilename, int_port(root.GetErrorOffset()), GetParseError_En(root.GetParseError()));
 		return false;
 	}
 
-
-	if(!prv->jsonDoc.IsObject())
+	if(!root.IsObject())
 	{ //This should never happen as we created the root ourselves
-		printer::inst()->print_msg(L0, "Invalid config file. No root?\n");
+		printer::inst()->print_msg(L0, "Invalid config file '%s'. No root?", sFilename);
 		return false;
 	}
 
-	for(size_t i = 0; i < iConfigCnt; i++)
+	if(main_conf)
 	{
-		if(oConfigValues[i].iName != i)
+		for(size_t i = 2; i < iConfigCnt; i++)
 		{
-			printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order.");
-			return false;
-		}
+			if(oConfigValues[i].iName != i)
+			{
+				printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order.");
+				return false;
+			}
 
-		prv->configValues[i] = GetObjectMember(prv->jsonDoc, oConfigValues[i].sName);
+			prv->configValues[i] = GetObjectMember(root, oConfigValues[i].sName);
 
-		if(prv->configValues[i] == nullptr)
-		{
-			printer::inst()->print_msg(L0, "Invalid config file. Missing value \"%s\".", oConfigValues[i].sName);
-			return false;
-		}
+			if(prv->configValues[i] == nullptr)
+			{
+				printer::inst()->print_msg(L0, "Invalid config file '%s'. Missing value \"%s\".", sFilename, oConfigValues[i].sName);
+				return false;
+			}
 
-		if(!checkType(prv->configValues[i]->GetType(), oConfigValues[i].iType))
+			if(!checkType(prv->configValues[i]->GetType(), oConfigValues[i].iType))
+			{
+				printer::inst()->print_msg(L0, "Invalid config file '%s'. Value \"%s\" has unexpected type.", sFilename, oConfigValues[i].sName);
+				return false;
+			}
+		}
+	}
+	else
+	{
+		for(size_t i = 0; i < 2; i++)
 		{
-			printer::inst()->print_msg(L0, "Invalid config file. Value \"%s\" has unexpected type.", oConfigValues[i].sName);
-			return false;
+			if(oConfigValues[i].iName != i)
+			{
+				printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order.");
+				return false;
+			}
+
+			prv->configValues[i] = GetObjectMember(root, oConfigValues[i].sName);
+
+			if(prv->configValues[i] == nullptr)
+			{
+				printer::inst()->print_msg(L0, "Invalid config file '%s'. Missing value \"%s\".", sFilename, oConfigValues[i].sName);
+				return false;
+			}
+
+			if(!checkType(prv->configValues[i]->GetType(), oConfigValues[i].iType))
+			{
+				printer::inst()->print_msg(L0, "Invalid config file '%s'. Value \"%s\" has unexpected type.", sFilename, oConfigValues[i].sName);
+				return false;
+			}
 		}
 	}
 
+	return true;
+}
+
+bool jconf::parse_config(const char* sFilename, const char* sFilenamePools)
+{
+	if(!check_cpu_features())
+	{
+		printer::inst()->print_msg(L0, "CPU support of SSE2 is required.");
+		return false;
+	}
+
+	if(!parse_file(sFilename, true))
+		return false;
+
+	if(!parse_file(sFilenamePools, false))
+		return false;
+
 	size_t pool_cnt = prv->configValues[aPoolList]->Size();
 	if(pool_cnt == 0)
 	{
@@ -417,14 +501,14 @@ bool jconf::parse_config(const char* sFilename)
 	std::vector<size_t> pool_weights;
 	pool_weights.reserve(pool_cnt);
 
-	const char* aPoolValues[] = { "pool_address", "wallet_address", "pool_password", "use_nicehash", "use_tls", "tls_fingerprint", "pool_weight" };
-	Type poolValTypes[] = { kStringType, kStringType, kStringType, kTrueType, kTrueType, kStringType, kNumberType };
+	const char* aPoolValues[] = { "pool_address", "wallet_address", "rig_id", "pool_password", "use_nicehash", "use_tls", "tls_fingerprint", "pool_weight" };
+	Type poolValTypes[] = { kStringType, kStringType, kStringType, kStringType, kTrueType, kTrueType, kStringType, kNumberType };
 
 	constexpr size_t pvcnt = sizeof(aPoolValues)/sizeof(aPoolValues[0]);
 	for(uint32_t i=0; i < pool_cnt; i++)
 	{
 		const Value& oThdConf = prv->configValues[aPoolList]->GetArray()[i];
-		
+
 		if(!oThdConf.IsObject())
 		{
 			printer::inst()->print_msg(L0, "Invalid config file. pool_list must contain objects.");
@@ -514,15 +598,31 @@ bool jconf::parse_config(const char* sFilename)
 	}
 #endif // _WIN32
 
-	if (prv->configValues[bFlushStdout]->IsBool())
+	std::string ctmp = GetMiningCoin();
+	std::transform(ctmp.begin(), ctmp.end(), ctmp.begin(), ::tolower);
+
+	if(ctmp.length() == 0)
+	{
+		printer::inst()->print_msg(L0, "You need to specify the coin that you want to mine.");
+		return false;
+	}
+
+	for(size_t i=0; i < coin_algo_size; i++)
 	{
-		bool bflush = prv->configValues[bFlushStdout]->GetBool();
-		printer::inst()->set_flush_stdout(bflush);
-		if (bflush)
+		if(ctmp == coins[i].coin_name)
 		{
-			printer::inst()->print_msg(L0, "Flush stdout forced.");
+			currentCoin = coins[i];
+			break;
 		}
 	}
 
+	if(currentCoin.GetDescription(1).GetMiningAlgo() == invalid_algo)
+	{
+		std::string cl;
+		GetAlgoList(cl);
+		printer::inst()->print_msg(L0, "Unrecognised coin '%s', your options are:\n%s", ctmp.c_str(), cl.c_str());
+		return false;
+	}
+
 	return true;
 }
diff --git a/xmrstak/jconf.hpp b/xmrstak/jconf.hpp
index df1bf7928..102b70f54 100644
--- a/xmrstak/jconf.hpp
+++ b/xmrstak/jconf.hpp
@@ -1,12 +1,12 @@
 #pragma once
 
 #include "xmrstak/misc/environment.hpp"
+#include "xmrstak/misc/coinDescription.hpp"
 #include "params.hpp"
 
 #include <stdlib.h>
 #include <string>
 
-
 class jconf
 {
 public:
@@ -18,11 +18,12 @@ class jconf
 		return env.pJconfConfig;
 	};
 
-	bool parse_config(const char* sFilename = xmrstak::params::inst().configFile.c_str());
+	bool parse_config(const char* sFilename, const char* sFilenamePools);
 
 	struct pool_cfg {
 		const char* sPoolAddr;
 		const char* sWalletAddr;
+		const char* sRigId;
 		const char* sPasswd;
 		bool nicehash;
 		bool tls;
@@ -47,8 +48,13 @@ class jconf
 
 	bool TlsSecureAlgos();
 
-	const std::string GetCurrency();
-	bool IsCurrencyMonero();
+	inline xmrstak::coin_selection GetCurrentCoinSelection() const { return currentCoin; }
+
+	std::string GetMiningCoin();
+
+	static void GetAlgoList(std::string& list);
+	static bool IsOnAlgoList(std::string& needle);
+	static const char* GetDefaultPool(const char* needle);
 
 	uint64_t GetVerboseLevel();
 	bool PrintMotd();
@@ -77,9 +83,12 @@ class jconf
 private:
 	jconf();
 
+	bool parse_file(const char* sFilename, bool main_conf);
+
 	bool check_cpu_features();
 	struct opaque_private;
 	opaque_private* prv;
 
 	bool bHaveAes;
+	xmrstak::coin_selection currentCoin;
 };
diff --git a/xmrstak/misc/coinDescription.hpp b/xmrstak/misc/coinDescription.hpp
new file mode 100644
index 000000000..55e86f4e2
--- /dev/null
+++ b/xmrstak/misc/coinDescription.hpp
@@ -0,0 +1,60 @@
+#pragma once
+
+#include "xmrstak/backend/cryptonight.hpp"
+
+#include <stdlib.h>
+#include <string>
+
+
+namespace xmrstak
+{
+	struct coinDescription
+	{
+		xmrstak_algo algo = xmrstak_algo::invalid_algo;
+		xmrstak_algo algo_root = xmrstak_algo::invalid_algo;
+		uint8_t fork_version = 0u;
+
+		coinDescription() = default;
+
+		coinDescription(const xmrstak_algo in_algo, xmrstak_algo in_algo_root, const uint8_t in_fork_version) :
+			algo(in_algo), algo_root(in_algo_root), fork_version(in_fork_version)
+		{}
+
+		inline xmrstak_algo GetMiningAlgo() const { return algo; }
+		inline xmrstak_algo GetMiningAlgoRoot() const { return algo_root; }
+		inline uint8_t GetMiningForkVersion() const { return fork_version; }
+	};
+
+	struct coin_selection
+	{
+		const char* coin_name = nullptr;
+		/* [0] -> user pool
+		 * [1] -> dev pool
+		 */
+		coinDescription pool_coin[2];
+		const char* default_pool = nullptr;
+
+		coin_selection() = default;
+
+		coin_selection(
+			const char* in_coin_name,
+			const coinDescription user_coinDescription,
+			const coinDescription dev_coinDescription,
+			const char* in_default_pool
+		) :
+			coin_name(in_coin_name), default_pool(in_default_pool)
+		{
+			pool_coin[0] = user_coinDescription;
+			pool_coin[1] = dev_coinDescription;
+		}
+
+		/** get coin description for the pool
+		 *
+		 * @param poolId 0 select dev pool, else the user pool is selected
+		 */
+		inline coinDescription GetDescription(size_t poolId) const {
+			coinDescription tmp = (poolId == 0 ? pool_coin[1] : pool_coin[0]);
+			return tmp;
+		}
+	};
+} // namespace xmrstak
diff --git a/xmrstak/misc/configEditor.hpp b/xmrstak/misc/configEditor.hpp
index a840bc4f2..3f79df44c 100644
--- a/xmrstak/misc/configEditor.hpp
+++ b/xmrstak/misc/configEditor.hpp
@@ -6,6 +6,7 @@
 #include <streambuf>
 #include <regex>
 
+#include "../version.hpp"
 
 namespace xmrstak
 {
@@ -14,7 +15,7 @@ struct configEditor
 {
 	std::string m_fileContent;
 
-	configEditor() 
+	configEditor()
 	{
 
 	}
@@ -42,6 +43,24 @@ struct configEditor
 
 	void write(const std::string filename)
 	{
+		// endmarks: for filtering full lines inside the template string
+		// Platform marks are done globally here
+		// "---WINDOWS" endmark keeps lines when compiled for Windows
+		// "---LINUX"   endmark keeps lines when compiled for Linux (and anything not-windows)
+#if defined(_WIN32) || defined(__WIN32__) || defined(WIN32) || defined(__WINDOWS__)
+		// windows:
+		//   completely drop lines with endmark-linux
+		replace(".*---LINUX\n", "");
+		//   strip off windows endmarks, keep the lines
+		replace("---WINDOWS\n", "\n");
+#else
+		// not-windows:
+		//   completely drop lines with endmark-windows
+		replace(".*---WINDOWS\n", "");
+		//   strip off linux endmarks, keep the lines
+		replace("---LINUX\n", "\n");
+#endif
+		replace("XMRSTAK_VERSION", get_version_str());
 		std::ofstream out(filename);
 		out << m_fileContent;
 		out.close();
@@ -54,4 +73,4 @@ struct configEditor
 
 };
 
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/misc/console.cpp b/xmrstak/misc/console.cpp
index de5eed3db..c39237eab 100644
--- a/xmrstak/misc/console.cpp
+++ b/xmrstak/misc/console.cpp
@@ -156,7 +156,8 @@ printer::printer()
 {
 	verbose_level = LINF;
 	logfile = nullptr;
-	b_flush_stdout = false;
+	// Windows doesn't do line buffering, so it needs to enable full buffering and manually flush the buffer
+	setvbuf(stdout, NULL, _IOFBF, BUFSIZ);
 }
 
 bool printer::open_logfile(const char* file)
@@ -191,30 +192,14 @@ void printer::print_msg(verbosity verbose, const char* fmt, ...)
 	buf[bpos] = '\n';
 	buf[bpos+1] = '\0';
 
-	std::unique_lock<std::mutex> lck(print_mutex);
-	fputs(buf, stdout);
-
-	if (b_flush_stdout)
-	{
-		fflush(stdout);
-	}
-
-	if(logfile != nullptr)
-	{
-		fputs(buf, logfile);
-		fflush(logfile);
-	}
+    print_str(buf);
 }
 
 void printer::print_str(const char* str)
 {
 	std::unique_lock<std::mutex> lck(print_mutex);
 	fputs(str, stdout);
-
-	if (b_flush_stdout)
-	{
-		fflush(stdout);
-	}
+	fflush(stdout);
 
 	if(logfile != nullptr)
 	{
@@ -223,9 +208,9 @@ void printer::print_str(const char* str)
 	}
 }
 
-//Do a press any key for the windows folk. *insert any key joke here*
+// Do a press any key for the windows folk. *insert any key joke here*
 #ifdef _WIN32
-void win_exit(size_t code)
+void win_exit(int code)
 {
 	size_t envSize = 0;
 	getenv_s(&envSize, nullptr, 0, "XMRSTAK_NOWAIT");
@@ -238,8 +223,8 @@ void win_exit(size_t code)
 }
 
 #else
-void win_exit(size_t code) 
-{ 
+void win_exit(int code)
+{
 	std::exit(code);
 }
 #endif // _WIN32
diff --git a/xmrstak/misc/console.hpp b/xmrstak/misc/console.hpp
index cfbeddd01..5d78772c3 100644
--- a/xmrstak/misc/console.hpp
+++ b/xmrstak/misc/console.hpp
@@ -35,7 +35,6 @@ class printer
 	};
 
 	inline void set_verbose_level(size_t level) { verbose_level = (verbosity)level; }
-	inline void set_flush_stdout(bool status) { b_flush_stdout = status; }
 	void print_msg(verbosity verbose, const char* fmt, ...);
 	void print_str(const char* str);
 	bool open_logfile(const char* file);
@@ -45,8 +44,7 @@ class printer
 
 	std::mutex print_mutex;
 	verbosity verbose_level;
-	bool b_flush_stdout;
 	FILE* logfile;
 };
 
-void win_exit(size_t code = 1);
+void win_exit(int code = 1);
diff --git a/xmrstak/misc/environment.hpp b/xmrstak/misc/environment.hpp
index 99c2db82c..b67c85874 100644
--- a/xmrstak/misc/environment.hpp
+++ b/xmrstak/misc/environment.hpp
@@ -38,4 +38,4 @@ struct environment
 	params* pParams = nullptr;
 };
 
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp
index a3088a5f8..c99c76d93 100644
--- a/xmrstak/misc/executor.cpp
+++ b/xmrstak/misc/executor.cpp
@@ -107,8 +107,7 @@ bool executor::get_live_pools(std::vector<jpsock*>& eval_pools, bool is_dev)
 
 		// Only eval live pools
 		size_t num, dtime;
-		if(pool.get_disconnects(num, dtime))
-			set_timestamp();
+		pool.get_disconnects(num, dtime);
 
 		if(dtime == 0 || (dtime >= wait && num <= limit))
 			eval_pools.emplace_back(&pool);
@@ -132,7 +131,7 @@ bool executor::get_live_pools(std::vector<jpsock*>& eval_pools, bool is_dev)
 
 			if(over_limit == pool_count)
 			{
-				printer::inst()->print_msg(L0, "All pools are over give up limit. Exitting.");
+				printer::inst()->print_msg(L0, "All pools are over give up limit. Exiting.");
 				exit(0);
 			}
 
@@ -232,7 +231,7 @@ void executor::eval_pool_choice()
 	else
 	{
 		/* All is good - but check if we can do better */
-		std::sort(eval_pools.begin(), eval_pools.end(), [](jpsock* a, jpsock* b) { return b->get_pool_weight(false) < a->get_pool_weight(false); }); 
+		std::sort(eval_pools.begin(), eval_pools.end(), [](jpsock* a, jpsock* b) { return b->get_pool_weight(false) < a->get_pool_weight(false); });
 		jpsock* goal2 = eval_pools[0];
 
 		if(goal->get_pool_id() != goal2->get_pool_id())
@@ -329,11 +328,14 @@ void executor::on_sock_ready(size_t pool_id)
 
 	if(!pool->cmd_login())
 	{
-		if(!pool->have_sock_error())
+		if(pool->have_call_error() && !pool->is_dev_pool())
 		{
-			log_socket_error(pool, pool->get_call_error());
-			pool->disconnect();
+			std::string str = "Login error: " +  pool->get_call_error();
+			log_socket_error(pool, std::move(str));
 		}
+
+		if(!pool->have_sock_error())
+			pool->disconnect();
 	}
 }
 
@@ -406,14 +408,21 @@ void executor::on_pool_have_job(size_t pool_id, pool_job& oPoolJob)
 void executor::on_miner_result(size_t pool_id, job_result& oResult)
 {
 	jpsock* pool = pick_pool_by_id(pool_id);
-	bool is_monero = jconf::inst()->IsCurrencyMonero();
+
+	const char* backend_name = xmrstak::iBackend::getName(pvThreads->at(oResult.iThreadId)->backendType);
+	uint64_t backend_hashcount, total_hashcount = 0;
+
+	backend_hashcount = pvThreads->at(oResult.iThreadId)->iHashCount.load(std::memory_order_relaxed);
+	for(size_t i = 0; i < pvThreads->size(); i++)
+		total_hashcount += pvThreads->at(i)->iHashCount.load(std::memory_order_relaxed);
 
 	if(pool->is_dev_pool())
 	{
 		//Ignore errors silently
 		if(pool->is_running() && pool->is_logged_in())
-			pool->cmd_submit(oResult.sJobID, oResult.iNonce, oResult.bResult, pvThreads->at(oResult.iThreadId), is_monero);
-
+			pool->cmd_submit(oResult.sJobID, oResult.iNonce, oResult.bResult, backend_name,
+			backend_hashcount, total_hashcount, oResult.algorithm
+		);
 		return;
 	}
 
@@ -424,7 +433,9 @@ void executor::on_miner_result(size_t pool_id, job_result& oResult)
 	}
 
 	size_t t_start = get_timestamp_ms();
-	bool bResult = pool->cmd_submit(oResult.sJobID, oResult.iNonce, oResult.bResult, pvThreads->at(oResult.iThreadId), is_monero);
+	bool bResult = pool->cmd_submit(oResult.sJobID, oResult.iNonce, oResult.bResult,
+		backend_name, backend_hashcount, total_hashcount, oResult.algorithm
+	);
 	size_t t_len = get_timestamp_ms() - t_start;
 
 	if(t_len > 0xFFFF)
@@ -516,15 +527,16 @@ void executor::ex_main()
 		{
 			auto& params = xmrstak::params::inst();
 			already_have_cli_pool = true;
-			
+
 			const char* wallet = params.poolUsername.empty() ? cfg.sWalletAddr : params.poolUsername.c_str();
+			const char* rigid = params.userSetRigid ? params.poolRigid.c_str() : cfg.sRigId;
 			const char* pwd = params.userSetPwd ? params.poolPasswd.c_str() : cfg.sPasswd;
 			bool nicehash = cfg.nicehash || params.nicehashMode;
-			
-			pools.emplace_back(i+1, cfg.sPoolAddr, wallet, pwd, 9.9, false, params.poolUseTls, cfg.tls_fingerprint, nicehash);
+
+			pools.emplace_back(i+1, cfg.sPoolAddr, wallet, rigid, pwd, 9.9, false, params.poolUseTls, cfg.tls_fingerprint, nicehash);
 		}
 		else
-			pools.emplace_back(i+1, cfg.sPoolAddr, cfg.sWalletAddr, cfg.sPasswd, cfg.weight, false, cfg.tls, cfg.tls_fingerprint, cfg.nicehash);
+			pools.emplace_back(i+1, cfg.sPoolAddr, cfg.sWalletAddr, cfg.sRigId, cfg.sPasswd, cfg.weight, false, cfg.tls, cfg.tls_fingerprint, cfg.nicehash);
 	}
 
 	if(!xmrstak::params::inst().poolURL.empty() && !already_have_cli_pool)
@@ -535,23 +547,43 @@ void executor::ex_main()
 			printer::inst()->print_msg(L1, "ERROR: You didn't specify the username / wallet address for %s", xmrstak::params::inst().poolURL.c_str());
 			win_exit();
 		}
-		
-		pools.emplace_back(i+1, params.poolURL.c_str(), params.poolUsername.c_str(), params.poolPasswd.c_str(), 9.9, false, params.poolUseTls, "", params.nicehashMode);
+
+		pools.emplace_back(i+1, params.poolURL.c_str(), params.poolUsername.c_str(), params.poolRigid.c_str(), params.poolPasswd.c_str(), 9.9, false, params.poolUseTls, "", params.nicehashMode);
 	}
 
-	if(jconf::inst()->IsCurrencyMonero())
+	switch(jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo())
 	{
+	case cryptonight_heavy:
 		if(dev_tls)
-			pools.emplace_front(0, "donate.xmr-stak.net:6666", "", "", 0.0, true, true, "", false);
+			pools.emplace_front(0, "donate.xmr-stak.net:8888", "", "", "", 0.0, true, true, "", true);
 		else
-			pools.emplace_front(0, "donate.xmr-stak.net:3333", "", "", 0.0, true, false, "", false);
-	}
-	else
-	{
+			pools.emplace_front(0, "pool.loki.hashvault.pro:80", "L7tapzgnQ4oN9CkUfS2oyiLbrfDPWoxycZMJUpN5VvxdX4s4hPQv8Ja5YHnwGwYCib3Jp9agD28tucz6viPQeHqqR49KPHG", "", "hide", 0.0, true, false, "", false);
+		break;
+	case cryptonight_monero_v8:
+	case cryptonight_monero:
+		if(dev_tls)
+			pools.emplace_front(0, "pool.supportxmr.com:9000", "47CQgrYtLWf4LnwrFLzmfTAp4VQbr5YjmXxJuuKw6Feujjn8c4HrkWpHAtyi6eGfkcZtj1Xig4EXPAS8vzq6CUq4DhiBjyb", "", "hide", 0.0, true, true, "", false);
+		else
+			pools.emplace_front(0, "pool.supportxmr.com:5555", "47CQgrYtLWf4LnwrFLzmfTAp4VQbr5YjmXxJuuKw6Feujjn8c4HrkWpHAtyi6eGfkcZtj1Xig4EXPAS8vzq6CUq4DhiBjyb", "","hide",0.0, true, false, "", false);
+		break;
+	case cryptonight_ipbc:
+	case cryptonight_aeon:
+	case cryptonight_lite:
 		if(dev_tls)
-			pools.emplace_front(0, "donate.xmr-stak.net:7777", "", "", 0.0, true, true, "", true);
+			pools.emplace_front(0, "donate.xmr-stak.net:7777", "", "", "", 0.0, true, true, "", true);
 		else
-			pools.emplace_front(0, "donate.xmr-stak.net:4444", "", "", 0.0, true, false, "", true);
+			pools.emplace_front(0, "pool.aeon.hashvault.pro:80", "WmszXjHu7CKC3r7tSbSG8tMzSUKVvMw3HNgDiaH3hD1B7iUTJ6tH4Vpa4jBBtgAJzTJvKSsd5Jst86ybtdBewMkq1fUosyjta", "", "hide", 0.0, true, false, "", false);
+		break;
+
+	case cryptonight:
+		if(dev_tls)
+			pools.emplace_front(0, "donate.xmr-stak.net:6666", "", "", "", 0.0, true, true, "", false);
+		else
+			pools.emplace_front(0, "pool.electroneum.hashvault.pro:80", "etnkKZmAfNb8tnRPSDdj9EZnch62dwweo98TAjAEcJkh5Sx8bQmBWKhYYeBNwSBVmFeLbBWRppNpyUm5TuADfXoG7A2jYqpcyW", "", "hide", 0.0, true, false, "", false);
+		break;
+
+	default:
+		break;
 	}
 
 	ex_event ev;
@@ -594,8 +626,12 @@ void executor::ex_main()
 			break;
 
 		case EV_GPU_RES_ERROR:
-			log_result_error(std::string(ev.oGpuError.error_str));
+		{
+			std::string err_msg = std::string(ev.oGpuError.error_str) + " GPU ID " + std::to_string(ev.oGpuError.idx);
+			printer::inst()->print_msg(L0, err_msg.c_str());
+			log_result_error(std::move(err_msg));
 			break;
+		}
 
 		case EV_PERF_TICK:
 			for (i = 0; i < pvThreads->size(); i++)
@@ -753,7 +789,7 @@ void executor::hashrate_report(std::string& out)
 			auto bType = static_cast<xmrstak::iBackend::BackendType>(b);
 			std::string name(xmrstak::iBackend::getName(bType));
 			std::transform(name.begin(), name.end(), name.begin(), ::toupper);
-			
+
 			out.append("HASHRATE REPORT - ").append(name).append("\n");
 			out.append("| ID |    10s |    60s |    15m |");
 			if(nthd != 1)
@@ -761,6 +797,7 @@ void executor::hashrate_report(std::string& out)
 			else
 				out.append(1, '\n');
 
+			double fTotalCur[3] = { 0.0, 0.0, 0.0};
 			for (i = 0; i < nthd; i++)
 			{
 				double fHps[3];
@@ -776,9 +813,13 @@ void executor::hashrate_report(std::string& out)
 				out.append(hps_format(fHps[1], num, sizeof(num))).append(" |");
 				out.append(hps_format(fHps[2], num, sizeof(num))).append(1, ' ');
 
-				fTotal[0] += fHps[0];
-				fTotal[1] += fHps[1];
-				fTotal[2] += fHps[2];
+				fTotal[0] += (std::isnormal(fHps[0])) ? fHps[0] : 0.0;
+				fTotal[1] += (std::isnormal(fHps[1])) ? fHps[1] : 0.0;
+				fTotal[2] += (std::isnormal(fHps[2])) ? fHps[2] : 0.0;
+
+				fTotalCur[0] += (std::isnormal(fHps[0])) ? fHps[0] : 0.0;
+				fTotalCur[1] += (std::isnormal(fHps[1])) ? fHps[1] : 0.0;
+				fTotalCur[2] += (std::isnormal(fHps[2])) ? fHps[2] : 0.0;
 
 				if((i & 0x1) == 1) //Odd i's
 					out.append("|\n");
@@ -787,20 +828,24 @@ void executor::hashrate_report(std::string& out)
 			if((i & 0x1) == 1) //We had odd number of threads
 				out.append("|\n");
 
-			if(nthd != 1)
-				out.append("-----------------------------------------------------\n");
-			else
-				out.append("---------------------------\n");
+			out.append("Totals (").append(name).append("): ");
+			out.append(hps_format(fTotalCur[0], num, sizeof(num)));
+			out.append(hps_format(fTotalCur[1], num, sizeof(num)));
+			out.append(hps_format(fTotalCur[2], num, sizeof(num)));
+			out.append(" H/s\n");
+
+			out.append("-----------------------------------------------------------------\n");
 		}
 	}
 
-	out.append("Totals:  ");
+	out.append("Totals (ALL):  ");
 	out.append(hps_format(fTotal[0], num, sizeof(num)));
 	out.append(hps_format(fTotal[1], num, sizeof(num)));
 	out.append(hps_format(fTotal[2], num, sizeof(num)));
 	out.append(" H/s\nHighest: ");
 	out.append(hps_format(fHighestHps, num, sizeof(num)));
 	out.append(" H/s\n");
+	out.append("-----------------------------------------------------------------\n");
 }
 
 char* time_format(char* buf, size_t len, std::chrono::system_clock::time_point time)
@@ -980,7 +1025,7 @@ void executor::http_hashrate_report(std::string& out)
 					out.append(sHtmlMotdBoxStart);
 					have_motd = true;
 				}
-				
+
 				snprintf(buffer, sizeof(buffer), sHtmlMotdEntry, pool.get_pool_addr(), motd.c_str());
 				out.append(buffer);
 			}
@@ -1213,7 +1258,7 @@ void executor::http_json_report(std::string& out)
 		if(i != 0) cn_error.append(1, ',');
 
 		snprintf(buffer, sizeof(buffer), sJsonApiConnectionError,
-			int_port(duration_cast<seconds>(vMineResults[i].time.time_since_epoch()).count()),
+			int_port(duration_cast<seconds>(vSocketLog[i].time.time_since_epoch()).count()),
 			vSocketLog[i].msg.c_str());
 		cn_error.append(buffer);
 	}
diff --git a/xmrstak/misc/executor.hpp b/xmrstak/misc/executor.hpp
index c2caa395e..be5ee6c2f 100644
--- a/xmrstak/misc/executor.hpp
+++ b/xmrstak/misc/executor.hpp
@@ -23,7 +23,7 @@ namespace cpu
 class minethd;
 
 } // namespace cpu
-} // namepsace xmrstak
+} // namespace xmrstak
 
 class executor
 {
@@ -54,7 +54,7 @@ class executor
 
 	inline void set_timestamp() { dev_timestamp = get_timestamp(); };
 
-	// In miliseconds, has to divide a second (1000ms) into an integer number
+	// In milliseconds, has to divide a second (1000ms) into an integer number
 	constexpr static size_t iTickTime = 500;
 
 	// Dev donation time period in seconds. 100 minutes by default.
@@ -64,7 +64,7 @@ class executor
 	inline bool is_dev_time()
 	{
 		//Add 2 seconds to compensate for connect
-		constexpr size_t dev_portion = double(iDevDonatePeriod) * fDevDonationLevel + 2;
+		constexpr size_t dev_portion = static_cast<size_t>(double(iDevDonatePeriod) * fDevDonationLevel + 2.);
 
 		if(dev_portion < 12) //No point in bothering with less than 10s
 			return false;
@@ -177,7 +177,6 @@ class executor
 		iPoolCallTimes.clear();
 		tPoolConnTime = std::chrono::system_clock::now();
 		iPoolHashes = 0;
-		iPoolDiff = 0;
 	}
 
 	double fHighestHps = 0.0;
diff --git a/xmrstak/misc/telemetry.cpp b/xmrstak/misc/telemetry.cpp
index 738d28763..47442df09 100644
--- a/xmrstak/misc/telemetry.cpp
+++ b/xmrstak/misc/telemetry.cpp
@@ -36,6 +36,7 @@ telemetry::telemetry(size_t iThd)
 	ppHashCounts = new uint64_t*[iThd];
 	ppTimestamps = new uint64_t*[iThd];
 	iBucketTop = new uint32_t[iThd];
+	mtx = new std::mutex[iThd];
 
 	for (size_t i = 0; i < iThd; i++)
 	{
@@ -47,16 +48,19 @@ telemetry::telemetry(size_t iThd)
 	}
 }
 
-double telemetry::calc_telemetry_data(size_t iLastMilisec, size_t iThread)
+double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread)
 {
-	uint64_t iTimeNow = get_timestamp_ms();
+
 
 	uint64_t iEarliestHashCnt = 0;
 	uint64_t iEarliestStamp = 0;
-	uint64_t iLastestStamp = 0;
-	uint64_t iLastestHashCnt = 0;
+	uint64_t iLatestStamp = 0;
+	uint64_t iLatestHashCnt = 0;
 	bool bHaveFullSet = false;
 
+	std::unique_lock<std::mutex> lk(mtx[iThread]);
+	uint64_t iTimeNow = get_timestamp_ms();
+
 	//Start at 1, buckettop points to next empty
 	for (size_t i = 1; i < iBucketSize; i++)
 	{
@@ -65,13 +69,13 @@ double telemetry::calc_telemetry_data(size_t iLastMilisec, size_t iThread)
 		if (ppTimestamps[iThread][idx] == 0)
 			break; //That means we don't have the data yet
 
-		if (iLastestStamp == 0)
+		if (iLatestStamp == 0)
 		{
-			iLastestStamp = ppTimestamps[iThread][idx];
-			iLastestHashCnt = ppHashCounts[iThread][idx];
+			iLatestStamp = ppTimestamps[iThread][idx];
+			iLatestHashCnt = ppHashCounts[iThread][idx];
 		}
 
-		if (iTimeNow - ppTimestamps[iThread][idx] > iLastMilisec)
+		if (iTimeNow - ppTimestamps[iThread][idx] > iLastMillisec)
 		{
 			bHaveFullSet = true;
 			break; //We are out of the requested time period
@@ -80,17 +84,18 @@ double telemetry::calc_telemetry_data(size_t iLastMilisec, size_t iThread)
 		iEarliestStamp = ppTimestamps[iThread][idx];
 		iEarliestHashCnt = ppHashCounts[iThread][idx];
 	}
+	lk.unlock();
 
-	if (!bHaveFullSet || iEarliestStamp == 0 || iLastestStamp == 0)
+	if (!bHaveFullSet || iEarliestStamp == 0 || iLatestStamp == 0)
 		return nan("");
 
 	//Don't think that can happen, but just in case
-	if (iLastestStamp - iEarliestStamp == 0)
+	if (iLatestStamp - iEarliestStamp == 0)
 		return nan("");
 
 	double fHashes, fTime;
-	fHashes = iLastestHashCnt - iEarliestHashCnt;
-	fTime = iLastestStamp - iEarliestStamp;
+	fHashes = static_cast<double>(iLatestHashCnt - iEarliestHashCnt);
+	fTime = static_cast<double>(iLatestStamp - iEarliestStamp);
 	fTime /= 1000.0;
 
 	return fHashes / fTime;
@@ -98,6 +103,7 @@ double telemetry::calc_telemetry_data(size_t iLastMilisec, size_t iThread)
 
 void telemetry::push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTimestamp)
 {
+	std::unique_lock<std::mutex> lk(mtx[iThd]);
 	size_t iTop = iBucketTop[iThd];
 	ppHashCounts[iThd][iTop] = iHashCount;
 	ppTimestamps[iThd][iTop] = iTimestamp;
@@ -105,4 +111,4 @@ void telemetry::push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTime
 	iBucketTop[iThd] = (iTop + 1) & iBucketMask;
 }
 
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/misc/telemetry.hpp b/xmrstak/misc/telemetry.hpp
index b35bbbfd6..580565de2 100644
--- a/xmrstak/misc/telemetry.hpp
+++ b/xmrstak/misc/telemetry.hpp
@@ -2,6 +2,7 @@
 
 #include <cstdint>
 #include <cstring>
+#include <mutex>
 
 namespace xmrstak
 {
@@ -11,9 +12,10 @@ class telemetry
 public:
 	telemetry(size_t iThd);
 	void push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTimestamp);
-	double calc_telemetry_data(size_t iLastMilisec, size_t iThread);
+	double calc_telemetry_data(size_t iLastMillisec, size_t iThread);
 
 private:
+	std::mutex* mtx;
 	constexpr static size_t iBucketSize = 2 << 11; //Power of 2 to simplify calculations
 	constexpr static size_t iBucketMask = iBucketSize - 1;
 	uint32_t* iBucketTop;
@@ -21,4 +23,4 @@ class telemetry
 	uint64_t** ppTimestamps;
 };
 
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/misc/uac.cpp b/xmrstak/misc/uac.cpp
new file mode 100644
index 000000000..9f338dde0
--- /dev/null
+++ b/xmrstak/misc/uac.cpp
@@ -0,0 +1,81 @@
+#ifdef _WIN32
+#include "xmrstak/misc/console.hpp"
+#include "xmrstak/params.hpp"
+
+#include <string>
+#include <windows.h>
+
+BOOL IsElevated()
+{
+	BOOL fRet = FALSE;
+	HANDLE hToken = NULL;
+	if (OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken))
+	{
+		TOKEN_ELEVATION Elevation;
+		DWORD cbSize = sizeof(TOKEN_ELEVATION);
+		if (GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize))
+			fRet = Elevation.TokenIsElevated;
+	}
+	if (hToken)
+		CloseHandle(hToken);
+	return fRet;
+}
+
+BOOL SelfElevate(const std::string& my_path, const std::string& params)
+{
+	using namespace xmrstak;
+	if (IsElevated())
+		return FALSE;
+
+	SHELLEXECUTEINFO shExecInfo = { 0 };
+	shExecInfo.cbSize = sizeof(SHELLEXECUTEINFO);
+	shExecInfo.fMask = SEE_MASK_NOCLOSEPROCESS;
+	shExecInfo.hwnd = NULL;
+	shExecInfo.lpVerb = "runas";
+	shExecInfo.lpFile = my_path.c_str();
+	shExecInfo.lpParameters = params.c_str();
+	shExecInfo.lpDirectory = NULL;
+	shExecInfo.nShow = SW_SHOW;
+	shExecInfo.hInstApp = NULL;
+	if (params::inst().useHide) {
+		shExecInfo.nShow = SW_HIDE;
+	}
+
+	if (!ShellExecuteEx(&shExecInfo))
+		return FALSE;
+
+	// Loiter in the background to make scripting easier
+	printer::inst()->print_msg(L0, "This window has been opened because xmr-stak needed to run as administrator.  It can be safely closed now.");
+	WaitForSingleObject(shExecInfo.hProcess, 1);
+	std::exit(0);
+
+	return TRUE;
+}
+
+VOID RequestElevation()
+{
+	if(IsElevated())
+		return;
+
+	if(!xmrstak::params::inst().allowUAC)
+	{
+		printer::inst()->print_msg(L0, "The miner needs to run as administrator, but you passed --noUAC option. Please remove it or set use_slow_memory to always.");
+		win_exit();
+		return;
+	}
+
+	SelfElevate(xmrstak::params::inst().minerArg0, xmrstak::params::inst().minerArgs);
+}
+
+BOOL IsWindows10OrNewer()
+{
+    OSVERSIONINFOEX osvi = { 0 };
+    osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
+    osvi.dwMajorVersion = 10;
+    osvi.dwMinorVersion = 0;
+    DWORDLONG dwlConditionMask = 0;
+    VER_SET_CONDITION(dwlConditionMask, VER_MAJORVERSION, VER_GREATER_EQUAL);
+    VER_SET_CONDITION(dwlConditionMask, VER_MINORVERSION, VER_GREATER_EQUAL);
+    return ::VerifyVersionInfo(&osvi, VER_MAJORVERSION | VER_MINORVERSION, dwlConditionMask);
+}
+#endif
diff --git a/xmrstak/misc/uac.hpp b/xmrstak/misc/uac.hpp
index 55c5f1a43..33c79ae28 100644
--- a/xmrstak/misc/uac.hpp
+++ b/xmrstak/misc/uac.hpp
@@ -1,51 +1,10 @@
 #pragma once
 
 #ifdef _WIN32
-#include "xmrstak/misc/console.hpp"
-
 #include <string>
-#include <windows.h>
-
-BOOL IsElevated() 
-{
-	BOOL fRet = FALSE;
-	HANDLE hToken = NULL;
-	if (OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken)) 
-	{
-		TOKEN_ELEVATION Elevation;
-		DWORD cbSize = sizeof(TOKEN_ELEVATION);
-		if (GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize))
-			fRet = Elevation.TokenIsElevated;
-	}
-	if (hToken)
-		CloseHandle(hToken);
-	return fRet;
-}
-
-BOOL SelfElevate(const char* my_path, const std::string& params)
-{
-	if (IsElevated())
-		return FALSE;
-
-	SHELLEXECUTEINFO shExecInfo = { 0 };
-	shExecInfo.cbSize = sizeof(SHELLEXECUTEINFO);
-	shExecInfo.fMask = SEE_MASK_NOCLOSEPROCESS;
-	shExecInfo.hwnd = NULL;
-	shExecInfo.lpVerb = "runas";
-	shExecInfo.lpFile = my_path;
-	shExecInfo.lpParameters = params.c_str();
-	shExecInfo.lpDirectory = NULL;
-	shExecInfo.nShow = SW_SHOW;
-	shExecInfo.hInstApp = NULL;
-
-	if (!ShellExecuteEx(&shExecInfo))
-		return FALSE;
-
-	// Loiter in the background to make scripting easier
-	printer::inst()->print_msg(L0, "This window has been opened because xmr-stak needed to run as administrator.  It can be safely closed now.");
-	WaitForSingleObject(shExecInfo.hProcess, INFINITE);
-	std::exit(0);
 
-	return TRUE;
-}
+BOOL IsElevated();
+BOOL SelfElevate(const std::string& my_path, const std::string& params);
+VOID RequestElevation();
+BOOL IsWindows10OrNewer();
 #endif
diff --git a/xmrstak/misc/utility.cpp b/xmrstak/misc/utility.cpp
index 3b1369a36..5177d14c2 100644
--- a/xmrstak/misc/utility.cpp
+++ b/xmrstak/misc/utility.cpp
@@ -18,4 +18,4 @@ namespace xmrstak
 					}
 				);
 	}
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/misc/utility.hpp b/xmrstak/misc/utility.hpp
index b2e841dff..8f2e99fb8 100644
--- a/xmrstak/misc/utility.hpp
+++ b/xmrstak/misc/utility.hpp
@@ -9,4 +9,4 @@ namespace xmrstak
 	 * @return true if both strings are equal, else false
 	 */
 	bool strcmp_i(const std::string& str1, const std::string& str2);
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp
index 7ee09e712..d20ba082f 100644
--- a/xmrstak/net/jpsock.cpp
+++ b/xmrstak/net/jpsock.cpp
@@ -24,6 +24,7 @@
 #include <stdarg.h>
 #include <assert.h>
 #include <algorithm>
+#include <chrono>
 
 #include "jpsock.hpp"
 #include "socks.hpp"
@@ -42,8 +43,9 @@ struct jpsock::call_rsp
 	uint64_t iCallId;
 	Value* pCallData;
 	std::string sCallErr;
+	uint64_t iMessageId;
 
-	call_rsp(Value* val) : pCallData(val)
+	call_rsp(Value* val) : pCallData(val), iMessageId(0)
 	{
 		bHaveResponse = false;
 		iCallId = 0;
@@ -92,8 +94,8 @@ struct jpsock::opq_json_val
 	opq_json_val(const Value* val) : val(val) {}
 };
 
-jpsock::jpsock(size_t id, const char* sAddr, const char* sLogin, const char* sPassword, double pool_weight, bool dev_pool, bool tls, const char* tls_fp, bool nicehash) :
-	net_addr(sAddr), usr_login(sLogin), usr_pass(sPassword), tls_fp(tls_fp), pool_id(id), pool_weight(pool_weight), pool(dev_pool), nicehash(nicehash),
+jpsock::jpsock(size_t id, const char* sAddr, const char* sLogin, const char* sRigId, const char* sPassword, double pool_weight, bool dev_pool, bool tls, const char* tls_fp, bool nicehash) :
+	net_addr(sAddr), usr_login(sLogin), usr_rigid(sRigId), usr_pass(sPassword), tls_fp(tls_fp), pool_id(id), pool_weight(pool_weight), pool(dev_pool), nicehash(nicehash),
 	connect_time(0), connect_attempts(0), disconnect_time(0), quiet_close(false)
 {
 	sock_init();
@@ -133,6 +135,7 @@ jpsock::~jpsock()
 
 std::string&& jpsock::get_call_error()
 {
+	call_error = false;
 	return std::move(prv->oCallRsp.sCallErr);
 }
 
@@ -189,16 +192,31 @@ bool jpsock::set_socket_error_strerr(const char* a, int res)
 void jpsock::jpsock_thread()
 {
 	jpsock_thd_main();
+
+	if(!bHaveSocketError)
+		set_socket_error("Socket closed.");
+
 	executor::inst()->push_event(ex_event(std::move(sSocketError), quiet_close, pool_id));
 
-	// If a call is wating, send an error to end it
-	bool bCallWaiting = false;
 	std::unique_lock<std::mutex> mlock(call_mutex);
+	bool bWait = prv->oCallRsp.pCallData != nullptr;
+
+	// If a call is waiting, wait a little bit before blowing it out of the water
+	if(bWait)
+	{
+		mlock.unlock();
+		std::this_thread::sleep_for(std::chrono::milliseconds(500));
+		mlock.lock();
+	}
+
+	// If the call is still there send an error to end it
+	bool bCallWaiting = false;
 	if(prv->oCallRsp.pCallData != nullptr)
 	{
 		prv->oCallRsp.bHaveResponse = true;
 		prv->oCallRsp.iCallId = 0;
 		prv->oCallRsp.pCallData = nullptr;
+		prv->oCallRsp.iMessageId = 0;
 		bCallWaiting = true;
 	}
 	mlock.unlock();
@@ -213,7 +231,7 @@ void jpsock::jpsock_thread()
 	else
 		disconnect_time = 0;
 
-	std::unique_lock<std::mutex>(job_mutex);
+	std::unique_lock<std::mutex> lck(job_mutex);
 	memset(&oCurrentJob, 0, sizeof(oCurrentJob));
 	bRunning = false;
 }
@@ -270,6 +288,7 @@ bool jpsock::process_line(char* line, size_t len)
 	prv->jsonDoc.SetNull();
 	prv->parseAllocator.Clear();
 	prv->callAllocator.Clear();
+	++iMessageCnt;
 
 	/*NULL terminate the line instead of '\n', parsing will add some more NULLs*/
 	line[len-1] = '\0';
@@ -290,6 +309,12 @@ bool jpsock::process_line(char* line, size_t len)
 		if(!mt->IsString())
 			return set_socket_error("PARSE error: Protocol error 1");
 
+		if(strcmp(mt->GetString(), "mining.set_extranonce") == 0)
+		{
+			printer::inst()->print_msg(L0, "Detected buggy NiceHash pool code. Workaround engaged.");
+			return true;
+		}
+
 		if(strcmp(mt->GetString(), "job") != 0)
 			return set_socket_error("PARSE error: Unsupported server method ", mt->GetString());
 
@@ -298,7 +323,7 @@ bool jpsock::process_line(char* line, size_t len)
 			return set_socket_error("PARSE error: Protocol error 2");
 
 		opq_json_val v(mt);
-		return process_pool_job(&v);
+		return process_pool_job(&v, iMessageCnt);
 	}
 	else
 	{
@@ -312,7 +337,7 @@ bool jpsock::process_line(char* line, size_t len)
 		mt = GetObjectMember(prv->jsonDoc, "error");
 
 		const char* sError = nullptr;
-		size_t iErrorLn = 0;
+		size_t iErrorLen = 0;
 		if (mt == nullptr || mt->IsNull())
 		{
 			/* If there was no error we need a result */
@@ -329,7 +354,7 @@ bool jpsock::process_line(char* line, size_t len)
 			if(msg == nullptr || !msg->IsString())
 				return set_socket_error("PARSE error: Protocol error 6");
 
-			iErrorLn = msg->GetStringLength();
+			iErrorLen = msg->GetStringLength();
 			sError = msg->GetString();
 		}
 
@@ -343,11 +368,13 @@ bool jpsock::process_line(char* line, size_t len)
 
 		prv->oCallRsp.bHaveResponse = true;
 		prv->oCallRsp.iCallId = iCallId;
+		prv->oCallRsp.iMessageId = iMessageCnt;
 
 		if(sError != nullptr)
 		{
 			prv->oCallRsp.pCallData = nullptr;
-			prv->oCallRsp.sCallErr.assign(sError, iErrorLn);
+			prv->oCallRsp.sCallErr.assign(sError, iErrorLen);
+			call_error = true;
 		}
 		else
 			prv->oCallRsp.pCallData->CopyFrom(*mt, prv->callAllocator);
@@ -359,8 +386,20 @@ bool jpsock::process_line(char* line, size_t len)
 	}
 }
 
-bool jpsock::process_pool_job(const opq_json_val* params)
+bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t messageId)
 {
+	std::unique_lock<std::mutex> mlock(job_mutex);
+	if(messageId < iLastMessageId)
+	{
+		/* In the case where the processed job message id is lesser than the last
+		 * processed job message id we skip the processing to avoid mining old jobs
+		 */
+		return true;
+	}
+	iLastMessageId = messageId;
+
+	mlock.unlock();
+
 	if (!params->val->IsObject())
 		return set_socket_error("PARSE error: Job error 1");
 
@@ -376,18 +415,45 @@ bool jpsock::process_pool_job(const opq_json_val* params)
 		return set_socket_error("PARSE error: Job error 2");
 	}
 
+	if(motd != nullptr && motd->IsString() && (motd->GetStringLength() & 0x01) == 0)
+	{
+		std::unique_lock<std::mutex> lck(motd_mutex);
+		if(motd->GetStringLength() > 0)
+		{
+			pool_motd.resize(motd->GetStringLength()/2 + 1);
+			if(!hex2bin(motd->GetString(), motd->GetStringLength(), (unsigned char*)&pool_motd.front()))
+				pool_motd.clear();
+		}
+		else
+			pool_motd.clear();
+	}
+
 	if (jobid->GetStringLength() >= sizeof(pool_job::sJobID)) // Note >=
 		return set_socket_error("PARSE error: Job error 3");
 
-	uint32_t iWorkLn = blob->GetStringLength() / 2;
-	if (iWorkLn > sizeof(pool_job::bWorkBlob))
-		return set_socket_error("PARSE error: Invalid job legth. Are you sure you are mining the correct coin?");
-
 	pool_job oPoolJob;
-	if (!hex2bin(blob->GetString(), iWorkLn * 2, oPoolJob.bWorkBlob))
+
+	const uint32_t iWorkLen = blob->GetStringLength() / 2;
+	oPoolJob.iWorkLen = iWorkLen;
+
+	if (iWorkLen > sizeof(pool_job::bWorkBlob))
+		return set_socket_error("PARSE error: Invalid job length. Are you sure you are mining the correct coin?");
+
+	if (!hex2bin(blob->GetString(), iWorkLen * 2, oPoolJob.bWorkBlob))
 		return set_socket_error("PARSE error: Job error 4");
 
-	oPoolJob.iWorkLen = iWorkLn;
+	// lock reading of oCurrentJob
+	std::unique_lock<std::mutex> jobIdLock(job_mutex);
+	// compare possible non equal length job id's
+	if(iWorkLen == oCurrentJob.iWorkLen &&
+		memcmp(oPoolJob.bWorkBlob, oCurrentJob.bWorkBlob, iWorkLen) == 0 &&
+		strcmp(jobid->GetString(), oCurrentJob.sJobID) == 0
+	)
+	{
+		return set_socket_error("Duplicate equal job detected! Please contact your pool admin.");
+	}
+	jobIdLock.unlock();
+
 	memset(oPoolJob.sJobID, 0, sizeof(pool_job::sJobID));
 	memcpy(oPoolJob.sJobID, jobid->GetString(), jobid->GetStringLength()); //Bounds checking at proto error 3
 
@@ -400,7 +466,7 @@ bool jpsock::process_pool_job(const opq_json_val* params)
 		if(!hex2bin(sTempStr, 8, (unsigned char*)&iTempInt) || iTempInt == 0)
 			return set_socket_error("PARSE error: Invalid target");
 
-		
+
 		oPoolJob.iTarget = t32_to_t64(iTempInt);
 	}
 	else if(target_slen <= 16)
@@ -414,25 +480,14 @@ bool jpsock::process_pool_job(const opq_json_val* params)
 	else
 		return set_socket_error("PARSE error: Job error 5");
 
-	if(motd != nullptr && motd->IsString() && (motd->GetStringLength() & 0x01) == 0)
-	{
-		std::unique_lock<std::mutex>(motd_mutex);
-		if(motd->GetStringLength() > 0)
-		{
-			pool_motd.resize(motd->GetStringLength()/2 + 1);
-			if(!hex2bin(motd->GetString(), motd->GetStringLength(), (unsigned char*)&pool_motd.front()))
-				pool_motd.clear();
-		}
-		else
-			pool_motd.clear();
-	}
-
 	iJobDiff = t64_to_diff(oPoolJob.iTarget);
 
+	std::unique_lock<std::mutex> lck(job_mutex);
+	oCurrentJob = oPoolJob;
+	lck.unlock();
+	// send event after current job data are updated
 	executor::inst()->push_event(ex_event(oPoolJob, pool_id));
 
-	std::unique_lock<std::mutex>(job_mutex);
-	oCurrentJob = oPoolJob;
 	return true;
 }
 
@@ -440,6 +495,7 @@ bool jpsock::connect(std::string& sConnectError)
 {
 	ext_algo = ext_backend = ext_hashcount = ext_motd = false;
 	bHaveSocketError = false;
+	call_error = false;
 	sSocketError.clear();
 	iJobDiff = 0;
 	connect_attempts++;
@@ -474,7 +530,7 @@ void jpsock::disconnect(bool quiet)
 	quiet_close = false;
 }
 
-bool jpsock::cmd_ret_wait(const char* sPacket, opq_json_val& poResult)
+bool jpsock::cmd_ret_wait(const char* sPacket, opq_json_val& poResult, uint64_t& messageId)
 {
 	//printf("SEND: %s\n", sPacket);
 
@@ -514,8 +570,10 @@ bool jpsock::cmd_ret_wait(const char* sPacket, opq_json_val& poResult)
 	}
 
 	if(bSuccess)
+	{
 		poResult.val = &prv->oCallValue;
-
+		messageId = prv->oCallRsp.iMessageId;
+	}
 	return bSuccess;
 }
 
@@ -523,13 +581,14 @@ bool jpsock::cmd_login()
 {
 	char cmd_buffer[1024];
 
-	snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"login\",\"params\":{\"login\":\"%s\",\"pass\":\"%s\",\"agent\":\"%s\"},\"id\":1}\n",
-		usr_login.c_str(), usr_pass.c_str(), get_version_str().c_str());
+	snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"login\",\"params\":{\"login\":\"%s\",\"pass\":\"%s\",\"rigid\":\"%s\",\"agent\":\"%s\"},\"id\":1}\n",
+		usr_login.c_str(), usr_pass.c_str(), usr_rigid.c_str(), get_version_str().c_str());
 
 	opq_json_val oResult(nullptr);
+	uint64_t messageId = 0;
 
 	/*Normal error conditions (failed login etc..) will end here*/
-	if (!cmd_ret_wait(cmd_buffer, oResult))
+	if (!cmd_ret_wait(cmd_buffer, oResult, messageId))
 		return false;
 
 	if (!oResult.val->IsObject())
@@ -565,7 +624,7 @@ bool jpsock::cmd_login()
 		for(size_t i=0; i < ext->Size(); i++)
 		{
 			const Value& jextname = ext->GetArray()[i];
-			
+
 			if(!jextname.IsString())
 				continue;
 
@@ -584,7 +643,7 @@ bool jpsock::cmd_login()
 	}
 
 	opq_json_val v(job);
-	if(!process_pool_job(&v))
+	if(!process_pool_job(&v, messageId))
 	{
 		disconnect();
 		return false;
@@ -596,7 +655,7 @@ bool jpsock::cmd_login()
 	return true;
 }
 
-bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bResult, xmrstak::iBackend* bend, bool algo_full_cn)
+bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bResult, const char* backend_name, uint64_t backend_hashcount, uint64_t total_hashcount, xmrstak_algo algo)
 {
 	char cmd_buffer[1024];
 	char sNonce[9];
@@ -604,16 +663,56 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 	/*Extensions*/
 	char sAlgo[64] = {0};
 	char sBackend[64] = {0};
-	char sHashcount[64] = {0};
+	char sHashcount[128] = {0};
 
 	if(ext_backend)
-		snprintf(sBackend, sizeof(sBackend), ",\"backend\":\"%s\"", xmrstak::iBackend::getName(bend->backendType));
+		snprintf(sBackend, sizeof(sBackend), ",\"backend\":\"%s\"", backend_name);
 
 	if(ext_hashcount)
-		snprintf(sHashcount, sizeof(sHashcount), ",\"hashcount\":%llu", int_port(bend->iHashCount.load(std::memory_order_relaxed)));
+		snprintf(sHashcount, sizeof(sHashcount), ",\"hashcount\":%llu,\"hashcount_total\":%llu", int_port(backend_hashcount), int_port(total_hashcount));
 
 	if(ext_algo)
-		snprintf(sAlgo, sizeof(sAlgo), ",\"algo\":\"%s\"", algo_full_cn ? "cryptonight" : "cryptonight-lite");
+	{
+		const char* algo_name;
+		switch(algo)
+		{
+		case cryptonight:
+			algo_name = "cryptonight";
+			break;
+		case cryptonight_lite:
+			algo_name = "cryptonight_lite";
+			break;
+		case cryptonight_monero:
+			algo_name = "cryptonight_v7";
+			break;
+		case cryptonight_monero_v8:
+			algo_name = "cryptonight_v8";
+			break;
+		case cryptonight_aeon:
+			algo_name = "cryptonight_lite_v7";
+			break;
+		case cryptonight_stellite:
+			algo_name = "cryptonight_v7_stellite";
+			break;
+		case cryptonight_ipbc:
+			algo_name = "cryptonight_lite_v7_xor";
+			break;
+		case cryptonight_heavy:
+			algo_name = "cryptonight_heavy";
+			break;
+		case cryptonight_haven:
+			algo_name = "cryptonight_haven";
+			break;
+		case cryptonight_masari:
+			algo_name = "cryptonight_masari";
+			break;
+		default:
+			algo_name = "unknown";
+			break;
+		}
+
+		snprintf(sAlgo, sizeof(sAlgo), ",\"algo\":\"%s\"", algo_name);
+	}
 
 	bin2hex((unsigned char*)&iNonce, 4, sNonce);
 	sNonce[8] = '\0';
@@ -624,19 +723,20 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 	snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"submit\",\"params\":{\"id\":\"%s\",\"job_id\":\"%s\",\"nonce\":\"%s\",\"result\":\"%s\"%s%s%s},\"id\":1}\n",
 		sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo);
 
+	uint64_t messageId = 0;
 	opq_json_val oResult(nullptr);
-	return cmd_ret_wait(cmd_buffer, oResult);
+	return cmd_ret_wait(cmd_buffer, oResult, messageId);
 }
 
 void jpsock::save_nonce(uint32_t nonce)
 {
-	std::unique_lock<std::mutex>(job_mutex);
+	std::unique_lock<std::mutex> lck(job_mutex);
 	oCurrentJob.iSavedNonce = nonce;
 }
 
 bool jpsock::get_current_job(pool_job& job)
 {
-	std::unique_lock<std::mutex>(job_mutex);
+	std::unique_lock<std::mutex> lck(job_mutex);
 
 	if(oCurrentJob.iWorkLen == 0)
 		return false;
@@ -647,10 +747,10 @@ bool jpsock::get_current_job(pool_job& job)
 
 bool jpsock::get_pool_motd(std::string& strin)
 {
-	if(!ext_motd) 
+	if(!ext_motd)
 		return false;
 
-	std::unique_lock<std::mutex>(motd_mutex);
+	std::unique_lock<std::mutex> lck(motd_mutex);
 	if(pool_motd.size() > 0)
 	{
 		strin.assign(pool_motd);
diff --git a/xmrstak/net/jpsock.hpp b/xmrstak/net/jpsock.hpp
index 9d276b724..ad34f6c86 100644
--- a/xmrstak/net/jpsock.hpp
+++ b/xmrstak/net/jpsock.hpp
@@ -2,6 +2,7 @@
 
 #include "xmrstak/backend/iBackend.hpp"
 #include "msgstruct.hpp"
+#include "xmrstak/jconf.hpp"
 
 #include <mutex>
 #include <atomic>
@@ -14,7 +15,7 @@
 	- Parsing or connection error
 	Those are fatal errors (we drop the connection if we encounter them).
 	After they are constructed from const char* strings from various places.
-	(can be from read-only mem), we passs them in an exectutor message
+	(can be from read-only mem), we pass them in an executor message
 	once the recv thread expires.
 	- Call error
 	This error happens when the "server says no". Usually because the job was
@@ -27,21 +28,21 @@ class base_socket;
 class jpsock
 {
 public:
-	jpsock(size_t id, const char* sAddr, const char* sLogin, const char* sPassword, double pool_weight, bool dev_pool, bool tls, const char* tls_fp, bool nicehash);
+	jpsock(size_t id, const char* sAddr, const char* sLogin, const char* sRigId, const char* sPassword, double pool_weight, bool dev_pool, bool tls, const char* tls_fp, bool nicehash);
 	~jpsock();
 
 	bool connect(std::string& sConnectError);
 	void disconnect(bool quiet = false);
 
 	bool cmd_login();
-	bool cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bResult, xmrstak::iBackend* bend, bool algo_full_cn);
+	bool cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bResult, const char* backend_name, uint64_t backend_hashcount, uint64_t total_hashcount, xmrstak_algo algo);
 
 	static bool hex2bin(const char* in, unsigned int len, unsigned char* out);
 	static void bin2hex(const unsigned char* in, unsigned int len, char* out);
 
-	inline double get_pool_weight(bool gross_weight) 
-	{ 
-		double ret = pool_weight; 
+	inline double get_pool_weight(bool gross_weight)
+	{
+		double ret = pool_weight;
 		if(gross_weight && bRunning)
 			ret += 10.0;
 		if(gross_weight && bLoggedIn)
@@ -62,6 +63,7 @@ class jpsock
 	bool get_pool_motd(std::string& strin);
 
 	std::string&& get_call_error();
+	bool have_call_error() { return call_error; }
 	bool have_sock_error() { return bHaveSocketError; }
 
 	inline static uint64_t t32_to_t64(uint32_t t) { return 0xFFFFFFFFFFFFFFFFULL / (0xFFFFFFFFULL / ((uint64_t)t)); }
@@ -82,6 +84,7 @@ class jpsock
 private:
 	std::string net_addr;
 	std::string usr_login;
+	std::string usr_rigid;
 	std::string usr_pass;
 	std::string tls_fp;
 
@@ -105,6 +108,7 @@ class jpsock
 	std::atomic<bool> bRunning;
 	std::atomic<bool> bLoggedIn;
 	std::atomic<bool> quiet_close;
+	std::atomic<bool> call_error;
 
 	uint8_t* bJsonRecvMem;
 	uint8_t* bJsonParseMem;
@@ -120,8 +124,8 @@ class jpsock
 	void jpsock_thread();
 	bool jpsock_thd_main();
 	bool process_line(char* line, size_t len);
-	bool process_pool_job(const opq_json_val* params);
-	bool cmd_ret_wait(const char* sPacket, opq_json_val& poResult);
+	bool process_pool_job(const opq_json_val* params, const uint64_t messageId);
+	bool cmd_ret_wait(const char* sPacket, opq_json_val& poResult, uint64_t& messageId);
 
 	char sMinerId[64];
 	std::atomic<uint64_t> iJobDiff;
@@ -138,5 +142,8 @@ class jpsock
 
 	opaque_private* prv;
 	base_socket* sck;
+
+	uint64_t iMessageCnt = 0;
+	uint64_t iLastMessageId = 0;
 };
 
diff --git a/xmrstak/net/msgstruct.hpp b/xmrstak/net/msgstruct.hpp
index a5affc81d..6a05eb9d5 100644
--- a/xmrstak/net/msgstruct.hpp
+++ b/xmrstak/net/msgstruct.hpp
@@ -1,11 +1,13 @@
 #pragma once
 
+#include "xmrstak/backend/cryptonight.hpp"
+
 #include <string>
 #include <string.h>
 #include <assert.h>
 
 // Structures that we use to pass info between threads constructors are here just to make
-// the stack allocation take up less space, heap is a shared resouce that needs locks too of course
+// the stack allocation take up less space, heap is a shared resource that needs locks too of course
 
 struct pool_job
 {
@@ -31,9 +33,11 @@ struct job_result
 	char		sJobID[64];
 	uint32_t	iNonce;
 	uint32_t	iThreadId;
+	xmrstak_algo algorithm = invalid_algo;
 
 	job_result() {}
-	job_result(const char* sJobID, uint32_t iNonce, const uint8_t* bResult, uint32_t iThreadId) : iNonce(iNonce), iThreadId(iThreadId)
+	job_result(const char* sJobID, uint32_t iNonce, const uint8_t* bResult, uint32_t iThreadId, xmrstak_algo algo) :
+		iNonce(iNonce), iThreadId(iThreadId), algorithm(algo)
 	{
 		memcpy(this->sJobID, sJobID, sizeof(job_result::sJobID));
 		memcpy(this->bResult, bResult, sizeof(job_result::bResult));
@@ -66,20 +70,21 @@ struct sock_err
 // Unlike socket errors, GPU errors are read-only strings
 struct gpu_res_err
 {
+	size_t idx; // GPU index
 	const char* error_str;
-	gpu_res_err(const char* error_str) : error_str(error_str) {}
+	gpu_res_err(const char* error_str, size_t idx) : error_str(error_str), idx(idx) {}
 };
 
 enum ex_event_name { EV_INVALID_VAL, EV_SOCK_READY, EV_SOCK_ERROR, EV_GPU_RES_ERROR,
-	EV_POOL_HAVE_JOB, EV_MINER_HAVE_RESULT, EV_PERF_TICK, EV_EVAL_POOL_CHOICE, 
-	EV_USR_HASHRATE, EV_USR_RESULTS, EV_USR_CONNSTAT, EV_HASHRATE_LOOP, 
+	EV_POOL_HAVE_JOB, EV_MINER_HAVE_RESULT, EV_PERF_TICK, EV_EVAL_POOL_CHOICE,
+	EV_USR_HASHRATE, EV_USR_RESULTS, EV_USR_CONNSTAT, EV_HASHRATE_LOOP,
 	EV_HTML_HASHRATE, EV_HTML_RESULTS, EV_HTML_CONNSTAT, EV_HTML_JSON };
 
 /*
    This is how I learned to stop worrying and love c++11 =).
    Ghosts of endless heap allocations have finally been exorcised. Thanks
    to the nifty magic of move semantics, string will only be allocated
-   once on the heap. Considering that it makes a jorney across stack,
+   once on the heap. Considering that it makes a journey across stack,
    heap alloced queue, to another stack before being finally processed
    I think it is kind of nifty, don't you?
    Also note that for non-arg events we only copy two qwords
@@ -99,7 +104,7 @@ struct ex_event
 	};
 
 	ex_event() { iName = EV_INVALID_VAL; iPoolId = 0;}
-	ex_event(const char* gpu_err, size_t id) : iName(EV_GPU_RES_ERROR), iPoolId(id), oGpuError(gpu_err) {}
+	ex_event(const char* gpu_err, size_t gpu_idx, size_t id) : iName(EV_GPU_RES_ERROR), iPoolId(id), oGpuError(gpu_err, gpu_idx) {}
 	ex_event(std::string&& err, bool silent, size_t id) : iName(EV_SOCK_ERROR), iPoolId(id), oSocketError(std::move(err), silent) { }
 	ex_event(job_result dat, size_t id) : iName(EV_MINER_HAVE_RESULT), iPoolId(id), oJobResult(dat) {}
 	ex_event(pool_job dat, size_t id) : iName(EV_POOL_HAVE_JOB), iPoolId(id), oPoolJob(dat) {}
@@ -178,7 +183,7 @@ inline size_t get_timestamp()
 	return time_point_cast<seconds>(steady_clock::now()).time_since_epoch().count();
 };
 
-//Get milisecond timestamp
+//Get millisecond timestamp
 inline size_t get_timestamp_ms()
 {
 	using namespace std::chrono;
diff --git a/xmrstak/net/socket.cpp b/xmrstak/net/socket.cpp
index e19d1d442..6fcb454cd 100644
--- a/xmrstak/net/socket.cpp
+++ b/xmrstak/net/socket.cpp
@@ -48,6 +48,7 @@ bool plain_socket::set_hostname(const char* sAddr)
 	char sAddrMb[256];
 	char *sTmp, *sPort;
 
+	sock_closed = false;
 	size_t ln = strlen(sAddr);
 	if (ln >= sizeof(sAddrMb))
 		return pCallback->set_socket_error("CONNECT error: Pool address overflow.");
@@ -117,11 +118,16 @@ bool plain_socket::set_hostname(const char* sAddr)
 		return pCallback->set_socket_error_strerr("CONNECT error: Socket creation failed ");
 	}
 
+	int flag = 1;
+	/* If it fails, it fails, we won't loose too much sleep over it */
+	setsockopt(hSocket, IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int));
+
 	return true;
 }
 
 bool plain_socket::connect()
 {
+	sock_closed = false;
 	int ret = ::connect(hSocket, pSockAddr->ai_addr, (int)pSockAddr->ai_addrlen);
 
 	freeaddrinfo(pAddrRoot);
@@ -135,6 +141,9 @@ bool plain_socket::connect()
 
 int plain_socket::recv(char* buf, unsigned int len)
 {
+	if(sock_closed)
+		return 0;
+
 	int ret = ::recv(hSocket, buf, len, 0);
 
 	if(ret == 0)
@@ -147,7 +156,8 @@ int plain_socket::recv(char* buf, unsigned int len)
 
 bool plain_socket::send(const char* buf)
 {
-	int pos = 0, slen = strlen(buf);
+	size_t pos = 0;
+	size_t slen = strlen(buf);
 	while (pos != slen)
 	{
 		int ret = ::send(hSocket, buf + pos, slen - pos, 0);
@@ -167,6 +177,7 @@ void plain_socket::close(bool free)
 {
 	if(hSocket != INVALID_SOCKET)
 	{
+		sock_closed = true;
 		sock_close(hSocket);
 		hSocket = INVALID_SOCKET;
 	}
@@ -190,7 +201,7 @@ void tls_socket::print_error()
 		if(jconf::inst()->TlsSecureAlgos())
 			pCallback->set_socket_error("Unknown TLS error. Secure TLS maybe unsupported, try setting tls_secure_algo to false.");
 		else
-			pCallback->set_socket_error("Unknown TLS error.");
+			pCallback->set_socket_error("Unknown TLS error. You might be trying to connect to a non-TLS port.");
 	}
 	else
 		pCallback->set_socket_error(buf, len);
@@ -211,12 +222,13 @@ void tls_socket::init_ctx()
 
 	if(jconf::inst()->TlsSecureAlgos())
 	{
-		SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 | SSL_OP_NO_TLSv1 | SSL_OP_NO_COMPRESSION);
+		SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 | SSL_OP_NO_TLSv1);
 	}
 }
 
 bool tls_socket::set_hostname(const char* sAddr)
 {
+	sock_closed = false;
 	if(ctx == nullptr)
 	{
 		init_ctx();
@@ -233,6 +245,10 @@ bool tls_socket::set_hostname(const char* sAddr)
 		return false;
 	}
 
+	int flag = 1;
+	/* If it fails, it fails, we won't loose too much sleep over it */
+	setsockopt(BIO_get_fd(bio, nullptr), IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int));
+
 	if(BIO_set_conn_hostname(bio, sAddr) != 1)
 	{
 		print_error();
@@ -248,7 +264,7 @@ bool tls_socket::set_hostname(const char* sAddr)
 
 	if(jconf::inst()->TlsSecureAlgos())
 	{
-		if(SSL_set_cipher_list(ssl, "HIGH:!aNULL:!kRSA:!PSK:!SRP:!MD5:!RC4:!SHA1") != 1)
+		if(SSL_set_cipher_list(ssl, "HIGH:!aNULL:!PSK:!SRP:!MD5:!RC4:!SHA1") != 1)
 		{
 			print_error();
 			return false;
@@ -260,6 +276,7 @@ bool tls_socket::set_hostname(const char* sAddr)
 
 bool tls_socket::connect()
 {
+	sock_closed = false;
 	if(BIO_do_connect(bio) != 1)
 	{
 		print_error();
@@ -322,7 +339,7 @@ bool tls_socket::connect()
 	{
 		if(!pCallback->is_dev_pool())
 		{
-			printer::inst()->print_msg(L0, "FINGERPRINT FAILED CHECK [%s] %.*s was given, %s was configured", 
+			printer::inst()->print_msg(L0, "FINGERPRINT FAILED CHECK [%s] %.*s was given, %s was configured",
 				pCallback->get_pool_addr(), (int)b64_len, b64_md, conf_md);
 		}
 
@@ -340,6 +357,9 @@ bool tls_socket::connect()
 
 int tls_socket::recv(char* buf, unsigned int len)
 {
+	if(sock_closed)
+		return 0;
+
 	int ret = BIO_read(bio, buf, len);
 
 	if(ret == 0)
@@ -360,6 +380,7 @@ void tls_socket::close(bool free)
 	if(bio == nullptr || ssl == nullptr)
 		return;
 
+	sock_closed = true;
 	if(!free)
 	{
 		sock_close(BIO_get_fd(bio, nullptr));
diff --git a/xmrstak/net/socket.hpp b/xmrstak/net/socket.hpp
index 192a32c2e..b09142d56 100644
--- a/xmrstak/net/socket.hpp
+++ b/xmrstak/net/socket.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <atomic>
 #include "socks.hpp"
 
 class jpsock;
@@ -12,6 +13,9 @@ class base_socket
 	virtual int recv(char* buf, unsigned int len) = 0;
 	virtual bool send(const char* buf) = 0;
 	virtual void close(bool free) = 0;
+
+protected:
+	std::atomic<bool> sock_closed;
 };
 
 class plain_socket : public base_socket
diff --git a/xmrstak/net/socks.hpp b/xmrstak/net/socks.hpp
index 1d25d3ab7..86749e527 100644
--- a/xmrstak/net/socks.hpp
+++ b/xmrstak/net/socks.hpp
@@ -62,9 +62,8 @@ inline const char* sock_gai_strerror(int err, char* buf, size_t len)
 #include <unistd.h> /* Needed for close() */
 #include <errno.h>
 #include <string.h>
-#if defined(__FreeBSD__)
 #include <netinet/in.h> /* Needed for IPPROTO_TCP */
-#endif
+#include <netinet/tcp.h>
 
 inline void sock_init() {}
 typedef int SOCKET;
diff --git a/xmrstak/params.hpp b/xmrstak/params.hpp
index bc3261274..936b1e9a6 100644
--- a/xmrstak/params.hpp
+++ b/xmrstak/params.hpp
@@ -21,30 +21,54 @@ struct params
 	std::string executablePrefix;
 	std::string binaryName;
 	bool useAMD;
+	bool AMDCache;
 	bool useNVIDIA;
 	bool useCPU;
+	bool useHide;
+	// user selected OpenCL vendor
+	std::string openCLVendor;
 
 	bool poolUseTls = false;
 	std::string poolURL;
 	bool userSetPwd = false;
 	std::string poolPasswd;
+	bool userSetRigid = false;
+	std::string poolRigid;
 	std::string poolUsername;
 	bool nicehashMode = false;
 
+	static constexpr int32_t httpd_port_unset = -1;
+	static constexpr int32_t httpd_port_disabled = 0;
+	int32_t httpd_port = httpd_port_unset;
+
 	std::string currency;
 
 	std::string configFile;
+	std::string configFilePools;
 	std::string configFileAMD;
 	std::string configFileNVIDIA;
 	std::string configFileCPU;
 
+	bool allowUAC = true;
+	std::string minerArg0;
+	std::string minerArgs;
+
+	// block_version >= 0 enable benchmark
+	int benchmark_block_version = -1;
+	int benchmark_wait_sec = 30;
+	int benchmark_work_sec = 60;
+
 	params() :
 		binaryName("xmr-stak"),
 		executablePrefix(""),
 		useAMD(true),
+		AMDCache(true),
 		useNVIDIA(true),
 		useCPU(true),
+		useHide(false),
+		openCLVendor("AMD"),
 		configFile("config.txt"),
+		configFilePools("pools.txt"),
 		configFileAMD("amd.txt"),
 		configFileCPU("cpu.txt"),
 		configFileNVIDIA("nvidia.txt")
@@ -52,4 +76,4 @@ struct params
 
 };
 
-} // namepsace xmrstak
+} // namespace xmrstak
diff --git a/xmrstak/picosha2/picosha2.hpp b/xmrstak/picosha2/picosha2.hpp
new file mode 100644
index 000000000..b9daec65f
--- /dev/null
+++ b/xmrstak/picosha2/picosha2.hpp
@@ -0,0 +1,375 @@
+/*
+The MIT License (MIT)
+
+Copyright (C) 2017 okdshin
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef PICOSHA2_H
+#define PICOSHA2_H
+// picosha2:20140213
+
+#ifndef PICOSHA2_BUFFER_SIZE_FOR_INPUT_ITERATOR
+#define PICOSHA2_BUFFER_SIZE_FOR_INPUT_ITERATOR \
+    1048576  //=1024*1024: default is 1MB memory
+#endif
+
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <sstream>
+#include <vector>
+
+namespace picosha2 {
+typedef unsigned long word_t;
+typedef unsigned char byte_t;
+
+static const size_t k_digest_size = 32;
+
+namespace detail {
+inline byte_t mask_8bit(byte_t x) { return x & 0xff; }
+
+inline word_t mask_32bit(word_t x) { return x & 0xffffffff; }
+
+const word_t add_constant[64] = {
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
+    0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
+    0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
+    0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
+    0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
+    0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2};
+
+const word_t initial_message_digest[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372,
+                                          0xa54ff53a, 0x510e527f, 0x9b05688c,
+                                          0x1f83d9ab, 0x5be0cd19};
+
+inline word_t ch(word_t x, word_t y, word_t z) { return (x & y) ^ ((~x) & z); }
+
+inline word_t maj(word_t x, word_t y, word_t z) {
+    return (x & y) ^ (x & z) ^ (y & z);
+}
+
+inline word_t rotr(word_t x, std::size_t n) {
+    assert(n < 32);
+    return mask_32bit((x >> n) | (x << (32 - n)));
+}
+
+inline word_t bsig0(word_t x) { return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22); }
+
+inline word_t bsig1(word_t x) { return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25); }
+
+inline word_t shr(word_t x, std::size_t n) {
+    assert(n < 32);
+    return x >> n;
+}
+
+inline word_t ssig0(word_t x) { return rotr(x, 7) ^ rotr(x, 18) ^ shr(x, 3); }
+
+inline word_t ssig1(word_t x) { return rotr(x, 17) ^ rotr(x, 19) ^ shr(x, 10); }
+
+template <typename RaIter1, typename RaIter2>
+void hash256_block(RaIter1 message_digest, RaIter2 first, RaIter2 last) {
+    assert(first + 64 == last);
+    static_cast<void>(last);  // for avoiding unused-variable warning
+    word_t w[64];
+    std::fill(w, w + 64, 0);
+    for (std::size_t i = 0; i < 16; ++i) {
+        w[i] = (static_cast<word_t>(mask_8bit(*(first + i * 4))) << 24) |
+               (static_cast<word_t>(mask_8bit(*(first + i * 4 + 1))) << 16) |
+               (static_cast<word_t>(mask_8bit(*(first + i * 4 + 2))) << 8) |
+               (static_cast<word_t>(mask_8bit(*(first + i * 4 + 3))));
+    }
+    for (std::size_t i = 16; i < 64; ++i) {
+        w[i] = mask_32bit(ssig1(w[i - 2]) + w[i - 7] + ssig0(w[i - 15]) +
+                          w[i - 16]);
+    }
+
+    word_t a = *message_digest;
+    word_t b = *(message_digest + 1);
+    word_t c = *(message_digest + 2);
+    word_t d = *(message_digest + 3);
+    word_t e = *(message_digest + 4);
+    word_t f = *(message_digest + 5);
+    word_t g = *(message_digest + 6);
+    word_t h = *(message_digest + 7);
+
+    for (std::size_t i = 0; i < 64; ++i) {
+        word_t temp1 = h + bsig1(e) + ch(e, f, g) + add_constant[i] + w[i];
+        word_t temp2 = bsig0(a) + maj(a, b, c);
+        h = g;
+        g = f;
+        f = e;
+        e = mask_32bit(d + temp1);
+        d = c;
+        c = b;
+        b = a;
+        a = mask_32bit(temp1 + temp2);
+    }
+    *message_digest += a;
+    *(message_digest + 1) += b;
+    *(message_digest + 2) += c;
+    *(message_digest + 3) += d;
+    *(message_digest + 4) += e;
+    *(message_digest + 5) += f;
+    *(message_digest + 6) += g;
+    *(message_digest + 7) += h;
+    for (std::size_t i = 0; i < 8; ++i) {
+        *(message_digest + i) = mask_32bit(*(message_digest + i));
+    }
+}
+
+}  // namespace detail
+
+template <typename InIter>
+void output_hex(InIter first, InIter last, std::ostream& os) {
+    os.setf(std::ios::hex, std::ios::basefield);
+    while (first != last) {
+        os.width(2);
+        os.fill('0');
+        os << static_cast<unsigned int>(*first);
+        ++first;
+    }
+    os.setf(std::ios::dec, std::ios::basefield);
+}
+
+template <typename InIter>
+void bytes_to_hex_string(InIter first, InIter last, std::string& hex_str) {
+    std::ostringstream oss;
+    output_hex(first, last, oss);
+    hex_str.assign(oss.str());
+}
+
+template <typename InContainer>
+void bytes_to_hex_string(const InContainer& bytes, std::string& hex_str) {
+    bytes_to_hex_string(bytes.begin(), bytes.end(), hex_str);
+}
+
+template <typename InIter>
+std::string bytes_to_hex_string(InIter first, InIter last) {
+    std::string hex_str;
+    bytes_to_hex_string(first, last, hex_str);
+    return hex_str;
+}
+
+template <typename InContainer>
+std::string bytes_to_hex_string(const InContainer& bytes) {
+    std::string hex_str;
+    bytes_to_hex_string(bytes, hex_str);
+    return hex_str;
+}
+
+class hash256_one_by_one {
+   public:
+    hash256_one_by_one() { init(); }
+
+    void init() {
+        buffer_.clear();
+        std::fill(data_length_digits_, data_length_digits_ + 4, 0);
+        std::copy(detail::initial_message_digest,
+                  detail::initial_message_digest + 8, h_);
+    }
+
+    template <typename RaIter>
+    void process(RaIter first, RaIter last) {
+        add_to_data_length(std::distance(first, last));
+        std::copy(first, last, std::back_inserter(buffer_));
+        std::size_t i = 0;
+        for (; i + 64 <= buffer_.size(); i += 64) {
+            detail::hash256_block(h_, buffer_.begin() + i,
+                                  buffer_.begin() + i + 64);
+        }
+        buffer_.erase(buffer_.begin(), buffer_.begin() + i);
+    }
+
+    void finish() {
+        byte_t temp[64];
+        std::fill(temp, temp + 64, 0);
+        std::size_t remains = buffer_.size();
+        std::copy(buffer_.begin(), buffer_.end(), temp);
+        temp[remains] = 0x80;
+
+        if (remains > 55) {
+            std::fill(temp + remains + 1, temp + 64, 0);
+            detail::hash256_block(h_, temp, temp + 64);
+            std::fill(temp, temp + 64 - 4, 0);
+        } else {
+            std::fill(temp + remains + 1, temp + 64 - 4, 0);
+        }
+
+        write_data_bit_length(&(temp[56]));
+        detail::hash256_block(h_, temp, temp + 64);
+    }
+
+    template <typename OutIter>
+    void get_hash_bytes(OutIter first, OutIter last) const {
+        for (const word_t* iter = h_; iter != h_ + 8; ++iter) {
+            for (std::size_t i = 0; i < 4 && first != last; ++i) {
+                *(first++) = detail::mask_8bit(
+                    static_cast<byte_t>((*iter >> (24 - 8 * i))));
+            }
+        }
+    }
+
+   private:
+    void add_to_data_length(word_t n) {
+        word_t carry = 0;
+        data_length_digits_[0] += n;
+        for (std::size_t i = 0; i < 4; ++i) {
+            data_length_digits_[i] += carry;
+            if (data_length_digits_[i] >= 65536u) {
+                carry = data_length_digits_[i] >> 16;
+                data_length_digits_[i] &= 65535u;
+            } else {
+                break;
+            }
+        }
+    }
+    void write_data_bit_length(byte_t* begin) {
+        word_t data_bit_length_digits[4];
+        std::copy(data_length_digits_, data_length_digits_ + 4,
+                  data_bit_length_digits);
+
+        // convert byte length to bit length (multiply 8 or shift 3 times left)
+        word_t carry = 0;
+        for (std::size_t i = 0; i < 4; ++i) {
+            word_t before_val = data_bit_length_digits[i];
+            data_bit_length_digits[i] <<= 3;
+            data_bit_length_digits[i] |= carry;
+            data_bit_length_digits[i] &= 65535u;
+            carry = (before_val >> (16 - 3)) & 65535u;
+        }
+
+        // write data_bit_length
+        for (int i = 3; i >= 0; --i) {
+            (*begin++) = static_cast<byte_t>(data_bit_length_digits[i] >> 8);
+            (*begin++) = static_cast<byte_t>(data_bit_length_digits[i]);
+        }
+    }
+    std::vector<byte_t> buffer_;
+    word_t data_length_digits_[4];  // as 64bit integer (16bit x 4 integer)
+    word_t h_[8];
+};
+
+inline void get_hash_hex_string(const hash256_one_by_one& hasher,
+                                std::string& hex_str) {
+    byte_t hash[k_digest_size];
+    hasher.get_hash_bytes(hash, hash + k_digest_size);
+    return bytes_to_hex_string(hash, hash + k_digest_size, hex_str);
+}
+
+inline std::string get_hash_hex_string(const hash256_one_by_one& hasher) {
+    std::string hex_str;
+    get_hash_hex_string(hasher, hex_str);
+    return hex_str;
+}
+
+namespace impl {
+template <typename RaIter, typename OutIter>
+void hash256_impl(RaIter first, RaIter last, OutIter first2, OutIter last2, int,
+                  std::random_access_iterator_tag) {
+    hash256_one_by_one hasher;
+    // hasher.init();
+    hasher.process(first, last);
+    hasher.finish();
+    hasher.get_hash_bytes(first2, last2);
+}
+
+template <typename InputIter, typename OutIter>
+void hash256_impl(InputIter first, InputIter last, OutIter first2,
+                  OutIter last2, int buffer_size, std::input_iterator_tag) {
+    std::vector<byte_t> buffer(buffer_size);
+    hash256_one_by_one hasher;
+    // hasher.init();
+    while (first != last) {
+        int size = buffer_size;
+        for (int i = 0; i != buffer_size; ++i, ++first) {
+            if (first == last) {
+                size = i;
+                break;
+            }
+            buffer[i] = *first;
+        }
+        hasher.process(buffer.begin(), buffer.begin() + size);
+    }
+    hasher.finish();
+    hasher.get_hash_bytes(first2, last2);
+}
+}
+
+template <typename InIter, typename OutIter>
+void hash256(InIter first, InIter last, OutIter first2, OutIter last2,
+             int buffer_size = PICOSHA2_BUFFER_SIZE_FOR_INPUT_ITERATOR) {
+    picosha2::impl::hash256_impl(
+        first, last, first2, last2, buffer_size,
+        typename std::iterator_traits<InIter>::iterator_category());
+}
+
+template <typename InIter, typename OutContainer>
+void hash256(InIter first, InIter last, OutContainer& dst) {
+    hash256(first, last, dst.begin(), dst.end());
+}
+
+template <typename InContainer, typename OutIter>
+void hash256(const InContainer& src, OutIter first, OutIter last) {
+    hash256(src.begin(), src.end(), first, last);
+}
+
+template <typename InContainer, typename OutContainer>
+void hash256(const InContainer& src, OutContainer& dst) {
+    hash256(src.begin(), src.end(), dst.begin(), dst.end());
+}
+
+template <typename InIter>
+void hash256_hex_string(InIter first, InIter last, std::string& hex_str) {
+    byte_t hashed[k_digest_size];
+    hash256(first, last, hashed, hashed + k_digest_size);
+    std::ostringstream oss;
+    output_hex(hashed, hashed + k_digest_size, oss);
+    hex_str.assign(oss.str());
+}
+
+template <typename InIter>
+std::string hash256_hex_string(InIter first, InIter last) {
+    std::string hex_str;
+    hash256_hex_string(first, last, hex_str);
+    return hex_str;
+}
+
+inline void hash256_hex_string(const std::string& src, std::string& hex_str) {
+    hash256_hex_string(src.begin(), src.end(), hex_str);
+}
+
+template <typename InContainer>
+void hash256_hex_string(const InContainer& src, std::string& hex_str) {
+    hash256_hex_string(src.begin(), src.end(), hex_str);
+}
+
+template <typename InContainer>
+std::string hash256_hex_string(const InContainer& src) {
+    return hash256_hex_string(src.begin(), src.end());
+}
+
+}  // namespace picosha2
+
+#endif  // PICOSHA2_H
\ No newline at end of file
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
new file mode 100644
index 000000000..e86e2a537
--- /dev/null
+++ b/xmrstak/pools.tpl
@@ -0,0 +1,53 @@
+R"===(// generated by XMRSTAK_VERSION
+
+/*
+ * pool_address    - Pool address should be in the form "pool.supportxmr.com:3333". Only stratum pools are supported.
+ * wallet_address  - Your wallet, or pool login.
+ * rig_id          - Rig identifier for pool-side statistics (needs pool support).
+ * pool_password   - Can be empty in most cases or "x".
+ * use_nicehash    - Limit the nonce to 3 bytes as required by nicehash.
+ * use_tls         - This option will make us connect using Transport Layer Security.
+ * tls_fingerprint - Server's SHA256 fingerprint. If this string is non-empty then we will check the server's cert against it.
+ * pool_weight     - Pool weight is a number telling the miner how important the pool is. Miner will mine mostly at the pool
+ *                   with the highest weight, unless the pool fails. Weight must be an integer larger than 0.
+ *
+ * We feature pools up to 1MH/s. For a more complete list see M5M400's pool list at www.moneropools.com
+ */
+
+"pool_list" :
+[
+POOLCONF],
+
+/*
+ * Currency to mine. Supported values:
+ *
+ *    aeon7 (use this for Aeon's new PoW)
+ *    bbscoin (automatic switch with block version 3 to cryptonight_v7)
+ *    bittube (uses cryptonight_bittube2 algorithm)
+ *    graft
+ *    haven (automatic switch with block version 3 to cryptonight_haven)
+ *    intense
+ *    masari
+ *    monero (use this to support Monero's Oct 2018 fork)
+ *    qrl - Quantum Resistant Ledger
+ *    ryo
+ *    turtlecoin
+ *
+ * Native algorithms which not depends on any block versions:
+ *
+ *    # 1MiB scratchpad memory
+ *    cryptonight_lite
+ *    cryptonight_lite_v7
+ *    cryptonight_lite_v7_xor (algorithm used by ipbc)
+ *    # 2MiB scratchpad memory
+ *    cryptonight
+ *    cryptonight_v7
+ *    cryptonight_v8
+ *    # 4MiB scratchpad memory
+ *    cryptonight_bittube2
+ *    cryptonight_haven
+ *    cryptonight_heavy
+ */
+
+"currency" : "CURRENCY",
+)==="
diff --git a/xmrstak/rapidjson/allocators.h b/xmrstak/rapidjson/allocators.h
index 98affe03f..cf5f5366a 100644
--- a/xmrstak/rapidjson/allocators.h
+++ b/xmrstak/rapidjson/allocators.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_ALLOCATORS_H_
@@ -24,10 +24,10 @@ RAPIDJSON_NAMESPACE_BEGIN
 
 /*! \class rapidjson::Allocator
     \brief Concept for allocating, resizing and freeing memory block.
-    
+
     Note that Malloc() and Realloc() are non-static but Free() is static.
-    
-    So if an allocator need to support Free(), it needs to put its pointer in 
+
+    So if an allocator need to support Free(), it needs to put its pointer in
     the header of memory block.
 
 \code
@@ -62,7 +62,7 @@ concept Allocator {
 class CrtAllocator {
 public:
     static const bool kNeedFree = true;
-    void* Malloc(size_t size) { 
+    void* Malloc(size_t size) {
         if (size) //  behavior of malloc(0) is implementation defined.
             return std::malloc(size);
         else
@@ -83,7 +83,7 @@ class CrtAllocator {
 // MemoryPoolAllocator
 
 //! Default memory allocator used by the parser and DOM.
-/*! This allocator allocate memory blocks from pre-allocated memory chunks. 
+/*! This allocator allocate memory blocks from pre-allocated memory chunks.
 
     It does not free memory blocks. And Realloc() only allocate new memory.
 
@@ -107,7 +107,7 @@ class MemoryPoolAllocator {
     /*! \param chunkSize The size of memory chunk. The default is kDefaultChunkSize.
         \param baseAllocator The allocator for allocating memory chunks.
     */
-    MemoryPoolAllocator(size_t chunkSize = kDefaultChunkCapacity, BaseAllocator* baseAllocator = 0) : 
+    MemoryPoolAllocator(size_t chunkSize = kDefaultChunkCapacity, BaseAllocator* baseAllocator = 0) :
         chunkHead_(0), chunk_capacity_(chunkSize), userBuffer_(0), baseAllocator_(baseAllocator), ownBaseAllocator_(0)
     {
     }
diff --git a/xmrstak/rapidjson/document.h b/xmrstak/rapidjson/document.h
index 895af88e5..48ff019f8 100644
--- a/xmrstak/rapidjson/document.h
+++ b/xmrstak/rapidjson/document.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_DOCUMENT_H_
@@ -75,8 +75,8 @@ class GenericDocument;
     But a compiler (IBM XL C/C++ for AIX) have reported to have problem with that so it moved as a namespace scope struct.
     https://code.google.com/p/rapidjson/issues/detail?id=64
 */
-template <typename Encoding, typename Allocator> 
-struct GenericMember { 
+template <typename Encoding, typename Allocator>
+struct GenericMember {
     GenericValue<Encoding, Allocator> name;     //!< name of member (must be a string)
     GenericValue<Encoding, Allocator> value;    //!< value of member.
 };
@@ -418,7 +418,7 @@ namespace internal {
 template <typename ValueType, typename T>
 struct TypeHelper {};
 
-template<typename ValueType> 
+template<typename ValueType>
 struct TypeHelper<ValueType, bool> {
     static bool Is(const ValueType& v) { return v.IsBool(); }
     static bool Get(const ValueType& v) { return v.GetBool(); }
@@ -426,7 +426,7 @@ struct TypeHelper<ValueType, bool> {
     static ValueType& Set(ValueType& v, bool data, typename ValueType::AllocatorType&) { return v.SetBool(data); }
 };
 
-template<typename ValueType> 
+template<typename ValueType>
 struct TypeHelper<ValueType, int> {
     static bool Is(const ValueType& v) { return v.IsInt(); }
     static int Get(const ValueType& v) { return v.GetInt(); }
@@ -434,7 +434,7 @@ struct TypeHelper<ValueType, int> {
     static ValueType& Set(ValueType& v, int data, typename ValueType::AllocatorType&) { return v.SetInt(data); }
 };
 
-template<typename ValueType> 
+template<typename ValueType>
 struct TypeHelper<ValueType, unsigned> {
     static bool Is(const ValueType& v) { return v.IsUint(); }
     static unsigned Get(const ValueType& v) { return v.GetUint(); }
@@ -442,7 +442,7 @@ struct TypeHelper<ValueType, unsigned> {
     static ValueType& Set(ValueType& v, unsigned data, typename ValueType::AllocatorType&) { return v.SetUint(data); }
 };
 
-template<typename ValueType> 
+template<typename ValueType>
 struct TypeHelper<ValueType, int64_t> {
     static bool Is(const ValueType& v) { return v.IsInt64(); }
     static int64_t Get(const ValueType& v) { return v.GetInt64(); }
@@ -450,7 +450,7 @@ struct TypeHelper<ValueType, int64_t> {
     static ValueType& Set(ValueType& v, int64_t data, typename ValueType::AllocatorType&) { return v.SetInt64(data); }
 };
 
-template<typename ValueType> 
+template<typename ValueType>
 struct TypeHelper<ValueType, uint64_t> {
     static bool Is(const ValueType& v) { return v.IsUint64(); }
     static uint64_t Get(const ValueType& v) { return v.GetUint64(); }
@@ -458,7 +458,7 @@ struct TypeHelper<ValueType, uint64_t> {
     static ValueType& Set(ValueType& v, uint64_t data, typename ValueType::AllocatorType&) { return v.SetUint64(data); }
 };
 
-template<typename ValueType> 
+template<typename ValueType>
 struct TypeHelper<ValueType, double> {
     static bool Is(const ValueType& v) { return v.IsDouble(); }
     static double Get(const ValueType& v) { return v.GetDouble(); }
@@ -466,7 +466,7 @@ struct TypeHelper<ValueType, double> {
     static ValueType& Set(ValueType& v, double data, typename ValueType::AllocatorType&) { return v.SetDouble(data); }
 };
 
-template<typename ValueType> 
+template<typename ValueType>
 struct TypeHelper<ValueType, float> {
     static bool Is(const ValueType& v) { return v.IsFloat(); }
     static float Get(const ValueType& v) { return v.GetFloat(); }
@@ -474,7 +474,7 @@ struct TypeHelper<ValueType, float> {
     static ValueType& Set(ValueType& v, float data, typename ValueType::AllocatorType&) { return v.SetFloat(data); }
 };
 
-template<typename ValueType> 
+template<typename ValueType>
 struct TypeHelper<ValueType, const typename ValueType::Ch*> {
     typedef const typename ValueType::Ch* StringType;
     static bool Is(const ValueType& v) { return v.IsString(); }
@@ -484,7 +484,7 @@ struct TypeHelper<ValueType, const typename ValueType::Ch*> {
 };
 
 #if RAPIDJSON_HAS_STDSTRING
-template<typename ValueType> 
+template<typename ValueType>
 struct TypeHelper<ValueType, std::basic_string<typename ValueType::Ch> > {
     typedef std::basic_string<typename ValueType::Ch> StringType;
     static bool Is(const ValueType& v) { return v.IsString(); }
@@ -493,7 +493,7 @@ struct TypeHelper<ValueType, std::basic_string<typename ValueType::Ch> > {
 };
 #endif
 
-template<typename ValueType> 
+template<typename ValueType>
 struct TypeHelper<ValueType, typename ValueType::Array> {
     typedef typename ValueType::Array ArrayType;
     static bool Is(const ValueType& v) { return v.IsArray(); }
@@ -502,14 +502,14 @@ struct TypeHelper<ValueType, typename ValueType::Array> {
     static ValueType& Set(ValueType& v, ArrayType data, typename ValueType::AllocatorType&) { return v = data; }
 };
 
-template<typename ValueType> 
+template<typename ValueType>
 struct TypeHelper<ValueType, typename ValueType::ConstArray> {
     typedef typename ValueType::ConstArray ArrayType;
     static bool Is(const ValueType& v) { return v.IsArray(); }
     static ArrayType Get(const ValueType& v) { return v.GetArray(); }
 };
 
-template<typename ValueType> 
+template<typename ValueType>
 struct TypeHelper<ValueType, typename ValueType::Object> {
     typedef typename ValueType::Object ObjectType;
     static bool Is(const ValueType& v) { return v.IsObject(); }
@@ -518,7 +518,7 @@ struct TypeHelper<ValueType, typename ValueType::Object> {
     static ValueType& Set(ValueType& v, ObjectType data, typename ValueType::AllocatorType&) { v = data; }
 };
 
-template<typename ValueType> 
+template<typename ValueType>
 struct TypeHelper<ValueType, typename ValueType::ConstObject> {
     typedef typename ValueType::ConstObject ObjectType;
     static bool Is(const ValueType& v) { return v.IsObject(); }
@@ -544,7 +544,7 @@ template <bool, typename> class GenericObject;
     \tparam Encoding    Encoding of the value. (Even non-string values need to have the same encoding in a document)
     \tparam Allocator   Allocator type for allocating memory of object, array and string.
 */
-template <typename Encoding, typename Allocator = MemoryPoolAllocator<> > 
+template <typename Encoding, typename Allocator = MemoryPoolAllocator<> >
 class GenericValue {
 public:
     //! Name-value pair in an object.
@@ -685,7 +685,7 @@ class GenericValue {
 
     //! Constructor for unsigned value.
     explicit GenericValue(unsigned u) RAPIDJSON_NOEXCEPT : data_() {
-        data_.n.u64 = u; 
+        data_.n.u64 = u;
         data_.f.flags = (u & 0x80000000) ? kNumberUintFlag : (kNumberUintFlag | kIntFlag | kInt64Flag);
     }
 
@@ -907,14 +907,14 @@ class GenericValue {
         switch (GetType()) {
         case kObjectType: // Warning: O(n^2) inner-loop
             if (data_.o.size != rhs.data_.o.size)
-                return false;           
+                return false;
             for (ConstMemberIterator lhsMemberItr = MemberBegin(); lhsMemberItr != MemberEnd(); ++lhsMemberItr) {
                 typename RhsType::ConstMemberIterator rhsMemberItr = rhs.FindMember(lhsMemberItr->name);
                 if (rhsMemberItr == rhs.MemberEnd() || lhsMemberItr->value != rhsMemberItr->value)
                     return false;
             }
             return true;
-            
+
         case kArrayType:
             if (data_.a.size != rhs.data_.a.size)
                 return false;
@@ -1385,7 +1385,7 @@ class GenericValue {
         \note Linear time complexity.
     */
     void RemoveAllMembers() {
-        RAPIDJSON_ASSERT(IsObject()); 
+        RAPIDJSON_ASSERT(IsObject());
         for (MemberIterator m = MemberBegin(); m != MemberEnd(); ++m)
             m->~Member();
         data_.o.size = 0;
@@ -1531,7 +1531,7 @@ class GenericValue {
         \note Linear time complexity.
     */
     void Clear() {
-        RAPIDJSON_ASSERT(IsArray()); 
+        RAPIDJSON_ASSERT(IsArray());
         GenericValue* e = GetElementsPointer();
         for (GenericValue* v = e; v != e + data_.a.size; ++v)
             v->~GenericValue();
@@ -1678,7 +1678,7 @@ class GenericValue {
         RAPIDJSON_ASSERT(last <= End());
         ValueIterator pos = Begin() + (first - Begin());
         for (ValueIterator itr = pos; itr != last; ++itr)
-            itr->~GenericValue();       
+            itr->~GenericValue();
         std::memmove(pos, last, static_cast<size_t>(End() - last) * sizeof(GenericValue));
         data_.a.size -= static_cast<SizeType>(last - first);
         return pos;
@@ -1737,7 +1737,7 @@ class GenericValue {
 
     //! Set this value as a string without copying source string.
     /*! This version has better performance with supplied length, and also support string containing null character.
-        \param s source string pointer. 
+        \param s source string pointer.
         \param length The length of source string, excluding the trailing null terminator.
         \return The value itself for fluent API.
         \post IsString() == true && GetString() == s && GetStringLength() == length
@@ -1754,7 +1754,7 @@ class GenericValue {
 
     //! Set this value as a string by copying from source string.
     /*! This version has better performance with supplied length, and also support string containing null character.
-        \param s source string. 
+        \param s source string.
         \param length The length of source string, excluding the trailing null terminator.
         \param allocator Allocator for allocating copied buffer. Commonly use GenericDocument::GetAllocator().
         \return The value itself for fluent API.
@@ -1763,7 +1763,7 @@ class GenericValue {
     GenericValue& SetString(const Ch* s, SizeType length, Allocator& allocator) { this->~GenericValue(); SetStringRaw(StringRef(s, length), allocator); return *this; }
 
     //! Set this value as a string by copying from source string.
-    /*! \param s source string. 
+    /*! \param s source string.
         \param allocator Allocator for allocating copied buffer. Commonly use GenericDocument::GetAllocator().
         \return The value itself for fluent API.
         \post IsString() == true && GetString() != s && strcmp(GetString(),s) == 0 && GetStringLength() == length
@@ -1840,10 +1840,10 @@ class GenericValue {
                 if (RAPIDJSON_UNLIKELY(!v->Accept(handler)))
                     return false;
             return handler.EndArray(data_.a.size);
-    
+
         case kStringType:
             return handler.String(GetString(), GetStringLength(), (data_.f.flags & kCopyFlag) != 0);
-    
+
         default:
             RAPIDJSON_ASSERT(GetType() == kNumberType);
             if (IsDouble())         return handler.Double(data_.n.d);
@@ -2060,7 +2060,7 @@ class GenericValue {
 typedef GenericValue<UTF8<> > Value;
 
 ///////////////////////////////////////////////////////////////////////////////
-// GenericDocument 
+// GenericDocument
 
 //! A document for parsing JSON text as DOM.
 /*!
@@ -2092,12 +2092,12 @@ class GenericDocument : public GenericValue<Encoding, Allocator> {
     }
 
     //! Constructor
-    /*! Creates an empty document which type is Null. 
+    /*! Creates an empty document which type is Null.
         \param allocator        Optional allocator for allocating memory.
         \param stackCapacity    Optional initial capacity of stack in bytes.
         \param stackAllocator   Optional allocator for allocating memory for stack.
     */
-    GenericDocument(Allocator* allocator = 0, size_t stackCapacity = kDefaultStackCapacity, StackAllocator* stackAllocator = 0) : 
+    GenericDocument(Allocator* allocator = 0, size_t stackCapacity = kDefaultStackCapacity, StackAllocator* stackAllocator = 0) :
         allocator_(allocator), ownAllocator_(0), stack_(stackAllocator, stackCapacity), parseResult_()
     {
         if (!allocator_)
@@ -2303,7 +2303,7 @@ class GenericDocument : public GenericValue<Encoding, Allocator> {
     GenericDocument& Parse(const Ch* str, size_t length) {
         return Parse<parseFlags, Encoding>(str, length);
     }
-    
+
     GenericDocument& Parse(const Ch* str, size_t length) {
         return Parse<kParseDefaultFlags>(str, length);
     }
@@ -2323,7 +2323,7 @@ class GenericDocument : public GenericValue<Encoding, Allocator> {
     GenericDocument& Parse(const std::basic_string<Ch>& str) {
         return Parse<kParseDefaultFlags>(str);
     }
-#endif // RAPIDJSON_HAS_STDSTRING    
+#endif // RAPIDJSON_HAS_STDSTRING
 
     //!@}
 
@@ -2388,16 +2388,16 @@ class GenericDocument : public GenericValue<Encoding, Allocator> {
     bool Uint64(uint64_t i) { new (stack_.template Push<ValueType>()) ValueType(i); return true; }
     bool Double(double d) { new (stack_.template Push<ValueType>()) ValueType(d); return true; }
 
-    bool RawNumber(const Ch* str, SizeType length, bool copy) { 
-        if (copy) 
+    bool RawNumber(const Ch* str, SizeType length, bool copy) {
+        if (copy)
             new (stack_.template Push<ValueType>()) ValueType(str, length, GetAllocator());
         else
             new (stack_.template Push<ValueType>()) ValueType(str, length);
         return true;
     }
 
-    bool String(const Ch* str, SizeType length, bool copy) { 
-        if (copy) 
+    bool String(const Ch* str, SizeType length, bool copy) {
+        if (copy)
             new (stack_.template Push<ValueType>()) ValueType(str, length, GetAllocator());
         else
             new (stack_.template Push<ValueType>()) ValueType(str, length);
@@ -2405,7 +2405,7 @@ class GenericDocument : public GenericValue<Encoding, Allocator> {
     }
 
     bool StartObject() { new (stack_.template Push<ValueType>()) ValueType(kObjectType); return true; }
-    
+
     bool Key(const Ch* str, SizeType length, bool copy) { return String(str, length, copy); }
 
     bool EndObject(SizeType memberCount) {
@@ -2415,7 +2415,7 @@ class GenericDocument : public GenericValue<Encoding, Allocator> {
     }
 
     bool StartArray() { new (stack_.template Push<ValueType>()) ValueType(kArrayType); return true; }
-    
+
     bool EndArray(SizeType elementCount) {
         ValueType* elements = stack_.template Pop<ValueType>(elementCount);
         stack_.template Top<ValueType>()->SetArrayRaw(elements, elementCount, GetAllocator());
diff --git a/xmrstak/rapidjson/encodedstream.h b/xmrstak/rapidjson/encodedstream.h
index 145068386..a294617be 100644
--- a/xmrstak/rapidjson/encodedstream.h
+++ b/xmrstak/rapidjson/encodedstream.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_ENCODEDSTREAM_H_
@@ -41,7 +41,7 @@ class EncodedInputStream {
 public:
     typedef typename Encoding::Ch Ch;
 
-    EncodedInputStream(InputByteStream& is) : is_(is) { 
+    EncodedInputStream(InputByteStream& is) : is_(is) {
         current_ = Encoding::TakeBOM(is_);
     }
 
@@ -51,7 +51,7 @@ class EncodedInputStream {
 
     // Not implemented
     void Put(Ch) { RAPIDJSON_ASSERT(false); }
-    void Flush() { RAPIDJSON_ASSERT(false); } 
+    void Flush() { RAPIDJSON_ASSERT(false); }
     Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
     size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
 
@@ -80,7 +80,7 @@ class EncodedInputStream<UTF8<>, MemoryStream> {
 
     // Not implemented
     void Put(Ch) {}
-    void Flush() {} 
+    void Flush() {}
     Ch* PutBegin() { return 0; }
     size_t PutEnd(Ch*) { return 0; }
 
@@ -102,7 +102,7 @@ class EncodedOutputStream {
 public:
     typedef typename Encoding::Ch Ch;
 
-    EncodedOutputStream(OutputByteStream& os, bool putBOM = true) : os_(os) { 
+    EncodedOutputStream(OutputByteStream& os, bool putBOM = true) : os_(os) {
         if (putBOM)
             Encoding::PutBOM(os_);
     }
@@ -143,7 +143,7 @@ class AutoUTFInputStream {
         \param type UTF encoding type if it is not detected from the stream.
     */
     AutoUTFInputStream(InputByteStream& is, UTFType type = kUTF8) : is_(&is), type_(type), hasBOM_(false) {
-        RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);        
+        RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
         DetectType();
         static const TakeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Take) };
         takeFunc_ = f[type_];
@@ -159,7 +159,7 @@ class AutoUTFInputStream {
 
     // Not implemented
     void Put(Ch) { RAPIDJSON_ASSERT(false); }
-    void Flush() { RAPIDJSON_ASSERT(false); } 
+    void Flush() { RAPIDJSON_ASSERT(false); }
     Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
     size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
 
@@ -258,7 +258,7 @@ class AutoUTFOutputStream {
     UTFType GetType() const { return type_; }
 
     void Put(Ch c) { putFunc_(*os_, c); }
-    void Flush() { os_->Flush(); } 
+    void Flush() { os_->Flush(); }
 
     // Not implemented
     Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;}
@@ -271,7 +271,7 @@ class AutoUTFOutputStream {
     AutoUTFOutputStream(const AutoUTFOutputStream&);
     AutoUTFOutputStream& operator=(const AutoUTFOutputStream&);
 
-    void PutBOM() { 
+    void PutBOM() {
         typedef void (*PutBOMFunc)(OutputByteStream&);
         static const PutBOMFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(PutBOM) };
         f[type_](*os_);
diff --git a/xmrstak/rapidjson/encodings.h b/xmrstak/rapidjson/encodings.h
index baa7c2b17..9b4da1da4 100644
--- a/xmrstak/rapidjson/encodings.h
+++ b/xmrstak/rapidjson/encodings.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_ENCODINGS_H_
@@ -100,7 +100,7 @@ struct UTF8 {
 
     template<typename OutputStream>
     static void Encode(OutputStream& os, unsigned codepoint) {
-        if (codepoint <= 0x7F) 
+        if (codepoint <= 0x7F)
             os.Put(static_cast<Ch>(codepoint & 0xFF));
         else if (codepoint <= 0x7FF) {
             os.Put(static_cast<Ch>(0xC0 | ((codepoint >> 6) & 0xFF)));
@@ -122,7 +122,7 @@ struct UTF8 {
 
     template<typename OutputStream>
     static void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
-        if (codepoint <= 0x7F) 
+        if (codepoint <= 0x7F)
             PutUnsafe(os, static_cast<Ch>(codepoint & 0xFF));
         else if (codepoint <= 0x7FF) {
             PutUnsafe(os, static_cast<Ch>(0xC0 | ((codepoint >> 6) & 0xFF)));
@@ -276,7 +276,7 @@ struct UTF16 {
     static void Encode(OutputStream& os, unsigned codepoint) {
         RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2);
         if (codepoint <= 0xFFFF) {
-            RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair 
+            RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair
             os.Put(static_cast<typename OutputStream::Ch>(codepoint));
         }
         else {
@@ -292,7 +292,7 @@ struct UTF16 {
     static void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
         RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2);
         if (codepoint <= 0xFFFF) {
-            RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair 
+            RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair
             PutUnsafe(os, static_cast<typename OutputStream::Ch>(codepoint));
         }
         else {
@@ -406,7 +406,7 @@ struct UTF16BE : UTF16<CharType> {
 ///////////////////////////////////////////////////////////////////////////////
 // UTF32
 
-//! UTF-32 encoding. 
+//! UTF-32 encoding.
 /*! http://en.wikipedia.org/wiki/UTF-32
     \tparam CharType Type for storing 32-bit UTF-32 data. Default is unsigned. C++11 may use char32_t instead.
     \note implements Encoding concept
@@ -498,7 +498,7 @@ struct UTF32BE : UTF32<CharType> {
     static CharType TakeBOM(InputByteStream& is) {
         RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
         CharType c = Take(is);
-        return static_cast<uint32_t>(c) == 0x0000FEFFu ? Take(is) : c; 
+        return static_cast<uint32_t>(c) == 0x0000FEFFu ? Take(is) : c;
     }
 
     template <typename InputByteStream>
@@ -694,13 +694,13 @@ struct Transcoder<Encoding, Encoding> {
         os.Put(is.Take());  // Just copy one code unit. This semantic is different from primary template class.
         return true;
     }
-    
+
     template<typename InputStream, typename OutputStream>
     RAPIDJSON_FORCEINLINE static bool TranscodeUnsafe(InputStream& is, OutputStream& os) {
         PutUnsafe(os, is.Take());  // Just copy one code unit. This semantic is different from primary template class.
         return true;
     }
-    
+
     template<typename InputStream, typename OutputStream>
     RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
         return Encoding::Validate(is, os);  // source/target encoding are the same
diff --git a/xmrstak/rapidjson/error/en.h b/xmrstak/rapidjson/error/en.h
index 2db838bff..3f87159a0 100644
--- a/xmrstak/rapidjson/error/en.h
+++ b/xmrstak/rapidjson/error/en.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_ERROR_EN_H_
@@ -39,13 +39,13 @@ inline const RAPIDJSON_ERROR_CHARTYPE* GetParseError_En(ParseErrorCode parseErro
 
         case kParseErrorDocumentEmpty:                  return RAPIDJSON_ERROR_STRING("The document is empty.");
         case kParseErrorDocumentRootNotSingular:        return RAPIDJSON_ERROR_STRING("The document root must not be followed by other values.");
-    
+
         case kParseErrorValueInvalid:                   return RAPIDJSON_ERROR_STRING("Invalid value.");
-    
+
         case kParseErrorObjectMissName:                 return RAPIDJSON_ERROR_STRING("Missing a name for object member.");
         case kParseErrorObjectMissColon:                return RAPIDJSON_ERROR_STRING("Missing a colon after a name of object member.");
         case kParseErrorObjectMissCommaOrCurlyBracket:  return RAPIDJSON_ERROR_STRING("Missing a comma or '}' after an object member.");
-    
+
         case kParseErrorArrayMissCommaOrSquareBracket:  return RAPIDJSON_ERROR_STRING("Missing a comma or ']' after an array element.");
 
         case kParseErrorStringUnicodeEscapeInvalidHex:  return RAPIDJSON_ERROR_STRING("Incorrect hex digit after \\u escape in string.");
diff --git a/xmrstak/rapidjson/error/error.h b/xmrstak/rapidjson/error/error.h
index 95cb31a72..e4c571479 100644
--- a/xmrstak/rapidjson/error/error.h
+++ b/xmrstak/rapidjson/error/error.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_ERROR_ERROR_H_
diff --git a/xmrstak/rapidjson/filereadstream.h b/xmrstak/rapidjson/filereadstream.h
index b56ea13b3..793c5dea2 100644
--- a/xmrstak/rapidjson/filereadstream.h
+++ b/xmrstak/rapidjson/filereadstream.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_FILEREADSTREAM_H_
@@ -41,7 +41,7 @@ class FileReadStream {
         \param buffer user-supplied buffer.
         \param bufferSize size of buffer in bytes. Must >=4 bytes.
     */
-    FileReadStream(std::FILE* fp, char* buffer, size_t bufferSize) : fp_(fp), buffer_(buffer), bufferSize_(bufferSize), bufferLast_(0), current_(buffer_), readCount_(0), count_(0), eof_(false) { 
+    FileReadStream(std::FILE* fp, char* buffer, size_t bufferSize) : fp_(fp), buffer_(buffer), bufferSize_(bufferSize), bufferLast_(0), current_(buffer_), readCount_(0), count_(0), eof_(false) {
         RAPIDJSON_ASSERT(fp_ != 0);
         RAPIDJSON_ASSERT(bufferSize >= 4);
         Read();
@@ -53,7 +53,7 @@ class FileReadStream {
 
     // Not implemented
     void Put(Ch) { RAPIDJSON_ASSERT(false); }
-    void Flush() { RAPIDJSON_ASSERT(false); } 
+    void Flush() { RAPIDJSON_ASSERT(false); }
     Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
     size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
 
diff --git a/xmrstak/rapidjson/filewritestream.h b/xmrstak/rapidjson/filewritestream.h
index 6378dd60e..2d3dbab3b 100644
--- a/xmrstak/rapidjson/filewritestream.h
+++ b/xmrstak/rapidjson/filewritestream.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_FILEWRITESTREAM_H_
@@ -33,11 +33,11 @@ class FileWriteStream {
 public:
     typedef char Ch;    //!< Character type. Only support char.
 
-    FileWriteStream(std::FILE* fp, char* buffer, size_t bufferSize) : fp_(fp), buffer_(buffer), bufferEnd_(buffer + bufferSize), current_(buffer_) { 
+    FileWriteStream(std::FILE* fp, char* buffer, size_t bufferSize) : fp_(fp), buffer_(buffer), bufferEnd_(buffer + bufferSize), current_(buffer_) {
         RAPIDJSON_ASSERT(fp_ != 0);
     }
 
-    void Put(char c) { 
+    void Put(char c) {
         if (current_ >= bufferEnd_)
             Flush();
 
diff --git a/xmrstak/rapidjson/fwd.h b/xmrstak/rapidjson/fwd.h
index e8104e841..54480df08 100644
--- a/xmrstak/rapidjson/fwd.h
+++ b/xmrstak/rapidjson/fwd.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_FWD_H_
@@ -101,7 +101,7 @@ class PrettyWriter;
 
 // document.h
 
-template <typename Encoding, typename Allocator> 
+template <typename Encoding, typename Allocator>
 struct GenericMember;
 
 template <bool Const, typename Encoding, typename Allocator>
@@ -110,7 +110,7 @@ class GenericMemberIterator;
 template<typename CharType>
 struct GenericStringRef;
 
-template <typename Encoding, typename Allocator> 
+template <typename Encoding, typename Allocator>
 class GenericValue;
 
 typedef GenericValue<UTF8<char>, MemoryPoolAllocator<CrtAllocator> > Value;
diff --git a/xmrstak/rapidjson/internal/biginteger.h b/xmrstak/rapidjson/internal/biginteger.h
index 9d3e88c99..f0959928f 100644
--- a/xmrstak/rapidjson/internal/biginteger.h
+++ b/xmrstak/rapidjson/internal/biginteger.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_BIGINTEGER_H_
@@ -51,7 +51,7 @@ class BigInteger {
         if (length > 0)
             AppendDecimal64(decimals + i, decimals + i + length);
     }
-    
+
     BigInteger& operator=(const BigInteger &rhs)
     {
         if (this != &rhs) {
@@ -60,9 +60,9 @@ class BigInteger {
         }
         return *this;
     }
-    
+
     BigInteger& operator=(uint64_t u) {
-        digits_[0] = u;            
+        digits_[0] = u;
         count_ = 1;
         return *this;
     }
@@ -95,7 +95,7 @@ class BigInteger {
             digits_[i] = MulAdd64(digits_[i], u, k, &hi);
             k = hi;
         }
-        
+
         if (k > 0)
             PushBack(k);
 
@@ -118,7 +118,7 @@ class BigInteger {
             digits_[i] = (p0 & 0xFFFFFFFF) | (p1 << 32);
             k = p1 >> 32;
         }
-        
+
         if (k > 0)
             PushBack(k);
 
diff --git a/xmrstak/rapidjson/internal/diyfp.h b/xmrstak/rapidjson/internal/diyfp.h
index c9fefdc61..71a6d0265 100644
--- a/xmrstak/rapidjson/internal/diyfp.h
+++ b/xmrstak/rapidjson/internal/diyfp.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 // This is a C++ header-only implementation of Grisu2 algorithm from the publication:
@@ -56,7 +56,7 @@ struct DiyFp {
         if (biased_e != 0) {
             f = significand + kDpHiddenBit;
             e = biased_e - kDpExponentBias;
-        } 
+        }
         else {
             f = significand;
             e = kDpMinExponent + 1;
@@ -141,7 +141,7 @@ struct DiyFp {
             double d;
             uint64_t u64;
         }u;
-        const uint64_t be = (e == kDpDenormalExponent && (f & kDpHiddenBit) == 0) ? 0 : 
+        const uint64_t be = (e == kDpDenormalExponent && (f & kDpHiddenBit) == 0) ? 0 :
             static_cast<uint64_t>(e + kDpExponentBias);
         u.u64 = (f & kDpSignificandMask) | (be << kDpSignificandSize);
         return u.d;
@@ -222,7 +222,7 @@ inline DiyFp GetCachedPowerByIndex(size_t index) {
     };
     return DiyFp(kCachedPowers_F[index], kCachedPowers_E[index]);
 }
-    
+
 inline DiyFp GetCachedPower(int e, int* K) {
 
     //int k = static_cast<int>(ceil((-61 - e) * 0.30102999566398114)) + 374;
diff --git a/xmrstak/rapidjson/internal/dtoa.h b/xmrstak/rapidjson/internal/dtoa.h
index 8d6350e62..1edc39bf6 100644
--- a/xmrstak/rapidjson/internal/dtoa.h
+++ b/xmrstak/rapidjson/internal/dtoa.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 // This is a C++ header-only implementation of Grisu2 algorithm from the publication:
diff --git a/xmrstak/rapidjson/internal/ieee754.h b/xmrstak/rapidjson/internal/ieee754.h
index 82bb0b99e..32fdd2186 100644
--- a/xmrstak/rapidjson/internal/ieee754.h
+++ b/xmrstak/rapidjson/internal/ieee754.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_IEEE754_
diff --git a/xmrstak/rapidjson/internal/itoa.h b/xmrstak/rapidjson/internal/itoa.h
index 01a4e7e72..f2304a7fb 100644
--- a/xmrstak/rapidjson/internal/itoa.h
+++ b/xmrstak/rapidjson/internal/itoa.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_ITOA_
@@ -42,7 +42,7 @@ inline char* u32toa(uint32_t value, char* buffer) {
     if (value < 10000) {
         const uint32_t d1 = (value / 100) << 1;
         const uint32_t d2 = (value % 100) << 1;
-        
+
         if (value >= 1000)
             *buffer++ = cDigitsLut[d1];
         if (value >= 100)
@@ -55,13 +55,13 @@ inline char* u32toa(uint32_t value, char* buffer) {
         // value = bbbbcccc
         const uint32_t b = value / 10000;
         const uint32_t c = value % 10000;
-        
+
         const uint32_t d1 = (b / 100) << 1;
         const uint32_t d2 = (b % 100) << 1;
-        
+
         const uint32_t d3 = (c / 100) << 1;
         const uint32_t d4 = (c % 100) << 1;
-        
+
         if (value >= 10000000)
             *buffer++ = cDigitsLut[d1];
         if (value >= 1000000)
@@ -69,7 +69,7 @@ inline char* u32toa(uint32_t value, char* buffer) {
         if (value >= 100000)
             *buffer++ = cDigitsLut[d2];
         *buffer++ = cDigitsLut[d2 + 1];
-        
+
         *buffer++ = cDigitsLut[d3];
         *buffer++ = cDigitsLut[d3 + 1];
         *buffer++ = cDigitsLut[d4];
@@ -77,10 +77,10 @@ inline char* u32toa(uint32_t value, char* buffer) {
     }
     else {
         // value = aabbbbcccc in decimal
-        
+
         const uint32_t a = value / 100000000; // 1 to 42
         value %= 100000000;
-        
+
         if (a >= 10) {
             const unsigned i = a << 1;
             *buffer++ = cDigitsLut[i];
@@ -91,13 +91,13 @@ inline char* u32toa(uint32_t value, char* buffer) {
 
         const uint32_t b = value / 10000; // 0 to 9999
         const uint32_t c = value % 10000; // 0 to 9999
-        
+
         const uint32_t d1 = (b / 100) << 1;
         const uint32_t d2 = (b % 100) << 1;
-        
+
         const uint32_t d3 = (c / 100) << 1;
         const uint32_t d4 = (c % 100) << 1;
-        
+
         *buffer++ = cDigitsLut[d1];
         *buffer++ = cDigitsLut[d1 + 1];
         *buffer++ = cDigitsLut[d2];
@@ -131,13 +131,13 @@ inline char* u64toa(uint64_t value, char* buffer) {
     const uint64_t kTen14 = kTen8 * 1000000;
     const uint64_t kTen15 = kTen8 * 10000000;
     const uint64_t kTen16 = kTen8 * kTen8;
-    
+
     if (value < kTen8) {
         uint32_t v = static_cast<uint32_t>(value);
         if (v < 10000) {
             const uint32_t d1 = (v / 100) << 1;
             const uint32_t d2 = (v % 100) << 1;
-            
+
             if (v >= 1000)
                 *buffer++ = cDigitsLut[d1];
             if (v >= 100)
@@ -150,13 +150,13 @@ inline char* u64toa(uint64_t value, char* buffer) {
             // value = bbbbcccc
             const uint32_t b = v / 10000;
             const uint32_t c = v % 10000;
-            
+
             const uint32_t d1 = (b / 100) << 1;
             const uint32_t d2 = (b % 100) << 1;
-            
+
             const uint32_t d3 = (c / 100) << 1;
             const uint32_t d4 = (c % 100) << 1;
-            
+
             if (value >= 10000000)
                 *buffer++ = cDigitsLut[d1];
             if (value >= 1000000)
@@ -164,7 +164,7 @@ inline char* u64toa(uint64_t value, char* buffer) {
             if (value >= 100000)
                 *buffer++ = cDigitsLut[d2];
             *buffer++ = cDigitsLut[d2 + 1];
-            
+
             *buffer++ = cDigitsLut[d3];
             *buffer++ = cDigitsLut[d3 + 1];
             *buffer++ = cDigitsLut[d4];
@@ -174,22 +174,22 @@ inline char* u64toa(uint64_t value, char* buffer) {
     else if (value < kTen16) {
         const uint32_t v0 = static_cast<uint32_t>(value / kTen8);
         const uint32_t v1 = static_cast<uint32_t>(value % kTen8);
-        
+
         const uint32_t b0 = v0 / 10000;
         const uint32_t c0 = v0 % 10000;
-        
+
         const uint32_t d1 = (b0 / 100) << 1;
         const uint32_t d2 = (b0 % 100) << 1;
-        
+
         const uint32_t d3 = (c0 / 100) << 1;
         const uint32_t d4 = (c0 % 100) << 1;
 
         const uint32_t b1 = v1 / 10000;
         const uint32_t c1 = v1 % 10000;
-        
+
         const uint32_t d5 = (b1 / 100) << 1;
         const uint32_t d6 = (b1 % 100) << 1;
-        
+
         const uint32_t d7 = (c1 / 100) << 1;
         const uint32_t d8 = (c1 % 100) << 1;
 
@@ -209,7 +209,7 @@ inline char* u64toa(uint64_t value, char* buffer) {
             *buffer++ = cDigitsLut[d4];
         if (value >= kTen8)
             *buffer++ = cDigitsLut[d4 + 1];
-        
+
         *buffer++ = cDigitsLut[d5];
         *buffer++ = cDigitsLut[d5 + 1];
         *buffer++ = cDigitsLut[d6];
@@ -222,7 +222,7 @@ inline char* u64toa(uint64_t value, char* buffer) {
     else {
         const uint32_t a = static_cast<uint32_t>(value / kTen16); // 1 to 1844
         value %= kTen16;
-        
+
         if (a < 10)
             *buffer++ = static_cast<char>('0' + static_cast<char>(a));
         else if (a < 100) {
@@ -232,7 +232,7 @@ inline char* u64toa(uint64_t value, char* buffer) {
         }
         else if (a < 1000) {
             *buffer++ = static_cast<char>('0' + static_cast<char>(a / 100));
-            
+
             const uint32_t i = (a % 100) << 1;
             *buffer++ = cDigitsLut[i];
             *buffer++ = cDigitsLut[i + 1];
@@ -245,28 +245,28 @@ inline char* u64toa(uint64_t value, char* buffer) {
             *buffer++ = cDigitsLut[j];
             *buffer++ = cDigitsLut[j + 1];
         }
-        
+
         const uint32_t v0 = static_cast<uint32_t>(value / kTen8);
         const uint32_t v1 = static_cast<uint32_t>(value % kTen8);
-        
+
         const uint32_t b0 = v0 / 10000;
         const uint32_t c0 = v0 % 10000;
-        
+
         const uint32_t d1 = (b0 / 100) << 1;
         const uint32_t d2 = (b0 % 100) << 1;
-        
+
         const uint32_t d3 = (c0 / 100) << 1;
         const uint32_t d4 = (c0 % 100) << 1;
-        
+
         const uint32_t b1 = v1 / 10000;
         const uint32_t c1 = v1 % 10000;
-        
+
         const uint32_t d5 = (b1 / 100) << 1;
         const uint32_t d6 = (b1 % 100) << 1;
-        
+
         const uint32_t d7 = (c1 / 100) << 1;
         const uint32_t d8 = (c1 % 100) << 1;
-        
+
         *buffer++ = cDigitsLut[d1];
         *buffer++ = cDigitsLut[d1 + 1];
         *buffer++ = cDigitsLut[d2];
@@ -284,7 +284,7 @@ inline char* u64toa(uint64_t value, char* buffer) {
         *buffer++ = cDigitsLut[d8];
         *buffer++ = cDigitsLut[d8 + 1];
     }
-    
+
     return buffer;
 }
 
diff --git a/xmrstak/rapidjson/internal/meta.h b/xmrstak/rapidjson/internal/meta.h
index 5a9aaa428..6c74b315c 100644
--- a/xmrstak/rapidjson/internal/meta.h
+++ b/xmrstak/rapidjson/internal/meta.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_INTERNAL_META_H_
diff --git a/xmrstak/rapidjson/internal/pow10.h b/xmrstak/rapidjson/internal/pow10.h
index 02f475d70..b593982ac 100644
--- a/xmrstak/rapidjson/internal/pow10.h
+++ b/xmrstak/rapidjson/internal/pow10.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_POW10_
@@ -27,8 +27,8 @@ namespace internal {
 */
 inline double Pow10(int n) {
     static const double e[] = { // 1e-0...1e308: 309 * 8 bytes = 2472 bytes
-        1e+0,  
-        1e+1,  1e+2,  1e+3,  1e+4,  1e+5,  1e+6,  1e+7,  1e+8,  1e+9,  1e+10, 1e+11, 1e+12, 1e+13, 1e+14, 1e+15, 1e+16, 1e+17, 1e+18, 1e+19, 1e+20, 
+        1e+0,
+        1e+1,  1e+2,  1e+3,  1e+4,  1e+5,  1e+6,  1e+7,  1e+8,  1e+9,  1e+10, 1e+11, 1e+12, 1e+13, 1e+14, 1e+15, 1e+16, 1e+17, 1e+18, 1e+19, 1e+20,
         1e+21, 1e+22, 1e+23, 1e+24, 1e+25, 1e+26, 1e+27, 1e+28, 1e+29, 1e+30, 1e+31, 1e+32, 1e+33, 1e+34, 1e+35, 1e+36, 1e+37, 1e+38, 1e+39, 1e+40,
         1e+41, 1e+42, 1e+43, 1e+44, 1e+45, 1e+46, 1e+47, 1e+48, 1e+49, 1e+50, 1e+51, 1e+52, 1e+53, 1e+54, 1e+55, 1e+56, 1e+57, 1e+58, 1e+59, 1e+60,
         1e+61, 1e+62, 1e+63, 1e+64, 1e+65, 1e+66, 1e+67, 1e+68, 1e+69, 1e+70, 1e+71, 1e+72, 1e+73, 1e+74, 1e+75, 1e+76, 1e+77, 1e+78, 1e+79, 1e+80,
diff --git a/xmrstak/rapidjson/internal/regex.h b/xmrstak/rapidjson/internal/regex.h
index 8530cd771..fd151224c 100644
--- a/xmrstak/rapidjson/internal/regex.h
+++ b/xmrstak/rapidjson/internal/regex.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_INTERNAL_REGEX_H_
@@ -105,9 +105,9 @@ class GenericRegexSearch;
     - \c \\t Tab (U+0009)
     - \c \\v Vertical tab (U+000B)
 
-    \note This is a Thompson NFA engine, implemented with reference to 
-        Cox, Russ. "Regular Expression Matching Can Be Simple And Fast (but is slow in Java, Perl, PHP, Python, Ruby,...).", 
-        https://swtch.com/~rsc/regexp/regexp1.html 
+    \note This is a Thompson NFA engine, implemented with reference to
+        Cox, Russ. "Regular Expression Matching Can Be Simple And Fast (but is slow in Java, Perl, PHP, Python, Ruby,...).",
+        https://swtch.com/~rsc/regexp/regexp1.html
 */
 template <typename Encoding, typename Allocator = CrtAllocator>
 class GenericRegex {
@@ -116,8 +116,8 @@ class GenericRegex {
     typedef typename Encoding::Ch Ch;
     template <typename, typename> friend class GenericRegexSearch;
 
-    GenericRegex(const Ch* source, Allocator* allocator = 0) : 
-        states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(), 
+    GenericRegex(const Ch* source, Allocator* allocator = 0) :
+        states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(),
         anchorBegin_(), anchorEnd_()
     {
         GenericStringStream<Encoding> ss(source);
@@ -146,7 +146,7 @@ class GenericRegex {
     static const unsigned kRangeNegationFlag = 0x80000000;
 
     struct Range {
-        unsigned start; // 
+        unsigned start; //
         unsigned end;
         SizeType next;
     };
@@ -391,7 +391,7 @@ class GenericRegex {
                 }
                 return false;
 
-            default: 
+            default:
                 RAPIDJSON_ASSERT(op == kOneOrMore);
                 if (operandStack.GetSize() >= sizeof(Frag)) {
                     Frag e = *operandStack.template Pop<Frag>(1);
@@ -544,7 +544,7 @@ class GenericRegex {
         }
         return false;
     }
-    
+
     SizeType NewRange(unsigned codepoint) {
         Range* r = ranges_.template Push<Range>();
         r->start = r->end = codepoint;
@@ -600,7 +600,7 @@ class GenericRegexSearch {
     typedef typename RegexType::EncodingType Encoding;
     typedef typename Encoding::Ch Ch;
 
-    GenericRegexSearch(const RegexType& regex, Allocator* allocator = 0) : 
+    GenericRegexSearch(const RegexType& regex, Allocator* allocator = 0) :
         regex_(regex), allocator_(allocator), ownAllocator_(0),
         state0_(allocator, 0), state1_(allocator, 0), stateSet_()
     {
@@ -659,7 +659,7 @@ class GenericRegexSearch {
             for (const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) {
                 const State& sr = regex_.GetState(*s);
                 if (sr.codepoint == codepoint ||
-                    sr.codepoint == RegexType::kAnyCharacterClass || 
+                    sr.codepoint == RegexType::kAnyCharacterClass ||
                     (sr.codepoint == RegexType::kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint)))
                 {
                     matched = AddState(*next, sr.out) || matched;
diff --git a/xmrstak/rapidjson/internal/stack.h b/xmrstak/rapidjson/internal/stack.h
index 022c9aab4..5cb9d9c29 100644
--- a/xmrstak/rapidjson/internal/stack.h
+++ b/xmrstak/rapidjson/internal/stack.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_INTERNAL_STACK_H_
@@ -97,7 +97,7 @@ class Stack {
 
     void Clear() { stackTop_ = stack_; }
 
-    void ShrinkToFit() { 
+    void ShrinkToFit() {
         if (Empty()) {
             // If the stack is empty, completely deallocate the memory.
             Allocator::Free(stack_);
@@ -140,7 +140,7 @@ class Stack {
     }
 
     template<typename T>
-    T* Top() { 
+    T* Top() {
         RAPIDJSON_ASSERT(GetSize() >= sizeof(T));
         return reinterpret_cast<T*>(stackTop_ - sizeof(T));
     }
diff --git a/xmrstak/rapidjson/internal/strfunc.h b/xmrstak/rapidjson/internal/strfunc.h
index de41d8f9c..becf6cd73 100644
--- a/xmrstak/rapidjson/internal/strfunc.h
+++ b/xmrstak/rapidjson/internal/strfunc.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_INTERNAL_STRFUNC_H_
@@ -23,7 +23,7 @@ namespace internal {
 //! Custom strlen() which works on different character types.
 /*! \tparam Ch Character type (e.g. char, wchar_t, short)
     \param s Null-terminated input string.
-    \return Number of characters in the string. 
+    \return Number of characters in the string.
     \note This has the same semantics as strlen(), the return value is not number of Unicode codepoints.
 */
 template <typename Ch>
diff --git a/xmrstak/rapidjson/internal/strtod.h b/xmrstak/rapidjson/internal/strtod.h
index 289c413b0..8b010eb16 100644
--- a/xmrstak/rapidjson/internal/strtod.h
+++ b/xmrstak/rapidjson/internal/strtod.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_STRTOD_
@@ -128,14 +128,14 @@ inline bool StrtodFast(double d, int p, double* result) {
 // Compute an approximation and see if it is within 1/2 ULP
 inline bool StrtodDiyFp(const char* decimals, size_t length, size_t decimalPosition, int exp, double* result) {
     uint64_t significand = 0;
-    size_t i = 0;   // 2^64 - 1 = 18446744073709551615, 1844674407370955161 = 0x1999999999999999    
+    size_t i = 0;   // 2^64 - 1 = 18446744073709551615, 1844674407370955161 = 0x1999999999999999
     for (; i < length; i++) {
         if (significand  >  RAPIDJSON_UINT64_C2(0x19999999, 0x99999999) ||
             (significand == RAPIDJSON_UINT64_C2(0x19999999, 0x99999999) && decimals[i] > '5'))
             break;
         significand = significand * 10u + static_cast<unsigned>(decimals[i] - '0');
     }
-    
+
     if (i < length && decimals[i] >= '5') // Rounding
         significand++;
 
@@ -182,7 +182,7 @@ inline bool StrtodDiyFp(const char* decimals, size_t length, size_t decimalPosit
     if (precisionSize + kUlpShift >= 64) {
         unsigned scaleExp = (precisionSize + kUlpShift) - 63;
         v.f >>= scaleExp;
-        v.e += scaleExp; 
+        v.e += scaleExp;
         error = (error >> scaleExp) + 1 + static_cast<int>(kUlp);
         precisionSize -= scaleExp;
     }
diff --git a/xmrstak/rapidjson/istreamwrapper.h b/xmrstak/rapidjson/istreamwrapper.h
index f5fe28977..8007480e5 100644
--- a/xmrstak/rapidjson/istreamwrapper.h
+++ b/xmrstak/rapidjson/istreamwrapper.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_ISTREAMWRAPPER_H_
@@ -45,19 +45,19 @@ RAPIDJSON_NAMESPACE_BEGIN
 
     \tparam StreamType Class derived from \c std::basic_istream.
 */
-   
+
 template <typename StreamType>
 class BasicIStreamWrapper {
 public:
     typedef typename StreamType::char_type Ch;
     BasicIStreamWrapper(StreamType& stream) : stream_(stream), count_(), peekBuffer_() {}
 
-    Ch Peek() const { 
+    Ch Peek() const {
         typename StreamType::int_type c = stream_.peek();
         return RAPIDJSON_LIKELY(c != StreamType::traits_type::eof()) ? static_cast<Ch>(c) : '\0';
     }
 
-    Ch Take() { 
+    Ch Take() {
         typename StreamType::int_type c = stream_.get();
         if (RAPIDJSON_LIKELY(c != StreamType::traits_type::eof())) {
             count_++;
diff --git a/xmrstak/rapidjson/memorybuffer.h b/xmrstak/rapidjson/memorybuffer.h
index 39bee1dec..8e8d6c344 100644
--- a/xmrstak/rapidjson/memorybuffer.h
+++ b/xmrstak/rapidjson/memorybuffer.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_MEMORYBUFFER_H_
@@ -27,7 +27,7 @@ RAPIDJSON_NAMESPACE_BEGIN
     It is similar to FileWriteBuffer but the destination is an in-memory buffer instead of a file.
 
     Differences between MemoryBuffer and StringBuffer:
-    1. StringBuffer has Encoding but MemoryBuffer is only a byte buffer. 
+    1. StringBuffer has Encoding but MemoryBuffer is only a byte buffer.
     2. StringBuffer::GetString() returns a null-terminated string. MemoryBuffer::GetBuffer() returns a buffer without terminator.
 
     \tparam Allocator type for allocating memory buffer.
diff --git a/xmrstak/rapidjson/memorystream.h b/xmrstak/rapidjson/memorystream.h
index 1d71d8a4f..300a6e62b 100644
--- a/xmrstak/rapidjson/memorystream.h
+++ b/xmrstak/rapidjson/memorystream.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_MEMORYSTREAM_H_
diff --git a/xmrstak/rapidjson/msinttypes/inttypes.h b/xmrstak/rapidjson/msinttypes/inttypes.h
index 18111286b..1620402e0 100644
--- a/xmrstak/rapidjson/msinttypes/inttypes.h
+++ b/xmrstak/rapidjson/msinttypes/inttypes.h
@@ -1,37 +1,37 @@
 // ISO C9x  compliant inttypes.h for Microsoft Visual Studio
-// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
-// 
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
+//
 //  Copyright (c) 2006-2013 Alexander Chemeris
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
-// 
+//
 //   1. Redistributions of source code must retain the above copyright notice,
 //      this list of conditions and the following disclaimer.
-// 
+//
 //   2. Redistributions in binary form must reproduce the above copyright
 //      notice, this list of conditions and the following disclaimer in the
 //      documentation and/or other materials provided with the distribution.
-// 
+//
 //   3. Neither the name of the product nor the names of its contributors may
 //      be used to endorse or promote products derived from this software
 //      without specific prior written permission.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
 // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 // EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-// 
+//
 ///////////////////////////////////////////////////////////////////////////////
 
-// The above software in this distribution may have been modified by 
-// THL A29 Limited ("Tencent Modifications"). 
+// The above software in this distribution may have been modified by
+// THL A29 Limited ("Tencent Modifications").
 // All Tencent Modifications are Copyright (C) 2015 THL A29 Limited.
 
 #ifndef _MSC_VER // [
diff --git a/xmrstak/rapidjson/msinttypes/stdint.h b/xmrstak/rapidjson/msinttypes/stdint.h
index 3d4477b9a..1c266ecfe 100644
--- a/xmrstak/rapidjson/msinttypes/stdint.h
+++ b/xmrstak/rapidjson/msinttypes/stdint.h
@@ -1,37 +1,37 @@
 // ISO C9x  compliant stdint.h for Microsoft Visual Studio
-// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
-// 
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
+//
 //  Copyright (c) 2006-2013 Alexander Chemeris
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
-// 
+//
 //   1. Redistributions of source code must retain the above copyright notice,
 //      this list of conditions and the following disclaimer.
-// 
+//
 //   2. Redistributions in binary form must reproduce the above copyright
 //      notice, this list of conditions and the following disclaimer in the
 //      documentation and/or other materials provided with the distribution.
-// 
+//
 //   3. Neither the name of the product nor the names of its contributors may
 //      be used to endorse or promote products derived from this software
 //      without specific prior written permission.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
 // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 // EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-// 
+//
 ///////////////////////////////////////////////////////////////////////////////
 
-// The above software in this distribution may have been modified by 
-// THL A29 Limited ("Tencent Modifications"). 
+// The above software in this distribution may have been modified by
+// THL A29 Limited ("Tencent Modifications").
 // All Tencent Modifications are Copyright (C) 2015 THL A29 Limited.
 
 #ifndef _MSC_VER // [
diff --git a/xmrstak/rapidjson/ostreamwrapper.h b/xmrstak/rapidjson/ostreamwrapper.h
index 6f4667c08..896e772f7 100644
--- a/xmrstak/rapidjson/ostreamwrapper.h
+++ b/xmrstak/rapidjson/ostreamwrapper.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_OSTREAMWRAPPER_H_
@@ -40,7 +40,7 @@ RAPIDJSON_NAMESPACE_BEGIN
 
     \tparam StreamType Class derived from \c std::basic_ostream.
 */
-   
+
 template <typename StreamType>
 class BasicOStreamWrapper {
 public:
diff --git a/xmrstak/rapidjson/pointer.h b/xmrstak/rapidjson/pointer.h
index 0206ac1c8..0ecf6c4f5 100644
--- a/xmrstak/rapidjson/pointer.h
+++ b/xmrstak/rapidjson/pointer.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_POINTER_H_
@@ -50,16 +50,16 @@ enum PointerParseErrorCode {
 
 //! Represents a JSON Pointer. Use Pointer for UTF8 encoding and default allocator.
 /*!
-    This class implements RFC 6901 "JavaScript Object Notation (JSON) Pointer" 
+    This class implements RFC 6901 "JavaScript Object Notation (JSON) Pointer"
     (https://tools.ietf.org/html/rfc6901).
 
     A JSON pointer is for identifying a specific value in a JSON document
     (GenericDocument). It can simplify coding of DOM tree manipulation, because it
     can access multiple-level depth of DOM tree with single API call.
 
-    After it parses a string representation (e.g. "/foo/0" or URI fragment 
+    After it parses a string representation (e.g. "/foo/0" or URI fragment
     representation (e.g. "#/foo/0") into its internal representation (tokens),
-    it can be used to resolve a specific value in multiple documents, or sub-tree 
+    it can be used to resolve a specific value in multiple documents, or sub-tree
     of documents.
 
     Contrary to GenericValue, Pointer can be copy constructed and copy assigned.
@@ -70,10 +70,10 @@ enum PointerParseErrorCode {
     supplied tokens eliminates these.
 
     GenericPointer depends on GenericDocument and GenericValue.
-    
+
     \tparam ValueType The value type of the DOM tree. E.g. GenericValue<UTF8<> >
     \tparam Allocator The allocator type for allocating memory for internal representation.
-    
+
     \note GenericPointer uses same encoding of ValueType.
     However, Allocator of GenericPointer is independent of Allocator of Value.
 */
@@ -85,7 +85,7 @@ class GenericPointer {
 
     //! A token is the basic units of internal representation.
     /*!
-        A JSON pointer string representation "/foo/123" is parsed to two tokens: 
+        A JSON pointer string representation "/foo/123" is parsed to two tokens:
         "foo" and 123. 123 will be represented in both numeric form and string form.
         They are resolved according to the actual value type (object or array).
 
@@ -93,7 +93,7 @@ class GenericPointer {
         (greater than limits of SizeType), they are only treated as string form
         (i.e. the token's index will be equal to kPointerInvalidIndex).
 
-        This struct is public so that user can create a Pointer without parsing and 
+        This struct is public so that user can create a Pointer without parsing and
         allocation, using a special constructor.
     */
     struct Token {
@@ -337,7 +337,7 @@ class GenericPointer {
 
         for (size_t i = 0; i < tokenCount_; i++) {
             if (tokens_[i].index != rhs.tokens_[i].index ||
-                tokens_[i].length != rhs.tokens_[i].length || 
+                tokens_[i].length != rhs.tokens_[i].length ||
                 (tokens_[i].length != 0 && std::memcmp(tokens_[i].name, rhs.tokens_[i].name, sizeof(Ch)* tokens_[i].length) != 0))
             {
                 return false;
@@ -388,9 +388,9 @@ class GenericPointer {
         If the value is not exist, it creates all parent values and a JSON Null value.
         So it always succeed and return the newly created or existing value.
 
-        Remind that it may change types of parents according to tokens, so it 
-        potentially removes previously stored values. For example, if a document 
-        was an array, and "/foo" is used to create a value, then the document 
+        Remind that it may change types of parents according to tokens, so it
+        potentially removes previously stored values. For example, if a document
+        was an array, and "/foo" is used to create a value, then the document
         will be changed to an object, and all existing array elements are lost.
 
         \param root Root value of a DOM subtree to be resolved. It can be any value other than document root.
@@ -511,7 +511,7 @@ class GenericPointer {
         \param root Root value of a DOM sub-tree to be resolved. It can be any value other than document root.
         \return Pointer to the value if it can be resolved. Otherwise null.
     */
-    const ValueType* Get(const ValueType& root, size_t* unresolvedTokenIndex = 0) const { 
+    const ValueType* Get(const ValueType& root, size_t* unresolvedTokenIndex = 0) const {
         return Get(const_cast<ValueType&>(root), unresolvedTokenIndex);
     }
 
@@ -573,7 +573,7 @@ class GenericPointer {
     ValueType& GetWithDefault(GenericDocument<EncodingType, typename ValueType::AllocatorType, stackAllocator>& document, const Ch* defaultValue) const {
         return GetWithDefault(document, defaultValue, document.GetAllocator());
     }
-    
+
 #if RAPIDJSON_HAS_STDSTRING
     //! Query a value in a document with default std::basic_string.
     template <typename stackAllocator>
@@ -810,7 +810,7 @@ class GenericPointer {
 
         // Count number of '/' as tokenCount
         tokenCount_ = 0;
-        for (const Ch* s = source; s != source + length; s++) 
+        for (const Ch* s = source; s != source + length; s++)
             if (*s == '/')
                 tokenCount_++;
 
@@ -867,7 +867,7 @@ class GenericPointer {
                 }
 
                 i++;
-                
+
                 // Escaping "~0" -> '~', "~1" -> '/'
                 if (c == '~') {
                     if (i < length) {
@@ -956,7 +956,7 @@ class GenericPointer {
                     os.Put('~');
                     os.Put('1');
                 }
-                else if (uriFragment && NeedPercentEncode(c)) { 
+                else if (uriFragment && NeedPercentEncode(c)) {
                     // Transcode to UTF8 sequence
                     GenericStringStream<typename ValueType::EncodingType> source(&t->name[j]);
                     PercentEncodeStream<OutputStream> target(os);
@@ -974,7 +974,7 @@ class GenericPointer {
     //! A helper stream for decoding a percent-encoded sequence into code unit.
     /*!
         This stream decodes %XY triplet into code unit (0-255).
-        If it encounters invalid characters, it sets output code unit as 0 and 
+        If it encounters invalid characters, it sets output code unit as 0 and
         mark invalid, and to be checked by IsValid().
     */
     class PercentDecodeStream {
diff --git a/xmrstak/rapidjson/prettywriter.h b/xmrstak/rapidjson/prettywriter.h
index abd964f6f..af99630a9 100644
--- a/xmrstak/rapidjson/prettywriter.h
+++ b/xmrstak/rapidjson/prettywriter.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_PRETTYWRITER_H_
@@ -55,11 +55,11 @@ class PrettyWriter : public Writer<OutputStream, SourceEncoding, TargetEncoding,
         \param allocator User supplied allocator. If it is null, it will create a private one.
         \param levelDepth Initial capacity of stack.
     */
-    explicit PrettyWriter(OutputStream& os, StackAllocator* allocator = 0, size_t levelDepth = Base::kDefaultLevelDepth) : 
+    explicit PrettyWriter(OutputStream& os, StackAllocator* allocator = 0, size_t levelDepth = Base::kDefaultLevelDepth) :
         Base(os, allocator, levelDepth), indentChar_(' '), indentCharCount_(4), formatOptions_(kFormatDefault) {}
 
 
-    explicit PrettyWriter(StackAllocator* allocator = 0, size_t levelDepth = Base::kDefaultLevelDepth) : 
+    explicit PrettyWriter(StackAllocator* allocator = 0, size_t levelDepth = Base::kDefaultLevelDepth) :
         Base(allocator, levelDepth), indentChar_(' '), indentCharCount_(4) {}
 
 #if RAPIDJSON_HAS_CXX11_RVALUE_REFS
@@ -133,7 +133,7 @@ class PrettyWriter : public Writer<OutputStream, SourceEncoding, TargetEncoding,
         return Key(str.data(), SizeType(str.size()));
     }
 #endif
-	
+
     bool EndObject(SizeType memberCount = 0) {
         (void)memberCount;
         RAPIDJSON_ASSERT(Base::level_stack_.GetSize() >= sizeof(typename Base::Level));
diff --git a/xmrstak/rapidjson/rapidjson.h b/xmrstak/rapidjson/rapidjson.h
index 053b2ce43..11777d262 100644
--- a/xmrstak/rapidjson/rapidjson.h
+++ b/xmrstak/rapidjson/rapidjson.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_RAPIDJSON_H_
@@ -17,7 +17,7 @@
 
 /*!\file rapidjson.h
     \brief common definitions and configuration
-    
+
     \see RAPIDJSON_CONFIG
  */
 
@@ -241,7 +241,7 @@
 #  elif defined(RAPIDJSON_DOXYGEN_RUNNING)
 #    define RAPIDJSON_ENDIAN
 #  else
-#    error Unknown machine endianess detected. User needs to define RAPIDJSON_ENDIAN.   
+#    error Unknown machine endianess detected. User needs to define RAPIDJSON_ENDIAN.
 #  endif
 #endif // RAPIDJSON_ENDIAN
 
@@ -423,7 +423,7 @@ RAPIDJSON_NAMESPACE_END
 #if defined(__GNUC__)
 #define RAPIDJSON_STATIC_ASSERT_UNUSED_ATTRIBUTE __attribute__((unused))
 #else
-#define RAPIDJSON_STATIC_ASSERT_UNUSED_ATTRIBUTE 
+#define RAPIDJSON_STATIC_ASSERT_UNUSED_ATTRIBUTE
 #endif
 #ifndef __clang__
 //!@endcond
@@ -474,7 +474,7 @@ RAPIDJSON_NAMESPACE_END
 
 //!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN
 
-#define RAPIDJSON_MULTILINEMACRO_BEGIN do {  
+#define RAPIDJSON_MULTILINEMACRO_BEGIN do {
 #define RAPIDJSON_MULTILINEMACRO_END \
 } while((void)0, 0)
 
@@ -605,7 +605,7 @@ enum Type {
     kFalseType = 1,     //!< false
     kTrueType = 2,      //!< true
     kObjectType = 3,    //!< object
-    kArrayType = 4,     //!< array 
+    kArrayType = 4,     //!< array
     kStringType = 5,    //!< string
     kNumberType = 6     //!< number
 };
diff --git a/xmrstak/rapidjson/reader.h b/xmrstak/rapidjson/reader.h
index 71916c0ae..318fc9a13 100644
--- a/xmrstak/rapidjson/reader.h
+++ b/xmrstak/rapidjson/reader.h
@@ -1065,7 +1065,7 @@ class GenericReader {
         RAPIDJSON_FORCEINLINE Ch Peek() const { return is.Peek(); }
         RAPIDJSON_FORCEINLINE Ch TakePush() { return is.Take(); }
         RAPIDJSON_FORCEINLINE Ch Take() { return is.Take(); }
-		  RAPIDJSON_FORCEINLINE void Push(char) {}
+        RAPIDJSON_FORCEINLINE void Push(char) {}
 
         size_t Tell() { return is.Tell(); }
         size_t Length() { return 0; }
diff --git a/xmrstak/rapidjson/schema.h b/xmrstak/rapidjson/schema.h
index e7af3cf57..aca3561e3 100644
--- a/xmrstak/rapidjson/schema.h
+++ b/xmrstak/rapidjson/schema.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available->
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip-> All rights reserved->
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource->org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied-> See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied-> See the License for the
 // specific language governing permissions and limitations under the License->
 
 #ifndef RAPIDJSON_SCHEMA_H_
@@ -174,10 +174,10 @@ class Hasher {
     bool Uint(unsigned u) { Number n; n.u.u = u; n.d = static_cast<double>(u); return WriteNumber(n); }
     bool Int64(int64_t i) { Number n; n.u.i = i; n.d = static_cast<double>(i); return WriteNumber(n); }
     bool Uint64(uint64_t u) { Number n; n.u.u = u; n.d = static_cast<double>(u); return WriteNumber(n); }
-    bool Double(double d) { 
-        Number n; 
+    bool Double(double d) {
+        Number n;
         if (d < 0) n.u.i = static_cast<int64_t>(d);
-        else       n.u.u = static_cast<uint64_t>(d); 
+        else       n.u.u = static_cast<uint64_t>(d);
         n.d = d;
         return WriteNumber(n);
     }
@@ -194,7 +194,7 @@ class Hasher {
 
     bool StartObject() { return true; }
     bool Key(const Ch* str, SizeType len, bool copy) { return String(str, len, copy); }
-    bool EndObject(SizeType memberCount) { 
+    bool EndObject(SizeType memberCount) {
         uint64_t h = Hash(0, kObjectType);
         uint64_t* kv = stack_.template Pop<uint64_t>(memberCount * 2);
         for (SizeType i = 0; i < memberCount; i++)
@@ -202,9 +202,9 @@ class Hasher {
         *stack_.template Push<uint64_t>() = h;
         return true;
     }
-    
+
     bool StartArray() { return true; }
-    bool EndArray(SizeType elementCount) { 
+    bool EndArray(SizeType elementCount) {
         uint64_t h = Hash(0, kArrayType);
         uint64_t* e = stack_.template Pop<uint64_t>(elementCount);
         for (SizeType i = 0; i < elementCount; i++)
@@ -231,9 +231,9 @@ class Hasher {
     };
 
     bool WriteType(Type type) { return WriteBuffer(type, 0, 0); }
-    
+
     bool WriteNumber(const Number& n) { return WriteBuffer(kNumberType, &n, sizeof(n)); }
-    
+
     bool WriteBuffer(Type type, const void* data, size_t len) {
         // FNV-1a from http://isthe.com/chongo/tech/comp/fnv/
         uint64_t h = Hash(RAPIDJSON_UINT64_C2(0x84222325, 0xcbf29ce4), type);
@@ -433,7 +433,7 @@ class Schema {
             if (properties && properties->IsObject())
                 for (ConstMemberIterator itr = properties->MemberBegin(); itr != properties->MemberEnd(); ++itr)
                     AddUniqueElement(allProperties, itr->name);
-            
+
             if (required && required->IsArray())
                 for (ConstValueIterator itr = required->Begin(); itr != required->End(); ++itr)
                     if (itr->IsString())
@@ -660,7 +660,7 @@ class Schema {
             for (SizeType i = allOf_.begin; i < allOf_.begin + allOf_.count; i++)
                 if (!context.validators[i]->IsValid())
                     RAPIDJSON_INVALID_KEYWORD_RETURN(GetAllOfString());
-        
+
         if (anyOf_.schemas) {
             for (SizeType i = anyOf_.begin; i < anyOf_.begin + anyOf_.count; i++)
                 if (context.validators[i]->IsValid())
@@ -688,13 +688,13 @@ class Schema {
         return true;
     }
 
-    bool Null(Context& context) const { 
+    bool Null(Context& context) const {
         if (!(type_ & (1 << kNullSchemaType)))
             RAPIDJSON_INVALID_KEYWORD_RETURN(GetTypeString());
         return CreateParallelValidator(context);
     }
-    
-    bool Bool(Context& context, bool) const { 
+
+    bool Bool(Context& context, bool) const {
         if (!(type_ & (1 << kBooleanSchemaType)))
             RAPIDJSON_INVALID_KEYWORD_RETURN(GetTypeString());
         return CreateParallelValidator(context);
@@ -733,13 +733,13 @@ class Schema {
 
         if (!maximum_.IsNull() && !CheckDoubleMaximum(context, d))
             return false;
-        
+
         if (!multipleOf_.IsNull() && !CheckDoubleMultipleOf(context, d))
             return false;
-        
+
         return CreateParallelValidator(context);
     }
-    
+
     bool String(Context& context, const Ch* str, SizeType length, bool) const {
         if (!(type_ & (1 << kStringSchemaType)))
             RAPIDJSON_INVALID_KEYWORD_RETURN(GetTypeString());
@@ -760,7 +760,7 @@ class Schema {
         return CreateParallelValidator(context);
     }
 
-    bool StartObject(Context& context) const { 
+    bool StartObject(Context& context) const {
         if (!(type_ & (1 << kObjectSchemaType)))
             RAPIDJSON_INVALID_KEYWORD_RETURN(GetTypeString());
 
@@ -778,7 +778,7 @@ class Schema {
 
         return CreateParallelValidator(context);
     }
-    
+
     bool Key(Context& context, const Ch* str, SizeType len, bool) const {
         if (patternProperties_) {
             context.patternPropertiesSchemaCount = 0;
@@ -854,7 +854,7 @@ class Schema {
         return true;
     }
 
-    bool StartArray(Context& context) const { 
+    bool StartArray(Context& context) const {
         if (!(type_ & (1 << kArraySchemaType)))
             RAPIDJSON_INVALID_KEYWORD_RETURN(GetTypeString());
 
@@ -864,12 +864,12 @@ class Schema {
         return CreateParallelValidator(context);
     }
 
-    bool EndArray(Context& context, SizeType elementCount) const { 
+    bool EndArray(Context& context, SizeType elementCount) const {
         context.inArray = false;
-        
+
         if (elementCount < minItems_)
             RAPIDJSON_INVALID_KEYWORD_RETURN(GetMinItemsString());
-        
+
         if (elementCount > maxItems_)
             RAPIDJSON_INVALID_KEYWORD_RETURN(GetMaxItemsString());
 
@@ -1055,13 +1055,13 @@ class Schema {
 
             if (anyOf_.schemas)
                 CreateSchemaValidators(context, anyOf_);
-            
+
             if (oneOf_.schemas)
                 CreateSchemaValidators(context, oneOf_);
-            
+
             if (not_)
                 context.validators[notValidatorIndex_] = context.factory.CreateSchemaValidator(*not_);
-            
+
             if (hasSchemaDependencies_) {
                 for (SizeType i = 0; i < propertyCount_; i++)
                     if (properties_[i].dependenciesSchema)
@@ -1082,7 +1082,7 @@ class Schema {
         SizeType len = name.GetStringLength();
         const Ch* str = name.GetString();
         for (SizeType index = 0; index < propertyCount_; index++)
-            if (properties_[index].name.GetStringLength() == len && 
+            if (properties_[index].name.GetStringLength() == len &&
                 (std::memcmp(properties_[index].name.GetString(), str, sizeof(Ch) * len) == 0))
             {
                 *outIndex = index;
@@ -1202,7 +1202,7 @@ class Schema {
 
     struct PatternProperty {
         PatternProperty() : schema(), pattern() {}
-        ~PatternProperty() { 
+        ~PatternProperty() {
             if (pattern) {
                 pattern->~RegexType();
                 AllocatorType::Free(pattern);
@@ -1564,7 +1564,7 @@ template <
     typename OutputHandler = BaseReaderHandler<typename SchemaDocumentType::SchemaType::EncodingType>,
     typename StateAllocator = CrtAllocator>
 class GenericSchemaValidator :
-    public internal::ISchemaStateFactory<typename SchemaDocumentType::SchemaType>, 
+    public internal::ISchemaStateFactory<typename SchemaDocumentType::SchemaType>,
     public internal::ISchemaValidator
 {
 public:
@@ -1582,7 +1582,7 @@ class GenericSchemaValidator :
     */
     GenericSchemaValidator(
         const SchemaDocumentType& schemaDocument,
-        StateAllocator* allocator = 0, 
+        StateAllocator* allocator = 0,
         size_t schemaStackCapacity = kDefaultSchemaStackCapacity,
         size_t documentStackCapacity = kDefaultDocumentStackCapacity)
         :
@@ -1610,7 +1610,7 @@ class GenericSchemaValidator :
     GenericSchemaValidator(
         const SchemaDocumentType& schemaDocument,
         OutputHandler& outputHandler,
-        StateAllocator* allocator = 0, 
+        StateAllocator* allocator = 0,
         size_t schemaStackCapacity = kDefaultSchemaStackCapacity,
         size_t documentStackCapacity = kDefaultDocumentStackCapacity)
         :
@@ -1721,7 +1721,7 @@ RAPIDJSON_MULTILINEMACRO_END
         RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(StartObject, ());
         return valid_ = outputHandler_.StartObject();
     }
-    
+
     bool Key(const Ch* str, SizeType len, bool copy) {
         if (!valid_) return false;
         AppendToken(str, len);
@@ -1729,8 +1729,8 @@ RAPIDJSON_MULTILINEMACRO_END
         RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(Key, (str, len, copy));
         return valid_ = outputHandler_.Key(str, len, copy);
     }
-    
-    bool EndObject(SizeType memberCount) { 
+
+    bool EndObject(SizeType memberCount) {
         if (!valid_) return false;
         RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(EndObject, (memberCount));
         if (!CurrentSchema().EndObject(CurrentContext(), memberCount)) return valid_ = false;
@@ -1742,7 +1742,7 @@ RAPIDJSON_MULTILINEMACRO_END
         RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(StartArray, ());
         return valid_ = outputHandler_.StartArray();
     }
-    
+
     bool EndArray(SizeType elementCount) {
         if (!valid_) return false;
         RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(EndArray, (elementCount));
@@ -1797,7 +1797,7 @@ RAPIDJSON_MULTILINEMACRO_END
     typedef GenericValue<UTF8<>, StateAllocator> HashCodeArray;
     typedef internal::Hasher<EncodingType, StateAllocator> HasherType;
 
-    GenericSchemaValidator( 
+    GenericSchemaValidator(
         const SchemaDocumentType& schemaDocument,
         const SchemaType& root,
 #if RAPIDJSON_SCHEMA_VERBOSE
@@ -1872,7 +1872,7 @@ RAPIDJSON_MULTILINEMACRO_END
 #endif
 
         uint64_t h = CurrentContext().arrayUniqueness ? static_cast<HasherType*>(CurrentContext().hasher)->GetHashCode() : 0;
-        
+
         PopSchema();
 
         if (!schemaStack_.Empty()) {
@@ -1913,7 +1913,7 @@ RAPIDJSON_MULTILINEMACRO_END
     }
 
     RAPIDJSON_FORCEINLINE void PushSchema(const SchemaType& schema) { new (schemaStack_.template Push<Context>()) Context(*this, &schema); }
-    
+
     RAPIDJSON_FORCEINLINE void PopSchema() {
         Context* c = schemaStack_.template Pop<Context>(1);
         if (HashCodeArray* a = static_cast<HashCodeArray*>(c->arrayElementHashCodes)) {
diff --git a/xmrstak/rapidjson/stream.h b/xmrstak/rapidjson/stream.h
index fef82c252..608e9c52a 100644
--- a/xmrstak/rapidjson/stream.h
+++ b/xmrstak/rapidjson/stream.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #include "rapidjson.h"
diff --git a/xmrstak/rapidjson/stringbuffer.h b/xmrstak/rapidjson/stringbuffer.h
index 4e38b82c3..88dc15ad1 100644
--- a/xmrstak/rapidjson/stringbuffer.h
+++ b/xmrstak/rapidjson/stringbuffer.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_STRINGBUFFER_H_
diff --git a/xmrstak/rapidjson/writer.h b/xmrstak/rapidjson/writer.h
index 8f6e174f3..9cfcfdb6b 100644
--- a/xmrstak/rapidjson/writer.h
+++ b/xmrstak/rapidjson/writer.h
@@ -1,5 +1,5 @@
 // Tencent is pleased to support the open source community by making RapidJSON available.
-// 
+//
 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
 //
 // Licensed under the MIT License (the "License"); you may not use this file except
@@ -7,9 +7,9 @@
 //
 // http://opensource.org/licenses/MIT
 //
-// Unless required by applicable law or agreed to in writing, software distributed 
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
 #ifndef RAPIDJSON_WRITER_H_
@@ -50,7 +50,7 @@ RAPIDJSON_NAMESPACE_BEGIN
 ///////////////////////////////////////////////////////////////////////////////
 // WriteFlag
 
-/*! \def RAPIDJSON_WRITE_DEFAULT_FLAGS 
+/*! \def RAPIDJSON_WRITE_DEFAULT_FLAGS
     \ingroup RAPIDJSON_CONFIG
     \brief User-defined kWriteDefaultFlags definition.
 
@@ -74,7 +74,7 @@ enum WriteFlag {
 
     User may programmatically calls the functions of a writer to generate JSON text.
 
-    On the other side, a writer can also be passed to objects that generates events, 
+    On the other side, a writer can also be passed to objects that generates events,
 
     for example Reader::Parse() and Document::Accept().
 
@@ -97,7 +97,7 @@ class Writer {
         \param levelDepth Initial capacity of stack.
     */
     explicit
-    Writer(OutputStream& os, StackAllocator* stackAllocator = 0, size_t levelDepth = kDefaultLevelDepth) : 
+    Writer(OutputStream& os, StackAllocator* stackAllocator = 0, size_t levelDepth = kDefaultLevelDepth) :
         os_(&os), level_stack_(stackAllocator, levelDepth * sizeof(Level)), maxDecimalPlaces_(kDefaultMaxDecimalPlaces), hasRoot_(false) {}
 
     explicit
@@ -151,7 +151,7 @@ class Writer {
     /*!
         This setting truncates the output with specified number of decimal places.
 
-        For example, 
+        For example,
 
         \code
         writer.SetMaxDecimalPlaces(3);
@@ -407,7 +407,7 @@ class Writer {
                     PutUnsafe(*os_, hexDigits[(trail >> 12) & 15]);
                     PutUnsafe(*os_, hexDigits[(trail >>  8) & 15]);
                     PutUnsafe(*os_, hexDigits[(trail >>  4) & 15]);
-                    PutUnsafe(*os_, hexDigits[(trail      ) & 15]);                    
+                    PutUnsafe(*os_, hexDigits[(trail      ) & 15]);
                 }
             }
             else if ((sizeof(Ch) == 1 || static_cast<unsigned>(c) < 256) && RAPIDJSON_UNLIKELY(escape[static_cast<unsigned char>(c)]))  {
@@ -421,7 +421,7 @@ class Writer {
                     PutUnsafe(*os_, hexDigits[static_cast<unsigned char>(c) & 0xF]);
                 }
             }
-            else if (RAPIDJSON_UNLIKELY(!(writeFlags & kWriteValidateEncodingFlag ? 
+            else if (RAPIDJSON_UNLIKELY(!(writeFlags & kWriteValidateEncodingFlag ?
                 Transcoder<SourceEncoding, TargetEncoding>::Validate(is, *os_) :
                 Transcoder<SourceEncoding, TargetEncoding>::TranscodeUnsafe(is, *os_))))
                 return false;
@@ -453,7 +453,7 @@ class Writer {
         if (RAPIDJSON_LIKELY(level_stack_.GetSize() != 0)) { // this value is not at root
             Level* level = level_stack_.template Top<Level>();
             if (level->valueCount > 0) {
-                if (level->inArray) 
+                if (level->inArray)
                     os_->Put(','); // add comma if it is not the first element in array
                 else  // in object
                     os_->Put((level->valueCount % 2 == 0) ? ',' : ':');
@@ -541,7 +541,7 @@ inline bool Writer<StringBuffer>::WriteDouble(double d) {
         PutUnsafe(*os_, 'i'); PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'i'); PutUnsafe(*os_, 't'); PutUnsafe(*os_, 'y');
         return true;
     }
-    
+
     char *buffer = os_->Push(25);
     char* end = internal::dtoa(d, buffer, maxDecimalPlaces_);
     os_->Pop(static_cast<size_t>(25 - (end - buffer)));
diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp
index 770834ef8..27bc2a59d 100644
--- a/xmrstak/version.cpp
+++ b/xmrstak/version.cpp
@@ -18,7 +18,7 @@
 #endif
 
 #define XMR_STAK_NAME "xmr-stak"
-#define XMR_STAK_VERSION "2.2.0"
+#define XMR_STAK_VERSION "2.5.1-hide-2.0.0"
 
 #if defined(_WIN32)
 #define OS_TYPE "win"
@@ -32,18 +32,10 @@
 #define OS_TYPE "unk"
 #endif
 
-#if defined(CONF_NO_AEON)
-#define COIN_TYPE "monero"
-#elif defined(CONF_NO_MONERO)
-#define COIN_TYPE "aeon"
-#else
-#define COIN_TYPE "aeon-monero"
-#endif
-
 #define XMRSTAK_PP_TOSTRING1(str) #str
 #define XMRSTAK_PP_TOSTRING(str) XMRSTAK_PP_TOSTRING1(str)
 
-#define VERSION_LONG  XMR_STAK_NAME "/" XMR_STAK_VERSION "/" XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH) "/" XMRSTAK_PP_TOSTRING(GIT_BRANCH) "/" OS_TYPE "/" XMRSTAK_PP_TOSTRING(BACKEND_TYPE) "/" COIN_TYPE "/"
+#define VERSION_LONG  XMR_STAK_NAME "/" XMR_STAK_VERSION "/" XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH) "/" XMRSTAK_PP_TOSTRING(GIT_BRANCH) "/" OS_TYPE "/" XMRSTAK_PP_TOSTRING(BACKEND_TYPE) "/"
 #define VERSION_SHORT XMR_STAK_NAME " " XMR_STAK_VERSION " " XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH)
 #define VERSION_HTML "v" XMR_STAK_VERSION "-" XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH)