diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9ab14e19e3..e31d6f4f41 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,11 @@ cmake_minimum_required(VERSION 3.27)
 
 project(placeholder)
 
+option(GPU_PROVER "Enable GPU support" OFF)
+if (GPU_PROVER)
+  add_compile_options(-DGPU_PROVER)
+endif()
+
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
   add_compile_options (-fdiagnostics-color=always)
 elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
@@ -26,7 +31,7 @@ file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/generated-dummy.cpp)
 add_library(crypto3_precompiled_headers STATIC ${CMAKE_CURRENT_BINARY_DIR}/generated-dummy.cpp)
 set_target_properties(crypto3_precompiled_headers PROPERTIES
     LINKER_LANGUAGE CXX
-    CXX_STANDARD 20
+    CXX_STANDARD 23
     CXX_STANDARD_REQUIRED TRUE
 )
 
diff --git a/crypto3/benchmarks/CMakeLists.txt b/crypto3/benchmarks/CMakeLists.txt
index b3dea658f1..772298f8c8 100644
--- a/crypto3/benchmarks/CMakeLists.txt
+++ b/crypto3/benchmarks/CMakeLists.txt
@@ -40,7 +40,7 @@ macro(define_benchmark benchmark)
 
     set_target_properties(${full_name}
         PROPERTIES
-        CXX_STANDARD 20
+        CXX_STANDARD 23
         CXX_STANDARD_REQUIRED TRUE)
 
     if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
diff --git a/crypto3/libs/algebra/example/CMakeLists.txt b/crypto3/libs/algebra/example/CMakeLists.txt
index c9c0d345c2..5691506a83 100644
--- a/crypto3/libs/algebra/example/CMakeLists.txt
+++ b/crypto3/libs/algebra/example/CMakeLists.txt
@@ -15,7 +15,7 @@ macro(define_algebra_example name)
                           Boost::random
                       )
 
-    set_target_properties(algebra_${name}_example PROPERTIES CXX_STANDARD 20)
+    set_target_properties(algebra_${name}_example PROPERTIES CXX_STANDARD 23)
 endmacro()
 
 set(EXAMPLES_NAMES
diff --git a/crypto3/libs/algebra/test/CMakeLists.txt b/crypto3/libs/algebra/test/CMakeLists.txt
index 2140872daa..1fceeefd07 100644
--- a/crypto3/libs/algebra/test/CMakeLists.txt
+++ b/crypto3/libs/algebra/test/CMakeLists.txt
@@ -23,7 +23,7 @@ macro(define_runtime_algebra_test name)
 
             ${Boost_INCLUDE_DIRS})
 
-    set_target_properties(${test_name} PROPERTIES CXX_STANDARD 20
+    set_target_properties(${test_name} PROPERTIES CXX_STANDARD 23
             CXX_STANDARD_REQUIRED TRUE)
 
     if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
@@ -44,7 +44,7 @@ macro(define_compile_time_algebra_test name)
 
             ${Boost_INCLUDE_DIRS})
 
-    set_target_properties(algebra_${name}_compile_test PROPERTIES CXX_STANDARD 20)
+    set_target_properties(algebra_${name}_compile_test PROPERTIES CXX_STANDARD 23)
 endmacro()
 
 set(RUNTIME_TESTS_NAMES
diff --git a/crypto3/libs/blueprint/example/CMakeLists.txt b/crypto3/libs/blueprint/example/CMakeLists.txt
index 0043c270d2..b4f6968eae 100644
--- a/crypto3/libs/blueprint/example/CMakeLists.txt
+++ b/crypto3/libs/blueprint/example/CMakeLists.txt
@@ -21,7 +21,7 @@ macro(define_blueprint_example name)
                           ${CMAKE_WORKSPACE_NAME}::multiprecision
                           ${CMAKE_WORKSPACE_NAME}::zk
     )
-    set_target_properties(${full_example_name} PROPERTIES CXX_STANDARD 20 CXX_STANDARD_REQUIRED TRUE)
+    set_target_properties(${full_example_name} PROPERTIES CXX_STANDARD 23 CXX_STANDARD_REQUIRED TRUE)
 endmacro()
 
 
diff --git a/crypto3/libs/blueprint/test/CMakeLists.txt b/crypto3/libs/blueprint/test/CMakeLists.txt
index 38db7a296a..a7c1aa85e2 100644
--- a/crypto3/libs/blueprint/test/CMakeLists.txt
+++ b/crypto3/libs/blueprint/test/CMakeLists.txt
@@ -44,7 +44,7 @@ cm_test_link_libraries(${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_NAME}
         crypto3::hash
         Boost::unit_test_framework
 )
-set_target_properties(_cm_internal_tests-crypto3-blueprint-test PROPERTIES CXX_STANDARD 20)
+set_target_properties(_cm_internal_tests-crypto3-blueprint-test PROPERTIES CXX_STANDARD 23)
 target_precompile_headers(_cm_internal_tests-crypto3-blueprint-test REUSE_FROM crypto3_precompiled_headers)
 
 macro(define_blueprint_test test)
@@ -60,7 +60,7 @@ macro(define_blueprint_test test)
     target_include_directories(${full_test_name} PRIVATE
                                ${Boost_INCLUDE_DIRS})
 
-    set_target_properties(${full_test_name} PROPERTIES CXX_STANDARD 20)
+    set_target_properties(${full_test_name} PROPERTIES CXX_STANDARD 23)
 
     target_compile_definitions(${full_test_name} PRIVATE TEST_DATA_DIR="${CMAKE_CURRENT_SOURCE_DIR}/zkevm/data/")
 
diff --git a/crypto3/libs/blueprint/test/zkevm_bbf/hardhat.cpp b/crypto3/libs/blueprint/test/zkevm_bbf/hardhat.cpp
index 175409e57d..ddb796e2c3 100644
--- a/crypto3/libs/blueprint/test/zkevm_bbf/hardhat.cpp
+++ b/crypto3/libs/blueprint/test/zkevm_bbf/hardhat.cpp
@@ -139,7 +139,7 @@ class zkEVMHardhatTestFixture: public BBFTestFixture {
         // std::cout << std::endl;
 
         // Max_copy, Max_rw, Max_keccak, Max_bytecode
-        result =test_bbf_component<field_type, nil::blueprint::bbf::exponentiation>(
+        result = test_bbf_component<field_type, nil::blueprint::bbf::exponentiation>(
             "exp",
             {}, exp_assignment_input, exp_constraint_input,
             max_exp_rows,
diff --git a/crypto3/libs/containers/example/CMakeLists.txt b/crypto3/libs/containers/example/CMakeLists.txt
index 3c9a21834e..0f329aaccc 100644
--- a/crypto3/libs/containers/example/CMakeLists.txt
+++ b/crypto3/libs/containers/example/CMakeLists.txt
@@ -37,7 +37,7 @@ macro(define_containers_example example)
             ${CMAKE_WORKSPACE_NAME}::algebra
             ${CMAKE_WORKSPACE_NAME}::hash
             Boost::container)
-    set_target_properties(${target_name} PROPERTIES CXX_STANDARD 20)
+    set_target_properties(${target_name} PROPERTIES CXX_STANDARD 23)
 endmacro()
 
 set(EXAMPLES_NAMES
diff --git a/crypto3/libs/containers/test/CMakeLists.txt b/crypto3/libs/containers/test/CMakeLists.txt
index ec5296ad0f..f1db3ba537 100644
--- a/crypto3/libs/containers/test/CMakeLists.txt
+++ b/crypto3/libs/containers/test/CMakeLists.txt
@@ -33,7 +33,7 @@ cm_test_link_libraries(${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_NAME}
     Boost::unit_test_framework
     Boost::random
 )
-set_target_properties(_cm_internal_tests-crypto3-containers-test PROPERTIES CXX_STANDARD 20)
+set_target_properties(_cm_internal_tests-crypto3-containers-test PROPERTIES CXX_STANDARD 23)
 target_precompile_headers(_cm_internal_tests-crypto3-containers-test REUSE_FROM crypto3_precompiled_headers)
 
 macro(define_storage_test test)
@@ -57,7 +57,7 @@ macro(define_storage_test test)
         Boost::random
     )
 
-    set_target_properties(${full_test_name} PROPERTIES CXX_STANDARD 20)
+    set_target_properties(${full_test_name} PROPERTIES CXX_STANDARD 23)
 
      if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
         target_compile_options(${full_test_name} PRIVATE "-fconstexpr-steps=2147483647" "-ftemplate-backtrace-limit=0")
diff --git a/crypto3/libs/hash/test/CMakeLists.txt b/crypto3/libs/hash/test/CMakeLists.txt
index e67a041c07..5d72a47f86 100644
--- a/crypto3/libs/hash/test/CMakeLists.txt
+++ b/crypto3/libs/hash/test/CMakeLists.txt
@@ -12,7 +12,7 @@ cm_test_link_libraries(${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_NAME}
                        ${${CURRENT_PROJECT_NAME}_INTERFACE_LIBRARIES}
                        Boost::unit_test_framework)
 
-set_target_properties(_cm_internal_tests-crypto3-hash-test PROPERTIES CXX_STANDARD 20)
+set_target_properties(_cm_internal_tests-crypto3-hash-test PROPERTIES CXX_STANDARD 23)
 target_precompile_headers(_cm_internal_tests-crypto3-hash-test REUSE_FROM crypto3_precompiled_headers)
 
 macro(define_hash_test name)
@@ -27,7 +27,7 @@ macro(define_hash_test name)
                                ${Boost_INCLUDE_DIRS})
 
     set_target_properties(${test_name} PROPERTIES
-                          CXX_STANDARD 20
+                          CXX_STANDARD 23
                           CXX_STANDARD_REQUIRED TRUE)
 
     if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
diff --git a/crypto3/libs/marshalling/algebra/test/CMakeLists.txt b/crypto3/libs/marshalling/algebra/test/CMakeLists.txt
index a23fdbc245..1e521ef07b 100644
--- a/crypto3/libs/marshalling/algebra/test/CMakeLists.txt
+++ b/crypto3/libs/marshalling/algebra/test/CMakeLists.txt
@@ -32,7 +32,7 @@ macro(define_marshalling_test name)
                                ${Boost_INCLUDE_DIRS})
 
     set_target_properties(${test_name} PROPERTIES
-                          CXX_STANDARD 20
+                          CXX_STANDARD 23
                           CXX_STANDARD_REQUIRED TRUE)
 
     if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
diff --git a/crypto3/libs/marshalling/core/CMakeLists.txt b/crypto3/libs/marshalling/core/CMakeLists.txt
index bff49113d0..2b80e7fd18 100644
--- a/crypto3/libs/marshalling/core/CMakeLists.txt
+++ b/crypto3/libs/marshalling/core/CMakeLists.txt
@@ -20,7 +20,7 @@ if(CRYPTO3_MARSHALLING_THROWS)
 endif()
 
 if(NOT CMAKE_CXX_STANDARD)
-    set_target_properties(${test_name} PROPERTIES CXX_STANDARD 20)
+    set_target_properties(${test_name} PROPERTIES CXX_STANDARD 23)
 endif()
 
 cm_setup_version(VERSION 0.1.0 PREFIX ${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_NAME})
diff --git a/crypto3/libs/marshalling/core/test/CMakeLists.txt b/crypto3/libs/marshalling/core/test/CMakeLists.txt
index f9ebe1188f..1b182d2ae2 100644
--- a/crypto3/libs/marshalling/core/test/CMakeLists.txt
+++ b/crypto3/libs/marshalling/core/test/CMakeLists.txt
@@ -28,7 +28,7 @@ macro(define_marshalling_test name)
                                ${Boost_INCLUDE_DIRS})
 
     set_target_properties(marshalling_core_${name}_test PROPERTIES
-                          CXX_STANDARD 20
+                          CXX_STANDARD 23
                           CXX_STANDARD_REQUIRED TRUE)
 endmacro()
 
diff --git a/crypto3/libs/marshalling/multiprecision/CMakeLists.txt b/crypto3/libs/marshalling/multiprecision/CMakeLists.txt
index d5cd627942..22a7edfb98 100644
--- a/crypto3/libs/marshalling/multiprecision/CMakeLists.txt
+++ b/crypto3/libs/marshalling/multiprecision/CMakeLists.txt
@@ -18,7 +18,7 @@ cm_setup_version(VERSION 0.1.0 PREFIX ${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_
 add_library(${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_NAME} INTERFACE)
 add_library(${CMAKE_WORKSPACE_NAME}::${CURRENT_PROJECT_NAME} ALIAS ${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_NAME})
 
-set_target_properties(${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_NAME} PROPERTIES CXX_STANDARD 20)
+set_target_properties(${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_NAME} PROPERTIES CXX_STANDARD 23)
 
 set_target_properties(${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_NAME} PROPERTIES
                       EXPORT_NAME ${CURRENT_PROJECT_NAME})
diff --git a/crypto3/libs/marshalling/multiprecision/test/CMakeLists.txt b/crypto3/libs/marshalling/multiprecision/test/CMakeLists.txt
index 07dffa77af..6227d45df9 100644
--- a/crypto3/libs/marshalling/multiprecision/test/CMakeLists.txt
+++ b/crypto3/libs/marshalling/multiprecision/test/CMakeLists.txt
@@ -39,7 +39,7 @@ macro(define_marshalling_test name)
                                ${Boost_INCLUDE_DIRS})
 
     set_target_properties(marshalling_${name}_test PROPERTIES
-                          CXX_STANDARD 20
+                          CXX_STANDARD 23
                           CXX_STANDARD_REQUIRED TRUE)
 
     if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
diff --git a/crypto3/libs/marshalling/zk/test/CMakeLists.txt b/crypto3/libs/marshalling/zk/test/CMakeLists.txt
index f54dca72bc..199249e1fa 100644
--- a/crypto3/libs/marshalling/zk/test/CMakeLists.txt
+++ b/crypto3/libs/marshalling/zk/test/CMakeLists.txt
@@ -22,7 +22,7 @@ cm_test_link_libraries(${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_NAME}
         ${CMAKE_WORKSPACE_NAME}::marshalling-algebra
         ${CMAKE_WORKSPACE_NAME}::marshalling-core)
 
-set_target_properties(_cm_internal_tests-crypto3-marshalling-zk-test PROPERTIES CXX_STANDARD 20)
+set_target_properties(_cm_internal_tests-crypto3-marshalling-zk-test PROPERTIES CXX_STANDARD 23)
 target_precompile_headers(_cm_internal_tests-crypto3-marshalling-zk-test REUSE_FROM crypto3_precompiled_headers)
 
 macro(define_marshalling_test test)
@@ -40,7 +40,7 @@ macro(define_marshalling_test test)
             ${Boost_INCLUDE_DIRS})
 
     set_target_properties(marshalling_zk_${name}_test PROPERTIES
-            CXX_STANDARD 20
+            CXX_STANDARD 23
             CXX_STANDARD_REQUIRED TRUE)
 
     if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
diff --git a/crypto3/libs/math/include/nil/crypto3/math/polynomial/polynomial_dfs.hpp b/crypto3/libs/math/include/nil/crypto3/math/polynomial/polynomial_dfs.hpp
index 147ea6cbc4..b56c99b5a1 100644
--- a/crypto3/libs/math/include/nil/crypto3/math/polynomial/polynomial_dfs.hpp
+++ b/crypto3/libs/math/include/nil/crypto3/math/polynomial/polynomial_dfs.hpp
@@ -883,4 +883,4 @@ struct std::hash<nil::crypto3::math::polynomial_dfs<FieldValueType, Allocator>>
     }
 };
 
-#endif    // CRYPTO3_MATH_POLYNOMIAL_POLYNOM_DFT_HPP
+#endif    // CRYPTO3_MATH_POLYNOMIAL_POLYNOM_DFT_HPP
\ No newline at end of file
diff --git a/crypto3/libs/math/test/CMakeLists.txt b/crypto3/libs/math/test/CMakeLists.txt
index b7e57806d2..825422c637 100644
--- a/crypto3/libs/math/test/CMakeLists.txt
+++ b/crypto3/libs/math/test/CMakeLists.txt
@@ -24,7 +24,7 @@ macro(define_math_test name)
 
                                ${Boost_INCLUDE_DIRS})
 
-    set_target_properties(${test_name} PROPERTIES CXX_STANDARD 20)
+    set_target_properties(${test_name} PROPERTIES CXX_STANDARD 23)
 
 endmacro()
 
diff --git a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/big_uint.hpp b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/big_uint.hpp
index 0207c4a7c4..9e86bad38d 100644
--- a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/big_uint.hpp
+++ b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/big_uint.hpp
@@ -42,6 +42,7 @@
 #include "nil/crypto3/multiprecision/detail/endian.hpp"
 #include "nil/crypto3/multiprecision/detail/force_inline.hpp"
 #include "nil/crypto3/multiprecision/unsigned_utils.hpp"
+#include "nil/crypto3/multiprecision/detail/throw.hpp"
 
 namespace nil::crypto3::multiprecision {
     /**
@@ -155,7 +156,7 @@ namespace nil::crypto3::multiprecision {
             do_assign_integral_unchecked(unsigned_or_throw(a));
             if constexpr (sizeof(T) * CHAR_BIT > Bits) {
                 if (compare(a) != 0) {
-                    throw std::range_error("big_uint: overflow");
+                    NIL_THROW(std::range_error("big_uint: overflow"));
                 }
             }
         }
@@ -177,7 +178,7 @@ namespace nil::crypto3::multiprecision {
             do_assign_unchecked(other);
             if constexpr (Bits2 > Bits) {
                 if (other.compare(*this) != 0) {
-                    throw std::range_error("big_uint: overflow");
+                    NIL_THROW(std::range_error("big_uint: overflow"));
                 }
             }
         }
@@ -253,7 +254,7 @@ namespace nil::crypto3::multiprecision {
                 }
             }
             if (bits > Bits) {
-                throw std::range_error("big_uint: not enough bits to store bytes");
+                NIL_THROW(std::range_error("big_uint: not enough bits to store bytes"));
             }
             return *this;
         }
@@ -315,7 +316,7 @@ namespace nil::crypto3::multiprecision {
                 return decimal_str();
             }
             if (!(flags & std::ios_base::hex)) {
-                throw std::invalid_argument("big_uint: unsupported format flags");
+                NIL_THROW(std::invalid_argument("big_uint: unsupported format flags"));
             }
             auto result = hex_str();
             if (flags & std::ios_base::uppercase) {
@@ -370,7 +371,7 @@ namespace nil::crypto3::multiprecision {
             auto result = to_unsigned_unchecked<T>();
             if constexpr (sizeof(T) * CHAR_BIT < Bits) {
                 if (compare(result) != 0) {
-                    throw std::overflow_error("big_uint: overflow");
+                    NIL_THROW(std::overflow_error("big_uint: overflow"));
                 }
             }
             return result;
@@ -382,7 +383,7 @@ namespace nil::crypto3::multiprecision {
             T result = static_cast<T>(to_unsigned_unchecked<std::make_unsigned_t<T>>());
             if constexpr (sizeof(T) * CHAR_BIT <= Bits) {
                 if (compare(result) != 0) {
-                    throw std::overflow_error("big_uint: overflow");
+                    NIL_THROW(std::overflow_error("big_uint: overflow"));
                 }
             }
             return result;
@@ -749,7 +750,7 @@ namespace nil::crypto3::multiprecision {
             try {
                 return static_cast<detail::largest_big_uint_t<T1, T2>>(result);
             } catch (const std::range_error&) {
-                throw std::overflow_error("big_uint: division overflow");
+                NIL_THROW(std::overflow_error("big_uint: division overflow"));
             }
         }
 
@@ -762,7 +763,7 @@ namespace nil::crypto3::multiprecision {
             try {
                 a = result;
             } catch (const std::range_error&) {
-                throw std::overflow_error("big_uint: division overflow");
+                NIL_THROW(std::overflow_error("big_uint: division overflow"));
             }
             return a;
         }
@@ -848,11 +849,11 @@ namespace nil::crypto3::multiprecision {
             if constexpr (Bits2 > Bits && !std::is_same_v<Op, std::bit_and<>>) {
                 for (; i < os; ++i) {
                     if (po[i] != 0) {
-                        throw std::overflow_error("big_uint: bitwise_op overflow");
+                        NIL_THROW(std::overflow_error("big_uint: bitwise_op overflow"));
                     }
                 }
                 if (normalize()) {
-                    throw std::overflow_error("big_uint: bitwise_op overflow");
+                    NIL_THROW(std::overflow_error("big_uint: bitwise_op overflow"));
                 }
             }
         }
@@ -886,7 +887,7 @@ namespace nil::crypto3::multiprecision {
             limbs()[0] |= l;
             if constexpr (static_limb_count == 1) {
                 if (normalize()) {
-                    throw std::overflow_error("big_uint: or overflow");
+                    NIL_THROW(std::overflow_error("big_uint: or overflow"));
                 }
             }
         }
@@ -895,7 +896,7 @@ namespace nil::crypto3::multiprecision {
             limbs()[0] ^= l;
             if constexpr (static_limb_count == 1) {
                 if (normalize()) {
-                    throw std::overflow_error("big_uint: xor overflow");
+                    NIL_THROW(std::overflow_error("big_uint: xor overflow"));
                 }
             }
         }
@@ -1168,7 +1169,7 @@ namespace nil::crypto3::multiprecision {
             }
 
             if (index == limb_count()) {
-                throw std::invalid_argument("zero has no lsb");
+                NIL_THROW(std::invalid_argument("zero has no lsb"));
             }
 
             //
@@ -1189,7 +1190,7 @@ namespace nil::crypto3::multiprecision {
                 }
             }
             if (limbs()[0] == 0) {
-                throw std::invalid_argument("zero has no msb");
+                NIL_THROW(std::invalid_argument("zero has no msb"));
             }
             return std::bit_width(limbs()[0]) - 1;
         }
@@ -1207,7 +1208,7 @@ namespace nil::crypto3::multiprecision {
 
         constexpr big_uint& bit_set(std::size_t index) {
             if (index >= Bits) {
-                throw std::invalid_argument("fixed precision overflow");
+                NIL_THROW(std::invalid_argument("fixed precision overflow"));
             }
             std::size_t offset = index / limb_bits;
             std::size_t shift = index % limb_bits;
@@ -1218,7 +1219,7 @@ namespace nil::crypto3::multiprecision {
 
         constexpr big_uint& bit_unset(std::size_t index) {
             if (index >= Bits) {
-                throw std::invalid_argument("fixed precision overflow");
+                NIL_THROW(std::invalid_argument("fixed precision overflow"));
             }
             std::size_t offset = index / limb_bits;
             std::size_t shift = index % limb_bits;
@@ -1229,7 +1230,7 @@ namespace nil::crypto3::multiprecision {
 
         constexpr big_uint& bit_flip(std::size_t index) {
             if (index >= Bits) {
-                throw std::invalid_argument("fixed precision overflow");
+                NIL_THROW(std::invalid_argument("fixed precision overflow"));
             }
             std::size_t offset = index / limb_bits;
             std::size_t shift = index % limb_bits;
@@ -1254,7 +1255,7 @@ namespace nil::crypto3::multiprecision {
             limb_type value = static_cast<limb_type>(bits & mask) << shift;
             if (value) {
                 if (limb >= limb_count()) {
-                    throw std::overflow_error("import_bits: overflow");
+                    NIL_THROW(std::overflow_error("import_bits: overflow"));
                 }
                 limbs()[limb] |= value;
             }
@@ -1317,7 +1318,7 @@ namespace nil::crypto3::multiprecision {
             }
 
             if (normalize()) {
-                throw std::overflow_error("import_bits: overflow");
+                NIL_THROW(std::overflow_error("import_bits: overflow"));
             }
         }
 
@@ -1329,7 +1330,7 @@ namespace nil::crypto3::multiprecision {
             if (std::any_of(reinterpret_cast<const unsigned char*>(i) + copy_len,
                             reinterpret_cast<const unsigned char*>(j),
                             [](char c) { return c != 0; })) {
-                throw std::overflow_error("import_bits: overflow");
+                NIL_THROW(std::overflow_error("import_bits: overflow"));
             }
 
             std::memcpy(reinterpret_cast<unsigned char*>(limbs()), i, copy_len);
@@ -1337,7 +1338,7 @@ namespace nil::crypto3::multiprecision {
                         limb_count() * sizeof(limb_type) - copy_len);
 
             if (normalize()) {
-                throw std::overflow_error("import_bits: overflow");
+                NIL_THROW(std::overflow_error("import_bits: overflow"));
             }
         }
 
diff --git a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_mod/modular_ops/goldilocks.hpp b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_mod/modular_ops/goldilocks.hpp
index 01cdbc825f..0b74d6daed 100644
--- a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_mod/modular_ops/goldilocks.hpp
+++ b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_mod/modular_ops/goldilocks.hpp
@@ -46,7 +46,7 @@ namespace nil::crypto3::multiprecision {
                 if (sum >= goldilocks_modulus) {
                     sum -= goldilocks_modulus;
                 }
-                result = sum;
+                result = static_cast<base_type>(sum);
                 BOOST_ASSERT(result < goldilocks_modulus);
             }
 
@@ -69,8 +69,8 @@ Goldilocks::new(t2)
 
                 */
 
-                std::uint64_t x_lo = input;
-                std::uint64_t x_hi = input >> 64;
+                std::uint64_t x_lo = static_cast<std::uint64_t>(input);
+                std::uint64_t x_hi = static_cast<std::uint64_t>(input >> 64);
                 std::uint64_t x_hi_hi = x_hi >> 32;
                 std::uint64_t x_hi_lo = x_hi & NEG_ORDER;
 
diff --git a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_mod/modular_ops/montgomery.hpp b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_mod/modular_ops/montgomery.hpp
index 5df98fc1db..6d26fea8cc 100644
--- a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_mod/modular_ops/montgomery.hpp
+++ b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_mod/modular_ops/montgomery.hpp
@@ -23,6 +23,7 @@
 #include "nil/crypto3/multiprecision/detail/big_mod/modular_ops/barrett.hpp"
 #include "nil/crypto3/multiprecision/detail/big_uint/storage.hpp"
 #include "nil/crypto3/multiprecision/detail/integer_ops_base.hpp"
+#include "nil/crypto3/multiprecision/detail/throw.hpp"
 
 namespace nil::crypto3::multiprecision::detail {
     template<typename T>
@@ -46,7 +47,7 @@ namespace nil::crypto3::multiprecision::detail {
 
         constexpr montgomery_modular_ops(const big_uint_t &m) : barrett_modular_ops<Bits_>(m) {
             if (!modulus_supports_montgomery(m)) {
-                throw std::invalid_argument("module not usable with montgomery");
+                NIL_THROW(std::invalid_argument("module not usable with montgomery"));
             }
 
             m_montgomery_p_dash = monty_inverse(this->mod().limbs()[0]);
@@ -73,7 +74,7 @@ namespace nil::crypto3::multiprecision::detail {
          */
         static constexpr limb_type monty_inverse(const limb_type &a) {
             if (a % 2 == 0) {
-                throw std::invalid_argument("inverse does not exist");
+                NIL_THROW(std::invalid_argument("inverse does not exist"));
             }
             limb_type b = 1;
             limb_type r = 0;
diff --git a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_uint/arithmetic.hpp b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_uint/arithmetic.hpp
index c055349e49..2244917ecb 100644
--- a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_uint/arithmetic.hpp
+++ b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_uint/arithmetic.hpp
@@ -26,6 +26,7 @@
 #include "nil/crypto3/multiprecision/detail/big_uint/storage.hpp"
 #include "nil/crypto3/multiprecision/type_traits.hpp"
 #include "nil/crypto3/multiprecision/unsigned_utils.hpp"
+#include "nil/crypto3/multiprecision/detail/throw.hpp"
 
 namespace nil::crypto3::multiprecision {
     template<std::size_t Bits>
@@ -121,7 +122,7 @@ namespace nil::crypto3::multiprecision {
             OverflowPolicy != overflow_policy::throw_exception) {
             if constexpr (OverflowPolicy == overflow_policy::throw_exception) {
                 if (overflow) {
-                    throw std::overflow_error("big_uint: addition overflow");
+                    NIL_THROW(std::overflow_error("big_uint: addition overflow"));
                 }
             } else if constexpr (OverflowPolicy == overflow_policy::debug_assert) {
                 BOOST_ASSERT_MSG(!overflow, "big_uint: addition overflow");
@@ -303,7 +304,7 @@ namespace nil::crypto3::multiprecision {
         constexpr void subtract_overflow() noexcept(OverflowPolicy !=
                                                     overflow_policy::throw_exception) {
             if constexpr (OverflowPolicy == overflow_policy::throw_exception) {
-                throw std::overflow_error("big_uint: subtraction overflow");
+                NIL_THROW(std::overflow_error("big_uint: subtraction overflow"));
             } else if constexpr (OverflowPolicy == overflow_policy::debug_assert) {
                 BOOST_ASSERT_MSG(false, "big_uint: subtraction overflow");
             }
@@ -454,7 +455,7 @@ namespace nil::crypto3::multiprecision {
             OverflowPolicy != overflow_policy::throw_exception) {
             if constexpr (OverflowPolicy == overflow_policy::throw_exception) {
                 if (carry) {
-                    throw std::overflow_error("big_uint: addition overflow");
+                    NIL_THROW(std::overflow_error("big_uint: addition overflow"));
                 }
             } else if constexpr (OverflowPolicy == overflow_policy::debug_assert) {
                 BOOST_ASSERT_MSG(!carry, "big_uint: addition overflow");
@@ -532,7 +533,7 @@ namespace nil::crypto3::multiprecision {
             */
 
             if (y.is_zero()) {
-                throw std::overflow_error("integer division by zero");
+                NIL_THROW(std::overflow_error("integer division by zero"));
             }
 
             const_limb_pointer px = x.limbs();
@@ -746,7 +747,7 @@ namespace nil::crypto3::multiprecision {
             OverflowPolicy != overflow_policy::throw_exception) {
             if constexpr (OverflowPolicy == overflow_policy::throw_exception) {
                 if (overflow) {
-                    throw std::overflow_error("big_uint: multiplication overflow");
+                    NIL_THROW(std::overflow_error("big_uint: multiplication overflow"));
                 }
             } else if constexpr (OverflowPolicy == overflow_policy::debug_assert) {
                 BOOST_ASSERT_MSG(!overflow, "big_uint: multiplication overflow");
diff --git a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_uint/big_uint_impl.hpp b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_uint/big_uint_impl.hpp
new file mode 100644
index 0000000000..4b163285e8
--- /dev/null
+++ b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_uint/big_uint_impl.hpp
@@ -0,0 +1,1091 @@
+#pragma once
+
+// IWYU pragma: private; include "nil/crypto3/multiprecision/big_uint.hpp"
+
+#include <algorithm>
+#include <array>
+#include <bit>
+#include <cctype>
+#include <charconv>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <system_error>
+#include <type_traits>
+
+#include <boost/functional/hash.hpp>
+
+#include "nil/crypto3/multiprecision/detail/assert.hpp"
+#include "nil/crypto3/multiprecision/detail/big_uint/arithmetic.hpp"
+#include "nil/crypto3/multiprecision/detail/big_uint/parsing.hpp"  // IWYU pragma: export
+#include "nil/crypto3/multiprecision/detail/big_uint/storage.hpp"
+#include "nil/crypto3/multiprecision/detail/big_uint/type_traits.hpp"  // IWYU pragma: export
+#include "nil/crypto3/multiprecision/detail/config.hpp"
+#include "nil/crypto3/multiprecision/detail/endian.hpp"
+#include "nil/crypto3/multiprecision/detail/type_traits.hpp"
+#include "nil/crypto3/multiprecision/detail/throw.hpp"
+
+namespace nil::crypto3::multiprecision {
+    /**
+     * @brief Big unsigned integer type
+     *
+     * @tparam Bits Number of bits
+     *
+     * @details
+     * This is a class that represents a big unsigned integer with a fixed size in bits.
+     *
+     * @note
+     * Addition and subtraction operations are optimized, while multiplication and division are not.
+     * Multiplication and division should be used in compile time or in non-performance critical
+     * code.
+     * If you need fast arithmetic, you probably are looking for big_mod, which implements fast
+     * modular arithmetic.
+     */
+    template<std::size_t Bits_>
+    class big_uint {
+      public:
+        static constexpr std::size_t Bits = Bits_;
+        using self_type = big_uint;
+
+        using limb_type = detail::limb_type;
+        using double_limb_type = detail::double_limb_type;
+        using signed_limb_type = detail::signed_limb_type;
+        using signed_double_limb_type = detail::signed_double_limb_type;
+
+        // Storage
+
+        using limb_pointer = detail::limb_pointer;
+        using const_limb_pointer = detail::const_limb_pointer;
+        static constexpr std::size_t limb_bits = detail::limb_bits;
+        static constexpr limb_type max_limb_value = detail::max_limb_value;
+
+        static constexpr std::size_t internal_limb_count =
+            (Bits / limb_bits) + (((Bits % limb_bits) != 0u) ? 1u : 0u);
+        static constexpr limb_type upper_limb_mask =
+            (Bits % limb_bits) ? (limb_type(1) << (Bits % limb_bits)) - 1 : (~limb_type(0u));
+
+        //
+        // Helper functions for getting at our internal data, and manipulating storage:
+        //
+        constexpr std::size_t limbs_count() const noexcept {
+            static_assert(internal_limb_count != 0, "No limbs in storage.");
+            return internal_limb_count;
+        }
+        constexpr limb_pointer limbs() noexcept { return m_data.data(); }
+        constexpr const_limb_pointer limbs() const noexcept { return m_data.data(); }
+        constexpr auto& limbs_array() noexcept { return m_data; }
+        constexpr const auto& limbs_array() const noexcept { return m_data; }
+
+      private:
+        // Zeros out everything after limb[i], replaces resizing.
+        constexpr void zero_after(std::size_t start_index) {
+            auto pr = this->limbs();
+            for (std::size_t i = start_index; i < this->limbs_count(); ++i) {
+                pr[i] = 0;
+            }
+        }
+
+        constexpr std::size_t used_limbs() const noexcept {
+            for (int i = internal_limb_count - 1; i >= 0; --i) {
+                if (limbs()[i] != 0) {
+                    return i + 1;
+                }
+            }
+            return 0;
+        }
+
+        constexpr std::size_t order() const noexcept {
+            for (int i = internal_limb_count - 1; i >= 0; --i) {
+                if (limbs()[i] != 0) {
+                    return i;
+                }
+            }
+            return 0;
+        }
+
+        // Assignment
+
+        template<typename T,
+                 std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T> || std::is_same_v<T, unsigned __int128>, int> = 0>
+        constexpr void do_assign_integral(const T& a) noexcept {
+            if constexpr (sizeof(T) <= sizeof(limb_type)) {
+                this->limbs()[0] = a;
+                this->zero_after(1);
+            } else {
+                static_assert(sizeof(T) % sizeof(limb_type) == 0);
+                constexpr std::size_t n =
+                    std::min(internal_limb_count, sizeof(T) / sizeof(limb_type));
+                auto a_copy = a;
+                for (std::size_t i = 0; i < n; ++i) {
+                    limbs()[i] = a_copy & static_cast<T>(static_cast<limb_type>(-1));
+                    a_copy >>= limb_bits;
+                }
+                zero_after(n);
+            }
+            this->normalize();
+            if constexpr (sizeof(T) * CHAR_BIT > Bits) {
+                NIL_CO3_MP_ASSERT(big_uint<sizeof(T) * CHAR_BIT>(a).compare(*this) == 0);
+            }
+        }
+
+        template<std::size_t Bits2>
+        constexpr void do_assign(const big_uint<Bits2>& other) noexcept {
+            std::size_t count = (std::min)(other.limbs_count(), this->limbs_count());
+            for (std::size_t i = 0; i < count; ++i) {
+                this->limbs()[i] = other.limbs()[i];
+            }
+            // Zero out everything after (std::min)(other.limbs_count(), limbs_count()), so if size
+            // of other was less, we have 0s at the end.
+            this->zero_after((std::min)(other.limbs_count(), this->limbs_count()));
+            this->normalize();
+        }
+
+      public:
+        // TODO(ioxid): this should be private
+        constexpr void normalize() noexcept { limbs()[internal_limb_count - 1] &= upper_limb_mask; }
+
+        constexpr bool has_carry() const noexcept { return m_carry; }
+        constexpr void set_carry(bool carry) noexcept { m_carry = carry; }
+
+        // Constructor
+
+        constexpr big_uint() noexcept {}
+
+        constexpr big_uint(std::string_view str) { *this = str; }
+        constexpr big_uint(const char* str) { *this = str; }
+        constexpr big_uint(const std::string &str) { *this = str; }
+
+        template<class T, std::enable_if_t<std::is_integral_v<T> && std::is_signed_v<T>, int> = 0>
+        constexpr big_uint(T val) noexcept {
+            NIL_CO3_MP_ASSERT_MSG(val >= 0, "big_uint: assignment from negative integer");
+            do_assign_integral(static_cast<std::make_unsigned_t<T>>(val));
+        }
+
+        template<class T, std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T>, int> = 0>
+        constexpr big_uint(T val) noexcept {
+            do_assign_integral(val);
+        }
+
+        // TODO(ioxid): make this explicit for the case when Bits2 > Bits
+        template<std::size_t Bits2>
+        constexpr big_uint(const big_uint<Bits2>& other) noexcept {
+            do_assign(other);
+            if constexpr (Bits2 > Bits) {
+                NIL_CO3_MP_ASSERT(other.compare(*this) == 0);
+            }
+        }
+
+        template<std::size_t N>
+        constexpr big_uint(const std::array<std::uint8_t, N>& bytes) noexcept {
+            *this = bytes;
+        }
+
+        // Assignment
+
+        constexpr big_uint& operator=(std::string_view str) {
+            *this = detail::parse_int<Bits>(str);
+            return *this;
+        }
+        constexpr big_uint& operator=(const char* str) {
+            *this = detail::parse_int<Bits>(str);
+            return *this;
+        }
+        constexpr big_uint& operator=(const std::string &str) {
+            *this = detail::parse_int<Bits>(str);
+            return *this;
+        }
+
+        template<typename T,
+                 std::enable_if_t<std::is_integral_v<T> && std::is_signed_v<T>, int> = 0>
+        constexpr big_uint& operator=(T val) noexcept {
+            NIL_CO3_MP_ASSERT_MSG(val >= 0, "big_uint: assignment from negative integer");
+            do_assign_integral(static_cast<std::make_unsigned_t<T>>(val));
+            return *this;
+        }
+
+        template<typename T,
+                 std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T> || std::is_same_v<T, unsigned __int128>, int> = 0>
+        constexpr big_uint& operator=(T val) noexcept {
+            do_assign_integral(val);
+            return *this;
+        }
+
+        template<std::size_t Bits2>
+        constexpr big_uint& operator=(const big_uint<Bits2>& other) noexcept {
+            do_assign(other);
+            if constexpr (Bits2 > Bits) {
+                NIL_CO3_MP_ASSERT(other.compare(*this) == 0);
+            }
+            return *this;
+        }
+
+        template<std::size_t N>
+        constexpr big_uint& operator=(const std::array<std::uint8_t, N>& bytes) {
+            std::size_t bits = 0;
+            for (std::size_t i = 0; i < bytes.size(); ++i) {
+                *this <<= 8;
+                if (bits != 0) {
+                    bits += 8;
+                }
+                unsigned b = bytes[i];
+                *this += b;
+                if (bits == 0 && b != 0) {
+                    bits += std::bit_width(b);
+                }
+            }
+            if (bits > Bits) {
+                NIL_THROW(std::invalid_argument("not enough bits"));
+            }
+            return *this;
+        }
+
+        // String conversion
+
+        constexpr std::string str(std::ios_base::fmtflags flags = std::ios_base::hex |
+                                                                  std::ios_base::showbase |
+                                                                  std::ios_base::uppercase) const {
+            if (flags & std::ios_base::dec) {
+                // TODO(ioxid): this is inefficient
+                std::string result;
+                auto copy = *this;
+                while (!copy.is_zero()) {
+                    result += static_cast<char>(static_cast<unsigned int>(copy % 10u) + '0');
+                    copy /= 10u;
+                }
+                std::reverse(result.begin(), result.end());
+                if (result.empty()) {
+                    result += '0';
+                }
+                return result;
+            }
+            if (!(flags & std::ios_base::hex)) {
+                NIL_THROW(std::invalid_argument("big_uint: unsupported format flags"));
+            }
+            std::string result;
+            result.reserve(used_limbs() * limb_bits / 4);
+            bool found_first = false;
+            for (int i = internal_limb_count - 1; i >= 0; --i) {
+                auto limb = limbs()[i];
+                bool should_pad = found_first;
+                found_first = found_first || limb != 0;
+                if (found_first) {
+                    std::size_t len = limb == 0 ? 1 : (std::bit_width(limb) + 3) / 4;
+                    std::size_t padded_len = len;
+                    if (should_pad) {
+                        padded_len = sizeof(limb_type) * 2;
+                    }
+                    for (std::size_t j = 0; j < padded_len - len; ++j) {
+                        result += '0';
+                    }
+                    std::size_t start_offset = result.size();
+                    result.resize(result.size() + len);
+                    auto ec = std::to_chars(result.data() + start_offset,
+                                            result.data() + result.size(), limb, 16)
+                                  .ec;
+                    NIL_CO3_MP_ASSERT(ec == std::errc{});
+                }
+            }
+            if (flags & std::ios_base::uppercase) {
+                for (std::size_t i = 0; i < result.size(); ++i) {
+                    result[i] =
+                        static_cast<char>(std::toupper(static_cast<unsigned char>(result[i])));
+                }
+            }
+            if (result.size() == 0) {
+                result += '0';
+            }
+            if (flags & std::ios_base::showbase) {
+                result = "0x" + result;
+            }
+            return result;
+        }
+
+        template<std::size_t Bits2, std::enable_if_t<(Bits2 < Bits), int> = 0>
+        constexpr big_uint<Bits2> truncate() const noexcept {
+            big_uint<Bits2> result;
+            result.do_assign(*this);
+            return result;
+        }
+
+        // Cast to integral types
+
+        template<typename T, std::enable_if_t<!std::is_same_v<T, bool> && std::is_integral_v<T> &&
+                                                  std::is_unsigned_v<T>,
+                                              int> = 0>
+        explicit constexpr operator T() const {
+            if constexpr (sizeof(T) <= sizeof(limb_type)) {
+                return static_cast<T>(this->limbs()[0]);
+            } else {
+                constexpr std::size_t n =
+                    std::min(sizeof(T) / sizeof(limb_type), internal_limb_count);
+                T result = 0;
+                for (std::size_t i = 0; i < n; ++i) {
+                    result <<= limb_bits;
+                    result |= limbs()[n - i - 1];
+                }
+                return result;
+            }
+        }
+
+        template<typename T,
+                 std::enable_if_t<std::is_integral_v<T> && std::is_signed_v<T>, int> = 0>
+        explicit constexpr operator T() const {
+            return static_cast<T>(static_cast<std::make_unsigned_t<T>>(*this));
+        }
+
+        explicit constexpr operator bool() const { return !is_zero(); }
+
+        // Comparison
+
+        template<std::size_t Bits2>
+        constexpr int compare(const big_uint<Bits2>& b) const noexcept {
+            std::size_t as = used_limbs();
+            std::size_t bs = b.used_limbs();
+            if (as != bs) {
+                return as > bs ? 1 : -1;
+            }
+            auto pa = limbs();
+            auto pb = b.limbs();
+            for (auto i = static_cast<std::ptrdiff_t>(as) - 1; i >= 0; --i) {
+                if (pa[i] != pb[i]) {
+                    return pa[i] > pb[i] ? 1 : -1;
+                }
+            }
+            return 0;
+        }
+
+        template<typename T,
+                 std::enable_if_t<std::is_integral_v<T> && std::is_signed_v<T>, int> = 0>
+        constexpr int compare(const T& b) const noexcept {
+            if (b < 0) {
+                return 1;
+            }
+            return compare(static_cast<std::make_unsigned_t<T>>(b));
+        }
+
+        template<typename T,
+                 std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T>, int> = 0>
+        constexpr int compare(const T& b) const noexcept {
+            static_assert(sizeof(T) <= sizeof(double_limb_type));
+            std::size_t s = used_limbs();
+            if constexpr (sizeof(T) <= sizeof(limb_type)) {
+                if (s > 1) {
+                    return 1;
+                }
+                auto lmb = this->limbs()[0];
+                return lmb == b ? 0 : lmb > b ? 1 : -1;
+            } else {
+                if (s > 2) {
+                    return 1;
+                }
+                auto dbl = static_cast<double_limb_type>(*this);
+                return dbl == b ? 0 : dbl > b ? 1 : -1;
+            }
+        }
+
+        // Comparison
+
+#define NIL_CO3_MP_BIG_UINT_IMPL_COMPARISON_OPERATOR(OP_)                        \
+    template<typename T, std::enable_if_t<detail::is_integral_v<T>, int> = 0>    \
+    constexpr bool operator OP_(const T& o) const noexcept {                     \
+        return compare(o) OP_ 0;                                                 \
+    }                                                                            \
+                                                                                 \
+    template<typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>       \
+    friend constexpr bool operator OP_(const T& a, const big_uint& b) noexcept { \
+        return (-(b.compare(a)))OP_ 0;                                           \
+    }
+
+        NIL_CO3_MP_BIG_UINT_IMPL_COMPARISON_OPERATOR(<)
+        NIL_CO3_MP_BIG_UINT_IMPL_COMPARISON_OPERATOR(<=)
+        NIL_CO3_MP_BIG_UINT_IMPL_COMPARISON_OPERATOR(>)
+        NIL_CO3_MP_BIG_UINT_IMPL_COMPARISON_OPERATOR(>=)
+        NIL_CO3_MP_BIG_UINT_IMPL_COMPARISON_OPERATOR(==)
+        NIL_CO3_MP_BIG_UINT_IMPL_COMPARISON_OPERATOR(!=)
+
+#undef NIL_CO3_MP_BIG_UINT_IMPL_COMPARISON_OPERATOR
+
+        // Arithmetic operations
+
+        constexpr void negate() noexcept {
+            if (is_zero()) {
+                return;
+            }
+            complement(*this);
+            ++*this;
+        }
+
+        constexpr auto& operator++() noexcept {
+            if (limbs()[0] < max_limb_value) {
+                ++limbs()[0];
+                if constexpr (Bits < limb_bits) {
+                    normalize();
+                }
+            } else {
+                detail::add(*this, *this, static_cast<limb_type>(1u));
+            }
+            return *this;
+        }
+
+        constexpr auto operator++(int) noexcept {
+            auto copy = *this;
+            ++*this;
+            return copy;
+        }
+
+        NIL_CO3_MP_FORCEINLINE constexpr void decrement() noexcept {}
+
+        constexpr auto operator+() const noexcept { return *this; }
+
+        constexpr auto& operator--() noexcept {
+            if (limbs()[0]) {
+                --limbs()[0];
+            } else {
+                detail::subtract(*this, *this, static_cast<limb_type>(1u));
+            }
+            return *this;
+        }
+        constexpr auto operator--(int) noexcept {
+            auto copy = *this;
+            --*this;
+            return copy;
+        }
+
+        constexpr big_uint operator-() const noexcept {
+            big_uint result = *this;
+            result.negate();
+            return result;
+        }
+
+        // Arithmetic operations
+
+        template<typename T, std::enable_if_t<detail::is_integral_v<T>, int> = 0>
+        constexpr auto operator+(const T& b) const noexcept {
+            detail::largest_big_uint_t<big_uint, T> result;
+            detail::add(result, *this, b);
+            return result;
+        }
+
+        template<typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
+        friend constexpr auto operator+(const T& a, const big_uint& b) noexcept {
+            return b + a;
+        }
+
+        template<typename T, std::enable_if_t<detail::is_integral_v<T>, int> = 0>
+        constexpr auto& operator+=(const T& b) noexcept {
+            detail::add(*this, *this, b);
+            return *this;
+        }
+
+        template<typename T, std::enable_if_t<detail::is_integral_v<T>, int> = 0>
+        constexpr auto operator-(const T& b) const noexcept {
+            detail::largest_big_uint_t<big_uint, T> result;
+            detail::subtract(result, *this, b);
+            return result;
+        }
+
+        template<typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
+        friend constexpr auto operator-(const T& a, const big_uint& b) noexcept {
+            return (-b) + a;
+        }
+
+        template<typename T, std::enable_if_t<detail::is_integral_v<T>, int> = 0>
+        constexpr auto& operator-=(const T& b) noexcept {
+            detail::subtract(*this, *this, b);
+            return *this;
+        }
+
+        template<typename T, std::enable_if_t<detail::is_integral_v<T>, int> = 0>
+        constexpr auto operator*(const T& b) const noexcept {
+            decltype(auto) b_unsigned = detail::unsigned_or_throw(b);
+            detail::largest_big_uint_t<big_uint, T> result;
+            detail::multiply(result, *this, detail::as_big_uint(b_unsigned));
+            return result;
+        }
+
+        template<typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
+        friend constexpr auto operator*(const T& a, const big_uint& b) noexcept {
+            return b * a;
+        }
+
+        template<typename T, std::enable_if_t<detail::is_integral_v<T>, int> = 0>
+        constexpr auto& operator*=(const T& b) noexcept {
+            decltype(auto) b_unsigned = detail::unsigned_or_throw(b);
+            big_uint result;
+            detail::multiply(result, *this, detail::as_big_uint(b_unsigned));
+            *this = result;
+            return *this;
+        }
+
+        template<typename T1, typename T2,
+                 std::enable_if_t<(std::is_same_v<T1, big_uint> && detail::is_integral_v<T2>) ||
+                                      (std::is_integral_v<T1> && std::is_same_v<T2, big_uint>),
+                                  int> = 0>
+        friend constexpr auto operator/(const T1& a, const T2& b) noexcept {
+            decltype(auto) a_unsigned = detail::unsigned_or_throw(a);
+            decltype(auto) b_unsigned = detail::unsigned_or_throw(b);
+            using big_uint_a = std::decay_t<decltype(detail::as_big_uint(a_unsigned))>;
+            big_uint_a result;
+            big_uint_a modulus;
+            detail::divide(&result, detail::as_big_uint(a_unsigned),
+                           detail::as_big_uint(b_unsigned), modulus);
+            return static_cast<detail::largest_big_uint_t<T1, T2>>(result);
+        }
+
+        template<typename T, std::enable_if_t<detail::is_integral_v<T>, int> = 0>
+        constexpr auto& operator/=(const T& b) noexcept {
+            decltype(auto) b_unsigned = detail::unsigned_or_throw(b);
+            big_uint result;
+            big_uint modulus;
+            detail::divide(&result, *this, detail::as_big_uint(b_unsigned), modulus);
+            *this = result;
+            return *this;
+        }
+
+        template<typename T1, typename T2,
+                 std::enable_if_t<(std::is_same_v<T1, big_uint> && detail::is_integral_v<T2>) ||
+                                      (std::is_integral_v<T1> && std::is_same_v<T2, big_uint>),
+                                  int> = 0>
+        friend constexpr auto operator%(const T1& a, const T2& b) {
+            decltype(auto) a_unsigned = detail::unsigned_or_throw(a);
+            decltype(auto) b_unsigned = detail::unsigned_or_throw(b);
+            using big_uint_a = std::decay_t<decltype(detail::as_big_uint(a_unsigned))>;
+            big_uint_a modulus;
+            detail::divide(static_cast<big_uint_a*>(nullptr), detail::as_big_uint(a_unsigned),
+                           detail::as_big_uint(b_unsigned), modulus);
+            return static_cast<detail::largest_big_uint_t<T1, T2>>(modulus);
+        }
+
+        template<typename T, std::enable_if_t<detail::is_integral_v<T>, int> = 0>
+        constexpr auto& operator%=(const T& b) {
+            decltype(auto) b_unsigned = detail::unsigned_or_throw(b);
+            big_uint modulus;
+            detail::divide(static_cast<big_uint*>(nullptr), *this, detail::as_big_uint(b_unsigned),
+                           modulus);
+            *this = modulus;
+            return *this;
+        }
+
+#define NIL_CO3_MP_BIG_UINT_BITWISE_OPERATOR_IMPL(OP_, OP_ASSIGN_, METHOD_)             \
+    template<typename T, std::enable_if_t<detail::is_integral_v<T>, int> = 0>           \
+    constexpr auto operator OP_(const T& b) const noexcept {                            \
+        detail::largest_big_uint_t<big_uint, T> result = *this;                         \
+        result.METHOD_(detail::as_limb_type_or_big_uint(detail::unsigned_or_throw(b))); \
+        return result;                                                                  \
+    }                                                                                   \
+                                                                                        \
+    template<typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>              \
+    friend constexpr auto operator OP_(const T& a, const big_uint& b) noexcept {        \
+        return b OP_ a;                                                                 \
+    }                                                                                   \
+                                                                                        \
+    template<typename T, std::enable_if_t<detail::is_integral_v<T>, int> = 0>           \
+    constexpr auto& operator OP_ASSIGN_(const T & b) noexcept {                         \
+        METHOD_(detail::as_limb_type_or_big_uint(detail::unsigned_or_throw(b)));        \
+        return *this;                                                                   \
+    }
+
+        NIL_CO3_MP_BIG_UINT_BITWISE_OPERATOR_IMPL(&, &=, bitwise_and)
+        NIL_CO3_MP_BIG_UINT_BITWISE_OPERATOR_IMPL(|, |=, bitwise_or)
+        NIL_CO3_MP_BIG_UINT_BITWISE_OPERATOR_IMPL(^, ^=, bitwise_xor)
+
+#undef NIL_CO3_MP_BIG_UINT_BITWISE_OPERATOR_IMPL
+
+        // Bitwise operations
+
+      private:
+        template<std::size_t Bits2, typename Op>
+        constexpr void bitwise_op(const big_uint<Bits2>& o, Op op) noexcept {
+            //
+            // Both arguments are unsigned types, very simple case handled as a special case.
+            //
+            // First figure out how big the result needs to be and set up some data:
+            //
+            std::size_t rs = limbs_count();
+            std::size_t os = o.limbs_count();
+            auto [m, x] = std::minmax(rs, os);
+            limb_pointer pr = limbs();
+            const_limb_pointer po = o.limbs();
+            for (std::size_t i = rs; i < x; ++i) {
+                pr[i] = 0;
+            }
+
+            for (std::size_t i = 0; i < os; ++i) {
+                pr[i] = op(pr[i], po[i]);
+            }
+            for (std::size_t i = os; i < x; ++i) {
+                pr[i] = op(pr[i], static_cast<limb_type>(0u));
+            }
+            normalize();
+        }
+
+        template<std::size_t Bits2>
+        NIL_CO3_MP_FORCEINLINE constexpr void bitwise_and(const big_uint<Bits2>& o) noexcept {
+            bitwise_op(o, std::bit_and());
+        }
+
+        template<std::size_t Bits2>
+        NIL_CO3_MP_FORCEINLINE constexpr void bitwise_or(const big_uint<Bits2>& o) noexcept {
+            bitwise_op(o, std::bit_or());
+        }
+
+        template<std::size_t Bits2>
+        NIL_CO3_MP_FORCEINLINE constexpr void bitwise_xor(const big_uint<Bits2>& o) noexcept {
+            bitwise_op(o, std::bit_xor());
+        }
+
+        //
+        // Again for operands which are single limbs:
+        //
+
+        NIL_CO3_MP_FORCEINLINE constexpr void bitwise_and(limb_type l) noexcept {
+            limbs()[0] &= l;
+            zero_after(1);
+        }
+
+        NIL_CO3_MP_FORCEINLINE constexpr void bitwise_or(limb_type l) noexcept { limbs()[0] |= l; }
+
+        NIL_CO3_MP_FORCEINLINE constexpr void bitwise_xor(limb_type l) noexcept { limbs()[0] ^= l; }
+
+        NIL_CO3_MP_FORCEINLINE constexpr void complement(const big_uint<Bits>& o) noexcept {
+            std::size_t os = o.limbs_count();
+            for (std::size_t i = 0; i < os; ++i) {
+                limbs()[i] = ~o.limbs()[i];
+            }
+            normalize();
+        }
+
+        // Left shift will throw away upper Bits.
+        // This function must be called only when s % 8 == 0, i.e. we shift bytes.
+        void left_shift_byte(double_limb_type s) noexcept {
+            limb_pointer pr = limbs();
+
+            std::size_t bytes = static_cast<std::size_t>(s / CHAR_BIT);
+            if (s >= Bits) {
+                // Set result to 0.
+                zero_after(0);
+            } else {
+                unsigned char* pc = reinterpret_cast<unsigned char*>(pr);
+                std::memmove(pc + bytes, pc, limbs_count() * sizeof(limb_type) - bytes);
+                std::memset(pc, 0, bytes);
+            }
+        }
+
+        // Left shift will throw away upper Bits.
+        // This function must be called only when s % limb_bits == 0, i.e. we shift limbs, which
+        // are normally 64 bit.
+
+        constexpr void left_shift_limb(double_limb_type s) noexcept {
+            limb_type offset = static_cast<limb_type>(s / limb_bits);
+            NIL_CO3_MP_ASSERT(static_cast<limb_type>(s % limb_bits) == 0);
+
+            limb_pointer pr = limbs();
+
+            if (s >= Bits) {
+                // Set result to 0.
+                zero_after(0);
+            } else {
+                std::size_t i = offset;
+                std::size_t rs = limbs_count() + offset;
+                for (; i < limbs_count(); ++i) {
+                    pr[rs - 1 - i] = pr[limbs_count() - 1 - i];
+                }
+                for (; i < rs; ++i) {
+                    pr[rs - 1 - i] = 0;
+                }
+            }
+        }
+
+        // Left shift will throw away upper Bits.
+
+        constexpr void left_shift_generic(double_limb_type s) noexcept {
+            if (s >= Bits) {
+                // Set result to 0.
+                zero_after(0);
+            } else {
+                limb_type offset = static_cast<limb_type>(s / limb_bits);
+                limb_type shift = static_cast<limb_type>(s % limb_bits);
+
+                limb_pointer pr = limbs();
+                std::size_t i = 0;
+                std::size_t rs = limbs_count();
+                // This code only works when shift is non-zero, otherwise we invoke undefined
+                // behaviour!
+                NIL_CO3_MP_ASSERT(shift);
+                for (; rs - i >= 2 + offset; ++i) {
+                    pr[rs - 1 - i] = pr[rs - 1 - i - offset] << shift;
+                    pr[rs - 1 - i] |= pr[rs - 2 - i - offset] >> (limb_bits - shift);
+                }
+                if (rs - i >= 1 + offset) {
+                    pr[rs - 1 - i] = pr[rs - 1 - i - offset] << shift;
+                    ++i;
+                }
+                for (; i < rs; ++i) {
+                    pr[rs - 1 - i] = 0;
+                }
+            }
+        }
+
+        void right_shift_byte(double_limb_type s) noexcept {
+            limb_type offset = static_cast<limb_type>(s / limb_bits);
+            NIL_CO3_MP_ASSERT((s % CHAR_BIT) == 0);
+            std::size_t ors = limbs_count();
+            std::size_t rs = ors;
+            if (offset >= rs) {
+                zero_after(0);
+                return;
+            }
+            rs -= offset;
+            limb_pointer pr = limbs();
+            unsigned char* pc = reinterpret_cast<unsigned char*>(pr);
+            limb_type shift = static_cast<limb_type>(s / CHAR_BIT);
+            std::memmove(pc, pc + shift, ors * sizeof(pr[0]) - shift);
+            shift = (sizeof(limb_type) - shift % sizeof(limb_type)) * CHAR_BIT;
+            if (shift < limb_bits) {
+                pr[ors - offset - 1] &= (static_cast<limb_type>(1u) << shift) - 1;
+                if (!pr[ors - offset - 1] && (rs > 1)) {
+                    --rs;
+                }
+            }
+            // Set zeros after 'rs', alternative to resizing to size 'rs'.
+            zero_after(rs);
+        }
+
+        constexpr void right_shift_limb(double_limb_type s) noexcept {
+            limb_type offset = static_cast<limb_type>(s / limb_bits);
+            NIL_CO3_MP_ASSERT((s % limb_bits) == 0);
+            std::size_t ors = limbs_count();
+            std::size_t rs = ors;
+            if (offset >= rs) {
+                zero_after(0);
+                return;
+            }
+            rs -= offset;
+            limb_pointer pr = limbs();
+            std::size_t i = 0;
+            for (; i < rs; ++i) {
+                pr[i] = pr[i + offset];
+            }
+            // Set zeros after 'rs', alternative to resizing to size 'rs'.
+            zero_after(rs);
+        }
+
+        constexpr void right_shift_generic(double_limb_type s) noexcept {
+            limb_type offset = static_cast<limb_type>(s / limb_bits);
+            limb_type shift = static_cast<limb_type>(s % limb_bits);
+            std::size_t ors = limbs_count();
+            std::size_t rs = ors;
+
+            if (offset >= rs) {
+                *this = static_cast<limb_type>(0u);
+                return;
+            }
+            rs -= offset;
+            limb_pointer pr = limbs();
+            if ((pr[ors - 1] >> shift) == 0) {
+                if (--rs == 0) {
+                    *this = static_cast<limb_type>(0u);
+                    return;
+                }
+            }
+            std::size_t i = 0;
+
+            // This code only works for non-zero shift, otherwise we invoke undefined behaviour!
+            NIL_CO3_MP_ASSERT(shift);
+            for (; i + offset + 1 < ors; ++i) {
+                pr[i] = pr[i + offset] >> shift;
+                pr[i] |= pr[i + offset + 1] << (limb_bits - shift);
+            }
+            pr[i] = pr[i + offset] >> shift;
+
+            // We cannot resize any more, so we need to set all the limbs to zero.
+            zero_after(rs);
+        }
+
+      public:
+        constexpr auto operator~() const noexcept {
+            big_uint result;
+            result.complement(*this);
+            return result;
+        }
+
+        // Shifting left throws away upper Bits.
+        constexpr big_uint& operator<<=(double_limb_type s) noexcept {
+            if (!s) {
+                return *this;
+            }
+
+#if NIL_CO3_MP_ENDIAN_LITTLE_BYTE && defined(NIL_CO3_MP_USE_LIMB_SHIFT)
+            constexpr limb_type limb_shift_mask = limb_bits - 1;
+            constexpr limb_type byte_shift_mask = CHAR_BIT - 1;
+
+            if ((s & limb_shift_mask) == 0) {
+                left_shift_limb(s);
+            } else if (((s & byte_shift_mask) == 0) && !std::is_constant_evaluated()) {
+                left_shift_byte(s);
+            }
+#elif NIL_CO3_MP_ENDIAN_LITTLE_BYTE
+            constexpr limb_type limb_shift_mask = limb_bits - 1;
+            constexpr limb_type byte_shift_mask = CHAR_BIT - 1;
+
+            if (std::is_constant_evaluated() && ((s & limb_shift_mask) == 0)) {
+                left_shift_limb(s);
+            } else if (((s & byte_shift_mask) == 0) && !std::is_constant_evaluated()) {
+                left_shift_byte(s);
+            }
+#else
+            constexpr limb_type limb_shift_mask = limb_bits - 1;
+
+            if ((s & limb_shift_mask) == 0) {
+                left_shift_limb(s);
+            }
+#endif
+            else {
+                left_shift_generic(s);
+            }
+            normalize();
+            return *this;
+        }
+
+        constexpr big_uint operator<<(double_limb_type s) const noexcept {
+            big_uint result = *this;
+            result <<= s;
+            return result;
+        }
+
+        constexpr big_uint& operator>>=(double_limb_type s) noexcept {
+            if (!s) {
+                return *this;
+            }
+
+#if NIL_CO3_MP_ENDIAN_LITTLE_BYTE && defined(NIL_CO3_MP_USE_LIMB_SHIFT)
+            constexpr limb_type limb_shift_mask = limb_bits - 1;
+            constexpr limb_type byte_shift_mask = CHAR_BIT - 1;
+
+            if ((s & limb_shift_mask) == 0) {
+                right_shift_limb(s);
+            } else if (((s & byte_shift_mask) == 0) && !std::is_constant_evaluated()) {
+                right_shift_byte(s);
+            }
+#elif NIL_CO3_MP_ENDIAN_LITTLE_BYTE
+            constexpr limb_type byte_shift_mask = CHAR_BIT - 1;
+
+            constexpr limb_type limb_shift_mask = limb_bits - 1;
+            if (std::is_constant_evaluated() && ((s & limb_shift_mask) == 0)) {
+                right_shift_limb(s);
+            } else if (((s & byte_shift_mask) == 0) && !std::is_constant_evaluated()) {
+                right_shift_byte(s);
+            }
+#else
+            constexpr limb_type limb_shift_mask = limb_bits - 1;
+
+            if ((s & limb_shift_mask) == 0) {
+                right_shift_limb(s);
+            }
+#endif
+            else {
+                right_shift_generic(s);
+            }
+            return *this;
+        }
+
+        constexpr big_uint operator>>(double_limb_type s) const noexcept {
+            big_uint result = *this;
+            result >>= s;
+            return result;
+        }
+
+        // IO
+
+        friend std::ostream& operator<<(std::ostream& os, const big_uint& value) {
+            os << value.str(os.flags());
+            return os;
+        }
+
+        // Misc ops
+
+        NIL_CO3_MP_FORCEINLINE constexpr bool is_zero() const noexcept {
+            for (std::size_t i = 0; i < limbs_count(); ++i) {
+                if (limbs()[i] != 0) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        constexpr std::size_t lsb() const {
+            //
+            // Find the index of the least significant limb that is non-zero:
+            //
+            std::size_t index = 0;
+            while ((index < limbs_count()) && !limbs()[index]) {
+                ++index;
+            }
+
+            if (index == limbs_count()) {
+                NIL_THROW(std::invalid_argument("zero has no lsb"));
+            }
+
+            //
+            // Find the index of the least significant bit within that limb:
+            //
+            std::size_t result = std::countr_zero(limbs()[index]);
+
+            return result + index * limb_bits;
+        }
+
+        constexpr std::size_t msb() const {
+            //
+            // Find the index of the most significant bit that is non-zero:
+            //
+            for (std::size_t i = limbs_count() - 1; i > 0; --i) {
+                if (limbs()[i] != 0) {
+                    return i * limb_bits + std::bit_width(limbs()[i]) - 1;
+                }
+            }
+            if (limbs()[0] == 0) {
+                NIL_THROW(std::invalid_argument("zero has no msb"));
+            }
+            return std::bit_width(limbs()[0]) - 1;
+        }
+
+        constexpr bool bit_test(std::size_t index) const {
+            if (index >= Bits) {
+                return false;
+                // TODO(ioxid): this throws in multiexp tests
+                // NIL_THROW(std::invalid_argument("fixed precision overflow"));
+            }
+            std::size_t offset = index / limb_bits;
+            std::size_t shift = index % limb_bits;
+            limb_type mask = limb_type(1u) << shift;
+            return static_cast<bool>(limbs()[offset] & mask);
+        }
+
+        constexpr void bit_set(std::size_t index) {
+            if (index >= Bits) {
+                NIL_THROW(std::invalid_argument("fixed precision overflow"));
+            }
+            std::size_t offset = index / limb_bits;
+            std::size_t shift = index % limb_bits;
+            limb_type mask = limb_type(1u) << shift;
+            limbs()[offset] |= mask;
+        }
+
+        constexpr void bit_unset(std::size_t index) {
+            if (index >= Bits) {
+                NIL_THROW(std::invalid_argument("fixed precision overflow"));
+            }
+            std::size_t offset = index / limb_bits;
+            std::size_t shift = index % limb_bits;
+            limb_type mask = limb_type(1u) << shift;
+            limbs()[offset] &= ~mask;
+        }
+
+        constexpr void bit_flip(big_uint<Bits>& val, std::size_t index) {
+            if (index >= Bits) {
+                NIL_THROW(std::invalid_argument("fixed precision overflow"));
+            }
+            std::size_t offset = index / limb_bits;
+            std::size_t shift = index % limb_bits;
+            limb_type mask = limb_type(1u) << shift;
+            val.limbs()[offset] ^= mask;
+        }
+
+      private:
+        // Data
+
+        // m_data[0] contains the lowest bits.
+        std::array<limb_type, internal_limb_count> m_data{0};
+
+        // This is a temporary value which is set when carry has happend during addition.
+        // If this value is true, reduction by modulus must happen next.
+        bool m_carry = false;
+
+        // Friends
+
+        template<std::size_t>
+        friend class big_uint;
+
+        template<std::size_t Bits1, std::size_t Bits2, std::size_t Bits3>
+        friend constexpr void detail::add_constexpr_unsigned(big_uint<Bits1>& result,
+                                                             const big_uint<Bits2>& a,
+                                                             const big_uint<Bits3>& b) noexcept;
+        template<std::size_t Bits1, std::size_t Bits2, std::size_t Bits3>
+        friend constexpr void detail::subtract_constexpr_unsigned(
+            big_uint<Bits1>& result, const big_uint<Bits2>& a, const big_uint<Bits3>& b) noexcept;
+        template<std::size_t Bits1, std::size_t Bits2, std::size_t Bits3>
+        friend constexpr void detail::add_unsigned(big_uint<Bits1>& result,
+                                                   const big_uint<Bits2>& a,
+                                                   const big_uint<Bits3>& b) noexcept;
+        template<std::size_t Bits1, std::size_t Bits2, std::size_t Bits3>
+        friend constexpr void detail::subtract_unsigned(big_uint<Bits1>& result,
+                                                        const big_uint<Bits2>& a,
+                                                        const big_uint<Bits3>& b) noexcept;
+        template<std::size_t Bits1, std::size_t Bits2>
+        friend constexpr void detail::add_unsigned(big_uint<Bits1>& result,
+                                                   const big_uint<Bits2>& a,
+                                                   const limb_type& o) noexcept;
+        template<std::size_t Bits1, std::size_t Bits2>
+        friend constexpr void detail::subtract_unsigned(big_uint<Bits1>& result,
+                                                        const big_uint<Bits2>& a,
+                                                        const limb_type& b) noexcept;
+        template<std::size_t Bits1, std::size_t Bits2>
+        friend constexpr void detail::divide(big_uint<Bits1>* div, const big_uint<Bits1>& x,
+                                             const big_uint<Bits2>& y, big_uint<Bits1>& rem);
+        template<std::size_t Bits1, std::size_t Bits2, typename T>
+        friend constexpr void detail::multiply(big_uint<Bits1>& result, const big_uint<Bits2>& a,
+                                               const T& b) noexcept;
+    };
+
+    // Hash
+
+    template<std::size_t Bits>
+    constexpr std::size_t hash_value(const big_uint<Bits>& val) noexcept {
+        std::size_t result = 0;
+        for (std::size_t i = 0; i < val.limbs_count(); ++i) {
+            boost::hash_combine(result, val.limbs()[i]);
+        }
+        return result;
+    }
+
+    // Misc ops
+
+    template<std::size_t Bits>
+    constexpr std::size_t msb(const big_uint<Bits>& a) {
+        return a.msb();
+    }
+
+    template<std::size_t Bits>
+    constexpr std::size_t lsb(const big_uint<Bits>& a) {
+        return a.lsb();
+    }
+
+    template<std::size_t Bits>
+    constexpr bool bit_test(const big_uint<Bits>& a, std::size_t index) {
+        return a.bit_test(index);
+    }
+
+    template<std::size_t Bits>
+    constexpr bool is_zero(const big_uint<Bits>& a) {
+        return a.is_zero();
+    }
+
+    template<std::size_t Bits1, std::size_t Bits2>
+    constexpr void divide_qr(const big_uint<Bits1>& a, const big_uint<Bits2>& b, big_uint<Bits1>& q,
+                             big_uint<Bits1>& r) {
+        detail::divide(&q, a, b, r);
+    }
+}  // namespace nil::crypto3::multiprecision
+
+template<std::size_t Bits>
+struct std::hash<nil::crypto3::multiprecision::big_uint<Bits>> {
+    std::size_t operator()(const nil::crypto3::multiprecision::big_uint<Bits>& a) const noexcept {
+        return boost::hash<nil::crypto3::multiprecision::big_uint<Bits>>{}(a);
+    }
+};
diff --git a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_uint/parsing.hpp b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_uint/parsing.hpp
index 1e1598ec7c..fbe3d82711 100644
--- a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_uint/parsing.hpp
+++ b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/big_uint/parsing.hpp
@@ -13,6 +13,8 @@
 #include <stdexcept>
 #include <string_view>
 
+#include "nil/crypto3/multiprecision/detail/throw.hpp"
+
 namespace nil::crypto3::multiprecision {
     template<std::size_t Bits>
     class big_uint;
@@ -35,7 +37,7 @@ namespace nil::crypto3::multiprecision {
         template<std::size_t Bits>
         constexpr big_uint<Bits> parse_int_hex(std::string_view str) {
             if (str.size() < 2 || str[0] != '0' || str[1] != 'x') {
-                throw std::invalid_argument("hex literal should start with 0x");
+                NIL_THROW(std::invalid_argument("hex literal should start with 0x"));
             }
 
             big_uint<Bits> result{0};
@@ -44,7 +46,7 @@ namespace nil::crypto3::multiprecision {
             for (std::size_t i = 2; i < str.size(); ++i) {
                 char c = str[i];
                 if (!is_valid_hex_digit(c)) {
-                    throw std::invalid_argument("non-hex character in literal");
+                    NIL_THROW(std::invalid_argument("non-hex character in literal"));
                 }
                 result <<= 4;
                 if (bits != 0) {
@@ -57,7 +59,7 @@ namespace nil::crypto3::multiprecision {
                 }
             }
             if (bits > Bits) {
-                throw std::range_error("not enough bits to store literal");
+                NIL_THROW(std::range_error("not enough bits to store literal"));
             }
             return result;
         }
@@ -69,7 +71,7 @@ namespace nil::crypto3::multiprecision {
             for (std::size_t i = 0; i < str.size(); ++i) {
                 char c = str[i];
                 if (c < '0' || c > '9') {
-                    throw std::invalid_argument("non decimal character in literal");
+                    NIL_THROW(std::invalid_argument("non decimal character in literal"));
                 }
                 result *= 10u;
                 result += static_cast<unsigned>(c - '0');
diff --git a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/config.hpp b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/config.hpp
index f5a7956ace..94a2aa0e08 100644
--- a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/config.hpp
+++ b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/config.hpp
@@ -17,3 +17,10 @@
 
 // Disable use of int128
 // #define NIL_CO3_MP_DISABLE_INT128
+
+#if defined(GPU_PROVER)
+  #define NIL_CO3_MP_DISABLE_INT128
+  #define NIL_CO3_MP_DISABLE_INTRINSICS
+  // black magick, i do not remember why i did this
+  struct float128_type {};
+#endif
diff --git a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/integer_ops_base.hpp b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/integer_ops_base.hpp
index e21b943942..7934c10649 100644
--- a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/integer_ops_base.hpp
+++ b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/integer_ops_base.hpp
@@ -13,13 +13,15 @@
 #include <stdexcept>
 #include <type_traits>
 
+#include "nil/crypto3/multiprecision/detail/throw.hpp"
+
 namespace nil::crypto3::multiprecision {
 
     template<typename T,
              std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T>, int> = 0>
     constexpr std::size_t lsb(T a) {
         if (a == 0) {
-            throw std::invalid_argument("zero has no lsb");
+            NIL_THROW(std::invalid_argument("zero has no lsb"));
         }
         return std::countr_zero(a);
     }
@@ -28,7 +30,7 @@ namespace nil::crypto3::multiprecision {
              std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T>, int> = 0>
     constexpr std::size_t msb(T a) {
         if (a == 0) {
-            throw std::invalid_argument("zero has no msb");
+            NIL_THROW(std::invalid_argument("zero has no msb"));
         }
         return std::bit_width(a) - 1;
     }
@@ -48,7 +50,7 @@ namespace nil::crypto3::multiprecision {
              std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T>, int> = 0>
     constexpr T &bit_set(T &a, std::size_t index) {
         if (index >= sizeof(T) * CHAR_BIT) {
-            throw std::invalid_argument("fixed precision overflow");
+            NIL_THROW(std::invalid_argument("fixed precision overflow"));
         }
         auto mask = static_cast<T>(1u) << index;
         a |= mask;
@@ -59,7 +61,7 @@ namespace nil::crypto3::multiprecision {
              std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T>, int> = 0>
     constexpr T &bit_unset(T &a, std::size_t index) {
         if (index >= sizeof(T) * CHAR_BIT) {
-            throw std::invalid_argument("fixed precision overflow");
+            NIL_THROW(std::invalid_argument("fixed precision overflow"));
         }
         auto mask = static_cast<T>(1u) << index;
         a &= ~mask;
@@ -70,7 +72,7 @@ namespace nil::crypto3::multiprecision {
              std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T>, int> = 0>
     constexpr T &bit_flip(T &a, std::size_t index) {
         if (index >= sizeof(T) * CHAR_BIT) {
-            throw std::invalid_argument("fixed precision overflow");
+            NIL_THROW(std::invalid_argument("fixed precision overflow"));
         }
         auto mask = static_cast<T>(1u) << index;
         a ^= mask;
diff --git a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/throw.hpp b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/throw.hpp
new file mode 100644
index 0000000000..6bddf58699
--- /dev/null
+++ b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/detail/throw.hpp
@@ -0,0 +1,36 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2025 Dmitrii Tabalin <d.tabalin@nil.foundation>
+//
+// MIT License
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//---------------------------------------------------------------------------//
+
+#pragma once
+
+#include <boost/assert.hpp>
+
+// we cannot throw exceptions on the GPU
+// so we replace throw with assert in that case
+
+#ifdef GPU_PROVER
+#define NIL_THROW(x) BOOST_ASSERT_MSG(false, x.what());
+#else
+#define NIL_THROW(x) throw x;
+#endif
diff --git a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/inverse.hpp b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/inverse.hpp
index afd3b575af..327dc9faa5 100644
--- a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/inverse.hpp
+++ b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/inverse.hpp
@@ -20,6 +20,7 @@
 #include "nil/crypto3/multiprecision/detail/big_int.hpp"
 #include "nil/crypto3/multiprecision/detail/half_extended_euclidean_algorithm.hpp"
 #include "nil/crypto3/multiprecision/type_traits.hpp"
+#include "nil/crypto3/multiprecision/detail/throw.hpp"
 
 namespace nil::crypto3::multiprecision {
     template<std::size_t Bits>
@@ -28,7 +29,7 @@ namespace nil::crypto3::multiprecision {
         big_int<Bits> aa = a, mm = m, x, g;
         g = detail::half_extended_euclidean_algorithm(aa, mm, x);
         if (g != 1u) {
-            throw std::invalid_argument("no multiplicative inverse");
+            NIL_THROW(std::invalid_argument("no multiplicative inverse"));
         }
         x %= m;
         if (x.negative()) {
diff --git a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/jacobi.hpp b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/jacobi.hpp
index 6581c316f5..5bca633252 100644
--- a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/jacobi.hpp
+++ b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/jacobi.hpp
@@ -14,6 +14,7 @@
 #include <stdexcept>
 
 #include "nil/crypto3/multiprecision/big_uint.hpp"
+#include "nil/crypto3/multiprecision/detail/throw.hpp"
 
 namespace nil::crypto3::multiprecision {
 
@@ -23,7 +24,7 @@ namespace nil::crypto3::multiprecision {
 
         // TODO(ioxid): optimize
         if (n % 2u == 0 || n <= 1) {
-            throw std::invalid_argument("jacobi: second argument must be odd and > 1");
+            NIL_THROW(std::invalid_argument("jacobi: second argument must be odd and > 1"));
         }
 
         big_uint_t x = a, y = n;
diff --git a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/ressol.hpp b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/ressol.hpp
index 6f15600f61..e5ad4d4f56 100644
--- a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/ressol.hpp
+++ b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/ressol.hpp
@@ -21,6 +21,7 @@
 #include "nil/crypto3/multiprecision/big_uint.hpp"
 #include "nil/crypto3/multiprecision/jacobi.hpp"
 #include "nil/crypto3/multiprecision/pow.hpp"
+#include "nil/crypto3/multiprecision/detail/throw.hpp"
 
 namespace nil::crypto3::multiprecision {
     template<std::size_t Bits>
@@ -49,7 +50,7 @@ namespace nil::crypto3::multiprecision {
         BOOST_ASSERT(p % 2u != 0u);
 
         if (jacobi(a, p) != 1) {
-            throw std::invalid_argument("Not a quadratic residue");
+            NIL_THROW(std::invalid_argument("Not a quadratic residue"));
         }
 
         // We can use montgomery_big_mod because p is odd
@@ -89,7 +90,7 @@ namespace nil::crypto3::multiprecision {
         big_uint_t z = two;
         while (jacobi(z, p) == 1) {
             if (z.is_zero()) {
-                throw std::invalid_argument("No quadratic nonresidue");
+                NIL_THROW(std::invalid_argument("No quadratic nonresidue"));
             }
             ++z;
         }
@@ -113,7 +114,7 @@ namespace nil::crypto3::multiprecision {
                 if (i >= s) {
                     // TODO(ioxid): when can this happen? (jacobi said that this should
                     // not happen) Martun: the value now has a square root
-                    throw std::invalid_argument("Not a quadratic residue");
+                    NIL_THROW(std::invalid_argument("Not a quadratic residue"));
                 }
             }
 
diff --git a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/unsigned_utils.hpp b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/unsigned_utils.hpp
index 3e00b3a2e3..35884a6bb3 100644
--- a/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/unsigned_utils.hpp
+++ b/crypto3/libs/multiprecision/include/nil/crypto3/multiprecision/unsigned_utils.hpp
@@ -12,6 +12,7 @@
 #include <type_traits>
 
 #include "nil/crypto3/multiprecision/type_traits.hpp"
+#include "nil/crypto3/multiprecision/detail/throw.hpp"
 
 namespace nil::crypto3::multiprecision {
     // unsigned_abs returns the absolute value of a signed type as an unsigned type.
@@ -39,9 +40,13 @@ namespace nil::crypto3::multiprecision {
 
     template<typename T, std::enable_if_t<std::is_signed_v<T>, int> = 0>
     constexpr std::make_unsigned_t<T> unsigned_or_throw(const T& a) {
+        // we are unable to throw exceptions on GPU
+        // so we pray that the value is nonnegative
+        #ifndef GPU_PROVER
         if (a < 0) {
-            throw std::range_error("nonnegative value expected");
+            NIL_THROW(std::range_error("nonnegative value expected"));
         }
+        #endif
         return static_cast<std::make_unsigned_t<T>>(a);
     }
 
diff --git a/crypto3/libs/random/example/CMakeLists.txt b/crypto3/libs/random/example/CMakeLists.txt
index 8a38f3b78b..0dc63b492d 100644
--- a/crypto3/libs/random/example/CMakeLists.txt
+++ b/crypto3/libs/random/example/CMakeLists.txt
@@ -20,7 +20,7 @@ macro(define_random_example name)
             ${CMAKE_WORKSPACE_NAME}::multiprecision
 
             Boost::container)
-    set_target_properties(random_${name}_example PROPERTIES CXX_STANDARD 20)
+    set_target_properties(random_${name}_example PROPERTIES CXX_STANDARD 23)
 endmacro()
 
 set(EXAMPLES_NAMES
diff --git a/crypto3/libs/random/test/CMakeLists.txt b/crypto3/libs/random/test/CMakeLists.txt
index 484d40f82a..a758c724c9 100644
--- a/crypto3/libs/random/test/CMakeLists.txt
+++ b/crypto3/libs/random/test/CMakeLists.txt
@@ -12,7 +12,7 @@ include(CMTest)
 cm_test_link_libraries(${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_NAME}
     ${CMAKE_WORKSPACE_NAME}::multiprecision
     Boost::unit_test_framework)
-set_target_properties(_cm_internal_tests-crypto3-random-test PROPERTIES CXX_STANDARD 20)
+set_target_properties(_cm_internal_tests-crypto3-random-test PROPERTIES CXX_STANDARD 23)
 target_precompile_headers(_cm_internal_tests-crypto3-random-test REUSE_FROM crypto3_precompiled_headers)
 
 macro(define_random_test test)
@@ -27,7 +27,7 @@ macro(define_random_test test)
 
                                ${Boost_INCLUDE_DIRS})
 
-    set_target_properties(${full_test_name} PROPERTIES CXX_STANDARD 20)
+    set_target_properties(${full_test_name} PROPERTIES CXX_STANDARD 23)
 
     if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
         target_compile_options(${full_test_name} PRIVATE "-fconstexpr-steps=2147483647")
diff --git a/crypto3/libs/transpiler/test/CMakeLists.txt b/crypto3/libs/transpiler/test/CMakeLists.txt
index 6550755099..77a02e9373 100644
--- a/crypto3/libs/transpiler/test/CMakeLists.txt
+++ b/crypto3/libs/transpiler/test/CMakeLists.txt
@@ -18,7 +18,7 @@ cm_test_link_libraries(${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_NAME}
                       crypto3::zk
                       Boost::unit_test_framework
 )
-set_target_properties(_cm_internal_tests-crypto3-transpiler-test PROPERTIES CXX_STANDARD 20)
+set_target_properties(_cm_internal_tests-crypto3-transpiler-test PROPERTIES CXX_STANDARD 23)
 target_precompile_headers(_cm_internal_tests-crypto3-transpiler-test REUSE_FROM crypto3_precompiled_headers)
 
 add_custom_target(compile_and_run_transpiler_tests)
@@ -39,7 +39,7 @@ macro(define_transpiler_test name)
 
                                ${Boost_INCLUDE_DIRS})
 
-    set_target_properties(${full_test_name} PROPERTIES CXX_STANDARD 20)
+    set_target_properties(${full_test_name} PROPERTIES CXX_STANDARD 23)
 
     get_target_property(target_type Boost::unit_test_framework TYPE)
     if(target_type STREQUAL "SHARED_LIB")
diff --git a/crypto3/libs/zk/test/CMakeLists.txt b/crypto3/libs/zk/test/CMakeLists.txt
index d1b8250b77..23e37c48fa 100644
--- a/crypto3/libs/zk/test/CMakeLists.txt
+++ b/crypto3/libs/zk/test/CMakeLists.txt
@@ -19,7 +19,7 @@ cm_test_link_libraries(${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_NAME}
     Boost::unit_test_framework
     Boost::log
     )
-set_target_properties(_cm_internal_tests-crypto3-zk-test PROPERTIES CXX_STANDARD 20)
+set_target_properties(_cm_internal_tests-crypto3-zk-test PROPERTIES CXX_STANDARD 23)
 target_precompile_headers(_cm_internal_tests-crypto3-zk-test REUSE_FROM crypto3_precompiled_headers)
 
 if(PROFILING_ENABLED)
@@ -38,7 +38,7 @@ macro(define_zk_test test)
 
                                ${Boost_INCLUDE_DIRS})
 
-    set_target_properties(${full_test_name} PROPERTIES CXX_STANDARD 20)
+    set_target_properties(${full_test_name} PROPERTIES CXX_STANDARD 23)
 
     if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
         target_compile_options(${full_test_name} PRIVATE "-fconstexpr-steps=2147483647")
diff --git a/debug-tools/bin/circgen/CMakeLists.txt b/debug-tools/bin/circgen/CMakeLists.txt
index 003f359d2a..5cfa6dd736 100644
--- a/debug-tools/bin/circgen/CMakeLists.txt
+++ b/debug-tools/bin/circgen/CMakeLists.txt
@@ -16,7 +16,7 @@ add_executable(circgen
 set_target_properties(circgen PROPERTIES
     LINKER_LANGUAGE CXX
     EXPORT_NAME circgen
-    CXX_STANDARD 20
+    CXX_STANDARD 23
     CXX_STANDARD_REQUIRED TRUE)
 
 
diff --git a/debug-tools/bin/excalibur/src/CMakeLists.txt b/debug-tools/bin/excalibur/src/CMakeLists.txt
index 3b54265c50..1f28da1e04 100644
--- a/debug-tools/bin/excalibur/src/CMakeLists.txt
+++ b/debug-tools/bin/excalibur/src/CMakeLists.txt
@@ -59,7 +59,7 @@ add_executable(${C3_TARGET}
 set_target_properties(${C3_TARGET} PROPERTIES
                       LINKER_LANGUAGE CXX
                       EXPORT_NAME ${CMAKE_PROJECT_NAME}
-                      CXX_STANDARD 20
+                      CXX_STANDARD 23
                       CXX_STANDARD_REQUIRED TRUE)
 
 target_link_directories(
diff --git a/find_symbol.sh b/find_symbol.sh
new file mode 100755
index 0000000000..563d0d10a8
--- /dev/null
+++ b/find_symbol.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Symbol to search for
+SYMBOL="$2"
+
+# Check if a directory was provided
+if [ -z "$1" ]; then
+    echo "Usage: $0 /path/to/directory symbol"
+    exit 1
+fi
+
+# Directory to search
+DIR="$1"
+
+# Find all .so and .a files and process them
+find "$DIR" -type f \( -name '*.so*' -o -name '*.a' \) -print0 | while IFS= read -r -d '' file; do
+    # Run nm and search for the symbol
+    if nm -D -U "$file" 2>/dev/null | grep -Fq "$SYMBOL"; then
+        echo "Symbol found in: $file"
+    fi
+done
diff --git a/flake.nix b/flake.nix
index 5c75a0af9f..5ac28442f5 100644
--- a/flake.nix
+++ b/flake.nix
@@ -21,6 +21,9 @@
         pkgs = import nixpkgs {
           inherit system;
           overlays = [ nix-3rdparty.overlays.${system}.default ];
+          config = {
+            allowUnfree = true;
+          };
         };
 
         # For proof-producer, our main target is statically linked binaries,
@@ -75,26 +78,33 @@
 
 
           parallel-crypto3 = (pkgs.callPackage ./parallel-crypto3.nix {
+            stdenv = pkgs.llvmPackages_19.stdenv;
             runTests = false;
             enableDebug = false;
+            enableGPU = false;
           });
           parallel-crypto3-tests = (pkgs.callPackage ./parallel-crypto3.nix {
+            stdenv = pkgs.llvmPackages_19.stdenv;
             runTests = true;
             enableDebug = false;
+            enableGPU = true;
           });
           parallel-crypto3-clang-bench = (pkgs.callPackage ./parallel-crypto3.nix {
             runTests = true;
             enableDebug = false;
             benchmarkTests = true;
+            enableGPU = false;
           });
           parallel-crypto3-debug-tests = (pkgs.callPackage ./parallel-crypto3.nix {
             enableDebug = true;
             runTests = true;
+            enableGPU = false;
           });
           parallel-crypto3-clang-debug = (pkgs.callPackage ./parallel-crypto3.nix {
             stdenv = pkgs.llvmPackages_19.stdenv;
             enableDebug = true;
             runTests = false;
+            enableGPU = false;
           });
 
           proof-producer = (staticPkgs.callPackage ./proof-producer.nix {
@@ -124,9 +134,10 @@
             runTests = true;
             sanitize = true;
             crypto3_tests = true;
-            parallel_crypto3_tets = true;
+            parallel_crypto3_tests = true;
             crypto3_bechmarks = true;
             parallel_crypto3_bechmarks = true;
+            enableGPU = true;
           });
 
           develop-clang = (pkgs.callPackage ./proof-producer.nix {
@@ -136,7 +147,7 @@
             runTests = true;
             sanitize = true;
             crypto3_tests = true;
-            parallel_crypto3_tets = true;
+            parallel_crypto3_tests = true;
             crypto3_bechmarks = true;
             parallel_crypto3_bechmarks = true;
           });
diff --git a/gpu_gpustat_monitor.sh b/gpu_gpustat_monitor.sh
new file mode 100755
index 0000000000..0d1b41d49e
--- /dev/null
+++ b/gpu_gpustat_monitor.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+while true; do
+    gpustat --json >> gpu_utilization_gpustat_log.json
+done
diff --git a/lzpatcher.sh b/lzpatcher.sh
new file mode 100755
index 0000000000..773782bae8
--- /dev/null
+++ b/lzpatcher.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+set -e
+
+usage() {
+    echo "Usage: $0 /path/to/directory"
+    exit 1
+}
+
+if [ -z "$1" ]; then
+    usage
+fi
+
+DIRECTORY="$1"
+
+if [ ! -d "$DIRECTORY" ]; then
+    echo "Error: Directory '$DIRECTORY' does not exist."
+    exit 1
+fi
+
+# Iterate over all files in the directory
+find "$DIRECTORY" -maxdepth 1 -type f | while read -r FILE; do
+    # Check if file is an ELF executable
+    if file "$FILE" | grep -q 'ELF'; then
+        echo "Processing ELF executable: $FILE"
+
+        # Backup the original file
+        cp "$FILE" "$FILE.bak"
+
+        # Get existing RPATH
+        EXISTING_RPATH=$(patchelf --print-rpath "$FILE" || true)
+
+        # Determine the new RPATH
+        if [ -z "$EXISTING_RPATH" ]; then
+            NEW_RPATH="$LIBZ_DIR"
+        else
+            NEW_RPATH="$EXISTING_RPATH:$LIBZ_DIR"
+        fi
+
+        # Modify the RPATH
+        patchelf --set-rpath "$NEW_RPATH" "$FILE"
+
+        echo "Updated RPATH for $FILE to $NEW_RPATH"
+    else
+        echo "Skipping non-ELF file: $FILE"
+    fi
+done
+
+echo "RPATH update complete."
diff --git a/nclang++ b/nclang++
new file mode 100644
index 0000000000..afded054b3
--- /dev/null
+++ b/nclang++
@@ -0,0 +1,275 @@
+#! /nix/store/717iy55ncqs0wmhdkwc5fg2vci5wbmq8-bash-5.2p32/bin/bash
+set -eu -o pipefail +o posix
+shopt -s nullglob
+
+if (( "${NIX_DEBUG:-0}" >= 7 )); then
+    set -x
+fi
+
+path_backup="$PATH"
+
+# That @-vars are substituted separately from bash evaluation makes
+# shellcheck think this, and others like it, are useless conditionals.
+# shellcheck disable=SC2157
+if [[ -n "/nix/store/ph44jcx3ddmlwh394mh1wb7f1qigxqb1-coreutils-9.5" && -n "/nix/store/lvnwdmnjm7nvaq0a3vhvvn46iy4ql7gr-gnugrep-3.11" ]]; then
+    PATH="/nix/store/ph44jcx3ddmlwh394mh1wb7f1qigxqb1-coreutils-9.5/bin:/nix/store/lvnwdmnjm7nvaq0a3vhvvn46iy4ql7gr-gnugrep-3.11/bin"
+fi
+
+source /nix/store/5j2f6adr7awqag8c7cv6q4px0lz477gc-clang-wrapper-19.1.1/nix-support/utils.bash
+
+
+# Parse command line options and set several variables.
+# For instance, figure out if linker flags should be passed.
+# GCC prints annoying warnings when they are not needed.
+dontLink=0
+nonFlagArgs=0
+cc1=0
+# shellcheck disable=SC2193
+[[ "/nix/store/m4yb6xs0g07l0bc3c4i0klgv5lgz7g6s-clang-19.1.1/bin/clang++" = *++ ]] && isCxx=1 || isCxx=0
+cxxInclude=1
+cxxLibrary=1
+cInclude=1
+
+expandResponseParams "$@"
+
+declare -ag positionalArgs=()
+declare -i n=0
+nParams=${#params[@]}
+while (( "$n" < "$nParams" )); do
+    p=${params[n]}
+    p2=${params[n+1]:-} # handle `p` being last one
+    n+=1
+
+    case "$p" in
+        -[cSEM] | -MM) dontLink=1 ;;
+        -cc1) cc1=1 ;;
+        -nostdinc) cInclude=0 cxxInclude=0 ;;
+        -nostdinc++) cxxInclude=0 ;;
+        -nostdlib) cxxLibrary=0 ;;
+        -x*-header) dontLink=1 ;; # both `-x c-header` and `-xc-header` are accepted by clang
+        -xc++*) isCxx=1 ;;        # both `-xc++` and `-x c++` are accepted by clang
+        -x)
+            case "$p2" in
+                *-header) dontLink=1 ;;
+                c++*) isCxx=1 ;;
+            esac
+            ;;
+        --) # Everything else is positional args!
+            # See: https://github.com/llvm/llvm-project/commit/ed1d07282cc9d8e4c25d585e03e5c8a1b6f63a74
+
+            # Any positional arg (i.e. any argument after `--`) will be
+            # interpreted as a "non flag" arg:
+            if [[ -v "params[$n]" ]]; then nonFlagArgs=1; fi
+
+            positionalArgs=("${params[@]:$n}")
+            params=("${params[@]:0:$((n - 1))}")
+            break;
+            ;;
+        -?*) ;;
+        *) nonFlagArgs=1 ;; # Includes a solitary dash (`-`) which signifies standard input; it is not a flag
+    esac
+done
+
+# If we pass a flag like -Wl, then gcc will call the linker unless it
+# can figure out that it has to do something else (e.g., because of a
+# "-c" flag).  So if no non-flag arguments are given, don't pass any
+# linker flags.  This catches cases like "gcc" (should just print
+# "gcc: no input files") and "gcc -v" (should print the version).
+if [ "$nonFlagArgs" = 0 ]; then
+    dontLink=1
+fi
+
+# Arocc does not link
+if [ "" = 1 ]; then
+    dontLink=1
+fi
+
+# Optionally filter out paths not refering to the store.
+if [[ "${NIX_ENFORCE_PURITY:-}" = 1 && -n "$NIX_STORE" ]]; then
+    kept=()
+    nParams=${#params[@]}
+    declare -i n=0
+    while (( "$n" < "$nParams" )); do
+        p=${params[n]}
+        p2=${params[n+1]:-} # handle `p` being last one
+        n+=1
+
+        skipNext=false
+        path=""
+        case "$p" in
+            -[IL]/*) path=${p:2} ;;
+            -[IL] | -isystem) path=$p2 skipNext=true ;;
+        esac
+
+        if [[ -n $path ]] && badPath "$path"; then
+            skip "$path"
+            $skipNext && n+=1
+            continue
+        fi
+
+        kept+=("$p")
+    done
+    # Old bash empty array hack
+    params=(${kept+"${kept[@]}"})
+fi
+
+# Flirting with a layer violation here.
+if [ -z "${NIX_BINTOOLS_WRAPPER_FLAGS_SET_x86_64_unknown_linux_gnu:-}" ]; then
+    source /nix/store/lfabp2rmzyn7ddbhgls0gsjjqckzw3np-binutils-wrapper-2.43.1/nix-support/add-flags.sh
+fi
+
+# Put this one second so libc ldflags take priority.
+if [ -z "${NIX_CC_WRAPPER_FLAGS_SET_x86_64_unknown_linux_gnu:-}" ]; then
+    source /nix/store/5j2f6adr7awqag8c7cv6q4px0lz477gc-clang-wrapper-19.1.1/nix-support/add-flags.sh
+fi
+
+# Clear march/mtune=native -- they bring impurity.
+if [ "$NIX_ENFORCE_NO_NATIVE_x86_64_unknown_linux_gnu" = 1 ]; then
+    kept=()
+    # Old bash empty array hack
+    for p in ${params+"${params[@]}"}; do
+        if [[ "$p" = -m*=native ]]; then
+            skip "$p"
+        else
+            kept+=("$p")
+        fi
+    done
+    # Old bash empty array hack
+    params=(${kept+"${kept[@]}"})
+fi
+
+if [[ "$isCxx" = 1 ]]; then
+    if [[ "$cxxInclude" = 1 ]]; then
+        #
+        # The motivation for this comment is to explain the reason for appending
+        # the C++ stdlib to NIX_CFLAGS_COMPILE, which I initially thought should
+        # change and later realized it shouldn't in:
+        #
+        #   https://github.com/NixOS/nixpkgs/pull/185569#issuecomment-1234959249
+        #
+        # NIX_CFLAGS_COMPILE contains dependencies added using "-isystem", and
+        # NIX_CXXSTDLIB_COMPILE adds the C++ stdlib using "-isystem". Appending
+        # NIX_CXXSTDLIB_COMPILE to NIX_CLAGS_COMPILE emulates this part of the
+        # include lookup order from GCC/Clang:
+        #
+        # > 4. Directories specified with -isystem options are scanned in
+        # >    left-to-right order.
+        # > 5. Standard system directories are scanned.
+        # > 6. Directories specified with -idirafter options are scanned
+        # >    in left-to-right order.
+        #
+        # NIX_CXX_STDLIB_COMPILE acts as the "standard system directories" that
+        # are otherwise missing from CC in nixpkgs, so should be added last.
+        #
+        # This means that the C standard library should never be present inside
+        # NIX_CFLAGS_COMPILE, because it MUST come after the C++ stdlib. It is
+        # added automatically by cc-wrapper later using "-idirafter".
+        #
+        NIX_CFLAGS_COMPILE_x86_64_unknown_linux_gnu+=" $NIX_CXXSTDLIB_COMPILE_x86_64_unknown_linux_gnu"
+    fi
+    if [[ "$cxxLibrary" = 1 ]]; then
+        NIX_CFLAGS_LINK_x86_64_unknown_linux_gnu+=" $NIX_CXXSTDLIB_LINK_x86_64_unknown_linux_gnu"
+    fi
+fi
+
+source /nix/store/5j2f6adr7awqag8c7cv6q4px0lz477gc-clang-wrapper-19.1.1/nix-support/add-hardening.sh
+
+# Add the flags for the C compiler proper.
+extraAfter=(${hardeningCFlagsAfter[@]+"${hardeningCFlagsAfter[@]}"} $NIX_CFLAGS_COMPILE_x86_64_unknown_linux_gnu)
+extraBefore=(${hardeningCFlagsBefore[@]+"${hardeningCFlagsBefore[@]}"} $NIX_CFLAGS_COMPILE_BEFORE_x86_64_unknown_linux_gnu)
+
+# Remove '-fzero-call-used-regs=used-gpr' from extraBefore
+filteredExtraBefore=()
+for arg in "${extraBefore[@]}"; do
+    if [[ "$arg" != "-fzero-call-used-regs=used-gpr" ]]; then
+        filteredExtraBefore+=("$arg")
+    fi
+done
+extraBefore=("${filteredExtraBefore[@]}")
+
+if [ "$dontLink" != 1 ]; then
+    linkType=$(checkLinkType $NIX_LDFLAGS_BEFORE_x86_64_unknown_linux_gnu "${params[@]}" ${NIX_CFLAGS_LINK_x86_64_unknown_linux_gnu:-} $NIX_LDFLAGS_x86_64_unknown_linux_gnu)
+
+    # Add the flags that should only be passed to the compiler when
+    # linking.
+    extraAfter+=($(filterRpathFlags "$linkType" $NIX_CFLAGS_LINK_x86_64_unknown_linux_gnu))
+
+    # Add the flags that should be passed to the linker (and prevent
+    # `ld-wrapper' from adding NIX_LDFLAGS_x86_64_unknown_linux_gnu again).
+    for i in $(filterRpathFlags "$linkType" $NIX_LDFLAGS_BEFORE_x86_64_unknown_linux_gnu); do
+        extraBefore+=("-Wl,$i")
+    done
+    if [[ "$linkType" == dynamic && -n "$NIX_DYNAMIC_LINKER_x86_64_unknown_linux_gnu" ]]; then
+        extraBefore+=("-Wl,-dynamic-linker=$NIX_DYNAMIC_LINKER_x86_64_unknown_linux_gnu")
+    fi
+    for i in $(filterRpathFlags "$linkType" $NIX_LDFLAGS_x86_64_unknown_linux_gnu); do
+        if [ "${i:0:3}" = -L/ ]; then
+            extraAfter+=("$i")
+        else
+            extraAfter+=("-Wl,$i")
+        fi
+    done
+    export NIX_LINK_TYPE_x86_64_unknown_linux_gnu=$linkType
+fi
+
+if [[ -e /nix/store/5j2f6adr7awqag8c7cv6q4px0lz477gc-clang-wrapper-19.1.1/nix-support/add-local-cc-cflags-before.sh ]]; then
+    source /nix/store/5j2f6adr7awqag8c7cv6q4px0lz477gc-clang-wrapper-19.1.1/nix-support/add-local-cc-cflags-before.sh
+fi
+
+# As a very special hack, if the arguments are just `-v', then don't
+# add anything.  This is to prevent `gcc -v' (which normally prints
+# out the version number and returns exit code 0) from printing out
+# `No input files specified' and returning exit code 1.
+if [ "$*" = -v ]; then
+    extraAfter=()
+    extraBefore=()
+fi
+
+# clang's -cc1 mode is not compatible with most options
+# that we would pass. Rather than trying to pass only
+# options that would work, let's just remove all of them.
+if [ "$cc1" = 1 ]; then
+  extraAfter=()
+  extraBefore=()
+fi
+
+# Finally, if we got any positional args, append them to `extraAfter`
+# now:
+if [[ "${#positionalArgs[@]}" -gt 0 ]]; then
+    extraAfter+=(-- "${positionalArgs[@]}")
+fi
+
+# Optionally print debug info.
+if (( "${NIX_DEBUG:-0}" >= 1 )); then
+    # Old bash workaround, see ld-wrapper for explanation.
+    echo "extra flags before to /nix/store/m4yb6xs0g07l0bc3c4i0klgv5lgz7g6s-clang-19.1.1/bin/clang++:" >&2
+    printf "  %q\n" ${extraBefore+"${extraBefore[@]}"}  >&2
+    echo "original flags to /nix/store/m4yb6xs0g07l0bc3c4i0klgv5lgz7g6s-clang-19.1.1/bin/clang++:" >&2
+    printf "  %q\n" ${params+"${params[@]}"} >&2
+    echo "extra flags after to /nix/store/m4yb6xs0g07l0bc3c4i0klgv5lgz7g6s-clang-19.1.1/bin/clang++:" >&2
+    printf "  %q\n" ${extraAfter+"${extraAfter[@]}"} >&2
+fi
+
+PATH="$path_backup"
+# Old bash workaround, see above.
+
+# if a cc-wrapper-hook exists, run it.
+if [[ -e /nix/store/5j2f6adr7awqag8c7cv6q4px0lz477gc-clang-wrapper-19.1.1/nix-support/cc-wrapper-hook ]]; then
+    compiler=/nix/store/m4yb6xs0g07l0bc3c4i0klgv5lgz7g6s-clang-19.1.1/bin/clang++
+    source /nix/store/5j2f6adr7awqag8c7cv6q4px0lz477gc-clang-wrapper-19.1.1/nix-support/cc-wrapper-hook
+fi
+
+if (( "${NIX_CC_USE_RESPONSE_FILE:-1}" >= 1 )); then
+    responseFile=$(mktemp "${TMPDIR:-/tmp}/cc-params.XXXXXX")
+    trap 'rm -f -- "$responseFile"' EXIT
+    printf "%q\n" \
+       ${extraBefore+"${extraBefore[@]}"} \
+       ${params+"${params[@]}"} \
+       ${extraAfter+"${extraAfter[@]}"} > "$responseFile"
+    /nix/store/m4yb6xs0g07l0bc3c4i0klgv5lgz7g6s-clang-19.1.1/bin/clang++ "@$responseFile"
+else
+    exec /nix/store/m4yb6xs0g07l0bc3c4i0klgv5lgz7g6s-clang-19.1.1/bin/clang++ \
+       ${extraBefore+"${extraBefore[@]}"} \
+       ${params+"${params[@]}"} \
+       ${extraAfter+"${extraAfter[@]}"}
+fi
diff --git a/opensycl.nix b/opensycl.nix
new file mode 100644
index 0000000000..11ca5aface
--- /dev/null
+++ b/opensycl.nix
@@ -0,0 +1,99 @@
+# stolen from nixpkgs (/pkgs/development/compilers/opensycl/default.nix
+# we need a custom version because (at the time of writing) the nixpkgs version is broken
+{ lib
+, fetchFromGitHub
+, llvmPackages_19
+, lld_19
+, python3
+, cmake
+, boost
+, libxml2
+, libffi
+, makeWrapper
+, config
+, cudaPackages
+, linuxPackages
+, rocmPackages_5
+, ompSupport ? true
+, openclSupport ? false
+, rocmSupport ? config.rocmSupport
+, cudaSupport
+, autoAddDriverRunpath
+}:
+let
+  inherit (llvmPackages_19) stdenv;
+  # move to newer ROCm version once supported
+  rocmPackages = rocmPackages_5;
+in
+stdenv.mkDerivation rec {
+  pname = "AdaptiveCpp";
+  version = "24.10.0";
+
+  src = fetchFromGitHub {
+    owner = "AdaptiveCpp";
+    repo = "AdaptiveCpp";
+    rev = "v24.10.0";
+    sha256 = "sha256-ZwHDiwv1ybC+2UhiOe2f7fnfqcul+CD9Uta8PT9ICr4=";
+  };
+  # zerocallusedregs is disabled because passing it to gpu compilers confuses them
+  # fortify is disabled because it was also disabled above in flake, idk why
+  hardeningDisable = [ "fortify" "zerocallusedregs" ];
+
+  nativeBuildInputs = [
+    cmake
+    makeWrapper
+  ] ++ lib.optionals cudaSupport [
+    autoAddDriverRunpath
+    linuxPackages.nvidia_x11
+    cudaPackages.cuda_nvcc
+    cudaPackages.cuda_cudart
+    cudaPackages.cudatoolkit
+    cudaPackages.cuda_nvrtc
+    cudaPackages.cuda_cupti
+  ];
+
+  buildInputs = [
+    libxml2
+    libffi
+    boost
+    llvmPackages_19.openmp
+    llvmPackages_19.llvm
+    llvmPackages_19.libclang.dev
+  ] ++ lib.optionals rocmSupport [
+    rocmPackages.clr
+    rocmPackages.rocm-runtime
+  ] ++ lib.optionals cudaSupport [
+    linuxPackages.nvidia_x11
+    cudaPackages.cuda_cudart
+    (lib.getOutput "stubs" cudaPackages.cuda_cudart)
+  ];
+
+  # set the gpu architecture for the cuda backend here
+  NIX_CXXFLAGS_COMPILE = lib.optionalString cudaSupport "--cuda-gpu-arch=sm_89";
+  # opensycl makes use of clangs internal headers. Its cmake does not successfully discover them automatically on nixos, so we supply the path manually
+  cmakeFlags = [
+    "-DCLANG_INCLUDE_PATH=${llvmPackages_19.libclang.dev}/include"
+  ] ++ lib.optionals cudaSupport [
+    "-DCMAKE_CUDA_COMPILER=$(which nvcc)"
+  ] ++ [
+    (lib.cmakeBool "WITH_CPU_BACKEND" ompSupport)
+    (lib.cmakeBool "WITH_CUDA_BACKEND" cudaSupport)
+    (lib.cmakeBool "WITH_ROCM_BACKEND" rocmSupport)
+  ] ++ lib.optionals (lib.versionAtLeast version "24") [
+    (lib.cmakeBool "WITH_OPENCL_BACKEND" openclSupport)
+  ];
+
+  postFixup = ''
+    wrapProgram $out/bin/syclcc-clang \
+      --prefix PATH : ${lib.makeBinPath [ python3 lld_19 ]} \
+  '' + lib.optionalString rocmSupport ''
+    --add-flags "--rocm-device-lib-path=${rocmPackages.rocm-device-libs}/amdgcn/bitcode"
+  '';
+
+  meta = with lib; {
+    homepage = "https://github.com/AdaptiveCpp/AdaptiveCpp";
+    description = "Multi-backend implementation of SYCL for CPUs and GPUs";
+    maintainers = with maintainers; [ yboettcher ];
+    license = licenses.bsd2;
+  };
+}
\ No newline at end of file
diff --git a/parallel-crypto3.nix b/parallel-crypto3.nix
index 0e26f4ae6e..96d9de82a4 100644
--- a/parallel-crypto3.nix
+++ b/parallel-crypto3.nix
@@ -1,4 +1,5 @@
 { lib,
+  pkgs,
   stdenv,
   ninja,
   pkg-config,
@@ -8,7 +9,23 @@
   lldb,
   mold,
   cmake_modules,
+  libgcc,
+  glibc,
+  libffi,
+  libz,
+  libxml2,
+  icu70,
+  ncurses,
+  gcc,
+  xz,
+  libedit,
+  llvm,
+  libcxx,
+  libstdcxx5,
+  llvmPackages_19,
+  opensycl,
   enableDebugging,
+  enableGPU ? false,
   enableDebug ? false,
   runTests ? false,
   sanitize? false,
@@ -16,32 +33,49 @@
   }:
 let
   inherit (lib) optional;
+  opensycl = pkgs.callPackage ./opensycl.nix {
+    inherit (pkgs);
+    cudaSupport = enableGPU;
+  };
+
 in stdenv.mkDerivation {
   name = "Parallel Crypto3";
 
   src = lib.sourceByRegex ./. ["^crypto3(/.*)?$" "^parallel-crypto3(/.*)?$" "CMakeLists.txt"];
-  hardeningDisable = [ "fortify" ];
-
-  nativeBuildInputs = [ cmake ninja pkg-config ] ++
+  hardeningDisable = [ "fortify" "zerocallusedregs" ];
+  nativeBuildInputs = [ cmake ninja pkg-config llvmPackages_19.openmp opensycl  ] ++
                        (lib.optional (!stdenv.isDarwin) gdb) ++
                        (lib.optional (stdenv.isDarwin) lldb);
 
   # enableDebugging will keep debug symbols in boost
   propagatedBuildInputs = [ (if enableDebug then (enableDebugging boost) else boost) ];
 
-  buildInputs = [cmake_modules];
+  buildInputs = [
+    cmake_modules
+    opensycl
+  ] ++ (if enableGPU then [
+    pkgs.cudaPackages.cudatoolkit
+    pkgs.cudaPackages.cuda_cudart
+    pkgs.cudaPackages.cuda_nvcc
+    pkgs.linuxPackages.nvidia_x11
+  ] else []);
+
+  makeWrapperArgs = [
+  # Ensure the real NVIDIA libraries are found first
+    "--prefix LD_LIBRARY_PATH : ${pkgs.linuxPackages.nvidia_x11}/lib"
+  ];
 
   cmakeFlags =
     [
       (if runTests then "-DBUILD_PARALLEL_CRYPTO3_TESTS=TRUE" else "")
       (if sanitize then "-DSANITIZE=ON" else "-DSANITIZE=OFF")
       (if benchmarkTests then "-DENABLE_BENCHMARKS=ON" else "-DENABLE_BENCHMARKS=OFF")
+      (if enableGPU then "-DGPU_PROVER=ON" else "")
       "-DPARALLEL_CRYPTO3_ENABLE=TRUE"
     ];
 
   cmakeBuildType = if enableDebug then "Debug" else "Release";
   doCheck = runTests; # tests are inside parallel-crypto3-tests derivation
-
   checkPhase = ''
     # JUNIT file without explicit file name is generated after the name of the master test suite inside `CMAKE_CURRENT_SOURCE_DIR`
     export BOOST_TEST_LOGGER=JUNIT:HRF
@@ -52,7 +86,14 @@ in stdenv.mkDerivation {
     find .. -type f -name '*_test.xml' -exec cp {} ${placeholder "out"}/test-logs \;
   '';
 
-  shellHook = ''
+  shellHook =
+  (if enableGPU then ''
+    CXX=syclcc-clang; export CXX
+    ACPP_ADAPTIVITY_LEVEL=2; export ACPP_ADAPTIVITY_LEVEL
+  '' else "") +
+  ''
+    rm -rf build
+    eval $configurePhase
     PS1="\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ "
     echo "Welcome to Parallel Crypto3 development environment!"
   '';
diff --git a/parallel-crypto3/CMakeLists.txt b/parallel-crypto3/CMakeLists.txt
index 481878b48b..9c4d573993 100644
--- a/parallel-crypto3/CMakeLists.txt
+++ b/parallel-crypto3/CMakeLists.txt
@@ -1,17 +1,20 @@
 cmake_minimum_required(VERSION 3.22 FATAL_ERROR)
 project(parallel-crypto3)
 
-option(BUILD_PARALLEL_CRYPTO3_TESTS "Enable tests" FALSE)
+option(BUILD_PARALLEL_CRYPTO3_TESTS "Enable tests" TRUE)
+set(CMAKE_CUDA_COMPILER nvc++)
 
 find_package(CM REQUIRED)
 include(CMConfig)
 
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
 # The file compile_commands.json is generated in build directory, so LSP could
 # pick it up and guess all include paths, defines and other stuff.
 # If Nix is used, LSP could not guess the locations of implicit include
 # directories, so we need to include them explicitly.
 if(CMAKE_EXPORT_COMPILE_COMMANDS)
-  set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES 
+  set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES
       ${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES})
 endif()
 
@@ -41,7 +44,8 @@ target_link_libraries(${PROJECT_NAME}_all INTERFACE
     actor::containers
     actor::math
     actor::zk
-    actor::core)
+    actor::core
+    )
 
 # Configure package file to be able to import headers
 include(CMakePackageConfigHelpers)
diff --git a/parallel-crypto3/benchmarks/CMakeLists.txt b/parallel-crypto3/benchmarks/CMakeLists.txt
index 6d7ed4fa39..9dcf40980f 100644
--- a/parallel-crypto3/benchmarks/CMakeLists.txt
+++ b/parallel-crypto3/benchmarks/CMakeLists.txt
@@ -34,7 +34,7 @@ macro(define_bench_test name)
 
                                ${Boost_INCLUDE_DIRS})
 
-    set_target_properties(${test_name} PROPERTIES CXX_STANDARD 20)
+    set_target_properties(${test_name} PROPERTIES CXX_STANDARD 23)
     target_precompile_headers(${test_name} REUSE_FROM crypto3_precompiled_headers)
 endmacro()
 
@@ -46,7 +46,7 @@ cm_test_link_libraries(
     Boost::unit_test_framework
     Boost::timer
 )
-set_target_properties(_cm_internal_tests--parallel-crypto3-benchmarks PROPERTIES CXX_STANDARD 20)
+set_target_properties(_cm_internal_tests--parallel-crypto3-benchmarks PROPERTIES CXX_STANDARD 23)
 target_precompile_headers(_cm_internal_tests--parallel-crypto3-benchmarks REUSE_FROM crypto3_precompiled_headers)
 
 set(TESTS_NAMES
diff --git a/parallel-crypto3/libs/parallel-containers/example/CMakeLists.txt b/parallel-crypto3/libs/parallel-containers/example/CMakeLists.txt
index 1c949653b1..e8c4941415 100644
--- a/parallel-crypto3/libs/parallel-containers/example/CMakeLists.txt
+++ b/parallel-crypto3/libs/parallel-containers/example/CMakeLists.txt
@@ -37,7 +37,7 @@ macro(define_containers_example example)
             ${CMAKE_WORKSPACE_NAME}::algebra
             ${CMAKE_WORKSPACE_NAME}::hash)
 
-    set_target_properties(${target_name} PROPERTIES CXX_STANDARD 20)
+    set_target_properties(${target_name} PROPERTIES CXX_STANDARD 23)
 endmacro()
 
 set(EXAMPLES_NAMES
diff --git a/parallel-crypto3/libs/parallel-containers/test/CMakeLists.txt b/parallel-crypto3/libs/parallel-containers/test/CMakeLists.txt
index 259cfda1c4..fd322a2d17 100644
--- a/parallel-crypto3/libs/parallel-containers/test/CMakeLists.txt
+++ b/parallel-crypto3/libs/parallel-containers/test/CMakeLists.txt
@@ -31,7 +31,7 @@ cm_test_link_libraries(${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_NAME}
                        Boost::random
                        Boost::unit_test_framework
 )
-set_target_properties(_cm_internal_tests-actor-containers-test PROPERTIES CXX_STANDARD 20)
+set_target_properties(_cm_internal_tests-actor-containers-test PROPERTIES CXX_STANDARD 23)
 target_precompile_headers(_cm_internal_tests-actor-containers-test REUSE_FROM crypto3_precompiled_headers)
 
 macro(define_storage_test test)
@@ -62,7 +62,7 @@ macro(define_storage_test test)
 
                                ${Boost_INCLUDE_DIRS})
 
-    set_target_properties(${target_name} PROPERTIES CXX_STANDARD 20)
+    set_target_properties(${target_name} PROPERTIES CXX_STANDARD 23)
 
     get_target_property(target_type Boost::unit_test_framework TYPE)
     if(target_type STREQUAL "SHARED_LIB")
diff --git a/parallel-crypto3/libs/parallel-math/CMakeLists.txt b/parallel-crypto3/libs/parallel-math/CMakeLists.txt
index ed1633ca8d..12be7d1a77 100644
--- a/parallel-crypto3/libs/parallel-math/CMakeLists.txt
+++ b/parallel-crypto3/libs/parallel-math/CMakeLists.txt
@@ -38,7 +38,7 @@ target_link_libraries(${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_NAME} INTERFACE
 
                       crypto3::algebra
                       crypto3::multiprecision
-
+                      crypto3::random
                       Boost::random
                   )
 
diff --git a/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/algorithms/make_evaluation_domain.hpp b/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/algorithms/make_evaluation_domain.hpp
index c016ab4105..a7f068f97f 100644
--- a/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/algorithms/make_evaluation_domain.hpp
+++ b/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/algorithms/make_evaluation_domain.hpp
@@ -120,7 +120,7 @@ namespace nil {
                     return result;
                 }
 
-                if (detail::is_extended_radix2_domain<FieldType>(m)) {
+                /*if (detail::is_extended_radix2_domain<FieldType>(m)) {
                     result_type result;
                     result.reset(new extended_radix2_domain<FieldType, ValueType>(m));
                     return result;
@@ -130,7 +130,7 @@ namespace nil {
                     result_type result;
                     result.reset(new step_radix2_domain<FieldType, ValueType>(m));
                     return result;
-                }
+                }*/
 
                 if (detail::is_basic_radix2_domain<FieldType>(big + rounded_small)) {
                     result_type result;
@@ -138,7 +138,7 @@ namespace nil {
                     return result;
                 }
 
-                if (detail::is_extended_radix2_domain<FieldType>(big + rounded_small)) {
+                /*if (detail::is_extended_radix2_domain<FieldType>(big + rounded_small)) {
                     result_type result;
                     result.reset(new extended_radix2_domain<FieldType, ValueType>(big + rounded_small));
                     return result;
@@ -160,7 +160,7 @@ namespace nil {
                     result_type result;
                     result.reset(new arithmetic_sequence_domain<FieldType, ValueType>(m));
                     return result;
-                }
+                }*/
 
                 return result_type();
             }
diff --git a/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/domains/basic_radix2_domain.hpp b/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/domains/basic_radix2_domain.hpp
index 0d5843d84b..36a8245e81 100644
--- a/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/domains/basic_radix2_domain.hpp
+++ b/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/domains/basic_radix2_domain.hpp
@@ -54,7 +54,6 @@ namespace nil {
                 typedef typename FieldType::value_type field_value_type;
                 typedef ValueType value_type;
                 typedef std::pair<std::vector<field_value_type>, std::vector<field_value_type>> cache_type;
-                std::shared_ptr<cache_type> fft_cache;
 
                 void create_fft_cache() {
                     fft_cache = std::make_shared<cache_type>(std::vector<field_value_type>(),
@@ -67,6 +66,7 @@ namespace nil {
                 typedef FieldType field_type;
 
                 field_value_type omega;
+                std::shared_ptr<cache_type> fft_cache;
 
                 basic_radix2_domain(const std::size_t m)
                         : evaluation_domain<FieldType, ValueType>(m),
@@ -85,6 +85,10 @@ namespace nil {
                     create_fft_cache();
                 }
 
+                std::shared_ptr<cache_type> get_fft_cache() override {
+                    return fft_cache;
+                }
+
                 void fft(std::vector<value_type> &a) override {
                     if (a.size() != this->m) {
                         if (a.size() < this->m) {
diff --git a/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/domains/evaluation_domain.hpp b/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/domains/evaluation_domain.hpp
index ec1e676b7e..953671e8d1 100644
--- a/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/domains/evaluation_domain.hpp
+++ b/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/domains/evaluation_domain.hpp
@@ -32,6 +32,8 @@
 
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
 #include <nil/crypto3/math/polynomial/polynomial.hpp>
 
 namespace nil {
@@ -49,7 +51,7 @@ namespace nil {
 
             public:
                 typedef FieldType field_type;
-
+                typedef std::pair<std::vector<field_value_type>, std::vector<field_value_type>> fft_cache_type;
                 std::size_t m;
                 std::size_t log2_size;
 
@@ -69,6 +71,8 @@ namespace nil {
                  */
                 virtual ~evaluation_domain() {};
 
+                virtual std::shared_ptr<fft_cache_type> get_fft_cache() = 0;
+
                 /**
                  * Get the unity root.
                  */
diff --git a/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/polynomial/polynomial_dfs.hpp b/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/polynomial/polynomial_dfs.hpp
index 6da143b12a..71154cf4db 100644
--- a/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/polynomial/polynomial_dfs.hpp
+++ b/parallel-crypto3/libs/parallel-math/include/nil/crypto3/math/polynomial/polynomial_dfs.hpp
@@ -230,6 +230,10 @@ namespace nil {
                     return _d;
                 }
 
+                void set_degree(size_type d) {
+                    _d = d;
+                }
+
                 size_type max_degree() const BOOST_NOEXCEPT {
                     return this->size();
                 }
@@ -862,10 +866,254 @@ namespace nil {
 
                 return dfs_result;
             }
+#ifdef GPU_PROVER
+            template<typename FieldType>
+            sycl::event gpu_fft(
+                typename FieldType::value_type* a,
+                std::size_t n,
+                typename FieldType::value_type* omega_cache,
+
+                sycl::queue& queue,
+                std::vector<sycl::event> a_events,
+                sycl::event cache_event
+            ) {
+                using value_type = typename FieldType::value_type;
+                const std::size_t logn = log2(n);
+
+                // swapping in place (from Storer's book)
+                // We can parallelize this look, since k and rk are pairs, they will never intersect.
+                a_events.push_back(cache_event);
+                auto swap_event = queue.submit([a_events, a, n, logn](sycl::handler &cgh) {
+                    cgh.depends_on(a_events);
+                    cgh.parallel_for(sycl::range<1>(n), [=](sycl::id<1> idx) {
+                        const std::size_t r_idx = crypto3::math::detail::bitreverse(idx, logn);
+                        if (idx < r_idx) {
+                            std::swap(a[idx], a[r_idx]);
+                        }
+                    });
+                });
+
+                // invariant: m = 2^{s-1}
+                sycl::event last_fft_event = swap_event;
+                for (std::size_t s = 1, m = 1, inc = n / 2; s <= logn; ++s, m <<= 1, inc >>= 1) {
+                    // w_m is 2^s-th root of unity now
+                    // Here we can parallelize on the both loops with 'k' and 'm', because for each value of k and m
+                    // the ranges of array 'a' used do not intersect. Think of these 2 loops as 1.
+                    const size_t count_k = n / (2 * m) + (n % (2 * m) ? 1 : 0);
+                    last_fft_event = queue.submit([count_k, m, inc, a, omega_cache, last_fft_event](sycl::handler &cgh) {
+                        cgh.depends_on(last_fft_event);
+                        cgh.parallel_for(sycl::range<1>(count_k * m), [=](sycl::id<1> index) {
+                            const std::size_t k = (index / m) * m * 2;
+                            const std::size_t j = index % m;
+                            const std::size_t idx = j * inc;
+                            const value_type t = a[k + j + m] * omega_cache[idx];
+                            a[k + j + m] = a[k + j] - t;
+                            a[k + j] += t;
+                        });
+                    });
+                }
+                return last_fft_event;
+            }
 
+            template<typename FieldType>
+            sycl::event gpu_inverse_fft(
+                typename FieldType::value_type* a,
+                const std::size_t n,
+                typename FieldType::value_type* fft_cache,
+
+                sycl::queue& queue,
+                std::vector<sycl::event> a_events,
+                sycl::event cache_event
+            ) {
+                using value_type = typename FieldType::value_type;
+                auto fft_event = gpu_fft<FieldType>(a, n, fft_cache, queue, a_events, cache_event);
+
+                const value_type sconst = value_type(n).inversed();
+                return queue.submit([sconst, fft_event, a, n](sycl::handler &cgh) {
+                    cgh.depends_on(fft_event);
+                    cgh.parallel_for(sycl::range<1>(n), [=](sycl::id<1> idx) {
+                        a[idx] *= sconst;
+                    });
+                });
+            }
+
+            template<typename FieldType>
+            sycl::event handle_polynomial_resizing(
+                typename FieldType::value_type* buffer,
+                const std::size_t cur_size,
+                const std::size_t new_size,
+                const std::size_t degree,
+                typename FieldType::value_type* current_domain_buf,
+                typename FieldType::value_type* new_domain_buf,
+
+                sycl::queue& queue,
+                sycl::event buffer_event,
+                sycl::event current_domain_event,
+                sycl::event new_domain_event
+            ) {
+                using value_type = typename FieldType::value_type;
+
+                if (cur_size >= new_size) {
+                    std::cout << "cur_size >= new_size" << std::endl;
+                    return sycl::event();
+                }
+
+                if (degree == 0) {
+                    // add zeros to the end of the buffer
+                    auto fill_event = queue.fill(buffer + cur_size, value_type::zero(), new_size - cur_size);
+                    return fill_event;
+                } else {
+                    auto ifft_event = gpu_inverse_fft<FieldType>(
+                        buffer, cur_size, current_domain_buf,
+                        queue, {buffer_event}, current_domain_event
+                    );
+                    auto fill_event = queue.fill(buffer + cur_size, value_type::zero(), new_size - cur_size);
+                    auto fft_event = gpu_fft<FieldType>(
+                        buffer, new_size, new_domain_buf,
+                        queue, {ifft_event, fill_event}, new_domain_event
+                    );
+                    return fft_event;
+                }
+            }
+
+            template<typename FieldType, typename ContainerType>
+            std::size_t create_domain_buffers(
+                const ContainerType& polynomials,
+                std::unordered_map<std::size_t, std::shared_ptr<evaluation_domain<FieldType>>>& domain_cache,
+
+                sycl::queue& queue,
+                std::unordered_map<std::size_t, typename FieldType::value_type*>& domain_buffers,
+                std::unordered_map<std::size_t, sycl::event>& domain_events,
+                typename FieldType::value_type*& max_domain_buf,
+                sycl::event& max_domain_buf_event
+            ) {
+                using value_type = typename FieldType::value_type;
+
+                std::size_t max_domain_size = 0;
+                std::size_t total_degree = 0;
+                std::set<std::size_t> needed_domain_sizes;
+                for (const auto& polynomial : polynomials) {
+                    max_domain_size = std::max(max_domain_size, polynomial.size());
+                    total_degree += polynomial.degree();
+                    needed_domain_sizes.insert(polynomial.size());
+                }
+                max_domain_size = std::max(max_domain_size, detail::power_of_two(total_degree + 1));
+                needed_domain_sizes.insert(max_domain_size);
+
+                for (const std::size_t domain_size : needed_domain_sizes) {
+                    domain_cache[domain_size] = nullptr;
+                }
+                // We cannot use LOW level thread pool here, make_evaluation_domain uses it.
+                parallel_foreach(needed_domain_sizes.begin(), needed_domain_sizes.end(),
+                    [&domain_cache](std::size_t domain_size) {
+                        domain_cache[domain_size] = make_evaluation_domain<FieldType>(domain_size);
+                    }, ThreadPool::PoolLevel::HIGH);
+
+                for (const std::size_t domain_size : needed_domain_sizes) {
+                    auto domain = domain_cache[domain_size];
+                    domain_buffers[domain_size] = sycl::malloc_device<value_type>(
+                        domain_size, queue
+                    );
+                    domain_events[domain_size] = queue.copy<value_type>(
+                        domain->get_fft_cache()->second.data(), domain_buffers[domain_size], domain_size
+                    );
+                }
+                max_domain_buf = sycl::malloc_device<value_type>(
+                    max_domain_size, queue
+                );
+                max_domain_buf_event = queue.copy<value_type>(
+                    domain_cache[max_domain_size]->get_fft_cache()->first.data(), max_domain_buf, max_domain_size
+                );
+
+                return max_domain_size;
+            }
+
+
+            template<typename FieldType>
+            polynomial_dfs<typename FieldType::value_type> polynomial_product(
+                const std::vector<math::polynomial_dfs<typename FieldType::value_type>> &multipliers
+            ) {
+                using value_type = typename FieldType::value_type;
+                using polynomial_type = polynomial_dfs<value_type>;
+
+                if (multipliers.size() == 0) {
+                    throw std::invalid_argument("polynomial_product multipliers.size() == 0");
+                }
+                if (multipliers.size() == 1) {
+                    return multipliers[0];
+                }
+
+                sycl::queue queue(sycl::gpu_selector{});
+
+                value_type* max_domain_buf = nullptr;
+                sycl::event max_domain_buf_event = sycl::event();
+                std::unordered_map<std::size_t, sycl::event> domain_events;
+                std::unordered_map<std::size_t, typename FieldType::value_type*> domain_buffers;
+                std::unordered_map<std::size_t, std::shared_ptr<evaluation_domain<FieldType>>> domain_cache;
+
+                std::size_t max_domain_size = create_domain_buffers<FieldType>(
+                    multipliers, domain_cache, queue,
+                    domain_buffers, domain_events, max_domain_buf, max_domain_buf_event
+                );
+
+                std::vector<value_type*> multipliers_buf(multipliers.size());
+                std::vector<sycl::event> multipliers_events(multipliers.size());
+
+                for (std::size_t i = 0; i < multipliers.size(); ++i) {
+                    multipliers_buf[i] = sycl::malloc_device<value_type>(max_domain_size, queue);
+                    multipliers_events[i] = queue.copy<value_type>(
+                        multipliers[i].data(), multipliers_buf[i], multipliers[i].size()
+                    );
+                }
+
+                // pre-resize the multipliers
+                std::vector<sycl::event> buffer_events(multipliers.size());
+                for (std::size_t i = 0; i < multipliers.size(); ++i) {
+                    buffer_events[i] = handle_polynomial_resizing<FieldType>(
+                        multipliers_buf[i], multipliers[i].size(), max_domain_size, multipliers[i].degree(),
+                        domain_buffers[multipliers[i].size()], max_domain_buf,
+                        queue, multipliers_events[i], domain_events[multipliers[i].size()], max_domain_buf_event
+                    );
+                }
+                for (std::size_t stride = 1; stride < multipliers.size(); stride <<= 1) {
+                    const std::size_t double_stride = stride << 1;
+                    std::size_t max_i = (multipliers.size() - stride) / double_stride;
+                    if ((multipliers.size() - stride) % double_stride != 0) {
+                        max_i++;
+                    }
+                    for (std::size_t i = 0; i < max_i; ++i) {
+                        const std::size_t index1 = i * double_stride;
+                        const std::size_t index2 = index1 + stride;
+                        std::vector<sycl::event> b_events = {buffer_events[index1], buffer_events[index2]};
+                        value_type* first_buf = multipliers_buf[index1];
+                        value_type* second_buf = multipliers_buf[index2];
+                        buffer_events[index1] = queue.submit([b_events, first_buf, second_buf, max_domain_size](sycl::handler &cgh) {
+                            cgh.depends_on(b_events);
+                            cgh.parallel_for(sycl::range<1>(max_domain_size), [=](sycl::id<1> idx) {
+                                first_buf[idx] *= second_buf[idx];
+                            });
+                        });
+                    }
+                }
+                polynomial_type result(max_domain_size - 1, max_domain_size);
+                auto copy_back_event = queue.copy<value_type>(
+                    multipliers_buf[0], result.data(), max_domain_size, buffer_events[0]
+                );
+                copy_back_event.wait();
+                for (std::size_t i = 0; i < multipliers.size(); ++i) {
+                    sycl::free(multipliers_buf[i], queue);
+                }
+                for (auto& domain_buffer : domain_buffers) {
+                    sycl::free(domain_buffer.second, queue);
+                }
+                sycl::free(max_domain_buf, queue);
+                return result;
+            }
+#else
             template<typename FieldType>
             static inline polynomial_dfs<typename FieldType::value_type> polynomial_product(
-                    std::vector<math::polynomial_dfs<typename FieldType::value_type>> multipliers) {
+                std::vector<math::polynomial_dfs<typename FieldType::value_type>> &&multipliers
+            ) {
                 // Pre-create all the domains. We could do this on-the-go, but we want this function to be more
                 // parallelization-friendly. This single-threaded version may look a bit complicated,
                 // but it's now very similar to what we have in parallel code.
@@ -921,14 +1169,11 @@ namespace nil {
                                 domain_cache[current_domain_size],
                                 domain_cache[next_domain_size],
                                 domain_cache[new_domain_size]);
-
-                            // Free the memory we are not going to use anymore.
-                            multipliers[index2] = polynomial_dfs<typename FieldType::value_type>();
                     }, ThreadPool::PoolLevel::HIGH);
                 }
-                return multipliers[0];
+                return std::move(multipliers[0]);
             }
-
+#endif
         }    // namespace math
     }        // namespace crypto3
 }    // namespace nil
@@ -950,4 +1195,4 @@ struct std::hash<nil::crypto3::math::polynomial_dfs<FieldValueType, Allocator>>
     }
 };
 
-#endif    // CRYPTO3_MATH_POLYNOMIAL_POLYNOM_DFT_HPP
+#endif    // CRYPTO3_MATH_POLYNOMIAL_POLYNOM_DFT_HPP
\ No newline at end of file
diff --git a/parallel-crypto3/libs/parallel-math/test/CMakeLists.txt b/parallel-crypto3/libs/parallel-math/test/CMakeLists.txt
index 8339de6ebf..d463a7ee8e 100644
--- a/parallel-crypto3/libs/parallel-math/test/CMakeLists.txt
+++ b/parallel-crypto3/libs/parallel-math/test/CMakeLists.txt
@@ -34,7 +34,7 @@ macro(define_math_test name)
 
                                ${Boost_INCLUDE_DIRS})
 
-    set_target_properties(${test_name} PROPERTIES CXX_STANDARD 20)
+    set_target_properties(${test_name} PROPERTIES CXX_STANDARD 23)
 
 endmacro()
 
diff --git a/parallel-crypto3/libs/parallel-math/test/polynomial_dfs.cpp b/parallel-crypto3/libs/parallel-math/test/polynomial_dfs.cpp
index 719db76f0a..bbde04f212 100644
--- a/parallel-crypto3/libs/parallel-math/test/polynomial_dfs.cpp
+++ b/parallel-crypto3/libs/parallel-math/test/polynomial_dfs.cpp
@@ -29,6 +29,8 @@
 #include <vector>
 #include <cstdint>
 
+#include <sycl/sycl.hpp>
+
 #include <boost/test/unit_test.hpp>
 #include <boost/test/data/test_case.hpp>
 #include <boost/test/data/monomorphic.hpp>
@@ -40,7 +42,7 @@
 #include <nil/crypto3/math/polynomial/polynomial.hpp>
 #include <nil/crypto3/math/polynomial/polynomial_dfs.hpp>
 #include <nil/crypto3/math/polynomial/shift.hpp>
-#include <nil/actor/core/thread_pool.hpp>
+#include <nil/crypto3/random/algebraic_engine.hpp>
 
 using namespace nil::crypto3::algebra;
 using namespace nil::crypto3::math;
@@ -49,7 +51,7 @@ typedef fields::bls12_fr<381> FieldType;
 
 BOOST_AUTO_TEST_SUITE(polynomial_dfs_from_coefficients_test_suite)
 
-BOOST_AUTO_TEST_CASE(polynomial_dfs_equal_test){
+BOOST_AUTO_TEST_CASE(polynomial_dfs_equal_test) {
     polynomial_dfs<typename FieldType::value_type> a = {
         7,
         {0x35_big_uint255, 0x26D37C08AED60085FDE335498E7DFEE2AFB1463D06E338219CD0E5DDAF27D68F_big_uint255,
@@ -1332,7 +1334,7 @@ BOOST_AUTO_TEST_CASE(polynomial_dfs_zero_one_test) {
     BOOST_CHECK((small_poly - one * small_poly).is_zero());
 }
 
-BOOST_AUTO_TEST_CASE(polynomial_dfs_2_levels_test) {
+BOOST_AUTO_TEST_CASE(polynomial_dfs_2_levels_test, *boost::unit_test::disabled()) {
     size_t size = 131072;
 
     polynomial_dfs<typename FieldType::value_type> poly = {
@@ -1397,19 +1399,33 @@ BOOST_AUTO_TEST_CASE(polynomial_dfs_multiplication_perf_test, *boost::unit_test:
     std::cout << "Multiplication time: " << duration.count() << " microseconds." << std::endl;
 }
 
-BOOST_AUTO_TEST_CASE(polynomial_dfs_resize_perf_test, *boost::unit_test::disabled()) {
-    std::vector<typename FieldType::value_type> values;
-    std::size_t size = 131072 * 16;
-    for (std::size_t i = 0; i < size; i++) {
-        values.push_back(nil::crypto3::algebra::random_element<FieldType>());
+template <typename Field>
+polynomial_dfs<typename Field::value_type>
+        generate_random_polynomial(
+            std::size_t size, nil::crypto3::random::algebraic_engine<Field>& engine) {
+    using value_type = typename Field::value_type;
+    std::vector<value_type> random_field_values(size);
+    for (std::size_t i = 0; i < size; ++i) {
+        random_field_values[i] = engine();
     }
+    return polynomial_dfs<value_type>(size - 1, std::move(random_field_values));
+}
 
-    polynomial_dfs<typename FieldType::value_type> poly = {
-        size - 1, values};
+BOOST_AUTO_TEST_CASE(polynomial_dfs_resize_perf_test, *boost::unit_test::disabled()) {
+    using field_type = nil::crypto3::algebra::fields::bls12_fr<381>;
+    using value_type = typename FieldType::value_type;
+    using polynomial_dfs_type = polynomial_dfs<value_type>;
+    nil::crypto3::random::algebraic_engine<field_type> alg_rnd_engine;
+    std::size_t size = 131072 * 16;
 
+    polynomial_dfs_type poly =
+        generate_random_polynomial<field_type>(
+            size,
+            alg_rnd_engine
+        );
     auto start = std::chrono::high_resolution_clock::now();
     for (std::size_t i = 0; i < 10; ++i) {
-        auto poly2 = poly;
+        polynomial_dfs_type poly2 = poly;
         poly2.resize(8 * size);
         BOOST_CHECK(poly2.size() == 8 * size);
     }
@@ -1446,4 +1462,28 @@ BOOST_AUTO_TEST_CASE(polynomial_dfs_equality_check_perf_test, *boost::unit_test:
     std::cout << "Equality check time: " << duration.count() << " microseconds." << std::endl;
 }
 
+BOOST_AUTO_TEST_CASE(polynomial_product_test) {
+    using field_type = nil::crypto3::algebra::fields::bls12_fr<381>;
+    using value_type = typename field_type::value_type;
+    using polynomial_dfs_type = polynomial_dfs<value_type>;
+
+    nil::crypto3::random::algebraic_engine<FieldType> alg_rnd_engine;
+    std::vector<polynomial_dfs_type> random_polynomials;
+    random_polynomials.reserve(8);
+    std::vector<std::size_t> sizes = {23, 17, 17, 16, 14, 15, 13, 15, 21, 16, 22};
+    for (auto size : sizes) {
+        random_polynomials.emplace_back(
+            generate_random_polynomial<field_type>(
+                1u << size,
+                alg_rnd_engine
+            )
+        );
+    }
+    auto start = std::chrono::high_resolution_clock::now();
+    polynomial_product<field_type>(std::move(random_polynomials));
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+    std::cout << "Polynomial product time: " << duration.count() << " microseconds." << std::endl;
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/parallel-crypto3/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp b/parallel-crypto3/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp
index 566a76c976..7ccfa5f24a 100644
--- a/parallel-crypto3/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp
+++ b/parallel-crypto3/libs/parallel-zk/include/nil/crypto3/zk/snark/systems/plonk/placeholder/gates_argument.hpp
@@ -73,15 +73,16 @@ namespace nil {
                 struct placeholder_gates_argument<FieldType, ParamsType, 1> {
 
                     typedef typename ParamsType::transcript_hash_type transcript_hash_type;
+                    using value_type = typename FieldType::value_type;
                     using transcript_type = transcript::fiat_shamir_heuristic_sequential<transcript_hash_type>;
-                    using polynomial_dfs_type = math::polynomial_dfs<typename FieldType::value_type>;
-                    using variable_type = plonk_variable<typename FieldType::value_type>;
+                    using polynomial_dfs_type = math::polynomial_dfs<value_type>;
+                    using variable_type = plonk_variable<value_type>;
                     using polynomial_dfs_variable_type = plonk_variable<polynomial_dfs_type>;
 
                     typedef detail::placeholder_policy<FieldType, ParamsType> policy_type;
 
                     constexpr static const std::size_t argument_size = 1;
-
+#ifndef GPU_PROVER
                     static inline void build_variable_value_map(
                         const math::expression<variable_type>& expr,
                         const plonk_polynomial_dfs_table<FieldType>& assignments,
@@ -239,7 +240,188 @@ namespace nil {
                         };
                         return F;
                     }
+#else
+                    static inline std::unordered_map<variable_type, sycl::event> build_variable_value_map(
+                        const math::expression<variable_type>& expr,
+                        const plonk_polynomial_dfs_table<FieldType>& assignments,
+                        std::shared_ptr<math::evaluation_domain<FieldType>> domain,
+                        std::size_t extended_domain_size,
+                        std::unordered_map<variable_type, value_type*>& variable_values_out,
+                        const polynomial_dfs_type &mask_polynomial,
+                        const polynomial_dfs_type &lagrange_0,
+
+                        sycl::queue& queue
+                    ) {
+
+                        std::unordered_map<variable_type, sycl::event> variable_events_set;
+
+                        math::expression_for_each_variable_visitor<variable_type> visitor(
+                            [&variable_set, &variable_values_out](const variable_type& var) {
+                                // Create the structure of the map so we can change the values later.
+                                if (variable_events_set.find(var) == variable_events_set.end()) {
+                                    variable_events_set[var] = sycl::event();
+                                    variable_values_out[var] = sycl::malloc_device<value_type>(extended_domain_size, queue);
+                                }
+                        });
+
+                        visitor.visit(expr);
+
+                        std::shared_ptr<math::evaluation_domain<FieldType>> extended_domain =
+                            math::make_evaluation_domain<FieldType>(extended_domain_size);
+
+                        // TODO: move mask_polynomial/lagrange_0 conversions up the callstack
+                        value_type* mask_polynomial_buf = sycl::malloc_device<value_type>(extended_domain_size, queue);
+                        value_type* lagrange_0_buf = sycl::malloc_device<value_type>(extended_domain_size, queue);
+
+                        auto mask_polynomial_event = queue.copy<value_type>(
+                            mask_polynomial.data(), mask_polynomial_buf,
+                            mask_polynomial.size()
+                        );
+                        auto lagrange_0_event = queue.copy<value_type>(
+                            lagrange_0.data(), lagrange_0_buf,
+                            lagrange_0.size()
+                        );
+                        value_type* mask_lagrange_diff_buf = sycl::malloc_device<value_type>(extended_domain_size, queue);
+                        sycl::event mask_lagrange_diff_event = queue.submit([&](sycl::handler& cgh) {
+                            cgh.depends_on({mask_polynomial_event, lagrange_0_event});
+                            cgh.parallel_for(sycl::range<1>(extended_domain_size), [=](sycl::id<1> idx) {
+                                mask_lagrange_diff_buf[idx] = mask_polynomial_buf[idx] - lagrange_0_buf[idx];
+                            });
+                        });
+
+                        for (const auto& var : variable_set) {
+                            // Convert the variable to polynomial_dfs variable type.
+                            polynomial_dfs_variable_type var_dfs(var.index, var.rotation, var.relative,
+                                static_cast<typename polynomial_dfs_variable_type::column_type>(
+                                    static_cast<std::uint8_t>(var.type)));
+
+                            value_type* assignment = nullptr;
+                            if( var.index == PLONK_SPECIAL_SELECTOR_ALL_USABLE_ROWS_SELECTED && var.type == variable_type::column_type::selector){
+                                assignment = mask_polynomial_buf;
+                                variable_events_set[var] = mask_polynomial_event;
+                            } else if( var.index == PLONK_SPECIAL_SELECTOR_ALL_NON_FIRST_USABLE_ROWS_SELECTED && var.type == variable_type::column_type::selector) {
+                                assignment = mask_lagrange_diff_buf;
+                                variable_events_set[var] = mask_lagrange_diff_event;
+                            } else {
+                                assignment = variable_values_out[var];
+                                variable_events_set[var] = handle_polynomial_resizing<FieldType>(
+                                    assignment, extended_domain_size, max_domain_size,
+                                    assignments.get_variable_value(var_dfs, domain).degree(),
+                                    queue,
+                                );
+                            }
+
+                        }
+
+                        sycl::free(mask_polynomial_buf, queue);
+                        sycl::free(lagrange_0_buf, queue);
+                        sycl::free(mask_lagrange_diff_buf, queue);
+                        return variable_set;
+                    }
+
+                    static inline std::array<polynomial_dfs_type, argument_size> prove_eval(
+                        const typename policy_type::constraint_system_type &constraint_system,
+                        const plonk_polynomial_dfs_table<FieldType> &column_polynomials,
+                        std::shared_ptr<math::evaluation_domain<FieldType>> original_domain,
+                        std::uint32_t max_gates_degree,
+                        const polynomial_dfs_type &mask_polynomial,
+                        const polynomial_dfs_type &lagrange_0,
+                        transcript_type& transcript
+                    ) {
+                        PROFILE_SCOPE("gate_argument_time");
 
+                        // max_gates_degree that comes from the outside does not take into account multiplication
+                        // by selector.
+                        ++max_gates_degree;
+                        typename FieldType::value_type theta = transcript.template challenge<FieldType>();
+
+                        auto value_type_to_polynomial_dfs = [](
+                            const typename variable_type::assignment_type& coeff) {
+                                return polynomial_dfs_type(0, 1, coeff);
+                            };
+
+                        std::vector<std::uint32_t> extended_domain_sizes;
+                        std::vector<std::uint32_t> degree_limits;
+                        std::uint32_t max_degree = std::pow(2, ceil(std::log2(max_gates_degree)));
+                        std::uint32_t max_domain_size = original_domain->m * max_degree;
+
+                        degree_limits.push_back(max_degree);
+                        extended_domain_sizes.push_back(max_domain_size);
+                        degree_limits.push_back(max_degree / 2);
+                        extended_domain_sizes.push_back(max_domain_size / 2);
+
+                        std::vector<math::expression<variable_type>> expressions(extended_domain_sizes.size());
+                        auto theta_acc = FieldType::value_type::one();
+
+                        // Every constraint has variable type 'variable_type', but we want it to use
+                        // 'polynomial_dfs_variable_type' instead. The only difference is the coefficient type
+                        // inside a term. We want the coefficients to be dfs polynomials here.
+                        math::expression_variable_type_converter<variable_type, polynomial_dfs_variable_type> converter(
+                            value_type_to_polynomial_dfs);
+
+                        math::expression_max_degree_visitor<variable_type> visitor;
+
+                        const auto& gates = constraint_system.gates();
+
+                        for (const auto& gate: gates) {
+                            std::vector<math::expression<variable_type>> gate_results(extended_domain_sizes.size());
+                            for (std::size_t constraint_idx = 0; constraint_idx < gate.constraints.size(); ++constraint_idx) {
+                                const auto& constraint = gate.constraints[constraint_idx];
+                                auto next_term = constraint * theta_acc;
+
+                                theta_acc *= theta;
+                                // +1 stands for the selector multiplication.
+                                size_t constraint_degree = visitor.compute_max_degree(constraint) + 1;
+                                for (int i = extended_domain_sizes.size() - 1; i >= 0; --i) {
+                                    // Whatever the degree of term is, add it to the maximal degree expression.
+                                    if (degree_limits[i] >= constraint_degree || i == 0) {
+                                        gate_results[i] += next_term;
+                                        break;
+                                    }
+                                }
+                            }
+                            variable_type selector(gate.selector_index, 0, false, variable_type::column_type::selector);
+                            for (size_t i = 0; i < extended_domain_sizes.size(); ++i) {
+                                gate_results[i] *= selector;
+                                expressions[i] += gate_results[i];
+                            }
+                        }
+
+                        std::array<polynomial_dfs_type, argument_size> F;
+
+                        F[0] = polynomial_dfs_type::zero();
+                        for (std::size_t i = 0; i < extended_domain_sizes.size(); ++i) {
+                            std::unordered_map<variable_type, polynomial_dfs_type> variable_values;
+
+                            build_variable_value_map(
+                                expressions[i], column_polynomials, original_domain,
+                                extended_domain_sizes[i], variable_values,
+                                mask_polynomial, lagrange_0
+                            );
+
+                            polynomial_dfs_type result(extended_domain_sizes[i] - 1, extended_domain_sizes[i]);
+                            wait_for_all(parallel_run_in_chunks<void>(
+                                extended_domain_sizes[i],
+                                [&variable_values, &extended_domain_sizes, &result, &expressions, i]
+                                (std::size_t begin, std::size_t end) {
+                                    for (std::size_t j = begin; j < end; ++j) {
+                                        // Don't use cache here. In practice it's slower to maintain the cache
+                                        // than to re-compute the subexpression value when value type is field element.
+                                        math::expression_evaluator<variable_type> evaluator(
+                                            expressions[i],
+                                            [&assignments=variable_values, j]
+                                                (const variable_type &var) -> const typename FieldType::value_type& {
+                                                    return assignments[var][j];
+                                            });
+                                        result[j] = evaluator.evaluate();
+                                    }
+                            }, ThreadPool::PoolLevel::HIGH));
+
+                            F[0] += result;
+                        };
+                        return F;
+                    }
+#endif
                     static inline std::array<typename FieldType::value_type, argument_size>
                         verify_eval(const std::vector<plonk_gate<FieldType, plonk_constraint<FieldType>>> &gates,
                                     typename policy_type::evaluation_map &evaluations,
diff --git a/parallel-crypto3/libs/parallel-zk/test/CMakeLists.txt b/parallel-crypto3/libs/parallel-zk/test/CMakeLists.txt
index e77dc23406..9cc408c128 100644
--- a/parallel-crypto3/libs/parallel-zk/test/CMakeLists.txt
+++ b/parallel-crypto3/libs/parallel-zk/test/CMakeLists.txt
@@ -19,7 +19,7 @@ cm_test_link_libraries(${CMAKE_WORKSPACE_NAME}_${CURRENT_PROJECT_NAME}
     Boost::unit_test_framework
     Boost::log
     )
-set_target_properties(_cm_internal_tests-actor-zk-test PROPERTIES CXX_STANDARD 20)
+set_target_properties(_cm_internal_tests-actor-zk-test PROPERTIES CXX_STANDARD 23)
 target_precompile_headers(_cm_internal_tests-actor-zk-test REUSE_FROM crypto3_precompiled_headers)
 
 if(PROFILING_ENABLED)
@@ -39,7 +39,7 @@ macro(define_zk_test test)
 
                                ${Boost_INCLUDE_DIRS})
 
-    set_target_properties(${full_test_name} PROPERTIES CXX_STANDARD 20)
+    set_target_properties(${full_test_name} PROPERTIES CXX_STANDARD 23)
 
     if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
         target_compile_options(${full_test_name} PRIVATE "-fconstexpr-steps=2147483647")
diff --git a/parallel-crypto3/libs/parallelization-utils/test/CMakeLists.txt b/parallel-crypto3/libs/parallelization-utils/test/CMakeLists.txt
index 4656ac9944..dffafd0e65 100644
--- a/parallel-crypto3/libs/parallelization-utils/test/CMakeLists.txt
+++ b/parallel-crypto3/libs/parallelization-utils/test/CMakeLists.txt
@@ -38,7 +38,7 @@ macro(define_actor_core_test name)
 
                                ${Boost_INCLUDE_DIRS})
 
-    set_target_properties(${test_name} PROPERTIES CXX_STANDARD 20)
+    set_target_properties(${test_name} PROPERTIES CXX_STANDARD 23)
 
     get_target_property(target_type Boost::unit_test_framework TYPE)
     if(target_type STREQUAL "SHARED_LIB")
diff --git a/proof-producer.nix b/proof-producer.nix
index ec8f0c696d..8b894abb6a 100644
--- a/proof-producer.nix
+++ b/proof-producer.nix
@@ -1,4 +1,6 @@
 { lib,
+  pkgs,
+  llvmPackages_19,
   stdenv,
   ninja,
   pkg-config,
@@ -16,13 +18,18 @@
   staticBuild ? true,
   runTests ? false,
   sanitize? false,
-  crypto3_tests? false,
-  parallel_crypto3_tets? false,
-  crypto3_bechmarks? false,
-  parallel_crypto3_bechmarks? false,
+  crypto3_tests ? false,
+  parallel_crypto3_tests ? false,
+  crypto3_bechmarks ? false,
+  parallel_crypto3_bechmarks ? false,
+  enableGPU ? false,
   }:
 let
   inherit (lib) optional;
+  opensycl = pkgs.callPackage ./opensycl.nix {
+    inherit (pkgs);
+    cudaSupport = enableGPU;
+  };
 in stdenv.mkDerivation {
   name = "Proof-producer";
   pname = "proof-producer";
@@ -30,15 +37,22 @@ in stdenv.mkDerivation {
   src = lib.sourceByRegex ./. ["^proof-producer(/.*)?$" "^crypto3(/.*)?$" "^parallel-crypto3(/.*)?$" "CMakeLists.txt"];
   hardeningDisable = [ "fortify" ];
 
-  nativeBuildInputs = [ cmake ninja pkg-config ] ++
+  nativeBuildInputs = [ cmake ninja pkg-config opensycl llvmPackages_19.openmp ] ++
                        (lib.optional (!stdenv.isDarwin) gdb) ++
                        (lib.optional (stdenv.isDarwin) lldb);
 
   # enableDebugging will keep debug symbols in boost
   propagatedBuildInputs = [ (if enableDebug then (enableDebugging boost) else boost) ];
 
-  buildInputs = [cmake_modules gtest protobuf] ++
-                  ( lib.optional (staticBuild) glibc.static );
+  buildInputs =
+  [cmake_modules gtest protobuf]
+  ++ ( lib.optional (staticBuild) glibc.static )
+  ++ (if enableGPU then [
+    pkgs.cudaPackages.cudatoolkit
+    pkgs.cudaPackages.cuda_cudart
+    pkgs.cudaPackages.cuda_nvcc
+    pkgs.linuxPackages.nvidia_x11
+  ] else []);
 
   cmakeFlags =
     [
@@ -48,11 +62,12 @@ in stdenv.mkDerivation {
       (if sanitize then "-DSANITIZE=ON" else "-DSANITIZE=OFF")
       "-DPROOF_PRODUCER_ENABLE=TRUE"
       (if crypto3_tests then "-DBUILD_CRYPTO3_TESTS=TRUE" else "-DBUILD_CRYPTO3_TESTS=False")
-      (if parallel_crypto3_tets then "-DBUILD_PARALLEL_CRYPTO3_TESTS=TRUE" else "")
+      (if parallel_crypto3_tests then "-DBUILD_PARALLEL_CRYPTO3_TESTS=TRUE" else "")
       (if parallel_crypto3_bechmarks then "-DENABLE_BENCHMARKS=ON" else "-DENABLE_BENCHMARKS=OFF")
       (if crypto3_bechmarks then "-DBUILD_CRYPTO3_BENCH_TESTS=ON" else "-DBUILD_CRYPTO3_BENCH_TESTS=OFF")
       (if staticBuild then "-DPROOF_PRODUCER_STATIC_BINARIES=ON" else "-DPROOF_PRODUCER_STATIC_BINARIES=OFF")
       "-G Ninja"
+      (if enableGPU then "-DGPU_PROVER=ON" else "-DGPU_PROVER=OFF")
     ];
 
   cmakeBuildType = if enableDebug then "Debug" else "Release";
@@ -69,7 +84,14 @@ in stdenv.mkDerivation {
     find .. -type f -name '*_benchmark.xml' -exec cp {} ${placeholder "out"}/test-logs \;
   '';
 
-  shellHook = ''
+  shellHook =
+  (if enableGPU then ''
+    CXX=syclcc-clang; export CXX
+    ACPP_ADAPTIVITY_LEVEL=2; export ACPP_ADAPTIVITY_LEVEL
+  '' else "") +
+  ''
+    rm -rf build
+    eval $configurePhase
     PS1="\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ "
     echo "Welcome to Proof-producer development environment!"
   '';
diff --git a/proof-producer/libs/assigner/CMakeLists.txt b/proof-producer/libs/assigner/CMakeLists.txt
index 5d85ab5d19..2888d38830 100644
--- a/proof-producer/libs/assigner/CMakeLists.txt
+++ b/proof-producer/libs/assigner/CMakeLists.txt
@@ -36,7 +36,7 @@ add_library(proof_generatorAssigner
             ${PROTO_SRC}
             ${PROTO_HASH_HEADER}
 )
-set_target_properties(proof_generatorAssigner PROPERTIES CXX_STANDARD 20)
+set_target_properties(proof_generatorAssigner PROPERTIES CXX_STANDARD 23)
 
 target_include_directories(proof_generatorAssigner
                             PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_BINARY_DIR}