diff --git a/mops/CMakeLists.txt b/mops/CMakeLists.txt
index 4899b0b..b1384e6 100644
--- a/mops/CMakeLists.txt
+++ b/mops/CMakeLists.txt
@@ -25,6 +25,10 @@ else()
 endif()
 
 option(BUILD_SHARED_LIBS "Build shared libraries instead of static ones" OFF)
+
+OPTION(MOPS_OPENMP "Try to use OpenMP when compiling Sphericart" ON)
+OPTION(MOPS_ARCH_NATIVE "Try to use -march=native when compiling Sphericart" ON)
+
 set(LIB_INSTALL_DIR "lib" CACHE PATH "Path relative to CMAKE_INSTALL_PREFIX where to install libraries")
 set(BIN_INSTALL_DIR "bin" CACHE PATH "Path relative to CMAKE_INSTALL_PREFIX where to install DLL/binaries")
 set(INCLUDE_INSTALL_DIR "include" CACHE PATH "Path relative to CMAKE_INSTALL_PREFIX where to install headers")
@@ -39,9 +43,9 @@ option(MOPS_TESTS "build and run mops unit tests" ${MOPS_MAIN_PROJECT})
 # Set a default build type if none was specified
 if (${MOPS_MAIN_PROJECT})
     if("${CMAKE_BUILD_TYPE}" STREQUAL "" AND "${CMAKE_CONFIGURATION_TYPES}" STREQUAL "")
-        message(STATUS "Setting build type to 'relwithdebinfo' as none was specified.")
+        message(STATUS "Setting build type to 'release' as none was specified.")
         set(
-            CMAKE_BUILD_TYPE "relwithdebinfo"
+            CMAKE_BUILD_TYPE "release"
             CACHE STRING
             "Choose the type of build, options are: none(CMAKE_CXX_FLAGS or CMAKE_C_FLAGS used) debug release relwithdebinfo minsizerel."
             FORCE
@@ -109,6 +113,39 @@ target_include_directories(mops PUBLIC
 )
 
 
+# Handle optimization and OpenMP flags
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-Wunknown-pragmas" COMPILER_SUPPORTS_WPRAGMAS)
+if (MOPS_OPENMP)
+    find_package(OpenMP)
+    if(OpenMP_CXX_FOUND)
+        message(STATUS "OpenMP is enabled")
+        target_link_libraries(mops PUBLIC OpenMP::OpenMP_CXX)
+    else()
+        message(WARNING "Could not find OpenMP")
+        if(COMPILER_SUPPORTS_WPRAGMAS)
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas")
+        endif()
+    endif()
+else()
+    if(COMPILER_SUPPORTS_WPRAGMAS)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas")
+    endif()
+endif()
+
+if (MOPS_ARCH_NATIVE)
+    check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
+    # for some reason COMPILER_SUPPORTS_MARCH_NATIVE is true with Apple clang,
+    # but then fails with `the clang compiler does not support '-march=native'`
+    if(COMPILER_SUPPORTS_MARCH_NATIVE AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        message(STATUS "march=native is enabled")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+    else()
+        message(STATUS "march=native is not supported by this compiler")
+    endif()
+endif()
+
+
 if (MOPS_TESTS)
     enable_testing()
     add_subdirectory(tests)
diff --git a/mops/src/opsa/cpu.tpp b/mops/src/opsa/cpu.tpp
index c629453..2860256 100644
--- a/mops/src/opsa/cpu.tpp
+++ b/mops/src/opsa/cpu.tpp
@@ -41,7 +41,6 @@ void mops::outer_product_scatter_add(
 
     for (size_t i=0; i<tensor_a.shape[0]; i++) {
         auto i_output = indexes.data[i];
-        assert(i_output < output.shape[0]);
         for (size_t a_j=0; a_j<tensor_a.shape[1]; a_j++) {
             for (size_t b_j=0; b_j<tensor_b.shape[1]; b_j++) {
                 auto output_index = tensor_b.shape[1] * (tensor_a.shape[1] * i_output + a_j) + b_j;
diff --git a/python/mops/benchmarks/benchmark.py b/python/mops/benchmarks/benchmark.py
index fc545f2..0fe07b1 100644
--- a/python/mops/benchmarks/benchmark.py
+++ b/python/mops/benchmarks/benchmark.py
@@ -1,24 +1,49 @@
-import numpy as np
+import math
 import time
+
 import matplotlib.pyplot as plt
+import numpy as np
 
 
-def benchmark(function, repeats=1000, plot=True):
+def benchmark(function, repeats=1000, warmup=10, plot=True):
+    for _ in range(warmup):
+        function()
 
     timings = []
     for _ in range(repeats):
         start = time.time()
         function()
         end = time.time()
-        timings.append(end-start)
+        timings.append(end - start)
 
     times_array = np.array(timings)
     mean = np.mean(times_array)
     std = np.std(times_array)
-    if std > 0.1 * mean: print("warning: inconsistent timings")
+    if std > 0.1 * mean:
+        print("warning: inconsistent timings")
 
     if plot:
         plt.plot(np.arange(times_array.shape[0]), times_array, ".")
         plt.savefig("benchmark_plot.pdf")
 
     return mean, std
+
+
+def format_mean_std(mean, std_dev, decimals=2):
+    # find the exponent
+    if mean != 0:
+        exponent = math.floor(math.log10(abs(mean)))
+    else:
+        exponent = 0
+
+    # scale the mean and standard deviation by the exponent
+    scaled_mean = mean / (10**exponent)
+    scaled_std_dev = std_dev / (10**exponent)
+
+    # format the scaled mean and standard deviation
+    format_string = f"{{:.{decimals}f}}"
+    formatted_mean = format_string.format(scaled_mean)
+    formatted_std_dev = format_string.format(scaled_std_dev)
+    final_string = f"({formatted_mean}±{formatted_std_dev})e{exponent}"
+
+    return final_string
diff --git a/python/mops/benchmarks/hpe.py b/python/mops/benchmarks/hpe.py
index b611e4f..a28f3ff 100644
--- a/python/mops/benchmarks/hpe.py
+++ b/python/mops/benchmarks/hpe.py
@@ -1,7 +1,7 @@
 import numpy as np
-from benchmark import benchmark
-
+from benchmark import benchmark, format_mean_std
 from mops.reference_implementations import homogeneous_polynomial_evaluation as ref_hpe
+
 from mops import homogeneous_polynomial_evaluation as hpe
 
 np.random.seed(0xDEADBEEF)
@@ -13,7 +13,7 @@
 ref_mean, ref_std = benchmark(lambda: ref_hpe(A, C, P))
 mean, std = benchmark(lambda: hpe(A, C, P))
 
-print(f"{ref_mean:.2e}", f"{ref_std:.2e}")
-print(f"{mean:.2e}", f"{std:.2e}")
+print("Reference implementation:", format_mean_std(ref_mean, ref_std))
+print("Optimized implementation:", format_mean_std(mean, std))
 
-print("Speed-up:", ref_mean/mean)
+print("Speed-up:", ref_mean / mean)
diff --git a/python/mops/benchmarks/opsa.py b/python/mops/benchmarks/opsa.py
new file mode 100644
index 0000000..17a8b26
--- /dev/null
+++ b/python/mops/benchmarks/opsa.py
@@ -0,0 +1,20 @@
+import numpy as np
+from benchmark import benchmark, format_mean_std
+from mops.reference_implementations import outer_product_scatter_add as ref_opsa
+
+from mops import outer_product_scatter_add as opsa
+
+np.random.seed(0xDEADBEEF)
+
+A = np.random.rand(1000, 20)
+B = np.random.rand(1000, 5)
+
+indices = np.sort(np.random.randint(10, size=(1000,)))
+
+ref_mean, ref_std = benchmark(lambda: ref_opsa(A, B, indices, np.max(indices) + 1))
+mean, std = benchmark(lambda: opsa(A, B, indices, np.max(indices) + 1))
+
+print("Reference implementation:", format_mean_std(ref_mean, ref_std))
+print("Optimized implementation:", format_mean_std(mean, std))
+
+print("Speed-up:", ref_mean / mean)
diff --git a/python/mops/benchmarks/opsax.py b/python/mops/benchmarks/opsax.py
new file mode 100644
index 0000000..56bf1d9
--- /dev/null
+++ b/python/mops/benchmarks/opsax.py
@@ -0,0 +1,26 @@
+import numpy as np
+from benchmark import benchmark, format_mean_std
+from mops.reference_implementations import (
+    outer_product_scatter_add_with_weights as ref_opsax,
+)
+
+from mops import outer_product_scatter_add_with_weights as opsax
+
+np.random.seed(0xDEADBEEF)
+
+
+A = np.random.rand(100, 10)
+R = np.random.rand(100, 5)
+n_O = 20
+X = np.random.rand(n_O, 5)
+
+I = np.random.randint(20, size=(100,))
+J = np.random.randint(20, size=(100,))
+
+ref_mean, ref_std = benchmark(lambda: ref_opsax(A, R, X, I, J, 20))
+mean, std = benchmark(lambda: opsax(A, R, X, I, J, 20))
+
+print("Reference implementation:", format_mean_std(ref_mean, ref_std))
+print("Optimized implementation:", format_mean_std(mean, std))
+
+print("Speed-up:", ref_mean / mean)
diff --git a/python/mops/benchmarks/sap.py b/python/mops/benchmarks/sap.py
new file mode 100644
index 0000000..c54f03d
--- /dev/null
+++ b/python/mops/benchmarks/sap.py
@@ -0,0 +1,24 @@
+import numpy as np
+from benchmark import benchmark, format_mean_std
+from mops.reference_implementations import sparse_accumulation_of_products as ref_sap
+
+from mops import sparse_accumulation_of_products as sap
+
+np.random.seed(0xDEADBEEF)
+
+A = np.random.rand(1000, 20)
+B = np.random.rand(1000, 6)
+C = np.random.rand(100)
+
+P_A = np.random.randint(20, size=(100,))
+P_B = np.random.randint(6, size=(100,))
+n_O = 50
+P_O = np.random.randint(n_O, size=(100,))
+
+ref_mean, ref_std = benchmark(lambda: ref_sap(A, B, C, P_A, P_B, P_O, n_O))
+mean, std = benchmark(lambda: sap(A, B, C, P_A, P_B, P_O, n_O))
+
+print("Reference implementation:", format_mean_std(ref_mean, ref_std))
+print("Optimized implementation:", format_mean_std(mean, std))
+
+print("Speed-up:", ref_mean / mean)
diff --git a/python/mops/benchmarks/sasax.py b/python/mops/benchmarks/sasax.py
new file mode 100644
index 0000000..db416ee
--- /dev/null
+++ b/python/mops/benchmarks/sasax.py
@@ -0,0 +1,31 @@
+import numpy as np
+from benchmark import benchmark, format_mean_std
+from mops.reference_implementations import (
+    sparse_accumulation_scatter_add_with_weights as ref_sasax,
+)
+
+from mops import sparse_accumulation_scatter_add_with_weights as sasax
+
+np.random.seed(0xDEADBEEF)
+
+A = np.random.rand(100, 20)
+R = np.random.rand(100, 200)
+X = np.random.rand(25, 13, 200)
+C = np.random.rand(50)
+n_O1 = 25
+I = np.random.randint(25, size=(100,))
+J = np.random.randint(25, size=(100,))
+n_O2 = 15
+M_1 = np.random.randint(20, size=(50,))
+M_2 = np.random.randint(13, size=(50,))
+M_3 = np.random.randint(n_O2, size=(50,))
+
+ref_mean, ref_std = benchmark(
+    lambda: ref_sasax(A, R, X, C, I, J, M_1, M_2, M_3, n_O1, n_O2)
+)
+mean, std = benchmark(lambda: sasax(A, R, X, C, I, J, M_1, M_2, M_3, n_O1, n_O2))
+
+print("Reference implementation:", format_mean_std(ref_mean, ref_std))
+print("Optimized implementation:", format_mean_std(mean, std))
+
+print("Speed-up:", ref_mean / mean)
diff --git a/setup.py b/setup.py
index b612afb..3d2151f 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@ def run(self):
         os.makedirs(build_dir, exist_ok=True)
 
         cmake_options = [
-            "-DCMAKE_BUILD_TYPE=Debug",
+            "-DCMAKE_BUILD_TYPE=release",
             "-DBUILD_SHARED_LIBS=ON",
             f"-DCMAKE_INSTALL_PREFIX={install_dir}",
         ]
diff --git a/tox.ini b/tox.ini
index e4b28b2..a4f4498 100644
--- a/tox.ini
+++ b/tox.ini
@@ -44,6 +44,7 @@ deps =
     black
     blackdoc
     isort
+    clang-format
 commands =
     isort {[testenv]lint_folders}
     black {[testenv]lint_folders}