diff --git a/mops/CMakeLists.txt b/mops/CMakeLists.txt index 4899b0b..b1384e6 100644 --- a/mops/CMakeLists.txt +++ b/mops/CMakeLists.txt @@ -25,6 +25,10 @@ else() endif() option(BUILD_SHARED_LIBS "Build shared libraries instead of static ones" OFF) + +OPTION(MOPS_OPENMP "Try to use OpenMP when compiling Sphericart" ON) +OPTION(MOPS_ARCH_NATIVE "Try to use -march=native when compiling Sphericart" ON) + set(LIB_INSTALL_DIR "lib" CACHE PATH "Path relative to CMAKE_INSTALL_PREFIX where to install libraries") set(BIN_INSTALL_DIR "bin" CACHE PATH "Path relative to CMAKE_INSTALL_PREFIX where to install DLL/binaries") set(INCLUDE_INSTALL_DIR "include" CACHE PATH "Path relative to CMAKE_INSTALL_PREFIX where to install headers") @@ -39,9 +43,9 @@ option(MOPS_TESTS "build and run mops unit tests" ${MOPS_MAIN_PROJECT}) # Set a default build type if none was specified if (${MOPS_MAIN_PROJECT}) if("${CMAKE_BUILD_TYPE}" STREQUAL "" AND "${CMAKE_CONFIGURATION_TYPES}" STREQUAL "") - message(STATUS "Setting build type to 'relwithdebinfo' as none was specified.") + message(STATUS "Setting build type to 'release' as none was specified.") set( - CMAKE_BUILD_TYPE "relwithdebinfo" + CMAKE_BUILD_TYPE "release" CACHE STRING "Choose the type of build, options are: none(CMAKE_CXX_FLAGS or CMAKE_C_FLAGS used) debug release relwithdebinfo minsizerel." FORCE @@ -109,6 +113,39 @@ target_include_directories(mops PUBLIC ) +# Handle optimization and OpenMP flags +include(CheckCXXCompilerFlag) +check_cxx_compiler_flag("-Wunknown-pragmas" COMPILER_SUPPORTS_WPRAGMAS) +if (MOPS_OPENMP) + find_package(OpenMP) + if(OpenMP_CXX_FOUND) + message(STATUS "OpenMP is enabled") + target_link_libraries(mops PUBLIC OpenMP::OpenMP_CXX) + else() + message(WARNING "Could not find OpenMP") + if(COMPILER_SUPPORTS_WPRAGMAS) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas") + endif() + endif() +else() + if(COMPILER_SUPPORTS_WPRAGMAS) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas") + endif() +endif() + +if (MOPS_ARCH_NATIVE) + check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) + # for some reason COMPILER_SUPPORTS_MARCH_NATIVE is true with Apple clang, + # but then fails with `the clang compiler does not support '-march=native'` + if(COMPILER_SUPPORTS_MARCH_NATIVE AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") + message(STATUS "march=native is enabled") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") + else() + message(STATUS "march=native is not supported by this compiler") + endif() +endif() + + if (MOPS_TESTS) enable_testing() add_subdirectory(tests) diff --git a/mops/src/opsa/cpu.tpp b/mops/src/opsa/cpu.tpp index c629453..2860256 100644 --- a/mops/src/opsa/cpu.tpp +++ b/mops/src/opsa/cpu.tpp @@ -41,7 +41,6 @@ void mops::outer_product_scatter_add( for (size_t i=0; i 0.1 * mean: print("warning: inconsistent timings") + if std > 0.1 * mean: + print("warning: inconsistent timings") if plot: plt.plot(np.arange(times_array.shape[0]), times_array, ".") plt.savefig("benchmark_plot.pdf") return mean, std + + +def format_mean_std(mean, std_dev, decimals=2): + # find the exponent + if mean != 0: + exponent = math.floor(math.log10(abs(mean))) + else: + exponent = 0 + + # scale the mean and standard deviation by the exponent + scaled_mean = mean / (10**exponent) + scaled_std_dev = std_dev / (10**exponent) + + # format the scaled mean and standard deviation + format_string = f"{{:.{decimals}f}}" + formatted_mean = format_string.format(scaled_mean) + formatted_std_dev = format_string.format(scaled_std_dev) + final_string = f"({formatted_mean}±{formatted_std_dev})e{exponent}" + + return final_string diff --git a/python/mops/benchmarks/hpe.py b/python/mops/benchmarks/hpe.py index b611e4f..a28f3ff 100644 --- a/python/mops/benchmarks/hpe.py +++ b/python/mops/benchmarks/hpe.py @@ -1,7 +1,7 @@ import numpy as np -from benchmark import benchmark - +from benchmark import benchmark, format_mean_std from mops.reference_implementations import homogeneous_polynomial_evaluation as ref_hpe + from mops import homogeneous_polynomial_evaluation as hpe np.random.seed(0xDEADBEEF) @@ -13,7 +13,7 @@ ref_mean, ref_std = benchmark(lambda: ref_hpe(A, C, P)) mean, std = benchmark(lambda: hpe(A, C, P)) -print(f"{ref_mean:.2e}", f"{ref_std:.2e}") -print(f"{mean:.2e}", f"{std:.2e}") +print("Reference implementation:", format_mean_std(ref_mean, ref_std)) +print("Optimized implementation:", format_mean_std(mean, std)) -print("Speed-up:", ref_mean/mean) +print("Speed-up:", ref_mean / mean) diff --git a/python/mops/benchmarks/opsa.py b/python/mops/benchmarks/opsa.py new file mode 100644 index 0000000..17a8b26 --- /dev/null +++ b/python/mops/benchmarks/opsa.py @@ -0,0 +1,20 @@ +import numpy as np +from benchmark import benchmark, format_mean_std +from mops.reference_implementations import outer_product_scatter_add as ref_opsa + +from mops import outer_product_scatter_add as opsa + +np.random.seed(0xDEADBEEF) + +A = np.random.rand(1000, 20) +B = np.random.rand(1000, 5) + +indices = np.sort(np.random.randint(10, size=(1000,))) + +ref_mean, ref_std = benchmark(lambda: ref_opsa(A, B, indices, np.max(indices) + 1)) +mean, std = benchmark(lambda: opsa(A, B, indices, np.max(indices) + 1)) + +print("Reference implementation:", format_mean_std(ref_mean, ref_std)) +print("Optimized implementation:", format_mean_std(mean, std)) + +print("Speed-up:", ref_mean / mean) diff --git a/python/mops/benchmarks/opsax.py b/python/mops/benchmarks/opsax.py new file mode 100644 index 0000000..56bf1d9 --- /dev/null +++ b/python/mops/benchmarks/opsax.py @@ -0,0 +1,26 @@ +import numpy as np +from benchmark import benchmark, format_mean_std +from mops.reference_implementations import ( + outer_product_scatter_add_with_weights as ref_opsax, +) + +from mops import outer_product_scatter_add_with_weights as opsax + +np.random.seed(0xDEADBEEF) + + +A = np.random.rand(100, 10) +R = np.random.rand(100, 5) +n_O = 20 +X = np.random.rand(n_O, 5) + +I = np.random.randint(20, size=(100,)) +J = np.random.randint(20, size=(100,)) + +ref_mean, ref_std = benchmark(lambda: ref_opsax(A, R, X, I, J, 20)) +mean, std = benchmark(lambda: opsax(A, R, X, I, J, 20)) + +print("Reference implementation:", format_mean_std(ref_mean, ref_std)) +print("Optimized implementation:", format_mean_std(mean, std)) + +print("Speed-up:", ref_mean / mean) diff --git a/python/mops/benchmarks/sap.py b/python/mops/benchmarks/sap.py new file mode 100644 index 0000000..c54f03d --- /dev/null +++ b/python/mops/benchmarks/sap.py @@ -0,0 +1,24 @@ +import numpy as np +from benchmark import benchmark, format_mean_std +from mops.reference_implementations import sparse_accumulation_of_products as ref_sap + +from mops import sparse_accumulation_of_products as sap + +np.random.seed(0xDEADBEEF) + +A = np.random.rand(1000, 20) +B = np.random.rand(1000, 6) +C = np.random.rand(100) + +P_A = np.random.randint(20, size=(100,)) +P_B = np.random.randint(6, size=(100,)) +n_O = 50 +P_O = np.random.randint(n_O, size=(100,)) + +ref_mean, ref_std = benchmark(lambda: ref_sap(A, B, C, P_A, P_B, P_O, n_O)) +mean, std = benchmark(lambda: sap(A, B, C, P_A, P_B, P_O, n_O)) + +print("Reference implementation:", format_mean_std(ref_mean, ref_std)) +print("Optimized implementation:", format_mean_std(mean, std)) + +print("Speed-up:", ref_mean / mean) diff --git a/python/mops/benchmarks/sasax.py b/python/mops/benchmarks/sasax.py new file mode 100644 index 0000000..db416ee --- /dev/null +++ b/python/mops/benchmarks/sasax.py @@ -0,0 +1,31 @@ +import numpy as np +from benchmark import benchmark, format_mean_std +from mops.reference_implementations import ( + sparse_accumulation_scatter_add_with_weights as ref_sasax, +) + +from mops import sparse_accumulation_scatter_add_with_weights as sasax + +np.random.seed(0xDEADBEEF) + +A = np.random.rand(100, 20) +R = np.random.rand(100, 200) +X = np.random.rand(25, 13, 200) +C = np.random.rand(50) +n_O1 = 25 +I = np.random.randint(25, size=(100,)) +J = np.random.randint(25, size=(100,)) +n_O2 = 15 +M_1 = np.random.randint(20, size=(50,)) +M_2 = np.random.randint(13, size=(50,)) +M_3 = np.random.randint(n_O2, size=(50,)) + +ref_mean, ref_std = benchmark( + lambda: ref_sasax(A, R, X, C, I, J, M_1, M_2, M_3, n_O1, n_O2) +) +mean, std = benchmark(lambda: sasax(A, R, X, C, I, J, M_1, M_2, M_3, n_O1, n_O2)) + +print("Reference implementation:", format_mean_std(ref_mean, ref_std)) +print("Optimized implementation:", format_mean_std(mean, std)) + +print("Speed-up:", ref_mean / mean) diff --git a/setup.py b/setup.py index b612afb..3d2151f 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ def run(self): os.makedirs(build_dir, exist_ok=True) cmake_options = [ - "-DCMAKE_BUILD_TYPE=Debug", + "-DCMAKE_BUILD_TYPE=release", "-DBUILD_SHARED_LIBS=ON", f"-DCMAKE_INSTALL_PREFIX={install_dir}", ] diff --git a/tox.ini b/tox.ini index e4b28b2..a4f4498 100644 --- a/tox.ini +++ b/tox.ini @@ -44,6 +44,7 @@ deps = black blackdoc isort + clang-format commands = isort {[testenv]lint_folders} black {[testenv]lint_folders}