From 934f0a296dfc2e1f28e11b522be0c2e42db9bb37 Mon Sep 17 00:00:00 2001 From: Filippo Bigi <98903385+frostedoyster@users.noreply.github.com> Date: Tue, 14 Nov 2023 13:03:29 +0100 Subject: [PATCH 1/2] Add tests (#6) --- .github/workflows/tests.yml | 63 +++++++++++++++++++ .gitignore | 1 + README.md | 1 + mops/CMakeLists.txt | 3 +- python/mops/src/mops/opsa.py | 36 +++++------ .../src/mops/reference_implementations.py | 22 +++++-- python/mops/tests/opsa.py | 35 +++++++---- tox.ini | 20 +++++- 8 files changed, 141 insertions(+), 40 deletions(-) create mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..bf0b44e --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,63 @@ +name: Tests + +on: + push: + branches: [main] + pull_request: + # Run for all PRs + +jobs: + tests: + runs-on: ${{ matrix.os }} + name: Test on ${{ matrix.os }} + strategy: + matrix: + include: + - os: ubuntu-20.04 + - os: macos-11 + steps: + - uses: actions/checkout@v3 + + - name: setup Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: install tests dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox + + - name: run C++ build and tests + run: | + mkdir build + cd build + cmake -DMOPS_TESTS=ON ../mops/ + cmake --build . + ctest + + - name: run Python tests + run: python -m tox + + # check that we can build Python wheels on any Python version + python-build: + runs-on: ubuntu-20.04 + name: check Python build + strategy: + matrix: + python-version: ['3.7', '3.11'] + steps: + - uses: actions/checkout@v3 + + - name: set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: install python dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox wheel + + - name: python build tests + run: tox -e build-python diff --git a/.gitignore b/.gitignore index 9c9d88a..b27fce4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ dist/ .tox/ *.egg-info +__pycache__/ diff --git a/README.md b/README.md index 9e7c3c0..b098f08 100644 --- a/README.md +++ b/README.md @@ -111,6 +111,7 @@ for e in range(E): #### Math notation $$ O_{i{m_3}k} = \sum_{e \in \{e'|I_{e'}=i\}} R_{ek} \sum_{n \in \{n'|M^3_{n'}=m_3\}} C_n A_{e{M_n^1}} X_{{J_e}{M_n^2}k} $$ + #### Calculation ```python diff --git a/mops/CMakeLists.txt b/mops/CMakeLists.txt index f2c79de..953d924 100644 --- a/mops/CMakeLists.txt +++ b/mops/CMakeLists.txt @@ -66,7 +66,8 @@ add_library(mops if(CMAKE_CUDA_COMPILER) target_compile_definitions(mops PUBLIC MOPS_CUDA_ENABLED) target_sources(mops - "src/opsa/opsa.cu" + PRIVATE + src/opsa/opsa.cu ) endif() diff --git a/python/mops/src/mops/opsa.py b/python/mops/src/mops/opsa.py index d78e258..810b62f 100644 --- a/python/mops/src/mops/opsa.py +++ b/python/mops/src/mops/opsa.py @@ -2,38 +2,34 @@ from ._c_lib import _get_library from .utils import numpy_to_mops_tensor +from .checks import check_opsa -def outer_product_scatter_add(A, B, indices): - return _outer_product_scatter_add_numpy(A, B, indices) +def outer_product_scatter_add(A, B, P, n_O): + check_opsa(A, B, P, n_O) - -def _outer_product_scatter_add_numpy(A, B, indices): A = np.ascontiguousarray(A) B = np.ascontiguousarray(B) - indices = np.ascontiguousarray(indices) + P = np.ascontiguousarray(P) + # TODO: Include these checks in check_opsa if A.dtype != B.dtype: raise TypeError("A and B must have the same dtype") - if len(A.shape) != 2 or len(B.shape) != 2: raise TypeError("A and B must be 2-dimensional arrays") - - if not np.can_cast(indices, np.int32, "same_kind"): - raise TypeError("indices must be an array of integers") - - indices = indices.astype(np.int32) - - if len(indices.shape) != 1: - raise TypeError("indices must be 1-dimensional arrays") - - if A.shape[0] != B.shape[0] or A.shape[0] != indices.shape[0]: + if not np.can_cast(P, np.int32, "same_kind"): + raise TypeError("P must be an array of integers") + if len(P.shape) != 1: + raise TypeError("P must be 1-dimensional arrays") + if A.shape[0] != B.shape[0] or A.shape[0] != P.shape[0]: raise TypeError( - "A, B and indices must have the same number of elements on the " + "A, B and P must have the same number of elements on the " "first dimension" ) - output = np.zeros((np.max(indices) + 1, A.shape[1] * B.shape[1]), dtype=A.dtype) + P = P.astype(np.int32) + + output = np.zeros((n_O, A.shape[1] * B.shape[1]), dtype=A.dtype) # TODO: 3D arrays lib = _get_library() @@ -42,13 +38,13 @@ def _outer_product_scatter_add_numpy(A, B, indices): elif A.dtype == np.float64: function = lib.mops_outer_product_scatter_add_f64 else: - raise TypeError("only float32 and float64 are supported") + raise TypeError("Unsupported dtype detected. Only float32 and float64 are supported") function( numpy_to_mops_tensor(output), numpy_to_mops_tensor(A), numpy_to_mops_tensor(B), - numpy_to_mops_tensor(indices), + numpy_to_mops_tensor(P), ) return output.reshape(-1, A.shape[1], B.shape[1]) diff --git a/python/mops/src/mops/reference_implementations.py b/python/mops/src/mops/reference_implementations.py index d19617b..05f77bd 100644 --- a/python/mops/src/mops/reference_implementations.py +++ b/python/mops/src/mops/reference_implementations.py @@ -1,8 +1,8 @@ import numpy as np -from checks import check_hpe, check_sap, check_opsa, check_opsax, check_sasax +from .checks import check_hpe, check_sap, check_opsa, check_opsax, check_sasax -def hpe(C, A, P): +def homogeneous_polynomial_evaluation(C, A, P): check_hpe(C, A, P) O = np.zeros((A.shape[0],), dtype=A.dtype) @@ -14,8 +14,10 @@ def hpe(C, A, P): temp *= A[:, P[j, k]] O[:] += temp + return O -def sap(C, A, B, P_A, P_B, P_O, n_O): + +def sparse_accumulation_of_products(C, A, B, P_A, P_B, P_O, n_O): check_sap(C, A, B, P_A, P_B, P_O, n_O) O = np.zeros((A.shape[0], n_O), dtype=A.dtype) @@ -23,8 +25,10 @@ def sap(C, A, B, P_A, P_B, P_O, n_O): for k in range(K): O[:, P_O[k]] += C[k] * A[:, P_A[k]] * A[:, P_B[k]] + return O + -def opsa(A, B, P, n_O): +def outer_product_scatter_add(A, B, P, n_O): check_opsa(A, B, P, n_O) O = np.zeros((n_O, A.shape[1], B.shape[1]), dtype=A.dtype) @@ -32,8 +36,10 @@ def opsa(A, B, P, n_O): for j in range(J): O[P[j], :, :] += A[j, :, None] * B[j, None, :] + return O + -def opsax(A, R, X, I, J, n_O): +def outer_product_scatter_add_with_weights(A, R, X, I, J, n_O): check_opsax(A, R, X, I, J, n_O) O = np.zeros((n_O, A.shape[1], R.shape[1]), dtype=A.dtype) @@ -41,8 +47,10 @@ def opsax(A, R, X, I, J, n_O): for e in range(E): O[I[e], :, :] += A[e, :, None] * R[e, None, :] * X[J[e], None, :] + return O -def sasax(C, A, R, X, I, J, M_1, M_2, M_3, n_O1, n_O2): + +def sparse_accumulation_scatter_add_with_weights(C, A, R, X, I, J, M_1, M_2, M_3, n_O1, n_O2): check_sasax(C, A, R, X, I, J, M_1, M_2, M_3, n_O1, n_O2) O = np.zeros((n_O1, n_O2, A.shape[1]), dtype=A.dtype) @@ -51,3 +59,5 @@ def sasax(C, A, R, X, I, J, M_1, M_2, M_3, n_O1, n_O2): for e in range(E): for n in range(N): O[I[e], M_3[n], :] += R[e, :] * C[n] * A[e, M_1[n]] * X[J[e], M_2[n], :] + + return O diff --git a/python/mops/tests/opsa.py b/python/mops/tests/opsa.py index 6ea13cc..bd549dc 100644 --- a/python/mops/tests/opsa.py +++ b/python/mops/tests/opsa.py @@ -1,20 +1,25 @@ import numpy as np +np.random.seed(0xDEADBEEF) -import mops +import pytest +from mops.reference_implementations import outer_product_scatter_add as ref_opsa +from mops import outer_product_scatter_add as opsa -def reference_implementation(A, B, indices): - assert A.shape[0] == B.shape[0] - output = np.zeros((np.max(indices) + 1, A.shape[1], B.shape[1])) - for a, b, i in zip(A, B, indices): - output[i] += np.tensordot(a, b, axes=0) +def test_opsa(): - return output + A = np.random.rand(100, 20) + B = np.random.rand(100, 5) + indices = np.sort(np.random.randint(10, size=(100,))) -def test_opsa_numpy(): - np.random.seed(0xDEADBEEF) + reference = ref_opsa(A, B, indices, np.max(indices)+1) + actual = opsa(A, B, indices, np.max(indices)+1) + assert np.allclose(reference, actual) + + +def test_opsa_no_neighbors(): A = np.random.rand(100, 20) B = np.random.rand(100, 5) @@ -23,9 +28,15 @@ def test_opsa_numpy(): # substitute all 1s by 2s so as to test the no-neighbor case indices[indices == 1] = 2 - reference = reference_implementation(A, B, indices) - actual = mops.outer_product_scatter_add(A, B, indices) + reference = ref_opsa(A, B, indices, np.max(indices)+1) + actual = opsa(A, B, indices, np.max(indices)+1) assert np.allclose(reference, actual) -test_opsa_numpy() +def test_opsa_wrong_type(): + + with pytest.raises( + ValueError, + match="A must be a 2D array in opsa, got a 1D array" + ): + opsa(np.array([1]), 2, 3, 4) diff --git a/tox.ini b/tox.ini index d861d70..bee3f18 100644 --- a/tox.ini +++ b/tox.ini @@ -25,12 +25,30 @@ cmake-options = -DCMAKE_BUILD_TYPE=Debug \ -DMOPS_TESTS=ON - commands = cmake {toxinidir}/mops -B {envdir}/build {[testenv:cxx-tests]cmake-options} cmake --build {envdir}/build --config Debug ctest --test-dir {envdir}/build --build-config Debug --output-on-failure +[testenv:build-python] +# this environement makes sure one can build sdist and wheels for Python +deps = + setuptools + wheel + cmake + twine + build + +allowlist_externals = + bash + +commands = + # check building sdist and wheels from a checkout + python -m build . --outdir dist + twine check dist/*.tar.gz + twine check dist/*.whl + + ; [testenv:torch-cxx-tests] ; package = skip ; passenv = * From 4232a2e7354a030f877aeb690aabf752eb141e2c Mon Sep 17 00:00:00 2001 From: Filippo Bigi <98903385+frostedoyster@users.noreply.github.com> Date: Mon, 20 Nov 2023 14:08:43 +0100 Subject: [PATCH 2/2] Port sparse accumulation (CPU) into mops (#7) --- mops/CMakeLists.txt | 4 + mops/include/mops/sap.h | 65 +++++++++++ mops/include/mops/sap.hpp | 60 ++++++++++ mops/include/mops/tensor.h | 10 ++ mops/src/sap/capi.cpp | 105 ++++++++++++++++++ mops/src/sap/cpu.tpp | 75 +++++++++++++ mops/src/sap/cuda.tpp | 16 +++ mops/src/sap/sap.cpp | 62 +++++++++++ mops/src/sap/sap.cu | 1 + python/mops/src/mops/__init__.py | 1 + python/mops/src/mops/_c_api.py | 59 ++++++++++ python/mops/src/mops/checks.py | 10 +- .../src/mops/reference_implementations.py | 8 +- python/mops/src/mops/sap.py | 42 +++++++ python/mops/src/mops/utils.py | 26 ++++- python/mops/tests/sap.py | 29 +++++ 16 files changed, 560 insertions(+), 13 deletions(-) create mode 100644 mops/include/mops/sap.h create mode 100644 mops/include/mops/sap.hpp create mode 100644 mops/src/sap/capi.cpp create mode 100644 mops/src/sap/cpu.tpp create mode 100644 mops/src/sap/cuda.tpp create mode 100644 mops/src/sap/sap.cpp create mode 100644 mops/src/sap/sap.cu create mode 100644 python/mops/src/mops/sap.py create mode 100644 python/mops/tests/sap.py diff --git a/mops/CMakeLists.txt b/mops/CMakeLists.txt index 953d924..e5c5e4d 100644 --- a/mops/CMakeLists.txt +++ b/mops/CMakeLists.txt @@ -55,12 +55,16 @@ add_library(mops "src/capi.cpp" "src/opsa/opsa.cpp" "src/opsa/capi.cpp" + "src/sap/sap.cpp" + "src/sap/capi.cpp" "include/mops.hpp" "include/mops.h" "include/mops/capi.hpp" "include/mops/opsa.hpp" "include/mops/opsa.h" + "include/mops/sap.hpp" + "include/mops/sap.h" ) if(CMAKE_CUDA_COMPILER) diff --git a/mops/include/mops/sap.h b/mops/include/mops/sap.h new file mode 100644 index 0000000..3ce2d6c --- /dev/null +++ b/mops/include/mops/sap.h @@ -0,0 +1,65 @@ +#ifndef MOPS_SPARSE_ACCUMULATION_OF_PRODUCTS_H +#define MOPS_SPARSE_ACCUMULATION_OF_PRODUCTS_H + +#include "mops/exports.h" +#include "mops/tensor.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +/// CPU version of mops::sparse_accumulation_of_products for 32-bit floats +int MOPS_EXPORT mops_sparse_accumulation_of_products_f32( + mops_tensor_2d_f32_t output, + mops_tensor_2d_f32_t tensor_a, + mops_tensor_2d_f32_t tensor_b, + mops_tensor_1d_f32_t tensor_c, + mops_tensor_1d_i32_t p_a, + mops_tensor_1d_i32_t p_b, + mops_tensor_1d_i32_t p_o +); + + +/// CPU version of mops::sparse_accumulation_of_products for 64-bit floats +int MOPS_EXPORT mops_sparse_accumulation_of_products_f64( + mops_tensor_2d_f64_t output, + mops_tensor_2d_f64_t tensor_a, + mops_tensor_2d_f64_t tensor_b, + mops_tensor_1d_f64_t tensor_c, + mops_tensor_1d_i32_t p_a, + mops_tensor_1d_i32_t p_b, + mops_tensor_1d_i32_t p_o +); + + +/// CUDA version of mops::sparse_accumulation_of_products for 32-bit floats +int MOPS_EXPORT mops_cuda_sparse_accumulation_of_products_f32( + mops_tensor_2d_f32_t output, + mops_tensor_2d_f32_t tensor_a, + mops_tensor_2d_f32_t tensor_b, + mops_tensor_1d_f32_t tensor_c, + mops_tensor_1d_i32_t p_a, + mops_tensor_1d_i32_t p_b, + mops_tensor_1d_i32_t p_o +); + + +/// CUDA version of mops::sparse_accumulation_of_products for 64-bit floats +int MOPS_EXPORT mops_cuda_sparse_accumulation_of_products_f64( + mops_tensor_2d_f64_t output, + mops_tensor_2d_f64_t tensor_a, + mops_tensor_2d_f64_t tensor_b, + mops_tensor_1d_f64_t tensor_c, + mops_tensor_1d_i32_t p_a, + mops_tensor_1d_i32_t p_b, + mops_tensor_1d_i32_t p_o +); + + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/mops/include/mops/sap.hpp b/mops/include/mops/sap.hpp new file mode 100644 index 0000000..e66dc90 --- /dev/null +++ b/mops/include/mops/sap.hpp @@ -0,0 +1,60 @@ +#ifndef MOPS_SPARSE_ACCUMULATION_OF_PRODUCTS_HPP +#define MOPS_SPARSE_ACCUMULATION_OF_PRODUCTS_HPP + +#include +#include + +#include "mops/exports.h" +#include "mops/tensor.hpp" + +namespace mops { + /// TODO + template + void MOPS_EXPORT sparse_accumulation_of_products( + Tensor output, + Tensor tensor_a, + Tensor tensor_b, + Tensor tensor_c, + Tensor p_a, + Tensor p_b, + Tensor p_o + ); + + // these templates will be precompiled and provided in the mops library + extern template void sparse_accumulation_of_products( + Tensor output, + Tensor tensor_a, + Tensor tensor_b, + Tensor tensor_c, + Tensor p_a, + Tensor p_b, + Tensor p_o + ); + + extern template void sparse_accumulation_of_products( + Tensor output, + Tensor tensor_a, + Tensor tensor_b, + Tensor tensor_c, + Tensor p_a, + Tensor p_b, + Tensor p_o + ); + + namespace cuda { + /// CUDA version of mops::sparse_accumulation_of_products + template + void MOPS_EXPORT sparse_accumulation_of_products( + Tensor output, + Tensor tensor_a, + Tensor tensor_b, + Tensor tensor_c, + Tensor p_a, + Tensor p_b, + Tensor p_o + ); + } +} + + +#endif diff --git a/mops/include/mops/tensor.h b/mops/include/mops/tensor.h index c42898c..6e48a9a 100644 --- a/mops/include/mops/tensor.h +++ b/mops/include/mops/tensor.h @@ -18,6 +18,16 @@ struct mops_tensor_2d_f64_t { int64_t shape[2]; }; +struct mops_tensor_1d_f32_t { + float* __restrict__ data; + int64_t shape[1]; +}; + +struct mops_tensor_1d_f64_t { + double* __restrict__ data; + int64_t shape[1]; +}; + struct mops_tensor_1d_i32_t { int32_t* __restrict__ data; int64_t shape[1]; diff --git a/mops/src/sap/capi.cpp b/mops/src/sap/capi.cpp new file mode 100644 index 0000000..646c723 --- /dev/null +++ b/mops/src/sap/capi.cpp @@ -0,0 +1,105 @@ +#include "mops/capi.hpp" + +#include "mops/sap.hpp" +#include "mops/sap.h" + +static size_t checked_cast(int64_t value) { + if (value < 0 || static_cast(value) > static_cast(SIZE_MAX)) { + throw std::runtime_error( + "integer value '" + std::to_string(value) + "' does not fit in size_t" + ); + } + return static_cast(value); +} + + +extern "C" int mops_sparse_accumulation_of_products_f32( + mops_tensor_2d_f32_t output, + mops_tensor_2d_f32_t tensor_a, + mops_tensor_2d_f32_t tensor_b, + mops_tensor_1d_f32_t tensor_c, + mops_tensor_1d_i32_t p_a, + mops_tensor_1d_i32_t p_b, + mops_tensor_1d_i32_t p_o +) { + MOPS_CATCH_EXCEPTIONS( + mops::sparse_accumulation_of_products( + {output.data, {checked_cast(output.shape[0]), checked_cast(output.shape[1])}}, + {tensor_a.data, {checked_cast(tensor_a.shape[0]), checked_cast(tensor_a.shape[1])}}, + {tensor_b.data, {checked_cast(tensor_b.shape[0]), checked_cast(tensor_b.shape[1])}}, + {tensor_c.data, {checked_cast(tensor_c.shape[0])}}, + {p_a.data, {checked_cast(p_a.shape[0])}}, + {p_b.data, {checked_cast(p_b.shape[0])}}, + {p_o.data, {checked_cast(p_o.shape[0])}} + ); + ); +} + + +extern "C" int mops_sparse_accumulation_of_products_f64( + mops_tensor_2d_f64_t output, + mops_tensor_2d_f64_t tensor_a, + mops_tensor_2d_f64_t tensor_b, + mops_tensor_1d_f64_t tensor_c, + mops_tensor_1d_i32_t p_a, + mops_tensor_1d_i32_t p_b, + mops_tensor_1d_i32_t p_o +) { + MOPS_CATCH_EXCEPTIONS( + mops::sparse_accumulation_of_products( + {output.data, {checked_cast(output.shape[0]), checked_cast(output.shape[1])}}, + {tensor_a.data, {checked_cast(tensor_a.shape[0]), checked_cast(tensor_a.shape[1])}}, + {tensor_b.data, {checked_cast(tensor_b.shape[0]), checked_cast(tensor_b.shape[1])}}, + {tensor_c.data, {checked_cast(tensor_c.shape[0])}}, + {p_a.data, {checked_cast(p_a.shape[0])}}, + {p_b.data, {checked_cast(p_b.shape[0])}}, + {p_o.data, {checked_cast(p_o.shape[0])}} + ); + ); +} + + +extern "C" int mops_cuda_sparse_accumulation_of_products_f32( + mops_tensor_2d_f32_t output, + mops_tensor_2d_f32_t tensor_a, + mops_tensor_2d_f32_t tensor_b, + mops_tensor_1d_f32_t tensor_c, + mops_tensor_1d_i32_t p_a, + mops_tensor_1d_i32_t p_b, + mops_tensor_1d_i32_t p_o +) { + MOPS_CATCH_EXCEPTIONS( + mops::cuda::sparse_accumulation_of_products( + {output.data, {checked_cast(output.shape[0]), checked_cast(output.shape[1])}}, + {tensor_a.data, {checked_cast(tensor_a.shape[0]), checked_cast(tensor_a.shape[1])}}, + {tensor_b.data, {checked_cast(tensor_b.shape[0]), checked_cast(tensor_b.shape[1])}}, + {tensor_c.data, {checked_cast(tensor_c.shape[0])}}, + {p_a.data, {checked_cast(p_a.shape[0])}}, + {p_b.data, {checked_cast(p_b.shape[0])}}, + {p_o.data, {checked_cast(p_o.shape[0])}} + ); + ); +} + + +extern "C" int mops_cuda_sparse_accumulation_of_products_f64( + mops_tensor_2d_f64_t output, + mops_tensor_2d_f64_t tensor_a, + mops_tensor_2d_f64_t tensor_b, + mops_tensor_1d_f64_t tensor_c, + mops_tensor_1d_i32_t p_a, + mops_tensor_1d_i32_t p_b, + mops_tensor_1d_i32_t p_o +) { + MOPS_CATCH_EXCEPTIONS( + mops::cuda::sparse_accumulation_of_products( + {output.data, {checked_cast(output.shape[0]), checked_cast(output.shape[1])}}, + {tensor_a.data, {checked_cast(tensor_a.shape[0]), checked_cast(tensor_a.shape[1])}}, + {tensor_b.data, {checked_cast(tensor_b.shape[0]), checked_cast(tensor_b.shape[1])}}, + {tensor_c.data, {checked_cast(tensor_c.shape[0])}}, + {p_a.data, {checked_cast(p_a.shape[0])}}, + {p_b.data, {checked_cast(p_b.shape[0])}}, + {p_o.data, {checked_cast(p_o.shape[0])}} + ); + ); +} diff --git a/mops/src/sap/cpu.tpp b/mops/src/sap/cpu.tpp new file mode 100644 index 0000000..5beb687 --- /dev/null +++ b/mops/src/sap/cpu.tpp @@ -0,0 +1,75 @@ +#include +#include +#include +#include + +#include "mops/sap.hpp" + +template +void mops::sparse_accumulation_of_products( + Tensor output, + Tensor tensor_a, + Tensor tensor_b, + Tensor tensor_c, + Tensor p_a, + Tensor p_b, + Tensor p_o +) { + if (tensor_a.shape[0] != tensor_b.shape[0]) { + throw std::runtime_error( + "A and B tensors must have the same number of elements along the " + "first dimension, got " + std::to_string(tensor_a.shape[0]) + " and " + + std::to_string(tensor_b.shape[0]) + ); + } + + if (tensor_a.shape[0] != output.shape[0]) { + throw std::runtime_error( + "O must contain the same number of elements as the first " + "dimension of A and B , got " + std::to_string(output.shape[0]) + + " and " + std::to_string(tensor_a.shape[0]) + ); + } + + if (tensor_c.shape[0] != p_b.shape[0]) { + throw std::runtime_error( + "the dimension of C must match that of P_B, got " + + std::to_string(tensor_c.shape[0]) + + " for C and " + std::to_string(p_b.shape[0]) + " for P_B" + ); + } + + if (tensor_c.shape[0] != p_o.shape[0]) { + throw std::runtime_error( + "the dimension of C must match that of P_O, got " + + std::to_string(tensor_c.shape[0]) + + " for C and " + std::to_string(p_o.shape[0]) + " for P_O" + ); + } + + // TODO: check sorting (not necessary here, necessary in CUDA implementation)? + + scalar_t* a_ptr = tensor_a.data; + scalar_t* b_ptr = tensor_b.data; + scalar_t* o_ptr = output.data; + scalar_t* c_ptr = tensor_c.data; + int32_t* p_a_ptr = p_a.data; + int32_t* p_b_ptr = p_b.data; + int32_t* p_o_ptr = p_o.data; + + long size_first_dimension = tensor_a.shape[0]; + long size_second_dimension_a = tensor_a.shape[1]; + long size_second_dimension_b = tensor_b.shape[1]; + long size_second_dimension_o = output.shape[1]; + long c_size = tensor_c.shape[0]; + + for (int i = 0; i < size_first_dimension; i++) { + long shift_first_dimension_a = i * size_second_dimension_a; + long shift_first_dimension_b = i * size_second_dimension_b; + long shift_first_dimension_o = i * size_second_dimension_o; + for (int j = 0; j < c_size; j++) { + o_ptr[shift_first_dimension_o + p_o_ptr[j]] += + c_ptr[j] * a_ptr[shift_first_dimension_a + p_a_ptr[j]] * b_ptr[shift_first_dimension_b + p_b_ptr[j]]; + } + } +} diff --git a/mops/src/sap/cuda.tpp b/mops/src/sap/cuda.tpp new file mode 100644 index 0000000..fa721f0 --- /dev/null +++ b/mops/src/sap/cuda.tpp @@ -0,0 +1,16 @@ +#include + +#include "mops/sap.hpp" + +template +void mops::cuda::sparse_accumulation_of_products( + Tensor output, + Tensor tensor_a, + Tensor tensor_b, + Tensor tensor_c, + Tensor p_a, + Tensor p_b, + Tensor p_o +) { + throw std::runtime_error("CUDA implementation does not exist yet"); +} diff --git a/mops/src/sap/sap.cpp b/mops/src/sap/sap.cpp new file mode 100644 index 0000000..be7324d --- /dev/null +++ b/mops/src/sap/sap.cpp @@ -0,0 +1,62 @@ +#include "cpu.tpp" + +// explicit instanciations of templates +template void mops::sparse_accumulation_of_products( + Tensor output, + Tensor tensor_a, + Tensor tensor_b, + Tensor tensor_c, + Tensor p_a, + Tensor p_b, + Tensor p_o +); + +template void mops::sparse_accumulation_of_products( + Tensor output, + Tensor tensor_a, + Tensor tensor_b, + Tensor tensor_c, + Tensor p_a, + Tensor p_b, + Tensor p_o +); + + +#ifdef MOPS_CUDA_ENABLED + #include "cuda.tpp" +#else +template +void mops::cuda::sparse_accumulation_of_products( + Tensor output, + Tensor tensor_a, + Tensor tensor_b, + Tensor tensor_c, + Tensor p_a, + Tensor p_b, + Tensor p_o +) { + throw std::runtime_error("MOPS was not compiled with CUDA support"); +} + +#endif + +// explicit instanciations of CUDA templates +template void mops::cuda::sparse_accumulation_of_products( + Tensor output, + Tensor tensor_a, + Tensor tensor_b, + Tensor tensor_c, + Tensor p_a, + Tensor p_b, + Tensor p_o +); + +template void mops::cuda::sparse_accumulation_of_products( + Tensor output, + Tensor tensor_a, + Tensor tensor_b, + Tensor tensor_c, + Tensor p_a, + Tensor p_b, + Tensor p_o +); diff --git a/mops/src/sap/sap.cu b/mops/src/sap/sap.cu new file mode 100644 index 0000000..8421bac --- /dev/null +++ b/mops/src/sap/sap.cu @@ -0,0 +1 @@ +// todo: cuda device code diff --git a/python/mops/src/mops/__init__.py b/python/mops/src/mops/__init__.py index cec030e..e48f08a 100644 --- a/python/mops/src/mops/__init__.py +++ b/python/mops/src/mops/__init__.py @@ -1 +1,2 @@ from .opsa import outer_product_scatter_add # noqa +from .sap import sparse_accumulation_of_products # noqa diff --git a/python/mops/src/mops/_c_api.py b/python/mops/src/mops/_c_api.py index 115c57a..c791ca3 100644 --- a/python/mops/src/mops/_c_api.py +++ b/python/mops/src/mops/_c_api.py @@ -15,6 +15,20 @@ class mops_tensor_2d_f64_t(ctypes.Structure): ] +class mops_tensor_1d_f32_t(ctypes.Structure): + _fields_ = [ + ("data", ctypes.POINTER(ctypes.c_float)), + ("shape", ctypes.ARRAY(ctypes.c_int64, 1)), + ] + + +class mops_tensor_1d_f64_t(ctypes.Structure): + _fields_ = [ + ("data", ctypes.POINTER(ctypes.c_double)), + ("shape", ctypes.ARRAY(ctypes.c_int64, 1)), + ] + + class mops_tensor_1d_i32_t(ctypes.Structure): _fields_ = [ ("data", ctypes.POINTER(ctypes.c_int32)), @@ -60,3 +74,48 @@ def setup_functions(lib): mops_tensor_1d_i32_t, ] lib.mops_cuda_outer_product_scatter_add_f64.restype = _check_status + + # sparse_accumulation_of_products + lib.mops_sparse_accumulation_of_products_f32.argtypes = [ + mops_tensor_2d_f32_t, + mops_tensor_2d_f32_t, + mops_tensor_2d_f32_t, + mops_tensor_1d_f32_t, + mops_tensor_1d_i32_t, + mops_tensor_1d_i32_t, + mops_tensor_1d_i32_t, + ] + lib.mops_sparse_accumulation_of_products_f32.restype = _check_status + + lib.mops_sparse_accumulation_of_products_f64.argtypes = [ + mops_tensor_2d_f64_t, + mops_tensor_2d_f64_t, + mops_tensor_2d_f64_t, + mops_tensor_1d_f64_t, + mops_tensor_1d_i32_t, + mops_tensor_1d_i32_t, + mops_tensor_1d_i32_t, + ] + lib.mops_sparse_accumulation_of_products_f64.restype = _check_status + + lib.mops_cuda_sparse_accumulation_of_products_f32.argtypes = [ + mops_tensor_2d_f32_t, + mops_tensor_2d_f32_t, + mops_tensor_2d_f32_t, + mops_tensor_1d_f32_t, + mops_tensor_1d_i32_t, + mops_tensor_1d_i32_t, + mops_tensor_1d_i32_t, + ] + lib.mops_cuda_sparse_accumulation_of_products_f32.restype = _check_status + + lib.mops_cuda_sparse_accumulation_of_products_f64.argtypes = [ + mops_tensor_2d_f64_t, + mops_tensor_2d_f64_t, + mops_tensor_2d_f64_t, + mops_tensor_1d_f64_t, + mops_tensor_1d_i32_t, + mops_tensor_1d_i32_t, + mops_tensor_1d_i32_t, + ] + lib.mops_cuda_sparse_accumulation_of_products_f64.restype = _check_status diff --git a/python/mops/src/mops/checks.py b/python/mops/src/mops/checks.py index 6a5ad57..e9da7fa 100644 --- a/python/mops/src/mops/checks.py +++ b/python/mops/src/mops/checks.py @@ -35,22 +35,22 @@ def check_hpe(C, A, P): check_array_dtype(P, np.integer, "hpe", "P") -def check_sap(C, A, B, P_A, P_B, P_O, n_O): +def check_sap(A, B, C, P_A, P_B, P_O, n_O): # Check dimensions - check_number_of_dimensions(C, 1, "sap", "C") check_number_of_dimensions(A, 2, "sap", "A") check_number_of_dimensions(B, 2, "sap", "B") + check_number_of_dimensions(C, 1, "sap", "C") check_number_of_dimensions(P_A, 1, "sap", "P_A") - check_number_of_dimensions(P_B, 2, "sap", "P_B") - check_number_of_dimensions(P_O, 2, "sap", "P_O") + check_number_of_dimensions(P_B, 1, "sap", "P_B") + check_number_of_dimensions(P_O, 1, "sap", "P_O") check_scalar(n_O, "sap", "n_O") # TODO: additional dimension checks (some dimensions sizes of the inputs must match) # Check types - check_array_dtype(C, np.floating, "sap", "C") check_array_dtype(A, np.floating, "sap", "A") check_array_dtype(B, np.floating, "sap", "B") + check_array_dtype(C, np.floating, "sap", "C") check_array_dtype(P_A, np.integer, "sap", "P_A") check_array_dtype(P_B, np.integer, "sap", "P_B") check_array_dtype(P_O, np.integer, "sap", "P_O") diff --git a/python/mops/src/mops/reference_implementations.py b/python/mops/src/mops/reference_implementations.py index 05f77bd..5877be8 100644 --- a/python/mops/src/mops/reference_implementations.py +++ b/python/mops/src/mops/reference_implementations.py @@ -17,13 +17,15 @@ def homogeneous_polynomial_evaluation(C, A, P): return O -def sparse_accumulation_of_products(C, A, B, P_A, P_B, P_O, n_O): - check_sap(C, A, B, P_A, P_B, P_O, n_O) +def sparse_accumulation_of_products(A, B, C, P_A, P_B, P_O, n_O): + check_sap(A, B, C, P_A, P_B, P_O, n_O) O = np.zeros((A.shape[0], n_O), dtype=A.dtype) K = C.shape[0] for k in range(K): - O[:, P_O[k]] += C[k] * A[:, P_A[k]] * A[:, P_B[k]] + O[:, P_O[k]] += C[k] * A[:, P_A[k]] * B[:, P_B[k]] + + return O return O diff --git a/python/mops/src/mops/sap.py b/python/mops/src/mops/sap.py new file mode 100644 index 0000000..338622c --- /dev/null +++ b/python/mops/src/mops/sap.py @@ -0,0 +1,42 @@ +import numpy as np + +from ._c_lib import _get_library +from .utils import numpy_to_mops_tensor +from .checks import check_sap + + +def sparse_accumulation_of_products(A, B, C, P_A, P_B, P_O, n_O): + check_sap(A, B, C, P_A, P_B, P_O, n_O) + + A = np.ascontiguousarray(A) + B = np.ascontiguousarray(B) + C = np.ascontiguousarray(C) + P_A = np.ascontiguousarray(P_A) + P_B = np.ascontiguousarray(P_B) + P_O = np.ascontiguousarray(P_O) + P_A = P_A.astype(np.int32) + P_B = P_B.astype(np.int32) + P_O = P_O.astype(np.int32) + + O = np.zeros((A.shape[0], n_O), dtype=A.dtype) + + lib = _get_library() + + if A.dtype == np.float32: + function = lib.mops_sparse_accumulation_of_products_f32 + elif A.dtype == np.float64: + function = lib.mops_sparse_accumulation_of_products_f64 + else: + raise TypeError("Unsupported dtype detected. Only float32 and float64 are supported") + + function( + numpy_to_mops_tensor(O), + numpy_to_mops_tensor(A), + numpy_to_mops_tensor(B), + numpy_to_mops_tensor(C), + numpy_to_mops_tensor(P_A), + numpy_to_mops_tensor(P_B), + numpy_to_mops_tensor(P_O), + ) + + return O diff --git a/python/mops/src/mops/utils.py b/python/mops/src/mops/utils.py index f0cde91..840ee76 100644 --- a/python/mops/src/mops/utils.py +++ b/python/mops/src/mops/utils.py @@ -2,30 +2,46 @@ import numpy as np -from ._c_api import mops_tensor_1d_i32_t, mops_tensor_2d_f32_t, mops_tensor_2d_f64_t +from ._c_api import ( + mops_tensor_1d_i32_t, + mops_tensor_1d_f32_t, + mops_tensor_1d_f64_t, + mops_tensor_2d_f32_t, + mops_tensor_2d_f64_t +) def numpy_to_mops_tensor(array): assert isinstance(array, np.ndarray) if array.dtype == np.float32: - if len(array.shape) == 2: + if len(array.shape) == 1: + tensor = mops_tensor_1d_f32_t() + tensor.data = array.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) + tensor.shape[0] = array.shape[0] + return tensor + elif len(array.shape) == 2: tensor = mops_tensor_2d_f32_t() tensor.data = array.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) tensor.shape[0] = array.shape[0] tensor.shape[1] = array.shape[1] return tensor else: - raise TypeError("we can only convert 2D arrays of float32") + raise TypeError("we can only convert 1D and 2D arrays of float32") elif array.dtype == np.float64: - if len(array.shape) == 2: + if len(array.shape) == 1: + tensor = mops_tensor_1d_f64_t() + tensor.data = array.ctypes.data_as(ctypes.POINTER(ctypes.c_double)) + tensor.shape[0] = array.shape[0] + return tensor + elif len(array.shape) == 2: tensor = mops_tensor_2d_f64_t() tensor.data = array.ctypes.data_as(ctypes.POINTER(ctypes.c_double)) tensor.shape[0] = array.shape[0] tensor.shape[1] = array.shape[1] return tensor else: - raise TypeError("we can only convert 2D arrays of float64") + raise TypeError("we can only convert 1D and 2D arrays of float64") elif array.dtype == np.int32: if len(array.shape) == 1: tensor = mops_tensor_1d_i32_t() diff --git a/python/mops/tests/sap.py b/python/mops/tests/sap.py new file mode 100644 index 0000000..3e75a66 --- /dev/null +++ b/python/mops/tests/sap.py @@ -0,0 +1,29 @@ +import numpy as np +np.random.seed(0xDEADBEEF) + +import pytest + +from mops.reference_implementations import sparse_accumulation_of_products as ref_sap +from mops import sparse_accumulation_of_products as sap + + +def test_sap(): + + A = np.random.rand(100, 20) + B = np.random.rand(100, 6) + C = np.random.rand(30) + + P_A = np.random.randint(20, size=(30,)) + P_B = np.random.randint(6, size=(30,)) + n_O = 35 + P_O = np.random.randint(n_O, size=(30,)) + + reference = ref_sap(A, B, C, P_A, P_B, P_O, n_O) + actual = sap(A, B, C, P_A, P_B, P_O, n_O) + assert np.allclose(reference, actual) + + +def test_sap_wrong_type(): + + with pytest.raises(ValueError): + sap(np.array(1), 2, 3, 4, 5, 6, 7)