From 934f0a296dfc2e1f28e11b522be0c2e42db9bb37 Mon Sep 17 00:00:00 2001
From: Filippo Bigi <98903385+frostedoyster@users.noreply.github.com>
Date: Tue, 14 Nov 2023 13:03:29 +0100
Subject: [PATCH 1/2] Add tests (#6)

---
 .github/workflows/tests.yml                   | 63 +++++++++++++++++++
 .gitignore                                    |  1 +
 README.md                                     |  1 +
 mops/CMakeLists.txt                           |  3 +-
 python/mops/src/mops/opsa.py                  | 36 +++++------
 .../src/mops/reference_implementations.py     | 22 +++++--
 python/mops/tests/opsa.py                     | 35 +++++++----
 tox.ini                                       | 20 +++++-
 8 files changed, 141 insertions(+), 40 deletions(-)
 create mode 100644 .github/workflows/tests.yml

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000..bf0b44e
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,63 @@
+name: Tests
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    # Run for all PRs
+
+jobs:
+  tests:
+    runs-on: ${{ matrix.os }}
+    name: Test on ${{ matrix.os }}
+    strategy:
+      matrix:
+        include:
+          - os: ubuntu-20.04
+          - os: macos-11
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: install tests dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install tox
+
+      - name: run C++ build and tests
+        run: |
+          mkdir build
+          cd build
+          cmake -DMOPS_TESTS=ON ../mops/
+          cmake --build .
+          ctest
+
+      - name: run Python tests
+        run: python -m tox
+
+  # check that we can build Python wheels on any Python version
+  python-build:
+    runs-on: ubuntu-20.04
+    name: check Python build
+    strategy:
+      matrix:
+        python-version: ['3.7', '3.11']
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: install python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install tox wheel
+
+      - name: python build tests
+        run: tox -e build-python
diff --git a/.gitignore b/.gitignore
index 9c9d88a..b27fce4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@ dist/
 .tox/
 
 *.egg-info
+__pycache__/
diff --git a/README.md b/README.md
index 9e7c3c0..b098f08 100644
--- a/README.md
+++ b/README.md
@@ -111,6 +111,7 @@ for e in range(E):
 #### Math notation
 
 $$ O_{i{m_3}k} = \sum_{e \in \{e'|I_{e'}=i\}} R_{ek} \sum_{n \in \{n'|M^3_{n'}=m_3\}} C_n A_{e{M_n^1}} X_{{J_e}{M_n^2}k} $$
+
 #### Calculation
 
 ```python
diff --git a/mops/CMakeLists.txt b/mops/CMakeLists.txt
index f2c79de..953d924 100644
--- a/mops/CMakeLists.txt
+++ b/mops/CMakeLists.txt
@@ -66,7 +66,8 @@ add_library(mops
 if(CMAKE_CUDA_COMPILER)
     target_compile_definitions(mops PUBLIC MOPS_CUDA_ENABLED)
     target_sources(mops
-        "src/opsa/opsa.cu"
+        PRIVATE
+            src/opsa/opsa.cu
     )
 endif()
 
diff --git a/python/mops/src/mops/opsa.py b/python/mops/src/mops/opsa.py
index d78e258..810b62f 100644
--- a/python/mops/src/mops/opsa.py
+++ b/python/mops/src/mops/opsa.py
@@ -2,38 +2,34 @@
 
 from ._c_lib import _get_library
 from .utils import numpy_to_mops_tensor
+from .checks import check_opsa
 
 
-def outer_product_scatter_add(A, B, indices):
-    return _outer_product_scatter_add_numpy(A, B, indices)
+def outer_product_scatter_add(A, B, P, n_O):
+    check_opsa(A, B, P, n_O)
 
-
-def _outer_product_scatter_add_numpy(A, B, indices):
     A = np.ascontiguousarray(A)
     B = np.ascontiguousarray(B)
-    indices = np.ascontiguousarray(indices)
+    P = np.ascontiguousarray(P)
 
+    # TODO: Include these checks in check_opsa
     if A.dtype != B.dtype:
         raise TypeError("A and B must have the same dtype")
-
     if len(A.shape) != 2 or len(B.shape) != 2:
         raise TypeError("A and B must be 2-dimensional arrays")
-
-    if not np.can_cast(indices, np.int32, "same_kind"):
-        raise TypeError("indices must be an array of integers")
-
-    indices = indices.astype(np.int32)
-
-    if len(indices.shape) != 1:
-        raise TypeError("indices must be 1-dimensional arrays")
-
-    if A.shape[0] != B.shape[0] or A.shape[0] != indices.shape[0]:
+    if not np.can_cast(P, np.int32, "same_kind"):
+        raise TypeError("P must be an array of integers")
+    if len(P.shape) != 1:
+        raise TypeError("P must be 1-dimensional arrays")
+    if A.shape[0] != B.shape[0] or A.shape[0] != P.shape[0]:
         raise TypeError(
-            "A, B and indices must have the same number of elements on the "
+            "A, B and P must have the same number of elements on the "
             "first dimension"
         )
 
-    output = np.zeros((np.max(indices) + 1, A.shape[1] * B.shape[1]), dtype=A.dtype)
+    P = P.astype(np.int32)
+
+    output = np.zeros((n_O, A.shape[1] * B.shape[1]), dtype=A.dtype)  # TODO: 3D arrays
 
     lib = _get_library()
 
@@ -42,13 +38,13 @@ def _outer_product_scatter_add_numpy(A, B, indices):
     elif A.dtype == np.float64:
         function = lib.mops_outer_product_scatter_add_f64
     else:
-        raise TypeError("only float32 and float64 are supported")
+        raise TypeError("Unsupported dtype detected. Only float32 and float64 are supported")
 
     function(
         numpy_to_mops_tensor(output),
         numpy_to_mops_tensor(A),
         numpy_to_mops_tensor(B),
-        numpy_to_mops_tensor(indices),
+        numpy_to_mops_tensor(P),
     )
 
     return output.reshape(-1, A.shape[1], B.shape[1])
diff --git a/python/mops/src/mops/reference_implementations.py b/python/mops/src/mops/reference_implementations.py
index d19617b..05f77bd 100644
--- a/python/mops/src/mops/reference_implementations.py
+++ b/python/mops/src/mops/reference_implementations.py
@@ -1,8 +1,8 @@
 import numpy as np
-from checks import check_hpe, check_sap, check_opsa, check_opsax, check_sasax
+from .checks import check_hpe, check_sap, check_opsa, check_opsax, check_sasax
 
 
-def hpe(C, A, P):
+def homogeneous_polynomial_evaluation(C, A, P):
     check_hpe(C, A, P)
 
     O = np.zeros((A.shape[0],), dtype=A.dtype)
@@ -14,8 +14,10 @@ def hpe(C, A, P):
             temp *= A[:, P[j, k]]
         O[:] += temp
 
+    return O
 
-def sap(C, A, B, P_A, P_B, P_O, n_O):
+
+def sparse_accumulation_of_products(C, A, B, P_A, P_B, P_O, n_O):
     check_sap(C, A, B, P_A, P_B, P_O, n_O)
 
     O = np.zeros((A.shape[0], n_O), dtype=A.dtype)
@@ -23,8 +25,10 @@ def sap(C, A, B, P_A, P_B, P_O, n_O):
     for k in range(K):
         O[:, P_O[k]] += C[k] * A[:, P_A[k]] * A[:, P_B[k]]
 
+    return O
+
 
-def opsa(A, B, P, n_O):
+def outer_product_scatter_add(A, B, P, n_O):
     check_opsa(A, B, P, n_O)
 
     O = np.zeros((n_O, A.shape[1], B.shape[1]), dtype=A.dtype)
@@ -32,8 +36,10 @@ def opsa(A, B, P, n_O):
     for j in range(J):
         O[P[j], :, :] += A[j, :, None] * B[j, None, :]
 
+    return O
+
 
-def opsax(A, R, X, I, J, n_O):
+def outer_product_scatter_add_with_weights(A, R, X, I, J, n_O):
     check_opsax(A, R, X, I, J, n_O)
 
     O = np.zeros((n_O, A.shape[1], R.shape[1]), dtype=A.dtype)
@@ -41,8 +47,10 @@ def opsax(A, R, X, I, J, n_O):
     for e in range(E):
         O[I[e], :, :] += A[e, :, None] * R[e, None, :] * X[J[e], None, :]
 
+    return O
 
-def sasax(C, A, R, X, I, J, M_1, M_2, M_3, n_O1, n_O2):
+
+def sparse_accumulation_scatter_add_with_weights(C, A, R, X, I, J, M_1, M_2, M_3, n_O1, n_O2):
     check_sasax(C, A, R, X, I, J, M_1, M_2, M_3, n_O1, n_O2)
 
     O = np.zeros((n_O1, n_O2, A.shape[1]), dtype=A.dtype)
@@ -51,3 +59,5 @@ def sasax(C, A, R, X, I, J, M_1, M_2, M_3, n_O1, n_O2):
     for e in range(E):
         for n in range(N):
             O[I[e], M_3[n], :] += R[e, :] * C[n] * A[e, M_1[n]] * X[J[e], M_2[n], :]
+
+    return O
diff --git a/python/mops/tests/opsa.py b/python/mops/tests/opsa.py
index 6ea13cc..bd549dc 100644
--- a/python/mops/tests/opsa.py
+++ b/python/mops/tests/opsa.py
@@ -1,20 +1,25 @@
 import numpy as np
+np.random.seed(0xDEADBEEF)
 
-import mops
+import pytest
 
+from mops.reference_implementations import outer_product_scatter_add as ref_opsa
+from mops import outer_product_scatter_add as opsa
 
-def reference_implementation(A, B, indices):
-    assert A.shape[0] == B.shape[0]
-    output = np.zeros((np.max(indices) + 1, A.shape[1], B.shape[1]))
 
-    for a, b, i in zip(A, B, indices):
-        output[i] += np.tensordot(a, b, axes=0)
+def test_opsa():
 
-    return output
+    A = np.random.rand(100, 20)
+    B = np.random.rand(100, 5)
 
+    indices = np.sort(np.random.randint(10, size=(100,)))
 
-def test_opsa_numpy():
-    np.random.seed(0xDEADBEEF)
+    reference = ref_opsa(A, B, indices, np.max(indices)+1)
+    actual = opsa(A, B, indices, np.max(indices)+1)
+    assert np.allclose(reference, actual)
+
+
+def test_opsa_no_neighbors():
 
     A = np.random.rand(100, 20)
     B = np.random.rand(100, 5)
@@ -23,9 +28,15 @@ def test_opsa_numpy():
     # substitute all 1s by 2s so as to test the no-neighbor case
     indices[indices == 1] = 2
 
-    reference = reference_implementation(A, B, indices)
-    actual = mops.outer_product_scatter_add(A, B, indices)
+    reference = ref_opsa(A, B, indices, np.max(indices)+1)
+    actual = opsa(A, B, indices, np.max(indices)+1)
     assert np.allclose(reference, actual)
 
 
-test_opsa_numpy()
+def test_opsa_wrong_type():
+
+    with pytest.raises(
+        ValueError,
+        match="A must be a 2D array in opsa, got a 1D array"
+    ):
+        opsa(np.array([1]), 2, 3, 4)
diff --git a/tox.ini b/tox.ini
index d861d70..bee3f18 100644
--- a/tox.ini
+++ b/tox.ini
@@ -25,12 +25,30 @@ cmake-options =
     -DCMAKE_BUILD_TYPE=Debug \
     -DMOPS_TESTS=ON
 
-
 commands =
     cmake {toxinidir}/mops -B {envdir}/build {[testenv:cxx-tests]cmake-options}
     cmake --build {envdir}/build --config Debug
     ctest --test-dir {envdir}/build --build-config Debug --output-on-failure
 
+[testenv:build-python]
+# this environement makes sure one can build sdist and wheels for Python
+deps =
+    setuptools
+    wheel
+    cmake
+    twine
+    build
+
+allowlist_externals =
+    bash
+
+commands =
+    # check building sdist and wheels from a checkout
+    python -m build .  --outdir dist
+    twine check dist/*.tar.gz
+    twine check dist/*.whl
+
+
 ; [testenv:torch-cxx-tests]
 ; package = skip
 ; passenv = *

From 4232a2e7354a030f877aeb690aabf752eb141e2c Mon Sep 17 00:00:00 2001
From: Filippo Bigi <98903385+frostedoyster@users.noreply.github.com>
Date: Mon, 20 Nov 2023 14:08:43 +0100
Subject: [PATCH 2/2] Port sparse accumulation (CPU) into mops (#7)

---
 mops/CMakeLists.txt                           |   4 +
 mops/include/mops/sap.h                       |  65 +++++++++++
 mops/include/mops/sap.hpp                     |  60 ++++++++++
 mops/include/mops/tensor.h                    |  10 ++
 mops/src/sap/capi.cpp                         | 105 ++++++++++++++++++
 mops/src/sap/cpu.tpp                          |  75 +++++++++++++
 mops/src/sap/cuda.tpp                         |  16 +++
 mops/src/sap/sap.cpp                          |  62 +++++++++++
 mops/src/sap/sap.cu                           |   1 +
 python/mops/src/mops/__init__.py              |   1 +
 python/mops/src/mops/_c_api.py                |  59 ++++++++++
 python/mops/src/mops/checks.py                |  10 +-
 .../src/mops/reference_implementations.py     |   8 +-
 python/mops/src/mops/sap.py                   |  42 +++++++
 python/mops/src/mops/utils.py                 |  26 ++++-
 python/mops/tests/sap.py                      |  29 +++++
 16 files changed, 560 insertions(+), 13 deletions(-)
 create mode 100644 mops/include/mops/sap.h
 create mode 100644 mops/include/mops/sap.hpp
 create mode 100644 mops/src/sap/capi.cpp
 create mode 100644 mops/src/sap/cpu.tpp
 create mode 100644 mops/src/sap/cuda.tpp
 create mode 100644 mops/src/sap/sap.cpp
 create mode 100644 mops/src/sap/sap.cu
 create mode 100644 python/mops/src/mops/sap.py
 create mode 100644 python/mops/tests/sap.py

diff --git a/mops/CMakeLists.txt b/mops/CMakeLists.txt
index 953d924..e5c5e4d 100644
--- a/mops/CMakeLists.txt
+++ b/mops/CMakeLists.txt
@@ -55,12 +55,16 @@ add_library(mops
     "src/capi.cpp"
     "src/opsa/opsa.cpp"
     "src/opsa/capi.cpp"
+    "src/sap/sap.cpp"
+    "src/sap/capi.cpp"
 
     "include/mops.hpp"
     "include/mops.h"
     "include/mops/capi.hpp"
     "include/mops/opsa.hpp"
     "include/mops/opsa.h"
+    "include/mops/sap.hpp"
+    "include/mops/sap.h"
 )
 
 if(CMAKE_CUDA_COMPILER)
diff --git a/mops/include/mops/sap.h b/mops/include/mops/sap.h
new file mode 100644
index 0000000..3ce2d6c
--- /dev/null
+++ b/mops/include/mops/sap.h
@@ -0,0 +1,65 @@
+#ifndef MOPS_SPARSE_ACCUMULATION_OF_PRODUCTS_H
+#define MOPS_SPARSE_ACCUMULATION_OF_PRODUCTS_H
+
+#include "mops/exports.h"
+#include "mops/tensor.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// CPU version of mops::sparse_accumulation_of_products for 32-bit floats
+int MOPS_EXPORT mops_sparse_accumulation_of_products_f32(
+    mops_tensor_2d_f32_t output,
+    mops_tensor_2d_f32_t tensor_a,
+    mops_tensor_2d_f32_t tensor_b,
+    mops_tensor_1d_f32_t tensor_c,
+    mops_tensor_1d_i32_t p_a,
+    mops_tensor_1d_i32_t p_b,
+    mops_tensor_1d_i32_t p_o
+);
+
+
+/// CPU version of mops::sparse_accumulation_of_products for 64-bit floats
+int MOPS_EXPORT mops_sparse_accumulation_of_products_f64(
+    mops_tensor_2d_f64_t output,
+    mops_tensor_2d_f64_t tensor_a,
+    mops_tensor_2d_f64_t tensor_b,
+    mops_tensor_1d_f64_t tensor_c,
+    mops_tensor_1d_i32_t p_a,
+    mops_tensor_1d_i32_t p_b,
+    mops_tensor_1d_i32_t p_o
+);
+
+
+/// CUDA version of mops::sparse_accumulation_of_products for 32-bit floats
+int MOPS_EXPORT mops_cuda_sparse_accumulation_of_products_f32(
+    mops_tensor_2d_f32_t output,
+    mops_tensor_2d_f32_t tensor_a,
+    mops_tensor_2d_f32_t tensor_b,
+    mops_tensor_1d_f32_t tensor_c,
+    mops_tensor_1d_i32_t p_a,
+    mops_tensor_1d_i32_t p_b,
+    mops_tensor_1d_i32_t p_o
+);
+
+
+/// CUDA version of mops::sparse_accumulation_of_products for 64-bit floats
+int MOPS_EXPORT mops_cuda_sparse_accumulation_of_products_f64(
+    mops_tensor_2d_f64_t output,
+    mops_tensor_2d_f64_t tensor_a,
+    mops_tensor_2d_f64_t tensor_b,
+    mops_tensor_1d_f64_t tensor_c,
+    mops_tensor_1d_i32_t p_a,
+    mops_tensor_1d_i32_t p_b,
+    mops_tensor_1d_i32_t p_o
+);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/mops/include/mops/sap.hpp b/mops/include/mops/sap.hpp
new file mode 100644
index 0000000..e66dc90
--- /dev/null
+++ b/mops/include/mops/sap.hpp
@@ -0,0 +1,60 @@
+#ifndef MOPS_SPARSE_ACCUMULATION_OF_PRODUCTS_HPP
+#define MOPS_SPARSE_ACCUMULATION_OF_PRODUCTS_HPP
+
+#include <cstddef>
+#include <cstdint>
+
+#include "mops/exports.h"
+#include "mops/tensor.hpp"
+
+namespace mops {
+    /// TODO
+    template<typename scalar_t>
+    void MOPS_EXPORT sparse_accumulation_of_products(
+        Tensor<scalar_t, 2> output,
+        Tensor<scalar_t, 2> tensor_a,
+        Tensor<scalar_t, 2> tensor_b,
+        Tensor<scalar_t, 1> tensor_c,
+        Tensor<int32_t, 1> p_a,
+        Tensor<int32_t, 1> p_b,
+        Tensor<int32_t, 1> p_o
+    );
+
+    // these templates will be precompiled and provided in the mops library
+    extern template void sparse_accumulation_of_products(
+        Tensor<float, 2> output,
+        Tensor<float, 2> tensor_a,
+        Tensor<float, 2> tensor_b,
+        Tensor<float, 1> tensor_c,
+        Tensor<int32_t, 1> p_a,
+        Tensor<int32_t, 1> p_b,
+        Tensor<int32_t, 1> p_o
+    );
+
+    extern template void sparse_accumulation_of_products(
+        Tensor<double, 2> output,
+        Tensor<double, 2> tensor_a,
+        Tensor<double, 2> tensor_b,
+        Tensor<double, 1> tensor_c,
+        Tensor<int32_t, 1> p_a,
+        Tensor<int32_t, 1> p_b,
+        Tensor<int32_t, 1> p_o
+    );
+
+    namespace cuda {
+        /// CUDA version of mops::sparse_accumulation_of_products
+        template<typename scalar_t>
+        void MOPS_EXPORT sparse_accumulation_of_products(
+            Tensor<scalar_t, 2> output,
+            Tensor<scalar_t, 2> tensor_a,
+            Tensor<scalar_t, 2> tensor_b,
+            Tensor<scalar_t, 1> tensor_c,
+            Tensor<int32_t, 1> p_a,
+            Tensor<int32_t, 1> p_b,
+            Tensor<int32_t, 1> p_o
+        );
+    }
+}
+
+
+#endif
diff --git a/mops/include/mops/tensor.h b/mops/include/mops/tensor.h
index c42898c..6e48a9a 100644
--- a/mops/include/mops/tensor.h
+++ b/mops/include/mops/tensor.h
@@ -18,6 +18,16 @@ struct mops_tensor_2d_f64_t {
     int64_t shape[2];
 };
 
+struct mops_tensor_1d_f32_t {
+    float* __restrict__ data;
+    int64_t shape[1];
+};
+
+struct mops_tensor_1d_f64_t {
+    double* __restrict__ data;
+    int64_t shape[1];
+};
+
 struct mops_tensor_1d_i32_t {
     int32_t* __restrict__ data;
     int64_t shape[1];
diff --git a/mops/src/sap/capi.cpp b/mops/src/sap/capi.cpp
new file mode 100644
index 0000000..646c723
--- /dev/null
+++ b/mops/src/sap/capi.cpp
@@ -0,0 +1,105 @@
+#include "mops/capi.hpp"
+
+#include "mops/sap.hpp"
+#include "mops/sap.h"
+
+static size_t checked_cast(int64_t value) {
+    if (value < 0 || static_cast<uint64_t>(value) > static_cast<uint64_t>(SIZE_MAX)) {
+        throw std::runtime_error(
+            "integer value '" + std::to_string(value) + "' does not fit in size_t"
+        );
+    }
+    return static_cast<size_t>(value);
+}
+
+
+extern "C" int mops_sparse_accumulation_of_products_f32(
+    mops_tensor_2d_f32_t output,
+    mops_tensor_2d_f32_t tensor_a,
+    mops_tensor_2d_f32_t tensor_b,
+    mops_tensor_1d_f32_t tensor_c,
+    mops_tensor_1d_i32_t p_a,
+    mops_tensor_1d_i32_t p_b,
+    mops_tensor_1d_i32_t p_o
+) {
+    MOPS_CATCH_EXCEPTIONS(
+        mops::sparse_accumulation_of_products<float>(
+            {output.data, {checked_cast(output.shape[0]), checked_cast(output.shape[1])}},
+            {tensor_a.data, {checked_cast(tensor_a.shape[0]), checked_cast(tensor_a.shape[1])}},
+            {tensor_b.data, {checked_cast(tensor_b.shape[0]), checked_cast(tensor_b.shape[1])}},
+            {tensor_c.data, {checked_cast(tensor_c.shape[0])}},
+            {p_a.data, {checked_cast(p_a.shape[0])}},
+            {p_b.data, {checked_cast(p_b.shape[0])}},
+            {p_o.data, {checked_cast(p_o.shape[0])}}
+        );
+    );
+}
+
+
+extern "C" int mops_sparse_accumulation_of_products_f64(
+    mops_tensor_2d_f64_t output,
+    mops_tensor_2d_f64_t tensor_a,
+    mops_tensor_2d_f64_t tensor_b,
+    mops_tensor_1d_f64_t tensor_c,
+    mops_tensor_1d_i32_t p_a,
+    mops_tensor_1d_i32_t p_b,
+    mops_tensor_1d_i32_t p_o
+) {
+    MOPS_CATCH_EXCEPTIONS(
+        mops::sparse_accumulation_of_products<double>(
+            {output.data, {checked_cast(output.shape[0]), checked_cast(output.shape[1])}},
+            {tensor_a.data, {checked_cast(tensor_a.shape[0]), checked_cast(tensor_a.shape[1])}},
+            {tensor_b.data, {checked_cast(tensor_b.shape[0]), checked_cast(tensor_b.shape[1])}},
+            {tensor_c.data, {checked_cast(tensor_c.shape[0])}},
+            {p_a.data, {checked_cast(p_a.shape[0])}},
+            {p_b.data, {checked_cast(p_b.shape[0])}},
+            {p_o.data, {checked_cast(p_o.shape[0])}}
+        );
+    );
+}
+
+
+extern "C" int mops_cuda_sparse_accumulation_of_products_f32(
+    mops_tensor_2d_f32_t output,
+    mops_tensor_2d_f32_t tensor_a,
+    mops_tensor_2d_f32_t tensor_b,
+    mops_tensor_1d_f32_t tensor_c,
+    mops_tensor_1d_i32_t p_a,
+    mops_tensor_1d_i32_t p_b,
+    mops_tensor_1d_i32_t p_o
+) {
+    MOPS_CATCH_EXCEPTIONS(
+        mops::cuda::sparse_accumulation_of_products<float>(
+            {output.data, {checked_cast(output.shape[0]), checked_cast(output.shape[1])}},
+            {tensor_a.data, {checked_cast(tensor_a.shape[0]), checked_cast(tensor_a.shape[1])}},
+            {tensor_b.data, {checked_cast(tensor_b.shape[0]), checked_cast(tensor_b.shape[1])}},
+            {tensor_c.data, {checked_cast(tensor_c.shape[0])}},
+            {p_a.data, {checked_cast(p_a.shape[0])}},
+            {p_b.data, {checked_cast(p_b.shape[0])}},
+            {p_o.data, {checked_cast(p_o.shape[0])}}
+        );
+    );
+}
+
+
+extern "C" int mops_cuda_sparse_accumulation_of_products_f64(
+    mops_tensor_2d_f64_t output,
+    mops_tensor_2d_f64_t tensor_a,
+    mops_tensor_2d_f64_t tensor_b,
+    mops_tensor_1d_f64_t tensor_c,
+    mops_tensor_1d_i32_t p_a,
+    mops_tensor_1d_i32_t p_b,
+    mops_tensor_1d_i32_t p_o
+) {
+    MOPS_CATCH_EXCEPTIONS(
+        mops::cuda::sparse_accumulation_of_products<double>(
+            {output.data, {checked_cast(output.shape[0]), checked_cast(output.shape[1])}},
+            {tensor_a.data, {checked_cast(tensor_a.shape[0]), checked_cast(tensor_a.shape[1])}},
+            {tensor_b.data, {checked_cast(tensor_b.shape[0]), checked_cast(tensor_b.shape[1])}},
+            {tensor_c.data, {checked_cast(tensor_c.shape[0])}},
+            {p_a.data, {checked_cast(p_a.shape[0])}},
+            {p_b.data, {checked_cast(p_b.shape[0])}},
+            {p_o.data, {checked_cast(p_o.shape[0])}}
+        );
+    );
+}
diff --git a/mops/src/sap/cpu.tpp b/mops/src/sap/cpu.tpp
new file mode 100644
index 0000000..5beb687
--- /dev/null
+++ b/mops/src/sap/cpu.tpp
@@ -0,0 +1,75 @@
+#include <cassert>
+#include <algorithm>
+#include <stdexcept>
+#include <string>
+
+#include "mops/sap.hpp"
+
+template<typename scalar_t>
+void mops::sparse_accumulation_of_products(
+    Tensor<scalar_t, 2> output,
+    Tensor<scalar_t, 2> tensor_a,
+    Tensor<scalar_t, 2> tensor_b,
+    Tensor<scalar_t, 1> tensor_c,
+    Tensor<int32_t, 1> p_a,
+    Tensor<int32_t, 1> p_b,
+    Tensor<int32_t, 1> p_o
+) {
+    if (tensor_a.shape[0] != tensor_b.shape[0]) {
+        throw std::runtime_error(
+            "A and B tensors must have the same number of elements along the "
+            "first dimension, got " + std::to_string(tensor_a.shape[0]) + " and " +
+            std::to_string(tensor_b.shape[0])
+        );
+    }
+
+    if (tensor_a.shape[0] != output.shape[0]) {
+        throw std::runtime_error(
+            "O must contain the same number of elements as the first "
+            "dimension of A and B , got " + std::to_string(output.shape[0]) +
+            " and " + std::to_string(tensor_a.shape[0])
+        );
+    }
+
+    if (tensor_c.shape[0] != p_b.shape[0]) {
+        throw std::runtime_error(
+            "the dimension of C must match that of P_B, got "
+            + std::to_string(tensor_c.shape[0]) +
+            " for C and " + std::to_string(p_b.shape[0]) + " for P_B"
+        );
+    }
+
+    if (tensor_c.shape[0] != p_o.shape[0]) {
+        throw std::runtime_error(
+            "the dimension of C must match that of P_O, got "
+            + std::to_string(tensor_c.shape[0]) +
+            " for C and " + std::to_string(p_o.shape[0]) + " for P_O"
+        );
+    }
+
+    // TODO: check sorting (not necessary here, necessary in CUDA implementation)?
+        
+    scalar_t* a_ptr = tensor_a.data;
+    scalar_t* b_ptr = tensor_b.data;
+    scalar_t* o_ptr = output.data;
+    scalar_t* c_ptr = tensor_c.data;
+    int32_t* p_a_ptr = p_a.data;
+    int32_t* p_b_ptr = p_b.data;
+    int32_t* p_o_ptr = p_o.data;
+
+    long size_first_dimension = tensor_a.shape[0];
+    long size_second_dimension_a = tensor_a.shape[1];
+    long size_second_dimension_b = tensor_b.shape[1];
+    long size_second_dimension_o = output.shape[1];
+    long c_size = tensor_c.shape[0];
+    
+    for (int i = 0; i < size_first_dimension; i++) {
+        long shift_first_dimension_a = i * size_second_dimension_a;
+        long shift_first_dimension_b = i * size_second_dimension_b;
+        long shift_first_dimension_o = i * size_second_dimension_o;
+        for (int j = 0; j < c_size; j++) { 
+            o_ptr[shift_first_dimension_o + p_o_ptr[j]] +=
+            c_ptr[j] * a_ptr[shift_first_dimension_a + p_a_ptr[j]] * b_ptr[shift_first_dimension_b + p_b_ptr[j]];                                             
+        }
+    }
+}
diff --git a/mops/src/sap/cuda.tpp b/mops/src/sap/cuda.tpp
new file mode 100644
index 0000000..fa721f0
--- /dev/null
+++ b/mops/src/sap/cuda.tpp
@@ -0,0 +1,16 @@
+#include <stdexcept>
+
+#include "mops/sap.hpp"
+
+template<typename scalar_t>
+void mops::cuda::sparse_accumulation_of_products(
+    Tensor<scalar_t, 2> output,
+    Tensor<scalar_t, 2> tensor_a,
+    Tensor<scalar_t, 2> tensor_b,
+    Tensor<scalar_t, 1> tensor_c,
+    Tensor<int32_t, 1> p_a,
+    Tensor<int32_t, 1> p_b,
+    Tensor<int32_t, 1> p_o
+) {
+    throw std::runtime_error("CUDA implementation does not exist yet");
+}
diff --git a/mops/src/sap/sap.cpp b/mops/src/sap/sap.cpp
new file mode 100644
index 0000000..be7324d
--- /dev/null
+++ b/mops/src/sap/sap.cpp
@@ -0,0 +1,62 @@
+#include "cpu.tpp"
+
+// explicit instanciations of templates
+template void mops::sparse_accumulation_of_products<float>(
+    Tensor<float, 2> output,
+    Tensor<float, 2> tensor_a,
+    Tensor<float, 2> tensor_b,
+    Tensor<float, 1> tensor_c,
+    Tensor<int32_t, 1> p_a,
+    Tensor<int32_t, 1> p_b,
+    Tensor<int32_t, 1> p_o
+);
+
+template void mops::sparse_accumulation_of_products<double>(
+    Tensor<double, 2> output,
+    Tensor<double, 2> tensor_a,
+    Tensor<double, 2> tensor_b,
+    Tensor<double, 1> tensor_c,
+    Tensor<int32_t, 1> p_a,
+    Tensor<int32_t, 1> p_b,
+    Tensor<int32_t, 1> p_o
+);
+
+
+#ifdef MOPS_CUDA_ENABLED
+    #include "cuda.tpp"
+#else
+template<typename scalar_t>
+void mops::cuda::sparse_accumulation_of_products(
+    Tensor<scalar_t, 2> output,
+    Tensor<scalar_t, 2> tensor_a,
+    Tensor<scalar_t, 2> tensor_b,
+    Tensor<scalar_t, 1> tensor_c,
+    Tensor<int32_t, 1> p_a,
+    Tensor<int32_t, 1> p_b,
+    Tensor<int32_t, 1> p_o
+) {
+    throw std::runtime_error("MOPS was not compiled with CUDA support");
+}
+
+#endif
+
+// explicit instanciations of CUDA templates
+template void mops::cuda::sparse_accumulation_of_products<float>(
+    Tensor<float, 2> output,
+    Tensor<float, 2> tensor_a,
+    Tensor<float, 2> tensor_b,
+    Tensor<float, 1> tensor_c,
+    Tensor<int32_t, 1> p_a,
+    Tensor<int32_t, 1> p_b,
+    Tensor<int32_t, 1> p_o
+);
+
+template void mops::cuda::sparse_accumulation_of_products<double>(
+    Tensor<double, 2> output,
+    Tensor<double, 2> tensor_a,
+    Tensor<double, 2> tensor_b,
+    Tensor<double, 1> tensor_c,
+    Tensor<int32_t, 1> p_a,
+    Tensor<int32_t, 1> p_b,
+    Tensor<int32_t, 1> p_o
+);
diff --git a/mops/src/sap/sap.cu b/mops/src/sap/sap.cu
new file mode 100644
index 0000000..8421bac
--- /dev/null
+++ b/mops/src/sap/sap.cu
@@ -0,0 +1 @@
+// todo: cuda device code
diff --git a/python/mops/src/mops/__init__.py b/python/mops/src/mops/__init__.py
index cec030e..e48f08a 100644
--- a/python/mops/src/mops/__init__.py
+++ b/python/mops/src/mops/__init__.py
@@ -1 +1,2 @@
 from .opsa import outer_product_scatter_add  # noqa
+from .sap import sparse_accumulation_of_products  # noqa
diff --git a/python/mops/src/mops/_c_api.py b/python/mops/src/mops/_c_api.py
index 115c57a..c791ca3 100644
--- a/python/mops/src/mops/_c_api.py
+++ b/python/mops/src/mops/_c_api.py
@@ -15,6 +15,20 @@ class mops_tensor_2d_f64_t(ctypes.Structure):
     ]
 
 
+class mops_tensor_1d_f32_t(ctypes.Structure):
+    _fields_ = [
+        ("data", ctypes.POINTER(ctypes.c_float)),
+        ("shape", ctypes.ARRAY(ctypes.c_int64, 1)),
+    ]
+
+
+class mops_tensor_1d_f64_t(ctypes.Structure):
+    _fields_ = [
+        ("data", ctypes.POINTER(ctypes.c_double)),
+        ("shape", ctypes.ARRAY(ctypes.c_int64, 1)),
+    ]
+
+
 class mops_tensor_1d_i32_t(ctypes.Structure):
     _fields_ = [
         ("data", ctypes.POINTER(ctypes.c_int32)),
@@ -60,3 +74,48 @@ def setup_functions(lib):
         mops_tensor_1d_i32_t,
     ]
     lib.mops_cuda_outer_product_scatter_add_f64.restype = _check_status
+
+    # sparse_accumulation_of_products
+    lib.mops_sparse_accumulation_of_products_f32.argtypes = [
+        mops_tensor_2d_f32_t,
+        mops_tensor_2d_f32_t,
+        mops_tensor_2d_f32_t,
+        mops_tensor_1d_f32_t,
+        mops_tensor_1d_i32_t,
+        mops_tensor_1d_i32_t,
+        mops_tensor_1d_i32_t,
+    ]
+    lib.mops_sparse_accumulation_of_products_f32.restype = _check_status
+
+    lib.mops_sparse_accumulation_of_products_f64.argtypes = [
+        mops_tensor_2d_f64_t,
+        mops_tensor_2d_f64_t,
+        mops_tensor_2d_f64_t,
+        mops_tensor_1d_f64_t,
+        mops_tensor_1d_i32_t,
+        mops_tensor_1d_i32_t,
+        mops_tensor_1d_i32_t,
+    ]
+    lib.mops_sparse_accumulation_of_products_f64.restype = _check_status
+
+    lib.mops_cuda_sparse_accumulation_of_products_f32.argtypes = [
+        mops_tensor_2d_f32_t,
+        mops_tensor_2d_f32_t,
+        mops_tensor_2d_f32_t,
+        mops_tensor_1d_f32_t,
+        mops_tensor_1d_i32_t,
+        mops_tensor_1d_i32_t,
+        mops_tensor_1d_i32_t,
+    ]
+    lib.mops_cuda_sparse_accumulation_of_products_f32.restype = _check_status
+
+    lib.mops_cuda_sparse_accumulation_of_products_f64.argtypes = [
+        mops_tensor_2d_f64_t,
+        mops_tensor_2d_f64_t,
+        mops_tensor_2d_f64_t,
+        mops_tensor_1d_f64_t,
+        mops_tensor_1d_i32_t,
+        mops_tensor_1d_i32_t,
+        mops_tensor_1d_i32_t,
+    ]
+    lib.mops_cuda_sparse_accumulation_of_products_f64.restype = _check_status
diff --git a/python/mops/src/mops/checks.py b/python/mops/src/mops/checks.py
index 6a5ad57..e9da7fa 100644
--- a/python/mops/src/mops/checks.py
+++ b/python/mops/src/mops/checks.py
@@ -35,22 +35,22 @@ def check_hpe(C, A, P):
     check_array_dtype(P, np.integer, "hpe", "P")
 
 
-def check_sap(C, A, B, P_A, P_B, P_O, n_O):
+def check_sap(A, B, C, P_A, P_B, P_O, n_O):
 
     # Check dimensions
-    check_number_of_dimensions(C, 1, "sap", "C")
     check_number_of_dimensions(A, 2, "sap", "A")
     check_number_of_dimensions(B, 2, "sap", "B")
+    check_number_of_dimensions(C, 1, "sap", "C")
     check_number_of_dimensions(P_A, 1, "sap", "P_A")
-    check_number_of_dimensions(P_B, 2, "sap", "P_B")
-    check_number_of_dimensions(P_O, 2, "sap", "P_O")
+    check_number_of_dimensions(P_B, 1, "sap", "P_B")
+    check_number_of_dimensions(P_O, 1, "sap", "P_O")
     check_scalar(n_O, "sap", "n_O")
     # TODO: additional dimension checks (some dimensions sizes of the inputs must match)
     
     # Check types
-    check_array_dtype(C, np.floating, "sap", "C")
     check_array_dtype(A, np.floating, "sap", "A")
     check_array_dtype(B, np.floating, "sap", "B")
+    check_array_dtype(C, np.floating, "sap", "C")
     check_array_dtype(P_A, np.integer, "sap", "P_A")
     check_array_dtype(P_B, np.integer, "sap", "P_B")
     check_array_dtype(P_O, np.integer, "sap", "P_O")
diff --git a/python/mops/src/mops/reference_implementations.py b/python/mops/src/mops/reference_implementations.py
index 05f77bd..5877be8 100644
--- a/python/mops/src/mops/reference_implementations.py
+++ b/python/mops/src/mops/reference_implementations.py
@@ -17,13 +17,15 @@ def homogeneous_polynomial_evaluation(C, A, P):
     return O
 
 
-def sparse_accumulation_of_products(C, A, B, P_A, P_B, P_O, n_O):
-    check_sap(C, A, B, P_A, P_B, P_O, n_O)
+def sparse_accumulation_of_products(A, B, C, P_A, P_B, P_O, n_O):
+    check_sap(A, B, C, P_A, P_B, P_O, n_O)
 
     O = np.zeros((A.shape[0], n_O), dtype=A.dtype)
     K = C.shape[0]
     for k in range(K):
-        O[:, P_O[k]] += C[k] * A[:, P_A[k]] * A[:, P_B[k]]
+        O[:, P_O[k]] += C[k] * A[:, P_A[k]] * B[:, P_B[k]]
+
+    return O
 
     return O
 
diff --git a/python/mops/src/mops/sap.py b/python/mops/src/mops/sap.py
new file mode 100644
index 0000000..338622c
--- /dev/null
+++ b/python/mops/src/mops/sap.py
@@ -0,0 +1,42 @@
+import numpy as np
+
+from ._c_lib import _get_library
+from .utils import numpy_to_mops_tensor
+from .checks import check_sap
+
+
+def sparse_accumulation_of_products(A, B, C, P_A, P_B, P_O, n_O):
+    check_sap(A, B, C, P_A, P_B, P_O, n_O)
+
+    A = np.ascontiguousarray(A)
+    B = np.ascontiguousarray(B)
+    C = np.ascontiguousarray(C)
+    P_A = np.ascontiguousarray(P_A)
+    P_B = np.ascontiguousarray(P_B)
+    P_O = np.ascontiguousarray(P_O)
+    P_A = P_A.astype(np.int32)
+    P_B = P_B.astype(np.int32)
+    P_O = P_O.astype(np.int32)
+
+    O = np.zeros((A.shape[0], n_O), dtype=A.dtype)
+
+    lib = _get_library()
+
+    if A.dtype == np.float32:
+        function = lib.mops_sparse_accumulation_of_products_f32
+    elif A.dtype == np.float64:
+        function = lib.mops_sparse_accumulation_of_products_f64
+    else:
+        raise TypeError("Unsupported dtype detected. Only float32 and float64 are supported")
+
+    function(
+        numpy_to_mops_tensor(O),
+        numpy_to_mops_tensor(A),
+        numpy_to_mops_tensor(B),
+        numpy_to_mops_tensor(C),
+        numpy_to_mops_tensor(P_A),
+        numpy_to_mops_tensor(P_B),
+        numpy_to_mops_tensor(P_O),
+    )
+
+    return O
diff --git a/python/mops/src/mops/utils.py b/python/mops/src/mops/utils.py
index f0cde91..840ee76 100644
--- a/python/mops/src/mops/utils.py
+++ b/python/mops/src/mops/utils.py
@@ -2,30 +2,46 @@
 
 import numpy as np
 
-from ._c_api import mops_tensor_1d_i32_t, mops_tensor_2d_f32_t, mops_tensor_2d_f64_t
+from ._c_api import (
+    mops_tensor_1d_i32_t,
+    mops_tensor_1d_f32_t, 
+    mops_tensor_1d_f64_t,
+    mops_tensor_2d_f32_t,
+    mops_tensor_2d_f64_t
+)
 
 
 def numpy_to_mops_tensor(array):
     assert isinstance(array, np.ndarray)
 
     if array.dtype == np.float32:
-        if len(array.shape) == 2:
+        if len(array.shape) == 1:
+            tensor = mops_tensor_1d_f32_t()
+            tensor.data = array.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+            tensor.shape[0] = array.shape[0]
+            return tensor
+        elif len(array.shape) == 2:
             tensor = mops_tensor_2d_f32_t()
             tensor.data = array.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
             tensor.shape[0] = array.shape[0]
             tensor.shape[1] = array.shape[1]
             return tensor
         else:
-            raise TypeError("we can only convert 2D arrays of float32")
+            raise TypeError("we can only convert 1D and 2D arrays of float32")
     elif array.dtype == np.float64:
-        if len(array.shape) == 2:
+        if len(array.shape) == 1:
+            tensor = mops_tensor_1d_f64_t()
+            tensor.data = array.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
+            tensor.shape[0] = array.shape[0]
+            return tensor
+        elif len(array.shape) == 2:
             tensor = mops_tensor_2d_f64_t()
             tensor.data = array.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
             tensor.shape[0] = array.shape[0]
             tensor.shape[1] = array.shape[1]
             return tensor
         else:
-            raise TypeError("we can only convert 2D arrays of float64")
+            raise TypeError("we can only convert 1D and 2D arrays of float64")
     elif array.dtype == np.int32:
         if len(array.shape) == 1:
             tensor = mops_tensor_1d_i32_t()
diff --git a/python/mops/tests/sap.py b/python/mops/tests/sap.py
new file mode 100644
index 0000000..3e75a66
--- /dev/null
+++ b/python/mops/tests/sap.py
@@ -0,0 +1,29 @@
+import numpy as np
+np.random.seed(0xDEADBEEF)
+
+import pytest
+
+from mops.reference_implementations import sparse_accumulation_of_products as ref_sap
+from mops import sparse_accumulation_of_products as sap
+
+
+def test_sap():
+
+    A = np.random.rand(100, 20)
+    B = np.random.rand(100, 6)
+    C = np.random.rand(30)
+
+    P_A = np.random.randint(20, size=(30,))
+    P_B = np.random.randint(6, size=(30,))
+    n_O = 35
+    P_O = np.random.randint(n_O, size=(30,))
+
+    reference = ref_sap(A, B, C, P_A, P_B, P_O, n_O)
+    actual = sap(A, B, C, P_A, P_B, P_O, n_O)
+    assert np.allclose(reference, actual)
+
+
+def test_sap_wrong_type():
+
+    with pytest.raises(ValueError):
+        sap(np.array(1), 2, 3, 4, 5, 6, 7)