Merge branch 'master' into port-polyeval-cpu

lab-cosmo · Nov 22, 2023 · ef39249 · ef39249
2 parents ca7d0a5 + 65dd323
commit ef39249
Show file tree

Hide file tree

Showing 43 changed files with 1,489 additions and 519 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -28,16 +28,10 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install tox
 
-      - name: run C++ build and tests
-        run: |
-          mkdir build
-          cd build
-          cmake -DMOPS_TESTS=ON ../mops/
-          cmake --build .
-          ctest
-
-      - name: run Python tests
-        run: python -m tox
+      - name: run tests
+        run: tox
+        env:
+          PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu
 
   # check that we can build Python wheels on any Python version
   python-build:
@@ -57,7 +51,9 @@ jobs:
       - name: install python dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install tox wheel
+          python -m pip install tox
 
       - name: python build tests
         run: tox -e build-python
+        env:
+          PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ $$ O_i = \sum_{j=1}^J C_j \prod_{k=1}^K A_{iP_{jk}} $$
 
 #### Inputs
 
-- $A$ is a 2D tensor of floats, of size $I \times N_{A,2}$. It contains the factors in the monomials that make up the polynomial. 
+- $A$ is a 2D tensor of floats, of size $I \times N_{A,2}$. It contains the factors in the monomials that make up the polynomial.
 
 - $C$ is a vector of float multipliers of size $J$. They represent the coefficients of each monomial in the polynomial, so that $J$ is the number of monomials in the polynomial.
 
@@ -78,7 +78,7 @@ $$ O_{ikl} = \sum_{j=1}^J A_{jk} B_{jl} \delta_{iP_j} \hspace{1cm} \mathrm{or} \
 
 - $P$ is a large vector of integers (of size $J$) which maps the dimension $j$ of $A$ and $B$ into the dimension $i$ of $O$. In other words, it contains the position within $O$ where each $AB$ product needs to be summed.
 
-- $n_O$ is the size of the output array along its first dimension. It must be grater or equal than the larger element in $P$ plus one. 
+- $n_O$ is the size of the output array along its first dimension. It must be grater or equal than the larger element in $P$ plus one.
 
 #### Output
 
@@ -97,13 +97,13 @@ for j in range(J):
 
 #### Math notation
 
-$$ O_{imk} = \sum_{e \in \{e'|I_{e'}=i\}} A_{em} R_{ek} X_{{J_e}k} $$
+$$ O_{ikl} = \sum_{j \in \{j'|P_{j'}=i\}} A_{jk} B_{jl} W_{{PW_j}l} $$
 
 #### Calculation
 
 ```python
-for e in range(E):
-    O[I[e], :, :] += A[e, :, None] * R[e, None, :] * X[J[e], None, :]
+for j in range(J):
+    O[PO[j], :, :] += A[j, :, None] * B[j, None, :] * W[PW[j], None, :]
 ```
 
 ### 5. Sparse Accumulation Scatter-Add with Weights
@@ -115,7 +115,7 @@ $$ O_{i{m_3}k} = \sum_{e \in \{e'|I_{e'}=i\}} R_{ek} \sum_{n \in \{n'|M^3_{n'}=m
 #### Calculation
 
 ```python
-for e in range(E):
+for j in range(J):
     for n in range(N):
-        O[I[e], M_3[n], :] += R[e, :] * C[n] * A[e, M_1[n]] * X[J[e], M_2[n], :]
+        O[PO1[e], PO2[n], :] += A[e, PA[n]] * B[e, :] * C[n] * W[PW1[e], PW2[n], :]
 ```
diff --git a/mops-torch/CMakeLists.txt b/mops-torch/CMakeLists.txt
@@ -0,0 +1,117 @@
+cmake_minimum_required(VERSION 3.16)
+
+if (POLICY CMP0077)
+    # use variables to set OPTIONS
+    cmake_policy(SET CMP0077 NEW)
+endif()
+
+
+file(READ ${CMAKE_CURRENT_SOURCE_DIR}/VERSION MOPS_TORCH_VERSION)
+string(STRIP ${MOPS_TORCH_VERSION} MOPS_TORCH_VERSION)
+string(REGEX REPLACE "^([0-9]+)\\..*" "\\1" MOPS_TORCH_VERSION_MAJOR "${MOPS_TORCH_VERSION}")
+string(REGEX REPLACE "^[0-9]+\\.([0-9]+).*" "\\1" MOPS_TORCH_VERSION_MINOR "${MOPS_TORCH_VERSION}")
+string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" MOPS_TORCH_VERSION_PATCH "${MOPS_TORCH_VERSION}")
+
+project(mops VERSION ${MOPS_TORCH_VERSION} LANGUAGES CXX)
+
+include(CheckLanguage)
+check_language(CUDA)
+
+if(CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+    set(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE BOOL "" FORCE)
+else()
+    message(STATUS "Could not find a CUDA compiler")
+endif()
+
+set(LIB_INSTALL_DIR "lib" CACHE PATH "Path relative to CMAKE_INSTALL_PREFIX where to install libraries")
+set(BIN_INSTALL_DIR "bin" CACHE PATH "Path relative to CMAKE_INSTALL_PREFIX where to install DLL/binaries")
+set(INCLUDE_INSTALL_DIR "include" CACHE PATH "Path relative to CMAKE_INSTALL_PREFIX where to install headers")
+
+if (${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR})
+    set(MOPS_TORCH_MAIN_PROJECT ON)
+else()
+    set(MOPS_TORCH_MAIN_PROJECT OFF)
+endif()
+
+# Set a default build type if none was specified
+if (${MOPS_TORCH_MAIN_PROJECT})
+    if("${CMAKE_BUILD_TYPE}" STREQUAL "" AND "${CMAKE_CONFIGURATION_TYPES}" STREQUAL "")
+        message(STATUS "Setting build type to 'relwithdebinfo' as none was specified.")
+        set(
+            CMAKE_BUILD_TYPE "relwithdebinfo"
+            CACHE STRING
+            "Choose the type of build, options are: none(CMAKE_CXX_FLAGS or CMAKE_C_FLAGS used) debug release relwithdebinfo minsizerel."
+            FORCE
+        )
+        set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS release debug relwithdebinfo minsizerel none)
+    endif()
+endif()
+
+find_package(Torch 1.11 REQUIRED)
+
+set(BUILD_SHARED_LIBS OFF)
+add_subdirectory(mops EXCLUDE_FROM_ALL)
+set_target_properties(mops PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+add_library(mops_torch SHARED
+    "src/register.cpp"
+    "src/opsa.cpp"
+
+    "include/mops/torch.hpp"
+    "include/mops/torch/opsa.hpp"
+)
+
+if(CMAKE_CUDA_COMPILER)
+    target_compile_definitions(mops_torch PUBLIC MOPS_CUDA_ENABLED)
+endif()
+
+target_compile_features(mops_torch PUBLIC cxx_std_17)
+target_link_libraries(mops_torch PRIVATE mops)
+target_link_libraries(mops_torch PUBLIC torch)
+
+# Create a header defining MOPS_TORCH_EXPORT for exported classes/functions
+set_target_properties(mops PROPERTIES
+    # hide non-exported symbols by default
+    CXX_VISIBILITY_PRESET hidden
+)
+
+target_include_directories(mops_torch PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
+    $<INSTALL_INTERFACE:include>
+)
+
+#------------------------------------------------------------------------------#
+# Installation configuration
+#------------------------------------------------------------------------------#
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+    mops_torch-config-version.cmake
+    VERSION ${MOPS_TORCH_VERSION}
+    COMPATIBILITY SameMinorVersion
+)
+
+install(TARGETS mops_torch
+    EXPORT mops_torch-targets
+    LIBRARY DESTINATION ${LIB_INSTALL_DIR}
+    ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
+    RUNTIME DESTINATION ${BIN_INSTALL_DIR}
+)
+
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/ DESTINATION ${INCLUDE_INSTALL_DIR})
+
+# Install files to find mops in CMake projects
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/mops_torch-config.in.cmake
+    ${CMAKE_CURRENT_BINARY_DIR}/mops_torch-config.cmake
+    @ONLY
+)
+install(EXPORT mops_torch-targets
+    DESTINATION ${LIB_INSTALL_DIR}/cmake/mops_torch
+)
+install(FILES
+    ${PROJECT_BINARY_DIR}/mops_torch-config-version.cmake
+    ${PROJECT_BINARY_DIR}/mops_torch-config.cmake
+    DESTINATION ${LIB_INSTALL_DIR}/cmake/mops_torch
+)
diff --git a/mops-torch/VERSION b/mops-torch/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/mops-torch/cmake/mops_torch-config.in.cmake b/mops-torch/cmake/mops_torch-config.in.cmake
@@ -0,0 +1 @@
+include(${CMAKE_CURRENT_LIST_DIR}/mops_torch-targets.cmake)
diff --git a/mops-torch/include/mops/torch.hpp b/mops-torch/include/mops/torch.hpp
@@ -0,0 +1,6 @@
+#ifndef MOPS_TORCH_H
+#define MOPS_TORCH_H
+
+#include "torch/opsa.hpp"   // IWYU pragma: export
+
+#endif
diff --git a/mops-torch/include/mops/torch/opsa.hpp b/mops-torch/include/mops/torch/opsa.hpp
@@ -0,0 +1,36 @@
+#ifndef MOPS_TORCH_OPSA_H
+#define MOPS_TORCH_OPSA_H
+
+#include <torch/script.h>
+
+#include <mops.hpp>
+
+namespace mops_torch {
+
+/// TODO
+torch::Tensor outer_product_scatter_add(
+    torch::Tensor A,
+    torch::Tensor B,
+    torch::Tensor indices_output,
+    int64_t output_size
+);
+
+class OuterProductScatterAdd: public torch::autograd::Function<mops_torch::OuterProductScatterAdd> {
+public:
+    static std::vector<torch::Tensor> forward(
+        torch::autograd::AutogradContext *ctx,
+        torch::Tensor A,
+        torch::Tensor B,
+        torch::Tensor indices_output,
+        int64_t output_size
+    );
+
+    static std::vector<torch::Tensor> backward(
+        torch::autograd::AutogradContext *ctx,
+        std::vector<torch::Tensor> grad_outputs
+    );
+};
+
+}
+
+#endif
diff --git a/mops-torch/mops b/mops-torch/mops
@@ -0,0 +1 @@
+../mops
diff --git a/mops-torch/src/opsa.cpp b/mops-torch/src/opsa.cpp
@@ -0,0 +1,142 @@
+#include "mops/torch/opsa.hpp"
+
+using namespace mops_torch;
+
+
+torch::Tensor mops_torch::outer_product_scatter_add(
+    torch::Tensor A,
+    torch::Tensor B,
+    torch::Tensor indices_output,
+    int64_t output_size
+) {
+    return OuterProductScatterAdd::apply(A, B, indices_output, output_size)[0];
+}
+
+template <typename scalar_t>
+static mops::Tensor<scalar_t, 1> torch_to_mops_1d(torch::Tensor tensor) {
+    assert(tensor.sizes().size() == 1);
+    return {
+        tensor.data_ptr<scalar_t>(),
+        {static_cast<size_t>(tensor.size(0))},
+    };
+}
+
+template <typename scalar_t>
+static mops::Tensor<scalar_t, 2> torch_to_mops_2d(torch::Tensor tensor) {
+    assert(tensor.sizes().size() == 2);
+    return {
+        tensor.data_ptr<scalar_t>(),
+        {static_cast<size_t>(tensor.size(0)), static_cast<size_t>(tensor.size(1))},
+    };
+}
+
+std::vector<torch::Tensor> OuterProductScatterAdd::forward(
+    torch::autograd::AutogradContext *ctx,
+    torch::Tensor A,
+    torch::Tensor B,
+    torch::Tensor indices_output,
+    int64_t output_size
+) {
+    if (A.sizes().size() != 2 || B.sizes().size() != 2) {
+        C10_THROW_ERROR(ValueError, "`A` and `B` must be 2-D tensor");
+    }
+
+    if (indices_output.sizes().size() != 1) {
+        C10_THROW_ERROR(ValueError, "`indices_output` must be a 1-D tensor");
+    }
+
+    if (indices_output.scalar_type() != torch::kInt32) {
+        C10_THROW_ERROR(ValueError, "`indices_output` must be a tensor of 32-bit integers");
+    }
+
+    if (A.device() != B.device() || A.device() != indices_output.device()) {
+        C10_THROW_ERROR(ValueError,
+            "all tensors must be on the same device, got " + A.device().str() +
+            ", " + B.device().str() + ", and " + indices_output.device().str()
+        );
+    }
+
+    if (A.scalar_type() != B.scalar_type()) {
+        C10_THROW_ERROR(ValueError,
+            std::string("`A` and `B` must have the same dtype, got ") +
+            torch::toString(A.scalar_type()) + " and " + torch::toString(B.scalar_type())
+        );
+    }
+
+    torch::Tensor output;
+    if (A.device().is_cpu()) {
+        output = torch::zeros({output_size, A.size(1), B.size(1)},
+            torch::TensorOptions().dtype(A.scalar_type()).device(A.device())
+        );
+
+        assert(output.is_contiguous());
+
+        AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "outer_product_scatter_add", [&](){
+            mops::outer_product_scatter_add<scalar_t>(
+                torch_to_mops_2d<scalar_t>(output.reshape({-1, output.size(1) * output.size(2)})),
+                torch_to_mops_2d<scalar_t>(A),
+                torch_to_mops_2d<scalar_t>(B),
+                torch_to_mops_1d<int32_t>(indices_output)
+            );
+        });
+    } else {
+        C10_THROW_ERROR(ValueError,
+            "outer_product_scatter_add is not implemented for device " + A.device().str()
+        );
+    }
+
+    if (A.requires_grad() || B.requires_grad()) {
+        ctx->save_for_backward({A, B, indices_output});
+    }
+
+    return {output};
+}
+
+std::vector<torch::Tensor> OuterProductScatterAdd::backward(
+    torch::autograd::AutogradContext *ctx,
+    std::vector<torch::Tensor> grad_outputs
+) {
+    auto saved_variables = ctx->get_saved_variables();
+    auto A = saved_variables[0];
+    auto B = saved_variables[1];
+    auto indices_output = saved_variables[2];
+
+    auto grad_output = grad_outputs[0];
+    if (!grad_output.is_contiguous()) {
+        throw std::runtime_error("expected contiguous grad_output");
+    }
+
+    auto grad_A = torch::Tensor();
+    auto grad_B = torch::Tensor();
+
+    if (A.device().is_cpu()) {
+        AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "outer_product_scatter_add_vjp", [&](){
+            auto mops_grad_A = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
+            if (A.requires_grad()) {
+                grad_A = torch::zeros_like(A);
+                mops_grad_A = torch_to_mops_2d<scalar_t>(grad_A);
+            }
+
+            auto mops_grad_B = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
+            if (B.requires_grad()) {
+                grad_B = torch::zeros_like(B);
+                mops_grad_B = torch_to_mops_2d<scalar_t>(grad_B);
+            }
+
+            mops::outer_product_scatter_add_vjp<scalar_t>(
+                mops_grad_A,
+                mops_grad_B,
+                torch_to_mops_2d<scalar_t>(grad_output.reshape({-1, grad_output.size(1) * grad_output.size(2)})),
+                torch_to_mops_2d<scalar_t>(A),
+                torch_to_mops_2d<scalar_t>(B),
+                torch_to_mops_1d<int32_t>(indices_output)
+            );
+        });
+    } else {
+        C10_THROW_ERROR(ValueError,
+            "outer_product_scatter_add is not implemented for device " + A.device().str()
+        );
+    }
+
+    return {grad_A, grad_B, torch::Tensor(), torch::Tensor()};
+}