Skip to content

Commit

Permalink
Merge branch 'master' into port-polyeval-cpu
Browse files Browse the repository at this point in the history
  • Loading branch information
frostedoyster authored Nov 22, 2023
2 parents ca7d0a5 + 65dd323 commit ef39249
Show file tree
Hide file tree
Showing 43 changed files with 1,489 additions and 519 deletions.
18 changes: 7 additions & 11 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,10 @@ jobs:
python -m pip install --upgrade pip
python -m pip install tox
- name: run C++ build and tests
run: |
mkdir build
cd build
cmake -DMOPS_TESTS=ON ../mops/
cmake --build .
ctest
- name: run Python tests
run: python -m tox
- name: run tests
run: tox
env:
PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu

# check that we can build Python wheels on any Python version
python-build:
Expand All @@ -57,7 +51,9 @@ jobs:
- name: install python dependencies
run: |
python -m pip install --upgrade pip
python -m pip install tox wheel
python -m pip install tox
- name: python build tests
run: tox -e build-python
env:
PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ $$ O_i = \sum_{j=1}^J C_j \prod_{k=1}^K A_{iP_{jk}} $$

#### Inputs

- $A$ is a 2D tensor of floats, of size $I \times N_{A,2}$. It contains the factors in the monomials that make up the polynomial.
- $A$ is a 2D tensor of floats, of size $I \times N_{A,2}$. It contains the factors in the monomials that make up the polynomial.

- $C$ is a vector of float multipliers of size $J$. They represent the coefficients of each monomial in the polynomial, so that $J$ is the number of monomials in the polynomial.

Expand Down Expand Up @@ -78,7 +78,7 @@ $$ O_{ikl} = \sum_{j=1}^J A_{jk} B_{jl} \delta_{iP_j} \hspace{1cm} \mathrm{or} \

- $P$ is a large vector of integers (of size $J$) which maps the dimension $j$ of $A$ and $B$ into the dimension $i$ of $O$. In other words, it contains the position within $O$ where each $AB$ product needs to be summed.

- $n_O$ is the size of the output array along its first dimension. It must be grater or equal than the larger element in $P$ plus one.
- $n_O$ is the size of the output array along its first dimension. It must be grater or equal than the larger element in $P$ plus one.

#### Output

Expand All @@ -97,13 +97,13 @@ for j in range(J):

#### Math notation

$$ O_{imk} = \sum_{e \in \{e'|I_{e'}=i\}} A_{em} R_{ek} X_{{J_e}k} $$
$$ O_{ikl} = \sum_{j \in \{j'|P_{j'}=i\}} A_{jk} B_{jl} W_{{PW_j}l} $$

#### Calculation

```python
for e in range(E):
O[I[e], :, :] += A[e, :, None] * R[e, None, :] * X[J[e], None, :]
for j in range(J):
O[PO[j], :, :] += A[j, :, None] * B[j, None, :] * W[PW[j], None, :]
```

### 5. Sparse Accumulation Scatter-Add with Weights
Expand All @@ -115,7 +115,7 @@ $$ O_{i{m_3}k} = \sum_{e \in \{e'|I_{e'}=i\}} R_{ek} \sum_{n \in \{n'|M^3_{n'}=m
#### Calculation

```python
for e in range(E):
for j in range(J):
for n in range(N):
O[I[e], M_3[n], :] += R[e, :] * C[n] * A[e, M_1[n]] * X[J[e], M_2[n], :]
O[PO1[e], PO2[n], :] += A[e, PA[n]] * B[e, :] * C[n] * W[PW1[e], PW2[n], :]
```
117 changes: 117 additions & 0 deletions mops-torch/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
cmake_minimum_required(VERSION 3.16)

if (POLICY CMP0077)
# use variables to set OPTIONS
cmake_policy(SET CMP0077 NEW)
endif()


file(READ ${CMAKE_CURRENT_SOURCE_DIR}/VERSION MOPS_TORCH_VERSION)
string(STRIP ${MOPS_TORCH_VERSION} MOPS_TORCH_VERSION)
string(REGEX REPLACE "^([0-9]+)\\..*" "\\1" MOPS_TORCH_VERSION_MAJOR "${MOPS_TORCH_VERSION}")
string(REGEX REPLACE "^[0-9]+\\.([0-9]+).*" "\\1" MOPS_TORCH_VERSION_MINOR "${MOPS_TORCH_VERSION}")
string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" MOPS_TORCH_VERSION_PATCH "${MOPS_TORCH_VERSION}")

project(mops VERSION ${MOPS_TORCH_VERSION} LANGUAGES CXX)

include(CheckLanguage)
check_language(CUDA)

if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
set(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE BOOL "" FORCE)
else()
message(STATUS "Could not find a CUDA compiler")
endif()

set(LIB_INSTALL_DIR "lib" CACHE PATH "Path relative to CMAKE_INSTALL_PREFIX where to install libraries")
set(BIN_INSTALL_DIR "bin" CACHE PATH "Path relative to CMAKE_INSTALL_PREFIX where to install DLL/binaries")
set(INCLUDE_INSTALL_DIR "include" CACHE PATH "Path relative to CMAKE_INSTALL_PREFIX where to install headers")

if (${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR})
set(MOPS_TORCH_MAIN_PROJECT ON)
else()
set(MOPS_TORCH_MAIN_PROJECT OFF)
endif()

# Set a default build type if none was specified
if (${MOPS_TORCH_MAIN_PROJECT})
if("${CMAKE_BUILD_TYPE}" STREQUAL "" AND "${CMAKE_CONFIGURATION_TYPES}" STREQUAL "")
message(STATUS "Setting build type to 'relwithdebinfo' as none was specified.")
set(
CMAKE_BUILD_TYPE "relwithdebinfo"
CACHE STRING
"Choose the type of build, options are: none(CMAKE_CXX_FLAGS or CMAKE_C_FLAGS used) debug release relwithdebinfo minsizerel."
FORCE
)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS release debug relwithdebinfo minsizerel none)
endif()
endif()

find_package(Torch 1.11 REQUIRED)

set(BUILD_SHARED_LIBS OFF)
add_subdirectory(mops EXCLUDE_FROM_ALL)
set_target_properties(mops PROPERTIES POSITION_INDEPENDENT_CODE ON)

add_library(mops_torch SHARED
"src/register.cpp"
"src/opsa.cpp"

"include/mops/torch.hpp"
"include/mops/torch/opsa.hpp"
)

if(CMAKE_CUDA_COMPILER)
target_compile_definitions(mops_torch PUBLIC MOPS_CUDA_ENABLED)
endif()

target_compile_features(mops_torch PUBLIC cxx_std_17)
target_link_libraries(mops_torch PRIVATE mops)
target_link_libraries(mops_torch PUBLIC torch)

# Create a header defining MOPS_TORCH_EXPORT for exported classes/functions
set_target_properties(mops PROPERTIES
# hide non-exported symbols by default
CXX_VISIBILITY_PRESET hidden
)

target_include_directories(mops_torch PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
$<INSTALL_INTERFACE:include>
)

#------------------------------------------------------------------------------#
# Installation configuration
#------------------------------------------------------------------------------#
include(CMakePackageConfigHelpers)
write_basic_package_version_file(
mops_torch-config-version.cmake
VERSION ${MOPS_TORCH_VERSION}
COMPATIBILITY SameMinorVersion
)

install(TARGETS mops_torch
EXPORT mops_torch-targets
LIBRARY DESTINATION ${LIB_INSTALL_DIR}
ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
RUNTIME DESTINATION ${BIN_INSTALL_DIR}
)

install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/ DESTINATION ${INCLUDE_INSTALL_DIR})

# Install files to find mops in CMake projects
configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/cmake/mops_torch-config.in.cmake
${CMAKE_CURRENT_BINARY_DIR}/mops_torch-config.cmake
@ONLY
)
install(EXPORT mops_torch-targets
DESTINATION ${LIB_INSTALL_DIR}/cmake/mops_torch
)
install(FILES
${PROJECT_BINARY_DIR}/mops_torch-config-version.cmake
${PROJECT_BINARY_DIR}/mops_torch-config.cmake
DESTINATION ${LIB_INSTALL_DIR}/cmake/mops_torch
)
1 change: 1 addition & 0 deletions mops-torch/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.1.0
1 change: 1 addition & 0 deletions mops-torch/cmake/mops_torch-config.in.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include(${CMAKE_CURRENT_LIST_DIR}/mops_torch-targets.cmake)
6 changes: 6 additions & 0 deletions mops-torch/include/mops/torch.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#ifndef MOPS_TORCH_H
#define MOPS_TORCH_H

#include "torch/opsa.hpp" // IWYU pragma: export

#endif
36 changes: 36 additions & 0 deletions mops-torch/include/mops/torch/opsa.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#ifndef MOPS_TORCH_OPSA_H
#define MOPS_TORCH_OPSA_H

#include <torch/script.h>

#include <mops.hpp>

namespace mops_torch {

/// TODO
torch::Tensor outer_product_scatter_add(
torch::Tensor A,
torch::Tensor B,
torch::Tensor indices_output,
int64_t output_size
);

class OuterProductScatterAdd: public torch::autograd::Function<mops_torch::OuterProductScatterAdd> {
public:
static std::vector<torch::Tensor> forward(
torch::autograd::AutogradContext *ctx,
torch::Tensor A,
torch::Tensor B,
torch::Tensor indices_output,
int64_t output_size
);

static std::vector<torch::Tensor> backward(
torch::autograd::AutogradContext *ctx,
std::vector<torch::Tensor> grad_outputs
);
};

}

#endif
1 change: 1 addition & 0 deletions mops-torch/mops
142 changes: 142 additions & 0 deletions mops-torch/src/opsa.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#include "mops/torch/opsa.hpp"

using namespace mops_torch;


torch::Tensor mops_torch::outer_product_scatter_add(
torch::Tensor A,
torch::Tensor B,
torch::Tensor indices_output,
int64_t output_size
) {
return OuterProductScatterAdd::apply(A, B, indices_output, output_size)[0];
}

template <typename scalar_t>
static mops::Tensor<scalar_t, 1> torch_to_mops_1d(torch::Tensor tensor) {
assert(tensor.sizes().size() == 1);
return {
tensor.data_ptr<scalar_t>(),
{static_cast<size_t>(tensor.size(0))},
};
}

template <typename scalar_t>
static mops::Tensor<scalar_t, 2> torch_to_mops_2d(torch::Tensor tensor) {
assert(tensor.sizes().size() == 2);
return {
tensor.data_ptr<scalar_t>(),
{static_cast<size_t>(tensor.size(0)), static_cast<size_t>(tensor.size(1))},
};
}

std::vector<torch::Tensor> OuterProductScatterAdd::forward(
torch::autograd::AutogradContext *ctx,
torch::Tensor A,
torch::Tensor B,
torch::Tensor indices_output,
int64_t output_size
) {
if (A.sizes().size() != 2 || B.sizes().size() != 2) {
C10_THROW_ERROR(ValueError, "`A` and `B` must be 2-D tensor");
}

if (indices_output.sizes().size() != 1) {
C10_THROW_ERROR(ValueError, "`indices_output` must be a 1-D tensor");
}

if (indices_output.scalar_type() != torch::kInt32) {
C10_THROW_ERROR(ValueError, "`indices_output` must be a tensor of 32-bit integers");
}

if (A.device() != B.device() || A.device() != indices_output.device()) {
C10_THROW_ERROR(ValueError,
"all tensors must be on the same device, got " + A.device().str() +
", " + B.device().str() + ", and " + indices_output.device().str()
);
}

if (A.scalar_type() != B.scalar_type()) {
C10_THROW_ERROR(ValueError,
std::string("`A` and `B` must have the same dtype, got ") +
torch::toString(A.scalar_type()) + " and " + torch::toString(B.scalar_type())
);
}

torch::Tensor output;
if (A.device().is_cpu()) {
output = torch::zeros({output_size, A.size(1), B.size(1)},
torch::TensorOptions().dtype(A.scalar_type()).device(A.device())
);

assert(output.is_contiguous());

AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "outer_product_scatter_add", [&](){
mops::outer_product_scatter_add<scalar_t>(
torch_to_mops_2d<scalar_t>(output.reshape({-1, output.size(1) * output.size(2)})),
torch_to_mops_2d<scalar_t>(A),
torch_to_mops_2d<scalar_t>(B),
torch_to_mops_1d<int32_t>(indices_output)
);
});
} else {
C10_THROW_ERROR(ValueError,
"outer_product_scatter_add is not implemented for device " + A.device().str()
);
}

if (A.requires_grad() || B.requires_grad()) {
ctx->save_for_backward({A, B, indices_output});
}

return {output};
}

std::vector<torch::Tensor> OuterProductScatterAdd::backward(
torch::autograd::AutogradContext *ctx,
std::vector<torch::Tensor> grad_outputs
) {
auto saved_variables = ctx->get_saved_variables();
auto A = saved_variables[0];
auto B = saved_variables[1];
auto indices_output = saved_variables[2];

auto grad_output = grad_outputs[0];
if (!grad_output.is_contiguous()) {
throw std::runtime_error("expected contiguous grad_output");
}

auto grad_A = torch::Tensor();
auto grad_B = torch::Tensor();

if (A.device().is_cpu()) {
AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "outer_product_scatter_add_vjp", [&](){
auto mops_grad_A = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
if (A.requires_grad()) {
grad_A = torch::zeros_like(A);
mops_grad_A = torch_to_mops_2d<scalar_t>(grad_A);
}

auto mops_grad_B = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
if (B.requires_grad()) {
grad_B = torch::zeros_like(B);
mops_grad_B = torch_to_mops_2d<scalar_t>(grad_B);
}

mops::outer_product_scatter_add_vjp<scalar_t>(
mops_grad_A,
mops_grad_B,
torch_to_mops_2d<scalar_t>(grad_output.reshape({-1, grad_output.size(1) * grad_output.size(2)})),
torch_to_mops_2d<scalar_t>(A),
torch_to_mops_2d<scalar_t>(B),
torch_to_mops_1d<int32_t>(indices_output)
);
});
} else {
C10_THROW_ERROR(ValueError,
"outer_product_scatter_add is not implemented for device " + A.device().str()
);
}

return {grad_A, grad_B, torch::Tensor(), torch::Tensor()};
}
Loading

0 comments on commit ef39249

Please sign in to comment.