Skip to content

Commit

Permalink
Make a scaffold for opsax and sasax (#10)
Browse files Browse the repository at this point in the history
  • Loading branch information
frostedoyster authored Nov 22, 2023
1 parent 9303ba0 commit 2ea1745
Show file tree
Hide file tree
Showing 28 changed files with 928 additions and 29 deletions.
8 changes: 8 additions & 0 deletions mops/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ add_library(mops
"src/sap/capi.cpp"
"src/hpe/hpe.cpp"
"src/hpe/capi.cpp"
"src/opsax/opsax.cpp"
"src/opsax/capi.cpp"
"src/sasax/sasax.cpp"
"src/sasax/capi.cpp"

"include/mops.hpp"
"include/mops.h"
Expand All @@ -69,6 +73,10 @@ add_library(mops
"include/mops/sap.h"
"include/mops/hpe.hpp"
"include/mops/hpe.h"
"include/mops/opsax.hpp"
"include/mops/opsax.h"
"include/mops/sasax.hpp"
"include/mops/sasax.h"
)

if(CMAKE_CUDA_COMPILER)
Expand Down
4 changes: 4 additions & 0 deletions mops/include/mops.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
#define MOPS_H

#include "mops/exports.h" // IWYU pragma: export
#include "mops/hpe.h" // IWYU pragma: export
#include "mops/opsa.h" // IWYU pragma: export
#include "mops/sap.h" // IWYU pragma: export
#include "mops/opsax.h" // IWYU pragma: export
#include "mops/sasax.h" // IWYU pragma: export

#ifdef __cplusplus
extern "C" {
Expand Down
12 changes: 8 additions & 4 deletions mops/include/mops.hpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
#ifndef MOPS_HPP
#define MOPS_HPP

#include "mops/exports.h" // IWYU pragma: export

#include "mops/capi.hpp" // IWYU pragma: export
#include "mops/opsa.hpp" // IWYU pragma: export
#include "mops/exports.h" // IWYU pragma: export

#include "mops/capi.hpp" // IWYU pragma: export
#include "mops/hpe.hpp" // IWYU pragma: export
#include "mops/opsa.hpp" // IWYU pragma: export
#include "mops/sap.hpp" // IWYU pragma: export
#include "mops/opsax.hpp" // IWYU pragma: export
#include "mops/sasax.hpp" // IWYU pragma: export


#endif
4 changes: 2 additions & 2 deletions mops/include/mops/hpe.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#ifndef MOPS_OUTER_PRODUCT_SCATTER_ADD_H
#define MOPS_OUTER_PRODUCT_SCATTER_ADD_H
#ifndef MOPS_HPE_H
#define MOPS_HPE_H

#include "mops/exports.h"
#include "mops/tensor.h"
Expand Down
4 changes: 2 additions & 2 deletions mops/include/mops/hpe.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#ifndef MOPS_OUTER_PRODUCT_SCATTER_ADD_HPP
#define MOPS_OUTER_PRODUCT_SCATTER_ADD_HPP
#ifndef MOPS_HPE_HPP
#define MOPS_HPE_HPP

#include <cstddef>
#include <cstdint>
Expand Down
4 changes: 2 additions & 2 deletions mops/include/mops/opsa.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#ifndef MOPS_OUTER_PRODUCT_SCATTER_ADD_H
#define MOPS_OUTER_PRODUCT_SCATTER_ADD_H
#ifndef MOPS_OPSA_H
#define MOPS_OPSA_H

#include "mops/exports.h"
#include "mops/tensor.h"
Expand Down
4 changes: 2 additions & 2 deletions mops/include/mops/opsa.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#ifndef MOPS_OUTER_PRODUCT_SCATTER_ADD_HPP
#define MOPS_OUTER_PRODUCT_SCATTER_ADD_HPP
#ifndef MOPS_OPSA_HPP
#define MOPS_OPSA_HPP

#include <cstddef>
#include <cstdint>
Expand Down
61 changes: 61 additions & 0 deletions mops/include/mops/opsax.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#ifndef MOPS_OPSAX_H
#define MOPS_OPSAX_H

#include "mops/exports.h"
#include "mops/tensor.h"


#ifdef __cplusplus
extern "C" {
#endif

/// CPU version of mops::outer_product_scatter_add_with for 32-bit floats
int MOPS_EXPORT mops_outer_product_scatter_add_with_weights_f32(
mops_tensor_3d_f32_t output,
mops_tensor_2d_f32_t tensor_a,
mops_tensor_2d_f32_t tensor_r,
mops_tensor_2d_f32_t tensor_x,
mops_tensor_1d_i32_t tensor_i,
mops_tensor_1d_i32_t tensor_j
);


/// CPU version of mops::outer_product_scatter_add_with for 64-bit floats
int MOPS_EXPORT mops_outer_product_scatter_add_with_weights_f64(
mops_tensor_3d_f64_t output,
mops_tensor_2d_f64_t tensor_a,
mops_tensor_2d_f64_t tensor_r,
mops_tensor_2d_f64_t tensor_x,
mops_tensor_1d_i32_t tensor_i,
mops_tensor_1d_i32_t tensor_j
);


/// CUDA version of mops::outer_product_scatter_add_with for 32-bit floats
int MOPS_EXPORT mops_cuda_outer_product_scatter_add_with_weights_f32(
mops_tensor_3d_f32_t output,
mops_tensor_2d_f32_t tensor_a,
mops_tensor_2d_f32_t tensor_r,
mops_tensor_2d_f32_t tensor_x,
mops_tensor_1d_i32_t tensor_i,
mops_tensor_1d_i32_t tensor_j
);


/// CUDA version of mops::outer_product_scatter_add_with for 64-bit floats
int MOPS_EXPORT mops_cuda_outer_product_scatter_add_with_weights_f64(
mops_tensor_3d_f64_t output,
mops_tensor_2d_f64_t tensor_a,
mops_tensor_2d_f64_t tensor_r,
mops_tensor_2d_f64_t tensor_x,
mops_tensor_1d_i32_t tensor_i,
mops_tensor_1d_i32_t tensor_j
);


#ifdef __cplusplus
}
#endif


#endif
56 changes: 56 additions & 0 deletions mops/include/mops/opsax.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#ifndef MOPS_OPSAX_HPP
#define MOPS_OPSAX_HPP

#include <cstddef>
#include <cstdint>

#include "mops/exports.h"
#include "mops/tensor.hpp"

namespace mops {
/// TODO
template<typename scalar_t>
void MOPS_EXPORT outer_product_scatter_add_with_weights(
Tensor<scalar_t, 3> output,
Tensor<scalar_t, 2> tensor_a,
Tensor<scalar_t, 2> tensor_r,
Tensor<scalar_t, 2> tensor_x,
Tensor<int32_t, 1> i,
Tensor<int32_t, 1> j
);

// these templates will be precompiled and provided in the mops library
extern template void outer_product_scatter_add_with_weights(
Tensor<float, 3> output,
Tensor<float, 2> tensor_a,
Tensor<float, 2> tensor_r,
Tensor<float, 2> tensor_x,
Tensor<int32_t, 1> i,
Tensor<int32_t, 1> j
);

extern template void outer_product_scatter_add_with_weights(
Tensor<double, 3> output,
Tensor<double, 2> tensor_a,
Tensor<double, 2> tensor_r,
Tensor<double, 2> tensor_x,
Tensor<int32_t, 1> i,
Tensor<int32_t, 1> j
);

namespace cuda {
/// CUDA version of mops::outer_product_scatter_add_with
template<typename scalar_t>
void MOPS_EXPORT outer_product_scatter_add_with_weights(
Tensor<scalar_t, 3> output,
Tensor<scalar_t, 2> tensor_a,
Tensor<scalar_t, 2> tensor_r,
Tensor<scalar_t, 2> tensor_x,
Tensor<int32_t, 1> i,
Tensor<int32_t, 1> j
);
}
}


#endif
4 changes: 2 additions & 2 deletions mops/include/mops/sap.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#ifndef MOPS_SPARSE_ACCUMULATION_OF_PRODUCTS_H
#define MOPS_SPARSE_ACCUMULATION_OF_PRODUCTS_H
#ifndef MOPS_SAP_H
#define MOPS_SAP_H

#include "mops/exports.h"
#include "mops/tensor.h"
Expand Down
4 changes: 2 additions & 2 deletions mops/include/mops/sap.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#ifndef MOPS_SPARSE_ACCUMULATION_OF_PRODUCTS_HPP
#define MOPS_SPARSE_ACCUMULATION_OF_PRODUCTS_HPP
#ifndef MOPS_SAP_HPP
#define MOPS_SAP_HPP

#include <cstddef>
#include <cstdint>
Expand Down
77 changes: 77 additions & 0 deletions mops/include/mops/sasax.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#ifndef MOPS_SASAX_H
#define MOPS_SASAX_H

#include "mops/exports.h"
#include "mops/tensor.h"


#ifdef __cplusplus
extern "C" {
#endif

/// CPU version of mops::sparse_accumulation_scatter_add_with for 32-bit floats
int MOPS_EXPORT mops_sparse_accumulation_scatter_add_with_weights_f32(
mops_tensor_3d_f32_t output,
mops_tensor_2d_f32_t tensor_a,
mops_tensor_2d_f32_t tensor_r,
mops_tensor_3d_f32_t tensor_x,
mops_tensor_1d_f32_t tensor_c,
mops_tensor_1d_i32_t tensor_i,
mops_tensor_1d_i32_t tensor_j,
mops_tensor_1d_i32_t tensor_m_1,
mops_tensor_1d_i32_t tensor_m_2,
mops_tensor_1d_i32_t tensor_m_3
);


/// CPU version of mops::sparse_accumulation_scatter_add_with for 64-bit floats
int MOPS_EXPORT mops_sparse_accumulation_scatter_add_with_weights_f64(
mops_tensor_3d_f64_t output,
mops_tensor_2d_f64_t tensor_a,
mops_tensor_2d_f64_t tensor_r,
mops_tensor_3d_f64_t tensor_x,
mops_tensor_1d_f64_t tensor_c,
mops_tensor_1d_i32_t tensor_i,
mops_tensor_1d_i32_t tensor_j,
mops_tensor_1d_i32_t tensor_m_1,
mops_tensor_1d_i32_t tensor_m_2,
mops_tensor_1d_i32_t tensor_m_3
);


/// CUDA version of mops::sparse_accumulation_scatter_add_with for 32-bit floats
int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_f32(
mops_tensor_3d_f32_t output,
mops_tensor_2d_f32_t tensor_a,
mops_tensor_2d_f32_t tensor_r,
mops_tensor_3d_f32_t tensor_x,
mops_tensor_1d_f32_t tensor_c,
mops_tensor_1d_i32_t tensor_i,
mops_tensor_1d_i32_t tensor_j,
mops_tensor_1d_i32_t tensor_m_1,
mops_tensor_1d_i32_t tensor_m_2,
mops_tensor_1d_i32_t tensor_m_3
);


/// CUDA version of mops::sparse_accumulation_scatter_add_with for 64-bit floats
int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_f64(
mops_tensor_3d_f64_t output,
mops_tensor_2d_f64_t tensor_a,
mops_tensor_2d_f64_t tensor_r,
mops_tensor_3d_f64_t tensor_x,
mops_tensor_1d_f64_t tensor_c,
mops_tensor_1d_i32_t tensor_i,
mops_tensor_1d_i32_t tensor_j,
mops_tensor_1d_i32_t tensor_m_1,
mops_tensor_1d_i32_t tensor_m_2,
mops_tensor_1d_i32_t tensor_m_3
);


#ifdef __cplusplus
}
#endif


#endif
72 changes: 72 additions & 0 deletions mops/include/mops/sasax.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#ifndef MOPS_SASAX_HPP
#define MOPS_SASAX_HPP

#include <cstddef>
#include <cstdint>

#include "mops/exports.h"
#include "mops/tensor.hpp"

namespace mops {
/// TODO
template<typename scalar_t>
void MOPS_EXPORT sparse_accumulation_scatter_add_with_weights(
Tensor<scalar_t, 3> output,
Tensor<scalar_t, 2> tensor_a,
Tensor<scalar_t, 2> tensor_r,
Tensor<scalar_t, 3> tensor_x,
Tensor<scalar_t, 1> tensor_c,
Tensor<int, 1> tensor_i,
Tensor<int, 1> tensor_j,
Tensor<int, 1> tensor_m_1,
Tensor<int, 1> tensor_m_2,
Tensor<int, 1> tensor_m_3
);

// these templates will be precompiled and provided in the mops library
extern template void sparse_accumulation_scatter_add_with_weights(
Tensor<float, 3> output,
Tensor<float, 2> tensor_a,
Tensor<float, 2> tensor_r,
Tensor<float, 3> tensor_x,
Tensor<float, 1> tensor_c,
Tensor<int, 1> tensor_i,
Tensor<int, 1> tensor_j,
Tensor<int, 1> tensor_m_1,
Tensor<int, 1> tensor_m_2,
Tensor<int, 1> tensor_m_3
);

extern template void sparse_accumulation_scatter_add_with_weights(
Tensor<double, 3> output,
Tensor<double, 2> tensor_a,
Tensor<double, 2> tensor_r,
Tensor<double, 3> tensor_x,
Tensor<double, 1> tensor_c,
Tensor<int, 1> tensor_i,
Tensor<int, 1> tensor_j,
Tensor<int, 1> tensor_m_1,
Tensor<int, 1> tensor_m_2,
Tensor<int, 1> tensor_m_3
);

namespace cuda {
/// CUDA version of mops::sparse_accumulation_scatter_add_with
template<typename scalar_t>
void MOPS_EXPORT sparse_accumulation_scatter_add_with_weights(
Tensor<scalar_t, 3> output,
Tensor<scalar_t, 2> tensor_a,
Tensor<scalar_t, 2> tensor_r,
Tensor<scalar_t, 3> tensor_x,
Tensor<scalar_t, 1> tensor_c,
Tensor<int, 1> tensor_i,
Tensor<int, 1> tensor_j,
Tensor<int, 1> tensor_m_1,
Tensor<int, 1> tensor_m_2,
Tensor<int, 1> tensor_m_3
);
}
}


#endif
10 changes: 10 additions & 0 deletions mops/include/mops/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,16 @@
extern "C" {
#endif

struct mops_tensor_3d_f32_t {
float* __restrict__ data;
int64_t shape[3];
};

struct mops_tensor_3d_f64_t {
double* __restrict__ data;
int64_t shape[3];
};

struct mops_tensor_2d_f32_t {
float* __restrict__ data;
int64_t shape[2];
Expand Down
Loading

0 comments on commit 2ea1745

Please sign in to comment.