Skip to content

Commit

Permalink
try memory allocation suggestion
Browse files Browse the repository at this point in the history
  • Loading branch information
PFLeget committed Nov 19, 2024
1 parent 6132de0 commit bbb9d22
Showing 1 changed file with 26 additions and 3 deletions.
29 changes: 26 additions & 3 deletions src/basic_solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,28 @@

namespace py = pybind11;

// A custom deleter that finds the original address of the memory allocation directly
// before the stored pointer and frees that using delete []
template <typename T>
struct AlignedDeleter {
void operator()(T* p) const { delete [] ((char**)p)[-1]; }
};

template <typename T>
std::shared_ptr<T> allocateAlignedMemory(int n)
{
// This bit is based on the answers here:
// http://stackoverflow.com/questions/227897/how-to-allocate-aligned-memory-only-using-the-standard-library/227900
// The point of this is to get the _data pointer aligned to a 16 byte (128 bit) boundary.
// Arrays that are so aligned can use SSE operations and so can be much faster than
// non-aligned memroy. FFTW in particular is faster if it gets aligned data.
char* mem = new char[n * sizeof(T) + sizeof(char*) + 15];
T* data = reinterpret_cast<T*>( (uintptr_t)(mem + sizeof(char*) + 15) & ~(size_t) 0x0F );
((char**)data)[-1] = mem;
std::shared_ptr<T> owner(data, AlignedDeleter<T>());
return owner;
}


template<class T, ssize_t N>
auto _solve_direct_cpp_impl(
Expand All @@ -26,9 +48,10 @@ auto _solve_direct_cpp_impl(
// Allocate and initialize container array for the A transpose A matrix.
// For some reason it is faster to allocate a numpy array and then map
// it to Eigen than allocating an Eigen array directly
py::array_t<T, py::array::f_style> ATA(std::vector<ssize_t>({A_shape[1]*n, A_shape[1]*n}));
py::buffer_info ATA_buffer = ATA.request();
T __restrict * ATA_data = static_cast<T *>(ATA_buffer.ptr);
// py::array_t<T, py::array::f_style> ATA(std::vector<ssize_t>({A_shape[1]*n, A_shape[1]*n}));
// py::buffer_info ATA_buffer = ATA.request();
// T __restrict * ATA_data = static_cast<T *>(ATA_buffer.ptr);
auto ATA_data = allocateAlignedMemory<T>(A_shape[1]*N * A_shape[1]*N);
// set it all to zero
memset(ATA_data, 0, A_shape[1]*n*A_shape[1]*n*sizeof(T));
Eigen::Map< Eigen::Matrix<T,
Expand Down

0 comments on commit bbb9d22

Please sign in to comment.