Skip to content

Commit

Permalink
reduce repeated code in MCF
Browse files Browse the repository at this point in the history
  • Loading branch information
Ahdhn committed Aug 25, 2024
1 parent 577015d commit b9a020b
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 106 deletions.
8 changes: 2 additions & 6 deletions apps/MCF/mcf.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,7 @@ TEST(App, MCF)
mcf_cg<dataT>(rx);

// RXMesh cusolver Impl
mcf_cusolver_chol<dataT>(rx);

// RXMesh cusolver Impl with our CUDA_ND reorder
mcf_cusolver_chol_cudaND<dataT>(rx);
mcf_cusolver_chol<dataT>(rx, PermuteMethod::CUSTOM);
}

int main(int argc, char** argv)
Expand Down Expand Up @@ -107,8 +104,7 @@ int main(int argc, char** argv)
atoi(get_cmd_option(argv, argv + argc, "-device_id"));
}
if (cmd_option_exists(argv, argc + argv, "-nd_level")) {
Arg.nd_level =
atoi(get_cmd_option(argv, argv + argc, "-nd_level"));
Arg.nd_level = atoi(get_cmd_option(argv, argv + argc, "-nd_level"));
}
}

Expand Down
126 changes: 26 additions & 100 deletions apps/MCF/mcf_cusolver_chol.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@ __global__ static void mcf_A_setup(
}

template <typename T>
void mcf_cusolver_chol(rxmesh::RXMeshStatic& rx)
void mcf_cusolver_chol(rxmesh::RXMeshStatic& rx,
rxmesh::PermuteMethod permute_method)
{
using namespace rxmesh;
constexpr uint32_t blockThreads = 256;
Expand Down Expand Up @@ -164,7 +165,6 @@ void mcf_cusolver_chol(rxmesh::RXMeshStatic& rx)
launch_box_B.smem_bytes_dyn>>>(
rx.get_context(), *coords, B_mat, Arg.use_uniform_laplace);

CUDA_ERROR(cudaDeviceSynchronize());

// A and X set up
LaunchBox<blockThreads> launch_box_A_X;
Expand Down Expand Up @@ -193,112 +193,37 @@ void mcf_cusolver_chol(rxmesh::RXMeshStatic& rx)
// A_mat.solve(B_mat, *X_mat, Solver::QR, PermuteMethod::NSTDIS);
// A_mat.solve(B_mat, *X_mat, Solver::CHOL, PermuteMethod::NSTDIS);

// Solving using CHOL
A_mat.pre_solve(Solver::CHOL, PermuteMethod::NSTDIS);
A_mat.solve(B_mat, *X_mat);


// move the results to the host
// if we use LU, the data will be on the host and we should not move the
// device to the host
X_mat->move(rxmesh::DEVICE, rxmesh::HOST);

// copy the results to attributes
coords->from_matrix(X_mat.get());

#if USE_POLYSCOPE
rx.get_polyscope_mesh()->updateVertexPositions(*coords);
polyscope::show();
#endif

B_mat.release();
X_mat->release();
A_mat.release();
}

template <typename T>
void mcf_cusolver_chol_cudaND(rxmesh::RXMeshStatic& rx)
{
using namespace rxmesh;
constexpr uint32_t blockThreads = 256;

uint32_t num_vertices = rx.get_num_vertices();

auto coords = rx.get_input_vertex_coordinates();

SparseMatrix<float> A_mat(rx);
DenseMatrix<float> B_mat(rx, num_vertices, 3);

std::shared_ptr<DenseMatrix<float>> X_mat = coords->to_matrix();

RXMESH_INFO("use_uniform_laplace: {}, time_step: {}",
Arg.use_uniform_laplace,
Arg.time_step);

// B set up
LaunchBox<blockThreads> launch_box_B;
rx.prepare_launch_box({Op::VV},
launch_box_B,
(void*)mcf_B_setup<float, blockThreads>,
!Arg.use_uniform_laplace);

mcf_B_setup<float, blockThreads><<<launch_box_B.blocks,
launch_box_B.num_threads,
launch_box_B.smem_bytes_dyn>>>(
rx.get_context(), *coords, B_mat, Arg.use_uniform_laplace);

CUDA_ERROR(cudaDeviceSynchronize());

// A and X set up
LaunchBox<blockThreads> launch_box_A_X;
rx.prepare_launch_box({Op::VV},
launch_box_A_X,
(void*)mcf_A_setup<float, blockThreads>,
!Arg.use_uniform_laplace);

mcf_A_setup<float, blockThreads>
<<<launch_box_A_X.blocks,
launch_box_A_X.num_threads,
launch_box_A_X.smem_bytes_dyn>>>(rx.get_context(),
*coords,
A_mat,
Arg.use_uniform_laplace,
Arg.time_step);
// Pre-Solves
uint32_t* h_reorder_array;

if (permute_method != PermuteMethod::CUSTOM) {
A_mat.pre_solve(Solver::CHOL, PermuteMethod::NSTDIS);
} else {
// Compute CUDA_ND reorder

// To Use LU, we have to move the data to the host
// A_mat.move(DEVICE, HOST);
// B_mat.move(DEVICE, HOST);
// X_mat->move(DEVICE, HOST);
// A_mat.solve(B_mat, *X_mat, Solver::LU, PermuteMethod::NSTDIS);
CUDA_ERROR(cudaMallocManaged(&h_reorder_array,
sizeof(uint32_t) * rx.get_num_vertices()));
CUDA_ERROR(cudaMemset(
h_reorder_array, 0, sizeof(uint32_t) * rx.get_num_vertices()));

// Solving using QR or CHOL
// A_mat.solve(B_mat, *X_mat, Solver::QR, PermuteMethod::NSTDIS);
// A_mat.solve(B_mat, *X_mat, Solver::CHOL, PermuteMethod::NSTDIS);
cuda_nd_reorder(rx, h_reorder_array, Arg.nd_level);

// Compute CUDA_ND reorder
uint32_t* h_reorder_array;
int* h_reorder_array_int_cpy;
CUDA_ERROR(cudaMallocManaged(&h_reorder_array,
sizeof(uint32_t) * rx.get_num_vertices()));
CUDA_ERROR(cudaMemset(
h_reorder_array, 0, sizeof(uint32_t) * rx.get_num_vertices()));

cuda_nd_reorder(rx, h_reorder_array, Arg.nd_level);
if (arr_check_uint32_to_int_cast(h_reorder_array, rx.get_num_vertices())) {
h_reorder_array_int_cpy = reinterpret_cast<int*>(h_reorder_array);
} else {
RXMESH_ERROR(
"Error: Overflow in casting reorder array uint32_t to int");
int* h_reorder_array_int_cpy;
if (arr_check_uint32_to_int_cast(h_reorder_array,
rx.get_num_vertices())) {
h_reorder_array_int_cpy = reinterpret_cast<int*>(h_reorder_array);
} else {
RXMESH_ERROR(
"Error: Overflow in casting reorder array uint32_t to int");
}
// Solving using CHOL
A_mat.pre_solve(
Solver::CHOL, PermuteMethod::CUSTOM, h_reorder_array_int_cpy);
}


// Solving using CHOL
A_mat.pre_solve(
Solver::CHOL, PermuteMethod::CUSTOM, h_reorder_array_int_cpy);
// Solve
A_mat.solve(B_mat, *X_mat);


// move the results to the host
// if we use LU, the data will be on the host and we should not move the
// device to the host
Expand All @@ -315,4 +240,5 @@ void mcf_cusolver_chol_cudaND(rxmesh::RXMeshStatic& rx)
B_mat.release();
X_mat->release();
A_mat.release();
GPU_FREE(h_reorder_array);
}

0 comments on commit b9a020b

Please sign in to comment.