reduce repeated code in MCF

owensgroup · Aug 25, 2024 · b9a020b · b9a020b
1 parent 577015d
commit b9a020b
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 106 deletions.
diff --git a/apps/MCF/mcf.cu b/apps/MCF/mcf.cu
@@ -44,10 +44,7 @@ TEST(App, MCF)
     mcf_cg<dataT>(rx);
 
     // RXMesh cusolver Impl
-    mcf_cusolver_chol<dataT>(rx);
-
-    // RXMesh cusolver Impl with our CUDA_ND reorder
-    mcf_cusolver_chol_cudaND<dataT>(rx);
+    mcf_cusolver_chol<dataT>(rx, PermuteMethod::CUSTOM);
 }
 
 int main(int argc, char** argv)
@@ -107,8 +104,7 @@ int main(int argc, char** argv)
                 atoi(get_cmd_option(argv, argv + argc, "-device_id"));
         }
         if (cmd_option_exists(argv, argc + argv, "-nd_level")) {
-            Arg.nd_level =
-                atoi(get_cmd_option(argv, argv + argc, "-nd_level"));
+            Arg.nd_level = atoi(get_cmd_option(argv, argv + argc, "-nd_level"));
         }
     }
 

diff --git a/apps/MCF/mcf_cusolver_chol.cuh b/apps/MCF/mcf_cusolver_chol.cuh
@@ -134,7 +134,8 @@ __global__ static void mcf_A_setup(
 }
 
 template <typename T>
-void mcf_cusolver_chol(rxmesh::RXMeshStatic& rx)
+void mcf_cusolver_chol(rxmesh::RXMeshStatic& rx,
+                       rxmesh::PermuteMethod permute_method)
 {
     using namespace rxmesh;
     constexpr uint32_t blockThreads = 256;
@@ -164,7 +165,6 @@ void mcf_cusolver_chol(rxmesh::RXMeshStatic& rx)
                                        launch_box_B.smem_bytes_dyn>>>(
         rx.get_context(), *coords, B_mat, Arg.use_uniform_laplace);
 
-    CUDA_ERROR(cudaDeviceSynchronize());
 
     // A and X set up
     LaunchBox<blockThreads> launch_box_A_X;
@@ -193,112 +193,37 @@ void mcf_cusolver_chol(rxmesh::RXMeshStatic& rx)
     // A_mat.solve(B_mat, *X_mat, Solver::QR, PermuteMethod::NSTDIS);
     // A_mat.solve(B_mat, *X_mat, Solver::CHOL, PermuteMethod::NSTDIS);
 
-    // Solving using CHOL
-    A_mat.pre_solve(Solver::CHOL, PermuteMethod::NSTDIS);
-    A_mat.solve(B_mat, *X_mat);
-
-
-    // move the results to the host
-    // if we use LU, the data will be on the host and we should not move the
-    // device to the host
-    X_mat->move(rxmesh::DEVICE, rxmesh::HOST);
-
-    // copy the results to attributes
-    coords->from_matrix(X_mat.get());
-
-#if USE_POLYSCOPE
-    rx.get_polyscope_mesh()->updateVertexPositions(*coords);
-    polyscope::show();
-#endif
-
-    B_mat.release();
-    X_mat->release();
-    A_mat.release();
-}
-
-template <typename T>
-void mcf_cusolver_chol_cudaND(rxmesh::RXMeshStatic& rx)
-{
-    using namespace rxmesh;
-    constexpr uint32_t blockThreads = 256;
-
-    uint32_t num_vertices = rx.get_num_vertices();
-
-    auto coords = rx.get_input_vertex_coordinates();
-
-    SparseMatrix<float> A_mat(rx);
-    DenseMatrix<float>  B_mat(rx, num_vertices, 3);
-
-    std::shared_ptr<DenseMatrix<float>> X_mat = coords->to_matrix();
-
-    RXMESH_INFO("use_uniform_laplace: {}, time_step: {}",
-                Arg.use_uniform_laplace,
-                Arg.time_step);
-
-    // B set up
-    LaunchBox<blockThreads> launch_box_B;
-    rx.prepare_launch_box({Op::VV},
-                          launch_box_B,
-                          (void*)mcf_B_setup<float, blockThreads>,
-                          !Arg.use_uniform_laplace);
-
-    mcf_B_setup<float, blockThreads><<<launch_box_B.blocks,
-                                       launch_box_B.num_threads,
-                                       launch_box_B.smem_bytes_dyn>>>(
-        rx.get_context(), *coords, B_mat, Arg.use_uniform_laplace);
-
-    CUDA_ERROR(cudaDeviceSynchronize());
-
-    // A and X set up
-    LaunchBox<blockThreads> launch_box_A_X;
-    rx.prepare_launch_box({Op::VV},
-                          launch_box_A_X,
-                          (void*)mcf_A_setup<float, blockThreads>,
-                          !Arg.use_uniform_laplace);
-
-    mcf_A_setup<float, blockThreads>
-        <<<launch_box_A_X.blocks,
-           launch_box_A_X.num_threads,
-           launch_box_A_X.smem_bytes_dyn>>>(rx.get_context(),
-                                            *coords,
-                                            A_mat,
-                                            Arg.use_uniform_laplace,
-                                            Arg.time_step);
+    // Pre-Solves
+    uint32_t* h_reorder_array;
 
+    if (permute_method != PermuteMethod::CUSTOM) {
+        A_mat.pre_solve(Solver::CHOL, PermuteMethod::NSTDIS);
+    } else {
+        // Compute CUDA_ND reorder
 
-    // To Use LU, we have to move the data to the host
-    // A_mat.move(DEVICE, HOST);
-    // B_mat.move(DEVICE, HOST);
-    // X_mat->move(DEVICE, HOST);
-    // A_mat.solve(B_mat, *X_mat, Solver::LU, PermuteMethod::NSTDIS);
+        CUDA_ERROR(cudaMallocManaged(&h_reorder_array,
+                                     sizeof(uint32_t) * rx.get_num_vertices()));
+        CUDA_ERROR(cudaMemset(
+            h_reorder_array, 0, sizeof(uint32_t) * rx.get_num_vertices()));
 
-    // Solving using QR or CHOL
-    // A_mat.solve(B_mat, *X_mat, Solver::QR, PermuteMethod::NSTDIS);
-    // A_mat.solve(B_mat, *X_mat, Solver::CHOL, PermuteMethod::NSTDIS);
+        cuda_nd_reorder(rx, h_reorder_array, Arg.nd_level);
 
-    // Compute CUDA_ND reorder
-    uint32_t* h_reorder_array;
-    int*      h_reorder_array_int_cpy;
-    CUDA_ERROR(cudaMallocManaged(&h_reorder_array,
-                                 sizeof(uint32_t) * rx.get_num_vertices()));
-    CUDA_ERROR(cudaMemset(
-        h_reorder_array, 0, sizeof(uint32_t) * rx.get_num_vertices()));
-
-    cuda_nd_reorder(rx, h_reorder_array, Arg.nd_level);
-    if (arr_check_uint32_to_int_cast(h_reorder_array, rx.get_num_vertices())) {
-        h_reorder_array_int_cpy = reinterpret_cast<int*>(h_reorder_array);
-    } else {
-        RXMESH_ERROR(
-            "Error: Overflow in casting reorder array uint32_t to int");
+        int* h_reorder_array_int_cpy;
+        if (arr_check_uint32_to_int_cast(h_reorder_array,
+                                         rx.get_num_vertices())) {
+            h_reorder_array_int_cpy = reinterpret_cast<int*>(h_reorder_array);
+        } else {
+            RXMESH_ERROR(
+                "Error: Overflow in casting reorder array uint32_t to int");
+        }
+        // Solving using CHOL
+        A_mat.pre_solve(
+            Solver::CHOL, PermuteMethod::CUSTOM, h_reorder_array_int_cpy);
     }
 
-
-    // Solving using CHOL
-    A_mat.pre_solve(
-        Solver::CHOL, PermuteMethod::CUSTOM, h_reorder_array_int_cpy);
+    // Solve
     A_mat.solve(B_mat, *X_mat);
 
-
     // move the results to the host
     // if we use LU, the data will be on the host and we should not move the
     // device to the host
@@ -315,4 +240,5 @@ void mcf_cusolver_chol_cudaND(rxmesh::RXMeshStatic& rx)
     B_mat.release();
     X_mat->release();
     A_mat.release();
+    GPU_FREE(h_reorder_array);
 }