From 2c4fa9c06b15b1329b360f02284956029b6e58b1 Mon Sep 17 00:00:00 2001
From: "Chance.H" <chancewithchance@outlook.com>
Date: Thu, 26 Sep 2024 09:29:49 +0800
Subject: [PATCH] fix: fix bug in using cuda and make program faster

Squashed commit of the following:

commit 1c36cbb0a779ff807f46f91482724b1f68420966
Author: Chance.H <chancewithchance@outlook.com>
Date:   Thu Sep 26 09:27:52 2024 +0800

    docs: update README and workflow

commit 39808b8297851e66223bc1d6f686c3020ddcc8e1
Author: Chance.H <chancewithchance@outlook.com>
Date:   Thu Sep 26 09:10:22 2024 +0800

    fix: unnecessarily accelerated matrix addition

commit 2ea89ae89b189121f6b81c863fe3f9edc01d40f5
Author: Chance.H <chancewithchance@outlook.com>
Date:   Thu Sep 26 09:05:33 2024 +0800

    fix: unnecessarily accelerated matrix addition

commit 97f6a6f4ae77dbe51939b891ecf0a3bdaa792d20
Author: Chance.H <chancewithchance@outlook.com>
Date:   Thu Sep 26 01:19:34 2024 +0800

    feat: add cusparse_multiply, 14 FPS

commit 306e74ddd91e8f6677154acfcc4b958c4fe74149
Author: Chance.H <chancewithchance@outlook.com>
Date:   Wed Sep 25 23:06:01 2024 +0800

    fix: fix bug of shader and make program faster

commit 343401c5af82ad407ded0b097da150083a1b122b
Author: Chance.H <chancewithchance@outlook.com>
Date:   Wed Sep 25 20:52:18 2024 +0800

    fix: use cuda successfully

commit f19e676fe15caeaf2110b7bd4359437e516aba47
Author: Chance.H <chancewithchance@outlook.com>
Date:   Tue Sep 24 23:54:14 2024 +0800

    to: use cuda, still run with bug
---
 .github/workflows/workflow.yml                |  18 +-
 CMakePresets.json                             |  12 +-
 README.md                                     |  49 +-
 cmake/3rd.cmake                               |  12 +
 current_state.txt                             |  76 ++
 system_matrix.txt                             | 772 ++++++++++++++++++
 test/system_test/MassSpring3D/CMakeLists.txt  |  42 +-
 .../MassSpring3D/cusparse_cholesky_solver.cpp | 400 +++++++++
 .../MassSpring3D/cusparse_wrapper.cpp         | 202 +++++
 .../MassSpring3D/dtkPhysMassSpringSolver.cpp  | 268 +++---
 .../include/cusparse_cholesky_solver.h        |  30 +
 .../MassSpring3D/include/cusparse_wrapper.h   | 102 +++
 .../MassSpring3D/include/device_buffer.h      |  70 ++
 .../include/dtkPhysMassSpringSolver.h         | 137 +++-
 test/system_test/MassSpring3D/include/macro.h |  13 +
 .../system_test/MassSpring3D/include/step.cuh |  19 +-
 test/system_test/MassSpring3D/main.cpp        |  19 +-
 .../MassSpring3D/shaders/basic.vshader        |   2 +-
 .../MassSpring3D/shaders/phong.fshader        |   2 +-
 test/system_test/MassSpring3D/step.cu         |  95 +--
 20 files changed, 2040 insertions(+), 300 deletions(-)
 create mode 100644 current_state.txt
 create mode 100644 system_matrix.txt
 create mode 100644 test/system_test/MassSpring3D/cusparse_cholesky_solver.cpp
 create mode 100644 test/system_test/MassSpring3D/cusparse_wrapper.cpp
 create mode 100644 test/system_test/MassSpring3D/include/cusparse_cholesky_solver.h
 create mode 100644 test/system_test/MassSpring3D/include/cusparse_wrapper.h
 create mode 100644 test/system_test/MassSpring3D/include/device_buffer.h
 create mode 100644 test/system_test/MassSpring3D/include/macro.h

diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml
index f594c2f..0b9a345 100644
--- a/.github/workflows/workflow.yml
+++ b/.github/workflows/workflow.yml
@@ -43,12 +43,28 @@ jobs:
     steps:
       - uses: actions/checkout@v3
 
+      - name: Check for CUDA
+        id: check_cuda
+        run: |
+          if command -v nvcc &> /dev/null; then
+            echo "CUDA is installed."
+            echo "has_cuda=true" >> $GITHUB_ENV
+          else
+            echo "CUDA is not installed."
+            echo "has_cuda=false" >> $GITHUB_ENV
+          fi
+
       - name: Install dependencies
         run: |
           sudo apt update
           sudo apt install --fix-missing -y doxygen graphviz clang-format clang-tidy cppcheck lcov
           sudo apt install --fix-missing -y gcc g++ libspdlog-dev libcgal-dev freeglut3-dev libboost-all-dev libvtk9-dev qtbase5-dev xorg-dev libglu1-mesa-dev libglm-dev libglfw3-dev libglew-dev
           sudo apt-get install --no-install-recommends -y xvfb ffmpeg
+          
+          # Optionally install CUDA dependencies if CUDA is available
+          if [ "$has_cuda" = "true" ]; then
+            sudo apt install --fix-missing -y cuda-toolkit
+          fi
 
       - name: Build
         run: |
@@ -68,7 +84,7 @@ jobs:
             ffmpeg -y -f x11grab -video_size 800x600 -i :99 -t 5 -r 10 ./build/bin/demo2d_$i.gif &
             FFmpeg_PID=$!
             # Run the demo2d program in the background
-            ./build/bin/demo2d --instruction $i &
+            ./build/bin/demo2d --instruction $i --edge_num 33 &
             Demo2d_PID=$!
             # Wait for 5 seconds
             sleep 5
diff --git a/CMakePresets.json b/CMakePresets.json
index 8102f43..1df6a7f 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -15,16 +15,14 @@
         "CMAKE_C_STANDARD": "17",
         "CMAKE_C_STANDARD_REQUIRED": "ON",
         "CMAKE_CXX_EXTENSIONS": "OFF",
-        "CMAKE_CXX_STANDARD": "20",
+        "CMAKE_CXX_STANDARD": "17",
         "CMAKE_CXX_STANDARD_REQUIRED": "ON"
       }
     },
     {
       "name": "configurePresets_base",
       "hidden": true,
-      "inherits": [
-        "std"
-      ],
+      "inherits": ["std"],
       "displayName": "configurePresets_base",
       "description": "base configurePresets",
       "binaryDir": "${sourceDir}/build",
@@ -54,11 +52,9 @@
     {
       "name": "build",
       "hidden": false,
-      "inherits": [
-        "configurePresets_base"
-      ],
+      "inherits": ["configurePresets_base"],
       "displayName": "build",
       "description": "build"
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/README.md b/README.md
index 2b2ddcf..cc52a4e 100644
--- a/README.md
+++ b/README.md
@@ -117,6 +117,10 @@ dtk Render by Opengl.
 
 [ The Visualization ToolKit (VTK)](https://vtk.org/Wiki/VTK) is an open source, freely available software system for 3D computer graphics, image processing, and visualization used by thousands of researchers and developers around the world. VTK consists of a C++ class library, and several interpreted interface layers including Tcl/Tk, Java, and Python. Professional support and products for VTK are provided by [Kitware, Inc.](http://www.kitware.com/) VTK supports a wide variety of visualization algorithms including scalar, vector, tensor, texture, and volumetric methods; and advanced modeling techniques such as implicit modelling, polygon reduction, mesh smoothing, cutting, contouring, and Delaunay triangulation. In addition, dozens of imaging algorithms have been directly integrated to allow the user to mix 2D imaging / 3D graphics algorithms and data.
 
+## [Cuda](https://developer.nvidia.com/cuda-zone) \(Optional\)
+
+CUDA® is a parallel computing platform and programming model developed by NVIDIA for general computing on graphical processing units (GPUs). With CUDA, developers are able to dramatically speed up computing applications by harnessing the power of GPUs.
+
 ## How to build
 
 dtk is build by CMake. so you can build easily. You could build this project either in windows or linux. [tutorial on cmake build](https://preshing.com/20170511/how-to-build-a-cmake-based-project/)
@@ -178,46 +182,7 @@ $ cmake --build "DESTINATION_SHARED_DIR" --config Release --target install
 
 ## Demo with dtk
 
-### Rigid body Simulation Demo
-
- A physical simulation demo for 2D rigid body in real time. It deals with the collision of the rigid body which has no deformation by SAT and AABB methods.
-
-<div align="center">
-  <img src="./demo/img/dtkRigidBodySimulation.gif" width = "50%", height = "50%"/>
-</div>
-
-### Finite Element Method Simulation
-
-A finite element method physical simulation for 2D hyperelasticity deformation meterial body in real time.
-
-<div align="center">
-  <img src="./demo/img/dtkFemSimulation.gif" width = "50%", height = "50%"/>
-</div>
-
-### SPH Methods Simulation
-
-A physical simulation demo for 2D fluid in real time. It deals with the fluid body by a series of SPH methods, which include WCSPH, PCISPH and DFSPH.
-
-<div align = center>
-<img src="./demo/img/WCSPH.gif" width = "30%", height = "30%" /><img src="./demo/img/PCISPH.gif" width = "30%", height = "30%"/><img src="./demo/img/DFSPH.gif" width = "30%", height = "30%"/>
-</div>
-
-The SPH method is WCSPH, PCISPH and DFSPH from left to right.
-
-### Guidewire Simulation
-
-A blood flow induced physical simulation of guidewire shape for virtual vascular intervention training system in real time. Virtual vascular intervention training system, which is a low cost, safe and effective solution, is able to provide an immersive virtual training environment for trainees.
-
-<div align="center">
-  <img src="./demo/img/guideWire.jpg" width = "50%", height = "50%" />
-</div>
-### Finite Element Method Simulation in 3D
-
-<div align="center">
-  <img src="./demo/img/3dFEM.gif" width = "50%", height = "50%" />
-</div>
-
-## Demo2d
+### Demo2d
 
 <div align="center">
   <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 10px; max-width: 800px; margin: auto;">
@@ -231,7 +196,9 @@ A blood flow induced physical simulation of guidewire shape for virtual vascular
   </div>
 </div>
 
-## MassSpring3D
+### MassSpring3D
+
+CUDA can be used to accelerate the simulation of mass spring system. 
 
 <div align="center">
   <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 10px; max-width: 800px; margin: auto;">
diff --git a/cmake/3rd.cmake b/cmake/3rd.cmake
index 19ca1c9..33ac300 100644
--- a/cmake/3rd.cmake
+++ b/cmake/3rd.cmake
@@ -219,4 +219,16 @@ find_package(glfw3 REQUIRED)
 if (NOT glfw3_FOUND)
     message(FATAL_ERROR "glfw3 not found.\n"
             "Following https://www.glfw.org to install.")
+endif ()
+
+find_package(GLEW REQUIRED)
+if (NOT GLEW_FOUND)
+    message(FATAL_ERROR "GLEW not found.\n"
+            "Following http://glew.sourceforge.net to install.")
+endif ()
+
+find_package(CUDAToolkit 11.7 REQUIRED)
+if (NOT CUDAToolkit_FOUND)
+    message(FATAL_ERROR "CUDAToolkit not found.\n"
+            "Following https://developer.nvidia.com/cuda-downloads to install.")
 endif ()
\ No newline at end of file
diff --git a/current_state.txt b/current_state.txt
new file mode 100644
index 0000000..0b24cc0
--- /dev/null
+++ b/current_state.txt
@@ -0,0 +1,76 @@
+default constructor:0x5555556396c0
+    -1.00297
+     1.00297
+  -0.0006272
+    -0.50019
+     1.00355
+  -0.0006272
+-2.03748e-09
+     1.00354
+  -0.0006272
+     0.50019
+     1.00355
+  -0.0006272
+     1.00297
+     1.00297
+  -0.0006272
+    -1.00355
+     0.50019
+  -0.0006272
+   -0.500268
+    0.500268
+  -0.0006272
+-9.07818e-09
+     0.50028
+  -0.0006272
+    0.500268
+    0.500268
+  -0.0006272
+     1.00355
+     0.50019
+  -0.0006272
+    -1.00354
+ 3.49283e-09
+  -0.0006272
+    -0.50028
+           0
+  -0.0006272
+ 6.65299e-09
+-6.56942e-09
+  -0.0006272
+     0.50028
+-7.09866e-10
+  -0.0006272
+     1.00354
+-4.36441e-10
+  -0.0006272
+    -1.00355
+    -0.50019
+  -0.0006272
+   -0.500268
+   -0.500268
+  -0.0006272
+-1.56147e-09
+    -0.50028
+  -0.0006272
+    0.500268
+   -0.500268
+  -0.0006272
+     1.00355
+    -0.50019
+  -0.0006272
+    -1.00297
+    -1.00297
+  -0.0006272
+    -0.50019
+    -1.00355
+  -0.0006272
+  2.9097e-09
+    -1.00354
+  -0.0006272
+     0.50019
+    -1.00355
+  -0.0006272
+     1.00297
+    -1.00297
+  -0.0006272
\ No newline at end of file
diff --git a/system_matrix.txt b/system_matrix.txt
new file mode 100644
index 0000000..92c26a2
--- /dev/null
+++ b/system_matrix.txt
@@ -0,0 +1,772 @@
+default constructor:0x55555563a6c0
+Element at (0, 0) = 0.00232
+Element at (0, 1) = -6.4e-05
+Element at (0, 2) = -6.4e-05
+Element at (0, 3) = -6.4e-05
+Element at (0, 14) = -6.4e-05
+Element at (0, 21) = -6.4e-05
+Element at (1, 0) = -6.4e-05
+Element at (1, 1) = 0.002384
+Element at (1, 2) = -6.4e-05
+Element at (1, 3) = -6.4e-05
+Element at (1, 14) = -1.13687e-13
+Element at (1, 15) = -6.4e-05
+Element at (1, 17) = -6.4e-05
+Element at (1, 21) = -6.4e-05
+Element at (2, 0) = -6.4e-05
+Element at (2, 1) = -6.4e-05
+Element at (2, 2) = 0.002384
+Element at (2, 3) = -6.4e-05
+Element at (2, 14) = -6.4e-05
+Element at (2, 15) = -6.4e-05
+Element at (2, 17) = -6.4e-05
+Element at (2, 21) = 0
+Element at (3, 0) = -6.4e-05
+Element at (3, 1) = -6.4e-05
+Element at (3, 2) = -6.4e-05
+Element at (3, 3) = 0.002512
+Element at (3, 14) = -6.4e-05
+Element at (3, 15) = -6.4e-05
+Element at (3, 16) = -6.4e-05
+Element at (3, 17) = -6.4e-05
+Element at (3, 21) = -6.4e-05
+Element at (4, 4) = 0.00232
+Element at (4, 5) = -6.4e-05
+Element at (4, 6) = -6.4e-05
+Element at (4, 7) = -6.4e-05
+Element at (4, 12) = -6.4e-05
+Element at (4, 23) = -6.4e-05
+Element at (5, 4) = -6.4e-05
+Element at (5, 5) = 0.002384
+Element at (5, 6) = -6.4e-05
+Element at (5, 7) = -6.4e-05
+Element at (5, 12) = -1.13687e-13
+Element at (5, 13) = -6.4e-05
+Element at (5, 18) = -6.4e-05
+Element at (5, 23) = -6.4e-05
+Element at (6, 4) = -6.4e-05
+Element at (6, 5) = -6.4e-05
+Element at (6, 6) = 0.002384
+Element at (6, 7) = -6.4e-05
+Element at (6, 12) = -6.4e-05
+Element at (6, 13) = -6.4e-05
+Element at (6, 18) = -6.4e-05
+Element at (6, 23) = 0
+Element at (7, 4) = -6.4e-05
+Element at (7, 5) = -6.4e-05
+Element at (7, 6) = -6.4e-05
+Element at (7, 7) = 0.002512
+Element at (7, 12) = -6.4e-05
+Element at (7, 13) = -6.4e-05
+Element at (7, 16) = -6.4e-05
+Element at (7, 18) = -6.4e-05
+Element at (7, 23) = -6.4e-05
+Element at (8, 8) = 0.00232
+Element at (8, 9) = -6.4e-05
+Element at (8, 10) = -6.4e-05
+Element at (8, 11) = -6.4e-05
+Element at (8, 12) = -6.4e-05
+Element at (8, 14) = -6.4e-05
+Element at (9, 8) = -6.4e-05
+Element at (9, 9) = 0.002384
+Element at (9, 10) = -6.4e-05
+Element at (9, 11) = -6.4e-05
+Element at (9, 12) = -1.13687e-13
+Element at (9, 13) = -6.4e-05
+Element at (9, 14) = -6.4e-05
+Element at (9, 15) = -6.4e-05
+Element at (10, 8) = -6.4e-05
+Element at (10, 9) = -6.4e-05
+Element at (10, 10) = 0.002384
+Element at (10, 11) = -6.4e-05
+Element at (10, 12) = -6.4e-05
+Element at (10, 13) = -6.4e-05
+Element at (10, 14) = 0
+Element at (10, 15) = -6.4e-05
+Element at (11, 8) = -6.4e-05
+Element at (11, 9) = -6.4e-05
+Element at (11, 10) = -6.4e-05
+Element at (11, 11) = 0.002512
+Element at (11, 12) = -6.4e-05
+Element at (11, 13) = -6.4e-05
+Element at (11, 14) = -6.4e-05
+Element at (11, 15) = -6.4e-05
+Element at (11, 16) = -6.4e-05
+Element at (12, 4) = -6.4e-05
+Element at (12, 5) = -1.13687e-13
+Element at (12, 6) = -6.4e-05
+Element at (12, 7) = -6.4e-05
+Element at (12, 8) = -6.4e-05
+Element at (12, 9) = -1.13687e-13
+Element at (12, 10) = -6.4e-05
+Element at (12, 11) = -6.4e-05
+Element at (12, 12) = 0.002512
+Element at (12, 13) = -6.4e-05
+Element at (12, 14) = 2.27374e-13
+Element at (12, 15) = 0
+Element at (12, 16) = -6.4e-05
+Element at (12, 18) = 0
+Element at (12, 23) = 2.27374e-13
+Element at (13, 5) = -6.4e-05
+Element at (13, 6) = -6.4e-05
+Element at (13, 7) = -6.4e-05
+Element at (13, 9) = -6.4e-05
+Element at (13, 10) = -6.4e-05
+Element at (13, 11) = -6.4e-05
+Element at (13, 12) = -6.4e-05
+Element at (13, 13) = 0.00264
+Element at (13, 14) = -2.27374e-13
+Element at (13, 15) = -6.4e-05
+Element at (13, 16) = -6.4e-05
+Element at (13, 18) = -6.4e-05
+Element at (13, 23) = -2.27374e-13
+Element at (14, 0) = -6.4e-05
+Element at (14, 1) = -1.13687e-13
+Element at (14, 2) = -6.4e-05
+Element at (14, 3) = -6.4e-05
+Element at (14, 8) = -6.4e-05
+Element at (14, 9) = -6.4e-05
+Element at (14, 10) = 0
+Element at (14, 11) = -6.4e-05
+Element at (14, 12) = 2.27374e-13
+Element at (14, 13) = -2.27374e-13
+Element at (14, 14) = 0.002512
+Element at (14, 15) = -6.4e-05
+Element at (14, 16) = -6.4e-05
+Element at (14, 17) = -2.27374e-13
+Element at (14, 18) = 0
+Element at (14, 21) = 2.27374e-13
+Element at (14, 23) = 0
+Element at (15, 1) = -6.4e-05
+Element at (15, 2) = -6.4e-05
+Element at (15, 3) = -6.4e-05
+Element at (15, 9) = -6.4e-05
+Element at (15, 10) = -6.4e-05
+Element at (15, 11) = -6.4e-05
+Element at (15, 12) = 0
+Element at (15, 13) = -6.4e-05
+Element at (15, 14) = -6.4e-05
+Element at (15, 15) = 0.00264
+Element at (15, 16) = -6.4e-05
+Element at (15, 17) = -6.4e-05
+Element at (15, 18) = 1.13687e-13
+Element at (15, 21) = 0
+Element at (15, 23) = -7.10543e-15
+Element at (16, 3) = -6.4e-05
+Element at (16, 7) = -6.4e-05
+Element at (16, 11) = -6.4e-05
+Element at (16, 12) = -6.4e-05
+Element at (16, 13) = -6.4e-05
+Element at (16, 14) = -6.4e-05
+Element at (16, 15) = -6.4e-05
+Element at (16, 16) = 0.002768
+Element at (16, 17) = -6.4e-05
+Element at (16, 18) = -6.4e-05
+Element at (16, 21) = -6.4e-05
+Element at (16, 22) = -6.4e-05
+Element at (16, 23) = -6.4e-05
+Element at (17, 1) = -6.4e-05
+Element at (17, 2) = -6.4e-05
+Element at (17, 3) = -6.4e-05
+Element at (17, 14) = -2.27374e-13
+Element at (17, 15) = -6.4e-05
+Element at (17, 16) = -6.4e-05
+Element at (17, 17) = 0.00264
+Element at (17, 18) = -6.4e-05
+Element at (17, 20) = -6.4e-05
+Element at (17, 21) = -6.4e-05
+Element at (17, 22) = -6.4e-05
+Element at (17, 23) = 1.13687e-13
+Element at (17, 24) = -6.4e-05
+Element at (18, 5) = -6.4e-05
+Element at (18, 6) = -6.4e-05
+Element at (18, 7) = -6.4e-05
+Element at (18, 12) = 0
+Element at (18, 13) = -6.4e-05
+Element at (18, 14) = 0
+Element at (18, 15) = 1.13687e-13
+Element at (18, 16) = -6.4e-05
+Element at (18, 17) = -6.4e-05
+Element at (18, 18) = 0.00264
+Element at (18, 20) = -6.4e-05
+Element at (18, 21) = 2.27374e-13
+Element at (18, 22) = -6.4e-05
+Element at (18, 23) = -6.4e-05
+Element at (18, 24) = -6.4e-05
+Element at (19, 19) = 0.00232
+Element at (19, 20) = -6.4e-05
+Element at (19, 21) = -6.4e-05
+Element at (19, 22) = -6.4e-05
+Element at (19, 23) = -6.4e-05
+Element at (19, 24) = -6.4e-05
+Element at (20, 17) = -6.4e-05
+Element at (20, 18) = -6.4e-05
+Element at (20, 19) = -6.4e-05
+Element at (20, 20) = 0.002384
+Element at (20, 21) = -6.4e-05
+Element at (20, 22) = -6.4e-05
+Element at (20, 23) = 0
+Element at (20, 24) = -6.4e-05
+Element at (21, 0) = -6.4e-05
+Element at (21, 1) = -6.4e-05
+Element at (21, 2) = 0
+Element at (21, 3) = -6.4e-05
+Element at (21, 14) = 2.27374e-13
+Element at (21, 15) = 0
+Element at (21, 16) = -6.4e-05
+Element at (21, 17) = -6.4e-05
+Element at (21, 18) = 2.27374e-13
+Element at (21, 19) = -6.4e-05
+Element at (21, 20) = -6.4e-05
+Element at (21, 21) = 0.002512
+Element at (21, 22) = -6.4e-05
+Element at (21, 23) = -2.27374e-13
+Element at (21, 24) = 0
+Element at (22, 16) = -6.4e-05
+Element at (22, 17) = -6.4e-05
+Element at (22, 18) = -6.4e-05
+Element at (22, 19) = -6.4e-05
+Element at (22, 20) = -6.4e-05
+Element at (22, 21) = -6.4e-05
+Element at (22, 22) = 0.002512
+Element at (22, 23) = -6.4e-05
+Element at (22, 24) = -6.4e-05
+Element at (23, 4) = -6.4e-05
+Element at (23, 5) = -6.4e-05
+Element at (23, 6) = 0
+Element at (23, 7) = -6.4e-05
+Element at (23, 12) = 2.27374e-13
+Element at (23, 13) = -2.27374e-13
+Element at (23, 14) = 0
+Element at (23, 15) = -7.10543e-15
+Element at (23, 16) = -6.4e-05
+Element at (23, 17) = 1.13687e-13
+Element at (23, 18) = -6.4e-05
+Element at (23, 19) = -6.4e-05
+Element at (23, 20) = 0
+Element at (23, 21) = -2.27374e-13
+Element at (23, 22) = -6.4e-05
+Element at (23, 23) = 0.002512
+Element at (23, 24) = -6.4e-05
+Element at (24, 17) = -6.4e-05
+Element at (24, 18) = -6.4e-05
+Element at (24, 19) = -6.4e-05
+Element at (24, 20) = -6.4e-05
+Element at (24, 21) = 0
+Element at (24, 22) = -6.4e-05
+Element at (24, 23) = -6.4e-05
+Element at (24, 24) = 0.002384
+Element at (25, 25) = 0.00232
+Element at (25, 26) = -6.4e-05
+Element at (25, 27) = -6.4e-05
+Element at (25, 28) = -6.4e-05
+Element at (25, 39) = -6.4e-05
+Element at (25, 46) = -6.4e-05
+Element at (26, 25) = -6.4e-05
+Element at (26, 26) = 0.002384
+Element at (26, 27) = -6.4e-05
+Element at (26, 28) = -6.4e-05
+Element at (26, 39) = -1.13687e-13
+Element at (26, 40) = -6.4e-05
+Element at (26, 42) = -6.4e-05
+Element at (26, 46) = -6.4e-05
+Element at (27, 25) = -6.4e-05
+Element at (27, 26) = -6.4e-05
+Element at (27, 27) = 0.002384
+Element at (27, 28) = -6.4e-05
+Element at (27, 39) = -6.4e-05
+Element at (27, 40) = -6.4e-05
+Element at (27, 42) = -6.4e-05
+Element at (27, 46) = 0
+Element at (28, 25) = -6.4e-05
+Element at (28, 26) = -6.4e-05
+Element at (28, 27) = -6.4e-05
+Element at (28, 28) = 0.002512
+Element at (28, 39) = -6.4e-05
+Element at (28, 40) = -6.4e-05
+Element at (28, 41) = -6.4e-05
+Element at (28, 42) = -6.4e-05
+Element at (28, 46) = -6.4e-05
+Element at (29, 29) = 0.00232
+Element at (29, 30) = -6.4e-05
+Element at (29, 31) = -6.4e-05
+Element at (29, 32) = -6.4e-05
+Element at (29, 37) = -6.4e-05
+Element at (29, 48) = -6.4e-05
+Element at (30, 29) = -6.4e-05
+Element at (30, 30) = 0.002384
+Element at (30, 31) = -6.4e-05
+Element at (30, 32) = -6.4e-05
+Element at (30, 37) = -1.13687e-13
+Element at (30, 38) = -6.4e-05
+Element at (30, 43) = -6.4e-05
+Element at (30, 48) = -6.4e-05
+Element at (31, 29) = -6.4e-05
+Element at (31, 30) = -6.4e-05
+Element at (31, 31) = 0.002384
+Element at (31, 32) = -6.4e-05
+Element at (31, 37) = -6.4e-05
+Element at (31, 38) = -6.4e-05
+Element at (31, 43) = -6.4e-05
+Element at (31, 48) = 0
+Element at (32, 29) = -6.4e-05
+Element at (32, 30) = -6.4e-05
+Element at (32, 31) = -6.4e-05
+Element at (32, 32) = 0.002512
+Element at (32, 37) = -6.4e-05
+Element at (32, 38) = -6.4e-05
+Element at (32, 41) = -6.4e-05
+Element at (32, 43) = -6.4e-05
+Element at (32, 48) = -6.4e-05
+Element at (33, 33) = 0.00232
+Element at (33, 34) = -6.4e-05
+Element at (33, 35) = -6.4e-05
+Element at (33, 36) = -6.4e-05
+Element at (33, 37) = -6.4e-05
+Element at (33, 39) = -6.4e-05
+Element at (34, 33) = -6.4e-05
+Element at (34, 34) = 0.002384
+Element at (34, 35) = -6.4e-05
+Element at (34, 36) = -6.4e-05
+Element at (34, 37) = -1.13687e-13
+Element at (34, 38) = -6.4e-05
+Element at (34, 39) = -6.4e-05
+Element at (34, 40) = -6.4e-05
+Element at (35, 33) = -6.4e-05
+Element at (35, 34) = -6.4e-05
+Element at (35, 35) = 0.002384
+Element at (35, 36) = -6.4e-05
+Element at (35, 37) = -6.4e-05
+Element at (35, 38) = -6.4e-05
+Element at (35, 39) = 0
+Element at (35, 40) = -6.4e-05
+Element at (36, 33) = -6.4e-05
+Element at (36, 34) = -6.4e-05
+Element at (36, 35) = -6.4e-05
+Element at (36, 36) = 0.002512
+Element at (36, 37) = -6.4e-05
+Element at (36, 38) = -6.4e-05
+Element at (36, 39) = -6.4e-05
+Element at (36, 40) = -6.4e-05
+Element at (36, 41) = -6.4e-05
+Element at (37, 29) = -6.4e-05
+Element at (37, 30) = -1.13687e-13
+Element at (37, 31) = -6.4e-05
+Element at (37, 32) = -6.4e-05
+Element at (37, 33) = -6.4e-05
+Element at (37, 34) = -1.13687e-13
+Element at (37, 35) = -6.4e-05
+Element at (37, 36) = -6.4e-05
+Element at (37, 37) = 0.002512
+Element at (37, 38) = -6.4e-05
+Element at (37, 39) = 2.27374e-13
+Element at (37, 40) = 0
+Element at (37, 41) = -6.4e-05
+Element at (37, 43) = 0
+Element at (37, 48) = 2.27374e-13
+Element at (38, 30) = -6.4e-05
+Element at (38, 31) = -6.4e-05
+Element at (38, 32) = -6.4e-05
+Element at (38, 34) = -6.4e-05
+Element at (38, 35) = -6.4e-05
+Element at (38, 36) = -6.4e-05
+Element at (38, 37) = -6.4e-05
+Element at (38, 38) = 0.00264
+Element at (38, 39) = -2.27374e-13
+Element at (38, 40) = -6.4e-05
+Element at (38, 41) = -6.4e-05
+Element at (38, 43) = -6.4e-05
+Element at (38, 48) = -2.27374e-13
+Element at (39, 25) = -6.4e-05
+Element at (39, 26) = -1.13687e-13
+Element at (39, 27) = -6.4e-05
+Element at (39, 28) = -6.4e-05
+Element at (39, 33) = -6.4e-05
+Element at (39, 34) = -6.4e-05
+Element at (39, 35) = 0
+Element at (39, 36) = -6.4e-05
+Element at (39, 37) = 2.27374e-13
+Element at (39, 38) = -2.27374e-13
+Element at (39, 39) = 0.002512
+Element at (39, 40) = -6.4e-05
+Element at (39, 41) = -6.4e-05
+Element at (39, 42) = -2.27374e-13
+Element at (39, 43) = 0
+Element at (39, 46) = 2.27374e-13
+Element at (39, 48) = 0
+Element at (40, 26) = -6.4e-05
+Element at (40, 27) = -6.4e-05
+Element at (40, 28) = -6.4e-05
+Element at (40, 34) = -6.4e-05
+Element at (40, 35) = -6.4e-05
+Element at (40, 36) = -6.4e-05
+Element at (40, 37) = 0
+Element at (40, 38) = -6.4e-05
+Element at (40, 39) = -6.4e-05
+Element at (40, 40) = 0.00264
+Element at (40, 41) = -6.4e-05
+Element at (40, 42) = -6.4e-05
+Element at (40, 43) = 1.13687e-13
+Element at (40, 46) = 0
+Element at (40, 48) = -7.10543e-15
+Element at (41, 28) = -6.4e-05
+Element at (41, 32) = -6.4e-05
+Element at (41, 36) = -6.4e-05
+Element at (41, 37) = -6.4e-05
+Element at (41, 38) = -6.4e-05
+Element at (41, 39) = -6.4e-05
+Element at (41, 40) = -6.4e-05
+Element at (41, 41) = 0.002768
+Element at (41, 42) = -6.4e-05
+Element at (41, 43) = -6.4e-05
+Element at (41, 46) = -6.4e-05
+Element at (41, 47) = -6.4e-05
+Element at (41, 48) = -6.4e-05
+Element at (42, 26) = -6.4e-05
+Element at (42, 27) = -6.4e-05
+Element at (42, 28) = -6.4e-05
+Element at (42, 39) = -2.27374e-13
+Element at (42, 40) = -6.4e-05
+Element at (42, 41) = -6.4e-05
+Element at (42, 42) = 0.00264
+Element at (42, 43) = -6.4e-05
+Element at (42, 45) = -6.4e-05
+Element at (42, 46) = -6.4e-05
+Element at (42, 47) = -6.4e-05
+Element at (42, 48) = 1.13687e-13
+Element at (42, 49) = -6.4e-05
+Element at (43, 30) = -6.4e-05
+Element at (43, 31) = -6.4e-05
+Element at (43, 32) = -6.4e-05
+Element at (43, 37) = 0
+Element at (43, 38) = -6.4e-05
+Element at (43, 39) = 0
+Element at (43, 40) = 1.13687e-13
+Element at (43, 41) = -6.4e-05
+Element at (43, 42) = -6.4e-05
+Element at (43, 43) = 0.00264
+Element at (43, 45) = -6.4e-05
+Element at (43, 46) = 2.27374e-13
+Element at (43, 47) = -6.4e-05
+Element at (43, 48) = -6.4e-05
+Element at (43, 49) = -6.4e-05
+Element at (44, 44) = 0.00232
+Element at (44, 45) = -6.4e-05
+Element at (44, 46) = -6.4e-05
+Element at (44, 47) = -6.4e-05
+Element at (44, 48) = -6.4e-05
+Element at (44, 49) = -6.4e-05
+Element at (45, 42) = -6.4e-05
+Element at (45, 43) = -6.4e-05
+Element at (45, 44) = -6.4e-05
+Element at (45, 45) = 0.002384
+Element at (45, 46) = -6.4e-05
+Element at (45, 47) = -6.4e-05
+Element at (45, 48) = 0
+Element at (45, 49) = -6.4e-05
+Element at (46, 25) = -6.4e-05
+Element at (46, 26) = -6.4e-05
+Element at (46, 27) = 0
+Element at (46, 28) = -6.4e-05
+Element at (46, 39) = 2.27374e-13
+Element at (46, 40) = 0
+Element at (46, 41) = -6.4e-05
+Element at (46, 42) = -6.4e-05
+Element at (46, 43) = 2.27374e-13
+Element at (46, 44) = -6.4e-05
+Element at (46, 45) = -6.4e-05
+Element at (46, 46) = 0.002512
+Element at (46, 47) = -6.4e-05
+Element at (46, 48) = -2.27374e-13
+Element at (46, 49) = 0
+Element at (47, 41) = -6.4e-05
+Element at (47, 42) = -6.4e-05
+Element at (47, 43) = -6.4e-05
+Element at (47, 44) = -6.4e-05
+Element at (47, 45) = -6.4e-05
+Element at (47, 46) = -6.4e-05
+Element at (47, 47) = 0.002512
+Element at (47, 48) = -6.4e-05
+Element at (47, 49) = -6.4e-05
+Element at (48, 29) = -6.4e-05
+Element at (48, 30) = -6.4e-05
+Element at (48, 31) = 0
+Element at (48, 32) = -6.4e-05
+Element at (48, 37) = 2.27374e-13
+Element at (48, 38) = -2.27374e-13
+Element at (48, 39) = 0
+Element at (48, 40) = -7.10543e-15
+Element at (48, 41) = -6.4e-05
+Element at (48, 42) = 1.13687e-13
+Element at (48, 43) = -6.4e-05
+Element at (48, 44) = -6.4e-05
+Element at (48, 45) = 0
+Element at (48, 46) = -2.27374e-13
+Element at (48, 47) = -6.4e-05
+Element at (48, 48) = 0.002512
+Element at (48, 49) = -6.4e-05
+Element at (49, 42) = -6.4e-05
+Element at (49, 43) = -6.4e-05
+Element at (49, 44) = -6.4e-05
+Element at (49, 45) = -6.4e-05
+Element at (49, 46) = 0
+Element at (49, 47) = -6.4e-05
+Element at (49, 48) = -6.4e-05
+Element at (49, 49) = 0.002384
+Element at (50, 50) = 0.00232
+Element at (50, 51) = -6.4e-05
+Element at (50, 52) = -6.4e-05
+Element at (50, 53) = -6.4e-05
+Element at (50, 64) = -6.4e-05
+Element at (50, 71) = -6.4e-05
+Element at (51, 50) = -6.4e-05
+Element at (51, 51) = 0.002384
+Element at (51, 52) = -6.4e-05
+Element at (51, 53) = -6.4e-05
+Element at (51, 64) = -1.13687e-13
+Element at (51, 65) = -6.4e-05
+Element at (51, 67) = -6.4e-05
+Element at (51, 71) = -6.4e-05
+Element at (52, 50) = -6.4e-05
+Element at (52, 51) = -6.4e-05
+Element at (52, 52) = 0.002384
+Element at (52, 53) = -6.4e-05
+Element at (52, 64) = -6.4e-05
+Element at (52, 65) = -6.4e-05
+Element at (52, 67) = -6.4e-05
+Element at (52, 71) = 0
+Element at (53, 50) = -6.4e-05
+Element at (53, 51) = -6.4e-05
+Element at (53, 52) = -6.4e-05
+Element at (53, 53) = 0.002512
+Element at (53, 64) = -6.4e-05
+Element at (53, 65) = -6.4e-05
+Element at (53, 66) = -6.4e-05
+Element at (53, 67) = -6.4e-05
+Element at (53, 71) = -6.4e-05
+Element at (54, 54) = 0.00232
+Element at (54, 55) = -6.4e-05
+Element at (54, 56) = -6.4e-05
+Element at (54, 57) = -6.4e-05
+Element at (54, 62) = -6.4e-05
+Element at (54, 73) = -6.4e-05
+Element at (55, 54) = -6.4e-05
+Element at (55, 55) = 0.002384
+Element at (55, 56) = -6.4e-05
+Element at (55, 57) = -6.4e-05
+Element at (55, 62) = -1.13687e-13
+Element at (55, 63) = -6.4e-05
+Element at (55, 68) = -6.4e-05
+Element at (55, 73) = -6.4e-05
+Element at (56, 54) = -6.4e-05
+Element at (56, 55) = -6.4e-05
+Element at (56, 56) = 0.002384
+Element at (56, 57) = -6.4e-05
+Element at (56, 62) = -6.4e-05
+Element at (56, 63) = -6.4e-05
+Element at (56, 68) = -6.4e-05
+Element at (56, 73) = 0
+Element at (57, 54) = -6.4e-05
+Element at (57, 55) = -6.4e-05
+Element at (57, 56) = -6.4e-05
+Element at (57, 57) = 0.002512
+Element at (57, 62) = -6.4e-05
+Element at (57, 63) = -6.4e-05
+Element at (57, 66) = -6.4e-05
+Element at (57, 68) = -6.4e-05
+Element at (57, 73) = -6.4e-05
+Element at (58, 58) = 0.00232
+Element at (58, 59) = -6.4e-05
+Element at (58, 60) = -6.4e-05
+Element at (58, 61) = -6.4e-05
+Element at (58, 62) = -6.4e-05
+Element at (58, 64) = -6.4e-05
+Element at (59, 58) = -6.4e-05
+Element at (59, 59) = 0.002384
+Element at (59, 60) = -6.4e-05
+Element at (59, 61) = -6.4e-05
+Element at (59, 62) = -1.13687e-13
+Element at (59, 63) = -6.4e-05
+Element at (59, 64) = -6.4e-05
+Element at (59, 65) = -6.4e-05
+Element at (60, 58) = -6.4e-05
+Element at (60, 59) = -6.4e-05
+Element at (60, 60) = 0.002384
+Element at (60, 61) = -6.4e-05
+Element at (60, 62) = -6.4e-05
+Element at (60, 63) = -6.4e-05
+Element at (60, 64) = 0
+Element at (60, 65) = -6.4e-05
+Element at (61, 58) = -6.4e-05
+Element at (61, 59) = -6.4e-05
+Element at (61, 60) = -6.4e-05
+Element at (61, 61) = 0.002512
+Element at (61, 62) = -6.4e-05
+Element at (61, 63) = -6.4e-05
+Element at (61, 64) = -6.4e-05
+Element at (61, 65) = -6.4e-05
+Element at (61, 66) = -6.4e-05
+Element at (62, 54) = -6.4e-05
+Element at (62, 55) = -1.13687e-13
+Element at (62, 56) = -6.4e-05
+Element at (62, 57) = -6.4e-05
+Element at (62, 58) = -6.4e-05
+Element at (62, 59) = -1.13687e-13
+Element at (62, 60) = -6.4e-05
+Element at (62, 61) = -6.4e-05
+Element at (62, 62) = 0.002512
+Element at (62, 63) = -6.4e-05
+Element at (62, 64) = 2.27374e-13
+Element at (62, 65) = 0
+Element at (62, 66) = -6.4e-05
+Element at (62, 68) = 0
+Element at (62, 73) = 2.27374e-13
+Element at (63, 55) = -6.4e-05
+Element at (63, 56) = -6.4e-05
+Element at (63, 57) = -6.4e-05
+Element at (63, 59) = -6.4e-05
+Element at (63, 60) = -6.4e-05
+Element at (63, 61) = -6.4e-05
+Element at (63, 62) = -6.4e-05
+Element at (63, 63) = 0.00264
+Element at (63, 64) = -2.27374e-13
+Element at (63, 65) = -6.4e-05
+Element at (63, 66) = -6.4e-05
+Element at (63, 68) = -6.4e-05
+Element at (63, 73) = -2.27374e-13
+Element at (64, 50) = -6.4e-05
+Element at (64, 51) = -1.13687e-13
+Element at (64, 52) = -6.4e-05
+Element at (64, 53) = -6.4e-05
+Element at (64, 58) = -6.4e-05
+Element at (64, 59) = -6.4e-05
+Element at (64, 60) = 0
+Element at (64, 61) = -6.4e-05
+Element at (64, 62) = 2.27374e-13
+Element at (64, 63) = -2.27374e-13
+Element at (64, 64) = 0.002512
+Element at (64, 65) = -6.4e-05
+Element at (64, 66) = -6.4e-05
+Element at (64, 67) = -2.27374e-13
+Element at (64, 68) = 0
+Element at (64, 71) = 2.27374e-13
+Element at (64, 73) = 0
+Element at (65, 51) = -6.4e-05
+Element at (65, 52) = -6.4e-05
+Element at (65, 53) = -6.4e-05
+Element at (65, 59) = -6.4e-05
+Element at (65, 60) = -6.4e-05
+Element at (65, 61) = -6.4e-05
+Element at (65, 62) = 0
+Element at (65, 63) = -6.4e-05
+Element at (65, 64) = -6.4e-05
+Element at (65, 65) = 0.00264
+Element at (65, 66) = -6.4e-05
+Element at (65, 67) = -6.4e-05
+Element at (65, 68) = 1.13687e-13
+Element at (65, 71) = 0
+Element at (65, 73) = -7.10543e-15
+Element at (66, 53) = -6.4e-05
+Element at (66, 57) = -6.4e-05
+Element at (66, 61) = -6.4e-05
+Element at (66, 62) = -6.4e-05
+Element at (66, 63) = -6.4e-05
+Element at (66, 64) = -6.4e-05
+Element at (66, 65) = -6.4e-05
+Element at (66, 66) = 0.002768
+Element at (66, 67) = -6.4e-05
+Element at (66, 68) = -6.4e-05
+Element at (66, 71) = -6.4e-05
+Element at (66, 72) = -6.4e-05
+Element at (66, 73) = -6.4e-05
+Element at (67, 51) = -6.4e-05
+Element at (67, 52) = -6.4e-05
+Element at (67, 53) = -6.4e-05
+Element at (67, 64) = -2.27374e-13
+Element at (67, 65) = -6.4e-05
+Element at (67, 66) = -6.4e-05
+Element at (67, 67) = 0.00264
+Element at (67, 68) = -6.4e-05
+Element at (67, 70) = -6.4e-05
+Element at (67, 71) = -6.4e-05
+Element at (67, 72) = -6.4e-05
+Element at (67, 73) = 1.13687e-13
+Element at (67, 74) = -6.4e-05
+Element at (68, 55) = -6.4e-05
+Element at (68, 56) = -6.4e-05
+Element at (68, 57) = -6.4e-05
+Element at (68, 62) = 0
+Element at (68, 63) = -6.4e-05
+Element at (68, 64) = 0
+Element at (68, 65) = 1.13687e-13
+Element at (68, 66) = -6.4e-05
+Element at (68, 67) = -6.4e-05
+Element at (68, 68) = 0.00264
+Element at (68, 70) = -6.4e-05
+Element at (68, 71) = 2.27374e-13
+Element at (68, 72) = -6.4e-05
+Element at (68, 73) = -6.4e-05
+Element at (68, 74) = -6.4e-05
+Element at (69, 69) = 0.00232
+Element at (69, 70) = -6.4e-05
+Element at (69, 71) = -6.4e-05
+Element at (69, 72) = -6.4e-05
+Element at (69, 73) = -6.4e-05
+Element at (69, 74) = -6.4e-05
+Element at (70, 67) = -6.4e-05
+Element at (70, 68) = -6.4e-05
+Element at (70, 69) = -6.4e-05
+Element at (70, 70) = 0.002384
+Element at (70, 71) = -6.4e-05
+Element at (70, 72) = -6.4e-05
+Element at (70, 73) = 0
+Element at (70, 74) = -6.4e-05
+Element at (71, 50) = -6.4e-05
+Element at (71, 51) = -6.4e-05
+Element at (71, 52) = 0
+Element at (71, 53) = -6.4e-05
+Element at (71, 64) = 2.27374e-13
+Element at (71, 65) = 0
+Element at (71, 66) = -6.4e-05
+Element at (71, 67) = -6.4e-05
+Element at (71, 68) = 2.27374e-13
+Element at (71, 69) = -6.4e-05
+Element at (71, 70) = -6.4e-05
+Element at (71, 71) = 0.002512
+Element at (71, 72) = -6.4e-05
+Element at (71, 73) = -2.27374e-13
+Element at (71, 74) = 0
+Element at (72, 66) = -6.4e-05
+Element at (72, 67) = -6.4e-05
+Element at (72, 68) = -6.4e-05
+Element at (72, 69) = -6.4e-05
+Element at (72, 70) = -6.4e-05
+Element at (72, 71) = -6.4e-05
+Element at (72, 72) = 0.002512
+Element at (72, 73) = -6.4e-05
+Element at (72, 74) = -6.4e-05
+Element at (73, 54) = -6.4e-05
+Element at (73, 55) = -6.4e-05
+Element at (73, 56) = 0
+Element at (73, 57) = -6.4e-05
+Element at (73, 62) = 2.27374e-13
+Element at (73, 63) = -2.27374e-13
+Element at (73, 64) = 0
+Element at (73, 65) = -7.10543e-15
+Element at (73, 66) = -6.4e-05
+Element at (73, 67) = 1.13687e-13
+Element at (73, 68) = -6.4e-05
+Element at (73, 69) = -6.4e-05
+Element at (73, 70) = 0
+Element at (73, 71) = -2.27374e-13
+Element at (73, 72) = -6.4e-05
+Element at (73, 73) = 0.002512
+Element at (73, 74) = -6.4e-05
+Element at (74, 67) = -6.4e-05
+Element at (74, 68) = -6.4e-05
+Element at (74, 69) = -6.4e-05
+Element at (74, 70) = -6.4e-05
+Element at (74, 71) = 0
+Element at (74, 72) = -6.4e-05
+Element at (74, 73) = -6.4e-05
+Element at (74, 74) = 0.002384
\ No newline at end of file
diff --git a/test/system_test/MassSpring3D/CMakeLists.txt b/test/system_test/MassSpring3D/CMakeLists.txt
index 9d10299..99e6980 100644
--- a/test/system_test/MassSpring3D/CMakeLists.txt
+++ b/test/system_test/MassSpring3D/CMakeLists.txt
@@ -2,26 +2,42 @@
 # (https://github.com/Simple-XX/SimplePhysicsEngine).
 #
 # CMakeLists.txt for Simple-XX/SimplePhysicsEngine.
+cmake_minimum_required (VERSION 3.8)
 
 project(MassSpring3D)
 
-add_executable(${PROJECT_NAME}
-        main.cpp
-        ClothSimulation.cpp
-        Shader.cpp
-        Renderer.cpp
-        dtkPhysMassSpringSolver.cpp
-        step.cu
+file(GLOB cpu_source_files
+        "main.cpp"
+        "ClothSimulation.cpp"
+        "Shader.cpp"
+        "Renderer.cpp"
+        "dtkPhysMassSpringSolver.cpp"
 )
-
-target_include_directories(${PROJECT_NAME} PRIVATE
-        include
+file(GLOB cuda_source_files
+        "*.cu"
+        "cusparse_cholesky_solver.cpp"
+        "cusparse_wrapper.cpp"
 )
 
-# find_package(CUDA REQUIRED)
-find_package(CUDAToolkit)
 
-target_link_libraries(${PROJECT_NAME} GLEW::GLEW CUDA::cudart CUDA::cusparse)
+if(CUDAToolkit_FOUND)
+        message(STATUS "CUDAToolkit found, compiling with CUDA support")
+        enable_language(CUDA)
+        
+        add_executable(${PROJECT_NAME} ${cpu_source_files} ${cuda_source_files})
+
+        # Set CUDA properties for the target
+        target_include_directories(${PROJECT_NAME} PRIVATE include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+        add_definitions(-DDTK_CUDA)
+
+        target_link_libraries(${PROJECT_NAME} GLEW::GLEW ${CUDA_cudart_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_cusolver_LIBRARY})
+else()
+        message(STATUS "CUDAToolkit not found, compiling without CUDA support")
+        add_executable(${PROJECT_NAME} ${cpu_source_files})
+
+        target_include_directories(${PROJECT_NAME} PRIVATE include)
+        target_link_libraries(${PROJECT_NAME} GLEW::GLEW)
+endif()
 
 if (APPLE)
         target_compile_definitions(${PROJECT_NAME} PRIVATE
diff --git a/test/system_test/MassSpring3D/cusparse_cholesky_solver.cpp b/test/system_test/MassSpring3D/cusparse_cholesky_solver.cpp
new file mode 100644
index 0000000..8c004a4
--- /dev/null
+++ b/test/system_test/MassSpring3D/cusparse_cholesky_solver.cpp
@@ -0,0 +1,400 @@
+#include "cusparse_cholesky_solver.h"
+
+#include <vector>
+#include <algorithm>
+#include <numeric>
+
+#include <cuda_runtime.h>
+#include <cusparse.h>
+#include <cusolverSp.h>
+#include <cusolverSp_LOWLEVEL_PREVIEW.h>
+
+#include "device_buffer.h"
+#include "cusparse_wrapper.h"
+
+struct CusparseHandle
+{
+	CusparseHandle() { init(); }
+	~CusparseHandle() { destroy(); }
+	void init() { cusparseCreate(&handle); }
+	void destroy() { cusparseDestroy(handle); }
+	operator cusparseHandle_t() const { return handle; }
+	CusparseHandle(const CusparseHandle&) = delete;
+	CusparseHandle& operator=(const CusparseHandle&) = delete;
+	cusparseHandle_t handle;
+};
+
+struct CusolverHandle
+{
+	CusolverHandle() { init(); }
+	~CusolverHandle() { destroy(); }
+	void init() { cusolverSpCreate(&handle); }
+	void destroy() { cusolverSpDestroy(handle); }
+	operator cusolverSpHandle_t() const { return handle; }
+	CusolverHandle(const CusolverHandle&) = delete;
+	CusolverHandle& operator=(const CusolverHandle&) = delete;
+	cusolverSpHandle_t handle;
+};
+
+struct CusparseMatDescriptor
+{
+	CusparseMatDescriptor() { init(); }
+	~CusparseMatDescriptor() { destroy(); }
+
+	void init()
+	{
+		cusparseCreateMatDescr(&desc);
+		cusparseSetMatType(desc, CUSPARSE_MATRIX_TYPE_GENERAL);
+		cusparseSetMatIndexBase(desc, CUSPARSE_INDEX_BASE_ZERO);
+		cusparseSetMatDiagType(desc, CUSPARSE_DIAG_TYPE_NON_UNIT);
+	}
+
+	void destroy() { cusparseDestroyMatDescr(desc); }
+	operator cusparseMatDescr_t() const { return desc; }
+	CusparseMatDescriptor(const CusparseMatDescriptor&) = delete;
+	CusparseMatDescriptor& operator=(const CusparseMatDescriptor&) = delete;
+	cusparseMatDescr_t desc;
+};
+
+template <typename T>
+class SparseSquareMatrixCSR
+{
+
+public:
+
+	SparseSquareMatrixCSR() : size_(0), nnz_(0) {}
+
+	void resize(int size)
+	{
+		size_ = size;
+		rowPtr_.allocate(size + 1);
+	}
+
+	void resizeNonZeros(int nnz)
+	{
+		nnz_ = nnz;
+		values_.allocate(nnz);
+		colInd_.allocate(nnz);
+	}
+
+	void upload(const T* values = nullptr, const int* rowPtr = nullptr, const int* colInd = nullptr)
+	{
+		if (values)
+			values_.upload(values);
+		if (rowPtr)
+			rowPtr_.upload(rowPtr);
+		if (colInd)
+			colInd_.upload(colInd);
+	}
+
+	T* val() { return values_.data; }
+	int* rowPtr() { return rowPtr_.data; }
+	int* colInd() { return colInd_.data; }
+
+	const T* val() const { return values_.data; }
+	const int* rowPtr() const { return rowPtr_.data; }
+	const int* colInd() const { return colInd_.data; }
+
+	int size() const { return size_; }
+	int nnz() const { return nnz_; }
+
+	cusparseMatDescr_t desc() const { return desc_; }
+
+private:
+
+	DeviceBuffer<T> values_;
+	DeviceBuffer<int> rowPtr_;
+	DeviceBuffer<int> colInd_;
+	int size_, nnz_;
+	CusparseMatDescriptor desc_;
+};
+
+template <typename T>
+class SparseCholesky
+{
+
+public:
+
+	void init(cusolverSpHandle_t handle)
+	{
+		handle_ = handle;
+
+		// create info
+		cusolverSpCreateCsrcholInfo(&info_);
+	}
+
+	void allocateBuffer(const SparseSquareMatrixCSR<T>& A)
+	{
+		size_t internalData, workSpace;
+		cusolverSpXcsrcholBufferInfo(handle_, A.size(), A.nnz(), A.desc(),
+			A.val(), A.rowPtr(), A.colInd(), info_, &internalData, &workSpace);
+		buffer_.allocate(workSpace);
+	}
+
+	bool hasZeroPivot(int* position = nullptr) const
+	{
+		const T tol = static_cast<T>(1e-14);
+		int singularity = -1;
+		cusolverSpXcsrcholZeroPivot(handle_, info_, tol, &singularity);
+		if (position)
+			*position = singularity;
+		return singularity >= 0;
+	}
+
+	bool analyze(const SparseSquareMatrixCSR<T>& A)
+	{
+		cusolverSpXcsrcholAnalysis(handle_, A.size(), A.nnz(), A.desc(), A.rowPtr(), A.colInd(), info_);
+		allocateBuffer(A);
+		return true;
+	}
+
+	bool factorize(SparseSquareMatrixCSR<T>& A)
+	{
+		cusolverSpXcsrcholFactor(handle_, A.size(), A.nnz(), A.desc(),
+			A.val(), A.rowPtr(), A.colInd(), info_, buffer_.data);
+
+		return !hasZeroPivot();
+	}
+
+	void solve(int size, const T* b, T* x)
+	{
+		cusolverSpXcsrcholSolve(handle_, size, b, x, info_, buffer_.data);
+	}
+
+	void destroy()
+	{
+		cusolverSpDestroyCsrcholInfo(info_);
+	}
+
+	~SparseCholesky() { destroy(); }
+
+private:
+
+	cusolverSpHandle_t handle_;
+	csrcholInfo_t info_;
+	DeviceBuffer<unsigned char> buffer_;
+};
+
+class Twist
+{
+
+public:
+
+	Twist()
+	{
+		cusolverSpCreate(&handle_);
+		cusparseCreateMatDescr(&desc_);
+		cusparseSetMatType(desc_, CUSPARSE_MATRIX_TYPE_GENERAL);
+		cusparseSetMatIndexBase(desc_, CUSPARSE_INDEX_BASE_ZERO);
+	}
+
+	~Twist()
+	{
+		cusolverSpDestroy(handle_);
+		cusparseDestroyMatDescr(desc_);
+	}
+
+	void allocateBuffer(int n, int nnz, const int* csrRowPtr, const int* csrColInd, const int* P)
+	{
+		size_t bufSize;
+		cusolverSpXcsrperm_bufferSizeHost(handle_, n, n, nnz, desc_,
+			csrRowPtr, csrColInd, P, P, &bufSize);
+		buffer_.resize(bufSize);
+	}
+
+	void compute(int n, int nnz, const int* csrRowPtr, const int* csrColInd, std::vector<int>& P,
+		std::vector<int>& permRowPtr, std::vector<int>& permColInd, std::vector<int>& map)
+	{
+		const int* p = P.data();
+
+		allocateBuffer(n, nnz, csrRowPtr, csrColInd, p);
+
+		permRowPtr.resize(n + 1);
+		permColInd.resize(nnz);
+		map.resize(nnz);
+
+		std::copy(csrRowPtr, csrRowPtr + (n + 1), std::begin(permRowPtr));
+		std::copy(csrColInd, csrColInd + nnz, std::begin(permColInd));
+		std::iota(std::begin(map), std::end(map), 0);
+
+		cusolverSpXcsrpermHost(handle_, n, n, nnz, desc_,
+			permRowPtr.data(), permColInd.data(), p, p, map.data(), buffer_.data());
+	}
+
+	void operator()(int n, int nnz, const int* csrRowPtr, const int* csrColInd, std::vector<int>& P,
+		std::vector<int>& permRowPtr, std::vector<int>& permColInd, std::vector<int>& map)
+	{
+		compute(n, nnz, csrRowPtr, csrColInd, P, permRowPtr, permColInd, map);
+	}
+
+private:
+
+	cusolverSpHandle_t handle_;
+	cusparseMatDescr_t desc_;
+	std::vector<unsigned char> buffer_;
+};
+
+template <typename T>
+class CuSparseCholeskySolverImpl : public CuSparseCholeskySolver<T>
+{
+
+public:
+
+	using Info = typename CuSparseCholeskySolver<T>::Info;
+
+	CuSparseCholeskySolverImpl(int size)
+	{
+		init();
+
+		if (size > 0)
+			resize(size);
+	}
+
+	void init()
+	{
+		cholesky.init(cusolver);
+		doOrdering = false;
+		information = Info::SUCCESS;
+	}
+
+	void resize(int size) override
+	{
+		Acsr.resize(size);
+		d_b.allocate(size);
+		d_x.allocate(size);
+		d_y.allocate(size);
+	}
+
+	void setPermutaion(int size, const int* P) override
+	{
+		h_P.resize(size);
+		h_PT.resize(size);
+
+		for (int i = 0; i < size; i++)
+		{
+			h_P[i] = P[i];
+			h_PT[P[i]] = i;
+		}
+
+		d_P.assign(size, h_P.data());
+		d_PT.assign(size, h_PT.data());
+		d_z.allocate(size);
+
+		doOrdering = true;
+	}
+
+	void analyze(int nnz, const int* csrRowPtr, const int* csrColInd) override
+	{
+		// if permutation is set, apply permutation to A (P*A*PT)
+		if (doOrdering)
+		{
+			twist(Acsr.size(), nnz, csrRowPtr, csrColInd, h_P, permRowPtr, permColInd, h_map);
+			d_map.assign(nnz, h_map.data());
+			csrRowPtr = permRowPtr.data();
+			csrColInd = permColInd.data();
+		}
+
+		// copy input data to device memory
+		Acsr.resizeNonZeros(nnz);
+		Acsr.upload(nullptr, csrRowPtr, csrColInd);
+
+		cholesky.analyze(Acsr);
+	}
+
+	void factorize(const T* A) override
+	{
+		if (doOrdering)
+		{
+			d_values.assign(Acsr.nnz(), A);
+			permute(Acsr.nnz(), d_values.data, Acsr.val(), d_map.data);
+		}
+		else
+		{
+			Acsr.upload(A);
+		}
+
+		// M = L * LT
+		if (!cholesky.factorize(Acsr))
+			information = Info::NUMERICAL_ISSUE;
+	}
+
+	void solve(const T* b, T* x) override
+	{
+		d_b.upload(b);
+
+		if (doOrdering)
+		{
+			// y = P * b
+			permute(Acsr.size(), d_b.data, d_y.data, d_P.data);
+
+			// solve A * z = y
+			cholesky.solve(Acsr.size(), d_y.data, d_z.data);
+
+			// x = PT * z
+			permute(Acsr.size(), d_z.data, d_x.data, d_PT.data);
+		}
+		else
+		{
+			// solve A * x = b
+			cholesky.solve(Acsr.size(), d_b.data, d_x.data);
+		}
+
+		d_x.download(x);
+	}
+
+	Info info() const override
+	{
+		return information;
+	}
+
+	void permute(int size, const T* src, T* dst, const int* P)
+	{
+		cusparseXgthr(cusparse, size, src, dst, P, CUSPARSE_INDEX_BASE_ZERO);
+	}
+
+	void destroy()
+	{
+	}
+
+	~CuSparseCholeskySolverImpl()
+	{
+		destroy();
+	}
+
+private:
+
+	SparseSquareMatrixCSR<T> Acsr;
+
+	DeviceBuffer<T> d_b;
+	DeviceBuffer<T> d_x;
+	DeviceBuffer<T> d_y;
+	DeviceBuffer<T> d_z;
+	DeviceBuffer<T> d_values;
+	DeviceBuffer<int> d_P, d_PT, d_map;
+
+	CusparseHandle cusparse;
+	CusolverHandle cusolver;
+
+	SparseCholesky<T> cholesky;
+
+	Twist twist;
+	std::vector<int> h_P, h_PT, h_map, permRowPtr, permColInd;
+	bool doOrdering;
+
+	Info information;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+typename CuSparseCholeskySolver<T>::Ptr CuSparseCholeskySolver<T>::create(int size)
+{
+	return std::make_unique<CuSparseCholeskySolverImpl<T>>(size);
+}
+
+template<typename T>
+CuSparseCholeskySolver<T>::~CuSparseCholeskySolver()
+{
+}
+
+template class CuSparseCholeskySolver<double>;
+template class CuSparseCholeskySolver<float>;
diff --git a/test/system_test/MassSpring3D/cusparse_wrapper.cpp b/test/system_test/MassSpring3D/cusparse_wrapper.cpp
new file mode 100644
index 0000000..ecfc257
--- /dev/null
+++ b/test/system_test/MassSpring3D/cusparse_wrapper.cpp
@@ -0,0 +1,202 @@
+#include "cusparse_wrapper.h"
+
+// @Chance.H TODO: warning: ‘cusparseStatus_t cusparseDgthr(cusparseHandle_t, int, const double*, double*, const int*, cusparseIndexBase_t)’ is deprecated: please use cusparseGather instead [-Wdeprecated-declarations]
+/* Description: Gather of non-zero elements from dense vector y into
+   sparse vector x. */
+cusparseStatus_t CUSPARSEAPI cusparseXgthr(cusparseHandle_t handle,
+										   int nnz,
+										   const float* y,
+										   float* xVal,
+										   const int* xInd,
+										   cusparseIndexBase_t idxBase)
+{
+	return
+		cusparseSgthr(handle,
+					  nnz,
+					  y,
+					  xVal,
+					  xInd,
+					  idxBase);
+}
+
+/*
+ * Low level API for GPU Cholesky
+ *
+ */
+cusparseStatus_t CUSPARSEAPI cusparseXgthr(cusparseHandle_t handle,
+										   int nnz,
+										   const double* y,
+										   double* xVal,
+										   const int* xInd,
+										   cusparseIndexBase_t idxBase)
+{
+	return
+		cusparseDgthr(handle,
+					  nnz,
+					  y,
+					  xVal,
+					  xInd,
+					  idxBase);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholBufferInfo(
+	cusolverSpHandle_t handle,
+	int n,
+	int nnzA,
+	const cusparseMatDescr_t descrA,
+	const float* csrValA,
+	const int* csrRowPtrA,
+	const int* csrColIndA,
+	csrcholInfo_t info,
+	size_t* internalDataInBytes,
+	size_t* workspaceInBytes)
+{
+	return
+		cusolverSpScsrcholBufferInfo(
+			handle,
+			n,
+			nnzA,
+			descrA,
+			csrValA,
+			csrRowPtrA,
+			csrColIndA,
+			info,
+			internalDataInBytes,
+			workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholBufferInfo(
+	cusolverSpHandle_t handle,
+	int n,
+	int nnzA,
+	const cusparseMatDescr_t descrA,
+	const double* csrValA,
+	const int* csrRowPtrA,
+	const int* csrColIndA,
+	csrcholInfo_t info,
+	size_t* internalDataInBytes,
+	size_t* workspaceInBytes)
+{
+	return
+		cusolverSpDcsrcholBufferInfo(
+			handle,
+			n,
+			nnzA,
+			descrA,
+			csrValA,
+			csrRowPtrA,
+			csrColIndA,
+			info,
+			internalDataInBytes,
+			workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholFactor(
+	cusolverSpHandle_t handle,
+	int n,
+	int nnzA,
+	const cusparseMatDescr_t descrA,
+	const float* csrValA,
+	const int* csrRowPtrA,
+	const int* csrColIndA,
+	csrcholInfo_t info,
+	void* pBuffer)
+{
+	return
+		cusolverSpScsrcholFactor(handle,
+								 n,
+								 nnzA,
+								 descrA,
+								 csrValA,
+								 csrRowPtrA,
+								 csrColIndA,
+								 info,
+								 pBuffer);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholFactor(
+	cusolverSpHandle_t handle,
+	int n,
+	int nnzA,
+	const cusparseMatDescr_t descrA,
+	const double* csrValA,
+	const int* csrRowPtrA,
+	const int* csrColIndA,
+	csrcholInfo_t info,
+	void* pBuffer)
+{
+	return
+		cusolverSpDcsrcholFactor(handle,
+								 n,
+								 nnzA,
+								 descrA,
+								 csrValA,
+								 csrRowPtrA,
+								 csrColIndA,
+								 info,
+								 pBuffer);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholZeroPivot(
+	cusolverSpHandle_t handle,
+	csrcholInfo_t info,
+	float tol,
+	int* position)
+{
+	return
+		cusolverSpScsrcholZeroPivot(
+			handle,
+			info,
+			tol,
+			position);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholZeroPivot(
+	cusolverSpHandle_t handle,
+	csrcholInfo_t info,
+	double tol,
+	int* position)
+{
+	return
+		cusolverSpDcsrcholZeroPivot(
+			handle,
+			info,
+			tol,
+			position);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholSolve(
+	cusolverSpHandle_t handle,
+	int n,
+	const float* b,
+	float* x,
+	csrcholInfo_t info,
+	void* pBuffer)
+{
+	return
+		cusolverSpScsrcholSolve(
+			handle,
+			n,
+			b,
+			x,
+			info,
+			pBuffer);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholSolve(
+	cusolverSpHandle_t handle,
+	int n,
+	const double* b,
+	double* x,
+	csrcholInfo_t info,
+	void* pBuffer)
+{
+	return
+		cusolverSpDcsrcholSolve(
+			handle,
+			n,
+			b,
+			x,
+			info,
+			pBuffer);
+}
diff --git a/test/system_test/MassSpring3D/dtkPhysMassSpringSolver.cpp b/test/system_test/MassSpring3D/dtkPhysMassSpringSolver.cpp
index b7be9a5..76ea2f2 100644
--- a/test/system_test/MassSpring3D/dtkPhysMassSpringSolver.cpp
+++ b/test/system_test/MassSpring3D/dtkPhysMassSpringSolver.cpp
@@ -1,5 +1,4 @@
 #include "dtkPhysMassSpringSolver.h"
-#include "step.cuh"
 
 dtk::dtkPhysMassSpringSolver::dtkPhysMassSpringSolver() {
 }
@@ -65,10 +64,51 @@ dtk::dtkPhysMassSpringSolver::dtkPhysMassSpringSolver(const dtk::dtkPhysMassSpri
     _J.setFromTriplets(JTriplets.begin(), JTriplets.end());
 
     // pre-factor 
-    double h2 = _system->GetTimeStep() * _system->GetTimeStep();
-    SparseMatrix A = _M + h2 * _L;
-    _system_matrix.compute(A);
+    _h2 = _system->GetTimeStep() * _system->GetTimeStep();
+    SparseMatrixCSR A = _M + _h2 * _L;
+
+    // external force (gravity)
+    const dtk::dtkDouble3& fext = _system->GetDefaultGravityAccel();
+    Vector3f gravity(fext.x, fext.y, fext.z);
+    _fext_force = gravity.replicate(_system->GetNumberOfMassPoints(), 1);
+
+#ifdef DTK_CUDA
+    const int n = static_cast<int>(A.rows());
+    const int m = static_cast<int>(A.cols());
+    const int nnz = static_cast<int>(A.nonZeros());
+
+    std::cout << "=== Matrix : " << std::endl;
+    std::cout << "Size       : " << n << " x " << m << std::endl;
+    std::cout << "Non-zeros  : " << nnz << std::endl;
+
+    SparseMatrixCSR Acsr = A; // solver supports CSR format
+    // printSparseMatrix(Acsr);
+    _system_matrix = CuSparseCholeskySolver<float>::create(n);
+
+    bool doOrdering = true;
+    if (doOrdering)
+    {
+        // compute permutation
+        PermutationMatrix P;
+        Ordering ordering;
+        ordering(Acsr.selfadjointView<Eigen::Upper>(), P);
+
+        // set permutation to solver
+        _system_matrix->setPermutaion(n, P.indices().data());
+    }
+
+    _system_matrix->analyze(nnz, Acsr.outerIndexPtr(), Acsr.innerIndexPtr());
+
+    _system_matrix->factorize(Acsr.valuePtr());
 
+    if (_system_matrix->info() != CuSparseCholeskySolver<float>::SUCCESS)
+    {
+        std::cerr << "Factorize failed." << std::endl;
+        std::exit(EXIT_FAILURE);
+    }
+#else
+    _system_matrix.compute(A);  // Cholesky 分解
+#endif
 
     int num_springs = _system->GetNumberOfSprings();
     _rest_lengths.resize(num_springs);
@@ -82,6 +122,22 @@ dtk::dtkPhysMassSpringSolver::dtkPhysMassSpringSolver(const dtk::dtkPhysMassSpri
         _spring_indices[2 * i + 1] = spring->GetSecondVertex()->GetPointID();
     }
 
+#ifdef DTK_CUDA
+    // Initialize CUDA and allocate memory on GPU
+    cudaMalloc(&d_current_state, _current_state.size() * sizeof(float));
+    cudaMalloc(&d_spring_directions, _spring_directions.size() * sizeof(float));
+    cudaMalloc(&d_rest_lengths, _rest_lengths.size() * sizeof(float));
+    cudaMalloc(&d_spring_indices, _spring_indices.size() * sizeof(int));
+    cudaMalloc(&d_fext_force, _fext_force.size() * sizeof(float));
+    cudaMalloc(&d_b, _fext_force.size() * sizeof(float));
+    cudaMalloc(&d_J_spring_directions, _fext_force.size() * sizeof(float));
+
+    // Copy initial data from CPU to GPU
+    cudaMemcpy(d_current_state, _current_state.data(), _current_state.size() * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_rest_lengths, _rest_lengths.data(), _rest_lengths.size() * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_spring_indices, _spring_indices.data(), _spring_indices.size() * sizeof(int), cudaMemcpyHostToDevice);
+#endif
+
     // std::cout << "M: " << std::endl;
     // printSparseMatrix(_M);
     // std::cout << "L: " << std::endl;
@@ -96,159 +152,77 @@ void dtk::dtkPhysMassSpringSolver::solve(unsigned int iter_num) {
     float damping_factor = _system->GetDefaultPointDamp();
 
     // update inertial term
+#ifdef DTK_CUDA
+    VectorXf v_term = (damping_factor + 1) * (_current_state)-damping_factor * _prev_state;
+    _inertial_term = cusparse_multiply(_M, v_term);
+#else
     _inertial_term = _M * ((damping_factor + 1) * (_current_state)-damping_factor * _prev_state);
+#endif
+
     _prev_state = _current_state;
 
+#ifdef DTK_CUDA
+    cudaMemcpy(d_inertial_term, _inertial_term.data(), _inertial_term.size() * sizeof(float), cudaMemcpyHostToDevice);
+#endif
+
     // perform steps
-    bool use_cuda = true;
-    for (unsigned int i = 0; i < iter_num; i++) {
-        step(use_cuda);
-    }
+    for (unsigned int i = 0; i < iter_num; i++)
+        step();
 }
 
-extern "C" void run_local_step_kernel(const float* current_state, float* spring_directions, const float* rest_lengths, const int* spring_indices, int num_springs);
-
-void dtk::dtkPhysMassSpringSolver::step(bool use_cuda) {
-    if (use_cuda) {
-        // ***** CUDA Setup ***** //
-        cusparseHandle_t handle;
-        cusparseCreate(&handle);
-
-        int num_springs = _system->GetNumberOfSprings();
-        int num_points = _system->GetNumberOfMassPoints();
-        float h2 = _system->GetTimeStep() * _system->GetTimeStep();
-
-        // 分配 GPU 内存
-        float* d_current_state, * d_spring_directions, * d_inertial_term, * d_fext_force, * d_b;
-        float* d_rest_lengths;
-        int* d_spring_indices;
-
-        cudaMalloc(&d_current_state, _current_state.size() * sizeof(float));
-        cudaMalloc(&d_spring_directions, _spring_directions.size() * sizeof(float));
-        cudaMalloc(&d_inertial_term, _inertial_term.size() * sizeof(float));
-        cudaMalloc(&d_fext_force, num_points * 3 * sizeof(float)); // Assuming 3D points
-        cudaMalloc(&d_b, _inertial_term.size() * sizeof(float));
-        cudaMalloc(&d_rest_lengths, num_springs * sizeof(float));
-        cudaMalloc(&d_spring_indices, 2 * num_springs * sizeof(int)); // 每个弹簧有两个顶点
-
-        // 将数据从 CPU 拷贝到 GPU
-        cudaMemcpy(d_current_state, _current_state.data(), _current_state.size() * sizeof(float), cudaMemcpyHostToDevice);
-        cudaMemcpy(d_rest_lengths, _rest_lengths.data(), num_springs * sizeof(float), cudaMemcpyHostToDevice);
-        cudaMemcpy(d_spring_indices, _spring_indices.data(), 2 * num_springs * sizeof(int), cudaMemcpyHostToDevice);
-
-        // 调用 CUDA 函数执行 local step
-        run_local_step_kernel(d_current_state, d_spring_directions, d_rest_lengths, d_spring_indices, num_springs);
-
-        // 拷贝结果回 CPU
-        cudaMemcpy(_spring_directions.data(), d_spring_directions, _spring_directions.size() * sizeof(float), cudaMemcpyDeviceToHost);
-
-        // ***** 外力计算 ***** //
-        dtk::dtkDouble3 fext = _system->GetDefaultGravityAccel();
-        Vector3f fext_vector(fext.x, fext.y, fext.z);
-        VectorXf fext_force = fext_vector.replicate(num_points, 1);
-        cudaMemcpy(d_fext_force, fext_force.data(), num_points * 3 * sizeof(float), cudaMemcpyHostToDevice);
-
-        // ***** Right Hand Side (RHS) 的计算 ***** //
-        VectorXf rhs = _inertial_term + h2 * _J * _spring_directions + h2 * fext_force;
-        cudaMemcpy(d_b, rhs.data(), rhs.size() * sizeof(float), cudaMemcpyHostToDevice);
-
-        // ***** cuSPARSE 矩阵求解 ***** //
-        Eigen::SparseMatrix<float> system_matrix_eigen = _system_matrix.matrixL();  // 获取分解的下三角矩阵
-        int nnz = system_matrix_eigen.nonZeros();
-        int rows = system_matrix_eigen.rows();
-        int cols = system_matrix_eigen.cols();
-
-        // 生成 CSR 格式的数据
-        std::vector<int> csrRowPtr(rows + 1), csrColInd(nnz);
-        std::vector<float> csrVal(nnz);
-        int idx = 0;
-        for (int k = 0; k < system_matrix_eigen.outerSize(); ++k) {
-            csrRowPtr[k] = idx;
-            for (Eigen::SparseMatrix<float>::InnerIterator it(system_matrix_eigen, k); it; ++it) {
-                csrVal[idx] = it.value();
-                csrColInd[idx] = it.col();
-                idx++;
-            }
-        }
-        csrRowPtr[rows] = nnz;
-
-        // 在 GPU 上分配 CSR 矩阵
-        int* d_csrRowPtr, * d_csrColInd;
-        float* d_csrVal;
-        cudaMalloc(&d_csrRowPtr, (rows + 1) * sizeof(int));
-        cudaMalloc(&d_csrColInd, nnz * sizeof(int));
-        cudaMalloc(&d_csrVal, nnz * sizeof(float));
-        cudaMemcpy(d_csrRowPtr, csrRowPtr.data(), (rows + 1) * sizeof(int), cudaMemcpyHostToDevice);
-        cudaMemcpy(d_csrColInd, csrColInd.data(), nnz * sizeof(int), cudaMemcpyHostToDevice);
-        cudaMemcpy(d_csrVal, csrVal.data(), nnz * sizeof(float), cudaMemcpyHostToDevice);
-
-        // 使用 cuSPARSE 进行求解
-        float* d_x;
-        cudaMalloc(&d_x, rhs.size() * sizeof(float));
-
-        cusparseMatDescr_t descr;
-        cusparseCreateMatDescr(&descr);
-
-        // 定义 alpha 和 beta
-        float alpha = 1.0f;
-        float beta = 0.0f;
-
-        // 使用 cuSPARSE API 进行矩阵-向量乘法
-        cusparseSpMatDescr_t matA;
-        cusparseDnVecDescr_t vecX, vecY;
-        cusparseCreateCsr(&matA, rows, cols, nnz, d_csrRowPtr, d_csrColInd, d_csrVal,
-                          CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-                          CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
-        cusparseCreateDnVec(&vecX, rhs.size(), d_b, CUDA_R_32F);
-        cusparseCreateDnVec(&vecY, rhs.size(), d_x, CUDA_R_32F);
-
-        cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY,
-                     CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, nullptr);
-
-        // 将解拷贝回 CPU
-        cudaMemcpy(_current_state.data(), d_x, rhs.size() * sizeof(float), cudaMemcpyDeviceToHost);
-
-        // 释放 GPU 资源
-        cudaFree(d_current_state);
-        cudaFree(d_spring_directions);
-        cudaFree(d_inertial_term);
-        cudaFree(d_fext_force);
-        cudaFree(d_b);
-        cudaFree(d_x);
-        cudaFree(d_csrRowPtr);
-        cudaFree(d_csrColInd);
-        cudaFree(d_csrVal);
-        cudaFree(d_rest_lengths);
-        cudaFree(d_spring_indices);
-        cusparseDestroyMatDescr(descr);
-        cusparseDestroy(handle);
+void dtk::dtkPhysMassSpringSolver::step() {
+    // local step
+#ifdef DTK_CUDA
+    int num_springs = _system->GetNumberOfSprings();
+    run_local_step_with_cuda(d_current_state, d_spring_directions, d_spring_indices, d_rest_lengths, num_springs);
+    cudaMemcpy(_spring_directions.data(), d_spring_directions, _spring_directions.size() * sizeof(float), cudaMemcpyDeviceToHost);
+#else
+    for (dtk::dtkID id = 0; id < _system->GetNumberOfSprings(); id++) {
+        dtk::dtkPhysSpring* spring = _system->GetSpring(id);
+        dtk::dtkID id1 = spring->GetFirstVertex()->GetPointID();
+        dtk::dtkID id2 = spring->GetSecondVertex()->GetPointID();
+        double rest_length = spring->GetRestLength();
+        Vector3f p12(
+            _current_state[3 * id1 + 0] - _current_state[3 * id2 + 0],
+            _current_state[3 * id1 + 1] - _current_state[3 * id2 + 1],
+            _current_state[3 * id1 + 2] - _current_state[3 * id2 + 2]
+        );
+
+        p12.normalize();
+        _spring_directions[3 * id + 0] = rest_length * p12[0];
+        _spring_directions[3 * id + 1] = rest_length * p12[1];
+        _spring_directions[3 * id + 2] = rest_length * p12[2];
     }
-    else {
-        // 原有 CPU 代码
-        for (dtk::dtkID id = 0; id < _system->GetNumberOfSprings(); id++) {
-            dtk::dtkPhysSpring* spring = _system->GetSpring(id);
-            dtk::dtkID id1 = spring->GetFirstVertex()->GetPointID();
-            dtk::dtkID id2 = spring->GetSecondVertex()->GetPointID();
-            double rest_length = spring->GetRestLength();
-            Vector3f p12(
-                _current_state[3 * id1 + 0] - _current_state[3 * id2 + 0],
-                _current_state[3 * id1 + 1] - _current_state[3 * id2 + 1],
-                _current_state[3 * id1 + 2] - _current_state[3 * id2 + 2]
-            );
+#endif
+    // global step
+#ifdef DTK_CUDA
+    // step(1)
 
-            p12.normalize();
-            _spring_directions[3 * id + 0] = rest_length * p12[0];
-            _spring_directions[3 * id + 1] = rest_length * p12[1];
-            _spring_directions[3 * id + 2] = rest_length * p12[2];
-        }
+    // auto start = std::chrono::high_resolution_clock::now();
 
-        float h2 = _system->GetTimeStep() * _system->GetTimeStep();
-        dtk::dtkDouble3 fext = _system->GetDefaultGravityAccel();
-        VectorXf fext_force = VectorXf(Vector3f(fext.x, fext.y, fext.z).replicate(_system->GetNumberOfMassPoints(), 1));
+    VectorXf _J_spring_directions = cusparse_multiply(_J, _spring_directions);
+    VectorXf b = _inertial_term + _h2 * (_J_spring_directions + _fext_force);
 
-        VectorXf b = _inertial_term + h2 * _J * _spring_directions + h2 * fext_force;
-        _current_state = _system_matrix.solve(b);
-    }
+    // auto end = std::chrono::high_resolution_clock::now();
+    // std::chrono::duration<double> elapsed = end - start;
+    // std::cout << "step(1) time: " << elapsed.count() << " seconds" << std::endl;
+
+    // step(2)
+
+    // start = std::chrono::high_resolution_clock::now();
+
+    VectorR xhatGPU(b.size());
+    _system_matrix->solve(b.data(), xhatGPU.data());
+    _current_state = Eigen::Map<VectorXf>(xhatGPU.data(), xhatGPU.size());
+    cudaMemcpy(d_current_state, _current_state.data(), _current_state.size() * sizeof(float), cudaMemcpyHostToDevice);
+
+    // end = std::chrono::high_resolution_clock::now();
+    // elapsed = end - start;
+    // std::cout << "step(2) time: " << elapsed.count() << " seconds" << std::endl;
+#else
+    VectorXf b = _inertial_term + _h2 * (_J * _spring_directions + _fext_force);
+    _current_state = _system_matrix.solve(b);
+#endif
 }
 
 
diff --git a/test/system_test/MassSpring3D/include/cusparse_cholesky_solver.h b/test/system_test/MassSpring3D/include/cusparse_cholesky_solver.h
new file mode 100644
index 0000000..88a784c
--- /dev/null
+++ b/test/system_test/MassSpring3D/include/cusparse_cholesky_solver.h
@@ -0,0 +1,30 @@
+#ifndef __CUSPARSE_CHOLESKY_SOLVER_H__
+#define __CUSPARSE_CHOLESKY_SOLVER_H__
+
+#include <memory>
+
+template <typename T>
+class CuSparseCholeskySolver
+{
+public:
+
+	enum Info
+	{
+		SUCCESS,
+		NUMERICAL_ISSUE
+	};
+
+	using Ptr = std::unique_ptr<CuSparseCholeskySolver>;
+
+	static Ptr create(int size = 0);
+	virtual void resize(int size) = 0;
+	virtual void analyze(int nnz, const int* csrRowPtr, const int* csrColInd) = 0;
+	virtual void factorize(const T* A) = 0;
+	virtual void solve(const T* b, T* x) = 0;
+	virtual void setPermutaion(int size, const int* P) = 0;
+	virtual Info info() const = 0;
+
+	virtual ~CuSparseCholeskySolver();
+};
+
+#endif // !__CUSPARSE_CHOLESKY_SOLVER_H__
diff --git a/test/system_test/MassSpring3D/include/cusparse_wrapper.h b/test/system_test/MassSpring3D/include/cusparse_wrapper.h
new file mode 100644
index 0000000..7811dba
--- /dev/null
+++ b/test/system_test/MassSpring3D/include/cusparse_wrapper.h
@@ -0,0 +1,102 @@
+#ifndef __CUSPARSE_WRAPPER_H__
+#define __CUSPARSE_WRAPPER_H__
+
+#include <cusparse.h>
+#include <cusolverSp.h>
+#include <cusolverSp_LOWLEVEL_PREVIEW.h>
+
+/* Description: Gather of non-zero elements from dense vector y into
+   sparse vector x. */
+cusparseStatus_t CUSPARSEAPI cusparseXgthr(cusparseHandle_t handle,
+										   int nnz,
+										   const float* y,
+										   float* xVal,
+										   const int* xInd,
+										   cusparseIndexBase_t idxBase);
+
+cusparseStatus_t CUSPARSEAPI cusparseXgthr(cusparseHandle_t handle,
+										   int nnz,
+										   const double* y,
+										   double* xVal,
+										   const int* xInd,
+										   cusparseIndexBase_t idxBase);
+
+/*
+ * Low level API for GPU Cholesky
+ *
+ */
+cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholBufferInfo(
+	cusolverSpHandle_t handle,
+	int n,
+	int nnzA,
+	const cusparseMatDescr_t descrA,
+	const float* csrValA,
+	const int* csrRowPtrA,
+	const int* csrColIndA,
+	csrcholInfo_t info,
+	size_t* internalDataInBytes,
+	size_t* workspaceInBytes);
+
+cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholBufferInfo(
+	cusolverSpHandle_t handle,
+	int n,
+	int nnzA,
+	const cusparseMatDescr_t descrA,
+	const double* csrValA,
+	const int* csrRowPtrA,
+	const int* csrColIndA,
+	csrcholInfo_t info,
+	size_t* internalDataInBytes,
+	size_t* workspaceInBytes);
+
+cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholFactor(
+	cusolverSpHandle_t handle,
+	int n,
+	int nnzA,
+	const cusparseMatDescr_t descrA,
+	const float* csrValA,
+	const int* csrRowPtrA,
+	const int* csrColIndA,
+	csrcholInfo_t info,
+	void* pBuffer);
+
+cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholFactor(
+	cusolverSpHandle_t handle,
+	int n,
+	int nnzA,
+	const cusparseMatDescr_t descrA,
+	const double* csrValA,
+	const int* csrRowPtrA,
+	const int* csrColIndA,
+	csrcholInfo_t info,
+	void* pBuffer);
+
+cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholZeroPivot(
+	cusolverSpHandle_t handle,
+	csrcholInfo_t info,
+	float tol,
+	int* position);
+
+cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholZeroPivot(
+	cusolverSpHandle_t handle,
+	csrcholInfo_t info,
+	double tol,
+	int* position);
+
+cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholSolve(
+	cusolverSpHandle_t handle,
+	int n,
+	const float* b,
+	float* x,
+	csrcholInfo_t info,
+	void* pBuffer);
+
+cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholSolve(
+	cusolverSpHandle_t handle,
+	int n,
+	const double* b,
+	double* x,
+	csrcholInfo_t info,
+	void* pBuffer);
+
+#endif // !__CUSPARSE_WRAPPER_H__
diff --git a/test/system_test/MassSpring3D/include/device_buffer.h b/test/system_test/MassSpring3D/include/device_buffer.h
new file mode 100644
index 0000000..04ae588
--- /dev/null
+++ b/test/system_test/MassSpring3D/include/device_buffer.h
@@ -0,0 +1,70 @@
+#ifndef __DEVICE_BUFFER_H__
+#define __DEVICE_BUFFER_H__
+
+#include <cuda_runtime.h>
+
+#include "macro.h"
+
+template <typename T>
+struct DeviceBuffer
+{
+	DeviceBuffer() : data(nullptr), size(0) {}
+	DeviceBuffer(size_t size) : data(nullptr), size(0) { allocate(size); }
+	~DeviceBuffer() { destroy(); }
+
+	void allocate(size_t _size)
+	{
+		if (data && size >= _size)
+			return;
+
+		destroy();
+		CUDA_CHECK(cudaMalloc(&data, sizeof(T) * _size));
+		size = _size;
+	}
+
+	void destroy()
+	{
+		if (data)
+			CUDA_CHECK(cudaFree(data));
+		data = nullptr;
+		size = 0;
+	}
+
+	void upload(const T* h_data)
+	{
+		CUDA_CHECK(cudaMemcpy(data, h_data, sizeof(T) * size, cudaMemcpyHostToDevice));
+	}
+
+	void download(T* h_data)
+	{
+		CUDA_CHECK(cudaMemcpy(h_data, data, sizeof(T) * size, cudaMemcpyDeviceToHost));
+	}
+
+	void copyTo(DeviceBuffer& rhs) const
+	{
+		CUDA_CHECK(cudaMemcpy(rhs.data, data, sizeof(T) * size, cudaMemcpyDeviceToDevice));
+	}
+
+	void copyTo(T* rhs) const
+	{
+		CUDA_CHECK(cudaMemcpy(rhs, data, sizeof(T) * size, cudaMemcpyDeviceToDevice));
+	}
+
+	void fillZero()
+	{
+		CUDA_CHECK(cudaMemset(data, 0, sizeof(T) * size));
+	}
+
+	bool empty() const { return !(data && size > 0); }
+
+	void assign(size_t size, const T* h_data)
+	{
+		allocate(size);
+		upload(h_data);
+	}
+
+	T* data;
+	size_t size;
+};
+
+#endif // !__DEVICE_BUFFER_H__
diff --git a/test/system_test/MassSpring3D/include/dtkPhysMassSpringSolver.h b/test/system_test/MassSpring3D/include/dtkPhysMassSpringSolver.h
index 22087db..c0630dd 100644
--- a/test/system_test/MassSpring3D/include/dtkPhysMassSpringSolver.h
+++ b/test/system_test/MassSpring3D/include/dtkPhysMassSpringSolver.h
@@ -7,20 +7,29 @@
 #include <Eigen/Sparse>
 #include <random>
 
+#ifdef DTK_CUDA
 #include <cuda_runtime.h>
-#include <cusparse.h>
+#include <cusparse_v2.h>
+#include "cusparse_cholesky_solver.h"
+#include "step.cuh"
+#endif
 
 namespace dtk {
     class dtkPhysMassSpringSolver {
     private:
         typedef Eigen::Vector3f Vector3f;
         typedef Eigen::VectorXf VectorXf;
-        typedef Eigen::SparseMatrix<float> SparseMatrix;
-        typedef Eigen::SimplicialLLT<Eigen::SparseMatrix<float> > Cholesky;
+        typedef Eigen::SparseMatrix<float, Eigen::StorageOptions::RowMajor> SparseMatrixCSR;
+        typedef Eigen::SimplicialLLT<SparseMatrixCSR> Cholesky;
         typedef Eigen::Map<Eigen::VectorXf> Map;
         typedef std::pair<unsigned int, unsigned int> Edge;
         typedef Eigen::Triplet<float> Triplet;
         typedef std::vector<Triplet> TripletList;
+#ifdef DTK_CUDA
+        typedef Eigen::Matrix<float, Eigen::Dynamic, 1> VectorR;
+        typedef Eigen::AMDOrdering<SparseMatrixCSR::StorageIndex> Ordering;
+        typedef Ordering::PermutationType PermutationMatrix;
+#endif
     public:
         typedef std::shared_ptr<dtkPhysMassSpringSolver> Ptr;
 
@@ -37,41 +46,145 @@ namespace dtk {
             Hang
         };
 
-
         void satisfy(ClothDropType type = Sphere);
         VectorXf getCurrentState() const { return _current_state; };
 
-        static void printSparseMatrix(const SparseMatrix& matrix) {
+#ifdef DTK_CUDA
+        static VectorXf cusparse_multiply(const SparseMatrixCSR& J, const VectorXf& x) {
+            int rows = J.rows();
+            int cols = J.cols();
+            int nnz = J.nonZeros();
+
+            // 在 GPU 上分配内存
+            float* d_x, * d_y;
+            int* d_csrRowPtr, * d_csrColInd;
+            float* d_csrVal;
+
+            cudaMalloc(&d_x, cols * sizeof(float));
+            cudaMalloc(&d_y, rows * sizeof(float));
+            cudaMalloc(&d_csrRowPtr, (rows + 1) * sizeof(int));
+            cudaMalloc(&d_csrColInd, nnz * sizeof(int));
+            cudaMalloc(&d_csrVal, nnz * sizeof(float));
+
+            cudaMemcpy(d_x, x.data(), cols * sizeof(float), cudaMemcpyHostToDevice);
+            cudaMemcpy(d_csrRowPtr, J.outerIndexPtr(), (rows + 1) * sizeof(int), cudaMemcpyHostToDevice);
+            cudaMemcpy(d_csrColInd, J.innerIndexPtr(), nnz * sizeof(int), cudaMemcpyHostToDevice);
+            cudaMemcpy(d_csrVal, J.valuePtr(), nnz * sizeof(float), cudaMemcpyHostToDevice);
+
+            cusparseHandle_t handle;
+            cusparseCreate(&handle);
+
+            cusparseSpMatDescr_t matA;
+            cusparseDnVecDescr_t vecX, vecY;
+            cusparseCreateCsr(&matA, rows, cols, nnz, d_csrRowPtr, d_csrColInd, d_csrVal,
+                              CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                              CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
+            cusparseCreateDnVec(&vecX, cols, d_x, CUDA_R_32F);
+            cusparseCreateDnVec(&vecY, rows, d_y, CUDA_R_32F);
+
+            float alpha = 1.0f;
+            float beta = 0.0f;
+
+            size_t bufferSize = 0;
+            void* d_buffer = nullptr;
+
+            cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY,
+                                    CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize);
+
+            if (bufferSize > 0) {
+                cudaMalloc(&d_buffer, bufferSize);
+            }
+
+            cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY,
+                         CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, d_buffer);
+
+            VectorXf y(rows);
+            cudaMemcpy(y.data(), d_y, rows * sizeof(float), cudaMemcpyDeviceToHost);
+
+            cudaFree(d_x);
+            cudaFree(d_y);
+            cudaFree(d_csrRowPtr);
+            cudaFree(d_csrColInd);
+            cudaFree(d_csrVal);
+            cudaFree(d_buffer);
+            cusparseDestroyDnVec(vecX);
+            cusparseDestroyDnVec(vecY);
+            cusparseDestroySpMat(matA);
+            cusparseDestroy(handle);
+
+            return std::move(y);
+        }
+#endif
+        static inline void printSparseMatrix(const SparseMatrixCSR& matrix) {
             for (int k = 0; k < matrix.outerSize(); ++k) {
-                for (SparseMatrix::InnerIterator it(matrix, k); it; ++it) {
+                for (SparseMatrixCSR::InnerIterator it(matrix, k); it; ++it) {
                     std::cout << "Element at (" << it.row() << ", " << it.col() << ") = " << it.value() << std::endl;
                 }
             }
         }
+        static inline void printCholesky(const Cholesky& matrix) {
+            printSparseMatrix(choleskyToSparseMatrix(matrix));
+        }
+        static inline SparseMatrixCSR choleskyToSparseMatrix(const Cholesky& cholesky) {
+            SparseMatrixCSR L = cholesky.matrixL();
+            SparseMatrixCSR fullMatrix = L * L.transpose();
+            return std::move(fullMatrix);
+        }
+        static inline void printVectorXf(const VectorXf& vec) {
+            std::cout << vec << std::endl;
+        }
     private:
         dtkPhysMassSpringSolver();
         dtkPhysMassSpringSolver(const dtkPhysMassSpring::Ptr& massSpring);
 
-        void step(bool use_cuda = false);
-        Cholesky _system_matrix;
+        void step();
         dtkPhysMassSpring::Ptr _system;
+#ifdef DTK_CUDA
+        CuSparseCholeskySolver<float>::Ptr _system_matrix;
+#else
+        Cholesky _system_matrix;
+#endif
 
         // M, L, J matrices
-        SparseMatrix _M;/**< 质量稀疏矩阵  */
-        SparseMatrix _L;/**< Laplace稀疏矩阵，表示弹簧质点系统中质点与质点之间的连接关系，以及连接弹簧的刚度 */
-        SparseMatrix _J;/**< Jacobian稀疏矩阵，表示弹簧质点系统中每个质点与弹簧的连接关系，以及连接弹簧的刚度 */
+        SparseMatrixCSR _M;/**< 质量稀疏矩阵  */
+        SparseMatrixCSR _L;/**< Laplace稀疏矩阵，表示弹簧质点系统中质点与质点之间的连接关系，以及连接弹簧的刚度 */
+        SparseMatrixCSR _J;/**< Jacobian稀疏矩阵，表示弹簧质点系统中每个质点与弹簧的连接关系，以及连接弹簧的刚度 */
 
         VectorXf _initial_state;    // q(0)
         VectorXf _current_state;  // q(n)
         VectorXf _prev_state;  // q(n-1)
         VectorXf _spring_directions; // d, spring directions
         VectorXf _inertial_term; /**< 惯性项 = M * y, y = (a + 1) * q(n) - a * q(n - 1), a = damp_factor */
+        VectorXf _fext_force; /**< 外力项 */
+        double _h2; /**< 时间步长的平方 */
 
         std::vector<float> _rest_lengths;  // 存储每个弹簧的初始长度
         std::vector<int> _spring_indices;  // 存储每个弹簧的两个顶点的索引（大小为 2 * num_springs）
 
-
+#ifdef DTK_CUDA
+        float* d_current_state;
+        float* d_spring_directions;
+        int* d_spring_indices;
+        float* d_rest_lengths;
+        float* d_inertial_term;
+        float* d_fext_force;
+        float* d_J_spring_directions;
+        float* d_b;
+#endif
         float _time_step;
+    public:
+#ifdef DTK_CUDA
+        ~dtkPhysMassSpringSolver() {
+            cudaFree(d_current_state);
+            cudaFree(d_spring_directions);
+            cudaFree(d_spring_indices);
+            cudaFree(d_rest_lengths);
+            cudaFree(d_inertial_term);
+            cudaFree(d_fext_force);
+            cudaFree(d_b);
+            cudaFree(d_J_spring_directions);
+        }
+#endif
     };
 }
 #endif // SIMPLEPHYSICSENGINE_DTKPHYSMASSSPRINGSOLVER_H
\ No newline at end of file
diff --git a/test/system_test/MassSpring3D/include/macro.h b/test/system_test/MassSpring3D/include/macro.h
new file mode 100644
index 0000000..2a3410a
--- /dev/null
+++ b/test/system_test/MassSpring3D/include/macro.h
@@ -0,0 +1,13 @@
+#ifndef __MACRO_H__
+#define __MACRO_H__
+
+#include <cstdio>
+
+#define CUDA_CHECK(err) \
+do {\
+	if (err != cudaSuccess) { \
+		printf("[CUDA Error] %s (code: %d) at %s:%d\n", cudaGetErrorString(err), err, __FILE__, __LINE__); \
+	} \
+} while (0)
+
+#endif // !__MACRO_H__
diff --git a/test/system_test/MassSpring3D/include/step.cuh b/test/system_test/MassSpring3D/include/step.cuh
index bfcd356..fe73f74 100644
--- a/test/system_test/MassSpring3D/include/step.cuh
+++ b/test/system_test/MassSpring3D/include/step.cuh
@@ -1,9 +1,10 @@
-#ifndef STEP_CUH
-#define STEP_CUH
-#include <cuda_runtime.h>
-#include <stdio.h>
-
-extern "C" void run_local_step_kernel(const float* current_state, float* spring_directions,
-                                      const float* rest_lengths, const int* spring_indices,
-                                      int num_springs);
-#endif
\ No newline at end of file
+#ifndef __LOCALSTEP_CUH__
+#define __LOCALSTEP_CUH__
+void run_local_step_with_cuda(
+    const float* d_current_state, 
+    float* d_spring_directions,                            
+    const int* d_spring_indices, 
+    const float* d_rest_lengths,                                       
+    int num_springs
+);
+#endif // __LOCALSTEP_CUH__
\ No newline at end of file
diff --git a/test/system_test/MassSpring3D/main.cpp b/test/system_test/MassSpring3D/main.cpp
index abd67a3..d43a385 100644
--- a/test/system_test/MassSpring3D/main.cpp
+++ b/test/system_test/MassSpring3D/main.cpp
@@ -28,8 +28,6 @@
 #include "Shader.h"
 #include "Renderer.h"
 
-static auto last_clock = std::chrono::high_resolution_clock::now();
-
 // The Width of the screen
 const unsigned int WINDOW_WIDTH = 800;
 // The height of the screen
@@ -39,7 +37,7 @@ Scene* current_scene = nullptr;
 double fps = 0.0;
 double total_time = 0.0;
 int total_frames = 0;
-auto last_time = std::chrono::high_resolution_clock::now();
+double dt = 0.08;
 
 char INSTRUCTION = '0';
 int I1_EDGE_NUM = 33;
@@ -75,17 +73,19 @@ static void draw_text(int x, int y, const char* format, ...) {
 }
 
 void display() {
+    if (current_scene == nullptr || current_scene->IsPause()) {
+        return;
+    }
     glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
     glMatrixMode(GL_MODELVIEW);
     glLoadIdentity();
     glTranslatef(0.0f, -8.0f, -25.0f);
 
+    auto last_time = std::chrono::high_resolution_clock::now();
+    current_scene->Update(dt);
+    current_scene->Render();
     auto now = std::chrono::high_resolution_clock::now();
-    auto dt = std::chrono::duration_cast<std::chrono::duration<double>>(
-        now - last_time)
-        .count();
-    last_time = now;
-
+    dt = std::chrono::duration_cast<std::chrono::duration<double>>(now - last_time).count();
     total_time += dt;
     total_frames++;
     last_time = now;
@@ -105,9 +105,6 @@ void display() {
     else
         draw_text(5, h - 20, "dt: %.2f ms", dt * 1000);
 
-    current_scene->Update(dt);
-    current_scene->Render();
-
     glutSwapBuffers();
     checkGlErrors();
 
diff --git a/test/system_test/MassSpring3D/shaders/basic.vshader b/test/system_test/MassSpring3D/shaders/basic.vshader
index 85c7bd3..c872dca 100644
--- a/test/system_test/MassSpring3D/shaders/basic.vshader
+++ b/test/system_test/MassSpring3D/shaders/basic.vshader
@@ -1,7 +1,7 @@
 #version 330 core
 
 uniform mat4 uModelViewMatrix;
-// uniform mat4 uNormalMatrix;
+uniform mat4 uNormalMatrix;
 uniform mat4 uProjectionMatrix;
 
 layout(location=0) in vec3 aPosition;
diff --git a/test/system_test/MassSpring3D/shaders/phong.fshader b/test/system_test/MassSpring3D/shaders/phong.fshader
index 17be2e7..183cf34 100644
--- a/test/system_test/MassSpring3D/shaders/phong.fshader
+++ b/test/system_test/MassSpring3D/shaders/phong.fshader
@@ -15,7 +15,7 @@ void main(){
 	}
 
     vec3 toLight = normalize(-uLight);
-    float diffuse = max(0, dot(toLight, normal));
+    float diffuse = max(0.0, dot(toLight, normal));
 
     vec3 color = diffuse * albedo + uAmbient * albedo;
     fragColor = vec4(color, 1.0);
diff --git a/test/system_test/MassSpring3D/step.cu b/test/system_test/MassSpring3D/step.cu
index 53e9012..04e19d4 100644
--- a/test/system_test/MassSpring3D/step.cu
+++ b/test/system_test/MassSpring3D/step.cu
@@ -1,62 +1,45 @@
-#include "step.cuh"
-
-// CUDA 核函数的声明
-__global__ void local_step_kernel(const float* __restrict__ current_state, float* __restrict__ spring_directions, 
-                                  const float* __restrict__ rest_lengths, const int* __restrict__ spring_indices, 
-                                  int num_springs);
+#include <cuda_runtime.h>
+#include <cmath>
+
+extern "C" __global__ void update_d_spring_directions(const float* current_state, float* d_spring_directions,
+                                                    const int* spring_indices, const float* rest_lengths,
+                                                    int num_springs) {
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id < num_springs) {
+        int id1 = spring_indices[2 * id];
+        int id2 = spring_indices[2 * id + 1];
+        
+        float3 p12 = make_float3(
+            current_state[3 * id1] - current_state[3 * id2],
+            current_state[3 * id1 + 1] - current_state[3 * id2 + 1],
+            current_state[3 * id1 + 2] - current_state[3 * id2 + 2]
+        );
+
+        float length = sqrtf(p12.x * p12.x + p12.y * p12.y + p12.z * p12.z);
+        if (length > 0) {
+            p12.x /= length; p12.y /= length; p12.z /= length; // Normalize
+        }
+
+        d_spring_directions[3 * id] = rest_lengths[id] * p12.x;
+        d_spring_directions[3 * id + 1] = rest_lengths[id] * p12.y;
+        d_spring_directions[3 * id + 2] = rest_lengths[id] * p12.z;
+    }
+}
 
-// CUDA 调用函数
-extern "C" void run_local_step_kernel(const float* current_state, float* spring_directions, 
-                                      const float* rest_lengths, const int* spring_indices, 
-                                      int num_springs) 
-{
-    // 计算 CUDA 核函数需要的线程数和块数
+void run_local_step_with_cuda(
+    const float* d_current_state, 
+    float* d_spring_directions,                            
+    const int* d_spring_indices, 
+    const float* d_rest_lengths,                                       
+    int num_springs
+){
     int blockSize = 256;
     int numBlocks = (num_springs + blockSize - 1) / blockSize;
 
-    // 调用 CUDA 核函数
-    local_step_kernel<<<numBlocks, blockSize>>>(current_state, spring_directions, rest_lengths, spring_indices, num_springs);
+    update_d_spring_directions<<<numBlocks, blockSize>>>(
+        d_current_state, d_spring_directions, d_spring_indices, d_rest_lengths, num_springs
+    );
 
-    // 检查 CUDA 错误
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA kernel failed: %s\n", cudaGetErrorString(err));
-    }
-
-    // 同步 CUDA 设备
+    // Wait for all threads to complete
     cudaDeviceSynchronize();
-}
-
-// CUDA 核函数的定义
-__global__ void local_step_kernel(const float* __restrict__ current_state, float* __restrict__ spring_directions, 
-                                  const float* __restrict__ rest_lengths, const int* __restrict__ spring_indices, 
-                                  int num_springs) 
-{
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= num_springs) return;  // 如果 idx 超过弹簧数量则返回
-
-    // 获取当前弹簧的两个点的索引
-    int id1 = spring_indices[2 * idx];   // 第一个顶点的ID
-    int id2 = spring_indices[2 * idx + 1]; // 第二个顶点的ID
-
-    // 计算两个点之间的向量 p12
-    float p12_x = current_state[3 * id1 + 0] - current_state[3 * id2 + 0];
-    float p12_y = current_state[3 * id1 + 1] - current_state[3 * id2 + 1];
-    float p12_z = current_state[3 * id1 + 2] - current_state[3 * id2 + 2];
-
-    // 计算 p12 的长度（归一化向量）
-    float length = sqrtf(p12_x * p12_x + p12_y * p12_y + p12_z * p12_z);
-
-    if (length > 1e-6) {
-        // 归一化向量
-        p12_x /= length;
-        p12_y /= length;
-        p12_z /= length;
-
-        // 使用 rest_length 计算方向
-        float rest_length = rest_lengths[idx];
-        spring_directions[3 * idx + 0] = rest_length * p12_x;
-        spring_directions[3 * idx + 1] = rest_length * p12_y;
-        spring_directions[3 * idx + 2] = rest_length * p12_z;
-    }
-}
+}
\ No newline at end of file