SciML · ChrisRackauckas · Aug 6, 2025 · Jul 28, 2025 · Jul 28, 2025 · Jul 28, 2025
diff --git a/Project.toml b/Project.toml
@@ -33,6 +33,7 @@ BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
 blis_jll = "6136c539-28a5-5bf0-87cc-b183200dce32"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e"
+CUSOLVERRF = "a8cc9031-bad2-4722-94f5-40deabb4245c"
 EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
 FastAlmostBandedMatrices = "9d29842c-ecb8-4973-b1e9-a27b1157504e"
 FastLapackInterface = "29a986be-02c6-4525-aec4-84b980013641"
@@ -54,6 +55,7 @@ LinearSolveBandedMatricesExt = "BandedMatrices"
 LinearSolveBlockDiagonalsExt = "BlockDiagonals"
 LinearSolveCUDAExt = "CUDA"
 LinearSolveCUDSSExt = "CUDSS"
+LinearSolveCUSOLVERRFExt = ["CUSOLVERRF", "SparseArrays"]
 LinearSolveEnzymeExt = "EnzymeCore"
 LinearSolveFastAlmostBandedMatricesExt = "FastAlmostBandedMatrices"
 LinearSolveFastLapackInterfaceExt = "FastLapackInterface"
@@ -77,6 +79,7 @@ BlockDiagonals = "0.1.42, 0.2"
 blis_jll = "0.9.0"
 CUDA = "5"
 CUDSS = "0.1, 0.2, 0.3, 0.4"
+CUSOLVERRF = "0.2.6"
 ChainRulesCore = "1.22"
 ConcreteStructs = "0.2.3"
 DocStringExtensions = "0.9.3"

diff --git a/docs/src/solvers/solvers.md b/docs/src/solvers/solvers.md
@@ -43,6 +43,14 @@ For sparse LU-factorizations, `KLUFactorization` if there is less structure
 to the sparsity pattern and `UMFPACKFactorization` if there is more structure.
 Pardiso.jl's methods are also known to be very efficient sparse linear solvers.
 
+For GPU-accelerated sparse LU-factorizations, there are two high-performance options.
+When using CuSparseMatrixCSR arrays with CUDSS.jl loaded, `LUFactorization()` will
+automatically use NVIDIA's cuDSS library. Alternatively, `CUSOLVERRFFactorization`
+provides access to NVIDIA's cusolverRF library. Both offer significant performance
+improvements for sparse systems on CUDA-capable GPUs and are particularly effective
+for large sparse matrices that can benefit from GPU parallelization. `CUDSS` is more
+for `Float32` while `CUSOLVERRFFactorization` is for `Float64`.
+
 While these sparse factorizations are based on implementations in other languages,
 and therefore constrained to standard number types (`Float64`,  `Float32` and
 their complex counterparts),  `SparspakFactorization` is able to handle general
@@ -219,7 +227,7 @@ LinearSolve.PardisoJL
 
 ### CUDA.jl
 
-Note that `CuArrays` are supported by `GenericFactorization` in the “normal” way.
+Note that `CuArrays` are supported by `GenericFactorization` in the "normal" way.
 The following are non-standard GPU factorization routines.
 
 !!! note
@@ -230,6 +238,16 @@ The following are non-standard GPU factorization routines.
 CudaOffloadFactorization
 ```
 
+### CUSOLVERRF.jl
+
+!!! note
+
+    Using this solver requires adding the package CUSOLVERRF.jl, i.e. `using CUSOLVERRF`
+
+```@docs
+CUSOLVERRFFactorization
+```
+
 ### IterativeSolvers.jl
 
 !!! note

diff --git a/docs/src/tutorials/gpu.md b/docs/src/tutorials/gpu.md
@@ -121,6 +121,30 @@ sol = LS.solve(prob, LS.LUFactorization())
 
     For now, CUDSS only supports CuSparseMatrixCSR type matrices.
 
+For high-performance sparse LU factorization on GPUs, you can also use CUSOLVERRF.jl:
+
+```julia
+using CUSOLVERRF
+sol = LS.solve(prob, LS.CUSOLVERRFFactorization())
+```
+
+CUSOLVERRF provides access to NVIDIA's cusolverRF library, which offers significant 
+performance improvements for sparse LU factorization on GPUs. It supports both 
+`:RF` (default) and `:KLU` symbolic factorization methods, and can reuse symbolic 
+factorization for matrices with the same sparsity pattern:
+
+```julia
+# Use KLU for symbolic factorization
+sol = LS.solve(prob, LS.CUSOLVERRFFactorization(symbolic = :KLU))
+
+# Reuse symbolic factorization for better performance
+sol = LS.solve(prob, LS.CUSOLVERRFFactorization(reuse_symbolic = true))
+```
+
+!!! note
+
+    CUSOLVERRF only supports `Float64` element types with `Int32` indices.
+
 Note that `KrylovJL` methods also work with sparse GPU arrays:
 
 ```julia

diff --git a/ext/LinearSolveCUSOLVERRFExt.jl b/ext/LinearSolveCUSOLVERRFExt.jl
@@ -0,0 +1,89 @@
+module LinearSolveCUSOLVERRFExt
+
+using LinearSolve: LinearSolve, @get_cacheval, pattern_changed, OperatorAssumptions
+using CUSOLVERRF: CUSOLVERRF, RFLU, CUDA
+using SparseArrays: SparseArrays, SparseMatrixCSC, nnz
+using CUSOLVERRF.CUDA.CUSPARSE: CuSparseMatrixCSR
+using LinearAlgebra: LinearAlgebra, Adjoint, ldiv!, lu!
+using SciMLBase: SciMLBase, LinearProblem, ReturnCode
+
+function LinearSolve.init_cacheval(alg::LinearSolve.CUSOLVERRFFactorization,
+        A, b, u, Pl, Pr,
+        maxiters::Int, abstol, reltol,
+        verbose::Bool, assumptions::OperatorAssumptions)
+    nothing
+end
+
+function LinearSolve.init_cacheval(alg::LinearSolve.CUSOLVERRFFactorization,
+        A::Union{CuSparseMatrixCSR{Float64, Int32}, SparseMatrixCSC{Float64, <:Integer}}, 
+        b, u, Pl, Pr,
+        maxiters::Int, abstol, reltol,
+        verbose::Bool, assumptions::OperatorAssumptions)
+    # Create initial factorization with appropriate options
+    nrhs = b isa AbstractMatrix ? size(b, 2) : 1
+    symbolic = alg.symbolic
+    # Convert to CuSparseMatrixCSR if needed
+    A_gpu = A isa CuSparseMatrixCSR ? A : CuSparseMatrixCSR(A)
+    RFLU(A_gpu; nrhs=nrhs, symbolic=symbolic)
+end
+
+function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::LinearSolve.CUSOLVERRFFactorization; kwargs...)
+    A = cache.A
+
+    # Convert to appropriate GPU format if needed
+    if A isa SparseMatrixCSC
+        A_gpu = CuSparseMatrixCSR(A)
+    elseif A isa CuSparseMatrixCSR
+        A_gpu = A
+    else
+        error("CUSOLVERRFFactorization only supports SparseMatrixCSC or CuSparseMatrixCSR matrices")
+    end
+
+    if cache.isfresh
+        cacheval = @get_cacheval(cache, :CUSOLVERRFFactorization)
+        if cacheval === nothing
+            # Create new factorization
+            nrhs = cache.b isa AbstractMatrix ? size(cache.b, 2) : 1
+            fact = RFLU(A_gpu; nrhs=nrhs, symbolic=alg.symbolic)
+        else
+            # Reuse symbolic factorization if pattern hasn't changed
+            if alg.reuse_symbolic && !pattern_changed(cacheval, A_gpu)
+                fact = cacheval
+                lu!(fact, A_gpu)
+            else
+                # Create new factorization if pattern changed
+                nrhs = cache.b isa AbstractMatrix ? size(cache.b, 2) : 1
+                fact = RFLU(A_gpu; nrhs=nrhs, symbolic=alg.symbolic)
+            end
+        end
+        cache.cacheval = fact
+        cache.isfresh = false
+    end
+
+    F = @get_cacheval(cache, :CUSOLVERRFFactorization)
+
+    # Ensure b and u are on GPU
+    b_gpu = cache.b isa CUDA.CuArray ? cache.b : CUDA.CuArray(cache.b)
+    u_gpu = cache.u isa CUDA.CuArray ? cache.u : CUDA.CuArray(cache.u)
+
+    # Solve
+    copyto!(u_gpu, b_gpu)
+    ldiv!(F, u_gpu)
+
+    # Copy back to CPU if needed
+    if !(cache.u isa CUDA.CuArray)
+        copyto!(cache.u, u_gpu)
+    end
+
+    SciMLBase.build_linear_solution(alg, cache.u, nothing, cache; retcode = ReturnCode.Success)
+end
+
+# Helper function for pattern checking
+function LinearSolve.pattern_changed(rf::RFLU, A::CuSparseMatrixCSR)
+    # For CUSOLVERRF, we need to check if the sparsity pattern has changed
+    # This is a simplified check - you might need a more sophisticated approach
+    size(rf) != size(A) || nnz(rf.M) != nnz(A)
+end
+
+
+end
diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl
@@ -211,7 +211,7 @@ for alg in (:LUFactorization, :FastLUFactorization, :SVDFactorization,
     :RFLUFactorization, :UMFPACKFactorization, :KLUFactorization, :SparspakFactorization,
     :DiagonalFactorization, :CholeskyFactorization, :BunchKaufmanFactorization,
     :CHOLMODFactorization, :LDLtFactorization, :AppleAccelerateLUFactorization,
-    :MKLLUFactorization, :MetalLUFactorization)
+    :MKLLUFactorization, :MetalLUFactorization, :CUSOLVERRFFactorization)
     @eval needs_square_A(::$(alg)) = true
 end
 
@@ -240,7 +240,8 @@ export LUFactorization, SVDFactorization, QRFactorization, GenericFactorization,
        NormalCholeskyFactorization, NormalBunchKaufmanFactorization,
        UMFPACKFactorization, KLUFactorization, FastLUFactorization, FastQRFactorization,
        SparspakFactorization, DiagonalFactorization, CholeskyFactorization,
-       BunchKaufmanFactorization, CHOLMODFactorization, LDLtFactorization
+       BunchKaufmanFactorization, CHOLMODFactorization, LDLtFactorization,
+       CUSOLVERRFFactorization
 
 export LinearSolveFunction, DirectLdiv!
 

diff --git a/src/extension_algs.jl b/src/extension_algs.jl
@@ -441,3 +441,36 @@ to avoid allocations and automatically offloads to the GPU.
 struct MetalLUFactorization <: AbstractFactorization end
 
 struct BLISLUFactorization <: AbstractFactorization end
+
+"""
+`CUSOLVERRFFactorization(; symbolic = :RF, reuse_symbolic = true)`
+
+A GPU-accelerated sparse LU factorization using NVIDIA's cusolverRF library.
+This solver is specifically designed for sparse matrices on CUDA GPUs and 
+provides high-performance factorization and solve capabilities.
+
+## Keyword Arguments
+
+  - `symbolic`: The symbolic factorization method to use. Options are:
+    - `:RF` (default): Use cusolverRF's built-in symbolic analysis
+    - `:KLU`: Use KLU for symbolic analysis
+  - `reuse_symbolic`: Whether to reuse the symbolic factorization when the 
+    sparsity pattern doesn't change (default: `true`)
+
+!!! note
+    This solver requires CUSOLVERRF.jl to be loaded and only supports 
+    `Float64` element types with `Int32` indices.
+"""
+struct CUSOLVERRFFactorization <: AbstractSparseFactorization
+    symbolic::Symbol
+    reuse_symbolic::Bool
+
+    function CUSOLVERRFFactorization(; symbolic::Symbol = :RF, reuse_symbolic::Bool = true)
+        ext = Base.get_extension(@__MODULE__, :LinearSolveCUSOLVERRFExt)
+        if ext === nothing
+            error("CUSOLVERRFFactorization requires that CUSOLVERRF.jl is loaded, i.e. `using CUSOLVERRF`")
+        else
+            return new{}(symbolic, reuse_symbolic)
+        end
+    end
+end
diff --git a/test/gpu/Project.toml b/test/gpu/Project.toml
@@ -2,7 +2,9 @@
 BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e"
+CUSOLVERRF = "a8cc9031-bad2-4722-94f5-40deabb4245c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/gpu/cuda.jl b/test/gpu/cuda.jl
@@ -106,3 +106,10 @@ end
     prob = LinearProblem(A_gpu_csr, b_gpu)
     sol = solve(prob)
 end
+
+# Include CUSOLVERRF tests if available
+if Base.find_package("CUSOLVERRF") !== nothing
+    @testset "CUSOLVERRF" begin
+        include("cusolverrf.jl")
+    end
+end
diff --git a/test/gpu/cusolverrf.jl b/test/gpu/cusolverrf.jl
@@ -0,0 +1,67 @@
+using LinearSolve
+using CUSOLVERRF
+using CUDA
+using SparseArrays
+using LinearAlgebra
+using Test
+
+@testset "CUSOLVERRFFactorization" begin
+    # Skip tests if CUDA is not available
+    if !CUDA.functional()
+        @info "CUDA not available, skipping CUSOLVERRF tests"
+        return
+    end
+
+    # Test with a small sparse matrix
+    n = 100
+    A = sprand(n, n, 0.1) + I
+    b = rand(n)
+
+    # Test with CPU sparse matrix (should auto-convert to GPU)
+    @testset "CPU Sparse Matrix" begin
+        prob = LinearProblem(A, b)
+
+        # Test with default symbolic (:RF)
+        sol = solve(prob, CUSOLVERRFFactorization())
+        @test norm(A * sol.u - b) / norm(b) < 1e-10
+
+        # Test with KLU symbolic
+        sol_klu = solve(prob, CUSOLVERRFFactorization(symbolic = :KLU))
+        @test norm(A * sol_klu.u - b) / norm(b) < 1e-10
+    end
+
+    # Test with GPU sparse matrix
+    @testset "GPU Sparse Matrix" begin
+        A_gpu = CUDA.CUSPARSE.CuSparseMatrixCSR(A)
+        b_gpu = CuArray(b)
+
+        prob_gpu = LinearProblem(A_gpu, b_gpu)
+        sol_gpu = solve(prob_gpu, CUSOLVERRFFactorization())
+
+        # Check residual on GPU
+        res_gpu = A_gpu * sol_gpu.u - b_gpu
+        @test norm(res_gpu) / norm(b_gpu) < 1e-10
+    end
+
+    # Test matrix update with same sparsity pattern
+    @testset "Matrix Update" begin
+        # Create a new matrix with same pattern but different values
+        A2 = A + 0.1 * sprand(n, n, 0.01)
+        b2 = rand(n)
+
+        prob2 = LinearProblem(A2, b2)
+        sol2 = solve(prob2, CUSOLVERRFFactorization(reuse_symbolic = true))
+        @test norm(A2 * sol2.u - b2) / norm(b2) < 1e-10
+    end
+
+    # Test error handling for unsupported types
+    @testset "Error Handling" begin
+        # Test with Float32 (not supported)
+        A_f32 = Float32.(A)
+        b_f32 = Float32.(b)
+        prob_f32 = LinearProblem(A_f32, b_f32)
+
+        # This should error since CUSOLVERRF only supports Float64
+        @test_throws Exception solve(prob_f32, CUSOLVERRFFactorization())
+    end
+end
diff --git a/test/resolve.jl b/test/resolve.jl
@@ -11,6 +11,7 @@ for alg in vcat(InteractiveUtils.subtypes(AbstractDenseFactorization),
     if !(alg in [
            DiagonalFactorization,
            CudaOffloadFactorization,
+           CUSOLVERRFFactorization,
            AppleAccelerateLUFactorization,
            MetalLUFactorization
        ]) &&