diff --git a/Project.toml b/Project.toml index 69369115b..2a29bbaae 100644 --- a/Project.toml +++ b/Project.toml @@ -33,6 +33,7 @@ BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0" blis_jll = "6136c539-28a5-5bf0-87cc-b183200dce32" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e" +CUSOLVERRF = "a8cc9031-bad2-4722-94f5-40deabb4245c" EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" FastAlmostBandedMatrices = "9d29842c-ecb8-4973-b1e9-a27b1157504e" FastLapackInterface = "29a986be-02c6-4525-aec4-84b980013641" @@ -54,6 +55,7 @@ LinearSolveBandedMatricesExt = "BandedMatrices" LinearSolveBlockDiagonalsExt = "BlockDiagonals" LinearSolveCUDAExt = "CUDA" LinearSolveCUDSSExt = "CUDSS" +LinearSolveCUSOLVERRFExt = ["CUSOLVERRF", "SparseArrays"] LinearSolveEnzymeExt = "EnzymeCore" LinearSolveFastAlmostBandedMatricesExt = "FastAlmostBandedMatrices" LinearSolveFastLapackInterfaceExt = "FastLapackInterface" @@ -77,6 +79,7 @@ BlockDiagonals = "0.1.42, 0.2" blis_jll = "0.9.0" CUDA = "5" CUDSS = "0.1, 0.2, 0.3, 0.4" +CUSOLVERRF = "0.2.6" ChainRulesCore = "1.22" ConcreteStructs = "0.2.3" DocStringExtensions = "0.9.3" diff --git a/docs/src/solvers/solvers.md b/docs/src/solvers/solvers.md index 02825b3fa..7bcb11a56 100644 --- a/docs/src/solvers/solvers.md +++ b/docs/src/solvers/solvers.md @@ -43,6 +43,14 @@ For sparse LU-factorizations, `KLUFactorization` if there is less structure to the sparsity pattern and `UMFPACKFactorization` if there is more structure. Pardiso.jl's methods are also known to be very efficient sparse linear solvers. +For GPU-accelerated sparse LU-factorizations, there are two high-performance options. +When using CuSparseMatrixCSR arrays with CUDSS.jl loaded, `LUFactorization()` will +automatically use NVIDIA's cuDSS library. Alternatively, `CUSOLVERRFFactorization` +provides access to NVIDIA's cusolverRF library. Both offer significant performance +improvements for sparse systems on CUDA-capable GPUs and are particularly effective +for large sparse matrices that can benefit from GPU parallelization. `CUDSS` is more +for `Float32` while `CUSOLVERRFFactorization` is for `Float64`. + While these sparse factorizations are based on implementations in other languages, and therefore constrained to standard number types (`Float64`, `Float32` and their complex counterparts), `SparspakFactorization` is able to handle general @@ -219,7 +227,7 @@ LinearSolve.PardisoJL ### CUDA.jl -Note that `CuArrays` are supported by `GenericFactorization` in the “normal” way. +Note that `CuArrays` are supported by `GenericFactorization` in the "normal" way. The following are non-standard GPU factorization routines. !!! note @@ -230,6 +238,16 @@ The following are non-standard GPU factorization routines. CudaOffloadFactorization ``` +### CUSOLVERRF.jl + +!!! note + + Using this solver requires adding the package CUSOLVERRF.jl, i.e. `using CUSOLVERRF` + +```@docs +CUSOLVERRFFactorization +``` + ### IterativeSolvers.jl !!! note diff --git a/docs/src/tutorials/gpu.md b/docs/src/tutorials/gpu.md index 9e11a3f3e..ee737668c 100644 --- a/docs/src/tutorials/gpu.md +++ b/docs/src/tutorials/gpu.md @@ -121,6 +121,30 @@ sol = LS.solve(prob, LS.LUFactorization()) For now, CUDSS only supports CuSparseMatrixCSR type matrices. +For high-performance sparse LU factorization on GPUs, you can also use CUSOLVERRF.jl: + +```julia +using CUSOLVERRF +sol = LS.solve(prob, LS.CUSOLVERRFFactorization()) +``` + +CUSOLVERRF provides access to NVIDIA's cusolverRF library, which offers significant +performance improvements for sparse LU factorization on GPUs. It supports both +`:RF` (default) and `:KLU` symbolic factorization methods, and can reuse symbolic +factorization for matrices with the same sparsity pattern: + +```julia +# Use KLU for symbolic factorization +sol = LS.solve(prob, LS.CUSOLVERRFFactorization(symbolic = :KLU)) + +# Reuse symbolic factorization for better performance +sol = LS.solve(prob, LS.CUSOLVERRFFactorization(reuse_symbolic = true)) +``` + +!!! note + + CUSOLVERRF only supports `Float64` element types with `Int32` indices. + Note that `KrylovJL` methods also work with sparse GPU arrays: ```julia diff --git a/ext/LinearSolveCUSOLVERRFExt.jl b/ext/LinearSolveCUSOLVERRFExt.jl new file mode 100644 index 000000000..18c71d410 --- /dev/null +++ b/ext/LinearSolveCUSOLVERRFExt.jl @@ -0,0 +1,89 @@ +module LinearSolveCUSOLVERRFExt + +using LinearSolve: LinearSolve, @get_cacheval, pattern_changed, OperatorAssumptions +using CUSOLVERRF: CUSOLVERRF, RFLU, CUDA +using SparseArrays: SparseArrays, SparseMatrixCSC, nnz +using CUSOLVERRF.CUDA.CUSPARSE: CuSparseMatrixCSR +using LinearAlgebra: LinearAlgebra, Adjoint, ldiv!, lu! +using SciMLBase: SciMLBase, LinearProblem, ReturnCode + +function LinearSolve.init_cacheval(alg::LinearSolve.CUSOLVERRFFactorization, + A, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, + verbose::Bool, assumptions::OperatorAssumptions) + nothing +end + +function LinearSolve.init_cacheval(alg::LinearSolve.CUSOLVERRFFactorization, + A::Union{CuSparseMatrixCSR{Float64, Int32}, SparseMatrixCSC{Float64, <:Integer}}, + b, u, Pl, Pr, + maxiters::Int, abstol, reltol, + verbose::Bool, assumptions::OperatorAssumptions) + # Create initial factorization with appropriate options + nrhs = b isa AbstractMatrix ? size(b, 2) : 1 + symbolic = alg.symbolic + # Convert to CuSparseMatrixCSR if needed + A_gpu = A isa CuSparseMatrixCSR ? A : CuSparseMatrixCSR(A) + RFLU(A_gpu; nrhs=nrhs, symbolic=symbolic) +end + +function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::LinearSolve.CUSOLVERRFFactorization; kwargs...) + A = cache.A + + # Convert to appropriate GPU format if needed + if A isa SparseMatrixCSC + A_gpu = CuSparseMatrixCSR(A) + elseif A isa CuSparseMatrixCSR + A_gpu = A + else + error("CUSOLVERRFFactorization only supports SparseMatrixCSC or CuSparseMatrixCSR matrices") + end + + if cache.isfresh + cacheval = @get_cacheval(cache, :CUSOLVERRFFactorization) + if cacheval === nothing + # Create new factorization + nrhs = cache.b isa AbstractMatrix ? size(cache.b, 2) : 1 + fact = RFLU(A_gpu; nrhs=nrhs, symbolic=alg.symbolic) + else + # Reuse symbolic factorization if pattern hasn't changed + if alg.reuse_symbolic && !pattern_changed(cacheval, A_gpu) + fact = cacheval + lu!(fact, A_gpu) + else + # Create new factorization if pattern changed + nrhs = cache.b isa AbstractMatrix ? size(cache.b, 2) : 1 + fact = RFLU(A_gpu; nrhs=nrhs, symbolic=alg.symbolic) + end + end + cache.cacheval = fact + cache.isfresh = false + end + + F = @get_cacheval(cache, :CUSOLVERRFFactorization) + + # Ensure b and u are on GPU + b_gpu = cache.b isa CUDA.CuArray ? cache.b : CUDA.CuArray(cache.b) + u_gpu = cache.u isa CUDA.CuArray ? cache.u : CUDA.CuArray(cache.u) + + # Solve + copyto!(u_gpu, b_gpu) + ldiv!(F, u_gpu) + + # Copy back to CPU if needed + if !(cache.u isa CUDA.CuArray) + copyto!(cache.u, u_gpu) + end + + SciMLBase.build_linear_solution(alg, cache.u, nothing, cache; retcode = ReturnCode.Success) +end + +# Helper function for pattern checking +function LinearSolve.pattern_changed(rf::RFLU, A::CuSparseMatrixCSR) + # For CUSOLVERRF, we need to check if the sparsity pattern has changed + # This is a simplified check - you might need a more sophisticated approach + size(rf) != size(A) || nnz(rf.M) != nnz(A) +end + + +end \ No newline at end of file diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl index d21103de0..bf24093b4 100644 --- a/src/LinearSolve.jl +++ b/src/LinearSolve.jl @@ -211,7 +211,7 @@ for alg in (:LUFactorization, :FastLUFactorization, :SVDFactorization, :RFLUFactorization, :UMFPACKFactorization, :KLUFactorization, :SparspakFactorization, :DiagonalFactorization, :CholeskyFactorization, :BunchKaufmanFactorization, :CHOLMODFactorization, :LDLtFactorization, :AppleAccelerateLUFactorization, - :MKLLUFactorization, :MetalLUFactorization) + :MKLLUFactorization, :MetalLUFactorization, :CUSOLVERRFFactorization) @eval needs_square_A(::$(alg)) = true end @@ -240,7 +240,8 @@ export LUFactorization, SVDFactorization, QRFactorization, GenericFactorization, NormalCholeskyFactorization, NormalBunchKaufmanFactorization, UMFPACKFactorization, KLUFactorization, FastLUFactorization, FastQRFactorization, SparspakFactorization, DiagonalFactorization, CholeskyFactorization, - BunchKaufmanFactorization, CHOLMODFactorization, LDLtFactorization + BunchKaufmanFactorization, CHOLMODFactorization, LDLtFactorization, + CUSOLVERRFFactorization export LinearSolveFunction, DirectLdiv! diff --git a/src/extension_algs.jl b/src/extension_algs.jl index 952057a15..29d1be8fa 100644 --- a/src/extension_algs.jl +++ b/src/extension_algs.jl @@ -441,3 +441,36 @@ to avoid allocations and automatically offloads to the GPU. struct MetalLUFactorization <: AbstractFactorization end struct BLISLUFactorization <: AbstractFactorization end + +""" +`CUSOLVERRFFactorization(; symbolic = :RF, reuse_symbolic = true)` + +A GPU-accelerated sparse LU factorization using NVIDIA's cusolverRF library. +This solver is specifically designed for sparse matrices on CUDA GPUs and +provides high-performance factorization and solve capabilities. + +## Keyword Arguments + + - `symbolic`: The symbolic factorization method to use. Options are: + - `:RF` (default): Use cusolverRF's built-in symbolic analysis + - `:KLU`: Use KLU for symbolic analysis + - `reuse_symbolic`: Whether to reuse the symbolic factorization when the + sparsity pattern doesn't change (default: `true`) + +!!! note + This solver requires CUSOLVERRF.jl to be loaded and only supports + `Float64` element types with `Int32` indices. +""" +struct CUSOLVERRFFactorization <: AbstractSparseFactorization + symbolic::Symbol + reuse_symbolic::Bool + + function CUSOLVERRFFactorization(; symbolic::Symbol = :RF, reuse_symbolic::Bool = true) + ext = Base.get_extension(@__MODULE__, :LinearSolveCUSOLVERRFExt) + if ext === nothing + error("CUSOLVERRFFactorization requires that CUSOLVERRF.jl is loaded, i.e. `using CUSOLVERRF`") + else + return new{}(symbolic, reuse_symbolic) + end + end +end diff --git a/test/gpu/Project.toml b/test/gpu/Project.toml index 4c304c0f2..914357037 100644 --- a/test/gpu/Project.toml +++ b/test/gpu/Project.toml @@ -2,7 +2,9 @@ BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e" +CUSOLVERRF = "a8cc9031-bad2-4722-94f5-40deabb4245c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/gpu/cuda.jl b/test/gpu/cuda.jl index 8e0b5a3ab..74ee1ab8f 100644 --- a/test/gpu/cuda.jl +++ b/test/gpu/cuda.jl @@ -106,3 +106,10 @@ end prob = LinearProblem(A_gpu_csr, b_gpu) sol = solve(prob) end + +# Include CUSOLVERRF tests if available +if Base.find_package("CUSOLVERRF") !== nothing + @testset "CUSOLVERRF" begin + include("cusolverrf.jl") + end +end diff --git a/test/gpu/cusolverrf.jl b/test/gpu/cusolverrf.jl new file mode 100644 index 000000000..f5c774487 --- /dev/null +++ b/test/gpu/cusolverrf.jl @@ -0,0 +1,67 @@ +using LinearSolve +using CUSOLVERRF +using CUDA +using SparseArrays +using LinearAlgebra +using Test + +@testset "CUSOLVERRFFactorization" begin + # Skip tests if CUDA is not available + if !CUDA.functional() + @info "CUDA not available, skipping CUSOLVERRF tests" + return + end + + # Test with a small sparse matrix + n = 100 + A = sprand(n, n, 0.1) + I + b = rand(n) + + # Test with CPU sparse matrix (should auto-convert to GPU) + @testset "CPU Sparse Matrix" begin + prob = LinearProblem(A, b) + + # Test with default symbolic (:RF) + sol = solve(prob, CUSOLVERRFFactorization()) + @test norm(A * sol.u - b) / norm(b) < 1e-10 + + # Test with KLU symbolic + sol_klu = solve(prob, CUSOLVERRFFactorization(symbolic = :KLU)) + @test norm(A * sol_klu.u - b) / norm(b) < 1e-10 + end + + # Test with GPU sparse matrix + @testset "GPU Sparse Matrix" begin + A_gpu = CUDA.CUSPARSE.CuSparseMatrixCSR(A) + b_gpu = CuArray(b) + + prob_gpu = LinearProblem(A_gpu, b_gpu) + sol_gpu = solve(prob_gpu, CUSOLVERRFFactorization()) + + # Check residual on GPU + res_gpu = A_gpu * sol_gpu.u - b_gpu + @test norm(res_gpu) / norm(b_gpu) < 1e-10 + end + + # Test matrix update with same sparsity pattern + @testset "Matrix Update" begin + # Create a new matrix with same pattern but different values + A2 = A + 0.1 * sprand(n, n, 0.01) + b2 = rand(n) + + prob2 = LinearProblem(A2, b2) + sol2 = solve(prob2, CUSOLVERRFFactorization(reuse_symbolic = true)) + @test norm(A2 * sol2.u - b2) / norm(b2) < 1e-10 + end + + # Test error handling for unsupported types + @testset "Error Handling" begin + # Test with Float32 (not supported) + A_f32 = Float32.(A) + b_f32 = Float32.(b) + prob_f32 = LinearProblem(A_f32, b_f32) + + # This should error since CUSOLVERRF only supports Float64 + @test_throws Exception solve(prob_f32, CUSOLVERRFFactorization()) + end +end \ No newline at end of file diff --git a/test/resolve.jl b/test/resolve.jl index 2ea69a0c9..9efcc6921 100644 --- a/test/resolve.jl +++ b/test/resolve.jl @@ -11,6 +11,7 @@ for alg in vcat(InteractiveUtils.subtypes(AbstractDenseFactorization), if !(alg in [ DiagonalFactorization, CudaOffloadFactorization, + CUSOLVERRFFactorization, AppleAccelerateLUFactorization, MetalLUFactorization ]) &&