From 21ac3aa52d3af18e97f584ade3f2854b0223f1bb Mon Sep 17 00:00:00 2001 From: Connor Baker Date: Wed, 31 Jul 2024 22:36:26 +0000 Subject: [PATCH] wip --- pkgs/development/cuda-modules/aliases.nix | 2 +- .../cuda-modules/packages/README.md | 29 +++ .../cuda-modules/packages/backendStdenv.nix | 8 +- .../packages/backendStdenvWithMoldLinker.nix | 31 +++ .../cuda-modules/packages/cutlass.nix | 186 ++++++++++++++++++ .../cuda-modules/packages/matx.nix | 118 +++++++++++ 6 files changed, 367 insertions(+), 7 deletions(-) create mode 100644 pkgs/development/cuda-modules/packages/backendStdenvWithMoldLinker.nix create mode 100644 pkgs/development/cuda-modules/packages/cutlass.nix create mode 100644 pkgs/development/cuda-modules/packages/matx.nix diff --git a/pkgs/development/cuda-modules/aliases.nix b/pkgs/development/cuda-modules/aliases.nix index 998ef385106a45f..6ccb7f2be4dad8d 100644 --- a/pkgs/development/cuda-modules/aliases.nix +++ b/pkgs/development/cuda-modules/aliases.nix @@ -1,7 +1,7 @@ # Packages which have been deprecated or removed from cudaPackages final: let - inherit (final.lib.attrsets) mapAttrs; + inherit (builtins) mapAttrs; inherit (final.lib.trivial) warn; mkRenamed = diff --git a/pkgs/development/cuda-modules/packages/README.md b/pkgs/development/cuda-modules/packages/README.md index e0ce580de34868b..07912db18f40ff8 100644 --- a/pkgs/development/cuda-modules/packages/README.md +++ b/pkgs/development/cuda-modules/packages/README.md @@ -1,3 +1,32 @@ # packages Packages which are not created by the manifest builder. + +## backendStdenv + +We have our own `stdenv` for two reasons: + +1. To ensure we use a compatible version of `gcc` with `nvcc`, and +2. To use `mold` as our default linker. + +Elaborating on the second point: a non-trivial amount of time is spent *linking* when compiling CUDA code. The `mold` linker is much, *much* faster than the linkers GCC or LLVM use by default. As an example, on my machine with an i9-13900K and 96 GB of DDR5-6800 RAM, building CUTLASS 3.5.0 targeting `sm_75`: + + + +```console +$ nix build -L --impure --builders '' .#cudaPackages.cutlass.showLinkTimesWithDefaultLinker \ + |& tee /dev/stderr \ + | awk '{ if ($2 " " $3 " " $4 == "Elapsed time (seconds):") sum += $5; } END { print sum; }' +... +8.39248 +``` + +```console +$ nix build -L --impure --builders '' .#cudaPackages.cutlass.showLinkTimesWithMoldLinker \ + |& tee /dev/stderr \ + | awk '{ if ($2 " " $3 " " $4 == "Elapsed time (seconds):") sum += $5; } END { print sum; }' +... +5.93172 +``` + +TODO: Next build uses unity. \ No newline at end of file diff --git a/pkgs/development/cuda-modules/packages/backendStdenv.nix b/pkgs/development/cuda-modules/packages/backendStdenv.nix index 7d25241f4dc2d15..fe24dafc4c48f22 100644 --- a/pkgs/development/cuda-modules/packages/backendStdenv.nix +++ b/pkgs/development/cuda-modules/packages/backendStdenv.nix @@ -16,13 +16,9 @@ # Cf. https://github.com/NixOS/nixpkgs/pull/218265 for context let gccMajorVersion = data.nvccCompatibilities.${cudaVersion}.gccMaxMajorVersion; + # TODO(@connorbaker): Get numbers for why we should use stdenvAdapters.useMoldLinker. cudaStdenv = stdenvAdapters.useLibsFrom stdenv pkgs."gcc${gccMajorVersion}Stdenv"; - passthruExtra = { - # cudaPackages.backendStdenv.nixpkgsCompatibleLibstdcxx has been removed, - # if you need it you're likely doing something wrong. There has been a - # warning here for a month or so. Now we can no longer return any - # meaningful value in its place and drop the attribute entirely. - }; + passthruExtra.withMoldLinker = stdenvAdapters.useMoldLinker cudaStdenv; assertCondition = true; in diff --git a/pkgs/development/cuda-modules/packages/backendStdenvWithMoldLinker.nix b/pkgs/development/cuda-modules/packages/backendStdenvWithMoldLinker.nix new file mode 100644 index 000000000000000..d90314a67114b13 --- /dev/null +++ b/pkgs/development/cuda-modules/packages/backendStdenvWithMoldLinker.nix @@ -0,0 +1,31 @@ +{ + cudaVersion, + data, + lib, + pkgs, + stdenv, + stdenvAdapters, +}: +# Exposed as cudaPackages.backendStdenv. +# This is what nvcc uses as a backend, +# and it has to be an officially supported one (e.g. gcc11 for cuda11). +# +# It, however, propagates current stdenv's libstdc++ to avoid "GLIBCXX_* not found errors" +# when linked with other C++ libraries. +# E.g. for cudaPackages_11_8 we use gcc11 with gcc12's libstdc++ +# Cf. https://github.com/NixOS/nixpkgs/pull/218265 for context +let + gccMajorVersion = data.nvccCompatibilities.${cudaVersion}.gccMaxMajorVersion; + cudaStdenv = stdenvAdapters.useMoldLinker (stdenvAdapters.useLibsFrom stdenv pkgs."gcc${gccMajorVersion}Stdenv"); + passthruExtra = { + # cudaPackages.backendStdenv.nixpkgsCompatibleLibstdcxx has been removed, + # if you need it you're likely doing something wrong. There has been a + # warning here for a month or so. Now we can no longer return any + # meaningful value in its place and drop the attribute entirely. + }; + assertCondition = true; +in + +# TODO: Consider testing whether we in fact use the newer libstdc++ + +lib.extendDerivation assertCondition passthruExtra cudaStdenv diff --git a/pkgs/development/cuda-modules/packages/cutlass.nix b/pkgs/development/cuda-modules/packages/cutlass.nix new file mode 100644 index 000000000000000..dcc5d285a7ba6a2 --- /dev/null +++ b/pkgs/development/cuda-modules/packages/cutlass.nix @@ -0,0 +1,186 @@ +{ + addDriverRunpath, + autoAddDriverRunpath, + backendStdenv, + cmake, + cuda_cudart, + cuda_nvcc, + cuda_nvrtc, + cudaMajorMinorVersion, + cudaOlder, + cudnn, + cutlass, + fetchFromGitHub, + fetchpatch2, + flags, + gtest, + lib, + libcublas, + libcurand, + ninja, + python3, + # Options + enableF16C ? false, + enableTools ? false, + # passthru.updateScript + gitUpdater, +}: +let + inherit (lib.lists) optionals; + inherit (lib.strings) cmakeBool cmakeFeature optionalString; +in +# TODO: This can also be packaged for Python! +backendStdenv.mkDerivation (finalAttrs: { + name = "cuda${cudaMajorMinorVersion}-${finalAttrs.pname}-${finalAttrs.version}"; + pname = "cutlass"; + version = "3.5.0"; + + src = fetchFromGitHub { + owner = "NVIDIA"; + repo = "cutlass"; + rev = "refs/tags/v${finalAttrs.version}"; + hash = "sha256-D/s7eYsa5l/mfx73tE4mnFcTQdYqGmXa9d9TCryw4e4="; + }; + + strictDeps = true; + + # TODO: As a header-only library, we should make sure we have an `include` directory or similar which is not a + # superset of the `out` (`bin`) or `dev` outputs (whih is what the multiple-outputs setup hook does by default). + outputs = [ "out" ]; + + nativeBuildInputs = [ + autoAddDriverRunpath + cuda_nvcc + cmake + ninja + python3 + ]; + + patches = + # Enable use of the system googletest. + # NOTE: Revisit these when 3.5.1 releases, as it may be included. + [ + (fetchpatch2 { + name = "support-system-googletest.patch"; + url = "https://github.com/NVIDIA/cutlass/pull/1469/commits/60923063b39c22982e56c304e132e59b358f4814.patch"; + hash = "sha256-q44SVpC7i91onsBDYKKBob5YqTPmXHRFgImnKpeVKpQ="; + }) + (fetchpatch2 { + name = "create-working-directory-for-tests-explicitly.patch"; + url = "https://github.com/NVIDIA/cutlass/pull/1469/commits/1093c4f90df6083146681e3c39b0c53611978c5f.patch"; + hash = "sha256-AYJsr+y5ntSWuV7+b/NRRY+fXE5SFN28BdZZqWXoElc="; + }) + ]; + + postPatch = + # Prepend some commands to the CUDA.cmake file so it can find the CUDA libraries using CMake's FindCUDAToolkit + # module. These target names are used throughout the project; I (@connorbaker) did not choose them. + '' + mv ./CUDA.cmake ./_CUDA_Append.cmake + cat > ./_CUDA_Prepend.cmake <<'EOF' + find_package(CUDAToolkit REQUIRED) + foreach(_target cudart cuda_driver nvrtc) + if (NOT TARGET CUDA::''${_target}) + message(FATAL_ERROR "''${_target} Not Found") + endif() + message(STATUS "''${_target} library: ''${CUDA_''${_target}_LIBRARY}") + add_library(''${_target} ALIAS CUDA::''${_target}) + endforeach() + EOF + cat ./_CUDA_Prepend.cmake ./_CUDA_Append.cmake > ./CUDA.cmake + ''; + + enableParallelBuilding = true; + + buildInputs = + [ + cuda_cudart + cuda_nvrtc + libcurand + ] + ++ optionals enableTools [ + cudnn + libcublas + ]; + + cmakeFlags = [ + (cmakeFeature "CUTLASS_NVCC_ARCHS" flags.cmakeCudaArchitecturesString) + (cmakeBool "CUTLASS_ENABLE_EXAMPLES" false) + + # Tests. + (cmakeBool "CUTLASS_ENABLE_TESTS" finalAttrs.doCheck) + (cmakeBool "CUTLASS_ENABLE_GTEST_UNIT_TESTS" finalAttrs.doCheck) + (cmakeBool "CUTLASS_USE_SYSTEM_GOOGLETEST" true) + + # NOTE: Both CUDNN and CUBLAS can be used by the examples and the profiler. Since they are large dependencies, they + # are disabled by default. + (cmakeBool "CUTLASS_ENABLE_TOOLS" enableTools) + (cmakeBool "CUTLASS_ENABLE_CUBLAS" enableTools) + (cmakeBool "CUTLASS_ENABLE_CUDNN" enableTools) + + # NOTE: Requires x86_64 and hardware support. + (cmakeBool "CUTLASS_ENABLE_F16C" enableF16C) + + # TODO: Unity builds are supposed to reduce build time, but this seems to just reduce the number of tasks + # generated? + # NOTE: Good explanation of unity builds: + # https://www.methodpark.de/blog/how-to-speed-up-clang-tidy-with-unity-builds. + (cmakeBool "CUTLASS_UNITY_BUILD_ENABLED" false) + + # NOTE: Can change the size of the executables + (cmakeBool "CUTLASS_NVCC_EMBED_CUBIN" true) + (cmakeBool "CUTLASS_NVCC_EMBED_PTX" true) + ]; + + doCheck = false; + + checkInputs = [ gtest ]; + + # NOTE: Because the test cases immediately create and try to run the binaries, we don't have an opportunity + # to patch them with autoAddDriverRunpath. To get around this, we add the driver runpath to the environment. + preCheck = optionalString finalAttrs.doCheck '' + export LD_LIBRARY_PATH="$(readlink -mnv "${addDriverRunpath.driverLink}/lib")" + ''; + + # This is *not* a derivation you want to build on a small machine. + requiredSystemFeatures = optionals finalAttrs.doCheck [ + "big-parallel" + "cuda" + ]; + + passthru = + let + cutlassShowLinkTimes = cutlass.overrideAttrs (prevAttrs: { + # Set RULE_LAUNCH_LINK so CMake adds a message to the linker command to show the link times. + postPatch = + prevAttrs.postPatch + + '' + sed -i '1i set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "''${CMAKE_COMMAND} -E time")' ./CUDA.cmake + ''; + }); + in + { + updateScript = gitUpdater { + inherit (finalAttrs) pname version; + rev-prefix = "v"; + }; + # TODO: These can be removed. + tests.withGpu = cutlass.overrideAttrs { doCheck = true; }; + showLinkTimesWithDefaultLinker = cutlassShowLinkTimes; + showLinkTimesWithMoldLinker = cutlassShowLinkTimes.override { + backendStdenv = backendStdenv.withMoldLinker; + }; + }; + + meta = with lib; { + description = "CUDA Templates for Linear Algebra Subroutines"; + homepage = "https://github.com/NVIDIA/cutlass"; + license = licenses.asl20; + broken = cudaOlder "11.4"; + platforms = [ + "aarch64-linux" + "x86_64-linux" + ]; + maintainers = with maintainers; [ connorbaker ] ++ teams.cuda.members; + }; +}) diff --git a/pkgs/development/cuda-modules/packages/matx.nix b/pkgs/development/cuda-modules/packages/matx.nix new file mode 100644 index 000000000000000..5e66dbc4acd550e --- /dev/null +++ b/pkgs/development/cuda-modules/packages/matx.nix @@ -0,0 +1,118 @@ +# NOTE: Though NCCL is called within the cudaPackages package set, we avoid passing in +# the names of dependencies from that package set directly to avoid evaluation errors +# in the case redistributable packages are not available. +{ + autoAddDriverRunpath, + cudaAtLeast, + cmake, + ninja, + flags, + cutlass, + cudaMajorMinorVersion, + cudaOlder, + backendStdenv, + cuda_cccl, + cuda_cudart, + cuda_nvcc, + cuquantum, + cudatoolkit, + libcutensor, + fetchFromGitHub, + rapids-cmake, + lib, + python3Packages, + which, + # passthru.updateScript + gitUpdater, +}: +let + inherit (lib.lists) optionals; + inherit (lib.strings) cmakeFeature cmakeBool; + + pythonDeps = with python3Packages; [ + pybind11 + numpy + cupy + ]; +in +backendStdenv.mkDerivation (finalAttrs: { + name = "cuda${cudaMajorMinorVersion}-${finalAttrs.pname}-${finalAttrs.version}"; + pname = "MatX"; + version = "0.8.0"; + + src = fetchFromGitHub { + owner = "NVIDIA"; + repo = "MatX"; + rev = "refs/tags/v${finalAttrs.version}"; + hash = "sha256-7ygdfo27tXz0f6jz6RwDCGwSQun3HqZJ9o6w2vKGM3s="; + }; + + strictDeps = true; + + nativeBuildInputs = [ + rapids-cmake + which + autoAddDriverRunpath + ] ++ pythonDeps ++ [ cuda_nvcc ]; + + postPatch = + # Remove the vendored rapids-cmake + '' + rm -rf cmake/rapids-cmake + '' + # Copy required components and update permissions + + rapids-cmake.passthru.utilities.copyToCmakeDir + # Remove the GetPyBind11.cmake file + + '' + substituteInPlace CMakeLists.txt \ + --replace-fail \ + "include(cmake/GetPyBind11.cmake)" \ + "" + ''; + + buildInputs = [ + cuda_cudart + cuda_cccl + libcutensor + cutlass + cuquantum # cutensorNet + ]; + + # TODO: This should be handled by setup hooks in rapids-cmake. + cmakeFlags = rapids-cmake.passthru.data.cmakeFlags ++ [ + (cmakeBool "MATX_BUILD_EXAMPLES" false) + (cmakeBool "MATX_BUILD_TESTS" false) + (cmakeBool "MATX_BUILD_BENCHMARKS" false) + (cmakeBool "MATX_NVTX_FLAGS" true) + (cmakeBool "MATX_BUILD_DOCS" false) + (cmakeBool "MATX_BUILD_32_BIT" false) + (cmakeBool "MATX_MULTI_GPU" false) # Requires Nvshmem? + (cmakeBool "MATX_EN_VISUALIZATION" false) # TODO: Revisit + (cmakeBool "MATX_EN_CUTLASS" false) # TODO: CUTLASS support is removed in main? + (cmakeBool "MATX_EN_CUTENSOR" true) + (cmakeBool "MATX_EN_FILEIO" true) + (cmakeBool "MATX_EN_NVPL" false) # TODO: Revisit for ARM support + (cmakeBool "MATX_DISABLE_CUB_CACHE" true) + ]; + + propagatedBuildInputs = pythonDeps; + + enableParallelBuilding = true; + + passthru.updateScript = gitUpdater { + inherit (finalAttrs) pname version; + rev-prefix = "v"; + }; + + meta = with lib; { + description = "An efficient C++17 GPU numerical computing library with Python-like syntax"; + homepage = "https://nvidia.github.io/MatX"; + broken = cudaOlder "11.4"; + license = licenses.bsd3; + platforms = [ + "aarch64-linux" + "x86_64-linux" + ]; + maintainers = with maintainers; [ connorbaker ] ++ teams.cuda.members; + }; +})