-
-
Notifications
You must be signed in to change notification settings - Fork 14.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
da17174
commit fed9d55
Showing
6 changed files
with
367 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,32 @@ | ||
# packages | ||
|
||
Packages which are not created by the manifest builder. | ||
|
||
## backendStdenv | ||
|
||
We have our own `stdenv` for two reasons: | ||
|
||
1. To ensure we use a compatible version of `gcc` with `nvcc`, and | ||
2. To use `mold` as our default linker. | ||
|
||
Elaborating on the second point: a non-trivial amount of time is spent *linking* when compiling CUDA code. The `mold` linker is much, *much* faster than the linkers GCC or LLVM use by default. As an example, on my machine with an i9-13900K and 96 GB of DDR5-6800 RAM, building CUTLASS 3.5.0 targeting `sm_75`: | ||
|
||
|
||
|
||
```console | ||
$ nix build -L --impure --builders '' .#cudaPackages.cutlass.showLinkTimesWithDefaultLinker \ | ||
|& tee /dev/stderr \ | ||
| awk '{ if ($2 " " $3 " " $4 == "Elapsed time (seconds):") sum += $5; } END { print sum; }' | ||
... | ||
8.39248 | ||
``` | ||
|
||
```console | ||
$ nix build -L --impure --builders '' .#cudaPackages.cutlass.showLinkTimesWithMoldLinker \ | ||
|& tee /dev/stderr \ | ||
| awk '{ if ($2 " " $3 " " $4 == "Elapsed time (seconds):") sum += $5; } END { print sum; }' | ||
... | ||
5.93172 | ||
``` | ||
|
||
TODO: Next build uses unity. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
31 changes: 31 additions & 0 deletions
31
pkgs/development/cuda-modules/packages/backendStdenvWithMoldLinker.nix
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
{ | ||
cudaVersion, | ||
data, | ||
lib, | ||
pkgs, | ||
stdenv, | ||
stdenvAdapters, | ||
}: | ||
# Exposed as cudaPackages.backendStdenv. | ||
# This is what nvcc uses as a backend, | ||
# and it has to be an officially supported one (e.g. gcc11 for cuda11). | ||
# | ||
# It, however, propagates current stdenv's libstdc++ to avoid "GLIBCXX_* not found errors" | ||
# when linked with other C++ libraries. | ||
# E.g. for cudaPackages_11_8 we use gcc11 with gcc12's libstdc++ | ||
# Cf. https://github.com/NixOS/nixpkgs/pull/218265 for context | ||
let | ||
gccMajorVersion = data.nvccCompatibilities.${cudaVersion}.gccMaxMajorVersion; | ||
cudaStdenv = stdenvAdapters.useMoldLinker (stdenvAdapters.useLibsFrom stdenv pkgs."gcc${gccMajorVersion}Stdenv"); | ||
passthruExtra = { | ||
# cudaPackages.backendStdenv.nixpkgsCompatibleLibstdcxx has been removed, | ||
# if you need it you're likely doing something wrong. There has been a | ||
# warning here for a month or so. Now we can no longer return any | ||
# meaningful value in its place and drop the attribute entirely. | ||
}; | ||
assertCondition = true; | ||
in | ||
|
||
# TODO: Consider testing whether we in fact use the newer libstdc++ | ||
|
||
lib.extendDerivation assertCondition passthruExtra cudaStdenv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
{ | ||
addDriverRunpath, | ||
autoAddDriverRunpath, | ||
backendStdenv, | ||
cmake, | ||
cuda_cudart, | ||
cuda_nvcc, | ||
cuda_nvrtc, | ||
cudaMajorMinorVersion, | ||
cudaOlder, | ||
cudnn, | ||
cutlass, | ||
fetchFromGitHub, | ||
fetchpatch2, | ||
flags, | ||
gtest, | ||
lib, | ||
libcublas, | ||
libcurand, | ||
ninja, | ||
python3, | ||
# Options | ||
enableF16C ? false, | ||
enableTools ? false, | ||
# passthru.updateScript | ||
gitUpdater, | ||
}: | ||
let | ||
inherit (lib.lists) optionals; | ||
inherit (lib.strings) cmakeBool cmakeFeature optionalString; | ||
in | ||
# TODO: This can also be packaged for Python! | ||
backendStdenv.mkDerivation (finalAttrs: { | ||
name = "cuda${cudaMajorMinorVersion}-${finalAttrs.pname}-${finalAttrs.version}"; | ||
pname = "cutlass"; | ||
version = "3.5.0"; | ||
|
||
src = fetchFromGitHub { | ||
owner = "NVIDIA"; | ||
repo = "cutlass"; | ||
rev = "refs/tags/v${finalAttrs.version}"; | ||
hash = "sha256-D/s7eYsa5l/mfx73tE4mnFcTQdYqGmXa9d9TCryw4e4="; | ||
}; | ||
|
||
strictDeps = true; | ||
|
||
# TODO: As a header-only library, we should make sure we have an `include` directory or similar which is not a | ||
# superset of the `out` (`bin`) or `dev` outputs (whih is what the multiple-outputs setup hook does by default). | ||
outputs = [ "out" ]; | ||
|
||
nativeBuildInputs = [ | ||
autoAddDriverRunpath | ||
cuda_nvcc | ||
cmake | ||
ninja | ||
python3 | ||
]; | ||
|
||
patches = | ||
# Enable use of the system googletest. | ||
# NOTE: Revisit these when 3.5.1 releases, as it may be included. | ||
[ | ||
(fetchpatch2 { | ||
name = "support-system-googletest.patch"; | ||
url = "https://github.com/NVIDIA/cutlass/pull/1469/commits/60923063b39c22982e56c304e132e59b358f4814.patch"; | ||
hash = "sha256-q44SVpC7i91onsBDYKKBob5YqTPmXHRFgImnKpeVKpQ="; | ||
}) | ||
(fetchpatch2 { | ||
name = "create-working-directory-for-tests-explicitly.patch"; | ||
url = "https://github.com/NVIDIA/cutlass/pull/1469/commits/1093c4f90df6083146681e3c39b0c53611978c5f.patch"; | ||
hash = "sha256-AYJsr+y5ntSWuV7+b/NRRY+fXE5SFN28BdZZqWXoElc="; | ||
}) | ||
]; | ||
|
||
postPatch = | ||
# Prepend some commands to the CUDA.cmake file so it can find the CUDA libraries using CMake's FindCUDAToolkit | ||
# module. These target names are used throughout the project; I (@connorbaker) did not choose them. | ||
'' | ||
mv ./CUDA.cmake ./_CUDA_Append.cmake | ||
cat > ./_CUDA_Prepend.cmake <<'EOF' | ||
find_package(CUDAToolkit REQUIRED) | ||
foreach(_target cudart cuda_driver nvrtc) | ||
if (NOT TARGET CUDA::''${_target}) | ||
message(FATAL_ERROR "''${_target} Not Found") | ||
endif() | ||
message(STATUS "''${_target} library: ''${CUDA_''${_target}_LIBRARY}") | ||
add_library(''${_target} ALIAS CUDA::''${_target}) | ||
endforeach() | ||
EOF | ||
cat ./_CUDA_Prepend.cmake ./_CUDA_Append.cmake > ./CUDA.cmake | ||
''; | ||
|
||
enableParallelBuilding = true; | ||
|
||
buildInputs = | ||
[ | ||
cuda_cudart | ||
cuda_nvrtc | ||
libcurand | ||
] | ||
++ optionals enableTools [ | ||
cudnn | ||
libcublas | ||
]; | ||
|
||
cmakeFlags = [ | ||
(cmakeFeature "CUTLASS_NVCC_ARCHS" flags.cmakeCudaArchitecturesString) | ||
(cmakeBool "CUTLASS_ENABLE_EXAMPLES" false) | ||
|
||
# Tests. | ||
(cmakeBool "CUTLASS_ENABLE_TESTS" finalAttrs.doCheck) | ||
(cmakeBool "CUTLASS_ENABLE_GTEST_UNIT_TESTS" finalAttrs.doCheck) | ||
(cmakeBool "CUTLASS_USE_SYSTEM_GOOGLETEST" true) | ||
|
||
# NOTE: Both CUDNN and CUBLAS can be used by the examples and the profiler. Since they are large dependencies, they | ||
# are disabled by default. | ||
(cmakeBool "CUTLASS_ENABLE_TOOLS" enableTools) | ||
(cmakeBool "CUTLASS_ENABLE_CUBLAS" enableTools) | ||
(cmakeBool "CUTLASS_ENABLE_CUDNN" enableTools) | ||
|
||
# NOTE: Requires x86_64 and hardware support. | ||
(cmakeBool "CUTLASS_ENABLE_F16C" enableF16C) | ||
|
||
# TODO: Unity builds are supposed to reduce build time, but this seems to just reduce the number of tasks | ||
# generated? | ||
# NOTE: Good explanation of unity builds: | ||
# https://www.methodpark.de/blog/how-to-speed-up-clang-tidy-with-unity-builds. | ||
(cmakeBool "CUTLASS_UNITY_BUILD_ENABLED" false) | ||
|
||
# NOTE: Can change the size of the executables | ||
(cmakeBool "CUTLASS_NVCC_EMBED_CUBIN" true) | ||
(cmakeBool "CUTLASS_NVCC_EMBED_PTX" true) | ||
]; | ||
|
||
doCheck = false; | ||
|
||
checkInputs = [ gtest ]; | ||
|
||
# NOTE: Because the test cases immediately create and try to run the binaries, we don't have an opportunity | ||
# to patch them with autoAddDriverRunpath. To get around this, we add the driver runpath to the environment. | ||
preCheck = optionalString finalAttrs.doCheck '' | ||
export LD_LIBRARY_PATH="$(readlink -mnv "${addDriverRunpath.driverLink}/lib")" | ||
''; | ||
|
||
# This is *not* a derivation you want to build on a small machine. | ||
requiredSystemFeatures = optionals finalAttrs.doCheck [ | ||
"big-parallel" | ||
"cuda" | ||
]; | ||
|
||
passthru = | ||
let | ||
cutlassShowLinkTimes = cutlass.overrideAttrs (prevAttrs: { | ||
# Set RULE_LAUNCH_LINK so CMake adds a message to the linker command to show the link times. | ||
postPatch = | ||
prevAttrs.postPatch | ||
+ '' | ||
sed -i '1i set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "''${CMAKE_COMMAND} -E time")' ./CUDA.cmake | ||
''; | ||
}); | ||
in | ||
{ | ||
updateScript = gitUpdater { | ||
inherit (finalAttrs) pname version; | ||
rev-prefix = "v"; | ||
}; | ||
# TODO: These can be removed. | ||
tests.withGpu = cutlass.overrideAttrs { doCheck = true; }; | ||
showLinkTimesWithDefaultLinker = cutlassShowLinkTimes; | ||
showLinkTimesWithMoldLinker = cutlassShowLinkTimes.override { | ||
backendStdenv = backendStdenv.withMoldLinker; | ||
}; | ||
}; | ||
|
||
meta = with lib; { | ||
description = "CUDA Templates for Linear Algebra Subroutines"; | ||
homepage = "https://github.com/NVIDIA/cutlass"; | ||
license = licenses.asl20; | ||
broken = cudaOlder "11.4"; | ||
platforms = [ | ||
"aarch64-linux" | ||
"x86_64-linux" | ||
]; | ||
maintainers = with maintainers; [ connorbaker ] ++ teams.cuda.members; | ||
}; | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
# NOTE: Though NCCL is called within the cudaPackages package set, we avoid passing in | ||
# the names of dependencies from that package set directly to avoid evaluation errors | ||
# in the case redistributable packages are not available. | ||
{ | ||
autoAddDriverRunpath, | ||
cudaAtLeast, | ||
cmake, | ||
ninja, | ||
flags, | ||
cutlass, | ||
cudaMajorMinorVersion, | ||
cudaOlder, | ||
backendStdenv, | ||
cuda_cccl, | ||
cuda_cudart, | ||
cuda_nvcc, | ||
cuquantum, | ||
cudatoolkit, | ||
libcutensor, | ||
fetchFromGitHub, | ||
rapids-cmake, | ||
lib, | ||
python3Packages, | ||
which, | ||
# passthru.updateScript | ||
gitUpdater, | ||
}: | ||
let | ||
inherit (lib.lists) optionals; | ||
inherit (lib.strings) cmakeFeature cmakeBool; | ||
|
||
pythonDeps = with python3Packages; [ | ||
pybind11 | ||
numpy | ||
cupy | ||
]; | ||
in | ||
backendStdenv.mkDerivation (finalAttrs: { | ||
name = "cuda${cudaMajorMinorVersion}-${finalAttrs.pname}-${finalAttrs.version}"; | ||
pname = "MatX"; | ||
version = "0.8.0"; | ||
|
||
src = fetchFromGitHub { | ||
owner = "NVIDIA"; | ||
repo = "MatX"; | ||
rev = "refs/tags/v${finalAttrs.version}"; | ||
hash = "sha256-7ygdfo27tXz0f6jz6RwDCGwSQun3HqZJ9o6w2vKGM3s="; | ||
}; | ||
|
||
strictDeps = true; | ||
|
||
nativeBuildInputs = [ | ||
rapids-cmake | ||
which | ||
autoAddDriverRunpath | ||
] ++ pythonDeps ++ [ cuda_nvcc ]; | ||
|
||
postPatch = | ||
# Remove the vendored rapids-cmake | ||
'' | ||
rm -rf cmake/rapids-cmake | ||
'' | ||
# Copy required components and update permissions | ||
+ rapids-cmake.passthru.utilities.copyToCmakeDir | ||
# Remove the GetPyBind11.cmake file | ||
+ '' | ||
substituteInPlace CMakeLists.txt \ | ||
--replace-fail \ | ||
"include(cmake/GetPyBind11.cmake)" \ | ||
"" | ||
''; | ||
|
||
buildInputs = [ | ||
cuda_cudart | ||
cuda_cccl | ||
libcutensor | ||
cutlass | ||
cuquantum # cutensorNet | ||
]; | ||
|
||
# TODO: This should be handled by setup hooks in rapids-cmake. | ||
cmakeFlags = rapids-cmake.passthru.data.cmakeFlags ++ [ | ||
(cmakeBool "MATX_BUILD_EXAMPLES" false) | ||
(cmakeBool "MATX_BUILD_TESTS" false) | ||
(cmakeBool "MATX_BUILD_BENCHMARKS" false) | ||
(cmakeBool "MATX_NVTX_FLAGS" true) | ||
(cmakeBool "MATX_BUILD_DOCS" false) | ||
(cmakeBool "MATX_BUILD_32_BIT" false) | ||
(cmakeBool "MATX_MULTI_GPU" false) # Requires Nvshmem? | ||
(cmakeBool "MATX_EN_VISUALIZATION" false) # TODO: Revisit | ||
(cmakeBool "MATX_EN_CUTLASS" false) # TODO: CUTLASS support is removed in main? | ||
(cmakeBool "MATX_EN_CUTENSOR" true) | ||
(cmakeBool "MATX_EN_FILEIO" true) | ||
(cmakeBool "MATX_EN_NVPL" false) # TODO: Revisit for ARM support | ||
(cmakeBool "MATX_DISABLE_CUB_CACHE" true) | ||
]; | ||
|
||
propagatedBuildInputs = pythonDeps; | ||
|
||
enableParallelBuilding = true; | ||
|
||
passthru.updateScript = gitUpdater { | ||
inherit (finalAttrs) pname version; | ||
rev-prefix = "v"; | ||
}; | ||
|
||
meta = with lib; { | ||
description = "An efficient C++17 GPU numerical computing library with Python-like syntax"; | ||
homepage = "https://nvidia.github.io/MatX"; | ||
broken = cudaOlder "11.4"; | ||
license = licenses.bsd3; | ||
platforms = [ | ||
"aarch64-linux" | ||
"x86_64-linux" | ||
]; | ||
maintainers = with maintainers; [ connorbaker ] ++ teams.cuda.members; | ||
}; | ||
}) |