Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
Connor Baker authored and ConnorBaker committed Sep 13, 2024
1 parent da17174 commit fed9d55
Show file tree
Hide file tree
Showing 6 changed files with 367 additions and 7 deletions.
2 changes: 1 addition & 1 deletion pkgs/development/cuda-modules/aliases.nix
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Packages which have been deprecated or removed from cudaPackages
final:
let
inherit (final.lib.attrsets) mapAttrs;
inherit (builtins) mapAttrs;
inherit (final.lib.trivial) warn;

mkRenamed =
Expand Down
29 changes: 29 additions & 0 deletions pkgs/development/cuda-modules/packages/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,32 @@
# packages

Packages which are not created by the manifest builder.

## backendStdenv

We have our own `stdenv` for two reasons:

1. To ensure we use a compatible version of `gcc` with `nvcc`, and
2. To use `mold` as our default linker.

Elaborating on the second point: a non-trivial amount of time is spent *linking* when compiling CUDA code. The `mold` linker is much, *much* faster than the linkers GCC or LLVM use by default. As an example, on my machine with an i9-13900K and 96 GB of DDR5-6800 RAM, building CUTLASS 3.5.0 targeting `sm_75`:



```console
$ nix build -L --impure --builders '' .#cudaPackages.cutlass.showLinkTimesWithDefaultLinker \
|& tee /dev/stderr \
| awk '{ if ($2 " " $3 " " $4 == "Elapsed time (seconds):") sum += $5; } END { print sum; }'
...
8.39248
```

```console
$ nix build -L --impure --builders '' .#cudaPackages.cutlass.showLinkTimesWithMoldLinker \
|& tee /dev/stderr \
| awk '{ if ($2 " " $3 " " $4 == "Elapsed time (seconds):") sum += $5; } END { print sum; }'
...
5.93172
```

TODO: Next build uses unity.
8 changes: 2 additions & 6 deletions pkgs/development/cuda-modules/packages/backendStdenv.nix
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,9 @@
# Cf. https://github.com/NixOS/nixpkgs/pull/218265 for context
let
gccMajorVersion = data.nvccCompatibilities.${cudaVersion}.gccMaxMajorVersion;
# TODO(@connorbaker): Get numbers for why we should use stdenvAdapters.useMoldLinker.
cudaStdenv = stdenvAdapters.useLibsFrom stdenv pkgs."gcc${gccMajorVersion}Stdenv";
passthruExtra = {
# cudaPackages.backendStdenv.nixpkgsCompatibleLibstdcxx has been removed,
# if you need it you're likely doing something wrong. There has been a
# warning here for a month or so. Now we can no longer return any
# meaningful value in its place and drop the attribute entirely.
};
passthruExtra.withMoldLinker = stdenvAdapters.useMoldLinker cudaStdenv;
assertCondition = true;
in

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
cudaVersion,
data,
lib,
pkgs,
stdenv,
stdenvAdapters,
}:
# Exposed as cudaPackages.backendStdenv.
# This is what nvcc uses as a backend,
# and it has to be an officially supported one (e.g. gcc11 for cuda11).
#
# It, however, propagates current stdenv's libstdc++ to avoid "GLIBCXX_* not found errors"
# when linked with other C++ libraries.
# E.g. for cudaPackages_11_8 we use gcc11 with gcc12's libstdc++
# Cf. https://github.com/NixOS/nixpkgs/pull/218265 for context
let
gccMajorVersion = data.nvccCompatibilities.${cudaVersion}.gccMaxMajorVersion;
cudaStdenv = stdenvAdapters.useMoldLinker (stdenvAdapters.useLibsFrom stdenv pkgs."gcc${gccMajorVersion}Stdenv");
passthruExtra = {
# cudaPackages.backendStdenv.nixpkgsCompatibleLibstdcxx has been removed,
# if you need it you're likely doing something wrong. There has been a
# warning here for a month or so. Now we can no longer return any
# meaningful value in its place and drop the attribute entirely.
};
assertCondition = true;
in

# TODO: Consider testing whether we in fact use the newer libstdc++

lib.extendDerivation assertCondition passthruExtra cudaStdenv
186 changes: 186 additions & 0 deletions pkgs/development/cuda-modules/packages/cutlass.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
{
addDriverRunpath,
autoAddDriverRunpath,
backendStdenv,
cmake,
cuda_cudart,
cuda_nvcc,
cuda_nvrtc,
cudaMajorMinorVersion,
cudaOlder,
cudnn,
cutlass,
fetchFromGitHub,
fetchpatch2,
flags,
gtest,
lib,
libcublas,
libcurand,
ninja,
python3,
# Options
enableF16C ? false,
enableTools ? false,
# passthru.updateScript
gitUpdater,
}:
let
inherit (lib.lists) optionals;
inherit (lib.strings) cmakeBool cmakeFeature optionalString;
in
# TODO: This can also be packaged for Python!
backendStdenv.mkDerivation (finalAttrs: {
name = "cuda${cudaMajorMinorVersion}-${finalAttrs.pname}-${finalAttrs.version}";
pname = "cutlass";
version = "3.5.0";

src = fetchFromGitHub {
owner = "NVIDIA";
repo = "cutlass";
rev = "refs/tags/v${finalAttrs.version}";
hash = "sha256-D/s7eYsa5l/mfx73tE4mnFcTQdYqGmXa9d9TCryw4e4=";
};

strictDeps = true;

# TODO: As a header-only library, we should make sure we have an `include` directory or similar which is not a
# superset of the `out` (`bin`) or `dev` outputs (whih is what the multiple-outputs setup hook does by default).
outputs = [ "out" ];

nativeBuildInputs = [
autoAddDriverRunpath
cuda_nvcc
cmake
ninja
python3
];

patches =
# Enable use of the system googletest.
# NOTE: Revisit these when 3.5.1 releases, as it may be included.
[
(fetchpatch2 {
name = "support-system-googletest.patch";
url = "https://github.com/NVIDIA/cutlass/pull/1469/commits/60923063b39c22982e56c304e132e59b358f4814.patch";
hash = "sha256-q44SVpC7i91onsBDYKKBob5YqTPmXHRFgImnKpeVKpQ=";
})
(fetchpatch2 {
name = "create-working-directory-for-tests-explicitly.patch";
url = "https://github.com/NVIDIA/cutlass/pull/1469/commits/1093c4f90df6083146681e3c39b0c53611978c5f.patch";
hash = "sha256-AYJsr+y5ntSWuV7+b/NRRY+fXE5SFN28BdZZqWXoElc=";
})
];

postPatch =
# Prepend some commands to the CUDA.cmake file so it can find the CUDA libraries using CMake's FindCUDAToolkit
# module. These target names are used throughout the project; I (@connorbaker) did not choose them.
''
mv ./CUDA.cmake ./_CUDA_Append.cmake
cat > ./_CUDA_Prepend.cmake <<'EOF'
find_package(CUDAToolkit REQUIRED)
foreach(_target cudart cuda_driver nvrtc)
if (NOT TARGET CUDA::''${_target})
message(FATAL_ERROR "''${_target} Not Found")
endif()
message(STATUS "''${_target} library: ''${CUDA_''${_target}_LIBRARY}")
add_library(''${_target} ALIAS CUDA::''${_target})
endforeach()
EOF
cat ./_CUDA_Prepend.cmake ./_CUDA_Append.cmake > ./CUDA.cmake
'';

enableParallelBuilding = true;

buildInputs =
[
cuda_cudart
cuda_nvrtc
libcurand
]
++ optionals enableTools [
cudnn
libcublas
];

cmakeFlags = [
(cmakeFeature "CUTLASS_NVCC_ARCHS" flags.cmakeCudaArchitecturesString)
(cmakeBool "CUTLASS_ENABLE_EXAMPLES" false)

# Tests.
(cmakeBool "CUTLASS_ENABLE_TESTS" finalAttrs.doCheck)
(cmakeBool "CUTLASS_ENABLE_GTEST_UNIT_TESTS" finalAttrs.doCheck)
(cmakeBool "CUTLASS_USE_SYSTEM_GOOGLETEST" true)

# NOTE: Both CUDNN and CUBLAS can be used by the examples and the profiler. Since they are large dependencies, they
# are disabled by default.
(cmakeBool "CUTLASS_ENABLE_TOOLS" enableTools)
(cmakeBool "CUTLASS_ENABLE_CUBLAS" enableTools)
(cmakeBool "CUTLASS_ENABLE_CUDNN" enableTools)

# NOTE: Requires x86_64 and hardware support.
(cmakeBool "CUTLASS_ENABLE_F16C" enableF16C)

# TODO: Unity builds are supposed to reduce build time, but this seems to just reduce the number of tasks
# generated?
# NOTE: Good explanation of unity builds:
# https://www.methodpark.de/blog/how-to-speed-up-clang-tidy-with-unity-builds.
(cmakeBool "CUTLASS_UNITY_BUILD_ENABLED" false)

# NOTE: Can change the size of the executables
(cmakeBool "CUTLASS_NVCC_EMBED_CUBIN" true)
(cmakeBool "CUTLASS_NVCC_EMBED_PTX" true)
];

doCheck = false;

checkInputs = [ gtest ];

# NOTE: Because the test cases immediately create and try to run the binaries, we don't have an opportunity
# to patch them with autoAddDriverRunpath. To get around this, we add the driver runpath to the environment.
preCheck = optionalString finalAttrs.doCheck ''
export LD_LIBRARY_PATH="$(readlink -mnv "${addDriverRunpath.driverLink}/lib")"
'';

# This is *not* a derivation you want to build on a small machine.
requiredSystemFeatures = optionals finalAttrs.doCheck [
"big-parallel"
"cuda"
];

passthru =
let
cutlassShowLinkTimes = cutlass.overrideAttrs (prevAttrs: {
# Set RULE_LAUNCH_LINK so CMake adds a message to the linker command to show the link times.
postPatch =
prevAttrs.postPatch
+ ''
sed -i '1i set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "''${CMAKE_COMMAND} -E time")' ./CUDA.cmake
'';
});
in
{
updateScript = gitUpdater {
inherit (finalAttrs) pname version;
rev-prefix = "v";
};
# TODO: These can be removed.
tests.withGpu = cutlass.overrideAttrs { doCheck = true; };
showLinkTimesWithDefaultLinker = cutlassShowLinkTimes;
showLinkTimesWithMoldLinker = cutlassShowLinkTimes.override {
backendStdenv = backendStdenv.withMoldLinker;
};
};

meta = with lib; {
description = "CUDA Templates for Linear Algebra Subroutines";
homepage = "https://github.com/NVIDIA/cutlass";
license = licenses.asl20;
broken = cudaOlder "11.4";
platforms = [
"aarch64-linux"
"x86_64-linux"
];
maintainers = with maintainers; [ connorbaker ] ++ teams.cuda.members;
};
})
118 changes: 118 additions & 0 deletions pkgs/development/cuda-modules/packages/matx.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# NOTE: Though NCCL is called within the cudaPackages package set, we avoid passing in
# the names of dependencies from that package set directly to avoid evaluation errors
# in the case redistributable packages are not available.
{
autoAddDriverRunpath,
cudaAtLeast,
cmake,
ninja,
flags,
cutlass,
cudaMajorMinorVersion,
cudaOlder,
backendStdenv,
cuda_cccl,
cuda_cudart,
cuda_nvcc,
cuquantum,
cudatoolkit,
libcutensor,
fetchFromGitHub,
rapids-cmake,
lib,
python3Packages,
which,
# passthru.updateScript
gitUpdater,
}:
let
inherit (lib.lists) optionals;
inherit (lib.strings) cmakeFeature cmakeBool;

pythonDeps = with python3Packages; [
pybind11
numpy
cupy
];
in
backendStdenv.mkDerivation (finalAttrs: {
name = "cuda${cudaMajorMinorVersion}-${finalAttrs.pname}-${finalAttrs.version}";
pname = "MatX";
version = "0.8.0";

src = fetchFromGitHub {
owner = "NVIDIA";
repo = "MatX";
rev = "refs/tags/v${finalAttrs.version}";
hash = "sha256-7ygdfo27tXz0f6jz6RwDCGwSQun3HqZJ9o6w2vKGM3s=";
};

strictDeps = true;

nativeBuildInputs = [
rapids-cmake
which
autoAddDriverRunpath
] ++ pythonDeps ++ [ cuda_nvcc ];

postPatch =
# Remove the vendored rapids-cmake
''
rm -rf cmake/rapids-cmake
''
# Copy required components and update permissions
+ rapids-cmake.passthru.utilities.copyToCmakeDir
# Remove the GetPyBind11.cmake file
+ ''
substituteInPlace CMakeLists.txt \
--replace-fail \
"include(cmake/GetPyBind11.cmake)" \
""
'';

buildInputs = [
cuda_cudart
cuda_cccl
libcutensor
cutlass
cuquantum # cutensorNet
];

# TODO: This should be handled by setup hooks in rapids-cmake.
cmakeFlags = rapids-cmake.passthru.data.cmakeFlags ++ [
(cmakeBool "MATX_BUILD_EXAMPLES" false)
(cmakeBool "MATX_BUILD_TESTS" false)
(cmakeBool "MATX_BUILD_BENCHMARKS" false)
(cmakeBool "MATX_NVTX_FLAGS" true)
(cmakeBool "MATX_BUILD_DOCS" false)
(cmakeBool "MATX_BUILD_32_BIT" false)
(cmakeBool "MATX_MULTI_GPU" false) # Requires Nvshmem?
(cmakeBool "MATX_EN_VISUALIZATION" false) # TODO: Revisit
(cmakeBool "MATX_EN_CUTLASS" false) # TODO: CUTLASS support is removed in main?
(cmakeBool "MATX_EN_CUTENSOR" true)
(cmakeBool "MATX_EN_FILEIO" true)
(cmakeBool "MATX_EN_NVPL" false) # TODO: Revisit for ARM support
(cmakeBool "MATX_DISABLE_CUB_CACHE" true)
];

propagatedBuildInputs = pythonDeps;

enableParallelBuilding = true;

passthru.updateScript = gitUpdater {
inherit (finalAttrs) pname version;
rev-prefix = "v";
};

meta = with lib; {
description = "An efficient C++17 GPU numerical computing library with Python-like syntax";
homepage = "https://nvidia.github.io/MatX";
broken = cudaOlder "11.4";
license = licenses.bsd3;
platforms = [
"aarch64-linux"
"x86_64-linux"
];
maintainers = with maintainers; [ connorbaker ] ++ teams.cuda.members;
};
})

0 comments on commit fed9d55

Please sign in to comment.