From 21ac3aa52d3af18e97f584ade3f2854b0223f1bb Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 31 Jul 2024 22:36:26 +0000
Subject: [PATCH] wip

---
 pkgs/development/cuda-modules/aliases.nix     |   2 +-
 .../cuda-modules/packages/README.md           |  29 +++
 .../cuda-modules/packages/backendStdenv.nix   |   8 +-
 .../packages/backendStdenvWithMoldLinker.nix  |  31 +++
 .../cuda-modules/packages/cutlass.nix         | 186 ++++++++++++++++++
 .../cuda-modules/packages/matx.nix            | 118 +++++++++++
 6 files changed, 367 insertions(+), 7 deletions(-)
 create mode 100644 pkgs/development/cuda-modules/packages/backendStdenvWithMoldLinker.nix
 create mode 100644 pkgs/development/cuda-modules/packages/cutlass.nix
 create mode 100644 pkgs/development/cuda-modules/packages/matx.nix

diff --git a/pkgs/development/cuda-modules/aliases.nix b/pkgs/development/cuda-modules/aliases.nix
index 998ef385106a45f..6ccb7f2be4dad8d 100644
--- a/pkgs/development/cuda-modules/aliases.nix
+++ b/pkgs/development/cuda-modules/aliases.nix
@@ -1,7 +1,7 @@
 # Packages which have been deprecated or removed from cudaPackages
 final:
 let
-  inherit (final.lib.attrsets) mapAttrs;
+  inherit (builtins) mapAttrs;
   inherit (final.lib.trivial) warn;
 
   mkRenamed =
diff --git a/pkgs/development/cuda-modules/packages/README.md b/pkgs/development/cuda-modules/packages/README.md
index e0ce580de34868b..07912db18f40ff8 100644
--- a/pkgs/development/cuda-modules/packages/README.md
+++ b/pkgs/development/cuda-modules/packages/README.md
@@ -1,3 +1,32 @@
 # packages
 
 Packages which are not created by the manifest builder.
+
+## backendStdenv
+
+We have our own `stdenv` for two reasons:
+
+1. To ensure we use a compatible version of `gcc` with `nvcc`, and
+2. To use `mold` as our default linker.
+
+Elaborating on the second point: a non-trivial amount of time is spent *linking* when compiling CUDA code. The `mold` linker is much, *much* faster than the linkers GCC or LLVM use by default. As an example, on my machine with an i9-13900K and 96 GB of DDR5-6800 RAM, building CUTLASS 3.5.0 targeting `sm_75`:
+
+
+
+```console
+$ nix build -L --impure --builders '' .#cudaPackages.cutlass.showLinkTimesWithDefaultLinker \
+  |& tee /dev/stderr \
+  | awk '{ if ($2 " " $3 " " $4 == "Elapsed time (seconds):") sum += $5; } END { print sum; }'
+...
+8.39248
+```
+
+```console
+$ nix build -L --impure --builders '' .#cudaPackages.cutlass.showLinkTimesWithMoldLinker \
+  |& tee /dev/stderr \
+  | awk '{ if ($2 " " $3 " " $4 == "Elapsed time (seconds):") sum += $5; } END { print sum; }'
+...
+5.93172
+```
+
+TODO: Next build uses unity.
\ No newline at end of file
diff --git a/pkgs/development/cuda-modules/packages/backendStdenv.nix b/pkgs/development/cuda-modules/packages/backendStdenv.nix
index 7d25241f4dc2d15..fe24dafc4c48f22 100644
--- a/pkgs/development/cuda-modules/packages/backendStdenv.nix
+++ b/pkgs/development/cuda-modules/packages/backendStdenv.nix
@@ -16,13 +16,9 @@
 # Cf. https://github.com/NixOS/nixpkgs/pull/218265 for context
 let
   gccMajorVersion = data.nvccCompatibilities.${cudaVersion}.gccMaxMajorVersion;
+  # TODO(@connorbaker): Get numbers for why we should use stdenvAdapters.useMoldLinker.
   cudaStdenv = stdenvAdapters.useLibsFrom stdenv pkgs."gcc${gccMajorVersion}Stdenv";
-  passthruExtra = {
-    # cudaPackages.backendStdenv.nixpkgsCompatibleLibstdcxx has been removed,
-    # if you need it you're likely doing something wrong. There has been a
-    # warning here for a month or so. Now we can no longer return any
-    # meaningful value in its place and drop the attribute entirely.
-  };
+  passthruExtra.withMoldLinker = stdenvAdapters.useMoldLinker cudaStdenv;
   assertCondition = true;
 in
 
diff --git a/pkgs/development/cuda-modules/packages/backendStdenvWithMoldLinker.nix b/pkgs/development/cuda-modules/packages/backendStdenvWithMoldLinker.nix
new file mode 100644
index 000000000000000..d90314a67114b13
--- /dev/null
+++ b/pkgs/development/cuda-modules/packages/backendStdenvWithMoldLinker.nix
@@ -0,0 +1,31 @@
+{
+  cudaVersion,
+  data,
+  lib,
+  pkgs,
+  stdenv,
+  stdenvAdapters,
+}:
+# Exposed as cudaPackages.backendStdenv.
+# This is what nvcc uses as a backend,
+# and it has to be an officially supported one (e.g. gcc11 for cuda11).
+#
+# It, however, propagates current stdenv's libstdc++ to avoid "GLIBCXX_* not found errors"
+# when linked with other C++ libraries.
+# E.g. for cudaPackages_11_8 we use gcc11 with gcc12's libstdc++
+# Cf. https://github.com/NixOS/nixpkgs/pull/218265 for context
+let
+  gccMajorVersion = data.nvccCompatibilities.${cudaVersion}.gccMaxMajorVersion;
+  cudaStdenv = stdenvAdapters.useMoldLinker (stdenvAdapters.useLibsFrom stdenv pkgs."gcc${gccMajorVersion}Stdenv");
+  passthruExtra = {
+    # cudaPackages.backendStdenv.nixpkgsCompatibleLibstdcxx has been removed,
+    # if you need it you're likely doing something wrong. There has been a
+    # warning here for a month or so. Now we can no longer return any
+    # meaningful value in its place and drop the attribute entirely.
+  };
+  assertCondition = true;
+in
+
+# TODO: Consider testing whether we in fact use the newer libstdc++
+
+lib.extendDerivation assertCondition passthruExtra cudaStdenv
diff --git a/pkgs/development/cuda-modules/packages/cutlass.nix b/pkgs/development/cuda-modules/packages/cutlass.nix
new file mode 100644
index 000000000000000..dcc5d285a7ba6a2
--- /dev/null
+++ b/pkgs/development/cuda-modules/packages/cutlass.nix
@@ -0,0 +1,186 @@
+{
+  addDriverRunpath,
+  autoAddDriverRunpath,
+  backendStdenv,
+  cmake,
+  cuda_cudart,
+  cuda_nvcc,
+  cuda_nvrtc,
+  cudaMajorMinorVersion,
+  cudaOlder,
+  cudnn,
+  cutlass,
+  fetchFromGitHub,
+  fetchpatch2,
+  flags,
+  gtest,
+  lib,
+  libcublas,
+  libcurand,
+  ninja,
+  python3,
+  # Options
+  enableF16C ? false,
+  enableTools ? false,
+  # passthru.updateScript
+  gitUpdater,
+}:
+let
+  inherit (lib.lists) optionals;
+  inherit (lib.strings) cmakeBool cmakeFeature optionalString;
+in
+# TODO: This can also be packaged for Python!
+backendStdenv.mkDerivation (finalAttrs: {
+  name = "cuda${cudaMajorMinorVersion}-${finalAttrs.pname}-${finalAttrs.version}";
+  pname = "cutlass";
+  version = "3.5.0";
+
+  src = fetchFromGitHub {
+    owner = "NVIDIA";
+    repo = "cutlass";
+    rev = "refs/tags/v${finalAttrs.version}";
+    hash = "sha256-D/s7eYsa5l/mfx73tE4mnFcTQdYqGmXa9d9TCryw4e4=";
+  };
+
+  strictDeps = true;
+
+  # TODO: As a header-only library, we should make sure we have an `include` directory or similar which is not a
+  # superset of the `out` (`bin`) or `dev` outputs (whih is what the multiple-outputs setup hook does by default).
+  outputs = [ "out" ];
+
+  nativeBuildInputs = [
+    autoAddDriverRunpath
+    cuda_nvcc
+    cmake
+    ninja
+    python3
+  ];
+
+  patches =
+    # Enable use of the system googletest.
+    # NOTE: Revisit these when 3.5.1 releases, as it may be included.
+    [
+      (fetchpatch2 {
+        name = "support-system-googletest.patch";
+        url = "https://github.com/NVIDIA/cutlass/pull/1469/commits/60923063b39c22982e56c304e132e59b358f4814.patch";
+        hash = "sha256-q44SVpC7i91onsBDYKKBob5YqTPmXHRFgImnKpeVKpQ=";
+      })
+      (fetchpatch2 {
+        name = "create-working-directory-for-tests-explicitly.patch";
+        url = "https://github.com/NVIDIA/cutlass/pull/1469/commits/1093c4f90df6083146681e3c39b0c53611978c5f.patch";
+        hash = "sha256-AYJsr+y5ntSWuV7+b/NRRY+fXE5SFN28BdZZqWXoElc=";
+      })
+    ];
+
+  postPatch =
+    # Prepend some commands to the CUDA.cmake file so it can find the CUDA libraries using CMake's FindCUDAToolkit
+    # module. These target names are used throughout the project; I (@connorbaker) did not choose them.
+    ''
+      mv ./CUDA.cmake ./_CUDA_Append.cmake
+      cat > ./_CUDA_Prepend.cmake <<'EOF'
+      find_package(CUDAToolkit REQUIRED)
+      foreach(_target cudart cuda_driver nvrtc)
+        if (NOT TARGET CUDA::''${_target})
+          message(FATAL_ERROR "''${_target} Not Found")
+        endif()
+        message(STATUS "''${_target} library: ''${CUDA_''${_target}_LIBRARY}")
+        add_library(''${_target} ALIAS CUDA::''${_target})
+      endforeach()
+      EOF
+      cat ./_CUDA_Prepend.cmake ./_CUDA_Append.cmake > ./CUDA.cmake
+    '';
+
+  enableParallelBuilding = true;
+
+  buildInputs =
+    [
+      cuda_cudart
+      cuda_nvrtc
+      libcurand
+    ]
+    ++ optionals enableTools [
+      cudnn
+      libcublas
+    ];
+
+  cmakeFlags = [
+    (cmakeFeature "CUTLASS_NVCC_ARCHS" flags.cmakeCudaArchitecturesString)
+    (cmakeBool "CUTLASS_ENABLE_EXAMPLES" false)
+
+    # Tests.
+    (cmakeBool "CUTLASS_ENABLE_TESTS" finalAttrs.doCheck)
+    (cmakeBool "CUTLASS_ENABLE_GTEST_UNIT_TESTS" finalAttrs.doCheck)
+    (cmakeBool "CUTLASS_USE_SYSTEM_GOOGLETEST" true)
+
+    # NOTE: Both CUDNN and CUBLAS can be used by the examples and the profiler. Since they are large dependencies, they
+    #       are disabled by default.
+    (cmakeBool "CUTLASS_ENABLE_TOOLS" enableTools)
+    (cmakeBool "CUTLASS_ENABLE_CUBLAS" enableTools)
+    (cmakeBool "CUTLASS_ENABLE_CUDNN" enableTools)
+
+    # NOTE: Requires x86_64 and hardware support.
+    (cmakeBool "CUTLASS_ENABLE_F16C" enableF16C)
+
+    # TODO: Unity builds are supposed to reduce build time, but this seems to just reduce the number of tasks
+    # generated?
+    # NOTE: Good explanation of unity builds:
+    #       https://www.methodpark.de/blog/how-to-speed-up-clang-tidy-with-unity-builds.
+    (cmakeBool "CUTLASS_UNITY_BUILD_ENABLED" false)
+
+    # NOTE: Can change the size of the executables
+    (cmakeBool "CUTLASS_NVCC_EMBED_CUBIN" true)
+    (cmakeBool "CUTLASS_NVCC_EMBED_PTX" true)
+  ];
+
+  doCheck = false;
+
+  checkInputs = [ gtest ];
+
+  # NOTE: Because the test cases immediately create and try to run the binaries, we don't have an opportunity
+  # to patch them with autoAddDriverRunpath. To get around this, we add the driver runpath to the environment.
+  preCheck = optionalString finalAttrs.doCheck ''
+    export LD_LIBRARY_PATH="$(readlink -mnv "${addDriverRunpath.driverLink}/lib")"
+  '';
+
+  # This is *not* a derivation you want to build on a small machine.
+  requiredSystemFeatures = optionals finalAttrs.doCheck [
+    "big-parallel"
+    "cuda"
+  ];
+
+  passthru =
+    let
+      cutlassShowLinkTimes = cutlass.overrideAttrs (prevAttrs: {
+        # Set RULE_LAUNCH_LINK so CMake adds a message to the linker command to show the link times.
+        postPatch =
+          prevAttrs.postPatch
+          + ''
+            sed -i '1i set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "''${CMAKE_COMMAND} -E time")' ./CUDA.cmake
+          '';
+      });
+    in
+    {
+      updateScript = gitUpdater {
+        inherit (finalAttrs) pname version;
+        rev-prefix = "v";
+      };
+      # TODO: These can be removed.
+      tests.withGpu = cutlass.overrideAttrs { doCheck = true; };
+      showLinkTimesWithDefaultLinker = cutlassShowLinkTimes;
+      showLinkTimesWithMoldLinker = cutlassShowLinkTimes.override {
+        backendStdenv = backendStdenv.withMoldLinker;
+      };
+    };
+
+  meta = with lib; {
+    description = "CUDA Templates for Linear Algebra Subroutines";
+    homepage = "https://github.com/NVIDIA/cutlass";
+    license = licenses.asl20;
+    broken = cudaOlder "11.4";
+    platforms = [
+      "aarch64-linux"
+      "x86_64-linux"
+    ];
+    maintainers = with maintainers; [ connorbaker ] ++ teams.cuda.members;
+  };
+})
diff --git a/pkgs/development/cuda-modules/packages/matx.nix b/pkgs/development/cuda-modules/packages/matx.nix
new file mode 100644
index 000000000000000..5e66dbc4acd550e
--- /dev/null
+++ b/pkgs/development/cuda-modules/packages/matx.nix
@@ -0,0 +1,118 @@
+# NOTE: Though NCCL is called within the cudaPackages package set, we avoid passing in
+# the names of dependencies from that package set directly to avoid evaluation errors
+# in the case redistributable packages are not available.
+{
+  autoAddDriverRunpath,
+  cudaAtLeast,
+  cmake,
+  ninja,
+  flags,
+  cutlass,
+  cudaMajorMinorVersion,
+  cudaOlder,
+  backendStdenv,
+  cuda_cccl,
+  cuda_cudart,
+  cuda_nvcc,
+  cuquantum,
+  cudatoolkit,
+  libcutensor,
+  fetchFromGitHub,
+  rapids-cmake,
+  lib,
+  python3Packages,
+  which,
+  # passthru.updateScript
+  gitUpdater,
+}:
+let
+  inherit (lib.lists) optionals;
+  inherit (lib.strings) cmakeFeature cmakeBool;
+
+  pythonDeps = with python3Packages; [
+    pybind11
+    numpy
+    cupy
+  ];
+in
+backendStdenv.mkDerivation (finalAttrs: {
+  name = "cuda${cudaMajorMinorVersion}-${finalAttrs.pname}-${finalAttrs.version}";
+  pname = "MatX";
+  version = "0.8.0";
+
+  src = fetchFromGitHub {
+    owner = "NVIDIA";
+    repo = "MatX";
+    rev = "refs/tags/v${finalAttrs.version}";
+    hash = "sha256-7ygdfo27tXz0f6jz6RwDCGwSQun3HqZJ9o6w2vKGM3s=";
+  };
+
+  strictDeps = true;
+
+  nativeBuildInputs = [
+    rapids-cmake
+    which
+    autoAddDriverRunpath
+  ] ++ pythonDeps ++ [ cuda_nvcc ];
+
+  postPatch =
+    # Remove the vendored rapids-cmake
+    ''
+      rm -rf cmake/rapids-cmake
+    ''
+    # Copy required components and update permissions
+    + rapids-cmake.passthru.utilities.copyToCmakeDir
+    # Remove the GetPyBind11.cmake file
+    + ''
+      substituteInPlace CMakeLists.txt \
+        --replace-fail \
+          "include(cmake/GetPyBind11.cmake)" \
+          ""
+    '';
+
+  buildInputs = [
+    cuda_cudart
+    cuda_cccl
+    libcutensor
+    cutlass
+    cuquantum # cutensorNet
+  ];
+
+  # TODO: This should be handled by setup hooks in rapids-cmake.
+  cmakeFlags = rapids-cmake.passthru.data.cmakeFlags ++ [
+    (cmakeBool "MATX_BUILD_EXAMPLES" false)
+    (cmakeBool "MATX_BUILD_TESTS" false)
+    (cmakeBool "MATX_BUILD_BENCHMARKS" false)
+    (cmakeBool "MATX_NVTX_FLAGS" true)
+    (cmakeBool "MATX_BUILD_DOCS" false)
+    (cmakeBool "MATX_BUILD_32_BIT" false)
+    (cmakeBool "MATX_MULTI_GPU" false) # Requires Nvshmem?
+    (cmakeBool "MATX_EN_VISUALIZATION" false) # TODO: Revisit
+    (cmakeBool "MATX_EN_CUTLASS" false) # TODO: CUTLASS support is removed in main?
+    (cmakeBool "MATX_EN_CUTENSOR" true)
+    (cmakeBool "MATX_EN_FILEIO" true)
+    (cmakeBool "MATX_EN_NVPL" false) # TODO: Revisit for ARM support
+    (cmakeBool "MATX_DISABLE_CUB_CACHE" true)
+  ];
+
+  propagatedBuildInputs = pythonDeps;
+
+  enableParallelBuilding = true;
+
+  passthru.updateScript = gitUpdater {
+    inherit (finalAttrs) pname version;
+    rev-prefix = "v";
+  };
+
+  meta = with lib; {
+    description = "An efficient C++17 GPU numerical computing library with Python-like syntax";
+    homepage = "https://nvidia.github.io/MatX";
+    broken = cudaOlder "11.4";
+    license = licenses.bsd3;
+    platforms = [
+      "aarch64-linux"
+      "x86_64-linux"
+    ];
+    maintainers = with maintainers; [ connorbaker ] ++ teams.cuda.members;
+  };
+})