replicate · yorickvP · Jun 12, 2024 · Jul 4, 2024 · Jul 4, 2024 · Jul 4, 2024
diff --git a/.github/workflows/nix.yml b/.github/workflows/nix.yml
@@ -11,6 +11,8 @@ jobs:
     steps:
     - uses: actions/checkout@v4
     - uses: DeterminateSystems/nix-installer-action@v10
+      with:
+        extra-conf: cores = 12
     - name: Authenticate to Google Cloud Platform
       uses: google-github-actions/auth@v2
       with:
@@ -37,6 +39,8 @@ jobs:
       run: |
         nix build --accept-flake-config ".#cog-triton-builder" -o cog-triton-builder
         ./cog-triton-builder push r8.im/replicate-internal/triton-builder
+        nix build --accept-flake-config ".#cog-triton-builder-h100" -o cog-triton-builder-h100
+        ./cog-triton-builder-h100 push r8.im/replicate-internal/triton-builder-h100
     - name: Build cog-triton-runner-80
       env:
         COG_TOKEN: ${{ secrets.COG_TOKEN }}

diff --git a/cog-trt-llm/trt_llm_builder.py b/cog-trt-llm/trt_llm_builder.py
@@ -111,6 +111,9 @@ def _assemble_subprocess_cmd(self, executable, args, script=None):
         elif executable == "trtllm-build":
             cmd = [executable]
 
+        if "TRTLLM_PYTHON" in os.environ:
+            cmd[0] = os.path.join(os.environ["TRTLLM_PYTHON"], "bin", cmd[0])
+
         for k, v in args.items():
             cmd += ["--" + str(k)]
             cmd += [str(v)] if v else []

diff --git a/configs/example_official_model_config.yaml b/configs/example_official_model_config.yaml
@@ -28,12 +28,13 @@ instantiate:
         max_queue_delay_microseconds: 100
         max_attention_window_size: 4096
         kv_cache_free_gpu_mem_fraction: 0.95
+        max_queue_size: 0
 
 
     postprocessing:
       args:
         tokenizer_dir: /src/triton_model_repo/tensorrt_llm/1/
-        tokenizer_type: llama
+        tokenizer_type: auto
         triton_max_batch_size: 64
         postprocessing_instance_count: 64
 

diff --git a/default.nix b/default.nix
@@ -2,81 +2,122 @@
 let
   deps = config.deps;
   python3 = config.python-env.deps.python;
-  cudaPackages = pkgs.cudaPackages_12_1;
+  inherit (config.cognix) cudaPackages;
   site = python3.sitePackages;
   pythonDrvs = config.python-env.pip.drvs;
   inherit (pkgs) lib;
   cfg = config.cog-triton; # defined in interface.nix
+  trtllm-env = config.python-env.public.extendModules {
+    modules = [{
+      _file = ./.;
+      pip.rootDependencies = lib.mkOverride 49 { tensorrt-llm = true; hf-transfer = true; };
+      pip.drvs.pydantic = let mkMoreForce = lib.mkOverride 49; in {
+        version = mkMoreForce "2.8.2";
+        mkDerivation.src = mkMoreForce (pkgs.fetchurl {
+          sha256 = "73ee9fddd406dc318b885c7a2eab8a6472b68b8fb5ba8150949fc3db939f23c8";
+          url = "https://files.pythonhosted.org/packages/1f/fa/b7f815b8c9ad021c07f88875b601222ef5e70619391ade4a49234d12d278/pydantic-2.8.2-py3-none-any.whl";
+        });
+      };
+    }];
+  };
+  trtllm-pythonDrvs = trtllm-env.config.pip.drvs;
+  toCudaCapability = cmakeArch: {
+    "70-real" = "7.0";
+    "80-real" = "8.0";
+    "86-real" = "8.6";
+    "89-real" = "8.9";
+    "90-real" = "9.0";
+  }.${cmakeArch};
 in
 {
   imports = [ ./interface.nix ];
   cog.build = {
     python_version = "3.10";
-    cog_version = "0.10.0-alpha16";
+    cog_version = "0.10.0-alpha18";
     cuda = "12.1"; # todo: 12.2
     gpu = true;
-    # inspiration: echo tensorrt_llm==0.8.0 | uv pip compile - --extra-index-url https://pypi.nvidia.com -p 3.10 --prerelease=allow --annotation-style=line
+    # inspiration: echo tensorrt_llm==0.10.0 | uv pip compile - --extra-index-url https://pypi.nvidia.com -p 3.10 --prerelease=allow --annotation-style=line
     python_packages = [
       "--extra-index-url"
       "https://pypi.nvidia.com"
-      "tensorrt_llm==0.9.0"
-      "torch==2.2.2"
-      "tensorrt==9.3.0.post12.dev1"
-      "tensorrt-bindings==9.3.0.post12.dev1"
-      "tensorrt-libs==9.3.0.post12.dev1"
-      "nvidia-pytriton==0.5.2" # corresponds to 2.42.0
-      "httpx"
-      "nvidia-cublas-cu12<12.2"
-      "nvidia-cuda-nvrtc-cu12<12.2"
-      "nvidia-cuda-runtime-cu12<12.2"
+      "tensorrt_llm==0.12.0.dev2024073000"
+      "tensorrt-cu12==10.2.0.post1"
+      "torch==2.3.1"
+      "nvidia-pytriton==0.5.8" # corresponds to 2.46.0
       "omegaconf"
       "hf-transfer"
-      "tokenizers"
+      "tokenizers>=0.19.0"
     ];
     # don't ask why it needs ssh
     system_packages = [ "pget" "openssh" "openmpi" ];
   };
+  # patch in cuda packages from nixpkgs
+  cognix.merge-native = {
+    cudnn = "force";
+    cublas = true;
+  };
   python-env.pip = {
-    uv.enable = true;
-    # todo: add some constraints to match cudaPackages
     constraintsList = [
-      "nvidia-cudnn-cu12<9"
+      "datasets>2.15.0" # picks older fsspec but newer datasets
+      "mpi4py<4" # recent release with breaking changes
     ];
+    # HACK: cog requires pydantic <2, but we do need the extra deps pydantic2 brings in
     overridesList = [
-      "tokenizers==0.19.0"
-      "transformers==4.40.0"
+      "pydantic>=2.0"
     ];
+    drvs.pydantic = {
+      version = lib.mkForce "1.10.17";
+      mkDerivation.src = pkgs.fetchurl {
+        sha256 ="371dcf1831f87c9e217e2b6a0c66842879a14873114ebb9d0861ab22e3b5bb1e";
+        url = "https://files.pythonhosted.org/packages/ef/a6/080cace699e89a94bd4bf34e8c12821d1f05fe4d56a0742f797b231d9a40/pydantic-1.10.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl";
+      };
+    };
   };
   cognix.includeNix = true;
   cognix.nix.extraOptions = ''
     extra-trusted-public-keys = replicate-1:rbU0MI8kgUmqLINtKfXoDkrl9NxXQMw6//+LHHDYflk=
     extra-substituters = https://storage.googleapis.com/replicate-nix-cache-dev/
   '';
   python-env.pip.drvs = {
+
+    torch.public = lib.mkIf cfg.torchSourceBuild
+      (lib.mkForce config.deps.minimal-torch);
+    tensorrt-llm.public = lib.mkIf cfg.trtllmSourceBuild
+      (lib.mkForce config.deps.tensorrt-llm.override {
+        withPython = true;
+      });
+
+    nvidia-modelopt.mkDerivation.propagatedBuildInputs = [
+      pythonDrvs.setuptools.public
+    ];
     # tensorrt likes doing a pip invocation from it's setup.py
     # circumvent by manually depending on tensorrt_libs, tensorrt_bindings
     # and setting this env variable
-    tensorrt.env.NVIDIA_TENSORRT_DISABLE_INTERNAL_PIP = true;
-    # TODO remove upon next rebuild:
-    tensorrt.mkDerivation.propagatedBuildInputs = with pythonDrvs; [
-      tensorrt-libs.public
-      tensorrt-bindings.public
+    tensorrt-cu12.env.NVIDIA_TENSORRT_DISABLE_INTERNAL_PIP = true;
+    tensorrt-cu12.mkDerivation.buildInputs = [ python3.pkgs.pip ];
+    tensorrt-cu12-bindings.mkDerivation.propagatedBuildInputs = [
+      pythonDrvs.tensorrt-cu12-libs.public
     ];
-    tensorrt-bindings.mkDerivation.propagatedBuildInputs = [ pythonDrvs.tensorrt-libs.public ];
-    tensorrt-libs.mkDerivation.postFixup = ''
+    # fixes tensorrt-llm build
+    tensorrt-cu12-libs.mkDerivation.postFixup = ''
       pushd $out/${site}/tensorrt_libs
-      ln -s libnvinfer.so.9 libnvinfer.so
-      ln -s libnvonnxparser.so.9 libnvonnxparser.so
+      ln -s libnvinfer.so.10 libnvinfer.so
+      ln -s libnvonnxparser.so.10 libnvonnxparser.so
       popd
     '';
-    tensorrt-libs.env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ];
+    tensorrt-cu12-libs.env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ];
     tensorrt-llm = {
       mkDerivation.buildInputs = [ cudaPackages.nccl ];
       mkDerivation.propagatedBuildInputs = with pythonDrvs; [
-        tensorrt-libs.public # libnvinfer, onnxparse
+        tensorrt-cu12-libs.public # libnvinfer, onnxparse
       ];
       env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ];
-      env.autoPatchelfIgnoreMissingDeps = ["libcuda.so.1"];
+      env.autoPatchelfIgnoreMissingDeps = [ "libcuda.so.1" "libnvidia-ml.so.1" ];
+      mkDerivation.postInstall = ''
+        pushd $out/${site}/tensorrt_llm/bin
+        patchelf --replace-needed libnvinfer_plugin_tensorrt_llm.so{.10,} executorWorker
+        popd
+      '';
     };
     # has some binaries that want cudart
     tritonclient.mkDerivation.postInstall = "rm -r $out/bin";
@@ -96,23 +137,10 @@ in
       done
       popd
     '';
-    # patch in cuda packages from nixpkgs
-    nvidia-cublas-cu12.mkDerivation.postInstall = ''
-      pushd $out/${python3.sitePackages}/nvidia/cublas/lib
-      for f in ./*.so.12; do
-        chmod +w "$f"
-        rm $f
-        ln -s ${cudaPackages.libcublas.lib}/lib/$f ./$f
-      done
-      popd
-    '';
-    nvidia-cudnn-cu12.mkDerivation.postInstall = ''
-      pushd $out/${python3.sitePackages}/nvidia/cudnn/lib
-      for f in ./*.so.8; do
-        chmod +w "$f"
-        rm $f
-        ln -s ${cudaPackages.cudnn.lib}/lib/$f ./$f
-      done
+    mpi4py.mkDerivation.nativeBuildInputs = [ pkgs.removeReferencesTo ];
+    mpi4py.mkDerivation.postInstall = ''
+      pushd $out/${site}/mpi4py
+      remove-references-to -t ${pkgs.openmpi.dev} mpi.cfg MPI.*.so
       popd
     '';
   };
@@ -131,27 +159,46 @@ in
   deps.tensorrt-src = pkgs.fetchFromGitHub {
     owner = "NVIDIA";
     repo = "TensorRT";
-    rev = "6d1397ed4bb65933d02725623c122a157544a729"; # release/9.3 branch
-    hash = "sha256-XWFyMD7jjvgIihlqCJNyH5iSa1vZCDhv1maLJqMM3UE=";
+    rev = "v10.2.0";
+    hash = "sha256-Euo9VD4VTpx8XJV97IMETTAx/YkPGXiNdA39Wjp3UMU=";
   };
-  # todo: replace with lockfile
-  deps.pybind11-stubgen = python3.pkgs.buildPythonPackage rec {
-    pname = "pybind11-stubgen";
-    version = "2.5";
-    src = pkgs.fetchPypi {
-      inherit pname version;
-      hash = "sha256-lqf+vKski/mKvUu3LMX3KbqHsjRCR0VMF1nmPN6f7zQ=";
+  # make a python3 environment with all the pkgs from lock.json *and* nixpkgs.python
+  # mainly used to build torch, which additionally requires astunparse
+  deps.python3-with-nixpkgs = python3.override {
+    packageOverrides = pyself: pysuper: (lib.mapAttrs (_: v: v.public.out) trtllm-pythonDrvs) // {
+      # todo: replace with lockfile?
+      pybind11-stubgen = pyself.buildPythonPackage rec {
+        pname = "pybind11-stubgen";
+        version = "2.5";
+        src = pyself.fetchPypi {
+          inherit pname version;
+          hash = "sha256-lqf+vKski/mKvUu3LMX3KbqHsjRCR0VMF1nmPN6f7zQ=";
+        };
+      };
+      # prevent infinite loop, don't override torch itself
+      inherit (pysuper) torch;
     };
   };
   deps.tensorrt-llm = pkgs.callPackage ./nix/tensorrt-llm.nix {
-    inherit python3 cudaPackages pythonDrvs;
-    # TODO: turn into config option
+    inherit python3 cudaPackages;
+    pythonDrvs = config.deps.trtllm-env.config.pip.drvs;
     withPython = false;
     inherit (cfg) architectures;
-    inherit (deps) pybind11-stubgen tensorrt-src;
+    inherit (deps.python3-with-nixpkgs.pkgs) pybind11-stubgen;
+    inherit (deps) tensorrt-src;
   };
+  deps.trtllm-env = trtllm-env;
   deps.trtllm-backend = pkgs.callPackage ./nix/trtllm-backend.nix {
     inherit python3 cudaPackages pythonDrvs;
     inherit (deps) tensorrt-llm tensorrt-src;
   };
+  deps.minimal-torch = pkgs.callPackage ./nix/torch.nix {
+    python3 = deps.python3-with-nixpkgs;
+    # todo: match/modify config.cognix.cudaPackages
+    cudaPackages = (pkgs.extend (self: super: {
+      config = super.config // {
+        cudaCapabilities = map toCudaCapability cfg.architectures;
+      };
+    })).cudaPackages_12_1;
+  };
 }
diff --git a/flake.lock b/flake.lock
diff --git a/flake.nix b/flake.nix
@@ -4,7 +4,7 @@
     extra-substituters = "https://storage.googleapis.com/replicate-nix-cache-dev/";
   };
   inputs = {
-    cognix.url = "github:datakami/cognix/24.03";
+    cognix.url = "github:datakami/cognix/24.07";
   };
 
   outputs = { self, cognix }@inputs: (cognix.lib.cognixFlake inputs {}) // {
@@ -26,23 +26,35 @@
 
         cog-triton.architectures = architectures;
         # don't need this file in a runner
-        python-env.pip.drvs.tensorrt-libs.mkDerivation.postInstall = lib.mkAfter ''
+        python-env.pip.drvs.tensorrt-cu12-libs.mkDerivation.postInstall = lib.mkAfter ''
           rm $out/lib/python*/site-packages/tensorrt_libs/libnvinfer_builder_resource*
         '';
       });
-      makeBuilder = name: callCognix ( { config, lib, ... }: {
+      makeBuilder = name: callCognix ( { config, lib, pkgs, ... }: {
         inherit name;
         # only grab deps of tensorrt-llm, omegaconf, hf-transfer
-        cognix.python_root_packages = [ "tensorrt-llm" "omegaconf" "hf-transfer" ];
+        cognix.python_root_packages = [ "omegaconf" "hf-transfer" "transformers" "torch" ];
+
+        cog-triton.architectures = [ "80-real" "86-real" "90-real" ];
 
         # override cog.yaml:
         cog.concurrency.max = lib.mkForce 1;
         cognix.rootPath = lib.mkForce "${./cog-trt-llm}";
         # this just needs the examples/ dir
         cognix.environment.TRTLLM_DIR = config.deps.tensorrt-llm.examples;
+        # HACK: cog needs pydantic v1, but trt-llm needs pydantic v2
+        cognix.environment.TRTLLM_PYTHON = config.deps.trtllm-env.config.public.pyEnv;
       });
     in {
       cog-triton-builder = makeBuilder "cog-triton-builder";
+      # we want to push the model to triton-builder-h100 as well
+      # as cog-triton-builder, but replicate doesn't let us.
+      # so let's add some data to fool it
+      cog-triton-builder-h100 = ((makeBuilder "cog-triton-builder-h100").extendModules {
+        modules = [{
+          cognix.environment.TRTLLM_BUILDER_VARIANT = "h100";
+        }];
+      }).config.public;
       cog-triton-runner-80 = makeRunner "cog-triton-runner-80" ["80-real"] {};
       cog-triton-runner-86 = makeRunner "cog-triton-runner-86" ["86-real"] {};
       cog-triton-runner-90 = makeRunner "cog-triton-runner-90" ["90-real"] {};

diff --git a/interface.nix b/interface.nix
@@ -9,5 +9,7 @@
       # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
       # 80: A100, 86: A5000, A40, A800, 89: L40, 90: H100
     };
+    torchSourceBuild = mkEnableOption "Build Torch from source to be smaller";
+    trtllmSourceBuild = mkEnableOption "Build trtllm python from source to be smaller";
   };
 }