Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tensorrt-llm 0.12.0.dev2024073000, triton 2.46.0 #52

Open
wants to merge 35 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
d12ae7b
tensorrt-llm: 0.9.0 -> 0.10.0, triton: 2.42.0 -> 2.44.0
yorickvP Jun 12, 2024
c3eb6f3
update cognix
yorickvP Jul 4, 2024
cec6d28
update triton_templates to trtllm-0.10
yorickvP Jul 4, 2024
0626e44
don't tokenize in postprocessing (see #27)
yorickvP Jul 4, 2024
4cd5533
instantiate triton_model_repo with the default config
yorickvP Jul 4, 2024
25cb314
Adjust decoding_mode after testing, omit missing optional params
yorickvP Jul 5, 2024
a19b28b
ignore empty SSE's, remove decoding_mode, just omit topk instead
yorickvP Jul 10, 2024
d86927d
bump pget==0.8.2, cog==0.10.0-alpha16
yorickvP Jul 10, 2024
098f7e7
tensorrt-llm: decrease closure size by cleaning up kernels
yorickvP Jul 11, 2024
cd3ce0e
bump cognix to exclude `train: null`
yorickvP Jul 17, 2024
e5ac906
tensorrt-llm: 0.10.0 -> 0.11.0
yorickvP Jul 19, 2024
57e97ee
update triton_templates
yorickvP Jul 19, 2024
7129024
update triton_model_repo
yorickvP Jul 19, 2024
8a2170f
Add a $TRTLLM_PYTHON environment with pydantic 2
yorickvP Jul 20, 2024
494d476
use pytriton 0.5.8
technillogue Jul 20, 2024
5c3eded
add nvidia-modelopt setuptools dependency
yorickvP Jul 21, 2024
dfd19b4
datasets: 2.14.4 -> 2.20.0
yorickvP Jul 21, 2024
d32b60c
update cognix
yorickvP Jul 24, 2024
a2fc518
tensorrt-llm: 0.11.0 -> 0.12.0.dev2024072300
yorickvP Jul 24, 2024
8cc2174
update triton_model_repo, triton_templates
yorickvP Jul 24, 2024
92000d5
don't add 2 copies of tensorrt-llm in builder
yorickvP Jul 24, 2024
dc78a43
tensorrt-lllm: 0.12.0.dev2024072300 -> 0.12.0.dev2024072301
yorickvP Jul 25, 2024
5fb8fb7
github action: limit to 12 cores
yorickvP Jul 25, 2024
6888d8f
remove rogue .gitkeep files
yorickvP Jul 25, 2024
49ccca6
tensorrt-cu12: 10.2.0 -> 10.2.0.post1
yorickvP Jul 26, 2024
73567de
Push builder to triton-builder-h100
yorickvP Aug 2, 2024
d2adf71
fix: work around replicate unique model check
yorickvP Aug 2, 2024
2805da2
build tensorrt-llm from source
yorickvP Jul 31, 2024
d10ce53
trtllm: fix patch location to work in `nix develop`
yorickvP Aug 1, 2024
95e44d8
Add support for building torch & trtllm(python) from source
yorickvP Aug 7, 2024
75c6f70
tensorrt-llm: 0.12.0.dev2024072301 -> 0.12.0.dev2024073000
yorickvP Aug 7, 2024
b25f096
update triton_model_repo
yorickvP Aug 7, 2024
56557e8
Update triton_model_repo python and configs, remove bls
yorickvP Aug 7, 2024
4152b5a
update triton_model_repo and default config
yorickvP Aug 7, 2024
67438c8
tensorrt-llm: always autoPatchelf $out
yorickvP Aug 8, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/nix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ jobs:
steps:
- uses: actions/checkout@v4
- uses: DeterminateSystems/nix-installer-action@v10
with:
extra-conf: cores = 12
- name: Authenticate to Google Cloud Platform
uses: google-github-actions/auth@v2
with:
Expand All @@ -37,6 +39,8 @@ jobs:
run: |
nix build --accept-flake-config ".#cog-triton-builder" -o cog-triton-builder
./cog-triton-builder push r8.im/replicate-internal/triton-builder
nix build --accept-flake-config ".#cog-triton-builder-h100" -o cog-triton-builder-h100
./cog-triton-builder-h100 push r8.im/replicate-internal/triton-builder-h100
- name: Build cog-triton-runner-80
env:
COG_TOKEN: ${{ secrets.COG_TOKEN }}
Expand Down
3 changes: 3 additions & 0 deletions cog-trt-llm/trt_llm_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ def _assemble_subprocess_cmd(self, executable, args, script=None):
elif executable == "trtllm-build":
cmd = [executable]

if "TRTLLM_PYTHON" in os.environ:
cmd[0] = os.path.join(os.environ["TRTLLM_PYTHON"], "bin", cmd[0])

for k, v in args.items():
cmd += ["--" + str(k)]
cmd += [str(v)] if v else []
Expand Down
3 changes: 2 additions & 1 deletion configs/example_official_model_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,13 @@ instantiate:
max_queue_delay_microseconds: 100
max_attention_window_size: 4096
kv_cache_free_gpu_mem_fraction: 0.95
max_queue_size: 0


postprocessing:
args:
tokenizer_dir: /src/triton_model_repo/tensorrt_llm/1/
tokenizer_type: llama
tokenizer_type: auto
triton_max_batch_size: 64
postprocessing_instance_count: 64

Expand Down
167 changes: 107 additions & 60 deletions default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -2,81 +2,122 @@
let
deps = config.deps;
python3 = config.python-env.deps.python;
cudaPackages = pkgs.cudaPackages_12_1;
inherit (config.cognix) cudaPackages;
site = python3.sitePackages;
pythonDrvs = config.python-env.pip.drvs;
inherit (pkgs) lib;
cfg = config.cog-triton; # defined in interface.nix
trtllm-env = config.python-env.public.extendModules {
modules = [{
_file = ./.;
pip.rootDependencies = lib.mkOverride 49 { tensorrt-llm = true; hf-transfer = true; };
pip.drvs.pydantic = let mkMoreForce = lib.mkOverride 49; in {
version = mkMoreForce "2.8.2";
mkDerivation.src = mkMoreForce (pkgs.fetchurl {
sha256 = "73ee9fddd406dc318b885c7a2eab8a6472b68b8fb5ba8150949fc3db939f23c8";
url = "https://files.pythonhosted.org/packages/1f/fa/b7f815b8c9ad021c07f88875b601222ef5e70619391ade4a49234d12d278/pydantic-2.8.2-py3-none-any.whl";
});
};
}];
};
trtllm-pythonDrvs = trtllm-env.config.pip.drvs;
toCudaCapability = cmakeArch: {
"70-real" = "7.0";
"80-real" = "8.0";
"86-real" = "8.6";
"89-real" = "8.9";
"90-real" = "9.0";
}.${cmakeArch};
in
{
imports = [ ./interface.nix ];
cog.build = {
python_version = "3.10";
cog_version = "0.10.0-alpha16";
cog_version = "0.10.0-alpha18";
cuda = "12.1"; # todo: 12.2
gpu = true;
# inspiration: echo tensorrt_llm==0.8.0 | uv pip compile - --extra-index-url https://pypi.nvidia.com -p 3.10 --prerelease=allow --annotation-style=line
# inspiration: echo tensorrt_llm==0.10.0 | uv pip compile - --extra-index-url https://pypi.nvidia.com -p 3.10 --prerelease=allow --annotation-style=line
python_packages = [
"--extra-index-url"
"https://pypi.nvidia.com"
"tensorrt_llm==0.9.0"
"torch==2.2.2"
"tensorrt==9.3.0.post12.dev1"
"tensorrt-bindings==9.3.0.post12.dev1"
"tensorrt-libs==9.3.0.post12.dev1"
"nvidia-pytriton==0.5.2" # corresponds to 2.42.0
"httpx"
"nvidia-cublas-cu12<12.2"
"nvidia-cuda-nvrtc-cu12<12.2"
"nvidia-cuda-runtime-cu12<12.2"
"tensorrt_llm==0.12.0.dev2024073000"
"tensorrt-cu12==10.2.0.post1"
"torch==2.3.1"
"nvidia-pytriton==0.5.8" # corresponds to 2.46.0
"omegaconf"
"hf-transfer"
"tokenizers"
"tokenizers>=0.19.0"
];
# don't ask why it needs ssh
system_packages = [ "pget" "openssh" "openmpi" ];
};
# patch in cuda packages from nixpkgs
cognix.merge-native = {
cudnn = "force";
cublas = true;
};
python-env.pip = {
uv.enable = true;
# todo: add some constraints to match cudaPackages
constraintsList = [
"nvidia-cudnn-cu12<9"
"datasets>2.15.0" # picks older fsspec but newer datasets
"mpi4py<4" # recent release with breaking changes
];
# HACK: cog requires pydantic <2, but we do need the extra deps pydantic2 brings in
overridesList = [
"tokenizers==0.19.0"
"transformers==4.40.0"
"pydantic>=2.0"
];
drvs.pydantic = {
version = lib.mkForce "1.10.17";
mkDerivation.src = pkgs.fetchurl {
sha256 ="371dcf1831f87c9e217e2b6a0c66842879a14873114ebb9d0861ab22e3b5bb1e";
url = "https://files.pythonhosted.org/packages/ef/a6/080cace699e89a94bd4bf34e8c12821d1f05fe4d56a0742f797b231d9a40/pydantic-1.10.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl";
};
};
};
cognix.includeNix = true;
cognix.nix.extraOptions = ''
extra-trusted-public-keys = replicate-1:rbU0MI8kgUmqLINtKfXoDkrl9NxXQMw6//+LHHDYflk=
extra-substituters = https://storage.googleapis.com/replicate-nix-cache-dev/
'';
python-env.pip.drvs = {

torch.public = lib.mkIf cfg.torchSourceBuild
(lib.mkForce config.deps.minimal-torch);
tensorrt-llm.public = lib.mkIf cfg.trtllmSourceBuild
(lib.mkForce config.deps.tensorrt-llm.override {
withPython = true;
});

nvidia-modelopt.mkDerivation.propagatedBuildInputs = [
pythonDrvs.setuptools.public
];
# tensorrt likes doing a pip invocation from it's setup.py
# circumvent by manually depending on tensorrt_libs, tensorrt_bindings
# and setting this env variable
tensorrt.env.NVIDIA_TENSORRT_DISABLE_INTERNAL_PIP = true;
# TODO remove upon next rebuild:
tensorrt.mkDerivation.propagatedBuildInputs = with pythonDrvs; [
tensorrt-libs.public
tensorrt-bindings.public
tensorrt-cu12.env.NVIDIA_TENSORRT_DISABLE_INTERNAL_PIP = true;
tensorrt-cu12.mkDerivation.buildInputs = [ python3.pkgs.pip ];
tensorrt-cu12-bindings.mkDerivation.propagatedBuildInputs = [
pythonDrvs.tensorrt-cu12-libs.public
];
tensorrt-bindings.mkDerivation.propagatedBuildInputs = [ pythonDrvs.tensorrt-libs.public ];
tensorrt-libs.mkDerivation.postFixup = ''
# fixes tensorrt-llm build
tensorrt-cu12-libs.mkDerivation.postFixup = ''
pushd $out/${site}/tensorrt_libs
ln -s libnvinfer.so.9 libnvinfer.so
ln -s libnvonnxparser.so.9 libnvonnxparser.so
ln -s libnvinfer.so.10 libnvinfer.so
ln -s libnvonnxparser.so.10 libnvonnxparser.so
popd
'';
tensorrt-libs.env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ];
tensorrt-cu12-libs.env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ];
tensorrt-llm = {
mkDerivation.buildInputs = [ cudaPackages.nccl ];
mkDerivation.propagatedBuildInputs = with pythonDrvs; [
tensorrt-libs.public # libnvinfer, onnxparse
tensorrt-cu12-libs.public # libnvinfer, onnxparse
];
env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ];
env.autoPatchelfIgnoreMissingDeps = ["libcuda.so.1"];
env.autoPatchelfIgnoreMissingDeps = [ "libcuda.so.1" "libnvidia-ml.so.1" ];
mkDerivation.postInstall = ''
pushd $out/${site}/tensorrt_llm/bin
patchelf --replace-needed libnvinfer_plugin_tensorrt_llm.so{.10,} executorWorker
popd
'';
};
# has some binaries that want cudart
tritonclient.mkDerivation.postInstall = "rm -r $out/bin";
Expand All @@ -96,23 +137,10 @@ in
done
popd
'';
# patch in cuda packages from nixpkgs
nvidia-cublas-cu12.mkDerivation.postInstall = ''
pushd $out/${python3.sitePackages}/nvidia/cublas/lib
for f in ./*.so.12; do
chmod +w "$f"
rm $f
ln -s ${cudaPackages.libcublas.lib}/lib/$f ./$f
done
popd
'';
nvidia-cudnn-cu12.mkDerivation.postInstall = ''
pushd $out/${python3.sitePackages}/nvidia/cudnn/lib
for f in ./*.so.8; do
chmod +w "$f"
rm $f
ln -s ${cudaPackages.cudnn.lib}/lib/$f ./$f
done
mpi4py.mkDerivation.nativeBuildInputs = [ pkgs.removeReferencesTo ];
mpi4py.mkDerivation.postInstall = ''
pushd $out/${site}/mpi4py
remove-references-to -t ${pkgs.openmpi.dev} mpi.cfg MPI.*.so
popd
'';
};
Expand All @@ -131,27 +159,46 @@ in
deps.tensorrt-src = pkgs.fetchFromGitHub {
owner = "NVIDIA";
repo = "TensorRT";
rev = "6d1397ed4bb65933d02725623c122a157544a729"; # release/9.3 branch
hash = "sha256-XWFyMD7jjvgIihlqCJNyH5iSa1vZCDhv1maLJqMM3UE=";
rev = "v10.2.0";
hash = "sha256-Euo9VD4VTpx8XJV97IMETTAx/YkPGXiNdA39Wjp3UMU=";
};
# todo: replace with lockfile
deps.pybind11-stubgen = python3.pkgs.buildPythonPackage rec {
pname = "pybind11-stubgen";
version = "2.5";
src = pkgs.fetchPypi {
inherit pname version;
hash = "sha256-lqf+vKski/mKvUu3LMX3KbqHsjRCR0VMF1nmPN6f7zQ=";
# make a python3 environment with all the pkgs from lock.json *and* nixpkgs.python
# mainly used to build torch, which additionally requires astunparse
deps.python3-with-nixpkgs = python3.override {
packageOverrides = pyself: pysuper: (lib.mapAttrs (_: v: v.public.out) trtllm-pythonDrvs) // {
# todo: replace with lockfile?
pybind11-stubgen = pyself.buildPythonPackage rec {
pname = "pybind11-stubgen";
version = "2.5";
src = pyself.fetchPypi {
inherit pname version;
hash = "sha256-lqf+vKski/mKvUu3LMX3KbqHsjRCR0VMF1nmPN6f7zQ=";
};
};
# prevent infinite loop, don't override torch itself
inherit (pysuper) torch;
};
};
deps.tensorrt-llm = pkgs.callPackage ./nix/tensorrt-llm.nix {
inherit python3 cudaPackages pythonDrvs;
# TODO: turn into config option
inherit python3 cudaPackages;
pythonDrvs = config.deps.trtllm-env.config.pip.drvs;
withPython = false;
inherit (cfg) architectures;
inherit (deps) pybind11-stubgen tensorrt-src;
inherit (deps.python3-with-nixpkgs.pkgs) pybind11-stubgen;
inherit (deps) tensorrt-src;
};
deps.trtllm-env = trtllm-env;
deps.trtllm-backend = pkgs.callPackage ./nix/trtllm-backend.nix {
inherit python3 cudaPackages pythonDrvs;
inherit (deps) tensorrt-llm tensorrt-src;
};
deps.minimal-torch = pkgs.callPackage ./nix/torch.nix {
python3 = deps.python3-with-nixpkgs;
# todo: match/modify config.cognix.cudaPackages
cudaPackages = (pkgs.extend (self: super: {
config = super.config // {
cudaCapabilities = map toCudaCapability cfg.architectures;
};
})).cudaPackages_12_1;
};
}
24 changes: 12 additions & 12 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 16 additions & 4 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
extra-substituters = "https://storage.googleapis.com/replicate-nix-cache-dev/";
};
inputs = {
cognix.url = "github:datakami/cognix/24.03";
cognix.url = "github:datakami/cognix/24.07";
};

outputs = { self, cognix }@inputs: (cognix.lib.cognixFlake inputs {}) // {
Expand All @@ -26,23 +26,35 @@

cog-triton.architectures = architectures;
# don't need this file in a runner
python-env.pip.drvs.tensorrt-libs.mkDerivation.postInstall = lib.mkAfter ''
python-env.pip.drvs.tensorrt-cu12-libs.mkDerivation.postInstall = lib.mkAfter ''
rm $out/lib/python*/site-packages/tensorrt_libs/libnvinfer_builder_resource*
'';
});
makeBuilder = name: callCognix ( { config, lib, ... }: {
makeBuilder = name: callCognix ( { config, lib, pkgs, ... }: {
inherit name;
# only grab deps of tensorrt-llm, omegaconf, hf-transfer
cognix.python_root_packages = [ "tensorrt-llm" "omegaconf" "hf-transfer" ];
cognix.python_root_packages = [ "omegaconf" "hf-transfer" "transformers" "torch" ];

cog-triton.architectures = [ "80-real" "86-real" "90-real" ];

# override cog.yaml:
cog.concurrency.max = lib.mkForce 1;
cognix.rootPath = lib.mkForce "${./cog-trt-llm}";
# this just needs the examples/ dir
cognix.environment.TRTLLM_DIR = config.deps.tensorrt-llm.examples;
# HACK: cog needs pydantic v1, but trt-llm needs pydantic v2
cognix.environment.TRTLLM_PYTHON = config.deps.trtllm-env.config.public.pyEnv;
});
in {
cog-triton-builder = makeBuilder "cog-triton-builder";
# we want to push the model to triton-builder-h100 as well
# as cog-triton-builder, but replicate doesn't let us.
# so let's add some data to fool it
cog-triton-builder-h100 = ((makeBuilder "cog-triton-builder-h100").extendModules {
modules = [{
cognix.environment.TRTLLM_BUILDER_VARIANT = "h100";
}];
}).config.public;
cog-triton-runner-80 = makeRunner "cog-triton-runner-80" ["80-real"] {};
cog-triton-runner-86 = makeRunner "cog-triton-runner-86" ["86-real"] {};
cog-triton-runner-90 = makeRunner "cog-triton-runner-90" ["90-real"] {};
Expand Down
2 changes: 2 additions & 0 deletions interface.nix
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,7 @@
# https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
# 80: A100, 86: A5000, A40, A800, 89: L40, 90: H100
};
torchSourceBuild = mkEnableOption "Build Torch from source to be smaller";
trtllmSourceBuild = mkEnableOption "Build trtllm python from source to be smaller";
};
}
Loading
Loading