From d12ae7b72f28b634a683eab69e38c5e9e40be487 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Wed, 12 Jun 2024 22:02:33 +0200 Subject: [PATCH 01/35] tensorrt-llm: 0.9.0 -> 0.10.0, triton: 2.42.0 -> 2.44.0 --- default.nix | 54 ++++----- flake.nix | 2 +- lock.json | 266 +++++++++++++++++++++++------------------ nix/tensorrt-llm.nix | 35 ++++-- nix/trtllm-backend.nix | 17 +-- 5 files changed, 213 insertions(+), 161 deletions(-) diff --git a/default.nix b/default.nix index c3a72ee..7609e59 100644 --- a/default.nix +++ b/default.nix @@ -15,36 +15,28 @@ in cog_version = "0.10.0-alpha16"; cuda = "12.1"; # todo: 12.2 gpu = true; - # inspiration: echo tensorrt_llm==0.8.0 | uv pip compile - --extra-index-url https://pypi.nvidia.com -p 3.10 --prerelease=allow --annotation-style=line + # inspiration: echo tensorrt_llm==0.10.0 | uv pip compile - --extra-index-url https://pypi.nvidia.com -p 3.10 --prerelease=allow --annotation-style=line python_packages = [ "--extra-index-url" "https://pypi.nvidia.com" - "tensorrt_llm==0.9.0" + "tensorrt_llm==0.10.0" "torch==2.2.2" - "tensorrt==9.3.0.post12.dev1" - "tensorrt-bindings==9.3.0.post12.dev1" - "tensorrt-libs==9.3.0.post12.dev1" - "nvidia-pytriton==0.5.2" # corresponds to 2.42.0 - "httpx" - "nvidia-cublas-cu12<12.2" - "nvidia-cuda-nvrtc-cu12<12.2" - "nvidia-cuda-runtime-cu12<12.2" + "nvidia-pytriton==0.5.6" # corresponds to 2.44.0 "omegaconf" "hf-transfer" - "tokenizers" + "tokenizers>=0.19.0" ]; # don't ask why it needs ssh system_packages = [ "pget" "openssh" "openmpi" ]; }; python-env.pip = { uv.enable = true; - # todo: add some constraints to match cudaPackages constraintsList = [ - "nvidia-cudnn-cu12<9" + # "nvidia-cudnn-cu12==${cudaPackages.cudnn.version}" + "nvidia-cublas-cu12==${cudaPackages.libcublas.version}" ]; overridesList = [ - "tokenizers==0.19.0" - "transformers==4.40.0" + "pydantic==1.10.16" ]; }; cognix.includeNix = true; @@ -56,27 +48,31 @@ in # tensorrt likes doing a pip invocation from it's setup.py # circumvent by manually depending on tensorrt_libs, tensorrt_bindings # and setting this env variable - tensorrt.env.NVIDIA_TENSORRT_DISABLE_INTERNAL_PIP = true; - # TODO remove upon next rebuild: - tensorrt.mkDerivation.propagatedBuildInputs = with pythonDrvs; [ - tensorrt-libs.public - tensorrt-bindings.public + tensorrt-cu12.env.NVIDIA_TENSORRT_DISABLE_INTERNAL_PIP = true; + tensorrt-cu12.mkDerivation.buildInputs = [ python3.pkgs.pip ]; + tensorrt-cu12-bindings.mkDerivation.propagatedBuildInputs = [ + pythonDrvs.tensorrt-cu12-libs.public ]; - tensorrt-bindings.mkDerivation.propagatedBuildInputs = [ pythonDrvs.tensorrt-libs.public ]; - tensorrt-libs.mkDerivation.postFixup = '' + # fixes tensorrt-llm build + tensorrt-cu12-libs.mkDerivation.postFixup = '' pushd $out/${site}/tensorrt_libs - ln -s libnvinfer.so.9 libnvinfer.so - ln -s libnvonnxparser.so.9 libnvonnxparser.so + ln -s libnvinfer.so.10 libnvinfer.so + ln -s libnvonnxparser.so.10 libnvonnxparser.so popd ''; - tensorrt-libs.env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ]; + tensorrt-cu12-libs.env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ]; tensorrt-llm = { mkDerivation.buildInputs = [ cudaPackages.nccl ]; mkDerivation.propagatedBuildInputs = with pythonDrvs; [ - tensorrt-libs.public # libnvinfer, onnxparse + tensorrt-cu12-libs.public # libnvinfer, onnxparse ]; env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ]; - env.autoPatchelfIgnoreMissingDeps = ["libcuda.so.1"]; + env.autoPatchelfIgnoreMissingDeps = [ "libcuda.so.1" "libnvidia-ml.so.1" ]; + mkDerivation.postInstall = '' + pushd $out/${site}/tensorrt_llm/bin + patchelf --replace-needed libnvinfer_plugin_tensorrt_llm.so{.10,} executorWorker + popd + ''; }; # has some binaries that want cudart tritonclient.mkDerivation.postInstall = "rm -r $out/bin"; @@ -131,8 +127,8 @@ in deps.tensorrt-src = pkgs.fetchFromGitHub { owner = "NVIDIA"; repo = "TensorRT"; - rev = "6d1397ed4bb65933d02725623c122a157544a729"; # release/9.3 branch - hash = "sha256-XWFyMD7jjvgIihlqCJNyH5iSa1vZCDhv1maLJqMM3UE="; + rev = "v10.0.1"; + hash = "sha256-lSEw0GM0eW2BHNBq/wTQA8v3aNueE3FT+k9F5nH1OgA="; }; # todo: replace with lockfile deps.pybind11-stubgen = python3.pkgs.buildPythonPackage rec { diff --git a/flake.nix b/flake.nix index 6743fca..6ed8aca 100644 --- a/flake.nix +++ b/flake.nix @@ -26,7 +26,7 @@ cog-triton.architectures = architectures; # don't need this file in a runner - python-env.pip.drvs.tensorrt-libs.mkDerivation.postInstall = lib.mkAfter '' + python-env.pip.drvs.tensorrt-cu12-libs.mkDerivation.postInstall = lib.mkAfter '' rm $out/lib/python*/site-packages/tensorrt_libs/libnvinfer_builder_resource* ''; }); diff --git a/lock.json b/lock.json index 7207c4c..4acd4a4 100644 --- a/lock.json +++ b/lock.json @@ -2,10 +2,10 @@ "fetchPipMetadata": { "sources": { "accelerate": { - "sha256": "c7bb817eb974bba0ff3ea1ba0f24d55afb86d50e3d4fe98d6922dc69cf2ccff1", + "sha256": "71fcf4be00872194071de561634268b71417d7f5b16b178e2fa76b6f117c52b0", "type": "url", - "url": "https://files.pythonhosted.org/packages/f7/fc/c55e5a2da345c9a24aa2e1e0f60eb2ca290b6a41be82da03a6d4baec4f99/accelerate-0.25.0-py3-none-any.whl", - "version": "0.25.0" + "url": "https://files.pythonhosted.org/packages/e4/74/564f621699b049b0358f7ad83d7437f8219a5d6efb69bbfcca328b60152f/accelerate-0.32.1-py3-none-any.whl", + "version": "0.32.1" }, "aiohttp": { "sha256": "c26959ca7b75ff768e2776d8055bf9582a6267e24556bb7f7bd29e677932be72", @@ -73,6 +73,12 @@ "url": "https://files.pythonhosted.org/packages/00/2e/d53fa4befbf2cfa713304affc7ca780ce4fc1fd8710527771b58311a3229/click-8.1.7-py3-none-any.whl", "version": "8.1.7" }, + "cloudpickle": { + "sha256": "246ee7d0c295602a036e86369c77fecda4ab17b506496730f2f576d9016fd9c7", + "type": "url", + "url": "https://files.pythonhosted.org/packages/96/43/dae06432d0c4b1dc9e9149ad37b4ca8384cf6eb7700cd9215b177b914f0a/cloudpickle-3.0.0-py3-none-any.whl", + "version": "3.0.0" + }, "cog": { "sha256": "0f658f2da28e37da8040d073af4f4e7a91b567a8d169f077d5afddc33793a62f", "type": "url", @@ -104,10 +110,10 @@ "version": "2.14.4" }, "diffusers": { - "sha256": "ca258d8141a9faa85b3ce60805fc4898c91d0e73fd5b1576413dfe3b8502a8ec", + "sha256": "d5e9bb13c8097b4eed10df23d1294d2e5a418f53e3f89c7ef228b5b982970428", "type": "url", - "url": "https://files.pythonhosted.org/packages/13/43/d4ae69ba5f503d58c7aef13f0f93d9c84694652dc2a16f8ea3d8246ebe95/diffusers-0.15.0-py3-none-any.whl", - "version": "0.15.0" + "url": "https://files.pythonhosted.org/packages/ee/22/2e6e90c87e718e63b1a860cb627bcf27ac4998edb5f190561b5c6cde6c62/diffusers-0.29.2-py3-none-any.whl", + "version": "0.29.2" }, "dill": { "sha256": "76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e", @@ -139,12 +145,6 @@ "url": "https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl", "version": "3.15.4" }, - "flatbuffers": { - "sha256": "8dbdec58f935f3765e4f7f3cf635ac3a77f83568138d6a2311f524ec96364812", - "type": "url", - "url": "https://files.pythonhosted.org/packages/41/f0/7e988a019bc54b2dbd0ad4182ef2d53488bb02e58694cd79d61369e85900/flatbuffers-24.3.25-py2.py3-none-any.whl", - "version": "24.3.25" - }, "frozenlist": { "sha256": "a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a", "type": "url", @@ -277,12 +277,24 @@ "url": "https://files.pythonhosted.org/packages/e7/9c/eef7c591e6dc952f3636cfe0df712c0f9916cedf317810a3bb53ccb65cdd/lark-1.1.9-py3-none-any.whl", "version": "1.1.9" }, + "markdown-it-py": { + "sha256": "355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", + "type": "url", + "url": "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", + "version": "3.0.0" + }, "markupsafe": { "sha256": "2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f", "type": "url", "url": "https://download.pytorch.org/whl/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", "version": "2.1.5" }, + "mdurl": { + "sha256": "84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", + "type": "url", + "url": "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", + "version": "0.1.2" + }, "mpi4py": { "sha256": "c8fa625e0f92b082ef955bfb52f19fa6691d29273d7d71135d295aa143dee6cb", "type": "url", @@ -331,12 +343,6 @@ "url": "https://files.pythonhosted.org/packages/4b/d7/ecf66c1cd12dc28b4040b15ab4d17b773b87fa9d29ca16125de01adb36cd/numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", "version": "1.26.4" }, - "nvidia-ammo": { - "sha256": "ed6b0aa3748e735923ce3825c0044a130400fcd040a2bb54580e4bcd7ef605d3", - "type": "url", - "url": "https://pypi.nvidia.com/nvidia-ammo/nvidia_ammo-0.7.4-cp310-cp310-linux_x86_64.whl", - "version": "0.7.4" - }, "nvidia-cublas-cu12": { "sha256": "ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728", "type": "url", @@ -391,6 +397,12 @@ "url": "https://pypi.nvidia.com/nvidia-cusparse-cu12/nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", "version": "12.1.0.106" }, + "nvidia-modelopt": { + "sha256": "f56f04280fef23727a49decf13ff8269c9cc47b95fc304fcefed79fbe8e6ef5f", + "type": "url", + "url": "https://pypi.nvidia.com/nvidia-modelopt/nvidia_modelopt-0.11.2-cp310-cp310-manylinux_2_28_x86_64.whl", + "version": "0.11.2" + }, "nvidia-nccl-cu12": { "sha256": "802756f02c43c0613dc83f48a76f702462b0f1f618411768748bba9c805fce19", "type": "url", @@ -410,10 +422,10 @@ "version": "12.1.105" }, "nvidia-pytriton": { - "sha256": "810531f752f7bdc4308b8821056ce2d5a456e6cb62966f2e07f65cff0053e42a", + "sha256": "6403e65c2bbab0ab2fe2b737ad612e2b88f3edf20d41aadd1d544ffb309a701c", "type": "url", - "url": "https://pypi.nvidia.com/nvidia-pytriton/nvidia_pytriton-0.5.2-py3-none-manylinux_2_35_x86_64.whl", - "version": "0.5.2" + "url": "https://pypi.nvidia.com/nvidia-pytriton/nvidia_pytriton-0.5.6-py3-none-manylinux_2_35_x86_64.whl", + "version": "0.5.6" }, "omegaconf": { "sha256": "7b4df175cdb08ba400f45cae3bdcae7ba8365db4d165fc65fd04b050ab63b46b", @@ -427,23 +439,11 @@ "url": "https://files.pythonhosted.org/packages/c6/7e/5031717c0636e6074764a2f61a459a3ecd46c20d8b83a1f1cd2513a76160/onnx-1.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", "version": "1.16.1" }, - "onnx-graphsurgeon": { - "sha256": "10c130d6129fdeee02945f8103b5b112e6fd4d9b356e2dd3e80f53e0ebee7b5c", - "type": "url", - "url": "https://pypi.nvidia.com/onnx-graphsurgeon/onnx_graphsurgeon-0.5.2-py2.py3-none-any.whl", - "version": "0.5.2" - }, - "onnxruntime": { - "sha256": "ef2b1fc269cabd27f129fb9058917d6fdc89b188c49ed8700f300b945c81f889", - "type": "url", - "url": "https://files.pythonhosted.org/packages/7a/cf/6aa8c56fd63f53c2c485921e411269c7b501a2b4e634bd02f226ab2d5d8e/onnxruntime-1.16.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "1.16.3" - }, "optimum": { - "sha256": "1354dd1081179b7c490d135c7f380cee672125e17c0bfef143e616c5b756b1db", + "sha256": "8b3633b9312413ceac5156294a2a0cd221268baf5a2c593f4d54ec20bff296d8", "type": "url", - "url": "https://files.pythonhosted.org/packages/13/6d/6b03ffb8df1ab2b43d461f7cace2af5f20092f0767f53a3e9331df00e8a2/optimum-1.21.1-py3-none-any.whl", - "version": "1.21.1" + "url": "https://files.pythonhosted.org/packages/fa/e4/f832e42a1eb9d5ac4fa6379295e05aebeae507d171babc1786bfa0210299/optimum-1.21.2-py3-none-any.whl", + "version": "1.21.2" }, "packaging": { "sha256": "5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124", @@ -494,10 +494,16 @@ "version": "16.1.0" }, "pydantic": { - "sha256": "371dcf1831f87c9e217e2b6a0c66842879a14873114ebb9d0861ab22e3b5bb1e", + "sha256": "4660dd697de1ae2d4305a85161312611f64d5360663a9ba026cd6ad9e3fe14c3", + "type": "url", + "url": "https://files.pythonhosted.org/packages/ae/d8/3ffbdeccf252d56c8e0b6f1f30798d3aa0ad5afaa541908207881855beeb/pydantic-1.10.16-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "1.10.16" + }, + "pygments": { + "sha256": "b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a", "type": "url", - "url": "https://files.pythonhosted.org/packages/ef/a6/080cace699e89a94bd4bf34e8c12821d1f05fe4d56a0742f797b231d9a40/pydantic-1.10.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "1.10.17" + "url": "https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl", + "version": "2.18.0" }, "pynvml": { "sha256": "5cce014ac01b098d08f06178f86c37be409b80b2e903a5a03ce15eed60f55e25", @@ -542,10 +548,10 @@ "version": "6.0.1" }, "pyzmq": { - "sha256": "7e0113d70b095339e99bb522fe7294f5ae6a7f3b2b8f52f659469a74b5cc7661", + "sha256": "ba6e5e6588e49139a0979d03a7deb9c734bde647b9a8808f26acf9c547cab1bf", "type": "url", - "url": "https://files.pythonhosted.org/packages/b7/ac/18b75626cede66295a27e94d7cfe301d2d35120b200a6a46f205a171a20e/pyzmq-23.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "23.2.1" + "url": "https://files.pythonhosted.org/packages/40/4f/088d0fe18b188a0754483b7d632a97ef608dce80c2648219d071c9f1715c/pyzmq-26.0.3-cp310-cp310-manylinux_2_28_x86_64.whl", + "version": "26.0.3" }, "regex": { "sha256": "1337b7dbef9b2f71121cdbf1e97e40de33ff114801263b275aafd75303bd62b5", @@ -559,6 +565,12 @@ "url": "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", "version": "2.32.3" }, + "rich": { + "sha256": "4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222", + "type": "url", + "url": "https://files.pythonhosted.org/packages/87/67/a37f6214d0e9fe57f6ae54b2956d550ca8365857f42a1ce0392bb21d9410/rich-13.7.1-py3-none-any.whl", + "version": "13.7.1" + }, "safetensors": { "sha256": "d88b33980222085dd6001ae2cad87c6068e0991d4f5ccf44975d216db3b57376", "type": "url", @@ -578,16 +590,22 @@ "version": "0.2.0" }, "setuptools": { - "sha256": "b8b8060bb426838fbe942479c90296ce976249451118ef566a5a0b7d8b78fb05", + "sha256": "fe384da74336c398e0d956d1cae0669bc02eed936cdb1d49b57de1990dc11ffc", "type": "url", - "url": "https://files.pythonhosted.org/packages/42/54/2a8ecfcc9a714a6fbf86559a4b0f50b126a4ac4269ea8134f2c75c3e73de/setuptools-70.2.0-py3-none-any.whl", - "version": "70.2.0" + "url": "https://files.pythonhosted.org/packages/ef/15/88e46eb9387e905704b69849618e699dc2f54407d8953cc4ec4b8b46528d/setuptools-70.3.0-py3-none-any.whl", + "version": "70.3.0" }, "sh": { - "sha256": "e4045b6c732d9ce75d571c79f5ac2234edd9ae4f5fa9d59b09705082bdca18c7", + "sha256": "2f2f79a65abd00696cf2e9ad26508cf8abb6dba5745f40255f1c0ded2876926d", + "type": "url", + "url": "https://files.pythonhosted.org/packages/15/c2/79f9dea6fc544c0eb79ed5018a38860c52d597c4be66c2cf2029bea5b3fd/sh-2.0.7-py3-none-any.whl", + "version": "2.0.7" + }, + "shellingham": { + "sha256": "7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", "type": "url", - "url": "https://files.pythonhosted.org/packages/b7/09/89c28aaf2a49f226fef8587c90c6386bd2cc03a0295bc4ff7fc6ee43c01d/sh-1.14.3.tar.gz", - "version": "1.14.3" + "url": "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", + "version": "1.5.4" }, "six": { "sha256": "8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254", @@ -620,40 +638,46 @@ "version": "24.2.0" }, "sympy": { - "sha256": "9b2cbc7f1a640289430e13d2a56f02f867a1da0190f2f99d8968c2f74da0e515", + "sha256": "6b0b32a4673fb91bd3cac3b55406c8e01d53ae22780be467301cc452f6680c92", "type": "url", - "url": "https://files.pythonhosted.org/packages/61/53/e18c8c97d0b2724d85c9830477e3ebea3acf1dcdc6deb344d5d9c93a9946/sympy-1.12.1-py3-none-any.whl", - "version": "1.12.1" + "url": "https://files.pythonhosted.org/packages/62/74/7e6c65ee89ff43942bffffdbb238634f16967bf327aee3c76efcf6e49587/sympy-1.13.0-py3-none-any.whl", + "version": "1.13.0" }, "tensorrt": { - "sha256": "24aea5376cb8440afe2b0a22ee83f9748e586aa27303d4f80091ad48a56552a4", + "sha256": "7e9c8666f5bee86771451f007e25f81d65a411a26e6ea0b41faa5ec83ab863af", + "type": "url", + "url": "https://pypi.nvidia.com/tensorrt/tensorrt-10.0.1.tar.gz", + "version": "10.0.1" + }, + "tensorrt-cu12": { + "sha256": "9663446e2872113d619ad5010766cccc1f023d693cb43c3f8f2496563028badc", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt/tensorrt-9.3.0.post12.dev1.tar.gz", - "version": "9.3.0.post12.dev1" + "url": "https://pypi.nvidia.com/tensorrt-cu12/tensorrt-cu12-10.2.0.post1.tar.gz", + "version": "10.2.0.post1" }, - "tensorrt-bindings": { - "sha256": "c1619e4a9b23b077717af7635489cd1a12a8b4d97477088fc3c5d3a81e36bf65", + "tensorrt-cu12-bindings": { + "sha256": "3248e7951d1f2fa8884759b19456ab7d08a3f75bd6b8e5d58e5cc18788c02171", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-bindings/tensorrt_bindings-9.3.0.post12.dev1-cp310-none-manylinux_2_17_x86_64.whl", - "version": "9.3.0.post12.dev1" + "url": "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.2.0.post1-cp310-none-manylinux_2_17_x86_64.whl", + "version": "10.2.0.post1" }, - "tensorrt-libs": { - "sha256": "ab0b6ee6cd41503273d44892cb92b92c75d046a5e468b73884978f59cca4b8d9", + "tensorrt-cu12-libs": { + "sha256": "a42f7ecb1659fac27cf68996df0984e68018be61bd8bbd95f51619f9c4e9cf31", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-libs/tensorrt_libs-9.3.0.post12.dev1-py2.py3-none-manylinux_2_17_x86_64.whl", - "version": "9.3.0.post12.dev1" + "url": "https://pypi.nvidia.com/tensorrt-cu12-libs/tensorrt_cu12_libs-10.2.0.post1-py2.py3-none-manylinux_2_17_x86_64.whl", + "version": "10.2.0.post1" }, "tensorrt-llm": { - "sha256": "2f60b6f8d0afee5f52a5160a44815b0af3e9cd4c46b53cc7a252377ed6cec670", + "sha256": "c7975326fa10b56079e0febf7c52a65ccf5b37760cd1c79d5aa3e8c7d85ce69c", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.9.0-cp310-cp310-linux_x86_64.whl", - "version": "0.9.0" + "url": "https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.10.0-cp310-cp310-linux_x86_64.whl", + "version": "0.10.0" }, "tokenizers": { - "sha256": "06a56acdfe6c5d51c03ebfc6838f727fcf231c035b94f2460cca68947f6799dc", + "sha256": "8b01afb7193d47439f091cd8f070a1ced347ad0f9144952a30a41836902fe09e", "type": "url", - "url": "https://files.pythonhosted.org/packages/11/f9/8c77a471469ea7d1b52f2a25607385109c954d6444a9b0df19796beba461/tokenizers-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "0.19.0" + "url": "https://files.pythonhosted.org/packages/40/4f/eb78de4af3b17b589f43a369cbf0c3a7173f25c3d2cd93068852c07689aa/tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "0.19.1" }, "tomli": { "sha256": "939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", @@ -674,10 +698,10 @@ "version": "4.66.4" }, "transformers": { - "sha256": "92797ec3368ed4476a053529a4039a12ad09167d9e371981dda4afb4bdf590ac", + "sha256": "71cb94301ec211a2e1d4b8c8d18dcfaa902dfa00a089dceca167a8aa265d6f2d", "type": "url", - "url": "https://files.pythonhosted.org/packages/09/c8/844d5518a6aeb4ffdc0cf0cae65ae13dbe5838306728c5c640b5a6e2a0c9/transformers-4.40.0-py3-none-any.whl", - "version": "4.40.0" + "url": "https://files.pythonhosted.org/packages/05/23/ba02efa28518557e0cfe0ce5c1170000dd7501ed02ac865fc90cbe3daa93/transformers-4.40.2-py3-none-any.whl", + "version": "4.40.2" }, "triton": { "sha256": "a2294514340cfe4e8f4f9e5c66c702744c4a117d25e618bd08469d0bfed1e2e5", @@ -691,6 +715,12 @@ "url": "https://pypi.nvidia.com/tritonclient/tritonclient-2.47.0-py3-none-manylinux1_x86_64.whl", "version": "2.47.0" }, + "typer": { + "sha256": "070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914", + "type": "url", + "url": "https://files.pythonhosted.org/packages/20/b5/11cf2e34fbb11b937e006286ab5b8cfd334fde1c8fa4dd7f491226931180/typer-0.12.3-py3-none-any.whl", + "version": "0.12.3" + }, "typing-extensions": { "sha256": "04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", "type": "url", @@ -698,10 +728,10 @@ "version": "4.12.2" }, "typing-inspect": { - "sha256": "3b98390df4d999a28cf5b35d8b333425af5da2ece8a4ea9e98f71e7591347b4f", + "sha256": "9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f", "type": "url", - "url": "https://files.pythonhosted.org/packages/42/1c/66402db44184904a2f14722d317a4da0b5c8c78acfc3faf74362566635c5/typing_inspect-0.6.0-py3-none-any.whl", - "version": "0.6.0" + "url": "https://files.pythonhosted.org/packages/65/f3/107a22063bf27bdccf2024833d3445f4eea42b2e598abfbd46f6a63b6cb0/typing_inspect-0.9.0-py3-none-any.whl", + "version": "0.9.0" }, "tzdata": { "sha256": "9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252", @@ -791,8 +821,7 @@ "psutil", "pyyaml", "safetensors", - "torch", - "transformers" + "torch" ], "aiohttp": [ "aiosignal", @@ -823,6 +852,7 @@ "certifi": [], "charset-normalizer": [], "click": [], + "cloudpickle": [], "cog": [ "attrs", "fastapi", @@ -852,7 +882,6 @@ "pyyaml", "requests", "tqdm", - "transformers", "xxhash" ], "diffusers": [ @@ -863,7 +892,7 @@ "pillow", "regex", "requests", - "transformers" + "safetensors" ], "dill": [], "evaluate": [ @@ -877,7 +906,6 @@ "pandas", "requests", "tqdm", - "transformers", "xxhash" ], "exceptiongroup": [], @@ -886,7 +914,6 @@ "starlette" ], "filelock": [], - "flatbuffers": [], "frozenlist": [], "fsspec": [ "aiohttp" @@ -949,7 +976,11 @@ "markupsafe" ], "lark": [], + "markdown-it-py": [ + "mdurl" + ], "markupsafe": [], + "mdurl": [], "mpi4py": [], "mpmath": [], "multidict": [], @@ -960,18 +991,6 @@ "networkx": [], "ninja": [], "numpy": [], - "nvidia-ammo": [ - "networkx", - "ninja", - "numpy", - "onnx", - "onnx-graphsurgeon", - "onnxruntime", - "scipy", - "torch", - "tqdm", - "transformers" - ], "nvidia-cublas-cu12": [], "nvidia-cuda-cupti-cu12": [], "nvidia-cuda-nvrtc-cu12": [], @@ -989,15 +1008,28 @@ "nvidia-cusparse-cu12": [ "nvidia-nvjitlink-cu12" ], + "nvidia-modelopt": [ + "cloudpickle", + "ninja", + "numpy", + "packaging", + "pydantic", + "rich", + "scipy", + "tqdm" + ], "nvidia-nccl-cu12": [], "nvidia-nvjitlink-cu12": [], "nvidia-nvtx-cu12": [], "nvidia-pytriton": [ + "grpcio", + "importlib-metadata", "numpy", "protobuf", "pyzmq", "sh", "tritonclient", + "typer", "typing-inspect", "wrapt" ], @@ -1009,18 +1041,6 @@ "numpy", "protobuf" ], - "onnx-graphsurgeon": [ - "numpy", - "onnx" - ], - "onnxruntime": [ - "coloredlogs", - "flatbuffers", - "numpy", - "packaging", - "protobuf", - "sympy" - ], "optimum": [ "coloredlogs", "datasets", @@ -1049,6 +1069,7 @@ "pydantic": [ "typing-extensions" ], + "pygments": [], "pynvml": [], "pyproject-hooks": [], "python-dateutil": [ @@ -1066,6 +1087,10 @@ "idna", "urllib3" ], + "rich": [ + "markdown-it-py", + "pygments" + ], "safetensors": [], "scipy": [ "numpy" @@ -1073,6 +1098,7 @@ "sentencepiece": [], "setuptools": [], "sh": [], + "shellingham": [], "six": [], "sniffio": [], "starlette": [ @@ -1084,14 +1110,15 @@ "mpmath" ], "tensorrt": [ - "tensorrt-bindings", - "tensorrt-libs" + "tensorrt-cu12" ], - "tensorrt-bindings": [], - "tensorrt-libs": [ - "nvidia-cublas-cu12", - "nvidia-cuda-runtime-cu12", - "nvidia-cudnn-cu12" + "tensorrt-cu12": [ + "tensorrt-cu12-bindings", + "tensorrt-cu12-libs" + ], + "tensorrt-cu12-bindings": [], + "tensorrt-cu12-libs": [ + "nvidia-cuda-runtime-cu12" ], "tensorrt-llm": [ "accelerate", @@ -1106,17 +1133,16 @@ "mpi4py", "mpmath", "numpy", - "nvidia-ammo", - "nvidia-cudnn-cu12", + "nvidia-modelopt", "onnx", "optimum", "pandas", "polygraphy", "psutil", "pulp", + "pydantic", "pynvml", "sentencepiece", - "setuptools", "strenum", "tensorrt", "torch", @@ -1153,10 +1179,13 @@ "huggingface-hub", "numpy", "packaging", + "protobuf", + "pydantic", "pyyaml", "regex", "requests", "safetensors", + "sentencepiece", "tokenizers", "tqdm" ], @@ -1165,7 +1194,6 @@ ], "tritonclient": [ "aiohttp", - "cuda-python", "geventhttpclient", "grpcio", "numpy", @@ -1174,6 +1202,12 @@ "python-rapidjson", "urllib3" ], + "typer": [ + "click", + "rich", + "shellingham", + "typing-extensions" + ], "typing-extensions": [], "typing-inspect": [ "mypy-extensions", @@ -1214,5 +1248,5 @@ } } }, - "invalidationHash": "aea5c24536de46921b0505e9f29e379558d83bbd76f08cf2f49f8ffe84243032" + "invalidationHash": "aedf040e5687ab8badc94e4500a11b3037a51c13346051fbcf5f441fd85fcfbb" } \ No newline at end of file diff --git a/nix/tensorrt-llm.nix b/nix/tensorrt-llm.nix index 20f901d..cb315ca 100644 --- a/nix/tensorrt-llm.nix +++ b/nix/tensorrt-llm.nix @@ -17,14 +17,14 @@ }: stdenv.mkDerivation (o: { pname = "tensorrt_llm"; - version = "0.9.0"; + version = "0.10.0"; src = fetchFromGitHub { owner = "NVIDIA"; repo = "TensorRT-LLM"; rev = "v${o.version}"; fetchSubmodules = true; fetchLFS = true; # libtensorrt_llm_batch_manager_static.a - hash = "sha256-BGU56yI6yuTGHYhq5I3xYhrsKI8O4ykhDFeRP/JGCRo="; + hash = "sha256-eOAixXzOQRaySbUtpeAF9qMFOzwe1rosC0GOgy8CakU="; }; outputs = if withPython then @@ -54,6 +54,10 @@ stdenv.mkDerivation (o: { # torch hates the split cuda, so only do it without torch cudaPackages.cuda_cudart cudaPackages.cuda_nvcc.dev + cudaPackages.cuda_nvrtc.dev + cudaPackages.cuda_nvrtc.lib + cudaPackages.cuda_nvml_dev.lib + cudaPackages.cuda_nvml_dev.dev cudaPackages.cuda_cccl cudaPackages.libcublas.lib cudaPackages.libcublas.dev @@ -85,8 +89,8 @@ stdenv.mkDerivation (o: { pynvml # >=11.5.0 sentencepiece # >=0.1.99 tensorrt # ==9.2.0.post12.dev5 - tensorrt-bindings # missed transitive dep - tensorrt-libs + tensorrt-cu12-bindings # missed transitive dep + tensorrt-cu12-libs torch # <=2.2.0a nvidia-ammo # ~=0.7.0; platform_machine=="x86_64" transformers # ==4.36.1 @@ -109,11 +113,16 @@ stdenv.mkDerivation (o: { "-DBUILD_PYBIND=${if withPython then "ON" else "OFF"}" # needs BUILD_PYT "-DBUILD_TESTS=OFF" # needs nvonnxparser.h # believe it or not, this is the actual binary distribution channel for tensorrt: - "-DTRT_LIB_DIR=${pythonDrvs.tensorrt-libs.public}/${python3.sitePackages}/tensorrt_libs" + "-DTRT_LIB_DIR=${pythonDrvs.tensorrt-cu12-libs.public}/${python3.sitePackages}/tensorrt_libs" "-DTRT_INCLUDE_DIR=${tensorrt-src}/include" "-DCMAKE_CUDA_ARCHITECTURES=${builtins.concatStringsSep ";" architectures}" # "-DFAST_BUILD=ON" ]; + # include cstdint in cpp/tensorrt_llm/common/mpiUtils.h after pragma once + postPatch = '' + sed -i 's/#include /#include \n#include /' /build/source/cpp/include/tensorrt_llm/common/mpiUtils.h + sed -i 's/#pragma once/#pragma once\n#include /' /build/source/cpp/tensorrt_llm/kernels/lruKernel.h + ''; postBuild = lib.optionalString withPython '' pushd ../../ chmod -R +w . @@ -137,17 +146,29 @@ stdenv.mkDerivation (o: { mkdir -p $out ${rsync}/bin/rsync -a --exclude "tensorrt_llm/kernels" $src/cpp $out/ chmod -R u+w $out/cpp + ${rsync}/bin/rsync -a $src/cpp/tensorrt_llm/kernels $out/cpp/tensorrt_llm/ + chmod -R u+w $out/cpp mkdir -p $out/cpp/build/tensorrt_llm/plugins pushd tensorrt_llm cp ./libtensorrt_llm.so $out/cpp/build/tensorrt_llm/ + cp -r ./executor_worker $out/cpp/build/tensorrt_llm/ + chmod -R u+w $out/cpp/build/tensorrt_llm/executor_worker patchelf --add-needed 'libcudnn.so.8' --add-rpath ${cudaPackages.cudnn.lib}/lib $out/cpp/build/tensorrt_llm/libtensorrt_llm.so cp ./plugins/libnvinfer_plugin_tensorrt_llm.so* $out/cpp/build/tensorrt_llm/plugins/ - for f in $out/cpp/build/tensorrt_llm/plugins/*.so*; do + mkdir -p $out/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/ + cp -r /build/source/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper $_ + for f in $out/cpp/build/tensorrt_llm/plugins/*.so* $out/cpp/build/tensorrt_llm/executor_worker/executorWorker; do if [ ! -L "$f" ]; then - new_path=$(patchelf --print-rpath "$f" | sed 's#/build/source/cpp/build/tensorrt_llm#$ORIGIN/..#') + new_path=$(patchelf --print-rpath "$f" | + sed 's#/build/source/cpp/build/tensorrt_llm#$ORIGIN/..#g' | + sed 's#/build/source/cpp/tensorrt_llm#$ORIGIN/../../../tensorrt_llm#g' + ) patchelf --set-rpath "$new_path" "$f" fi done + new_path=$(patchelf --print-rpath $out/cpp/build/tensorrt_llm/libtensorrt_llm.so | + sed 's#/build/source/cpp/tensorrt_llm#$ORIGIN/../../tensorrt_llm#') + patchelf --set-rpath "$new_path" $out/cpp/build/tensorrt_llm/libtensorrt_llm.so popd '' + (lib.optionalString withPython '' diff --git a/nix/trtllm-backend.nix b/nix/trtllm-backend.nix index 4dab9b2..b52d81c 100644 --- a/nix/trtllm-backend.nix +++ b/nix/trtllm-backend.nix @@ -28,12 +28,11 @@ let rev = "a06e9a1157d6b5b9b34b6d05a07bb84d517f17c9"; hash = "sha256-Ju2zV/jHUuciTs6GbkqcPG8U0y2lkIWSdAsX78DrpV4="; }; - # todo: update with trt-llm 0.9? deps.triton_repo_core = fetchFromGitHub { owner = "triton-inference-server"; repo = "core"; - rev = "5d4a99c285c729a349265ce8dd7a4535e59d29b1"; - hash = "sha256-WP8bwplo98GmNulX+QA+IrQEc2+GMcTjV53K438vX1g="; + rev = "434e50313b80fdc7ef295fcb3baeeacf65b295e4"; + hash = "sha256-kfDXQEYuMze4E53OHHJ1YjQHnNtAEt4lzNK27K6ttVE="; }; deps.googletest = fetchFromGitHub { owner = "google"; @@ -43,18 +42,18 @@ let }; inherit (python3) sitePackages; - trt_lib_dir = "${pythonDrvs.tensorrt-libs.public}/${sitePackages}/tensorrt_libs"; + trt_lib_dir = "${pythonDrvs.tensorrt-cu12-libs.public}/${sitePackages}/tensorrt_libs"; # this package wants gcc12 oldGccStdenv = stdenvAdapters.useLibsFrom stdenv gcc12Stdenv; in oldGccStdenv.mkDerivation rec { pname = "tensorrtllm_backend"; - version = "0.9.0"; + version = "0.10.0"; src = fetchFromGitHub { owner = "triton-inference-server"; repo = "tensorrtllm_backend"; rev = "v${version}"; - hash = "sha256-aNjVYu7sDrIj/lse/wS3vYaR/vmjtZfxzBWYi3z3KqQ="; + hash = "sha256-6df9MbHPqBVxpdkTcEzf99OCPtgFrK0jjDJfvE/guyA="; }; nativeBuildInputs = [ cmake @@ -70,6 +69,8 @@ oldGccStdenv.mkDerivation rec { cudaPackages.cuda_cccl cudaPackages.libcublas.lib cudaPackages.libcublas.dev + cudaPackages.cuda_nvml_dev.lib + cudaPackages.cuda_nvml_dev.dev ]; sourceRoot = "source/inflight_batcher_llm"; cmakeFlags = [ @@ -84,7 +85,7 @@ oldGccStdenv.mkDerivation rec { ]; postInstall = '' mkdir -p $out/backends/tensorrtllm - cp libtriton_*.so triton_tensorrtllm_worker $out/backends/tensorrtllm + cp libtriton_*.so trtllmExecutorWorker $out/backends/tensorrtllm rm -r /build/source/inflight_batcher_llm/build/_deps/repo-core-build rm -r /build/source/inflight_batcher_llm/build/libtriton_tensorrtllm_common.so ''; @@ -94,7 +95,7 @@ oldGccStdenv.mkDerivation rec { --add-rpath '$ORIGIN:${trt_lib_dir}:${tensorrt-llm}/cpp/build/tensorrt_llm:${tensorrt-llm}/cpp/build/tensorrt_llm/plugins:${cudaPackages.cudnn.lib}/lib' patchelf $out/backends/tensorrtllm/libtriton_tensorrtllm_common.so \ --add-rpath '$ORIGIN:${trt_lib_dir}:${tensorrt-llm}/cpp/build/tensorrt_llm:${tensorrt-llm}/cpp/build/tensorrt_llm/plugins:${cudaPackages.cudnn.lib}/lib' - patchelf $out/backends/tensorrtllm/triton_tensorrtllm_worker \ + patchelf $out/backends/tensorrtllm/trtllmExecutorWorker \ --add-rpath '$ORIGIN:${trt_lib_dir}:${tensorrt-llm}/cpp/build/tensorrt_llm:${tensorrt-llm}/cpp/build/tensorrt_llm/plugins:${cudaPackages.cudnn.lib}/lib' ''; } From c3eb6f3c1f5ed66b85c7e64c007377d1bb535928 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Thu, 4 Jul 2024 14:08:51 +0200 Subject: [PATCH 02/35] update cognix --- flake.lock | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/flake.lock b/flake.lock index 644260d..769d11c 100644 --- a/flake.lock +++ b/flake.lock @@ -12,16 +12,15 @@ "rust-overlay": "rust-overlay" }, "locked": { - "lastModified": 1721227860, - "narHash": "sha256-Ufbkk0FaMViyFoRogCq+5iEWs8pCGR0Xkc+v7i83Uw0=", + "lastModified": 1720087249, + "narHash": "sha256-xKIs2n8Ux7Y+BgaBfZekeZD9v70Gvas4oNQVBsctQYw=", "owner": "datakami", "repo": "cognix", - "rev": "47db7ff3e82bc73bd067c89997bb8006da08a148", + "rev": "2f11a38c8d6bd9ba2c8ea4970cd93c0e334f7189", "type": "github" }, "original": { "owner": "datakami", - "ref": "24.03", "repo": "cognix", "type": "github" } @@ -33,15 +32,15 @@ "pyproject-nix": "pyproject-nix" }, "locked": { - "lastModified": 1710167744, - "narHash": "sha256-z78iB1ckRQuJluM82iCuQNjN5hqsNpd1om0q75ncza4=", - "owner": "yorickvp", + "lastModified": 1719513340, + "narHash": "sha256-on3zRua52KZ8G5kBOXMQOzrsA07ywVMNdcIWJEeotfo=", + "owner": "nix-community", "repo": "dream2nix", - "rev": "3bfbbbb19471b60cf1bb7f7c476588a36ac3fb04", + "rev": "4d441820e0d0916c97d7af6c4d4f6843d676e242", "type": "github" }, "original": { - "owner": "yorickvp", + "owner": "nix-community", "repo": "dream2nix", "type": "github" } @@ -84,11 +83,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1709780214, - "narHash": "sha256-p4iDKdveHMhfGAlpxmkCtfQO3WRzmlD11aIcThwPqhk=", + "lastModified": 1719436386, + "narHash": "sha256-NBGYaic5FLRg8AWSj6yr4g2IlMPUxNCVjRK6+RNuQBc=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "f945939fd679284d736112d3d5410eb867f3b31c", + "rev": "c66e984bda09e7230ea7b364e677c5ba4f0d36d0", "type": "github" }, "original": { From cec6d28f0b981a91a21f1926853202ade5ded078 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Thu, 4 Jul 2024 14:54:41 +0200 Subject: [PATCH 03/35] update triton_templates to trtllm-0.10 --- triton_templates/ensemble/config.pbtxt | 6 +- triton_templates/postprocessing/1/model.py | 10 +- triton_templates/postprocessing/config.pbtxt | 2 +- triton_templates/preprocessing/1/model.py | 14 - triton_templates/tensorrt_llm/1/model.py | 581 +++++++++++++++++++ triton_templates/tensorrt_llm/config.pbtxt | 101 +++- 6 files changed, 683 insertions(+), 31 deletions(-) create mode 100644 triton_templates/tensorrt_llm/1/model.py diff --git a/triton_templates/ensemble/config.pbtxt b/triton_templates/ensemble/config.pbtxt index 0e2627b..bb521d3 100644 --- a/triton_templates/ensemble/config.pbtxt +++ b/triton_templates/ensemble/config.pbtxt @@ -173,8 +173,8 @@ input [ ] output [ { - name: "output_ids" - data_type: TYPE_INT32 + name: "text_output" + data_type: TYPE_STRING dims: [ -1 ] }, { @@ -421,7 +421,7 @@ ensemble_scheduling { } output_map { key: "OUTPUT" - value: "output_ids" + value: "text_output" } output_map { key: "OUT_OUTPUT_LOG_PROBS" diff --git a/triton_templates/postprocessing/1/model.py b/triton_templates/postprocessing/1/model.py index 5d5663b..02aafad 100644 --- a/triton_templates/postprocessing/1/model.py +++ b/triton_templates/postprocessing/1/model.py @@ -129,19 +129,13 @@ def execute(self, requests): # tokens_batch = tokens_batch.T # Postprocessing output data. - # outputs = self._postprocessing(tokens_batch, sequence_lengths) + outputs = self._postprocessing(tokens_batch, sequence_lengths) # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. - output_tensor = pb_utils.Tensor( 'OUTPUT', - tokens_batch - ) - - # output_tensor = pb_utils.Tensor( - # 'OUTPUT', - # np.array(outputs).astype(self.output_dtype)) + np.array(outputs).astype(self.output_dtype)) outputs = [] outputs.append(output_tensor) diff --git a/triton_templates/postprocessing/config.pbtxt b/triton_templates/postprocessing/config.pbtxt index 67b8b8a..60d0290 100644 --- a/triton_templates/postprocessing/config.pbtxt +++ b/triton_templates/postprocessing/config.pbtxt @@ -66,7 +66,7 @@ input [ output [ { name: "OUTPUT" - data_type: TYPE_INT32 + data_type: TYPE_STRING dims: [ -1 ] }, { diff --git a/triton_templates/preprocessing/1/model.py b/triton_templates/preprocessing/1/model.py index 0f561f7..62ab243 100644 --- a/triton_templates/preprocessing/1/model.py +++ b/triton_templates/preprocessing/1/model.py @@ -268,11 +268,6 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): flat_ids = [] offsets = [] - arbitrary_start_sequence_token = "!" - arbitrary_start_sequence_id = self.tokenizer.encode( - "!", add_special_tokens=False - )[0] - for word_list in word_lists: item_flat_ids = [] item_offsets = [] @@ -281,16 +276,7 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): if isinstance(word, bytes): word = word.decode() - word = arbitrary_start_sequence_token + word ids = self.tokenizer.encode(word, add_special_tokens=False) - if ids[0] != arbitrary_start_sequence_id: - raise ValueError( - f"To standardize tokenizer behavior, we prepend '{arbitrary_start_sequence_token}' to the string representation of each stop sequence." - "We then strip the corresponding first token from the stop sequence IDs." - "However, the first token of the stop sequence IDs was not '{arbitrary_start_sequence_id}', which suggestions there is a problem with the tokenizer that you are using." - ) - else: - ids = ids[1:] if len(ids) == 0: continue diff --git a/triton_templates/tensorrt_llm/1/model.py b/triton_templates/tensorrt_llm/1/model.py new file mode 100644 index 0000000..5b3eda3 --- /dev/null +++ b/triton_templates/tensorrt_llm/1/model.py @@ -0,0 +1,581 @@ +import datetime +import json +import os +import time +from threading import Lock, Thread + +import numpy as np +import triton_python_backend_utils as pb_utils +from torch import from_numpy + +import tensorrt_llm.bindings.executor as trtllm + + +def get_input_tensor_by_name(request, name): + tensor = pb_utils.get_input_tensor_by_name(request, name) + if tensor is None: + return None + return tensor.as_numpy() + + +def get_input_scalar_by_name(request, name): + tensor = get_input_tensor_by_name(request, name) + if tensor is None: + return None + if tensor.size != 1: + raise pb_utils.TritonModelException( + f"Expected a single value for {name}") + return tensor.item() + + +def read_parameter_as_type(value, name, pytype=str): + if value == "": + return None + if value.startswith("${") and value.endswith("}"): + return None + if pytype is bool: + return value.lower() in ["1", "true"] + try: + result = pytype(value) + return result + except: + pb_utils.Logger.log_warning( + f"Could not read parameter '{name}' with value '{value}', will use default." + ) + return None + + +def get_parameter(model_config, name, pytype=str): + if name not in model_config['parameters']: + return None + return read_parameter_as_type( + model_config['parameters'][name]['string_value'], name, pytype) + + +def convert_word_list(word_list): + if word_list is None: + return None + word_list = word_list.tolist() + if len(word_list) == 0 or len(word_list[0]) != 2: + raise pb_utils.TritonModelException(f"Invalid format for word list.") + words, indices = word_list[0] + result = [] + current_index = 0 + for i in indices: + if i == -1: + continue + if i > len(words): + raise pb_utils.TritonModelException( + f"Invalid format for word list.") + current_word = [] + while current_index < i: + current_word.append(words[current_index]) + current_index += 1 + result.append(current_word) + return result + + +def parse_medusa_choices(medusa_choices): + if medusa_choices is None: + return None + try: + result = json.loads( + "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]") + assert isinstance(result, list) and len(result) > 0 + assert all([isinstance(x, list) for x in result]) + assert all([isinstance(y, int) for x in result for y in x]) + except Exception: + raise pb_utils.TritonModelException( + "Invalid format for medusa_choices") + return result + + +def get_sampling_config_from_request(request): + kwargs = {} + kwargs['beam_width'] = get_input_scalar_by_name(request, 'beam_width') or 1 + kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k') + kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p') + kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[ + 'top_p'] <= 0 else kwargs['top_p'] + kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed') + kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature') + kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length') + kwargs['repetition_penalty'] = get_input_scalar_by_name( + request, 'repetition_penalty') + kwargs['presence_penalty'] = get_input_scalar_by_name( + request, 'presence_penalty') + kwargs['frequency_penalty'] = get_input_scalar_by_name( + request, 'frequency_penalty') + kwargs['length_penalty'] = get_input_scalar_by_name(request, 'len_penalty') + kwargs['top_p_min'] = get_input_scalar_by_name(request, + 'runtime_top_p_min') + kwargs['top_p_reset_ids'] = get_input_scalar_by_name( + request, 'runtime_top_p_reset_ids') + kwargs['top_p_decay'] = get_input_scalar_by_name(request, + 'runtime_top_p_decay') + kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name( + request, 'beam_search_diversity_rate') + kwargs['early_stopping'] = get_input_scalar_by_name( + request, 'early_stopping') + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.SamplingConfig(**kwargs) + + +def get_output_config_from_request(request, exclude_input_from_output): + kwargs = {} + kwargs["return_log_probs"] = get_input_scalar_by_name( + request, 'return_log_probs') + kwargs["return_context_logits"] = get_input_scalar_by_name( + request, 'return_context_logits') + kwargs["return_generation_logits"] = get_input_scalar_by_name( + request, 'return_generation_logits') + kwargs["exclude_input_from_output"] = exclude_input_from_output + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.OutputConfig(**kwargs) + + +def get_speculative_decoding_config_from_request(request): + kwargs = {} + draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids') + if draft_input_ids is not None: + kwargs['tokens'] = draft_input_ids.tolist() + draft_logits = get_input_tensor_by_name(request, 'draft_logits') + if draft_logits is not None: + kwargs['logits'] = from_numpy(draft_logits) + kwargs['acceptance_threshold'] = get_input_scalar_by_name( + request, 'draft_acceptance_threshold') + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + return trtllm.SpeculativeDecodingConfig(**kwargs) + return None + + +def get_prompt_tuning_config_from_request(request): + # prompt_vocab_size is unused by executor. + kwargs = {} + prompt_embedding_table = get_input_tensor_by_name( + request, 'prompt_embedding_table') + if prompt_embedding_table is not None: + kwargs["embedding_table"] = from_numpy(prompt_embedding_table) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + return trtllm.PromptTuningConfig(**kwargs) + return None + + +def get_lora_config_from_request(request): + kwargs = {} + kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id') + lora_weights = get_input_tensor_by_name(request, 'lora_weights') + if lora_weights is not None: + kwargs["weights"] = from_numpy(lora_weights) + lora_config = get_input_tensor_by_name(request, 'lora_config') + if lora_config is not None: + kwargs["config"] = from_numpy(lora_config) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + return trtllm.LoraConfig(**kwargs) + return None + + +def convert_request(request, exclude_input_from_output, decoupled): + inputs = {} + input_token_ids = get_input_tensor_by_name(request, 'input_ids') + if input_token_ids is None: + raise pb_utils.TritonModelException( + "A value is required for input_ids") + input_token_ids = input_token_ids.tolist() + if len(input_token_ids) == 0: + raise pb_utils.TritonModelException(f"Invalid format for input_ids") + inputs['input_token_ids'] = input_token_ids[0] + # input_lengths is not not used by executor. + inputs['max_new_tokens'] = get_input_scalar_by_name( + request, 'request_output_len') + if inputs['max_new_tokens'] is None: + raise pb_utils.TritonModelException( + "A value is required for request_output_len") + inputs['streaming'] = get_input_scalar_by_name(request, 'streaming') + if inputs['streaming'] and not decoupled: + raise pb_utils.TritonModelException( + "Streaming is only supported in decoupled mode.") + inputs['end_id'] = get_input_scalar_by_name(request, 'end_id') + inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id') + inputs['stop_words'] = convert_word_list( + get_input_tensor_by_name(request, 'stop_words_list')) + inputs['bad_words'] = convert_word_list( + get_input_tensor_by_name(request, 'bad_words_list')) + embedding_bias = get_input_tensor_by_name(request, 'embedding_bias') + if embedding_bias is not None and embedding_bias.size != 0: + inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze() + + sampling_config = get_sampling_config_from_request(request) + output_config = get_output_config_from_request(request, + exclude_input_from_output) + speculative_decoding_config = get_speculative_decoding_config_from_request( + request) + prompt_tuning_config = get_prompt_tuning_config_from_request(request) + lora_config = get_lora_config_from_request(request) + + return trtllm.Request( + **inputs, + sampling_config=sampling_config, + output_config=output_config, + speculative_decoding_config=speculative_decoding_config, + prompt_tuning_config=prompt_tuning_config, + lora_config=lora_config, + ) + + +def convert_response(response): + if response.has_error(): + return pb_utils.InferenceResponse(output_tensors=[], + error=pb_utils.TritonError( + response.error_msg)), True + result = response.result + beam_lengths = np.expand_dims( + np.array([len(beam) for beam in result.output_token_ids], np.int32), 0) + max_beam_length = max([len(beam) for beam in result.output_token_ids]) + output_ids = np.full((1, len(result.output_token_ids), max_beam_length), + -1, np.int32) + for idx, beam in enumerate(result.output_token_ids): + output_ids[0, idx, :len(beam)] = beam + output_tensors = [ + pb_utils.Tensor("output_ids", output_ids), + pb_utils.Tensor("sequence_length", beam_lengths), + ] + output_tensors.append( + pb_utils.Tensor( + "cum_log_probs", + np.expand_dims(np.array(result.cum_log_probs, np.float32), 0) + if result.cum_log_probs is not None else np.zeros( + (1, 1), np.float32))) + output_tensors.append( + pb_utils.Tensor( + "output_log_probs", + np.expand_dims(np.array(result.log_probs, np.float32), 0) if + result.log_probs is not None else np.zeros((1, 1, 1), np.float32))) + output_tensors.append( + pb_utils.Tensor( + "context_logits", + np.expand_dims(np.array(result.context_logits, np.float32), 0) + if result.context_logits is not None else np.zeros( + (1, 1, 1), np.float32))) + output_tensors.append( + pb_utils.Tensor( + "generation_logits", + np.expand_dims(np.array(result.generation_logits, np.float32), 0) + if result.generation_logits is not None else np.zeros( + (1, 1, 1, 1), np.float32))) + return pb_utils.InferenceResponse(output_tensors), result.is_final + + +def convert_scheduler_policy(batch_scheduler_policy: str): + if batch_scheduler_policy.lower() == "max_utilization": + return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION + elif batch_scheduler_policy.lower() == "guaranteed_no_evict": + return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT + raise pb_utils.TritonModelException( + f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported." + ) + + +def convert_batching_type(gpt_model_type: str): + if gpt_model_type is None: + return None + if gpt_model_type.lower( + ) == "inflight_fused_batching" or gpt_model_type.lower( + ) == "inflight_batching": + return trtllm.BatchingType.INFLIGHT + elif gpt_model_type.lower() == "v1": + return trtllm.BatchingType.STATIC + raise pb_utils.TritonModelException( + f"gpt_model_type value of '{gpt_model_type}' is not supported.") + + +def convert_decoding_mode(decoding_mode: str): + if decoding_mode is None: + return None + elif decoding_mode == "none": + return trtllm.DecodingMode.NONE + elif decoding_mode == "top_k": + return trtllm.DecodingMode.TOP_K + elif decoding_mode == "top_p": + return trtllm.DecodingMode.TOP_P + elif decoding_mode == "top_k_top_p": + return trtllm.DecodingMode.TOP_K_TOP_P + elif decoding_mode == "beam_search": + return trtllm.DecodingMode.BEAM_SEARCH + elif decoding_mode == "medusa": + return trtllm.DecodingMode.MEDUSA + raise pb_utils.TritonModelException( + f"decoding_mode value of '{decoding_mode}' is not supported.") + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def get_scheduler_config(self, model_config): + batch_scheduler_policy = get_parameter(model_config, + "batch_scheduler_policy") + if batch_scheduler_policy is None: + return trtllm.SchedulerConfig() + return trtllm.SchedulerConfig( + convert_scheduler_policy(batch_scheduler_policy)) + + def get_kv_cache_config(self, model_config): + kwargs = { + "enable_block_reuse": + get_parameter(model_config, "enable_kv_cache_reuse", bool), + "max_tokens": + get_parameter(model_config, "max_tokens_in_paged_kv_cache", int), + "sink_token_length": + get_parameter(model_config, "sink_token_length", int), + "max_attention_window": + get_parameter(model_config, "max_attention_window_size", int), + "free_gpu_memory_fraction": + get_parameter(model_config, "kv_cache_free_gpu_mem_fraction", + float), + "host_cache_size": + get_parameter(model_config, "kv_cache_host_memory_bytes", int), + "onboard_blocks": + get_parameter(model_config, "kv_cache_onboard_blocks", bool), + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.KvCacheConfig(**kwargs) + + def get_parallel_config(self, model_config): + kwargs = {} + gpu_device_ids = get_parameter(model_config, "gpu_device_ids") + if gpu_device_ids: + kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")] + self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR", + "0") == "1" + if self.use_orchestrator_mode: + kwargs[ + "communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR + worker_path = get_parameter(model_config, "worker_path") + if worker_path is not None: + raise pb_utils.TritonModelException( + "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecuutorWorker executable." + ) + executor_worker_path = get_parameter(model_config, + "executor_worker_path") + kwargs["orchestrator_config"] = trtllm.OrchestratorConfig( + True, executor_worker_path) + if len(kwargs) > 0: + return trtllm.ParallelConfig(**kwargs) + return None + + def get_peft_cache_config(self, model_config): + kwargs = { + "optimal_adapter_size": + get_parameter(model_config, "lora_cache_optimal_adapter_size", + int), + "max_adapter_size": + get_parameter(model_config, "lora_cache_max_adapter_size", int), + "device_cache_percent": + get_parameter(model_config, "lora_cache_gpu_memory_fraction", + float), + "host_cache_size": + get_parameter(model_config, "lora_cache_host_memory_bytes", int), + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.PeftCacheConfig(**kwargs) + + def get_executor_config(self, model_config): + kwargs = { + "max_beam_width": + get_parameter(model_config, "max_beam_width", int), + "scheduler_config": + self.get_scheduler_config(model_config), + "kv_cache_config": + self.get_kv_cache_config(model_config), + "enable_chunked_context": + get_parameter(model_config, "enable_chunked_context", bool), + "normalize_log_probs": + get_parameter(model_config, "normalize_log_probs", bool), + "batching_type": + convert_batching_type(get_parameter(model_config, + "gpt_model_type")), + "parallel_config": + self.get_parallel_config(model_config), + "peft_cache_config": + self.get_peft_cache_config(model_config), + "medusa_choices": + parse_medusa_choices(get_parameter(model_config, + "medusa_choices")), + "decoding_mode": + convert_decoding_mode(get_parameter(model_config, + "decoding_mode")), + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.ExecutorConfig(**kwargs) + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + model_config = json.loads(args['model_config']) + gpt_model_path = get_parameter(model_config, "gpt_model_path") + if get_parameter(model_config, "enable_trt_overlap", bool): + raise pb_utils.TritonModelException( + f"enable_trt_overlap=true is not supported.") + self.exclude_input_from_output = get_parameter( + model_config, "exclude_input_in_output", bool) + executor_config = self.get_executor_config(model_config) + self.executor = trtllm.Executor(gpt_model_path, + trtllm.ModelType.DECODER_ONLY, + executor_config) + self.decoupled = pb_utils.using_decoupled_model_transaction_policy( + model_config) + self.cancellation_check_period_ms = get_parameter( + model_config, "cancellation_check_period_ms", int) or 100 + + if not self.decoupled: + raise pb_utils.TritonModelException( + "Please enable decoupled transaction policy in the model configuration to serve this model" + ) + + self.triton_id_to_req_id = {} + self.req_id_to_response_sender = {} + self.lock = Lock() + self.running = False + self.awaiter_thread = Thread(target=self.awaiter_loop) + self.cancellation_thread = Thread(target=self.cancellation_loop) + if self.executor.can_enqueue_requests(): + self.running = True + self.awaiter_thread.start() + self.cancellation_thread.start() + else: + # In leader mode, worker ranks will wait here until leader is done. + self.executor.shutdown() + + def handle_stop_request(self, triton_id, response_sender): + if triton_id is None or triton_id == "": + response_sender.send( + pb_utils.InferenceResponse(error=pb_utils.TritonError( + "A request id must be provided for request cancellation")), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + return + + if triton_id in self.triton_id_to_req_id: + req_id = self.triton_id_to_req_id[triton_id] + self.executor.cancel_request(req_id) + + response_sender.send( + pb_utils.InferenceResponse(), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + if not self.executor.can_enqueue_requests(): + return + + # Convert to executor requests. + triton_requests = [] + executor_requests = [] + for request in requests: + response_sender = request.get_response_sender() + if get_input_scalar_by_name(request, 'stop'): + self.handle_stop_request(request.request_id(), response_sender) + else: + try: + converted = convert_request(request, + self.exclude_input_from_output, + self.decoupled) + except Exception as e: + response_sender.send( + pb_utils.InferenceResponse(error=pb_utils.TritonError( + f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'" + )), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + else: + triton_requests.append(request) + executor_requests.append(converted) + + with self.lock: + request_ids = self.executor.enqueue_requests(executor_requests) + for req_id, request in zip(request_ids, triton_requests): + triton_id = request.request_id() + self.req_id_to_response_sender[ + req_id] = triton_id, request.get_response_sender() + self.triton_id_to_req_id[triton_id] = req_id + return None + + def awaiter_loop(self): + """Gets responses from executor and returns the results.""" + while self.running: + for response in self.executor.await_responses( + timeout=datetime.timedelta(milliseconds=1)): + req_id = response.request_id + with self.lock: + if req_id not in self.req_id_to_response_sender: + continue + triton_id, response_sender = self.req_id_to_response_sender[ + req_id] + + triton_response, is_final = convert_response(response) + response_sender.send( + triton_response, + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + if is_final else 0) + + if is_final: + with self.lock: + del self.triton_id_to_req_id[triton_id] + del self.req_id_to_response_sender[req_id] + # Remove local reference so response_sender can be cleaned properly. + del response_sender + # TODO: Read stats: https://jirasw.nvidia.com/browse/TRTLLM-563 + + def cancellation_loop(self): + """Checks if any pending requests have been cancelled.""" + while self.running: + time.sleep(self.cancellation_check_period_ms / 1000.0) + with self.lock: + for req_id, (triton_id, response_sender + ) in self.req_id_to_response_sender.items(): + if response_sender.is_cancelled(): + self.executor.cancel_request(req_id) + # Remove local reference so response_sender can be cleaned properly. + del response_sender + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + if self.executor.can_enqueue_requests(): + self.running = False + self.awaiter_thread.join() + self.cancellation_thread.join() + self.executor.shutdown() diff --git a/triton_templates/tensorrt_llm/config.pbtxt b/triton_templates/tensorrt_llm/config.pbtxt index 71d2b98..64c5d0e 100644 --- a/triton_templates/tensorrt_llm/config.pbtxt +++ b/triton_templates/tensorrt_llm/config.pbtxt @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. name: "tensorrt_llm" -backend: "tensorrtllm" +backend: "${triton_backend}" max_batch_size: ${triton_max_batch_size} model_transaction_policy { @@ -69,6 +69,13 @@ input [ optional: true allow_ragged_batch: true }, + { + name: "draft_acceptance_threshold" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "end_id" data_type: TYPE_INT32 @@ -132,6 +139,27 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "runtime_top_p_min" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_decay" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_reset_ids" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "len_penalty" data_type: TYPE_FP32 @@ -139,6 +167,13 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "early_stopping" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "repetition_penalty" data_type: TYPE_FP32 @@ -153,6 +188,13 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "beam_search_diversity_rate" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "presence_penalty" data_type: TYPE_FP32 @@ -338,6 +380,12 @@ parameters: { string_value: "${max_attention_window_size}" } } +parameters: { + key: "sink_token_length" + value: { + string_value: "${sink_token_length}" + } +} parameters: { key: "batch_scheduler_policy" value: { @@ -351,17 +399,54 @@ parameters: { } } parameters: { - key: "enable_trt_overlap" + key: "kv_cache_host_memory_bytes" value: { - string_value: "${enable_trt_overlap}" + string_value: "${kv_cache_host_memory_bytes}" } } +parameters: { + key: "kv_cache_onboard_blocks" + value: { + string_value: "${kv_cache_onboard_blocks}" + } +} +# enable_trt_overlap is deprecated and doesn't have any effect on the runtime +# parameters: { +# key: "enable_trt_overlap" +# value: { +# string_value: "${enable_trt_overlap}" +# } +# } parameters: { key: "exclude_input_in_output" value: { string_value: "${exclude_input_in_output}" } } +parameters: { + key: "cancellation_check_period_ms" + value: { + string_value: "${cancellation_check_period_ms}" + } +} +parameters: { + key: "stats_check_period_ms" + value: { + string_value: "${stats_check_period_ms}" + } +} +parameters: { + key: "iter_stats_max_iterations" + value: { + string_value: "${iter_stats_max_iterations}" + } +} +parameters: { + key: "request_stats_max_iterations" + value: { + string_value: "${request_stats_max_iterations}" + } +} parameters: { key: "enable_kv_cache_reuse" value: { @@ -417,9 +502,9 @@ parameters: { } } parameters: { - key: "worker_path" + key: "executor_worker_path" value: { - string_value: "/opt/tritonserver/backends/tensorrtllm/triton_tensorrtllm_worker" + string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker" } } parameters: { @@ -428,3 +513,9 @@ parameters: { string_value: "${medusa_choices}" } } +parameters: { + key: "gpu_weights_percent" + value: { + string_value: "${gpu_weights_percent}" + } +} From 0626e445642bb215b90eb9febd40fe9f65fda074 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Thu, 4 Jul 2024 15:00:32 +0200 Subject: [PATCH 04/35] don't tokenize in postprocessing (see #27) --- triton_templates/ensemble/config.pbtxt | 6 +-- triton_templates/postprocessing/1/model.py | 52 ++++++++++---------- triton_templates/postprocessing/config.pbtxt | 2 +- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/triton_templates/ensemble/config.pbtxt b/triton_templates/ensemble/config.pbtxt index bb521d3..0e2627b 100644 --- a/triton_templates/ensemble/config.pbtxt +++ b/triton_templates/ensemble/config.pbtxt @@ -173,8 +173,8 @@ input [ ] output [ { - name: "text_output" - data_type: TYPE_STRING + name: "output_ids" + data_type: TYPE_INT32 dims: [ -1 ] }, { @@ -421,7 +421,7 @@ ensemble_scheduling { } output_map { key: "OUTPUT" - value: "text_output" + value: "output_ids" } output_map { key: "OUT_OUTPUT_LOG_PROBS" diff --git a/triton_templates/postprocessing/1/model.py b/triton_templates/postprocessing/1/model.py index 02aafad..ac42a0d 100644 --- a/triton_templates/postprocessing/1/model.py +++ b/triton_templates/postprocessing/1/model.py @@ -28,7 +28,7 @@ import numpy as np import triton_python_backend_utils as pb_utils -from transformers import AutoTokenizer +# from transformers import AutoTokenizer class TritonPythonModel: @@ -53,19 +53,19 @@ def initialize(self, args): """ # Parse model configs model_config = json.loads(args['model_config']) - tokenizer_dir = model_config['parameters']['tokenizer_dir'][ - 'string_value'] - self.skip_special_tokens = model_config['parameters'].get( - 'skip_special_tokens', - {'string_value': "true"})['string_value'].lower() in [ - 'true', '1', 't', 'y', 'yes' - ] - - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - legacy=False, - padding_side='left', - trust_remote_code=True) - self.tokenizer.pad_token = self.tokenizer.eos_token + # tokenizer_dir = model_config['parameters']['tokenizer_dir'][ + # 'string_value'] + # self.skip_special_tokens = model_config['parameters'].get( + # 'skip_special_tokens', + # {'string_value': "true"})['string_value'].lower() in [ + # 'true', '1', 't', 'y', 'yes' + # ] + + # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, + # legacy=False, + # padding_side='left', + # trust_remote_code=True) + # self.tokenizer.pad_token = self.tokenizer.eos_token # Parse model output configs output_config = pb_utils.get_output_config_by_name( @@ -129,13 +129,13 @@ def execute(self, requests): # tokens_batch = tokens_batch.T # Postprocessing output data. - outputs = self._postprocessing(tokens_batch, sequence_lengths) + # outputs = self._postprocessing(tokens_batch, sequence_lengths) # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. output_tensor = pb_utils.Tensor( 'OUTPUT', - np.array(outputs).astype(self.output_dtype)) + tokens_batch) outputs = [] outputs.append(output_tensor) @@ -201,13 +201,13 @@ def finalize(self): """ print('Cleaning up...') - def _postprocessing(self, tokens_batch, sequence_lengths): - outputs = [] - for batch_idx, beam_tokens in enumerate(tokens_batch): - for beam_idx, tokens in enumerate(beam_tokens): - seq_len = sequence_lengths[batch_idx][beam_idx] - output = self.tokenizer.decode( - tokens[:seq_len], - skip_special_tokens=self.skip_special_tokens) - outputs.append(output.encode('utf8')) - return outputs + # def _postprocessing(self, tokens_batch, sequence_lengths): + # outputs = [] + # for batch_idx, beam_tokens in enumerate(tokens_batch): + # for beam_idx, tokens in enumerate(beam_tokens): + # seq_len = sequence_lengths[batch_idx][beam_idx] + # output = self.tokenizer.decode( + # tokens[:seq_len], + # skip_special_tokens=self.skip_special_tokens) + # outputs.append(output.encode('utf8')) + # return outputs diff --git a/triton_templates/postprocessing/config.pbtxt b/triton_templates/postprocessing/config.pbtxt index 60d0290..67b8b8a 100644 --- a/triton_templates/postprocessing/config.pbtxt +++ b/triton_templates/postprocessing/config.pbtxt @@ -66,7 +66,7 @@ input [ output [ { name: "OUTPUT" - data_type: TYPE_STRING + data_type: TYPE_INT32 dims: [ -1 ] }, { From 4cd55334526db9d731bd55ab6eb1fe83f3379943 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Thu, 4 Jul 2024 15:11:01 +0200 Subject: [PATCH 05/35] instantiate triton_model_repo with the default config --- triton_model_repo/ensemble/config.pbtxt | 1 - triton_model_repo/postprocessing/1/model.py | 56 +- triton_model_repo/postprocessing/config.pbtxt | 1 - triton_model_repo/preprocessing/1/model.py | 14 - triton_model_repo/preprocessing/config.pbtxt | 1 - triton_model_repo/tensorrt_llm/1/.gitkeep | 0 triton_model_repo/tensorrt_llm/1/model.py | 581 ++++++++++++++++++ triton_model_repo/tensorrt_llm/config.pbtxt | 108 +++- .../tensorrt_llm_bls/config.pbtxt | 1 - 9 files changed, 705 insertions(+), 58 deletions(-) create mode 100644 triton_model_repo/tensorrt_llm/1/.gitkeep create mode 100644 triton_model_repo/tensorrt_llm/1/model.py diff --git a/triton_model_repo/ensemble/config.pbtxt b/triton_model_repo/ensemble/config.pbtxt index 40d291d..6d54df6 100644 --- a/triton_model_repo/ensemble/config.pbtxt +++ b/triton_model_repo/ensemble/config.pbtxt @@ -442,4 +442,3 @@ ensemble_scheduling { } ] } - diff --git a/triton_model_repo/postprocessing/1/model.py b/triton_model_repo/postprocessing/1/model.py index 5d5663b..ac42a0d 100644 --- a/triton_model_repo/postprocessing/1/model.py +++ b/triton_model_repo/postprocessing/1/model.py @@ -28,7 +28,7 @@ import numpy as np import triton_python_backend_utils as pb_utils -from transformers import AutoTokenizer +# from transformers import AutoTokenizer class TritonPythonModel: @@ -53,19 +53,19 @@ def initialize(self, args): """ # Parse model configs model_config = json.loads(args['model_config']) - tokenizer_dir = model_config['parameters']['tokenizer_dir'][ - 'string_value'] - self.skip_special_tokens = model_config['parameters'].get( - 'skip_special_tokens', - {'string_value': "true"})['string_value'].lower() in [ - 'true', '1', 't', 'y', 'yes' - ] - - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - legacy=False, - padding_side='left', - trust_remote_code=True) - self.tokenizer.pad_token = self.tokenizer.eos_token + # tokenizer_dir = model_config['parameters']['tokenizer_dir'][ + # 'string_value'] + # self.skip_special_tokens = model_config['parameters'].get( + # 'skip_special_tokens', + # {'string_value': "true"})['string_value'].lower() in [ + # 'true', '1', 't', 'y', 'yes' + # ] + + # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, + # legacy=False, + # padding_side='left', + # trust_remote_code=True) + # self.tokenizer.pad_token = self.tokenizer.eos_token # Parse model output configs output_config = pb_utils.get_output_config_by_name( @@ -133,15 +133,9 @@ def execute(self, requests): # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. - output_tensor = pb_utils.Tensor( 'OUTPUT', - tokens_batch - ) - - # output_tensor = pb_utils.Tensor( - # 'OUTPUT', - # np.array(outputs).astype(self.output_dtype)) + tokens_batch) outputs = [] outputs.append(output_tensor) @@ -207,13 +201,13 @@ def finalize(self): """ print('Cleaning up...') - def _postprocessing(self, tokens_batch, sequence_lengths): - outputs = [] - for batch_idx, beam_tokens in enumerate(tokens_batch): - for beam_idx, tokens in enumerate(beam_tokens): - seq_len = sequence_lengths[batch_idx][beam_idx] - output = self.tokenizer.decode( - tokens[:seq_len], - skip_special_tokens=self.skip_special_tokens) - outputs.append(output.encode('utf8')) - return outputs + # def _postprocessing(self, tokens_batch, sequence_lengths): + # outputs = [] + # for batch_idx, beam_tokens in enumerate(tokens_batch): + # for beam_idx, tokens in enumerate(beam_tokens): + # seq_len = sequence_lengths[batch_idx][beam_idx] + # output = self.tokenizer.decode( + # tokens[:seq_len], + # skip_special_tokens=self.skip_special_tokens) + # outputs.append(output.encode('utf8')) + # return outputs diff --git a/triton_model_repo/postprocessing/config.pbtxt b/triton_model_repo/postprocessing/config.pbtxt index 5e2e37a..df87aeb 100644 --- a/triton_model_repo/postprocessing/config.pbtxt +++ b/triton_model_repo/postprocessing/config.pbtxt @@ -111,4 +111,3 @@ instance_group [ kind: KIND_CPU } ] - diff --git a/triton_model_repo/preprocessing/1/model.py b/triton_model_repo/preprocessing/1/model.py index a109775..62ab243 100644 --- a/triton_model_repo/preprocessing/1/model.py +++ b/triton_model_repo/preprocessing/1/model.py @@ -268,11 +268,6 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): flat_ids = [] offsets = [] - arbitrary_start_sequence_token = "!" - arbitrary_start_sequence_id = self.tokenizer.encode( - "!", add_special_tokens=False - )[0] - for word_list in word_lists: item_flat_ids = [] item_offsets = [] @@ -281,16 +276,7 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): if isinstance(word, bytes): word = word.decode() - word = arbitrary_start_sequence_token + word ids = self.tokenizer.encode(word, add_special_tokens=False) - if ids[0] != arbitrary_start_sequence_id: - raise ValueError( - f"To standardize tokenizer behavior, we prepend '{arbitrary_start_sequence_token}' to the string representation of each stop sequence. " - "We then strip the corresponding first token from the stop sequence IDs. " - f"However, the first token of the stop sequence IDs was not '{arbitrary_start_sequence_id}', which suggests there is a problem with the tokenizer that you are using." - ) - else: - ids = ids[1:] if len(ids) == 0: continue diff --git a/triton_model_repo/preprocessing/config.pbtxt b/triton_model_repo/preprocessing/config.pbtxt index b0fa8f2..e76fec5 100644 --- a/triton_model_repo/preprocessing/config.pbtxt +++ b/triton_model_repo/preprocessing/config.pbtxt @@ -138,4 +138,3 @@ instance_group [ kind: KIND_CPU } ] - diff --git a/triton_model_repo/tensorrt_llm/1/.gitkeep b/triton_model_repo/tensorrt_llm/1/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/triton_model_repo/tensorrt_llm/1/model.py b/triton_model_repo/tensorrt_llm/1/model.py new file mode 100644 index 0000000..5b3eda3 --- /dev/null +++ b/triton_model_repo/tensorrt_llm/1/model.py @@ -0,0 +1,581 @@ +import datetime +import json +import os +import time +from threading import Lock, Thread + +import numpy as np +import triton_python_backend_utils as pb_utils +from torch import from_numpy + +import tensorrt_llm.bindings.executor as trtllm + + +def get_input_tensor_by_name(request, name): + tensor = pb_utils.get_input_tensor_by_name(request, name) + if tensor is None: + return None + return tensor.as_numpy() + + +def get_input_scalar_by_name(request, name): + tensor = get_input_tensor_by_name(request, name) + if tensor is None: + return None + if tensor.size != 1: + raise pb_utils.TritonModelException( + f"Expected a single value for {name}") + return tensor.item() + + +def read_parameter_as_type(value, name, pytype=str): + if value == "": + return None + if value.startswith("${") and value.endswith("}"): + return None + if pytype is bool: + return value.lower() in ["1", "true"] + try: + result = pytype(value) + return result + except: + pb_utils.Logger.log_warning( + f"Could not read parameter '{name}' with value '{value}', will use default." + ) + return None + + +def get_parameter(model_config, name, pytype=str): + if name not in model_config['parameters']: + return None + return read_parameter_as_type( + model_config['parameters'][name]['string_value'], name, pytype) + + +def convert_word_list(word_list): + if word_list is None: + return None + word_list = word_list.tolist() + if len(word_list) == 0 or len(word_list[0]) != 2: + raise pb_utils.TritonModelException(f"Invalid format for word list.") + words, indices = word_list[0] + result = [] + current_index = 0 + for i in indices: + if i == -1: + continue + if i > len(words): + raise pb_utils.TritonModelException( + f"Invalid format for word list.") + current_word = [] + while current_index < i: + current_word.append(words[current_index]) + current_index += 1 + result.append(current_word) + return result + + +def parse_medusa_choices(medusa_choices): + if medusa_choices is None: + return None + try: + result = json.loads( + "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]") + assert isinstance(result, list) and len(result) > 0 + assert all([isinstance(x, list) for x in result]) + assert all([isinstance(y, int) for x in result for y in x]) + except Exception: + raise pb_utils.TritonModelException( + "Invalid format for medusa_choices") + return result + + +def get_sampling_config_from_request(request): + kwargs = {} + kwargs['beam_width'] = get_input_scalar_by_name(request, 'beam_width') or 1 + kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k') + kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p') + kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[ + 'top_p'] <= 0 else kwargs['top_p'] + kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed') + kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature') + kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length') + kwargs['repetition_penalty'] = get_input_scalar_by_name( + request, 'repetition_penalty') + kwargs['presence_penalty'] = get_input_scalar_by_name( + request, 'presence_penalty') + kwargs['frequency_penalty'] = get_input_scalar_by_name( + request, 'frequency_penalty') + kwargs['length_penalty'] = get_input_scalar_by_name(request, 'len_penalty') + kwargs['top_p_min'] = get_input_scalar_by_name(request, + 'runtime_top_p_min') + kwargs['top_p_reset_ids'] = get_input_scalar_by_name( + request, 'runtime_top_p_reset_ids') + kwargs['top_p_decay'] = get_input_scalar_by_name(request, + 'runtime_top_p_decay') + kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name( + request, 'beam_search_diversity_rate') + kwargs['early_stopping'] = get_input_scalar_by_name( + request, 'early_stopping') + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.SamplingConfig(**kwargs) + + +def get_output_config_from_request(request, exclude_input_from_output): + kwargs = {} + kwargs["return_log_probs"] = get_input_scalar_by_name( + request, 'return_log_probs') + kwargs["return_context_logits"] = get_input_scalar_by_name( + request, 'return_context_logits') + kwargs["return_generation_logits"] = get_input_scalar_by_name( + request, 'return_generation_logits') + kwargs["exclude_input_from_output"] = exclude_input_from_output + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.OutputConfig(**kwargs) + + +def get_speculative_decoding_config_from_request(request): + kwargs = {} + draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids') + if draft_input_ids is not None: + kwargs['tokens'] = draft_input_ids.tolist() + draft_logits = get_input_tensor_by_name(request, 'draft_logits') + if draft_logits is not None: + kwargs['logits'] = from_numpy(draft_logits) + kwargs['acceptance_threshold'] = get_input_scalar_by_name( + request, 'draft_acceptance_threshold') + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + return trtllm.SpeculativeDecodingConfig(**kwargs) + return None + + +def get_prompt_tuning_config_from_request(request): + # prompt_vocab_size is unused by executor. + kwargs = {} + prompt_embedding_table = get_input_tensor_by_name( + request, 'prompt_embedding_table') + if prompt_embedding_table is not None: + kwargs["embedding_table"] = from_numpy(prompt_embedding_table) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + return trtllm.PromptTuningConfig(**kwargs) + return None + + +def get_lora_config_from_request(request): + kwargs = {} + kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id') + lora_weights = get_input_tensor_by_name(request, 'lora_weights') + if lora_weights is not None: + kwargs["weights"] = from_numpy(lora_weights) + lora_config = get_input_tensor_by_name(request, 'lora_config') + if lora_config is not None: + kwargs["config"] = from_numpy(lora_config) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + return trtllm.LoraConfig(**kwargs) + return None + + +def convert_request(request, exclude_input_from_output, decoupled): + inputs = {} + input_token_ids = get_input_tensor_by_name(request, 'input_ids') + if input_token_ids is None: + raise pb_utils.TritonModelException( + "A value is required for input_ids") + input_token_ids = input_token_ids.tolist() + if len(input_token_ids) == 0: + raise pb_utils.TritonModelException(f"Invalid format for input_ids") + inputs['input_token_ids'] = input_token_ids[0] + # input_lengths is not not used by executor. + inputs['max_new_tokens'] = get_input_scalar_by_name( + request, 'request_output_len') + if inputs['max_new_tokens'] is None: + raise pb_utils.TritonModelException( + "A value is required for request_output_len") + inputs['streaming'] = get_input_scalar_by_name(request, 'streaming') + if inputs['streaming'] and not decoupled: + raise pb_utils.TritonModelException( + "Streaming is only supported in decoupled mode.") + inputs['end_id'] = get_input_scalar_by_name(request, 'end_id') + inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id') + inputs['stop_words'] = convert_word_list( + get_input_tensor_by_name(request, 'stop_words_list')) + inputs['bad_words'] = convert_word_list( + get_input_tensor_by_name(request, 'bad_words_list')) + embedding_bias = get_input_tensor_by_name(request, 'embedding_bias') + if embedding_bias is not None and embedding_bias.size != 0: + inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze() + + sampling_config = get_sampling_config_from_request(request) + output_config = get_output_config_from_request(request, + exclude_input_from_output) + speculative_decoding_config = get_speculative_decoding_config_from_request( + request) + prompt_tuning_config = get_prompt_tuning_config_from_request(request) + lora_config = get_lora_config_from_request(request) + + return trtllm.Request( + **inputs, + sampling_config=sampling_config, + output_config=output_config, + speculative_decoding_config=speculative_decoding_config, + prompt_tuning_config=prompt_tuning_config, + lora_config=lora_config, + ) + + +def convert_response(response): + if response.has_error(): + return pb_utils.InferenceResponse(output_tensors=[], + error=pb_utils.TritonError( + response.error_msg)), True + result = response.result + beam_lengths = np.expand_dims( + np.array([len(beam) for beam in result.output_token_ids], np.int32), 0) + max_beam_length = max([len(beam) for beam in result.output_token_ids]) + output_ids = np.full((1, len(result.output_token_ids), max_beam_length), + -1, np.int32) + for idx, beam in enumerate(result.output_token_ids): + output_ids[0, idx, :len(beam)] = beam + output_tensors = [ + pb_utils.Tensor("output_ids", output_ids), + pb_utils.Tensor("sequence_length", beam_lengths), + ] + output_tensors.append( + pb_utils.Tensor( + "cum_log_probs", + np.expand_dims(np.array(result.cum_log_probs, np.float32), 0) + if result.cum_log_probs is not None else np.zeros( + (1, 1), np.float32))) + output_tensors.append( + pb_utils.Tensor( + "output_log_probs", + np.expand_dims(np.array(result.log_probs, np.float32), 0) if + result.log_probs is not None else np.zeros((1, 1, 1), np.float32))) + output_tensors.append( + pb_utils.Tensor( + "context_logits", + np.expand_dims(np.array(result.context_logits, np.float32), 0) + if result.context_logits is not None else np.zeros( + (1, 1, 1), np.float32))) + output_tensors.append( + pb_utils.Tensor( + "generation_logits", + np.expand_dims(np.array(result.generation_logits, np.float32), 0) + if result.generation_logits is not None else np.zeros( + (1, 1, 1, 1), np.float32))) + return pb_utils.InferenceResponse(output_tensors), result.is_final + + +def convert_scheduler_policy(batch_scheduler_policy: str): + if batch_scheduler_policy.lower() == "max_utilization": + return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION + elif batch_scheduler_policy.lower() == "guaranteed_no_evict": + return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT + raise pb_utils.TritonModelException( + f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported." + ) + + +def convert_batching_type(gpt_model_type: str): + if gpt_model_type is None: + return None + if gpt_model_type.lower( + ) == "inflight_fused_batching" or gpt_model_type.lower( + ) == "inflight_batching": + return trtllm.BatchingType.INFLIGHT + elif gpt_model_type.lower() == "v1": + return trtllm.BatchingType.STATIC + raise pb_utils.TritonModelException( + f"gpt_model_type value of '{gpt_model_type}' is not supported.") + + +def convert_decoding_mode(decoding_mode: str): + if decoding_mode is None: + return None + elif decoding_mode == "none": + return trtllm.DecodingMode.NONE + elif decoding_mode == "top_k": + return trtllm.DecodingMode.TOP_K + elif decoding_mode == "top_p": + return trtllm.DecodingMode.TOP_P + elif decoding_mode == "top_k_top_p": + return trtllm.DecodingMode.TOP_K_TOP_P + elif decoding_mode == "beam_search": + return trtllm.DecodingMode.BEAM_SEARCH + elif decoding_mode == "medusa": + return trtllm.DecodingMode.MEDUSA + raise pb_utils.TritonModelException( + f"decoding_mode value of '{decoding_mode}' is not supported.") + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def get_scheduler_config(self, model_config): + batch_scheduler_policy = get_parameter(model_config, + "batch_scheduler_policy") + if batch_scheduler_policy is None: + return trtllm.SchedulerConfig() + return trtllm.SchedulerConfig( + convert_scheduler_policy(batch_scheduler_policy)) + + def get_kv_cache_config(self, model_config): + kwargs = { + "enable_block_reuse": + get_parameter(model_config, "enable_kv_cache_reuse", bool), + "max_tokens": + get_parameter(model_config, "max_tokens_in_paged_kv_cache", int), + "sink_token_length": + get_parameter(model_config, "sink_token_length", int), + "max_attention_window": + get_parameter(model_config, "max_attention_window_size", int), + "free_gpu_memory_fraction": + get_parameter(model_config, "kv_cache_free_gpu_mem_fraction", + float), + "host_cache_size": + get_parameter(model_config, "kv_cache_host_memory_bytes", int), + "onboard_blocks": + get_parameter(model_config, "kv_cache_onboard_blocks", bool), + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.KvCacheConfig(**kwargs) + + def get_parallel_config(self, model_config): + kwargs = {} + gpu_device_ids = get_parameter(model_config, "gpu_device_ids") + if gpu_device_ids: + kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")] + self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR", + "0") == "1" + if self.use_orchestrator_mode: + kwargs[ + "communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR + worker_path = get_parameter(model_config, "worker_path") + if worker_path is not None: + raise pb_utils.TritonModelException( + "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecuutorWorker executable." + ) + executor_worker_path = get_parameter(model_config, + "executor_worker_path") + kwargs["orchestrator_config"] = trtllm.OrchestratorConfig( + True, executor_worker_path) + if len(kwargs) > 0: + return trtllm.ParallelConfig(**kwargs) + return None + + def get_peft_cache_config(self, model_config): + kwargs = { + "optimal_adapter_size": + get_parameter(model_config, "lora_cache_optimal_adapter_size", + int), + "max_adapter_size": + get_parameter(model_config, "lora_cache_max_adapter_size", int), + "device_cache_percent": + get_parameter(model_config, "lora_cache_gpu_memory_fraction", + float), + "host_cache_size": + get_parameter(model_config, "lora_cache_host_memory_bytes", int), + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.PeftCacheConfig(**kwargs) + + def get_executor_config(self, model_config): + kwargs = { + "max_beam_width": + get_parameter(model_config, "max_beam_width", int), + "scheduler_config": + self.get_scheduler_config(model_config), + "kv_cache_config": + self.get_kv_cache_config(model_config), + "enable_chunked_context": + get_parameter(model_config, "enable_chunked_context", bool), + "normalize_log_probs": + get_parameter(model_config, "normalize_log_probs", bool), + "batching_type": + convert_batching_type(get_parameter(model_config, + "gpt_model_type")), + "parallel_config": + self.get_parallel_config(model_config), + "peft_cache_config": + self.get_peft_cache_config(model_config), + "medusa_choices": + parse_medusa_choices(get_parameter(model_config, + "medusa_choices")), + "decoding_mode": + convert_decoding_mode(get_parameter(model_config, + "decoding_mode")), + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.ExecutorConfig(**kwargs) + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + model_config = json.loads(args['model_config']) + gpt_model_path = get_parameter(model_config, "gpt_model_path") + if get_parameter(model_config, "enable_trt_overlap", bool): + raise pb_utils.TritonModelException( + f"enable_trt_overlap=true is not supported.") + self.exclude_input_from_output = get_parameter( + model_config, "exclude_input_in_output", bool) + executor_config = self.get_executor_config(model_config) + self.executor = trtllm.Executor(gpt_model_path, + trtllm.ModelType.DECODER_ONLY, + executor_config) + self.decoupled = pb_utils.using_decoupled_model_transaction_policy( + model_config) + self.cancellation_check_period_ms = get_parameter( + model_config, "cancellation_check_period_ms", int) or 100 + + if not self.decoupled: + raise pb_utils.TritonModelException( + "Please enable decoupled transaction policy in the model configuration to serve this model" + ) + + self.triton_id_to_req_id = {} + self.req_id_to_response_sender = {} + self.lock = Lock() + self.running = False + self.awaiter_thread = Thread(target=self.awaiter_loop) + self.cancellation_thread = Thread(target=self.cancellation_loop) + if self.executor.can_enqueue_requests(): + self.running = True + self.awaiter_thread.start() + self.cancellation_thread.start() + else: + # In leader mode, worker ranks will wait here until leader is done. + self.executor.shutdown() + + def handle_stop_request(self, triton_id, response_sender): + if triton_id is None or triton_id == "": + response_sender.send( + pb_utils.InferenceResponse(error=pb_utils.TritonError( + "A request id must be provided for request cancellation")), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + return + + if triton_id in self.triton_id_to_req_id: + req_id = self.triton_id_to_req_id[triton_id] + self.executor.cancel_request(req_id) + + response_sender.send( + pb_utils.InferenceResponse(), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + if not self.executor.can_enqueue_requests(): + return + + # Convert to executor requests. + triton_requests = [] + executor_requests = [] + for request in requests: + response_sender = request.get_response_sender() + if get_input_scalar_by_name(request, 'stop'): + self.handle_stop_request(request.request_id(), response_sender) + else: + try: + converted = convert_request(request, + self.exclude_input_from_output, + self.decoupled) + except Exception as e: + response_sender.send( + pb_utils.InferenceResponse(error=pb_utils.TritonError( + f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'" + )), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + else: + triton_requests.append(request) + executor_requests.append(converted) + + with self.lock: + request_ids = self.executor.enqueue_requests(executor_requests) + for req_id, request in zip(request_ids, triton_requests): + triton_id = request.request_id() + self.req_id_to_response_sender[ + req_id] = triton_id, request.get_response_sender() + self.triton_id_to_req_id[triton_id] = req_id + return None + + def awaiter_loop(self): + """Gets responses from executor and returns the results.""" + while self.running: + for response in self.executor.await_responses( + timeout=datetime.timedelta(milliseconds=1)): + req_id = response.request_id + with self.lock: + if req_id not in self.req_id_to_response_sender: + continue + triton_id, response_sender = self.req_id_to_response_sender[ + req_id] + + triton_response, is_final = convert_response(response) + response_sender.send( + triton_response, + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + if is_final else 0) + + if is_final: + with self.lock: + del self.triton_id_to_req_id[triton_id] + del self.req_id_to_response_sender[req_id] + # Remove local reference so response_sender can be cleaned properly. + del response_sender + # TODO: Read stats: https://jirasw.nvidia.com/browse/TRTLLM-563 + + def cancellation_loop(self): + """Checks if any pending requests have been cancelled.""" + while self.running: + time.sleep(self.cancellation_check_period_ms / 1000.0) + with self.lock: + for req_id, (triton_id, response_sender + ) in self.req_id_to_response_sender.items(): + if response_sender.is_cancelled(): + self.executor.cancel_request(req_id) + # Remove local reference so response_sender can be cleaned properly. + del response_sender + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + if self.executor.can_enqueue_requests(): + self.running = False + self.awaiter_thread.join() + self.cancellation_thread.join() + self.executor.shutdown() diff --git a/triton_model_repo/tensorrt_llm/config.pbtxt b/triton_model_repo/tensorrt_llm/config.pbtxt index 911fdeb..2d1f071 100644 --- a/triton_model_repo/tensorrt_llm/config.pbtxt +++ b/triton_model_repo/tensorrt_llm/config.pbtxt @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. name: "tensorrt_llm" -backend: "tensorrtllm" +backend: "${triton_backend}" max_batch_size: 64 model_transaction_policy { @@ -69,6 +69,13 @@ input [ optional: true allow_ragged_batch: true }, + { + name: "draft_acceptance_threshold" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "end_id" data_type: TYPE_INT32 @@ -132,6 +139,27 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "runtime_top_p_min" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_decay" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_reset_ids" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "len_penalty" data_type: TYPE_FP32 @@ -139,6 +167,13 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "early_stopping" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "repetition_penalty" data_type: TYPE_FP32 @@ -153,6 +188,13 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "beam_search_diversity_rate" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "presence_penalty" data_type: TYPE_FP32 @@ -323,7 +365,7 @@ parameters: { parameters: { key: "gpt_model_path" value: { - string_value: "/src/triton_model_repo/tensorrt_llm/1" + string_value: "/src/triton_model_repo/tensorrt_llm/1/" } } parameters: { @@ -335,7 +377,13 @@ parameters: { parameters: { key: "max_attention_window_size" value: { - string_value: "${max_attention_window_size}" + string_value: "4096" + } +} +parameters: { + key: "sink_token_length" + value: { + string_value: "${sink_token_length}" } } parameters: { @@ -347,21 +395,58 @@ parameters: { parameters: { key: "kv_cache_free_gpu_mem_fraction" value: { - string_value: "${kv_cache_free_gpu_mem_fraction}" + string_value: "0.95" + } +} +parameters: { + key: "kv_cache_host_memory_bytes" + value: { + string_value: "${kv_cache_host_memory_bytes}" } } parameters: { - key: "enable_trt_overlap" + key: "kv_cache_onboard_blocks" value: { - string_value: "${enable_trt_overlap}" + string_value: "${kv_cache_onboard_blocks}" } } +# enable_trt_overlap is deprecated and doesn't have any effect on the runtime +# parameters: { +# key: "enable_trt_overlap" +# value: { +# string_value: "${enable_trt_overlap}" +# } +# } parameters: { key: "exclude_input_in_output" value: { string_value: "${exclude_input_in_output}" } } +parameters: { + key: "cancellation_check_period_ms" + value: { + string_value: "${cancellation_check_period_ms}" + } +} +parameters: { + key: "stats_check_period_ms" + value: { + string_value: "${stats_check_period_ms}" + } +} +parameters: { + key: "iter_stats_max_iterations" + value: { + string_value: "${iter_stats_max_iterations}" + } +} +parameters: { + key: "request_stats_max_iterations" + value: { + string_value: "${request_stats_max_iterations}" + } +} parameters: { key: "enable_kv_cache_reuse" value: { @@ -417,9 +502,9 @@ parameters: { } } parameters: { - key: "worker_path" + key: "executor_worker_path" value: { - string_value: "/opt/tritonserver/backends/tensorrtllm/triton_tensorrtllm_worker" + string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker" } } parameters: { @@ -428,4 +513,9 @@ parameters: { string_value: "${medusa_choices}" } } - +parameters: { + key: "gpu_weights_percent" + value: { + string_value: "${gpu_weights_percent}" + } +} diff --git a/triton_model_repo/tensorrt_llm_bls/config.pbtxt b/triton_model_repo/tensorrt_llm_bls/config.pbtxt index 17989a9..e8c80e8 100644 --- a/triton_model_repo/tensorrt_llm_bls/config.pbtxt +++ b/triton_model_repo/tensorrt_llm_bls/config.pbtxt @@ -245,4 +245,3 @@ instance_group [ kind : KIND_CPU } ] - From 25cb314411e0c1ab8c0a4f12481735dd4826671e Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Fri, 5 Jul 2024 18:23:59 +0200 Subject: [PATCH 06/35] Adjust decoding_mode after testing, omit missing optional params Remove the tensorrt_llm python script, since it confuses `maybe_download_tarball_with_pget` --- predict.py | 25 +- triton_model_repo/tensorrt_llm/1/model.py | 581 -------------------- triton_model_repo/tensorrt_llm/config.pbtxt | 2 +- triton_templates/tensorrt_llm/1/model.py | 581 -------------------- triton_templates/tensorrt_llm/config.pbtxt | 2 +- 5 files changed, 21 insertions(+), 1170 deletions(-) delete mode 100644 triton_model_repo/tensorrt_llm/1/model.py delete mode 100644 triton_templates/tensorrt_llm/1/model.py diff --git a/predict.py b/predict.py index 9273485..c52310b 100644 --- a/predict.py +++ b/predict.py @@ -446,10 +446,19 @@ def _process_args( pad_id = self.pad_id end_id = self.end_id - if top_k < 0: - top_k = 0 - if min_tokens < 0: - min_tokens = 0 + decoding_mode = "top_k_top_p" + + if top_k <= 0: + top_k = None + decoding_mode = "top_p" + + if top_p == 0.0: + if decoding_mode == "top_p": + raise UserError( + "E1105 InvalidArgumentTopKTopP: Can't set both top_k and top_p to 0" + ) + decoding_mode = "top_k" + top_p = None if not seed: seed = int(np.random.randint(0, 100000)) @@ -459,7 +468,10 @@ def _process_args( max_tokens = min(max_tokens, token_budget) min_tokens = min(min_tokens, token_budget) - args = { + if min_tokens <= 0: + min_tokens = None + + args = {k: v for k, v in { "text_input": prompt, "max_tokens": max_tokens, "min_length": min_tokens, @@ -473,7 +485,8 @@ def _process_args( "random_seed": seed, "pad_id": pad_id, "end_id": end_id, - } + "decoding_mode": decoding_mode, + }.items() if v is not None} return args diff --git a/triton_model_repo/tensorrt_llm/1/model.py b/triton_model_repo/tensorrt_llm/1/model.py deleted file mode 100644 index 5b3eda3..0000000 --- a/triton_model_repo/tensorrt_llm/1/model.py +++ /dev/null @@ -1,581 +0,0 @@ -import datetime -import json -import os -import time -from threading import Lock, Thread - -import numpy as np -import triton_python_backend_utils as pb_utils -from torch import from_numpy - -import tensorrt_llm.bindings.executor as trtllm - - -def get_input_tensor_by_name(request, name): - tensor = pb_utils.get_input_tensor_by_name(request, name) - if tensor is None: - return None - return tensor.as_numpy() - - -def get_input_scalar_by_name(request, name): - tensor = get_input_tensor_by_name(request, name) - if tensor is None: - return None - if tensor.size != 1: - raise pb_utils.TritonModelException( - f"Expected a single value for {name}") - return tensor.item() - - -def read_parameter_as_type(value, name, pytype=str): - if value == "": - return None - if value.startswith("${") and value.endswith("}"): - return None - if pytype is bool: - return value.lower() in ["1", "true"] - try: - result = pytype(value) - return result - except: - pb_utils.Logger.log_warning( - f"Could not read parameter '{name}' with value '{value}', will use default." - ) - return None - - -def get_parameter(model_config, name, pytype=str): - if name not in model_config['parameters']: - return None - return read_parameter_as_type( - model_config['parameters'][name]['string_value'], name, pytype) - - -def convert_word_list(word_list): - if word_list is None: - return None - word_list = word_list.tolist() - if len(word_list) == 0 or len(word_list[0]) != 2: - raise pb_utils.TritonModelException(f"Invalid format for word list.") - words, indices = word_list[0] - result = [] - current_index = 0 - for i in indices: - if i == -1: - continue - if i > len(words): - raise pb_utils.TritonModelException( - f"Invalid format for word list.") - current_word = [] - while current_index < i: - current_word.append(words[current_index]) - current_index += 1 - result.append(current_word) - return result - - -def parse_medusa_choices(medusa_choices): - if medusa_choices is None: - return None - try: - result = json.loads( - "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]") - assert isinstance(result, list) and len(result) > 0 - assert all([isinstance(x, list) for x in result]) - assert all([isinstance(y, int) for x in result for y in x]) - except Exception: - raise pb_utils.TritonModelException( - "Invalid format for medusa_choices") - return result - - -def get_sampling_config_from_request(request): - kwargs = {} - kwargs['beam_width'] = get_input_scalar_by_name(request, 'beam_width') or 1 - kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k') - kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p') - kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[ - 'top_p'] <= 0 else kwargs['top_p'] - kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed') - kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature') - kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length') - kwargs['repetition_penalty'] = get_input_scalar_by_name( - request, 'repetition_penalty') - kwargs['presence_penalty'] = get_input_scalar_by_name( - request, 'presence_penalty') - kwargs['frequency_penalty'] = get_input_scalar_by_name( - request, 'frequency_penalty') - kwargs['length_penalty'] = get_input_scalar_by_name(request, 'len_penalty') - kwargs['top_p_min'] = get_input_scalar_by_name(request, - 'runtime_top_p_min') - kwargs['top_p_reset_ids'] = get_input_scalar_by_name( - request, 'runtime_top_p_reset_ids') - kwargs['top_p_decay'] = get_input_scalar_by_name(request, - 'runtime_top_p_decay') - kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name( - request, 'beam_search_diversity_rate') - kwargs['early_stopping'] = get_input_scalar_by_name( - request, 'early_stopping') - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.SamplingConfig(**kwargs) - - -def get_output_config_from_request(request, exclude_input_from_output): - kwargs = {} - kwargs["return_log_probs"] = get_input_scalar_by_name( - request, 'return_log_probs') - kwargs["return_context_logits"] = get_input_scalar_by_name( - request, 'return_context_logits') - kwargs["return_generation_logits"] = get_input_scalar_by_name( - request, 'return_generation_logits') - kwargs["exclude_input_from_output"] = exclude_input_from_output - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.OutputConfig(**kwargs) - - -def get_speculative_decoding_config_from_request(request): - kwargs = {} - draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids') - if draft_input_ids is not None: - kwargs['tokens'] = draft_input_ids.tolist() - draft_logits = get_input_tensor_by_name(request, 'draft_logits') - if draft_logits is not None: - kwargs['logits'] = from_numpy(draft_logits) - kwargs['acceptance_threshold'] = get_input_scalar_by_name( - request, 'draft_acceptance_threshold') - kwargs = {k: v for k, v in kwargs.items() if v is not None} - if len(kwargs) > 0: - return trtllm.SpeculativeDecodingConfig(**kwargs) - return None - - -def get_prompt_tuning_config_from_request(request): - # prompt_vocab_size is unused by executor. - kwargs = {} - prompt_embedding_table = get_input_tensor_by_name( - request, 'prompt_embedding_table') - if prompt_embedding_table is not None: - kwargs["embedding_table"] = from_numpy(prompt_embedding_table) - kwargs = {k: v for k, v in kwargs.items() if v is not None} - if len(kwargs) > 0: - return trtllm.PromptTuningConfig(**kwargs) - return None - - -def get_lora_config_from_request(request): - kwargs = {} - kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id') - lora_weights = get_input_tensor_by_name(request, 'lora_weights') - if lora_weights is not None: - kwargs["weights"] = from_numpy(lora_weights) - lora_config = get_input_tensor_by_name(request, 'lora_config') - if lora_config is not None: - kwargs["config"] = from_numpy(lora_config) - kwargs = {k: v for k, v in kwargs.items() if v is not None} - if len(kwargs) > 0: - return trtllm.LoraConfig(**kwargs) - return None - - -def convert_request(request, exclude_input_from_output, decoupled): - inputs = {} - input_token_ids = get_input_tensor_by_name(request, 'input_ids') - if input_token_ids is None: - raise pb_utils.TritonModelException( - "A value is required for input_ids") - input_token_ids = input_token_ids.tolist() - if len(input_token_ids) == 0: - raise pb_utils.TritonModelException(f"Invalid format for input_ids") - inputs['input_token_ids'] = input_token_ids[0] - # input_lengths is not not used by executor. - inputs['max_new_tokens'] = get_input_scalar_by_name( - request, 'request_output_len') - if inputs['max_new_tokens'] is None: - raise pb_utils.TritonModelException( - "A value is required for request_output_len") - inputs['streaming'] = get_input_scalar_by_name(request, 'streaming') - if inputs['streaming'] and not decoupled: - raise pb_utils.TritonModelException( - "Streaming is only supported in decoupled mode.") - inputs['end_id'] = get_input_scalar_by_name(request, 'end_id') - inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id') - inputs['stop_words'] = convert_word_list( - get_input_tensor_by_name(request, 'stop_words_list')) - inputs['bad_words'] = convert_word_list( - get_input_tensor_by_name(request, 'bad_words_list')) - embedding_bias = get_input_tensor_by_name(request, 'embedding_bias') - if embedding_bias is not None and embedding_bias.size != 0: - inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze() - - sampling_config = get_sampling_config_from_request(request) - output_config = get_output_config_from_request(request, - exclude_input_from_output) - speculative_decoding_config = get_speculative_decoding_config_from_request( - request) - prompt_tuning_config = get_prompt_tuning_config_from_request(request) - lora_config = get_lora_config_from_request(request) - - return trtllm.Request( - **inputs, - sampling_config=sampling_config, - output_config=output_config, - speculative_decoding_config=speculative_decoding_config, - prompt_tuning_config=prompt_tuning_config, - lora_config=lora_config, - ) - - -def convert_response(response): - if response.has_error(): - return pb_utils.InferenceResponse(output_tensors=[], - error=pb_utils.TritonError( - response.error_msg)), True - result = response.result - beam_lengths = np.expand_dims( - np.array([len(beam) for beam in result.output_token_ids], np.int32), 0) - max_beam_length = max([len(beam) for beam in result.output_token_ids]) - output_ids = np.full((1, len(result.output_token_ids), max_beam_length), - -1, np.int32) - for idx, beam in enumerate(result.output_token_ids): - output_ids[0, idx, :len(beam)] = beam - output_tensors = [ - pb_utils.Tensor("output_ids", output_ids), - pb_utils.Tensor("sequence_length", beam_lengths), - ] - output_tensors.append( - pb_utils.Tensor( - "cum_log_probs", - np.expand_dims(np.array(result.cum_log_probs, np.float32), 0) - if result.cum_log_probs is not None else np.zeros( - (1, 1), np.float32))) - output_tensors.append( - pb_utils.Tensor( - "output_log_probs", - np.expand_dims(np.array(result.log_probs, np.float32), 0) if - result.log_probs is not None else np.zeros((1, 1, 1), np.float32))) - output_tensors.append( - pb_utils.Tensor( - "context_logits", - np.expand_dims(np.array(result.context_logits, np.float32), 0) - if result.context_logits is not None else np.zeros( - (1, 1, 1), np.float32))) - output_tensors.append( - pb_utils.Tensor( - "generation_logits", - np.expand_dims(np.array(result.generation_logits, np.float32), 0) - if result.generation_logits is not None else np.zeros( - (1, 1, 1, 1), np.float32))) - return pb_utils.InferenceResponse(output_tensors), result.is_final - - -def convert_scheduler_policy(batch_scheduler_policy: str): - if batch_scheduler_policy.lower() == "max_utilization": - return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION - elif batch_scheduler_policy.lower() == "guaranteed_no_evict": - return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT - raise pb_utils.TritonModelException( - f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported." - ) - - -def convert_batching_type(gpt_model_type: str): - if gpt_model_type is None: - return None - if gpt_model_type.lower( - ) == "inflight_fused_batching" or gpt_model_type.lower( - ) == "inflight_batching": - return trtllm.BatchingType.INFLIGHT - elif gpt_model_type.lower() == "v1": - return trtllm.BatchingType.STATIC - raise pb_utils.TritonModelException( - f"gpt_model_type value of '{gpt_model_type}' is not supported.") - - -def convert_decoding_mode(decoding_mode: str): - if decoding_mode is None: - return None - elif decoding_mode == "none": - return trtllm.DecodingMode.NONE - elif decoding_mode == "top_k": - return trtllm.DecodingMode.TOP_K - elif decoding_mode == "top_p": - return trtllm.DecodingMode.TOP_P - elif decoding_mode == "top_k_top_p": - return trtllm.DecodingMode.TOP_K_TOP_P - elif decoding_mode == "beam_search": - return trtllm.DecodingMode.BEAM_SEARCH - elif decoding_mode == "medusa": - return trtllm.DecodingMode.MEDUSA - raise pb_utils.TritonModelException( - f"decoding_mode value of '{decoding_mode}' is not supported.") - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def get_scheduler_config(self, model_config): - batch_scheduler_policy = get_parameter(model_config, - "batch_scheduler_policy") - if batch_scheduler_policy is None: - return trtllm.SchedulerConfig() - return trtllm.SchedulerConfig( - convert_scheduler_policy(batch_scheduler_policy)) - - def get_kv_cache_config(self, model_config): - kwargs = { - "enable_block_reuse": - get_parameter(model_config, "enable_kv_cache_reuse", bool), - "max_tokens": - get_parameter(model_config, "max_tokens_in_paged_kv_cache", int), - "sink_token_length": - get_parameter(model_config, "sink_token_length", int), - "max_attention_window": - get_parameter(model_config, "max_attention_window_size", int), - "free_gpu_memory_fraction": - get_parameter(model_config, "kv_cache_free_gpu_mem_fraction", - float), - "host_cache_size": - get_parameter(model_config, "kv_cache_host_memory_bytes", int), - "onboard_blocks": - get_parameter(model_config, "kv_cache_onboard_blocks", bool), - } - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.KvCacheConfig(**kwargs) - - def get_parallel_config(self, model_config): - kwargs = {} - gpu_device_ids = get_parameter(model_config, "gpu_device_ids") - if gpu_device_ids: - kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")] - self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR", - "0") == "1" - if self.use_orchestrator_mode: - kwargs[ - "communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR - worker_path = get_parameter(model_config, "worker_path") - if worker_path is not None: - raise pb_utils.TritonModelException( - "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecuutorWorker executable." - ) - executor_worker_path = get_parameter(model_config, - "executor_worker_path") - kwargs["orchestrator_config"] = trtllm.OrchestratorConfig( - True, executor_worker_path) - if len(kwargs) > 0: - return trtllm.ParallelConfig(**kwargs) - return None - - def get_peft_cache_config(self, model_config): - kwargs = { - "optimal_adapter_size": - get_parameter(model_config, "lora_cache_optimal_adapter_size", - int), - "max_adapter_size": - get_parameter(model_config, "lora_cache_max_adapter_size", int), - "device_cache_percent": - get_parameter(model_config, "lora_cache_gpu_memory_fraction", - float), - "host_cache_size": - get_parameter(model_config, "lora_cache_host_memory_bytes", int), - } - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.PeftCacheConfig(**kwargs) - - def get_executor_config(self, model_config): - kwargs = { - "max_beam_width": - get_parameter(model_config, "max_beam_width", int), - "scheduler_config": - self.get_scheduler_config(model_config), - "kv_cache_config": - self.get_kv_cache_config(model_config), - "enable_chunked_context": - get_parameter(model_config, "enable_chunked_context", bool), - "normalize_log_probs": - get_parameter(model_config, "normalize_log_probs", bool), - "batching_type": - convert_batching_type(get_parameter(model_config, - "gpt_model_type")), - "parallel_config": - self.get_parallel_config(model_config), - "peft_cache_config": - self.get_peft_cache_config(model_config), - "medusa_choices": - parse_medusa_choices(get_parameter(model_config, - "medusa_choices")), - "decoding_mode": - convert_decoding_mode(get_parameter(model_config, - "decoding_mode")), - } - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.ExecutorConfig(**kwargs) - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - model_config = json.loads(args['model_config']) - gpt_model_path = get_parameter(model_config, "gpt_model_path") - if get_parameter(model_config, "enable_trt_overlap", bool): - raise pb_utils.TritonModelException( - f"enable_trt_overlap=true is not supported.") - self.exclude_input_from_output = get_parameter( - model_config, "exclude_input_in_output", bool) - executor_config = self.get_executor_config(model_config) - self.executor = trtllm.Executor(gpt_model_path, - trtllm.ModelType.DECODER_ONLY, - executor_config) - self.decoupled = pb_utils.using_decoupled_model_transaction_policy( - model_config) - self.cancellation_check_period_ms = get_parameter( - model_config, "cancellation_check_period_ms", int) or 100 - - if not self.decoupled: - raise pb_utils.TritonModelException( - "Please enable decoupled transaction policy in the model configuration to serve this model" - ) - - self.triton_id_to_req_id = {} - self.req_id_to_response_sender = {} - self.lock = Lock() - self.running = False - self.awaiter_thread = Thread(target=self.awaiter_loop) - self.cancellation_thread = Thread(target=self.cancellation_loop) - if self.executor.can_enqueue_requests(): - self.running = True - self.awaiter_thread.start() - self.cancellation_thread.start() - else: - # In leader mode, worker ranks will wait here until leader is done. - self.executor.shutdown() - - def handle_stop_request(self, triton_id, response_sender): - if triton_id is None or triton_id == "": - response_sender.send( - pb_utils.InferenceResponse(error=pb_utils.TritonError( - "A request id must be provided for request cancellation")), - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) - return - - if triton_id in self.triton_id_to_req_id: - req_id = self.triton_id_to_req_id[triton_id] - self.executor.cancel_request(req_id) - - response_sender.send( - pb_utils.InferenceResponse(), - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. - - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - if not self.executor.can_enqueue_requests(): - return - - # Convert to executor requests. - triton_requests = [] - executor_requests = [] - for request in requests: - response_sender = request.get_response_sender() - if get_input_scalar_by_name(request, 'stop'): - self.handle_stop_request(request.request_id(), response_sender) - else: - try: - converted = convert_request(request, - self.exclude_input_from_output, - self.decoupled) - except Exception as e: - response_sender.send( - pb_utils.InferenceResponse(error=pb_utils.TritonError( - f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'" - )), - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) - else: - triton_requests.append(request) - executor_requests.append(converted) - - with self.lock: - request_ids = self.executor.enqueue_requests(executor_requests) - for req_id, request in zip(request_ids, triton_requests): - triton_id = request.request_id() - self.req_id_to_response_sender[ - req_id] = triton_id, request.get_response_sender() - self.triton_id_to_req_id[triton_id] = req_id - return None - - def awaiter_loop(self): - """Gets responses from executor and returns the results.""" - while self.running: - for response in self.executor.await_responses( - timeout=datetime.timedelta(milliseconds=1)): - req_id = response.request_id - with self.lock: - if req_id not in self.req_id_to_response_sender: - continue - triton_id, response_sender = self.req_id_to_response_sender[ - req_id] - - triton_response, is_final = convert_response(response) - response_sender.send( - triton_response, - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL - if is_final else 0) - - if is_final: - with self.lock: - del self.triton_id_to_req_id[triton_id] - del self.req_id_to_response_sender[req_id] - # Remove local reference so response_sender can be cleaned properly. - del response_sender - # TODO: Read stats: https://jirasw.nvidia.com/browse/TRTLLM-563 - - def cancellation_loop(self): - """Checks if any pending requests have been cancelled.""" - while self.running: - time.sleep(self.cancellation_check_period_ms / 1000.0) - with self.lock: - for req_id, (triton_id, response_sender - ) in self.req_id_to_response_sender.items(): - if response_sender.is_cancelled(): - self.executor.cancel_request(req_id) - # Remove local reference so response_sender can be cleaned properly. - del response_sender - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - if self.executor.can_enqueue_requests(): - self.running = False - self.awaiter_thread.join() - self.cancellation_thread.join() - self.executor.shutdown() diff --git a/triton_model_repo/tensorrt_llm/config.pbtxt b/triton_model_repo/tensorrt_llm/config.pbtxt index 2d1f071..14aab33 100644 --- a/triton_model_repo/tensorrt_llm/config.pbtxt +++ b/triton_model_repo/tensorrt_llm/config.pbtxt @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. name: "tensorrt_llm" -backend: "${triton_backend}" +backend: "tensorrtllm" max_batch_size: 64 model_transaction_policy { diff --git a/triton_templates/tensorrt_llm/1/model.py b/triton_templates/tensorrt_llm/1/model.py deleted file mode 100644 index 5b3eda3..0000000 --- a/triton_templates/tensorrt_llm/1/model.py +++ /dev/null @@ -1,581 +0,0 @@ -import datetime -import json -import os -import time -from threading import Lock, Thread - -import numpy as np -import triton_python_backend_utils as pb_utils -from torch import from_numpy - -import tensorrt_llm.bindings.executor as trtllm - - -def get_input_tensor_by_name(request, name): - tensor = pb_utils.get_input_tensor_by_name(request, name) - if tensor is None: - return None - return tensor.as_numpy() - - -def get_input_scalar_by_name(request, name): - tensor = get_input_tensor_by_name(request, name) - if tensor is None: - return None - if tensor.size != 1: - raise pb_utils.TritonModelException( - f"Expected a single value for {name}") - return tensor.item() - - -def read_parameter_as_type(value, name, pytype=str): - if value == "": - return None - if value.startswith("${") and value.endswith("}"): - return None - if pytype is bool: - return value.lower() in ["1", "true"] - try: - result = pytype(value) - return result - except: - pb_utils.Logger.log_warning( - f"Could not read parameter '{name}' with value '{value}', will use default." - ) - return None - - -def get_parameter(model_config, name, pytype=str): - if name not in model_config['parameters']: - return None - return read_parameter_as_type( - model_config['parameters'][name]['string_value'], name, pytype) - - -def convert_word_list(word_list): - if word_list is None: - return None - word_list = word_list.tolist() - if len(word_list) == 0 or len(word_list[0]) != 2: - raise pb_utils.TritonModelException(f"Invalid format for word list.") - words, indices = word_list[0] - result = [] - current_index = 0 - for i in indices: - if i == -1: - continue - if i > len(words): - raise pb_utils.TritonModelException( - f"Invalid format for word list.") - current_word = [] - while current_index < i: - current_word.append(words[current_index]) - current_index += 1 - result.append(current_word) - return result - - -def parse_medusa_choices(medusa_choices): - if medusa_choices is None: - return None - try: - result = json.loads( - "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]") - assert isinstance(result, list) and len(result) > 0 - assert all([isinstance(x, list) for x in result]) - assert all([isinstance(y, int) for x in result for y in x]) - except Exception: - raise pb_utils.TritonModelException( - "Invalid format for medusa_choices") - return result - - -def get_sampling_config_from_request(request): - kwargs = {} - kwargs['beam_width'] = get_input_scalar_by_name(request, 'beam_width') or 1 - kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k') - kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p') - kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[ - 'top_p'] <= 0 else kwargs['top_p'] - kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed') - kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature') - kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length') - kwargs['repetition_penalty'] = get_input_scalar_by_name( - request, 'repetition_penalty') - kwargs['presence_penalty'] = get_input_scalar_by_name( - request, 'presence_penalty') - kwargs['frequency_penalty'] = get_input_scalar_by_name( - request, 'frequency_penalty') - kwargs['length_penalty'] = get_input_scalar_by_name(request, 'len_penalty') - kwargs['top_p_min'] = get_input_scalar_by_name(request, - 'runtime_top_p_min') - kwargs['top_p_reset_ids'] = get_input_scalar_by_name( - request, 'runtime_top_p_reset_ids') - kwargs['top_p_decay'] = get_input_scalar_by_name(request, - 'runtime_top_p_decay') - kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name( - request, 'beam_search_diversity_rate') - kwargs['early_stopping'] = get_input_scalar_by_name( - request, 'early_stopping') - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.SamplingConfig(**kwargs) - - -def get_output_config_from_request(request, exclude_input_from_output): - kwargs = {} - kwargs["return_log_probs"] = get_input_scalar_by_name( - request, 'return_log_probs') - kwargs["return_context_logits"] = get_input_scalar_by_name( - request, 'return_context_logits') - kwargs["return_generation_logits"] = get_input_scalar_by_name( - request, 'return_generation_logits') - kwargs["exclude_input_from_output"] = exclude_input_from_output - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.OutputConfig(**kwargs) - - -def get_speculative_decoding_config_from_request(request): - kwargs = {} - draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids') - if draft_input_ids is not None: - kwargs['tokens'] = draft_input_ids.tolist() - draft_logits = get_input_tensor_by_name(request, 'draft_logits') - if draft_logits is not None: - kwargs['logits'] = from_numpy(draft_logits) - kwargs['acceptance_threshold'] = get_input_scalar_by_name( - request, 'draft_acceptance_threshold') - kwargs = {k: v for k, v in kwargs.items() if v is not None} - if len(kwargs) > 0: - return trtllm.SpeculativeDecodingConfig(**kwargs) - return None - - -def get_prompt_tuning_config_from_request(request): - # prompt_vocab_size is unused by executor. - kwargs = {} - prompt_embedding_table = get_input_tensor_by_name( - request, 'prompt_embedding_table') - if prompt_embedding_table is not None: - kwargs["embedding_table"] = from_numpy(prompt_embedding_table) - kwargs = {k: v for k, v in kwargs.items() if v is not None} - if len(kwargs) > 0: - return trtllm.PromptTuningConfig(**kwargs) - return None - - -def get_lora_config_from_request(request): - kwargs = {} - kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id') - lora_weights = get_input_tensor_by_name(request, 'lora_weights') - if lora_weights is not None: - kwargs["weights"] = from_numpy(lora_weights) - lora_config = get_input_tensor_by_name(request, 'lora_config') - if lora_config is not None: - kwargs["config"] = from_numpy(lora_config) - kwargs = {k: v for k, v in kwargs.items() if v is not None} - if len(kwargs) > 0: - return trtllm.LoraConfig(**kwargs) - return None - - -def convert_request(request, exclude_input_from_output, decoupled): - inputs = {} - input_token_ids = get_input_tensor_by_name(request, 'input_ids') - if input_token_ids is None: - raise pb_utils.TritonModelException( - "A value is required for input_ids") - input_token_ids = input_token_ids.tolist() - if len(input_token_ids) == 0: - raise pb_utils.TritonModelException(f"Invalid format for input_ids") - inputs['input_token_ids'] = input_token_ids[0] - # input_lengths is not not used by executor. - inputs['max_new_tokens'] = get_input_scalar_by_name( - request, 'request_output_len') - if inputs['max_new_tokens'] is None: - raise pb_utils.TritonModelException( - "A value is required for request_output_len") - inputs['streaming'] = get_input_scalar_by_name(request, 'streaming') - if inputs['streaming'] and not decoupled: - raise pb_utils.TritonModelException( - "Streaming is only supported in decoupled mode.") - inputs['end_id'] = get_input_scalar_by_name(request, 'end_id') - inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id') - inputs['stop_words'] = convert_word_list( - get_input_tensor_by_name(request, 'stop_words_list')) - inputs['bad_words'] = convert_word_list( - get_input_tensor_by_name(request, 'bad_words_list')) - embedding_bias = get_input_tensor_by_name(request, 'embedding_bias') - if embedding_bias is not None and embedding_bias.size != 0: - inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze() - - sampling_config = get_sampling_config_from_request(request) - output_config = get_output_config_from_request(request, - exclude_input_from_output) - speculative_decoding_config = get_speculative_decoding_config_from_request( - request) - prompt_tuning_config = get_prompt_tuning_config_from_request(request) - lora_config = get_lora_config_from_request(request) - - return trtllm.Request( - **inputs, - sampling_config=sampling_config, - output_config=output_config, - speculative_decoding_config=speculative_decoding_config, - prompt_tuning_config=prompt_tuning_config, - lora_config=lora_config, - ) - - -def convert_response(response): - if response.has_error(): - return pb_utils.InferenceResponse(output_tensors=[], - error=pb_utils.TritonError( - response.error_msg)), True - result = response.result - beam_lengths = np.expand_dims( - np.array([len(beam) for beam in result.output_token_ids], np.int32), 0) - max_beam_length = max([len(beam) for beam in result.output_token_ids]) - output_ids = np.full((1, len(result.output_token_ids), max_beam_length), - -1, np.int32) - for idx, beam in enumerate(result.output_token_ids): - output_ids[0, idx, :len(beam)] = beam - output_tensors = [ - pb_utils.Tensor("output_ids", output_ids), - pb_utils.Tensor("sequence_length", beam_lengths), - ] - output_tensors.append( - pb_utils.Tensor( - "cum_log_probs", - np.expand_dims(np.array(result.cum_log_probs, np.float32), 0) - if result.cum_log_probs is not None else np.zeros( - (1, 1), np.float32))) - output_tensors.append( - pb_utils.Tensor( - "output_log_probs", - np.expand_dims(np.array(result.log_probs, np.float32), 0) if - result.log_probs is not None else np.zeros((1, 1, 1), np.float32))) - output_tensors.append( - pb_utils.Tensor( - "context_logits", - np.expand_dims(np.array(result.context_logits, np.float32), 0) - if result.context_logits is not None else np.zeros( - (1, 1, 1), np.float32))) - output_tensors.append( - pb_utils.Tensor( - "generation_logits", - np.expand_dims(np.array(result.generation_logits, np.float32), 0) - if result.generation_logits is not None else np.zeros( - (1, 1, 1, 1), np.float32))) - return pb_utils.InferenceResponse(output_tensors), result.is_final - - -def convert_scheduler_policy(batch_scheduler_policy: str): - if batch_scheduler_policy.lower() == "max_utilization": - return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION - elif batch_scheduler_policy.lower() == "guaranteed_no_evict": - return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT - raise pb_utils.TritonModelException( - f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported." - ) - - -def convert_batching_type(gpt_model_type: str): - if gpt_model_type is None: - return None - if gpt_model_type.lower( - ) == "inflight_fused_batching" or gpt_model_type.lower( - ) == "inflight_batching": - return trtllm.BatchingType.INFLIGHT - elif gpt_model_type.lower() == "v1": - return trtllm.BatchingType.STATIC - raise pb_utils.TritonModelException( - f"gpt_model_type value of '{gpt_model_type}' is not supported.") - - -def convert_decoding_mode(decoding_mode: str): - if decoding_mode is None: - return None - elif decoding_mode == "none": - return trtllm.DecodingMode.NONE - elif decoding_mode == "top_k": - return trtllm.DecodingMode.TOP_K - elif decoding_mode == "top_p": - return trtllm.DecodingMode.TOP_P - elif decoding_mode == "top_k_top_p": - return trtllm.DecodingMode.TOP_K_TOP_P - elif decoding_mode == "beam_search": - return trtllm.DecodingMode.BEAM_SEARCH - elif decoding_mode == "medusa": - return trtllm.DecodingMode.MEDUSA - raise pb_utils.TritonModelException( - f"decoding_mode value of '{decoding_mode}' is not supported.") - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def get_scheduler_config(self, model_config): - batch_scheduler_policy = get_parameter(model_config, - "batch_scheduler_policy") - if batch_scheduler_policy is None: - return trtllm.SchedulerConfig() - return trtllm.SchedulerConfig( - convert_scheduler_policy(batch_scheduler_policy)) - - def get_kv_cache_config(self, model_config): - kwargs = { - "enable_block_reuse": - get_parameter(model_config, "enable_kv_cache_reuse", bool), - "max_tokens": - get_parameter(model_config, "max_tokens_in_paged_kv_cache", int), - "sink_token_length": - get_parameter(model_config, "sink_token_length", int), - "max_attention_window": - get_parameter(model_config, "max_attention_window_size", int), - "free_gpu_memory_fraction": - get_parameter(model_config, "kv_cache_free_gpu_mem_fraction", - float), - "host_cache_size": - get_parameter(model_config, "kv_cache_host_memory_bytes", int), - "onboard_blocks": - get_parameter(model_config, "kv_cache_onboard_blocks", bool), - } - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.KvCacheConfig(**kwargs) - - def get_parallel_config(self, model_config): - kwargs = {} - gpu_device_ids = get_parameter(model_config, "gpu_device_ids") - if gpu_device_ids: - kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")] - self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR", - "0") == "1" - if self.use_orchestrator_mode: - kwargs[ - "communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR - worker_path = get_parameter(model_config, "worker_path") - if worker_path is not None: - raise pb_utils.TritonModelException( - "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecuutorWorker executable." - ) - executor_worker_path = get_parameter(model_config, - "executor_worker_path") - kwargs["orchestrator_config"] = trtllm.OrchestratorConfig( - True, executor_worker_path) - if len(kwargs) > 0: - return trtllm.ParallelConfig(**kwargs) - return None - - def get_peft_cache_config(self, model_config): - kwargs = { - "optimal_adapter_size": - get_parameter(model_config, "lora_cache_optimal_adapter_size", - int), - "max_adapter_size": - get_parameter(model_config, "lora_cache_max_adapter_size", int), - "device_cache_percent": - get_parameter(model_config, "lora_cache_gpu_memory_fraction", - float), - "host_cache_size": - get_parameter(model_config, "lora_cache_host_memory_bytes", int), - } - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.PeftCacheConfig(**kwargs) - - def get_executor_config(self, model_config): - kwargs = { - "max_beam_width": - get_parameter(model_config, "max_beam_width", int), - "scheduler_config": - self.get_scheduler_config(model_config), - "kv_cache_config": - self.get_kv_cache_config(model_config), - "enable_chunked_context": - get_parameter(model_config, "enable_chunked_context", bool), - "normalize_log_probs": - get_parameter(model_config, "normalize_log_probs", bool), - "batching_type": - convert_batching_type(get_parameter(model_config, - "gpt_model_type")), - "parallel_config": - self.get_parallel_config(model_config), - "peft_cache_config": - self.get_peft_cache_config(model_config), - "medusa_choices": - parse_medusa_choices(get_parameter(model_config, - "medusa_choices")), - "decoding_mode": - convert_decoding_mode(get_parameter(model_config, - "decoding_mode")), - } - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.ExecutorConfig(**kwargs) - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - model_config = json.loads(args['model_config']) - gpt_model_path = get_parameter(model_config, "gpt_model_path") - if get_parameter(model_config, "enable_trt_overlap", bool): - raise pb_utils.TritonModelException( - f"enable_trt_overlap=true is not supported.") - self.exclude_input_from_output = get_parameter( - model_config, "exclude_input_in_output", bool) - executor_config = self.get_executor_config(model_config) - self.executor = trtllm.Executor(gpt_model_path, - trtllm.ModelType.DECODER_ONLY, - executor_config) - self.decoupled = pb_utils.using_decoupled_model_transaction_policy( - model_config) - self.cancellation_check_period_ms = get_parameter( - model_config, "cancellation_check_period_ms", int) or 100 - - if not self.decoupled: - raise pb_utils.TritonModelException( - "Please enable decoupled transaction policy in the model configuration to serve this model" - ) - - self.triton_id_to_req_id = {} - self.req_id_to_response_sender = {} - self.lock = Lock() - self.running = False - self.awaiter_thread = Thread(target=self.awaiter_loop) - self.cancellation_thread = Thread(target=self.cancellation_loop) - if self.executor.can_enqueue_requests(): - self.running = True - self.awaiter_thread.start() - self.cancellation_thread.start() - else: - # In leader mode, worker ranks will wait here until leader is done. - self.executor.shutdown() - - def handle_stop_request(self, triton_id, response_sender): - if triton_id is None or triton_id == "": - response_sender.send( - pb_utils.InferenceResponse(error=pb_utils.TritonError( - "A request id must be provided for request cancellation")), - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) - return - - if triton_id in self.triton_id_to_req_id: - req_id = self.triton_id_to_req_id[triton_id] - self.executor.cancel_request(req_id) - - response_sender.send( - pb_utils.InferenceResponse(), - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. - - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - if not self.executor.can_enqueue_requests(): - return - - # Convert to executor requests. - triton_requests = [] - executor_requests = [] - for request in requests: - response_sender = request.get_response_sender() - if get_input_scalar_by_name(request, 'stop'): - self.handle_stop_request(request.request_id(), response_sender) - else: - try: - converted = convert_request(request, - self.exclude_input_from_output, - self.decoupled) - except Exception as e: - response_sender.send( - pb_utils.InferenceResponse(error=pb_utils.TritonError( - f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'" - )), - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) - else: - triton_requests.append(request) - executor_requests.append(converted) - - with self.lock: - request_ids = self.executor.enqueue_requests(executor_requests) - for req_id, request in zip(request_ids, triton_requests): - triton_id = request.request_id() - self.req_id_to_response_sender[ - req_id] = triton_id, request.get_response_sender() - self.triton_id_to_req_id[triton_id] = req_id - return None - - def awaiter_loop(self): - """Gets responses from executor and returns the results.""" - while self.running: - for response in self.executor.await_responses( - timeout=datetime.timedelta(milliseconds=1)): - req_id = response.request_id - with self.lock: - if req_id not in self.req_id_to_response_sender: - continue - triton_id, response_sender = self.req_id_to_response_sender[ - req_id] - - triton_response, is_final = convert_response(response) - response_sender.send( - triton_response, - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL - if is_final else 0) - - if is_final: - with self.lock: - del self.triton_id_to_req_id[triton_id] - del self.req_id_to_response_sender[req_id] - # Remove local reference so response_sender can be cleaned properly. - del response_sender - # TODO: Read stats: https://jirasw.nvidia.com/browse/TRTLLM-563 - - def cancellation_loop(self): - """Checks if any pending requests have been cancelled.""" - while self.running: - time.sleep(self.cancellation_check_period_ms / 1000.0) - with self.lock: - for req_id, (triton_id, response_sender - ) in self.req_id_to_response_sender.items(): - if response_sender.is_cancelled(): - self.executor.cancel_request(req_id) - # Remove local reference so response_sender can be cleaned properly. - del response_sender - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - if self.executor.can_enqueue_requests(): - self.running = False - self.awaiter_thread.join() - self.cancellation_thread.join() - self.executor.shutdown() diff --git a/triton_templates/tensorrt_llm/config.pbtxt b/triton_templates/tensorrt_llm/config.pbtxt index 64c5d0e..1c34f77 100644 --- a/triton_templates/tensorrt_llm/config.pbtxt +++ b/triton_templates/tensorrt_llm/config.pbtxt @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. name: "tensorrt_llm" -backend: "${triton_backend}" +backend: "tensorrtllm" max_batch_size: ${triton_max_batch_size} model_transaction_policy { From a19b28b692bb64080df16b74ec0b9b87a57f7513 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Wed, 10 Jul 2024 16:39:06 +0200 Subject: [PATCH 07/35] ignore empty SSE's, remove decoding_mode, just omit topk instead --- predict.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/predict.py b/predict.py index c52310b..e363113 100644 --- a/predict.py +++ b/predict.py @@ -371,6 +371,9 @@ async def predict( f"E2104 TritonMalformedEvent: Triton returned malformed event (no output_ids or error key): {event_data}" ) + if token == []: + continue + n_tokens += 1 if n_tokens == 1: first_token_time = time.time() @@ -446,18 +449,12 @@ def _process_args( pad_id = self.pad_id end_id = self.end_id - decoding_mode = "top_k_top_p" - if top_k <= 0: + # workaround, unneccesary with with trtllm > 0.10.0 top_k = None - decoding_mode = "top_p" - if top_p == 0.0: - if decoding_mode == "top_p": - raise UserError( - "E1105 InvalidArgumentTopKTopP: Can't set both top_k and top_p to 0" - ) - decoding_mode = "top_k" + if top_p <= 0.0: + # workaround, unneccesary with with trtllm > 0.10.0 top_p = None if not seed: @@ -469,6 +466,7 @@ def _process_args( min_tokens = min(min_tokens, token_budget) if min_tokens <= 0: + # workaround, unneccesary with with trtllm > 0.10.0 min_tokens = None args = {k: v for k, v in { @@ -485,7 +483,6 @@ def _process_args( "random_seed": seed, "pad_id": pad_id, "end_id": end_id, - "decoding_mode": decoding_mode, }.items() if v is not None} return args From d86927d526fe650eb3517d1d7699470e734e7728 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Wed, 10 Jul 2024 17:05:19 +0200 Subject: [PATCH 08/35] bump pget==0.8.2, cog==0.10.0-alpha16 --- default.nix | 1 + flake.lock | 6 +++--- lock.json | 20 ++++++++++---------- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/default.nix b/default.nix index 7609e59..63dabf2 100644 --- a/default.nix +++ b/default.nix @@ -20,6 +20,7 @@ in "--extra-index-url" "https://pypi.nvidia.com" "tensorrt_llm==0.10.0" + "tensorrt-cu12==10.1.0" "torch==2.2.2" "nvidia-pytriton==0.5.6" # corresponds to 2.44.0 "omegaconf" diff --git a/flake.lock b/flake.lock index 769d11c..6c7183e 100644 --- a/flake.lock +++ b/flake.lock @@ -12,11 +12,11 @@ "rust-overlay": "rust-overlay" }, "locked": { - "lastModified": 1720087249, - "narHash": "sha256-xKIs2n8Ux7Y+BgaBfZekeZD9v70Gvas4oNQVBsctQYw=", + "lastModified": 1720623451, + "narHash": "sha256-s9B4Y9n7wKK/fDhF5td7/4z/jkKRG7Rr6aCyzA87uMc=", "owner": "datakami", "repo": "cognix", - "rev": "2f11a38c8d6bd9ba2c8ea4970cd93c0e334f7189", + "rev": "dd980ee21fe66f9de4a5475c22b674a7407dc084", "type": "github" }, "original": { diff --git a/lock.json b/lock.json index 4acd4a4..067b583 100644 --- a/lock.json +++ b/lock.json @@ -650,22 +650,22 @@ "version": "10.0.1" }, "tensorrt-cu12": { - "sha256": "9663446e2872113d619ad5010766cccc1f023d693cb43c3f8f2496563028badc", + "sha256": "a549e2fe472eb03b2737a708c0aef0cac9cb0be1ae46bc7dad72ec1dfc81bd19", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-cu12/tensorrt-cu12-10.2.0.post1.tar.gz", - "version": "10.2.0.post1" + "url": "https://pypi.nvidia.com/tensorrt-cu12/tensorrt-cu12-10.1.0.tar.gz", + "version": "10.1.0" }, "tensorrt-cu12-bindings": { - "sha256": "3248e7951d1f2fa8884759b19456ab7d08a3f75bd6b8e5d58e5cc18788c02171", + "sha256": "91e1bd0eb348524ff209ef6b235d329983ea704b5d16f9a7ba747c08cc3c2495", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.2.0.post1-cp310-none-manylinux_2_17_x86_64.whl", - "version": "10.2.0.post1" + "url": "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.1.0-cp310-none-manylinux_2_17_x86_64.whl", + "version": "10.1.0" }, "tensorrt-cu12-libs": { - "sha256": "a42f7ecb1659fac27cf68996df0984e68018be61bd8bbd95f51619f9c4e9cf31", + "sha256": "1ad13c26b3f441267a746df6859e44eb0e8da78d4382458d1fd2eb7675abd49f", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-cu12-libs/tensorrt_cu12_libs-10.2.0.post1-py2.py3-none-manylinux_2_17_x86_64.whl", - "version": "10.2.0.post1" + "url": "https://pypi.nvidia.com/tensorrt-cu12-libs/tensorrt_cu12_libs-10.1.0-py2.py3-none-manylinux_2_17_x86_64.whl", + "version": "10.1.0" }, "tensorrt-llm": { "sha256": "c7975326fa10b56079e0febf7c52a65ccf5b37760cd1c79d5aa3e8c7d85ce69c", @@ -1248,5 +1248,5 @@ } } }, - "invalidationHash": "aedf040e5687ab8badc94e4500a11b3037a51c13346051fbcf5f441fd85fcfbb" + "invalidationHash": "e7e207b87a9d99d7041d2be7edfba110533a437cc9cf984be13642572fa5f156" } \ No newline at end of file From 098f7e7f7e11d3ea40585125a79debd40aab029e Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Thu, 11 Jul 2024 13:16:16 +0200 Subject: [PATCH 09/35] tensorrt-llm: decrease closure size by cleaning up kernels --- nix/tensorrt-llm.nix | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/nix/tensorrt-llm.nix b/nix/tensorrt-llm.nix index cb315ca..5db8c87 100644 --- a/nix/tensorrt-llm.nix +++ b/nix/tensorrt-llm.nix @@ -41,6 +41,7 @@ stdenv.mkDerivation (o: { ninja python3 cudaPackages.cuda_nvcc + rsync ]; buildInputs = [ @@ -144,19 +145,12 @@ stdenv.mkDerivation (o: { installPhase = '' mkdir -p $out - ${rsync}/bin/rsync -a --exclude "tensorrt_llm/kernels" $src/cpp $out/ - chmod -R u+w $out/cpp - ${rsync}/bin/rsync -a $src/cpp/tensorrt_llm/kernels $out/cpp/tensorrt_llm/ - chmod -R u+w $out/cpp - mkdir -p $out/cpp/build/tensorrt_llm/plugins + rsync -a --chmod=u+w --include "tensorrt_llm/kernels/" --include "tensorrt_llm/kernels/kvCacheIndex.h" --exclude "tensorrt_llm/kernels/*" $src/cpp $out/ + # rsync -a --chmod=u+w $src/cpp/tensorrt_llm/kernels $out/cpp/tensorrt_llm/ pushd tensorrt_llm - cp ./libtensorrt_llm.so $out/cpp/build/tensorrt_llm/ - cp -r ./executor_worker $out/cpp/build/tensorrt_llm/ - chmod -R u+w $out/cpp/build/tensorrt_llm/executor_worker + mkdir -p $out/cpp/build/tensorrt_llm/ + find . '(' '(' -type f -executable ')' -or -type l ')' -print0 | rsync -av --chmod=u+w --files-from=- --from0 ./ $out/cpp/build/tensorrt_llm/ patchelf --add-needed 'libcudnn.so.8' --add-rpath ${cudaPackages.cudnn.lib}/lib $out/cpp/build/tensorrt_llm/libtensorrt_llm.so - cp ./plugins/libnvinfer_plugin_tensorrt_llm.so* $out/cpp/build/tensorrt_llm/plugins/ - mkdir -p $out/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/ - cp -r /build/source/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper $_ for f in $out/cpp/build/tensorrt_llm/plugins/*.so* $out/cpp/build/tensorrt_llm/executor_worker/executorWorker; do if [ ! -L "$f" ]; then new_path=$(patchelf --print-rpath "$f" | From cd3ce0e2570d371ea1f63d5677c230ef1c297078 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Wed, 17 Jul 2024 17:17:43 +0200 Subject: [PATCH 10/35] bump cognix to exclude `train: null` --- flake.lock | 7 ++++--- flake.nix | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/flake.lock b/flake.lock index 6c7183e..5136b55 100644 --- a/flake.lock +++ b/flake.lock @@ -12,15 +12,16 @@ "rust-overlay": "rust-overlay" }, "locked": { - "lastModified": 1720623451, - "narHash": "sha256-s9B4Y9n7wKK/fDhF5td7/4z/jkKRG7Rr6aCyzA87uMc=", + "lastModified": 1721228311, + "narHash": "sha256-EEe5Kcno5FMFSd2aYVB2ONHFpe/9k0CX1gIFjNQgV+A=", "owner": "datakami", "repo": "cognix", - "rev": "dd980ee21fe66f9de4a5475c22b674a7407dc084", + "rev": "8c28f745d7339c495265a85fb65da3ce5592f0ef", "type": "github" }, "original": { "owner": "datakami", + "ref": "24.07", "repo": "cognix", "type": "github" } diff --git a/flake.nix b/flake.nix index 6ed8aca..249250b 100644 --- a/flake.nix +++ b/flake.nix @@ -4,7 +4,7 @@ extra-substituters = "https://storage.googleapis.com/replicate-nix-cache-dev/"; }; inputs = { - cognix.url = "github:datakami/cognix/24.03"; + cognix.url = "github:datakami/cognix/24.07"; }; outputs = { self, cognix }@inputs: (cognix.lib.cognixFlake inputs {}) // { From e5ac90619d29e42f22d80ba58d37f9ec52d46fad Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Fri, 19 Jul 2024 13:47:19 +0200 Subject: [PATCH 11/35] tensorrt-llm: 0.10.0 -> 0.11.0 --- default.nix | 6 +-- lock.json | 116 +++++++++++++++++++---------------------- nix/tensorrt-llm.nix | 4 +- nix/trtllm-backend.nix | 4 +- 4 files changed, 61 insertions(+), 69 deletions(-) diff --git a/default.nix b/default.nix index 63dabf2..d1024d6 100644 --- a/default.nix +++ b/default.nix @@ -19,9 +19,9 @@ in python_packages = [ "--extra-index-url" "https://pypi.nvidia.com" - "tensorrt_llm==0.10.0" + "tensorrt_llm==0.11.0" "tensorrt-cu12==10.1.0" - "torch==2.2.2" + "torch==2.3.1" "nvidia-pytriton==0.5.6" # corresponds to 2.44.0 "omegaconf" "hf-transfer" @@ -37,7 +37,7 @@ in "nvidia-cublas-cu12==${cudaPackages.libcublas.version}" ]; overridesList = [ - "pydantic==1.10.16" + "pydantic==1.10.17" ]; }; cognix.includeNix = true; diff --git a/lock.json b/lock.json index 067b583..e310635 100644 --- a/lock.json +++ b/lock.json @@ -128,10 +128,10 @@ "version": "0.4.2" }, "exceptiongroup": { - "sha256": "5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad", + "sha256": "3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", "type": "url", - "url": "https://files.pythonhosted.org/packages/01/90/79fe92dd413a9cab314ef5c591b5aa9b9ba787ae4cadab75055b0ae00b33/exceptiongroup-1.2.1-py3-none-any.whl", - "version": "1.2.1" + "url": "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", + "version": "1.2.2" }, "fastapi": { "sha256": "f4165fb1fe3610c52cb1b8282c1480de9c34bc270f56a965aa93a884c350d605", @@ -176,10 +176,10 @@ "version": "3.0.3" }, "grpcio": { - "sha256": "e7cd5c1325f6808b8ae31657d281aadb2a51ac11ab081ae335f4f7fc44c1721d", + "sha256": "a8422dc13ad93ec8caa2612b5032a2b9cd6421c13ed87f54db4a3a2c93afaf77", "type": "url", - "url": "https://files.pythonhosted.org/packages/5e/3b/459a477de3d899ffd4164d116a0a1db67468465ef5eaa81652f9319c27ab/grpcio-1.64.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "1.64.1" + "url": "https://files.pythonhosted.org/packages/b4/b5/fe9dcf91919d0f09da8eec3c2091fab6dbb5e102027deeca928bc26b9fc2/grpcio-1.65.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "1.65.1" }, "h11": { "sha256": "e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761", @@ -230,10 +230,10 @@ "version": "0.27.0" }, "huggingface-hub": { - "sha256": "3a0b957aa87150addf0cc7bd71b4d954b78e749850e1e7fb29ebbd2db64ca037", + "sha256": "7ad92edefb93d8145c061f6df8d99df2ff85f8379ba5fac8a95aca0642afa5d7", "type": "url", - "url": "https://files.pythonhosted.org/packages/69/d6/73f9d1b7c4da5f0544bc17680d0fa9932445423b90cd38e1ee77d001a4f5/huggingface_hub-0.23.4-py3-none-any.whl", - "version": "0.23.4" + "url": "https://files.pythonhosted.org/packages/57/c0/cf4435f3186655e3bafdca08cd6c794e3866f1f89ed99595504e7240b6a2/huggingface_hub-0.24.0-py3-none-any.whl", + "version": "0.24.0" }, "humanfriendly": { "sha256": "1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", @@ -398,16 +398,16 @@ "version": "12.1.0.106" }, "nvidia-modelopt": { - "sha256": "f56f04280fef23727a49decf13ff8269c9cc47b95fc304fcefed79fbe8e6ef5f", + "sha256": "9af69e4215e7da9c65431bd27b51bc1b95c5d98cfb97105f83daf2198a820b5d", "type": "url", - "url": "https://pypi.nvidia.com/nvidia-modelopt/nvidia_modelopt-0.11.2-cp310-cp310-manylinux_2_28_x86_64.whl", - "version": "0.11.2" + "url": "https://pypi.nvidia.com/nvidia-modelopt/nvidia_modelopt-0.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "0.13.1" }, "nvidia-nccl-cu12": { - "sha256": "802756f02c43c0613dc83f48a76f702462b0f1f618411768748bba9c805fce19", + "sha256": "057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56", "type": "url", - "url": "https://pypi.nvidia.com/nvidia-nccl-cu12/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl", - "version": "2.19.3" + "url": "https://pypi.nvidia.com/nvidia-nccl-cu12/nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl", + "version": "2.20.5" }, "nvidia-nvjitlink-cu12": { "sha256": "f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212", @@ -458,10 +458,10 @@ "version": "2.2.2" }, "pillow": { - "sha256": "a985e028fc183bf12a77a8bbf36318db4238a3ded7fa9df1b9a133f1cb79f8fc", + "sha256": "b14f16f94cbc61215115b9b1236f9c18403c15dd3c52cf629072afa9d54c1cbf", "type": "url", - "url": "https://files.pythonhosted.org/packages/b5/5b/6651c288b08df3b8c1e2f8c1152201e0b25d240e22ddade0f1e242fc9fa0/pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl", - "version": "10.4.0" + "url": "https://files.pythonhosted.org/packages/b5/a2/7a09695dc636bf8d0a1b63022f58701177b7dc6fad30f6d6bc343e5473a4/pillow-10.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", + "version": "10.3.0" }, "polygraphy": { "sha256": "62ae22825efdd3288222e5b1d2d791fe58e87844fcd848bcd1251fbce02ba956", @@ -482,22 +482,22 @@ "version": "6.0.0" }, "pulp": { - "sha256": "4a19814a5b0a4392d788ac2315263435293579b0583c3469943fe0c6a586f263", + "sha256": "ad6a9b566d8458f4d05f4bfe2cea59e32885dd1da6929a361be579222107987c", "type": "url", - "url": "https://files.pythonhosted.org/packages/09/d7/57e71e11108203039c895643368c0d1a99fe719a6a80184edf240c33d25f/PuLP-2.8.0-py3-none-any.whl", - "version": "2.8.0" + "url": "https://files.pythonhosted.org/packages/64/10/704c18b5960b3f9b10efcc859e11881ad90f1e44008e181d2b10cd305a63/PuLP-2.9.0-py3-none-any.whl", + "version": "2.9.0" }, "pyarrow": { - "sha256": "48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd", + "sha256": "f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047", "type": "url", - "url": "https://files.pythonhosted.org/packages/b0/54/eb7fcfc0e1ec6a8404cadd11ac957b3ee4fd0774225cafe3ffe6287861cb/pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", - "version": "16.1.0" + "url": "https://files.pythonhosted.org/packages/ee/fb/c1b47f0ada36d856a352da261a44d7344d8f22e2f7db3945f8c3b81be5dd/pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", + "version": "17.0.0" }, "pydantic": { - "sha256": "4660dd697de1ae2d4305a85161312611f64d5360663a9ba026cd6ad9e3fe14c3", + "sha256": "371dcf1831f87c9e217e2b6a0c66842879a14873114ebb9d0861ab22e3b5bb1e", "type": "url", - "url": "https://files.pythonhosted.org/packages/ae/d8/3ffbdeccf252d56c8e0b6f1f30798d3aa0ad5afaa541908207881855beeb/pydantic-1.10.16-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "1.10.16" + "url": "https://files.pythonhosted.org/packages/ef/a6/080cace699e89a94bd4bf34e8c12821d1f05fe4d56a0742f797b231d9a40/pydantic-1.10.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "1.10.17" }, "pygments": { "sha256": "b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a", @@ -506,10 +506,10 @@ "version": "2.18.0" }, "pynvml": { - "sha256": "5cce014ac01b098d08f06178f86c37be409b80b2e903a5a03ce15eed60f55e25", + "sha256": "a5fba3ab14febda50d19dbda012ef62ae0aed45b7ccc07af0bc5be79223e450c", "type": "url", - "url": "https://files.pythonhosted.org/packages/5b/9c/adb8070059caaa15d5a572b66bccd95900d8c1b9fa54d6ecea6ae97448d1/pynvml-11.5.0-py3-none-any.whl", - "version": "11.5.0" + "url": "https://files.pythonhosted.org/packages/54/5b/16e50abf152be7f18120f11dfff495014a9eaff7b764626e1656f04ad262/pynvml-11.5.3-py3-none-any.whl", + "version": "11.5.3" }, "pyproject-hooks": { "sha256": "7ceeefe9aec63a1064c18d939bdc3adf2d8aa1988a510afec15151578b232aa2", @@ -590,10 +590,10 @@ "version": "0.2.0" }, "setuptools": { - "sha256": "fe384da74336c398e0d956d1cae0669bc02eed936cdb1d49b57de1990dc11ffc", + "sha256": "f501b6e6db709818dc76882582d9c516bf3b67b948864c5fa1d1624c09a49207", "type": "url", - "url": "https://files.pythonhosted.org/packages/ef/15/88e46eb9387e905704b69849618e699dc2f54407d8953cc4ec4b8b46528d/setuptools-70.3.0-py3-none-any.whl", - "version": "70.3.0" + "url": "https://files.pythonhosted.org/packages/32/10/e72bb221cdd2f11e649cf38bd7ba8ea6d527c77f330366e10ae9bb798730/setuptools-71.0.3-py3-none-any.whl", + "version": "71.0.3" }, "sh": { "sha256": "2f2f79a65abd00696cf2e9ad26508cf8abb6dba5745f40255f1c0ded2876926d", @@ -632,22 +632,16 @@ "version": "0.4.15" }, "structlog": { - "sha256": "983bd49f70725c5e1e3867096c0c09665918936b3db27341b41d294283d7a48a", + "sha256": "597f61e80a91cc0749a9fd2a098ed76715a1c8a01f73e336b746504d1aad7610", "type": "url", - "url": "https://files.pythonhosted.org/packages/8f/63/2eb7d30fe126dbd8a398386f14ab0421bb722515f9f50c35fd4048251285/structlog-24.2.0-py3-none-any.whl", - "version": "24.2.0" + "url": "https://files.pythonhosted.org/packages/bf/65/813fc133609ebcb1299be6a42e5aea99d6344afb35ccb43f67e7daaa3b92/structlog-24.4.0-py3-none-any.whl", + "version": "24.4.0" }, "sympy": { - "sha256": "6b0b32a4673fb91bd3cac3b55406c8e01d53ae22780be467301cc452f6680c92", - "type": "url", - "url": "https://files.pythonhosted.org/packages/62/74/7e6c65ee89ff43942bffffdbb238634f16967bf327aee3c76efcf6e49587/sympy-1.13.0-py3-none-any.whl", - "version": "1.13.0" - }, - "tensorrt": { - "sha256": "7e9c8666f5bee86771451f007e25f81d65a411a26e6ea0b41faa5ec83ab863af", + "sha256": "db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt/tensorrt-10.0.1.tar.gz", - "version": "10.0.1" + "url": "https://files.pythonhosted.org/packages/b2/fe/81695a1aa331a842b582453b605175f419fe8540355886031328089d840a/sympy-1.13.1-py3-none-any.whl", + "version": "1.13.1" }, "tensorrt-cu12": { "sha256": "a549e2fe472eb03b2737a708c0aef0cac9cb0be1ae46bc7dad72ec1dfc81bd19", @@ -668,10 +662,10 @@ "version": "10.1.0" }, "tensorrt-llm": { - "sha256": "c7975326fa10b56079e0febf7c52a65ccf5b37760cd1c79d5aa3e8c7d85ce69c", + "sha256": "2bffd83b31eb36cc9429fe38ac23501537a7a42a6eabef107f1e76e28c6180bd", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.10.0-cp310-cp310-linux_x86_64.whl", - "version": "0.10.0" + "url": "https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.11.0-cp310-cp310-linux_x86_64.whl", + "version": "0.11.0" }, "tokenizers": { "sha256": "8b01afb7193d47439f091cd8f070a1ced347ad0f9144952a30a41836902fe09e", @@ -686,10 +680,10 @@ "version": "2.0.1" }, "torch": { - "sha256": "cade4fd6c8ce7d826dbcfabd65f1d53b0ee0a058db8c1809d65bfd6051b55530", + "sha256": "f0deb5d2f932a68ed54625ba140eddbf2af22be978ee19b9b63c986add6425b2", "type": "url", - "url": "https://download.pytorch.org/whl/cu121/torch-2.2.2%2Bcu121-cp310-cp310-linux_x86_64.whl", - "version": "2.2.2+cu121" + "url": "https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp310-cp310-linux_x86_64.whl", + "version": "2.3.1+cu121" }, "tqdm": { "sha256": "b75ca56b413b030bc3f00af51fd2c1a1a5eac6a0c1cca83cbb37a5c52abce644", @@ -698,16 +692,16 @@ "version": "4.66.4" }, "transformers": { - "sha256": "71cb94301ec211a2e1d4b8c8d18dcfaa902dfa00a089dceca167a8aa265d6f2d", + "sha256": "6d59061392d0f1da312af29c962df9017ff3c0108c681a56d1bc981004d16d24", "type": "url", - "url": "https://files.pythonhosted.org/packages/05/23/ba02efa28518557e0cfe0ce5c1170000dd7501ed02ac865fc90cbe3daa93/transformers-4.40.2-py3-none-any.whl", - "version": "4.40.2" + "url": "https://files.pythonhosted.org/packages/6a/dc/23c26b7b0bce5aaccf2b767db3e9c4f5ae4331bd47688c1f2ef091b23696/transformers-4.42.4-py3-none-any.whl", + "version": "4.42.4" }, "triton": { - "sha256": "a2294514340cfe4e8f4f9e5c66c702744c4a117d25e618bd08469d0bfed1e2e5", + "sha256": "3c84595cbe5e546b1b290d2a58b1494df5a2ef066dd890655e5b8a8a92205c33", "type": "url", - "url": "https://download.pytorch.org/whl/triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "2.2.0" + "url": "https://download.pytorch.org/whl/triton-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "2.3.1" }, "tritonclient": { "sha256": "754ab373a45306be0c45afbcde06838179d04561694f6d15e138530153aee581", @@ -1109,9 +1103,6 @@ "sympy": [ "mpmath" ], - "tensorrt": [ - "tensorrt-cu12" - ], "tensorrt-cu12": [ "tensorrt-cu12-bindings", "tensorrt-cu12-libs" @@ -1137,6 +1128,7 @@ "onnx", "optimum", "pandas", + "pillow", "polygraphy", "psutil", "pulp", @@ -1144,7 +1136,7 @@ "pynvml", "sentencepiece", "strenum", - "tensorrt", + "tensorrt-cu12", "torch", "transformers", "wheel" @@ -1248,5 +1240,5 @@ } } }, - "invalidationHash": "e7e207b87a9d99d7041d2be7edfba110533a437cc9cf984be13642572fa5f156" + "invalidationHash": "a2c91999e1adbeb840cd39d43711d7c054e354dff33f419447333a2b7c828a10" } \ No newline at end of file diff --git a/nix/tensorrt-llm.nix b/nix/tensorrt-llm.nix index 5db8c87..75ea94f 100644 --- a/nix/tensorrt-llm.nix +++ b/nix/tensorrt-llm.nix @@ -17,14 +17,14 @@ }: stdenv.mkDerivation (o: { pname = "tensorrt_llm"; - version = "0.10.0"; + version = "0.11.0"; src = fetchFromGitHub { owner = "NVIDIA"; repo = "TensorRT-LLM"; rev = "v${o.version}"; fetchSubmodules = true; fetchLFS = true; # libtensorrt_llm_batch_manager_static.a - hash = "sha256-eOAixXzOQRaySbUtpeAF9qMFOzwe1rosC0GOgy8CakU="; + hash = "sha256-J2dqKjuEXVbE9HgoCzhUASZAnsn/hsC+qUTHL6uT4nU="; }; outputs = if withPython then diff --git a/nix/trtllm-backend.nix b/nix/trtllm-backend.nix index b52d81c..84eacb0 100644 --- a/nix/trtllm-backend.nix +++ b/nix/trtllm-backend.nix @@ -48,12 +48,12 @@ let in oldGccStdenv.mkDerivation rec { pname = "tensorrtllm_backend"; - version = "0.10.0"; + version = "0.11.0"; src = fetchFromGitHub { owner = "triton-inference-server"; repo = "tensorrtllm_backend"; rev = "v${version}"; - hash = "sha256-6df9MbHPqBVxpdkTcEzf99OCPtgFrK0jjDJfvE/guyA="; + hash = "sha256-PzcFpeq7ISqmHa9TBT0lVVYNdkJWB224kRGQKF4uas8="; }; nativeBuildInputs = [ cmake From 57e97eee30f4a5a166dc5046021b5027db2e3b39 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Fri, 19 Jul 2024 13:51:28 +0200 Subject: [PATCH 12/35] update triton_templates --- triton_templates/ensemble/config.pbtxt | 26 +++++++++ triton_templates/postprocessing/1/model.py | 34 ++++++++--- triton_templates/postprocessing/config.pbtxt | 2 +- triton_templates/preprocessing/1/model.py | 57 ++++++++++++++++--- triton_templates/preprocessing/config.pbtxt | 16 ++++++ triton_templates/tensorrt_llm/config.pbtxt | 20 +++++++ .../tensorrt_llm_bls/1/lib/decode.py | 11 ++-- .../tensorrt_llm_bls/1/lib/triton_decoder.py | 7 +++ .../tensorrt_llm_bls/config.pbtxt | 6 ++ 9 files changed, 156 insertions(+), 23 deletions(-) diff --git a/triton_templates/ensemble/config.pbtxt b/triton_templates/ensemble/config.pbtxt index 0e2627b..853818e 100644 --- a/triton_templates/ensemble/config.pbtxt +++ b/triton_templates/ensemble/config.pbtxt @@ -33,6 +33,12 @@ input [ data_type: TYPE_STRING dims: [ -1 ] }, + { + name: "decoder_text_input" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, { name: "max_tokens" data_type: TYPE_INT32 @@ -207,6 +213,10 @@ ensemble_scheduling { key: "QUERY" value: "text_input" } + input_map { + key: "DECODER_QUERY" + value: "decoder_text_input" + } input_map { key: "REQUEST_OUTPUT_LEN" value: "max_tokens" @@ -243,6 +253,14 @@ ensemble_scheduling { key: "INPUT_ID" value: "_INPUT_ID" } + output_map { + key: "REQUEST_DECODER_INPUT_LEN" + value: "_REQUEST_DECODER_INPUT_LEN" + } + output_map { + key: "DECODER_INPUT_ID" + value: "_DECODER_INPUT_ID" + } output_map { key: "REQUEST_OUTPUT_LEN" value: "_REQUEST_OUTPUT_LEN" @@ -275,10 +293,18 @@ ensemble_scheduling { key: "input_ids" value: "_INPUT_ID" } + input_map { + key: "decoder_input_ids" + value: "_DECODER_INPUT_ID" + } input_map { key: "input_lengths" value: "_REQUEST_INPUT_LEN" } + input_map { + key: "decoder_input_lengths" + value: "_REQUEST_DECODER_INPUT_LEN" + } input_map { key: "request_output_len" value: "_REQUEST_OUTPUT_LEN" diff --git a/triton_templates/postprocessing/1/model.py b/triton_templates/postprocessing/1/model.py index ac42a0d..4514190 100644 --- a/triton_templates/postprocessing/1/model.py +++ b/triton_templates/postprocessing/1/model.py @@ -55,17 +55,35 @@ def initialize(self, args): model_config = json.loads(args['model_config']) # tokenizer_dir = model_config['parameters']['tokenizer_dir'][ # 'string_value'] - # self.skip_special_tokens = model_config['parameters'].get( - # 'skip_special_tokens', - # {'string_value': "true"})['string_value'].lower() in [ - # 'true', '1', 't', 'y', 'yes' - # ] + + # skip_special_tokens = model_config['parameters'].get( + # 'skip_special_tokens') + # if skip_special_tokens is not None: + # skip_special_tokens_str = skip_special_tokens[ + # 'string_value'].lower() + # if skip_special_tokens_str in [ + # 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no' + # ]: + # self.skip_special_tokens = skip_special_tokens_str in [ + # 'true', '1', 't', 'y', 'yes' + # ] + # else: + # print( + # f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default." + # ) + # self.skip_special_tokens = True + # else: + # print( + # f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default." + # ) + # self.skip_special_tokens = True # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, # legacy=False, # padding_side='left', # trust_remote_code=True) - # self.tokenizer.pad_token = self.tokenizer.eos_token + # if not self.tokenizer.pad_token: + # self.tokenizer.pad_token = self.tokenizer.eos_token # Parse model output configs output_config = pb_utils.get_output_config_by_name( @@ -129,13 +147,13 @@ def execute(self, requests): # tokens_batch = tokens_batch.T # Postprocessing output data. - # outputs = self._postprocessing(tokens_batch, sequence_lengths) + outputs = self._postprocessing(tokens_batch, sequence_lengths) # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. output_tensor = pb_utils.Tensor( 'OUTPUT', - tokens_batch) + np.array(outputs).astype(self.output_dtype)) outputs = [] outputs.append(output_tensor) diff --git a/triton_templates/postprocessing/config.pbtxt b/triton_templates/postprocessing/config.pbtxt index 67b8b8a..0ad7f23 100644 --- a/triton_templates/postprocessing/config.pbtxt +++ b/triton_templates/postprocessing/config.pbtxt @@ -101,7 +101,7 @@ parameters { parameters { key: "skip_special_tokens" value: { - string_value: "True" + string_value: "${skip_special_tokens}" } } diff --git a/triton_templates/preprocessing/1/model.py b/triton_templates/preprocessing/1/model.py index 62ab243..ed09cd4 100644 --- a/triton_templates/preprocessing/1/model.py +++ b/triton_templates/preprocessing/1/model.py @@ -56,11 +56,27 @@ def initialize(self, args): model_config = json.loads(args['model_config']) tokenizer_dir = model_config['parameters']['tokenizer_dir'][ 'string_value'] - self.add_special_tokens = model_config['parameters'].get( - 'add_special_tokens', - {'string_value': "false"})['string_value'].lower() in [ - 'true', '1', 't', 'y', 'yes' - ] + + add_special_tokens = model_config['parameters'].get( + 'add_special_tokens') + if add_special_tokens is not None: + add_special_tokens_str = add_special_tokens['string_value'].lower() + if add_special_tokens_str in [ + 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no' + ]: + self.add_special_tokens = add_special_tokens_str in [ + 'true', '1', 't', 'y', 'yes' + ] + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default." + ) + self.add_special_tokens = True + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default." + ) + self.add_special_tokens = True self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, legacy=False, @@ -68,7 +84,9 @@ def initialize(self, args): trust_remote_code=True) if isinstance(self.tokenizer, T5Tokenizer): self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id() - self.tokenizer.pad_token = self.tokenizer.eos_token + + if not self.tokenizer.pad_token: + self.tokenizer.pad_token = self.tokenizer.eos_token self.tokenizer_end_id = self.tokenizer.encode( self.tokenizer.eos_token, add_special_tokens=False)[0] @@ -77,7 +95,8 @@ def initialize(self, args): # Parse model output configs and convert Triton types to numpy types output_names = [ - "INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS", + "INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN", + "REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS", "OUT_END_ID", "OUT_PAD_ID" ] input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"] @@ -126,6 +145,11 @@ def execute(self, requests): # Get input tensors query = pb_utils.get_input_tensor_by_name(request, 'QUERY').as_numpy() + decoder_query = pb_utils.get_input_tensor_by_name( + request, 'DECODER_QUERY') + if decoder_query is not None: + decoder_query = decoder_query.as_numpy() + batch_dim = query.shape[0] if batch_dim != 1: @@ -178,6 +202,13 @@ def execute(self, requests): # Preprocessing input data. input_id, request_input_len = self._create_request(query) + if decoder_query is not None: + decoder_input_id, request_decoder_input_len = self._create_request( + decoder_query) + else: + decoder_input_id = pad_id * np.ones((1, 1), np.int32) + request_decoder_input_len = 1 * np.ones((1, 1), np.int32) + bad_words = self._to_word_list_format(bad_words_dict) stop_words = self._to_word_list_format(stop_words_dict) @@ -192,6 +223,13 @@ def execute(self, requests): request_input_len_tensor = pb_utils.Tensor( 'REQUEST_INPUT_LEN', request_input_len.astype(self.request_input_len_dtype)) + decoder_input_id_tensor = pb_utils.Tensor( + 'DECODER_INPUT_ID', + decoder_input_id.astype(self.decoder_input_id_dtype)) + request_decoder_input_len_tensor = pb_utils.Tensor( + 'REQUEST_DECODER_INPUT_LEN', + request_decoder_input_len.astype( + self.request_decoder_input_len_dtype)) request_output_len_tensor = pb_utils.Tensor( 'REQUEST_OUTPUT_LEN', request_output_len) bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words) @@ -205,8 +243,9 @@ def execute(self, requests): np.array(pad_id, dtype=np.int32)) inference_response = pb_utils.InferenceResponse(output_tensors=[ - input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor, - request_input_len_tensor, request_output_len_tensor, + input_id_tensor, decoder_input_id_tensor, bad_words_ids_tensor, + stop_words_ids_tensor, request_input_len_tensor, + request_decoder_input_len_tensor, request_output_len_tensor, embedding_bias_tensor, end_id_tensor, pad_id_tensor ]) responses.append(inference_response) diff --git a/triton_templates/preprocessing/config.pbtxt b/triton_templates/preprocessing/config.pbtxt index ca92187..165134c 100644 --- a/triton_templates/preprocessing/config.pbtxt +++ b/triton_templates/preprocessing/config.pbtxt @@ -33,6 +33,12 @@ input [ data_type: TYPE_STRING dims: [ -1 ] }, + { + name: "DECODER_QUERY" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, { name: "REQUEST_OUTPUT_LEN" data_type: TYPE_INT32 @@ -86,6 +92,16 @@ output [ data_type: TYPE_INT32 dims: [ 1 ] }, + { + name: "DECODER_INPUT_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "REQUEST_DECODER_INPUT_LEN" + data_type: TYPE_INT32 + dims: [ 1 ] + }, { name: "BAD_WORDS_IDS" data_type: TYPE_INT32 diff --git a/triton_templates/tensorrt_llm/config.pbtxt b/triton_templates/tensorrt_llm/config.pbtxt index 1c34f77..d204504 100644 --- a/triton_templates/tensorrt_llm/config.pbtxt +++ b/triton_templates/tensorrt_llm/config.pbtxt @@ -62,6 +62,20 @@ input [ optional: true allow_ragged_batch: true }, + { + name: "decoder_input_ids" + data_type: TYPE_INT32 + dims: [ -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "decoder_input_lengths" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + reshape: { shape: [ ] } + }, { name: "draft_logits" data_type: TYPE_FP32 @@ -368,6 +382,12 @@ parameters: { string_value: "${engine_dir}" } } +parameters: { + key: "encoder_model_path" + value: { + string_value: "${encoder_engine_dir}" + } +} parameters: { key: "max_tokens_in_paged_kv_cache" value: { diff --git a/triton_templates/tensorrt_llm_bls/1/lib/decode.py b/triton_templates/tensorrt_llm_bls/1/lib/decode.py index aa2a6d5..de9e28b 100644 --- a/triton_templates/tensorrt_llm_bls/1/lib/decode.py +++ b/triton_templates/tensorrt_llm_bls/1/lib/decode.py @@ -58,6 +58,7 @@ def _single_value(data: Optional[np.ndarray]): @dataclass class Request: text_input: np.ndarray = np.array([]) + decoder_text_input: np.ndarray = None max_tokens: np.ndarray = np.array([]) bad_words: Optional[np.ndarray] = None stop_words: Optional[np.ndarray] = None @@ -91,17 +92,13 @@ def validate(self): num_draft_tokens = _single_value(self.num_draft_tokens) stream = _single_value(self.stream) - gen_logits = _single_value(self.return_generation_logits) + _single_value(self.return_generation_logits) context_logits = _single_value(self.return_context_logits) if num_draft_tokens: _validate_that( not stream, "streaming is not supported with speculative decoding") - _validate_that( - not gen_logits, - "generation logits are not supported with speculative decoding" - ) _validate_that( not context_logits, "context logits are not supported with speculative decoding") @@ -116,7 +113,9 @@ class DraftRequest: @dataclass class PreprocResponse: input_ids: np.ndarray = np.array([]) + decoder_input_ids: np.ndarray = None input_lengths: np.ndarray = np.array([]) + decoder_input_lengths: np.ndarray = None bad_words_list: Optional[np.ndarray] = None stop_words_list: Optional[np.ndarray] = None embedding_bias: Optional[np.ndarray] = None @@ -133,6 +132,8 @@ def with_new_inputs(cls, if input_ids is not None else other.input_ids), input_lengths=(input_lengths if input_lengths is not None else other.input_lengths), + decoder_input_ids=other.decoder_input_ids, + decoder_input_lengths=other.decoder_input_lengths, bad_words_list=other.bad_words_list, stop_words_list=other.stop_words_list, end_id=other.end_id, diff --git a/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py b/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py index f0df3b8..456ded5 100644 --- a/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py +++ b/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py @@ -50,7 +50,9 @@ def __init__(self, self._preproc_outputs = [ "INPUT_ID", + "DECODER_INPUT_ID", "REQUEST_INPUT_LEN", + "REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS", "EMBEDDING_BIAS", @@ -73,6 +75,7 @@ def __init__(self, self.input_names = [ "text_input", + "decoder_text_input", "max_tokens", "bad_words", "stop_words", @@ -217,6 +220,7 @@ def preprocess(self, request: Request) -> PreprocResponse: def _get_preproc_tensors(self, request: Request): name_map = { "text_input": "QUERY", + "decoder_text_input": "DECODER_QUERY", "max_tokens": "REQUEST_OUTPUT_LEN", "bad_words": "BAD_WORDS_DICT", "stop_words": "STOP_WORDS_DICT", @@ -230,7 +234,9 @@ def _get_preproc_tensors(self, request: Request): def _get_preproc_response(self, triton_output): name_map = { "INPUT_ID": "input_ids", + "DECODER_INPUT_ID": "decoder_input_ids", "REQUEST_INPUT_LEN": "input_lengths", + "REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths", "BAD_WORDS_IDS": "bad_words_list", "STOP_WORDS_IDS": "stop_words_list", "EMBEDDING_BIAS": "embedding_bias", @@ -303,6 +309,7 @@ def _get_llm_tensors(self, def _get_tensors_from_preproc(self, preproc: PreprocResponse): name_map = { "input_ids": "input_ids", + "decoder_input_ids": "decoder_input_ids", "input_lengths": "input_lengths", "bad_words_list": "bad_words_list", "stop_words_list": "stop_words_list", diff --git a/triton_templates/tensorrt_llm_bls/config.pbtxt b/triton_templates/tensorrt_llm_bls/config.pbtxt index e5aff22..ba0fa58 100644 --- a/triton_templates/tensorrt_llm_bls/config.pbtxt +++ b/triton_templates/tensorrt_llm_bls/config.pbtxt @@ -38,6 +38,12 @@ input [ data_type: TYPE_STRING dims: [ -1 ] }, + { + name: "decoder_text_input" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, { name: "max_tokens" data_type: TYPE_INT32 From 7129024c75ca879b70aafccf1593693bcc62db1e Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Fri, 19 Jul 2024 13:52:37 +0200 Subject: [PATCH 13/35] update triton_model_repo --- triton_model_repo/ensemble/config.pbtxt | 26 +++++++++++++++++++ triton_model_repo/postprocessing/config.pbtxt | 2 +- triton_model_repo/preprocessing/config.pbtxt | 16 ++++++++++++ triton_model_repo/tensorrt_llm/config.pbtxt | 20 ++++++++++++++ .../tensorrt_llm_bls/config.pbtxt | 6 +++++ 5 files changed, 69 insertions(+), 1 deletion(-) diff --git a/triton_model_repo/ensemble/config.pbtxt b/triton_model_repo/ensemble/config.pbtxt index 6d54df6..9a047fe 100644 --- a/triton_model_repo/ensemble/config.pbtxt +++ b/triton_model_repo/ensemble/config.pbtxt @@ -33,6 +33,12 @@ input [ data_type: TYPE_STRING dims: [ -1 ] }, + { + name: "decoder_text_input" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, { name: "max_tokens" data_type: TYPE_INT32 @@ -207,6 +213,10 @@ ensemble_scheduling { key: "QUERY" value: "text_input" } + input_map { + key: "DECODER_QUERY" + value: "decoder_text_input" + } input_map { key: "REQUEST_OUTPUT_LEN" value: "max_tokens" @@ -243,6 +253,14 @@ ensemble_scheduling { key: "INPUT_ID" value: "_INPUT_ID" } + output_map { + key: "REQUEST_DECODER_INPUT_LEN" + value: "_REQUEST_DECODER_INPUT_LEN" + } + output_map { + key: "DECODER_INPUT_ID" + value: "_DECODER_INPUT_ID" + } output_map { key: "REQUEST_OUTPUT_LEN" value: "_REQUEST_OUTPUT_LEN" @@ -275,10 +293,18 @@ ensemble_scheduling { key: "input_ids" value: "_INPUT_ID" } + input_map { + key: "decoder_input_ids" + value: "_DECODER_INPUT_ID" + } input_map { key: "input_lengths" value: "_REQUEST_INPUT_LEN" } + input_map { + key: "decoder_input_lengths" + value: "_REQUEST_DECODER_INPUT_LEN" + } input_map { key: "request_output_len" value: "_REQUEST_OUTPUT_LEN" diff --git a/triton_model_repo/postprocessing/config.pbtxt b/triton_model_repo/postprocessing/config.pbtxt index df87aeb..0ed0053 100644 --- a/triton_model_repo/postprocessing/config.pbtxt +++ b/triton_model_repo/postprocessing/config.pbtxt @@ -101,7 +101,7 @@ parameters { parameters { key: "skip_special_tokens" value: { - string_value: "True" + string_value: "${skip_special_tokens}" } } diff --git a/triton_model_repo/preprocessing/config.pbtxt b/triton_model_repo/preprocessing/config.pbtxt index e76fec5..ec9d2b2 100644 --- a/triton_model_repo/preprocessing/config.pbtxt +++ b/triton_model_repo/preprocessing/config.pbtxt @@ -33,6 +33,12 @@ input [ data_type: TYPE_STRING dims: [ -1 ] }, + { + name: "DECODER_QUERY" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, { name: "REQUEST_OUTPUT_LEN" data_type: TYPE_INT32 @@ -86,6 +92,16 @@ output [ data_type: TYPE_INT32 dims: [ 1 ] }, + { + name: "DECODER_INPUT_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "REQUEST_DECODER_INPUT_LEN" + data_type: TYPE_INT32 + dims: [ 1 ] + }, { name: "BAD_WORDS_IDS" data_type: TYPE_INT32 diff --git a/triton_model_repo/tensorrt_llm/config.pbtxt b/triton_model_repo/tensorrt_llm/config.pbtxt index 14aab33..75c93a0 100644 --- a/triton_model_repo/tensorrt_llm/config.pbtxt +++ b/triton_model_repo/tensorrt_llm/config.pbtxt @@ -62,6 +62,20 @@ input [ optional: true allow_ragged_batch: true }, + { + name: "decoder_input_ids" + data_type: TYPE_INT32 + dims: [ -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "decoder_input_lengths" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + reshape: { shape: [ ] } + }, { name: "draft_logits" data_type: TYPE_FP32 @@ -368,6 +382,12 @@ parameters: { string_value: "/src/triton_model_repo/tensorrt_llm/1/" } } +parameters: { + key: "encoder_model_path" + value: { + string_value: "${encoder_engine_dir}" + } +} parameters: { key: "max_tokens_in_paged_kv_cache" value: { diff --git a/triton_model_repo/tensorrt_llm_bls/config.pbtxt b/triton_model_repo/tensorrt_llm_bls/config.pbtxt index e8c80e8..f9639c8 100644 --- a/triton_model_repo/tensorrt_llm_bls/config.pbtxt +++ b/triton_model_repo/tensorrt_llm_bls/config.pbtxt @@ -38,6 +38,12 @@ input [ data_type: TYPE_STRING dims: [ -1 ] }, + { + name: "decoder_text_input" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, { name: "max_tokens" data_type: TYPE_INT32 From 8a2170f5fea53bafd66acd16443d8bd357a41b68 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Sat, 20 Jul 2024 19:55:40 +0200 Subject: [PATCH 14/35] Add a $TRTLLM_PYTHON environment with pydantic 2 --- cog-trt-llm/trt_llm_builder.py | 3 +++ default.nix | 10 +++++++++- flake.nix | 15 ++++++++++++++- lock.json | 26 ++++++++++++++++++++++---- 4 files changed, 48 insertions(+), 6 deletions(-) diff --git a/cog-trt-llm/trt_llm_builder.py b/cog-trt-llm/trt_llm_builder.py index 1b49934..88ad6be 100644 --- a/cog-trt-llm/trt_llm_builder.py +++ b/cog-trt-llm/trt_llm_builder.py @@ -111,6 +111,9 @@ def _assemble_subprocess_cmd(self, executable, args, script=None): elif executable == "trtllm-build": cmd = [executable] + if "TRTLLM_PYTHON" in os.environ: + cmd[0] = os.path.join(os.environ["TRTLLM_PYTHON"], "bin", cmd[0]) + for k, v in args.items(): cmd += ["--" + str(k)] cmd += [str(v)] if v else [] diff --git a/default.nix b/default.nix index d1024d6..63e4bb0 100644 --- a/default.nix +++ b/default.nix @@ -36,9 +36,17 @@ in # "nvidia-cudnn-cu12==${cudaPackages.cudnn.version}" "nvidia-cublas-cu12==${cudaPackages.libcublas.version}" ]; + # HACK: cog requires pydantic <2, but we do need the extra deps pydantic2 brings in overridesList = [ - "pydantic==1.10.17" + "pydantic>=2.0" ]; + drvs.pydantic = { + version = lib.mkForce "1.10.17"; + mkDerivation.src = pkgs.fetchurl { + sha256 ="371dcf1831f87c9e217e2b6a0c66842879a14873114ebb9d0861ab22e3b5bb1e"; + url = "https://files.pythonhosted.org/packages/ef/a6/080cace699e89a94bd4bf34e8c12821d1f05fe4d56a0742f797b231d9a40/pydantic-1.10.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"; + }; + }; }; cognix.includeNix = true; cognix.nix.extraOptions = '' diff --git a/flake.nix b/flake.nix index 249250b..7d17b43 100644 --- a/flake.nix +++ b/flake.nix @@ -30,7 +30,7 @@ rm $out/lib/python*/site-packages/tensorrt_libs/libnvinfer_builder_resource* ''; }); - makeBuilder = name: callCognix ( { config, lib, ... }: { + makeBuilder = name: callCognix ( { config, lib, pkgs, ... }: { inherit name; # only grab deps of tensorrt-llm, omegaconf, hf-transfer cognix.python_root_packages = [ "tensorrt-llm" "omegaconf" "hf-transfer" ]; @@ -40,6 +40,19 @@ cognix.rootPath = lib.mkForce "${./cog-trt-llm}"; # this just needs the examples/ dir cognix.environment.TRTLLM_DIR = config.deps.tensorrt-llm.examples; + # HACK: cog needs pydantic v1, but trt-llm needs pydantic v2 + cognix.environment.TRTLLM_PYTHON = (config.python-env.public.extendModules { + modules = [{ + _file = ./.; + pip.drvs.pydantic = let mkMoreForce = lib.mkOverride 49; in { + version = mkMoreForce "2.8.2"; + mkDerivation.src = mkMoreForce (pkgs.fetchurl { + sha256 = "73ee9fddd406dc318b885c7a2eab8a6472b68b8fb5ba8150949fc3db939f23c8"; + url = "https://files.pythonhosted.org/packages/1f/fa/b7f815b8c9ad021c07f88875b601222ef5e70619391ade4a49234d12d278/pydantic-2.8.2-py3-none-any.whl"; + }); + }; + }]; + }).config.public.pyEnv; }); in { cog-triton-builder = makeBuilder "cog-triton-builder"; diff --git a/lock.json b/lock.json index e310635..987a84d 100644 --- a/lock.json +++ b/lock.json @@ -19,6 +19,12 @@ "url": "https://files.pythonhosted.org/packages/76/ac/a7305707cb852b7e16ff80eaf5692309bde30e2b1100a1fcacdc8f731d97/aiosignal-1.3.1-py3-none-any.whl", "version": "1.3.1" }, + "annotated-types": { + "sha256": "1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", + "type": "url", + "url": "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", + "version": "0.7.0" + }, "antlr4-python3-runtime": { "sha256": "f224469b4168294902bb1efa80a8bf7855f24c99aef99cbefc1bcd3cce77881b", "type": "url", @@ -494,10 +500,16 @@ "version": "17.0.0" }, "pydantic": { - "sha256": "371dcf1831f87c9e217e2b6a0c66842879a14873114ebb9d0861ab22e3b5bb1e", + "sha256": "73ee9fddd406dc318b885c7a2eab8a6472b68b8fb5ba8150949fc3db939f23c8", + "type": "url", + "url": "https://files.pythonhosted.org/packages/1f/fa/b7f815b8c9ad021c07f88875b601222ef5e70619391ade4a49234d12d278/pydantic-2.8.2-py3-none-any.whl", + "version": "2.8.2" + }, + "pydantic-core": { + "sha256": "3d482efec8b7dc6bfaedc0f166b2ce349df0011f5d2f1f25537ced4cfc34fd98", "type": "url", - "url": "https://files.pythonhosted.org/packages/ef/a6/080cace699e89a94bd4bf34e8c12821d1f05fe4d56a0742f797b231d9a40/pydantic-1.10.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "1.10.17" + "url": "https://files.pythonhosted.org/packages/ae/49/8a6fe79d35e2f3bea566d8ea0e4e6f436d4f749d7838c8e8c4c5148ae706/pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "2.20.1" }, "pygments": { "sha256": "b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a", @@ -828,6 +840,7 @@ "aiosignal": [ "frozenlist" ], + "annotated-types": [], "antlr4-python3-runtime": [], "anyio": [ "exceptiongroup", @@ -1061,6 +1074,11 @@ "numpy" ], "pydantic": [ + "annotated-types", + "pydantic-core", + "typing-extensions" + ], + "pydantic-core": [ "typing-extensions" ], "pygments": [], @@ -1240,5 +1258,5 @@ } } }, - "invalidationHash": "a2c91999e1adbeb840cd39d43711d7c054e354dff33f419447333a2b7c828a10" + "invalidationHash": "f5f309acde79ec4808388874f81a02dd122a2dda9eb1cc24deb2a0e5db9dee1f" } \ No newline at end of file From 494d476a48a5046df5f15aaa12dcea05ff434ffa Mon Sep 17 00:00:00 2001 From: technillogue Date: Sat, 20 Jul 2024 19:48:10 +0000 Subject: [PATCH 15/35] use pytriton 0.5.8 --- default.nix | 2 +- lock.json | 20 ++++++++++---------- nix/trtllm-backend.nix | 9 +++++---- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/default.nix b/default.nix index 63e4bb0..620889b 100644 --- a/default.nix +++ b/default.nix @@ -22,7 +22,7 @@ in "tensorrt_llm==0.11.0" "tensorrt-cu12==10.1.0" "torch==2.3.1" - "nvidia-pytriton==0.5.6" # corresponds to 2.44.0 + "nvidia-pytriton==0.5.8" # corresponds to 2.46.0 "omegaconf" "hf-transfer" "tokenizers>=0.19.0" diff --git a/lock.json b/lock.json index 987a84d..b5f6faa 100644 --- a/lock.json +++ b/lock.json @@ -428,10 +428,10 @@ "version": "12.1.105" }, "nvidia-pytriton": { - "sha256": "6403e65c2bbab0ab2fe2b737ad612e2b88f3edf20d41aadd1d544ffb309a701c", + "sha256": "cd3cdfb704db3a01f857adc97fea77d5413c9f9e89f9b7add91c9d16a0bec7f8", "type": "url", - "url": "https://pypi.nvidia.com/nvidia-pytriton/nvidia_pytriton-0.5.6-py3-none-manylinux_2_35_x86_64.whl", - "version": "0.5.6" + "url": "https://pypi.nvidia.com/nvidia-pytriton/nvidia_pytriton-0.5.8-py3-none-manylinux_2_35_x86_64.whl", + "version": "0.5.8" }, "omegaconf": { "sha256": "7b4df175cdb08ba400f45cae3bdcae7ba8365db4d165fc65fd04b050ab63b46b", @@ -602,10 +602,10 @@ "version": "0.2.0" }, "setuptools": { - "sha256": "f501b6e6db709818dc76882582d9c516bf3b67b948864c5fa1d1624c09a49207", + "sha256": "ed2feca703be3bdbd94e6bb17365d91c6935c6b2a8d0bb09b66a2c435ba0b1a5", "type": "url", - "url": "https://files.pythonhosted.org/packages/32/10/e72bb221cdd2f11e649cf38bd7ba8ea6d527c77f330366e10ae9bb798730/setuptools-71.0.3-py3-none-any.whl", - "version": "71.0.3" + "url": "https://files.pythonhosted.org/packages/d2/93/40d9bd9c7cf5d5e556894cc2a2c492e6e4ef4eda6ce9806bd1a8c47ae351/setuptools-71.0.4-py3-none-any.whl", + "version": "71.0.4" }, "sh": { "sha256": "2f2f79a65abd00696cf2e9ad26508cf8abb6dba5745f40255f1c0ded2876926d", @@ -752,10 +752,10 @@ "version": "2.2.2" }, "uvicorn": { - "sha256": "cd17daa7f3b9d7a24de3617820e634d0933b69eed8e33a516071174427238c81", + "sha256": "94a3608da0e530cea8f69683aa4126364ac18e3826b6630d1a65f4638aade503", "type": "url", - "url": "https://files.pythonhosted.org/packages/b2/f9/e6f30ba6094733e4f9794fd098ca0543a19b07ac1fa3075d595bf0f1fb60/uvicorn-0.30.1-py3-none-any.whl", - "version": "0.30.1" + "url": "https://files.pythonhosted.org/packages/63/84/2a26b4eac1cf0c6b5b176dd4346cc4912af5e1b0efc150b865e28636ac34/uvicorn-0.30.3-py3-none-any.whl", + "version": "0.30.3" }, "uvloop": { "sha256": "5a05128d315e2912791de6088c34136bfcdd0c7cbc1cf85fd6fd1bb321b7c849", @@ -1258,5 +1258,5 @@ } } }, - "invalidationHash": "f5f309acde79ec4808388874f81a02dd122a2dda9eb1cc24deb2a0e5db9dee1f" + "invalidationHash": "bec665639eefe16815084d852ef1ffcf53d34983173c86dd230f129862d12619" } \ No newline at end of file diff --git a/nix/trtllm-backend.nix b/nix/trtllm-backend.nix index 84eacb0..0f90aaa 100644 --- a/nix/trtllm-backend.nix +++ b/nix/trtllm-backend.nix @@ -19,20 +19,21 @@ let deps.triton_repo_common = fetchFromGitHub { owner = "triton-inference-server"; repo = "common"; - rev = "00b3a71519e32e3bc954e9f0d067e155ef8f1a6c"; - hash = "sha256-KyFicnB0716nIteSNo43RoiDzuVbj17KM4tIbmN6F+s="; + rev = "0f2072bbc2d4e85f68b10cf60c0ed4e4ebfc766b"; + hash = "sha256-7DdJ1zkHrFEImI137Gt/pDKZhBvoQu0lg2ulqA/yLFA="; }; deps.triton_repo_backend = fetchFromGitHub { owner = "triton-inference-server"; repo = "backend"; + # update for tritons after may 28, 2024 rev = "a06e9a1157d6b5b9b34b6d05a07bb84d517f17c9"; hash = "sha256-Ju2zV/jHUuciTs6GbkqcPG8U0y2lkIWSdAsX78DrpV4="; }; deps.triton_repo_core = fetchFromGitHub { owner = "triton-inference-server"; repo = "core"; - rev = "434e50313b80fdc7ef295fcb3baeeacf65b295e4"; - hash = "sha256-kfDXQEYuMze4E53OHHJ1YjQHnNtAEt4lzNK27K6ttVE="; + rev = "bbcd7816997046821f9d1a22e418acb84ca5364b"; + hash = "sha256-LWLxMvtV0VQYMQQIfztm10xzQreNAoN9zAexf+5ktHo="; }; deps.googletest = fetchFromGitHub { owner = "google"; From 5c3eded5ba24a7fa81379a5ee6bc0f1511562d40 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Sun, 21 Jul 2024 09:54:36 +0200 Subject: [PATCH 16/35] add nvidia-modelopt setuptools dependency --- default.nix | 3 +++ 1 file changed, 3 insertions(+) diff --git a/default.nix b/default.nix index 620889b..434f403 100644 --- a/default.nix +++ b/default.nix @@ -54,6 +54,9 @@ in extra-substituters = https://storage.googleapis.com/replicate-nix-cache-dev/ ''; python-env.pip.drvs = { + nvidia-modelopt.mkDerivation.propagatedBuildInputs = [ + pythonDrvs.setuptools.public + ]; # tensorrt likes doing a pip invocation from it's setup.py # circumvent by manually depending on tensorrt_libs, tensorrt_bindings # and setting this env variable From dfd19b456d02c09394dd0e8cd9c27bfffdc228cb Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Sun, 21 Jul 2024 10:37:54 +0200 Subject: [PATCH 17/35] datasets: 2.14.4 -> 2.20.0 hopefully fixes `ValueError: Invalid pattern: '**' can only be an entire path component` --- default.nix | 1 + lock.json | 35 ++++++++++++++++++++++------------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/default.nix b/default.nix index 434f403..fa26f96 100644 --- a/default.nix +++ b/default.nix @@ -35,6 +35,7 @@ in constraintsList = [ # "nvidia-cudnn-cu12==${cudaPackages.cudnn.version}" "nvidia-cublas-cu12==${cudaPackages.libcublas.version}" + "datasets>2.15.0" # picks older fsspec but newer datasets ]; # HACK: cog requires pydantic <2, but we do need the extra deps pydantic2 brings in overridesList = [ diff --git a/lock.json b/lock.json index b5f6faa..8c4916c 100644 --- a/lock.json +++ b/lock.json @@ -110,10 +110,10 @@ "version": "12.5.0" }, "datasets": { - "sha256": "29336bd316a7d827ccd4da2236596279b20ca2ac78f64c04c9483da7cbc2459b", + "sha256": "76ac02e3bdfff824492e20678f0b6b1b6d080515957fe834b00c2ba8d6b18e5e", "type": "url", - "url": "https://files.pythonhosted.org/packages/66/f8/38298237d18d4b6a8ee5dfe390e97bed5adb8e01ec6f9680c0ddf3066728/datasets-2.14.4-py3-none-any.whl", - "version": "2.14.4" + "url": "https://files.pythonhosted.org/packages/60/2d/963b266bb8f88492d5ab4232d74292af8beb5b6fdae97902df9e284d4c32/datasets-2.20.0-py3-none-any.whl", + "version": "2.20.0" }, "diffusers": { "sha256": "d5e9bb13c8097b4eed10df23d1294d2e5a418f53e3f89c7ef228b5b982970428", @@ -122,10 +122,10 @@ "version": "0.29.2" }, "dill": { - "sha256": "76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e", + "sha256": "c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7", "type": "url", - "url": "https://files.pythonhosted.org/packages/f5/3a/74a29b11cf2cdfcd6ba89c0cecd70b37cd1ba7b77978ce611eb7a146a832/dill-0.3.7-py3-none-any.whl", - "version": "0.3.7" + "url": "https://files.pythonhosted.org/packages/c9/7a/cef76fd8438a42f96db64ddaa85280485a9c395e7df3db8158cfec1eee34/dill-0.3.8-py3-none-any.whl", + "version": "0.3.8" }, "evaluate": { "sha256": "5fdcaf8a086b075c2b8e2c5898f501224b020b0ac7d07be76536e47e661c0c65", @@ -158,10 +158,10 @@ "version": "1.4.1" }, "fsspec": { - "sha256": "3cb443f8bcd2efb31295a5b9fdb02aee81d8452c80d28f97a6d0959e6cee101e", + "sha256": "e0fdbc446d67e182f49a70b82cf7889028a63588fde6b222521f10937b2b670c", "type": "url", - "url": "https://files.pythonhosted.org/packages/5e/44/73bea497ac69bafde2ee4269292fa3b41f1198f4bb7bbaaabde30ad29d4a/fsspec-2024.6.1-py3-none-any.whl", - "version": "2024.6.1" + "url": "https://files.pythonhosted.org/packages/ba/a3/16e9fe32187e9c8bc7f9b7bcd9728529faa725231a0c96f2f98714ff2fc5/fsspec-2024.5.0-py3-none-any.whl", + "version": "2024.5.0" }, "gevent": { "sha256": "ca80b121bbec76d7794fcb45e65a7eca660a76cc1a104ed439cdbd7df5f0b060", @@ -320,10 +320,10 @@ "version": "6.0.5" }, "multiprocess": { - "sha256": "7dd58e33235e83cf09d625e55cffd7b0f0eede7ee9223cdd666a87624f60c21a", + "sha256": "c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", "type": "url", - "url": "https://files.pythonhosted.org/packages/35/a8/36d8d7b3e46b377800d8dec47891cdf05842d1a2366909ae4a0c89fbc5e6/multiprocess-0.70.15-py310-none-any.whl", - "version": "0.70.15" + "url": "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", + "version": "0.70.16" }, "mypy-extensions": { "sha256": "4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d", @@ -499,6 +499,12 @@ "url": "https://files.pythonhosted.org/packages/ee/fb/c1b47f0ada36d856a352da261a44d7344d8f22e2f7db3945f8c3b81be5dd/pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", "version": "17.0.0" }, + "pyarrow-hotfix": { + "sha256": "dcc9ae2d220dff0083be6a9aa8e0cdee5182ad358d4931fce825c545e5c89178", + "type": "url", + "url": "https://files.pythonhosted.org/packages/e4/f4/9ec2222f5f5f8ea04f66f184caafd991a39c8782e31f5b0266f101cb68ca/pyarrow_hotfix-0.6-py3-none-any.whl", + "version": "0.6" + }, "pydantic": { "sha256": "73ee9fddd406dc318b885c7a2eab8a6472b68b8fb5ba8150949fc3db939f23c8", "type": "url", @@ -879,6 +885,7 @@ "datasets": [ "aiohttp", "dill", + "filelock", "fsspec", "huggingface-hub", "multiprocess", @@ -886,6 +893,7 @@ "packaging", "pandas", "pyarrow", + "pyarrow-hotfix", "pyyaml", "requests", "tqdm", @@ -1073,6 +1081,7 @@ "pyarrow": [ "numpy" ], + "pyarrow-hotfix": [], "pydantic": [ "annotated-types", "pydantic-core", @@ -1258,5 +1267,5 @@ } } }, - "invalidationHash": "bec665639eefe16815084d852ef1ffcf53d34983173c86dd230f129862d12619" + "invalidationHash": "2084796847b80c7c6eb4d142f3f6f15b2c0fce135751e7e06ae1b0abca591744" } \ No newline at end of file From d32b60c70ee99632eae5e052282c6da24d3c9b25 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Wed, 24 Jul 2024 11:46:31 +0200 Subject: [PATCH 18/35] update cognix --- default.nix | 29 ++++++----------------------- flake.lock | 6 +++--- lock.json | 53 +++++++++++++++++++++++++++-------------------------- 3 files changed, 36 insertions(+), 52 deletions(-) diff --git a/default.nix b/default.nix index fa26f96..78e8457 100644 --- a/default.nix +++ b/default.nix @@ -2,7 +2,7 @@ let deps = config.deps; python3 = config.python-env.deps.python; - cudaPackages = pkgs.cudaPackages_12_1; + inherit (config.cognix) cudaPackages; site = python3.sitePackages; pythonDrvs = config.python-env.pip.drvs; inherit (pkgs) lib; @@ -30,11 +30,13 @@ in # don't ask why it needs ssh system_packages = [ "pget" "openssh" "openmpi" ]; }; + # patch in cuda packages from nixpkgs + cognix.merge-native = { + cudnn = "force"; + cublas = true; + }; python-env.pip = { - uv.enable = true; constraintsList = [ - # "nvidia-cudnn-cu12==${cudaPackages.cudnn.version}" - "nvidia-cublas-cu12==${cudaPackages.libcublas.version}" "datasets>2.15.0" # picks older fsspec but newer datasets ]; # HACK: cog requires pydantic <2, but we do need the extra deps pydantic2 brings in @@ -105,25 +107,6 @@ in done popd ''; - # patch in cuda packages from nixpkgs - nvidia-cublas-cu12.mkDerivation.postInstall = '' - pushd $out/${python3.sitePackages}/nvidia/cublas/lib - for f in ./*.so.12; do - chmod +w "$f" - rm $f - ln -s ${cudaPackages.libcublas.lib}/lib/$f ./$f - done - popd - ''; - nvidia-cudnn-cu12.mkDerivation.postInstall = '' - pushd $out/${python3.sitePackages}/nvidia/cudnn/lib - for f in ./*.so.8; do - chmod +w "$f" - rm $f - ln -s ${cudaPackages.cudnn.lib}/lib/$f ./$f - done - popd - ''; }; deps.backend_dir = pkgs.runCommand "triton_backends" {} '' mkdir $out diff --git a/flake.lock b/flake.lock index 5136b55..21ce584 100644 --- a/flake.lock +++ b/flake.lock @@ -12,11 +12,11 @@ "rust-overlay": "rust-overlay" }, "locked": { - "lastModified": 1721228311, - "narHash": "sha256-EEe5Kcno5FMFSd2aYVB2ONHFpe/9k0CX1gIFjNQgV+A=", + "lastModified": 1721813236, + "narHash": "sha256-QeN5B6hTxLXUAa88xSLIr2D+MW4/ryszZWwbCj86/ek=", "owner": "datakami", "repo": "cognix", - "rev": "8c28f745d7339c495265a85fb65da3ce5592f0ef", + "rev": "3a7c66a1e93b9badcaf4eb3e0f497b4597289c1e", "type": "github" }, "original": { diff --git a/lock.json b/lock.json index 8c4916c..2b99e09 100644 --- a/lock.json +++ b/lock.json @@ -2,10 +2,10 @@ "fetchPipMetadata": { "sources": { "accelerate": { - "sha256": "71fcf4be00872194071de561634268b71417d7f5b16b178e2fa76b6f117c52b0", + "sha256": "0a7f33d60ba09afabd028d4f0856dd19c5a734b7a596d637d9dd6e3d0eadbaf3", "type": "url", - "url": "https://files.pythonhosted.org/packages/e4/74/564f621699b049b0358f7ad83d7437f8219a5d6efb69bbfcca328b60152f/accelerate-0.32.1-py3-none-any.whl", - "version": "0.32.1" + "url": "https://files.pythonhosted.org/packages/15/33/b6b4ad5efa8b9f4275d4ed17ff8a44c97276171341ba565fdffb0e3dc5e8/accelerate-0.33.0-py3-none-any.whl", + "version": "0.33.0" }, "aiohttp": { "sha256": "c26959ca7b75ff768e2776d8055bf9582a6267e24556bb7f7bd29e677932be72", @@ -206,10 +206,10 @@ "version": "3.10.0" }, "hf-transfer": { - "sha256": "2f42b89735f1cde22f2a795d1f0915741023235666be7de45879e533c7d6010c", + "sha256": "f865c33ada5bd3650c2b46e59979f2d7755c3f517f8d0facc78576a0c7d26406", "type": "url", - "url": "https://files.pythonhosted.org/packages/ce/00/a3afdb1fee4a9c28228f9962ab2ae3f3fc74380fff195022d76818e9fdac/hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "0.1.6" + "url": "https://files.pythonhosted.org/packages/5e/89/863f333b49603cc8d3c8862a428cc8fbaa9388ac8f076e9fa5ef3e729c3c/hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "0.1.8" }, "hpack": { "sha256": "84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c", @@ -236,10 +236,10 @@ "version": "0.27.0" }, "huggingface-hub": { - "sha256": "7ad92edefb93d8145c061f6df8d99df2ff85f8379ba5fac8a95aca0642afa5d7", + "sha256": "d3a623d0f2cbb9399299aefc85e3423fa2689f18ab9b6e1aa0f95d1793889f30", "type": "url", - "url": "https://files.pythonhosted.org/packages/57/c0/cf4435f3186655e3bafdca08cd6c794e3866f1f89ed99595504e7240b6a2/huggingface_hub-0.24.0-py3-none-any.whl", - "version": "0.24.0" + "url": "https://files.pythonhosted.org/packages/96/e6/a1fd9cccd2c08244243aeef71b61cb9b2ba26575d8fd6f7c41edc95e9de0/huggingface_hub-0.24.1-py3-none-any.whl", + "version": "0.24.1" }, "humanfriendly": { "sha256": "1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", @@ -260,10 +260,10 @@ "version": "3.7" }, "importlib-metadata": { - "sha256": "15584cf2b1bf449d98ff8a6ff1abef57bf20f3ac6454f431736cd3e660921b2f", + "sha256": "3cd29f739ed65973840b068e3132135ce954c254d48b5b640484467ef7ab3c8c", "type": "url", - "url": "https://files.pythonhosted.org/packages/dc/ef/38766b2edb096260d9b1b6ad35adaa0bce3b0567abb452b21eb074af88c4/importlib_metadata-8.0.0-py3-none-any.whl", - "version": "8.0.0" + "url": "https://files.pythonhosted.org/packages/c7/f3/6bd738acf4e03b2cd8360521cf5edd398866acc1b4bc95fa9fced218e52b/importlib_metadata-8.1.0-py3-none-any.whl", + "version": "8.1.0" }, "janus": { "sha256": "2596ea5482711c1ee3ef2df6c290aaf370a13c55a007826e8f7c32d696d1d00a", @@ -374,10 +374,10 @@ "version": "12.1.105" }, "nvidia-cudnn-cu12": { - "sha256": "5ccb288774fdfb07a7e7025ffec286971c06d8d7b4fb162525334616d7629ff9", + "sha256": "adf4f59ed7a1341103822ed8df6e144f4d47ea8b10d9bf0ea0047ba738fd7b02", "type": "url", - "url": "https://pypi.nvidia.com/nvidia-cudnn-cu12/nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl", - "version": "8.9.2.26" + "url": "https://pypi.nvidia.com/nvidia-cudnn-cu12/nvidia_cudnn_cu12-8.9.7.29-py3-none-manylinux1_x86_64.whl", + "version": "8.9.7.29" }, "nvidia-cufft-cu12": { "sha256": "794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56", @@ -446,10 +446,10 @@ "version": "1.16.1" }, "optimum": { - "sha256": "8b3633b9312413ceac5156294a2a0cd221268baf5a2c593f4d54ec20bff296d8", + "sha256": "508bc55db3c9434f4e8d5a30c39a46ac63c4cdb45bcc5a641b6c1c77cae88d23", "type": "url", - "url": "https://files.pythonhosted.org/packages/fa/e4/f832e42a1eb9d5ac4fa6379295e05aebeae507d171babc1786bfa0210299/optimum-1.21.2-py3-none-any.whl", - "version": "1.21.2" + "url": "https://files.pythonhosted.org/packages/5d/7a/1cc655edf289cdb533b0ea1d2f382d344248a53ad21eb8e34deb4551684b/optimum-1.17.1-py3-none-any.whl", + "version": "1.17.1" }, "packaging": { "sha256": "5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124", @@ -608,10 +608,10 @@ "version": "0.2.0" }, "setuptools": { - "sha256": "ed2feca703be3bdbd94e6bb17365d91c6935c6b2a8d0bb09b66a2c435ba0b1a5", + "sha256": "33874fdc59b3188304b2e7c80d9029097ea31627180896fb549c578ceb8a0855", "type": "url", - "url": "https://files.pythonhosted.org/packages/d2/93/40d9bd9c7cf5d5e556894cc2a2c492e6e4ef4eda6ce9806bd1a8c47ae351/setuptools-71.0.4-py3-none-any.whl", - "version": "71.0.4" + "url": "https://files.pythonhosted.org/packages/51/a0/ee460cc54e68afcf33190d198299c9579a5eafeadef0016ae8563237ccb6/setuptools-71.1.0-py3-none-any.whl", + "version": "71.1.0" }, "sh": { "sha256": "2f2f79a65abd00696cf2e9ad26508cf8abb6dba5745f40255f1c0ded2876926d", @@ -710,10 +710,10 @@ "version": "4.66.4" }, "transformers": { - "sha256": "6d59061392d0f1da312af29c962df9017ff3c0108c681a56d1bc981004d16d24", + "sha256": "eb44b731902e062acbaff196ae4896d7cb3494ddf38275aa00a5fcfb5b34f17d", "type": "url", - "url": "https://files.pythonhosted.org/packages/6a/dc/23c26b7b0bce5aaccf2b767db3e9c4f5ae4331bd47688c1f2ef091b23696/transformers-4.42.4-py3-none-any.whl", - "version": "4.42.4" + "url": "https://files.pythonhosted.org/packages/e3/89/66b0d61558c971dd2c8cbe125a471603fce0a1b8850c2f4d99a07584fca2/transformers-4.43.1-py3-none-any.whl", + "version": "4.43.1" }, "triton": { "sha256": "3c84595cbe5e546b1b290d2a58b1494df5a2ef066dd890655e5b8a8a92205c33", @@ -1011,7 +1011,8 @@ "nvidia-cuda-nvrtc-cu12": [], "nvidia-cuda-runtime-cu12": [], "nvidia-cudnn-cu12": [ - "nvidia-cublas-cu12" + "nvidia-cublas-cu12", + "nvidia-cuda-nvrtc-cu12" ], "nvidia-cufft-cu12": [], "nvidia-curand-cu12": [], @@ -1267,5 +1268,5 @@ } } }, - "invalidationHash": "2084796847b80c7c6eb4d142f3f6f15b2c0fce135751e7e06ae1b0abca591744" + "invalidationHash": "f4ffe25a9fa6f00fa52f4463e58c7bfad577099959f60841a3011bfd872c3f06" } \ No newline at end of file From a2fc518f2d57b7c009b5cdaa9ed05e09683a361f Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Wed, 24 Jul 2024 14:25:09 +0200 Subject: [PATCH 19/35] tensorrt-llm: 0.11.0 -> 0.12.0.dev2024072300 --- default.nix | 10 +++++----- lock.json | 43 +++++++++++++++++++++++++----------------- nix/tensorrt-llm.nix | 12 ++++++++---- nix/trtllm-backend.nix | 8 ++++---- 4 files changed, 43 insertions(+), 30 deletions(-) diff --git a/default.nix b/default.nix index 78e8457..a02fa9e 100644 --- a/default.nix +++ b/default.nix @@ -12,15 +12,15 @@ in imports = [ ./interface.nix ]; cog.build = { python_version = "3.10"; - cog_version = "0.10.0-alpha16"; + cog_version = "0.10.0-alpha18"; cuda = "12.1"; # todo: 12.2 gpu = true; # inspiration: echo tensorrt_llm==0.10.0 | uv pip compile - --extra-index-url https://pypi.nvidia.com -p 3.10 --prerelease=allow --annotation-style=line python_packages = [ "--extra-index-url" "https://pypi.nvidia.com" - "tensorrt_llm==0.11.0" - "tensorrt-cu12==10.1.0" + "tensorrt_llm==0.12.0.dev2024072300" + "tensorrt-cu12==10.2.0" "torch==2.3.1" "nvidia-pytriton==0.5.8" # corresponds to 2.46.0 "omegaconf" @@ -123,8 +123,8 @@ in deps.tensorrt-src = pkgs.fetchFromGitHub { owner = "NVIDIA"; repo = "TensorRT"; - rev = "v10.0.1"; - hash = "sha256-lSEw0GM0eW2BHNBq/wTQA8v3aNueE3FT+k9F5nH1OgA="; + rev = "v10.2.0"; + hash = "sha256-Euo9VD4VTpx8XJV97IMETTAx/YkPGXiNdA39Wjp3UMU="; }; # todo: replace with lockfile deps.pybind11-stubgen = python3.pkgs.buildPythonPackage rec { diff --git a/lock.json b/lock.json index 2b99e09..b464794 100644 --- a/lock.json +++ b/lock.json @@ -86,10 +86,10 @@ "version": "3.0.0" }, "cog": { - "sha256": "0f658f2da28e37da8040d073af4f4e7a91b567a8d169f077d5afddc33793a62f", + "sha256": "abf55ed3309735b2a4fc37f51ac86ab113dcefd8eb4296c0edd5980e02efe463", "type": "url", - "url": "https://files.pythonhosted.org/packages/77/2e/440a1d358a45242b6cbabdfbd59e2f51c4106cbcc6b235b5930077929896/cog-0.10.0a16-py3-none-any.whl", - "version": "0.10.0a16" + "url": "https://files.pythonhosted.org/packages/68/8d/ca4854035294ea02a0a9ffcc11827f315953b5aa1754d68fd23b3753013a/cog-0.10.0a18-py3-none-any.whl", + "version": "0.10.0a18" }, "colored": { "sha256": "a7069673bd90a35f46cb748d012c17284a0668d2f1c06bc7a51822a2d5ad2112", @@ -661,29 +661,35 @@ "url": "https://files.pythonhosted.org/packages/b2/fe/81695a1aa331a842b582453b605175f419fe8540355886031328089d840a/sympy-1.13.1-py3-none-any.whl", "version": "1.13.1" }, + "tensorrt": { + "sha256": "275c45af70b52b64f8267f1e1f52c2a74419ae5e2cc0eaad65d040bdbf543031", + "type": "url", + "url": "https://pypi.nvidia.com/tensorrt/tensorrt-10.2.0.tar.gz", + "version": "10.2.0" + }, "tensorrt-cu12": { - "sha256": "a549e2fe472eb03b2737a708c0aef0cac9cb0be1ae46bc7dad72ec1dfc81bd19", + "sha256": "a5e387a399bd1ce727a6e0b9aa2698de4c4dadf40a91c2aa61154e9196eddc56", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-cu12/tensorrt-cu12-10.1.0.tar.gz", - "version": "10.1.0" + "url": "https://pypi.nvidia.com/tensorrt-cu12/tensorrt-cu12-10.2.0.tar.gz", + "version": "10.2.0" }, "tensorrt-cu12-bindings": { - "sha256": "91e1bd0eb348524ff209ef6b235d329983ea704b5d16f9a7ba747c08cc3c2495", + "sha256": "5f952539d64bb032a3b309635fcfe0c5fa9cccf9262f487a84f5a0c14c6717f6", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.1.0-cp310-none-manylinux_2_17_x86_64.whl", - "version": "10.1.0" + "url": "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.2.0-cp310-none-manylinux_2_17_x86_64.whl", + "version": "10.2.0" }, "tensorrt-cu12-libs": { - "sha256": "1ad13c26b3f441267a746df6859e44eb0e8da78d4382458d1fd2eb7675abd49f", + "sha256": "57761499fd120c03b1858f4db3fa16fa690cd785c15275c72bc4f6195f6f9d3e", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-cu12-libs/tensorrt_cu12_libs-10.1.0-py2.py3-none-manylinux_2_17_x86_64.whl", - "version": "10.1.0" + "url": "https://pypi.nvidia.com/tensorrt-cu12-libs/tensorrt_cu12_libs-10.2.0-py2.py3-none-manylinux_2_17_x86_64.whl", + "version": "10.2.0" }, "tensorrt-llm": { - "sha256": "2bffd83b31eb36cc9429fe38ac23501537a7a42a6eabef107f1e76e28c6180bd", + "sha256": "6e4cc6b77231ddeb0e22405da3b99b45d3b186db60a61231753e245699c9fb48", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.11.0-cp310-cp310-linux_x86_64.whl", - "version": "0.11.0" + "url": "https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.12.0.dev2024072300-cp310-cp310-linux_x86_64.whl", + "version": "0.12.0.dev2024072300" }, "tokenizers": { "sha256": "8b01afb7193d47439f091cd8f070a1ced347ad0f9144952a30a41836902fe09e", @@ -1131,6 +1137,9 @@ "sympy": [ "mpmath" ], + "tensorrt": [ + "tensorrt-cu12" + ], "tensorrt-cu12": [ "tensorrt-cu12-bindings", "tensorrt-cu12-libs" @@ -1164,7 +1173,7 @@ "pynvml", "sentencepiece", "strenum", - "tensorrt-cu12", + "tensorrt", "torch", "transformers", "wheel" @@ -1268,5 +1277,5 @@ } } }, - "invalidationHash": "f4ffe25a9fa6f00fa52f4463e58c7bfad577099959f60841a3011bfd872c3f06" + "invalidationHash": "fb599e2ab6a67ae812038801ccbf85ac8402d919c95ced2d07f252ea5aee7d8a" } \ No newline at end of file diff --git a/nix/tensorrt-llm.nix b/nix/tensorrt-llm.nix index 75ea94f..c7ae2ee 100644 --- a/nix/tensorrt-llm.nix +++ b/nix/tensorrt-llm.nix @@ -17,14 +17,14 @@ }: stdenv.mkDerivation (o: { pname = "tensorrt_llm"; - version = "0.11.0"; + version = "0.12.0.dev2024072300"; src = fetchFromGitHub { owner = "NVIDIA"; repo = "TensorRT-LLM"; - rev = "v${o.version}"; + rev = "bca9a33b022dc6a924bf7913137feed3d28b602d"; fetchSubmodules = true; fetchLFS = true; # libtensorrt_llm_batch_manager_static.a - hash = "sha256-J2dqKjuEXVbE9HgoCzhUASZAnsn/hsC+qUTHL6uT4nU="; + hash = "sha256-d4xl6SZ1BM51DUkfFcclJYF0l3GrNWJR7S2xyTH9rs4="; }; outputs = if withPython then @@ -146,6 +146,9 @@ stdenv.mkDerivation (o: { '' mkdir -p $out rsync -a --chmod=u+w --include "tensorrt_llm/kernels/" --include "tensorrt_llm/kernels/kvCacheIndex.h" --exclude "tensorrt_llm/kernels/*" $src/cpp $out/ + pushd $src/cpp/tensorrt_llm + find . '(' '(' -type f -executable ')' -or -type l ')' -print0 | rsync -av --chmod=u+w --files-from=- --from0 ./ $out/cpp/tensorrt_llm/ + popd # rsync -a --chmod=u+w $src/cpp/tensorrt_llm/kernels $out/cpp/tensorrt_llm/ pushd tensorrt_llm mkdir -p $out/cpp/build/tensorrt_llm/ @@ -161,7 +164,8 @@ stdenv.mkDerivation (o: { fi done new_path=$(patchelf --print-rpath $out/cpp/build/tensorrt_llm/libtensorrt_llm.so | - sed 's#/build/source/cpp/tensorrt_llm#$ORIGIN/../../tensorrt_llm#') + sed 's#/build/source/cpp/build/tensorrt_llm#$ORIGIN#g' | + sed 's#/build/source/cpp/tensorrt_llm#$ORIGIN/../../tensorrt_llm#g') patchelf --set-rpath "$new_path" $out/cpp/build/tensorrt_llm/libtensorrt_llm.so popd '' diff --git a/nix/trtllm-backend.nix b/nix/trtllm-backend.nix index 0f90aaa..93d573b 100644 --- a/nix/trtllm-backend.nix +++ b/nix/trtllm-backend.nix @@ -49,12 +49,12 @@ let in oldGccStdenv.mkDerivation rec { pname = "tensorrtllm_backend"; - version = "0.11.0"; + version = "0.12.0.dev2024072300"; src = fetchFromGitHub { owner = "triton-inference-server"; repo = "tensorrtllm_backend"; - rev = "v${version}"; - hash = "sha256-PzcFpeq7ISqmHa9TBT0lVVYNdkJWB224kRGQKF4uas8="; + rev = "693c6377983b7efc2043287db10380d3c128bffd"; + hash = "sha256-oa/OOO3pp1W/J1yqmwytwO0y25dLYixisorRcB42kUU="; }; nativeBuildInputs = [ cmake @@ -97,6 +97,6 @@ oldGccStdenv.mkDerivation rec { patchelf $out/backends/tensorrtllm/libtriton_tensorrtllm_common.so \ --add-rpath '$ORIGIN:${trt_lib_dir}:${tensorrt-llm}/cpp/build/tensorrt_llm:${tensorrt-llm}/cpp/build/tensorrt_llm/plugins:${cudaPackages.cudnn.lib}/lib' patchelf $out/backends/tensorrtllm/trtllmExecutorWorker \ - --add-rpath '$ORIGIN:${trt_lib_dir}:${tensorrt-llm}/cpp/build/tensorrt_llm:${tensorrt-llm}/cpp/build/tensorrt_llm/plugins:${cudaPackages.cudnn.lib}/lib' + --add-rpath '$ORIGIN:${trt_lib_dir}:${tensorrt-llm}/cpp/build/tensorrt_llm:${tensorrt-llm}/cpp/build/tensorrt_llm/plugins:${cudaPackages.cudnn.lib}/lib:${tensorrt-llm}/cpp/build/tensorrt_llm/kernels/decoderMaskedMultiheadAttention:${tensorrt-llm}/cpp/build/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper' ''; } From 8cc21747c4b06a0b8d1368bfa518f2304d609d1f Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Wed, 24 Jul 2024 14:33:49 +0200 Subject: [PATCH 20/35] update triton_model_repo, triton_templates --- triton_model_repo/ensemble/config.pbtxt | 63 ++++-- triton_model_repo/postprocessing/config.pbtxt | 11 + triton_model_repo/preprocessing/config.pbtxt | 46 +++- triton_model_repo/tensorrt_llm/config.pbtxt | 8 + .../tensorrt_llm_bls/config.pbtxt | 17 +- triton_templates/ensemble/config.pbtxt | 63 ++++-- triton_templates/postprocessing/1/model.py | 21 +- triton_templates/postprocessing/config.pbtxt | 11 + triton_templates/preprocessing/1/model.py | 204 +++++++++++++++--- triton_templates/preprocessing/config.pbtxt | 46 +++- triton_templates/tensorrt_llm/config.pbtxt | 8 + .../tensorrt_llm_bls/1/lib/decode.py | 47 ++-- .../tensorrt_llm_bls/1/lib/triton_decoder.py | 71 +++--- .../tensorrt_llm_bls/config.pbtxt | 17 +- 14 files changed, 506 insertions(+), 127 deletions(-) diff --git a/triton_model_repo/ensemble/config.pbtxt b/triton_model_repo/ensemble/config.pbtxt index 9a047fe..10293bb 100644 --- a/triton_model_repo/ensemble/config.pbtxt +++ b/triton_model_repo/ensemble/config.pbtxt @@ -31,18 +31,24 @@ input [ { name: "text_input" data_type: TYPE_STRING - dims: [ -1 ] + dims: [ 1 ] }, { name: "decoder_text_input" data_type: TYPE_STRING - dims: [ -1 ] + dims: [ 1 ] + optional: true + }, + { + name: "image_input" + data_type: TYPE_FP16 + dims: [ 3, 224, 224 ] optional: true }, { name: "max_tokens" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] }, { name: "bad_words" @@ -165,22 +171,22 @@ input [ optional: true }, { - name: "embedding_bias_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true + name: "embedding_bias_words" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true }, { - name: "embedding_bias_weights" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true + name: "embedding_bias_weights" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true } ] output [ { - name: "output_ids" - data_type: TYPE_INT32 + name: "text_output" + data_type: TYPE_STRING dims: [ -1 ] }, { @@ -202,6 +208,11 @@ output [ name: "generation_logits" data_type: TYPE_FP32 dims: [ -1, -1, -1 ] + }, + { + name: "batch_index" + data_type: TYPE_INT32 + dims: [ 1 ] } ] ensemble_scheduling { @@ -217,6 +228,10 @@ ensemble_scheduling { key: "DECODER_QUERY" value: "decoder_text_input" } + input_map { + key: "IMAGE" + value: "image_input" + } input_map { key: "REQUEST_OUTPUT_LEN" value: "max_tokens" @@ -245,6 +260,10 @@ ensemble_scheduling { key: "PAD_ID" value: "pad_id" } + input_map { + key: "PROMPT_EMBEDDING_TABLE" + value: "prompt_embedding_table" + } output_map { key: "REQUEST_INPUT_LEN" value: "_REQUEST_INPUT_LEN" @@ -285,6 +304,10 @@ ensemble_scheduling { key: "OUT_PAD_ID" value: "_PREPROCESSOR_PAD_ID" } + output_map { + key: "OUT_PROMPT_EMBEDDING_TABLE" + value: "out_prompt_embedding_table" + } }, { model_name: "tensorrt_llm" @@ -379,7 +402,7 @@ ensemble_scheduling { } input_map { key: "prompt_embedding_table" - value: "prompt_embedding_table" + value: "out_prompt_embedding_table" } input_map { key: "prompt_vocab_size" @@ -416,6 +439,10 @@ ensemble_scheduling { output_map { key: "generation_logits" value: "_GENERATION_LOGITS" + }, + output_map { + key: "batch_index" + value: "_BATCH_INDEX" } }, { @@ -445,6 +472,10 @@ ensemble_scheduling { key: "SEQUENCE_LENGTH" value: "_SEQUENCE_LENGTH" } + input_map { + key: "BATCH_INDEX" + value: "_BATCH_INDEX" + } output_map { key: "OUTPUT" value: "output_ids" @@ -465,6 +496,10 @@ ensemble_scheduling { key: "OUT_GENERATION_LOGITS" value: "generation_logits" } + output_map { + key: "OUT_BATCH_INDEX" + value: "batch_index" + } } ] } diff --git a/triton_model_repo/postprocessing/config.pbtxt b/triton_model_repo/postprocessing/config.pbtxt index 0ed0053..a599265 100644 --- a/triton_model_repo/postprocessing/config.pbtxt +++ b/triton_model_repo/postprocessing/config.pbtxt @@ -61,6 +61,12 @@ input [ data_type: TYPE_FP32 dims: [ -1, -1, -1 ] optional: true + }, + { + name: "BATCH_INDEX" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true } ] output [ @@ -88,6 +94,11 @@ output [ name: "OUT_GENERATION_LOGITS" data_type: TYPE_FP32 dims: [ -1, -1, -1 ] + }, + { + name: "OUT_BATCH_INDEX" + data_type: TYPE_INT32 + dims: [ 1 ] } ] diff --git a/triton_model_repo/preprocessing/config.pbtxt b/triton_model_repo/preprocessing/config.pbtxt index ec9d2b2..9b0348c 100644 --- a/triton_model_repo/preprocessing/config.pbtxt +++ b/triton_model_repo/preprocessing/config.pbtxt @@ -31,18 +31,24 @@ input [ { name: "QUERY" data_type: TYPE_STRING - dims: [ -1 ] + dims: [ 1 ] }, { name: "DECODER_QUERY" data_type: TYPE_STRING - dims: [ -1 ] + dims: [ 1 ] + optional: true + }, + { + name: "IMAGE" + data_type: TYPE_FP16 + dims: [ 3, 224, 224 ] optional: true }, { name: "REQUEST_OUTPUT_LEN" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] }, { name: "BAD_WORDS_DICT" @@ -71,14 +77,21 @@ input [ { name: "END_ID" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] optional: true }, { name: "PAD_ID" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] + optional: true + }, + { + name: "PROMPT_EMBEDDING_TABLE" + data_type: TYPE_FP16 + dims: [ -1, -1 ] optional: true + allow_ragged_batch: true } ] output [ @@ -125,12 +138,17 @@ output [ { name: "OUT_END_ID" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] }, { name: "OUT_PAD_ID" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] + }, + { + name: "OUT_PROMPT_EMBEDDING_TABLE" + data_type: TYPE_FP16 + dims: [ -1, -1 ] } ] @@ -148,6 +166,20 @@ parameters { } } +parameters { + key: "visual_model_path" + value: { + string_value: "${visual_model_path}" + } +} + +parameters: { + key: "gpt_model_path" + value: { + string_value: "${engine_dir}" + } +} + instance_group [ { count: 64 diff --git a/triton_model_repo/tensorrt_llm/config.pbtxt b/triton_model_repo/tensorrt_llm/config.pbtxt index 75c93a0..e2a43c9 100644 --- a/triton_model_repo/tensorrt_llm/config.pbtxt +++ b/triton_model_repo/tensorrt_llm/config.pbtxt @@ -54,6 +54,7 @@ input [ name: "request_output_len" data_type: TYPE_INT32 dims: [ 1 ] + reshape: { shape: [ ] } }, { name: "draft_input_ids" @@ -255,12 +256,14 @@ input [ name: "stop" data_type: TYPE_BOOL dims: [ 1 ] + reshape: { shape: [ ] } optional: true }, { name: "streaming" data_type: TYPE_BOOL dims: [ 1 ] + reshape: { shape: [ ] } optional: true }, { @@ -350,6 +353,11 @@ output [ name: "generation_logits" data_type: TYPE_FP32 dims: [ -1, -1, -1 ] + }, + { + name: "batch_index" + data_type: TYPE_INT32 + dims: [ 1 ] } ] instance_group [ diff --git a/triton_model_repo/tensorrt_llm_bls/config.pbtxt b/triton_model_repo/tensorrt_llm_bls/config.pbtxt index f9639c8..45c9460 100644 --- a/triton_model_repo/tensorrt_llm_bls/config.pbtxt +++ b/triton_model_repo/tensorrt_llm_bls/config.pbtxt @@ -36,18 +36,24 @@ input [ { name: "text_input" data_type: TYPE_STRING - dims: [ -1 ] + dims: [ 1 ] }, { name: "decoder_text_input" data_type: TYPE_STRING - dims: [ -1 ] + dims: [ 1 ] + optional: true + }, + { + name: "image_input" + data_type: TYPE_FP16 + dims: [ 3, 224, 224 ] optional: true }, { name: "max_tokens" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] }, { name: "bad_words" @@ -223,6 +229,11 @@ output [ name: "generation_logits" data_type: TYPE_FP32 dims: [ -1, -1, -1 ] + }, + { + name: "batch_index" + data_type: TYPE_INT32 + dims: [ 1 ] } ] diff --git a/triton_templates/ensemble/config.pbtxt b/triton_templates/ensemble/config.pbtxt index 853818e..4f4a245 100644 --- a/triton_templates/ensemble/config.pbtxt +++ b/triton_templates/ensemble/config.pbtxt @@ -31,18 +31,24 @@ input [ { name: "text_input" data_type: TYPE_STRING - dims: [ -1 ] + dims: [ 1 ] }, { name: "decoder_text_input" data_type: TYPE_STRING - dims: [ -1 ] + dims: [ 1 ] + optional: true + }, + { + name: "image_input" + data_type: TYPE_FP16 + dims: [ 3, 224, 224 ] optional: true }, { name: "max_tokens" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] }, { name: "bad_words" @@ -165,22 +171,22 @@ input [ optional: true }, { - name: "embedding_bias_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true + name: "embedding_bias_words" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true }, { - name: "embedding_bias_weights" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true + name: "embedding_bias_weights" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true } ] output [ { - name: "output_ids" - data_type: TYPE_INT32 + name: "text_output" + data_type: TYPE_STRING dims: [ -1 ] }, { @@ -202,6 +208,11 @@ output [ name: "generation_logits" data_type: TYPE_FP32 dims: [ -1, -1, -1 ] + }, + { + name: "batch_index" + data_type: TYPE_INT32 + dims: [ 1 ] } ] ensemble_scheduling { @@ -217,6 +228,10 @@ ensemble_scheduling { key: "DECODER_QUERY" value: "decoder_text_input" } + input_map { + key: "IMAGE" + value: "image_input" + } input_map { key: "REQUEST_OUTPUT_LEN" value: "max_tokens" @@ -245,6 +260,10 @@ ensemble_scheduling { key: "PAD_ID" value: "pad_id" } + input_map { + key: "PROMPT_EMBEDDING_TABLE" + value: "prompt_embedding_table" + } output_map { key: "REQUEST_INPUT_LEN" value: "_REQUEST_INPUT_LEN" @@ -285,6 +304,10 @@ ensemble_scheduling { key: "OUT_PAD_ID" value: "_PREPROCESSOR_PAD_ID" } + output_map { + key: "OUT_PROMPT_EMBEDDING_TABLE" + value: "out_prompt_embedding_table" + } }, { model_name: "tensorrt_llm" @@ -379,7 +402,7 @@ ensemble_scheduling { } input_map { key: "prompt_embedding_table" - value: "prompt_embedding_table" + value: "out_prompt_embedding_table" } input_map { key: "prompt_vocab_size" @@ -416,6 +439,10 @@ ensemble_scheduling { output_map { key: "generation_logits" value: "_GENERATION_LOGITS" + }, + output_map { + key: "batch_index" + value: "_BATCH_INDEX" } }, { @@ -445,6 +472,10 @@ ensemble_scheduling { key: "SEQUENCE_LENGTH" value: "_SEQUENCE_LENGTH" } + input_map { + key: "BATCH_INDEX" + value: "_BATCH_INDEX" + } output_map { key: "OUTPUT" value: "output_ids" @@ -465,6 +496,10 @@ ensemble_scheduling { key: "OUT_GENERATION_LOGITS" value: "generation_logits" } + output_map { + key: "OUT_BATCH_INDEX" + value: "batch_index" + } } ] } diff --git a/triton_templates/postprocessing/1/model.py b/triton_templates/postprocessing/1/model.py index 4514190..9c68429 100644 --- a/triton_templates/postprocessing/1/model.py +++ b/triton_templates/postprocessing/1/model.py @@ -142,6 +142,10 @@ def execute(self, requests): generation_logits = pb_utils.get_input_tensor_by_name( request, 'GENERATION_LOGITS') + # Get the batch index + batch_index = pb_utils.get_input_tensor_by_name( + request, 'BATCH_INDEX') + # Reshape Input # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]]) # tokens_batch = tokens_batch.T @@ -197,6 +201,15 @@ def execute(self, requests): np.array([[[[0.0]]]], dtype=np.float32)) outputs.append(out_generation_logits) + if batch_index: + out_batch_index = pb_utils.Tensor('OUT_BATCH_INDEX', + batch_index.as_numpy()) + outputs.append(out_batch_index) + else: + out_batch_index = pb_utils.Tensor( + 'OUT_BATCH_INDEX', np.array([[0]], dtype=np.int32)) + outputs.append(out_batch_index) + # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. # Below is an example of how you can set errors in inference @@ -224,8 +237,14 @@ def finalize(self): # for batch_idx, beam_tokens in enumerate(tokens_batch): # for beam_idx, tokens in enumerate(beam_tokens): # seq_len = sequence_lengths[batch_idx][beam_idx] + # # Exclude fake ids in multimodal models + # fake_id_len = 0 + # for i in range(seq_len): + # if tokens[i] < self.tokenizer.vocab_size: + # fake_id_len = i + # break # output = self.tokenizer.decode( - # tokens[:seq_len], + # tokens[fake_id_len:seq_len], # skip_special_tokens=self.skip_special_tokens) # outputs.append(output.encode('utf8')) # return outputs diff --git a/triton_templates/postprocessing/config.pbtxt b/triton_templates/postprocessing/config.pbtxt index 0ad7f23..2ebda5e 100644 --- a/triton_templates/postprocessing/config.pbtxt +++ b/triton_templates/postprocessing/config.pbtxt @@ -61,6 +61,12 @@ input [ data_type: TYPE_FP32 dims: [ -1, -1, -1 ] optional: true + }, + { + name: "BATCH_INDEX" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true } ] output [ @@ -88,6 +94,11 @@ output [ name: "OUT_GENERATION_LOGITS" data_type: TYPE_FP32 dims: [ -1, -1, -1 ] + }, + { + name: "OUT_BATCH_INDEX" + data_type: TYPE_INT32 + dims: [ 1 ] } ] diff --git a/triton_templates/preprocessing/1/model.py b/triton_templates/preprocessing/1/model.py index ed09cd4..7bfddf9 100644 --- a/triton_templates/preprocessing/1/model.py +++ b/triton_templates/preprocessing/1/model.py @@ -25,10 +25,14 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import json +import os from typing import List import numpy as np +import tensorrt as trt +import torch import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import from_dlpack from transformers import AutoTokenizer, T5Tokenizer @@ -59,6 +63,11 @@ def initialize(self, args): add_special_tokens = model_config['parameters'].get( 'add_special_tokens') + visual_model_path = model_config['parameters']['visual_model_path'][ + 'string_value'] + if visual_model_path == "${visual_model_path}" or visual_model_path == "": + visual_model_path = None + if add_special_tokens is not None: add_special_tokens_str = add_special_tokens['string_value'].lower() if add_special_tokens_str in [ @@ -93,11 +102,51 @@ def initialize(self, args): self.tokenizer_pad_id = self.tokenizer.encode( self.tokenizer.pad_token, add_special_tokens=False)[0] + self.visual_engine = None + self.visual_context = None + self.stream = None + self.vocab_size = None + self.dtype = None + if visual_model_path is not None: + llm_model_path = model_config['parameters']['gpt_model_path'][ + 'string_value'] + llm_model_path = os.path.join(llm_model_path, 'config.json') + + vision_encoder_path = os.path.join(visual_model_path, + 'visual_encoder.engine') + with open(vision_encoder_path, 'rb') as f: + engine_buffer = f.read() + + self.stream = torch.cuda.Stream() + torch.cuda.set_stream(self.stream) + + trt_logger = trt.Logger(trt.Logger.WARNING) + visual_runtime = trt.Runtime(trt_logger) + if engine_buffer is not None: + self.visual_engine = visual_runtime.deserialize_cuda_engine( + engine_buffer) + self.visual_context = self.visual_engine.create_execution_context() + self.visual_context.set_optimization_profile_async( + 0, self.stream.cuda_stream) + + assert self.visual_engine.get_tensor_dtype( + 'input' + ) == trt.float16 and self.visual_engine.get_tensor_dtype( + 'output' + ) == trt.float16 and self.visual_engine.num_io_tensors == 2, "Please use the model built in examples/multimodal." + + self.stream.synchronize() + + with open(llm_model_path, 'r') as f: + llm_model_config = json.load(f) + self.vocab_size = int( + llm_model_config["pretrained_config"]["vocab_size"]) + # Parse model output configs and convert Triton types to numpy types output_names = [ "INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN", "REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS", - "OUT_END_ID", "OUT_PAD_ID" + "OUT_END_ID", "OUT_PAD_ID", "OUT_PROMPT_EMBEDDING_TABLE" ] input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"] for input_name in input_names: @@ -145,21 +194,33 @@ def execute(self, requests): # Get input tensors query = pb_utils.get_input_tensor_by_name(request, 'QUERY').as_numpy() + batch_size = query.shape[0] + decoder_query = pb_utils.get_input_tensor_by_name( request, 'DECODER_QUERY') if decoder_query is not None: decoder_query = decoder_query.as_numpy() - batch_dim = query.shape[0] - if batch_dim != 1: + image = pb_utils.get_input_tensor_by_name(request, 'IMAGE') + if image is not None: + image = from_dlpack(image.to_dlpack()).cuda().half() + if self.visual_engine is None: + err_str = "Images cannot be processed without a vision model." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue - err_str = "Inflight batching backend expects requests with batch size of 1." - logger.log_error(err_str) - responses.append( - pb_utils.InferenceResponse( - output_tensors=[], - error=pb_utils.TritonError(err_str))) - continue + if image.shape[0] != batch_size: + err_str = "Query and Image have different batch sizes." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue request_output_len = pb_utils.get_input_tensor_by_name( request, 'REQUEST_OUTPUT_LEN').as_numpy() @@ -184,13 +245,65 @@ def execute(self, requests): if embedding_bias_weights is not None: embedding_bias_weights = embedding_bias_weights.as_numpy() + prompt_embedding_table_tensor = pb_utils.get_input_tensor_by_name( + request, 'PROMPT_EMBEDDING_TABLE') + if prompt_embedding_table_tensor is not None: + prompt_embedding_table = prompt_embedding_table_tensor.as_numpy( + ) + prompt_embedding_table_tensor = pb_utils.Tensor( + 'OUT_PROMPT_EMBEDDING_TABLE', prompt_embedding_table) + + if image is not None and prompt_embedding_table_tensor is not None: + + err_str = "Image and prompt table cannot be provided simultaneously." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue + + visual_output = None + if image is not None: + ok = self.visual_context.set_input_shape('input', image.shape) + if not ok: + err_str = "Image has wrong shape." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue + self.visual_context.set_tensor_address('input', + image.data_ptr()) + + visual_output_shape = self.visual_context.get_tensor_shape( + 'output') + visual_output = torch.empty(tuple(visual_output_shape), + dtype=torch.float16, + device=image.device) + self.visual_context.set_tensor_address( + 'output', visual_output.data_ptr()) + + ok = self.visual_context.execute_async_v3( + self.stream.cuda_stream) + if not ok: + err_str = "Runtime execution failed for vision encoder model." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue + self.stream.synchronize() + # Take the end_id from the input tensors # If not specified, use tokenizer to get end_id end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID') if end_id is not None: end_id = end_id.as_numpy() else: - end_id = [[self.tokenizer_end_id]] + end_id = [[self.tokenizer_end_id]] * batch_size # Take the pad_id from the input tensors # If not specified, use tokenizer to get pad_id @@ -198,23 +311,31 @@ def execute(self, requests): if pad_id is not None: pad_id = pad_id.as_numpy() else: - pad_id = [[self.tokenizer_pad_id]] + pad_id = [[self.tokenizer_pad_id]] * batch_size # Preprocessing input data. - input_id, request_input_len = self._create_request(query) + input_id, request_input_len = self._create_request( + query, visual_output) if decoder_query is not None: decoder_input_id, request_decoder_input_len = self._create_request( decoder_query) else: - decoder_input_id = pad_id * np.ones((1, 1), np.int32) - request_decoder_input_len = 1 * np.ones((1, 1), np.int32) + decoder_input_id = pad_id * np.ones((batch_size, 1), np.int32) + request_decoder_input_len = 1 * np.ones( + (batch_size, 1), np.int32) - bad_words = self._to_word_list_format(bad_words_dict) - stop_words = self._to_word_list_format(stop_words_dict) + bad_words = self._to_word_list_format(bad_words_dict, batch_size) + stop_words = self._to_word_list_format(stop_words_dict, batch_size) embedding_bias = self._get_embedding_bias( embedding_bias_words, embedding_bias_weights, - self.embedding_bias_weights_dtype) + self.embedding_bias_weights_dtype, batch_size) + + if image is not None: + prompt_table = np.array(visual_output.cpu()) + prompt_embedding_table_tensor = pb_utils.Tensor( + 'OUT_PROMPT_EMBEDDING_TABLE', + prompt_table.astype(self.out_prompt_embedding_table_dtype)) # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. @@ -242,12 +363,27 @@ def execute(self, requests): pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID', np.array(pad_id, dtype=np.int32)) - inference_response = pb_utils.InferenceResponse(output_tensors=[ - input_id_tensor, decoder_input_id_tensor, bad_words_ids_tensor, - stop_words_ids_tensor, request_input_len_tensor, - request_decoder_input_len_tensor, request_output_len_tensor, - embedding_bias_tensor, end_id_tensor, pad_id_tensor - ]) + if prompt_embedding_table_tensor is not None: + inference_response = pb_utils.InferenceResponse( + output_tensors=[ + input_id_tensor, decoder_input_id_tensor, + bad_words_ids_tensor, stop_words_ids_tensor, + request_input_len_tensor, + request_decoder_input_len_tensor, + request_output_len_tensor, embedding_bias_tensor, + end_id_tensor, pad_id_tensor, + prompt_embedding_table_tensor + ]) + else: + inference_response = pb_utils.InferenceResponse( + output_tensors=[ + input_id_tensor, decoder_input_id_tensor, + bad_words_ids_tensor, stop_words_ids_tensor, + request_input_len_tensor, + request_decoder_input_len_tensor, + request_output_len_tensor, embedding_bias_tensor, + end_id_tensor, pad_id_tensor + ]) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length @@ -261,7 +397,7 @@ def finalize(self): """ print('Cleaning up...') - def _create_request(self, query): + def _create_request(self, query, visual_features): """ query : batch string (2D numpy array) """ @@ -279,6 +415,14 @@ def _create_request(self, query): add_special_tokens=self.add_special_tokens)).astype( int) for s in query ] + if visual_features is not None: + fake_prompt_id = np.arange( + self.vocab_size, self.vocab_size + visual_features.shape[1]) + start_ids = [ + np.concatenate((fake_prompt_id, ids), axis=0) + for ids in start_ids + ] + start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int) max_len = 0 @@ -293,7 +437,8 @@ def _create_request(self, query): return start_ids, start_lengths - def _to_word_list_format(self, word_lists: List[List[str | bytes]]): + def _to_word_list_format(self, word_lists: List[List[str | bytes]], + batch_size): ''' word_lists format: len(word_lists) == batch_size @@ -303,7 +448,7 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): if word_lists is None: # Return an empty array of shape (1,2,0) - return np.empty([1, 2, 0], dtype="int32") + return np.empty([batch_size, 2, 0], dtype="int32") flat_ids = [] offsets = [] @@ -337,12 +482,13 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): (1, 0, 2)) def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights, - bias_dtype): + bias_dtype, batch_size): assert self.tokenizer != None, "need to set tokenizer" if embedding_bias_words is None or embedding_bias_weights is None: - return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype) + return np.empty([batch_size, 0], + dtype=self.embedding_bias_weights_dtype) batch_embedding_bias = [] for words, weights in zip(embedding_bias_words, diff --git a/triton_templates/preprocessing/config.pbtxt b/triton_templates/preprocessing/config.pbtxt index 165134c..75d49d5 100644 --- a/triton_templates/preprocessing/config.pbtxt +++ b/triton_templates/preprocessing/config.pbtxt @@ -31,18 +31,24 @@ input [ { name: "QUERY" data_type: TYPE_STRING - dims: [ -1 ] + dims: [ 1 ] }, { name: "DECODER_QUERY" data_type: TYPE_STRING - dims: [ -1 ] + dims: [ 1 ] + optional: true + }, + { + name: "IMAGE" + data_type: TYPE_FP16 + dims: [ 3, 224, 224 ] optional: true }, { name: "REQUEST_OUTPUT_LEN" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] }, { name: "BAD_WORDS_DICT" @@ -71,14 +77,21 @@ input [ { name: "END_ID" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] optional: true }, { name: "PAD_ID" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] + optional: true + }, + { + name: "PROMPT_EMBEDDING_TABLE" + data_type: TYPE_FP16 + dims: [ -1, -1 ] optional: true + allow_ragged_batch: true } ] output [ @@ -125,12 +138,17 @@ output [ { name: "OUT_END_ID" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] }, { name: "OUT_PAD_ID" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] + }, + { + name: "OUT_PROMPT_EMBEDDING_TABLE" + data_type: TYPE_FP16 + dims: [ -1, -1 ] } ] @@ -148,6 +166,20 @@ parameters { } } +parameters { + key: "visual_model_path" + value: { + string_value: "${visual_model_path}" + } +} + +parameters: { + key: "gpt_model_path" + value: { + string_value: "${engine_dir}" + } +} + instance_group [ { count: ${preprocessing_instance_count} diff --git a/triton_templates/tensorrt_llm/config.pbtxt b/triton_templates/tensorrt_llm/config.pbtxt index d204504..81aedb2 100644 --- a/triton_templates/tensorrt_llm/config.pbtxt +++ b/triton_templates/tensorrt_llm/config.pbtxt @@ -54,6 +54,7 @@ input [ name: "request_output_len" data_type: TYPE_INT32 dims: [ 1 ] + reshape: { shape: [ ] } }, { name: "draft_input_ids" @@ -255,12 +256,14 @@ input [ name: "stop" data_type: TYPE_BOOL dims: [ 1 ] + reshape: { shape: [ ] } optional: true }, { name: "streaming" data_type: TYPE_BOOL dims: [ 1 ] + reshape: { shape: [ ] } optional: true }, { @@ -350,6 +353,11 @@ output [ name: "generation_logits" data_type: TYPE_FP32 dims: [ -1, -1, -1 ] + }, + { + name: "batch_index" + data_type: TYPE_INT32 + dims: [ 1 ] } ] instance_group [ diff --git a/triton_templates/tensorrt_llm_bls/1/lib/decode.py b/triton_templates/tensorrt_llm_bls/1/lib/decode.py index de9e28b..eb4e1b9 100644 --- a/triton_templates/tensorrt_llm_bls/1/lib/decode.py +++ b/triton_templates/tensorrt_llm_bls/1/lib/decode.py @@ -59,6 +59,7 @@ def _single_value(data: Optional[np.ndarray]): class Request: text_input: np.ndarray = np.array([]) decoder_text_input: np.ndarray = None + image_input: Optional[np.ndarray] = None max_tokens: np.ndarray = np.array([]) bad_words: Optional[np.ndarray] = None stop_words: Optional[np.ndarray] = None @@ -91,13 +92,12 @@ def validate(self): "max_tokens must be a single value > 0") num_draft_tokens = _single_value(self.num_draft_tokens) - stream = _single_value(self.stream) _single_value(self.return_generation_logits) context_logits = _single_value(self.return_context_logits) if num_draft_tokens: _validate_that( - not stream, + not self.stream.any(), "streaming is not supported with speculative decoding") _validate_that( not context_logits, @@ -121,24 +121,24 @@ class PreprocResponse: embedding_bias: Optional[np.ndarray] = None end_id: Optional[np.ndarray] = None pad_id: Optional[np.ndarray] = None + prompt_embedding_table: Optional[np.ndarray] = None @classmethod def with_new_inputs(cls, other, input_ids: Optional[np.ndarray] = None, input_lengths: Optional[np.ndarray] = None): - return cls( - input_ids=(input_ids - if input_ids is not None else other.input_ids), - input_lengths=(input_lengths if input_lengths is not None else - other.input_lengths), - decoder_input_ids=other.decoder_input_ids, - decoder_input_lengths=other.decoder_input_lengths, - bad_words_list=other.bad_words_list, - stop_words_list=other.stop_words_list, - end_id=other.end_id, - pad_id=other.pad_id, - ) + return cls(input_ids=(input_ids + if input_ids is not None else other.input_ids), + input_lengths=(input_lengths if input_lengths is not None + else other.input_lengths), + decoder_input_ids=other.decoder_input_ids, + decoder_input_lengths=other.decoder_input_lengths, + bad_words_list=other.bad_words_list, + stop_words_list=other.stop_words_list, + end_id=other.end_id, + pad_id=other.pad_id, + prompt_embedding_table=other.prompt_embedding_table) @dataclass @@ -149,6 +149,7 @@ class GenerationResponse: output_log_probs: Optional[np.ndarray] = None context_logits: Optional[np.ndarray] = None generation_logits: Optional[np.ndarray] = None + batch_index: Optional[np.ndarray] = None @dataclass @@ -158,6 +159,7 @@ class Response: output_log_probs: Optional[np.ndarray] = None context_logits: Optional[np.ndarray] = None generation_logits: Optional[np.ndarray] = None + batch_index: Optional[np.ndarray] = None def __eq__(self, o) -> bool: """Just for testing""" @@ -166,8 +168,9 @@ def __eq__(self, o) -> bool: return (np.array_equal(self.text_output, o.text_output) and np.array_equal(self.cum_log_probs, o.cum_log_probs) and np.array_equal(self.output_log_probs, o.output_log_probs) - and np.array_equal(self.context_logits, o.context_logits) and - np.array_equal(self.generation_logits, o.generation_logits)) + and np.array_equal(self.context_logits, o.context_logits) + and np.array_equal(self.generation_logits, o.generation_logits) + and np.array_equal(self.batch_index, o.batch_index)) class Decoder: @@ -181,13 +184,19 @@ def __init__(self, streaming=False, accumulate=False): def decode(self, request: Request, speculative_decoding=False) -> Generator[Response, None, None]: + + batch_size = request.text_input.shape[0] preproc_response = self.preprocess(request) if speculative_decoding: + if batch_size > 1: + raise Exception( + "speculative decoding is not supported with batch size > 1" + ) for gen_response in self._spec_generate(preproc_response, request): yield self.postprocess(gen_response) else: - if not self._streaming: + if not self._streaming and batch_size == 1: gen_response = self._generate_non_streaming( preproc_response, request) yield self.postprocess(gen_response) @@ -205,6 +214,10 @@ def _spec_generate( self, preproc: PreprocResponse, request: Request) -> Generator[GenerationResponse, None, None]: + if preproc.input_ids.shape[0] > 1: + raise Exception( + "Speculative decoding does not support batch size > 1.") + prompt_input_ids: np.ndarray = preproc.input_ids[0] input_ids: np.ndarray = prompt_input_ids output_len: int = request.max_tokens[0][0] diff --git a/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py b/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py index 456ded5..9c8a4e9 100644 --- a/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py +++ b/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py @@ -58,15 +58,13 @@ def __init__(self, "EMBEDDING_BIAS", "OUT_PAD_ID", "OUT_END_ID", + "OUT_PROMPT_EMBEDDING_TABLE", ] self._llm_outputs = [ - "output_ids", - "sequence_length", - "cum_log_probs", - "output_log_probs", - "context_logits", - "generation_logits", + "output_ids", "sequence_length", "cum_log_probs", + "output_log_probs", "context_logits", "generation_logits", + "batch_index" ] self._postproc_outputs = [ @@ -76,6 +74,7 @@ def __init__(self, self.input_names = [ "text_input", "decoder_text_input", + "image_input", "max_tokens", "bad_words", "stop_words", @@ -145,7 +144,8 @@ def create_triton_response(self, response: Response): "cum_log_probs": "cum_log_probs", "output_log_probs": "output_log_probs", "context_logits": "context_logits", - "generation_logits": "generation_logits" + "generation_logits": "generation_logits", + "batch_index": "batch_index" } tensors = self.create_triton_tensors(response, name_map) return pb_utils.InferenceResponse(output_tensors=tensors) @@ -221,6 +221,7 @@ def _get_preproc_tensors(self, request: Request): name_map = { "text_input": "QUERY", "decoder_text_input": "DECODER_QUERY", + "image_input": "IMAGE", "max_tokens": "REQUEST_OUTPUT_LEN", "bad_words": "BAD_WORDS_DICT", "stop_words": "STOP_WORDS_DICT", @@ -242,6 +243,7 @@ def _get_preproc_response(self, triton_output): "EMBEDDING_BIAS": "embedding_bias", "OUT_PAD_ID": "pad_id", "OUT_END_ID": "end_id", + "OUT_PROMPT_EMBEDDING_TABLE": "prompt_embedding_table", } return self.convert_triton_response(triton_output, PreprocResponse, name_map) @@ -316,6 +318,7 @@ def _get_tensors_from_preproc(self, preproc: PreprocResponse): "embedding_bias": "embedding_bias", "pad_id": "pad_id", "end_id": "end_id", + "prompt_embedding_table": "prompt_embedding_table", } return self.create_triton_tensors(preproc, name_map) @@ -340,23 +343,28 @@ def _get_llm_tensors_from_request( "prompt_embedding_table": "prompt_embedding_table", "prompt_vocab_size": "prompt_vocab_size", } + batch_size = request.text_input.shape[0] tensors = self.create_triton_tensors(request, name_map) - - out_len = request.max_tokens[0][0] if request.max_tokens else None - if num_output_tokens is not None: - out_len = num_output_tokens - elif draft_request: - if draft_request.draft_input_ids is not None: - out_len = len(draft_request.draft_input_ids[0]) + 1 - else: - out_len = 1 - - if out_len is None: - raise Exception("Could not determine request_output_len") - else: + if request.max_tokens is not None: tensors.append( pb_utils.Tensor("request_output_len", - np.array([[out_len]], dtype=np.int32))) + np.array(request.max_tokens, dtype=np.int32))) + else: + out_len = None + if num_output_tokens is not None: + out_len = num_output_tokens + elif draft_request: + if draft_request.draft_input_ids is not None: + out_len = len(draft_request.draft_input_ids[0]) + 1 + else: + out_len = 1 + + if out_len is None: + raise Exception("Could not determine request_output_len") + else: + tensors.append( + pb_utils.Tensor("request_output_len", + np.array([[out_len]], dtype=np.int32))) if draft_request: if draft_request.draft_input_ids is not None: @@ -369,24 +377,21 @@ def _get_llm_tensors_from_request( pb_utils.Tensor("draft_logits", draft_request.draft_logits)) - return_context_logits = False - return_generation_logits = False + return_context_logits = [[False]] * batch_size + return_generation_logits = [[False]] * batch_size if draft_request is None: if is_draft_model_request: - return_generation_logits = request.use_draft_logits[ - 0] if request.use_draft_logits is not None else False + return_generation_logits = request.use_draft_logits if request.use_draft_logits is not None else return_generation_logits else: - return_context_logits = request.return_context_logits[ - 0] if request.return_context_logits is not None else False - return_generation_logits = request.return_generation_logits[ - 0] if request.return_generation_logits is not None else False + return_context_logits = request.return_context_logits if request.return_context_logits is not None else return_context_logits + return_generation_logits = request.return_generation_logits if request.return_generation_logits is not None else return_generation_logits tensors.append( pb_utils.Tensor("return_context_logits", - np.array([[return_context_logits]]))) + np.array(return_context_logits))) tensors.append( pb_utils.Tensor("return_generation_logits", - np.array([[return_generation_logits]]))) + np.array(return_generation_logits))) return tensors def _get_llm_response(self, triton_output): @@ -397,6 +402,7 @@ def _get_llm_response(self, triton_output): "output_log_probs": "output_log_probs", "context_logits": "context_logits", "generation_logits": "generation_logits", + "batch_index": "batch_index", } return self.convert_triton_response(triton_output, GenerationResponse, name_map) @@ -436,5 +442,6 @@ def _get_response(self, triton_output, gen_res: GenerationResponse): cum_log_probs=gen_res.cum_log_probs, output_log_probs=gen_res.output_log_probs, context_logits=gen_res.context_logits, - generation_logits=gen_res.generation_logits) + generation_logits=gen_res.generation_logits, + batch_index=gen_res.batch_index) return response diff --git a/triton_templates/tensorrt_llm_bls/config.pbtxt b/triton_templates/tensorrt_llm_bls/config.pbtxt index ba0fa58..da84b98 100644 --- a/triton_templates/tensorrt_llm_bls/config.pbtxt +++ b/triton_templates/tensorrt_llm_bls/config.pbtxt @@ -36,18 +36,24 @@ input [ { name: "text_input" data_type: TYPE_STRING - dims: [ -1 ] + dims: [ 1 ] }, { name: "decoder_text_input" data_type: TYPE_STRING - dims: [ -1 ] + dims: [ 1 ] + optional: true + }, + { + name: "image_input" + data_type: TYPE_FP16 + dims: [ 3, 224, 224 ] optional: true }, { name: "max_tokens" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] }, { name: "bad_words" @@ -223,6 +229,11 @@ output [ name: "generation_logits" data_type: TYPE_FP32 dims: [ -1, -1, -1 ] + }, + { + name: "batch_index" + data_type: TYPE_INT32 + dims: [ 1 ] } ] From 92000d528a0e36dc8176417bead5021e216e8b49 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Wed, 24 Jul 2024 18:01:25 +0200 Subject: [PATCH 21/35] don't add 2 copies of tensorrt-llm in builder --- flake.nix | 4 +++- lock.json | 18 +++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/flake.nix b/flake.nix index 7d17b43..e54d93b 100644 --- a/flake.nix +++ b/flake.nix @@ -33,7 +33,8 @@ makeBuilder = name: callCognix ( { config, lib, pkgs, ... }: { inherit name; # only grab deps of tensorrt-llm, omegaconf, hf-transfer - cognix.python_root_packages = [ "tensorrt-llm" "omegaconf" "hf-transfer" ]; + cognix.python_root_packages = [ "omegaconf" "hf-transfer" "transformers" "torch" ]; + # override cog.yaml: cog.concurrency.max = lib.mkForce 1; @@ -44,6 +45,7 @@ cognix.environment.TRTLLM_PYTHON = (config.python-env.public.extendModules { modules = [{ _file = ./.; + pip.rootDependencies = lib.mkOverride 49 { tensorrt-llm = true; hf-transfer = true; }; pip.drvs.pydantic = let mkMoreForce = lib.mkOverride 49; in { version = mkMoreForce "2.8.2"; mkDerivation.src = mkMoreForce (pkgs.fetchurl { diff --git a/lock.json b/lock.json index b464794..6564584 100644 --- a/lock.json +++ b/lock.json @@ -236,10 +236,10 @@ "version": "0.27.0" }, "huggingface-hub": { - "sha256": "d3a623d0f2cbb9399299aefc85e3423fa2689f18ab9b6e1aa0f95d1793889f30", + "sha256": "abdf3244d3a274c4b1fbc5c4a1ef700032b3f60ba93cc63e4f036fd082aa2805", "type": "url", - "url": "https://files.pythonhosted.org/packages/96/e6/a1fd9cccd2c08244243aeef71b61cb9b2ba26575d8fd6f7c41edc95e9de0/huggingface_hub-0.24.1-py3-none-any.whl", - "version": "0.24.1" + "url": "https://files.pythonhosted.org/packages/93/14/6a82b1c2eab5a828f7d3d675811660eb68424e8b039191f418a94e8d9726/huggingface_hub-0.24.2-py3-none-any.whl", + "version": "0.24.2" }, "humanfriendly": { "sha256": "1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", @@ -260,10 +260,10 @@ "version": "3.7" }, "importlib-metadata": { - "sha256": "3cd29f739ed65973840b068e3132135ce954c254d48b5b640484467ef7ab3c8c", + "sha256": "11901fa0c2f97919b288679932bb64febaeacf289d18ac84dd68cb2e74213369", "type": "url", - "url": "https://files.pythonhosted.org/packages/c7/f3/6bd738acf4e03b2cd8360521cf5edd398866acc1b4bc95fa9fced218e52b/importlib_metadata-8.1.0-py3-none-any.whl", - "version": "8.1.0" + "url": "https://files.pythonhosted.org/packages/82/47/bb25ec04985d0693da478797c3d8c1092b140f3a53ccb984fbbd38affa5b/importlib_metadata-8.2.0-py3-none-any.whl", + "version": "8.2.0" }, "janus": { "sha256": "2596ea5482711c1ee3ef2df6c290aaf370a13c55a007826e8f7c32d696d1d00a", @@ -716,10 +716,10 @@ "version": "4.66.4" }, "transformers": { - "sha256": "eb44b731902e062acbaff196ae4896d7cb3494ddf38275aa00a5fcfb5b34f17d", + "sha256": "283c8b47cf38640c5c0caea60be0dfa948669fa48e9739b03717cbf5e8b20f11", "type": "url", - "url": "https://files.pythonhosted.org/packages/e3/89/66b0d61558c971dd2c8cbe125a471603fce0a1b8850c2f4d99a07584fca2/transformers-4.43.1-py3-none-any.whl", - "version": "4.43.1" + "url": "https://files.pythonhosted.org/packages/13/63/cccd0297770d7096c19c99d4c542f3068a30e73cdfd971a920bfa686cb3a/transformers-4.43.2-py3-none-any.whl", + "version": "4.43.2" }, "triton": { "sha256": "3c84595cbe5e546b1b290d2a58b1494df5a2ef066dd890655e5b8a8a92205c33", From dc78a43606bd9f7b54443c4079cec7ce3f39e955 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Thu, 25 Jul 2024 12:33:30 +0200 Subject: [PATCH 22/35] tensorrt-lllm: 0.12.0.dev2024072300 -> 0.12.0.dev2024072301 --- default.nix | 2 +- lock.json | 26 +++++++++++++------------- nix/tensorrt-llm.nix | 4 ++-- nix/trtllm-backend.nix | 4 ++-- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/default.nix b/default.nix index a02fa9e..bac2fb8 100644 --- a/default.nix +++ b/default.nix @@ -19,7 +19,7 @@ in python_packages = [ "--extra-index-url" "https://pypi.nvidia.com" - "tensorrt_llm==0.12.0.dev2024072300" + "tensorrt_llm==0.12.0.dev2024072301" "tensorrt-cu12==10.2.0" "torch==2.3.1" "nvidia-pytriton==0.5.8" # corresponds to 2.46.0 diff --git a/lock.json b/lock.json index 6564584..348e0d3 100644 --- a/lock.json +++ b/lock.json @@ -476,10 +476,10 @@ "version": "0.49.9" }, "protobuf": { - "sha256": "7c8daa26095f82482307bc717364e7c13f4f1c99659be82890dcfc215194554d", + "sha256": "3319e073562e2515c6ddc643eb92ce20809f5d8f10fead3332f71c63be6a7040", "type": "url", - "url": "https://files.pythonhosted.org/packages/15/db/7f731524fe0e56c6b2eb57d05b55d3badd80ef7d1f1ed59db191b2fdd8ab/protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl", - "version": "4.25.3" + "url": "https://files.pythonhosted.org/packages/ca/6c/cc7ab2fb3a4a7f07f211d8a7bbb76bba633eb09b148296dbd4281e217f95/protobuf-4.25.4-cp37-abi3-manylinux2014_x86_64.whl", + "version": "4.25.4" }, "psutil": { "sha256": "5fd9a97c8e94059b0ef54a7d4baf13b405011176c3b6ff257c247cae0d560ecd", @@ -572,10 +572,10 @@ "version": "26.0.3" }, "regex": { - "sha256": "1337b7dbef9b2f71121cdbf1e97e40de33ff114801263b275aafd75303bd62b5", + "sha256": "bf7a89eef64b5455835f5ed30254ec19bf41f7541cd94f266ab7cbd463f00c41", "type": "url", - "url": "https://files.pythonhosted.org/packages/07/17/5d92509b4dccacf9767d8607112c19667e15db2428014440bae4356b8aff/regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "2024.5.15" + "url": "https://files.pythonhosted.org/packages/3e/66/04b63f31580026c8b819aed7f171149177d10cfab27477ea8800a2268d50/regex-2024.7.24-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "2024.7.24" }, "requests": { "sha256": "70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", @@ -686,10 +686,10 @@ "version": "10.2.0" }, "tensorrt-llm": { - "sha256": "6e4cc6b77231ddeb0e22405da3b99b45d3b186db60a61231753e245699c9fb48", + "sha256": "fb0615796b865702ec24039ce1df416ecc80082ec0dbc445875fce2de9e73585", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.12.0.dev2024072300-cp310-cp310-linux_x86_64.whl", - "version": "0.12.0.dev2024072300" + "url": "https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.12.0.dev2024072301-cp310-cp310-linux_x86_64.whl", + "version": "0.12.0.dev2024072301" }, "tokenizers": { "sha256": "8b01afb7193d47439f091cd8f070a1ced347ad0f9144952a30a41836902fe09e", @@ -728,10 +728,10 @@ "version": "2.3.1" }, "tritonclient": { - "sha256": "754ab373a45306be0c45afbcde06838179d04561694f6d15e138530153aee581", + "sha256": "7074885798e8a711fedaf94d6dd77f310973fe2ebf29fb2837db350a43e2d6a5", "type": "url", - "url": "https://pypi.nvidia.com/tritonclient/tritonclient-2.47.0-py3-none-manylinux1_x86_64.whl", - "version": "2.47.0" + "url": "https://pypi.nvidia.com/tritonclient/tritonclient-2.48.0-py3-none-manylinux1_x86_64.whl", + "version": "2.48.0" }, "typer": { "sha256": "070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914", @@ -1277,5 +1277,5 @@ } } }, - "invalidationHash": "fb599e2ab6a67ae812038801ccbf85ac8402d919c95ced2d07f252ea5aee7d8a" + "invalidationHash": "eff1a58e03fa1393029b4e1fbf7ad15a678c0a53cd3cf03db6b6db88514decc5" } \ No newline at end of file diff --git a/nix/tensorrt-llm.nix b/nix/tensorrt-llm.nix index c7ae2ee..c7f167d 100644 --- a/nix/tensorrt-llm.nix +++ b/nix/tensorrt-llm.nix @@ -21,10 +21,10 @@ stdenv.mkDerivation (o: { src = fetchFromGitHub { owner = "NVIDIA"; repo = "TensorRT-LLM"; - rev = "bca9a33b022dc6a924bf7913137feed3d28b602d"; + rev = "5fa9436e17c2f9aeace070f49aa645d2577f676b"; fetchSubmodules = true; fetchLFS = true; # libtensorrt_llm_batch_manager_static.a - hash = "sha256-d4xl6SZ1BM51DUkfFcclJYF0l3GrNWJR7S2xyTH9rs4="; + hash = "sha256-Ea15Sp3wzye4UGaTRtc+ByUdxNlNRu6uUefXXuJg78A="; }; outputs = if withPython then diff --git a/nix/trtllm-backend.nix b/nix/trtllm-backend.nix index 93d573b..6cbca59 100644 --- a/nix/trtllm-backend.nix +++ b/nix/trtllm-backend.nix @@ -49,11 +49,11 @@ let in oldGccStdenv.mkDerivation rec { pname = "tensorrtllm_backend"; - version = "0.12.0.dev2024072300"; + version = "0.12.0.dev2024072301"; src = fetchFromGitHub { owner = "triton-inference-server"; repo = "tensorrtllm_backend"; - rev = "693c6377983b7efc2043287db10380d3c128bffd"; + rev = "a6aa8eb6ce9371521df166c480e10262cd9c0cf4"; hash = "sha256-oa/OOO3pp1W/J1yqmwytwO0y25dLYixisorRcB42kUU="; }; nativeBuildInputs = [ From 5fb8fb75af3bd511797bffba7a79dd72e274b802 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Thu, 25 Jul 2024 14:19:31 +0200 Subject: [PATCH 23/35] github action: limit to 12 cores --- .github/workflows/nix.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/nix.yml b/.github/workflows/nix.yml index 852a4f3..38a4047 100644 --- a/.github/workflows/nix.yml +++ b/.github/workflows/nix.yml @@ -11,6 +11,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: DeterminateSystems/nix-installer-action@v10 + with: + extra-conf: cores = 12 - name: Authenticate to Google Cloud Platform uses: google-github-actions/auth@v2 with: From 6888d8f565729ff243f88614399aee75cd039832 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Thu, 25 Jul 2024 18:05:44 +0200 Subject: [PATCH 24/35] remove rogue .gitkeep files --- triton_model_repo/tensorrt_llm/1/.gitkeep | 0 triton_templates/tensorrt_llm/1/.gitkeep | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 triton_model_repo/tensorrt_llm/1/.gitkeep delete mode 100644 triton_templates/tensorrt_llm/1/.gitkeep diff --git a/triton_model_repo/tensorrt_llm/1/.gitkeep b/triton_model_repo/tensorrt_llm/1/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/triton_templates/tensorrt_llm/1/.gitkeep b/triton_templates/tensorrt_llm/1/.gitkeep deleted file mode 100644 index e69de29..0000000 From 49ccca6b814096ef7808ae42124b2b09785a09e9 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Fri, 26 Jul 2024 15:09:28 +0200 Subject: [PATCH 25/35] tensorrt-cu12: 10.2.0 -> 10.2.0.post1 Should fix #51 --- default.nix | 2 +- lock.json | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/default.nix b/default.nix index bac2fb8..d8fe4c4 100644 --- a/default.nix +++ b/default.nix @@ -20,7 +20,7 @@ in "--extra-index-url" "https://pypi.nvidia.com" "tensorrt_llm==0.12.0.dev2024072301" - "tensorrt-cu12==10.2.0" + "tensorrt-cu12==10.2.0.post1" "torch==2.3.1" "nvidia-pytriton==0.5.8" # corresponds to 2.46.0 "omegaconf" diff --git a/lock.json b/lock.json index 348e0d3..3f33033 100644 --- a/lock.json +++ b/lock.json @@ -662,28 +662,28 @@ "version": "1.13.1" }, "tensorrt": { - "sha256": "275c45af70b52b64f8267f1e1f52c2a74419ae5e2cc0eaad65d040bdbf543031", + "sha256": "ac5336eea871b812047f6ec8bbeed9ae343a539ba02ff86513b04bd081c14738", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt/tensorrt-10.2.0.tar.gz", - "version": "10.2.0" + "url": "https://pypi.nvidia.com/tensorrt/tensorrt-10.2.0.post1.tar.gz", + "version": "10.2.0.post1" }, "tensorrt-cu12": { - "sha256": "a5e387a399bd1ce727a6e0b9aa2698de4c4dadf40a91c2aa61154e9196eddc56", + "sha256": "9663446e2872113d619ad5010766cccc1f023d693cb43c3f8f2496563028badc", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-cu12/tensorrt-cu12-10.2.0.tar.gz", - "version": "10.2.0" + "url": "https://pypi.nvidia.com/tensorrt-cu12/tensorrt-cu12-10.2.0.post1.tar.gz", + "version": "10.2.0.post1" }, "tensorrt-cu12-bindings": { - "sha256": "5f952539d64bb032a3b309635fcfe0c5fa9cccf9262f487a84f5a0c14c6717f6", + "sha256": "3248e7951d1f2fa8884759b19456ab7d08a3f75bd6b8e5d58e5cc18788c02171", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.2.0-cp310-none-manylinux_2_17_x86_64.whl", - "version": "10.2.0" + "url": "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.2.0.post1-cp310-none-manylinux_2_17_x86_64.whl", + "version": "10.2.0.post1" }, "tensorrt-cu12-libs": { - "sha256": "57761499fd120c03b1858f4db3fa16fa690cd785c15275c72bc4f6195f6f9d3e", + "sha256": "a42f7ecb1659fac27cf68996df0984e68018be61bd8bbd95f51619f9c4e9cf31", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-cu12-libs/tensorrt_cu12_libs-10.2.0-py2.py3-none-manylinux_2_17_x86_64.whl", - "version": "10.2.0" + "url": "https://pypi.nvidia.com/tensorrt-cu12-libs/tensorrt_cu12_libs-10.2.0.post1-py2.py3-none-manylinux_2_17_x86_64.whl", + "version": "10.2.0.post1" }, "tensorrt-llm": { "sha256": "fb0615796b865702ec24039ce1df416ecc80082ec0dbc445875fce2de9e73585", @@ -1277,5 +1277,5 @@ } } }, - "invalidationHash": "eff1a58e03fa1393029b4e1fbf7ad15a678c0a53cd3cf03db6b6db88514decc5" + "invalidationHash": "c1ec24857cc6649b3e0f8c8f0695235a6ddfb2507809cc4fd9d451d251e1fe85" } \ No newline at end of file From 73567de5cc14b2da8cd007b07263476b5af6a491 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Fri, 2 Aug 2024 12:45:24 +0200 Subject: [PATCH 26/35] Push builder to triton-builder-h100 --- .github/workflows/nix.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nix.yml b/.github/workflows/nix.yml index 38a4047..59a4951 100644 --- a/.github/workflows/nix.yml +++ b/.github/workflows/nix.yml @@ -39,6 +39,7 @@ jobs: run: | nix build --accept-flake-config ".#cog-triton-builder" -o cog-triton-builder ./cog-triton-builder push r8.im/replicate-internal/triton-builder + ./cog-triton-builder push r8.im/replicate-internal/triton-builder-h100 - name: Build cog-triton-runner-80 env: COG_TOKEN: ${{ secrets.COG_TOKEN }} From d2adf714fb2ce7920d6c4d203c548e279497f674 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Fri, 2 Aug 2024 13:00:36 +0200 Subject: [PATCH 27/35] fix: work around replicate unique model check adds TRTLLM_BUILDER_VARIANT=h100 to work around "It looks like a copy of this model version already exists on Replicate" --- .github/workflows/nix.yml | 3 ++- flake.nix | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nix.yml b/.github/workflows/nix.yml index 59a4951..72ffe41 100644 --- a/.github/workflows/nix.yml +++ b/.github/workflows/nix.yml @@ -39,7 +39,8 @@ jobs: run: | nix build --accept-flake-config ".#cog-triton-builder" -o cog-triton-builder ./cog-triton-builder push r8.im/replicate-internal/triton-builder - ./cog-triton-builder push r8.im/replicate-internal/triton-builder-h100 + nix build --accept-flake-config ".#cog-triton-builder-h100" -o cog-triton-builder-h100 + ./cog-triton-builder-h100 push r8.im/replicate-internal/triton-builder-h100 - name: Build cog-triton-runner-80 env: COG_TOKEN: ${{ secrets.COG_TOKEN }} diff --git a/flake.nix b/flake.nix index e54d93b..c17bb0c 100644 --- a/flake.nix +++ b/flake.nix @@ -58,6 +58,14 @@ }); in { cog-triton-builder = makeBuilder "cog-triton-builder"; + # we want to push the model to triton-builder-h100 as well + # as cog-triton-builder, but replicate doesn't let us. + # so let's add some data to fool it + cog-triton-builder-h100 = ((makeBuilder "cog-triton-builder-h100").extendModules { + modules = [{ + cognix.environment.TRTLLM_BUILDER_VARIANT = "h100"; + }]; + }).config.public; cog-triton-runner-80 = makeRunner "cog-triton-runner-80" ["80-real"] {}; cog-triton-runner-86 = makeRunner "cog-triton-runner-86" ["86-real"] {}; cog-triton-runner-90 = makeRunner "cog-triton-runner-90" ["90-real"] {}; From 2805da282faa543c94b15ead6a44f5fc21a7de91 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Wed, 31 Jul 2024 17:01:03 +0200 Subject: [PATCH 28/35] build tensorrt-llm from source --- default.nix | 20 ++++++- flake.nix | 15 +---- nix/tensorrt-llm.nix | 133 ++++++++++++++++++++++++++----------------- 3 files changed, 100 insertions(+), 68 deletions(-) diff --git a/default.nix b/default.nix index d8fe4c4..383f6d9 100644 --- a/default.nix +++ b/default.nix @@ -7,6 +7,19 @@ let pythonDrvs = config.python-env.pip.drvs; inherit (pkgs) lib; cfg = config.cog-triton; # defined in interface.nix + trtllm-env = config.python-env.public.extendModules { + modules = [{ + _file = ./.; + pip.rootDependencies = lib.mkOverride 49 { tensorrt-llm = true; hf-transfer = true; }; + pip.drvs.pydantic = let mkMoreForce = lib.mkOverride 49; in { + version = mkMoreForce "2.8.2"; + mkDerivation.src = mkMoreForce (pkgs.fetchurl { + sha256 = "73ee9fddd406dc318b885c7a2eab8a6472b68b8fb5ba8150949fc3db939f23c8"; + url = "https://files.pythonhosted.org/packages/1f/fa/b7f815b8c9ad021c07f88875b601222ef5e70619391ade4a49234d12d278/pydantic-2.8.2-py3-none-any.whl"; + }); + }; + }]; + }; in { imports = [ ./interface.nix ]; @@ -136,12 +149,15 @@ in }; }; deps.tensorrt-llm = pkgs.callPackage ./nix/tensorrt-llm.nix { - inherit python3 cudaPackages pythonDrvs; + inherit python3 cudaPackages; + pythonDrvs = config.deps.trtllm-env.config.pip.drvs; # TODO: turn into config option - withPython = false; + withPython = true; inherit (cfg) architectures; inherit (deps) pybind11-stubgen tensorrt-src; }; + deps.python-with-trtllm = python3.withPackages (_: [ (python3.pkgs.toPythonModule deps.tensorrt-llm.python) ]); + deps.trtllm-env = trtllm-env; deps.trtllm-backend = pkgs.callPackage ./nix/trtllm-backend.nix { inherit python3 cudaPackages pythonDrvs; inherit (deps) tensorrt-llm tensorrt-src; diff --git a/flake.nix b/flake.nix index c17bb0c..77344aa 100644 --- a/flake.nix +++ b/flake.nix @@ -35,6 +35,7 @@ # only grab deps of tensorrt-llm, omegaconf, hf-transfer cognix.python_root_packages = [ "omegaconf" "hf-transfer" "transformers" "torch" ]; + cog-triton.architectures = [ "86-real" ]; # override cog.yaml: cog.concurrency.max = lib.mkForce 1; @@ -42,19 +43,7 @@ # this just needs the examples/ dir cognix.environment.TRTLLM_DIR = config.deps.tensorrt-llm.examples; # HACK: cog needs pydantic v1, but trt-llm needs pydantic v2 - cognix.environment.TRTLLM_PYTHON = (config.python-env.public.extendModules { - modules = [{ - _file = ./.; - pip.rootDependencies = lib.mkOverride 49 { tensorrt-llm = true; hf-transfer = true; }; - pip.drvs.pydantic = let mkMoreForce = lib.mkOverride 49; in { - version = mkMoreForce "2.8.2"; - mkDerivation.src = mkMoreForce (pkgs.fetchurl { - sha256 = "73ee9fddd406dc318b885c7a2eab8a6472b68b8fb5ba8150949fc3db939f23c8"; - url = "https://files.pythonhosted.org/packages/1f/fa/b7f815b8c9ad021c07f88875b601222ef5e70619391ade4a49234d12d278/pydantic-2.8.2-py3-none-any.whl"; - }); - }; - }]; - }).config.public.pyEnv; + cognix.environment.TRTLLM_PYTHON = config.deps.trtllm-env.config.public.pyEnv; }); in { cog-triton-builder = makeBuilder "cog-triton-builder"; diff --git a/nix/tensorrt-llm.nix b/nix/tensorrt-llm.nix index c7f167d..af62526 100644 --- a/nix/tensorrt-llm.nix +++ b/nix/tensorrt-llm.nix @@ -14,6 +14,9 @@ pybind11-stubgen ? null, withPython ? true, rsync, + zstd, + autoPatchelfHook, + patchelfUnstable, }: stdenv.mkDerivation (o: { pname = "tensorrt_llm"; @@ -37,11 +40,14 @@ stdenv.mkDerivation (o: { [ "out" ]; setSourceRoot = "sourceRoot=$(echo */cpp)"; nativeBuildInputs = [ + patchelfUnstable + zstd cmake ninja python3 cudaPackages.cuda_nvcc rsync + autoPatchelfHook ]; buildInputs = [ @@ -51,56 +57,65 @@ stdenv.mkDerivation (o: { openmpi python3.pkgs.setuptools ] - ++ (lib.optionals (!withPython) [ + ++ (with cudaPackages; [ # torch hates the split cuda, so only do it without torch - cudaPackages.cuda_cudart - cudaPackages.cuda_nvcc.dev - cudaPackages.cuda_nvrtc.dev - cudaPackages.cuda_nvrtc.lib - cudaPackages.cuda_nvml_dev.lib - cudaPackages.cuda_nvml_dev.dev - cudaPackages.cuda_cccl - cudaPackages.libcublas.lib - cudaPackages.libcublas.dev - cudaPackages.libcurand.dev - cudaPackages.cuda_profiler_api + cuda_cudart + cuda_nvcc.dev + cuda_nvrtc.dev + cuda_nvrtc.lib + cuda_nvml_dev.lib + cuda_nvml_dev.dev + cuda_cccl + libcublas.lib + libcublas.dev + libcurand.dev + cuda_profiler_api ]) - ++ (lib.optionals withPython [ - cudaPackages.cudatoolkit + ++ (lib.optionals withPython (with cudaPackages; [ + # cudaPackages.cudatoolkit + cuda_nvtx.dev cuda_nvtx.lib + libcusparse.dev libcusparse.lib + libcusolver.dev libcusolver.lib python3.pkgs.pybind11 python3.pkgs.wheel python3.pkgs.pip pybind11-stubgen - ]); + ])); + env.pythonRelaxDeps = "nvidia-cudnn-cu12"; propagatedBuildInputs = lib.optionals withPython ( with pythonDrvs; builtins.map (x: x.public or x) [ - accelerate # ==0.25.0 + accelerate build colored - # concerning statement from trtllm's requirements.txt: - cuda-python # "Do not override the custom version of cuda-python installed in the NGC PyTorch image." - diffusers # ==0.15.0 + cuda-python # Do not override the custom version of cuda-python installed in the NGC PyTorch image. + diffusers lark mpi4py numpy - onnx # >=1.12.0 + onnx polygraphy psutil - pynvml # >=11.5.0 - sentencepiece # >=0.1.99 - tensorrt # ==9.2.0.post12.dev5 - tensorrt-cu12-bindings # missed transitive dep - tensorrt-cu12-libs - torch # <=2.2.0a - nvidia-ammo # ~=0.7.0; platform_machine=="x86_64" - transformers # ==4.36.1 + pynvml + pulp + pandas + h5py + strenum + sentencepiece + tensorrt + torch + nvidia-modelopt + transformers + pillow wheel optimum evaluate janus + mpmath ] ); + autoPatchelfIgnoreMissingDeps = [ "libcuda.so.1" "libnvidia-ml.so.1" ]; + # tries to run cutlass's `python setup.py develop` PYTHONUSERBASE = "/tmp/python"; preConfigure = '' @@ -118,30 +133,46 @@ stdenv.mkDerivation (o: { "-DTRT_INCLUDE_DIR=${tensorrt-src}/include" "-DCMAKE_CUDA_ARCHITECTURES=${builtins.concatStringsSep ";" architectures}" # "-DFAST_BUILD=ON" + "-DCMAKE_SKIP_BUILD_RPATH=ON" ]; # include cstdint in cpp/tensorrt_llm/common/mpiUtils.h after pragma once postPatch = '' sed -i 's/#include /#include \n#include /' /build/source/cpp/include/tensorrt_llm/common/mpiUtils.h sed -i 's/#pragma once/#pragma once\n#include /' /build/source/cpp/tensorrt_llm/kernels/lruKernel.h ''; - postBuild = lib.optionalString withPython '' + # configurePhase = "true"; + # buildPhase = '' + # tar xf ${/home/yorick/datakami/r8/cog-triton-r8/build-dir.tar.zst} + # cd source/cpp/build/ + # runHook postBuild + # ''; + # libtensorrt_llm.so _sometimes_ wants libcudnn, so --add-needed to prevent it from being shrunk out + postBuild = '' + patchelf --add-needed 'libcudnn.so.8' --add-rpath ${cudaPackages.cudnn.lib}/lib tensorrt_llm/libtensorrt_llm.so + '' + (lib.optionalString withPython '' pushd ../../ chmod -R +w . - mkdir ./libs - cp -r cpp/build/tensorrt_llm/libtensorrt_llm.so ./libs - cp -r cpp/build/tensorrt_llm/thop/libth_common.so ./libs - cp -r cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so* ./libs + mkdir -p ./libs + cp -ar cpp/build/tensorrt_llm/libtensorrt_llm.so ./libs + cp -ar cpp/build/tensorrt_llm/thop/libth_common.so ./libs + cp -ar cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so* ./libs + cp -ar cpp/build/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/libtensorrt_llm_nvrtc_wrapper.so ./libs + cp -ar cpp/build/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/libdecoder_attention.so ./libs + mkdir -p ./bin + cp -r cpp/build/tensorrt_llm/executor_worker/executorWorker ./bin cp -r cpp/build/tensorrt_llm/pybind/bindings.*.so . - python -m pybind11_stubgen -o . bindings - mv bindings libs bindings.*.so tensorrt_llm + + LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${cudaPackages.cuda_cudart.stubs}/lib python -m pybind11_stubgen -o . bindings + rm -rf tensorrt_llm/{bin,bindings,libs} + mv bin bindings libs bindings.*.so tensorrt_llm + patchelf --replace-needed libnvinfer_plugin_tensorrt_llm.so.10 libnvinfer_plugin_tensorrt_llm.so --add-rpath '$ORIGIN/../libs' ./tensorrt_llm/bin/executorWorker python setup.py bdist_wheel popd - ''; + ''); + # noAuditTmpdir = true; # todo pythonOutputDistHook # Install isn't well-defined, -backend just expects the build directory to exist somewhere. # Since we just copy build outputs, cmake doesn't get a chance to relink with the correct rpath. - # sed the rpath in place manually - # Also, libtensorrt_llm.so _sometimes_ wants libcudnn, so --add-needed to prevent it from being shrunk out installPhase = '' mkdir -p $out @@ -153,33 +184,29 @@ stdenv.mkDerivation (o: { pushd tensorrt_llm mkdir -p $out/cpp/build/tensorrt_llm/ find . '(' '(' -type f -executable ')' -or -type l ')' -print0 | rsync -av --chmod=u+w --files-from=- --from0 ./ $out/cpp/build/tensorrt_llm/ - patchelf --add-needed 'libcudnn.so.8' --add-rpath ${cudaPackages.cudnn.lib}/lib $out/cpp/build/tensorrt_llm/libtensorrt_llm.so - for f in $out/cpp/build/tensorrt_llm/plugins/*.so* $out/cpp/build/tensorrt_llm/executor_worker/executorWorker; do - if [ ! -L "$f" ]; then - new_path=$(patchelf --print-rpath "$f" | - sed 's#/build/source/cpp/build/tensorrt_llm#$ORIGIN/..#g' | - sed 's#/build/source/cpp/tensorrt_llm#$ORIGIN/../../../tensorrt_llm#g' - ) - patchelf --set-rpath "$new_path" "$f" - fi - done - new_path=$(patchelf --print-rpath $out/cpp/build/tensorrt_llm/libtensorrt_llm.so | - sed 's#/build/source/cpp/build/tensorrt_llm#$ORIGIN#g' | - sed 's#/build/source/cpp/tensorrt_llm#$ORIGIN/../../tensorrt_llm#g') - patchelf --set-rpath "$new_path" $out/cpp/build/tensorrt_llm/libtensorrt_llm.so popd '' + (lib.optionalString withPython '' mv ../../dist $dist pushd $dist - python -m pip install ./*.whl --no-index --no-warn-script-location --prefix="$python" --no-cache + python -m pip install ./*.whl --no-index --no-warn-script-location --prefix="$python" --no-cache --no-deps popd ''); + # manually call autoPatchelf so it doesn't cross-link the outputs + dontAutoPatchelf = true; + # move the propagatedBuildInputs to $python postFixup = lib.optionalString withPython '' mv $out/nix-support $python/ + autoPatchelf $out + autoPatchelf $python ''; + # imports check, wants nvml + # pushd $python/${python3.sitePackages} + # python -c "import tensorrt_llm.bindings" + # popd passthru.examples = runCommand "trt-examples" {} '' mkdir $out cp -r ${o.src}/examples $out/examples ''; + passthru.pythonModule = python3; }) From d10ce53e45ab60389f163de112d2bf9bdb10da28 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Thu, 1 Aug 2024 10:53:45 +0200 Subject: [PATCH 29/35] trtllm: fix patch location to work in `nix develop` --- nix/tensorrt-llm.nix | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/nix/tensorrt-llm.nix b/nix/tensorrt-llm.nix index af62526..b0a3964 100644 --- a/nix/tensorrt-llm.nix +++ b/nix/tensorrt-llm.nix @@ -58,7 +58,6 @@ stdenv.mkDerivation (o: { python3.pkgs.setuptools ] ++ (with cudaPackages; [ - # torch hates the split cuda, so only do it without torch cuda_cudart cuda_nvcc.dev cuda_nvrtc.dev @@ -72,7 +71,6 @@ stdenv.mkDerivation (o: { cuda_profiler_api ]) ++ (lib.optionals withPython (with cudaPackages; [ - # cudaPackages.cudatoolkit cuda_nvtx.dev cuda_nvtx.lib libcusparse.dev libcusparse.lib libcusolver.dev libcusolver.lib @@ -137,8 +135,8 @@ stdenv.mkDerivation (o: { ]; # include cstdint in cpp/tensorrt_llm/common/mpiUtils.h after pragma once postPatch = '' - sed -i 's/#include /#include \n#include /' /build/source/cpp/include/tensorrt_llm/common/mpiUtils.h - sed -i 's/#pragma once/#pragma once\n#include /' /build/source/cpp/tensorrt_llm/kernels/lruKernel.h + sed -i 's/#include /#include \n#include /' include/tensorrt_llm/common/mpiUtils.h + sed -i 's/#pragma once/#pragma once\n#include /' tensorrt_llm/kernels/lruKernel.h ''; # configurePhase = "true"; # buildPhase = '' From 95e44d820fbfb26bb2b07b12fb08eeb2fe7819f1 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Wed, 7 Aug 2024 14:04:04 +0200 Subject: [PATCH 30/35] Add support for building torch & trtllm(python) from source --- default.nix | 60 ++++++++++++++++++++++++++++++++++++-------- flake.nix | 2 +- interface.nix | 2 ++ nix/tensorrt-llm.nix | 5 ++++ nix/torch.nix | 28 +++++++++++++++++++++ 5 files changed, 85 insertions(+), 12 deletions(-) create mode 100644 nix/torch.nix diff --git a/default.nix b/default.nix index 383f6d9..d973d26 100644 --- a/default.nix +++ b/default.nix @@ -20,6 +20,14 @@ let }; }]; }; + trtllm-pythonDrvs = trtllm-env.config.pip.drvs; + toCudaCapability = cmakeArch: { + "70-real" = "7.0"; + "80-real" = "8.0"; + "86-real" = "8.6"; + "89-real" = "8.9"; + "90-real" = "9.0"; + }.${cmakeArch}; in { imports = [ ./interface.nix ]; @@ -70,6 +78,14 @@ in extra-substituters = https://storage.googleapis.com/replicate-nix-cache-dev/ ''; python-env.pip.drvs = { + + torch.public = lib.mkIf cfg.torchSourceBuild + (lib.mkForce config.deps.minimal-torch); + tensorrt-llm.public = lib.mkIf cfg.trtllmSourceBuild + (lib.mkForce config.deps.tensorrt-llm.override { + withPython = true; + }); + nvidia-modelopt.mkDerivation.propagatedBuildInputs = [ pythonDrvs.setuptools.public ]; @@ -120,6 +136,12 @@ in done popd ''; + mpi4py.mkDerivation.nativeBuildInputs = [ pkgs.removeReferencesTo ]; + mpi4py.mkDerivation.postInstall = '' + pushd $out/${site}/mpi4py + remove-references-to -t ${pkgs.openmpi.dev} mpi.cfg MPI.*.so + popd + ''; }; deps.backend_dir = pkgs.runCommand "triton_backends" {} '' mkdir $out @@ -139,27 +161,43 @@ in rev = "v10.2.0"; hash = "sha256-Euo9VD4VTpx8XJV97IMETTAx/YkPGXiNdA39Wjp3UMU="; }; - # todo: replace with lockfile - deps.pybind11-stubgen = python3.pkgs.buildPythonPackage rec { - pname = "pybind11-stubgen"; - version = "2.5"; - src = pkgs.fetchPypi { - inherit pname version; - hash = "sha256-lqf+vKski/mKvUu3LMX3KbqHsjRCR0VMF1nmPN6f7zQ="; + # make a python3 environment with all the pkgs from lock.json *and* nixpkgs.python + # mainly used to build torch, which additionally requires astunparse + deps.python3-with-nixpkgs = python3.override { + packageOverrides = pyself: pysuper: (lib.mapAttrs (_: v: v.public.out) trtllm-pythonDrvs) // { + # todo: replace with lockfile? + pybind11-stubgen = pyself.buildPythonPackage rec { + pname = "pybind11-stubgen"; + version = "2.5"; + src = pyself.fetchPypi { + inherit pname version; + hash = "sha256-lqf+vKski/mKvUu3LMX3KbqHsjRCR0VMF1nmPN6f7zQ="; + }; + }; + # prevent infinite loop, don't override torch itself + inherit (pysuper) torch; }; }; deps.tensorrt-llm = pkgs.callPackage ./nix/tensorrt-llm.nix { inherit python3 cudaPackages; pythonDrvs = config.deps.trtllm-env.config.pip.drvs; - # TODO: turn into config option - withPython = true; + withPython = false; inherit (cfg) architectures; - inherit (deps) pybind11-stubgen tensorrt-src; + inherit (deps.python3-with-nixpkgs.pkgs) pybind11-stubgen; + inherit (deps) tensorrt-src; }; - deps.python-with-trtllm = python3.withPackages (_: [ (python3.pkgs.toPythonModule deps.tensorrt-llm.python) ]); deps.trtllm-env = trtllm-env; deps.trtllm-backend = pkgs.callPackage ./nix/trtllm-backend.nix { inherit python3 cudaPackages pythonDrvs; inherit (deps) tensorrt-llm tensorrt-src; }; + deps.minimal-torch = pkgs.callPackage ./nix/torch.nix { + python3 = deps.python3-with-nixpkgs; + # todo: match/modify config.cognix.cudaPackages + cudaPackages = (pkgs.extend (self: super: { + config = super.config // { + cudaCapabilities = map toCudaCapability cfg.architectures; + }; + })).cudaPackages_12_1; + }; } diff --git a/flake.nix b/flake.nix index 77344aa..c62603a 100644 --- a/flake.nix +++ b/flake.nix @@ -35,7 +35,7 @@ # only grab deps of tensorrt-llm, omegaconf, hf-transfer cognix.python_root_packages = [ "omegaconf" "hf-transfer" "transformers" "torch" ]; - cog-triton.architectures = [ "86-real" ]; + cog-triton.architectures = [ "80-real" "86-real" "90-real" ]; # override cog.yaml: cog.concurrency.max = lib.mkForce 1; diff --git a/interface.nix b/interface.nix index da2f8a5..01cc000 100644 --- a/interface.nix +++ b/interface.nix @@ -9,5 +9,7 @@ # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ # 80: A100, 86: A5000, A40, A800, 89: L40, 90: H100 }; + torchSourceBuild = mkEnableOption "Build Torch from source to be smaller"; + trtllmSourceBuild = mkEnableOption "Build trtllm python from source to be smaller"; }; } diff --git a/nix/tensorrt-llm.nix b/nix/tensorrt-llm.nix index b0a3964..e9ef986 100644 --- a/nix/tensorrt-llm.nix +++ b/nix/tensorrt-llm.nix @@ -133,6 +133,11 @@ stdenv.mkDerivation (o: { # "-DFAST_BUILD=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]; + # workaround: cuda_nvcc exposes a gcc12 that uses a gcc13 libc + # however, cmake finds the gcc12 libc somehow, which is wrong + postConfigure = '' + sed -i 's#${cudaPackages.cuda_nvcc.stdenv.cc.cc.lib}#${stdenv.cc.cc.lib}#g' build.ninja + ''; # include cstdint in cpp/tensorrt_llm/common/mpiUtils.h after pragma once postPatch = '' sed -i 's/#include /#include \n#include /' include/tensorrt_llm/common/mpiUtils.h diff --git a/nix/torch.nix b/nix/torch.nix new file mode 100644 index 0000000..8594c99 --- /dev/null +++ b/nix/torch.nix @@ -0,0 +1,28 @@ +{ python3, magma-cuda-static, cudaPackages }: +(python3.pkgs.torchWithCuda.override { + torchWithCuda = null; # ?!, not used + cudaSupport = true; + inherit cudaPackages; + magma-cuda-static = magma-cuda-static.override { inherit cudaPackages; }; + future = null; + tensorboard = null; + hypothesis = null; + cffi = null; + openai-triton = null; +}).overridePythonAttrs + (o: { + nativeBuildInputs = o.nativeBuildInputs ++ [ python3.pkgs.setuptools ]; + dependencies = o.dependencies ++ [ python3.pkgs.requests ]; + USE_CUDNN = 0; + USE_KINETO = 0; + USE_QNNPACK = 0; + USE_PYTORCH_QNNPACK = 0; + USE_XNNPACK = 0; + INTERN_DISABLE_ONNX = 1; + ONNX_ML = 0; + USE_ITT = 0; + USE_FLASH_ATTENTION = 0; + USE_MEM_EFF_ATTENTION = 0; + USE_FBGEMM = 0; + USE_MKLDNN = 0; + }) From 75c6f70abbaff636067ea2de20331019d233c275 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Wed, 7 Aug 2024 15:07:54 +0200 Subject: [PATCH 31/35] tensorrt-llm: 0.12.0.dev2024072301 -> 0.12.0.dev2024073000 --- default.nix | 3 +- lock.json | 130 ++++++++++-------- nix/tensorrt-llm.nix | 6 +- nix/trtllm-backend.nix | 6 +- triton_templates/preprocessing/1/model.py | 2 +- triton_templates/tensorrt_llm/config.pbtxt | 1 + .../tensorrt_llm_bls/1/lib/decode.py | 2 +- .../tensorrt_llm_bls/1/lib/triton_decoder.py | 61 ++++---- 8 files changed, 116 insertions(+), 95 deletions(-) diff --git a/default.nix b/default.nix index d973d26..aae162c 100644 --- a/default.nix +++ b/default.nix @@ -40,7 +40,7 @@ in python_packages = [ "--extra-index-url" "https://pypi.nvidia.com" - "tensorrt_llm==0.12.0.dev2024072301" + "tensorrt_llm==0.12.0.dev2024073000" "tensorrt-cu12==10.2.0.post1" "torch==2.3.1" "nvidia-pytriton==0.5.8" # corresponds to 2.46.0 @@ -59,6 +59,7 @@ in python-env.pip = { constraintsList = [ "datasets>2.15.0" # picks older fsspec but newer datasets + "mpi4py<4" # recent release with breaking changes ]; # HACK: cog requires pydantic <2, but we do need the extra deps pydantic2 brings in overridesList = [ diff --git a/lock.json b/lock.json index 3f33033..2c346a8 100644 --- a/lock.json +++ b/lock.json @@ -7,11 +7,17 @@ "url": "https://files.pythonhosted.org/packages/15/33/b6b4ad5efa8b9f4275d4ed17ff8a44c97276171341ba565fdffb0e3dc5e8/accelerate-0.33.0-py3-none-any.whl", "version": "0.33.0" }, + "aiohappyeyeballs": { + "sha256": "4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03", + "type": "url", + "url": "https://files.pythonhosted.org/packages/8b/b4/0983e94060405eb51f23be493e3f5c28003f7ebc5efcd0803c1cb23ea407/aiohappyeyeballs-2.3.5-py3-none-any.whl", + "version": "2.3.5" + }, "aiohttp": { - "sha256": "c26959ca7b75ff768e2776d8055bf9582a6267e24556bb7f7bd29e677932be72", + "sha256": "b9db600a86414a9a653e3c1c7f6a2f6a1894ab8f83d11505247bd1b90ad57157", "type": "url", - "url": "https://files.pythonhosted.org/packages/a0/09/e7637f4f0760cad4d67347bbd8311c6ad0259a3fc01f04555af9e84bd378/aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "3.9.5" + "url": "https://files.pythonhosted.org/packages/79/ac/0319ee00dcc4ab36856d85a2185721f29806163212fb9e1745c836830aea/aiohttp-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "3.10.1" }, "aiosignal": { "sha256": "f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17", @@ -104,10 +110,10 @@ "version": "15.0.1" }, "cuda-python": { - "sha256": "f087acc19ac4b467d71cfb7a39306038993176a7a1459426da50afa0fe68c697", + "sha256": "e177f584094d9c9fd9c7d153168486a3966765c79cb2a80e86feb15e3b5adc14", "type": "url", - "url": "https://files.pythonhosted.org/packages/70/d1/2e4ae2207f200b75ecfecf025517597ea00899759ef1cb5fb27e99641234/cuda_python-12.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "12.5.0" + "url": "https://files.pythonhosted.org/packages/86/93/f00a5f48eb67216d8a8818b93c0e8bbe5949f297add3367522081ec5223c/cuda_python-12.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "12.6.0" }, "datasets": { "sha256": "76ac02e3bdfff824492e20678f0b6b1b6d080515957fe834b00c2ba8d6b18e5e", @@ -116,10 +122,10 @@ "version": "2.20.0" }, "diffusers": { - "sha256": "d5e9bb13c8097b4eed10df23d1294d2e5a418f53e3f89c7ef228b5b982970428", + "sha256": "114194eb61498aff06243ade750fca6fbc179ca9df68923bb175b70030bed495", "type": "url", - "url": "https://files.pythonhosted.org/packages/ee/22/2e6e90c87e718e63b1a860cb627bcf27ac4998edb5f190561b5c6cde6c62/diffusers-0.29.2-py3-none-any.whl", - "version": "0.29.2" + "url": "https://files.pythonhosted.org/packages/74/2b/69bb842f7567cd92a540f8a9a63a20e09304ad8ff84530f26762e7e19626/diffusers-0.30.0-py3-none-any.whl", + "version": "0.30.0" }, "dill": { "sha256": "c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7", @@ -182,10 +188,10 @@ "version": "3.0.3" }, "grpcio": { - "sha256": "a8422dc13ad93ec8caa2612b5032a2b9cd6421c13ed87f54db4a3a2c93afaf77", + "sha256": "5764237d751d3031a36fafd57eb7d36fd2c10c658d2b4057c516ccf114849a3e", "type": "url", - "url": "https://files.pythonhosted.org/packages/b4/b5/fe9dcf91919d0f09da8eec3c2091fab6dbb5e102027deeca928bc26b9fc2/grpcio-1.65.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "1.65.1" + "url": "https://files.pythonhosted.org/packages/a5/57/f03b02c4fad8b72539ab04b8b524782e071c89a2d9c182d60b5d9ded41d7/grpcio-1.65.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "1.65.4" }, "h11": { "sha256": "e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761", @@ -236,10 +242,10 @@ "version": "0.27.0" }, "huggingface-hub": { - "sha256": "abdf3244d3a274c4b1fbc5c4a1ef700032b3f60ba93cc63e4f036fd082aa2805", + "sha256": "d93fb63b1f1a919a22ce91a14518974e81fc4610bf344dfe7572343ce8d3aced", "type": "url", - "url": "https://files.pythonhosted.org/packages/93/14/6a82b1c2eab5a828f7d3d675811660eb68424e8b039191f418a94e8d9726/huggingface_hub-0.24.2-py3-none-any.whl", - "version": "0.24.2" + "url": "https://files.pythonhosted.org/packages/0b/05/31b21998f68c31e7ffcc27ff08531fb9af5506d765ce8d661fb0036e6918/huggingface_hub-0.24.5-py3-none-any.whl", + "version": "0.24.5" }, "humanfriendly": { "sha256": "1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", @@ -404,10 +410,10 @@ "version": "12.1.0.106" }, "nvidia-modelopt": { - "sha256": "9af69e4215e7da9c65431bd27b51bc1b95c5d98cfb97105f83daf2198a820b5d", + "sha256": "0a81ab04b2013ebc2f35409a48b3eb774517294b7fc274d7bd33c39f4d8bf508", "type": "url", - "url": "https://pypi.nvidia.com/nvidia-modelopt/nvidia_modelopt-0.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "0.13.1" + "url": "https://pypi.nvidia.com/nvidia-modelopt/nvidia_modelopt-0.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "0.15.1" }, "nvidia-nccl-cu12": { "sha256": "057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56", @@ -416,10 +422,10 @@ "version": "2.20.5" }, "nvidia-nvjitlink-cu12": { - "sha256": "f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212", + "sha256": "562ab97ea2c23164823b2a89cb328d01d45cb99634b8c65fe7cd60d14562bd79", "type": "url", - "url": "https://pypi.nvidia.com/nvidia-nvjitlink-cu12/nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", - "version": "12.5.82" + "url": "https://pypi.nvidia.com/nvidia-nvjitlink-cu12/nvidia_nvjitlink_cu12-12.6.20-py3-none-manylinux2014_x86_64.whl", + "version": "12.6.20" }, "nvidia-nvtx-cu12": { "sha256": "dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5", @@ -440,10 +446,10 @@ "version": "2.3.0" }, "onnx": { - "sha256": "6251910e554f811fdd070164b0bc76d76b067b95576cb9dad4d52ae64fe014b5", + "sha256": "ec6a425e59291fff430da4a884aa07a1d0cbb5dcd22cc78f6cf4ba5adb9f3367", "type": "url", - "url": "https://files.pythonhosted.org/packages/c6/7e/5031717c0636e6074764a2f61a459a3ecd46c20d8b83a1f1cd2513a76160/onnx-1.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "1.16.1" + "url": "https://files.pythonhosted.org/packages/f5/3d/d28484e5d87d4500db0d3b44836d9cd31d88f1efbe168356dbb1dd4f2571/onnx-1.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "1.16.2" }, "optimum": { "sha256": "508bc55db3c9434f4e8d5a30c39a46ac63c4cdb45bcc5a641b6c1c77cae88d23", @@ -548,10 +554,10 @@ "version": "1.0.1" }, "python-rapidjson": { - "sha256": "507595740300e95dded254536558cd56733cc3207e3c2457f19231ad00e78d85", + "sha256": "6cb3ad353ec083a6dcf0552f1fce3c490f92e2fccf9a81eac42835297a8431a1", "type": "url", - "url": "https://files.pythonhosted.org/packages/75/f7/7d79a906618ac106c6fad6704bc6375056308526df834fa867b7d94d6039/python_rapidjson-1.18-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "1.18" + "url": "https://files.pythonhosted.org/packages/f7/e4/b2d1dff12eae71c35e59d1379727697fd7a543d1ac027071f3cd486b8a1f/python_rapidjson-1.20-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "1.20" }, "pytz": { "sha256": "328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319", @@ -560,16 +566,16 @@ "version": "2024.1" }, "pyyaml": { - "sha256": "ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515", + "sha256": "ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed", "type": "url", - "url": "https://files.pythonhosted.org/packages/29/61/bf33c6c85c55bc45a29eee3195848ff2d518d84735eb0e2d8cb42e0d285e/PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "6.0.1" + "url": "https://files.pythonhosted.org/packages/6b/4e/1523cb902fd98355e2e9ea5e5eb237cbc5f3ad5f3075fa65087aa0ecb669/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "6.0.2" }, "pyzmq": { - "sha256": "ba6e5e6588e49139a0979d03a7deb9c734bde647b9a8808f26acf9c547cab1bf", + "sha256": "77ce6a332c7e362cb59b63f5edf730e83590d0ab4e59c2aa5bd79419a42e3449", "type": "url", - "url": "https://files.pythonhosted.org/packages/40/4f/088d0fe18b188a0754483b7d632a97ef608dce80c2648219d071c9f1715c/pyzmq-26.0.3-cp310-cp310-manylinux_2_28_x86_64.whl", - "version": "26.0.3" + "url": "https://files.pythonhosted.org/packages/4a/f2/633999c1dcc7e7c0536ac990390a6a3e49295724dbf450c42ea730daadd9/pyzmq-26.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", + "version": "26.1.0" }, "regex": { "sha256": "bf7a89eef64b5455835f5ed30254ec19bf41f7541cd94f266ab7cbd463f00c41", @@ -590,10 +596,10 @@ "version": "13.7.1" }, "safetensors": { - "sha256": "d88b33980222085dd6001ae2cad87c6068e0991d4f5ccf44975d216db3b57376", + "sha256": "44d464bdc384874601a177375028012a5f177f1505279f9456fea84bbc575c7f", "type": "url", - "url": "https://files.pythonhosted.org/packages/8f/05/969e1a976b84283285181b00028cf73d78434b77a6627fc2a94194cca265/safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "0.4.3" + "url": "https://files.pythonhosted.org/packages/18/f3/27bf4d7112b194eea2d8401706953080692d37ace1b74b36fcc7234961cd/safetensors-0.4.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "0.4.4" }, "scipy": { "sha256": "42470ea0195336df319741e230626b6225a740fd9dce9642ca13e98f667047c0", @@ -608,10 +614,10 @@ "version": "0.2.0" }, "setuptools": { - "sha256": "33874fdc59b3188304b2e7c80d9029097ea31627180896fb549c578ceb8a0855", + "sha256": "5a03e1860cf56bb6ef48ce186b0e557fdba433237481a9a625176c2831be15d1", "type": "url", - "url": "https://files.pythonhosted.org/packages/51/a0/ee460cc54e68afcf33190d198299c9579a5eafeadef0016ae8563237ccb6/setuptools-71.1.0-py3-none-any.whl", - "version": "71.1.0" + "url": "https://files.pythonhosted.org/packages/e1/58/e0ef3b9974a04ce9cde2a7a33881ddcb2d68450803745804545cdd8d258f/setuptools-72.1.0-py3-none-any.whl", + "version": "72.1.0" }, "sh": { "sha256": "2f2f79a65abd00696cf2e9ad26508cf8abb6dba5745f40255f1c0ded2876926d", @@ -686,10 +692,10 @@ "version": "10.2.0.post1" }, "tensorrt-llm": { - "sha256": "fb0615796b865702ec24039ce1df416ecc80082ec0dbc445875fce2de9e73585", + "sha256": "2a13e1b42a8e5f30189d9e55d8e7e8abe90db1d395130ce328ab50748037053e", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.12.0.dev2024072301-cp310-cp310-linux_x86_64.whl", - "version": "0.12.0.dev2024072301" + "url": "https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.12.0.dev2024073000-cp310-cp310-linux_x86_64.whl", + "version": "0.12.0.dev2024073000" }, "tokenizers": { "sha256": "8b01afb7193d47439f091cd8f070a1ced347ad0f9144952a30a41836902fe09e", @@ -710,16 +716,16 @@ "version": "2.3.1+cu121" }, "tqdm": { - "sha256": "b75ca56b413b030bc3f00af51fd2c1a1a5eac6a0c1cca83cbb37a5c52abce644", + "sha256": "90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd", "type": "url", - "url": "https://files.pythonhosted.org/packages/18/eb/fdb7eb9e48b7b02554e1664afd3bd3f117f6b6d6c5881438a0b055554f9b/tqdm-4.66.4-py3-none-any.whl", - "version": "4.66.4" + "url": "https://files.pythonhosted.org/packages/48/5d/acf5905c36149bbaec41ccf7f2b68814647347b72075ac0b1fe3022fdc73/tqdm-4.66.5-py3-none-any.whl", + "version": "4.66.5" }, "transformers": { - "sha256": "283c8b47cf38640c5c0caea60be0dfa948669fa48e9739b03717cbf5e8b20f11", + "sha256": "ea0ff72def71e9f4812d9414d4803b22681b1617aa6f511bd51cfff2b44a6fca", "type": "url", - "url": "https://files.pythonhosted.org/packages/13/63/cccd0297770d7096c19c99d4c542f3068a30e73cdfd971a920bfa686cb3a/transformers-4.43.2-py3-none-any.whl", - "version": "4.43.2" + "url": "https://files.pythonhosted.org/packages/62/c0/810e741a6244c0f004be40ccb96486d072f042eabbd4d7e8aa02b81ca1eb/transformers-4.44.0-py3-none-any.whl", + "version": "4.44.0" }, "triton": { "sha256": "3c84595cbe5e546b1b290d2a58b1494df5a2ef066dd890655e5b8a8a92205c33", @@ -764,10 +770,10 @@ "version": "2.2.2" }, "uvicorn": { - "sha256": "94a3608da0e530cea8f69683aa4126364ac18e3826b6630d1a65f4638aade503", + "sha256": "b2d86de274726e9878188fa07576c9ceeff90a839e2b6e25c917fe05f5a6c835", "type": "url", - "url": "https://files.pythonhosted.org/packages/63/84/2a26b4eac1cf0c6b5b176dd4346cc4912af5e1b0efc150b865e28636ac34/uvicorn-0.30.3-py3-none-any.whl", - "version": "0.30.3" + "url": "https://files.pythonhosted.org/packages/67/d8/1bcb5e6508d14c6c9912cd964b286f04392298ffb3e4218f4a1292d64e76/uvicorn-0.30.5-py3-none-any.whl", + "version": "0.30.5" }, "uvloop": { "sha256": "5a05128d315e2912791de6088c34136bfcdd0c7cbc1cf85fd6fd1bb321b7c849", @@ -776,10 +782,10 @@ "version": "0.19.0" }, "watchfiles": { - "sha256": "c2444dc7cb9d8cc5ab88ebe792a8d75709d96eeef47f4c8fccb6df7c7bc5be71", + "sha256": "8f48c917ffd36ff9a5212614c2d0d585fa8b064ca7e66206fb5c095015bc8207", "type": "url", - "url": "https://files.pythonhosted.org/packages/3d/ae/e7eddbdca559f14a9a38cf04782a5d715cf350aad498d0862fb02b4ebe10/watchfiles-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "0.22.0" + "url": "https://files.pythonhosted.org/packages/22/ec/c756c012b174ccf5f2ee32202603e66b33b93a54cf16c69a7440c764d7f9/watchfiles-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "0.23.0" }, "websockets": { "sha256": "6350b14a40c95ddd53e775dbdbbbc59b124a5c8ecd6fbb09c2e52029f7a9f480", @@ -788,10 +794,10 @@ "version": "12.0" }, "wheel": { - "sha256": "55c570405f142630c6b9f72fe09d9b67cf1477fcf543ae5b8dcb1f5b7377da81", + "sha256": "2376a90c98cc337d18623527a97c31797bd02bad0033d41547043a1cbfbe448f", "type": "url", - "url": "https://files.pythonhosted.org/packages/7d/cd/d7460c9a869b16c3dd4e1e403cce337df165368c71d6af229a74699622ce/wheel-0.43.0-py3-none-any.whl", - "version": "0.43.0" + "url": "https://files.pythonhosted.org/packages/1b/d1/9babe2ccaecff775992753d8686970b1e2755d21c8a63be73aba7a4e7d77/wheel-0.44.0-py3-none-any.whl", + "version": "0.44.0" }, "wrapt": { "sha256": "ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf", @@ -824,10 +830,10 @@ "version": "5.0" }, "zope-interface": { - "sha256": "d22fce0b0f5715cdac082e35a9e735a1752dc8585f005d045abb1a7c20e197f9", + "sha256": "10ebac566dd0cec66f942dc759d46a994a2b3ba7179420f0e2130f88f8a5f400", "type": "url", - "url": "https://files.pythonhosted.org/packages/64/0a/849dc6346aae1929101174b413517b1105e278bd649c856584944b834208/zope.interface-6.4.post2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "6.4.post2" + "url": "https://files.pythonhosted.org/packages/ef/c2/8c38d60a99ff20c4837866362283e5e7e7b63fd2ab62eee35d0055dab7c3/zope.interface-7.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "7.0.1" } }, "targets": { @@ -841,7 +847,9 @@ "safetensors", "torch" ], + "aiohappyeyeballs": [], "aiohttp": [ + "aiohappyeyeballs", "aiosignal", "async-timeout", "attrs", @@ -1277,5 +1285,5 @@ } } }, - "invalidationHash": "c1ec24857cc6649b3e0f8c8f0695235a6ddfb2507809cc4fd9d451d251e1fe85" + "invalidationHash": "e7e598f1bfb380172adc5cd02e60cdb3bf715610c02bf2b9dfe84e8a53754f0f" } \ No newline at end of file diff --git a/nix/tensorrt-llm.nix b/nix/tensorrt-llm.nix index e9ef986..1c8bffe 100644 --- a/nix/tensorrt-llm.nix +++ b/nix/tensorrt-llm.nix @@ -20,14 +20,14 @@ }: stdenv.mkDerivation (o: { pname = "tensorrt_llm"; - version = "0.12.0.dev2024072300"; + version = "0.12.0.dev2024073000"; src = fetchFromGitHub { owner = "NVIDIA"; repo = "TensorRT-LLM"; - rev = "5fa9436e17c2f9aeace070f49aa645d2577f676b"; + rev = "a681853d3803ee5893307e812530b5e7004bb6e1"; fetchSubmodules = true; fetchLFS = true; # libtensorrt_llm_batch_manager_static.a - hash = "sha256-Ea15Sp3wzye4UGaTRtc+ByUdxNlNRu6uUefXXuJg78A="; + hash = "sha256-Uvx8+Lhuo8lT4TqKjYSL0Mt/QI8jS5T9kxdsNGKJZzU="; }; outputs = if withPython then diff --git a/nix/trtllm-backend.nix b/nix/trtllm-backend.nix index 6cbca59..b4d0519 100644 --- a/nix/trtllm-backend.nix +++ b/nix/trtllm-backend.nix @@ -49,12 +49,12 @@ let in oldGccStdenv.mkDerivation rec { pname = "tensorrtllm_backend"; - version = "0.12.0.dev2024072301"; + version = "0.12.0.dev2024073000"; src = fetchFromGitHub { owner = "triton-inference-server"; repo = "tensorrtllm_backend"; - rev = "a6aa8eb6ce9371521df166c480e10262cd9c0cf4"; - hash = "sha256-oa/OOO3pp1W/J1yqmwytwO0y25dLYixisorRcB42kUU="; + rev = "b25d578a48422db3b2d5bd89b16c235dd85c4300"; + hash = "sha256-UxuMdhkMv89Ozxi4jXioOfR1gf/cYr/bCxt/RG6CdZw="; }; nativeBuildInputs = [ cmake diff --git a/triton_templates/preprocessing/1/model.py b/triton_templates/preprocessing/1/model.py index 7bfddf9..3671c07 100644 --- a/triton_templates/preprocessing/1/model.py +++ b/triton_templates/preprocessing/1/model.py @@ -113,7 +113,7 @@ def initialize(self, args): llm_model_path = os.path.join(llm_model_path, 'config.json') vision_encoder_path = os.path.join(visual_model_path, - 'visual_encoder.engine') + 'model.engine') with open(vision_encoder_path, 'rb') as f: engine_buffer = f.read() diff --git a/triton_templates/tensorrt_llm/config.pbtxt b/triton_templates/tensorrt_llm/config.pbtxt index 81aedb2..1974161 100644 --- a/triton_templates/tensorrt_llm/config.pbtxt +++ b/triton_templates/tensorrt_llm/config.pbtxt @@ -35,6 +35,7 @@ model_transaction_policy { dynamic_batching { preferred_batch_size: [ ${triton_max_batch_size} ] max_queue_delay_microseconds: ${max_queue_delay_microseconds} + default_queue_policy: { max_queue_size: ${max_queue_size} } } input [ diff --git a/triton_templates/tensorrt_llm_bls/1/lib/decode.py b/triton_templates/tensorrt_llm_bls/1/lib/decode.py index eb4e1b9..4736a19 100644 --- a/triton_templates/tensorrt_llm_bls/1/lib/decode.py +++ b/triton_templates/tensorrt_llm_bls/1/lib/decode.py @@ -60,7 +60,7 @@ class Request: text_input: np.ndarray = np.array([]) decoder_text_input: np.ndarray = None image_input: Optional[np.ndarray] = None - max_tokens: np.ndarray = np.array([]) + max_tokens: Optional[np.ndarray] = None bad_words: Optional[np.ndarray] = None stop_words: Optional[np.ndarray] = None end_id: Optional[np.ndarray] = None diff --git a/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py b/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py index 9c8a4e9..a6d4d48 100644 --- a/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py +++ b/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py @@ -345,26 +345,23 @@ def _get_llm_tensors_from_request( } batch_size = request.text_input.shape[0] tensors = self.create_triton_tensors(request, name_map) + out_len = None if request.max_tokens is not None: + out_len = request.max_tokens[0][0] + if num_output_tokens is not None: + out_len = num_output_tokens + elif draft_request: + if draft_request.draft_input_ids is not None: + out_len = len(draft_request.draft_input_ids[0]) + 1 + else: + out_len = 1 + + if out_len is None: + raise Exception("Could not determine request_output_len") + else: tensors.append( pb_utils.Tensor("request_output_len", - np.array(request.max_tokens, dtype=np.int32))) - else: - out_len = None - if num_output_tokens is not None: - out_len = num_output_tokens - elif draft_request: - if draft_request.draft_input_ids is not None: - out_len = len(draft_request.draft_input_ids[0]) + 1 - else: - out_len = 1 - - if out_len is None: - raise Exception("Could not determine request_output_len") - else: - tensors.append( - pb_utils.Tensor("request_output_len", - np.array([[out_len]], dtype=np.int32))) + np.array([[out_len]], dtype=np.int32))) if draft_request: if draft_request.draft_input_ids is not None: @@ -377,21 +374,35 @@ def _get_llm_tensors_from_request( pb_utils.Tensor("draft_logits", draft_request.draft_logits)) - return_context_logits = [[False]] * batch_size - return_generation_logits = [[False]] * batch_size + return_context_logits_data = [False] + return_generation_logits_data = [False] if draft_request is None: if is_draft_model_request: - return_generation_logits = request.use_draft_logits if request.use_draft_logits is not None else return_generation_logits + return_generation_logits_data = request.use_draft_logits if request.use_draft_logits is not None else [ + False + ] else: - return_context_logits = request.return_context_logits if request.return_context_logits is not None else return_context_logits - return_generation_logits = request.return_generation_logits if request.return_generation_logits is not None else return_generation_logits + return_context_logits_data = request.return_context_logits if request.return_context_logits is not None else [ + False + ] + return_generation_logits_data = request.return_generation_logits if request.return_generation_logits is not None else [ + False + ] + return_context_logits = np.array([return_context_logits_data] * + batch_size, + dtype=bool) + return_generation_logits = np.array([return_generation_logits_data] * + batch_size, + dtype=bool) + + assert len(return_context_logits.shape) == 2 + assert len(return_generation_logits.shape) == 2 tensors.append( - pb_utils.Tensor("return_context_logits", - np.array(return_context_logits))) + pb_utils.Tensor("return_context_logits", return_context_logits)) tensors.append( pb_utils.Tensor("return_generation_logits", - np.array(return_generation_logits))) + return_generation_logits)) return tensors def _get_llm_response(self, triton_output): From b25f0960d932c0c95536ea417368409b99466129 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Wed, 7 Aug 2024 15:16:48 +0200 Subject: [PATCH 32/35] update triton_model_repo --- triton_model_repo/tensorrt_llm/config.pbtxt | 1 + 1 file changed, 1 insertion(+) diff --git a/triton_model_repo/tensorrt_llm/config.pbtxt b/triton_model_repo/tensorrt_llm/config.pbtxt index e2a43c9..b8296c4 100644 --- a/triton_model_repo/tensorrt_llm/config.pbtxt +++ b/triton_model_repo/tensorrt_llm/config.pbtxt @@ -35,6 +35,7 @@ model_transaction_policy { dynamic_batching { preferred_batch_size: [ 64 ] max_queue_delay_microseconds: 100 + default_queue_policy: { max_queue_size: ${max_queue_size} } } input [ From 56557e882f406e39881f7b4f35a75489da14c5a9 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Wed, 7 Aug 2024 17:49:33 +0200 Subject: [PATCH 33/35] Update triton_model_repo python and configs, remove bls --- triton_config_generator.py | 4 +- triton_model_repo/ensemble/config.pbtxt | 4 +- triton_model_repo/postprocessing/1/model.py | 58 ++- triton_model_repo/preprocessing/1/model.py | 251 ++++++++-- .../tensorrt_llm_bls/1/lib/decode.py | 332 ------------- .../tensorrt_llm_bls/1/lib/triton_decoder.py | 433 ----------------- triton_model_repo/tensorrt_llm_bls/1/model.py | 131 ----- .../tensorrt_llm_bls/config.pbtxt | 264 ---------- triton_templates/ensemble/config.pbtxt | 4 +- triton_templates/postprocessing/1/model.py | 55 +-- triton_templates/preprocessing/1/model.py | 6 +- triton_templates/tensorrt_llm/config.pbtxt | 2 +- .../tensorrt_llm_bls/1/lib/decode.py | 346 ------------- .../tensorrt_llm_bls/1/lib/triton_decoder.py | 458 ------------------ triton_templates/tensorrt_llm_bls/1/model.py | 131 ----- .../tensorrt_llm_bls/config.pbtxt | 264 ---------- 16 files changed, 304 insertions(+), 2439 deletions(-) delete mode 100644 triton_model_repo/tensorrt_llm_bls/1/lib/decode.py delete mode 100644 triton_model_repo/tensorrt_llm_bls/1/lib/triton_decoder.py delete mode 100644 triton_model_repo/tensorrt_llm_bls/1/model.py delete mode 100644 triton_model_repo/tensorrt_llm_bls/config.pbtxt delete mode 100644 triton_templates/tensorrt_llm_bls/1/lib/decode.py delete mode 100644 triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py delete mode 100644 triton_templates/tensorrt_llm_bls/1/model.py delete mode 100644 triton_templates/tensorrt_llm_bls/config.pbtxt diff --git a/triton_config_generator.py b/triton_config_generator.py index 0a3dc8f..5d904a0 100644 --- a/triton_config_generator.py +++ b/triton_config_generator.py @@ -53,7 +53,7 @@ def get_config_paths(model_config, model): def generate_configs(config): - models = ['preprocessing', 'tensorrt_llm', 'postprocessing', 'ensemble', 'tensorrt_llm_bls'] + models = ['preprocessing', 'tensorrt_llm', 'postprocessing', 'ensemble'] for model in models: if model not in config: @@ -77,4 +77,4 @@ def main(yaml_file): parser.add_argument('yaml_file', help='Path to the YAML configuration file.') args = parser.parse_args() - main(args.yaml_file) \ No newline at end of file + main(args.yaml_file) diff --git a/triton_model_repo/ensemble/config.pbtxt b/triton_model_repo/ensemble/config.pbtxt index 10293bb..b279740 100644 --- a/triton_model_repo/ensemble/config.pbtxt +++ b/triton_model_repo/ensemble/config.pbtxt @@ -185,8 +185,8 @@ input [ ] output [ { - name: "text_output" - data_type: TYPE_STRING + name: "output_ids" + data_type: TYPE_INT32 dims: [ -1 ] }, { diff --git a/triton_model_repo/postprocessing/1/model.py b/triton_model_repo/postprocessing/1/model.py index ac42a0d..e9b0e55 100644 --- a/triton_model_repo/postprocessing/1/model.py +++ b/triton_model_repo/postprocessing/1/model.py @@ -53,19 +53,37 @@ def initialize(self, args): """ # Parse model configs model_config = json.loads(args['model_config']) - # tokenizer_dir = model_config['parameters']['tokenizer_dir'][ - # 'string_value'] - # self.skip_special_tokens = model_config['parameters'].get( - # 'skip_special_tokens', - # {'string_value': "true"})['string_value'].lower() in [ - # 'true', '1', 't', 'y', 'yes' - # ] + tokenizer_dir = model_config['parameters']['tokenizer_dir'][ + 'string_value'] + + skip_special_tokens = model_config['parameters'].get( + 'skip_special_tokens') + if skip_special_tokens is not None: + skip_special_tokens_str = skip_special_tokens[ + 'string_value'].lower() + if skip_special_tokens_str in [ + 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no' + ]: + self.skip_special_tokens = skip_special_tokens_str in [ + 'true', '1', 't', 'y', 'yes' + ] + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default." + ) + self.skip_special_tokens = True + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default." + ) + self.skip_special_tokens = True # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, # legacy=False, # padding_side='left', # trust_remote_code=True) - # self.tokenizer.pad_token = self.tokenizer.eos_token + # if not self.tokenizer.pad_token: + # self.tokenizer.pad_token = self.tokenizer.eos_token # Parse model output configs output_config = pb_utils.get_output_config_by_name( @@ -124,6 +142,10 @@ def execute(self, requests): generation_logits = pb_utils.get_input_tensor_by_name( request, 'GENERATION_LOGITS') + # Get the batch index + batch_index = pb_utils.get_input_tensor_by_name( + request, 'BATCH_INDEX') + # Reshape Input # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]]) # tokens_batch = tokens_batch.T @@ -135,7 +157,8 @@ def execute(self, requests): # objects to create pb_utils.InferenceResponse. output_tensor = pb_utils.Tensor( 'OUTPUT', - tokens_batch) + tokens_batch + ) outputs = [] outputs.append(output_tensor) @@ -179,6 +202,15 @@ def execute(self, requests): np.array([[[[0.0]]]], dtype=np.float32)) outputs.append(out_generation_logits) + if batch_index: + out_batch_index = pb_utils.Tensor('OUT_BATCH_INDEX', + batch_index.as_numpy()) + outputs.append(out_batch_index) + else: + out_batch_index = pb_utils.Tensor( + 'OUT_BATCH_INDEX', np.array([[0]], dtype=np.int32)) + outputs.append(out_batch_index) + # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. # Below is an example of how you can set errors in inference @@ -206,8 +238,14 @@ def finalize(self): # for batch_idx, beam_tokens in enumerate(tokens_batch): # for beam_idx, tokens in enumerate(beam_tokens): # seq_len = sequence_lengths[batch_idx][beam_idx] + # # Exclude fake ids in multimodal models + # fake_id_len = 0 + # for i in range(seq_len): + # if tokens[i] < self.tokenizer.vocab_size: + # fake_id_len = i + # break # output = self.tokenizer.decode( - # tokens[:seq_len], + # tokens[fake_id_len:seq_len], # skip_special_tokens=self.skip_special_tokens) # outputs.append(output.encode('utf8')) # return outputs diff --git a/triton_model_repo/preprocessing/1/model.py b/triton_model_repo/preprocessing/1/model.py index 62ab243..7e8f677 100644 --- a/triton_model_repo/preprocessing/1/model.py +++ b/triton_model_repo/preprocessing/1/model.py @@ -25,10 +25,14 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import json +import os from typing import List import numpy as np +# import tensorrt as trt +# import torch import triton_python_backend_utils as pb_utils +# from torch.utils.dlpack import from_dlpack from transformers import AutoTokenizer, T5Tokenizer @@ -56,11 +60,32 @@ def initialize(self, args): model_config = json.loads(args['model_config']) tokenizer_dir = model_config['parameters']['tokenizer_dir'][ 'string_value'] - self.add_special_tokens = model_config['parameters'].get( - 'add_special_tokens', - {'string_value': "false"})['string_value'].lower() in [ - 'true', '1', 't', 'y', 'yes' - ] + + add_special_tokens = model_config['parameters'].get( + 'add_special_tokens') + visual_model_path = model_config['parameters']['visual_model_path'][ + 'string_value'] + if visual_model_path == "${visual_model_path}" or visual_model_path == "": + visual_model_path = None + + if add_special_tokens is not None: + add_special_tokens_str = add_special_tokens['string_value'].lower() + if add_special_tokens_str in [ + 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no' + ]: + self.add_special_tokens = add_special_tokens_str in [ + 'true', '1', 't', 'y', 'yes' + ] + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default." + ) + self.add_special_tokens = True + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default." + ) + self.add_special_tokens = True self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, legacy=False, @@ -68,17 +93,60 @@ def initialize(self, args): trust_remote_code=True) if isinstance(self.tokenizer, T5Tokenizer): self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id() - self.tokenizer.pad_token = self.tokenizer.eos_token + + if not self.tokenizer.pad_token: + self.tokenizer.pad_token = self.tokenizer.eos_token self.tokenizer_end_id = self.tokenizer.encode( self.tokenizer.eos_token, add_special_tokens=False)[0] self.tokenizer_pad_id = self.tokenizer.encode( self.tokenizer.pad_token, add_special_tokens=False)[0] + self.visual_engine = None + self.visual_context = None + self.stream = None + self.vocab_size = None + self.dtype = None + if visual_model_path is not None: + llm_model_path = model_config['parameters']['gpt_model_path'][ + 'string_value'] + llm_model_path = os.path.join(llm_model_path, 'config.json') + + vision_encoder_path = os.path.join(visual_model_path, + 'model.engine') + with open(vision_encoder_path, 'rb') as f: + engine_buffer = f.read() + + self.stream = torch.cuda.Stream() + torch.cuda.set_stream(self.stream) + + trt_logger = trt.Logger(trt.Logger.WARNING) + visual_runtime = trt.Runtime(trt_logger) + if engine_buffer is not None: + self.visual_engine = visual_runtime.deserialize_cuda_engine( + engine_buffer) + self.visual_context = self.visual_engine.create_execution_context() + self.visual_context.set_optimization_profile_async( + 0, self.stream.cuda_stream) + + assert self.visual_engine.get_tensor_dtype( + 'input' + ) == trt.float16 and self.visual_engine.get_tensor_dtype( + 'output' + ) == trt.float16 and self.visual_engine.num_io_tensors == 2, "Please use the model built in examples/multimodal." + + self.stream.synchronize() + + with open(llm_model_path, 'r') as f: + llm_model_config = json.load(f) + self.vocab_size = int( + llm_model_config["pretrained_config"]["vocab_size"]) + # Parse model output configs and convert Triton types to numpy types output_names = [ - "INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS", - "OUT_END_ID", "OUT_PAD_ID" + "INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN", + "REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS", + "OUT_END_ID", "OUT_PAD_ID", "OUT_PROMPT_EMBEDDING_TABLE" ] input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"] for input_name in input_names: @@ -126,16 +194,33 @@ def execute(self, requests): # Get input tensors query = pb_utils.get_input_tensor_by_name(request, 'QUERY').as_numpy() - batch_dim = query.shape[0] - if batch_dim != 1: + batch_size = query.shape[0] + + decoder_query = pb_utils.get_input_tensor_by_name( + request, 'DECODER_QUERY') + if decoder_query is not None: + decoder_query = decoder_query.as_numpy() + + image = pb_utils.get_input_tensor_by_name(request, 'IMAGE') + if image is not None: + image = from_dlpack(image.to_dlpack()).cuda().half() + if self.visual_engine is None: + err_str = "Images cannot be processed without a vision model." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue - err_str = "Inflight batching backend expects requests with batch size of 1." - logger.log_error(err_str) - responses.append( - pb_utils.InferenceResponse( - output_tensors=[], - error=pb_utils.TritonError(err_str))) - continue + if image.shape[0] != batch_size: + err_str = "Query and Image have different batch sizes." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue request_output_len = pb_utils.get_input_tensor_by_name( request, 'REQUEST_OUTPUT_LEN').as_numpy() @@ -160,13 +245,65 @@ def execute(self, requests): if embedding_bias_weights is not None: embedding_bias_weights = embedding_bias_weights.as_numpy() + prompt_embedding_table_tensor = pb_utils.get_input_tensor_by_name( + request, 'PROMPT_EMBEDDING_TABLE') + if prompt_embedding_table_tensor is not None: + prompt_embedding_table = prompt_embedding_table_tensor.as_numpy( + ) + prompt_embedding_table_tensor = pb_utils.Tensor( + 'OUT_PROMPT_EMBEDDING_TABLE', prompt_embedding_table) + + if image is not None and prompt_embedding_table_tensor is not None: + + err_str = "Image and prompt table cannot be provided simultaneously." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue + + visual_output = None + if image is not None: + ok = self.visual_context.set_input_shape('input', image.shape) + if not ok: + err_str = "Image has wrong shape." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue + self.visual_context.set_tensor_address('input', + image.data_ptr()) + + visual_output_shape = self.visual_context.get_tensor_shape( + 'output') + visual_output = torch.empty(tuple(visual_output_shape), + dtype=torch.float16, + device=image.device) + self.visual_context.set_tensor_address( + 'output', visual_output.data_ptr()) + + ok = self.visual_context.execute_async_v3( + self.stream.cuda_stream) + if not ok: + err_str = "Runtime execution failed for vision encoder model." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue + self.stream.synchronize() + # Take the end_id from the input tensors # If not specified, use tokenizer to get end_id end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID') if end_id is not None: end_id = end_id.as_numpy() else: - end_id = [[self.tokenizer_end_id]] + end_id = [[self.tokenizer_end_id]] * batch_size # Take the pad_id from the input tensors # If not specified, use tokenizer to get pad_id @@ -174,16 +311,31 @@ def execute(self, requests): if pad_id is not None: pad_id = pad_id.as_numpy() else: - pad_id = [[self.tokenizer_pad_id]] + pad_id = [[self.tokenizer_pad_id]] * batch_size # Preprocessing input data. - input_id, request_input_len = self._create_request(query) - bad_words = self._to_word_list_format(bad_words_dict) - stop_words = self._to_word_list_format(stop_words_dict) + input_id, request_input_len = self._create_request( + query, visual_output) + if decoder_query is not None: + decoder_input_id, request_decoder_input_len = self._create_request( + decoder_query) + else: + decoder_input_id = pad_id * np.ones((batch_size, 1), np.int32) + request_decoder_input_len = 1 * np.ones( + (batch_size, 1), np.int32) + + bad_words = self._to_word_list_format(bad_words_dict, batch_size) + stop_words = self._to_word_list_format(stop_words_dict, batch_size) embedding_bias = self._get_embedding_bias( embedding_bias_words, embedding_bias_weights, - self.embedding_bias_weights_dtype) + self.embedding_bias_weights_dtype, batch_size) + + if image is not None: + prompt_table = np.array(visual_output.cpu()) + prompt_embedding_table_tensor = pb_utils.Tensor( + 'OUT_PROMPT_EMBEDDING_TABLE', + prompt_table.astype(self.out_prompt_embedding_table_dtype)) # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. @@ -192,6 +344,13 @@ def execute(self, requests): request_input_len_tensor = pb_utils.Tensor( 'REQUEST_INPUT_LEN', request_input_len.astype(self.request_input_len_dtype)) + decoder_input_id_tensor = pb_utils.Tensor( + 'DECODER_INPUT_ID', + decoder_input_id.astype(self.decoder_input_id_dtype)) + request_decoder_input_len_tensor = pb_utils.Tensor( + 'REQUEST_DECODER_INPUT_LEN', + request_decoder_input_len.astype( + self.request_decoder_input_len_dtype)) request_output_len_tensor = pb_utils.Tensor( 'REQUEST_OUTPUT_LEN', request_output_len) bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words) @@ -204,11 +363,27 @@ def execute(self, requests): pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID', np.array(pad_id, dtype=np.int32)) - inference_response = pb_utils.InferenceResponse(output_tensors=[ - input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor, - request_input_len_tensor, request_output_len_tensor, - embedding_bias_tensor, end_id_tensor, pad_id_tensor - ]) + if prompt_embedding_table_tensor is not None: + inference_response = pb_utils.InferenceResponse( + output_tensors=[ + input_id_tensor, decoder_input_id_tensor, + bad_words_ids_tensor, stop_words_ids_tensor, + request_input_len_tensor, + request_decoder_input_len_tensor, + request_output_len_tensor, embedding_bias_tensor, + end_id_tensor, pad_id_tensor, + prompt_embedding_table_tensor + ]) + else: + inference_response = pb_utils.InferenceResponse( + output_tensors=[ + input_id_tensor, decoder_input_id_tensor, + bad_words_ids_tensor, stop_words_ids_tensor, + request_input_len_tensor, + request_decoder_input_len_tensor, + request_output_len_tensor, embedding_bias_tensor, + end_id_tensor, pad_id_tensor + ]) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length @@ -222,7 +397,7 @@ def finalize(self): """ print('Cleaning up...') - def _create_request(self, query): + def _create_request(self, query, visual_features): """ query : batch string (2D numpy array) """ @@ -240,6 +415,14 @@ def _create_request(self, query): add_special_tokens=self.add_special_tokens)).astype( int) for s in query ] + if visual_features is not None: + fake_prompt_id = np.arange( + self.vocab_size, self.vocab_size + visual_features.shape[1]) + start_ids = [ + np.concatenate((fake_prompt_id, ids), axis=0) + for ids in start_ids + ] + start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int) max_len = 0 @@ -254,7 +437,8 @@ def _create_request(self, query): return start_ids, start_lengths - def _to_word_list_format(self, word_lists: List[List[str | bytes]]): + def _to_word_list_format(self, word_lists: List[List[str | bytes]], + batch_size): ''' word_lists format: len(word_lists) == batch_size @@ -264,7 +448,7 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): if word_lists is None: # Return an empty array of shape (1,2,0) - return np.empty([1, 2, 0], dtype="int32") + return np.empty([batch_size, 2, 0], dtype="int32") flat_ids = [] offsets = [] @@ -298,12 +482,13 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): (1, 0, 2)) def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights, - bias_dtype): + bias_dtype, batch_size): assert self.tokenizer != None, "need to set tokenizer" if embedding_bias_words is None or embedding_bias_weights is None: - return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype) + return np.empty([batch_size, 0], + dtype=self.embedding_bias_weights_dtype) batch_embedding_bias = [] for words, weights in zip(embedding_bias_words, diff --git a/triton_model_repo/tensorrt_llm_bls/1/lib/decode.py b/triton_model_repo/tensorrt_llm_bls/1/lib/decode.py deleted file mode 100644 index aa2a6d5..0000000 --- a/triton_model_repo/tensorrt_llm_bls/1/lib/decode.py +++ /dev/null @@ -1,332 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from collections.abc import Generator -from dataclasses import dataclass -from typing import Optional - -import numpy as np - - -class RequestValidationError(Exception): - pass - - -def _validate_that(condition: bool, msg: str): - if not condition: - raise RequestValidationError(msg) - - -def _validate_non_empty(data, msg: str): - _validate_that(data is not None and data.size > 0, msg) - - -def _validate_single_gt_0(data, msg: str): - _validate_non_empty(data, msg) - _validate_that(data.flatten()[0] > 0, msg) - - -def _single_value(data: Optional[np.ndarray]): - if data is None: - return None - return data.flatten()[0] - - -@dataclass -class Request: - text_input: np.ndarray = np.array([]) - max_tokens: np.ndarray = np.array([]) - bad_words: Optional[np.ndarray] = None - stop_words: Optional[np.ndarray] = None - end_id: Optional[np.ndarray] = None - pad_id: Optional[np.ndarray] = None - top_k: Optional[np.ndarray] = None - top_p: Optional[np.ndarray] = None - temperature: Optional[np.ndarray] = None - length_penalty: Optional[np.ndarray] = None - repetition_penalty: Optional[np.ndarray] = None - min_length: Optional[np.ndarray] = None - return_log_probs: Optional[np.ndarray] = None - prompt_embedding_table: Optional[np.ndarray] = None - prompt_vocab_size: Optional[np.ndarray] = None - embedding_bias_words: Optional[np.ndarray] = None - embedding_bias_weights: Optional[np.ndarray] = None - num_draft_tokens: Optional[np.ndarray] = None - use_draft_logits: Optional[np.ndarray] = None - stream: Optional[np.ndarray] = None - beam_width: Optional[np.ndarray] = None - return_context_logits: Optional[np.ndarray] = None - return_generation_logits: Optional[np.ndarray] = None - random_seed: Optional[np.ndarray] = None - presence_penalty: Optional[np.ndarray] = None - frequency_penalty: Optional[np.ndarray] = None - - def validate(self): - _validate_non_empty(self.text_input, "text_input is required") - _validate_single_gt_0(self.max_tokens, - "max_tokens must be a single value > 0") - - num_draft_tokens = _single_value(self.num_draft_tokens) - stream = _single_value(self.stream) - gen_logits = _single_value(self.return_generation_logits) - context_logits = _single_value(self.return_context_logits) - - if num_draft_tokens: - _validate_that( - not stream, - "streaming is not supported with speculative decoding") - _validate_that( - not gen_logits, - "generation logits are not supported with speculative decoding" - ) - _validate_that( - not context_logits, - "context logits are not supported with speculative decoding") - - -@dataclass -class DraftRequest: - draft_input_ids: Optional[np.ndarray] = None - draft_logits: Optional[np.ndarray] = None - - -@dataclass -class PreprocResponse: - input_ids: np.ndarray = np.array([]) - input_lengths: np.ndarray = np.array([]) - bad_words_list: Optional[np.ndarray] = None - stop_words_list: Optional[np.ndarray] = None - embedding_bias: Optional[np.ndarray] = None - end_id: Optional[np.ndarray] = None - pad_id: Optional[np.ndarray] = None - - @classmethod - def with_new_inputs(cls, - other, - input_ids: Optional[np.ndarray] = None, - input_lengths: Optional[np.ndarray] = None): - return cls( - input_ids=(input_ids - if input_ids is not None else other.input_ids), - input_lengths=(input_lengths if input_lengths is not None else - other.input_lengths), - bad_words_list=other.bad_words_list, - stop_words_list=other.stop_words_list, - end_id=other.end_id, - pad_id=other.pad_id, - ) - - -@dataclass -class GenerationResponse: - output_ids: np.ndarray = np.array([]) - sequence_length: np.ndarray = np.array([]) - cum_log_probs: Optional[np.ndarray] = None - output_log_probs: Optional[np.ndarray] = None - context_logits: Optional[np.ndarray] = None - generation_logits: Optional[np.ndarray] = None - - -@dataclass -class Response: - text_output: np.ndarray = np.array([]) - cum_log_probs: Optional[np.ndarray] = None - output_log_probs: Optional[np.ndarray] = None - context_logits: Optional[np.ndarray] = None - generation_logits: Optional[np.ndarray] = None - - def __eq__(self, o) -> bool: - """Just for testing""" - if not isinstance(o, Response): - return False - return (np.array_equal(self.text_output, o.text_output) - and np.array_equal(self.cum_log_probs, o.cum_log_probs) - and np.array_equal(self.output_log_probs, o.output_log_probs) - and np.array_equal(self.context_logits, o.context_logits) and - np.array_equal(self.generation_logits, o.generation_logits)) - - -class Decoder: - - def __init__(self, streaming=False, accumulate=False): - self._streaming = streaming - self._accumulate = accumulate - - self._accumulated_tokens = None - - def decode(self, - request: Request, - speculative_decoding=False) -> Generator[Response, None, None]: - preproc_response = self.preprocess(request) - - if speculative_decoding: - for gen_response in self._spec_generate(preproc_response, request): - yield self.postprocess(gen_response) - else: - if not self._streaming: - gen_response = self._generate_non_streaming( - preproc_response, request) - yield self.postprocess(gen_response) - else: - for gen_response in self._generate(preproc_response, request): - yield self.postprocess(gen_response) - - def encountered_stop_words(self, input_ids, stop_words_ids): - for stop_word_ids in stop_words_ids: - if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids): - return True - return False - - def _spec_generate( - self, preproc: PreprocResponse, - request: Request) -> Generator[GenerationResponse, None, None]: - - prompt_input_ids: np.ndarray = preproc.input_ids[0] - input_ids: np.ndarray = prompt_input_ids - output_len: int = request.max_tokens[0][0] - last_input_ids: np.ndarray = None - draft_output_ids: np.ndarray = None - draft_logits: np.ndarray = None - - target_response: GenerationResponse = None - - cur_preproc = preproc - - counter = 0 - while True: - counter += 1 - num_draft_tokens = min( - request.num_draft_tokens[0][0], - len(prompt_input_ids) + output_len - len(input_ids) - 1) - - draft_request = None - if num_draft_tokens > 0: - draft_response: GenerationResponse = self._draft_generate_non_streaming( - cur_preproc, request, num_draft_tokens) - seq_len: int = draft_response.sequence_length[0][0] - # [1, beamWidth, outputLength] -> [outputLen] - draft_output_ids = draft_response.output_ids[0][0] - # [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded] - if request.use_draft_logits is not None and request.use_draft_logits[ - 0]: - if draft_response.generation_logits is not None: - draft_logits = draft_response.generation_logits[0][0] - - input_draft_tokens = draft_output_ids[len(input_ids):seq_len] - draft_request = DraftRequest( - draft_input_ids=np.expand_dims(input_draft_tokens, 0)) - if request.use_draft_logits is not None and request.use_draft_logits[ - 0]: - draft_request.draft_logits = np.expand_dims( - draft_logits[-len(input_draft_tokens):], 0) - else: - draft_request = DraftRequest() - target_response = self._generate_non_streaming( - cur_preproc, request, draft_request) - last_input_ids = input_ids - input_ids = target_response.output_ids[0][0] - cur_preproc = PreprocResponse.with_new_inputs( - cur_preproc, np.expand_dims(input_ids, 0), - np.array([[len(input_ids)]], dtype=np.int32)) - - # Evaluate criteria to stop generation loop. - # If we've hit or exceeded the max output length, should stop - length_stop = (len(input_ids) >= - len(prompt_input_ids) + output_len) - if length_stop: - break - # If draft and target have same outputs, should stop. Normally target should return 1 more token. - # If they are the same length, they should differ at the last token - target_draft_equal = draft_output_ids is not None and np.array_equal( - draft_output_ids, input_ids) - if target_draft_equal: - break - # If tokens no longer change, should stop, means we have hit early stopping - last_current_equal = np.array_equal(last_input_ids, input_ids) - if last_current_equal: - break - # Need to check if stop words was encountered - hit_stop_words = self.encountered_stop_words( - input_ids, preproc.stop_words_list[0]) - if hit_stop_words: - break - - yield target_response - - def _draft_generate_non_streaming( - self, preproc: PreprocResponse, request: Request, - num_draft_tokens: int) -> GenerationResponse: - raise NotImplementedError() - - def _generate( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None - ) -> Generator[GenerationResponse, None, None]: - raise NotImplementedError() - - def _generate_non_streaming( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None - ) -> GenerationResponse: - raise NotImplementedError() - - def postprocess(self, gen_response: GenerationResponse) -> Response: - if self._accumulate and self._streaming: - new_tokens: np.ndarray = gen_response.output_ids - if new_tokens.ndim != 3: - raise Exception("Expected output_ids tensor to have 3 dims.") - if new_tokens.shape[0] != 1: - raise Exception("Expected batch size of 1") - if new_tokens.shape[1] != 1: - raise Exception( - "Accumulation of tokens is only implemented for beam width = 1" - ) - - self._accumulated_tokens = new_tokens if ( - self._accumulated_tokens is None) else np.concatenate( - (self._accumulated_tokens, new_tokens), axis=2) - sequence_lengths = np.array([[self._accumulated_tokens.shape[2]]], - dtype=np.int32) - return self._postprocess(self._accumulated_tokens, - sequence_lengths, gen_response) - else: - return self._postprocess(gen_response.output_ids, None, - gen_response) - - def _postprocess(self, tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse) -> Response: - raise NotImplementedError() - - def preprocess(self, request: Request) -> PreprocResponse: - raise NotImplementedError() - - def reset_decoder(self): - self._accumulated_tokens = None diff --git a/triton_model_repo/tensorrt_llm_bls/1/lib/triton_decoder.py b/triton_model_repo/tensorrt_llm_bls/1/lib/triton_decoder.py deleted file mode 100644 index f0df3b8..0000000 --- a/triton_model_repo/tensorrt_llm_bls/1/lib/triton_decoder.py +++ /dev/null @@ -1,433 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from collections.abc import Callable -from typing import Dict, Optional - -import numpy as np -import triton_python_backend_utils as pb_utils -from lib.decode import * -from typing_extensions import override - - -class TritonDecoder(Decoder): - - def __init__(self, - streaming=False, - accumulate=False, - preproc_model_name="preprocessing", - postproc_model_name="postprocessing", - llm_model_name="tensorrt_llm", - draft_llm_model_name: Optional[str] = None): - super().__init__(streaming=streaming, accumulate=accumulate) - self.preproc_model_name = preproc_model_name - self.postproc_model_name = postproc_model_name - self.llm_model_name = llm_model_name - self.draft_llm_model_name = draft_llm_model_name - - self._preproc_outputs = [ - "INPUT_ID", - "REQUEST_INPUT_LEN", - "BAD_WORDS_IDS", - "STOP_WORDS_IDS", - "EMBEDDING_BIAS", - "OUT_PAD_ID", - "OUT_END_ID", - ] - - self._llm_outputs = [ - "output_ids", - "sequence_length", - "cum_log_probs", - "output_log_probs", - "context_logits", - "generation_logits", - ] - - self._postproc_outputs = [ - "OUTPUT", - ] - - self.input_names = [ - "text_input", - "max_tokens", - "bad_words", - "stop_words", - "end_id", - "pad_id", - "top_k", - "top_p", - "temperature", - "length_penalty", - "repetition_penalty", - "min_length", - "presence_penalty", - "frequency_penalty", - "random_seed", - "return_log_probs", - "return_context_logits", - "return_generation_logits", - "beam_width", - "stream", - "prompt_embedding_table", - "prompt_vocab_size", - "embedding_bias_words", - "embedding_bias_weights", - "num_draft_tokens", - "use_draft_logits", - ] - - self.__undo_reshape_whitelist = { - "max_tokens", - "end_id", - "pad_id", - "top_k", - "top_p", - "temperature", - "length_penalty", - "repetition_penalty", - "min_length", - "presence_penalty", - "frequency_penalty", - "random_seed", - "return_log_probs", - "return_context_logits", - "return_generation_logits", - "beam_width", - "stream", - "prompt_vocab_size", - "num_draft_tokens", - "use_draft_logits", - } - - def _exec_triton_request(self, request): - responses = request.exec(decoupled=True) - for r in responses: - if r.has_error(): - raise pb_utils.TritonModelException(r.error().message()) - yield r - - def _exec_triton_request_single(self, request): - responses = request.exec(decoupled=False) - if responses.has_error(): - raise pb_utils.TritonModelException(responses.error().message()) - return responses - - def create_triton_response(self, response: Response): - name_map = { - "text_output": "text_output", - "cum_log_probs": "cum_log_probs", - "output_log_probs": "output_log_probs", - "context_logits": "context_logits", - "generation_logits": "generation_logits" - } - tensors = self.create_triton_tensors(response, name_map) - return pb_utils.InferenceResponse(output_tensors=tensors) - - def convert_triton_request(self, triton_request) -> Request: - request = Request() - for triton_name in self.input_names: - tensor = pb_utils.get_input_tensor_by_name(triton_request, - triton_name) - target_name = triton_name - if tensor is None: - continue - if not hasattr(request, target_name): - raise AttributeError( - f"Request has no attribute '{target_name}'") - setattr(request, target_name, tensor.as_numpy()) - return request - - def convert_triton_response(self, - triton_response, - response_factory: Callable, - name_map=None): - response = response_factory() - for tensor in triton_response.output_tensors(): - if tensor is None: - continue - triton_name = tensor.name() - value = tensor.as_numpy() - target_name = triton_name - if name_map and triton_name in name_map: - target_name = name_map[triton_name] - if name_map and not triton_name in name_map: - continue - if target_name is None: - # explicitly ignore this triton input - continue - if not hasattr(response, target_name): - raise AttributeError( - f"response object has not attribute '{target_name}'") - setattr(response, target_name, value) - return response - - def __undo_reshape(self, x, name): - if name in self.__undo_reshape_whitelist and len(x.shape) == 1: - # handle reshapes - return np.expand_dims(x, 0) - else: - return x - - def create_triton_tensors(self, obj, name_map: dict): - tensors = [] - for name, triton_name in name_map.items(): - if triton_name is None: - continue - value = getattr(obj, name) - if value is None: - continue - t = pb_utils.Tensor(triton_name, self.__undo_reshape(value, name)) - tensors.append(t) - return tensors - - @override - def preprocess(self, request: Request) -> PreprocResponse: - input_tensors = self._get_preproc_tensors(request) - triton_req = pb_utils.InferenceRequest( - model_name=self.preproc_model_name, - inputs=input_tensors, - requested_output_names=self._preproc_outputs) - triton_output = self._exec_triton_request_single(triton_req) - return self._get_preproc_response(triton_output) - - def _get_preproc_tensors(self, request: Request): - name_map = { - "text_input": "QUERY", - "max_tokens": "REQUEST_OUTPUT_LEN", - "bad_words": "BAD_WORDS_DICT", - "stop_words": "STOP_WORDS_DICT", - "embedding_bias_words": "EMBEDDING_BIAS_WORDS", - "embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS", - "pad_id": "PAD_ID", - "end_id": "END_ID", - } - return self.create_triton_tensors(request, name_map) - - def _get_preproc_response(self, triton_output): - name_map = { - "INPUT_ID": "input_ids", - "REQUEST_INPUT_LEN": "input_lengths", - "BAD_WORDS_IDS": "bad_words_list", - "STOP_WORDS_IDS": "stop_words_list", - "EMBEDDING_BIAS": "embedding_bias", - "OUT_PAD_ID": "pad_id", - "OUT_END_ID": "end_id", - } - return self.convert_triton_response(triton_output, PreprocResponse, - name_map) - - @override - def _draft_generate_non_streaming( - self, preproc: PreprocResponse, request: Request, - num_draft_tokens: int) -> GenerationResponse: - input_tensors = self._get_llm_tensors(preproc, request, - num_draft_tokens, None, True) - triton_req = pb_utils.InferenceRequest( - model_name=self.draft_llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs) - triton_response = self._exec_triton_request_single(triton_req) - llm_response = self._get_llm_response(triton_response) - return llm_response - - @override - def _generate( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None - ) -> Generator[GenerationResponse, None, None]: - input_tensors = self._get_llm_tensors(preproc, request, None, - draft_request) - triton_req = pb_utils.InferenceRequest( - model_name=self.llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs) - for r in self._exec_triton_request(triton_req): - yield self._get_llm_response(r) - - @override - def _generate_non_streaming( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None - ) -> GenerationResponse: - input_tensors = self._get_llm_tensors(preproc, request, None, - draft_request) - triton_req = pb_utils.InferenceRequest( - model_name=self.llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs) - r = self._exec_triton_request_single(triton_req) - return self._get_llm_response(r) - - def _get_llm_tensors(self, - preproc: PreprocResponse, - request: Request, - num_output_tokens: Optional[int] = None, - draft_request: Optional[DraftRequest] = None, - is_draft_model_request: bool = False): - tensors = [] - tensors.extend(self._get_tensors_from_preproc(preproc)) - tensors.extend( - self._get_llm_tensors_from_request(request, num_output_tokens, - draft_request, - is_draft_model_request)) - return tensors - - def _get_tensors_from_preproc(self, preproc: PreprocResponse): - name_map = { - "input_ids": "input_ids", - "input_lengths": "input_lengths", - "bad_words_list": "bad_words_list", - "stop_words_list": "stop_words_list", - "embedding_bias": "embedding_bias", - "pad_id": "pad_id", - "end_id": "end_id", - } - return self.create_triton_tensors(preproc, name_map) - - def _get_llm_tensors_from_request( - self, - request: Request, - num_output_tokens: Optional[int] = None, - draft_request: Optional[DraftRequest] = None, - is_draft_model_request: bool = False): - name_map: Dict[str, Optional[str]] = { - "beam_width": "beam_width", - "top_k": "runtime_top_k", - "top_p": "runtime_top_p", - "length_penalty": "len_penalty", - "repetition_penalty": "repetition_penalty", - "min_length": "min_length", - "presence_penalty": "presence_penalty", - "frequency_penalty": "frequency_penalty", - "random_seed": "random_seed", - "return_log_probs": "return_log_probs", - "stream": "streaming", - "prompt_embedding_table": "prompt_embedding_table", - "prompt_vocab_size": "prompt_vocab_size", - } - tensors = self.create_triton_tensors(request, name_map) - - out_len = request.max_tokens[0][0] if request.max_tokens else None - if num_output_tokens is not None: - out_len = num_output_tokens - elif draft_request: - if draft_request.draft_input_ids is not None: - out_len = len(draft_request.draft_input_ids[0]) + 1 - else: - out_len = 1 - - if out_len is None: - raise Exception("Could not determine request_output_len") - else: - tensors.append( - pb_utils.Tensor("request_output_len", - np.array([[out_len]], dtype=np.int32))) - - if draft_request: - if draft_request.draft_input_ids is not None: - tensors.append( - pb_utils.Tensor("draft_input_ids", - draft_request.draft_input_ids)) - if draft_request.draft_logits is not None and request.use_draft_logits is not None and request.use_draft_logits[ - 0]: - tensors.append( - pb_utils.Tensor("draft_logits", - draft_request.draft_logits)) - - return_context_logits = False - return_generation_logits = False - if draft_request is None: - if is_draft_model_request: - return_generation_logits = request.use_draft_logits[ - 0] if request.use_draft_logits is not None else False - else: - return_context_logits = request.return_context_logits[ - 0] if request.return_context_logits is not None else False - return_generation_logits = request.return_generation_logits[ - 0] if request.return_generation_logits is not None else False - - tensors.append( - pb_utils.Tensor("return_context_logits", - np.array([[return_context_logits]]))) - tensors.append( - pb_utils.Tensor("return_generation_logits", - np.array([[return_generation_logits]]))) - return tensors - - def _get_llm_response(self, triton_output): - name_map = { - "output_ids": "output_ids", - "sequence_length": "sequence_length", - "cum_log_probs": "cum_log_probs", - "output_log_probs": "output_log_probs", - "context_logits": "context_logits", - "generation_logits": "generation_logits", - } - return self.convert_triton_response(triton_output, GenerationResponse, - name_map) - - def _postprocess(self, tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse) -> Response: - input_tensors = self._get_postproc_tensors(tokens, sequence_lengths, - gen_response) - triton_req = pb_utils.InferenceRequest( - model_name=self.postproc_model_name, - inputs=input_tensors, - requested_output_names=self._postproc_outputs) - r = self._exec_triton_request_single(triton_req) - response = self._get_response(r, gen_response) - return response - - def _get_postproc_tensors(self, tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse): - tensors = [ - pb_utils.Tensor("TOKENS_BATCH", tokens), - pb_utils.Tensor( - "SEQUENCE_LENGTH", sequence_lengths - if sequence_lengths else gen_response.sequence_length) - ] - return tensors - - def _get_response(self, triton_output, gen_res: GenerationResponse): - tensors = triton_output.output_tensors() - t_map = {} - for named_t in tensors: - name = named_t.name() - t = named_t.as_numpy() - t_map[name] = t - response = Response(text_output=t_map["OUTPUT"], - cum_log_probs=gen_res.cum_log_probs, - output_log_probs=gen_res.output_log_probs, - context_logits=gen_res.context_logits, - generation_logits=gen_res.generation_logits) - return response diff --git a/triton_model_repo/tensorrt_llm_bls/1/model.py b/triton_model_repo/tensorrt_llm_bls/1/model.py deleted file mode 100644 index 609e323..0000000 --- a/triton_model_repo/tensorrt_llm_bls/1/model.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -import traceback - -import triton_python_backend_utils as pb_utils -from lib.triton_decoder import TritonDecoder - - -class TritonPythonModel: - - def initialize(self, args): - - # Parse model configs - model_config = json.loads(args['model_config']) - - params = model_config['parameters'] - - accumulate_tokens_str = '' - if 'accumulate_tokens' in params: - accumulate_tokens_str = params['accumulate_tokens']['string_value'] - - self.accumulate_tokens = accumulate_tokens_str.lower() in [ - 'true', 'yes', '1', 't' - ] - - self.decoupled = pb_utils.using_decoupled_model_transaction_policy( - model_config) - - self.logger = pb_utils.Logger - - self.llm_model_name = "tensorrt_llm" - if "tensorrt_llm_model_name" in params: - self.llm_model_name = params["tensorrt_llm_model_name"][ - "string_value"] - self.draft_llm_model_name = None - if "tensorrt_llm_draft_model_name" in params: - self.draft_llm_model_name = params[ - "tensorrt_llm_draft_model_name"]["string_value"] - - self.decoder = TritonDecoder( - streaming=self.decoupled, - accumulate=self.accumulate_tokens, - preproc_model_name="preprocessing", - postproc_model_name="postprocessing", - llm_model_name=self.llm_model_name, - draft_llm_model_name=self.draft_llm_model_name) - - def execute(self, requests): - - responses = [] - - for request in requests: - if self.decoupled: - response_sender = request.get_response_sender() - try: - - req = self.decoder.convert_triton_request(request) - req.validate() - speculative_decode = (req.num_draft_tokens is not None - and req.num_draft_tokens[0][0] > 0) - if speculative_decode and (self.draft_llm_model_name is None - or self.draft_llm_model_name == ""): - raise Exception( - "cannot perform speculative decoding without draft model" - ) - res_gen = self.decoder.decode( - req, speculative_decoding=speculative_decode) - - for res in res_gen: - triton_response = self.decoder.create_triton_response(res) - if self.decoupled: - response_sender.send(triton_response) - else: - responses.append(triton_response) - - if self.decoupled: - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) - - except Exception: - self.logger.log_error(traceback.format_exc()) - # If encountering an error, send a response with err msg - error_response = pb_utils.InferenceResponse( - output_tensors=[], - error=pb_utils.TritonError(traceback.format_exc())) - - if self.decoupled: - response_sender.send(error_response) - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) - else: - responses.append(error_response) - - self.decoder.reset_decoder() - if self.decoupled: - return None - else: - assert len(responses) == len(requests) - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print('Cleaning up...') diff --git a/triton_model_repo/tensorrt_llm_bls/config.pbtxt b/triton_model_repo/tensorrt_llm_bls/config.pbtxt deleted file mode 100644 index 45c9460..0000000 --- a/triton_model_repo/tensorrt_llm_bls/config.pbtxt +++ /dev/null @@ -1,264 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "tensorrt_llm_bls" -backend: "python" -max_batch_size: 64 - -model_transaction_policy { - decoupled: True -} - -input [ - { - name: "text_input" - data_type: TYPE_STRING - dims: [ 1 ] - }, - { - name: "decoder_text_input" - data_type: TYPE_STRING - dims: [ 1 ] - optional: true - }, - { - name: "image_input" - data_type: TYPE_FP16 - dims: [ 3, 224, 224 ] - optional: true - }, - { - name: "max_tokens" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "bad_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "stop_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "end_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "pad_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_k" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "length_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "min_length" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "frequency_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - { - name: "return_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_context_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_generation_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "beam_width" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "stream" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "prompt_embedding_table" - data_type: TYPE_FP16 - dims: [ -1, -1 ] - optional: true - }, - { - name: "prompt_vocab_size" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "embedding_bias_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "embedding_bias_weights" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true - }, - { - name: "num_draft_tokens", - data_type: TYPE_INT32, - dims: [ 1 ] - optional: true - }, - { - name: "use_draft_logits", - data_type: TYPE_BOOL, - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - } -] -output [ - { - name: "text_output" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "cum_log_probs" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "output_log_probs" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "context_logits" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "generation_logits" - data_type: TYPE_FP32 - dims: [ -1, -1, -1 ] - }, - { - name: "batch_index" - data_type: TYPE_INT32 - dims: [ 1 ] - } -] - -parameters: { - key: "accumulate_tokens" - value: { - string_value: "true" - } -} -parameters: { - key: "tensorrt_llm_model_name" - value: { - string_value: "${tensorrt_llm_model_name}" - } -} -parameters: { - key: "tensorrt_llm_draft_model_name" - value: { - string_value: "${tensorrt_llm_draft_model_name}" - } -} - -instance_group [ - { - count: 64 - kind : KIND_CPU - } -] diff --git a/triton_templates/ensemble/config.pbtxt b/triton_templates/ensemble/config.pbtxt index 4f4a245..74dd3ab 100644 --- a/triton_templates/ensemble/config.pbtxt +++ b/triton_templates/ensemble/config.pbtxt @@ -185,8 +185,8 @@ input [ ] output [ { - name: "text_output" - data_type: TYPE_STRING + name: "output_ids" + data_type: TYPE_INT32 dims: [ -1 ] }, { diff --git a/triton_templates/postprocessing/1/model.py b/triton_templates/postprocessing/1/model.py index 9c68429..e9b0e55 100644 --- a/triton_templates/postprocessing/1/model.py +++ b/triton_templates/postprocessing/1/model.py @@ -53,37 +53,37 @@ def initialize(self, args): """ # Parse model configs model_config = json.loads(args['model_config']) - # tokenizer_dir = model_config['parameters']['tokenizer_dir'][ - # 'string_value'] - - # skip_special_tokens = model_config['parameters'].get( - # 'skip_special_tokens') - # if skip_special_tokens is not None: - # skip_special_tokens_str = skip_special_tokens[ - # 'string_value'].lower() - # if skip_special_tokens_str in [ - # 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no' - # ]: - # self.skip_special_tokens = skip_special_tokens_str in [ - # 'true', '1', 't', 'y', 'yes' - # ] - # else: - # print( - # f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default." - # ) - # self.skip_special_tokens = True - # else: - # print( - # f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default." - # ) - # self.skip_special_tokens = True + tokenizer_dir = model_config['parameters']['tokenizer_dir'][ + 'string_value'] + + skip_special_tokens = model_config['parameters'].get( + 'skip_special_tokens') + if skip_special_tokens is not None: + skip_special_tokens_str = skip_special_tokens[ + 'string_value'].lower() + if skip_special_tokens_str in [ + 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no' + ]: + self.skip_special_tokens = skip_special_tokens_str in [ + 'true', '1', 't', 'y', 'yes' + ] + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default." + ) + self.skip_special_tokens = True + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default." + ) + self.skip_special_tokens = True # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, # legacy=False, # padding_side='left', # trust_remote_code=True) # if not self.tokenizer.pad_token: - # self.tokenizer.pad_token = self.tokenizer.eos_token + # self.tokenizer.pad_token = self.tokenizer.eos_token # Parse model output configs output_config = pb_utils.get_output_config_by_name( @@ -151,13 +151,14 @@ def execute(self, requests): # tokens_batch = tokens_batch.T # Postprocessing output data. - outputs = self._postprocessing(tokens_batch, sequence_lengths) + # outputs = self._postprocessing(tokens_batch, sequence_lengths) # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. output_tensor = pb_utils.Tensor( 'OUTPUT', - np.array(outputs).astype(self.output_dtype)) + tokens_batch + ) outputs = [] outputs.append(output_tensor) diff --git a/triton_templates/preprocessing/1/model.py b/triton_templates/preprocessing/1/model.py index 3671c07..7e8f677 100644 --- a/triton_templates/preprocessing/1/model.py +++ b/triton_templates/preprocessing/1/model.py @@ -29,10 +29,10 @@ from typing import List import numpy as np -import tensorrt as trt -import torch +# import tensorrt as trt +# import torch import triton_python_backend_utils as pb_utils -from torch.utils.dlpack import from_dlpack +# from torch.utils.dlpack import from_dlpack from transformers import AutoTokenizer, T5Tokenizer diff --git a/triton_templates/tensorrt_llm/config.pbtxt b/triton_templates/tensorrt_llm/config.pbtxt index 1974161..1de3659 100644 --- a/triton_templates/tensorrt_llm/config.pbtxt +++ b/triton_templates/tensorrt_llm/config.pbtxt @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. name: "tensorrt_llm" -backend: "tensorrtllm" +backend: "${triton_backend}" max_batch_size: ${triton_max_batch_size} model_transaction_policy { diff --git a/triton_templates/tensorrt_llm_bls/1/lib/decode.py b/triton_templates/tensorrt_llm_bls/1/lib/decode.py deleted file mode 100644 index 4736a19..0000000 --- a/triton_templates/tensorrt_llm_bls/1/lib/decode.py +++ /dev/null @@ -1,346 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from collections.abc import Generator -from dataclasses import dataclass -from typing import Optional - -import numpy as np - - -class RequestValidationError(Exception): - pass - - -def _validate_that(condition: bool, msg: str): - if not condition: - raise RequestValidationError(msg) - - -def _validate_non_empty(data, msg: str): - _validate_that(data is not None and data.size > 0, msg) - - -def _validate_single_gt_0(data, msg: str): - _validate_non_empty(data, msg) - _validate_that(data.flatten()[0] > 0, msg) - - -def _single_value(data: Optional[np.ndarray]): - if data is None: - return None - return data.flatten()[0] - - -@dataclass -class Request: - text_input: np.ndarray = np.array([]) - decoder_text_input: np.ndarray = None - image_input: Optional[np.ndarray] = None - max_tokens: Optional[np.ndarray] = None - bad_words: Optional[np.ndarray] = None - stop_words: Optional[np.ndarray] = None - end_id: Optional[np.ndarray] = None - pad_id: Optional[np.ndarray] = None - top_k: Optional[np.ndarray] = None - top_p: Optional[np.ndarray] = None - temperature: Optional[np.ndarray] = None - length_penalty: Optional[np.ndarray] = None - repetition_penalty: Optional[np.ndarray] = None - min_length: Optional[np.ndarray] = None - return_log_probs: Optional[np.ndarray] = None - prompt_embedding_table: Optional[np.ndarray] = None - prompt_vocab_size: Optional[np.ndarray] = None - embedding_bias_words: Optional[np.ndarray] = None - embedding_bias_weights: Optional[np.ndarray] = None - num_draft_tokens: Optional[np.ndarray] = None - use_draft_logits: Optional[np.ndarray] = None - stream: Optional[np.ndarray] = None - beam_width: Optional[np.ndarray] = None - return_context_logits: Optional[np.ndarray] = None - return_generation_logits: Optional[np.ndarray] = None - random_seed: Optional[np.ndarray] = None - presence_penalty: Optional[np.ndarray] = None - frequency_penalty: Optional[np.ndarray] = None - - def validate(self): - _validate_non_empty(self.text_input, "text_input is required") - _validate_single_gt_0(self.max_tokens, - "max_tokens must be a single value > 0") - - num_draft_tokens = _single_value(self.num_draft_tokens) - _single_value(self.return_generation_logits) - context_logits = _single_value(self.return_context_logits) - - if num_draft_tokens: - _validate_that( - not self.stream.any(), - "streaming is not supported with speculative decoding") - _validate_that( - not context_logits, - "context logits are not supported with speculative decoding") - - -@dataclass -class DraftRequest: - draft_input_ids: Optional[np.ndarray] = None - draft_logits: Optional[np.ndarray] = None - - -@dataclass -class PreprocResponse: - input_ids: np.ndarray = np.array([]) - decoder_input_ids: np.ndarray = None - input_lengths: np.ndarray = np.array([]) - decoder_input_lengths: np.ndarray = None - bad_words_list: Optional[np.ndarray] = None - stop_words_list: Optional[np.ndarray] = None - embedding_bias: Optional[np.ndarray] = None - end_id: Optional[np.ndarray] = None - pad_id: Optional[np.ndarray] = None - prompt_embedding_table: Optional[np.ndarray] = None - - @classmethod - def with_new_inputs(cls, - other, - input_ids: Optional[np.ndarray] = None, - input_lengths: Optional[np.ndarray] = None): - return cls(input_ids=(input_ids - if input_ids is not None else other.input_ids), - input_lengths=(input_lengths if input_lengths is not None - else other.input_lengths), - decoder_input_ids=other.decoder_input_ids, - decoder_input_lengths=other.decoder_input_lengths, - bad_words_list=other.bad_words_list, - stop_words_list=other.stop_words_list, - end_id=other.end_id, - pad_id=other.pad_id, - prompt_embedding_table=other.prompt_embedding_table) - - -@dataclass -class GenerationResponse: - output_ids: np.ndarray = np.array([]) - sequence_length: np.ndarray = np.array([]) - cum_log_probs: Optional[np.ndarray] = None - output_log_probs: Optional[np.ndarray] = None - context_logits: Optional[np.ndarray] = None - generation_logits: Optional[np.ndarray] = None - batch_index: Optional[np.ndarray] = None - - -@dataclass -class Response: - text_output: np.ndarray = np.array([]) - cum_log_probs: Optional[np.ndarray] = None - output_log_probs: Optional[np.ndarray] = None - context_logits: Optional[np.ndarray] = None - generation_logits: Optional[np.ndarray] = None - batch_index: Optional[np.ndarray] = None - - def __eq__(self, o) -> bool: - """Just for testing""" - if not isinstance(o, Response): - return False - return (np.array_equal(self.text_output, o.text_output) - and np.array_equal(self.cum_log_probs, o.cum_log_probs) - and np.array_equal(self.output_log_probs, o.output_log_probs) - and np.array_equal(self.context_logits, o.context_logits) - and np.array_equal(self.generation_logits, o.generation_logits) - and np.array_equal(self.batch_index, o.batch_index)) - - -class Decoder: - - def __init__(self, streaming=False, accumulate=False): - self._streaming = streaming - self._accumulate = accumulate - - self._accumulated_tokens = None - - def decode(self, - request: Request, - speculative_decoding=False) -> Generator[Response, None, None]: - - batch_size = request.text_input.shape[0] - preproc_response = self.preprocess(request) - - if speculative_decoding: - if batch_size > 1: - raise Exception( - "speculative decoding is not supported with batch size > 1" - ) - for gen_response in self._spec_generate(preproc_response, request): - yield self.postprocess(gen_response) - else: - if not self._streaming and batch_size == 1: - gen_response = self._generate_non_streaming( - preproc_response, request) - yield self.postprocess(gen_response) - else: - for gen_response in self._generate(preproc_response, request): - yield self.postprocess(gen_response) - - def encountered_stop_words(self, input_ids, stop_words_ids): - for stop_word_ids in stop_words_ids: - if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids): - return True - return False - - def _spec_generate( - self, preproc: PreprocResponse, - request: Request) -> Generator[GenerationResponse, None, None]: - - if preproc.input_ids.shape[0] > 1: - raise Exception( - "Speculative decoding does not support batch size > 1.") - - prompt_input_ids: np.ndarray = preproc.input_ids[0] - input_ids: np.ndarray = prompt_input_ids - output_len: int = request.max_tokens[0][0] - last_input_ids: np.ndarray = None - draft_output_ids: np.ndarray = None - draft_logits: np.ndarray = None - - target_response: GenerationResponse = None - - cur_preproc = preproc - - counter = 0 - while True: - counter += 1 - num_draft_tokens = min( - request.num_draft_tokens[0][0], - len(prompt_input_ids) + output_len - len(input_ids) - 1) - - draft_request = None - if num_draft_tokens > 0: - draft_response: GenerationResponse = self._draft_generate_non_streaming( - cur_preproc, request, num_draft_tokens) - seq_len: int = draft_response.sequence_length[0][0] - # [1, beamWidth, outputLength] -> [outputLen] - draft_output_ids = draft_response.output_ids[0][0] - # [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded] - if request.use_draft_logits is not None and request.use_draft_logits[ - 0]: - if draft_response.generation_logits is not None: - draft_logits = draft_response.generation_logits[0][0] - - input_draft_tokens = draft_output_ids[len(input_ids):seq_len] - draft_request = DraftRequest( - draft_input_ids=np.expand_dims(input_draft_tokens, 0)) - if request.use_draft_logits is not None and request.use_draft_logits[ - 0]: - draft_request.draft_logits = np.expand_dims( - draft_logits[-len(input_draft_tokens):], 0) - else: - draft_request = DraftRequest() - target_response = self._generate_non_streaming( - cur_preproc, request, draft_request) - last_input_ids = input_ids - input_ids = target_response.output_ids[0][0] - cur_preproc = PreprocResponse.with_new_inputs( - cur_preproc, np.expand_dims(input_ids, 0), - np.array([[len(input_ids)]], dtype=np.int32)) - - # Evaluate criteria to stop generation loop. - # If we've hit or exceeded the max output length, should stop - length_stop = (len(input_ids) >= - len(prompt_input_ids) + output_len) - if length_stop: - break - # If draft and target have same outputs, should stop. Normally target should return 1 more token. - # If they are the same length, they should differ at the last token - target_draft_equal = draft_output_ids is not None and np.array_equal( - draft_output_ids, input_ids) - if target_draft_equal: - break - # If tokens no longer change, should stop, means we have hit early stopping - last_current_equal = np.array_equal(last_input_ids, input_ids) - if last_current_equal: - break - # Need to check if stop words was encountered - hit_stop_words = self.encountered_stop_words( - input_ids, preproc.stop_words_list[0]) - if hit_stop_words: - break - - yield target_response - - def _draft_generate_non_streaming( - self, preproc: PreprocResponse, request: Request, - num_draft_tokens: int) -> GenerationResponse: - raise NotImplementedError() - - def _generate( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None - ) -> Generator[GenerationResponse, None, None]: - raise NotImplementedError() - - def _generate_non_streaming( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None - ) -> GenerationResponse: - raise NotImplementedError() - - def postprocess(self, gen_response: GenerationResponse) -> Response: - if self._accumulate and self._streaming: - new_tokens: np.ndarray = gen_response.output_ids - if new_tokens.ndim != 3: - raise Exception("Expected output_ids tensor to have 3 dims.") - if new_tokens.shape[0] != 1: - raise Exception("Expected batch size of 1") - if new_tokens.shape[1] != 1: - raise Exception( - "Accumulation of tokens is only implemented for beam width = 1" - ) - - self._accumulated_tokens = new_tokens if ( - self._accumulated_tokens is None) else np.concatenate( - (self._accumulated_tokens, new_tokens), axis=2) - sequence_lengths = np.array([[self._accumulated_tokens.shape[2]]], - dtype=np.int32) - return self._postprocess(self._accumulated_tokens, - sequence_lengths, gen_response) - else: - return self._postprocess(gen_response.output_ids, None, - gen_response) - - def _postprocess(self, tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse) -> Response: - raise NotImplementedError() - - def preprocess(self, request: Request) -> PreprocResponse: - raise NotImplementedError() - - def reset_decoder(self): - self._accumulated_tokens = None diff --git a/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py b/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py deleted file mode 100644 index a6d4d48..0000000 --- a/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py +++ /dev/null @@ -1,458 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from collections.abc import Callable -from typing import Dict, Optional - -import numpy as np -import triton_python_backend_utils as pb_utils -from lib.decode import * -from typing_extensions import override - - -class TritonDecoder(Decoder): - - def __init__(self, - streaming=False, - accumulate=False, - preproc_model_name="preprocessing", - postproc_model_name="postprocessing", - llm_model_name="tensorrt_llm", - draft_llm_model_name: Optional[str] = None): - super().__init__(streaming=streaming, accumulate=accumulate) - self.preproc_model_name = preproc_model_name - self.postproc_model_name = postproc_model_name - self.llm_model_name = llm_model_name - self.draft_llm_model_name = draft_llm_model_name - - self._preproc_outputs = [ - "INPUT_ID", - "DECODER_INPUT_ID", - "REQUEST_INPUT_LEN", - "REQUEST_DECODER_INPUT_LEN", - "BAD_WORDS_IDS", - "STOP_WORDS_IDS", - "EMBEDDING_BIAS", - "OUT_PAD_ID", - "OUT_END_ID", - "OUT_PROMPT_EMBEDDING_TABLE", - ] - - self._llm_outputs = [ - "output_ids", "sequence_length", "cum_log_probs", - "output_log_probs", "context_logits", "generation_logits", - "batch_index" - ] - - self._postproc_outputs = [ - "OUTPUT", - ] - - self.input_names = [ - "text_input", - "decoder_text_input", - "image_input", - "max_tokens", - "bad_words", - "stop_words", - "end_id", - "pad_id", - "top_k", - "top_p", - "temperature", - "length_penalty", - "repetition_penalty", - "min_length", - "presence_penalty", - "frequency_penalty", - "random_seed", - "return_log_probs", - "return_context_logits", - "return_generation_logits", - "beam_width", - "stream", - "prompt_embedding_table", - "prompt_vocab_size", - "embedding_bias_words", - "embedding_bias_weights", - "num_draft_tokens", - "use_draft_logits", - ] - - self.__undo_reshape_whitelist = { - "max_tokens", - "end_id", - "pad_id", - "top_k", - "top_p", - "temperature", - "length_penalty", - "repetition_penalty", - "min_length", - "presence_penalty", - "frequency_penalty", - "random_seed", - "return_log_probs", - "return_context_logits", - "return_generation_logits", - "beam_width", - "stream", - "prompt_vocab_size", - "num_draft_tokens", - "use_draft_logits", - } - - def _exec_triton_request(self, request): - responses = request.exec(decoupled=True) - for r in responses: - if r.has_error(): - raise pb_utils.TritonModelException(r.error().message()) - yield r - - def _exec_triton_request_single(self, request): - responses = request.exec(decoupled=False) - if responses.has_error(): - raise pb_utils.TritonModelException(responses.error().message()) - return responses - - def create_triton_response(self, response: Response): - name_map = { - "text_output": "text_output", - "cum_log_probs": "cum_log_probs", - "output_log_probs": "output_log_probs", - "context_logits": "context_logits", - "generation_logits": "generation_logits", - "batch_index": "batch_index" - } - tensors = self.create_triton_tensors(response, name_map) - return pb_utils.InferenceResponse(output_tensors=tensors) - - def convert_triton_request(self, triton_request) -> Request: - request = Request() - for triton_name in self.input_names: - tensor = pb_utils.get_input_tensor_by_name(triton_request, - triton_name) - target_name = triton_name - if tensor is None: - continue - if not hasattr(request, target_name): - raise AttributeError( - f"Request has no attribute '{target_name}'") - setattr(request, target_name, tensor.as_numpy()) - return request - - def convert_triton_response(self, - triton_response, - response_factory: Callable, - name_map=None): - response = response_factory() - for tensor in triton_response.output_tensors(): - if tensor is None: - continue - triton_name = tensor.name() - value = tensor.as_numpy() - target_name = triton_name - if name_map and triton_name in name_map: - target_name = name_map[triton_name] - if name_map and not triton_name in name_map: - continue - if target_name is None: - # explicitly ignore this triton input - continue - if not hasattr(response, target_name): - raise AttributeError( - f"response object has not attribute '{target_name}'") - setattr(response, target_name, value) - return response - - def __undo_reshape(self, x, name): - if name in self.__undo_reshape_whitelist and len(x.shape) == 1: - # handle reshapes - return np.expand_dims(x, 0) - else: - return x - - def create_triton_tensors(self, obj, name_map: dict): - tensors = [] - for name, triton_name in name_map.items(): - if triton_name is None: - continue - value = getattr(obj, name) - if value is None: - continue - t = pb_utils.Tensor(triton_name, self.__undo_reshape(value, name)) - tensors.append(t) - return tensors - - @override - def preprocess(self, request: Request) -> PreprocResponse: - input_tensors = self._get_preproc_tensors(request) - triton_req = pb_utils.InferenceRequest( - model_name=self.preproc_model_name, - inputs=input_tensors, - requested_output_names=self._preproc_outputs) - triton_output = self._exec_triton_request_single(triton_req) - return self._get_preproc_response(triton_output) - - def _get_preproc_tensors(self, request: Request): - name_map = { - "text_input": "QUERY", - "decoder_text_input": "DECODER_QUERY", - "image_input": "IMAGE", - "max_tokens": "REQUEST_OUTPUT_LEN", - "bad_words": "BAD_WORDS_DICT", - "stop_words": "STOP_WORDS_DICT", - "embedding_bias_words": "EMBEDDING_BIAS_WORDS", - "embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS", - "pad_id": "PAD_ID", - "end_id": "END_ID", - } - return self.create_triton_tensors(request, name_map) - - def _get_preproc_response(self, triton_output): - name_map = { - "INPUT_ID": "input_ids", - "DECODER_INPUT_ID": "decoder_input_ids", - "REQUEST_INPUT_LEN": "input_lengths", - "REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths", - "BAD_WORDS_IDS": "bad_words_list", - "STOP_WORDS_IDS": "stop_words_list", - "EMBEDDING_BIAS": "embedding_bias", - "OUT_PAD_ID": "pad_id", - "OUT_END_ID": "end_id", - "OUT_PROMPT_EMBEDDING_TABLE": "prompt_embedding_table", - } - return self.convert_triton_response(triton_output, PreprocResponse, - name_map) - - @override - def _draft_generate_non_streaming( - self, preproc: PreprocResponse, request: Request, - num_draft_tokens: int) -> GenerationResponse: - input_tensors = self._get_llm_tensors(preproc, request, - num_draft_tokens, None, True) - triton_req = pb_utils.InferenceRequest( - model_name=self.draft_llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs) - triton_response = self._exec_triton_request_single(triton_req) - llm_response = self._get_llm_response(triton_response) - return llm_response - - @override - def _generate( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None - ) -> Generator[GenerationResponse, None, None]: - input_tensors = self._get_llm_tensors(preproc, request, None, - draft_request) - triton_req = pb_utils.InferenceRequest( - model_name=self.llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs) - for r in self._exec_triton_request(triton_req): - yield self._get_llm_response(r) - - @override - def _generate_non_streaming( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None - ) -> GenerationResponse: - input_tensors = self._get_llm_tensors(preproc, request, None, - draft_request) - triton_req = pb_utils.InferenceRequest( - model_name=self.llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs) - r = self._exec_triton_request_single(triton_req) - return self._get_llm_response(r) - - def _get_llm_tensors(self, - preproc: PreprocResponse, - request: Request, - num_output_tokens: Optional[int] = None, - draft_request: Optional[DraftRequest] = None, - is_draft_model_request: bool = False): - tensors = [] - tensors.extend(self._get_tensors_from_preproc(preproc)) - tensors.extend( - self._get_llm_tensors_from_request(request, num_output_tokens, - draft_request, - is_draft_model_request)) - return tensors - - def _get_tensors_from_preproc(self, preproc: PreprocResponse): - name_map = { - "input_ids": "input_ids", - "decoder_input_ids": "decoder_input_ids", - "input_lengths": "input_lengths", - "bad_words_list": "bad_words_list", - "stop_words_list": "stop_words_list", - "embedding_bias": "embedding_bias", - "pad_id": "pad_id", - "end_id": "end_id", - "prompt_embedding_table": "prompt_embedding_table", - } - return self.create_triton_tensors(preproc, name_map) - - def _get_llm_tensors_from_request( - self, - request: Request, - num_output_tokens: Optional[int] = None, - draft_request: Optional[DraftRequest] = None, - is_draft_model_request: bool = False): - name_map: Dict[str, Optional[str]] = { - "beam_width": "beam_width", - "top_k": "runtime_top_k", - "top_p": "runtime_top_p", - "length_penalty": "len_penalty", - "repetition_penalty": "repetition_penalty", - "min_length": "min_length", - "presence_penalty": "presence_penalty", - "frequency_penalty": "frequency_penalty", - "random_seed": "random_seed", - "return_log_probs": "return_log_probs", - "stream": "streaming", - "prompt_embedding_table": "prompt_embedding_table", - "prompt_vocab_size": "prompt_vocab_size", - } - batch_size = request.text_input.shape[0] - tensors = self.create_triton_tensors(request, name_map) - out_len = None - if request.max_tokens is not None: - out_len = request.max_tokens[0][0] - if num_output_tokens is not None: - out_len = num_output_tokens - elif draft_request: - if draft_request.draft_input_ids is not None: - out_len = len(draft_request.draft_input_ids[0]) + 1 - else: - out_len = 1 - - if out_len is None: - raise Exception("Could not determine request_output_len") - else: - tensors.append( - pb_utils.Tensor("request_output_len", - np.array([[out_len]], dtype=np.int32))) - - if draft_request: - if draft_request.draft_input_ids is not None: - tensors.append( - pb_utils.Tensor("draft_input_ids", - draft_request.draft_input_ids)) - if draft_request.draft_logits is not None and request.use_draft_logits is not None and request.use_draft_logits[ - 0]: - tensors.append( - pb_utils.Tensor("draft_logits", - draft_request.draft_logits)) - - return_context_logits_data = [False] - return_generation_logits_data = [False] - if draft_request is None: - if is_draft_model_request: - return_generation_logits_data = request.use_draft_logits if request.use_draft_logits is not None else [ - False - ] - else: - return_context_logits_data = request.return_context_logits if request.return_context_logits is not None else [ - False - ] - return_generation_logits_data = request.return_generation_logits if request.return_generation_logits is not None else [ - False - ] - return_context_logits = np.array([return_context_logits_data] * - batch_size, - dtype=bool) - return_generation_logits = np.array([return_generation_logits_data] * - batch_size, - dtype=bool) - - assert len(return_context_logits.shape) == 2 - assert len(return_generation_logits.shape) == 2 - - tensors.append( - pb_utils.Tensor("return_context_logits", return_context_logits)) - tensors.append( - pb_utils.Tensor("return_generation_logits", - return_generation_logits)) - return tensors - - def _get_llm_response(self, triton_output): - name_map = { - "output_ids": "output_ids", - "sequence_length": "sequence_length", - "cum_log_probs": "cum_log_probs", - "output_log_probs": "output_log_probs", - "context_logits": "context_logits", - "generation_logits": "generation_logits", - "batch_index": "batch_index", - } - return self.convert_triton_response(triton_output, GenerationResponse, - name_map) - - def _postprocess(self, tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse) -> Response: - input_tensors = self._get_postproc_tensors(tokens, sequence_lengths, - gen_response) - triton_req = pb_utils.InferenceRequest( - model_name=self.postproc_model_name, - inputs=input_tensors, - requested_output_names=self._postproc_outputs) - r = self._exec_triton_request_single(triton_req) - response = self._get_response(r, gen_response) - return response - - def _get_postproc_tensors(self, tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse): - tensors = [ - pb_utils.Tensor("TOKENS_BATCH", tokens), - pb_utils.Tensor( - "SEQUENCE_LENGTH", sequence_lengths - if sequence_lengths else gen_response.sequence_length) - ] - return tensors - - def _get_response(self, triton_output, gen_res: GenerationResponse): - tensors = triton_output.output_tensors() - t_map = {} - for named_t in tensors: - name = named_t.name() - t = named_t.as_numpy() - t_map[name] = t - response = Response(text_output=t_map["OUTPUT"], - cum_log_probs=gen_res.cum_log_probs, - output_log_probs=gen_res.output_log_probs, - context_logits=gen_res.context_logits, - generation_logits=gen_res.generation_logits, - batch_index=gen_res.batch_index) - return response diff --git a/triton_templates/tensorrt_llm_bls/1/model.py b/triton_templates/tensorrt_llm_bls/1/model.py deleted file mode 100644 index 609e323..0000000 --- a/triton_templates/tensorrt_llm_bls/1/model.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -import traceback - -import triton_python_backend_utils as pb_utils -from lib.triton_decoder import TritonDecoder - - -class TritonPythonModel: - - def initialize(self, args): - - # Parse model configs - model_config = json.loads(args['model_config']) - - params = model_config['parameters'] - - accumulate_tokens_str = '' - if 'accumulate_tokens' in params: - accumulate_tokens_str = params['accumulate_tokens']['string_value'] - - self.accumulate_tokens = accumulate_tokens_str.lower() in [ - 'true', 'yes', '1', 't' - ] - - self.decoupled = pb_utils.using_decoupled_model_transaction_policy( - model_config) - - self.logger = pb_utils.Logger - - self.llm_model_name = "tensorrt_llm" - if "tensorrt_llm_model_name" in params: - self.llm_model_name = params["tensorrt_llm_model_name"][ - "string_value"] - self.draft_llm_model_name = None - if "tensorrt_llm_draft_model_name" in params: - self.draft_llm_model_name = params[ - "tensorrt_llm_draft_model_name"]["string_value"] - - self.decoder = TritonDecoder( - streaming=self.decoupled, - accumulate=self.accumulate_tokens, - preproc_model_name="preprocessing", - postproc_model_name="postprocessing", - llm_model_name=self.llm_model_name, - draft_llm_model_name=self.draft_llm_model_name) - - def execute(self, requests): - - responses = [] - - for request in requests: - if self.decoupled: - response_sender = request.get_response_sender() - try: - - req = self.decoder.convert_triton_request(request) - req.validate() - speculative_decode = (req.num_draft_tokens is not None - and req.num_draft_tokens[0][0] > 0) - if speculative_decode and (self.draft_llm_model_name is None - or self.draft_llm_model_name == ""): - raise Exception( - "cannot perform speculative decoding without draft model" - ) - res_gen = self.decoder.decode( - req, speculative_decoding=speculative_decode) - - for res in res_gen: - triton_response = self.decoder.create_triton_response(res) - if self.decoupled: - response_sender.send(triton_response) - else: - responses.append(triton_response) - - if self.decoupled: - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) - - except Exception: - self.logger.log_error(traceback.format_exc()) - # If encountering an error, send a response with err msg - error_response = pb_utils.InferenceResponse( - output_tensors=[], - error=pb_utils.TritonError(traceback.format_exc())) - - if self.decoupled: - response_sender.send(error_response) - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) - else: - responses.append(error_response) - - self.decoder.reset_decoder() - if self.decoupled: - return None - else: - assert len(responses) == len(requests) - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print('Cleaning up...') diff --git a/triton_templates/tensorrt_llm_bls/config.pbtxt b/triton_templates/tensorrt_llm_bls/config.pbtxt deleted file mode 100644 index da84b98..0000000 --- a/triton_templates/tensorrt_llm_bls/config.pbtxt +++ /dev/null @@ -1,264 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "tensorrt_llm_bls" -backend: "python" -max_batch_size: ${triton_max_batch_size} - -model_transaction_policy { - decoupled: ${decoupled_mode} -} - -input [ - { - name: "text_input" - data_type: TYPE_STRING - dims: [ 1 ] - }, - { - name: "decoder_text_input" - data_type: TYPE_STRING - dims: [ 1 ] - optional: true - }, - { - name: "image_input" - data_type: TYPE_FP16 - dims: [ 3, 224, 224 ] - optional: true - }, - { - name: "max_tokens" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "bad_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "stop_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "end_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "pad_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_k" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "length_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "min_length" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "frequency_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - { - name: "return_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_context_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_generation_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "beam_width" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "stream" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "prompt_embedding_table" - data_type: TYPE_FP16 - dims: [ -1, -1 ] - optional: true - }, - { - name: "prompt_vocab_size" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "embedding_bias_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "embedding_bias_weights" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true - }, - { - name: "num_draft_tokens", - data_type: TYPE_INT32, - dims: [ 1 ] - optional: true - }, - { - name: "use_draft_logits", - data_type: TYPE_BOOL, - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - } -] -output [ - { - name: "text_output" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "cum_log_probs" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "output_log_probs" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "context_logits" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "generation_logits" - data_type: TYPE_FP32 - dims: [ -1, -1, -1 ] - }, - { - name: "batch_index" - data_type: TYPE_INT32 - dims: [ 1 ] - } -] - -parameters: { - key: "accumulate_tokens" - value: { - string_value: "${accumulate_tokens}" - } -} -parameters: { - key: "tensorrt_llm_model_name" - value: { - string_value: "${tensorrt_llm_model_name}" - } -} -parameters: { - key: "tensorrt_llm_draft_model_name" - value: { - string_value: "${tensorrt_llm_draft_model_name}" - } -} - -instance_group [ - { - count: ${bls_instance_count} - kind : KIND_CPU - } -] From 4152b5ac38b6f74606b42d87cf03df590cef4313 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Wed, 7 Aug 2024 17:56:05 +0200 Subject: [PATCH 34/35] update triton_model_repo and default config --- configs/example_official_model_config.yaml | 3 ++- triton_model_repo/tensorrt_llm/config.pbtxt | 2 +- triton_templates/tensorrt_llm/config.pbtxt | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/configs/example_official_model_config.yaml b/configs/example_official_model_config.yaml index 89cb36a..21e13f5 100644 --- a/configs/example_official_model_config.yaml +++ b/configs/example_official_model_config.yaml @@ -28,12 +28,13 @@ instantiate: max_queue_delay_microseconds: 100 max_attention_window_size: 4096 kv_cache_free_gpu_mem_fraction: 0.95 + max_queue_size: 0 postprocessing: args: tokenizer_dir: /src/triton_model_repo/tensorrt_llm/1/ - tokenizer_type: llama + tokenizer_type: auto triton_max_batch_size: 64 postprocessing_instance_count: 64 diff --git a/triton_model_repo/tensorrt_llm/config.pbtxt b/triton_model_repo/tensorrt_llm/config.pbtxt index b8296c4..504291a 100644 --- a/triton_model_repo/tensorrt_llm/config.pbtxt +++ b/triton_model_repo/tensorrt_llm/config.pbtxt @@ -35,7 +35,7 @@ model_transaction_policy { dynamic_batching { preferred_batch_size: [ 64 ] max_queue_delay_microseconds: 100 - default_queue_policy: { max_queue_size: ${max_queue_size} } + default_queue_policy: { max_queue_size: 0 } } input [ diff --git a/triton_templates/tensorrt_llm/config.pbtxt b/triton_templates/tensorrt_llm/config.pbtxt index 1de3659..1974161 100644 --- a/triton_templates/tensorrt_llm/config.pbtxt +++ b/triton_templates/tensorrt_llm/config.pbtxt @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. name: "tensorrt_llm" -backend: "${triton_backend}" +backend: "tensorrtllm" max_batch_size: ${triton_max_batch_size} model_transaction_policy { From 67438c88852563e1631c4d84102bb2103a189d56 Mon Sep 17 00:00:00 2001 From: Yorick van Pelt Date: Thu, 8 Aug 2024 14:12:35 +0200 Subject: [PATCH 35/35] tensorrt-llm: always autoPatchelf $out --- nix/tensorrt-llm.nix | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nix/tensorrt-llm.nix b/nix/tensorrt-llm.nix index 1c8bffe..9764c64 100644 --- a/nix/tensorrt-llm.nix +++ b/nix/tensorrt-llm.nix @@ -198,10 +198,11 @@ stdenv.mkDerivation (o: { # manually call autoPatchelf so it doesn't cross-link the outputs dontAutoPatchelf = true; # move the propagatedBuildInputs to $python - postFixup = lib.optionalString withPython '' + postFixup = (lib.optionalString withPython '' mv $out/nix-support $python/ - autoPatchelf $out autoPatchelf $python + '') + '' + autoPatchelf $out ''; # imports check, wants nvml # pushd $python/${python3.sitePackages}