NVIDIA
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.gitmodules‎
Lines changed: 0 additions & 1 deletion b/‎.gitmodules‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 9 additions & 4 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎3rdparty/cutlass‎ b/‎3rdparty/cutlass‎
diff --git a/‎3rdparty/json‎ b/‎3rdparty/json‎
diff --git a/‎README.md‎
Lines changed: 58 additions & 19 deletions b/‎README.md‎
Lines changed: 58 additions & 19 deletions
diff --git a/‎benchmarks/cpp/README.md‎
Lines changed: 9 additions & 11 deletions b/‎benchmarks/cpp/README.md‎
Lines changed: 9 additions & 11 deletions
diff --git a/‎benchmarks/cpp/gptManagerBenchmark.cpp‎
Lines changed: 36 additions & 38 deletions b/‎benchmarks/cpp/gptManagerBenchmark.cpp‎
Lines changed: 36 additions & 38 deletions
@@ -18,6 +18,9 @@ venv/
 .hypothesis/
 .idea/
 cpp/cmake-build-*
+cpp/.ccache/
+tensorrt_llm/libs
+tensorrt_llm/bindings.pyi
 
 # Testing
 .coverage.*
 
@@ -1,7 +1,6 @@
 [submodule "3rdparty/cutlass"]
 	path = 3rdparty/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
-	branch = v2.10.0
 [submodule "3rdparty/json"]
 	path = 3rdparty/json
 	url = https://github.com/nlohmann/json.git
 
@@ -15,7 +15,7 @@ repos:
     rev: v4.1.0
     hooks:
     -   id: check-added-large-files
-        exclude: 'cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin'
+        exclude: 'cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/'
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
@@ -33,10 +33,15 @@ repos:
     -   id: clang-format
         types_or: [c++, c, cuda]
         exclude: |
-            (?x)^(
-                cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/.*
-            )$
+            (?x)^(.*cubin.cpp$ | .*fmha_cubin.h)$
 -   repo: https://github.com/cheshirekow/cmake-format-precommit
     rev: v0.6.10
     hooks:
     -   id: cmake-format
+-   repo: https://github.com/codespell-project/codespell
+    rev: v2.2.4
+    hooks:
+    -   id: codespell
+        args:
+        - --skip=".git,3rdparty"
+        - --ignore-words-list=rouge,inout,atleast,strat
@@ -43,17 +43,22 @@ H200 FP8 achieves 11,819 tok/s on Llama2-13B on a single GPU, and is up to 1.9x
 - [Installation](#installation)
 - [Quick Start](#quick-start)
 - [Support Matrix](#support-matrix)
+  - [Devices](#devices)
+  - [Precision](#precision)
+  - [Key Features](#key-features)
+  - [Models](#models)
 - [Performance](#performance)
 - [Advanced Topics](#advanced-topics)
   - [Quantization](#quantization)
   - [In-flight Batching](#in-flight-batching)
   - [Attention](#attention)
   - [Graph Rewriting](#graph-rewriting)
-  - [Benchmarking](#benchmarking)
+  - [Benchmark](#benchmark)
 - [Troubleshooting](#troubleshooting)
-- [Release Notes](#release-notes)
-  - [Changelog](#changelog)
-  - [Known issues](#known-issues)
+- [Release notes](#release-notes)
+  - [Change Log](#change-log)
+  - [Known Issues](#known-issues)
+  - [Report Issues](#report-issues)
 
 ## TensorRT-LLM Overview
 
@@ -99,7 +104,7 @@ concepts used in TensorRT-LLM, we recommend you to read the following
 
 ## Installation
 
-*For Windows installation, see [`Windows/`](windows/).*
+*For Windows installation, see [`Windows`](windows/README.md).*
 
 TensorRT-LLM must be built from source, instructions can be found
 [here](./docs/source/installation.md). An image of a Docker container with
@@ -154,14 +159,14 @@ See the BLOOM [example](examples/bloom) for more details and options regarding t
 
 ***3. Run***
 
-The `summarize.py` script can be used to perform the summarization of articles
+The `../summarize.py` script can be used to perform the summarization of articles
 from the CNN Daily dataset:
 
 ```python
-python summarize.py --test_trt_llm \
-                    --hf_model_location ./bloom/560M/ \
-                    --data_type fp16 \
-                    --engine_dir ./bloom/560M/trt_engines/fp16/1-gpu/
+python ../summarize.py --test_trt_llm \
+                       --hf_model_dir ./bloom/560M/ \
+                       --data_type fp16 \
+                       --engine_dir ./bloom/560M/trt_engines/fp16/1-gpu/
 ```
 
 More details about the script and how to run the BLOOM model can be found in
@@ -193,13 +198,13 @@ Lovelace architectures. Certain limitations may, however, apply.
 Various numerical precisions are supported in TensorRT-LLM. The support for
 some of those numerical features require specific architectures:
 
-|                              | FP32  | FP16  | BF16  | FP8  | INT8 | INT4 |
-| :--------------------------- | :---- | :---- | :---- | :--- | :--- | :--- |
-| Volta (SM70)                 | Y     | Y     | N     | N    | Y    | Y    |
-| Turing (SM75)                | Y     | Y     | N     | N    | Y    | Y    |
-| Ampere (SM80, SM86)          | Y     | Y     | Y     | N    | Y    | Y    |
-| Ada-Lovelace (SM89)          | Y     | Y     | Y     | Y    | Y    | Y    |
-| Hopper (SM90)                | Y     | Y     | Y     | Y    | Y    | Y    |
+|                     | FP32 | FP16 | BF16 | FP8  | INT8 | INT4 |
+| :------------------ | :--- | :--- | :--- | :--- | :--- | :--- |
+| Volta (SM70)        | Y    | Y    | N    | N    | Y    | Y    |
+| Turing (SM75)       | Y    | Y    | N    | N    | Y    | Y    |
+| Ampere (SM80, SM86) | Y    | Y    | Y    | N    | Y    | Y    |
+| Ada-Lovelace (SM89) | Y    | Y    | Y    | Y    | Y    | Y    |
+| Hopper (SM90)       | Y    | Y    | Y    | Y    | Y    | Y    |
 
 In this release of TensorRT-LLM, the support for FP8 and quantized data types
 (INT8 or INT4) is not implemented for all the models. See the
@@ -237,19 +242,26 @@ The list of supported models is:
 * [Bert](examples/bert)
 * [Blip2](examples/blip2)
 * [BLOOM](examples/bloom)
-* [ChatGLM-6B](examples/chatglm6b)
-* [ChatGLM2-6B](examples/chatglm2-6b/)
+* [ChatGLM](examples/chatglm)
 * [Falcon](examples/falcon)
+* [Flan-T5](examples/enc_dec)
 * [GPT](examples/gpt)
 * [GPT-J](examples/gptj)
 * [GPT-Nemo](examples/gpt)
 * [GPT-NeoX](examples/gptneox)
+* [InternLM](examples/internlm)
 * [LLaMA](examples/llama)
 * [LLaMA-v2](examples/llama)
+* [Mistral](examples/llama)
 * [MPT](examples/mpt)
 * [OPT](examples/opt)
+* [Qwen](examples/qwen)
+* [Replit Code](examples/mpt)
 * [SantaCoder](examples/gpt)
 * [StarCoder](examples/gpt)
+* [T5](examples/enc_dec)
+
+Note: [Encoder-Decoder](examples/enc_dec/) provides general encoder-decoder support that contains many encoder-decoder models such as T5, Flan-T5, etc. We unroll the exact model names in the list above to let users find specific models easier.
 
 ## Performance
 
@@ -311,6 +323,33 @@ may happen. One possible solution is to reduce the amount of memory needed by
 reducing the maximum batch size, input and output lengths. Another option is to
 enable plugins, for example: `--use_gpt_attention_plugin`.
 
+* MPI + Slurm
+
+TensorRT-LLM is a [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface)-aware package that uses [`mpi4py`](https://mpi4py.readthedocs.io/en/stable/). If you are running scripts in a [Slurm](https://slurm.schedmd.com/) environment, you might encounter interferences:
+```
+--------------------------------------------------------------------------
+PMI2_Init failed to initialize.  Return code: 14
+--------------------------------------------------------------------------
+--------------------------------------------------------------------------
+The application appears to have been direct launched using "srun",
+but OMPI was not built with SLURM's PMI support and therefore cannot
+execute. There are several options for building PMI support under
+SLURM, depending upon the SLURM version you are using:
+
+  version 16.05 or later: you can use SLURM's PMIx support. This
+  requires that you configure and build SLURM --with-pmix.
+
+  Versions earlier than 16.05: you must use either SLURM's PMI-1 or
+  PMI-2 support. SLURM builds PMI-1 by default, or you can manually
+  install PMI-2. You must then build Open MPI using --with-pmi pointing
+  to the SLURM PMI library location.
+
+Please configure as appropriate and try again.
+--------------------------------------------------------------------------
+```
+As a rule of thumb, if you are running TensorRT-LLM interactively on a Slurm node, prefix your commands with `mpirun -n 1` to run TensorRT-LLM in a dedicated MPI environment, not the one provided by your Slurm allocation.
+For example: `mpirun -n 1 python3 examples/gpt/build.py ...`
+
 ## Release notes
 
   * TensorRT-LLM requires TensorRT 9.1.0.4 and 23.08 containers.
 
@@ -7,18 +7,14 @@ multiple GPUs or multiple nodes with multiple GPUs.
 
 ### 1. Build TensorRT-LLM and benchmarking source code
 
-Please follow the [`installation document`](../../../README.md) to build TensorRT-LLM.
+Please follow the [`installation document`](../../docs/source/installation.md) to build TensorRT-LLM.
+
+Note that the benchmarking source code for C++ runtime is not built by default, you can use the argument `--benchmarks` in [`build_wheel.py`](../../scripts/build_wheel.py) to build that.
 
 Windows users: Follow the
-[`Windows installation document`](../../../windows/README.md)
+[`Windows installation document`](../../windows/README.md)
 instead, and be sure to set DLL paths as specified in
-[Extra Steps for C++ Runtime Usage](../../../windows/README.md#extra-steps-for-c-runtime-usage).
-
-After that, you can build benchmarking source code for C++ runtime
-```
-cd cpp/build
-make -j benchmarks
-```
+[Extra Steps for C++ Runtime Usage](../../windows/README.md#extra-steps-for-c-runtime-usage).
 
 ### 2. Launch C++ benchmarking (Fixed BatchSize/InputLen/OutputLen)
 
@@ -44,7 +40,7 @@ Take GPT-350M as an example for single GPU
     --batch_size "1" \
     --input_output_len "60,20"
 
-# Expected ouput:
+# Expected output:
 # [BENCHMARK] batch_size 1 input_length 60 output_length 20 latency(ms) 40.81
 ```
 Take GPT-175B as an example for multiple GPUs
@@ -55,10 +51,12 @@ mpirun -n 8 ./benchmarks/gptSessionBenchmark \
     --batch_size "1" \
     --input_output_len "60,20"
 
-# Expected ouput:
+# Expected output:
 # [BENCHMARK] batch_size 1 input_length 60 output_length 20 latency(ms) 792.14
 ```
 
+If you want to obtain context and generation logits, you could build an enigne with `--gather_all_token_logits` and run gptSessionBenchmark with `--print_all_logits`. This will print a large number of logit values and has a certain impact on performance.
+
 *Please note that the expected outputs in that document are only for reference, specific performance numbers depend on the GPU you're using.*
 
 ### 3. Launch Batch Manager benchmarking (Inflight/V1 batching)
 
@@ -273,13 +273,9 @@ class GptServer
 {
 public:
     GptServer(std::filesystem::path const& trtEnginePath, TrtGptModelType modelType, int32_t maxBeamWidth,
-        batch_scheduler::SchedulerPolicy schedulerPolicy, std::optional<int32_t> maxNumSequences,
-        std::optional<int32_t> maxTokensInPagedKvCache, std::optional<float> kvCacheFreeGpuMemFraction,
-        std::optional<bool> enableTrtOverlap, std::shared_ptr<Recorder> recorder,
-        std::optional<uint64_t> terminateReqId)
+        batch_scheduler::SchedulerPolicy schedulerPolicy, TrtGptModelOptionalParams const& optionalParams,
+        std::shared_ptr<Recorder> recorder, std::optional<uint64_t> terminateReqId)
     {
-        const TrtGptModelOptionalParams& optionalParams = TrtGptModelOptionalParams(
-            maxNumSequences, maxTokensInPagedKvCache, kvCacheFreeGpuMemFraction, enableTrtOverlap);
         mBatchManager = std::make_shared<GptManager>(
             trtEnginePath, modelType, maxBeamWidth, schedulerPolicy,
             [this](int max_num_requests) { return getInferenceRequests(max_num_requests); },
@@ -460,10 +456,8 @@ std::pair<std::vector<std::vector<int32_t>>, std::vector<int32_t>> parseDataset(
 }
 
 void benchmarkGptManager(std::string const& modelName, std::filesystem::path const& engineDir, std::string const& type,
-    std::string const& datasetPath, std::shared_ptr<nvinfer1::ILogger> const& logger,
-    std::optional<int32_t> maxNumSequences, std::optional<int32_t> maxTokensInPagedKvCache,
-    std::optional<float> kvCacheFreeGpuMemFraction, std::optional<bool> enableTrtOverlap,
-    batch_scheduler::SchedulerPolicy schedulerPolicy)
+    std::string const& datasetPath, int beamWidth, std::shared_ptr<nvinfer1::ILogger> const& logger,
+    TrtGptModelOptionalParams const& optionalParams, batch_scheduler::SchedulerPolicy schedulerPolicy)
 {
     auto const worldConfig = WorldConfig::mpi(*logger);
 
@@ -482,6 +476,11 @@ void benchmarkGptManager(std::string const& modelName, std::filesystem::path con
         TLLM_LOG_ERROR(errStr);
     }
 
+    ITensor::SharedPtr beamWidthBuffer = BufferManager::cpu(ITensor::makeShape({1}), nvinfer1::DataType::kINT32);
+    auto beamWidthBufferPtr = bufferCast<SizeType>(*beamWidthBuffer);
+    *beamWidthBufferPtr = beamWidth;
+    auto beamWidthTensor = NamedTensor(beamWidthBuffer, "beam_width");
+
     // Load dataset
     auto dataset = parseDataset(datasetPath);
     std::vector<std::vector<NamedTensor>> tensors_list;
@@ -494,15 +493,16 @@ void benchmarkGptManager(std::string const& modelName, std::filesystem::path con
         auto input_ids_tensor = NamedTensor(nvinfer1::DataType::kINT32, input_ids_shape, "input_ids", input_ids.data());
         auto request_output_len_tensor
             = NamedTensor(nvinfer1::DataType::kINT32, {1, 1}, "request_output_len", &request_output_len);
-        std::vector<NamedTensor> tensors = {input_ids_tensor, request_output_len_tensor};
-        tensors_list.push_back(tensors);
+        std::vector<NamedTensor> tensors
+            = {std::move(input_ids_tensor), std::move(request_output_len_tensor), beamWidthTensor};
+        tensors_list.emplace_back(std::move(tensors));
     }
 
-    const int maxBeamWidth = 1;
+    const int maxBeamWidth = beamWidth;
     auto recorder = std::make_shared<Recorder>();
     uint64_t terminateReqId = num_samples + 1;
-    auto gptServer = std::make_shared<GptServer>(engineDir, modelType, maxBeamWidth, schedulerPolicy, maxNumSequences,
-        maxTokensInPagedKvCache, kvCacheFreeGpuMemFraction, enableTrtOverlap, recorder, terminateReqId);
+    auto gptServer = std::make_shared<GptServer>(
+        engineDir, modelType, maxBeamWidth, schedulerPolicy, optionalParams, recorder, terminateReqId);
 
     if (worldConfig.getRank() == 0)
     {
@@ -537,16 +537,18 @@ int main(int argc, char* argv[])
         "type", "Batching type: IFB or V1(non-IFB) batching.", cxxopts::value<std::string>()->default_value("IFB"));
     options.add_options()("dataset", "Dataset that is used for benchmarking BatchManager.",
         cxxopts::value<std::string>()->default_value(""));
+    options.add_options()(
+        "beam_width", "Specify beam width you want to benchmark.", cxxopts::value<int>()->default_value("1"));
 
-    options.add_options()("max_num_sequences", "Max number of Sequences.", cxxopts::value<int>()->default_value("-1"));
+    options.add_options()("max_num_sequences", "Max number of Sequences.", cxxopts::value<int>());
+    options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value<int>());
     options.add_options()(
-        "max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value<int>()->default_value("-1"));
-    options.add_options()("kv_cache_free_gpu_mem_fraction", "K-V Cache Free Gpu Mem Fraction.",
-        cxxopts::value<float>()->default_value("-1"));
+        "kv_cache_free_gpu_mem_fraction", "K-V Cache Free Gpu Mem Fraction.", cxxopts::value<float>());
+    options.add_options()(
+        "enable_trt_overlap", "Overlap TRT context preparation and execution", cxxopts::value<bool>());
+
     options.add_options()("scheduler_policy", "Choose scheduler policy between max_utilization/guaranteed_no_evict.",
         cxxopts::value<std::string>()->default_value("guaranteed_no_evict"));
-    options.add_options()("enable_trt_overlap", "Overlap TRT context preparation and execution",
-        cxxopts::value<bool>()->default_value("false"));
 
     options.add_options()("log_level", "Choose log level between verbose/info/warning/error/internal_error.",
         cxxopts::value<std::string>()->default_value("error"));
@@ -573,32 +575,29 @@ int main(int argc, char* argv[])
     // Argument: Dataset
     auto const datasetPath = result["dataset"].as<std::string>();
 
+    // Argument: beam width
+    auto const beamWidth = result["beam_width"].as<int>();
+
+    TrtGptModelOptionalParams optionalParams;
     // Argument: Max Num Sequences
-    std::optional<int32_t> maxNumSequences = std::nullopt;
-    if (result["max_num_sequences"].as<int>() != -1)
+    if (result.count("max_num_sequences"))
     {
-        maxNumSequences = result["max_num_sequences"].as<int>();
+        optionalParams.maxNumSequences = result["max_num_sequences"].as<int>();
     }
-
     // Argument: Max tokens in paged K-V Cache
-    std::optional<int32_t> maxTokensInPagedKvCache = std::nullopt;
-    if (result["max_tokens_in_paged_kvcache"].as<int>() != -1)
+    if (result.count("max_tokens_in_paged_kvcache"))
     {
-        maxTokensInPagedKvCache = result["max_tokens_in_paged_kvcache"].as<int>();
+        optionalParams.kvCacheConfig.maxTokens = result["max_tokens_in_paged_kvcache"].as<int>();
     }
-
     // Argument: K-V Cache Free Gpu Mem Fraction
-    std::optional<float> kvCacheFreeGpuMemFraction = std::nullopt;
-    if (result["kv_cache_free_gpu_mem_fraction"].as<float>() != -1)
+    if (result.count("kv_cache_free_gpu_mem_fraction"))
     {
-        kvCacheFreeGpuMemFraction = result["kv_cache_free_gpu_mem_fraction"].as<float>();
+        optionalParams.kvCacheConfig.freeGpuMemoryFraction = result["kv_cache_free_gpu_mem_fraction"].as<float>();
     }
-
     // Argument: Enable TRT overlap
-    std::optional<bool> enableTrtOverlap = std::nullopt;
-    if (result["enable_trt_overlap"].as<bool>() != -1)
+    if (result.count("enable_trt_overlap"))
     {
-        enableTrtOverlap = result["enable_trt_overlap"].as<bool>();
+        optionalParams.enableTrtOverlap = result["enable_trt_overlap"].as<bool>();
     }
 
     // Argument: Scheduler policy
@@ -652,8 +651,7 @@ int main(int argc, char* argv[])
     try
     {
         benchmarkGptManager(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), type,
-            datasetPath, logger, maxNumSequences, maxTokensInPagedKvCache, kvCacheFreeGpuMemFraction, enableTrtOverlap,
-            schedulerPolicy);
+            datasetPath, beamWidth, logger, optionalParams, schedulerPolicy);
     }
     catch (const std::exception& e)
     {