NVIDIA
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h‎
Lines changed: 1 addition & 5 deletions b/‎cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎cpp/kernels/fmha_v2/setup.py‎
Lines changed: 11 additions & 7 deletions b/‎cpp/kernels/fmha_v2/setup.py‎
Lines changed: 11 additions & 7 deletions
diff --git a/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h‎
Lines changed: 121 additions & 45 deletions b/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h‎
Lines changed: 121 additions & 45 deletions
diff --git a/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu‎
Lines changed: 1 addition & 1 deletion b/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/assignReqSeqSlots.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/assignReqSeqSlots.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -34,6 +34,7 @@ TensorRT-LLM
 ✨ [➡️ link](./docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md)
 
 ## Latest News
+* [07/15] 🌟 TensorRT-LLM delivers Day-0 support for LG AI Research's latest model, EXAONE 4.0 [➡️ link](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B)
 * [06/17] Join NVIDIA and DeepInfra for a developer meetup on June 26 ✨ [➡️ link](https://events.nvidia.com/scaletheunscalablenextgenai)
 * [05/22] Blackwell Breaks the 1,000 TPS/User Barrier With Meta’s Llama 4 Maverick
 ✨ [➡️ link](https://developer.nvidia.com/blog/blackwell-breaks-the-1000-tps-user-barrier-with-metas-llama-4-maverick/)
 
@@ -232,13 +232,9 @@ class RuntimeBuffers
 
     GenerationLogitsCache generationLogitsCache;
 
-    //! Helper for KV cache rewind
+    //! Mapping from batch idx to slot id
     TensorPtr seqSlots;
     TensorPtr seqSlotsDevice;
-    TensorPtr sortedSeqSlots;
-    //! For KV cache rewind
-    TensorPtr seqSlotRemappingHost;   // [numSequences]
-    TensorPtr seqSlotRemappingDevice; // [numSequences]
 
     //! Explicitly device-copy src offsets to reduce warp stalls in copy batch kernel invocation
     //! [mMaxNumRequests], on gpu
 
@@ -3049,11 +3049,13 @@ def get_kernel_traits_code(specs_names):
     return code
 
 
-# For now, only hopper head_size 128 kernel uses cubins, and other kernels use cu files.
-# You should set the condition `use_cubin_header` to false if you have modified the source code of the FMHA kernels on Hopper (sm90) with head_size 128.
+# For now:
+# 1. Hopper head_size 128 kernel uses cubins for performance regressions.
+# 2. Hopper sm89 with e4m3/e4m3_fp32 dtype uses cubins for accuracy regressions (will be fixed).
+# You should set the condition `use_cubin_header` to false if you have modified the source codes of those kernels that use cubins.
 # This ensures that the kernels will be recompiled using the updated source code rather than relying on precompiled cubins.
-def use_cubin_header(kspec):
-    return kspec.sm == 90 and kspec.head_size == 128
+def use_cubin_header(sm, head_size, dtype):
+    return (sm == 90 and head_size == 128) or (sm == 89 and 'e4m3' in dtype)
 
 
 def get_cubin_header(kernel_traits, specs_names):
@@ -3062,7 +3064,8 @@ def get_cubin_header(kernel_traits, specs_names):
     cubins_dict = {}
     cubin_lens_dict = {}
     for kspec, fname, lname, kname in specs_names:
-        if generate_cu_trtllm and not use_cubin_header(kspec):
+        if generate_cu_trtllm and not use_cubin_header(
+                kspec.sm, kspec.head_size, kspec.dtype):
             continue
         name = fname.replace('.', '_')
         data = 'extern unsigned char cubin_{name}_cubin[];'.format(name=name)
@@ -3215,7 +3218,7 @@ def get_cubin_header(kernel_traits, specs_names):
             if generate_cu_trtllm:
 
                 def get_lname_from_kname(kname: str) -> str:
-                    if use_cubin_header(kspec):
+                    if use_cubin_header(int(sm), int(head_size), prec.lower()):
                         return 'nullptr'
                     lname = kname.replace('_kernel', '')
                     mask_types = [
@@ -3234,7 +3237,8 @@ def get_lname_from_kname(kname: str) -> str:
 {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
 {attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \
 {is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {lname}}}\
-'''.format(**locals()) if use_cubin_header(kspec) else '''\
+'''.format(**locals()) if use_cubin_header(int(sm), int(head_size),
+                                           prec.lower()) else '''\
 {{ DATA_TYPE_{prec}, DATA_TYPE_{output_prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, {head_size_v}, \
 {sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, nullptr, \
 0, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
 
@@ -76,6 +76,7 @@ namespace
 // Abstract class for routing config
 struct RoutingConfig
 {
+    virtual void start(){};
     virtual void setRouting(int* selected_experts, int64_t num_experts, int64_t k, int64_t num_tokens) = 0;
     virtual std::string getName() = 0;
     virtual bool isDeterministic() const = 0;
@@ -143,6 +144,11 @@ struct RandomDistributionRoutingConfig : public RoutingConfig
             "Cannot create random routing distribution. Number of experts does not match the number of weights");
     }
 
+    void start()
+    {
+        twister.seed(0xD5);
+    }
+
     std::string getName() override
     {
         return name;
@@ -208,6 +214,11 @@ struct UniformRoutingConfig : public RoutingConfig
 {
     std::mt19937_64 twister{0xD5};
 
+    void start()
+    {
+        twister.seed(0xD5);
+    }
+
     std::string getName() override
     {
         return "uniform";
@@ -522,14 +533,32 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
 
     ActivationType mActType = ActivationType::Relu;
 
-    QuantParams mQuantParams{};
+    constexpr static int64_t NUM_BUFFERS = 32;
+
+    std::array<QuantParams, NUM_BUFFERS> mQuantParams{};
     bool mUseLora = false;
     bool mUsePrequantScale = false;
     int mGroupSize = -1;
-    LoraParams mLoraParams{};
+    std::array<LoraParams, NUM_BUFFERS> mLoraParams{};
 
     std::optional<tensorrt_llm::cutlass_extensions::CutlassGemmConfig> mSelectedConfig = std::nullopt;
 
+    int64_t mBufferIndex = 0;
+    size_t mWorkspaceSize = 0;
+    size_t mExpertWeight1Size = 0;
+    size_t mExpertWeight2Size = 0;
+    size_t mExpertBias1Size = 0;
+    size_t mExpertBias2Size = 0;
+    size_t mInputTensorSize = 0;
+    size_t mFinalOutputSize = 0;
+    size_t mSourceToExpandedMapSize = 0;
+    size_t mScaleProbsSize = 0;
+    size_t mSelectedExpertsSize = 0;
+    size_t mExpertFP4WeightSf1Size = 0;
+    size_t mExpertFP4WeightSf2Size = 0;
+    size_t mExpertIntScale1Size = 0;
+    size_t mExpertIntScale2Size = 0;
+
     template <class T>
     T* allocBuffer(size_t size)
     {
@@ -558,70 +587,97 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
         mGatedMultiplier = mIsGated ? 2 : 1;
         auto const gated_inter = mInterSize * mGatedMultiplier;
 
-        size_t workspace_size
-            = mMoERunner.getWorkspaceSize(mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK, mActType, {},
-                mUseLora, /*use_deepseek_fp8_block_scale=*/false, /*min_latency_mode=*/false, mUsePrequantScale);
+        mWorkspaceSize = mMoERunner.getWorkspaceSize(mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK, mActType,
+            {}, mUseLora, /*use_deepseek_fp8_block_scale=*/false, /*min_latency_mode=*/false, mUsePrequantScale);
 
-        mWorkspace = allocBuffer<char>(workspace_size);
+        mWorkspace = allocBuffer<char>(mWorkspaceSize * NUM_BUFFERS);
         size_t const expert_matrix_size = mNumExperts * mHiddenSize * mInterSize;
 
-        mExpertWeight1 = allocBuffer<WeightStorage>(expert_matrix_size * mGatedMultiplier / WEIGHT_ELEM_PER_BYTE);
-        mExpertWeight2 = allocBuffer<WeightStorage>(expert_matrix_size / WEIGHT_ELEM_PER_BYTE);
+        mExpertWeight1Size = expert_matrix_size * mGatedMultiplier / WEIGHT_ELEM_PER_BYTE;
+        mExpertWeight2Size = expert_matrix_size / WEIGHT_ELEM_PER_BYTE;
+        mExpertWeight1 = allocBuffer<WeightStorage>(mExpertWeight1Size * NUM_BUFFERS);
+        mExpertWeight2 = allocBuffer<WeightStorage>(mExpertWeight2Size * NUM_BUFFERS);
 
         mExpertBias1 = nullptr;
         mExpertBias2 = nullptr;
         if (mUseBias)
         {
-            mExpertBias1 = allocBuffer<DataType>(mNumExperts * gated_inter);
-            mExpertBias2 = allocBuffer<DataType>(mNumExperts * mHiddenSize);
+            mExpertBias1Size = mNumExperts * gated_inter;
+            mExpertBias2Size = mNumExperts * mHiddenSize;
+            mExpertBias1 = allocBuffer<DataType>(mExpertBias1Size * NUM_BUFFERS);
+            mExpertBias2 = allocBuffer<DataType>(mExpertBias2Size * NUM_BUFFERS);
         }
 
         if constexpr (INT_QUANT)
         {
-            mExpertIntScale1 = allocBuffer<DataType>(mNumExperts * gated_inter);
-            mExpertIntScale2 = allocBuffer<DataType>(mNumExperts * mHiddenSize);
+            mExpertIntScale1Size = mNumExperts * gated_inter;
+            mExpertIntScale2Size = mNumExperts * mHiddenSize;
+            mExpertIntScale1 = allocBuffer<DataType>(mExpertIntScale1Size * NUM_BUFFERS);
+            mExpertIntScale2 = allocBuffer<DataType>(mExpertIntScale2Size * NUM_BUFFERS);
 
-            mQuantParams = QuantParams::Int(mExpertIntScale1, mExpertIntScale2);
+            for (int i = 0; i < NUM_BUFFERS; i++)
+            {
+                mQuantParams[i] = QuantParams::Int(
+                    mExpertIntScale1 + mExpertIntScale1Size * i, mExpertIntScale2 + mExpertIntScale2Size * i);
+            }
         }
         else if constexpr (FP8)
         {
             mExpertFP8Scale1 = allocBuffer<float>(mNumExperts);
             mExpertFP8Scale2 = allocBuffer<float>(1);
             mExpertFP8Scale3 = allocBuffer<float>(mNumExperts);
 
-            mQuantParams = QuantParams::FP8(mExpertFP8Scale1, mExpertFP8Scale2, mExpertFP8Scale3);
+            for (int i = 0; i < NUM_BUFFERS; i++)
+            {
+                mQuantParams[i] = QuantParams::FP8(mExpertFP8Scale1, mExpertFP8Scale2, mExpertFP8Scale3);
+            }
         }
         else if constexpr (ANY_FP4)
         {
             mExpertFP4ActScale1 = allocBuffer<float>(1);
-            mExpertFP4WeightSf1 = allocBuffer<ElementSF>(num_experts * gated_inter * mHiddenSize / FP4_VECTOR_SIZE);
+            mExpertFP4WeightSf1Size = num_experts * gated_inter * mHiddenSize / FP4_VECTOR_SIZE;
+            mExpertFP4WeightSf1 = allocBuffer<ElementSF>(mExpertFP4WeightSf1Size * NUM_BUFFERS);
             mExpertFP4GlobalScale1 = allocBuffer<float>(num_experts);
 
             mExpertFP4ActScale2 = allocBuffer<float>(1);
-            mExpertFP4WeightSf2 = allocBuffer<ElementSF>(num_experts * mInterSize * mHiddenSize / FP4_VECTOR_SIZE);
+            mExpertFP4WeightSf2Size = num_experts * mInterSize * mHiddenSize / FP4_VECTOR_SIZE;
+            mExpertFP4WeightSf2 = allocBuffer<ElementSF>(mExpertFP4WeightSf2Size * NUM_BUFFERS);
             mExpertFP4GlobalScale2 = allocBuffer<float>(num_experts);
 
             auto func = NVFP4 ? QuantParams::FP4 : QuantParams::FP8MXFP4;
-            mQuantParams = func(mExpertFP4ActScale1, mExpertFP4WeightSf1, mExpertFP4GlobalScale1, mExpertFP4ActScale2,
-                mExpertFP4WeightSf2, mExpertFP4GlobalScale2, false, false);
+            for (int i = 0; i < NUM_BUFFERS; i++)
+            {
+                mQuantParams[i] = func(mExpertFP4ActScale1, mExpertFP4WeightSf1 + mExpertFP4WeightSf1Size * i,
+                    mExpertFP4GlobalScale1, mExpertFP4ActScale2, mExpertFP4WeightSf2 + mExpertFP4WeightSf2Size * i,
+                    mExpertFP4GlobalScale2, false, false);
+            }
         }
 
-        mSelectedExperts = allocBuffer<int>(mTotalTokens * mK);
-        mScaleProbs = allocBuffer<float>(mTotalTokens * mK);
-        mInputTensor = allocBuffer<DataType>(mTotalTokens * mHiddenSize);
-        mFinalOutput = allocBuffer<OutputType>(mTotalTokens * mHiddenSize);
+        mSelectedExpertsSize = mTotalTokens * mK;
+        mSelectedExperts = allocBuffer<int>(mSelectedExpertsSize * NUM_BUFFERS);
+        mScaleProbsSize = mTotalTokens * mK;
+        mScaleProbs = allocBuffer<float>(mScaleProbsSize * NUM_BUFFERS);
+        mInputTensorSize = mTotalTokens * mHiddenSize;
+        mInputTensor = allocBuffer<DataType>(mInputTensorSize * NUM_BUFFERS);
+        mFinalOutputSize = mTotalTokens * mHiddenSize;
+        mFinalOutput = allocBuffer<OutputType>(mFinalOutputSize * NUM_BUFFERS);
 
-        mSourceToExpandedMap = allocBuffer<int>(mTotalTokens * mK);
+        mSourceToExpandedMapSize = mTotalTokens * mK;
+        mSourceToExpandedMap = allocBuffer<int>(mSourceToExpandedMapSize * NUM_BUFFERS);
 
         mRoutingConfigIndex = routing_config;
         auto tactic = routingConfigCache.at(routing_config);
-        tactic->setRouting(mSelectedExperts, mNumExperts, mK, mTotalTokens);
+        tactic->start();
+        for (int i = 0; i < NUM_BUFFERS; i++)
+        {
+            tactic->setRouting(mSelectedExperts + mSelectedExpertsSize * i, mNumExperts, mK, mTotalTokens);
+        }
 
         check_cuda_error(cudaStreamSynchronize(streamPtr->get()));
     }
 
-    cudaGraph_t mGraph{};
-    cudaGraphExec_t mGraphInstance{};
+    std::array<cudaGraph_t, NUM_BUFFERS> mGraph{};
+    std::array<cudaGraphExec_t, NUM_BUFFERS> mGraphInstance{};
 
     void createGraph(MOEParallelismConfig parallelism_config)
     {
@@ -630,11 +686,15 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
 
         NVTX3_SCOPED_RANGE(BuildGraph);
 
-        check_cuda_error(cudaGraphCreate(&mGraph, 0));
-        check_cuda_error(cudaStreamBeginCapture(streamPtr->get(), cudaStreamCaptureModeThreadLocal));
-        runMoEPermute(parallelism_config);
-        check_cuda_error(cudaStreamEndCapture(streamPtr->get(), &mGraph));
-        check_cuda_error(cudaGraphInstantiate(&mGraphInstance, mGraph, nullptr, nullptr, 0));
+        for (int i = 0; i < NUM_BUFFERS; i++)
+        {
+            mBufferIndex = i;
+            check_cuda_error(cudaGraphCreate(&mGraph[i], 0));
+            check_cuda_error(cudaStreamBeginCapture(streamPtr->get(), cudaStreamCaptureModeThreadLocal));
+            runMoEPermute(parallelism_config);
+            check_cuda_error(cudaStreamEndCapture(streamPtr->get(), &mGraph[i]));
+            check_cuda_error(cudaGraphInstantiate(&mGraphInstance[i], mGraph[i], nullptr, nullptr, 0));
+        }
     }
 
     void destroyGraph()
@@ -644,24 +704,28 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
 
         NVTX3_SCOPED_RANGE(DestroyGraph);
 
-        check_cuda_error(cudaGraphExecDestroy(mGraphInstance));
-        check_cuda_error(cudaGraphDestroy(mGraph));
+        for (int i = 0; i < NUM_BUFFERS; i++)
+        {
+            check_cuda_error(cudaGraphExecDestroy(mGraphInstance[i]));
+            check_cuda_error(cudaGraphDestroy(mGraph[i]));
+        }
     }
 
     float benchmarkLoop(MOEParallelismConfig parallelism_config)
     {
+        mBufferIndex = (mBufferIndex + 1) % NUM_BUFFERS;
         auto tactic = routingConfigCache.at(mRoutingConfigIndex);
         if (!tactic->isDeterministic())
         {
-            tactic->setRouting(mSelectedExperts, mNumExperts, mK, mTotalTokens);
+            tactic->setRouting(mSelectedExperts + mSelectedExpertsSize * mBufferIndex, mNumExperts, mK, mTotalTokens);
         }
 
         {
             NVTX3_SCOPED_RANGE(BenchmarkLoopIteration);
             check_cuda_error(cudaEventRecord(mStartEvent, streamPtr->get()));
             if (useCudaGraph)
             {
-                cudaGraphLaunch(mGraphInstance, streamPtr->get());
+                cudaGraphLaunch(mGraphInstance[mBufferIndex], streamPtr->get());
             }
             else
             {
@@ -802,17 +866,29 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
         auto stream = streamPtr->get();
         MoeMinLatencyParams min_latency_params;
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
-        mMoERunner.runMoe(mInputTensor, nullptr, mSelectedExperts, mUseFinalScale ? mScaleProbs : nullptr,
-            mExpertWeight1, mExpertBias1, mActType, mExpertWeight2, mExpertBias2, mQuantParams, mTotalTokens,
-            mHiddenSize, mInterSize, mNumExperts, mK, mWorkspace, mFinalOutput, mSourceToExpandedMap,
-            parallelism_config, /*enable_alltoall=*/false, mUseLora, mLoraParams,
-            /*use_deepseek_fp8_block_scale=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
+        mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr,
+            mSelectedExperts + mSelectedExpertsSize * mBufferIndex,
+            mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr,
+            mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex,
+            mActType, mExpertWeight2 + mExpertWeight2Size * mBufferIndex,
+            mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize,
+            mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex,
+            mFinalOutput + mFinalOutputSize * mBufferIndex,
+            mSourceToExpandedMap + mSourceToExpandedMapSize * mBufferIndex, parallelism_config,
+            /*enable_alltoall=*/false, mUseLora, mLoraParams[mBufferIndex],
+            /*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
 #else
-        mMoERunner.runMoe(mInputTensor, nullptr, mSelectedExperts, mUseFinalScale ? mScaleProbs : nullptr,
-            mExpertWeight1, mExpertBias1, mActType, mExpertWeight2, mExpertBias2, mQuantParams, mTotalTokens,
-            mHiddenSize, mInterSize, mNumExperts, mK, mWorkspace, mFinalOutput, mSourceToExpandedMap,
-            parallelism_config, mUseLora, mLoraParams, /*use_deepseek_fp8_block_scale=*/false,
-            /*min_latency_mode=*/false, min_latency_params, stream);
+        mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr,
+            mSelectedExperts + mSelectedExpertsSize * mBufferIndex,
+            mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr,
+            mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex,
+            mActType, mExpertWeight2 + mExpertWeight2Size * mBufferIndex,
+            mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize,
+            mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex,
+            mFinalOutput + mFinalOutputSize * mBufferIndex,
+            mSourceToExpandedMap + mSourceToExpandedMapSize * mBufferIndex, parallelism_config, mUseLora,
+            mLoraParams[mBufferIndex],
+            /*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
 #endif
     }
 
 
@@ -623,7 +623,7 @@ void help()
            "    \"dtypes\": [string, ...], (optional)\n"
            "    \"routing_name\": string, (optional)\n"
            "    \"selected_experts\": [int, ...], or string, (optional, length is a multiple of k)\n"
-           "    \"expert_distribtuion\": [float, ...], or string, (optional, length is num_experts)\n"
+           "    \"expert_distribution\": [float, ...], or string, (optional, length is num_experts)\n"
            "  },\n"
            "  ...\n"
            "]\n"
 
@@ -37,7 +37,7 @@ void tensorrt_llm::batch_manager::AssignReqSeqSlots::operator()(SequenceSlotMana
                 llmReq->setFirstScheduledTime();
             }
             auto const reqSeqSlot = seqSlotManager.getSequenceSlot(isReqNew, llmReq->mRequestId);
-            TLLM_CHECK_WITH_INFO(reqSeqSlot, "Unable to get batch slot for reqId");
+            TLLM_CHECK_WITH_INFO(reqSeqSlot, "Unable to get batch slot for request ID %lu", llmReq->mRequestId);
             llmReq->mSeqSlot = reqSeqSlot;
         }
     }
Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ void tensorrt_llm::batch_manager::AssignReqSeqSlots::operator()(SequenceSlotMana`
`37`	`37`	`llmReq->setFirstScheduledTime();`
`38`	`38`	`}`
`39`	`39`	`auto const reqSeqSlot = seqSlotManager.getSequenceSlot(isReqNew, llmReq->mRequestId);`
`40`		`- TLLM_CHECK_WITH_INFO(reqSeqSlot, "Unable to get batch slot for reqId");`
	`40`	`+ TLLM_CHECK_WITH_INFO(reqSeqSlot, "Unable to get batch slot for request ID %lu", llmReq->mRequestId);`
`41`	`41`	`llmReq->mSeqSlot = reqSeqSlot;`
`42`	`42`	`}`
`43`	`43`	`}`