Skip to content

Commit e9ebd36

Browse files
committed
Merge branch 'NVIDIA:main' into support_qwen3_dense_eagle3
Signed-off-by: xq25478 <[email protected]>
2 parents e9d096e + 2a147c4 commit e9ebd36

File tree

182 files changed

+2549
-1172
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

182 files changed

+2549
-1172
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ TensorRT-LLM
3434
[➡️ link](./docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md)
3535

3636
## Latest News
37+
* [07/15] 🌟 TensorRT-LLM delivers Day-0 support for LG AI Research's latest model, EXAONE 4.0 [➡️ link](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B)
3738
* [06/17] Join NVIDIA and DeepInfra for a developer meetup on June 26 ✨ [➡️ link](https://events.nvidia.com/scaletheunscalablenextgenai)
3839
* [05/22] Blackwell Breaks the 1,000 TPS/User Barrier With Meta’s Llama 4 Maverick
3940
[➡️ link](https://developer.nvidia.com/blog/blackwell-breaks-the-1000-tps-user-barrier-with-metas-llama-4-maverick/)

cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -232,13 +232,9 @@ class RuntimeBuffers
232232

233233
GenerationLogitsCache generationLogitsCache;
234234

235-
//! Helper for KV cache rewind
235+
//! Mapping from batch idx to slot id
236236
TensorPtr seqSlots;
237237
TensorPtr seqSlotsDevice;
238-
TensorPtr sortedSeqSlots;
239-
//! For KV cache rewind
240-
TensorPtr seqSlotRemappingHost; // [numSequences]
241-
TensorPtr seqSlotRemappingDevice; // [numSequences]
242238

243239
//! Explicitly device-copy src offsets to reduce warp stalls in copy batch kernel invocation
244240
//! [mMaxNumRequests], on gpu

cpp/kernels/fmha_v2/setup.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3049,11 +3049,13 @@ def get_kernel_traits_code(specs_names):
30493049
return code
30503050

30513051

3052-
# For now, only hopper head_size 128 kernel uses cubins, and other kernels use cu files.
3053-
# You should set the condition `use_cubin_header` to false if you have modified the source code of the FMHA kernels on Hopper (sm90) with head_size 128.
3052+
# For now:
3053+
# 1. Hopper head_size 128 kernel uses cubins for performance regressions.
3054+
# 2. Hopper sm89 with e4m3/e4m3_fp32 dtype uses cubins for accuracy regressions (will be fixed).
3055+
# You should set the condition `use_cubin_header` to false if you have modified the source codes of those kernels that use cubins.
30543056
# This ensures that the kernels will be recompiled using the updated source code rather than relying on precompiled cubins.
3055-
def use_cubin_header(kspec):
3056-
return kspec.sm == 90 and kspec.head_size == 128
3057+
def use_cubin_header(sm, head_size, dtype):
3058+
return (sm == 90 and head_size == 128) or (sm == 89 and 'e4m3' in dtype)
30573059

30583060

30593061
def get_cubin_header(kernel_traits, specs_names):
@@ -3062,7 +3064,8 @@ def get_cubin_header(kernel_traits, specs_names):
30623064
cubins_dict = {}
30633065
cubin_lens_dict = {}
30643066
for kspec, fname, lname, kname in specs_names:
3065-
if generate_cu_trtllm and not use_cubin_header(kspec):
3067+
if generate_cu_trtllm and not use_cubin_header(
3068+
kspec.sm, kspec.head_size, kspec.dtype):
30663069
continue
30673070
name = fname.replace('.', '_')
30683071
data = 'extern unsigned char cubin_{name}_cubin[];'.format(name=name)
@@ -3215,7 +3218,7 @@ def get_cubin_header(kernel_traits, specs_names):
32153218
if generate_cu_trtllm:
32163219

32173220
def get_lname_from_kname(kname: str) -> str:
3218-
if use_cubin_header(kspec):
3221+
if use_cubin_header(int(sm), int(head_size), prec.lower()):
32193222
return 'nullptr'
32203223
lname = kname.replace('_kernel', '')
32213224
mask_types = [
@@ -3234,7 +3237,8 @@ def get_lname_from_kname(kname: str) -> str:
32343237
{cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
32353238
{attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \
32363239
{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {lname}}}\
3237-
'''.format(**locals()) if use_cubin_header(kspec) else '''\
3240+
'''.format(**locals()) if use_cubin_header(int(sm), int(head_size),
3241+
prec.lower()) else '''\
32383242
{{ DATA_TYPE_{prec}, DATA_TYPE_{output_prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, {head_size_v}, \
32393243
{sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, nullptr, \
32403244
0, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \

cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h

Lines changed: 121 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ namespace
7676
// Abstract class for routing config
7777
struct RoutingConfig
7878
{
79+
virtual void start(){};
7980
virtual void setRouting(int* selected_experts, int64_t num_experts, int64_t k, int64_t num_tokens) = 0;
8081
virtual std::string getName() = 0;
8182
virtual bool isDeterministic() const = 0;
@@ -143,6 +144,11 @@ struct RandomDistributionRoutingConfig : public RoutingConfig
143144
"Cannot create random routing distribution. Number of experts does not match the number of weights");
144145
}
145146

147+
void start()
148+
{
149+
twister.seed(0xD5);
150+
}
151+
146152
std::string getName() override
147153
{
148154
return name;
@@ -208,6 +214,11 @@ struct UniformRoutingConfig : public RoutingConfig
208214
{
209215
std::mt19937_64 twister{0xD5};
210216

217+
void start()
218+
{
219+
twister.seed(0xD5);
220+
}
221+
211222
std::string getName() override
212223
{
213224
return "uniform";
@@ -522,14 +533,32 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
522533

523534
ActivationType mActType = ActivationType::Relu;
524535

525-
QuantParams mQuantParams{};
536+
constexpr static int64_t NUM_BUFFERS = 32;
537+
538+
std::array<QuantParams, NUM_BUFFERS> mQuantParams{};
526539
bool mUseLora = false;
527540
bool mUsePrequantScale = false;
528541
int mGroupSize = -1;
529-
LoraParams mLoraParams{};
542+
std::array<LoraParams, NUM_BUFFERS> mLoraParams{};
530543

531544
std::optional<tensorrt_llm::cutlass_extensions::CutlassGemmConfig> mSelectedConfig = std::nullopt;
532545

546+
int64_t mBufferIndex = 0;
547+
size_t mWorkspaceSize = 0;
548+
size_t mExpertWeight1Size = 0;
549+
size_t mExpertWeight2Size = 0;
550+
size_t mExpertBias1Size = 0;
551+
size_t mExpertBias2Size = 0;
552+
size_t mInputTensorSize = 0;
553+
size_t mFinalOutputSize = 0;
554+
size_t mSourceToExpandedMapSize = 0;
555+
size_t mScaleProbsSize = 0;
556+
size_t mSelectedExpertsSize = 0;
557+
size_t mExpertFP4WeightSf1Size = 0;
558+
size_t mExpertFP4WeightSf2Size = 0;
559+
size_t mExpertIntScale1Size = 0;
560+
size_t mExpertIntScale2Size = 0;
561+
533562
template <class T>
534563
T* allocBuffer(size_t size)
535564
{
@@ -558,70 +587,97 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
558587
mGatedMultiplier = mIsGated ? 2 : 1;
559588
auto const gated_inter = mInterSize * mGatedMultiplier;
560589

561-
size_t workspace_size
562-
= mMoERunner.getWorkspaceSize(mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK, mActType, {},
563-
mUseLora, /*use_deepseek_fp8_block_scale=*/false, /*min_latency_mode=*/false, mUsePrequantScale);
590+
mWorkspaceSize = mMoERunner.getWorkspaceSize(mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK, mActType,
591+
{}, mUseLora, /*use_deepseek_fp8_block_scale=*/false, /*min_latency_mode=*/false, mUsePrequantScale);
564592

565-
mWorkspace = allocBuffer<char>(workspace_size);
593+
mWorkspace = allocBuffer<char>(mWorkspaceSize * NUM_BUFFERS);
566594
size_t const expert_matrix_size = mNumExperts * mHiddenSize * mInterSize;
567595

568-
mExpertWeight1 = allocBuffer<WeightStorage>(expert_matrix_size * mGatedMultiplier / WEIGHT_ELEM_PER_BYTE);
569-
mExpertWeight2 = allocBuffer<WeightStorage>(expert_matrix_size / WEIGHT_ELEM_PER_BYTE);
596+
mExpertWeight1Size = expert_matrix_size * mGatedMultiplier / WEIGHT_ELEM_PER_BYTE;
597+
mExpertWeight2Size = expert_matrix_size / WEIGHT_ELEM_PER_BYTE;
598+
mExpertWeight1 = allocBuffer<WeightStorage>(mExpertWeight1Size * NUM_BUFFERS);
599+
mExpertWeight2 = allocBuffer<WeightStorage>(mExpertWeight2Size * NUM_BUFFERS);
570600

571601
mExpertBias1 = nullptr;
572602
mExpertBias2 = nullptr;
573603
if (mUseBias)
574604
{
575-
mExpertBias1 = allocBuffer<DataType>(mNumExperts * gated_inter);
576-
mExpertBias2 = allocBuffer<DataType>(mNumExperts * mHiddenSize);
605+
mExpertBias1Size = mNumExperts * gated_inter;
606+
mExpertBias2Size = mNumExperts * mHiddenSize;
607+
mExpertBias1 = allocBuffer<DataType>(mExpertBias1Size * NUM_BUFFERS);
608+
mExpertBias2 = allocBuffer<DataType>(mExpertBias2Size * NUM_BUFFERS);
577609
}
578610

579611
if constexpr (INT_QUANT)
580612
{
581-
mExpertIntScale1 = allocBuffer<DataType>(mNumExperts * gated_inter);
582-
mExpertIntScale2 = allocBuffer<DataType>(mNumExperts * mHiddenSize);
613+
mExpertIntScale1Size = mNumExperts * gated_inter;
614+
mExpertIntScale2Size = mNumExperts * mHiddenSize;
615+
mExpertIntScale1 = allocBuffer<DataType>(mExpertIntScale1Size * NUM_BUFFERS);
616+
mExpertIntScale2 = allocBuffer<DataType>(mExpertIntScale2Size * NUM_BUFFERS);
583617

584-
mQuantParams = QuantParams::Int(mExpertIntScale1, mExpertIntScale2);
618+
for (int i = 0; i < NUM_BUFFERS; i++)
619+
{
620+
mQuantParams[i] = QuantParams::Int(
621+
mExpertIntScale1 + mExpertIntScale1Size * i, mExpertIntScale2 + mExpertIntScale2Size * i);
622+
}
585623
}
586624
else if constexpr (FP8)
587625
{
588626
mExpertFP8Scale1 = allocBuffer<float>(mNumExperts);
589627
mExpertFP8Scale2 = allocBuffer<float>(1);
590628
mExpertFP8Scale3 = allocBuffer<float>(mNumExperts);
591629

592-
mQuantParams = QuantParams::FP8(mExpertFP8Scale1, mExpertFP8Scale2, mExpertFP8Scale3);
630+
for (int i = 0; i < NUM_BUFFERS; i++)
631+
{
632+
mQuantParams[i] = QuantParams::FP8(mExpertFP8Scale1, mExpertFP8Scale2, mExpertFP8Scale3);
633+
}
593634
}
594635
else if constexpr (ANY_FP4)
595636
{
596637
mExpertFP4ActScale1 = allocBuffer<float>(1);
597-
mExpertFP4WeightSf1 = allocBuffer<ElementSF>(num_experts * gated_inter * mHiddenSize / FP4_VECTOR_SIZE);
638+
mExpertFP4WeightSf1Size = num_experts * gated_inter * mHiddenSize / FP4_VECTOR_SIZE;
639+
mExpertFP4WeightSf1 = allocBuffer<ElementSF>(mExpertFP4WeightSf1Size * NUM_BUFFERS);
598640
mExpertFP4GlobalScale1 = allocBuffer<float>(num_experts);
599641

600642
mExpertFP4ActScale2 = allocBuffer<float>(1);
601-
mExpertFP4WeightSf2 = allocBuffer<ElementSF>(num_experts * mInterSize * mHiddenSize / FP4_VECTOR_SIZE);
643+
mExpertFP4WeightSf2Size = num_experts * mInterSize * mHiddenSize / FP4_VECTOR_SIZE;
644+
mExpertFP4WeightSf2 = allocBuffer<ElementSF>(mExpertFP4WeightSf2Size * NUM_BUFFERS);
602645
mExpertFP4GlobalScale2 = allocBuffer<float>(num_experts);
603646

604647
auto func = NVFP4 ? QuantParams::FP4 : QuantParams::FP8MXFP4;
605-
mQuantParams = func(mExpertFP4ActScale1, mExpertFP4WeightSf1, mExpertFP4GlobalScale1, mExpertFP4ActScale2,
606-
mExpertFP4WeightSf2, mExpertFP4GlobalScale2, false, false);
648+
for (int i = 0; i < NUM_BUFFERS; i++)
649+
{
650+
mQuantParams[i] = func(mExpertFP4ActScale1, mExpertFP4WeightSf1 + mExpertFP4WeightSf1Size * i,
651+
mExpertFP4GlobalScale1, mExpertFP4ActScale2, mExpertFP4WeightSf2 + mExpertFP4WeightSf2Size * i,
652+
mExpertFP4GlobalScale2, false, false);
653+
}
607654
}
608655

609-
mSelectedExperts = allocBuffer<int>(mTotalTokens * mK);
610-
mScaleProbs = allocBuffer<float>(mTotalTokens * mK);
611-
mInputTensor = allocBuffer<DataType>(mTotalTokens * mHiddenSize);
612-
mFinalOutput = allocBuffer<OutputType>(mTotalTokens * mHiddenSize);
656+
mSelectedExpertsSize = mTotalTokens * mK;
657+
mSelectedExperts = allocBuffer<int>(mSelectedExpertsSize * NUM_BUFFERS);
658+
mScaleProbsSize = mTotalTokens * mK;
659+
mScaleProbs = allocBuffer<float>(mScaleProbsSize * NUM_BUFFERS);
660+
mInputTensorSize = mTotalTokens * mHiddenSize;
661+
mInputTensor = allocBuffer<DataType>(mInputTensorSize * NUM_BUFFERS);
662+
mFinalOutputSize = mTotalTokens * mHiddenSize;
663+
mFinalOutput = allocBuffer<OutputType>(mFinalOutputSize * NUM_BUFFERS);
613664

614-
mSourceToExpandedMap = allocBuffer<int>(mTotalTokens * mK);
665+
mSourceToExpandedMapSize = mTotalTokens * mK;
666+
mSourceToExpandedMap = allocBuffer<int>(mSourceToExpandedMapSize * NUM_BUFFERS);
615667

616668
mRoutingConfigIndex = routing_config;
617669
auto tactic = routingConfigCache.at(routing_config);
618-
tactic->setRouting(mSelectedExperts, mNumExperts, mK, mTotalTokens);
670+
tactic->start();
671+
for (int i = 0; i < NUM_BUFFERS; i++)
672+
{
673+
tactic->setRouting(mSelectedExperts + mSelectedExpertsSize * i, mNumExperts, mK, mTotalTokens);
674+
}
619675

620676
check_cuda_error(cudaStreamSynchronize(streamPtr->get()));
621677
}
622678

623-
cudaGraph_t mGraph{};
624-
cudaGraphExec_t mGraphInstance{};
679+
std::array<cudaGraph_t, NUM_BUFFERS> mGraph{};
680+
std::array<cudaGraphExec_t, NUM_BUFFERS> mGraphInstance{};
625681

626682
void createGraph(MOEParallelismConfig parallelism_config)
627683
{
@@ -630,11 +686,15 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
630686

631687
NVTX3_SCOPED_RANGE(BuildGraph);
632688

633-
check_cuda_error(cudaGraphCreate(&mGraph, 0));
634-
check_cuda_error(cudaStreamBeginCapture(streamPtr->get(), cudaStreamCaptureModeThreadLocal));
635-
runMoEPermute(parallelism_config);
636-
check_cuda_error(cudaStreamEndCapture(streamPtr->get(), &mGraph));
637-
check_cuda_error(cudaGraphInstantiate(&mGraphInstance, mGraph, nullptr, nullptr, 0));
689+
for (int i = 0; i < NUM_BUFFERS; i++)
690+
{
691+
mBufferIndex = i;
692+
check_cuda_error(cudaGraphCreate(&mGraph[i], 0));
693+
check_cuda_error(cudaStreamBeginCapture(streamPtr->get(), cudaStreamCaptureModeThreadLocal));
694+
runMoEPermute(parallelism_config);
695+
check_cuda_error(cudaStreamEndCapture(streamPtr->get(), &mGraph[i]));
696+
check_cuda_error(cudaGraphInstantiate(&mGraphInstance[i], mGraph[i], nullptr, nullptr, 0));
697+
}
638698
}
639699

640700
void destroyGraph()
@@ -644,24 +704,28 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
644704

645705
NVTX3_SCOPED_RANGE(DestroyGraph);
646706

647-
check_cuda_error(cudaGraphExecDestroy(mGraphInstance));
648-
check_cuda_error(cudaGraphDestroy(mGraph));
707+
for (int i = 0; i < NUM_BUFFERS; i++)
708+
{
709+
check_cuda_error(cudaGraphExecDestroy(mGraphInstance[i]));
710+
check_cuda_error(cudaGraphDestroy(mGraph[i]));
711+
}
649712
}
650713

651714
float benchmarkLoop(MOEParallelismConfig parallelism_config)
652715
{
716+
mBufferIndex = (mBufferIndex + 1) % NUM_BUFFERS;
653717
auto tactic = routingConfigCache.at(mRoutingConfigIndex);
654718
if (!tactic->isDeterministic())
655719
{
656-
tactic->setRouting(mSelectedExperts, mNumExperts, mK, mTotalTokens);
720+
tactic->setRouting(mSelectedExperts + mSelectedExpertsSize * mBufferIndex, mNumExperts, mK, mTotalTokens);
657721
}
658722

659723
{
660724
NVTX3_SCOPED_RANGE(BenchmarkLoopIteration);
661725
check_cuda_error(cudaEventRecord(mStartEvent, streamPtr->get()));
662726
if (useCudaGraph)
663727
{
664-
cudaGraphLaunch(mGraphInstance, streamPtr->get());
728+
cudaGraphLaunch(mGraphInstance[mBufferIndex], streamPtr->get());
665729
}
666730
else
667731
{
@@ -802,17 +866,29 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
802866
auto stream = streamPtr->get();
803867
MoeMinLatencyParams min_latency_params;
804868
#ifdef USING_OSS_CUTLASS_MOE_GEMM
805-
mMoERunner.runMoe(mInputTensor, nullptr, mSelectedExperts, mUseFinalScale ? mScaleProbs : nullptr,
806-
mExpertWeight1, mExpertBias1, mActType, mExpertWeight2, mExpertBias2, mQuantParams, mTotalTokens,
807-
mHiddenSize, mInterSize, mNumExperts, mK, mWorkspace, mFinalOutput, mSourceToExpandedMap,
808-
parallelism_config, /*enable_alltoall=*/false, mUseLora, mLoraParams,
809-
/*use_deepseek_fp8_block_scale=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
869+
mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr,
870+
mSelectedExperts + mSelectedExpertsSize * mBufferIndex,
871+
mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr,
872+
mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex,
873+
mActType, mExpertWeight2 + mExpertWeight2Size * mBufferIndex,
874+
mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize,
875+
mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex,
876+
mFinalOutput + mFinalOutputSize * mBufferIndex,
877+
mSourceToExpandedMap + mSourceToExpandedMapSize * mBufferIndex, parallelism_config,
878+
/*enable_alltoall=*/false, mUseLora, mLoraParams[mBufferIndex],
879+
/*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
810880
#else
811-
mMoERunner.runMoe(mInputTensor, nullptr, mSelectedExperts, mUseFinalScale ? mScaleProbs : nullptr,
812-
mExpertWeight1, mExpertBias1, mActType, mExpertWeight2, mExpertBias2, mQuantParams, mTotalTokens,
813-
mHiddenSize, mInterSize, mNumExperts, mK, mWorkspace, mFinalOutput, mSourceToExpandedMap,
814-
parallelism_config, mUseLora, mLoraParams, /*use_deepseek_fp8_block_scale=*/false,
815-
/*min_latency_mode=*/false, min_latency_params, stream);
881+
mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr,
882+
mSelectedExperts + mSelectedExpertsSize * mBufferIndex,
883+
mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr,
884+
mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex,
885+
mActType, mExpertWeight2 + mExpertWeight2Size * mBufferIndex,
886+
mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize,
887+
mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex,
888+
mFinalOutput + mFinalOutputSize * mBufferIndex,
889+
mSourceToExpandedMap + mSourceToExpandedMapSize * mBufferIndex, parallelism_config, mUseLora,
890+
mLoraParams[mBufferIndex],
891+
/*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
816892
#endif
817893
}
818894

cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -623,7 +623,7 @@ void help()
623623
" \"dtypes\": [string, ...], (optional)\n"
624624
" \"routing_name\": string, (optional)\n"
625625
" \"selected_experts\": [int, ...], or string, (optional, length is a multiple of k)\n"
626-
" \"expert_distribtuion\": [float, ...], or string, (optional, length is num_experts)\n"
626+
" \"expert_distribution\": [float, ...], or string, (optional, length is num_experts)\n"
627627
" },\n"
628628
" ...\n"
629629
"]\n"

cpp/tensorrt_llm/batch_manager/assignReqSeqSlots.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ void tensorrt_llm::batch_manager::AssignReqSeqSlots::operator()(SequenceSlotMana
3737
llmReq->setFirstScheduledTime();
3838
}
3939
auto const reqSeqSlot = seqSlotManager.getSequenceSlot(isReqNew, llmReq->mRequestId);
40-
TLLM_CHECK_WITH_INFO(reqSeqSlot, "Unable to get batch slot for reqId");
40+
TLLM_CHECK_WITH_INFO(reqSeqSlot, "Unable to get batch slot for request ID %lu", llmReq->mRequestId);
4141
llmReq->mSeqSlot = reqSeqSlot;
4242
}
4343
}

0 commit comments

Comments
 (0)